summaryrefslogtreecommitdiffstats
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 12:35:59 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-03-24 12:35:59 -0700
commit7403e6d8263937dea206dd201fed1ceed190ca18 (patch)
tree72e84c7bc56998c9998e95a4f14ebdc252dded41 /include
parent66711cfea642a162bc99abdefe1645b26dbd778f (diff)
parentf621eb13facb7681a79f4fec8ec6553ae160da76 (diff)
downloadlinux-7403e6d8263937dea206dd201fed1ceed190ca18.tar.gz
linux-7403e6d8263937dea206dd201fed1ceed190ca18.tar.bz2
linux-7403e6d8263937dea206dd201fed1ceed190ca18.zip
Merge tag 'vfio-v5.18-rc1' of https://github.com/awilliam/linux-vfio
Pull VFIO updates from Alex Williamson: - Introduce new device migration uAPI and implement device specific mlx5 vfio-pci variant driver supporting new protocol (Jason Gunthorpe, Yishai Hadas, Leon Romanovsky) - New HiSilicon acc vfio-pci variant driver, also supporting migration interface (Shameer Kolothum, Longfang Liu) - D3hot fixes for vfio-pci-core (Abhishek Sahu) - Document new vfio-pci variant driver acceptance criteria (Alex Williamson) - Fix UML build unresolved ioport_{un}map() functions (Alex Williamson) - Fix MAINTAINERS due to header movement (Lukas Bulwahn) * tag 'vfio-v5.18-rc1' of https://github.com/awilliam/linux-vfio: (31 commits) vfio-pci: Provide reviewers and acceptance criteria for variant drivers MAINTAINERS: adjust entry for header movement in hisilicon qm driver hisi_acc_vfio_pci: Use its own PCI reset_done error handler hisi_acc_vfio_pci: Add support for VFIO live migration crypto: hisilicon/qm: Set the VF QM state register hisi_acc_vfio_pci: Add helper to retrieve the struct pci_driver hisi_acc_vfio_pci: Restrict access to VF dev BAR2 migration region hisi_acc_vfio_pci: add new vfio_pci driver for HiSilicon ACC devices hisi_acc_qm: Move VF PCI device IDs to common header crypto: hisilicon/qm: Move few definitions to common header crypto: hisilicon/qm: Move the QM header to include/linux vfio/mlx5: Fix to not use 0 as NULL pointer PCI/IOV: Fix wrong kernel-doc identifier vfio/mlx5: Use its own PCI reset_done error handler vfio/pci: Expose vfio_pci_core_aer_err_detected() vfio/mlx5: Implement vfio_pci driver for mlx5 devices vfio/mlx5: Expose migration commands over mlx5 device vfio: Remove migration protocol v1 documentation vfio: Extend the device migration protocol with RUNNING_P2P vfio: Define device migration protocol v2 ...
Diffstat (limited to 'include')
-rw-r--r--include/linux/hisi_acc_qm.h490
-rw-r--r--include/linux/mlx5/driver.h3
-rw-r--r--include/linux/mlx5/mlx5_ifc.h147
-rw-r--r--include/linux/pci.h15
-rw-r--r--include/linux/pci_ids.h3
-rw-r--r--include/linux/vfio.h53
-rw-r--r--include/linux/vfio_pci_core.h13
-rw-r--r--include/uapi/linux/vfio.h406
8 files changed, 917 insertions, 213 deletions
diff --git a/include/linux/hisi_acc_qm.h b/include/linux/hisi_acc_qm.h
new file mode 100644
index 000000000000..177f7b7cd414
--- /dev/null
+++ b/include/linux/hisi_acc_qm.h
@@ -0,0 +1,490 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (c) 2019 HiSilicon Limited. */
+#ifndef HISI_ACC_QM_H
+#define HISI_ACC_QM_H
+
+#include <linux/bitfield.h>
+#include <linux/debugfs.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#define QM_QNUM_V1 4096
+#define QM_QNUM_V2 1024
+#define QM_MAX_VFS_NUM_V2 63
+
+/* qm user domain */
+#define QM_ARUSER_M_CFG_1 0x100088
+#define AXUSER_SNOOP_ENABLE BIT(30)
+#define AXUSER_CMD_TYPE GENMASK(14, 12)
+#define AXUSER_CMD_SMMU_NORMAL 1
+#define AXUSER_NS BIT(6)
+#define AXUSER_NO BIT(5)
+#define AXUSER_FP BIT(4)
+#define AXUSER_SSV BIT(0)
+#define AXUSER_BASE (AXUSER_SNOOP_ENABLE | \
+ FIELD_PREP(AXUSER_CMD_TYPE, \
+ AXUSER_CMD_SMMU_NORMAL) | \
+ AXUSER_NS | AXUSER_NO | AXUSER_FP)
+#define QM_ARUSER_M_CFG_ENABLE 0x100090
+#define ARUSER_M_CFG_ENABLE 0xfffffffe
+#define QM_AWUSER_M_CFG_1 0x100098
+#define QM_AWUSER_M_CFG_ENABLE 0x1000a0
+#define AWUSER_M_CFG_ENABLE 0xfffffffe
+#define QM_WUSER_M_CFG_ENABLE 0x1000a8
+#define WUSER_M_CFG_ENABLE 0xffffffff
+
+/* mailbox */
+#define QM_MB_CMD_SQC 0x0
+#define QM_MB_CMD_CQC 0x1
+#define QM_MB_CMD_EQC 0x2
+#define QM_MB_CMD_AEQC 0x3
+#define QM_MB_CMD_SQC_BT 0x4
+#define QM_MB_CMD_CQC_BT 0x5
+#define QM_MB_CMD_SQC_VFT_V2 0x6
+#define QM_MB_CMD_STOP_QP 0x8
+#define QM_MB_CMD_SRC 0xc
+#define QM_MB_CMD_DST 0xd
+
+#define QM_MB_CMD_SEND_BASE 0x300
+#define QM_MB_EVENT_SHIFT 8
+#define QM_MB_BUSY_SHIFT 13
+#define QM_MB_OP_SHIFT 14
+#define QM_MB_CMD_DATA_ADDR_L 0x304
+#define QM_MB_CMD_DATA_ADDR_H 0x308
+#define QM_MB_MAX_WAIT_CNT 6000
+
+/* doorbell */
+#define QM_DOORBELL_CMD_SQ 0
+#define QM_DOORBELL_CMD_CQ 1
+#define QM_DOORBELL_CMD_EQ 2
+#define QM_DOORBELL_CMD_AEQ 3
+
+#define QM_DOORBELL_SQ_CQ_BASE_V2 0x1000
+#define QM_DOORBELL_EQ_AEQ_BASE_V2 0x2000
+#define QM_QP_MAX_NUM_SHIFT 11
+#define QM_DB_CMD_SHIFT_V2 12
+#define QM_DB_RAND_SHIFT_V2 16
+#define QM_DB_INDEX_SHIFT_V2 32
+#define QM_DB_PRIORITY_SHIFT_V2 48
+#define QM_VF_STATE 0x60
+
+/* qm cache */
+#define QM_CACHE_CTL 0x100050
+#define SQC_CACHE_ENABLE BIT(0)
+#define CQC_CACHE_ENABLE BIT(1)
+#define SQC_CACHE_WB_ENABLE BIT(4)
+#define SQC_CACHE_WB_THRD GENMASK(10, 5)
+#define CQC_CACHE_WB_ENABLE BIT(11)
+#define CQC_CACHE_WB_THRD GENMASK(17, 12)
+#define QM_AXI_M_CFG 0x1000ac
+#define AXI_M_CFG 0xffff
+#define QM_AXI_M_CFG_ENABLE 0x1000b0
+#define AM_CFG_SINGLE_PORT_MAX_TRANS 0x300014
+#define AXI_M_CFG_ENABLE 0xffffffff
+#define QM_PEH_AXUSER_CFG 0x1000cc
+#define QM_PEH_AXUSER_CFG_ENABLE 0x1000d0
+#define PEH_AXUSER_CFG 0x401001
+#define PEH_AXUSER_CFG_ENABLE 0xffffffff
+
+#define QM_AXI_RRESP BIT(0)
+#define QM_AXI_BRESP BIT(1)
+#define QM_ECC_MBIT BIT(2)
+#define QM_ECC_1BIT BIT(3)
+#define QM_ACC_GET_TASK_TIMEOUT BIT(4)
+#define QM_ACC_DO_TASK_TIMEOUT BIT(5)
+#define QM_ACC_WB_NOT_READY_TIMEOUT BIT(6)
+#define QM_SQ_CQ_VF_INVALID BIT(7)
+#define QM_CQ_VF_INVALID BIT(8)
+#define QM_SQ_VF_INVALID BIT(9)
+#define QM_DB_TIMEOUT BIT(10)
+#define QM_OF_FIFO_OF BIT(11)
+#define QM_DB_RANDOM_INVALID BIT(12)
+#define QM_MAILBOX_TIMEOUT BIT(13)
+#define QM_FLR_TIMEOUT BIT(14)
+
+#define QM_BASE_NFE (QM_AXI_RRESP | QM_AXI_BRESP | QM_ECC_MBIT | \
+ QM_ACC_GET_TASK_TIMEOUT | QM_DB_TIMEOUT | \
+ QM_OF_FIFO_OF | QM_DB_RANDOM_INVALID | \
+ QM_MAILBOX_TIMEOUT | QM_FLR_TIMEOUT)
+#define QM_BASE_CE QM_ECC_1BIT
+
+#define QM_Q_DEPTH 1024
+#define QM_MIN_QNUM 2
+#define HISI_ACC_SGL_SGE_NR_MAX 255
+#define QM_SHAPER_CFG 0x100164
+#define QM_SHAPER_ENABLE BIT(30)
+#define QM_SHAPER_TYPE1_OFFSET 10
+
+/* page number for queue file region */
+#define QM_DOORBELL_PAGE_NR 1
+
+/* uacce mode of the driver */
+#define UACCE_MODE_NOUACCE 0 /* don't use uacce */
+#define UACCE_MODE_SVA 1 /* use uacce sva mode */
+#define UACCE_MODE_DESC "0(default) means only register to crypto, 1 means both register to crypto and uacce"
+
+enum qm_stop_reason {
+ QM_NORMAL,
+ QM_SOFT_RESET,
+ QM_FLR,
+};
+
+enum qm_state {
+ QM_INIT = 0,
+ QM_START,
+ QM_CLOSE,
+ QM_STOP,
+};
+
+enum qp_state {
+ QP_INIT = 1,
+ QP_START,
+ QP_STOP,
+ QP_CLOSE,
+};
+
+enum qm_hw_ver {
+ QM_HW_UNKNOWN = -1,
+ QM_HW_V1 = 0x20,
+ QM_HW_V2 = 0x21,
+ QM_HW_V3 = 0x30,
+};
+
+enum qm_fun_type {
+ QM_HW_PF,
+ QM_HW_VF,
+};
+
+enum qm_debug_file {
+ CURRENT_QM,
+ CURRENT_Q,
+ CLEAR_ENABLE,
+ DEBUG_FILE_NUM,
+};
+
+enum qm_vf_state {
+ QM_READY = 0,
+ QM_NOT_READY,
+};
+
+struct qm_dfx {
+ atomic64_t err_irq_cnt;
+ atomic64_t aeq_irq_cnt;
+ atomic64_t abnormal_irq_cnt;
+ atomic64_t create_qp_err_cnt;
+ atomic64_t mb_err_cnt;
+};
+
+struct debugfs_file {
+ enum qm_debug_file index;
+ struct mutex lock;
+ struct qm_debug *debug;
+};
+
+struct qm_debug {
+ u32 curr_qm_qp_num;
+ u32 sqe_mask_offset;
+ u32 sqe_mask_len;
+ struct qm_dfx dfx;
+ struct dentry *debug_root;
+ struct dentry *qm_d;
+ struct debugfs_file files[DEBUG_FILE_NUM];
+};
+
+struct qm_shaper_factor {
+ u32 func_qos;
+ u64 cir_b;
+ u64 cir_u;
+ u64 cir_s;
+ u64 cbs_s;
+};
+
+struct qm_dma {
+ void *va;
+ dma_addr_t dma;
+ size_t size;
+};
+
+struct hisi_qm_status {
+ u32 eq_head;
+ bool eqc_phase;
+ u32 aeq_head;
+ bool aeqc_phase;
+ atomic_t flags;
+ int stop_reason;
+};
+
+struct hisi_qm;
+
+struct hisi_qm_err_info {
+ char *acpi_rst;
+ u32 msi_wr_port;
+ u32 ecc_2bits_mask;
+ u32 dev_ce_mask;
+ u32 ce;
+ u32 nfe;
+ u32 fe;
+};
+
+struct hisi_qm_err_status {
+ u32 is_qm_ecc_mbit;
+ u32 is_dev_ecc_mbit;
+};
+
+struct hisi_qm_err_ini {
+ int (*hw_init)(struct hisi_qm *qm);
+ void (*hw_err_enable)(struct hisi_qm *qm);
+ void (*hw_err_disable)(struct hisi_qm *qm);
+ u32 (*get_dev_hw_err_status)(struct hisi_qm *qm);
+ void (*clear_dev_hw_err_status)(struct hisi_qm *qm, u32 err_sts);
+ void (*open_axi_master_ooo)(struct hisi_qm *qm);
+ void (*close_axi_master_ooo)(struct hisi_qm *qm);
+ void (*open_sva_prefetch)(struct hisi_qm *qm);
+ void (*close_sva_prefetch)(struct hisi_qm *qm);
+ void (*log_dev_hw_err)(struct hisi_qm *qm, u32 err_sts);
+ void (*err_info_init)(struct hisi_qm *qm);
+};
+
+struct hisi_qm_list {
+ struct mutex lock;
+ struct list_head list;
+ int (*register_to_crypto)(struct hisi_qm *qm);
+ void (*unregister_from_crypto)(struct hisi_qm *qm);
+};
+
+struct hisi_qm {
+ enum qm_hw_ver ver;
+ enum qm_fun_type fun_type;
+ const char *dev_name;
+ struct pci_dev *pdev;
+ void __iomem *io_base;
+ void __iomem *db_io_base;
+ u32 sqe_size;
+ u32 qp_base;
+ u32 qp_num;
+ u32 qp_in_used;
+ u32 ctrl_qp_num;
+ u32 max_qp_num;
+ u32 vfs_num;
+ u32 db_interval;
+ struct list_head list;
+ struct hisi_qm_list *qm_list;
+
+ struct qm_dma qdma;
+ struct qm_sqc *sqc;
+ struct qm_cqc *cqc;
+ struct qm_eqe *eqe;
+ struct qm_aeqe *aeqe;
+ dma_addr_t sqc_dma;
+ dma_addr_t cqc_dma;
+ dma_addr_t eqe_dma;
+ dma_addr_t aeqe_dma;
+
+ struct hisi_qm_status status;
+ const struct hisi_qm_err_ini *err_ini;
+ struct hisi_qm_err_info err_info;
+ struct hisi_qm_err_status err_status;
+ unsigned long misc_ctl; /* driver removing and reset sched */
+
+ struct rw_semaphore qps_lock;
+ struct idr qp_idr;
+ struct hisi_qp *qp_array;
+
+ struct mutex mailbox_lock;
+
+ const struct hisi_qm_hw_ops *ops;
+
+ struct qm_debug debug;
+
+ u32 error_mask;
+
+ struct workqueue_struct *wq;
+ struct work_struct work;
+ struct work_struct rst_work;
+ struct work_struct cmd_process;
+
+ const char *algs;
+ bool use_sva;
+ bool is_frozen;
+
+ /* doorbell isolation enable */
+ bool use_db_isolation;
+ resource_size_t phys_base;
+ resource_size_t db_phys_base;
+ struct uacce_device *uacce;
+ int mode;
+ struct qm_shaper_factor *factor;
+ u32 mb_qos;
+ u32 type_rate;
+};
+
+struct hisi_qp_status {
+ atomic_t used;
+ u16 sq_tail;
+ u16 cq_head;
+ bool cqc_phase;
+ atomic_t flags;
+};
+
+struct hisi_qp_ops {
+ int (*fill_sqe)(void *sqe, void *q_parm, void *d_parm);
+};
+
+struct hisi_qp {
+ u32 qp_id;
+ u8 alg_type;
+ u8 req_type;
+
+ struct qm_dma qdma;
+ void *sqe;
+ struct qm_cqe *cqe;
+ dma_addr_t sqe_dma;
+ dma_addr_t cqe_dma;
+
+ struct hisi_qp_status qp_status;
+ struct hisi_qp_ops *hw_ops;
+ void *qp_ctx;
+ void (*req_cb)(struct hisi_qp *qp, void *data);
+ void (*event_cb)(struct hisi_qp *qp);
+
+ struct hisi_qm *qm;
+ bool is_resetting;
+ bool is_in_kernel;
+ u16 pasid;
+ struct uacce_queue *uacce_q;
+};
+
+static inline int q_num_set(const char *val, const struct kernel_param *kp,
+ unsigned int device)
+{
+ struct pci_dev *pdev = pci_get_device(PCI_VENDOR_ID_HUAWEI,
+ device, NULL);
+ u32 n, q_num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+
+ if (!pdev) {
+ q_num = min_t(u32, QM_QNUM_V1, QM_QNUM_V2);
+ pr_info("No device found currently, suppose queue number is %u\n",
+ q_num);
+ } else {
+ if (pdev->revision == QM_HW_V1)
+ q_num = QM_QNUM_V1;
+ else
+ q_num = QM_QNUM_V2;
+ }
+
+ ret = kstrtou32(val, 10, &n);
+ if (ret || n < QM_MIN_QNUM || n > q_num)
+ return -EINVAL;
+
+ return param_set_int(val, kp);
+}
+
+static inline int vfs_num_set(const char *val, const struct kernel_param *kp)
+{
+ u32 n;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+
+ ret = kstrtou32(val, 10, &n);
+ if (ret < 0)
+ return ret;
+
+ if (n > QM_MAX_VFS_NUM_V2)
+ return -EINVAL;
+
+ return param_set_int(val, kp);
+}
+
+static inline int mode_set(const char *val, const struct kernel_param *kp)
+{
+ u32 n;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+
+ ret = kstrtou32(val, 10, &n);
+ if (ret != 0 || (n != UACCE_MODE_SVA &&
+ n != UACCE_MODE_NOUACCE))
+ return -EINVAL;
+
+ return param_set_int(val, kp);
+}
+
+static inline int uacce_mode_set(const char *val, const struct kernel_param *kp)
+{
+ return mode_set(val, kp);
+}
+
+static inline void hisi_qm_init_list(struct hisi_qm_list *qm_list)
+{
+ INIT_LIST_HEAD(&qm_list->list);
+ mutex_init(&qm_list->lock);
+}
+
+int hisi_qm_init(struct hisi_qm *qm);
+void hisi_qm_uninit(struct hisi_qm *qm);
+int hisi_qm_start(struct hisi_qm *qm);
+int hisi_qm_stop(struct hisi_qm *qm, enum qm_stop_reason r);
+struct hisi_qp *hisi_qm_create_qp(struct hisi_qm *qm, u8 alg_type);
+int hisi_qm_start_qp(struct hisi_qp *qp, unsigned long arg);
+int hisi_qm_stop_qp(struct hisi_qp *qp);
+void hisi_qm_release_qp(struct hisi_qp *qp);
+int hisi_qp_send(struct hisi_qp *qp, const void *msg);
+int hisi_qm_get_free_qp_num(struct hisi_qm *qm);
+int hisi_qm_get_vft(struct hisi_qm *qm, u32 *base, u32 *number);
+void hisi_qm_debug_init(struct hisi_qm *qm);
+enum qm_hw_ver hisi_qm_get_hw_version(struct pci_dev *pdev);
+void hisi_qm_debug_regs_clear(struct hisi_qm *qm);
+int hisi_qm_sriov_enable(struct pci_dev *pdev, int max_vfs);
+int hisi_qm_sriov_disable(struct pci_dev *pdev, bool is_frozen);
+int hisi_qm_sriov_configure(struct pci_dev *pdev, int num_vfs);
+void hisi_qm_dev_err_init(struct hisi_qm *qm);
+void hisi_qm_dev_err_uninit(struct hisi_qm *qm);
+pci_ers_result_t hisi_qm_dev_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
+pci_ers_result_t hisi_qm_dev_slot_reset(struct pci_dev *pdev);
+void hisi_qm_reset_prepare(struct pci_dev *pdev);
+void hisi_qm_reset_done(struct pci_dev *pdev);
+
+int hisi_qm_wait_mb_ready(struct hisi_qm *qm);
+int hisi_qm_mb(struct hisi_qm *qm, u8 cmd, dma_addr_t dma_addr, u16 queue,
+ bool op);
+
+struct hisi_acc_sgl_pool;
+struct hisi_acc_hw_sgl *hisi_acc_sg_buf_map_to_hw_sgl(struct device *dev,
+ struct scatterlist *sgl, struct hisi_acc_sgl_pool *pool,
+ u32 index, dma_addr_t *hw_sgl_dma);
+void hisi_acc_sg_buf_unmap(struct device *dev, struct scatterlist *sgl,
+ struct hisi_acc_hw_sgl *hw_sgl);
+struct hisi_acc_sgl_pool *hisi_acc_create_sgl_pool(struct device *dev,
+ u32 count, u32 sge_nr);
+void hisi_acc_free_sgl_pool(struct device *dev,
+ struct hisi_acc_sgl_pool *pool);
+int hisi_qm_alloc_qps_node(struct hisi_qm_list *qm_list, int qp_num,
+ u8 alg_type, int node, struct hisi_qp **qps);
+void hisi_qm_free_qps(struct hisi_qp **qps, int qp_num);
+void hisi_qm_dev_shutdown(struct pci_dev *pdev);
+void hisi_qm_wait_task_finish(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
+int hisi_qm_alg_register(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
+void hisi_qm_alg_unregister(struct hisi_qm *qm, struct hisi_qm_list *qm_list);
+int hisi_qm_resume(struct device *dev);
+int hisi_qm_suspend(struct device *dev);
+void hisi_qm_pm_uninit(struct hisi_qm *qm);
+void hisi_qm_pm_init(struct hisi_qm *qm);
+int hisi_qm_get_dfx_access(struct hisi_qm *qm);
+void hisi_qm_put_dfx_access(struct hisi_qm *qm);
+void hisi_qm_regs_dump(struct seq_file *s, struct debugfs_regset32 *regset);
+
+/* Used by VFIO ACC live migration driver */
+struct pci_driver *hisi_sec_get_pf_driver(void);
+struct pci_driver *hisi_hpre_get_pf_driver(void);
+struct pci_driver *hisi_zip_get_pf_driver(void);
+#endif
diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h
index 78655d8d13a7..319322a8ff94 100644
--- a/include/linux/mlx5/driver.h
+++ b/include/linux/mlx5/driver.h
@@ -1143,6 +1143,9 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
+struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
+void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
+
#ifdef CONFIG_MLX5_CORE_IPOIB
struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev,
struct ib_device *ibdev,
diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
index 49a48d7709ac..0235054fe55c 100644
--- a/include/linux/mlx5/mlx5_ifc.h
+++ b/include/linux/mlx5/mlx5_ifc.h
@@ -127,6 +127,11 @@ enum {
MLX5_CMD_OP_QUERY_SF_PARTITION = 0x111,
MLX5_CMD_OP_ALLOC_SF = 0x113,
MLX5_CMD_OP_DEALLOC_SF = 0x114,
+ MLX5_CMD_OP_SUSPEND_VHCA = 0x115,
+ MLX5_CMD_OP_RESUME_VHCA = 0x116,
+ MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE = 0x117,
+ MLX5_CMD_OP_SAVE_VHCA_STATE = 0x118,
+ MLX5_CMD_OP_LOAD_VHCA_STATE = 0x119,
MLX5_CMD_OP_CREATE_MKEY = 0x200,
MLX5_CMD_OP_QUERY_MKEY = 0x201,
MLX5_CMD_OP_DESTROY_MKEY = 0x202,
@@ -1757,7 +1762,9 @@ struct mlx5_ifc_cmd_hca_cap_bits {
u8 reserved_at_682[0x1];
u8 log_max_sf[0x5];
u8 apu[0x1];
- u8 reserved_at_689[0x7];
+ u8 reserved_at_689[0x4];
+ u8 migration[0x1];
+ u8 reserved_at_68e[0x2];
u8 log_min_sf_size[0x8];
u8 max_num_sf_partitions[0x8];
@@ -11518,4 +11525,142 @@ enum {
MLX5_MTT_PERM_RW = MLX5_MTT_PERM_READ | MLX5_MTT_PERM_WRITE,
};
+enum {
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR = 0x0,
+ MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER = 0x1,
+};
+
+struct mlx5_ifc_suspend_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_suspend_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+enum {
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER = 0x0,
+ MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR = 0x1,
+};
+
+struct mlx5_ifc_resume_vhca_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_resume_vhca_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_query_vhca_migration_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+
+ u8 required_umem_size[0x20];
+
+ u8 reserved_at_a0[0x160];
+};
+
+struct mlx5_ifc_save_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_save_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 actual_image_size[0x20];
+
+ u8 reserved_at_60[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_in_bits {
+ u8 opcode[0x10];
+ u8 uid[0x10];
+
+ u8 reserved_at_20[0x10];
+ u8 op_mod[0x10];
+
+ u8 reserved_at_40[0x10];
+ u8 vhca_id[0x10];
+
+ u8 reserved_at_60[0x20];
+
+ u8 va[0x40];
+
+ u8 mkey[0x20];
+
+ u8 size[0x20];
+};
+
+struct mlx5_ifc_load_vhca_state_out_bits {
+ u8 status[0x8];
+ u8 reserved_at_8[0x18];
+
+ u8 syndrome[0x20];
+
+ u8 reserved_at_40[0x40];
+};
+
#endif /* MLX5_IFC_H */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 8253a5413d7c..60d423d8f0c4 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -2166,7 +2166,8 @@ void __iomem *pci_ioremap_wc_bar(struct pci_dev *pdev, int bar);
#ifdef CONFIG_PCI_IOV
int pci_iov_virtfn_bus(struct pci_dev *dev, int id);
int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
-
+int pci_iov_vf_id(struct pci_dev *dev);
+void *pci_iov_get_pf_drvdata(struct pci_dev *dev, struct pci_driver *pf_driver);
int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
void pci_disable_sriov(struct pci_dev *dev);
@@ -2194,6 +2195,18 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
{
return -ENOSYS;
}
+
+static inline int pci_iov_vf_id(struct pci_dev *dev)
+{
+ return -ENOSYS;
+}
+
+static inline void *pci_iov_get_pf_drvdata(struct pci_dev *dev,
+ struct pci_driver *pf_driver)
+{
+ return ERR_PTR(-EINVAL);
+}
+
static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
{ return -ENODEV; }
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index aad54c666407..31dee2b65a62 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -2529,6 +2529,9 @@
#define PCI_DEVICE_ID_KORENIX_JETCARDF3 0x17ff
#define PCI_VENDOR_ID_HUAWEI 0x19e5
+#define PCI_DEVICE_ID_HUAWEI_ZIP_VF 0xa251
+#define PCI_DEVICE_ID_HUAWEI_SEC_VF 0xa256
+#define PCI_DEVICE_ID_HUAWEI_HPRE_VF 0xa259
#define PCI_VENDOR_ID_NETRONOME 0x19ee
#define PCI_DEVICE_ID_NETRONOME_NFP4000 0x4000
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 76191d7abed1..66dda06ec42d 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -33,6 +33,7 @@ struct vfio_device {
struct vfio_group *group;
struct vfio_device_set *dev_set;
struct list_head dev_set_list;
+ unsigned int migration_flags;
/* Members below here are private, not for driver use */
refcount_t refcount;
@@ -55,6 +56,17 @@ struct vfio_device {
* @match: Optional device name match callback (return: 0 for no-match, >0 for
* match, -errno for abort (ex. match with insufficient or incorrect
* additional args)
+ * @device_feature: Optional, fill in the VFIO_DEVICE_FEATURE ioctl
+ * @migration_set_state: Optional callback to change the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
+ * The returned FD is used for data transfer according to the FSM
+ * definition. The driver is responsible to ensure that FD reaches end
+ * of stream or error whenever the migration FSM leaves a data transfer
+ * state or before close_device() returns.
+ * @migration_get_state: Optional callback to get the migration state for
+ * devices that support migration. It's mandatory for
+ * VFIO_DEVICE_FEATURE_MIGRATION migration support.
*/
struct vfio_device_ops {
char *name;
@@ -69,8 +81,44 @@ struct vfio_device_ops {
int (*mmap)(struct vfio_device *vdev, struct vm_area_struct *vma);
void (*request)(struct vfio_device *vdev, unsigned int count);
int (*match)(struct vfio_device *vdev, char *buf);
+ int (*device_feature)(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
+ struct file *(*migration_set_state)(
+ struct vfio_device *device,
+ enum vfio_device_mig_state new_state);
+ int (*migration_get_state)(struct vfio_device *device,
+ enum vfio_device_mig_state *curr_state);
};
+/**
+ * vfio_check_feature - Validate user input for the VFIO_DEVICE_FEATURE ioctl
+ * @flags: Arg from the device_feature op
+ * @argsz: Arg from the device_feature op
+ * @supported_ops: Combination of VFIO_DEVICE_FEATURE_GET and SET the driver
+ * supports
+ * @minsz: Minimum data size the driver accepts
+ *
+ * For use in a driver's device_feature op. Checks that the inputs to the
+ * VFIO_DEVICE_FEATURE ioctl are correct for the driver's feature. Returns 1 if
+ * the driver should execute the get or set, otherwise the relevant
+ * value should be returned.
+ */
+static inline int vfio_check_feature(u32 flags, size_t argsz, u32 supported_ops,
+ size_t minsz)
+{
+ if ((flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)) &
+ ~supported_ops)
+ return -EINVAL;
+ if (flags & VFIO_DEVICE_FEATURE_PROBE)
+ return 0;
+ /* Without PROBE one of GET or SET must be requested */
+ if (!(flags & (VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_SET)))
+ return -EINVAL;
+ if (argsz < minsz)
+ return -EINVAL;
+ return 1;
+}
+
void vfio_init_group_dev(struct vfio_device *device, struct device *dev,
const struct vfio_device_ops *ops);
void vfio_uninit_group_dev(struct vfio_device *device);
@@ -82,6 +130,11 @@ extern void vfio_device_put(struct vfio_device *device);
int vfio_assign_device_set(struct vfio_device *device, void *set_id);
+int vfio_mig_get_next_state(struct vfio_device *device,
+ enum vfio_device_mig_state cur_fsm,
+ enum vfio_device_mig_state new_fsm,
+ enum vfio_device_mig_state *next_fsm);
+
/*
* External user API
*/
diff --git a/include/linux/vfio_pci_core.h b/include/linux/vfio_pci_core.h
index ef9a44b6cf5d..74a4a0f17b28 100644
--- a/include/linux/vfio_pci_core.h
+++ b/include/linux/vfio_pci_core.h
@@ -159,8 +159,17 @@ extern ssize_t vfio_pci_config_rw(struct vfio_pci_core_device *vdev,
extern ssize_t vfio_pci_bar_rw(struct vfio_pci_core_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
+#ifdef CONFIG_VFIO_PCI_VGA
extern ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev, char __user *buf,
size_t count, loff_t *ppos, bool iswrite);
+#else
+static inline ssize_t vfio_pci_vga_rw(struct vfio_pci_core_device *vdev,
+ char __user *buf, size_t count,
+ loff_t *ppos, bool iswrite)
+{
+ return -EINVAL;
+}
+#endif
extern long vfio_pci_ioeventfd(struct vfio_pci_core_device *vdev, loff_t offset,
uint64_t data, int count, int fd);
@@ -220,6 +229,8 @@ int vfio_pci_core_sriov_configure(struct pci_dev *pdev, int nr_virtfn);
extern const struct pci_error_handlers vfio_pci_core_err_handlers;
long vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd,
unsigned long arg);
+int vfio_pci_core_ioctl_feature(struct vfio_device *device, u32 flags,
+ void __user *arg, size_t argsz);
ssize_t vfio_pci_core_read(struct vfio_device *core_vdev, char __user *buf,
size_t count, loff_t *ppos);
ssize_t vfio_pci_core_write(struct vfio_device *core_vdev, const char __user *buf,
@@ -230,6 +241,8 @@ int vfio_pci_core_match(struct vfio_device *core_vdev, char *buf);
int vfio_pci_core_enable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_disable(struct vfio_pci_core_device *vdev);
void vfio_pci_core_finish_enable(struct vfio_pci_core_device *vdev);
+pci_ers_result_t vfio_pci_core_aer_err_detected(struct pci_dev *pdev,
+ pci_channel_state_t state);
static inline bool vfio_pci_is_vga(struct pci_dev *pdev)
{
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index ef33ea002b0b..fea86061b44e 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -323,7 +323,7 @@ struct vfio_region_info_cap_type {
#define VFIO_REGION_TYPE_PCI_VENDOR_MASK (0xffff)
#define VFIO_REGION_TYPE_GFX (1)
#define VFIO_REGION_TYPE_CCW (2)
-#define VFIO_REGION_TYPE_MIGRATION (3)
+#define VFIO_REGION_TYPE_MIGRATION_DEPRECATED (3)
/* sub-types for VFIO_REGION_TYPE_PCI_* */
@@ -405,225 +405,29 @@ struct vfio_region_gfx_edid {
#define VFIO_REGION_SUBTYPE_CCW_CRW (3)
/* sub-types for VFIO_REGION_TYPE_MIGRATION */
-#define VFIO_REGION_SUBTYPE_MIGRATION (1)
-
-/*
- * The structure vfio_device_migration_info is placed at the 0th offset of
- * the VFIO_REGION_SUBTYPE_MIGRATION region to get and set VFIO device related
- * migration information. Field accesses from this structure are only supported
- * at their native width and alignment. Otherwise, the result is undefined and
- * vendor drivers should return an error.
- *
- * device_state: (read/write)
- * - The user application writes to this field to inform the vendor driver
- * about the device state to be transitioned to.
- * - The vendor driver should take the necessary actions to change the
- * device state. After successful transition to a given state, the
- * vendor driver should return success on write(device_state, state)
- * system call. If the device state transition fails, the vendor driver
- * should return an appropriate -errno for the fault condition.
- * - On the user application side, if the device state transition fails,
- * that is, if write(device_state, state) returns an error, read
- * device_state again to determine the current state of the device from
- * the vendor driver.
- * - The vendor driver should return previous state of the device unless
- * the vendor driver has encountered an internal error, in which case
- * the vendor driver may report the device_state VFIO_DEVICE_STATE_ERROR.
- * - The user application must use the device reset ioctl to recover the
- * device from VFIO_DEVICE_STATE_ERROR state. If the device is
- * indicated to be in a valid device state by reading device_state, the
- * user application may attempt to transition the device to any valid
- * state reachable from the current state or terminate itself.
- *
- * device_state consists of 3 bits:
- * - If bit 0 is set, it indicates the _RUNNING state. If bit 0 is clear,
- * it indicates the _STOP state. When the device state is changed to
- * _STOP, driver should stop the device before write() returns.
- * - If bit 1 is set, it indicates the _SAVING state, which means that the
- * driver should start gathering device state information that will be
- * provided to the VFIO user application to save the device's state.
- * - If bit 2 is set, it indicates the _RESUMING state, which means that
- * the driver should prepare to resume the device. Data provided through
- * the migration region should be used to resume the device.
- * Bits 3 - 31 are reserved for future use. To preserve them, the user
- * application should perform a read-modify-write operation on this
- * field when modifying the specified bits.
- *
- * +------- _RESUMING
- * |+------ _SAVING
- * ||+----- _RUNNING
- * |||
- * 000b => Device Stopped, not saving or resuming
- * 001b => Device running, which is the default state
- * 010b => Stop the device & save the device state, stop-and-copy state
- * 011b => Device running and save the device state, pre-copy state
- * 100b => Device stopped and the device state is resuming
- * 101b => Invalid state
- * 110b => Error state
- * 111b => Invalid state
- *
- * State transitions:
- *
- * _RESUMING _RUNNING Pre-copy Stop-and-copy _STOP
- * (100b) (001b) (011b) (010b) (000b)
- * 0. Running or default state
- * |
- *
- * 1. Normal Shutdown (optional)
- * |------------------------------------->|
- *
- * 2. Save the state or suspend
- * |------------------------->|---------->|
- *
- * 3. Save the state during live migration
- * |----------->|------------>|---------->|
- *
- * 4. Resuming
- * |<---------|
- *
- * 5. Resumed
- * |--------->|
- *
- * 0. Default state of VFIO device is _RUNNING when the user application starts.
- * 1. During normal shutdown of the user application, the user application may
- * optionally change the VFIO device state from _RUNNING to _STOP. This
- * transition is optional. The vendor driver must support this transition but
- * must not require it.
- * 2. When the user application saves state or suspends the application, the
- * device state transitions from _RUNNING to stop-and-copy and then to _STOP.
- * On state transition from _RUNNING to stop-and-copy, driver must stop the
- * device, save the device state and send it to the application through the
- * migration region. The sequence to be followed for such transition is given
- * below.
- * 3. In live migration of user application, the state transitions from _RUNNING
- * to pre-copy, to stop-and-copy, and to _STOP.
- * On state transition from _RUNNING to pre-copy, the driver should start
- * gathering the device state while the application is still running and send
- * the device state data to application through the migration region.
- * On state transition from pre-copy to stop-and-copy, the driver must stop
- * the device, save the device state and send it to the user application
- * through the migration region.
- * Vendor drivers must support the pre-copy state even for implementations
- * where no data is provided to the user before the stop-and-copy state. The
- * user must not be required to consume all migration data before the device
- * transitions to a new state, including the stop-and-copy state.
- * The sequence to be followed for above two transitions is given below.
- * 4. To start the resuming phase, the device state should be transitioned from
- * the _RUNNING to the _RESUMING state.
- * In the _RESUMING state, the driver should use the device state data
- * received through the migration region to resume the device.
- * 5. After providing saved device data to the driver, the application should
- * change the state from _RESUMING to _RUNNING.
- *
- * reserved:
- * Reads on this field return zero and writes are ignored.
- *
- * pending_bytes: (read only)
- * The number of pending bytes still to be migrated from the vendor driver.
- *
- * data_offset: (read only)
- * The user application should read data_offset field from the migration
- * region. The user application should read the device data from this
- * offset within the migration region during the _SAVING state or write
- * the device data during the _RESUMING state. See below for details of
- * sequence to be followed.
- *
- * data_size: (read/write)
- * The user application should read data_size to get the size in bytes of
- * the data copied in the migration region during the _SAVING state and
- * write the size in bytes of the data copied in the migration region
- * during the _RESUMING state.
- *
- * The format of the migration region is as follows:
- * ------------------------------------------------------------------
- * |vfio_device_migration_info| data section |
- * | | /////////////////////////////// |
- * ------------------------------------------------------------------
- * ^ ^
- * offset 0-trapped part data_offset
- *
- * The structure vfio_device_migration_info is always followed by the data
- * section in the region, so data_offset will always be nonzero. The offset
- * from where the data is copied is decided by the kernel driver. The data
- * section can be trapped, mmapped, or partitioned, depending on how the kernel
- * driver defines the data section. The data section partition can be defined
- * as mapped by the sparse mmap capability. If mmapped, data_offset must be
- * page aligned, whereas initial section which contains the
- * vfio_device_migration_info structure, might not end at the offset, which is
- * page aligned. The user is not required to access through mmap regardless
- * of the capabilities of the region mmap.
- * The vendor driver should determine whether and how to partition the data
- * section. The vendor driver should return data_offset accordingly.
- *
- * The sequence to be followed while in pre-copy state and stop-and-copy state
- * is as follows:
- * a. Read pending_bytes, indicating the start of a new iteration to get device
- * data. Repeated read on pending_bytes at this stage should have no side
- * effects.
- * If pending_bytes == 0, the user application should not iterate to get data
- * for that device.
- * If pending_bytes > 0, perform the following steps.
- * b. Read data_offset, indicating that the vendor driver should make data
- * available through the data section. The vendor driver should return this
- * read operation only after data is available from (region + data_offset)
- * to (region + data_offset + data_size).
- * c. Read data_size, which is the amount of data in bytes available through
- * the migration region.
- * Read on data_offset and data_size should return the offset and size of
- * the current buffer if the user application reads data_offset and
- * data_size more than once here.
- * d. Read data_size bytes of data from (region + data_offset) from the
- * migration region.
- * e. Process the data.
- * f. Read pending_bytes, which indicates that the data from the previous
- * iteration has been read. If pending_bytes > 0, go to step b.
- *
- * The user application can transition from the _SAVING|_RUNNING
- * (pre-copy state) to the _SAVING (stop-and-copy) state regardless of the
- * number of pending bytes. The user application should iterate in _SAVING
- * (stop-and-copy) until pending_bytes is 0.
- *
- * The sequence to be followed while _RESUMING device state is as follows:
- * While data for this device is available, repeat the following steps:
- * a. Read data_offset from where the user application should write data.
- * b. Write migration data starting at the migration region + data_offset for
- * the length determined by data_size from the migration source.
- * c. Write data_size, which indicates to the vendor driver that data is
- * written in the migration region. Vendor driver must return this write
- * operations on consuming data. Vendor driver should apply the
- * user-provided migration region data to the device resume state.
- *
- * If an error occurs during the above sequences, the vendor driver can return
- * an error code for next read() or write() operation, which will terminate the
- * loop. The user application should then take the next necessary action, for
- * example, failing migration or terminating the user application.
- *
- * For the user application, data is opaque. The user application should write
- * data in the same order as the data is received and the data should be of
- * same transaction size at the source.
- */
+#define VFIO_REGION_SUBTYPE_MIGRATION_DEPRECATED (1)
struct vfio_device_migration_info {
__u32 device_state; /* VFIO device state */
-#define VFIO_DEVICE_STATE_STOP (0)
-#define VFIO_DEVICE_STATE_RUNNING (1 << 0)
-#define VFIO_DEVICE_STATE_SAVING (1 << 1)
-#define VFIO_DEVICE_STATE_RESUMING (1 << 2)
-#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_RUNNING | \
- VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+#define VFIO_DEVICE_STATE_V1_STOP (0)
+#define VFIO_DEVICE_STATE_V1_RUNNING (1 << 0)
+#define VFIO_DEVICE_STATE_V1_SAVING (1 << 1)
+#define VFIO_DEVICE_STATE_V1_RESUMING (1 << 2)
+#define VFIO_DEVICE_STATE_MASK (VFIO_DEVICE_STATE_V1_RUNNING | \
+ VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
#define VFIO_DEVICE_STATE_VALID(state) \
- (state & VFIO_DEVICE_STATE_RESUMING ? \
- (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_RESUMING : 1)
+ (state & VFIO_DEVICE_STATE_V1_RESUMING ? \
+ (state & VFIO_DEVICE_STATE_MASK) == VFIO_DEVICE_STATE_V1_RESUMING : 1)
#define VFIO_DEVICE_STATE_IS_ERROR(state) \
- ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING))
+ ((state & VFIO_DEVICE_STATE_MASK) == (VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING))
#define VFIO_DEVICE_STATE_SET_ERROR(state) \
- ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_SATE_SAVING | \
- VFIO_DEVICE_STATE_RESUMING)
+ ((state & ~VFIO_DEVICE_STATE_MASK) | VFIO_DEVICE_STATE_V1_SAVING | \
+ VFIO_DEVICE_STATE_V1_RESUMING)
__u32 reserved;
__u64 pending_bytes;
@@ -1002,6 +806,186 @@ struct vfio_device_feature {
*/
#define VFIO_DEVICE_FEATURE_PCI_VF_TOKEN (0)
+/*
+ * Indicates the device can support the migration API through
+ * VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE. If this GET succeeds, the RUNNING and
+ * ERROR states are always supported. Support for additional states is
+ * indicated via the flags field; at least VFIO_MIGRATION_STOP_COPY must be
+ * set.
+ *
+ * VFIO_MIGRATION_STOP_COPY means that STOP, STOP_COPY and
+ * RESUMING are supported.
+ *
+ * VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P means that RUNNING_P2P
+ * is supported in addition to the STOP_COPY states.
+ *
+ * Other combinations of flags have behavior to be defined in the future.
+ */
+struct vfio_device_feature_migration {
+ __aligned_u64 flags;
+#define VFIO_MIGRATION_STOP_COPY (1 << 0)
+#define VFIO_MIGRATION_P2P (1 << 1)
+};
+#define VFIO_DEVICE_FEATURE_MIGRATION 1
+
+/*
+ * Upon VFIO_DEVICE_FEATURE_SET, execute a migration state change on the VFIO
+ * device. The new state is supplied in device_state, see enum
+ * vfio_device_mig_state for details
+ *
+ * The kernel migration driver must fully transition the device to the new state
+ * value before the operation returns to the user.
+ *
+ * The kernel migration driver must not generate asynchronous device state
+ * transitions outside of manipulation by the user or the VFIO_DEVICE_RESET
+ * ioctl as described above.
+ *
+ * If this function fails then current device_state may be the original
+ * operating state or some other state along the combination transition path.
+ * The user can then decide if it should execute a VFIO_DEVICE_RESET, attempt
+ * to return to the original state, or attempt to return to some other state
+ * such as RUNNING or STOP.
+ *
+ * If the new_state starts a new data transfer session then the FD associated
+ * with that session is returned in data_fd. The user is responsible to close
+ * this FD when it is finished. The user must consider the migration data stream
+ * carried over the FD to be opaque and must preserve the byte order of the
+ * stream. The user is not required to preserve buffer segmentation when writing
+ * the data stream during the RESUMING operation.
+ *
+ * Upon VFIO_DEVICE_FEATURE_GET, get the current migration state of the VFIO
+ * device, data_fd will be -1.
+ */
+struct vfio_device_feature_mig_state {
+ __u32 device_state; /* From enum vfio_device_mig_state */
+ __s32 data_fd;
+};
+#define VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE 2
+
+/*
+ * The device migration Finite State Machine is described by the enum
+ * vfio_device_mig_state. Some of the FSM arcs will create a migration data
+ * transfer session by returning a FD, in this case the migration data will
+ * flow over the FD using read() and write() as discussed below.
+ *
+ * There are 5 states to support VFIO_MIGRATION_STOP_COPY:
+ * RUNNING - The device is running normally
+ * STOP - The device does not change the internal or external state
+ * STOP_COPY - The device internal state can be read out
+ * RESUMING - The device is stopped and is loading a new internal state
+ * ERROR - The device has failed and must be reset
+ *
+ * And 1 optional state to support VFIO_MIGRATION_P2P:
+ * RUNNING_P2P - RUNNING, except the device cannot do peer to peer DMA
+ *
+ * The FSM takes actions on the arcs between FSM states. The driver implements
+ * the following behavior for the FSM arcs:
+ *
+ * RUNNING_P2P -> STOP
+ * STOP_COPY -> STOP
+ * While in STOP the device must stop the operation of the device. The device
+ * must not generate interrupts, DMA, or any other change to external state.
+ * It must not change its internal state. When stopped the device and kernel
+ * migration driver must accept and respond to interaction to support external
+ * subsystems in the STOP state, for example PCI MSI-X and PCI config space.
+ * Failure by the user to restrict device access while in STOP must not result
+ * in error conditions outside the user context (ex. host system faults).
+ *
+ * The STOP_COPY arc will terminate a data transfer session.
+ *
+ * RESUMING -> STOP
+ * Leaving RESUMING terminates a data transfer session and indicates the
+ * device should complete processing of the data delivered by write(). The
+ * kernel migration driver should complete the incorporation of data written
+ * to the data transfer FD into the device internal state and perform
+ * final validity and consistency checking of the new device state. If the
+ * user provided data is found to be incomplete, inconsistent, or otherwise
+ * invalid, the migration driver must fail the SET_STATE ioctl and
+ * optionally go to the ERROR state as described below.
+ *
+ * While in STOP the device has the same behavior as other STOP states
+ * described above.
+ *
+ * To abort a RESUMING session the device must be reset.
+ *
+ * RUNNING_P2P -> RUNNING
+ * While in RUNNING the device is fully operational, the device may generate
+ * interrupts, DMA, respond to MMIO, all vfio device regions are functional,
+ * and the device may advance its internal state.
+ *
+ * RUNNING -> RUNNING_P2P
+ * STOP -> RUNNING_P2P
+ * While in RUNNING_P2P the device is partially running in the P2P quiescent
+ * state defined below.
+ *
+ * STOP -> STOP_COPY
+ * This arc begin the process of saving the device state and will return a
+ * new data_fd.
+ *
+ * While in the STOP_COPY state the device has the same behavior as STOP
+ * with the addition that the data transfers session continues to stream the
+ * migration state. End of stream on the FD indicates the entire device
+ * state has been transferred.
+ *
+ * The user should take steps to restrict access to vfio device regions while
+ * the device is in STOP_COPY or risk corruption of the device migration data
+ * stream.
+ *
+ * STOP -> RESUMING
+ * Entering the RESUMING state starts a process of restoring the device state
+ * and will return a new data_fd. The data stream fed into the data_fd should
+ * be taken from the data transfer output of a single FD during saving from
+ * a compatible device. The migration driver may alter/reset the internal
+ * device state for this arc if required to prepare the device to receive the
+ * migration data.
+ *
+ * any -> ERROR
+ * ERROR cannot be specified as a device state, however any transition request
+ * can be failed with an errno return and may then move the device_state into
+ * ERROR. In this case the device was unable to execute the requested arc and
+ * was also unable to restore the device to any valid device_state.
+ * To recover from ERROR VFIO_DEVICE_RESET must be used to return the
+ * device_state back to RUNNING.
+ *
+ * The optional peer to peer (P2P) quiescent state is intended to be a quiescent
+ * state for the device for the purposes of managing multiple devices within a
+ * user context where peer-to-peer DMA between devices may be active. The
+ * RUNNING_P2P states must prevent the device from initiating
+ * any new P2P DMA transactions. If the device can identify P2P transactions
+ * then it can stop only P2P DMA, otherwise it must stop all DMA. The migration
+ * driver must complete any such outstanding operations prior to completing the
+ * FSM arc into a P2P state. For the purpose of specification the states
+ * behave as though the device was fully running if not supported. Like while in
+ * STOP or STOP_COPY the user must not touch the device, otherwise the state
+ * can be exited.
+ *
+ * The remaining possible transitions are interpreted as combinations of the
+ * above FSM arcs. As there are multiple paths through the FSM arcs the path
+ * should be selected based on the following rules:
+ * - Select the shortest path.
+ * Refer to vfio_mig_get_next_state() for the result of the algorithm.
+ *
+ * The automatic transit through the FSM arcs that make up the combination
+ * transition is invisible to the user. When working with combination arcs the
+ * user may see any step along the path in the device_state if SET_STATE
+ * fails. When handling these types of errors users should anticipate future
+ * revisions of this protocol using new states and those states becoming
+ * visible in this case.
+ *
+ * The optional states cannot be used with SET_STATE if the device does not
+ * support them. The user can discover if these states are supported by using
+ * VFIO_DEVICE_FEATURE_MIGRATION. By using combination transitions the user can
+ * avoid knowing about these optional states if the kernel driver supports them.
+ */
+enum vfio_device_mig_state {
+ VFIO_DEVICE_STATE_ERROR = 0,
+ VFIO_DEVICE_STATE_STOP = 1,
+ VFIO_DEVICE_STATE_RUNNING = 2,
+ VFIO_DEVICE_STATE_STOP_COPY = 3,
+ VFIO_DEVICE_STATE_RESUMING = 4,
+ VFIO_DEVICE_STATE_RUNNING_P2P = 5,
+};
+
/* -------- API for Type1 VFIO IOMMU -------- */
/**