qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH V6 21/27] vfio-pci: cpr part 3 (intx)


From: Fam Zheng
Subject: Re: [PATCH V6 21/27] vfio-pci: cpr part 3 (intx)
Date: Tue, 29 Mar 2022 12:03:26 +0100

On 2021-08-06 14:43, Steve Sistare wrote:
> Preserve vfio INTX state across cpr restart.  Preserve VFIOINTx fields as
> follows:
>   pin : Recover this from the vfio config in kernel space
>   interrupt : Preserve its eventfd descriptor across exec.
>   unmask : Ditto
>   route.irq : This could perhaps be recovered in vfio_pci_post_load by
>     calling pci_device_route_intx_to_irq(pin), whose implementation reads
>     config space for a bridge device such as ich9.  However, there is no
>     guarantee that the bridge vmstate is read before vfio vmstate.  Rather
>     than fiddling with MigrationPriority for vmstate handlers, explicitly
>     save route.irq in vfio vmstate.
>   pending : save in vfio vmstate.
>   mmap_timeout, mmap_timer : Re-initialize
>   bool kvm_accel : Re-initialize
> 
> In vfio_realize, defer calling vfio_intx_enable until the vmstate
> is available, in vfio_pci_post_load.  Modify vfio_intx_enable and
> vfio_intx_kvm_enable to skip vfio initialization, but still perform
> kvm initialization.
> 
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>

Hi Steve,

Not directly related to this patch, but since the context is close: it looks
like this series only takes care of exec restart mode of vfio-pci, have you had
any thoughts on kexec reboot mode with vfio-pci?

The general idea is if DMAR context is not lost during kexec, we should be able
to set up irqfds again and things will just work?

Fam

--

PS some more info below:

I have some local kernel patches to kexec reboot most part of the host kernel
while keeping IOMMU DMAR tables in a valid state; with that, not many extra
things are needed in addition to restore it. A PoC is like below (I can share
more details of the kernel changes if this patch makes any sense):


commit f8951e58be86bd6e37f816394a9a73f28d8059fc
Author: Fam Zheng <fam.zheng@bytedance.com>
Date:   Mon Mar 21 13:19:49 2022 +0000

    cpr: Add live-update support to vfio-pci devices
    
    In cpr-save, always serialize the vfio-pci states.
    
    In cpr-load, add a '-restore' mode that will do
    VFIO_GROUP_GET_DEVICE_FD_INTACT and skip DMAR setup, somewhat similar to
    the current cpr exec mode.
    
    Signed-off-by: Fam Zheng <fam.zheng@bytedance.com>

diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 73f4259556..e36f0ef97d 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -584,10 +584,15 @@ void msix_init_vector_notifiers(PCIDevice *dev,
                                 MSIVectorReleaseNotifier release_notifier,
                                 MSIVectorPollNotifier poll_notifier)
 {
+    int vector;
+
     assert(use_notifier && release_notifier);
     dev->msix_vector_use_notifier = use_notifier;
     dev->msix_vector_release_notifier = release_notifier;
     dev->msix_vector_poll_notifier = poll_notifier;
+    for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
+        msix_handle_mask_update(dev, vector, true);
+    }
 }
 
 int msix_set_vector_notifiers(PCIDevice *dev,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 605ffbb5d0..f1240410a8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -2066,6 +2066,9 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
     bool reused;
     VFIOAddressSpace *space;
 
+    if (restore) {
+        return 0;
+    }
     space = vfio_get_address_space(as);
     fd = cpr_find_fd("vfio_container_for_group", group->groupid);
     reused = (fd > 0);
@@ -2486,7 +2489,8 @@ int vfio_get_device(VFIOGroup *group, const char *name,
     fd = cpr_find_fd(name, 0);
     reused = (fd >= 0);
     if (!reused) {
-        fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+        int op = restore ? VFIO_GROUP_GET_DEVICE_FD_INTACT : 
VFIO_GROUP_GET_DEVICE_FD;
+        fd = ioctl(group->fd, op, name);
     }
 
     if (fd < 0) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e32513c668..9da5f93228 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -361,7 +361,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
      * Do not alter interrupt state during vfio_realize and cpr-load.  The
      * reused flag is cleared thereafter.
      */
-    if (!vdev->pdev.reused) {
+    if (!vdev->pdev.reused && !restore) {
         vfio_disable_interrupts(vdev);
     }
 
@@ -388,7 +388,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error 
**errp)
     fd = event_notifier_get_fd(&vdev->intx.interrupt);
     qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
 
-    if (vdev->pdev.reused) {
+    if (vdev->pdev.reused && !restore) {
         vfio_intx_reenable_kvm(vdev, &err);
         goto finish;
     }
@@ -2326,6 +2326,9 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool 
single)
     int ret, i, count;
     bool multi = false;
 
+    if (restore) {
+        return 0;
+    }
     trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
 
     if (!single) {
@@ -3185,7 +3188,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
         kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
 
         /* Wait until cpr-load reads intx routing data to enable */
-        if (!pdev->reused) {
+        if (!pdev->reused && !restore) {
             ret = vfio_intx_enable(vdev, errp);
             if (ret) {
                 goto out_deregister;
@@ -3295,7 +3298,7 @@ static void vfio_pci_reset(DeviceState *dev)
     VFIOPCIDevice *vdev = VFIO_PCI(dev);
 
     /* Do not reset the device during qemu_system_reset prior to cpr-load */
-    if (vdev->pdev.reused) {
+    if (vdev->pdev.reused || restore) {
         return;
     }
 
@@ -3429,33 +3432,40 @@ static void vfio_merge_config(VFIOPCIDevice *vdev)
 
 static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix)
 {
-    int i, fd;
+    int i, fd, ret;
     bool pending = false;
     PCIDevice *pdev = &vdev->pdev;
 
+    pdev->msix_entries_nr = nr_vectors;
     vdev->nr_vectors = nr_vectors;
     vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
     vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
 
-    for (i = 0; i < nr_vectors; i++) {
-        VFIOMSIVector *vector = &vdev->msi_vectors[i];
-
-        fd = load_event_fd(vdev, "interrupt", i);
-        if (fd >= 0) {
-            vfio_vector_init(vdev, i);
-            qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+    if (restore) {
+        ret = vfio_enable_vectors(vdev, true);
+        if (ret) {
+            error_report("vfio: failed to enable vectors, %d", ret);
         }
+    } else {
+        for (i = 0; i < nr_vectors; i++) {
+            VFIOMSIVector *vector = &vdev->msi_vectors[i];
 
-        if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
-            vfio_add_kvm_msi_virq(vdev, vector, i, msix);
-        }
+            fd = load_event_fd(vdev, "interrupt", i);
+            if (fd >= 0) {
+                vfio_vector_init(vdev, i);
+                qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+            }
 
-        if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
-            set_bit(i, vdev->msix->pending);
-            pending = true;
+            if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
+                vfio_add_kvm_msi_virq(vdev, vector, i, msix);
+            }
+
+            if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+                set_bit(i, vdev->msix->pending);
+                pending = true;
+            }
         }
     }
-
     if (msix) {
         memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
     }
@@ -3534,7 +3544,7 @@ static const VMStateDescription vfio_intx_vmstate = {
 
 static bool vfio_pci_needed(void *opaque)
 {
-    return cpr_get_mode() == CPR_MODE_RESTART;
+    return 1;
 }
 
 static const VMStateDescription vfio_pci_vmstate = {
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 6241c20fb1..0179b0aa90 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -26,6 +26,7 @@ void configure_rtc(QemuOpts *opts);
 void qemu_init_subsystems(void);
 
 extern int autostart;
+extern int restore;
 
 typedef enum {
     VGA_NONE, VGA_STD, VGA_CIRRUS, VGA_VMWARE, VGA_XENFB, VGA_QXL,
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index e680594f27..65c3bab074 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -188,6 +188,8 @@ struct vfio_group_status {
  */
 #define VFIO_GROUP_GET_DEVICE_FD       _IO(VFIO_TYPE, VFIO_BASE + 6)
 
+#define VFIO_GROUP_GET_DEVICE_FD_INTACT        _IO(VFIO_TYPE, VFIO_BASE + 21)
+
 /* --------------- IOCTLs for DEVICE file descriptors --------------- */
 
 /**
diff --git a/qemu-options.hx b/qemu-options.hx
index 8b90d04cb9..03666a59b3 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3984,6 +3984,10 @@ SRST
     option is experimental.
 ERST
 
+DEF("restore", 0, QEMU_OPTION_restore, \
+    "-restore              restore mode",
+    QEMU_ARCH_ALL)
+
 DEF("S", 0, QEMU_OPTION_S, \
     "-S              freeze CPU at startup (use 'c' to start execution)\n",
     QEMU_ARCH_ALL)
diff --git a/softmmu/globals.c b/softmmu/globals.c
index a18fd8dcf3..6fcb5846b4 100644
--- a/softmmu/globals.c
+++ b/softmmu/globals.c
@@ -41,6 +41,7 @@ bool enable_cpu_pm;
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart = 1;
+int restore;
 int vga_interface_type = VGA_NONE;
 Chardev *parallel_hds[MAX_PARALLEL_PORTS];
 int win2k_install_hack;
diff --git a/softmmu/vl.c b/softmmu/vl.c
index f14e29e622..fba6b577cb 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -3088,6 +3088,9 @@ void qemu_init(int argc, char **argv, char **envp)
             case QEMU_OPTION_S:
                 autostart = 0;
                 break;
+            case QEMU_OPTION_restore:
+                restore = 1;
+                break;
             case QEMU_OPTION_k:
                 keyboard_layout = optarg;
                 break;



reply via email to

[Prev in Thread] Current Thread [Next in Thread]