[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH V6 21/27] vfio-pci: cpr part 3 (intx)
From: |
Fam Zheng |
Subject: |
Re: [PATCH V6 21/27] vfio-pci: cpr part 3 (intx) |
Date: |
Tue, 29 Mar 2022 12:03:26 +0100 |
On 2021-08-06 14:43, Steve Sistare wrote:
> Preserve vfio INTX state across cpr restart. Preserve VFIOINTx fields as
> follows:
> pin : Recover this from the vfio config in kernel space
> interrupt : Preserve its eventfd descriptor across exec.
> unmask : Ditto
> route.irq : This could perhaps be recovered in vfio_pci_post_load by
> calling pci_device_route_intx_to_irq(pin), whose implementation reads
> config space for a bridge device such as ich9. However, there is no
> guarantee that the bridge vmstate is read before vfio vmstate. Rather
> than fiddling with MigrationPriority for vmstate handlers, explicitly
> save route.irq in vfio vmstate.
> pending : save in vfio vmstate.
> mmap_timeout, mmap_timer : Re-initialize
> bool kvm_accel : Re-initialize
>
> In vfio_realize, defer calling vfio_intx_enable until the vmstate
> is available, in vfio_pci_post_load. Modify vfio_intx_enable and
> vfio_intx_kvm_enable to skip vfio initialization, but still perform
> kvm initialization.
>
> Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Hi Steve,
Not directly related to this patch, but since the context is close: it looks
like this series only takes care of exec restart mode of vfio-pci, have you had
any thoughts on kexec reboot mode with vfio-pci?
The general idea is if DMAR context is not lost during kexec, we should be able
to set up irqfds again and things will just work?
Fam
--
PS some more info below:
I have some local kernel patches to kexec reboot most part of the host kernel
while keeping IOMMU DMAR tables in a valid state; with that, not many extra
things are needed in addition to restore it. A PoC is like below (I can share
more details of the kernel changes if this patch makes any sense):
commit f8951e58be86bd6e37f816394a9a73f28d8059fc
Author: Fam Zheng <fam.zheng@bytedance.com>
Date: Mon Mar 21 13:19:49 2022 +0000
cpr: Add live-update support to vfio-pci devices
In cpr-save, always serialize the vfio-pci states.
In cpr-load, add a '-restore' mode that will do
VFIO_GROUP_GET_DEVICE_FD_INTACT and skip DMAR setup, somewhat similar to
the current cpr exec mode.
Signed-off-by: Fam Zheng <fam.zheng@bytedance.com>
diff --git a/hw/pci/msix.c b/hw/pci/msix.c
index 73f4259556..e36f0ef97d 100644
--- a/hw/pci/msix.c
+++ b/hw/pci/msix.c
@@ -584,10 +584,15 @@ void msix_init_vector_notifiers(PCIDevice *dev,
MSIVectorReleaseNotifier release_notifier,
MSIVectorPollNotifier poll_notifier)
{
+ int vector;
+
assert(use_notifier && release_notifier);
dev->msix_vector_use_notifier = use_notifier;
dev->msix_vector_release_notifier = release_notifier;
dev->msix_vector_poll_notifier = poll_notifier;
+ for (vector = 0; vector < dev->msix_entries_nr; ++vector) {
+ msix_handle_mask_update(dev, vector, true);
+ }
}
int msix_set_vector_notifiers(PCIDevice *dev,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 605ffbb5d0..f1240410a8 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -2066,6 +2066,9 @@ static int vfio_connect_container(VFIOGroup *group,
AddressSpace *as,
bool reused;
VFIOAddressSpace *space;
+ if (restore) {
+ return 0;
+ }
space = vfio_get_address_space(as);
fd = cpr_find_fd("vfio_container_for_group", group->groupid);
reused = (fd > 0);
@@ -2486,7 +2489,8 @@ int vfio_get_device(VFIOGroup *group, const char *name,
fd = cpr_find_fd(name, 0);
reused = (fd >= 0);
if (!reused) {
- fd = ioctl(group->fd, VFIO_GROUP_GET_DEVICE_FD, name);
+ int op = restore ? VFIO_GROUP_GET_DEVICE_FD_INTACT :
VFIO_GROUP_GET_DEVICE_FD;
+ fd = ioctl(group->fd, op, name);
}
if (fd < 0) {
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index e32513c668..9da5f93228 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -361,7 +361,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error
**errp)
* Do not alter interrupt state during vfio_realize and cpr-load. The
* reused flag is cleared thereafter.
*/
- if (!vdev->pdev.reused) {
+ if (!vdev->pdev.reused && !restore) {
vfio_disable_interrupts(vdev);
}
@@ -388,7 +388,7 @@ static int vfio_intx_enable(VFIOPCIDevice *vdev, Error
**errp)
fd = event_notifier_get_fd(&vdev->intx.interrupt);
qemu_set_fd_handler(fd, vfio_intx_interrupt, NULL, vdev);
- if (vdev->pdev.reused) {
+ if (vdev->pdev.reused && !restore) {
vfio_intx_reenable_kvm(vdev, &err);
goto finish;
}
@@ -2326,6 +2326,9 @@ static int vfio_pci_hot_reset(VFIOPCIDevice *vdev, bool
single)
int ret, i, count;
bool multi = false;
+ if (restore) {
+ return 0;
+ }
trace_vfio_pci_hot_reset(vdev->vbasedev.name, single ? "one" : "multi");
if (!single) {
@@ -3185,7 +3188,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
kvm_irqchip_add_change_notifier(&vdev->irqchip_change_notifier);
/* Wait until cpr-load reads intx routing data to enable */
- if (!pdev->reused) {
+ if (!pdev->reused && !restore) {
ret = vfio_intx_enable(vdev, errp);
if (ret) {
goto out_deregister;
@@ -3295,7 +3298,7 @@ static void vfio_pci_reset(DeviceState *dev)
VFIOPCIDevice *vdev = VFIO_PCI(dev);
/* Do not reset the device during qemu_system_reset prior to cpr-load */
- if (vdev->pdev.reused) {
+ if (vdev->pdev.reused || restore) {
return;
}
@@ -3429,33 +3432,40 @@ static void vfio_merge_config(VFIOPCIDevice *vdev)
static void vfio_claim_vectors(VFIOPCIDevice *vdev, int nr_vectors, bool msix)
{
- int i, fd;
+ int i, fd, ret;
bool pending = false;
PCIDevice *pdev = &vdev->pdev;
+ pdev->msix_entries_nr = nr_vectors;
vdev->nr_vectors = nr_vectors;
vdev->msi_vectors = g_new0(VFIOMSIVector, nr_vectors);
vdev->interrupt = msix ? VFIO_INT_MSIX : VFIO_INT_MSI;
- for (i = 0; i < nr_vectors; i++) {
- VFIOMSIVector *vector = &vdev->msi_vectors[i];
-
- fd = load_event_fd(vdev, "interrupt", i);
- if (fd >= 0) {
- vfio_vector_init(vdev, i);
- qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+ if (restore) {
+ ret = vfio_enable_vectors(vdev, true);
+ if (ret) {
+ error_report("vfio: failed to enable vectors, %d", ret);
}
+ } else {
+ for (i = 0; i < nr_vectors; i++) {
+ VFIOMSIVector *vector = &vdev->msi_vectors[i];
- if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
- vfio_add_kvm_msi_virq(vdev, vector, i, msix);
- }
+ fd = load_event_fd(vdev, "interrupt", i);
+ if (fd >= 0) {
+ vfio_vector_init(vdev, i);
+ qemu_set_fd_handler(fd, vfio_msi_interrupt, NULL, vector);
+ }
- if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
- set_bit(i, vdev->msix->pending);
- pending = true;
+ if (load_event_fd(vdev, "kvm_interrupt", i) >= 0) {
+ vfio_add_kvm_msi_virq(vdev, vector, i, msix);
+ }
+
+ if (msix && msix_is_pending(pdev, i) && msix_is_masked(pdev, i)) {
+ set_bit(i, vdev->msix->pending);
+ pending = true;
+ }
}
}
-
if (msix) {
memory_region_set_enabled(&pdev->msix_pba_mmio, pending);
}
@@ -3534,7 +3544,7 @@ static const VMStateDescription vfio_intx_vmstate = {
static bool vfio_pci_needed(void *opaque)
{
- return cpr_get_mode() == CPR_MODE_RESTART;
+ return 1;
}
static const VMStateDescription vfio_pci_vmstate = {
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 6241c20fb1..0179b0aa90 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -26,6 +26,7 @@ void configure_rtc(QemuOpts *opts);
void qemu_init_subsystems(void);
extern int autostart;
+extern int restore;
typedef enum {
VGA_NONE, VGA_STD, VGA_CIRRUS, VGA_VMWARE, VGA_XENFB, VGA_QXL,
diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h
index e680594f27..65c3bab074 100644
--- a/linux-headers/linux/vfio.h
+++ b/linux-headers/linux/vfio.h
@@ -188,6 +188,8 @@ struct vfio_group_status {
*/
#define VFIO_GROUP_GET_DEVICE_FD _IO(VFIO_TYPE, VFIO_BASE + 6)
+#define VFIO_GROUP_GET_DEVICE_FD_INTACT _IO(VFIO_TYPE, VFIO_BASE + 21)
+
/* --------------- IOCTLs for DEVICE file descriptors --------------- */
/**
diff --git a/qemu-options.hx b/qemu-options.hx
index 8b90d04cb9..03666a59b3 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3984,6 +3984,10 @@ SRST
option is experimental.
ERST
+DEF("restore", 0, QEMU_OPTION_restore, \
+ "-restore restore mode",
+ QEMU_ARCH_ALL)
+
DEF("S", 0, QEMU_OPTION_S, \
"-S freeze CPU at startup (use 'c' to start execution)\n",
QEMU_ARCH_ALL)
diff --git a/softmmu/globals.c b/softmmu/globals.c
index a18fd8dcf3..6fcb5846b4 100644
--- a/softmmu/globals.c
+++ b/softmmu/globals.c
@@ -41,6 +41,7 @@ bool enable_cpu_pm;
int nb_nics;
NICInfo nd_table[MAX_NICS];
int autostart = 1;
+int restore;
int vga_interface_type = VGA_NONE;
Chardev *parallel_hds[MAX_PARALLEL_PORTS];
int win2k_install_hack;
diff --git a/softmmu/vl.c b/softmmu/vl.c
index f14e29e622..fba6b577cb 100644
--- a/softmmu/vl.c
+++ b/softmmu/vl.c
@@ -3088,6 +3088,9 @@ void qemu_init(int argc, char **argv, char **envp)
case QEMU_OPTION_S:
autostart = 0;
break;
+ case QEMU_OPTION_restore:
+ restore = 1;
+ break;
case QEMU_OPTION_k:
keyboard_layout = optarg;
break;
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [PATCH V6 21/27] vfio-pci: cpr part 3 (intx),
Fam Zheng <=