qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [Xen-devel] [PATCH V5 10/10] Introduce Xen PCI Passthro


From: Konrad Rzeszutek Wilk
Subject: Re: [Qemu-devel] [Xen-devel] [PATCH V5 10/10] Introduce Xen PCI Passthrough, MSI (3/3)
Date: Mon, 5 Dec 2011 20:02:40 -0400
User-agent: Mutt/1.5.9i

On Thu, Nov 24, 2011 at 05:44:39PM +0000, Anthony PERARD wrote:
> From: Jiang Yunhong <address@hidden>
> 
> A more complete history can be found here:
> git://xenbits.xensource.com/qemu-xen-unstable.git
> 
> Signed-off-by: Jiang Yunhong <address@hidden>
> Signed-off-by: Shan Haitao <address@hidden>
> Signed-off-by: Anthony PERARD <address@hidden>
> ---
>  Makefile.target                      |    1 +
>  hw/apic-msidef.h                     |    2 +
>  hw/xen_pci_passthrough.c             |   60 +++-
>  hw/xen_pci_passthrough.h             |   55 +++
>  hw/xen_pci_passthrough_config_init.c |  505 +++++++++++++++++++++++++-
>  hw/xen_pci_passthrough_msi.c         |  678 
> ++++++++++++++++++++++++++++++++++
>  6 files changed, 1294 insertions(+), 7 deletions(-)
>  create mode 100644 hw/xen_pci_passthrough_msi.c
> 
> diff --git a/Makefile.target b/Makefile.target
> index 33435a3..81cff70 100644
> --- a/Makefile.target
> +++ b/Makefile.target
> @@ -223,6 +223,7 @@ obj-i386-$(CONFIG_XEN) += xen_platform.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += host-pci-device.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough.o
>  obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_config_init.o
> +obj-i386-$(CONFIG_XEN_PCI_PASSTHROUGH) += xen_pci_passthrough_msi.o
>  
>  # Inter-VM PCI shared memory
>  CONFIG_IVSHMEM =
> diff --git a/hw/apic-msidef.h b/hw/apic-msidef.h
> index 3182f0b..6e2eb71 100644
> --- a/hw/apic-msidef.h
> +++ b/hw/apic-msidef.h
> @@ -22,6 +22,8 @@
>  
>  #define MSI_ADDR_DEST_MODE_SHIFT        2
>  
> +#define MSI_ADDR_REDIRECTION_SHIFT      3
> +
>  #define MSI_ADDR_DEST_ID_SHIFT          12
>  #define  MSI_ADDR_DEST_ID_MASK          0x00ffff0
>  
> diff --git a/hw/xen_pci_passthrough.c b/hw/xen_pci_passthrough.c
> index c816ed5..cd7e3c7 100644
> --- a/hw/xen_pci_passthrough.c
> +++ b/hw/xen_pci_passthrough.c
> @@ -38,6 +38,39 @@
>   *     Write '1'
>   *       <ptdev->msi_trans_en is false>
>   *         - Set real bit to '1'.
> + *
> + * MSI-INTx translation.
> + *   Initialize(xc_physdev_map_pirq_msi/pt_msi_setup)
> + *     Bind MSI-INTx(xc_domain_bind_pt_irq)
> + *       <fail>
> + *         - Unmap MSI.
> + *           <success>
> + *             - Set dev->msi->pirq to '-1'.
> + *           <fail>
> + *             - Do nothing.
> + *
> + *   Write to Interrupt Disable bit by guest software(pt_cmd_reg_write)
> + *     Write '0'
> + *       <ptdev->msi_trans_en is true>
> + *         - Set MSI Enable bit to '1'.
> + *
> + *     Write '1'
> + *       <ptdev->msi_trans_en is true>
> + *         - Set MSI Enable bit to '0'.
> + *
> + * MSI interrupt:
> + *   Initialize MSI register(pt_msi_setup, pt_msi_update)
> + *     Bind MSI(xc_domain_update_msi_irq)
> + *       <fail>
> + *         - Unmap MSI.
> + *         - Set dev->msi->pirq to '-1'.
> + *
> + * MSI-X interrupt:
> + *   Initialize MSI-X register(pt_msix_update_one)
> + *     Bind MSI-X(xc_domain_update_msi_irq)
> + *       <fail>
> + *         - Unmap MSI-X.
> + *         - Set entry->pirq to '-1'.
>   */
>  
>  #include <sys/ioctl.h>
> @@ -389,6 +422,7 @@ static void pt_iomem_map(XenPCIPassthroughState *s, int i,
>      }
>  
>      if (!first_map && old_ebase != PT_PCI_BAR_UNMAPPED) {
> +        pt_add_msix_mapping(s, i);
>          /* Remove old mapping */
>          memory_region_del_subregion(r->address_space,
>                                      r->memory);
> @@ -417,6 +451,15 @@ static void pt_iomem_map(XenPCIPassthroughState *s, int 
> i,
>          if (ret != 0) {
>              PT_ERR(&s->dev, "create new mapping failed!\n");
>          }
> +
> +        ret = pt_remove_msix_mapping(s, i);
> +        if (ret != 0) {
> +            PT_ERR(&s->dev, "Remove MSI-X MMIO mapping failed!\n");
> +        }
> +
> +        if (old_ebase != e_phys && old_ebase != -1) {
> +            pt_msix_update_remap(s, i);
> +        }
>      }
>  }
>  
> @@ -744,6 +787,9 @@ static int pt_initfn(PCIDevice *pcidev)
>          mapped_machine_irq[machine_irq]++;
>      }
>  
> +    /* setup MSI-INTx translation if support */
> +    rc = pt_enable_msi_translate(s);
> +
>      /* bind machine_irq to device */
>      if (rc < 0 && machine_irq != 0) {
>          uint8_t e_device = PCI_SLOT(s->dev.devfn);
> @@ -773,7 +819,8 @@ static int pt_initfn(PCIDevice *pcidev)
>  
>  out:
>      PT_LOG(pcidev, "Real physical device %02x:%02x.%x registered 
> successfuly!"
> -           "\nIRQ type = %s\n", bus, slot, func, "INTx");
> +           "\nIRQ type = %s\n", bus, slot, func,
> +           s->msi_trans_en ? "MSI-INTx" : "INTx");
>  
>      return 0;
>  }
> @@ -790,7 +837,7 @@ static int pt_unregister_device(PCIDevice *pcidev)
>      e_intx = pci_intx(s);
>      machine_irq = s->machine_irq;
>  
> -    if (machine_irq) {
> +    if (s->msi_trans_en == 0 && machine_irq) {

It is defined as 'bool' so you could do !s->msi_trans_en instead?

>          rc = xc_domain_unbind_pt_irq(xen_xc, xen_domid, machine_irq,
>                                       PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 
> 0);
>          if (rc < 0) {
> @@ -798,6 +845,13 @@ static int pt_unregister_device(PCIDevice *pcidev)
>          }
>      }
>  
> +    if (s->msi) {
> +        pt_msi_disable(s);
> +    }
> +    if (s->msix) {
> +        pt_msix_disable(s);
> +    }
> +
>      if (machine_irq) {
>          mapped_machine_irq[machine_irq]--;
>  
> @@ -832,6 +886,8 @@ static PCIDeviceInfo xen_pci_passthrough = {
>      .is_express = 0,
>      .qdev.props = (Property[]) {
>          DEFINE_PROP_STRING("hostaddr", XenPCIPassthroughState, hostaddr),
> +        DEFINE_PROP_BIT("msitranslate", XenPCIPassthroughState, 
> msi_trans_cap,
> +                        0, true),
>          DEFINE_PROP_BIT("power-mgmt", XenPCIPassthroughState, power_mgmt,
>                          0, false),
>          DEFINE_PROP_END_OF_LIST(),
> diff --git a/hw/xen_pci_passthrough.h b/hw/xen_pci_passthrough.h
> index 110325c..acbbab5 100644
> --- a/hw/xen_pci_passthrough.h
> +++ b/hw/xen_pci_passthrough.h
> @@ -74,6 +74,10 @@ typedef int (*conf_byte_restore)
>  #define PT_PCI_BAR_UNMAPPED (-1)
>  #define PT_UNASSIGNED_PIRQ (-1)
>  
> +/* MSI-X */
> +#define PT_MSI_FLAG_UNINIT 0x1000
> +#define PT_MSI_FLAG_MAPPED 0x2000


Can you explain what those two defines do please?

> +
>  
>  typedef enum {
>      GRP_TYPE_HARDWIRED = 0,                     /* 0 Hardwired reg group */
> @@ -177,6 +181,34 @@ typedef struct XenPTRegGroup {
>  } XenPTRegGroup;
>  
>  
> +typedef struct XenPTMSI {
> +    uint32_t flags;
> +    uint32_t ctrl_offset; /* saved control offset */
> +    int pirq;          /* guest pirq corresponding */
> +    uint32_t addr_lo;  /* guest message address */
> +    uint32_t addr_hi;  /* guest message upper address */
> +    uint16_t data;     /* guest message data */
> +} XenPTMSI;
> +
> +typedef struct XenMSIXEntry {
> +    int pirq;        /* -1 means unmapped */
> +    int flags;       /* flags indicting whether MSI ADDR or DATA is updated 
> */
> +    uint32_t io_mem[4];

Can you document what is in each position? Perhaps some enums or
#defines for them?

> +} XenMSIXEntry;
> +typedef struct XenPTMSIX {
> +    uint32_t ctrl_offset;
> +    int enabled;
> +    int total_entries;
> +    int bar_index;
> +    uint64_t table_base;
> +    uint32_t table_off;
> +    uint32_t table_offset_adjust; /* page align mmap */
> +    uint64_t mmio_base_addr;
> +    int mmio_index;
> +    void *phys_iomem_base;
> +    XenMSIXEntry msix_entry[0];
> +} XenPTMSIX;
> +
>  typedef struct XenPTPM {
>      QEMUTimer *pm_timer;  /* QEMUTimer struct */
>      int no_soft_reset;    /* No Soft Reset flags */
> @@ -200,6 +232,13 @@ struct XenPCIPassthroughState {
>  
>      uint32_t machine_irq;
>  
> +    XenPTMSI *msi;
> +    XenPTMSIX *msix;
> +
> +    /* Physical MSI to guest INTx translation when possible */

Meaning that the MSI vector is going to show up as INTx in the guest?

This is by default enabled, right? At least that is what
txt/misc/vtd.txt

tells me. So would it be better than to have 'msi_trans_off' as flag
as by default we would enable it? So would it be better than to have
'msi_trans_off' as flag?

> +    uint32_t msi_trans_cap;
> +    bool msi_trans_en;
> +
>      uint32_t power_mgmt;
>      XenPTPM *pm_state;
>  
> @@ -279,4 +318,20 @@ static inline uint8_t pci_intx(XenPCIPassthroughState *s)
>      return r_val;
>  }
>  
> +/* MSI/MSI-X */
> +void pt_msi_set_enable(XenPCIPassthroughState *s, int en);

You could make it a bool. So "bool en" instead?
> +int pt_msi_setup(XenPCIPassthroughState *s);
> +int pt_msi_update(XenPCIPassthroughState *d);
> +void pt_msi_disable(XenPCIPassthroughState *s);
> +int pt_enable_msi_translate(XenPCIPassthroughState *s);
> +void pt_disable_msi_translate(XenPCIPassthroughState *s);
> +
> +int pt_msix_init(XenPCIPassthroughState *s, int pos);

uint32_t as pos?
> +void pt_msix_delete(XenPCIPassthroughState *s);
> +int pt_msix_update(XenPCIPassthroughState *s);
> +int pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index);

uint32_t as bar_index?
> +void pt_msix_disable(XenPCIPassthroughState *s);
> +int pt_add_msix_mapping(XenPCIPassthroughState *s, int bar_index);
> +int pt_remove_msix_mapping(XenPCIPassthroughState *s, int bar_index);
> +
>  #endif /* !QEMU_HW_XEN_PCI_PASSTHROUGH_H */
> diff --git a/hw/xen_pci_passthrough_config_init.c 
> b/hw/xen_pci_passthrough_config_init.c
> index ae64544..28523f1 100644
> --- a/hw/xen_pci_passthrough_config_init.c
> +++ b/hw/xen_pci_passthrough_config_init.c
> @@ -398,11 +398,19 @@ static int pt_cmd_reg_write(XenPCIPassthroughState *s, 
> XenPTReg *cfg_entry,
>      throughable_mask = ~emu_mask & valid_mask;
>  
>      if (*value & PCI_COMMAND_INTX_DISABLE) {
> -        throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> -    } else {
> -        if (s->machine_irq) {
> +        if (s->msi_trans_en) {
> +            pt_msi_set_enable(s, 0);

Can you put a comment explaining the '0' argument?
> +        } else {
>              throughable_mask |= PCI_COMMAND_INTX_DISABLE;
>          }
> +    } else {
> +        if (s->msi_trans_en) {
> +            pt_msi_set_enable(s, 1);
> +        } else {
> +            if (s->machine_irq) {
> +                throughable_mask |= PCI_COMMAND_INTX_DISABLE;
> +            }
> +        }
>      }
>  
>      *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> @@ -1282,13 +1290,21 @@ static void 
> pt_reset_interrupt_and_io_mapping(XenPCIPassthroughState *s)
>      e_device = PCI_SLOT(s->dev.devfn);
>      e_intx = pci_intx(s);
>  
> -    if (s->machine_irq) {
> +    if (s->msi_trans_en == 0 && s->machine_irq) {
>          if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->machine_irq,
>                                      PT_IRQ_TYPE_PCI, 0, e_device, e_intx, 
> 0)) {
>              PT_ERR(d, "Unbinding of interrupt failed!\n");
>          }
>      }
>  
> +    /* disable MSI/MSI-X and MSI-INTx translation */
> +    if (s->msi) {
> +        pt_msi_disable(s);
> +    }
> +    if (s->msix) {
> +        pt_msix_disable(s);
> +    }
> +
>      /* clear all virtual region address */
>      for (i = 0; i < PCI_NUM_REGIONS; i++) {
>          r = &d->io_regions[i];
> @@ -1536,6 +1552,415 @@ static XenPTRegInfo pt_emu_reg_pm_tbl[] = {
>      },
>  };
>  
> +/********************************
> + * MSI Capability
> + */
> +
> +/* Message Control register */
> +static int pt_msgctrl_reg_init(XenPCIPassthroughState *s,
> +                               XenPTRegInfo *reg, uint32_t real_offset,
> +                               uint32_t *data)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t reg_field = 0;
> +
> +    /* use I/O device register's value as initial value */
> +    reg_field = pci_get_word(d->config + real_offset);

There is no 'pci_get_halfword' operations?

> +
> +    if (reg_field & PCI_MSI_FLAGS_ENABLE) {
> +        PT_LOG(&s->dev, "MSI already enabled, disable it first\n");

.. "disabling it first."

> +        host_pci_set_word(s->real_device, real_offset,
> +                          reg_field & ~PCI_MSI_FLAGS_ENABLE);
> +    }
> +    s->msi->flags |= reg_field | PT_MSI_FLAG_UNINIT;

There is no chance that you PT_MSI_FLAG_UNINIT would shadow what
is in the register? You are reading a word after all.. ah, the
reg_field is a 16-bit, so you are populating the unused fields.
Perhaps you could use bit-fields?


> +    s->msi->ctrl_offset = real_offset;
> +
> +    *data = reg->init_val;
> +    return 0;
> +}
> +static int pt_msgctrl_reg_write(XenPCIPassthroughState *s, XenPTReg 
> *cfg_entry,
> +                                uint16_t *value, uint16_t dev_value,
> +                                uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    uint16_t val;
> +
> +    /* Currently no support for multi-vector */
> +    if (*value & PCI_MSI_FLAGS_QSIZE) {
> +        PT_WARN(&s->dev, "try to set more than 1 vector ctrl %x\n", *value);


But no return -ENOSPC?

The message should probably read: "Tries to set more.."
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +    /* update the msi_info too */

err, msi_info -> msi->flags?

> +    s->msi->flags |= cfg_entry->data &
> +        ~(PT_MSI_FLAG_UNINIT | PT_MSI_FLAG_MAPPED | PCI_MSI_FLAGS_ENABLE);
> +
> +    /* create value for writing to I/O device register */
> +    val = *value;
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* update MSI */
> +    if (val & PCI_MSI_FLAGS_ENABLE) {
> +        /* setup MSI pirq for the first time */
> +        if (s->msi->flags & PT_MSI_FLAG_UNINIT) {
> +            if (s->msi_trans_en) {
> +                PT_LOG(&s->dev,
> +                       "guest enabling MSI, disable MSI-INTx translation\n");
> +                pt_disable_msi_translate(s);
> +            } else {
> +                /* Init physical one */
> +                PT_LOG(&s->dev, "setup MSI\n");
> +                if (pt_msi_setup(s)) {
> +                    /* We do not broadcast the error to the framework code, 
> so
> +                     * that MSI errors are contained in MSI emulation code 
> and
> +                     * QEMU can go on running.
> +                     * Guest MSI would be actually not working.

Not sure if the last sentence is needed?

> +                     */
> +                    *value &= ~PCI_MSI_FLAGS_ENABLE;
> +                    PT_WARN(&s->dev, "Can not map MSI.\n");
> +                    return 0;
> +                }
> +            }
> +            if (pt_msi_update(s)) {
> +                *value &= ~PCI_MSI_FLAGS_ENABLE;
> +                PT_WARN(&s->dev, "Can not bind MSI\n");
> +                return 0;
> +            }
> +            s->msi->flags &= ~PT_MSI_FLAG_UNINIT;
> +            s->msi->flags |= PT_MSI_FLAG_MAPPED;
> +        }
> +        s->msi->flags |= PCI_MSI_FLAGS_ENABLE;
> +    } else {
> +        s->msi->flags &= ~PCI_MSI_FLAGS_ENABLE;
> +    }
> +
> +    /* pass through MSI_ENABLE bit when no MSI-INTx translation */
                                          ^- "there is"


> +    if (!s->msi_trans_en) {
> +        *value &= ~PCI_MSI_FLAGS_ENABLE;
> +        *value |= val & PCI_MSI_FLAGS_ENABLE;
> +    }
> +
> +    return 0;
> +}
> +
> +/* initialize Message Upper Address register */
> +static int pt_msgaddr64_reg_init(XenPCIPassthroughState *ptdev,
> +                                 XenPTRegInfo *reg, uint32_t real_offset,
> +                                 uint32_t *data)
> +{
> +    /* no need to initialize in case of 32 bit type */
> +    if (!(ptdev->msi->flags & PCI_MSI_FLAGS_64BIT)) {
> +        *data = PT_INVALID_REG;
> +    } else {
> +        *data = reg->init_val;
> +    }
> +
> +    return 0;
> +}
> +/* this function will be called twice (for 32 bit and 64 bit type) */
> +/* initialize Message Data register */
> +static int pt_msgdata_reg_init(XenPCIPassthroughState *ptdev,
> +                               XenPTRegInfo *reg, uint32_t real_offset,
> +                               uint32_t *data)
> +{
> +    uint32_t flags = ptdev->msi->flags;
> +    uint32_t offset = reg->offset;
> +
> +    /* check the offset whether matches the type or not */
> +    if (((offset == PCI_MSI_DATA_64) &&  (flags & PCI_MSI_FLAGS_64BIT)) ||
> +        ((offset == PCI_MSI_DATA_32) && !(flags & PCI_MSI_FLAGS_64BIT))) {

This looks like it could be made in a helper function..

> +        *data = reg->init_val;
> +    } else {
> +        *data = PT_INVALID_REG;
> +    }
> +    return 0;
> +}
> +
> +/* write Message Address register */
> +static int pt_msgaddr32_reg_write(XenPCIPassthroughState *s,
> +                                  XenPTReg *cfg_entry, uint32_t *value,
> +                                  uint32_t dev_value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    uint32_t old_addr = cfg_entry->data;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +    /* update the msi_info too */
> +    s->msi->addr_lo = cfg_entry->data;
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* update MSI */
> +    if (cfg_entry->data != old_addr) {
> +        if (s->msi->flags & PT_MSI_FLAG_MAPPED) {
> +            pt_msi_update(s);
> +        }
> +    }
> +
> +    return 0;
> +}
> +/* write Message Upper Address register */
> +static int pt_msgaddr64_reg_write(XenPCIPassthroughState *s,
> +                                  XenPTReg *cfg_entry, uint32_t *value,
> +                                  uint32_t dev_value, uint32_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint32_t writable_mask = 0;
> +    uint32_t throughable_mask = 0;
> +    uint32_t old_addr = cfg_entry->data;
> +
> +    /* check whether the type is 64 bit or not */
> +    if (!(s->msi->flags & PCI_MSI_FLAGS_64BIT)) {
> +        PT_ERR(&s->dev, "Write to the Upper Address without 64 bit 
> support\n");

"Can't write to the ..."
> +        return -1;

-ENOSPC?

> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +    /* update the msi_info too */
> +    s->msi->addr_hi = cfg_entry->data;
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* update MSI */
> +    if (cfg_entry->data != old_addr) {
> +        if (s->msi->flags & PT_MSI_FLAG_MAPPED) {
> +            pt_msi_update(s);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +
> +/* this function will be called twice (for 32 bit and 64 bit type) */
> +/* write Message Data register */
> +static int pt_msgdata_reg_write(XenPCIPassthroughState *s, XenPTReg 
> *cfg_entry,
> +                                uint16_t *value, uint16_t dev_value,
> +                                uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +    uint16_t old_data = cfg_entry->data;
> +    uint32_t flags = s->msi->flags;
> +    uint32_t offset = reg->offset;
> +
> +    /* check the offset whether matches the type or not */
> +    if (!((offset == PCI_MSI_DATA_64) &&  (flags & PCI_MSI_FLAGS_64BIT)) &&
> +        !((offset == PCI_MSI_DATA_32) && !(flags & PCI_MSI_FLAGS_64BIT))) {
> +        /* exit I/O emulator */
> +        PT_ERR(&s->dev, "the offset does not match the 32/64 bit type!\n");
> +        return -1;

-ENOSPC
> +    }
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +    /* update the msi_info too */
> +    s->msi->data = cfg_entry->data;
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* update MSI */
> +    if (cfg_entry->data != old_data) {
> +        if (flags & PT_MSI_FLAG_MAPPED) {
> +            pt_msi_update(s);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/* MSI Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_msi_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Message Control reg */
> +    {
> +        .offset     = PCI_MSI_FLAGS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0xFF8E,
> +        .emu_mask   = 0x007F,
> +        .init       = pt_msgctrl_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_msgctrl_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Message Address reg */
> +    {
> +        .offset     = PCI_MSI_ADDRESS_LO,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x00000003,
> +        .emu_mask   = 0xFFFFFFFF,
> +        .no_wb      = 1,
> +        .init       = pt_common_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_msgaddr32_reg_write,
> +        .u.dw.restore = NULL,
> +    },
> +    /* Message Upper Address reg (if PCI_MSI_FLAGS_64BIT set) */
> +    {
> +        .offset     = PCI_MSI_ADDRESS_HI,
> +        .size       = 4,
> +        .init_val   = 0x00000000,
> +        .ro_mask    = 0x00000000,
> +        .emu_mask   = 0xFFFFFFFF,
> +        .no_wb      = 1,
> +        .init       = pt_msgaddr64_reg_init,
> +        .u.dw.read  = pt_long_reg_read,
> +        .u.dw.write = pt_msgaddr64_reg_write,
> +        .u.dw.restore = NULL,
> +    },
> +    /* Message Data reg (16 bits of data for 32-bit devices) */
> +    {
> +        .offset     = PCI_MSI_DATA_32,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0x0000,
> +        .emu_mask   = 0xFFFF,
> +        .no_wb      = 1,
> +        .init       = pt_msgdata_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_msgdata_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    /* Message Data reg (16 bits of data for 64-bit devices) */
> +    {
> +        .offset     = PCI_MSI_DATA_64,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0x0000,
> +        .emu_mask   = 0xFFFF,
> +        .no_wb      = 1,
> +        .init       = pt_msgdata_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_msgdata_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
> +
> +/**************************************
> + * MSI-X Capability
> + */
> +
> +/* Message Control register for MSI-X */
> +static int pt_msixctrl_reg_init(XenPCIPassthroughState *s,
> +                                XenPTRegInfo *reg, uint32_t real_offset,
> +                                uint32_t *data)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t reg_field = 0;
> +
> +    /* use I/O device register's value as initial value */
> +    reg_field = pci_get_word(d->config + real_offset);
> +
> +    if (reg_field & PCI_MSIX_FLAGS_ENABLE) {
> +        PT_LOG(d, "MSIX already enabled, disable it first\n");

.. disabling it first..
> +        host_pci_set_word(s->real_device, real_offset,
> +                          reg_field & ~PCI_MSIX_FLAGS_ENABLE);
> +    }
> +
> +    s->msix->ctrl_offset = real_offset;
> +
> +    *data = reg->init_val;
> +    return 0;
> +}
> +static int pt_msixctrl_reg_write(XenPCIPassthroughState *s,
> +                                 XenPTReg *cfg_entry, uint16_t *value,
> +                                 uint16_t dev_value, uint16_t valid_mask)
> +{
> +    XenPTRegInfo *reg = cfg_entry->reg;
> +    uint16_t writable_mask = 0;
> +    uint16_t throughable_mask = 0;
> +
> +    /* modify emulate register */
> +    writable_mask = reg->emu_mask & ~reg->ro_mask & valid_mask;
> +    cfg_entry->data = PT_MERGE_VALUE(*value, cfg_entry->data, writable_mask);
> +
> +    /* create value for writing to I/O device register */
> +    throughable_mask = ~reg->emu_mask & valid_mask;
> +    *value = PT_MERGE_VALUE(*value, dev_value, throughable_mask);
> +
> +    /* update MSI-X */
> +    if ((*value & PCI_MSIX_FLAGS_ENABLE)
> +        && !(*value & PCI_MSIX_FLAGS_MASKALL)) {
> +        if (s->msi_trans_en) {
> +            PT_LOG(&s->dev,
> +                   "guest enabling MSI-X, disable MSI-INTx translation\n");

... disabling
> +            pt_disable_msi_translate(s);
> +        }
> +        pt_msix_update(s);
> +    }
> +
> +    s->msix->enabled = !!(*value & PCI_MSIX_FLAGS_ENABLE);
> +
> +    return 0;
> +}
> +
> +/* MSI-X Capability Structure reg static infomation table */
> +static XenPTRegInfo pt_emu_reg_msix_tbl[] = {
> +    /* Next Pointer reg */
> +    {
> +        .offset     = PCI_CAP_LIST_NEXT,
> +        .size       = 1,
> +        .init_val   = 0x00,
> +        .ro_mask    = 0xFF,
> +        .emu_mask   = 0xFF,
> +        .init       = pt_ptr_reg_init,
> +        .u.b.read   = pt_byte_reg_read,
> +        .u.b.write  = pt_byte_reg_write,
> +        .u.b.restore  = NULL,
> +    },
> +    /* Message Control reg */
> +    {
> +        .offset     = PCI_MSI_FLAGS,
> +        .size       = 2,
> +        .init_val   = 0x0000,
> +        .ro_mask    = 0x3FFF,
> +        .emu_mask   = 0x0000,
> +        .init       = pt_msixctrl_reg_init,
> +        .u.w.read   = pt_word_reg_read,
> +        .u.w.write  = pt_msixctrl_reg_write,
> +        .u.w.restore  = NULL,
> +    },
> +    {
> +        .size = 0,
> +    },
> +};
> +
>  
>  /****************************
>   * Capabilities
> @@ -1709,6 +2134,49 @@ static int pt_pcie_size_init(XenPCIPassthroughState *s,
>      *size = pcie_size;
>      return 0;
>  }
> +/* get MSI Capability Structure register group size */
> +static int pt_msi_size_init(XenPCIPassthroughState *s,
> +                            const XenPTRegGroupInfo *grp_reg,
> +                            uint32_t base_offset, uint8_t *size)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint16_t msg_ctrl = 0;
> +    uint8_t msi_size = 0xa;
> +
> +    msg_ctrl = pci_get_word(d->config + (base_offset + PCI_MSI_FLAGS));
> +
> +    /* check 64 bit address capable & Per-vector masking capable */

.. 'check if 64-bit address is capable of per-vector masking.'

> +    if (msg_ctrl & PCI_MSI_FLAGS_64BIT) {
> +        msi_size += 4;
> +    }
> +    if (msg_ctrl & PCI_MSI_FLAGS_MASKBIT) {
> +        msi_size += 10;
> +    }
> +
> +    s->msi = g_new0(XenPTMSI, 1);
> +    s->msi->pirq = PT_UNASSIGNED_PIRQ;
> +
> +    *size = msi_size;
> +    return 0;
> +}
> +/* get MSI-X Capability Structure register group size */
> +static int pt_msix_size_init(XenPCIPassthroughState *s,
> +                             const XenPTRegGroupInfo *grp_reg,
> +                             uint32_t base_offset, uint8_t *size)
> +{
> +    int rc = 0;
> +
> +    rc = pt_msix_init(s, base_offset);
> +
> +    if (rc == -1) {

if (rc < 0) ..

> +        PT_ERR(&s->dev, "Internal error: Invalid pt_msix_init.\n");
> +        return rc;
> +    }
> +
> +    *size = grp_reg->grp_size;
> +    return 0;
> +}
> +
>  
>  static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
>      /* Header Type0 reg group */
> @@ -1749,6 +2217,14 @@ static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
>          .grp_size   = 0x04,
>          .size_init  = pt_reg_grp_size_init,
>      },
> +    /* MSI Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_MSI,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0xFF,
> +        .size_init   = pt_msi_size_init,
> +        .emu_reg_tbl = pt_emu_reg_msi_tbl,
> +    },
>      /* PCI-X Capabilities List Item reg group */
>      {
>          .grp_id     = PCI_CAP_ID_PCIX,
> @@ -1793,6 +2269,14 @@ static const XenPTRegGroupInfo pt_emu_reg_grp_tbl[] = {
>          .size_init   = pt_pcie_size_init,
>          .emu_reg_tbl = pt_emu_reg_pcie_tbl,
>      },
> +    /* MSI-X Capability Structure reg group */
> +    {
> +        .grp_id      = PCI_CAP_ID_MSIX,
> +        .grp_type    = GRP_TYPE_EMU,
> +        .grp_size    = 0x0C,
> +        .size_init   = pt_msix_size_init,
> +        .emu_reg_tbl = pt_emu_reg_msix_tbl,
> +    },
>      {
>          .grp_size = 0,
>      },
> @@ -1965,8 +2449,11 @@ static int pt_init_pci_config(XenPCIPassthroughState 
> *s)
>          return rc;
>      }
>  
> +    /* setup MSI-INTx translation if support */

.. if supported.

> +    rc = pt_enable_msi_translate(s);
> +
>      /* rebind machine_irq to device */
> -    if (s->machine_irq != 0) {
> +    if (rc < 0 && s->machine_irq != 0) {
>          uint8_t e_device = PCI_SLOT(s->dev.devfn);
>          uint8_t e_intx = pci_intx(s);
>  
> @@ -2117,6 +2604,14 @@ void pt_config_delete(XenPCIPassthroughState *s)
>      struct XenPTRegGroup *reg_group, *next_grp;
>      struct XenPTReg *reg, *next_reg;
>  
> +    /* free MSI/MSI-X info table */
> +    if (s->msix) {
> +        pt_msix_delete(s);
> +    }
> +    if (s->msi) {
> +        g_free(s->msi);
> +    }
> +
>      /* free Power Management info table */
>      if (s->pm_state) {
>          if (s->pm_state->pm_timer) {
> diff --git a/hw/xen_pci_passthrough_msi.c b/hw/xen_pci_passthrough_msi.c
> new file mode 100644
> index 0000000..73d3d9b
> --- /dev/null
> +++ b/hw/xen_pci_passthrough_msi.c
> @@ -0,0 +1,678 @@
> +/*
> + * Copyright (c) 2007, Intel Corporation.
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2.  See
> + * the COPYING file in the top-level directory.
> + *
> + * Jiang Yunhong <address@hidden>
> + *
> + * This file implements direct PCI assignment to a HVM guest
> + */
> +
> +#include <sys/mman.h>
> +
> +#include "xen_backend.h"
> +#include "xen_pci_passthrough.h"
> +#include "apic-msidef.h"
> +
> +
> +#define AUTO_ASSIGN -1
> +
> +/* shift count for gflags */
> +#define GFLAGS_SHIFT_DEST_ID        0
> +#define GFLAGS_SHIFT_RH             8
> +#define GFLAGS_SHIFT_DM             9
> +#define GLFAGS_SHIFT_DELIV_MODE     12
> +#define GLFAGS_SHIFT_TRG_MODE       15
> +
> +
> +/*
> + * Helpers
> + */
> +
> +static uint32_t get_msi_gflags(uint32_t data, uint64_t addr)
> +{
> +    uint32_t result = 0;
> +    int rh, dm, dest_id, deliv_mode, trig_mode;
> +
> +    rh = (addr >> MSI_ADDR_REDIRECTION_SHIFT) & 0x1;
> +    dm = (addr >> MSI_ADDR_DEST_MODE_SHIFT) & 0x1;
> +    dest_id = (addr >> MSI_ADDR_DEST_ID_SHIFT) & 0xff;
> +    deliv_mode = (data >> MSI_DATA_DELIVERY_MODE_SHIFT) & 0x7;
> +    trig_mode = (data >> MSI_DATA_TRIGGER_SHIFT) & 0x1;
> +
> +    result = dest_id | (rh << GFLAGS_SHIFT_RH) | (dm << GFLAGS_SHIFT_DM) |
> +             (deliv_mode << GLFAGS_SHIFT_DELIV_MODE) |
> +             (trig_mode << GLFAGS_SHIFT_TRG_MODE);
> +
> +    return result;
> +}
> +
> +
> +/*
> + * MSI virtualization functions
> + */
> +
> +void pt_msi_set_enable(XenPCIPassthroughState *s, int en)

Usually these functions that 'enable' something return a value.
Perhaps this function should do that?

> +{
> +    uint16_t val = 0;
> +    uint32_t address = 0;
> +
> +    PT_LOG(&s->dev, "enable: %i\n", en);

Uh, why not: "PT_LOG(&s->dev,"%s.\n", en ? "enabling" : "disabling");

> +
> +    if (!s->msi) {
> +        return;
> +    }
> +
> +    address = s->msi->ctrl_offset;
> +    if (!address) {
> +        return;
> +    }
> +
> +    host_pci_get_word(s->real_device, address, &val);
> +    val &= ~PCI_MSI_FLAGS_ENABLE;
> +    val |= en & PCI_MSI_FLAGS_ENABLE;
> +    host_pci_set_word(s->real_device, address, val);
> +}
> +
> +/* setup physical msi, but don't enable it */
> +int pt_msi_setup(XenPCIPassthroughState *s)
> +{
> +    int pirq = PT_UNASSIGNED_PIRQ;
> +    uint8_t gvec = 0;
> +    int rc = 0;
> +
> +    if (!(s->msi->flags & PT_MSI_FLAG_UNINIT)) {
> +        PT_ERR(&s->dev, "Setup physical MSI when it's already 
> initialized.\n");

Eh? I think you mean "Setup physical MSI when it has been properly
initialized." ?

> +        return -1;
> +    }
> +
> +    gvec = s->msi->data & 0xFF;
> +    if (!gvec) {
> +        /* if gvec is 0, the guest is asking for a particular pirq that
> +         * is passed as dest_id */
> +        pirq = (s->msi->addr_hi & 0xffffff00) |
> +               ((s->msi->addr_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);

Please macrofy this.

> +        if (!pirq) {
> +            /* this probably identifies an misconfiguration of the guest,
> +             * try the emulated path */
> +            pirq = PT_UNASSIGNED_PIRQ;
> +        } else {
> +            PT_LOG(&s->dev, "requested pirq: %d\n", pirq);
> +        }
> +    }
> +
> +    rc = xc_physdev_map_pirq_msi(xen_xc, xen_domid, AUTO_ASSIGN, &pirq,
> +                                 PCI_DEVFN(s->real_device->dev,
> +                                           s->real_device->func),
> +                                 s->real_device->bus, 0, 0);

What are the two '0' at the end for? Can you provide a comment in there?

> +    if (rc) {
> +        PT_ERR(&s->dev, "Mapping of MSI failed. (rc: %d,"
> +               " assigned device: %02x:%02x.%x)\n", rc,
> +               s->real_device->dev, s->real_device->func, 
> s->real_device->bus);

You might want to include the 'pirq' value that was requested.

> +        return -1;
> +    }
> +
> +    if (pirq < 0) {
> +        PT_ERR(&s->dev, "Invalid pirq number.\n");
> +        return -1;
> +    }
> +
> +    s->msi->pirq = pirq;
> +    PT_LOG(&s->dev, "MSI mapped with pirq %d.\n", pirq);
> +
> +    return 0;
> +}
> +
> +int pt_msi_update(XenPCIPassthroughState *s)
> +{
> +    uint8_t gvec = 0;
> +    uint32_t gflags = 0;
> +    uint64_t addr = 0;
> +    int rc = 0;
> +
> +    /* get vector, address, flags info, etc. */
> +    gvec = s->msi->data & 0xFF;
> +    addr = (uint64_t)s->msi->addr_hi << 32 | s->msi->addr_lo;
> +    gflags = get_msi_gflags(s->msi->data, addr);
> +
> +    PT_LOG(&s->dev, "Update MSI with pirq %d gvec 0x%x gflags 0x%x\n",

Updating
> +           s->msi->pirq, gvec, gflags);
> +
> +    rc = xc_domain_update_msi_irq(xen_xc, xen_domid, gvec,
> +                                  s->msi->pirq, gflags, 0);
> +    if (rc) {
> +        PT_ERR(&s->dev, "Binding of MSI failed. (rc: %d)\n", rc);

Not binding.. Updating
> +
> +        if (xc_physdev_unmap_pirq(xen_xc, xen_domid, s->msi->pirq)) {
> +            PT_ERR(&s->dev, "Unmapping of MSI pirq %d failed.\n",
> +                   s->msi->pirq);
> +        }
> +        s->msi->pirq = PT_UNASSIGNED_PIRQ;
> +    }
> +    return rc;
> +}
> +
> +void pt_msi_disable(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint8_t gvec = 0;
> +    uint32_t gflags = 0;
> +    uint64_t addr = 0;
> +    uint8_t e_device = 0;
> +    uint8_t e_intx = 0;
> +
> +    pt_msi_set_enable(s, 0);
> +
> +    e_device = PCI_SLOT(d->devfn);
> +    e_intx = pci_intx(s);
> +
> +    if (s->msi_trans_en) {
> +        if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->msi->pirq,
> +                                    PT_IRQ_TYPE_MSI_TRANSLATE, 0,
> +                                    e_device, e_intx, 0)) {
> +            PT_ERR(d, "Unbinding pt irq for MSI-INTx failed!\n");

Please include the e_intx and pirq that was being unbinded.

> +            goto out;
> +        }
> +    } else if (!(s->msi->flags & PT_MSI_FLAG_UNINIT)) {
> +        /* get vector, address, flags info, etc. */
> +        gvec = s->msi->data & 0xFF;
> +        addr = (uint64_t)s->msi->addr_hi << 32 | s->msi->addr_lo;
> +        gflags = get_msi_gflags(s->msi->data, addr);
> +
> +        PT_ERR(&s->dev, "Unbind MSI with pirq %x, gvec %x\n",
> +               s->msi->pirq, gvec);
> +
> +        if (xc_domain_unbind_msi_irq(xen_xc, xen_domid, gvec,
> +                                     s->msi->pirq, gflags)) {
> +            PT_ERR(&s->dev, "Unbinding of MSI failed. (pirq: %d)\n",
> +                   s->msi->pirq);
> +            goto out;
> +        }
> +    }
> +
> +    if (s->msi->pirq != PT_UNASSIGNED_PIRQ) {
> +        PT_LOG(&s->dev, "Unmap msi with pirq %d\n", s->msi->pirq);
> +
> +        if (xc_physdev_unmap_pirq(xen_xc, xen_domid, s->msi->pirq)) {
> +            PT_ERR(&s->dev, "Unmapping of MSI failed. (pirq: %d)\n",
> +                   s->msi->pirq);
> +            goto out;
> +        }
> +    }
> +
> +out:
> +    /* clear msi info */
> +    s->msi->flags = 0;
> +    s->msi->pirq = -1;
> +    s->msi_trans_en = 0;
> +}
> +
> +/*
> + * MSI-INTx translation virtualization functions
> + */
> +
> +int pt_enable_msi_translate(XenPCIPassthroughState *s)
> +{
> +    uint8_t e_device = 0;
> +    uint8_t e_intx = 0;
> +
> +    if (!(s->msi && s->msi_trans_cap)) {
> +        return -1;

-ENODEV ?

> +    }
> +
> +    pt_msi_set_enable(s, 0);
> +    s->msi_trans_en = 0;
> +
> +    if (pt_msi_setup(s)) {

You can save the return value from pt_msi_setup..
> +        PT_ERR(&s->dev, "MSI-INTx translation MSI setup failed, fallback\n");

..falling back. But not sure what it is actually falling back to? Just
to normal INTx injecting? If so, you might want to say that.


.. and the return value from pt_msi_setup could be returned here.

> +        return -1;
> +    }
> +
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    /* fix virtual interrupt pin to INTA# */
> +    e_intx = pci_intx(s);
> +
> +    if (xc_domain_bind_pt_irq(xen_xc, xen_domid, s->msi->pirq,
> +                              PT_IRQ_TYPE_MSI_TRANSLATE, 0,
> +                              e_device, e_intx, 0)) {

Please document those '0'.

> +        PT_ERR(&s->dev, "MSI-INTx translation bind failed, fallback\n");
> +
> +        if (xc_physdev_unmap_pirq(xen_xc, xen_domid, s->msi->pirq)) {
> +            PT_ERR(&s->dev, "Unmapping of MSI failed.\n");

Mention which PIRQ failed.

> +        }
> +        s->msi->pirq = PT_UNASSIGNED_PIRQ;
> +        return -1;

You can return the error value returned from xc_domain_bind_pt_irq..
> +    }
> +
> +    pt_msi_set_enable(s, 1);

Should really have pt_msi_set_enable return a value and then check it.

Or do we really don't care about its value? If so, please mention
that in the pt_msi_set_enable.

> +    s->msi_trans_en = 1;
> +
> +    return 0;
> +}
> +
> +void pt_disable_msi_translate(XenPCIPassthroughState *s)
> +{
> +    uint8_t e_device = 0;
> +    uint8_t e_intx = 0;
> +
> +    /* MSI_ENABLE bit should be disabed until the new handler is set */

disabled.

> +    pt_msi_set_enable(s, 0);
> +
> +    e_device = PCI_SLOT(s->dev.devfn);
> +    e_intx = pci_intx(s);
> +
> +    if (xc_domain_unbind_pt_irq(xen_xc, xen_domid, s->msi->pirq,
> +                                PT_IRQ_TYPE_MSI_TRANSLATE, 0,
> +                                e_device, e_intx, 0)) {
> +        PT_ERR(&s->dev, "Unbinding pt irq for MSI-INTx failed!\n");

Can you include the error code please?
> +    }
> +
> +    if (s->machine_irq) {
> +        if (xc_domain_bind_pt_pci_irq(xen_xc, xen_domid, s->machine_irq,
> +                                      0, e_device, e_intx)) {
> +            PT_ERR(&s->dev, "Rebinding of interrupt failed!\n");

And here as well.. And also the machine_irq and e_intx value.

> +        }
> +    }
> +
> +    s->msi_trans_en = 0;
> +}
> +
> +/*
> + * MSI-X virtualization functions
> + */
> +
> +static void msix_set_enable(XenPCIPassthroughState *s, int en)

No return value? The 'int en' can also be a bool.

> +{
> +    uint16_t val = 0;
> +    uint32_t address = 0;
> +
> +    if (!s->msix) {
> +        return;
> +    }
> +
> +    address = s->msix->ctrl_offset;
> +    if (!address) {
> +        return;
> +    }
> +
> +    host_pci_get_word(s->real_device, address, &val);
> +    val &= ~PCI_MSIX_FLAGS_ENABLE;
> +    if (en) {
> +        val |= PCI_MSIX_FLAGS_ENABLE;
> +    }
> +    host_pci_set_word(s->real_device, address, val);
> +}
> +
> +static void mask_physical_msix_entry(XenPCIPassthroughState *s,
> +                                     int entry_nr, int mask)
> +{
> +    void *phys_off;
> +
> +    phys_off = s->msix->phys_iomem_base + 16 * entry_nr + 12;

Can you provide a comment explaining this math please?

> +    *(uint32_t *)phys_off = mask;
> +}
> +
> +static int pt_msix_update_one(XenPCIPassthroughState *s, int entry_nr)
> +{
> +    XenMSIXEntry *entry = &s->msix->msix_entry[entry_nr];

You should be checking whether entry_nr is outside the bounds of
msix_entry just in case.


> +    int pirq = entry->pirq;
> +    int gvec = entry->io_mem[2] & 0xff;
> +    uint64_t gaddr = *(uint64_t *)&entry->io_mem[0];
> +    uint32_t gflags = get_msi_gflags(entry->io_mem[2], gaddr);
> +    int rc;
> +
> +    if (!entry->flags) {
> +        return 0;
> +    }
> +
> +    if (!gvec) {
> +        /* if gvec is 0, the guest is asking for a particular pirq that
> +         * is passed as dest_id */
> +        pirq = ((gaddr >> 32) & 0xffffff00) |
> +               (((gaddr & 0xffffffff) >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
> +        if (!pirq) {
> +            /* this probably identifies an misconfiguration of the guest,
> +             * try the emulated path */
> +            pirq = PT_UNASSIGNED_PIRQ;
> +        } else {
> +            PT_LOG(&s->dev, "requested pirq: %d\n", pirq);
> +        }
> +    }
> +
> +    /* Check if this entry is already mapped */
> +    if (entry->pirq == PT_UNASSIGNED_PIRQ) {
> +        rc = xc_physdev_map_pirq_msi(xen_xc, xen_domid, AUTO_ASSIGN, &pirq,
> +                                     PCI_DEVFN(s->real_device->dev,
> +                                               s->real_device->func),
> +                                     s->real_device->bus, entry_nr,
> +                                     s->msix->table_base);
> +        if (rc) {
> +            PT_ERR(&s->dev, "Mapping msix entry %x\n", entry_nr);
> +            return rc;
> +        }
> +        entry->pirq = pirq;
> +    }
> +
> +    PT_LOG(&s->dev, "Update msix entry 0x%x with pirq %d gvec 0x%x\n",
> +            entry_nr, pirq, gvec);
> +
> +    rc = xc_domain_update_msi_irq(xen_xc, xen_domid, gvec, pirq, gflags,
> +                                  s->msix->mmio_base_addr);
> +    if (rc) {
> +        PT_ERR(&s->dev, "Updating msix irq %d info for entry %d\n",
> +               pirq, entry_nr);
> +
> +        if (xc_physdev_unmap_pirq(xen_xc, xen_domid, entry->pirq)) {
> +            PT_ERR(&s->dev, "Unmapping of MSI-X failed. (pirq: %d)\n",
> +                   entry->pirq);
> +        }
> +        entry->pirq = PT_UNASSIGNED_PIRQ;
> +        return rc;
> +    }
> +
> +    entry->flags = 0;

This whole function looks so similar to msi one.. Is there no chance of
collapsing them together? And perhaps having an extra argument to check
if it is MSI or MSI-X and where to get the gflags and such.

> +
> +    return 0;
> +}
> +
> +int pt_msix_update(XenPCIPassthroughState *s)
> +{
> +    XenPTMSIX *msix = s->msix;
> +    int i;
> +
> +    for (i = 0; i < msix->total_entries; i++) {
> +        pt_msix_update_one(s, i);
> +    }
> +
> +    return 0;
> +}
> +
> +void pt_msix_disable(XenPCIPassthroughState *s)
> +{
> +    PCIDevice *d = &s->dev;
> +    uint8_t gvec = 0;
> +    uint32_t gflags = 0;
> +    uint64_t addr = 0;
> +    int i = 0;
> +    XenMSIXEntry *entry = NULL;
> +
> +    msix_set_enable(s, 0);
> +
> +    for (i = 0; i < s->msix->total_entries; i++) {
> +        entry = &s->msix->msix_entry[i];
> +
> +        if (entry->pirq == PT_UNASSIGNED_PIRQ) {
> +            continue;
> +        }
> +
> +        gvec = entry->io_mem[2] & 0xff;
> +        addr = *(uint64_t *)&entry->io_mem[0];
> +        gflags = get_msi_gflags(entry->io_mem[2], addr);
> +
> +        PT_LOG(d, "Unbind msix with pirq %d, gvec 0x%x\n",
> +                entry->pirq, gvec);
> +
> +        if (xc_domain_unbind_msi_irq(xen_xc, xen_domid, gvec,
> +                                        entry->pirq, gflags)) {
> +            PT_ERR(d, "Unbinding of MSI-X failed.\n");
> +        } else {
> +            PT_LOG(d, "Unmap msix with pirq %d\n", entry->pirq);
> +
> +            if (xc_physdev_unmap_pirq(xen_xc, xen_domid, entry->pirq)) {
> +                PT_ERR(d, "Unmapping of MSI-X failed.\n");
> +            }
> +        }
> +        /* clear msi-x info */
> +        entry->pirq = PT_UNASSIGNED_PIRQ;
> +        entry->flags = 0;
> +    }

This looks like a candidate for squashing with the MSI as well. If
you can manage to abstract the gflags, addr, etc.

> +}
> +
> +int pt_msix_update_remap(XenPCIPassthroughState *s, int bar_index)
> +{
> +    XenMSIXEntry *entry;
> +    int i, ret;
> +
> +    if (!(s->msix && s->msix->bar_index == bar_index)) {
> +        return 0;
> +    }
> +
> +    for (i = 0; i < s->msix->total_entries; i++) {
> +        entry = &s->msix->msix_entry[i];
> +        if (entry->pirq != PT_UNASSIGNED_PIRQ) {
> +            ret = xc_domain_unbind_pt_irq(xen_xc, xen_domid, entry->pirq,
> +                                          PT_IRQ_TYPE_MSI, 0, 0, 0, 0);

Lots of zeros. Can you document those please?

> +            if (ret) {
> +                PT_ERR(&s->dev, "unbind MSI-X entry %d failed\n", 
> entry->pirq);
> +            }
> +            entry->flags = 1;

Could use a #define for the '1' value.

> +        }
> +    }
> +    pt_msix_update(s);
> +
> +    return 0;
> +}
> +
> +static void pci_msix_invalid_write(void *opaque, target_phys_addr_t addr,
> +                                   uint32_t val)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    PT_ERR(&s->dev, "Invalid write to MSI-X table,"
> +           " only dword access is allowed.\n");
> +}
> +
> +static void pci_msix_writel(void *opaque, target_phys_addr_t addr,
> +                            uint32_t val)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    XenPTMSIX *msix = s->msix;
> +    XenMSIXEntry *entry;
> +    int entry_nr, offset;
> +    void *phys_off;
> +    uint32_t vec_ctrl;
> +
> +    if (addr % 4) {
> +        PT_ERR(&s->dev, "Unaligned dword access to MSI-X table, "
> +                "addr 0x"TARGET_FMT_plx"\n", addr);
> +        return;
> +    }
> +
> +    entry_nr = addr / 16;
> +    entry = &msix->msix_entry[entry_nr];

You should double-check that that entry_nr is actually allocated.

> +    offset = (addr % 16) / 4;
> +
> +    /*
> +     * If Xen intercepts the mask bit access, io_mem[3] may not be
> +     * up-to-date. Read from hardware directly.
> +     */
> +    phys_off = s->msix->phys_iomem_base + 16 * entry_nr + 12;
> +    vec_ctrl = *(uint32_t *)phys_off;
> +
> +    if (offset != 3 && msix->enabled && !(vec_ctrl & 0x1)) {
> +        PT_ERR(&s->dev, "Can't update msix entry %d since MSI-X is already "
> +                "enabled.\n", entry_nr);

Well, and a whole bunch of other things are set. Would it make sense to
provide 'offset' and 'vec_ctrl' contents?

> +        return;
> +    }
> +
> +    if (offset != 3 && entry->io_mem[offset] != val) {

What is the magical '3' value? Can you document it a bit please? Perhaps
along with the io_mem[x] definition?

> +        entry->flags = 1;
> +    }
> +    entry->io_mem[offset] = val;
> +
> +    if (offset == 3) {
> +        if (msix->enabled && !(val & 0x1)) {
> +            pt_msix_update_one(s, entry_nr);
> +        }
> +        mask_physical_msix_entry(s, entry_nr, entry->io_mem[3] & 0x1);
> +    }
> +}
> +
> +static CPUWriteMemoryFunc *pci_msix_write[] = {
> +    pci_msix_invalid_write,
> +    pci_msix_invalid_write,
> +    pci_msix_writel
> +};
> +
> +static uint32_t pci_msix_invalid_read(void *opaque, target_phys_addr_t addr)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    PT_ERR(&s->dev, "Invalid read to MSI-X table,"
> +           " only dword access is allowed.\n");
> +    return 0;
> +}
> +
> +static uint32_t pci_msix_readl(void *opaque, target_phys_addr_t addr)
> +{
> +    XenPCIPassthroughState *s = opaque;
> +    XenPTMSIX *msix = s->msix;
> +    int entry_nr, offset;
> +
> +    if (addr % 4) {
> +        PT_ERR(&s->dev, "Unaligned dword access to MSI-X table, "
> +                "addr 0x"TARGET_FMT_plx"\n", addr);
> +        return 0;
> +    }
> +
> +    entry_nr = addr / 16;
> +    offset = (addr % 16) / 4;
> +
> +    return msix->msix_entry[entry_nr].io_mem[offset];
> +}
> +
> +static CPUReadMemoryFunc *pci_msix_read[] = {
> +    pci_msix_invalid_read,
> +    pci_msix_invalid_read,
> +    pci_msix_readl
> +};
> +
> +int pt_add_msix_mapping(XenPCIPassthroughState *s, int bar_index)
> +{
> +    if (!(s->msix && s->msix->bar_index == bar_index)) {
> +        return 0;
> +    }
> +
> +    return xc_domain_memory_mapping(xen_xc, xen_domid,
> +         s->msix->mmio_base_addr >> XC_PAGE_SHIFT,
> +         (s->bases[bar_index].access.maddr + s->msix->table_off)
> +             >> XC_PAGE_SHIFT,
> +         (s->msix->total_entries * 16 + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> +         DPCI_ADD_MAPPING);
> +}
> +
> +int pt_remove_msix_mapping(XenPCIPassthroughState *s, int bar_index)
> +{
> +    if (!(s->msix && s->msix->bar_index == bar_index)) {
> +        return 0;
> +    }
> +
> +    s->msix->mmio_base_addr = s->bases[bar_index].e_physbase
> +        + s->msix->table_off;
> +
> +    cpu_register_physical_memory(s->msix->mmio_base_addr,
> +                                 s->msix->total_entries * 16,
> +                                 s->msix->mmio_index);
> +
> +    return xc_domain_memory_mapping(xen_xc, xen_domid,
> +         s->msix->mmio_base_addr >> XC_PAGE_SHIFT,
> +         (s->bases[bar_index].access.maddr + s->msix->table_off)
> +             >> XC_PAGE_SHIFT,
> +         (s->msix->total_entries * 16 + XC_PAGE_SIZE - 1) >> XC_PAGE_SHIFT,
> +         DPCI_REMOVE_MAPPING);
> +}
> +
> +int pt_msix_init(XenPCIPassthroughState *s, int base)
> +{
> +    uint8_t id = 0;
> +    uint16_t control = 0;
> +    uint32_t table_off = 0;
> +    int i, total_entries, bar_index;
> +    HostPCIDevice *d = s->real_device;
> +    int fd;
> +
> +    if (host_pci_get_byte(d, base + PCI_CAP_LIST_ID, &id)) {
> +        return -1;

-ENOSPC?

> +    }
> +
> +    if (id != PCI_CAP_ID_MSIX) {
> +        PT_ERR(&s->dev, "Invalid id %#x base %#x\n", id, base);
> +        return -1;
> +    }
> +
> +    host_pci_get_word(d, base + 2, &control);
> +    total_entries = control & 0x7ff;
> +    total_entries += 1;
> +
> +    s->msix = g_malloc0(sizeof (XenPTMSIX)
> +                        + total_entries * sizeof (XenMSIXEntry));
> +
> +    s->msix->total_entries = total_entries;
> +    for (i = 0; i < total_entries; i++) {
> +        s->msix->msix_entry[i].pirq = PT_UNASSIGNED_PIRQ;
> +    }
> +
> +    s->msix->mmio_index =
> +        cpu_register_io_memory(pci_msix_read, pci_msix_write,
> +                               s, DEVICE_NATIVE_ENDIAN);
> +
> +    host_pci_get_long(d, base + PCI_MSIX_TABLE, &table_off);
> +    bar_index = s->msix->bar_index = table_off & PCI_MSIX_FLAGS_BIRMASK;
> +    table_off = s->msix->table_off = table_off & ~PCI_MSIX_FLAGS_BIRMASK;
> +    s->msix->table_base = s->real_device->io_regions[bar_index].base_addr;
> +    PT_LOG(&s->dev, "get MSI-X table BAR base 0x%"PRIx64"\n",
> +           s->msix->table_base);

Can you also include the amount of entries it can have?
> +
> +    fd = open("/dev/mem", O_RDWR);
> +    if (fd == -1) {
> +        PT_ERR(&s->dev, "Can't open /dev/mem: %s\n", strerror(errno));
> +        goto error_out;
> +    }
> +    PT_LOG(&s->dev, "table_off = %#x, total_entries = %d\n",
> +           table_off, total_entries);
> +    s->msix->table_offset_adjust = table_off & 0x0fff;
> +    s->msix->phys_iomem_base =
> +        mmap(0,
> +             total_entries * 16 + s->msix->table_offset_adjust,
> +             PROT_WRITE | PROT_READ,
> +             MAP_SHARED | MAP_LOCKED,
> +             fd,
> +             s->msix->table_base + table_off - s->msix->table_offset_adjust);
> +
> +    if (s->msix->phys_iomem_base == MAP_FAILED) {
> +        PT_ERR(&s->dev, "Can't map physical MSI-X table: %s\n",
> +               strerror(errno));
> +        close(fd);
> +        goto error_out;
> +    }
> +    s->msix->phys_iomem_base = (char *)s->msix->phys_iomem_base
> +        + s->msix->table_offset_adjust;
> +
> +    close(fd);
> +
> +    PT_LOG(&s->dev, "mapping physical MSI-X table to %p\n",
> +           s->msix->phys_iomem_base);
> +    return 0;
> +
> +error_out:
> +    g_free(s->msix);
> +    s->msix = NULL;
> +    return -1;
> +}
> +
> +void pt_msix_delete(XenPCIPassthroughState *s)
> +{
> +    /* unmap the MSI-X memory mapped register area */
> +    if (s->msix->phys_iomem_base) {
> +        PT_LOG(&s->dev, "unmapping physical MSI-X table from %p\n",
> +           s->msix->phys_iomem_base);
> +        munmap(s->msix->phys_iomem_base, s->msix->total_entries * 16 +
> +           s->msix->table_offset_adjust);
> +    }
> +
> +    if (s->msix->mmio_index > 0) {
> +        cpu_unregister_io_memory(s->msix->mmio_index);
> +    }
> +
> +    g_free(s->msix);
> +    s->msix = NULL;
> +}
> -- 
> Anthony PERARD
> 
> 
> _______________________________________________
> Xen-devel mailing list
> address@hidden
> http://lists.xensource.com/xen-devel



reply via email to

[Prev in Thread] Current Thread [Next in Thread]