qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 1/2] KVM userspace: Add PCI device passthrough s


From: Fabrice Bellard
Subject: Re: [Qemu-devel] [PATCH 1/2] KVM userspace: Add PCI device passthrough support
Date: Thu, 08 Nov 2007 10:19:17 +0100
User-agent: Thunderbird 2.0.0.5 (X11/20070727)

Hi,

Some remarks:

- rename the option to -pcidevice.

- Remove the directory passthrough and put the file directly in hw/. Rename the file to something more explicit such as pci_passthrough*

- Suppress the files passthrough.h and neo_pci_tree.h

- pt_init should be called only if there are really devices. Moreover the code should be disabled for non Linux hosts.

Can this support work if KVM is not used for CPU emulation ? Can it work for non x86 targets ?

Regards,

Fabrice.

Amit Shah wrote:
This patch introduces support for device passthrough
from the host to a paravirtualized guest.

A new command-line option, -passthrough is added.
For example, to invoke it for an Ethernet device sitting at
PCI bus:dev.fn 04:08.0 with host IRQ 18, use this:

-passthrough Ethernet/04:08.0-18

The host driver is to be removed before doing the passthrough.

Signed-off-by: Amit Shah <address@hidden>
---
 qemu/Makefile                      |    6 +-
 qemu/Makefile.target               |    4 +-
 qemu/exec.c                        |    1 +
 qemu/hw/apic.c                     |    2 +
 qemu/hw/passthrough/neo_pci_tree.h |   44 +++
 qemu/hw/passthrough/passthrough.c  |  604 ++++++++++++++++++++++++++++++++++++
 qemu/hw/passthrough/passthrough.h  |   64 ++++
 qemu/hw/pc.c                       |    3 +
 qemu/hw/pci.c                      |    5 +
 qemu/hw/piix_pci.c                 |    6 +
 qemu/vl.c                          |    6 +
 tools/pci_barsize.c                |   53 ++++
 tools/pci_mmio.c                   |   82 +++++
 13 files changed, 876 insertions(+), 4 deletions(-)
 create mode 100644 qemu/hw/passthrough/neo_pci_tree.h
 create mode 100644 qemu/hw/passthrough/passthrough.c
 create mode 100644 qemu/hw/passthrough/passthrough.h
 create mode 100644 tools/pci_barsize.c
 create mode 100644 tools/pci_mmio.c

diff --git a/qemu/Makefile b/qemu/Makefile
index 053c88c..3e599f3 100644
--- a/qemu/Makefile
+++ b/qemu/Makefile
@@ -37,7 +37,7 @@ qemu-img$(EXESUF): qemu-img.c cutils.c block.c block-raw.c 
block-cow.c block-qco
 dyngen$(EXESUF): dyngen.c
        $(HOST_CC) $(CFLAGS) $(CPPFLAGS) $(BASE_CFLAGS) -o $@ $^
-clean: +clean: # avoid old build problems by removing potentially incorrect old files
        rm -f config.mak config.h op-i386.h opc-i386.h gen-op-i386.h op-arm.h 
opc-arm.h gen-op-arm.h
        rm -f *.o *.a $(TOOLS) dyngen$(EXESUF) TAGS cscope.* *.pod *~ */*~
@@ -88,8 +88,8 @@ endif
 test speed test2: all
        $(MAKE) -C tests $@
-TAGS:
-       etags *.[ch] tests/*.[ch]
+TAGS: + etags *.[ch] tests/*.[ch] hw/passthrough/*.[ch] cscope:
        rm -f ./cscope.*
diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 65f449e..9a96011 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -24,7 +24,7 @@ ifeq ($(TARGET_ARCH), sparc64)
 TARGET_BASE_ARCH:=sparc
 endif
 TARGET_PATH=$(SRC_PATH)/target-$(TARGET_BASE_ARCH)
-VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/audio
+VPATH=$(SRC_PATH):$(TARGET_PATH):$(SRC_PATH)/hw:$(SRC_PATH)/hw/passthrough:$(SRC_PATH)/audio
 CPPFLAGS=-I. -I.. -I$(TARGET_PATH) -I$(SRC_PATH)
 ifdef CONFIG_DARWIN_USER
 VPATH+=:$(SRC_PATH)/darwin-user
@@ -454,6 +454,8 @@ VL_OBJS+= ide.o pckbd.o ps2.o vga.o $(SOUND_HW) dma.o 
$(AUDIODRV)
 VL_OBJS+= fdc.o mc146818rtc.o serial.o i8259.o i8254.o pcspk.o pc.o
 VL_OBJS+= cirrus_vga.o apic.o parallel.o acpi.o piix_pci.o
 VL_OBJS+= usb-uhci.o smbus_eeprom.o vmmouse.o vmport.o vmware_vga.o
+# passthrough support
+VL_OBJS+= passthrough.o
 CPPFLAGS += -DHAS_AUDIO -DHAS_AUDIO_CHOICE
 endif
 ifeq ($(TARGET_BASE_ARCH), ppc)
diff --git a/qemu/exec.c b/qemu/exec.c
index 3e588d5..7a21ca5 100644
--- a/qemu/exec.c
+++ b/qemu/exec.c
@@ -2484,6 +2484,7 @@ int cpu_register_io_memory(int io_index,
         if (io_mem_nb >= IO_MEM_NB_ENTRIES)
             return -1;
         io_index = io_mem_nb++;
+       fprintf(stderr, "iomem index %d out of %d\n", io_index, 
IO_MEM_NB_ENTRIES);
     } else {
         if (io_index >= IO_MEM_NB_ENTRIES)
             return -1;
diff --git a/qemu/hw/apic.c b/qemu/hw/apic.c
index 60d31fa..5b1bdf4 100644
--- a/qemu/hw/apic.c
+++ b/qemu/hw/apic.c
@@ -349,6 +349,7 @@ static void apic_eoi(APICState *s)
     /* XXX: send the EOI packet to the APIC bus to allow the I/O APIC to
             set the remote IRR bit for level triggered interrupts. */
     apic_update_irq(s);
+    pt_ack_mirq(isrv);
 }
static void apic_get_delivery_bitmask(uint32_t *deliver_bitmask,
@@ -1122,6 +1123,7 @@ static void ioapic_mem_writel(void *opaque, 
target_phys_addr_t addr, uint32_t va
                     } else {
                         s->ioredtbl[index] &= ~0xffffffffULL;
                         s->ioredtbl[index] |= val;
+                        pt_set_vector(index, (val << 24) >> 24);
                     }
                     ioapic_service(s);
                 }
diff --git a/qemu/hw/passthrough/neo_pci_tree.h 
b/qemu/hw/passthrough/neo_pci_tree.h
new file mode 100644
index 0000000..79adef9
--- /dev/null
+++ b/qemu/hw/passthrough/neo_pci_tree.h
@@ -0,0 +1,44 @@
+/*************************************************************************************************
+
+    Some data structures to save the result of the PCI probing.
+ + Copyright (c) 2007, Neocleus: Guy Zana, Alex Novik
+
+**************************************************************************************************/
+
+#ifndef __XC_NEO_PCI_TREE_H__
+#define __XC_NEO_PCI_TREE_H__
+
+#include <linux/types.h>
+
+typedef __u8 u8;
+typedef __u16 u16;
+typedef __u32 u32;
+typedef __u64 u64;
+
+
+/************************************ Data Types / Structures 
************************************/
+
+typedef u32 pciaddr_t;
+
+#define MAX_IO_REGIONS                 (6)
+
+typedef struct pci_region_s {
+       int type;       /* Memory or port I/O */
+       int valid;
+       pciaddr_t base_addr;
+       pciaddr_t size;         /* size of the region */
+       int resource_fd;
+} pci_region_t;
+
+typedef struct neo_pci_dev_s {
+       u8 bus, dev, func;      /* Bus inside domain, device and function */
+       int irq;                /* IRQ number */
+       u16 region_number;      /* number of active regions */
+
+       /* Port I/O or MMIO Regions */
+       pci_region_t regions[MAX_IO_REGIONS];
+       int config_fd;
+} neo_pci_dev_t;
+
+#endif                         /* __XC_NEO_PCI_TREE_H__ */
diff --git a/qemu/hw/passthrough/passthrough.c 
b/qemu/hw/passthrough/passthrough.c
new file mode 100644
index 0000000..42540a7
--- /dev/null
+++ b/qemu/hw/passthrough/passthrough.c
@@ -0,0 +1,604 @@
+/******************************************************************************
+
+    PCI config handling, MMIO & PIO access through dom0 is done for
+    debugging needs.
+
+    Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
+    Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
+
+******************************************************************************/
+#include <stdio.h>
+#include <pthread.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+
+#include "neo_pci_tree.h"
+
+typedef u64 resource_size_t;
+#define __deprecated +
+#include <linux/ioport.h>
+#include "vl.h"
+#include "passthrough.h"
+
+#ifdef USE_KVM
+#include "qemu-kvm.h"
+#include <linux/kvm_para.h>
+extern kvm_context_t kvm_context;
+#endif
+extern FILE *logfile;
+
+CPUReadMemoryFunc *pt_mmio_read_cb[3] = {
+       pt_mmio_readb,
+       pt_mmio_readw,
+       pt_mmio_readl
+};
+
+CPUWriteMemoryFunc *pt_mmio_write_cb[3] = {
+       pt_mmio_writeb,
+       pt_mmio_writew,
+       pt_mmio_writel
+};
+
+//#define PT_DEBUG
+
+#ifdef PT_DEBUG
+#define DEBUG(fmt, args...) fprintf(stderr, "%s: " fmt, __FUNCTION__ , ## args)
+#else
+#define DEBUG(fmt, args...)
+#endif
+
+#define pt_mmio_write(suffix, type)                                    \
+void pt_mmio_write##suffix(void *opaque, target_phys_addr_t e_phys,    \
+                               uint32_t value)                         \
+{                                                                      \
+       pt_region_t *r_access = (pt_region_t *)opaque;                  \
+       void *r_virt = (u8 *)r_access->r_virtbase +                  \
+                       (e_phys - r_access->e_physbase);             \
+       if (r_access->debug & PT_DEBUG_MMIO) {                           \
+               fprintf(logfile, "pt_mmio_write" #suffix              \
+                       ": e_physbase=%p e_phys=%p r_virt=%p value=%08x\n", \
+                       (void *)r_access->e_physbase, (void *)e_phys,        \
+                       r_virt, value);                                 \
+       }                                                               \
+       *(type *)r_virt = (type)value;                                  \
+}
+
+pt_mmio_write(b, u8)
+pt_mmio_write(w, u16)
+pt_mmio_write(l, u32)
+
+#define pt_mmio_read(suffix, type)                                     \
+uint32_t pt_mmio_read##suffix(void *opaque, target_phys_addr_t e_phys) \
+{                                                                      \
+       pt_region_t *r_access = (pt_region_t *)opaque;                  \
+       void *r_virt = (u8 *)r_access->r_virtbase +                  \
+                       (e_phys - r_access->e_physbase);             \
+       uint32_t value = (u32) (*(type *) r_virt);                      \
+       if (r_access->debug & PT_DEBUG_MMIO) {                           \
+               fprintf(logfile,                                        \
+                       "pt_mmio_read" #suffix ": e_physbase=%p "   \
+                       "e_phys=%p r_virt=%p value=%08x\n",           \
+                       (void *)r_access->e_physbase,                        \
+                       (void *)e_phys, r_virt, value);                 \
+       }                                                               \
+       return value;                                                   \
+}
+
+pt_mmio_read(b, u8)
+pt_mmio_read(w, u16)
+pt_mmio_read(l, u32)
+
+#define pt_ioport_write(suffix)                                                
\
+void pt_ioport_write##suffix(void *opaque, uint32_t addr, uint32_t value) \
+{                                                                      \
+       pt_region_t *r_access = (pt_region_t *)opaque;                  \
+       uint32_t r_pio = (unsigned long)r_access->r_virtbase         \
+                        + (addr - r_access->e_physbase);            \
+       if (r_access->debug & PT_DEBUG_PIO) {                            \
+               fprintf(logfile, "pt_ioport_write" #suffix            \
+                       ": r_pio=%08x e_physbase=%08x"                        \
+                       " r_virtbase=%08lx value=%08x\n",             \
+                       r_pio, (int)r_access->e_physbase,            \
+                       (unsigned long)r_access->r_virtbase, value); \
+       }                                                               \
+       out##suffix(value, r_pio);                                      \
+}
+
+pt_ioport_write(b)
+pt_ioport_write(w)
+pt_ioport_write(l)
+
+#define pt_ioport_read(suffix)                                         \
+uint32_t pt_ioport_read##suffix(void *opaque, uint32_t addr)           \
+{                                                                      \
+       pt_region_t *r_access = (pt_region_t *)opaque;                  \
+       uint32_t r_pio = (addr - r_access->e_physbase)                       \
+                       + (unsigned long)r_access->r_virtbase;               \
+       uint32_t value = in##suffix(r_pio);                             \
+       if (r_access->debug & PT_DEBUG_PIO) {                            \
+               fprintf(logfile, "pt_ioport_read" #suffix             \
+                       ": r_pio=%08x e_physbase=%08x r_virtbase=%08lx "\
+                       "value=%08x\n",                               \
+                       r_pio, (int)r_access->e_physbase,            \
+                       (unsigned long)r_access->r_virtbase, value); \
+       }                                                               \
+       return (value);                                                 \
+}
+
+pt_ioport_read(b)
+pt_ioport_read(w)
+pt_ioport_read(l)
+
+static void pt_iomem_map(PCIDevice * d, int region_num,
+                        uint32_t e_phys, uint32_t e_size, int type)
+{
+       pt_dev_t *r_dev = (pt_dev_t *) d;
+
+       r_dev->v_addrs[region_num].e_physbase = e_phys;
+
+       DEBUG("e_phys=%08x r_virt=%p type=%d len=%08x region_num=%d \n",
+             e_phys, r_dev->v_addrs[region_num].r_virtbase, type, e_size,
+             region_num);
+
+       cpu_register_physical_memory(e_phys,
+                                    r_dev->dev.io_regions[region_num].size,
+                                    r_dev->v_addrs[region_num].memory_index);
+}
+
+
+static void pt_ioport_map(PCIDevice * pci_dev, int region_num,
+                         uint32_t addr, uint32_t size, int type)
+{
+       pt_dev_t *r_dev = (pt_dev_t *) pci_dev;
+       int i;
+       uint32_t ((*rf[])(void *, uint32_t)) =  { pt_ioport_readb,
+                                                 pt_ioport_readw,
+                                                 pt_ioport_readl
+                                               };
+       void ((*wf[])(void *, uint32_t, uint32_t)) = { pt_ioport_writeb,
+                                                      pt_ioport_writew,
+                                                      pt_ioport_writel
+                                                    };
+
+       r_dev->v_addrs[region_num].e_physbase = addr;
+       fprintf(logfile, "pt_ioport_map: address=0x%x type=0x%x len=%d"
+               "region_num=%d \n", addr, type, size, region_num);
+
+       for (i = 0; i < 3; i++) {
+               register_ioport_write(addr, size, 1<<i, wf[i],
+                                     (void *) (r_dev->v_addrs + region_num));
+               register_ioport_read(addr, size, 1<<i, rf[i],
+                                    (void *) (r_dev->v_addrs + region_num));
+       }
+}
+
+static void pt_pci_write_config(PCIDevice * d, uint32_t address, uint32_t val,
+                               int len)
+{
+       int fd;
+
+       DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+             ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+             val, len);
+
+       if (address == 0x4)
+               pci_default_write_config(d, address, val, len);
+
+       if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+           address == 0x3c || address == 0x3d) {
+         /* used for update-mappings (BAR emulation) */
+               pci_default_write_config(d, address, val, len);
+               return;
+       }
+
+       DEBUG("NON BAR (%x.%x): address=%04x val=0x%08x len=%d\n",
+             ((d->devfn >> 3) & 0x1F), (d->devfn & 0x7), (uint16_t) address,
+             val, len);
+       fd = ((pt_dev_t *)d)->real_device.config_fd;
+       lseek(fd, address, SEEK_SET);
+       write(fd, &val, len);
+}
+
+static uint32_t pt_pci_read_config(PCIDevice *d, uint32_t address, int len)
+{
+       uint32_t val = 0;
+       int fd;
+
+       if ((address >= 0x10 && address <= 0x24) || address == 0x34 ||
+           address == 0x3c || address == 0x3d) {
+               val = pci_default_read_config(d, address, len);
+               DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+                     (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val,
+                     len);
+               return (val);
+       }
+
+       /* vga specific, remove later */
+       if (address == 0xFC)
+               goto do_log;
+
+       fd = ((pt_dev_t *)d)->real_device.config_fd;
+       lseek(fd, address, SEEK_SET);
+       read(fd, &val, len);
+
+      do_log:
+       DEBUG("(%x.%x): address=%04x val=0x%08x len=%d\n",
+             (d->devfn >> 3) & 0x1F, (d->devfn & 0x7), address, val, len);
+
+       /* kill the special capabilities */
+       if (address == 4 && len == 4)
+               val &= ~0x100000;
+       else if (address == 6)
+               val &= ~0x10;
+
+       return (val);
+}
+
+
+int pt_register_regions(pci_region_t * io_regions,
+                       unsigned long regions_num, pt_dev_t * pci_dev)
+{
+       uint32_t i;
+       pci_region_t *cur_region = io_regions;
+
+       for (i = 0; i < regions_num; i++, cur_region++) {
+               if (!cur_region->valid)
+                       continue;
+#ifdef PT_DEBUG
+               pci_dev->v_addrs[i].debug |= PT_DEBUG_MMIO | PT_DEBUG_PIO;
+#endif
+               pci_dev->v_addrs[i].num = i;
+
+               /* handle memory io regions */
+               if (cur_region->type & IORESOURCE_MEM) {
+                       int t = cur_region->type & IORESOURCE_PREFETCH ? 
PCI_ADDRESS_SPACE_MEM_PREFETCH : PCI_ADDRESS_SPACE_MEM;
+
+                       /* map physical memory */
+                       pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+                       pci_dev->v_addrs[i].r_virtbase =
+                               mmap(NULL, (cur_region->size + 0xFFF) & 
0xFFFFF000, PROT_WRITE | PROT_READ,
+                                           MAP_SHARED, 
cur_region->resource_fd, (off_t) 0);
+
+                       if ((void *) -1 ==
+                           pci_dev->v_addrs[i].r_virtbase) {
+                               fprintf(logfile, "NEO: Error: Couldn't mmap 
0x%x!\n",
+                                       (uint32_t) (cur_region->base_addr));
+                               return (-1);
+                       }
+
+                       /* add offset */
+                       pci_dev->v_addrs[i].r_virtbase += (cur_region->base_addr 
& 0xFFF);
+
+                       pci_register_io_region((PCIDevice *) pci_dev, i, 
cur_region->size, t, pt_iomem_map);
+
+                       pci_dev->v_addrs[i].memory_index =
+                           cpu_register_io_memory(0, pt_mmio_read_cb, 
pt_mmio_write_cb,
+                                               (void *) 
&(pci_dev->v_addrs[i]));
+
+                       continue;
+               }
+               /* handle port io regions */
+
+               pci_register_io_region((PCIDevice *) pci_dev, i, 
cur_region->size, PCI_ADDRESS_SPACE_IO,
+                                      pt_ioport_map);
+
+               pci_dev->v_addrs[i].e_physbase = cur_region->base_addr;
+               pci_dev->v_addrs[i].r_virtbase = (void 
*)(long)cur_region->base_addr;
+               pci_dev->v_addrs[i].memory_index = 0;        // not relevant 
for port io
+       }
+
+       /* success */
+       return (0);
+
+}
+
+int
+pt_get_real_device(pt_dev_t *pci_dev, uint8_t r_bus, uint8_t r_dev,
+                  uint8_t r_func)
+{
+       char dir[128], name[128], comp[16];
+       int fd, r = 0;
+       FILE *f;
+       unsigned long long start, end, size, flags;
+       pci_region_t *rp;
+       neo_pci_dev_t *dev = &pci_dev->real_device;
+
+       dev->region_number = 0;
+
+       sprintf(dir, "/sys/bus/pci/devices/0000:%02x:%02x.%x/",
+               r_bus, r_dev, r_func);
+       strcpy(name, dir);
+       strcat(name, "config");
+       if ((fd = open(name, O_RDWR)) == -1) {
+               fprintf(logfile, "%s: %m\n", name);
+               return 1;
+       }
+       dev->config_fd = fd;
+       read(fd, pci_dev->dev.config, sizeof pci_dev->dev.config);
+
+       strcpy(name, dir);
+       strcat(name, "resource");
+       if ((f = fopen(name, "r")) == NULL) {
+               fprintf(logfile, "%s: %m\n", name);
+               return 1;
+       }
+
+       for (r = 0; fscanf(f, "%lli %lli %lli\n", &start, &end, &flags) == 3; 
r++) {
+               rp = dev->regions + r;
+               rp->valid = 0;
+               size = end - start + 1;
+               flags &= IORESOURCE_IO | IORESOURCE_MEM | IORESOURCE_PREFETCH;
+               if (size == 0 || (flags & ~IORESOURCE_PREFETCH) == 0)
+                       continue;
+               if (flags & IORESOURCE_MEM) {
+                       flags &= ~IORESOURCE_IO;
+                       sprintf(comp, "resource%d", r);
+                       strcpy(name, dir);
+                       strcat(name, comp);
+                       if ((fd = open(name, O_RDWR)) == -1)
+                               continue;               // probably ROM
+                       rp->resource_fd = fd;
+               } else
+                       flags &= ~IORESOURCE_PREFETCH;
+
+               rp->type = flags;
+               rp->valid = 1;
+               rp->base_addr = start;
+               rp->size = size;
+               fprintf(logfile, "region %d size %d start 0x%x type %d resource_fd 
%d\n", r, rp->size, start, rp->type, rp->resource_fd);
+       }
+       fclose(f);
+       
+       dev->region_number = r;
+       return 0;
+}
+
+/* From include/linux/pci.h in the kernel sources */
+#define PCI_DEVFN(slot,func)   ((((slot) & 0x1f) << 3) | ((func) & 0x07))
+
+pt_dev_t *register_real_device(PCIBus * e_bus, const char *e_dev_name,
+                              int e_devfn, uint8_t r_bus, uint8_t r_dev,
+                              uint8_t r_func, uint32_t machine_irq)
+{
+       int rc;
+       pt_dev_t *pci_dev;
+       uint8_t e_device, e_intx;
+       struct kvm_pv_passthrough_dev pv_pci_dev;
+
+       fprintf(logfile, "register_real_device: Registering real physical device %s 
(devfn=0x%x)\n", e_dev_name, e_devfn);
+
+       pci_dev = (pt_dev_t *) pci_register_device(e_bus, e_dev_name, 
sizeof(pt_dev_t), e_devfn,
+                                                  pt_pci_read_config, 
pt_pci_write_config);
+
+       if (NULL == pci_dev) {
+               fprintf(logfile, "register_real_device: Error: Couldn't register 
real device %s\n", e_dev_name);
+               return (NULL);
+       }
+       if (pt_get_real_device(pci_dev, r_bus, r_dev, r_func)) {
+               fprintf(logfile, "register_real_device: Error: Couldn't get real 
device (%s)!\n", e_dev_name);
+               return NULL;
+       }
+
+       /* handle real device's MMIO/PIO BARs */
+       if (pt_register_regions(pci_dev->real_device.regions, 
pci_dev->real_device.region_number, pci_dev))
+               return (NULL);
+
+       /* handle interrupt routing */
+       e_device = (pci_dev->dev.devfn >> 3) & 0x1f;
+       e_intx = pci_dev->dev.config[0x3d] - 1;
+       pci_dev->intpin = e_intx;
+       pci_dev->run = 0;
+       pci_dev->mirq = machine_irq;
+
+       /* bind machine_irq to device */
+       if (machine_irq) {
+               fprintf(logfile, "Binding mirq %u to device=0x%x intpin=0x%x\n",
+                               machine_irq, e_device, pci_dev->intpin);
+               rc = pt_bind_mirq(r_bus, r_dev, r_func);
+               if (rc) {
+                       fprintf(logfile, "pt_bind %d failed rc=%d\n", 
pci_dev->mirq, rc);
+                       return NULL;
+               }
+               sprintf(pci_dev->sirq, "%d", pci_dev->mirq);
+       }
+
+#ifdef USE_KVM
+       /* Let the host kernel know we'll dealing with this device hereafter */
+       pv_pci_dev.guest.busnr = pci_bus_num(e_bus);
+       pv_pci_dev.guest.devfn = PCI_DEVFN(e_device, r_func);
+       pv_pci_dev.mach.busnr  = r_bus;
+       pv_pci_dev.mach.devfn  = PCI_DEVFN(r_dev, r_func);
+
+       rc = ioctl(kvm_get_vm_fd(kvm_context), KVM_ASSIGN_PV_PCI_DEV,
+                  &pv_pci_dev);
+       if (rc == -1) {
+               fprintf(stderr, "Could not notify kernel about passthrough "
+                       "device\n");
+               perror("pt-ioctl:");
+               return NULL;
+       }
+#endif
+       fprintf(logfile, "register_real_device: Real physical device (%02x:%02x.%x) 
\"%s\" registered successfully!\n", r_bus, r_dev, r_func, e_dev_name);
+
+       return (pci_dev);
+}
+
+#define        MAX_PTDEVS 4
+struct {
+       char name[128];
+       int bus;
+       int dev;
+       int func;
+       int irq;
+       pt_dev_t *ptdev;
+} ptdevs[MAX_PTDEVS];
+
+int nptdevs;
+
+static QEMUBH *ptbh;
+static int irqfd;
+static pt_dev_t **apicv[0xfe]; //0x10 - 0xfe according to intel IOAPIC spec
+#define IRQHOOK_DEV "/dev/irqhook"
+static pthread_t irqthread;
+
+void pt_irq(void *arg)
+{
+       char buf[20];
+       int irq;
+       int i;
+       pt_dev_t *dev;
+
+       if (!irqfd) {
+               fprintf(stderr, "pt_irq: irqfd %d, exiting\n", irqfd);
+               exit(-1);
+       }
+
+       for (;;) {
+               if (read(irqfd, buf, 20) == -1) {
+                       if (errno == EINTR) continue;
+                       perror("irq read: ");
+               }
+
+               irq = atoi(buf);
+               DEBUG("read irq %d\n", irq);
+               if (!irq) continue;
+               
+               for (i = 0; i < nptdevs; i++) if ((dev = ptdevs[i].ptdev) && 
dev->mirq == irq) dev->run = 1;
+               qemu_bh_schedule(ptbh);
+       }
+}
+
+static void pt_bh(void *p)
+{
+       int i;
+       pt_dev_t *dev;
+       for (i = 0; i < nptdevs; i++)
+               if ((dev = ptdevs[i].ptdev) && dev->run) {
+                       qemu_set_irq(dev->dev.irq[dev->intpin], 1);
+                       dev->run = 0;
+                       if (cpu_single_env) cpu_interrupt(cpu_single_env, 
CPU_INTERRUPT_EXIT);
+               }
+}
+
+int pt_init(PCIBus * bus)
+{
+       pt_dev_t *dev;
+       int i, ret = 0;
+
+       iopl(3);
+
+       if (!(ptbh = qemu_bh_new(pt_bh, 0))) {
+ fprintf(logfile, "Couldn't register PT callback\n"); + return -1;
+       }
+
+       if (!(irqfd = open(IRQHOOK_DEV, O_RDWR))) {
+               fprintf(logfile, "Couldn't open PT irqhook dev\n");
+               return -1;
+       }
+
+       if (pthread_create(&irqthread, 0, pt_irq, 0)) {
+               fprintf(logfile, "Couldn't create IRQ thread\n");
+               return -1;
+       }
+
+       for (i = 0; i < nptdevs; i++) {
+               dev = register_real_device(bus, ptdevs[i].name, -1, 
ptdevs[i].bus, ptdevs[i].dev, ptdevs[i].func, ptdevs[i].irq);
+
+               if (dev == NULL) {
+                       fprintf(logfile, "NEO: Error: Couldn't register %s\n", 
"AUDIO_0");
+                       ret = -1;
+               }
+               ptdevs[i].ptdev = dev;
+       }
+
+       /* success */
+       return (ret);
+}
+
+void
+add_passthrough_device(char *arg)
+{
+       /* name/bus:dev.func-intr */
+       char *cp, *cp1;
+
+       if (nptdevs >= MAX_PTDEVS) {
+               fprintf(logfile, "Too many passthrough devices (max %d)\n", 
MAX_PTDEVS);
+               return;
+       }
+       strcpy(ptdevs[nptdevs].name, arg);
+       cp = strchr(ptdevs[nptdevs].name, '/');
+       if (cp == NULL)
+               goto bad;
+       *cp++ = 0;
+
+       ptdevs[nptdevs].bus = strtoul(cp, &cp1, 16);
+       if (*cp1 != ':')
+               goto bad;
+       cp = cp1 + 1;
+
+       ptdevs[nptdevs].dev = strtoul(cp, &cp1, 16);
+       if (*cp1 != '.')
+               goto bad;
+       cp = cp1 + 1;
+
+       ptdevs[nptdevs].func = strtoul(cp, &cp1, 16);
+       if (*cp1 != '-')
+               goto bad;
+       cp = cp1 + 1;
+
+       ptdevs[nptdevs].irq = strtoul(cp, &cp1, 0);
+       if (*cp1 != 0)
+               goto bad;
+
+       nptdevs++;
+       return;
+    bad:
+       fprintf(logfile, "passthrough arg (%s) not in the form of 
name/bus:dev.func-intr\n", arg);
+}
+
+void pt_ack_mirq(int vector)
+{
+       pt_dev_t **p = apicv[vector];
+       if (!p) return;
+
+       for (; *p; *p++) {
+               write(irqfd, (*p)->sirq, strlen((*p)->sirq));
+               qemu_set_irq((*p)->dev.irq[(*p)->intpin], 0);
+       }
+}
+
+int pt_bind_mirq(int bus, int dev, int fn)
+{
+       char s[64];
+       sprintf(s, "+%d:%d.%d", bus, dev, fn);
+       if (write(irqfd, s, strlen(s)) != strlen(s)) {
+               perror("pt_bind_mirq:");
+               exit(-1);
+       }
+       return 0;
+}
+
+void pt_set_vector(int irq, int vector)
+{
+       int i, j;
+       int pin = piix3_get_pin(irq);
+       pt_dev_t *pt, **p;
+
+       DEBUG("irq %d vector %d\n", irq, vector);
+       if (vector > 0xfe) return;
+       for (i = 0; i < nptdevs; i++) {
+               pt = ptdevs[i].ptdev;
+               if (!pt || pt->bound) continue;
+               if (pci_map_irq(&pt->dev, pt->intpin) == pin) {
+                       for (j = 1, p = apicv[vector]; p; j++, *p++);
+                       apicv[vector] = realloc(apicv[vector], j * sizeof pt);
+                       p = &apicv[vector][j];
+                       *(p-1) = pt;
+                       *p = 0;
+                       pt->bound = 1;
+               }
+       }
+       DEBUG("done\n");
+}
diff --git a/qemu/hw/passthrough/passthrough.h 
b/qemu/hw/passthrough/passthrough.h
new file mode 100644
index 0000000..3d8542d
--- /dev/null
+++ b/qemu/hw/passthrough/passthrough.h
@@ -0,0 +1,64 @@
+/*************************************************************************************************
+
+    PCI config handling, MMIO & PIO access through dom0 is done for debugging 
needs.
+
+    Copyright (c) 2007, Neocleus, Alex Novik (address@hidden)
+    Copyright (c) 2007, Neocleus, Guy Zana (address@hidden)
+
+**************************************************************************************************/
+
+#include <sys/mman.h>
+#include "vl.h"
+
+#ifndef __PASSTHROUGH_H__
+#define __PASSTHROUGH_H__
+
+#define PT_DEBUG_PIO   (0x01)
+#define PT_DEBUG_MMIO  (0x02)
+
+typedef struct pt_region_s {
+       target_phys_addr_t e_physbase;
+       uint32_t memory_index;
+       void *r_virtbase;       /* mmapped access address */
+       int num;                /* our index within v_addrs[] */
+       uint32_t debug;
+} pt_region_t;
+
+typedef struct pt_dev_s {
+       PCIDevice dev;
+       int intpin;
+       uint8_t debug_flags;
+       pt_region_t v_addrs[PCI_NUM_REGIONS];
+       neo_pci_dev_t real_device;
+       int run;
+       int mirq;
+       char sirq[4];
+       int bound;
+} pt_dev_t;
+
+
+/* MMIO access functions */
+uint32_t pt_mmio_readb(void *opaque, target_phys_addr_t e_phys);
+uint32_t pt_mmio_readw(void *opaque, target_phys_addr_t e_phys);
+uint32_t pt_mmio_readl(void *opaque, target_phys_addr_t e_phys);
+void pt_mmio_writeb(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+void pt_mmio_writew(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+void pt_mmio_writel(void *opaque, target_phys_addr_t e_phys, uint32_t value);
+
+/* PIO access functions */
+uint32_t pt_ioport_readb(void *opaque, uint32_t addr);
+uint32_t pt_ioport_readw(void *opaque, uint32_t addr);
+uint32_t pt_ioport_readl(void *opaque, uint32_t addr);
+void pt_ioport_writeb(void *opaque, uint32_t addr, uint32_t value);
+void pt_ioport_writew(void *opaque, uint32_t addr, uint32_t value);
+void pt_ioport_writel(void *opaque, uint32_t addr, uint32_t value);
+
+/* Registration functions */
+int register_pt_pio_region(uint32_t pio_start, uint32_t length,
+                          uint8_t do_logging);
+int register_pt_mmio_region(uint32_t mmio_addr, uint32_t length,
+                           uint8_t do_logging);
+
+#define logfile stderr
+
+#endif                         /* __PASSTHROUGH_H__ */
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 8aae814..d7892e0 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -888,6 +888,9 @@ static void pc_init1(ram_addr_t ram_size, int vga_ram_size, 
int boot_device,
         }
     }
+ /* Initialize pass-through */
+    pt_init(pci_bus);
+
     rtc_state = rtc_init(0x70, i8259[8]);
register_ioport_read(0x92, 1, 1, ioport92_read, NULL);
diff --git a/qemu/hw/pci.c b/qemu/hw/pci.c
index 7e8adc4..8be3645 100644
--- a/qemu/hw/pci.c
+++ b/qemu/hw/pci.c
@@ -457,6 +457,11 @@ static void pci_set_irq(void *opaque, int irq_num, int 
level)
     bus->set_irq(bus->irq_opaque, irq_num, bus->irq_count[irq_num] != 0);
 }
+int pci_map_irq(PCIDevice *pci_dev, int pin)
+{
+       return pci_dev->bus->map_irq(pci_dev, pin);
+}
+
 /***********************************************************/
 /* monitor info on PCI */
diff --git a/qemu/hw/piix_pci.c b/qemu/hw/piix_pci.c
index 8c00f0d..a9d87bd 100644
--- a/qemu/hw/piix_pci.c
+++ b/qemu/hw/piix_pci.c
@@ -225,6 +225,12 @@ static void piix3_set_irq(qemu_irq *pic, int irq_num, int 
level)
     }
 }
+int piix3_get_pin(int pic_irq)
+{
+       int i;
+       for (i = 0; i < 4; i++) if (piix3_dev->config[0x60+i] == pic_irq) 
return i;
+}
+
 static void piix3_reset(PCIDevice *d)
 {
     uint8_t *pci_conf = d->config;
diff --git a/qemu/vl.c b/qemu/vl.c
index 634fb34..21b3d47 100644
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -1182,6 +1182,7 @@ static void host_alarm_handler(int host_signum)
         SetEvent(data->host_alarm);
 #endif
         CPUState *env = cpu_single_env;
+
         if (env) {
             /* stop the currently executing cpu because a timer occured */
             cpu_interrupt(env, CPU_INTERRUPT_EXIT);
@@ -7532,6 +7533,7 @@ enum {
     QEMU_OPTION_vnc,
     QEMU_OPTION_no_acpi,
     QEMU_OPTION_no_kvm,
+    QEMU_OPTION_passthrough,
     QEMU_OPTION_no_kvm_irqchip,
     QEMU_OPTION_no_reboot,
     QEMU_OPTION_show_cursor,
@@ -7611,6 +7613,7 @@ const QEMUOption qemu_options[] = {
 #endif
 #ifdef USE_KVM
     { "no-kvm", 0, QEMU_OPTION_no_kvm },
+    { "passthrough", HAS_ARG, QEMU_OPTION_passthrough },
     { "no-kvm-irqchip", 0, QEMU_OPTION_no_kvm_irqchip },
 #endif
 #if defined(TARGET_PPC) || defined(TARGET_SPARC)
@@ -8427,6 +8430,9 @@ int main(int argc, char **argv)
            case QEMU_OPTION_no_kvm:
                kvm_allowed = 0;
                break;
+           case QEMU_OPTION_passthrough:
+               add_passthrough_device(optarg);
+               break;
            case QEMU_OPTION_no_kvm_irqchip:
                kvm_irqchip = 0;
                break;
diff --git a/tools/pci_barsize.c b/tools/pci_barsize.c
new file mode 100644
index 0000000..dd230c9
--- /dev/null
+++ b/tools/pci_barsize.c
@@ -0,0 +1,53 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+
+int
+panic(char *msg)
+{
+       perror(msg);
+       exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+       unsigned l, b, sz;
+       int fd, ismem, bar = 0, offs;
+
+       if (argc < 2)
+               panic("usage: pci_barsize <file> [bar no]");
+       
+       if ((fd = open(argv[1], O_RDWR)) < 0)
+               panic("open");
+
+       if (argc > 2)
+               bar = strtoul(argv[2], 0, 0);
+       if (bar < 0 || bar > 5)
+               panic("bar range 0-5");
+
+       offs = 0x10 + bar * 4;
+       lseek(fd, offs, 0);
+       read(fd, &l, sizeof(l));
+       printf("bar %d (offs 0x%x) == %x\n", bar, offs, l);
+
+       ismem = !(l & 0x01);
+       
+       b = ~0;
+       lseek(fd, offs, 0);
+       write(fd, &b, sizeof(b));
+
+       lseek(fd, offs, 0);
+       read(fd, &b, sizeof(b));
+       sz = ~(b & (ismem ? ~0x15 : ~0x1)) + 1;
+       printf("bar %d %s size 0x%x == %ldKB (%x)\n",
+               bar, ismem ? "memory" : "IO", sz, sz / 1024, b);
+
+       lseek(fd, offs, 0);
+       write(fd, &l, sizeof(l));
+
+       return 0;
+}
diff --git a/tools/pci_mmio.c b/tools/pci_mmio.c
new file mode 100644
index 0000000..6e91571
--- /dev/null
+++ b/tools/pci_mmio.c
@@ -0,0 +1,82 @@
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+
+int
+panic(char *msg)
+{
+       perror(msg);
+       exit(1);
+}
+
+int
+main(int argc, char **argv)
+{
+       unsigned sz;
+       int fd, cnt, rsz, offs = 0;
+       void *map;
+       struct stat st;
+
+       if (argc < 2)
+               panic("usage: pci_mmio <resouce-file> [offset [count]]");
+       
+       if ((fd = open(argv[1], O_RDWR)) < 0)
+               panic("open");
+
+       if (fstat(fd, &st) < 0)
+               panic("fstat");
+       cnt = sz = st.st_size;
+
+       if (argc > 2)
+               offs = strtoul(argv[2], 0, 0);
+       if (argc > 3)
+               cnt = strtoul(argv[3], 0, 0);
+
+       if (cnt < 0 || cnt > sz)
+               panic("bad count");
+       if (offs < 0 || offs > sz)
+               panic("bad offset");
+       if (offs + cnt > sz) {
+               cnt = sz - offs;
+               fprintf(stderr, "count truncated to %d", cnt);
+       }
+       if (cnt > 4 && offs % 4)
+               panic("read bigger than 4 must be 4 bytes aligned");
+       if (cnt == 2 && offs % 2)
+               panic("2 bytes read must be 2 bytes aligned");
+       if (cnt != 1 && cnt != 2 && cnt != 4 && cnt % 4)
+               panic("counts must be 1, 2, 4 or 4*n");
+
+       fprintf(stderr, "reading %s [%d:%d]\n", argv[1], offs, offs + cnt);
+       map = mmap(NULL, sz, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+
+       if (!map)
+               panic("mmap");
+
+       rsz = cnt > 4 ? 4 : cnt;
+       fprintf(stderr, "rsz: %d cnt %d\n", rsz, cnt);
+       while (cnt > 0) {
+               char buf[8];
+               switch (rsz) {
+               case 1:
+                       *(char *)buf = *(char *)map + offs;
+                       break;
+               case 2:
+                       *(short *)buf = *(short *)map + offs/sizeof(short);
+                       break;
+               case 4:
+                       *(int *)buf = *(int *)map + offs/4;
+                       break;
+               }
+               write(1, buf, rsz);
+
+               offs += rsz;
+               cnt -= rsz;
+       }
+       fprintf(stderr, "done\n");
+       return 0;
+}





reply via email to

[Prev in Thread] Current Thread [Next in Thread]