qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH v6] e1000: add interrupt mitigation support


From: Alexey Kardashevskiy
Subject: Re: [Qemu-devel] [PATCH v6] e1000: add interrupt mitigation support
Date: Tue, 01 Apr 2014 17:21:38 +1100
User-agent: Mozilla/5.0 (X11; Linux i686 on x86_64; rv:24.0) Gecko/20100101 Thunderbird/24.4.0

On 03/31/2014 06:52 PM, Alexey Kardashevskiy wrote:
> On Sat, Aug 3, 2013 at 2:30 AM, Vincenzo Maffione <address@hidden>wrote:
> 
>> This patch partially implements the e1000 interrupt mitigation mechanisms.
>> Using a single QEMUTimer, it emulates the ITR register (which is the newer
>> mitigation register, recommended by Intel) and approximately emulates
>> RADV and TADV registers. TIDV and RDTR register functionalities are not
>> emulated (RDTR is only used to validate RADV, according to the e1000
>> specs).
>>
> 
> 
> I have been debugging e1000 today and discovered that e1000 does not
> survive migration.


Sorry, false alarm :)
The problem is in ppc64-kvm and not e1000-specific (but somehow it is a lot
easier to trigger by using e1000).



> The systems are PPC64 (POWER7), machine names are vpl1, vpl2, the guest
> name is aiktest0.
> QEMU command line is:
>     /home/aik/qemu-system-ppc64 \
>     -enable-kvm \
>     -m 1024 \
>     -machine pseries \
>     -nographic \
>     -vga none \
>     -initrd 1.cpio \
>     -kernel vml310 \
>     -netdev
> tap,id=id0,ifname=tap-testmig,script=ifup.sh,downscript=ifdown.sh \
>     -device e1000,id=id1,netdev=id0,mitigation=off,mac=C0:41:49:4b:00:00 \
>     -incoming tcp:vpl2:4000 \
> 
> What I do:
> 1. Run a guest.
> 2. Do dhclient
> 3. Start "ping -f vpl2" in the guest.
> 4. Start "ping -f aiktest0" on vpl1 (once) and vpl2 (twice, to create
> enough load).
> 5. Migrate.
> 
> Normally after migration, s->mit_irq_level is zero at the exit from
> e1000_post_load() and destination guest continues working fine. However
> sometime it is (s->mit_irq_level == 1). If this is the case, the device
> will stay dead and will not recover, periodically printing weird messages
> and trying to reset the adapter. RDH becomes equal to RDT, e1000_has_rxbufs
> returns false and that's it.
> 
> Running both guests with -device e1000,...mitigation=off helps.
> 
> Any idea how to fix this properly? Thanks.
> 
> 
> 
>>
>> RADV, TADV, TIDV and RDTR registers make up the older e1000 mitigation
>> mechanism and would need a timer each to be completely emulated. However,
>> a single timer has been used in order to reach a good compromise between
>> emulation accuracy and simplicity/efficiency.
>>
>> The implemented mechanism can be enabled/disabled specifying the command
>> line e1000-specific boolean parameter "mitigation", e.g.
>>
>>     qemu-system-x86_64 -device e1000,mitigation=on,... ...
>>
>> For more information, see the Software developer's manual at
>> http://download.intel.com/design/network/manuals/8254x_GBe_SDM.pdf.
>>
>> Interrupt mitigation boosts performance when the guest suffers from
>> an high interrupt rate (i.e. receiving short UDP packets at high packet
>> rate). For some numerical results see the following link
>> http://info.iet.unipi.it/~luigi/papers/20130520-rizzo-vm.pdf
>>
>> Signed-off-by: Vincenzo Maffione <address@hidden>
>> ---
>> Added pc-*-1.7 machines (default machine moved to pc-i440fx-1.7).
>>
>>  hw/i386/pc_piix.c    |  18 ++++++-
>>  hw/i386/pc_q35.c     |  16 ++++++-
>>  hw/net/e1000.c       | 131
>> +++++++++++++++++++++++++++++++++++++++++++++++++--
>>  include/hw/i386/pc.h |   8 ++++
>>  4 files changed, 167 insertions(+), 6 deletions(-)
>>
>> diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
>> index ab25458..f5183d4 100644
>> --- a/hw/i386/pc_piix.c
>> +++ b/hw/i386/pc_piix.c
>> @@ -336,8 +336,8 @@ static void pc_xen_hvm_init(QEMUMachineInitArgs *args)
>>  }
>>  #endif
>>
>> -static QEMUMachine pc_i440fx_machine_v1_6 = {
>> -    .name = "pc-i440fx-1.6",
>> +static QEMUMachine pc_i440fx_machine_v1_7 = {
>> +    .name = "pc-i440fx-1.7",
>>      .alias = "pc",
>>      .desc = "Standard PC (i440FX + PIIX, 1996)",
>>      .init = pc_init_pci,
>> @@ -347,6 +347,19 @@ static QEMUMachine pc_i440fx_machine_v1_6 = {
>>      DEFAULT_MACHINE_OPTIONS,
>>  };
>>
>> +static QEMUMachine pc_i440fx_machine_v1_6 = {
>> +    .name = "pc-i440fx-1.6",
>> +    .desc = "Standard PC (i440FX + PIIX, 1996)",
>> +    .init = pc_init_pci,
>> +    .hot_add_cpu = pc_hot_add_cpu,
>> +    .max_cpus = 255,
>> +    .compat_props = (GlobalProperty[]) {
>> +        PC_COMPAT_1_6,
>> +        { /* end of list */ }
>> +    },
>> +    DEFAULT_MACHINE_OPTIONS,
>> +};
>> +
>>  static QEMUMachine pc_i440fx_machine_v1_5 = {
>>      .name = "pc-i440fx-1.5",
>>      .desc = "Standard PC (i440FX + PIIX, 1996)",
>> @@ -769,6 +782,7 @@ static QEMUMachine xenfv_machine = {
>>
>>  static void pc_machine_init(void)
>>  {
>> +    qemu_register_machine(&pc_i440fx_machine_v1_7);
>>      qemu_register_machine(&pc_i440fx_machine_v1_6);
>>      qemu_register_machine(&pc_i440fx_machine_v1_5);
>>      qemu_register_machine(&pc_i440fx_machine_v1_4);
>> diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
>> index 2f35d12..09b0a54 100644
>> --- a/hw/i386/pc_q35.c
>> +++ b/hw/i386/pc_q35.c
>> @@ -230,13 +230,26 @@ static void pc_q35_init_1_4(QEMUMachineInitArgs
>> *args)
>>      pc_q35_init_1_5(args);
>>  }
>>
>> +static QEMUMachine pc_q35_machine_v1_7 = {
>> +    .name = "pc-q35-1.7",
>> +    .alias = "q35",
>> +    .desc = "Standard PC (Q35 + ICH9, 2009)",
>> +    .init = pc_q35_init,
>> +    .hot_add_cpu = pc_hot_add_cpu,
>> +    .max_cpus = 255,
>> +    DEFAULT_MACHINE_OPTIONS,
>> +};
>> +
>>  static QEMUMachine pc_q35_machine_v1_6 = {
>>      .name = "pc-q35-1.6",
>> -    .alias = "q35",
>>      .desc = "Standard PC (Q35 + ICH9, 2009)",
>>      .init = pc_q35_init,
>>      .hot_add_cpu = pc_hot_add_cpu,
>>      .max_cpus = 255,
>> +    .compat_props = (GlobalProperty[]) {
>> +        PC_COMPAT_1_6,
>> +        { /* end of list */ }
>> +    },
>>      DEFAULT_MACHINE_OPTIONS,
>>  };
>>
>> @@ -267,6 +280,7 @@ static QEMUMachine pc_q35_machine_v1_4 = {
>>
>>  static void pc_q35_machine_init(void)
>>  {
>> +    qemu_register_machine(&pc_q35_machine_v1_7);
>>      qemu_register_machine(&pc_q35_machine_v1_6);
>>      qemu_register_machine(&pc_q35_machine_v1_5);
>>      qemu_register_machine(&pc_q35_machine_v1_4);
>> diff --git a/hw/net/e1000.c b/hw/net/e1000.c
>> index fdb1f89..32014c2 100644
>> --- a/hw/net/e1000.c
>> +++ b/hw/net/e1000.c
>> @@ -135,9 +135,16 @@ typedef struct E1000State_st {
>>
>>      QEMUTimer *autoneg_timer;
>>
>> +    QEMUTimer *mit_timer;      /* Mitigation timer. */
>> +    bool mit_timer_on;         /* Mitigation timer is running. */
>> +    bool mit_irq_level;        /* Tracks interrupt pin level. */
>> +    uint32_t mit_ide;          /* Tracks E1000_TXD_CMD_IDE bit. */
>> +
>>  /* Compatibility flags for migration to/from qemu 1.3.0 and older */
>>  #define E1000_FLAG_AUTONEG_BIT 0
>> +#define E1000_FLAG_MIT_BIT 1
>>  #define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT)
>> +#define E1000_FLAG_MIT (1 << E1000_FLAG_MIT_BIT)
>>      uint32_t compat_flags;
>>  } E1000State;
>>
>> @@ -158,7 +165,8 @@ enum {
>>      defreg(TORH),      defreg(TORL),   defreg(TOTH),   defreg(TOTL),
>>      defreg(TPR),       defreg(TPT),    defreg(TXDCTL), defreg(WUFC),
>>      defreg(RA),                defreg(MTA),
>>  defreg(CRCERRS),defreg(VFTA),
>> -    defreg(VET),
>> +    defreg(VET),        defreg(RDTR),   defreg(RADV),   defreg(TADV),
>> +    defreg(ITR),
>>  };
>>
>>  static void
>> @@ -245,10 +253,21 @@ static const uint32_t mac_reg_init[] = {
>>                  E1000_MANC_RMCP_EN,
>>  };
>>
>> +/* Helper function, *curr == 0 means the value is not set */
>> +static inline void
>> +mit_update_delay(uint32_t *curr, uint32_t value)
>> +{
>> +    if (value && (*curr == 0 || value < *curr)) {
>> +        *curr = value;
>> +    }
>> +}
>> +
>>  static void
>>  set_interrupt_cause(E1000State *s, int index, uint32_t val)
>>  {
>>      PCIDevice *d = PCI_DEVICE(s);
>> +    uint32_t pending_ints;
>> +    uint32_t mit_delay;
>>
>>      if (val && (E1000_DEVID >= E1000_DEV_ID_82547EI_MOBILE)) {
>>          /* Only for 8257x */
>> @@ -266,7 +285,57 @@ set_interrupt_cause(E1000State *s, int index,
>> uint32_t val)
>>       */
>>      s->mac_reg[ICS] = val;
>>
>> -    qemu_set_irq(d->irq[0], (s->mac_reg[IMS] & s->mac_reg[ICR]) != 0);
>> +    pending_ints = (s->mac_reg[IMS] & s->mac_reg[ICR]);
>> +    if (!s->mit_irq_level && pending_ints) {
>> +        /*
>> +         * Here we detect a potential raising edge. We postpone raising
>> the
>> +         * interrupt line if we are inside the mitigation delay window
>> +         * (s->mit_timer_on == 1).
>> +         * We provide a partial implementation of interrupt mitigation,
>> +         * emulating only RADV, TADV and ITR (lower 16 bits, 1024ns units
>> for
>> +         * RADV and TADV, 256ns units for ITR). RDTR is only used to
>> enable
>> +         * RADV; relative timers based on TIDV and RDTR are not
>> implemented.
>> +         */
>> +        if (s->mit_timer_on) {
>> +            return;
>> +        }
>> +        if (s->compat_flags & E1000_FLAG_MIT) {
>> +            /* Compute the next mitigation delay according to pending
>> +             * interrupts and the current values of RADV (provided
>> +             * RDTR!=0), TADV and ITR.
>> +             * Then rearm the timer.
>> +             */
>> +            mit_delay = 0;
>> +            if (s->mit_ide &&
>> +                    (pending_ints & (E1000_ICR_TXQE | E1000_ICR_TXDW))) {
>> +                mit_update_delay(&mit_delay, s->mac_reg[TADV] * 4);
>> +            }
>> +            if (s->mac_reg[RDTR] && (pending_ints & E1000_ICS_RXT0)) {
>> +                mit_update_delay(&mit_delay, s->mac_reg[RADV] * 4);
>> +            }
>> +            mit_update_delay(&mit_delay, s->mac_reg[ITR]);
>> +
>> +            if (mit_delay) {
>> +                s->mit_timer_on = 1;
>> +                qemu_mod_timer(s->mit_timer,
>> +                        qemu_get_clock_ns(vm_clock) + mit_delay * 256);
>> +            }
>> +            s->mit_ide = 0;
>> +        }
>> +    }
>> +
>> +    s->mit_irq_level = (pending_ints != 0);
>> +    qemu_set_irq(d->irq[0], s->mit_irq_level);
>> +}
>> +
>> +static void
>> +e1000_mit_timer(void *opaque)
>> +{
>> +    E1000State *s = opaque;
>> +
>> +    s->mit_timer_on = 0;
>> +    /* Call set_interrupt_cause to update the irq level (if necessary). */
>> +    set_interrupt_cause(s, 0, s->mac_reg[ICR]);
>>  }
>>
>>  static void
>> @@ -307,6 +376,10 @@ static void e1000_reset(void *opaque)
>>      int i;
>>
>>      qemu_del_timer(d->autoneg_timer);
>> +    qemu_del_timer(d->mit_timer);
>> +    d->mit_timer_on = 0;
>> +    d->mit_irq_level = 0;
>> +    d->mit_ide = 0;
>>      memset(d->phy_reg, 0, sizeof d->phy_reg);
>>      memmove(d->phy_reg, phy_reg_init, sizeof phy_reg_init);
>>      memset(d->mac_reg, 0, sizeof d->mac_reg);
>> @@ -572,6 +645,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc
>> *dp)
>>      struct e1000_context_desc *xp = (struct e1000_context_desc *)dp;
>>      struct e1000_tx *tp = &s->tx;
>>
>> +    s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
>>      if (dtype == E1000_TXD_CMD_DEXT) { // context descriptor
>>          op = le32_to_cpu(xp->cmd_and_length);
>>          tp->ipcss = xp->lower_setup.ip_fields.ipcss;
>> @@ -1047,7 +1121,8 @@ static uint32_t (*macreg_readops[])(E1000State *,
>> int) = {
>>      getreg(TORL),      getreg(TOTL),   getreg(IMS),    getreg(TCTL),
>>      getreg(RDH),       getreg(RDT),    getreg(VET),    getreg(ICS),
>>      getreg(TDBAL),     getreg(TDBAH),  getreg(RDBAH),  getreg(RDBAL),
>> -    getreg(TDLEN),     getreg(RDLEN),
>> +    getreg(TDLEN),      getreg(RDLEN),  getreg(RDTR),   getreg(RADV),
>> +    getreg(TADV),       getreg(ITR),
>>
>>      [TOTH] = mac_read_clr8,    [TORH] = mac_read_clr8, [GPRC] =
>> mac_read_clr4,
>>      [GPTC] = mac_read_clr4,    [TPR] = mac_read_clr4,  [TPT] =
>> mac_read_clr4,
>> @@ -1069,6 +1144,8 @@ static void (*macreg_writeops[])(E1000State *, int,
>> uint32_t) = {
>>      [TDH] = set_16bit, [RDH] = set_16bit,      [RDT] = set_rdt,
>>      [IMC] = set_imc,   [IMS] = set_ims,        [ICR] = set_icr,
>>      [EECD] = set_eecd, [RCTL] = set_rx_control, [CTRL] = set_ctrl,
>> +    [RDTR] = set_16bit, [RADV] = set_16bit,     [TADV] = set_16bit,
>> +    [ITR] = set_16bit,
>>      [RA ... RA+31] = &mac_writereg,
>>      [MTA ... MTA+127] = &mac_writereg,
>>      [VFTA ... VFTA+127] = &mac_writereg,
>> @@ -1150,6 +1227,11 @@ static void e1000_pre_save(void *opaque)
>>      E1000State *s = opaque;
>>      NetClientState *nc = qemu_get_queue(s->nic);
>>
>> +    /* If the mitigation timer is active, emulate a timeout now. */
>> +    if (s->mit_timer_on) {
>> +        e1000_mit_timer(s);
>> +    }
>> +
>>      if (!(s->compat_flags & E1000_FLAG_AUTONEG)) {
>>          return;
>>      }
>> @@ -1171,6 +1253,14 @@ static int e1000_post_load(void *opaque, int
>> version_id)
>>      E1000State *s = opaque;
>>      NetClientState *nc = qemu_get_queue(s->nic);
>>
>> +    if (!(s->compat_flags & E1000_FLAG_MIT)) {
>> +        s->mac_reg[ITR] = s->mac_reg[RDTR] = s->mac_reg[RADV] =
>> +            s->mac_reg[TADV] = 0;
>> +        s->mit_irq_level = false;
>> +    }
>> +    s->mit_ide = 0;
>> +    s->mit_timer_on = false;
>> +
>>      /* nc.link_down can't be migrated, so infer link_down according
>>       * to link status bit in mac_reg[STATUS].
>>       * Alternatively, restart link negotiation if it was in progress. */
>> @@ -1190,6 +1280,28 @@ static int e1000_post_load(void *opaque, int
>> version_id)
>>      return 0;
>>  }
>>
>> +static bool e1000_mit_state_needed(void *opaque)
>> +{
>> +    E1000State *s = opaque;
>> +
>> +    return s->compat_flags & E1000_FLAG_MIT;
>> +}
>> +
>> +static const VMStateDescription vmstate_e1000_mit_state = {
>> +    .name = "e1000/mit_state",
>> +    .version_id = 1,
>> +    .minimum_version_id = 1,
>> +    .minimum_version_id_old = 1,
>> +    .fields    = (VMStateField[]) {
>> +        VMSTATE_UINT32(mac_reg[RDTR], E1000State),
>> +        VMSTATE_UINT32(mac_reg[RADV], E1000State),
>> +        VMSTATE_UINT32(mac_reg[TADV], E1000State),
>> +        VMSTATE_UINT32(mac_reg[ITR], E1000State),
>> +        VMSTATE_BOOL(mit_irq_level, E1000State),
>> +        VMSTATE_END_OF_LIST()
>> +    }
>> +};
>> +
>>  static const VMStateDescription vmstate_e1000 = {
>>      .name = "e1000",
>>      .version_id = 2,
>> @@ -1267,6 +1379,14 @@ static const VMStateDescription vmstate_e1000 = {
>>          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, MTA, 128),
>>          VMSTATE_UINT32_SUB_ARRAY(mac_reg, E1000State, VFTA, 128),
>>          VMSTATE_END_OF_LIST()
>> +    },
>> +    .subsections = (VMStateSubsection[]) {
>> +        {
>> +            .vmsd = &vmstate_e1000_mit_state,
>> +            .needed = e1000_mit_state_needed,
>> +        }, {
>> +            /* empty */
>> +        }
>>      }
>>  };
>>
>> @@ -1316,6 +1436,8 @@ pci_e1000_uninit(PCIDevice *dev)
>>
>>      qemu_del_timer(d->autoneg_timer);
>>      qemu_free_timer(d->autoneg_timer);
>> +    qemu_del_timer(d->mit_timer);
>> +    qemu_free_timer(d->mit_timer);
>>      memory_region_destroy(&d->mmio);
>>      memory_region_destroy(&d->io);
>>      qemu_del_nic(d->nic);
>> @@ -1371,6 +1493,7 @@ static int pci_e1000_init(PCIDevice *pci_dev)
>>      add_boot_device_path(d->conf.bootindex, dev, "/address@hidden");
>>
>>      d->autoneg_timer = qemu_new_timer_ms(vm_clock, e1000_autoneg_timer,
>> d);
>> +    d->mit_timer = qemu_new_timer_ns(vm_clock, e1000_mit_timer, d);
>>
>>      return 0;
>>  }
>> @@ -1385,6 +1508,8 @@ static Property e1000_properties[] = {
>>      DEFINE_NIC_PROPERTIES(E1000State, conf),
>>      DEFINE_PROP_BIT("autonegotiation", E1000State,
>>                      compat_flags, E1000_FLAG_AUTONEG_BIT, true),
>> +    DEFINE_PROP_BIT("mitigation", E1000State,
>> +                    compat_flags, E1000_FLAG_MIT_BIT, true),
>>      DEFINE_PROP_END_OF_LIST(),
>>  };
>>
>> diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
>> index 3a0c4e3..894c124 100644
>> --- a/include/hw/i386/pc.h
>> +++ b/include/hw/i386/pc.h
>> @@ -214,7 +214,15 @@ void pvpanic_init(ISABus *bus);
>>
>>  int e820_add_entry(uint64_t, uint64_t, uint32_t);
>>
>> +#define PC_COMPAT_1_6 \
>> +        {\
>> +            .driver   = "e1000",\
>> +            .property = "mitigation",\
>> +            .value    = "off",\
>> +        }
>> +
>>  #define PC_COMPAT_1_5 \
>> +        PC_COMPAT_1_6, \
>>          {\
>>              .driver   = "Conroe-" TYPE_X86_CPU,\
>>              .property = "model",\
>> --
>> 1.8.3.4
>>
>>
>>
> 
> 


-- 
Alexey



reply via email to

[Prev in Thread] Current Thread [Next in Thread]