qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor


From: Paolo Bonzini
Subject: Re: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
Date: Wed, 3 Feb 2016 14:40:56 +0100
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.5.0


On 03/02/2016 13:34, Gonglei (Arei) wrote:
> Hi,
> 
>> Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor
>>
>> Compared to vring, virtio has a performance penalty of 10%.  Fix it
>> by combining all the reads for a descriptor in a single address_space_read
>> call.  This also simplifies the code nicely.
>>
>> Reviewed-by: Cornelia Huck <address@hidden>
>> Signed-off-by: Paolo Bonzini <address@hidden>
>> ---
>>  hw/virtio/virtio.c | 86 
>> ++++++++++++++++++++++--------------------------------
>>  1 file changed, 35 insertions(+), 51 deletions(-)
>>
> 
> Unbelievable! After applying this patch, the virtio-crypto speed can attach 
> 74MB/sec, host
> Cpu overhead is 180% (the main thread 100% and vcpu threads 80%)

The three patches from Vincenzo will help too.  What was it like before?

Also, are you using ioeventfd or dataplane?  virtio-crypto sounds like
something that could be very easily run outside the "big QEMU lock".

Paolo

> Testing AES-128-CBC cipher: 
>         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 
> 74.12 MiB/sec (1523475 packets)
>         Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs: 
> 73.88 MiB/sec (1514900 packets)
>         Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs: 
> 73.97 MiB/sec (1519914 packets)
>         Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs: 
> 74.09 MiB/sec (1522309 packets)
>         Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs: 
> 74.12 MiB/sec (1522868 packets)
>         Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 
> 74.15 MiB/sec (1523457 packets)
>         Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs: 
> 74.14 MiB/sec (1523317 packets)
>         Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs: 
> 74.10 MiB/sec (1522522 packets)
> 
> 15.95%  qemu-kvm                 [.] address_space_translate
>   6.98%  qemu-kvm                 [.] qemu_get_ram_ptr
>   4.87%  libpthread-2.19.so       [.] __pthread_mutex_unlock_usercnt
>   4.40%  qemu-kvm                 [.] qemu_ram_addr_from_host
>   3.79%  qemu-kvm                 [.] address_space_map
>   3.41%  libc-2.19.so             [.] _int_malloc
>   3.29%  libc-2.19.so             [.] _int_free
>   3.07%  libc-2.19.so             [.] malloc
>   2.95%  libpthread-2.19.so       [.] pthread_mutex_lock
>   2.94%  qemu-kvm                 [.] phys_page_find
>   2.73%  qemu-kvm                 [.] address_space_translate_internal
>   2.65%  libc-2.19.so             [.] malloc_consolidate
>   2.35%  libc-2.19.so             [.] __memcpy_sse2_unaligned
>   1.72%  qemu-kvm                 [.] find_next_zero_bit
>   1.38%  qemu-kvm                 [.] address_space_rw
>   1.34%  qemu-kvm                 [.] object_unref
>   1.30%  qemu-kvm                 [.] object_ref
>   1.28%  qemu-kvm                 [.] virtqueue_pop
>   1.20%  libc-2.19.so             [.] memset
>   1.11%  qemu-kvm                 [.] virtio_notify
> 
> Thank you so much!
> 
> Regards,
> -Gonglei
> 
>> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
>> index 79a635f..2433866 100644
>> --- a/hw/virtio/virtio.c
>> +++ b/hw/virtio/virtio.c
>> @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev,
>> int n)
>>                                vring->align);
>>  }
>>
>> -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                       int i)
>> +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc,
>> +                            hwaddr desc_pa, int i)
>>  {
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr);
>> -    return virtio_ldq_phys(vdev, pa);
>> -}
>> -
>> -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, 
>> int
>> i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len);
>> -    return virtio_ldl_phys(vdev, pa);
>> -}
>> -
>> -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                        int i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags);
>> -    return virtio_lduw_phys(vdev, pa);
>> -}
>> -
>> -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                       int i)
>> -{
>> -    hwaddr pa;
>> -    pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next);
>> -    return virtio_lduw_phys(vdev, pa);
>> +    address_space_read(&address_space_memory, desc_pa + i *
>> sizeof(VRingDesc),
>> +                       MEMTXATTRS_UNSPECIFIED, (void *)desc,
>> sizeof(VRingDesc));
>> +    virtio_tswap64s(vdev, &desc->addr);
>> +    virtio_tswap32s(vdev, &desc->len);
>> +    virtio_tswap16s(vdev, &desc->flags);
>> +    virtio_tswap16s(vdev, &desc->next);
>>  }
>>
>>  static inline uint16_t vring_avail_flags(VirtQueue *vq)
>> @@ -345,18 +325,18 @@ static unsigned int virtqueue_get_head(VirtQueue
>> *vq, unsigned int idx)
>>      return head;
>>  }
>>
>> -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa,
>> -                                    unsigned int i, unsigned int max)
>> +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc
>> *desc,
>> +                                         hwaddr desc_pa, unsigned
>> int max)
>>  {
>>      unsigned int next;
>>
>>      /* If this descriptor says it doesn't chain, we're done. */
>> -    if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) {
>> +    if (!(desc->flags & VRING_DESC_F_NEXT)) {
>>          return max;
>>      }
>>
>>      /* Check they're not leading us off end of descriptors. */
>> -    next = vring_desc_next(vdev, desc_pa, i);
>> +    next = desc->next;
>>      /* Make sure compiler knows to grab that: we don't want it changing! */
>>      smp_wmb();
>>
>> @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice
>> *vdev, hwaddr desc_pa,
>>          exit(1);
>>      }
>>
>> +    vring_desc_read(vdev, desc, desc_pa, next);
>>      return next;
>>  }
>>
>> @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>      while (virtqueue_num_heads(vq, idx)) {
>>          VirtIODevice *vdev = vq->vdev;
>>          unsigned int max, num_bufs, indirect = 0;
>> +        VRingDesc desc;
>>          hwaddr desc_pa;
>>          int i;
>>
>> @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>          num_bufs = total_bufs;
>>          i = virtqueue_get_head(vq, idx++);
>>          desc_pa = vq->vring.desc;
>> +        vring_desc_read(vdev, &desc, desc_pa, i);
>>
>> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
>> -            if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
>> +        if (desc.flags & VRING_DESC_F_INDIRECT) {
>> +            if (desc.len % sizeof(VRingDesc)) {
>>                  error_report("Invalid size for indirect buffer table");
>>                  exit(1);
>>              }
>> @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>
>>              /* loop over the indirect descriptor table */
>>              indirect = 1;
>> -            max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
>> -            desc_pa = vring_desc_addr(vdev, desc_pa, i);
>> +            max = desc.len / sizeof(VRingDesc);
>> +            desc_pa = desc.addr;
>>              num_bufs = i = 0;
>> +            vring_desc_read(vdev, &desc, desc_pa, i);
>>          }
>>
>>          do {
>> @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq,
>> unsigned int *in_bytes,
>>                  exit(1);
>>              }
>>
>> -            if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE)
>> {
>> -                in_total += vring_desc_len(vdev, desc_pa, i);
>> +            if (desc.flags & VRING_DESC_F_WRITE) {
>> +                in_total += desc.len;
>>              } else {
>> -                out_total += vring_desc_len(vdev, desc_pa, i);
>> +                out_total += desc.len;
>>              }
>>              if (in_total >= max_in_bytes && out_total >= max_out_bytes) {
>>                  goto done;
>>              }
>> -        } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
>> +        } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa,
>> max)) != max);
>>
>>          if (!indirect)
>>              total_bufs = num_bufs;
>> @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>      unsigned out_num, in_num;
>>      hwaddr addr[VIRTQUEUE_MAX_SIZE];
>>      struct iovec iov[VIRTQUEUE_MAX_SIZE];
>> +    VRingDesc desc;
>>
>>      if (!virtqueue_num_heads(vq, vq->last_avail_idx)) {
>>          return NULL;
>> @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>          vring_set_avail_event(vq, vq->last_avail_idx);
>>      }
>>
>> -    if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) {
>> -        if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) {
>> +    vring_desc_read(vdev, &desc, desc_pa, i);
>> +    if (desc.flags & VRING_DESC_F_INDIRECT) {
>> +        if (desc.len % sizeof(VRingDesc)) {
>>              error_report("Invalid size for indirect buffer table");
>>              exit(1);
>>          }
>>
>>          /* loop over the indirect descriptor table */
>> -        max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc);
>> -        desc_pa = vring_desc_addr(vdev, desc_pa, i);
>> +        max = desc.len / sizeof(VRingDesc);
>> +        desc_pa = desc.addr;
>>          i = 0;
>> +        vring_desc_read(vdev, &desc, desc_pa, i);
>>      }
>>
>>      /* Collect all the descriptors */
>>      do {
>> -        hwaddr pa = vring_desc_addr(vdev, desc_pa, i);
>> -        size_t len = vring_desc_len(vdev, desc_pa, i);
>> -
>> -        if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) {
>> +        if (desc.flags & VRING_DESC_F_WRITE) {
>>              virtqueue_map_desc(&in_num, addr + out_num, iov +
>> out_num,
>> -                               VIRTQUEUE_MAX_SIZE - out_num, true,
>> pa, len);
>> +                               VIRTQUEUE_MAX_SIZE - out_num, true,
>> desc.addr, desc.len);
>>          } else {
>>              if (in_num) {
>>                  error_report("Incorrect order for descriptors");
>>                  exit(1);
>>              }
>>              virtqueue_map_desc(&out_num, addr, iov,
>> -                               VIRTQUEUE_MAX_SIZE, false, pa, len);
>> +                               VIRTQUEUE_MAX_SIZE, false,
>> desc.addr, desc.len);
>>          }
>>
>>          /* If we've got too many, that implies a descriptor loop. */
>> @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz)
>>              error_report("Looped descriptor");
>>              exit(1);
>>          }
>> -    } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max);
>> +    } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) !=
>> max);
>>
>>      /* Now copy what we have collected and mapped */
>>      elem = virtqueue_alloc_element(sz, out_num, in_num);
>> --
>> 2.5.0
>>
>>
> 
> 
> 



reply via email to

[Prev in Thread] Current Thread [Next in Thread]