qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization


From: Li, Liang Z
Subject: Re: [Qemu-devel] [v2 1/2] cutils: add avx2 instruction optimization
Date: Fri, 13 Nov 2015 02:49:56 +0000

> > This patch use the ifunc mechanism to select the proper function when
> > running, for platform supports AVX2, excute the AVX2 instructions,
> > else, excute the original code.
> >
> > Signed-off-by: Liang Li <address@hidden>
> > ---
> >  include/qemu-common.h | 28 +++++++++++++++------
> >  util/Makefile.objs    |  2 ++
> >  util/avx2.c           | 69
> +++++++++++++++++++++++++++++++++++++++++++++++++++
> >  util/cutils.c         | 53 +++++++++++++++++++++++++++++++++++++--
> >  4 files changed, 143 insertions(+), 9 deletions(-)  create mode
> > 100644 util/avx2.c
> >
> > diff --git a/include/qemu-common.h b/include/qemu-common.h index
> > 2f74540..9fa7501 100644
> > --- a/include/qemu-common.h
> > +++ b/include/qemu-common.h
> > @@ -484,15 +484,29 @@ void qemu_hexdump(const char *buf, FILE *fp,
> > const char *prefix, size_t size);  #endif
> >
> >  #define BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR 8 -static inline
> > bool -can_use_buffer_find_nonzero_offset(const void *buf, size_t len)
> > -{
> > -    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> > -                   * sizeof(VECTYPE)) == 0
> > -            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0);
> > -}
> > +bool can_use_buffer_find_nonzero_offset(const void *buf, size_t len);
> > +
> >  size_t buffer_find_nonzero_offset(const void *buf, size_t len);
> >
> > +extern bool
> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len);
> > +
> > +extern size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t
> > +len);
> > +
> > +extern bool
> > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t
> > +len);
> > +
> > +extern size_t buffer_find_nonzero_offset_inner(const void *buf,
> > +size_t len);
> > +
> > +__asm__(".type can_use_buffer_find_nonzero_offset,
> > +\%gnu_indirect_function"); __asm__(".type buffer_find_nonzero_offset,
> > +\%gnu_indirect_function");
> > +
> > +
> > +void *can_use_buffer_find_nonzero_offset_ifunc(void) \
> > +                     __asm__("can_use_buffer_find_nonzero_offset");
> > +
> > +void *buffer_find_nonzero_offset_ifunc(void) \
> > +                     __asm__("buffer_find_nonzero_offset");
> >  /*
> >   * helper to parse debug environment variables
> >   */
> > diff --git a/util/Makefile.objs b/util/Makefile.objs index
> > d7cc399..6aacad7 100644
> > --- a/util/Makefile.objs
> > +++ b/util/Makefile.objs
> > @@ -1,4 +1,5 @@
> >  util-obj-y = osdep.o cutils.o unicode.o qemu-timer-common.o
> > +util-obj-y += avx2.o
> >  util-obj-$(CONFIG_POSIX) += compatfd.o
> >  util-obj-$(CONFIG_POSIX) += event_notifier-posix.o
> >  util-obj-$(CONFIG_POSIX) += mmap-alloc.o @@ -29,3 +30,4 @@ util-obj-y
> > += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
> > util-obj-y += qemu-coroutine-sleep.o  util-obj-y +=
> > coroutine-$(CONFIG_COROUTINE_BACKEND).o
> >  util-obj-y += buffer.o
> > +avx2.o-cflags      := $(AVX2_CFLAGS)
> > diff --git a/util/avx2.c b/util/avx2.c new file mode 100644 index
> > 0000000..0e6915a
> > --- /dev/null
> > +++ b/util/avx2.c
> > @@ -0,0 +1,69 @@
> > +#include "qemu-common.h"
> > +
> > +#ifdef __AVX2__
> > +#include <immintrin.h>
> > +#define AVX2_VECTYPE        __m256i
> > +#define AVX2_SPLAT(p)       _mm256_set1_epi8(*(p))
> > +#define AVX2_ALL_EQ(v1, v2) \
> > +    (_mm256_movemask_epi8(_mm256_cmpeq_epi8(v1, v2)) == 0xFFFFFFFF)
> > +#define AVX2_VEC_OR(v1, v2) (_mm256_or_si256(v1, v2))
> > +
> > +inline bool
> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> > +{
> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> > +                   * sizeof(AVX2_VECTYPE)) == 0
> > +            && ((uintptr_t) buf) % sizeof(AVX2_VECTYPE) == 0); }
> > +
> > +size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t len) {
> > +    const AVX2_VECTYPE *p = buf;
> > +    const AVX2_VECTYPE zero = (AVX2_VECTYPE){0};
> > +    size_t i;
> > +
> > +    assert(can_use_buffer_find_nonzero_offset_avx2(buf, len));
> > +
> > +    if (!len) {
> > +        return 0;
> > +    }
> > +
> > +    for (i = 0; i < BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR; i++) {
> > +        if (!AVX2_ALL_EQ(p[i], zero)) {
> > +            return i * sizeof(AVX2_VECTYPE);
> > +        }
> > +    }
> > +
> > +    for (i = BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR;
> > +         i < len / sizeof(AVX2_VECTYPE);
> > +         i += BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR) {
> > +        AVX2_VECTYPE tmp0 = AVX2_VEC_OR(p[i + 0], p[i + 1]);
> > +        AVX2_VECTYPE tmp1 = AVX2_VEC_OR(p[i + 2], p[i + 3]);
> > +        AVX2_VECTYPE tmp2 = AVX2_VEC_OR(p[i + 4], p[i + 5]);
> > +        AVX2_VECTYPE tmp3 = AVX2_VEC_OR(p[i + 6], p[i + 7]);
> > +        AVX2_VECTYPE tmp01 = AVX2_VEC_OR(tmp0, tmp1);
> > +        AVX2_VECTYPE tmp23 = AVX2_VEC_OR(tmp2, tmp3);
> > +        if (!AVX2_ALL_EQ(AVX2_VEC_OR(tmp01, tmp23), zero)) {
> > +            break;
> > +        }
> > +    }
> > +
> > +    return i * sizeof(AVX2_VECTYPE);
> > +}
> > +
> > +#else
> > +/* use the original functions if avx2 is not enabled when buiding*/
> > +
> > +inline bool
> > +can_use_buffer_find_nonzero_offset_avx2(const void *buf, size_t len)
> > +{
> > +    return can_use_buffer_find_nonzero_offset_inner(buf, len); }
> > +
> > +inline size_t buffer_find_nonzero_offset_avx2(const void *buf, size_t
> > +len) {
> > +    return buffer_find_nonzero_offset_inner(buf, len); }
> > +
> > +#endif
> > +
> > diff --git a/util/cutils.c b/util/cutils.c index cfeb848..cd478ce
> > 100644
> > --- a/util/cutils.c
> > +++ b/util/cutils.c
> > @@ -26,6 +26,7 @@
> >  #include <math.h>
> >  #include <limits.h>
> >  #include <errno.h>
> > +#include <cpuid.h>
> >
> >  #include "qemu/sockets.h"
> >  #include "qemu/iov.h"
> > @@ -161,6 +162,54 @@ int qemu_fdatasync(int fd)  #endif  }
> >
> > +/* old compiler maynot define bit_AVX2 */ #ifndef bit_AVX2 #define
> > +bit_AVX2 (1 << 5) #endif
> > +
> > +static inline bool avx2_support(void) {
> > +    int a, b, c, d;
> > +
> > +    if (__get_cpuid_max(0, NULL) < 7) {
> > +        printf("max cpuid < 7\n");
> > +        return false;
> > +    }
> > +
> > +    __cpuid_count(7, 0, a, b, c, d);
> > +    printf("b = %x\n", b);
> > +    return b & bit_AVX2;
> > +}
> > +
> > +void *buffer_find_nonzero_offset_ifunc(void)
> > +{
> > +    printf("deciding %s\n", __func__);
> > +
> > +    typeof(buffer_find_nonzero_offset) *func = (avx2_support()) ?
> > +        buffer_find_nonzero_offset_avx2 :
> > + buffer_find_nonzero_offset_inner;
> > +
> > +    return func;
> > +}
> > +
> > +void *can_use_buffer_find_nonzero_offset_ifunc(void)
> > +{
> > +    printf("deciding %s\n", __func__);
> > +
> > +    typeof(can_use_buffer_find_nonzero_offset) *func = (avx2_support()) ?
> > +        can_use_buffer_find_nonzero_offset_avx2 :
> > +        can_use_buffer_find_nonzero_offset_inner;
> > +
> > +    return func;
> > +}
> > +
> > +inline bool
> > +can_use_buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> > +{
> > +    return (len % (BUFFER_FIND_NONZERO_OFFSET_UNROLL_FACTOR
> > +                   * sizeof(VECTYPE)) == 0
> > +            && ((uintptr_t) buf) % sizeof(VECTYPE) == 0); }
> > +
> >  /*
> >   * Searches for an area with non-zero content in a buffer
> >   *
> > @@ -181,13 +230,13 @@ int qemu_fdatasync(int fd)
> >   * If the buffer is all zero the return value is equal to len.
> >   */
> >
> > -size_t buffer_find_nonzero_offset(const void *buf, size_t len)
> > +size_t buffer_find_nonzero_offset_inner(const void *buf, size_t len)
> >  {
> >      const VECTYPE *p = buf;
> >      const VECTYPE zero = (VECTYPE){0};
> >      size_t i;
> >
> > -    assert(can_use_buffer_find_nonzero_offset(buf, len));
> > +    assert(can_use_buffer_find_nonzero_offset_inner(buf, len));
> >
> >      if (!len) {
> >          return 0;
> >
> 
> The main issue here is that you are not testing whether the compiler supports
> gnu_indirect_function.
> 
> I suggest that you start by moving the functions to util/buffer-zero.c
> 
> Then the structure should be something like
> 
> #ifdef CONFIG_HAVE_AVX2
> #include <immintrin.h>
> #endif
> 
> ... define buffer_find_nonzero_offset_inner ...
> ... define can_use_buffer_find_nonzero_offset_inner ...

> #if defined CONFIG_HAVE_GNU_IFUNC && defined CONFIG_HAVE_AVX2 ...
> define buffer_find_nonzero_offset_avx2 ...
> ... define can_use_buffer_find_nonzero_offset_avx2 ...
> ... define the indirect functions ...
> #else
> ... define buffer_find_nonzero_offset that just calls
> buffer_find_nonzero_offset_inner ...
> ... define can_use_buffer_find_nonzero_offset that just calls
> can_use_buffer_find_nonzero_offset_inner ...
> #endif
> 
> Thanks,
> 
> Paolo

The buffer_find_nonzero_offset_inner  & buffer_find_nonzero_offset_avx2  can't 
defined in the same .c file.
Or, if the '-maxv2' is enabled, the " buffer_find_nonzero_offset_inner  ()" 
will be compiled to AVX2 instructions.

Liang


reply via email to

[Prev in Thread] Current Thread [Next in Thread]