qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [Qemu-devel] [PATCH] main-loop: Use epoll on Linux


From: Fam Zheng
Subject: Re: [Qemu-devel] [PATCH] main-loop: Use epoll on Linux
Date: Mon, 29 Sep 2014 17:17:48 +0800
User-agent: Mutt/1.5.23 (2014-03-12)

On Mon, 09/29 13:26, Fam Zheng wrote:
> A new implementation for qemu_poll_ns based on epoll is introduced here
> to address the slowness of g_poll and ppoll when the number of fds are
> high.
> 
> On my laptop this would reduce the virtio-blk on top of null-aio
> device's response time from 32 us to 29 us with few fds (~10), and 48 us
> to 32 us with more fds (for example when virtio-serial is plugged and
> ~64 more io handlers are enabled).
> 
> Signed-off-by: Fam Zheng <address@hidden>
> ---
>  Makefile.objs            |   1 +
>  include/qemu/main-loop.h |   1 +
>  qemu-epoll.c             | 165 
> +++++++++++++++++++++++++++++++++++++++++++++++
>  qemu-timer.c             |   4 +-
>  tests/Makefile           |   2 +-
>  5 files changed, 171 insertions(+), 2 deletions(-)
>  create mode 100644 qemu-epoll.c
> 
> diff --git a/Makefile.objs b/Makefile.objs
> index 97db978..52ee086 100644
> --- a/Makefile.objs
> +++ b/Makefile.objs
> @@ -9,6 +9,7 @@ util-obj-y = util/ qobject/ qapi/ qapi-types.o qapi-visit.o 
> qapi-event.o
>  block-obj-y = async.o thread-pool.o
>  block-obj-y += nbd.o block.o blockjob.o
>  block-obj-y += main-loop.o iohandler.o qemu-timer.o
> +block-obj-$(CONFIG_LINUX) += qemu-epoll.o
>  block-obj-$(CONFIG_POSIX) += aio-posix.o
>  block-obj-$(CONFIG_WIN32) += aio-win32.o
>  block-obj-y += block/
> diff --git a/include/qemu/main-loop.h b/include/qemu/main-loop.h
> index 62c68c0..eb01b95 100644
> --- a/include/qemu/main-loop.h
> +++ b/include/qemu/main-loop.h
> @@ -307,5 +307,6 @@ void qemu_iohandler_poll(GArray *pollfds, int rc);
>  
>  QEMUBH *qemu_bh_new(QEMUBHFunc *cb, void *opaque);
>  void qemu_bh_schedule_idle(QEMUBH *bh);
> +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout);
>  
>  #endif
> diff --git a/qemu-epoll.c b/qemu-epoll.c
> new file mode 100644
> index 0000000..89ec12a
> --- /dev/null
> +++ b/qemu-epoll.c
> @@ -0,0 +1,165 @@
> +/*
> + * QEMU Event Loop
> + *
> + * Copyright (c) 2014 Red Hat, Inc.
> + *
> + * Authors:
> + *      Fam Zheng <address@hidden>
> + *
> + * Permission is hereby granted, free of charge, to any person obtaining a 
> copy
> + * of this software and associated documentation files (the "Software"), to 
> deal
> + * in the Software without restriction, including without limitation the 
> rights
> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
> + * copies of the Software, and to permit persons to whom the Software is
> + * furnished to do so, subject to the following conditions:
> + *
> + * The above copyright notice and this permission notice shall be included in
> + * all copies or substantial portions of the Software.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
> FROM,
> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
> + * THE SOFTWARE.
> + */
> +
> +#include <sys/epoll.h>
> +#include "qemu/main-loop.h"
> +
> +static bool g_poll_fds_changed(const GPollFD *fds_a, const guint nfds_a,
> +                               const GPollFD *fds_b, const guint nfds_b)
> +{
> +    int i;
> +
> +    if (nfds_a != nfds_b) {
> +        return true;
> +    }
> +    if (!!fds_a != !!fds_b) {
> +        return true;
> +    }
> +    for (i = 0; i < nfds_a; i++) {
> +        if (fds_a[i].fd != fds_b[i].fd ||
> +            fds_a[i].events != fds_b[i].events) {
> +            return true;
> +        }
> +    }
> +    return false;
> +}
> +
> +static inline int g_io_condition_from_epoll_events(int e)
> +{
> +    return (e & EPOLLIN  ? G_IO_IN : 0) |
> +           (e & EPOLLOUT ? G_IO_OUT : 0) |
> +           (e & EPOLLERR ? G_IO_ERR : 0) |
> +           (e & EPOLLHUP ? G_IO_HUP : 0);
> +}
> +
> +static inline void epoll_event_from_g_poll_fd(struct epoll_event *event,
> +                                              GPollFD *fd)
> +{
> +    int e = fd->events;
> +
> +    event->events = (e & G_IO_IN  ? EPOLLIN : 0) |
> +                    (e & G_IO_OUT ? EPOLLOUT : 0) |
> +                    (e & G_IO_ERR ? EPOLLERR : 0) |
> +                    (e & G_IO_HUP ? EPOLLHUP : 0);
> +    event->data.ptr = fd;
> +}
> +
> +static int epoll_prepare(int epollfd,
> +                         GPollFD *fds, guint nfds,
> +                         GPollFD **g_poll_fds,
> +                         guint *g_poll_nfds,
> +                         int **g_poll_fd_idx)
> +{
> +    int i;
> +
> +    GPollFD *pfds = NULL;
> +    int npfds = 0;
> +    int *idx = NULL;
> +
> +    for (i = 0; i < nfds; i++) {
> +        int r;
> +        struct epoll_event event;
> +        epoll_event_from_g_poll_fd(&event, &fds[i]);
> +
> +        r = epoll_ctl(epollfd, EPOLL_CTL_ADD, fds[i].fd, &event);
> +        if (r) {
> +            /* Some fds may not support epoll, fall back and add them to
> +             * ppoll_fds */
> +            pfds = g_renew(GPollFD, pfds, npfds + 1);
> +            pfds[npfds] = fds[i];
> +            idx = g_renew(int, idx, npfds + 1);
> +            idx[npfds] = i;
> +            npfds++;
> +        }
> +    }
> +
> +    g_free(*g_poll_fds);
> +    *g_poll_fds = pfds;
> +    *g_poll_nfds = npfds;
> +    *g_poll_fd_idx = idx;
> +
> +    return epollfd;
> +}
> +
> +int qemu_epoll(GPollFD *fds, guint nfds, int64_t timeout)
> +{
> +    /* A copy of last fd array, used to skip epoll_prepare when nothing
> +     * changed. */
> +    static GPollFD *last_fds;
> +    static guint last_nfds;
> +    /* An array of fds that failed epoll_ctl and fall back to ppoll. Rare 
> case
> +     * too.  */
> +    static GPollFD *g_poll_fds;
> +    static guint g_poll_nfds;
> +    static int *g_poll_fd_idx;
> +    static int epollfd = -1;
> +    const int max_events = 40;
> +    struct epoll_event events[max_events];
> +    int ret = 0;
> +    int r, i;
> +
> +    if (!last_fds || g_poll_fds_changed(fds, nfds, last_fds, last_nfds)) {
> +        if (epollfd >= 0) {
> +            close(epollfd);
> +        }
> +        epollfd = epoll_create(1);
> +        if (epollfd < 0) {
> +            perror("epoll_create");
> +            abort();
> +        }
> +        epollfd = epoll_prepare(epollfd, fds, nfds, &g_poll_fds, 
> &g_poll_nfds,
> +                                &g_poll_fd_idx);
> +        last_fds = g_memdup(fds, nfds * sizeof(GPollFD));

g_poll_fd_idx and last_fds are both leaked.

Fam

> +        last_nfds = nfds;
> +    }
> +    if (g_poll_nfds) {
> +        ret = g_poll(g_poll_fds, g_poll_nfds, 
> qemu_timeout_ns_to_ms(timeout));
> +        if (ret < 0) {
> +            return ret;
> +        }
> +        /* Sync revents back to original fds */
> +        for (i = 0; i < ret; i++) {
> +            GPollFD *fd = &fds[g_poll_fd_idx[i]];
> +            assert(fd->fd == g_poll_fds[i].fd);
> +            fd->revents = g_poll_fds[i].revents;
> +        }
> +    }
> +
> +    r = epoll_wait(epollfd, events, max_events,
> +                   qemu_timeout_ns_to_ms(timeout));
> +    if (r < 0) {
> +        return r;
> +    }
> +
> +    for (i = 0; i < r; i++) {
> +        GPollFD *gpfd = events[i].data.ptr;
> +        gpfd->revents = g_io_condition_from_epoll_events(events[i].events);
> +    }
> +
> +    ret += r;
> +    return ret;
> +}



reply via email to

[Prev in Thread] Current Thread [Next in Thread]