qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH] console: make QMP screendump use coroutine


From: Dr. David Alan Gilbert
Subject: Re: [PATCH] console: make QMP screendump use coroutine
Date: Thu, 20 Feb 2020 20:11:55 +0000
User-agent: Mutt/1.13.3 (2020-01-12)

* Markus Armbruster (address@hidden) wrote:
> Cc: David for questions regarding the HMP core.  David, please look for
> "Is HMP blocking the main loop a problem?"
> 
> Marc-André Lureau <address@hidden> writes:
> 
> > Hi
> >
> > On Thu, Feb 20, 2020 at 8:49 AM Markus Armbruster <address@hidden> wrote:
> >>
> >> Marc-André Lureau <address@hidden> writes:
> >>
> >> > Thanks to the QMP coroutine support, the screendump handler can
> >> > trigger a graphic_hw_update(), yield and let the main loop run until
> >> > update is done. Then the handler is resumed, and the ppm_save() will
> >> > write the screen image to disk in the coroutine context (thus
> >> > non-blocking).
> >> >
> >> > For now, HMP doesn't have coroutine support, so it remains potentially
> >> > outdated or glitched.
> >> >
> >> > Fixes:
> >> > https://bugzilla.redhat.com/show_bug.cgi?id=1230527
> >> >
> >> > Based-on: <address@hidden>
> >> >
> >> > Cc: Kevin Wolf <address@hidden>
> >> > Signed-off-by: Marc-André Lureau <address@hidden>
> >> > ---
> >> >  qapi/ui.json    |  3 ++-
> >> >  ui/console.c    | 35 +++++++++++++++++++++++++++--------
> >> >  ui/trace-events |  2 +-
> >> >  3 files changed, 30 insertions(+), 10 deletions(-)
> >> >
> >> > diff --git a/qapi/ui.json b/qapi/ui.json
> >> > index e04525d8b4..d941202f34 100644
> >> > --- a/qapi/ui.json
> >> > +++ b/qapi/ui.json
> >> > @@ -96,7 +96,8 @@
> >> >  #
> >> >  ##
> >> >  { 'command': 'screendump',
> >> > -  'data': {'filename': 'str', '*device': 'str', '*head': 'int'} }
> >> > +  'data': {'filename': 'str', '*device': 'str', '*head': 'int'},
> >> > +  'coroutine': true }
> >> >
> >> >  ##
> >> >  # == Spice
> >> > diff --git a/ui/console.c b/ui/console.c
> >> > index ac79d679f5..db184b473f 100644
> >> > --- a/ui/console.c
> >> > +++ b/ui/console.c
> >> > @@ -167,6 +167,7 @@ struct QemuConsole {
> >> >      QEMUFIFO out_fifo;
> >> >      uint8_t out_fifo_buf[16];
> >> >      QEMUTimer *kbd_timer;
> >> > +    Coroutine *screendump_co;
> >> >
> >> >      QTAILQ_ENTRY(QemuConsole) next;
> >> >  };
> >> > @@ -194,7 +195,6 @@ static void dpy_refresh(DisplayState *s);
> >> >  static DisplayState *get_alloc_displaystate(void);
> >> >  static void text_console_update_cursor_timer(void);
> >> >  static void text_console_update_cursor(void *opaque);
> >> > -static bool ppm_save(int fd, DisplaySurface *ds, Error **errp);
> >> >
> >> >  static void gui_update(void *opaque)
> >> >  {
> >> > @@ -263,6 +263,9 @@ static void gui_setup_refresh(DisplayState *ds)
> >> >
> >> >  void graphic_hw_update_done(QemuConsole *con)
> >> >  {
> >> > +    if (con && con->screendump_co) {
> >>
> >> How can !con happen?
> >
> > I don't think it can happen anymore (the patch evolved over several
> > years, this is probably a left-over). In any case, it doesn't hurt.
> 
> I hate such dead checks, because they make me assume they can actually
> happen.  Incorrect assumptions breed bugs.
> 
> But I'm willing to defer to the maintainer here.  Gerd?
> 
> >> > +        aio_co_wake(con->screendump_co);
> >> > +    }
> >> >  }
> >> >
> >> >  void graphic_hw_update(QemuConsole *con)
> >> > @@ -310,16 +313,16 @@ void graphic_hw_invalidate(QemuConsole *con)
> >> >      }
> >> >  }
> >> >
> >> > -static bool ppm_save(int fd, DisplaySurface *ds, Error **errp)
> >> > +static bool ppm_save(int fd, pixman_image_t *image, Error **errp)
> >> >  {
> >> > -    int width = pixman_image_get_width(ds->image);
> >> > -    int height = pixman_image_get_height(ds->image);
> >> > +    int width = pixman_image_get_width(image);
> >> > +    int height = pixman_image_get_height(image);
> >> >      g_autoptr(Object) ioc = OBJECT(qio_channel_file_new_fd(fd));
> >> >      g_autofree char *header = NULL;
> >> >      g_autoptr(pixman_image_t) linebuf = NULL;
> >> >      int y;
> >> >
> >> > -    trace_ppm_save(fd, ds);
> >> > +    trace_ppm_save(fd, image);
> >> >
> >> >      header = g_strdup_printf("P6\n%d %d\n%d\n", width, height, 255);
> >> >      if (qio_channel_write_all(QIO_CHANNEL(ioc),
> >> > @@ -329,7 +332,7 @@ static bool ppm_save(int fd, DisplaySurface *ds, 
> >> > Error **errp)
> >> >
> >> >      linebuf = qemu_pixman_linebuf_create(PIXMAN_BE_r8g8b8, width);
> >> >      for (y = 0; y < height; y++) {
> >> > -        qemu_pixman_linebuf_fill(linebuf, ds->image, width, 0, y);
> >> > +        qemu_pixman_linebuf_fill(linebuf, image, width, 0, y);
> >> >          if (qio_channel_write_all(QIO_CHANNEL(ioc),
> >> >                                    (char 
> >> > *)pixman_image_get_data(linebuf),
> >> >                                    pixman_image_get_stride(linebuf), 
> >> > errp) < 0) {
> >>
> >> Looks like an unrelated optimization / simplification.  If I was
> >> maintainer, I'd ask for a separate patch.
> >
> > I can be split, but it's related. We should pass a reference to
> > pixman_image_t, rather than a pointer to DisplaySurface, as the
> > underlying image may change over time, and would result in corrupted
> > coroutine save or worse.
> 
> Work that into your commit message, please.  Might be easier if you
> split, but that's between you and the maintainer :)
> 
> >> > @@ -340,11 +343,18 @@ static bool ppm_save(int fd, DisplaySurface *ds, 
> >> > Error **errp)
> >> >      return true;
> >> >  }
> >> >
> >> > +static void graphic_hw_update_bh(void *con)
> >> > +{
> >> > +    graphic_hw_update(con);
> >> > +}
> >> > +
> >> > +/* may be called in coroutine context or not */
> >>
> >> Hmm.
> >>
> >> Even though the QMP core always calls in coroutine context, the comment
> >> is correct: hmp_screendump() calls it outside coroutine context.
> >> Because of that...
> >>
> >> >  void qmp_screendump(const char *filename, bool has_device, const char 
> >> > *device,
> >> >                      bool has_head, int64_t head, Error **errp)
> >> >  {
> >> >      QemuConsole *con;
> >> >      DisplaySurface *surface;
> >> > +    g_autoptr(pixman_image_t) image = NULL;
> >> >      int fd;
> >> >
> >> >      if (has_device) {
> >> > @@ -365,7 +375,15 @@ void qmp_screendump(const char *filename, bool 
> >> > has_device, const char *device,
> >> >          }
> >> >      }
> >> >
> >> > -    graphic_hw_update(con);
> >> > +    if (qemu_in_coroutine()) {
> >> > +        assert(!con->screendump_co);
> >>
> >> What if multiple QMP monitors simultaneously screendump?  Hmm, it works
> >> because all execute one after another in the same coroutine
> >> qmp_dispatcher_co.  Implicit mutual exclusion.
> >>
> >> Executing them one after another is bad, because it lets an ill-behaved
> >> QMP command starve *all* QMP monitors.  We do it only out of
> >> (reasonable!) fear of implicit mutual exclusion requirements like the
> >> one you add.
> >>
> >> Let's not add more if we can help it.
> >
> > The situation is not worse than the current blocking handling.
> 
> Really?
> 
> What makes executing multiple qmp_screendump() concurrently (in separate
> threads) or interleaved (in separate coroutines in the same thread)
> unsafe before this patch?
> 
> >> Your screendump_co is per QemuConsole instead of per QMP monitor only
> >> because you need to find the coroutine in graphic_hw_update_done().  Can
> >> we somehow pass it via function arguments?
> >
> > I think it could be done later, so I suggest a TODO.
> 
> We should avoid making our dependence on implicit mutual exclusion
> worse.  When we do it anyway, a big, fat, ugly comment is definitely
> called for.
> 
> >> In case avoiding the mutual exclusion is impractical: please explain it
> >> in a comment to make it somewhat less implicit.
> 
> It is anything but: see appended patch.
> 
> >> > +        con->screendump_co = qemu_coroutine_self();
> >> > +        aio_bh_schedule_oneshot(qemu_get_aio_context(),
> >> > +                                graphic_hw_update_bh, con);
> >> > +        qemu_coroutine_yield();
> >> > +        con->screendump_co = NULL;
> >> > +    }
> >> > +
> >>
> >> ... the command handler needs extra code to cope with either.  Is this
> >> really what we want for coroutine QMP command handlers?  We'll acquire
> >> more of them, and I'd hate to make each one run both in and outside
> >> coroutine context.  Shouldn't we let the HMP core take care of this?  Or
> >> at least have some common infrastructure these handlers can use?
> >
> > We have several functions that have this dual support, for ex QIO.
> >
> > Changing both QMP & HMP commands to run in coroutine is likely
> > additional work that we may not care at this point.
> 
> If it wasn't for non-QMP calls (typically HMP, but also CLI), then
> handlers for QMP commands with 'coroutine': true could be coroutine_fn.
> 
> So far, coroutine_fn is merely documentation.  Perhaps it can guide a
> checker for "do coroutine stuff only in coroutines" some day.  Would be
> nice, because the coroutine calls are often buried deep, and far away
> from the code that ensures they run in a coroutine.
> 
> My point is: marking functions coroutine_fn is good.  We should do it
> more.  We should try to avoid stuff that hinders doing it more.
> 
> > I propose to leave a TODO, once we have several similar QMP & HMP mix
> > cases we can try to find a common HMP solution to make the code
> > simpler in QMP handler.
> 
> Collecting several users before building infrastructure makes sense when
> the design of the infrastructure isn't obvious, or when the need for it
> is in doubt.
> 
> Neither is the case for running QMP handlers in a coroutine: QMP
> commands blocking the main loop is without doubt a problem we need to
> solve, and the way to solve it was obvious enough for Kevin to do it
> with one user: block_resize.  A second one quickly followed: screendump.
> 
> The only part that's different for HMP, I think, is "need".
> 
> Is HMP blocking the main loop a problem?
> 
> If yes, is it serious enough to justify solving it?

I don't mind if HMP blocks for a small time while doing something, but
not if it can hang if the guest (or something else like it) misbehaves.
Not if it's something you might need to issue another command to recover
from.

Dave

> If yes, then putting workarounds into QMP handlers now so we can put off
> solving it some more is taking on technical debt.
> 
> > I don't know if this is going to be a common pattern, we may end up
> > with conversions that can run both without explicit handling (like the
> > ppm_save() function, thanks to QIO).
> 
> Yes, such handlers may exist.  Running them out of coroutine context
> would throw away their capability not to block the event loop, though,
> wouldn't it?
> 
> >> Why is it okay not to call graphic_hw_update() anymore when
> >> !qemu_in_coroutine()?
> >
> > You could call it, but then you should wait for completion by
> > reentering the main loop (that was the point of my earlier qapi-async
> > series)
> 
> Possibly stupid question: why is it necessary before this patch
> (assuming it is, since we call it), and why is it no longer necessary
> after?
> 
> Oh, see below.
> 
> >>
> >> If qemu_in_coroutine(), we now run graphic_hw_update() in a bottom half,
> >> then yield until the update completes (see graphic_hw_update_done()
> >> above).  Can you explain the need for the bottom half?
> >
> > At least spice rendering is done in a separate thread, completion is async.
> 
> When I ask a stupid question like this one, I'm really after the "for
> dummies" explanation.  I may be able to figure it out myself with some
> effort, but having to put that kind of effort into patch review makes me
> grumpy, and once I'm sufficiently grumpy, I don't want to review patches
> anymore, let alone merge them.
> 
> Oh well, let me try.  We're in the main loop.  We want to trigger a
> "graphics update" (whatever that is, doesn't matter) and wait for it to
> complete without blocking the main loop.
> 
> "Without blocking the main loop" means the QMP coroutine yields.  I'd
> naively expect
> 
>     QMP coroutine: schedule the job; yield
>     whatever gets scheduled: complete the job; wake up QMP coroutine
> 
> Now let's examine the "graphics update" interface.
> 
> GraphicHwOps callback gfx_update() comes in two flavours:
> 
> * synchronous: complete the job, return
> 
> * asynchronous: start the job, return immediately,
>   graphic_hw_update_done() will get called on job completion
> 
> graphic_hw_update() partly hides the difference:
> 
> * synchronous: complete the job, call graphic_hw_update_done()
> 
> * asynchronous: start the job, return immediately,
>   graphic_hw_update_done() will get called on job completion
> 
> This lets you treat the synchronous case more like the asynchronous
> case.
> 
> You use graphic_hw_update_done() to wake up the QMP coroutine.
> 
> I think I can now answer my question "why is it okay not to call
> graphic_hw_update() anymore when !qemu_in_coroutine()?"
> 
> Before the patch, both QMP and HMP:
> 
> * with synchronous gfx_update(): we update before we write out the
>   screendump.  The screendump is up-to-date.  Both update and write out
>   block the main loop.
> 
> * with asynchronous gfx_update(): we start updating, but don't wait for
>   it to complete before we write out.  This is wrong.  Write out blocks
>   the main loop, but update does not.
> 
> After the patch:
> 
> * QMP with either gfx_update(): we update before we write out the
>   screendump.  The screendump is up-to-date.  Neither update nor write
>   out block the main loop.  Improvement.
> 
> * HMP with either gfx_update(): we don't update before we write out.
>   Similarly wrong for asynchronous gfx_update(), regression for
>   synchronous gfx_update().  Write out blocks the main loop as before.
> 
> Why is the regression okay?
> 
> Back to the bottom half.  The way graphic_hw_update() works, the QMP
> coroutine can't schedule then yield.  It *has* to yield before
> graphic_hw_update() runs.  That means we need a bottom half.
> 
> Alright, I'm officially grumpy now.
> 
> Please explain the need for a bottom half in a comment.
> 
> >> >      surface = qemu_console_surface(con);
> >> >      if (!surface) {
> >> >          error_setg(errp, "no surface");
> >> > @@ -379,7 +397,8 @@ void qmp_screendump(const char *filename, bool 
> >> > has_device, const char *device,
> >> >          return;
> >> >      }
> >> >
> >> > -    if (!ppm_save(fd, surface, errp)) {
> >> > +    image = pixman_image_ref(surface->image);
> >> > +    if (!ppm_save(fd, image, errp)) {
> >> >          qemu_unlink(filename);
> >> >      }
> >> >  }
> >> > diff --git a/ui/trace-events b/ui/trace-events
> >> > index 0dcda393c1..e8726fc969 100644
> >> > --- a/ui/trace-events
> >> > +++ b/ui/trace-events
> >> > @@ -15,7 +15,7 @@ displaysurface_create_pixman(void *display_surface) 
> >> > "surface=%p"
> >> >  displaysurface_free(void *display_surface) "surface=%p"
> >> >  displaychangelistener_register(void *dcl, const char *name) "%p [ %s ]"
> >> >  displaychangelistener_unregister(void *dcl, const char *name) "%p [ %s 
> >> > ]"
> >> > -ppm_save(int fd, void *display_surface) "fd=%d surface=%p"
> >> > +ppm_save(int fd, void *image) "fd=%d image=%p"
> >> >
> >> >  # gtk.c
> >> >  # gtk-gl-area.c
> 
> 
> diff --git a/ui/console.c b/ui/console.c
> index 57df3a5439..c5aabf5a5f 100644
> --- a/ui/console.c
> +++ b/ui/console.c
> @@ -167,7 +167,6 @@ struct QemuConsole {
>      QEMUFIFO out_fifo;
>      uint8_t out_fifo_buf[16];
>      QEMUTimer *kbd_timer;
> -    Coroutine *screendump_co;
>  
>      QTAILQ_ENTRY(QemuConsole) next;
>  };
> @@ -261,14 +260,20 @@ static void gui_setup_refresh(DisplayState *ds)
>      ds->have_text = have_text;
>  }
>  
> -void graphic_hw_update_done(QemuConsole *con)
> +static void graphic_hw_update_done_with_kick(QemuConsole *con,
> +                                             Coroutine *kick_me)
>  {
> -    if (con && con->screendump_co) {
> -        aio_co_wake(con->screendump_co);
> +    if (kick_me) {
> +        aio_co_wake(kick_me);
>      }
>  }
>  
> -void graphic_hw_update(QemuConsole *con)
> +void graphic_hw_update_done(QemuConsole *con)
> +{
> +    graphic_hw_update_done_with_kick(con, NULL);
> +}
> +
> +static void graphic_hw_update_with_kick(QemuConsole *con, Coroutine *kick_me)
>  {
>      bool async = false;
>      if (!con) {
> @@ -279,10 +284,15 @@ void graphic_hw_update(QemuConsole *con)
>          async = con->hw_ops->gfx_update_async;
>      }
>      if (!async) {
> -        graphic_hw_update_done(con);
> +        graphic_hw_update_done_with_kick(con, kick_me);
>      }
>  }
>  
> +void graphic_hw_update(QemuConsole *con)
> +{
> +    graphic_hw_update_with_kick(con, NULL);
> +}
> +
>  void graphic_hw_gl_block(QemuConsole *con, bool block)
>  {
>      assert(con != NULL);
> @@ -343,9 +353,16 @@ static bool ppm_save(int fd, pixman_image_t *image, 
> Error **errp)
>      return true;
>  }
>  
> -static void graphic_hw_update_bh(void *con)
> +typedef struct {
> +    QemuConsole *con;
> +    Coroutine *kick_me;
> +} UpdateBHargs;
> +
> +static void graphic_hw_update_bh(void *p)
>  {
> -    graphic_hw_update(con);
> +    UpdateBHargs *args = p;
> +
> +    graphic_hw_update_with_kick(args->con, args->kick_me);
>  }
>  
>  /* may be called in coroutine context or not */
> @@ -376,12 +393,13 @@ void qmp_screendump(const char *filename, bool 
> has_device, const char *device,
>      }
>  
>      if (qemu_in_coroutine()) {
> -        assert(!con->screendump_co);
> -        con->screendump_co = qemu_coroutine_self();
> +        UpdateBHargs args = {
> +            .con = con,
> +            .kick_me = qemu_coroutine_self(),
> +        };
>          aio_bh_schedule_oneshot(qemu_get_aio_context(),
> -                                graphic_hw_update_bh, con);
> +                                graphic_hw_update_bh, &args);
>          qemu_coroutine_yield();
> -        con->screendump_co = NULL;
>      }
>  
>      surface = qemu_console_surface(con);
--
Dr. David Alan Gilbert / address@hidden / Manchester, UK




reply via email to

[Prev in Thread] Current Thread [Next in Thread]