qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: [PATCH v2 for-7.1 4/9] nbd: keep send_mutex/free_sema handling outsi


From: Vladimir Sementsov-Ogievskiy
Subject: Re: [PATCH v2 for-7.1 4/9] nbd: keep send_mutex/free_sema handling outside nbd_co_do_establish_connection
Date: Sat, 16 Apr 2022 14:54:46 +0300
User-agent: Mozilla/5.0 (X11; Linux x86_64; rv:91.0) Gecko/20100101 Thunderbird/91.5.0

14.04.2022 20:57, Paolo Bonzini wrote:
Elevate s->in_flight early so that other incoming requests will wait
on the CoQueue in nbd_co_send_request; restart them after getting back
from nbd_reconnect_attempt.  This could be after the reconnect timer or
nbd_cancel_in_flight have cancelled the attempt, so there is no
need anymore to cancel the requests there.

nbd_co_send_request now handles both stopping and restarting pending
requests after a successful connection, and there is no need to
hold send_mutex in nbd_co_do_establish_connection.  The current setup
is confusing because nbd_co_do_establish_connection is called both with
send_mutex taken and without it.  Before the patch it uses free_sema which
(at least in theory...) is protected by send_mutex, after the patch it
does not anymore.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

Seems good to me, still again, hard to imagine, can it lead to some new 
dead-locks or not. Seems not, at least I don't see the problem. We will see.

Reviewed-by: Vladimir Sementsov-Ogievskiy <vsementsov@openvz.org>

---
  block/coroutines.h |  4 +--
  block/nbd.c        | 66 ++++++++++++++++++++++------------------------
  2 files changed, 33 insertions(+), 37 deletions(-)

diff --git a/block/coroutines.h b/block/coroutines.h
index b293e943c8..8f6e438ef3 100644
--- a/block/coroutines.h
+++ b/block/coroutines.h
@@ -59,7 +59,7 @@ int coroutine_fn bdrv_co_writev_vmstate(BlockDriverState *bs,
                                          QEMUIOVector *qiov, int64_t pos);
int coroutine_fn
-nbd_co_do_establish_connection(BlockDriverState *bs, Error **errp);
+nbd_co_do_establish_connection(BlockDriverState *bs, bool blocking, Error 
**errp);
int coroutine_fn
@@ -109,7 +109,7 @@ bdrv_common_block_status_above(BlockDriverState *bs,
                                 BlockDriverState **file,
                                 int *depth);
  int generated_co_wrapper
-nbd_do_establish_connection(BlockDriverState *bs, Error **errp);
+nbd_do_establish_connection(BlockDriverState *bs, bool blocking, Error **errp);
int generated_co_wrapper
  blk_do_preadv(BlockBackend *blk, int64_t offset, int64_t bytes,
diff --git a/block/nbd.c b/block/nbd.c
index a30603ce87..62dd338ef3 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -187,9 +187,6 @@ static void reconnect_delay_timer_cb(void *opaque)
      if (qatomic_load_acquire(&s->state) == NBD_CLIENT_CONNECTING_WAIT) {
          s->state = NBD_CLIENT_CONNECTING_NOWAIT;
          nbd_co_establish_connection_cancel(s->conn);
-        while (qemu_co_enter_next(&s->free_sema, NULL)) {
-            /* Resume all queued requests */
-        }
      }
reconnect_delay_timer_del(s);

Seems, removing the timer is not needed here too. We do 
nbd_co_establish_connection_cancel(), and timer will be removed in 
nbd_reconnect_attempt anyway.

More over, we don't need to keep timer in the state at all: it should become local thing 
for nbd_reconnect_attempt. A kind of call nbd_co_do_establish_connection() with timeout. 
That could be refactored later, using same way as in 4-5 patches of my "[PATCH v4 
0/7] copy-before-write: on-cbw-error and cbw-timeout" series.

@@ -310,11 +307,10 @@ static int nbd_handle_updated_info(BlockDriverState *bs, 
Error **errp)
  }
int coroutine_fn nbd_co_do_establish_connection(BlockDriverState *bs,
-                                                Error **errp)
+                                                bool blocking, Error **errp)
  {
      BDRVNBDState *s = (BDRVNBDState *)bs->opaque;
      int ret;
-    bool blocking = nbd_client_connecting_wait(s);
      IO_CODE();
assert(!s->ioc);
@@ -350,7 +346,6 @@ int coroutine_fn 
nbd_co_do_establish_connection(BlockDriverState *bs,
/* successfully connected */
      s->state = NBD_CLIENT_CONNECTED;
-    qemu_co_queue_restart_all(&s->free_sema);
return 0;
  }
@@ -358,25 +353,25 @@ int coroutine_fn 
nbd_co_do_establish_connection(BlockDriverState *bs,
  /* called under s->send_mutex */
  static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState *s)
  {
-    assert(nbd_client_connecting(s));
-    assert(s->in_flight == 0);
-
-    if (nbd_client_connecting_wait(s) && s->reconnect_delay &&
-        !s->reconnect_delay_timer)
-    {
-        /*
-         * It's first reconnect attempt after switching to
-         * NBD_CLIENT_CONNECTING_WAIT
-         */
-        reconnect_delay_timer_init(s,
-            qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
-            s->reconnect_delay * NANOSECONDS_PER_SECOND);
-    }
+    bool blocking = nbd_client_connecting_wait(s);
/*
       * Now we are sure that nobody is accessing the channel, and no one will
       * try until we set the state to CONNECTED.
       */
+    assert(nbd_client_connecting(s));
+    assert(s->in_flight == 1);
+
+    if (blocking && !s->reconnect_delay_timer) {
+        /*
+         * It's the first reconnect attempt after switching to
+         * NBD_CLIENT_CONNECTING_WAIT
+         */
+        g_assert(s->reconnect_delay);
+        reconnect_delay_timer_init(s,
+            qemu_clock_get_ns(QEMU_CLOCK_REALTIME) +
+            s->reconnect_delay * NANOSECONDS_PER_SECOND);
+    }
/* Finalize previous connection if any */
      if (s->ioc) {
@@ -387,7 +382,9 @@ static coroutine_fn void nbd_reconnect_attempt(BDRVNBDState 
*s)
          s->ioc = NULL;
      }
- nbd_co_do_establish_connection(s->bs, NULL);
+    qemu_co_mutex_unlock(&s->send_mutex);
+    nbd_co_do_establish_connection(s->bs, blocking, NULL);
+    qemu_co_mutex_lock(&s->send_mutex);


Hmm. So, that breaks a critical section.

We can do it because in_flight=1 and we are not connected => all other requests 
will wait in the queue.

Still. during nbd_co_do_establish_connection() state may become CONNECTED. That 
theoretically means that parallel requests may start.

Is it bad? Seems not.. Bad thing that comes into my mind is that parallel 
request fails, and go to reconnect, and start own timer, but we remove it now 
after locking the mutex back. But that's not possible as parallel request 
should wait for in-flight=0 to start own reconnect-attempt. And that is not 
possible, as we keep our in-flight point.


/*
       * The reconnect attempt is done (maybe successfully, maybe not), so
@@ -472,21 +469,21 @@ static int coroutine_fn 
nbd_co_send_request(BlockDriverState *bs,
      qemu_co_mutex_lock(&s->send_mutex);
while (s->in_flight == MAX_NBD_REQUESTS ||
-           (!nbd_client_connected(s) && s->in_flight > 0))
-    {
+           (!nbd_client_connected(s) && s->in_flight > 0)) {
          qemu_co_queue_wait(&s->free_sema, &s->send_mutex);
      }
- if (nbd_client_connecting(s)) {
-        nbd_reconnect_attempt(s);
-    }
-
-    if (!nbd_client_connected(s)) {
-        rc = -EIO;
-        goto err;
-    }
-
      s->in_flight++;

I like how this works.

+    if (!nbd_client_connected(s)) {
+        if (nbd_client_connecting(s)) {
+            nbd_reconnect_attempt(s);
+            qemu_co_queue_restart_all(&s->free_sema);
+        }
+        if (!nbd_client_connected(s)) {
+            rc = -EIO;
+            goto err;
+        }
+    }
for (i = 0; i < MAX_NBD_REQUESTS; i++) {
          if (s->requests[i].coroutine == NULL) {
@@ -526,8 +523,8 @@ err:
          nbd_channel_error(s, rc);
          if (i != -1) {
              s->requests[i].coroutine = NULL;
-            s->in_flight--;
          }
+        s->in_flight--;
          qemu_co_queue_next(&s->free_sema);
      }
      qemu_co_mutex_unlock(&s->send_mutex);
@@ -1883,7 +1880,7 @@ static int nbd_open(BlockDriverState *bs, QDict *options, 
int flags,
      }
s->state = NBD_CLIENT_CONNECTING_WAIT;
-    ret = nbd_do_establish_connection(bs, errp);
+    ret = nbd_do_establish_connection(bs, true, errp);
      if (ret < 0) {
          goto fail;
      }
@@ -2049,7 +2046,6 @@ static void nbd_cancel_in_flight(BlockDriverState *bs)
if (s->state == NBD_CLIENT_CONNECTING_WAIT) {
          s->state = NBD_CLIENT_CONNECTING_NOWAIT;
-        qemu_co_queue_restart_all(&s->free_sema);

As I understand, we always have one request running (or no requests at all, but 
that's OK too) and it will wake all others (altogether, or they will wake each 
other in a chain). So, we don't need to wake them here.

      }
nbd_co_establish_connection_cancel(s->conn);


--
Best regards,
Vladimir



reply via email to

[Prev in Thread] Current Thread [Next in Thread]