qemu-devel
[Top][All Lists]
Advanced

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

Re: 9pfs: Twalk crash


From: Christian Schoenebeck
Subject: Re: 9pfs: Twalk crash
Date: Wed, 01 Sep 2021 14:49:37 +0200

On Dienstag, 31. August 2021 19:04:54 CEST Greg Kurz wrote:
> > Ok, I made a quick "is same thread" assertion check:
> > 
> > diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
> > index 54ee93b71f..bb6ebd16aa 100644
> > --- a/hw/9pfs/virtio-9p-device.c
> > +++ b/hw/9pfs/virtio-9p-device.c
> > @@ -28,12 +28,25 @@
> > 
> >  #include "qemu/module.h"
> >  #include "sysemu/qtest.h"
> > 
> > +static void assert_thread(void) {
> > +    static QemuThread thread;
> > +    static bool calledBefore;
> > +    if (!calledBefore) {
> > +        calledBefore = 1;
> > +        qemu_thread_get_self(&thread);
> > +        return;
> > +    }
> > +    assert(qemu_thread_is_self(&thread));
> > +}
> > +
[...]
> > And it triggered, however I am not sure if some of those functions I
> > asserted above are indeed allowed to be executed on a different thread
> > than main thread:
> > 
> > Program terminated with signal SIGABRT, Aborted.
> > #0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
> > 50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
> > [Current thread is 1 (Thread 0x7fd0bcef1700 (LWP 6470))]
> 
> Based in the thread number, it seems that the signal was raised by
> the main event thread...

No, it was not main thread actually, gdb's "current thread is 1" output is 
misleading.

Following the thread id trace, I extended the thread assertion checks over to 
v9fs_walk() as well, like this:

static void coroutine_fn v9fs_walk(void *opaque)
{
    ...
    assert_thread();
    v9fs_co_run_in_worker({
        ...
    });
    assert_thread();
    ...
}

and made sure the reference thread id to be compared is really the main 
thread.

And what happens here is before v9fs_co_run_in_worker() is entered, 
v9fs_walk() runs on main thread, but after returning from 
v9fs_co_run_in_worker() it runs on a different thread for some reason, not on 
main thread as it would be expected at that point.

This is my test patch:

----

diff --git a/hw/9pfs/9p.c b/hw/9pfs/9p.c
index 2815257f42..fa706c10e6 100644
--- a/hw/9pfs/9p.c
+++ b/hw/9pfs/9p.c
@@ -1717,6 +1717,8 @@ static void coroutine_fn v9fs_walk(void *opaque)
     V9fsState *s = pdu->s;
     V9fsQID qid;
 
+    assert_thread();
+
     err = pdu_unmarshal(pdu, offset, "ddw", &fid, &newfid, &nwnames);
     if (err < 0) {
         pdu_complete(pdu, err);
@@ -1724,10 +1726,15 @@ static void coroutine_fn v9fs_walk(void *opaque)
     }
     offset += err;
 
+    assert_thread();
+
     trace_v9fs_walk(pdu->tag, pdu->id, fid, newfid, nwnames);
 
+    assert_thread();
+
     if (nwnames > P9_MAXWELEM) {
         err = -EINVAL;
+        assert_thread();
         goto out_nofid;
     }
     if (nwnames) {
@@ -1738,10 +1745,12 @@ static void coroutine_fn v9fs_walk(void *opaque)
         for (i = 0; i < nwnames; i++) {
             err = pdu_unmarshal(pdu, offset, "s", &wnames[i]);
             if (err < 0) {
+                assert_thread();
                 goto out_nofid;
             }
             if (name_is_illegal(wnames[i].data)) {
                 err = -ENOENT;
+                assert_thread();
                 goto out_nofid;
             }
             offset += err;
@@ -1750,6 +1759,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
     fidp = get_fid(pdu, fid);
     if (fidp == NULL) {
         err = -ENOENT;
+        assert_thread();
         goto out_nofid;
     }
 
@@ -1767,6 +1777,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
      * Twalk client request) as small as possible, run all the required fs
      * driver code altogether inside the following block.
      */
+    assert_thread();
     v9fs_co_run_in_worker({
         if (v9fs_request_cancelled(pdu)) {
             err = -EINTR;
@@ -1807,6 +1818,7 @@ static void coroutine_fn v9fs_walk(void *opaque)
             }
         }
     });
+    assert_thread();
     /*
      * Handle all the rest of this Twalk request on main thread ...
      */
diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 54ee93b71f..abf8a33b3b 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -27,6 +27,21 @@
 #include "qemu/iov.h"
 #include "qemu/module.h"
 #include "sysemu/qtest.h"
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+static pid_t main_thread_id = -1;
+
+void assert_thread(void) {
+    assert(main_thread_id != -1);
+    pid_t tid = syscall(__NR_gettid);
+    if (tid != main_thread_id) {
+        printf("Unexpected thread id %d (should be %d) - process id is %d\n",
+               tid, main_thread_id, getpid());
+        fflush(stdout);
+    }
+    assert(tid == main_thread_id);
+}
 
 static void virtio_9p_push_and_notify(V9fsPDU *pdu)
 {
@@ -34,6 +49,8 @@ static void virtio_9p_push_and_notify(V9fsPDU *pdu)
     V9fsVirtioState *v = container_of(s, V9fsVirtioState, state);
     VirtQueueElement *elem = v->elems[pdu->idx];
 
+    assert_thread();
+
     /* push onto queue and notify */
     virtqueue_push(v->vq, elem, pdu->size);
     g_free(elem);
@@ -51,6 +68,8 @@ static void handle_9p_output(VirtIODevice *vdev, VirtQueue 
*vq)
     ssize_t len;
     VirtQueueElement *elem;
 
+    assert_thread();
+
     while ((pdu = pdu_alloc(s))) {
         P9MsgHeader out;
 
@@ -125,6 +144,8 @@ static ssize_t virtio_pdu_vmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vmarshal(elem->in_sg, elem->in_num, offset, 1, fmt, ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -143,6 +164,8 @@ static ssize_t virtio_pdu_vunmarshal(V9fsPDU *pdu, size_t 
offset,
     VirtQueueElement *elem = v->elems[pdu->idx];
     ssize_t ret;
 
+    assert_thread();
+
     ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset, 1, fmt, 
ap);
     if (ret < 0) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
@@ -160,6 +183,8 @@ static void virtio_init_in_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->in_sg, elem->in_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
@@ -180,6 +205,8 @@ static void virtio_init_out_iov_from_pdu(V9fsPDU *pdu, 
struct iovec **piov,
     VirtQueueElement *elem = v->elems[pdu->idx];
     size_t buf_size = iov_size(elem->out_sg, elem->out_num);
 
+    assert_thread();
+
     if (buf_size < size) {
         VirtIODevice *vdev = VIRTIO_DEVICE(v);
 
@@ -251,6 +278,10 @@ static Property virtio_9p_properties[] = {
 
 static void virtio_9p_class_init(ObjectClass *klass, void *data)
 {
+    main_thread_id = syscall(__NR_gettid);
+    printf("Main thread ID is %d\n", main_thread_id);
+    fflush(stdout);
+
     DeviceClass *dc = DEVICE_CLASS(klass);
     VirtioDeviceClass *vdc = VIRTIO_DEVICE_CLASS(klass);
 
diff --git a/hw/9pfs/virtio-9p.h b/hw/9pfs/virtio-9p.h
index 20fa118f3a..eb7d1293cb 100644
--- a/hw/9pfs/virtio-9p.h
+++ b/hw/9pfs/virtio-9p.h
@@ -17,4 +17,6 @@ struct V9fsVirtioState {
 #define TYPE_VIRTIO_9P "virtio-9p-device"
 OBJECT_DECLARE_SIMPLE_TYPE(V9fsVirtioState, VIRTIO_9P)
 
+void assert_thread(void);
+
 #endif


----

Output:

Unexpected thread id 10780 (should be 10754) - process id is 10754
qemu-system-x86_64: ../hw/9pfs/virtio-9p-device.c:43: assert_thread: Assertion 
`tid == main_thread_id' failed.
Aborted (core dumped)

So the initially captured thread ID 10754 at process startup matches the 
process ID, hence 10754 was the main thread, and hence the assertion was 
triggered on a thread that was not main thread.

Backtrace:

Program terminated with signal SIGABRT, Aborted.
#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
[Current thread is 1 (Thread 0x7fb4637fe700 (LWP 10780))]
(gdb) bt
#0  0x00007fb7ac6d87bb in __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/
sysv/linux/raise.c:50
#1  0x00007fb7ac6c3535 in __GI_abort () at abort.c:79
#2  0x00007fb7ac6c340f in __assert_fail_base
    (fmt=0x7fb7ac825ee0 "%s%s%s:%u: %s%sAssertion `%s' failed.\n%n", 
assertion=0x562e6126072f "tid == main_thread_id", file=0x562e612606fc "../hw/2
#3  0x00007fb7ac6d1102 in __GI___assert_fail
    (assertion=assertion@entry=0x562e6126072f "tid == main_thread_id", 
file=file@entry=0x562e612606fc "../hw/9pfs/virtio-9p-device.c", line=line@e1
#4  0x0000562e60ef3965 in assert_thread () at ../hw/9pfs/virtio-9p-device.c:43
#5  0x0000562e60ef3965 in assert_thread () at ../hw/9pfs/virtio-9p-device.c:35
#6  0x0000562e60cef125 in v9fs_walk (opaque=0x562e6405c288) at ../hw/9pfs/
9p.c:1821
#7  0x0000562e6114f79b in coroutine_trampoline (i0=<optimized out>, 
i1=<optimized out>) at ../util/coroutine-ucontext.c:173
#8  0x00007fb7ac6edb50 in __correctly_grouped_prefixwc
    (begin=0x2 <error: Cannot access memory at address 0x2>, end=0x0, 
thousands=0 L'\000', grouping=0x7fb7ac6d87bb <__GI_raise+267> "H\213\214$\b\1
#9  0x0000000000000000 in  ()

And 9p.c:1821 is exactly:

    assert_thread();
    v9fs_co_run_in_worker({
        if (v9fs_request_cancelled(pdu)) {
            err = -EINTR;
            break;
        }
        err = s->ops->lstat(&s->ctx, &dpath, &fidst);
        if (err < 0) {
            err = -errno;
            break;
        }
        stbuf = fidst;
        for (name_idx = 0; name_idx < nwnames; name_idx++) {
            if (v9fs_request_cancelled(pdu)) {
                err = -EINTR;
                break;
            }
            if (!same_stat_id(&pdu->s->root_st, &stbuf) ||
                strcmp("..", wnames[name_idx].data))
            {
                err = s->ops->name_to_path(&s->ctx, &dpath,
                                        wnames[name_idx].data, &path);
                if (err < 0) {
                    err = -errno;
                    break;
                }
                if (v9fs_request_cancelled(pdu)) {
                    err = -EINTR;
                    break;
                }
                err = s->ops->lstat(&s->ctx, &path, &stbuf);
                if (err < 0) {
                    err = -errno;
                    break;
                }
                stbufs[name_idx] = stbuf;
                v9fs_path_copy(&dpath, &path);
                v9fs_path_copy(&pathes[name_idx], &path);
            }
        }
    });
    assert_thread(); //<--- HERE

> > > > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > > > (gdb) bt full
> > > > #0  0x000055a934dfb9a7 in virtio_pdu_vunmarshal (pdu=0x55a93717cde8,
> > > > offset=7, fmt=0x55a9352766d1 "ddw", ap=0x7f38a9ad9cd0) at
> > > > ../hw/9pfs/virtio-9p-device.c: 146
> > > > 
> > > >         s = 0x55a93717b4b8
> > > >         v = 0x55a93717aee0
> > > >         elem = 0x0
> > > 
> > > So this is v->elems[pdu->idx]... what's the value of pdu->idx ?
> > 
> > In that originally posted core dump it was 113:
> > 
> > #0  virtio_pdu_vunmarshal (pdu=0x55a93717cde8, offset=7,
> > fmt=0x55a9352766d1
> > "ddw", ap=0x7f38a9ad9cd0) at ../hw/9pfs/virtio-9p-device.c:146
> > 146         ret = v9fs_iov_vunmarshal(elem->out_sg, elem->out_num, offset,
> > 1, fmt, ap);
> > [Current thread is 1 (Thread 0x7f3bddd2ac40 (LWP 7811))]
> > (gdb) p pdu->idx
> > $1 = 113
> 
> Ok, so it is a valid index (not over MAX_REQ). So it would mean
> that someone cleared the slot in our back ?

Yes, the slot is within valid boundaries. Assertion checks for that in 
virtio-9p-device.c in future wouldn't hurt though.

Best regards,
Christian Schoenebeck





reply via email to

[Prev in Thread] Current Thread [Next in Thread]