[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [PATCH] linux-user: add signalfd/signalfd4 syscalls
From: |
Laurent Vivier |
Subject: |
Re: [Qemu-devel] [PATCH] linux-user: add signalfd/signalfd4 syscalls |
Date: |
Tue, 14 Jul 2015 23:13:28 +0200 |
User-agent: |
Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Thunderbird/38.0.1 |
Thank you for your comments.
I'm currently writing a v2 of this patch with what I've learned working
on the netlink interface. I will fix it accordingly all your comments.
[ Now, I can boot a PPC64 container with systemd on an x86_64 host:
https://asciinema.org/a/7sgwvok72m34v7hg08hfspg4s ]
Laurent
Le 14/07/2015 22:50, Peter Maydell a écrit :
> On 29 May 2015 at 23:50, Laurent Vivier <address@hidden> wrote:
>> This patch introduces a system very similar to the one used in the kernel
>> to attach specific functions to a given file descriptor.
>>
>> In this case, we attach a specific "read()" to the fd returned by
>> signalfd() to be able to byte-swap the signalfd_siginfo structure
>> provided by read().
>>
>> This system could be also used to introduce netlink sockets.
>
> Thanks for this patch; apologies for the delayed review.
>
>> This patch allows to execute the example program given by
>> man signalfd(2):
>>
>> do { perror(msg); exit(EXIT_FAILURE); } while (0)
>
> Your commit message seems to have lost all the lines beginning
> '#' in this example program. I blame git :-)
>
>> int
>> main(int argc, char *argv[])
>> {
>> sigset_t mask;
>> int sfd;
>> struct signalfd_siginfo fdsi;
>> ssize_t s;
>>
>> sigemptyset(&mask);
>> sigaddset(&mask, SIGINT);
>> sigaddset(&mask, SIGQUIT);
>>
>> /* Block signals so that they aren't handled
>> according to their default dispositions */
>>
>> if (sigprocmask(SIG_BLOCK, &mask, NULL) == -1)
>> handle_error("sigprocmask");
>>
>> sfd = signalfd(-1, &mask, 0);
>> if (sfd == -1)
>> handle_error("signalfd");
>>
>> for (;;) {
>> s = read(sfd, &fdsi, sizeof(struct signalfd_siginfo));
>> if (s != sizeof(struct signalfd_siginfo))
>> handle_error("read");
>>
>> if (fdsi.ssi_signo == SIGINT) {
>> printf("Got SIGINT\n");
>> } else if (fdsi.ssi_signo == SIGQUIT) {
>> printf("Got SIGQUIT\n");
>> exit(EXIT_SUCCESS);
>> } else {
>> printf("Read unexpected signal\n");
>> }
>> }
>> }
>>
>> $ ./signalfd_demo
>> ^CGot SIGINT
>> ^CGot SIGINT
>> ^\Got SIGQUIT
>>
>> Signed-off-by: Laurent Vivier <address@hidden>
>> ---
>> linux-user/syscall.c | 117
>> ++++++++++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 116 insertions(+), 1 deletion(-)
>>
>> diff --git a/linux-user/syscall.c b/linux-user/syscall.c
>> index 1622ad6..c72c440 100644
>> --- a/linux-user/syscall.c
>> +++ b/linux-user/syscall.c
>> @@ -60,6 +60,7 @@ int __clone2(int (*fn)(void *), void *child_stack_base,
>> #include <sys/statfs.h>
>> #include <utime.h>
>> #include <sys/sysinfo.h>
>> +#include <sys/signalfd.h>
>> //#include <sys/user.h>
>> #include <netinet/ip.h>
>> #include <netinet/tcp.h>
>> @@ -294,6 +295,15 @@ static bitmask_transtbl fcntl_flags_tbl[] = {
>> { 0, 0, 0, 0 }
>> };
>>
>> +struct target_fd_ops {
>> + abi_long (*read) (int, void *, size_t);
>
> Maybe could use a brief comment defining the API here (for instance
> that you need to return -target_errno isn't necessarily obvious).
>
>> +};
>> +
>> +typedef struct target_fd_ops target_fd_ops_t;
>
> QEMU style generally prefers CamelCase for struct and
> struct typedef names.
>
>> +
>> +static int target_fd_max;
>> +static target_fd_ops_t **target_fd_ops;
>> +
>> static int sys_getcwd1(char *buf, size_t size)
>> {
>> if (getcwd(buf, size) == NULL) {
>> @@ -4878,6 +4888,7 @@ void syscall_init(void)
>> const argtype *arg_type;
>> int size;
>> int i;
>> + struct rlimit rlim;
>>
>> #define STRUCT(name, ...) thunk_register_struct(STRUCT_ ## name, #name,
>> struct_ ## name ## _def);
>> #define STRUCT_SPECIAL(name) thunk_register_struct_direct(STRUCT_ ## name,
>> #name, &struct_ ## name ## _def);
>> @@ -4885,6 +4896,14 @@ void syscall_init(void)
>> #undef STRUCT
>> #undef STRUCT_SPECIAL
>>
>> + /* allocate an array to store fd ops */
>> +
>> + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) {
>> + target_fd_max = rlim.rlim_cur;
>> + target_fd_ops = g_malloc0(target_fd_max * sizeof(target_fd_ops_t
>> *));
>> + }
>
> ...and what if the guest raises the rlimit after we've started?
> I think it would be better to just dynamically allocate/reallocate
> the table when the guest does a signalfd syscall, rather than
> trying to allocate it once at startup.
>
>> +
>> +
>> /* Build target_to_host_errno_table[] table from
>> * host_to_target_errno_table[]. */
>> for (i = 0; i < ERRNO_TABLE_SIZE; i++) {
>> @@ -5512,6 +5531,51 @@ static target_timer_t get_timer_id(abi_long arg)
>> return timerid;
>> }
>>
>> +/* signalfd siginfo conversion */
>> +
>> +static void
>> +host_to_target_signalfd_siginfo(struct signalfd_siginfo *tinfo,
>> + const struct signalfd_siginfo *info)
>> +{
>> + int sig = host_to_target_signal(info->ssi_signo);
>> + tinfo->ssi_signo = tswap32(sig);
>> + tinfo->ssi_errno = 0; /* unused */
>
> (a) Why not swap it anyway?
> (b) the kernel seems to write something to this field:
> http://lxr.free-electrons.com/source/fs/signalfd.c#L97
>
>> + tinfo->ssi_code = tswap32(info->ssi_code);
>> + tinfo->ssi_pid = tswap32(info->ssi_pid);
>> + tinfo->ssi_uid = tswap32(info->ssi_uid);
>> + tinfo->ssi_fd = tswap32(info->ssi_fd);
>> + tinfo->ssi_tid = tswap32(info->ssi_tid);
>> + tinfo->ssi_band = tswap32(info->ssi_band);
>> + tinfo->ssi_overrun = tswap32(info->ssi_overrun);
>> + tinfo->ssi_trapno = tswap32(info->ssi_trapno);
>> + tinfo->ssi_status = tswap32(info->ssi_status);
>> + tinfo->ssi_int = tswap32(info->ssi_int);
>> + tinfo->ssi_ptr = tswap64(info->ssi_ptr);
>> + tinfo->ssi_utime = tswap64(info->ssi_utime);
>> + tinfo->ssi_stime = tswap64(info->ssi_stime);
>> + tinfo->ssi_addr = tswap64(info->ssi_addr);
>
> Missing
> tinfo->ssi_addr_lsb = tswap16(info->ssi_addr_lsb);
>
>> +}
>> +
>> +static abi_long target_signalfd_read(int fd, void *buf, size_t count)
>> +{
>> + int ret;
>> + int i;
>> +
>> + ret = get_errno(read(fd, buf, count));
>
> The return value from read might not fit into an int...
>
>> + if (ret < 0) {
>> + return ret;
>> + }
>> +
>> + for (i = 0; i < ret; i += sizeof(struct signalfd_siginfo)) {
>> + host_to_target_signalfd_siginfo(buf + i, buf + i);
>
> This is implicitly assuming that none of the fields in
> the target and host arrangements of signalfd_siginfo
> overlap, so that all we're doing is swapping each one
> in place. That's true, but it's a bit subtle...
>
>> + }
>> + return ret;
>> +}
>> +
>> +static target_fd_ops_t target_signalfd_ops = {
>> + .read = target_signalfd_read,
>> +};
>> +
>> /* do_syscall() should always have a single exit point at the end so
>> that actions, such as logging of syscall results, can be performed.
>> All errnos that do_syscall() returns must be -TARGET_<errcode>. */
>> @@ -5571,7 +5635,13 @@ abi_long do_syscall(void *cpu_env, int num, abi_long
>> arg1,
>> else {
>> if (!(p = lock_user(VERIFY_WRITE, arg2, arg3, 0)))
>> goto efault;
>> - ret = get_errno(read(arg1, p, arg3));
>> + if (arg1 < target_fd_max &&
>> + target_fd_ops[arg1] &&
>> + target_fd_ops[arg1]->read) {
>> + ret = target_fd_ops[arg1]->read(arg1, p, arg3);
>> + } else {
>> + ret = get_errno(read(arg1, p, arg3));
>> + }
>
> This won't do the right thing in the case of:
> emulated-under-QEMU program A does a signalfd() with CLOEXEC clear
> program A exec()s program B, also emulated under QEMU
> program B reads its inherited signalfd descriptor
>
> That's a bit of an edge case, of course, and practically impossible
> to get right. It is mentioned specifically as working in the
> signalfd manpage, though; maybe worth a comment somewhere that we
> don't do the right thing.
>
>> unlock_user(p, arg2, ret);
>> }
>> break;
>> @@ -5598,6 +5668,10 @@ abi_long do_syscall(void *cpu_env, int num, abi_long
>> arg1,
>> unlock_user(p, arg2, 0);
>> break;
>> case TARGET_NR_close:
>> + if (arg1 < target_fd_max &&
>> + target_fd_ops[arg1]) {
>> + target_fd_ops[arg1] = NULL;
>> + }
>
> This isn't the only place an fd can get closed; for instance
> dup2() can implicitly close an fd. I think you need a wider set
> of changes to either (a) NULL out the entries in all the places
> where we can close an fd or (b) NULL out the entries in all the
> places where we can open an fd.
>
>> ret = get_errno(close(arg1));
>> break;
>> case TARGET_NR_brk:
>> @@ -9451,6 +9525,26 @@ abi_long do_syscall(void *cpu_env, int num, abi_long
>> arg1,
>> break;
>> #endif
>> #endif
>> +#if defined(TARGET_NR_signalfd4)
>> + case TARGET_NR_signalfd4:
>> + {
>> + target_sigset_t *target_mask;
>> + sigset_t mask;
>> + if (!lock_user_struct(VERIFY_READ, target_mask, arg2, 1)) {
>> + goto efault;
>> + }
>> +
>> + target_to_host_sigset(&mask, target_mask);
>> +
>> + ret = get_errno(signalfd(arg1, &mask, arg4));
>
> The signalfd flags are SFD_NONBLOCK and SFD_CLOEXEC, which the
> kernel defines (and asserts) to be equal to O_NONBLOCK and
> O_CLOEXEC. Those are not the same for all architectures, so we
> need to convert the flags argument here.
>
>> + if (ret >= 0) {
>> + target_fd_ops[ret] = &target_signalfd_ops;
>
> We checked the fd against target_fds_max earlier, but we don't here?
> (Moot if you take my suggestion of auto-reallocating the fd_ops
> array here.)
>
>> + }
>> +
>> + unlock_user_struct(target_mask, arg2, 0);
>
> You might as well do the unlock as soon as you've done
> the target_to_host_sigset() conversion.
>
>> + }
>> + break;
>> +#endif
>> #if defined(CONFIG_EPOLL)
>> #if defined(TARGET_NR_epoll_create)
>> case TARGET_NR_epoll_create:
>> @@ -9743,6 +9837,27 @@ abi_long do_syscall(void *cpu_env, int num, abi_long
>> arg1,
>> }
>> #endif
>>
>> +#if defined(TARGET_NR_signalfd)
>
> Can we put this next to the signalfd4 code rather than separated by
> other cases?
>
>> + case TARGET_NR_signalfd:
>> + {
>> + target_sigset_t *target_mask;
>> + sigset_t mask;
>> + if (!lock_user_struct(VERIFY_READ, target_mask, arg2, 1)) {
>> + goto efault;
>> + }
>> +
>> + target_to_host_sigset(&mask, target_mask);
>> +
>> + ret = get_errno(signalfd(arg1, &mask, arg3));
>
> The syscall signalfd() has the arguments (fd, mask, sizemask),
> whereas the libc function takes (fd, mask, flags), so passing arg3
> here looks wrong. Should be const 0 ?
>
>> + if (ret >= 0) {
>> + target_fd_ops[ret] = &target_signalfd_ops;
>> + }
>> +
>> + unlock_user_struct(target_mask, arg2, 0);
>
> This is very similar to the signalfd4 implementation,
> unsurprisingly. I would suggest factoring it out into a
> do_signalfd4() function to be called from both places.
>
>> + }
>> + break;
>> +#endif
>> +
>> #if defined(TARGET_NR_timerfd_create) && defined(CONFIG_TIMERFD)
>> case TARGET_NR_timerfd_create:
>> ret = get_errno(timerfd_create(arg1,
>> --
>> 2.4.1
>
> thanks
> -- PMM
>