[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 4/5 (v4)] x86_64: add 64-bit syscall entry point
From: |
Samuel Thibault |
Subject: |
Re: [PATCH 4/5 (v4)] x86_64: add 64-bit syscall entry point |
Date: |
Mon, 1 May 2023 02:00:43 +0200 |
User-agent: |
NeoMutt/20170609 (1.8.3) |
Applied, thanks!
Luca Dariz, le mer. 19 avril 2023 21:47:02 +0200, a ecrit:
> While theoretically we could still use the same call gate as for
> 32-bit userspace, it doesn't seem very common, and gcc seems to not
> encode properly the instruction. Instead we use syscall/sysret as
> other kernels (e.g. XNU,Linux). This version still has some
> limitations, but should be enough to start working on the 64-bit user
> space.
>
> * i386/i386/i386asm.sym: add more constants to fill pcb->iss
> * i386/i386/ldt.c: configure 64-bit syscall entry point. We can just
> check for the SEP bit as MSR are always available on x86_64.
> * i386/i386/ldt.h: swap CS/DS segments order if !USER32 as required by
> sysret
> * i386/i386/locore.h: add syscall64 prototype
> * i386/i386/msr.h: add MSR definitions and C read/write helpers
> * i386/include/mach/i386/syscall_sw.h: remove old BSD_TRAP
> * x86_64/Makefrag.am: selectively install syscall_sw.h depending on
> USER32
> * x86_64/include/syscall_sw.h: add entry point template from user
> space
> * x86_64/locore.S: implement syscall64 entry point and use it when a
> 64-bit user-space is configured
> ---
> i386/i386/i386asm.sym | 15 +++
> i386/i386/ldt.c | 16 ++-
> i386/i386/ldt.h | 9 +-
> i386/i386/locore.h | 1 +
> i386/i386/msr.h | 56 ++++++++++
> i386/include/mach/i386/syscall_sw.h | 12 +--
> x86_64/Makefrag.am | 7 +-
> x86_64/include/syscall_sw.h | 40 +++++++
> x86_64/locore.S | 158 +++++++++++++++++++++++++++-
> 9 files changed, 294 insertions(+), 20 deletions(-)
> create mode 100644 i386/i386/msr.h
> create mode 100644 x86_64/include/syscall_sw.h
>
> diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
> index 8317db6c..1b9b40bb 100644
> --- a/i386/i386/i386asm.sym
> +++ b/i386/i386/i386asm.sym
> @@ -52,6 +52,8 @@ expr CALL_SINGLE_FUNCTION_BASE
>
> offset ApicLocalUnit lu apic_id APIC_ID
>
> +offset pcb pcb iss
> +
> offset thread th pcb
> offset thread th task
> offset thread th recover
> @@ -82,16 +84,29 @@ size i386_kernel_state iks
>
> size i386_exception_link iel
>
> +offset i386_saved_state r gs
> +offset i386_saved_state r fs
> offset i386_saved_state r cs
> offset i386_saved_state r uesp
> offset i386_saved_state r eax
> +offset i386_saved_state r ebx
> +offset i386_saved_state r ecx
> +offset i386_saved_state r edx
> +offset i386_saved_state r ebp
> offset i386_saved_state r trapno
> offset i386_saved_state r err
> offset i386_saved_state r efl R_EFLAGS
> offset i386_saved_state r eip
> offset i386_saved_state r cr2
> offset i386_saved_state r edi
> +offset i386_saved_state r esi
> #ifdef __x86_64__
> +offset i386_saved_state r r8
> +offset i386_saved_state r r9
> +offset i386_saved_state r r10
> +offset i386_saved_state r r12
> +offset i386_saved_state r r13
> +offset i386_saved_state r r14
> offset i386_saved_state r r15
> #endif
>
> diff --git a/i386/i386/ldt.c b/i386/i386/ldt.c
> index b86a0e3c..4d7ec19a 100644
> --- a/i386/i386/ldt.c
> +++ b/i386/i386/ldt.c
> @@ -31,6 +31,7 @@
> #include <mach/xen.h>
>
> #include <intel/pmap.h>
> +#include <kern/debug.h>
>
> #include "vm_param.h"
> #include "seg.h"
> @@ -38,6 +39,7 @@
> #include "ldt.h"
> #include "locore.h"
> #include "mp_desc.h"
> +#include "msr.h"
>
> #ifdef MACH_PV_DESCRIPTORS
> /* It is actually defined in xen_boothdr.S */
> @@ -65,10 +67,22 @@ ldt_fill(struct real_descriptor *myldt, struct
> real_descriptor *mygdt)
> ACC_PL_K|ACC_LDT, 0);
> #endif /* MACH_PV_DESCRIPTORS */
>
> - /* Initialize the 32bit LDT descriptors. */
> + /* Initialize the syscall entry point */
> +#if defined(__x86_64__) && ! defined(USER32)
> + if (!CPU_HAS_FEATURE(CPU_FEATURE_SEP))
> + panic("syscall support is missing on 64 bit");
> + /* Enable 64-bit syscalls */
> + wrmsr(MSR_REG_EFER, rdmsr(MSR_REG_EFER) | MSR_EFER_SCE);
> + wrmsr(MSR_REG_LSTAR, (vm_offset_t)syscall64);
> + wrmsr(MSR_REG_STAR, ((((long)USER_CS - 16) << 16) | (long)KERNEL_CS)
> << 32);
> + wrmsr(MSR_REG_FMASK, 0); // ?
> +#else /* defined(__x86_64__) && ! defined(USER32) */
> fill_ldt_gate(myldt, USER_SCALL,
> (vm_offset_t)&syscall, KERNEL_CS,
> ACC_PL_U|ACC_CALL_GATE, 0);
> +#endif /* defined(__x86_64__) && ! defined(USER32) */
> +
> + /* Initialize the 32bit LDT descriptors. */
> fill_ldt_descriptor(myldt, USER_CS,
> VM_MIN_USER_ADDRESS,
> VM_MAX_USER_ADDRESS-VM_MIN_USER_ADDRESS-4096,
> diff --git a/i386/i386/ldt.h b/i386/i386/ldt.h
> index b15f11a5..51867f47 100644
> --- a/i386/i386/ldt.h
> +++ b/i386/i386/ldt.h
> @@ -43,11 +43,16 @@
> * User descriptors for Mach - 32-bit flat address space
> */
> #define USER_SCALL 0x07 /* system call gate */
> -#ifdef __x86_64__
> +#if defined(__x86_64__) && ! defined(USER32)
> /* Call gate needs two entries */
> -#endif
> +
> +/* The sysret instruction puts some constraints on the user segment indexes
> */
> +#define USER_CS 0x1f /* user code segment */
> +#define USER_DS 0x17 /* user data segment */
> +#else
> #define USER_CS 0x17 /* user code segment */
> #define USER_DS 0x1f /* user data segment */
> +#endif
>
> #define LDTSZ 4
>
> diff --git a/i386/i386/locore.h b/i386/i386/locore.h
> index a8807dbf..4388ea28 100644
> --- a/i386/i386/locore.h
> +++ b/i386/i386/locore.h
> @@ -57,6 +57,7 @@ extern int inst_fetch (int eip, int cs);
> extern void cpu_shutdown (void);
>
> extern int syscall (void);
> +extern int syscall64 (void);
>
> extern unsigned int cpu_features[2];
>
> diff --git a/i386/i386/msr.h b/i386/i386/msr.h
> new file mode 100644
> index 00000000..8f09b80b
> --- /dev/null
> +++ b/i386/i386/msr.h
> @@ -0,0 +1,56 @@
> +/*
> + * Copyright (C) 2023 Free Software Foundation
> + *
> + * This program is free software ; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation ; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY ; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with the program ; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#ifndef _MACHINE_MSR_H_
> +#define _MACHINE_MSR_H_
> +
> +#define MSR_REG_EFER 0xC0000080
> +#define MSR_REG_STAR 0xC0000081
> +#define MSR_REG_LSTAR 0xC0000082
> +#define MSR_REG_CSTAR 0xC0000083
> +#define MSR_REG_FMASK 0xC0000084
> +#define MSR_REG_FSBASE 0xC0000100
> +#define MSR_REG_GSBASE 0xC0000101
> +
> +#define MSR_EFER_SCE 0x00000001
> +
> +#ifndef __ASSEMBLER__
> +
> +static inline void wrmsr(uint32_t regaddr, uint64_t value)
> +{
> + uint32_t low = (uint32_t) value, high = ((uint32_t) (value >> 32));
> + asm volatile("wrmsr"
> + :
> + : "c" (regaddr), "a" (low), "d" (high)
> + : "memory" /* wrmsr may cause a read from memory, so
> + * make the compiler flush any changes */
> + );
> +}
> +
> +static inline uint64_t rdmsr(uint32_t regaddr)
> +{
> + uint32_t low, high;
> + asm volatile("rdmsr"
> + : "=a" (low), "=d" (high)
> + : "c" (regaddr)
> + );
> + return ((uint64_t)high << 32) | low;
> +}
> +#endif /* __ASSEMBLER__ */
> +
> +#endif /* _MACHINE_MSR_H_ */
> diff --git a/i386/include/mach/i386/syscall_sw.h
> b/i386/include/mach/i386/syscall_sw.h
> index 86f6ff2f..9eeb2939 100644
> --- a/i386/include/mach/i386/syscall_sw.h
> +++ b/i386/include/mach/i386/syscall_sw.h
> @@ -29,21 +29,11 @@
>
> #include <mach/machine/asm.h>
>
> -#if BSD_TRAP
> -#define kernel_trap(trap_name,trap_number,number_args) \
> +#define kernel_trap(trap_name,trap_number,number_args) \
> ENTRY(trap_name) \
> movl $ trap_number,%eax; \
> SVC; \
> - jb LCL(cerror); \
> ret; \
> END(trap_name)
> -#else
> -#define kernel_trap(trap_name,trap_number,number_args) \
> -ENTRY(trap_name) \
> - movl $ trap_number,%eax; \
> - SVC; \
> - ret; \
> -END(trap_name)
> -#endif
>
> #endif /* _MACH_I386_SYSCALL_SW_H_ */
> diff --git a/x86_64/Makefrag.am b/x86_64/Makefrag.am
> index d3735890..fb225aa5 100644
> --- a/x86_64/Makefrag.am
> +++ b/x86_64/Makefrag.am
> @@ -175,11 +175,16 @@ include_mach_x86_64_HEADERS = \
> i386/include/mach/i386/mach_i386_types.h \
> i386/include/mach/i386/machine_types.defs \
> i386/include/mach/i386/multiboot.h \
> - i386/include/mach/i386/syscall_sw.h \
> i386/include/mach/i386/thread_status.h \
> i386/include/mach/i386/trap.h \
> i386/include/mach/i386/vm_param.h \
> i386/include/mach/i386/vm_types.h
> +
> +if enable_user32
> +include_mach_x86_64_HEADERS += i386/include/mach/i386/syscall_sw.h
> +else
> +include_mach_x86_64_HEADERS += x86_64/include/syscall_sw.h
> +endif
>
> #
> # Platform specific parts.
> diff --git a/x86_64/include/syscall_sw.h b/x86_64/include/syscall_sw.h
> new file mode 100644
> index 00000000..4e03f28c
> --- /dev/null
> +++ b/x86_64/include/syscall_sw.h
> @@ -0,0 +1,40 @@
> +/*
> + * Mach Operating System
> + * Copyright (c) 1991,1990,1989,1988 Carnegie Mellon University
> + * All Rights Reserved.
> + *
> + * Permission to use, copy, modify and distribute this software and its
> + * documentation is hereby granted, provided that both the copyright
> + * notice and this permission notice appear in all copies of the
> + * software, derivative works or modified versions, and any portions
> + * thereof, and that both notices appear in supporting documentation.
> + *
> + * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
> + * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
> + * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
> + *
> + * Carnegie Mellon requests users of this software to return to
> + *
> + * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
> + * School of Computer Science
> + * Carnegie Mellon University
> + * Pittsburgh PA 15213-3890
> + *
> + * any improvements or extensions that they make and grant Carnegie Mellon
> + * the rights to redistribute these changes.
> + */
> +
> +#ifndef _MACH_X86_64_SYSCALL_SW_H_
> +#define _MACH_X86_64_SYSCALL_SW_H_
> +
> +#include <mach/machine/asm.h>
> +
> +#define kernel_trap(trap_name,trap_number,number_args) \
> +ENTRY(trap_name) \
> + movq $ trap_number,%rax; \
> + movq %rcx,%r10; \
> + syscall; \
> + ret; \
> +END(trap_name)
> +
> +#endif /* _MACH_X86_64_SYSCALL_SW_H_ */
> diff --git a/x86_64/locore.S b/x86_64/locore.S
> index ea5c71d6..1b17d921 100644
> --- a/x86_64/locore.S
> +++ b/x86_64/locore.S
> @@ -423,13 +423,17 @@ ENTRY(t_debug)
> /* Note: handling KERNEL_RING value by hand */
> testq $2,8(%rsp) /* is trap from kernel mode? */
> jnz 0f /* if so: */
> +#ifdef USER32
> cmpq $syscall_entry,(%rsp) /* system call entry? */
> jne 0f /* if so: */
> /* flags are sitting where syscall */
> /* wants them */
> addq $32,%rsp /* remove eip/cs */
> jmp syscall_entry_2 /* continue system call entry */
> -
> +#else
> + // TODO: implement the 64-bit case
> + ud2
> +#endif
> 0: pushq $0 /* otherwise: */
> pushq $(T_DEBUG) /* handle as normal */
> jmp EXT(alltraps) /* debug fault */
> @@ -497,12 +501,12 @@ trap_from_user:
> _take_trap:
> movq %rbx,%rdi /* pass register save area to trap */
> call EXT(user_trap) /* call user trap routine */
> -
> +#ifdef USER32
> orq %rax,%rax /* emulated syscall? */
> jz 1f /* no, just return */
> movq R_EAX(%rbx),%rax /* yes, get syscall number */
> jmp syscall_entry_3 /* and emulate it */
> -
> +#endif
> 1:
> movq (%rsp),%rsp /* switch back to PCB stack */
>
> @@ -1055,6 +1059,7 @@ ud2
>
> #endif /* MACH_TTD */
>
> +#ifdef USER32
> /*
> * System call enters through a call gate. Flags are not saved -
> * we must shuffle stack to look like trap save area.
> @@ -1269,7 +1274,152 @@ syscall_addr:
> movq $(T_PF_USER),R_ERR(%rbx)
> /* set error code - read user space */
> jmp _take_trap /* treat as a trap */
> +END(syscall)
> +
> +#else /* USER32 */
> +
> +/* Entry point for 64-bit syscalls.
> + * On entry we're still on the user stack, so better not use it. Instead we
> + * save the thread state immediately in thread->pcb->iss, then try to invoke
> + * the syscall.
> + * Note: emulated syscalls seem to not be used anymore in GNU/Hurd, so they
> + * are not handled here.
> + * TODO:
> + - for now we assume the return address is canonical, but apparently
> there
> + can be cases where it's not (see how Linux handles this). Does it
> apply
> + here?
> + - do we need to check for ast on syscalls? Maybe on interrupts is enough
> + - check that the case where a task is suspended, and later returns via
> + iretq from return_from_trap, works fine in all combinations
> + */
> +ENTRY(syscall64)
> + /* RFLAGS[32:63] are reserved, so combine syscall num (32 bit) and
> + * eflags in RAX to allow using r11 as temporary register
> + */
> + shlq $32,%r11
> + shlq $32,%rax /* make sure bits 32:63 of %rax are zero */
> + shrq $32,%rax
> + or %r11,%rax
> +
> + /* Save thread state in pcb->iss, as on exception entry.
> + * Since this is triggered synchronously from userspace, we could
> + * save only the callee-preserved status according to the C ABI,
> + * plus RIP and EFLAGS for sysret
> + */
> + CPU_NUMBER(%r11)
> + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */
> + movq TH_PCB(%r11),%r11 /* point to pcb */
> + addq $ PCB_ISS,%r11 /* point to saved state */
> +
> + mov %rsp,R_UESP(%r11) /* callee-preserved register */
> + mov %rcx,R_EIP(%r11) /* syscall places user RIP in RCX */
> + mov %rbx,R_EBX(%r11) /* callee-preserved register */
> + mov %rax,%rbx /* Now we can unpack eflags again */
> + shr $32,%rbx
> + mov %rbx,R_EFLAGS(%r11) /* ... and save them in pcb as well */
> + mov %rbp,R_EBP(%r11) /* callee-preserved register */
> + mov %r12,R_R12(%r11) /* callee-preserved register */
> + mov %r13,R_R13(%r11) /* callee-preserved register */
> + mov %r14,R_R14(%r11) /* callee-preserved register */
> + mov %r15,R_R15(%r11) /* callee-preserved register */
> +
> + /* Save syscall number and args for SYSCALL_EXAMINE/MSG_EXAMINE in
> glibc.
> + * Note: syscall number is only 32 bit, in EAX, so we sign-extend it in
> + * RAX to mask the EFLAGS bits.
> + */
> + cdqe /* sign-extend EAX in RAX */
> + mov %rax,R_EAX(%r11) /* syscall number */
> + mov %rdi,R_EDI(%r11) /* syscall arg0 */
> + mov %rsi,R_ESI(%r11) /* syscall arg1 */
> + mov %rdx,R_EDX(%r11) /* syscall arg2 */
> + mov %r10,R_R10(%r11) /* syscall arg3 */
> + mov %r8,R_R8(%r11) /* syscall arg4 */
> + mov %r9,R_R9(%r11) /* syscall arg5 */
> +
> + mov %r11,%rbx /* prepare for error handling */
> + mov %r10,%rcx /* fix arg3 location according to C ABI
> */
> +
> + /* switch to kernel stack */
> + CPU_NUMBER(%r11)
> + movq CX(EXT(kernel_stack),%r11),%rsp
> +
> + /* Now we have saved state and args 1-6 are in place.
> + * Before invoking the syscall we do some bound checking and,
> + * if we have more that 6 arguments, we need to copy the
> + * remaining ones to the kernel stack, handling page faults when
> + * accessing the user stack.
> + */
> + negl %eax /* get system call number */
> + jl _syscall64_range /* out of range if it was positive */
> + cmpl EXT(mach_trap_count),%eax /* check system call table
> bounds */
> + jg _syscall64_range /* error if out of range */
> + shll $5,%eax /* manual indexing of mach_trap_t */
> +
> + /* check if we need to place some arguments on the stack */
> +_syscall64_args_stack:
> + mov EXT(mach_trap_table)(%rax),%r10 /* get number of arguments */
> + subq $6,%r10 /* the first 6 args are already in
> place */
> + jle _syscall64_call /* skip argument copy if num args <= 6
> */
> +
> + movq R_UESP(%rbx),%r11 /* get user stack pointer */
> + addq $8,%r11 /* Skip user return address */
>
> + lea (%r11,%r10,8),%r11 /* point past last argument */
> +
> +0: subq $8,%r11
> + RECOVER(_syscall64_addr_push)
> + mov (%r11),%r12
> + pushq %r12 /* push argument on stack */
> + dec %r10
> + jnz 0b /* loop for all remaining arguments */
> +
> +_syscall64_call:
> + call *EXT(mach_trap_table)+8(%rax) /* call procedure */
> + // XXX: check ast on exit?
> +
> + /* Restore thread state and return to user using sysret. */
> + CPU_NUMBER(%r11)
> + movq CX(EXT(active_threads),%r11),%r11 /* point to current thread */
> + movq TH_PCB(%r11),%r11 /* point to pcb */
> + addq $ PCB_ISS,%r11 /* point to saved state */
> +
> + /* Restore syscall args. Note: we can't restore the syscall number in
> + * RAX because it needs to hold the return value.*/
> + mov R_EDI(%r11),%rdi /* syscall arg0 */
> + mov R_ESI(%r11),%rsi /* syscall arg1 */
> + mov R_EDX(%r11),%rdx /* syscall arg2 */
> + mov R_R10(%r11),%r10 /* syscall arg3 */
> + mov R_R8(%r11),%r8 /* syscall arg4 */
> + mov R_R9(%r11),%r9 /* syscall arg5 */
> +
> + mov R_UESP(%r11),%rsp /* callee-preserved register,
> + * also switch back to user stack */
> + mov R_EIP(%r11),%rcx /* sysret convention */
> + mov R_EBX(%r11),%rbx /* callee-preserved register */
> + mov R_EBP(%r11),%rbp /* callee-preserved register */
> + mov R_R12(%r11),%r12 /* callee-preserved register */
> + mov R_R13(%r11),%r13 /* callee-preserved register */
> + mov R_R14(%r11),%r14 /* callee-preserved register */
> + mov R_R15(%r11),%r15 /* callee-preserved register */
> + mov R_EFLAGS(%r11),%r11 /* sysret convention */
> +
> + sysretq /* fast return to user-space, the thread didn't block */
> +
> +/* Error handling fragments, from here we jump directly to the trap handler
> */
> +_syscall64_addr_push:
> + movq %r11,R_CR2(%rbx) /* set fault address */
> + movq $(T_PAGE_FAULT),R_TRAPNO(%rbx) /* set page-fault trap */
> + movq $(T_PF_USER),R_ERR(%rbx) /* set error code - read user space */
> + jmp _take_trap /* treat as a trap */
> +
> +_syscall64_range:
> + movq $(T_INVALID_OPCODE),R_TRAPNO(%rbx)
> + /* set invalid-operation trap */
> + movq $0,R_ERR(%rbx) /* clear error code */
> + jmp _take_trap /* treat as a trap */
> +
> +END(syscall64)
> +#endif /* USER32 */
>
> .data
> DATA(cpu_features)
> @@ -1279,8 +1429,6 @@ DATA(cpu_features_ecx)
> .long 0
> .text
>
> -END(syscall)
> -
> /* Discover what kind of cpu we have; return the family number
> (3, 4, 5, 6, for 386, 486, 586, 686 respectively). */
> ENTRY(discover_x86_cpu_type)
> --
> 2.30.2
>
>
--
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.
- Re: [PATCH 5/5] add setting gs/fsbase, (continued)
- Re: [PATCH 5/5] add setting gs/fsbase, Luca Dariz, 2023/04/24
- Re: [PATCH 5/5] add setting gs/fsbase, Sergey Bugaev, 2023/04/24
- Re: [PATCH 5/5] add setting gs/fsbase, Samuel Thibault, 2023/04/24
- Re: [PATCH 5/5] add setting gs/fsbase, Sergey Bugaev, 2023/04/25
- Re: [PATCH 5/5] add setting gs/fsbase, Sergey Bugaev, 2023/04/25
- Re: [PATCH 5/5] add setting gs/fsbase, Samuel Thibault, 2023/04/25
- Re: [PATCH 5/5] add setting gs/fsbase, Sergey Bugaev, 2023/04/26
[PATCH 2/5] fix copyoutmsg for 64-bit userspace, Luca Dariz, 2023/04/19
[PATCH 4/5 (v4)] x86_64: add 64-bit syscall entry point, Luca Dariz, 2023/04/19
- Re: [PATCH 4/5 (v4)] x86_64: add 64-bit syscall entry point,
Samuel Thibault <=
Re: [PATCH 1/5] fix address fault for 32-on-64-bit syscall, Samuel Thibault, 2023/04/20