[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [PATCH 4/5] x86_64: refactor segment register handling
From: |
Samuel Thibault |
Subject: |
Re: [PATCH 4/5] x86_64: refactor segment register handling |
Date: |
Fri, 4 Aug 2023 23:46:19 +0200 |
User-agent: |
NeoMutt/20170609 (1.8.3) |
Applied, thanks!
Luca Dariz, le sam. 29 juil. 2023 19:47:52 +0200, a ecrit:
> The actual values are not saved together with the rest of the thread
> state, both because it would be quite espensive (reading MSR, unless
> rdfsbase instructions are supported, but that's optional) and not
> really needed. The only way the user has to change its value is with a
> specific RPC, so we can intercept the change easily. Furthermore,
> Leaving the values there exposes them to being corrupted in case of a
> double interruption, e.g. an irq is handled just before iretq but
> after starting to restore the thread state. This solution was
> suggested by Sergey Bugaev.
>
> * i386/i386/db_trace.c: remove fsbase/gsbase from the registers
> available
> * i386/i386/debug_i386.c: remove fsbase/gsbase from the printed thread
> state
> * i386/i386/i386asm.sym: remove fsbase/gsbase as it's not needed in
> asm anymore
> * i386/i386/pcb.c: point fsbase/gsbase to the new location
> * i386/i386/thread.h: move fsbase/gsbase to the machine state
> * x86_64/locore.S: generalize segment-handling including es/ds/gs/fs
> and remove fsbase/gsbase handling. Also, factor out kernel segment
> selector setting to a macro.
> ---
> i386/i386/db_trace.c | 2 -
> i386/i386/debug_i386.c | 2 -
> i386/i386/i386asm.sym | 2 -
> i386/i386/pcb.c | 12 +--
> i386/i386/thread.h | 22 +++-
> x86_64/locore.S | 228 +++++++++++++++--------------------------
> 6 files changed, 108 insertions(+), 160 deletions(-)
>
> diff --git a/i386/i386/db_trace.c b/i386/i386/db_trace.c
> index 2b6ad741..8bd86fa5 100644
> --- a/i386/i386/db_trace.c
> +++ b/i386/i386/db_trace.c
> @@ -78,8 +78,6 @@ struct db_variable db_regs[] = {
> { "r13",(long *)&ddb_regs.r13, db_i386_reg_value },
> { "r14",(long *)&ddb_regs.r14, db_i386_reg_value },
> { "r15",(long *)&ddb_regs.r15, db_i386_reg_value },
> - { "fsb",(long *)&ddb_regs.fsbase,db_i386_reg_value },
> - { "gsb",(long *)&ddb_regs.gsbase,db_i386_reg_value },
> #endif
> };
> struct db_variable *db_eregs = db_regs + sizeof(db_regs)/sizeof(db_regs[0]);
> diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c
> index b5465796..41d032e3 100644
> --- a/i386/i386/debug_i386.c
> +++ b/i386/i386/debug_i386.c
> @@ -40,8 +40,6 @@ void dump_ss(const struct i386_saved_state *st)
> st->r8, st->r9, st->r10, st->r11);
> printf("R12 %016lx R13 %016lx R14 %016lx R15 %016lx\n",
> st->r12, st->r13, st->r14, st->r15);
> - printf("FSBASE %016lx GSBASE %016lx\n",
> - st->fsbase, st->gsbase);
> printf("RIP %016lx EFLAGS %08lx\n", st->eip, st->efl);
> #else
> printf("EAX %08lx EBX %08lx ECX %08lx EDX %08lx\n",
> diff --git a/i386/i386/i386asm.sym b/i386/i386/i386asm.sym
> index fd0be557..1b9b40bb 100644
> --- a/i386/i386/i386asm.sym
> +++ b/i386/i386/i386asm.sym
> @@ -108,8 +108,6 @@ offset i386_saved_state r r12
> offset i386_saved_state r r13
> offset i386_saved_state r r14
> offset i386_saved_state r r15
> -offset i386_saved_state r fsbase
> -offset i386_saved_state r gsbase
> #endif
>
> offset i386_interrupt_state i eip
> diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c
> index d1c5fb50..1cf87eb1 100644
> --- a/i386/i386/pcb.c
> +++ b/i386/i386/pcb.c
> @@ -229,8 +229,8 @@ void switch_ktss(pcb_t pcb)
> #endif /* MACH_PV_DESCRIPTORS */
>
> #if defined(__x86_64__) && !defined(USER32)
> - wrmsr(MSR_REG_FSBASE, pcb->iss.fsbase);
> - wrmsr(MSR_REG_GSBASE, pcb->iss.gsbase);
> + wrmsr(MSR_REG_FSBASE, pcb->ims.sbs.fsbase);
> + wrmsr(MSR_REG_GSBASE, pcb->ims.sbs.gsbase);
> #endif
>
> db_load_context(pcb);
> @@ -687,8 +687,8 @@ kern_return_t thread_setstatus(
> return KERN_INVALID_ARGUMENT;
>
> state = (struct i386_fsgs_base_state *) tstate;
> - thread->pcb->iss.fsbase = state->fs_base;
> - thread->pcb->iss.gsbase = state->gs_base;
> + thread->pcb->ims.sbs.fsbase = state->fs_base;
> + thread->pcb->ims.sbs.gsbase = state->gs_base;
> if (thread == current_thread()) {
> wrmsr(MSR_REG_FSBASE, state->fs_base);
> wrmsr(MSR_REG_GSBASE, state->gs_base);
> @@ -889,8 +889,8 @@ kern_return_t thread_getstatus(
> return KERN_INVALID_ARGUMENT;
>
> state = (struct i386_fsgs_base_state *) tstate;
> - state->fs_base = thread->pcb->iss.fsbase;
> - state->gs_base = thread->pcb->iss.gsbase;
> + state->fs_base = thread->pcb->ims.sbs.fsbase;
> + state->gs_base = thread->pcb->ims.sbs.gsbase;
> *count = i386_FSGS_BASE_STATE_COUNT;
> break;
> }
> diff --git a/i386/i386/thread.h b/i386/i386/thread.h
> index 2378154f..86a44098 100644
> --- a/i386/i386/thread.h
> +++ b/i386/i386/thread.h
> @@ -51,10 +51,6 @@
> */
>
> struct i386_saved_state {
> -#ifdef __x86_64__
> - unsigned long fsbase;
> - unsigned long gsbase;
> -#endif
> unsigned long gs;
> unsigned long fs;
> unsigned long es;
> @@ -162,6 +158,13 @@ struct v86_assist_state {
> #define V86_IF_PENDING 0x8000 /* unused bit */
> #endif
>
> +#if defined(__x86_64__) && !defined(USER32)
> +struct i386_segment_base_state {
> + unsigned long fsbase;
> + unsigned long gsbase;
> +};
> +#endif
> +
> /*
> * i386_interrupt_state:
> *
> @@ -206,14 +209,23 @@ struct i386_machine_state {
> #endif
> struct real_descriptor user_gdt[USER_GDT_SLOTS];
> struct i386_debug_state ids;
> +#if defined(__x86_64__) && !defined(USER32)
> + struct i386_segment_base_state sbs;
> +#endif
> };
>
> typedef struct pcb {
> + /* START of the exception stack.
> + * NOTE: this area is used as exception stack when switching
> + * CPL, and it MUST be big enough to save the thread state and
> + * switch to a proper stack area, even considering recursive
> + * exceptions, otherwise it could corrupt nearby memory */
> struct i386_interrupt_state iis[2]; /* interrupt and NMI */
> #ifdef __x86_64__
> - unsigned long pad; /* ensure exception stack is aligned to 16 */
> + unsigned long pad; /* ensure exception stack is aligned to 16 */
> #endif
> struct i386_saved_state iss;
> + /* END of exception stack*/
> struct i386_machine_state ims;
> decl_simple_lock_data(, lock)
> unsigned short init_control; /* Initial FPU control to set */
> diff --git a/x86_64/locore.S b/x86_64/locore.S
> index 7127957b..66a9436a 100644
> --- a/x86_64/locore.S
> +++ b/x86_64/locore.S
> @@ -39,6 +39,10 @@
> #include <i386/i386/cpu_number.h>
> #include <i386/i386/xen.h>
>
> +
> +/*
> + * Helpers for thread state as saved in the pcb area, during trap or irq
> handling
> + */
> #define pusha \
> pushq %rax ;\
> pushq %rcx ;\
> @@ -75,45 +79,74 @@
> popq %rcx ;\
> popq %rax
>
> +/*
> + * Note that we have to load the kernel segment registers even if this
> + * is a trap from the kernel, because the kernel uses user segment
> + * registers for copyin/copyout.
> + * (XXX Would it be smarter just to use fs or gs for that?)
> + */
> #ifdef USER32
> -#define PUSH_FSGS \
> +#define PUSH_SEGMENTS(reg) \
> + movq %ds,reg ;\
> + pushq reg ;\
> + movq %es,reg ;\
> + pushq reg ;\
> pushq %fs ;\
> - pushq %gs ;\
> - subq $16,%rsp
> + pushq %gs
> #else
> -#define PUSH_FSGS \
> +#define PUSH_SEGMENTS(reg) \
> subq $32,%rsp
> #endif
>
> #ifdef USER32
> -#define POP_FSGS \
> +#define POP_SEGMENTS(reg) \
> popq %gs ;\
> popq %fs ;\
> - addq $16,%rsp
> + popq reg ;\
> + movq reg,%es ;\
> + popq reg ;\
> + movq reg,%ds
> #else
> -#define POP_FSGS \
> +#define POP_SEGMENTS(reg) \
> addq $32,%rsp
> #endif
>
> #ifdef USER32
> -#define PUSH_FSGS_ISR \
> +#define PUSH_SEGMENTS_ISR(reg) \
> + movq %ds,reg ;\
> + pushq reg ;\
> + movq %es,reg ;\
> + pushq reg ;\
> pushq %fs ;\
> pushq %gs
> #else
> -#define PUSH_FSGS_ISR \
> - subq $16,%rsp
> +#define PUSH_SEGMENTS_ISR(reg) \
> + subq $32,%rsp
> #endif
>
> #ifdef USER32
> -#define POP_FSGS_ISR \
> +#define POP_SEGMENTS_ISR(reg) \
> popq %gs ;\
> - popq %fs
> + popq %fs ;\
> + popq reg ;\
> + movq reg,%es ;\
> + popq reg ;\
> + movq reg,%ds
> #else
> -#define POP_FSGS_ISR \
> - addq $16,%rsp
> +#define POP_SEGMENTS_ISR(reg) \
> + addq $32,%rsp
> #endif
>
> -
> +#ifdef USER32
> +#define SET_KERNEL_SEGMENTS \
> + mov %ss,%dx /* switch to kernel segments */ ;\
> + mov %dx,%ds ;\
> + mov %dx,%es ;\
> + mov %dx,%fs ;\
> + mov %dx,%gs
> +#else
> +#define SET_KERNEL_SEGMENTS
> +#endif
>
> /*
> * Fault recovery.
> @@ -350,32 +383,17 @@ ENTRY(start_timer)
> /*
> * Trap/interrupt entry points.
> *
> - * All traps must create the following save area on the kernel stack:
> - *
> - * gs
> - * fs
> - * es
> - * ds
> - * edi
> - * esi
> - * ebp
> - * cr2 if page fault - otherwise unused
> - * ebx
> - * edx
> - * ecx
> - * eax
> - * trap number
> - * error code
> - * eip
> - * cs
> - * eflags
> - * user rsp - if from user
> - * user ss - if from user
> - * es - if from V86 thread
> - * ds - if from V86 thread
> - * fs - if from V86 thread
> - * gs - if from V86 thread
> + * All traps must create the i386_saved_state struct on the stack on
> + * entry. Note that:
> + * - CR2 is only used if the trap is a page fault
> + * - user_rsp/user_ss are only used if entering from user space
> + * - v86_regs are used only from V86 threads
> + * (TODO check if V86 is still used with USER32)
> *
> + * Depending the CPL before entry, the stack might be switched or not;
> + * if entering from user-space the CPU loads TSS->RSP0 in RSP,
> + * otherwise RSP is unchanged. After this, the cpu pushes
> + * SS/RSP/RFLAFS/CS/RIP and optionally ErrorCode and executes the handler.
> */
>
> /* Try to save/show some information when a double fault happens
> @@ -426,16 +444,16 @@ trap_check_kernel_exit:
> /* check for the kernel exit sequence */
> cmpq $_kret_iret,16(%rsp) /* on IRET? */
> je fault_iret
> -#if 0
> +#ifdef USER32
> cmpq $_kret_popl_ds,16(%rsp) /* popping DS? */
> je fault_popl_ds
> cmpq $_kret_popl_es,16(%rsp) /* popping ES? */
> je fault_popl_es
> -#endif
> cmpq $_kret_popl_fs,16(%rsp) /* popping FS? */
> je fault_popl_fs
> cmpq $_kret_popl_gs,16(%rsp) /* popping GS? */
> je fault_popl_gs
> +#endif
> take_fault: /* if none of the above: */
> jmp EXT(alltraps) /* treat as normal trap. */
>
> @@ -464,6 +482,7 @@ fault_iret:
> popq %rax /* restore eax */
> jmp EXT(alltraps) /* take fault */
>
> +#ifdef USER32
> /*
> * Fault restoring a segment register. The user's registers are still
> * saved on the stack. The offending segment register has not been
> @@ -499,6 +518,7 @@ push_gs:
> push_gsbase:
> pushq $0
> pushq $0
> +#endif
> push_segregs:
> movq %rax,R_TRAPNO(%rsp) /* set trap number */
> movq %rdx,R_ERR(%rsp) /* set error code */
> @@ -562,23 +582,8 @@ ENTRY(t_page_fault)
> ENTRY(alltraps)
> pusha /* save the general registers */
> trap_push_segs:
> - movq %ds,%rax /* and the segment registers */
> - pushq %rax
> - movq %es,%rax /* and the segment registers */
> - pushq %rax
> - PUSH_FSGS
> -
> - /* Note that we have to load the segment registers
> - even if this is a trap from the kernel,
> - because the kernel uses user segment registers for copyin/copyout.
> - (XXX Would it be smarter just to use fs or gs for that?) */
> - mov %ss,%ax /* switch to kernel data segment */
> - mov %ax,%ds /* (same as kernel stack segment) */
> - mov %ax,%es
> -#ifdef USER32
> - mov %ax,%fs
> - mov %ax,%gs
> -#endif
> + PUSH_SEGMENTS(%rax)
> + SET_KERNEL_SEGMENTS
> trap_set_segs:
> cld /* clear direction flag */
> #ifdef USER32
> @@ -634,23 +639,20 @@ _return_to_user:
> */
>
> _return_from_kernel:
> - addq $16,%rsp /* skip FS/GS base */
> #ifndef USER32
> -_kret_popl_gs:
> -_kret_popl_fs:
> - addq $16,%rsp /* skip FS/GS selector */
> + addq $32,%rsp /* skip FS/GS selector */
> #else
> _kret_popl_gs:
> popq %gs /* restore segment registers */
> _kret_popl_fs:
> popq %fs
> -#endif
> _kret_popl_es:
> popq %rax
> movq %rax,%es
> _kret_popl_ds:
> popq %rax
> movq %rax,%ds
> +#endif
> popa /* restore general registers */
> addq $16,%rsp /* discard trap number and error code */
> _kret_iret:
> @@ -777,8 +779,11 @@ INTERRUPT(255)
> /* XXX handle NMI - at least print a warning like Linux does. */
>
> /*
> - * All interrupts enter here.
> - * old %eax on stack; interrupt number in %eax.
> + * All interrupts enter here. The cpu might have loaded a new RSP,
> + * depending on the previous CPL, as in alltraps.
> + * Old %eax on stack, interrupt number in %eax; we need to fill the remaining
> + * fields of struct i386_interrupt_state, which might be in the pcb or in the
> + * interrupt stack.
> */
> ENTRY(all_intrs)
> pushq %rcx /* save registers */
> @@ -791,24 +796,15 @@ ENTRY(all_intrs)
> pushq %r11
> cld /* clear direction flag */
>
> - movq %ds,%rdx /* save segment registers */
> - pushq %rdx
> - movq %es,%rdx
> - pushq %rdx
> - PUSH_FSGS_ISR
> + PUSH_SEGMENTS_ISR(%rdx)
>
> movq %rsp,%rdx /* on an interrupt stack? */
> and $(~(INTSTACK_SIZE-1)),%rdx
> cmpq %ss:EXT(int_stack_base),%rdx
> je int_from_intstack /* if not: */
>
> - mov %ss,%dx /* switch to kernel segments */
> - mov %dx,%ds
> - mov %dx,%es
> -#ifdef USER32
> - mov %dx,%fs
> - mov %dx,%gs
> -#endif
> + SET_KERNEL_SEGMENTS
> +
> CPU_NUMBER(%edx)
>
> movq CX(EXT(int_stack_top),%edx),%rcx
> @@ -849,11 +845,7 @@ LEXT(return_to_iret) /* ( label for
> kdb_kintr and hardclock) */
> cmpq $0,CX(EXT(need_ast),%edx)
> jnz ast_from_interrupt /* take it if so */
> 1:
> - POP_FSGS_ISR
> - pop %rdx
> - mov %rdx,%es
> - pop %rdx
> - mov %rdx,%ds
> + POP_SEGMENTS_ISR(%rdx)
> pop %r11
> pop %r10
> pop %r9
> @@ -871,12 +863,7 @@ int_from_intstack:
> jb stack_overflowed /* if not: */
> call EXT(interrupt) /* call interrupt routine */
> _return_to_iret_i: /* ( label for kdb_kintr) */
> - POP_FSGS_ISR
> - pop %rdx
> - mov %rdx,%es
> - pop %rdx
> - mov %rdx,%ds
> -
> + POP_SEGMENTS_ISR(%rdx)
> pop %r11
> pop %r10
> pop %r9
> @@ -909,11 +896,7 @@ stack_overflowed:
> * ss
> */
> ast_from_interrupt:
> - POP_FSGS_ISR
> - pop %rdx
> - mov %rdx,%es
> - pop %rdx
> - mov %rdx,%ds
> + POP_SEGMENTS_ISR(%rdx)
> popq %r11
> popq %r10
> popq %r9
> @@ -926,19 +909,8 @@ ast_from_interrupt:
> pushq $0 /* zero code */
> pushq $0 /* zero trap number */
> pusha /* save general registers */
> - mov %ds,%rdx /* save segment registers */
> - push %rdx
> - mov %es,%rdx
> - push %rdx
> - PUSH_FSGS_ISR
> -
> - mov %ss,%dx /* switch to kernel segments */
> - mov %dx,%ds
> - mov %dx,%es
> -#ifdef USER32
> - mov %dx,%fs
> - mov %dx,%gs
> -#endif
> + PUSH_SEGMENTS_ISR(%rdx)
> + SET_KERNEL_SEGMENTS
> CPU_NUMBER(%edx)
> TIME_TRAP_UENTRY
>
> @@ -1056,20 +1028,12 @@ kdb_from_iret_i: /* on interrupt
> stack */
> pushq $0 /* zero error code */
> pushq $0 /* zero trap number */
> pusha /* save general registers */
> - mov %ds,%rdx /* save segment registers */
> - push %rdx
> - mov %es,%rdx
> - push %rdx
> - PUSH_FSGS
> + PUSH_SEGMENTS(%rdx)
> movq %rsp,%rdx /* pass regs, */
> movq $0,%rsi /* code, */
> movq $-1,%rdi /* type to kdb */
> call EXT(kdb_trap)
> - POP_FSGS
> - pop %rdx
> - mov %rdx,%es
> - pop %rdx
> - mov %rdx,%ds
> + POP_SEGMENTS(%rdx)
> popa /* restore general registers */
> addq $16,%rsp
>
> @@ -1144,23 +1108,13 @@ ttd_from_iret_i: /* on interrupt
> stack */
> pushq $0 /* zero error code */
> pushq $0 /* zero trap number */
> pusha /* save general registers */
> - mov %ds,%rdx /* save segment registers */
> - push %rdx
> - mov %es,%rdx
> - push %rdx
> - push %fs
> - push %gs
> + PUSH_SEGMENTS_ISR(%rdx)
> ud2 // TEST it
> movq %rsp,%rdx /* pass regs, */
> movq $0,%rsi /* code, */
> movq $-1,%rdi /* type to kdb */
> call _kttd_trap
> - pop %gs /* restore segment registers */
> - pop %fs
> - pop %rdx
> - mov %rdx,%es
> - pop %rdx
> - mov %rdx,%ds
> + POP_SEGMENTS_ISR(%rdx)
> popa /* restore general registers */
> addq $16,%rsp
>
> @@ -1193,20 +1147,8 @@ syscall_entry_2:
> pushq $0 /* clear trap number slot */
>
> pusha /* save the general registers */
> - movq %ds,%rdx /* and the segment registers */
> - pushq %rdx
> - movq %es,%rdx
> - pushq %rdx
> - pushq %fs
> - pushq %gs
> - pushq $0 // gsbase
> - pushq $0 // fsbase
> -
> - mov %ss,%dx /* switch to kernel data segment */
> - mov %dx,%ds
> - mov %dx,%es
> - mov %dx,%fs
> - mov %dx,%gs
> + PUSH_SEGMENTS(%rdx)
> + SET_KERNEL_SEGMENTS
>
> /*
> * Shuffle eflags,eip,cs into proper places
> --
> 2.39.2
>
>
--
Samuel
---
Pour une évaluation indépendante, transparente et rigoureuse !
Je soutiens la Commission d'Évaluation de l'Inria.
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- Re: [PATCH 4/5] x86_64: refactor segment register handling,
Samuel Thibault <=