From: Hans-Jörg Höxer Subject: Re: [EXT] Re: SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3 To: Cc: Date: Tue, 1 Jul 2025 10:29:31 +0200 Hi, thanks for fixup & committing. Answers inline. On Fri, Jun 27, 2025 at 08:03:18PM +0200, Alexander Bluhm wrote: > ... > I have commited the first diff with the comment suggested by mlarkin. > > Three remarks inline... > > > diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c > > index ae5aecf13f9..bb1027f82ae 100644 > > --- a/sys/arch/amd64/amd64/ghcb.c > > +++ b/sys/arch/amd64/amd64/ghcb.c > > @@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = { > > 0x00000000ffffffffULL, 0xffffffffffffffffULL > > }; > > > > +vaddr_t ghcb_vaddr; > > +paddr_t ghcb_paddr; > > + > > /* > > * ghcb_clear > > * > > diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S > > index 3f79af0d3cc..b14e053785c 100644 > > --- a/sys/arch/amd64/amd64/locore0.S > > +++ b/sys/arch/amd64/amd64/locore0.S > > @@ -439,7 +439,9 @@ cont: > > #define PROC0_DMP2_OFF (PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG) > > #define TABLESIZE \ > > ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \ > > - NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG) > > + NDML3_ENTRIES + NDML2_ENTRIES + 5) * NBPG) > > +#define PROC0_GHCB_OFF (TABLESIZE - 5 * NBPG) > > +#define GHCB_SIZE (2 * NBPG) > > > > #define fillkpt \ > > pushl %ebp ; /* save */ \ > > @@ -463,6 +465,17 @@ cont: > > loop 1b ; /* till finished */ \ > > popl %ebp > > > > + > > +#define fillkpt_nx_nc \ > > + pushl %ebp ; /* save */ \ > > + movl RELOC((pg_nx + 4)), %ebp ; /* NX bit? */ \ > > +1: movl %eax,(%ebx) ; /* store phys addr */ \ > > + movl %ebp,4(%ebx) ; /* upper 32 bits */ \ > > + addl $8,%ebx ; /* next pte/pde */ \ > > + addl $NBPG,%eax ; /* next phys page */ \ > > + loop 1b ; /* till finished */ \ > > + popl %ebp > > + > > /* Find end of kernel image. */ > > movl $RELOC(end),%edi > > #if (NKSYMS || defined(DDB)) > > @@ -569,6 +582,19 @@ map_tables: > > shrl $PGSHIFT,%ecx > > fillkpt_nx > > > > + /* When in SEV-ES guestmode, remap GHCB shared (ie. unencrypted) */ > > + movl RELOC(cpu_sev_guestmode),%eax > > + testl $SEV_STAT_ES_ENABLED,%eax > > + jz .Lnoghcb > > + pushl %ebx /* save current slot */ > > + subl $(5 << 3),%ebx /* move back to slot of GHCB */ > > + leal (PROC0_GHCB_OFF)(%esi),%eax > > + orl $(PG_V|PG_KW), %eax > > + movl $(GHCB_SIZE>>PGSHIFT), %ecx > > + fillkpt_nx_nc > > + popl %ebx /* continue with slot saved above */ > > + > > +.Lnoghcb: > > /* Map ISA I/O mem (later atdevbase) RW, NX */ > > movl $(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax > > movl $(IOM_SIZE>>PGSHIFT),%ecx > > @@ -778,6 +804,15 @@ longmode_hi: > > addq %rsi,%rdx > > movq %rdx,atdevbase(%rip) > > > > + /* Relocate GHCB. */ > > + movq cpu_sev_guestmode(%rip),%rax > > + testq $SEV_STAT_ES_ENABLED,%rax > > + jz .Lnoghcbreloc > > + movq $(PROC0_GHCB_OFF+KERNBASE),%rdx > > + addq %rsi,%rdx > > + movq %rdx,ghcb_vaddr(%rip) > > + > > +.Lnoghcbreloc: > > /* Record start of symbols */ > > movq $__kernel_bss_end, ssym(%rip) > > > > @@ -800,7 +835,7 @@ longmode_hi: > > movw %ax,%fs > > > > leaq TABLESIZE(%rsi),%rdi > > - subq $(NBPG*3), %rdi > > + subq $(NBPG*5), %rdi > > > > /* XXX merge these */ > > call init_x86_64 > > diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c > > index d9b34f2cde4..bc454f03ba3 100644 > > --- a/sys/arch/amd64/amd64/machdep.c > > +++ b/sys/arch/amd64/amd64/machdep.c > > @@ -100,6 +100,7 @@ > > #include > > #include > > #include > > +#include > > > > #include > > #include > > @@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, > > extern int tsc_is_invariant; > > extern int amd64_has_xcrypt; > > extern int need_retpoline; > > +extern int cpu_sev_guestmode; > > > > const struct sysctl_bounded_args cpuctl_vars[] = { > > { CPU_LIDACTION, &lid_action, -1, 2 }, > > @@ -1314,6 +1316,37 @@ cpu_init_idt(void) > > lidt(®ion); > > } > > > > +uint64_t early_gdt[GDT_SIZE / 8]; > > + > > +void > > +cpu_init_early_vctrap(paddr_t addr) > > +{ > > + struct region_descriptor region; > > + > > + extern void Xvctrap_early(void); > > + > > + /* Setup temporary "early" longmode GDT, will be reset soon */ > > + memset(early_gdt, 0, sizeof(early_gdt)); > > + set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff, > > + SDT_MEMERA, SEL_KPL, 1, 0, 1); > > + set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff, > > + SDT_MEMRWA, SEL_KPL, 1, 0, 1); > > + setregion(®ion, early_gdt, GDT_SIZE - 1); > > + lgdt(®ion); > > + > > + /* Setup temporary "early" longmode #VC entry, will be reset soon */ > > + idt = early_idt; > > + memset((void *)idt, 0, NIDT * sizeof(idt[0])); > > + setgate(&idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT, SEL_KPL, > > + GSEL(GCODE_SEL, SEL_KPL)); > > + cpu_init_idt(); > > + > > + /* Tell vmm(4) about our GHCB. */ > > + ghcb_paddr = addr; > > + memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE); > > + wrmsr(MSR_SEV_GHCB, ghcb_paddr); > > +} > > + > > void > > cpu_init_extents(void) > > { > > @@ -1434,6 +1467,13 @@ init_x86_64(paddr_t first_avail) > > int x, ist; > > uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30; > > > > + /* > > + * locore0 mapped 2 pages for use as GHCB before pmap is initialized. > > + */ > > + if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED)) > > + cpu_init_early_vctrap(first_avail); > > + first_avail += 2 * NBPG; > > The GHCB is always allocated to we skip two pages uncoditionally, > right? Can we keep these pages usable when running without SEV-ES? I guess so. Right now we are wasting two pages in the non-SEV guest case. I will improve this. > > + > > /* > > * locore0 mapped 3 pages for use before the pmap is initialized > > * starting at first_avail. These pages are currently used by > > diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c > > index ff432b7af6d..bb445a60894 100644 > > --- a/sys/arch/amd64/amd64/trap.c > > +++ b/sys/arch/amd64/amd64/trap.c > > @@ -86,6 +86,8 @@ > > #include > > #include > > #include > > +#include > > +#include > > #ifdef DDB > > #include > > #include > > @@ -95,6 +97,7 @@ > > > > int upageflttrap(struct trapframe *, uint64_t); > > int kpageflttrap(struct trapframe *, uint64_t); > > +int vctrap(struct trapframe *); > > void kerntrap(struct trapframe *); > > void usertrap(struct trapframe *); > > void ast(struct trapframe *); > > @@ -123,6 +126,7 @@ const char * const trap_type[] = { > > "SSE FP exception", /* 19 T_XMM */ > > "virtualization exception", /* 20 T_VE */ > > "control protection exception", /* 21 T_CP */ > > + "VMM communication exception", /* 29 T_VC */ > > }; > > const int trap_types = nitems(trap_type); > > > > @@ -297,6 +301,52 @@ kpageflttrap(struct trapframe *frame, uint64_t cr2) > > return 1; > > } > > > > +int > > +vctrap(struct trapframe *frame) > > +{ > > + uint64_t sw_exitcode, sw_exitinfo1, sw_exitinfo2; > > + struct ghcb_sync syncout, syncin; > > + struct ghcb_sa *ghcb; > > + > > + intr_disable(); > > We were debugging some races before. Back then, interrupts were > enabled at this point. So we modified the trap entry point. I > think interrupts are always disabled now, and this should be an > assertion instead of an additional disable. ack. The intr_disable() is not needed, interrupts are always disabled at this point. And must be! So an assert is a good idea. > > + > > + memset(&syncout, 0, sizeof(syncout)); > > + memset(&syncin, 0, sizeof(syncin)); > > + > > + sw_exitcode = frame->tf_err; > > + sw_exitinfo1 = 0; > > + sw_exitinfo2 = 0; > > + > > + switch (sw_exitcode) { > > + default: > > + panic("invalid exit code 0x%llx", sw_exitcode); > > + } > > + > > + /* Always required */ > > + ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout); > > + ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout); > > + ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout); > > + > > + /* Sync out to GHCB */ > > + ghcb = (struct ghcb_sa *)ghcb_vaddr; > > + ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb, > > + &syncout); > > + > > + /* Call hypervisor. */ > > + vmgexit(); > > + > > + /* Verify response */ > > + if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) { > > + ghcb_clear(ghcb); > > + panic("invalid hypervisor response"); > > + } > > + > > + /* Sync in from GHCB */ > > + ghcb_sync_in(frame, ghcb, &syncin); > > + > > + return 1; > > +} > > + > > > > /* > > * kerntrap(frame): > > @@ -348,6 +398,11 @@ kerntrap(struct trapframe *frame) > > else > > return; > > #endif /* NISA > 0 */ > > + > > + case T_VC: > > + if (vctrap(frame)) > > + return; > > + goto we_re_toast; > > This looks like priviledged instructions are prossible from userland > now. As vctrap() only calls panic and SEV-ES is not possible yet, > this should not be security problem. I think a follow up diff > handles this situation correctly. cpuid is not privileged and is allowed from userland. That follow-up diff will only allow cpuid from userland. rdmsr/wrmsr and in/out will not be allowed. > > } > > } > > > > @@ -427,6 +482,9 @@ usertrap(struct trapframe *frame) > > code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI > > : ILL_BADSTK; > > break; > > + case T_VC: > > + vctrap(frame); > > + goto out; > > > > case T_PAGEFLT: /* page fault */ > > if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), > > diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S > > index 59c786483c3..4a106b0e8e7 100644 > > --- a/sys/arch/amd64/amd64/vector.S > > +++ b/sys/arch/amd64/amd64/vector.S > > @@ -513,6 +513,15 @@ END(alltraps_kern) > > END(alltraps_kern_meltdown) > > KTEXT_PAGE_END > > > > +/* #VC trap entry for early bootstrap */ > > +IDTVEC(vctrap_early) > > + pushq $T_VC > > + TRAP_ENTRY_KERN /* early #VC has to be in kernel mode */ > > + cld > > + movq %rsp, %rdi > > + call vctrap > > + movq $0,-8(%rsp) > > + INTRFASTEXIT > > > > /* > > * Macros for interrupt entry, call to handler, and exit. > > diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h > > index e4c8d6924d2..9f01b3cb989 100644 > > --- a/sys/arch/amd64/include/cpufunc.h > > +++ b/sys/arch/amd64/include/cpufunc.h > > @@ -439,6 +439,13 @@ breakpoint(void) > > __asm volatile("int $3"); > > } > > > > +/* VMGEXIT */ > > +static __inline void > > +vmgexit(void) > > +{ > > + __asm volatile("rep; vmmcall"); > > +} > > + > > void amd64_errata(struct cpu_info *); > > void cpu_ucode_setup(void); > > void cpu_ucode_apply(struct cpu_info *); > > diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h > > index 5c23b310fa2..ed4bad5dda1 100644 > > --- a/sys/arch/amd64/include/ghcb.h > > +++ b/sys/arch/amd64/include/ghcb.h > > @@ -109,6 +109,9 @@ struct ghcb_sync { > > > > #ifndef _LOCORE > > > > +extern vaddr_t ghcb_vaddr; > > +extern paddr_t ghcb_paddr; > > + > > void ghcb_clear(struct ghcb_sa *); > > int ghcb_valbm_set(uint8_t *, int); > > int ghcb_valbm_isset(uint8_t *, int); > > diff --git a/sys/arch/amd64/include/segments.h b/sys/arch/amd64/include/segments.h > > index 308639a6baa..ce284262faa 100644 > > --- a/sys/arch/amd64/include/segments.h > > +++ b/sys/arch/amd64/include/segments.h > > @@ -151,6 +151,7 @@ struct region_descriptor { > > > > #ifdef _KERNEL > > extern struct gate_descriptor *idt; > > +extern struct gate_descriptor early_idt[]; > > > > void setgate(struct gate_descriptor *, void *, int, int, int, int); > > void unsetgate(struct gate_descriptor *);