From: Hans-Jörg Höxer Subject: SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3 To: Date: Thu, 26 Jun 2025 14:20:00 +0200 Hi, these are the last three diffs missing for running SEV-ES enabled guests on vmm(4)/vmd(8). This is 1/3. When we are a SEV-ES guest claim two pages to be used for GHCB communication with vmm(4). As we want to share these two pages with vmm(4) we re-map them plain text. We do this as soon as long mode reached. Arriving in init_x86_64() reset the IDT we used during locore0 and install a new #VC trap handler. This is the actual handler we will use from now on. However, as we are not fully up and running yet, we use a simple and kernel-only entry stub for the #VC exception. At the end of early bootstrap init_x86_64() will install the actual and final IDT. Note: For now this will reset and disable the #VC handler. Therefore, SEV-ES enabled guest are not working yet. The #VC handler vctrap() is also hooked into kerntrap() and usertrap(). Communication with vmm(4) using the GHCB protocol is also ready. However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left out for now. Again, SEV-ES enabled guest are not working yet. Take care, HJ. -- commit 03c18bb384d7016050aaae9f9f7f2babd2ed4616 Author: Hans-Joerg Hoexer Date: Mon Jan 13 15:17:37 2025 +0100 SEV-ES guest: early GHCB allocation and early #VC trap handling When we are a SEV-ES guest claim two pages to be used for GHCB communication with vmm(4). As we want to share these two pages with vmm(4) we re-map them plain text. We do this as soon as long mode reached. Arriving in init_x86_64() reset the IDT we used during locore0 and install a new #VC trap handler. This is the actual handler we will use from now on. However, as we are not fully up and running yet, we use a simple and kernel-only entry stub for the #VC exception. At the end of early bootstrap init_x86_64() will install the actual and final IDT. Note: For now this will reset and disable the #VC handler. Therefore, SEV-ES enabled guest are not working yet. The #VC handler vctrap() is also hooked into kerntrap() and usertrap(). Communication with vmm(4) using the GHCB protocol is also ready. However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left out for now. Again, SEV-ES enabled guest are not working yet. diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c index ae5aecf13f9..bb1027f82ae 100644 --- a/sys/arch/amd64/amd64/ghcb.c +++ b/sys/arch/amd64/amd64/ghcb.c @@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = { 0x00000000ffffffffULL, 0xffffffffffffffffULL }; +vaddr_t ghcb_vaddr; +paddr_t ghcb_paddr; + /* * ghcb_clear * diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S index 3f79af0d3cc..b14e053785c 100644 --- a/sys/arch/amd64/amd64/locore0.S +++ b/sys/arch/amd64/amd64/locore0.S @@ -439,7 +439,9 @@ cont: #define PROC0_DMP2_OFF (PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG) #define TABLESIZE \ ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \ - NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG) + NDML3_ENTRIES + NDML2_ENTRIES + 5) * NBPG) +#define PROC0_GHCB_OFF (TABLESIZE - 5 * NBPG) +#define GHCB_SIZE (2 * NBPG) #define fillkpt \ pushl %ebp ; /* save */ \ @@ -463,6 +465,17 @@ cont: loop 1b ; /* till finished */ \ popl %ebp + +#define fillkpt_nx_nc \ + pushl %ebp ; /* save */ \ + movl RELOC((pg_nx + 4)), %ebp ; /* NX bit? */ \ +1: movl %eax,(%ebx) ; /* store phys addr */ \ + movl %ebp,4(%ebx) ; /* upper 32 bits */ \ + addl $8,%ebx ; /* next pte/pde */ \ + addl $NBPG,%eax ; /* next phys page */ \ + loop 1b ; /* till finished */ \ + popl %ebp + /* Find end of kernel image. */ movl $RELOC(end),%edi #if (NKSYMS || defined(DDB)) @@ -569,6 +582,19 @@ map_tables: shrl $PGSHIFT,%ecx fillkpt_nx + /* When in SEV-ES guestmode, remap GHCB shared (ie. unencrypted) */ + movl RELOC(cpu_sev_guestmode),%eax + testl $SEV_STAT_ES_ENABLED,%eax + jz .Lnoghcb + pushl %ebx /* save current slot */ + subl $(5 << 3),%ebx /* move back to slot of GHCB */ + leal (PROC0_GHCB_OFF)(%esi),%eax + orl $(PG_V|PG_KW), %eax + movl $(GHCB_SIZE>>PGSHIFT), %ecx + fillkpt_nx_nc + popl %ebx /* continue with slot saved above */ + +.Lnoghcb: /* Map ISA I/O mem (later atdevbase) RW, NX */ movl $(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax movl $(IOM_SIZE>>PGSHIFT),%ecx @@ -778,6 +804,15 @@ longmode_hi: addq %rsi,%rdx movq %rdx,atdevbase(%rip) + /* Relocate GHCB. */ + movq cpu_sev_guestmode(%rip),%rax + testq $SEV_STAT_ES_ENABLED,%rax + jz .Lnoghcbreloc + movq $(PROC0_GHCB_OFF+KERNBASE),%rdx + addq %rsi,%rdx + movq %rdx,ghcb_vaddr(%rip) + +.Lnoghcbreloc: /* Record start of symbols */ movq $__kernel_bss_end, ssym(%rip) @@ -800,7 +835,7 @@ longmode_hi: movw %ax,%fs leaq TABLESIZE(%rsi),%rdi - subq $(NBPG*3), %rdi + subq $(NBPG*5), %rdi /* XXX merge these */ call init_x86_64 diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c index d9b34f2cde4..bc454f03ba3 100644 --- a/sys/arch/amd64/amd64/machdep.c +++ b/sys/arch/amd64/amd64/machdep.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include @@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, extern int tsc_is_invariant; extern int amd64_has_xcrypt; extern int need_retpoline; +extern int cpu_sev_guestmode; const struct sysctl_bounded_args cpuctl_vars[] = { { CPU_LIDACTION, &lid_action, -1, 2 }, @@ -1314,6 +1316,37 @@ cpu_init_idt(void) lidt(®ion); } +uint64_t early_gdt[GDT_SIZE / 8]; + +void +cpu_init_early_vctrap(paddr_t addr) +{ + struct region_descriptor region; + + extern void Xvctrap_early(void); + + /* Setup temporary "early" longmode GDT, will be reset soon */ + memset(early_gdt, 0, sizeof(early_gdt)); + set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff, + SDT_MEMERA, SEL_KPL, 1, 0, 1); + set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff, + SDT_MEMRWA, SEL_KPL, 1, 0, 1); + setregion(®ion, early_gdt, GDT_SIZE - 1); + lgdt(®ion); + + /* Setup temporary "early" longmode #VC entry, will be reset soon */ + idt = early_idt; + memset((void *)idt, 0, NIDT * sizeof(idt[0])); + setgate(&idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT, SEL_KPL, + GSEL(GCODE_SEL, SEL_KPL)); + cpu_init_idt(); + + /* Tell vmm(4) about our GHCB. */ + ghcb_paddr = addr; + memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE); + wrmsr(MSR_SEV_GHCB, ghcb_paddr); +} + void cpu_init_extents(void) { @@ -1434,6 +1467,13 @@ init_x86_64(paddr_t first_avail) int x, ist; uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30; + /* + * locore0 mapped 2 pages for use as GHCB before pmap is initialized. + */ + if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED)) + cpu_init_early_vctrap(first_avail); + first_avail += 2 * NBPG; + /* * locore0 mapped 3 pages for use before the pmap is initialized * starting at first_avail. These pages are currently used by diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c index ff432b7af6d..bb445a60894 100644 --- a/sys/arch/amd64/amd64/trap.c +++ b/sys/arch/amd64/amd64/trap.c @@ -86,6 +86,8 @@ #include #include #include +#include +#include #ifdef DDB #include #include @@ -95,6 +97,7 @@ int upageflttrap(struct trapframe *, uint64_t); int kpageflttrap(struct trapframe *, uint64_t); +int vctrap(struct trapframe *); void kerntrap(struct trapframe *); void usertrap(struct trapframe *); void ast(struct trapframe *); @@ -123,6 +126,7 @@ const char * const trap_type[] = { "SSE FP exception", /* 19 T_XMM */ "virtualization exception", /* 20 T_VE */ "control protection exception", /* 21 T_CP */ + "VMM communication exception", /* 29 T_VC */ }; const int trap_types = nitems(trap_type); @@ -297,6 +301,52 @@ kpageflttrap(struct trapframe *frame, uint64_t cr2) return 1; } +int +vctrap(struct trapframe *frame) +{ + uint64_t sw_exitcode, sw_exitinfo1, sw_exitinfo2; + struct ghcb_sync syncout, syncin; + struct ghcb_sa *ghcb; + + intr_disable(); + + memset(&syncout, 0, sizeof(syncout)); + memset(&syncin, 0, sizeof(syncin)); + + sw_exitcode = frame->tf_err; + sw_exitinfo1 = 0; + sw_exitinfo2 = 0; + + switch (sw_exitcode) { + default: + panic("invalid exit code 0x%llx", sw_exitcode); + } + + /* Always required */ + ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout); + ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout); + ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout); + + /* Sync out to GHCB */ + ghcb = (struct ghcb_sa *)ghcb_vaddr; + ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb, + &syncout); + + /* Call hypervisor. */ + vmgexit(); + + /* Verify response */ + if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) { + ghcb_clear(ghcb); + panic("invalid hypervisor response"); + } + + /* Sync in from GHCB */ + ghcb_sync_in(frame, ghcb, &syncin); + + return 1; +} + /* * kerntrap(frame): @@ -348,6 +398,11 @@ kerntrap(struct trapframe *frame) else return; #endif /* NISA > 0 */ + + case T_VC: + if (vctrap(frame)) + return; + goto we_re_toast; } } @@ -427,6 +482,9 @@ usertrap(struct trapframe *frame) code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI : ILL_BADSTK; break; + case T_VC: + vctrap(frame); + goto out; case T_PAGEFLT: /* page fault */ if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p), diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S index 59c786483c3..4a106b0e8e7 100644 --- a/sys/arch/amd64/amd64/vector.S +++ b/sys/arch/amd64/amd64/vector.S @@ -513,6 +513,15 @@ END(alltraps_kern) END(alltraps_kern_meltdown) KTEXT_PAGE_END +/* #VC trap entry for early bootstrap */ +IDTVEC(vctrap_early) + pushq $T_VC + TRAP_ENTRY_KERN /* early #VC has to be in kernel mode */ + cld + movq %rsp, %rdi + call vctrap + movq $0,-8(%rsp) + INTRFASTEXIT /* * Macros for interrupt entry, call to handler, and exit. diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h index e4c8d6924d2..9f01b3cb989 100644 --- a/sys/arch/amd64/include/cpufunc.h +++ b/sys/arch/amd64/include/cpufunc.h @@ -439,6 +439,13 @@ breakpoint(void) __asm volatile("int $3"); } +/* VMGEXIT */ +static __inline void +vmgexit(void) +{ + __asm volatile("rep; vmmcall"); +} + void amd64_errata(struct cpu_info *); void cpu_ucode_setup(void); void cpu_ucode_apply(struct cpu_info *); diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h index 5c23b310fa2..ed4bad5dda1 100644 --- a/sys/arch/amd64/include/ghcb.h +++ b/sys/arch/amd64/include/ghcb.h @@ -109,6 +109,9 @@ struct ghcb_sync { #ifndef _LOCORE +extern vaddr_t ghcb_vaddr; +extern paddr_t ghcb_paddr; + void ghcb_clear(struct ghcb_sa *); int ghcb_valbm_set(uint8_t *, int); int ghcb_valbm_isset(uint8_t *, int); diff --git a/sys/arch/amd64/include/segments.h b/sys/arch/amd64/include/segments.h index 308639a6baa..ce284262faa 100644 --- a/sys/arch/amd64/include/segments.h +++ b/sys/arch/amd64/include/segments.h @@ -151,6 +151,7 @@ struct region_descriptor { #ifdef _KERNEL extern struct gate_descriptor *idt; +extern struct gate_descriptor early_idt[]; void setgate(struct gate_descriptor *, void *, int, int, int, int); void unsetgate(struct gate_descriptor *);