Download raw body.
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
On Thu, Jun 26, 2025 at 02:20:00PM +0200, Hans-Jörg Höxer wrote:
> Hi,
>
> these are the last three diffs missing for running SEV-ES enabled guests
> on vmm(4)/vmd(8). This is 1/3.
>
> When we are a SEV-ES guest claim two pages to be used for GHCB
> communication with vmm(4). As we want to share these two pages
> with vmm(4) we re-map them plain text. We do this as soon as long
> mode reached.
>
> Arriving in init_x86_64() reset the IDT we used during locore0 and
> install a new #VC trap handler. This is the actual handler we will
> use from now on. However, as we are not fully up and running yet,
> we use a simple and kernel-only entry stub for the #VC exception.
> At the end of early bootstrap init_x86_64() will install the actual
> and final IDT.
>
> Note: For now this will reset and disable the #VC handler. Therefore,
> SEV-ES enabled guest are not working yet.
>
> The #VC handler vctrap() is also hooked into kerntrap() and usertrap().
> Communication with vmm(4) using the GHCB protocol is also ready.
> However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left
> out for now. Again, SEV-ES enabled guest are not working yet.
>
> Take care,
> HJ.
Couple minor nits/questions below, but ok mlarkin nonetheless.
-ml
>
> --
> commit 03c18bb384d7016050aaae9f9f7f2babd2ed4616
> Author: Hans-Joerg Hoexer <hshoexer@genua.de>
> Date: Mon Jan 13 15:17:37 2025 +0100
>
> SEV-ES guest: early GHCB allocation and early #VC trap handling
>
> When we are a SEV-ES guest claim two pages to be used for GHCB
> communication with vmm(4). As we want to share these two pages
> with vmm(4) we re-map them plain text. We do this as soon as long
> mode reached.
>
> Arriving in init_x86_64() reset the IDT we used during locore0 and
> install a new #VC trap handler. This is the actual handler we will
> use from now on. However, as we are not fully up and running yet,
> we use a simple and kernel-only entry stub for the #VC exception.
> At the end of early bootstrap init_x86_64() will install the actual
> and final IDT.
>
> Note: For now this will reset and disable the #VC handler. Therefore,
> SEV-ES enabled guest are not working yet.
>
> The #VC handler vctrap() is also hooked into kerntrap() and usertrap().
> Communication with vmm(4) using the GHCB protocol is also ready.
> However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left
> out for now. Again, SEV-ES enabled guest are not working yet.
>
> diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c
> index ae5aecf13f9..bb1027f82ae 100644
> --- a/sys/arch/amd64/amd64/ghcb.c
> +++ b/sys/arch/amd64/amd64/ghcb.c
> @@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = {
> 0x00000000ffffffffULL, 0xffffffffffffffffULL
> };
>
> +vaddr_t ghcb_vaddr;
> +paddr_t ghcb_paddr;
> +
> /*
> * ghcb_clear
> *
> diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
> index 3f79af0d3cc..b14e053785c 100644
> --- a/sys/arch/amd64/amd64/locore0.S
> +++ b/sys/arch/amd64/amd64/locore0.S
> @@ -439,7 +439,9 @@ cont:
> #define PROC0_DMP2_OFF (PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG)
> #define TABLESIZE \
> ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \
> - NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG)
> + NDML3_ENTRIES + NDML2_ENTRIES + 5) * NBPG)
> +#define PROC0_GHCB_OFF (TABLESIZE - 5 * NBPG)
> +#define GHCB_SIZE (2 * NBPG)
>
> #define fillkpt \
> pushl %ebp ; /* save */ \
> @@ -463,6 +465,17 @@ cont:
> loop 1b ; /* till finished */ \
> popl %ebp
>
> +
> +#define fillkpt_nx_nc \
> + pushl %ebp ; /* save */ \
> + movl RELOC((pg_nx + 4)), %ebp ; /* NX bit? */ \
> +1: movl %eax,(%ebx) ; /* store phys addr */ \
> + movl %ebp,4(%ebx) ; /* upper 32 bits */ \
> + addl $8,%ebx ; /* next pte/pde */ \
> + addl $NBPG,%eax ; /* next phys page */ \
> + loop 1b ; /* till finished */ \
> + popl %ebp
> +
> /* Find end of kernel image. */
> movl $RELOC(end),%edi
> #if (NKSYMS || defined(DDB))
> @@ -569,6 +582,19 @@ map_tables:
> shrl $PGSHIFT,%ecx
> fillkpt_nx
>
> + /* When in SEV-ES guestmode, remap GHCB shared (ie. unencrypted) */
> + movl RELOC(cpu_sev_guestmode),%eax
> + testl $SEV_STAT_ES_ENABLED,%eax
> + jz .Lnoghcb
> + pushl %ebx /* save current slot */
> + subl $(5 << 3),%ebx /* move back to slot of GHCB */
> + leal (PROC0_GHCB_OFF)(%esi),%eax
> + orl $(PG_V|PG_KW), %eax
> + movl $(GHCB_SIZE>>PGSHIFT), %ecx
> + fillkpt_nx_nc
> + popl %ebx /* continue with slot saved above */
> +
> +.Lnoghcb:
> /* Map ISA I/O mem (later atdevbase) RW, NX */
> movl $(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax
> movl $(IOM_SIZE>>PGSHIFT),%ecx
> @@ -778,6 +804,15 @@ longmode_hi:
> addq %rsi,%rdx
> movq %rdx,atdevbase(%rip)
>
> + /* Relocate GHCB. */
> + movq cpu_sev_guestmode(%rip),%rax
> + testq $SEV_STAT_ES_ENABLED,%rax
> + jz .Lnoghcbreloc
> + movq $(PROC0_GHCB_OFF+KERNBASE),%rdx
> + addq %rsi,%rdx
> + movq %rdx,ghcb_vaddr(%rip)
> +
> +.Lnoghcbreloc:
> /* Record start of symbols */
> movq $__kernel_bss_end, ssym(%rip)
>
> @@ -800,7 +835,7 @@ longmode_hi:
> movw %ax,%fs
>
> leaq TABLESIZE(%rsi),%rdi
> - subq $(NBPG*3), %rdi
> + subq $(NBPG*5), %rdi
>
> /* XXX merge these */
> call init_x86_64
> diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
> index d9b34f2cde4..bc454f03ba3 100644
> --- a/sys/arch/amd64/amd64/machdep.c
> +++ b/sys/arch/amd64/amd64/machdep.c
> @@ -100,6 +100,7 @@
> #include <machine/mpbiosvar.h>
> #include <machine/kcore.h>
> #include <machine/tss.h>
> +#include <machine/ghcb.h>
>
> #include <dev/isa/isareg.h>
> #include <dev/ic/i8042reg.h>
> @@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
> extern int tsc_is_invariant;
> extern int amd64_has_xcrypt;
> extern int need_retpoline;
> +extern int cpu_sev_guestmode;
>
> const struct sysctl_bounded_args cpuctl_vars[] = {
> { CPU_LIDACTION, &lid_action, -1, 2 },
> @@ -1314,6 +1316,37 @@ cpu_init_idt(void)
> lidt(®ion);
> }
>
> +uint64_t early_gdt[GDT_SIZE / 8];
> +
> +void
> +cpu_init_early_vctrap(paddr_t addr)
> +{
> + struct region_descriptor region;
> +
> + extern void Xvctrap_early(void);
> +
> + /* Setup temporary "early" longmode GDT, will be reset soon */
> + memset(early_gdt, 0, sizeof(early_gdt));
> + set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff,
> + SDT_MEMERA, SEL_KPL, 1, 0, 1);
> + set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff,
> + SDT_MEMRWA, SEL_KPL, 1, 0, 1);
> + setregion(®ion, early_gdt, GDT_SIZE - 1);
> + lgdt(®ion);
> +
> + /* Setup temporary "early" longmode #VC entry, will be reset soon */
> + idt = early_idt;
> + memset((void *)idt, 0, NIDT * sizeof(idt[0]));
> + setgate(&idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT, SEL_KPL,
> + GSEL(GCODE_SEL, SEL_KPL));
> + cpu_init_idt();
> +
> + /* Tell vmm(4) about our GHCB. */
> + ghcb_paddr = addr;
> + memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE);
> + wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +}
> +
> void
> cpu_init_extents(void)
> {
> @@ -1434,6 +1467,13 @@ init_x86_64(paddr_t first_avail)
> int x, ist;
> uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
>
> + /*
> + * locore0 mapped 2 pages for use as GHCB before pmap is initialized.
> + */
> + if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> + cpu_init_early_vctrap(first_avail);
> + first_avail += 2 * NBPG;
> +
> /*
> * locore0 mapped 3 pages for use before the pmap is initialized
> * starting at first_avail. These pages are currently used by
> diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c
> index ff432b7af6d..bb445a60894 100644
> --- a/sys/arch/amd64/amd64/trap.c
> +++ b/sys/arch/amd64/amd64/trap.c
> @@ -86,6 +86,8 @@
> #include <machine/fpu.h>
> #include <machine/psl.h>
> #include <machine/trap.h>
> +#include <machine/ghcb.h>
> +#include <machine/vmmvar.h>
> #ifdef DDB
> #include <ddb/db_output.h>
> #include <machine/db_machdep.h>
> @@ -95,6 +97,7 @@
>
> int upageflttrap(struct trapframe *, uint64_t);
> int kpageflttrap(struct trapframe *, uint64_t);
> +int vctrap(struct trapframe *);
> void kerntrap(struct trapframe *);
> void usertrap(struct trapframe *);
> void ast(struct trapframe *);
> @@ -123,6 +126,7 @@ const char * const trap_type[] = {
> "SSE FP exception", /* 19 T_XMM */
> "virtualization exception", /* 20 T_VE */
> "control protection exception", /* 21 T_CP */
> + "VMM communication exception", /* 29 T_VC */
> };
> const int trap_types = nitems(trap_type);
>
> @@ -297,6 +301,52 @@ kpageflttrap(struct trapframe *frame, uint64_t cr2)
> return 1;
> }
>
> +int
> +vctrap(struct trapframe *frame)
> +{
> + uint64_t sw_exitcode, sw_exitinfo1, sw_exitinfo2;
> + struct ghcb_sync syncout, syncin;
> + struct ghcb_sa *ghcb;
> +
> + intr_disable();
> +
> + memset(&syncout, 0, sizeof(syncout));
> + memset(&syncin, 0, sizeof(syncin));
> +
> + sw_exitcode = frame->tf_err;
> + sw_exitinfo1 = 0;
> + sw_exitinfo2 = 0;
> +
> + switch (sw_exitcode) {
> + default:
> + panic("invalid exit code 0x%llx", sw_exitcode);
> + }
I was scratching my head on this until tb@ pointed out that this is filled out
in a later diff. What are your thoughts around combining the diffs together
so that if we need to revert diff #3 later for some reason, we aren't left with
a halfway working vctrap()? Eg, "all or nothing"? I'll defer to you and bluhm@.
> +
> + /* Always required */
> + ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout);
> + ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout);
> + ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
> +
> + /* Sync out to GHCB */
> + ghcb = (struct ghcb_sa *)ghcb_vaddr;
> + ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb,
> + &syncout);
> +
> + /* Call hypervisor. */
> + vmgexit();
> +
> + /* Verify response */
> + if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) {
> + ghcb_clear(ghcb);
> + panic("invalid hypervisor response");
> + }
> +
> + /* Sync in from GHCB */
> + ghcb_sync_in(frame, ghcb, &syncin);
> +
> + return 1;
> +}
> +
>
> /*
> * kerntrap(frame):
> @@ -348,6 +398,11 @@ kerntrap(struct trapframe *frame)
> else
> return;
> #endif /* NISA > 0 */
> +
> + case T_VC:
> + if (vctrap(frame))
> + return;
> + goto we_re_toast;
> }
> }
>
> @@ -427,6 +482,9 @@ usertrap(struct trapframe *frame)
> code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI
> : ILL_BADSTK;
> break;
> + case T_VC:
> + vctrap(frame);
> + goto out;
>
> case T_PAGEFLT: /* page fault */
> if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
> diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
> index 59c786483c3..4a106b0e8e7 100644
> --- a/sys/arch/amd64/amd64/vector.S
> +++ b/sys/arch/amd64/amd64/vector.S
> @@ -513,6 +513,15 @@ END(alltraps_kern)
> END(alltraps_kern_meltdown)
> KTEXT_PAGE_END
>
> +/* #VC trap entry for early bootstrap */
> +IDTVEC(vctrap_early)
> + pushq $T_VC
> + TRAP_ENTRY_KERN /* early #VC has to be in kernel mode */
> + cld
> + movq %rsp, %rdi
> + call vctrap
> + movq $0,-8(%rsp)
> + INTRFASTEXIT
>
> /*
> * Macros for interrupt entry, call to handler, and exit.
> diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h
> index e4c8d6924d2..9f01b3cb989 100644
> --- a/sys/arch/amd64/include/cpufunc.h
> +++ b/sys/arch/amd64/include/cpufunc.h
> @@ -439,6 +439,13 @@ breakpoint(void)
> __asm volatile("int $3");
> }
>
> +/* VMGEXIT */
> +static __inline void
> +vmgexit(void)
> +{
> + __asm volatile("rep; vmmcall");
A comment here about the conversation we had the other day would be useful for
posterity:
/* rep; vmmcall encodes the vmgexit instruction */
> +}
> +
> void amd64_errata(struct cpu_info *);
> void cpu_ucode_setup(void);
> void cpu_ucode_apply(struct cpu_info *);
> diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
> index 5c23b310fa2..ed4bad5dda1 100644
> --- a/sys/arch/amd64/include/ghcb.h
> +++ b/sys/arch/amd64/include/ghcb.h
> @@ -109,6 +109,9 @@ struct ghcb_sync {
>
> #ifndef _LOCORE
>
> +extern vaddr_t ghcb_vaddr;
> +extern paddr_t ghcb_paddr;
> +
> void ghcb_clear(struct ghcb_sa *);
> int ghcb_valbm_set(uint8_t *, int);
> int ghcb_valbm_isset(uint8_t *, int);
> diff --git a/sys/arch/amd64/include/segments.h b/sys/arch/amd64/include/segments.h
> index 308639a6baa..ce284262faa 100644
> --- a/sys/arch/amd64/include/segments.h
> +++ b/sys/arch/amd64/include/segments.h
> @@ -151,6 +151,7 @@ struct region_descriptor {
>
> #ifdef _KERNEL
> extern struct gate_descriptor *idt;
> +extern struct gate_descriptor early_idt[];
>
> void setgate(struct gate_descriptor *, void *, int, int, int, int);
> void unsetgate(struct gate_descriptor *);
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3