Index | Thread | Search

From:
Hans-Jörg Höxer <hshoexer@genua.de>
Subject:
SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
To:
<tech@openbsd.org>
Date:
Thu, 26 Jun 2025 14:20:00 +0200

Download raw body.

Thread
Hi,

these are the last three diffs missing for running SEV-ES enabled guests
on vmm(4)/vmd(8).  This is 1/3.

When we are a SEV-ES guest claim two pages to be used for GHCB
communication with vmm(4).  As we want to share these two pages
with vmm(4) we re-map them plain text.  We do this as soon as long
mode reached.

Arriving in init_x86_64() reset the IDT we used during locore0 and
install a new #VC trap handler.  This is the actual handler we will
use from now on.  However, as we are not fully up and running yet,
we use a simple and kernel-only entry stub for the #VC exception.
At the end of early bootstrap init_x86_64() will install the actual
and final IDT.

Note: For now this will reset and disable the #VC handler.  Therefore,
SEV-ES enabled guest are not working yet.

The #VC handler vctrap() is also hooked into kerntrap() and usertrap().
Communication with  vmm(4) using the GHCB protocol is also ready.
However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left
out for now.  Again, SEV-ES enabled guest are not working yet.

Take care,
HJ.

-- 
commit 03c18bb384d7016050aaae9f9f7f2babd2ed4616
Author: Hans-Joerg Hoexer <hshoexer@genua.de>
Date:   Mon Jan 13 15:17:37 2025 +0100

    SEV-ES guest: early GHCB allocation and early #VC trap handling
    
    When we are a SEV-ES guest claim two pages to be used for GHCB
    communication with vmm(4).  As we want to share these two pages
    with vmm(4) we re-map them plain text.  We do this as soon as long
    mode reached.
    
    Arriving in init_x86_64() reset the IDT we used during locore0 and
    install a new #VC trap handler.  This is the actual handler we will
    use from now on.  However, as we are not fully up and running yet,
    we use a simple and kernel-only entry stub for the #VC exception.
    At the end of early bootstrap init_x86_64() will install the actual
    and final IDT.
    
    Note: For now this will reset and disable the #VC handler.  Therefore,
    SEV-ES enabled guest are not working yet.
    
    The #VC handler vctrap() is also hooked into kerntrap() and usertrap().
    Communication with  vmm(4) using the GHCB protocol is also ready.
    However, the actual handling of cpuid, in/out, wrmsr/rdmsr is left
    out for now.  Again, SEV-ES enabled guest are not working yet.

diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c
index ae5aecf13f9..bb1027f82ae 100644
--- a/sys/arch/amd64/amd64/ghcb.c
+++ b/sys/arch/amd64/amd64/ghcb.c
@@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = {
     0x00000000ffffffffULL, 0xffffffffffffffffULL
 };
 
+vaddr_t ghcb_vaddr;
+paddr_t ghcb_paddr;
+
 /*
  * ghcb_clear
  *
diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
index 3f79af0d3cc..b14e053785c 100644
--- a/sys/arch/amd64/amd64/locore0.S
+++ b/sys/arch/amd64/amd64/locore0.S
@@ -439,7 +439,9 @@ cont:
 #define PROC0_DMP2_OFF	(PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG)
 #define TABLESIZE \
     ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \
-	NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG)
+	NDML3_ENTRIES + NDML2_ENTRIES + 5) * NBPG)
+#define PROC0_GHCB_OFF	(TABLESIZE - 5 * NBPG)
+#define GHCB_SIZE	(2 * NBPG)
 
 #define fillkpt \
 	pushl	%ebp				;	/* save */ \
@@ -463,6 +465,17 @@ cont:
 	loop	1b				;	/* till finished */ \
 	popl	%ebp
 
+
+#define fillkpt_nx_nc \
+	pushl	%ebp				;	/* save */ \
+	movl	RELOC((pg_nx + 4)), %ebp	;	/* NX bit? */ \
+1:	movl	%eax,(%ebx)			;	/* store phys addr */ \
+	movl	%ebp,4(%ebx)			;	/* upper 32 bits */ \
+	addl	$8,%ebx				;	/* next pte/pde */ \
+	addl	$NBPG,%eax			;	/* next phys page */ \
+	loop	1b				;	/* till finished */ \
+	popl	%ebp
+
 	/* Find end of kernel image. */
 	movl	$RELOC(end),%edi
 #if (NKSYMS || defined(DDB))
@@ -569,6 +582,19 @@ map_tables:
 	shrl	$PGSHIFT,%ecx
 	fillkpt_nx
 
+	/* When in SEV-ES guestmode, remap GHCB shared (ie. unencrypted) */
+	movl	RELOC(cpu_sev_guestmode),%eax
+	testl	$SEV_STAT_ES_ENABLED,%eax
+	jz	.Lnoghcb
+	pushl	%ebx			/* save current slot */
+	subl	$(5 << 3),%ebx		/* move back to slot of GHCB */
+	leal	(PROC0_GHCB_OFF)(%esi),%eax
+	orl	$(PG_V|PG_KW), %eax
+	movl	$(GHCB_SIZE>>PGSHIFT), %ecx
+	fillkpt_nx_nc
+	popl	%ebx			/* continue with slot saved above */
+
+.Lnoghcb:
 	/* Map ISA I/O mem (later atdevbase) RW, NX */
 	movl	$(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax
 	movl	$(IOM_SIZE>>PGSHIFT),%ecx
@@ -778,6 +804,15 @@ longmode_hi:
 	addq	%rsi,%rdx
 	movq	%rdx,atdevbase(%rip)
 
+	/* Relocate GHCB. */
+	movq	cpu_sev_guestmode(%rip),%rax
+	testq	$SEV_STAT_ES_ENABLED,%rax
+	jz	.Lnoghcbreloc
+	movq	$(PROC0_GHCB_OFF+KERNBASE),%rdx
+	addq	%rsi,%rdx
+	movq	%rdx,ghcb_vaddr(%rip)
+
+.Lnoghcbreloc:
 	/* Record start of symbols */
 	movq	$__kernel_bss_end, ssym(%rip)
 
@@ -800,7 +835,7 @@ longmode_hi:
 	movw	%ax,%fs
 
 	leaq	TABLESIZE(%rsi),%rdi
-	subq	$(NBPG*3), %rdi
+	subq	$(NBPG*5), %rdi
 
 	/* XXX merge these */
 	call	init_x86_64
diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
index d9b34f2cde4..bc454f03ba3 100644
--- a/sys/arch/amd64/amd64/machdep.c
+++ b/sys/arch/amd64/amd64/machdep.c
@@ -100,6 +100,7 @@
 #include <machine/mpbiosvar.h>
 #include <machine/kcore.h>
 #include <machine/tss.h>
+#include <machine/ghcb.h>
 
 #include <dev/isa/isareg.h>
 #include <dev/ic/i8042reg.h>
@@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
 extern int tsc_is_invariant;
 extern int amd64_has_xcrypt;
 extern int need_retpoline;
+extern int cpu_sev_guestmode;
 
 const struct sysctl_bounded_args cpuctl_vars[] = {
 	{ CPU_LIDACTION, &lid_action, -1, 2 },
@@ -1314,6 +1316,37 @@ cpu_init_idt(void)
 	lidt(&region);
 }
 
+uint64_t early_gdt[GDT_SIZE / 8];
+
+void
+cpu_init_early_vctrap(paddr_t addr)
+{
+	struct region_descriptor region;
+
+	extern void Xvctrap_early(void);
+
+	/* Setup temporary "early" longmode GDT, will be reset soon */
+	memset(early_gdt, 0, sizeof(early_gdt));
+	set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff,
+	    SDT_MEMERA, SEL_KPL, 1, 0, 1);
+	set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff,
+	    SDT_MEMRWA, SEL_KPL, 1, 0, 1);
+	setregion(&region, early_gdt, GDT_SIZE - 1);
+	lgdt(&region);
+
+	/* Setup temporary "early" longmode #VC entry, will be reset soon */
+	idt = early_idt;
+	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
+	setgate(&idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT, SEL_KPL,
+	    GSEL(GCODE_SEL, SEL_KPL));
+	cpu_init_idt();
+
+	/* Tell vmm(4) about our GHCB. */
+	ghcb_paddr = addr;
+	memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE);
+	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
+}
+
 void
 cpu_init_extents(void)
 {
@@ -1434,6 +1467,13 @@ init_x86_64(paddr_t first_avail)
 	int x, ist;
 	uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
 
+	/*
+	 * locore0 mapped 2 pages for use as GHCB before pmap is initialized.
+	 */
+	if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
+		cpu_init_early_vctrap(first_avail);
+	first_avail += 2 * NBPG;
+
 	/*
 	 * locore0 mapped 3 pages for use before the pmap is initialized
 	 * starting at first_avail. These pages are currently used by
diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c
index ff432b7af6d..bb445a60894 100644
--- a/sys/arch/amd64/amd64/trap.c
+++ b/sys/arch/amd64/amd64/trap.c
@@ -86,6 +86,8 @@
 #include <machine/fpu.h>
 #include <machine/psl.h>
 #include <machine/trap.h>
+#include <machine/ghcb.h>
+#include <machine/vmmvar.h>
 #ifdef DDB
 #include <ddb/db_output.h>
 #include <machine/db_machdep.h>
@@ -95,6 +97,7 @@
 
 int	upageflttrap(struct trapframe *, uint64_t);
 int	kpageflttrap(struct trapframe *, uint64_t);
+int	vctrap(struct trapframe *);
 void	kerntrap(struct trapframe *);
 void	usertrap(struct trapframe *);
 void	ast(struct trapframe *);
@@ -123,6 +126,7 @@ const char * const trap_type[] = {
 	"SSE FP exception",			/* 19 T_XMM */
 	"virtualization exception",		/* 20 T_VE */
 	"control protection exception",		/* 21 T_CP */
+	"VMM communication exception",		/* 29 T_VC */
 };
 const int	trap_types = nitems(trap_type);
 
@@ -297,6 +301,52 @@ kpageflttrap(struct trapframe *frame, uint64_t cr2)
 	return 1;
 }
 
+int
+vctrap(struct trapframe *frame)
+{
+	uint64_t	 sw_exitcode, sw_exitinfo1, sw_exitinfo2;
+	struct ghcb_sync syncout, syncin;
+	struct ghcb_sa	*ghcb;
+
+	intr_disable();
+
+	memset(&syncout, 0, sizeof(syncout));
+	memset(&syncin, 0, sizeof(syncin));
+
+	sw_exitcode = frame->tf_err;
+	sw_exitinfo1 = 0;
+	sw_exitinfo2 = 0;
+
+	switch (sw_exitcode) {
+	default:
+		panic("invalid exit code 0x%llx", sw_exitcode);
+	}
+
+	/* Always required */
+	ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout);
+	ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout);
+	ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
+
+	/* Sync out to GHCB */
+	ghcb = (struct ghcb_sa *)ghcb_vaddr;
+	ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb,
+	    &syncout);
+
+	/* Call hypervisor. */
+	vmgexit();
+
+	/* Verify response */
+	if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) {
+		ghcb_clear(ghcb);
+		panic("invalid hypervisor response");
+	}
+
+	/* Sync in from GHCB */
+	ghcb_sync_in(frame, ghcb, &syncin);
+
+	return 1;
+}
+
 
 /*
  * kerntrap(frame):
@@ -348,6 +398,11 @@ kerntrap(struct trapframe *frame)
 		else
 			return;
 #endif /* NISA > 0 */
+
+	case T_VC:
+		if (vctrap(frame))
+			return;
+		goto we_re_toast;
 	}
 }
 
@@ -427,6 +482,9 @@ usertrap(struct trapframe *frame)
 		code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI
 		    : ILL_BADSTK;
 		break;
+	case T_VC:
+		vctrap(frame);
+		goto out;
 
 	case T_PAGEFLT:			/* page fault */
 		if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
index 59c786483c3..4a106b0e8e7 100644
--- a/sys/arch/amd64/amd64/vector.S
+++ b/sys/arch/amd64/amd64/vector.S
@@ -513,6 +513,15 @@ END(alltraps_kern)
 END(alltraps_kern_meltdown)
 KTEXT_PAGE_END
 
+/* #VC trap entry for early bootstrap */
+IDTVEC(vctrap_early)
+	pushq	$T_VC
+	TRAP_ENTRY_KERN		/* early #VC has to be in kernel mode */
+	cld
+	movq	%rsp, %rdi
+	call	vctrap
+	movq	$0,-8(%rsp)
+	INTRFASTEXIT
 
 /*
  * Macros for interrupt entry, call to handler, and exit.
diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h
index e4c8d6924d2..9f01b3cb989 100644
--- a/sys/arch/amd64/include/cpufunc.h
+++ b/sys/arch/amd64/include/cpufunc.h
@@ -439,6 +439,13 @@ breakpoint(void)
 	__asm volatile("int $3");
 }
 
+/* VMGEXIT */
+static __inline void
+vmgexit(void)
+{
+	__asm volatile("rep; vmmcall");
+}
+
 void amd64_errata(struct cpu_info *);
 void cpu_ucode_setup(void);
 void cpu_ucode_apply(struct cpu_info *);
diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
index 5c23b310fa2..ed4bad5dda1 100644
--- a/sys/arch/amd64/include/ghcb.h
+++ b/sys/arch/amd64/include/ghcb.h
@@ -109,6 +109,9 @@ struct ghcb_sync {
 
 #ifndef _LOCORE
 
+extern vaddr_t ghcb_vaddr;
+extern paddr_t ghcb_paddr;
+
 void	ghcb_clear(struct ghcb_sa *);
 int	ghcb_valbm_set(uint8_t *, int);
 int	ghcb_valbm_isset(uint8_t *, int);
diff --git a/sys/arch/amd64/include/segments.h b/sys/arch/amd64/include/segments.h
index 308639a6baa..ce284262faa 100644
--- a/sys/arch/amd64/include/segments.h
+++ b/sys/arch/amd64/include/segments.h
@@ -151,6 +151,7 @@ struct region_descriptor {
 
 #ifdef _KERNEL
 extern struct gate_descriptor *idt;
+extern struct gate_descriptor early_idt[];
 
 void setgate(struct gate_descriptor *, void *, int, int, int, int);
 void unsetgate(struct gate_descriptor *);