Index | Thread | Search

From:
Hans-Jörg Höxer <hshoexer@genua.de>
Subject:
SEV-ES guest: locore #VC trap handling
To:
<tech@openbsd.org>
Date:
Wed, 21 May 2025 17:10:27 +0200

Download raw body.

Thread
Hi,

this change deals with locore for SEV-ES enabled guests.  The approach
might be a bit controversial.  And it requires a diff for vmd(8), that
I've also attached, to simplify the discussion:

    SEV-ES guest: locore #VC trap handling
    
    When locore is executed by a SEV-ES enabled guest the first cpuid
    instruction will raise a #VC trap that will need to be handled.
    However, at that point in time the guest does not know wether it's
    a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
    etc.
    
    To resolve this chicken-egg situation we undconditionally setup a
    
    As vmd(8) configures the runtime for locore to be in 32 bit
    compatibility mode a raised #VC exception will switch to long mode.
    And the CPU will expect a 64 bit entry in the IDT.  When running
    on eg. KVM locore is execute in 32 bit legacy mode.  There the
    CPU will expect a 32 bit entry in the IDT.
    
    To accomodate both situations, we set up both 64 and 32 bit handler
    in the IDT.
    
    Additionally, vmd(8) has to setup a long mode segment in the GDT.
    
    Both #VC trap handler use the MSR protocol to talk to the hypervisor
    to emulate CPUID.  The MSR protocol only supports "simple" CPUID
    without subfunctions.
    
    Note:  When SEV-ES is enabled, the hypervisor can not intercept
    writes to EFER beforehand, only after the write.  Thus on vmm(4)
    with directly executed kernel we are in compatibility mode and
    EFER_LMA is set.  As resetting EFER_LMA raises #GP we have to
    preserve it.

Take care,
HJ.

--------------------------------------------------------------------------
commit 588a7de9576a84062110b29c2c15b9f2cb9ea4c0
Author: Hans-Joerg Hoexer <hshoexer@genua.de>
Date:   Tue Aug 6 17:56:55 2024 +0200

    SEV-ES guest: locore #VC trap handling
    
    When locore is executed by a SEV-ES enabled guest the first cpuid
    instruction will raise a #VC trap that will need to be handled.
    However, at that point in time the guest does not know wether it's
    a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
    etc.
    
    To resolve this chicken-egg situation we undconditionally setup a
    
    As vmd(8) configures the runtime for locore to be in 32 bit
    compatibility mode a raised #VC exception will switch to long mode.
    And the CPU will expect a 64 bit entry in the IDT.  When running
    on eg. KVM locore is execute in 32 bit legacy mode.  There the
    CPU will expect a 32 bit entry in the IDT.
    
    To accomodate both situations, we set up both 64 and 32 bit handler
    in the IDT.
    
    Additionally, vmd(8) has to setup a long mode segment in the GDT.
    
    Both #VC trap handler use the MSR protocol to talk to the hypervisor
    to emulate CPUID.  The MSR protocol only supports "simple" CPUID
    without subfunctions.
    
    Note:  When SEV-ES is enabled, the hypervisor can not intercept
    writes to EFER beforehand, only after the write.  Thus on vmm(4)
    with directly executed kernel we are in compatibility mode and
    EFER_LMA is set.  As resetting EFER_LMA raises #GP we have to
    preserve it.

diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
index 4ef1892c7cc..6b4111717a8 100644
--- a/sys/arch/amd64/amd64/locore0.S
+++ b/sys/arch/amd64/amd64/locore0.S
@@ -111,6 +111,9 @@
 #include <machine/param.h>
 #include <machine/segments.h>
 #include <machine/specialreg.h>
+#include <machine/trap.h>
+#include <machine/ghcb.h>
+#include <machine/vmmvar.h>
 
 /*
  * override user-land alignment before including asm.h
@@ -193,6 +196,58 @@ bi_size_ok:
 	pushl	$PSL_MBO
 	popfl
 
+	/*
+	 * Setup temporary #VC trap handler, in case we are running
+	 * on an AMD CPU in SEV-ES guest mode.  Will be reset by
+	 * init_x86_64().
+	 * We are setting up two handlers:
+	 *
+	 * 1) locore_vc_trap32:  Triggered when we are running in
+	 *    32-bit legacy mode.
+	 *
+	 * 2) locore_vc_trap64:  Triggered when we are running in
+	 *    32-bit compatibility mode.
+	 *
+	 * The latter one is used by vmd(8).
+	 */
+	movl	$RELOC(early_idt), %ecx
+	movl	$T_VC, %edx
+	leal	(%ecx, %edx, 8), %ecx		/* 32bit #VC IDT slot */
+
+	pushl	%cs				/* get current %cs */
+	popl	%ebx
+	shll	$16, %ebx
+
+	movl	$RELOC(locore_vc_trap32), %eax
+	andl	$0x0000ffff, %eax
+	orl	%ebx, %eax			/* use current %cs */
+	movl	%eax, (%ecx)
+
+	movl	$RELOC(locore_vc_trap32), %eax
+	andl	$0xffff0000, %eax
+	orl	$((0x80 | SDT_SYS386IGT) << 8), %eax
+	movl	%eax, 4(%ecx)
+
+	movl	$RELOC(early_idt), %ecx
+	movl	$(2 * T_VC), %edx
+	leal	(%ecx, %edx, 8), %ecx		/* 64bit #VC IDT slot */
+
+	movl	$RELOC(locore_vc_trap64), %eax
+	andl	$0x0000ffff, %eax
+	orl	$(GSEL(3, SEL_KPL) << 16), %eax
+	movl	%eax, (%ecx)
+
+	movl	$RELOC(locore_vc_trap64), %eax
+	andl	$0xffff0000, %eax
+	orl	$((0x80 | SDT_SYS386IGT) << 8), %eax
+	movl	%eax, 4(%ecx)
+	xorl	%eax, %eax
+	movl	%eax, 8(%ecx)
+	movl	%eax, 12(%ecx)
+
+	movl	$RELOC(idtlc), %eax
+	lidt	(%eax)
+
 	/* Reset debug control registers */
 	xorl	%eax,%eax
 	movl	%eax,%dr6
@@ -631,8 +686,14 @@ store_pte:
 	 */
 	movl	$MSR_EFER,%ecx
 	rdmsr
+	movl	%eax,%ebx
 	xorl	%eax,%eax	/* XXX */
 	orl	$(EFER_LME|EFER_SCE),%eax
+	/* If set, preserve LMA */
+	testl	$EFER_LMA,%ebx
+	jz	efer_nxe
+	orl	$EFER_LMA,%eax
+efer_nxe:
 	movl	RELOC((pg_nx + 4)), %ebx
 	cmpl	$0, %ebx
 	je	write_efer
@@ -745,6 +806,118 @@ longmode_hi:
 	call	init_x86_64
 	call	main
 
+vc_cpuid64:
+	shll	$30, %eax		/* requested register */
+	orl	$MSR_PROTO_CPUID_REQ, %eax
+	movl	%ebx, %edx		/* CPUID function */
+	movl	$MSR_SEV_GHCB, %ecx
+	wrmsr
+	rep vmmcall
+	rdmsr
+	ret
+
+	.globl	locore_vc_trap64
+locore_vc_trap64:
+	pushq	%rax
+	pushq	%rbx
+	pushq	%rcx
+	pushq	%rdx
+
+	cmpl	$SVM_VMEXIT_CPUID, 32(%rsp)
+	jne	.Lterminate64
+
+	movl	%eax, %ebx		/* save CPUID function */
+
+	movl	$0, %eax		/* request cpuid, get %eax */
+	call	vc_cpuid64
+	movq	%rdx, 24(%rsp)
+
+	movl	$1, %eax		/* get %ebx */
+	call	vc_cpuid64
+	movq	%rdx, 16(%rsp)
+
+	movl	$2, %eax		/* get %ecx */
+	call	vc_cpuid64
+	movq	%rdx, 8(%rsp)
+
+	movl	$3, %eax		/* get %edx */
+	call	vc_cpuid64
+	movq	%rdx, 0(%rsp)
+
+	popq	%rdx
+	popq	%rcx
+	popq	%rbx
+	popq	%rax
+	addq	$8, %rsp
+	addq	$2, (%rsp)
+	iretq
+
+.Lterminate64:
+	movl	$MSR_PROTO_TERMINATE, %eax
+	movl	$MSR_SEV_GHCB, %ecx
+	wrmsr
+	rep vmmcall
+.Lterm_loop64:
+	hlt
+	jmp	.Lterm_loop64
+
+	.code32
+vc_cpuid32:
+	shll	$30, %eax		/* requested register */
+	orl	$MSR_PROTO_CPUID_REQ, %eax
+	movl	%ebx, %edx		/* CPUID function */
+	movl	$MSR_SEV_GHCB, %ecx
+	wrmsr
+	rep vmmcall
+	rdmsr
+	ret
+
+	.globl	locore_vc_trap32
+locore_vc_trap32:
+	pushl	%eax
+	pushl	%ebx
+	pushl	%ecx
+	pushl	%edx
+
+	cmpl	$SVM_VMEXIT_CPUID, 16(%esp)
+	jne	.Lterminate32
+
+	movl	%eax, %ebx		/* save CPUID function */
+
+	movl	$0, %eax		/* request cpuid, get %eax */
+	call	vc_cpuid32
+	movl	%edx, 12(%esp)
+
+	movl	$1, %eax		/* get %ebx */
+	call	vc_cpuid32
+	movl	%edx, 8(%esp)
+
+	movl	$2, %eax		/* get %ecx */
+	call	vc_cpuid32
+	movl	%edx, 4(%esp)
+
+	movl	$3, %eax		/* get %edx */
+	call	vc_cpuid32
+	movl	%edx, 0(%esp)
+
+	popl	%edx
+	popl	%ecx
+	popl	%ebx
+	popl	%eax
+	addl	$4, %esp
+	addl	$2, (%esp)
+	iret
+
+.Lterminate32:
+	movl	$MSR_PROTO_TERMINATE, %eax
+	movl	$MSR_SEV_GHCB, %ecx
+	wrmsr
+	rep vmmcall
+.Lterm_loop32:
+	hlt
+	jmp	.Lterm_loop32
+
+
 	.section .codepatch,"a"
 	.align	8, 0xcc
 	.globl codepatch_begin
@@ -757,6 +930,20 @@ codepatch_end:
 	.previous
 
 	.data
+	.globl	idtlc			/* temporary locore IDT */
+idtlc:
+	.word	early_idt_end-early_idt-1
+	.long	_RELOC(early_idt)
+	.align 64, 0xcc
+
+	.globl	early_idt
+early_idt:
+	.rept	NIDT
+	.quad	0x0000000000000000
+	.quad	0x0000000000000000
+	.endr
+early_idt_end:
+
 	.globl	gdt64
 gdt64:
 	.word	gdt64_end-gdt64_start-1
diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
index 954e1fa3e3b..fb4ff389ac4 100644
--- a/sys/arch/amd64/include/ghcb.h
+++ b/sys/arch/amd64/include/ghcb.h
@@ -19,6 +19,8 @@
 #ifndef _MACHINE_GHCB_H_
 #define _MACHINE_GHCB_H_
 
+#ifndef _LOCORE
+
 #include <machine/frame.h>
 
 #define GHCB_OFFSET(m)			((m) / 8)
@@ -99,6 +101,7 @@ struct ghcb_sync {
 		int			sz_c;
 		int			sz_d;
 };
+#endif /* !_LOCORE */
 
 
 /* Definitions used with the MSR protocol */
@@ -107,6 +110,8 @@ struct ghcb_sync {
 #define MSR_PROTO_TERMINATE	0x100
 
 
+#ifndef _LOCORE
+
 void	ghcb_clear(struct ghcb_sa *);
 int	ghcb_valbm_set(uint8_t *, int);
 int	ghcb_valbm_isset(uint8_t *, int);
@@ -118,4 +123,6 @@ void	ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t,
 	    struct ghcb_sa *, struct ghcb_sync *);
 void	ghcb_sync_in(struct trapframe *, struct ghcb_sa *, struct ghcb_sync *);
 
+#endif /* !_LOCORE */
+
 #endif /* !_MACHINE_GHCB_H_ */
diff --git a/sys/arch/amd64/include/trap.h b/sys/arch/amd64/include/trap.h
index fa322ba9566..7506dddf804 100644
--- a/sys/arch/amd64/include/trap.h
+++ b/sys/arch/amd64/include/trap.h
@@ -62,3 +62,4 @@
 #define	T_XMM		19	/* SSE FP exception */
 #define	T_VE		20	/* virtualization exception */
 #define	T_CP		21	/* control protection exception */
+#define	T_VC		29	/* VMM communication exception */
diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
index 92f3eade605..cfbfa604902 100644
--- a/sys/arch/amd64/include/vmmvar.h
+++ b/sys/arch/amd64/include/vmmvar.h
@@ -21,6 +21,8 @@
 #ifndef _MACHINE_VMMVAR_H_
 #define _MACHINE_VMMVAR_H_
 
+#ifndef _LOCORE
+
 #define VMM_HV_SIGNATURE 	"OpenBSDVMM58"
 
 /* VMX: Basic Exit Reasons */
@@ -94,6 +96,8 @@
 #define VMX_MAX_CR3_TARGETS			256
 #define VMX_VMCS_PA_CLEAR			0xFFFFFFFFFFFFFFFFUL
 
+#endif	/* ! _LOCORE */
+
 /*
  * SVM: Intercept codes (exit reasons)
  */
@@ -262,6 +266,8 @@
 #define SVM_VMEXIT_VMGEXIT			0x403
 #define SVM_VMEXIT_INVALID			-1
 
+#ifndef _LOCORE
+
 /*
  * Exception injection vectors (these correspond to the CPU exception types
  * defined in the SDM.)
@@ -1057,4 +1063,6 @@ int	svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
 
 #endif /* _KERNEL */
 
+#endif	/* ! _LOCORE */
+
 #endif /* ! _MACHINE_VMMVAR_H_ */
--------------------------------------------------------------------------

commit c29b30056940a3f8b2acfd18b734daf60257656a
Author: Hans-Joerg Hoexer <hshoexer@genua.de>
Date:   Wed Nov 20 11:16:48 2024 +0100

    vmd(8): Setup long mode code segment for SEV-ES guests
    
    Right now vmd(8) starts a kernel image in compatibility mode.
    However, SEV-ES enabled guest will trigger #VC traps during locore.
    To be able to run such a trap handler, we need a long mode segment.

diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c
index 2b62ca07565..73c2010397d 100644
--- a/usr.sbin/vmd/loadfile_elf.c
+++ b/usr.sbin/vmd/loadfile_elf.c
@@ -110,7 +110,7 @@ union {
 } hdr;
 
 static void setsegment(struct mem_segment_descriptor *, uint32_t,
-    size_t, int, int, int, int);
+    size_t, int, int, int, int, int);
 static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
 static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
 static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
@@ -148,7 +148,7 @@ uint64_t pg_crypt = 0;
  */
 static void
 setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
-    int type, int dpl, int def32, int gran)
+    int type, int dpl, int def32, int gran, int lm)
 {
 	sd->sd_lolimit = (int)limit;
 	sd->sd_lobase = (int)base;
@@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
 	sd->sd_p = 1;
 	sd->sd_hilimit = (int)limit >> 16;
 	sd->sd_avl = 0;
-	sd->sd_long = 0;
+	sd->sd_long = lm;
 	sd->sd_def32 = def32;
 	sd->sd_gran = gran;
 	sd->sd_hibase = (int)base >> 24;
@@ -185,11 +185,13 @@ push_gdt(void)
 	 * Create three segment descriptors:
 	 *
 	 * GDT[0] : null descriptor. "Created" via memset above.
-	 * GDT[1] (selector @ 0x8): Executable segment, for CS
+	 * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
 	 * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
+	 * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
 	 */
-	setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
-	setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
+	setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
+	setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
+	setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
 
 	write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
 	sev_register_encryption(GDT_PAGE, PAGE_SIZE);