Download raw body.
SEV-ES guest: locore #VC trap handling
On Wed, May 21, 2025 at 05:10:27PM +0200, Hans-Jörg Höxer wrote:
> Hi,
>
> this change deals with locore for SEV-ES enabled guests. The approach
> might be a bit controversial. And it requires a diff for vmd(8), that
> I've also attached, to simplify the discussion:
>
> SEV-ES guest: locore #VC trap handling
>
> When locore is executed by a SEV-ES enabled guest the first cpuid
> instruction will raise a #VC trap that will need to be handled.
> However, at that point in time the guest does not know wether it's
> a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
> etc.
>
> To resolve this chicken-egg situation we undconditionally setup a
>
> As vmd(8) configures the runtime for locore to be in 32 bit
> compatibility mode a raised #VC exception will switch to long mode.
> And the CPU will expect a 64 bit entry in the IDT. When running
> on eg. KVM locore is execute in 32 bit legacy mode. There the
> CPU will expect a 32 bit entry in the IDT.
>
> To accomodate both situations, we set up both 64 and 32 bit handler
> in the IDT.
>
> Additionally, vmd(8) has to setup a long mode segment in the GDT.
>
> Both #VC trap handler use the MSR protocol to talk to the hypervisor
> to emulate CPUID. The MSR protocol only supports "simple" CPUID
> without subfunctions.
>
> Note: When SEV-ES is enabled, the hypervisor can not intercept
> writes to EFER beforehand, only after the write. Thus on vmm(4)
> with directly executed kernel we are in compatibility mode and
> EFER_LMA is set. As resetting EFER_LMA raises #GP we have to
> preserve it.
>
> Take care,
> HJ.
>
This one probably needs a lot of testing. What tests have you guys done so
far? My SEV-ES machine is available now, I will test this and let you know
but we probably want a lot of testing from different host/guest combos.
-ml
> --------------------------------------------------------------------------
> commit 588a7de9576a84062110b29c2c15b9f2cb9ea4c0
> Author: Hans-Joerg Hoexer <hshoexer@genua.de>
> Date: Tue Aug 6 17:56:55 2024 +0200
>
> SEV-ES guest: locore #VC trap handling
>
> When locore is executed by a SEV-ES enabled guest the first cpuid
> instruction will raise a #VC trap that will need to be handled.
> However, at that point in time the guest does not know wether it's
> a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
> etc.
>
> To resolve this chicken-egg situation we undconditionally setup a
>
> As vmd(8) configures the runtime for locore to be in 32 bit
> compatibility mode a raised #VC exception will switch to long mode.
> And the CPU will expect a 64 bit entry in the IDT. When running
> on eg. KVM locore is execute in 32 bit legacy mode. There the
> CPU will expect a 32 bit entry in the IDT.
>
> To accomodate both situations, we set up both 64 and 32 bit handler
> in the IDT.
>
> Additionally, vmd(8) has to setup a long mode segment in the GDT.
>
> Both #VC trap handler use the MSR protocol to talk to the hypervisor
> to emulate CPUID. The MSR protocol only supports "simple" CPUID
> without subfunctions.
>
> Note: When SEV-ES is enabled, the hypervisor can not intercept
> writes to EFER beforehand, only after the write. Thus on vmm(4)
> with directly executed kernel we are in compatibility mode and
> EFER_LMA is set. As resetting EFER_LMA raises #GP we have to
> preserve it.
>
> diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
> index 4ef1892c7cc..6b4111717a8 100644
> --- a/sys/arch/amd64/amd64/locore0.S
> +++ b/sys/arch/amd64/amd64/locore0.S
> @@ -111,6 +111,9 @@
> #include <machine/param.h>
> #include <machine/segments.h>
> #include <machine/specialreg.h>
> +#include <machine/trap.h>
> +#include <machine/ghcb.h>
> +#include <machine/vmmvar.h>
>
> /*
> * override user-land alignment before including asm.h
> @@ -193,6 +196,58 @@ bi_size_ok:
> pushl $PSL_MBO
> popfl
>
> + /*
> + * Setup temporary #VC trap handler, in case we are running
> + * on an AMD CPU in SEV-ES guest mode. Will be reset by
> + * init_x86_64().
> + * We are setting up two handlers:
> + *
> + * 1) locore_vc_trap32: Triggered when we are running in
> + * 32-bit legacy mode.
> + *
> + * 2) locore_vc_trap64: Triggered when we are running in
> + * 32-bit compatibility mode.
> + *
> + * The latter one is used by vmd(8).
> + */
> + movl $RELOC(early_idt), %ecx
> + movl $T_VC, %edx
> + leal (%ecx, %edx, 8), %ecx /* 32bit #VC IDT slot */
> +
> + pushl %cs /* get current %cs */
> + popl %ebx
> + shll $16, %ebx
> +
> + movl $RELOC(locore_vc_trap32), %eax
> + andl $0x0000ffff, %eax
> + orl %ebx, %eax /* use current %cs */
> + movl %eax, (%ecx)
> +
> + movl $RELOC(locore_vc_trap32), %eax
> + andl $0xffff0000, %eax
> + orl $((0x80 | SDT_SYS386IGT) << 8), %eax
> + movl %eax, 4(%ecx)
> +
> + movl $RELOC(early_idt), %ecx
> + movl $(2 * T_VC), %edx
> + leal (%ecx, %edx, 8), %ecx /* 64bit #VC IDT slot */
> +
> + movl $RELOC(locore_vc_trap64), %eax
> + andl $0x0000ffff, %eax
> + orl $(GSEL(3, SEL_KPL) << 16), %eax
> + movl %eax, (%ecx)
> +
> + movl $RELOC(locore_vc_trap64), %eax
> + andl $0xffff0000, %eax
> + orl $((0x80 | SDT_SYS386IGT) << 8), %eax
> + movl %eax, 4(%ecx)
> + xorl %eax, %eax
> + movl %eax, 8(%ecx)
> + movl %eax, 12(%ecx)
> +
> + movl $RELOC(idtlc), %eax
> + lidt (%eax)
> +
> /* Reset debug control registers */
> xorl %eax,%eax
> movl %eax,%dr6
> @@ -631,8 +686,14 @@ store_pte:
> */
> movl $MSR_EFER,%ecx
> rdmsr
> + movl %eax,%ebx
> xorl %eax,%eax /* XXX */
> orl $(EFER_LME|EFER_SCE),%eax
> + /* If set, preserve LMA */
> + testl $EFER_LMA,%ebx
> + jz efer_nxe
> + orl $EFER_LMA,%eax
> +efer_nxe:
> movl RELOC((pg_nx + 4)), %ebx
> cmpl $0, %ebx
> je write_efer
> @@ -745,6 +806,118 @@ longmode_hi:
> call init_x86_64
> call main
>
> +vc_cpuid64:
> + shll $30, %eax /* requested register */
> + orl $MSR_PROTO_CPUID_REQ, %eax
> + movl %ebx, %edx /* CPUID function */
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> + rdmsr
> + ret
> +
> + .globl locore_vc_trap64
> +locore_vc_trap64:
> + pushq %rax
> + pushq %rbx
> + pushq %rcx
> + pushq %rdx
> +
> + cmpl $SVM_VMEXIT_CPUID, 32(%rsp)
> + jne .Lterminate64
> +
> + movl %eax, %ebx /* save CPUID function */
> +
> + movl $0, %eax /* request cpuid, get %eax */
> + call vc_cpuid64
> + movq %rdx, 24(%rsp)
> +
> + movl $1, %eax /* get %ebx */
> + call vc_cpuid64
> + movq %rdx, 16(%rsp)
> +
> + movl $2, %eax /* get %ecx */
> + call vc_cpuid64
> + movq %rdx, 8(%rsp)
> +
> + movl $3, %eax /* get %edx */
> + call vc_cpuid64
> + movq %rdx, 0(%rsp)
> +
> + popq %rdx
> + popq %rcx
> + popq %rbx
> + popq %rax
> + addq $8, %rsp
> + addq $2, (%rsp)
> + iretq
> +
> +.Lterminate64:
> + movl $MSR_PROTO_TERMINATE, %eax
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> +.Lterm_loop64:
> + hlt
> + jmp .Lterm_loop64
> +
> + .code32
> +vc_cpuid32:
> + shll $30, %eax /* requested register */
> + orl $MSR_PROTO_CPUID_REQ, %eax
> + movl %ebx, %edx /* CPUID function */
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> + rdmsr
> + ret
> +
> + .globl locore_vc_trap32
> +locore_vc_trap32:
> + pushl %eax
> + pushl %ebx
> + pushl %ecx
> + pushl %edx
> +
> + cmpl $SVM_VMEXIT_CPUID, 16(%esp)
> + jne .Lterminate32
> +
> + movl %eax, %ebx /* save CPUID function */
> +
> + movl $0, %eax /* request cpuid, get %eax */
> + call vc_cpuid32
> + movl %edx, 12(%esp)
> +
> + movl $1, %eax /* get %ebx */
> + call vc_cpuid32
> + movl %edx, 8(%esp)
> +
> + movl $2, %eax /* get %ecx */
> + call vc_cpuid32
> + movl %edx, 4(%esp)
> +
> + movl $3, %eax /* get %edx */
> + call vc_cpuid32
> + movl %edx, 0(%esp)
> +
> + popl %edx
> + popl %ecx
> + popl %ebx
> + popl %eax
> + addl $4, %esp
> + addl $2, (%esp)
> + iret
> +
> +.Lterminate32:
> + movl $MSR_PROTO_TERMINATE, %eax
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> +.Lterm_loop32:
> + hlt
> + jmp .Lterm_loop32
> +
> +
> .section .codepatch,"a"
> .align 8, 0xcc
> .globl codepatch_begin
> @@ -757,6 +930,20 @@ codepatch_end:
> .previous
>
> .data
> + .globl idtlc /* temporary locore IDT */
> +idtlc:
> + .word early_idt_end-early_idt-1
> + .long _RELOC(early_idt)
> + .align 64, 0xcc
> +
> + .globl early_idt
> +early_idt:
> + .rept NIDT
> + .quad 0x0000000000000000
> + .quad 0x0000000000000000
> + .endr
> +early_idt_end:
> +
> .globl gdt64
> gdt64:
> .word gdt64_end-gdt64_start-1
> diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
> index 954e1fa3e3b..fb4ff389ac4 100644
> --- a/sys/arch/amd64/include/ghcb.h
> +++ b/sys/arch/amd64/include/ghcb.h
> @@ -19,6 +19,8 @@
> #ifndef _MACHINE_GHCB_H_
> #define _MACHINE_GHCB_H_
>
> +#ifndef _LOCORE
> +
> #include <machine/frame.h>
>
> #define GHCB_OFFSET(m) ((m) / 8)
> @@ -99,6 +101,7 @@ struct ghcb_sync {
> int sz_c;
> int sz_d;
> };
> +#endif /* !_LOCORE */
>
>
> /* Definitions used with the MSR protocol */
> @@ -107,6 +110,8 @@ struct ghcb_sync {
> #define MSR_PROTO_TERMINATE 0x100
>
>
> +#ifndef _LOCORE
> +
> void ghcb_clear(struct ghcb_sa *);
> int ghcb_valbm_set(uint8_t *, int);
> int ghcb_valbm_isset(uint8_t *, int);
> @@ -118,4 +123,6 @@ void ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t,
> struct ghcb_sa *, struct ghcb_sync *);
> void ghcb_sync_in(struct trapframe *, struct ghcb_sa *, struct ghcb_sync *);
>
> +#endif /* !_LOCORE */
> +
> #endif /* !_MACHINE_GHCB_H_ */
> diff --git a/sys/arch/amd64/include/trap.h b/sys/arch/amd64/include/trap.h
> index fa322ba9566..7506dddf804 100644
> --- a/sys/arch/amd64/include/trap.h
> +++ b/sys/arch/amd64/include/trap.h
> @@ -62,3 +62,4 @@
> #define T_XMM 19 /* SSE FP exception */
> #define T_VE 20 /* virtualization exception */
> #define T_CP 21 /* control protection exception */
> +#define T_VC 29 /* VMM communication exception */
> diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
> index 92f3eade605..cfbfa604902 100644
> --- a/sys/arch/amd64/include/vmmvar.h
> +++ b/sys/arch/amd64/include/vmmvar.h
> @@ -21,6 +21,8 @@
> #ifndef _MACHINE_VMMVAR_H_
> #define _MACHINE_VMMVAR_H_
>
> +#ifndef _LOCORE
> +
> #define VMM_HV_SIGNATURE "OpenBSDVMM58"
>
> /* VMX: Basic Exit Reasons */
> @@ -94,6 +96,8 @@
> #define VMX_MAX_CR3_TARGETS 256
> #define VMX_VMCS_PA_CLEAR 0xFFFFFFFFFFFFFFFFUL
>
> +#endif /* ! _LOCORE */
> +
> /*
> * SVM: Intercept codes (exit reasons)
> */
> @@ -262,6 +266,8 @@
> #define SVM_VMEXIT_VMGEXIT 0x403
> #define SVM_VMEXIT_INVALID -1
>
> +#ifndef _LOCORE
> +
> /*
> * Exception injection vectors (these correspond to the CPU exception types
> * defined in the SDM.)
> @@ -1057,4 +1063,6 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
>
> #endif /* _KERNEL */
>
> +#endif /* ! _LOCORE */
> +
> #endif /* ! _MACHINE_VMMVAR_H_ */
> --------------------------------------------------------------------------
>
> commit c29b30056940a3f8b2acfd18b734daf60257656a
> Author: Hans-Joerg Hoexer <hshoexer@genua.de>
> Date: Wed Nov 20 11:16:48 2024 +0100
>
> vmd(8): Setup long mode code segment for SEV-ES guests
>
> Right now vmd(8) starts a kernel image in compatibility mode.
> However, SEV-ES enabled guest will trigger #VC traps during locore.
> To be able to run such a trap handler, we need a long mode segment.
>
> diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c
> index 2b62ca07565..73c2010397d 100644
> --- a/usr.sbin/vmd/loadfile_elf.c
> +++ b/usr.sbin/vmd/loadfile_elf.c
> @@ -110,7 +110,7 @@ union {
> } hdr;
>
> static void setsegment(struct mem_segment_descriptor *, uint32_t,
> - size_t, int, int, int, int);
> + size_t, int, int, int, int, int);
> static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
> static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
> static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
> @@ -148,7 +148,7 @@ uint64_t pg_crypt = 0;
> */
> static void
> setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
> - int type, int dpl, int def32, int gran)
> + int type, int dpl, int def32, int gran, int lm)
> {
> sd->sd_lolimit = (int)limit;
> sd->sd_lobase = (int)base;
> @@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
> sd->sd_p = 1;
> sd->sd_hilimit = (int)limit >> 16;
> sd->sd_avl = 0;
> - sd->sd_long = 0;
> + sd->sd_long = lm;
> sd->sd_def32 = def32;
> sd->sd_gran = gran;
> sd->sd_hibase = (int)base >> 24;
> @@ -185,11 +185,13 @@ push_gdt(void)
> * Create three segment descriptors:
> *
> * GDT[0] : null descriptor. "Created" via memset above.
> - * GDT[1] (selector @ 0x8): Executable segment, for CS
> + * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
> * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
> + * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
> */
> - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
> - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
> + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
> + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
> + setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
>
> write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
> sev_register_encryption(GDT_PAGE, PAGE_SIZE);
SEV-ES guest: locore #VC trap handling