From: Hans-Jörg Höxer Subject: SEV-ES guest: locore #VC trap handling To: Date: Wed, 21 May 2025 17:10:27 +0200 Hi, this change deals with locore for SEV-ES enabled guests. The approach might be a bit controversial. And it requires a diff for vmd(8), that I've also attached, to simplify the discussion: SEV-ES guest: locore #VC trap handling When locore is executed by a SEV-ES enabled guest the first cpuid instruction will raise a #VC trap that will need to be handled. However, at that point in time the guest does not know wether it's a guest at all, if it is running on an AMD cpu with SEV-ES enabled, etc. To resolve this chicken-egg situation we undconditionally setup a As vmd(8) configures the runtime for locore to be in 32 bit compatibility mode a raised #VC exception will switch to long mode. And the CPU will expect a 64 bit entry in the IDT. When running on eg. KVM locore is execute in 32 bit legacy mode. There the CPU will expect a 32 bit entry in the IDT. To accomodate both situations, we set up both 64 and 32 bit handler in the IDT. Additionally, vmd(8) has to setup a long mode segment in the GDT. Both #VC trap handler use the MSR protocol to talk to the hypervisor to emulate CPUID. The MSR protocol only supports "simple" CPUID without subfunctions. Note: When SEV-ES is enabled, the hypervisor can not intercept writes to EFER beforehand, only after the write. Thus on vmm(4) with directly executed kernel we are in compatibility mode and EFER_LMA is set. As resetting EFER_LMA raises #GP we have to preserve it. Take care, HJ. -------------------------------------------------------------------------- commit 588a7de9576a84062110b29c2c15b9f2cb9ea4c0 Author: Hans-Joerg Hoexer Date: Tue Aug 6 17:56:55 2024 +0200 SEV-ES guest: locore #VC trap handling When locore is executed by a SEV-ES enabled guest the first cpuid instruction will raise a #VC trap that will need to be handled. However, at that point in time the guest does not know wether it's a guest at all, if it is running on an AMD cpu with SEV-ES enabled, etc. To resolve this chicken-egg situation we undconditionally setup a As vmd(8) configures the runtime for locore to be in 32 bit compatibility mode a raised #VC exception will switch to long mode. And the CPU will expect a 64 bit entry in the IDT. When running on eg. KVM locore is execute in 32 bit legacy mode. There the CPU will expect a 32 bit entry in the IDT. To accomodate both situations, we set up both 64 and 32 bit handler in the IDT. Additionally, vmd(8) has to setup a long mode segment in the GDT. Both #VC trap handler use the MSR protocol to talk to the hypervisor to emulate CPUID. The MSR protocol only supports "simple" CPUID without subfunctions. Note: When SEV-ES is enabled, the hypervisor can not intercept writes to EFER beforehand, only after the write. Thus on vmm(4) with directly executed kernel we are in compatibility mode and EFER_LMA is set. As resetting EFER_LMA raises #GP we have to preserve it. diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S index 4ef1892c7cc..6b4111717a8 100644 --- a/sys/arch/amd64/amd64/locore0.S +++ b/sys/arch/amd64/amd64/locore0.S @@ -111,6 +111,9 @@ #include #include #include +#include +#include +#include /* * override user-land alignment before including asm.h @@ -193,6 +196,58 @@ bi_size_ok: pushl $PSL_MBO popfl + /* + * Setup temporary #VC trap handler, in case we are running + * on an AMD CPU in SEV-ES guest mode. Will be reset by + * init_x86_64(). + * We are setting up two handlers: + * + * 1) locore_vc_trap32: Triggered when we are running in + * 32-bit legacy mode. + * + * 2) locore_vc_trap64: Triggered when we are running in + * 32-bit compatibility mode. + * + * The latter one is used by vmd(8). + */ + movl $RELOC(early_idt), %ecx + movl $T_VC, %edx + leal (%ecx, %edx, 8), %ecx /* 32bit #VC IDT slot */ + + pushl %cs /* get current %cs */ + popl %ebx + shll $16, %ebx + + movl $RELOC(locore_vc_trap32), %eax + andl $0x0000ffff, %eax + orl %ebx, %eax /* use current %cs */ + movl %eax, (%ecx) + + movl $RELOC(locore_vc_trap32), %eax + andl $0xffff0000, %eax + orl $((0x80 | SDT_SYS386IGT) << 8), %eax + movl %eax, 4(%ecx) + + movl $RELOC(early_idt), %ecx + movl $(2 * T_VC), %edx + leal (%ecx, %edx, 8), %ecx /* 64bit #VC IDT slot */ + + movl $RELOC(locore_vc_trap64), %eax + andl $0x0000ffff, %eax + orl $(GSEL(3, SEL_KPL) << 16), %eax + movl %eax, (%ecx) + + movl $RELOC(locore_vc_trap64), %eax + andl $0xffff0000, %eax + orl $((0x80 | SDT_SYS386IGT) << 8), %eax + movl %eax, 4(%ecx) + xorl %eax, %eax + movl %eax, 8(%ecx) + movl %eax, 12(%ecx) + + movl $RELOC(idtlc), %eax + lidt (%eax) + /* Reset debug control registers */ xorl %eax,%eax movl %eax,%dr6 @@ -631,8 +686,14 @@ store_pte: */ movl $MSR_EFER,%ecx rdmsr + movl %eax,%ebx xorl %eax,%eax /* XXX */ orl $(EFER_LME|EFER_SCE),%eax + /* If set, preserve LMA */ + testl $EFER_LMA,%ebx + jz efer_nxe + orl $EFER_LMA,%eax +efer_nxe: movl RELOC((pg_nx + 4)), %ebx cmpl $0, %ebx je write_efer @@ -745,6 +806,118 @@ longmode_hi: call init_x86_64 call main +vc_cpuid64: + shll $30, %eax /* requested register */ + orl $MSR_PROTO_CPUID_REQ, %eax + movl %ebx, %edx /* CPUID function */ + movl $MSR_SEV_GHCB, %ecx + wrmsr + rep vmmcall + rdmsr + ret + + .globl locore_vc_trap64 +locore_vc_trap64: + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdx + + cmpl $SVM_VMEXIT_CPUID, 32(%rsp) + jne .Lterminate64 + + movl %eax, %ebx /* save CPUID function */ + + movl $0, %eax /* request cpuid, get %eax */ + call vc_cpuid64 + movq %rdx, 24(%rsp) + + movl $1, %eax /* get %ebx */ + call vc_cpuid64 + movq %rdx, 16(%rsp) + + movl $2, %eax /* get %ecx */ + call vc_cpuid64 + movq %rdx, 8(%rsp) + + movl $3, %eax /* get %edx */ + call vc_cpuid64 + movq %rdx, 0(%rsp) + + popq %rdx + popq %rcx + popq %rbx + popq %rax + addq $8, %rsp + addq $2, (%rsp) + iretq + +.Lterminate64: + movl $MSR_PROTO_TERMINATE, %eax + movl $MSR_SEV_GHCB, %ecx + wrmsr + rep vmmcall +.Lterm_loop64: + hlt + jmp .Lterm_loop64 + + .code32 +vc_cpuid32: + shll $30, %eax /* requested register */ + orl $MSR_PROTO_CPUID_REQ, %eax + movl %ebx, %edx /* CPUID function */ + movl $MSR_SEV_GHCB, %ecx + wrmsr + rep vmmcall + rdmsr + ret + + .globl locore_vc_trap32 +locore_vc_trap32: + pushl %eax + pushl %ebx + pushl %ecx + pushl %edx + + cmpl $SVM_VMEXIT_CPUID, 16(%esp) + jne .Lterminate32 + + movl %eax, %ebx /* save CPUID function */ + + movl $0, %eax /* request cpuid, get %eax */ + call vc_cpuid32 + movl %edx, 12(%esp) + + movl $1, %eax /* get %ebx */ + call vc_cpuid32 + movl %edx, 8(%esp) + + movl $2, %eax /* get %ecx */ + call vc_cpuid32 + movl %edx, 4(%esp) + + movl $3, %eax /* get %edx */ + call vc_cpuid32 + movl %edx, 0(%esp) + + popl %edx + popl %ecx + popl %ebx + popl %eax + addl $4, %esp + addl $2, (%esp) + iret + +.Lterminate32: + movl $MSR_PROTO_TERMINATE, %eax + movl $MSR_SEV_GHCB, %ecx + wrmsr + rep vmmcall +.Lterm_loop32: + hlt + jmp .Lterm_loop32 + + .section .codepatch,"a" .align 8, 0xcc .globl codepatch_begin @@ -757,6 +930,20 @@ codepatch_end: .previous .data + .globl idtlc /* temporary locore IDT */ +idtlc: + .word early_idt_end-early_idt-1 + .long _RELOC(early_idt) + .align 64, 0xcc + + .globl early_idt +early_idt: + .rept NIDT + .quad 0x0000000000000000 + .quad 0x0000000000000000 + .endr +early_idt_end: + .globl gdt64 gdt64: .word gdt64_end-gdt64_start-1 diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h index 954e1fa3e3b..fb4ff389ac4 100644 --- a/sys/arch/amd64/include/ghcb.h +++ b/sys/arch/amd64/include/ghcb.h @@ -19,6 +19,8 @@ #ifndef _MACHINE_GHCB_H_ #define _MACHINE_GHCB_H_ +#ifndef _LOCORE + #include #define GHCB_OFFSET(m) ((m) / 8) @@ -99,6 +101,7 @@ struct ghcb_sync { int sz_c; int sz_d; }; +#endif /* !_LOCORE */ /* Definitions used with the MSR protocol */ @@ -107,6 +110,8 @@ struct ghcb_sync { #define MSR_PROTO_TERMINATE 0x100 +#ifndef _LOCORE + void ghcb_clear(struct ghcb_sa *); int ghcb_valbm_set(uint8_t *, int); int ghcb_valbm_isset(uint8_t *, int); @@ -118,4 +123,6 @@ void ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t, struct ghcb_sa *, struct ghcb_sync *); void ghcb_sync_in(struct trapframe *, struct ghcb_sa *, struct ghcb_sync *); +#endif /* !_LOCORE */ + #endif /* !_MACHINE_GHCB_H_ */ diff --git a/sys/arch/amd64/include/trap.h b/sys/arch/amd64/include/trap.h index fa322ba9566..7506dddf804 100644 --- a/sys/arch/amd64/include/trap.h +++ b/sys/arch/amd64/include/trap.h @@ -62,3 +62,4 @@ #define T_XMM 19 /* SSE FP exception */ #define T_VE 20 /* virtualization exception */ #define T_CP 21 /* control protection exception */ +#define T_VC 29 /* VMM communication exception */ diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h index 92f3eade605..cfbfa604902 100644 --- a/sys/arch/amd64/include/vmmvar.h +++ b/sys/arch/amd64/include/vmmvar.h @@ -21,6 +21,8 @@ #ifndef _MACHINE_VMMVAR_H_ #define _MACHINE_VMMVAR_H_ +#ifndef _LOCORE + #define VMM_HV_SIGNATURE "OpenBSDVMM58" /* VMX: Basic Exit Reasons */ @@ -94,6 +96,8 @@ #define VMX_MAX_CR3_TARGETS 256 #define VMX_VMCS_PA_CLEAR 0xFFFFFFFFFFFFFFFFUL +#endif /* ! _LOCORE */ + /* * SVM: Intercept codes (exit reasons) */ @@ -262,6 +266,8 @@ #define SVM_VMEXIT_VMGEXIT 0x403 #define SVM_VMEXIT_INVALID -1 +#ifndef _LOCORE + /* * Exception injection vectors (these correspond to the CPU exception types * defined in the SDM.) @@ -1057,4 +1063,6 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *); #endif /* _KERNEL */ +#endif /* ! _LOCORE */ + #endif /* ! _MACHINE_VMMVAR_H_ */ -------------------------------------------------------------------------- commit c29b30056940a3f8b2acfd18b734daf60257656a Author: Hans-Joerg Hoexer Date: Wed Nov 20 11:16:48 2024 +0100 vmd(8): Setup long mode code segment for SEV-ES guests Right now vmd(8) starts a kernel image in compatibility mode. However, SEV-ES enabled guest will trigger #VC traps during locore. To be able to run such a trap handler, we need a long mode segment. diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c index 2b62ca07565..73c2010397d 100644 --- a/usr.sbin/vmd/loadfile_elf.c +++ b/usr.sbin/vmd/loadfile_elf.c @@ -110,7 +110,7 @@ union { } hdr; static void setsegment(struct mem_segment_descriptor *, uint32_t, - size_t, int, int, int, int); + size_t, int, int, int, int, int); static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int); static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int); static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *); @@ -148,7 +148,7 @@ uint64_t pg_crypt = 0; */ static void setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, - int type, int dpl, int def32, int gran) + int type, int dpl, int def32, int gran, int lm) { sd->sd_lolimit = (int)limit; sd->sd_lobase = (int)base; @@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, sd->sd_p = 1; sd->sd_hilimit = (int)limit >> 16; sd->sd_avl = 0; - sd->sd_long = 0; + sd->sd_long = lm; sd->sd_def32 = def32; sd->sd_gran = gran; sd->sd_hibase = (int)base >> 24; @@ -185,11 +185,13 @@ push_gdt(void) * Create three segment descriptors: * * GDT[0] : null descriptor. "Created" via memset above. - * GDT[1] (selector @ 0x8): Executable segment, for CS + * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS + * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS */ - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1); - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1); + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0); + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0); + setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1); write_mem(GDT_PAGE, gdtpage, PAGE_SIZE); sev_register_encryption(GDT_PAGE, PAGE_SIZE);