From: Hans-Jörg Höxer Subject: Re: SEV-ES guest: locore #VC trap handling To: Date: Wed, 21 May 2025 17:19:14 +0200 Hi, On Wed, May 21, 2025 at 05:10:27PM +0200, Hans-Jörg Höxer wrote: > Hi, > > this change deals with locore for SEV-ES enabled guests. The approach > might be a bit controversial. And it requires a diff for vmd(8), that > I've also attached, to simplify the discussion: > > SEV-ES guest: locore #VC trap handling > > When locore is executed by a SEV-ES enabled guest the first cpuid > instruction will raise a #VC trap that will need to be handled. > However, at that point in time the guest does not know wether it's > a guest at all, if it is running on an AMD cpu with SEV-ES enabled, > etc. > > To resolve this chicken-egg situation we undconditionally setup a that paragraph got butchered. Sorry for that. It should read: To resolve this chicken-egg situation we undconditionally setup a SEV-ES enabled guest we, or are runnign on some non-AMD CPU we will not raise #VC (hopefull). On Intel CPUs the vector for #VC is reserved. > As vmd(8) configures the runtime for locore to be in 32 bit > compatibility mode a raised #VC exception will switch to long mode. > And the CPU will expect a 64 bit entry in the IDT. When running > on eg. KVM locore is execute in 32 bit legacy mode. There the > CPU will expect a 32 bit entry in the IDT. > > To accomodate both situations, we set up both 64 and 32 bit handler > in the IDT. > > Additionally, vmd(8) has to setup a long mode segment in the GDT. > > Both #VC trap handler use the MSR protocol to talk to the hypervisor > to emulate CPUID. The MSR protocol only supports "simple" CPUID > without subfunctions. > > Note: When SEV-ES is enabled, the hypervisor can not intercept > writes to EFER beforehand, only after the write. Thus on vmm(4) > with directly executed kernel we are in compatibility mode and > EFER_LMA is set. As resetting EFER_LMA raises #GP we have to > preserve it. > > Take care, > HJ. > > -------------------------------------------------------------------------- > commit 588a7de9576a84062110b29c2c15b9f2cb9ea4c0 > Author: Hans-Joerg Hoexer > Date: Tue Aug 6 17:56:55 2024 +0200 > > SEV-ES guest: locore #VC trap handling > > When locore is executed by a SEV-ES enabled guest the first cpuid > instruction will raise a #VC trap that will need to be handled. > However, at that point in time the guest does not know wether it's > a guest at all, if it is running on an AMD cpu with SEV-ES enabled, > etc. > > To resolve this chicken-egg situation we undconditionally setup a > > As vmd(8) configures the runtime for locore to be in 32 bit > compatibility mode a raised #VC exception will switch to long mode. > And the CPU will expect a 64 bit entry in the IDT. When running > on eg. KVM locore is execute in 32 bit legacy mode. There the > CPU will expect a 32 bit entry in the IDT. > > To accomodate both situations, we set up both 64 and 32 bit handler > in the IDT. > > Additionally, vmd(8) has to setup a long mode segment in the GDT. > > Both #VC trap handler use the MSR protocol to talk to the hypervisor > to emulate CPUID. The MSR protocol only supports "simple" CPUID > without subfunctions. > > Note: When SEV-ES is enabled, the hypervisor can not intercept > writes to EFER beforehand, only after the write. Thus on vmm(4) > with directly executed kernel we are in compatibility mode and > EFER_LMA is set. As resetting EFER_LMA raises #GP we have to > preserve it. > > diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S > index 4ef1892c7cc..6b4111717a8 100644 > --- a/sys/arch/amd64/amd64/locore0.S > +++ b/sys/arch/amd64/amd64/locore0.S > @@ -111,6 +111,9 @@ > #include > #include > #include > +#include > +#include > +#include > > /* > * override user-land alignment before including asm.h > @@ -193,6 +196,58 @@ bi_size_ok: > pushl $PSL_MBO > popfl > > + /* > + * Setup temporary #VC trap handler, in case we are running > + * on an AMD CPU in SEV-ES guest mode. Will be reset by > + * init_x86_64(). > + * We are setting up two handlers: > + * > + * 1) locore_vc_trap32: Triggered when we are running in > + * 32-bit legacy mode. > + * > + * 2) locore_vc_trap64: Triggered when we are running in > + * 32-bit compatibility mode. > + * > + * The latter one is used by vmd(8). > + */ > + movl $RELOC(early_idt), %ecx > + movl $T_VC, %edx > + leal (%ecx, %edx, 8), %ecx /* 32bit #VC IDT slot */ > + > + pushl %cs /* get current %cs */ > + popl %ebx > + shll $16, %ebx > + > + movl $RELOC(locore_vc_trap32), %eax > + andl $0x0000ffff, %eax > + orl %ebx, %eax /* use current %cs */ > + movl %eax, (%ecx) > + > + movl $RELOC(locore_vc_trap32), %eax > + andl $0xffff0000, %eax > + orl $((0x80 | SDT_SYS386IGT) << 8), %eax > + movl %eax, 4(%ecx) > + > + movl $RELOC(early_idt), %ecx > + movl $(2 * T_VC), %edx > + leal (%ecx, %edx, 8), %ecx /* 64bit #VC IDT slot */ > + > + movl $RELOC(locore_vc_trap64), %eax > + andl $0x0000ffff, %eax > + orl $(GSEL(3, SEL_KPL) << 16), %eax > + movl %eax, (%ecx) > + > + movl $RELOC(locore_vc_trap64), %eax > + andl $0xffff0000, %eax > + orl $((0x80 | SDT_SYS386IGT) << 8), %eax > + movl %eax, 4(%ecx) > + xorl %eax, %eax > + movl %eax, 8(%ecx) > + movl %eax, 12(%ecx) > + > + movl $RELOC(idtlc), %eax > + lidt (%eax) > + > /* Reset debug control registers */ > xorl %eax,%eax > movl %eax,%dr6 > @@ -631,8 +686,14 @@ store_pte: > */ > movl $MSR_EFER,%ecx > rdmsr > + movl %eax,%ebx > xorl %eax,%eax /* XXX */ > orl $(EFER_LME|EFER_SCE),%eax > + /* If set, preserve LMA */ > + testl $EFER_LMA,%ebx > + jz efer_nxe > + orl $EFER_LMA,%eax > +efer_nxe: > movl RELOC((pg_nx + 4)), %ebx > cmpl $0, %ebx > je write_efer > @@ -745,6 +806,118 @@ longmode_hi: > call init_x86_64 > call main > > +vc_cpuid64: > + shll $30, %eax /* requested register */ > + orl $MSR_PROTO_CPUID_REQ, %eax > + movl %ebx, %edx /* CPUID function */ > + movl $MSR_SEV_GHCB, %ecx > + wrmsr > + rep vmmcall > + rdmsr > + ret > + > + .globl locore_vc_trap64 > +locore_vc_trap64: > + pushq %rax > + pushq %rbx > + pushq %rcx > + pushq %rdx > + > + cmpl $SVM_VMEXIT_CPUID, 32(%rsp) > + jne .Lterminate64 > + > + movl %eax, %ebx /* save CPUID function */ > + > + movl $0, %eax /* request cpuid, get %eax */ > + call vc_cpuid64 > + movq %rdx, 24(%rsp) > + > + movl $1, %eax /* get %ebx */ > + call vc_cpuid64 > + movq %rdx, 16(%rsp) > + > + movl $2, %eax /* get %ecx */ > + call vc_cpuid64 > + movq %rdx, 8(%rsp) > + > + movl $3, %eax /* get %edx */ > + call vc_cpuid64 > + movq %rdx, 0(%rsp) > + > + popq %rdx > + popq %rcx > + popq %rbx > + popq %rax > + addq $8, %rsp > + addq $2, (%rsp) > + iretq > + > +.Lterminate64: > + movl $MSR_PROTO_TERMINATE, %eax > + movl $MSR_SEV_GHCB, %ecx > + wrmsr > + rep vmmcall > +.Lterm_loop64: > + hlt > + jmp .Lterm_loop64 > + > + .code32 > +vc_cpuid32: > + shll $30, %eax /* requested register */ > + orl $MSR_PROTO_CPUID_REQ, %eax > + movl %ebx, %edx /* CPUID function */ > + movl $MSR_SEV_GHCB, %ecx > + wrmsr > + rep vmmcall > + rdmsr > + ret > + > + .globl locore_vc_trap32 > +locore_vc_trap32: > + pushl %eax > + pushl %ebx > + pushl %ecx > + pushl %edx > + > + cmpl $SVM_VMEXIT_CPUID, 16(%esp) > + jne .Lterminate32 > + > + movl %eax, %ebx /* save CPUID function */ > + > + movl $0, %eax /* request cpuid, get %eax */ > + call vc_cpuid32 > + movl %edx, 12(%esp) > + > + movl $1, %eax /* get %ebx */ > + call vc_cpuid32 > + movl %edx, 8(%esp) > + > + movl $2, %eax /* get %ecx */ > + call vc_cpuid32 > + movl %edx, 4(%esp) > + > + movl $3, %eax /* get %edx */ > + call vc_cpuid32 > + movl %edx, 0(%esp) > + > + popl %edx > + popl %ecx > + popl %ebx > + popl %eax > + addl $4, %esp > + addl $2, (%esp) > + iret > + > +.Lterminate32: > + movl $MSR_PROTO_TERMINATE, %eax > + movl $MSR_SEV_GHCB, %ecx > + wrmsr > + rep vmmcall > +.Lterm_loop32: > + hlt > + jmp .Lterm_loop32 > + > + > .section .codepatch,"a" > .align 8, 0xcc > .globl codepatch_begin > @@ -757,6 +930,20 @@ codepatch_end: > .previous > > .data > + .globl idtlc /* temporary locore IDT */ > +idtlc: > + .word early_idt_end-early_idt-1 > + .long _RELOC(early_idt) > + .align 64, 0xcc > + > + .globl early_idt > +early_idt: > + .rept NIDT > + .quad 0x0000000000000000 > + .quad 0x0000000000000000 > + .endr > +early_idt_end: > + > .globl gdt64 > gdt64: > .word gdt64_end-gdt64_start-1 > diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h > index 954e1fa3e3b..fb4ff389ac4 100644 > --- a/sys/arch/amd64/include/ghcb.h > +++ b/sys/arch/amd64/include/ghcb.h > @@ -19,6 +19,8 @@ > #ifndef _MACHINE_GHCB_H_ > #define _MACHINE_GHCB_H_ > > +#ifndef _LOCORE > + > #include > > #define GHCB_OFFSET(m) ((m) / 8) > @@ -99,6 +101,7 @@ struct ghcb_sync { > int sz_c; > int sz_d; > }; > +#endif /* !_LOCORE */ > > > /* Definitions used with the MSR protocol */ > @@ -107,6 +110,8 @@ struct ghcb_sync { > #define MSR_PROTO_TERMINATE 0x100 > > > +#ifndef _LOCORE > + > void ghcb_clear(struct ghcb_sa *); > int ghcb_valbm_set(uint8_t *, int); > int ghcb_valbm_isset(uint8_t *, int); > @@ -118,4 +123,6 @@ void ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t, > struct ghcb_sa *, struct ghcb_sync *); > void ghcb_sync_in(struct trapframe *, struct ghcb_sa *, struct ghcb_sync *); > > +#endif /* !_LOCORE */ > + > #endif /* !_MACHINE_GHCB_H_ */ > diff --git a/sys/arch/amd64/include/trap.h b/sys/arch/amd64/include/trap.h > index fa322ba9566..7506dddf804 100644 > --- a/sys/arch/amd64/include/trap.h > +++ b/sys/arch/amd64/include/trap.h > @@ -62,3 +62,4 @@ > #define T_XMM 19 /* SSE FP exception */ > #define T_VE 20 /* virtualization exception */ > #define T_CP 21 /* control protection exception */ > +#define T_VC 29 /* VMM communication exception */ > diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h > index 92f3eade605..cfbfa604902 100644 > --- a/sys/arch/amd64/include/vmmvar.h > +++ b/sys/arch/amd64/include/vmmvar.h > @@ -21,6 +21,8 @@ > #ifndef _MACHINE_VMMVAR_H_ > #define _MACHINE_VMMVAR_H_ > > +#ifndef _LOCORE > + > #define VMM_HV_SIGNATURE "OpenBSDVMM58" > > /* VMX: Basic Exit Reasons */ > @@ -94,6 +96,8 @@ > #define VMX_MAX_CR3_TARGETS 256 > #define VMX_VMCS_PA_CLEAR 0xFFFFFFFFFFFFFFFFUL > > +#endif /* ! _LOCORE */ > + > /* > * SVM: Intercept codes (exit reasons) > */ > @@ -262,6 +266,8 @@ > #define SVM_VMEXIT_VMGEXIT 0x403 > #define SVM_VMEXIT_INVALID -1 > > +#ifndef _LOCORE > + > /* > * Exception injection vectors (these correspond to the CPU exception types > * defined in the SDM.) > @@ -1057,4 +1063,6 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *); > > #endif /* _KERNEL */ > > +#endif /* ! _LOCORE */ > + > #endif /* ! _MACHINE_VMMVAR_H_ */ > -------------------------------------------------------------------------- > > commit c29b30056940a3f8b2acfd18b734daf60257656a > Author: Hans-Joerg Hoexer > Date: Wed Nov 20 11:16:48 2024 +0100 > > vmd(8): Setup long mode code segment for SEV-ES guests > > Right now vmd(8) starts a kernel image in compatibility mode. > However, SEV-ES enabled guest will trigger #VC traps during locore. > To be able to run such a trap handler, we need a long mode segment. > > diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c > index 2b62ca07565..73c2010397d 100644 > --- a/usr.sbin/vmd/loadfile_elf.c > +++ b/usr.sbin/vmd/loadfile_elf.c > @@ -110,7 +110,7 @@ union { > } hdr; > > static void setsegment(struct mem_segment_descriptor *, uint32_t, > - size_t, int, int, int, int); > + size_t, int, int, int, int, int); > static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int); > static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int); > static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *); > @@ -148,7 +148,7 @@ uint64_t pg_crypt = 0; > */ > static void > setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, > - int type, int dpl, int def32, int gran) > + int type, int dpl, int def32, int gran, int lm) > { > sd->sd_lolimit = (int)limit; > sd->sd_lobase = (int)base; > @@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, > sd->sd_p = 1; > sd->sd_hilimit = (int)limit >> 16; > sd->sd_avl = 0; > - sd->sd_long = 0; > + sd->sd_long = lm; > sd->sd_def32 = def32; > sd->sd_gran = gran; > sd->sd_hibase = (int)base >> 24; > @@ -185,11 +185,13 @@ push_gdt(void) > * Create three segment descriptors: > * > * GDT[0] : null descriptor. "Created" via memset above. > - * GDT[1] (selector @ 0x8): Executable segment, for CS > + * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS > * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS > + * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS > */ > - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1); > - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1); > + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0); > + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0); > + setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1); > > write_mem(GDT_PAGE, gdtpage, PAGE_SIZE); > sev_register_encryption(GDT_PAGE, PAGE_SIZE); > > > > > > > Julia Template > > Betreff : SEV-ES guest: locore #VC trap handling > Sender : owner-tech+M103616=hshoexer=genua.de@openbsd.org > Envelope Sender : owner-tech+M103616=hshoexer=genua.de@openbsd.org > Sender Name : Hans-Jörg Höxer > Sender Domain : openbsd.org > Message ID : > Mail Size : 24038 > Time : 21.05.2025 17:11:14 (Mi 21 Mai 2025 17:11:14 CEST) > Julia Commands : Keine Kommandos verwendet > > Die Nachricht war signiert. > > Allgemeine Informationen zur Signatur: > Die Signatur der Nachricht war gültig. > Der Nachrichtenumschlag war S/MIME signiert. > > S/MIME-Engine Antworten: > Envelope Signer : /C=DE/O=genua GmbH/CN=Hans-J\xC3\xB6rg H\xC3\xB6xer/L=Kirchheim b. M\xC3\xBCnchen/GN=Hans-J\xC3\xB6rg/SN=H\xC3\xB6xer/organizationIdentifier=NTRDE-D2601V.HRB98238/serialNumber=CSM043478563/ST=Bayern > > Info Signatur : CMS Inhaltstyp : pkcs7-signedData (1.2.840.113549.1.7.2) > Verschlüsselter Content-Type : pkcs7-data (1.2.840.113549.1.7.1) > Signatur Zertifikat Fingerprint SHA1: 8A:6A:1A:A3:1A:B4:EA:25:18:45:2E:6E:E2:CE:DC:64:F0:3D:9B:26 > Signatur Zertifikat Schlüssellänge: 4096 (RSA) > Signatur Zertifikat Seriennummer: 74:36:DA:14:8D:EC:3B:D4:F6:35:7A:F3:88:82:CB:FE > Signatur Zertifikat gültig ab: Sep 17 14:21:48 2024 GMT > Signatur Zertifikat gültig bis: Sep 20 14:21:48 2025 GMT > Signatur Zertifikat gültig für die Verschlüsselung: Ja > Signatur Zertifikat gültig für die Signatur: Ja > Signatur Zertifikats CA /C=DE/O=D-Trust GmbH/CN=D-TRUST Application Certificates CA 3-1 2013 > Signatur MD : sha256 (2.16.840.1.101.3.4.2.1) > Signaturalgorithmus : rsaEncryption (1.2.840.113549.1.1.1) > Signaturzeitpunkt: May 21 15:10:29 2025 GMT > Signaturzertifikat enthält nicht die Email-Adresse des Absenders 'owner-tech+M103616=hshoexer=genua.de@openbsd.org' > > MD Signatur : : sha256 (2.16.840.1.101.3.4.2.1) > > Signature Engine Response : > Verify Engine Response : > Verification OK (0) > > Qualified Verify Engine Response : > > > > > > > > Ende des Julia-Templates -- Dr. Hans-Jörg Höxer Hans-Joerg_Hoexer@genua.de Senior Expert Kryptographie eXtreme Kernel and Crypto Development genua GmbH Domagkstrasse 7, 85551 Kirchheim bei München tel +49 89 991950-0, fax -999, www.genua.eu Geschäftsführer: Matthias Ochs, Marc Tesch Amtsgericht München HRB 98238 genua ist ein Unternehmen der Bundesdruckerei-Gruppe.