Download raw body.
SEV-ES guest: locore #VC trap handling
Hi,
this change deals with locore for SEV-ES enabled guests. The approach
might be a bit controversial. And it requires a diff for vmd(8), that
I've also attached, to simplify the discussion:
SEV-ES guest: locore #VC trap handling
When locore is executed by a SEV-ES enabled guest the first cpuid
instruction will raise a #VC trap that will need to be handled.
However, at that point in time the guest does not know wether it's
a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
etc.
To resolve this chicken-egg situation we undconditionally setup a
As vmd(8) configures the runtime for locore to be in 32 bit
compatibility mode a raised #VC exception will switch to long mode.
And the CPU will expect a 64 bit entry in the IDT. When running
on eg. KVM locore is execute in 32 bit legacy mode. There the
CPU will expect a 32 bit entry in the IDT.
To accomodate both situations, we set up both 64 and 32 bit handler
in the IDT.
Additionally, vmd(8) has to setup a long mode segment in the GDT.
Both #VC trap handler use the MSR protocol to talk to the hypervisor
to emulate CPUID. The MSR protocol only supports "simple" CPUID
without subfunctions.
Note: When SEV-ES is enabled, the hypervisor can not intercept
writes to EFER beforehand, only after the write. Thus on vmm(4)
with directly executed kernel we are in compatibility mode and
EFER_LMA is set. As resetting EFER_LMA raises #GP we have to
preserve it.
Take care,
HJ.
--------------------------------------------------------------------------
commit 588a7de9576a84062110b29c2c15b9f2cb9ea4c0
Author: Hans-Joerg Hoexer <hshoexer@genua.de>
Date: Tue Aug 6 17:56:55 2024 +0200
SEV-ES guest: locore #VC trap handling
When locore is executed by a SEV-ES enabled guest the first cpuid
instruction will raise a #VC trap that will need to be handled.
However, at that point in time the guest does not know wether it's
a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
etc.
To resolve this chicken-egg situation we undconditionally setup a
As vmd(8) configures the runtime for locore to be in 32 bit
compatibility mode a raised #VC exception will switch to long mode.
And the CPU will expect a 64 bit entry in the IDT. When running
on eg. KVM locore is execute in 32 bit legacy mode. There the
CPU will expect a 32 bit entry in the IDT.
To accomodate both situations, we set up both 64 and 32 bit handler
in the IDT.
Additionally, vmd(8) has to setup a long mode segment in the GDT.
Both #VC trap handler use the MSR protocol to talk to the hypervisor
to emulate CPUID. The MSR protocol only supports "simple" CPUID
without subfunctions.
Note: When SEV-ES is enabled, the hypervisor can not intercept
writes to EFER beforehand, only after the write. Thus on vmm(4)
with directly executed kernel we are in compatibility mode and
EFER_LMA is set. As resetting EFER_LMA raises #GP we have to
preserve it.
diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
index 4ef1892c7cc..6b4111717a8 100644
--- a/sys/arch/amd64/amd64/locore0.S
+++ b/sys/arch/amd64/amd64/locore0.S
@@ -111,6 +111,9 @@
#include <machine/param.h>
#include <machine/segments.h>
#include <machine/specialreg.h>
+#include <machine/trap.h>
+#include <machine/ghcb.h>
+#include <machine/vmmvar.h>
/*
* override user-land alignment before including asm.h
@@ -193,6 +196,58 @@ bi_size_ok:
pushl $PSL_MBO
popfl
+ /*
+ * Setup temporary #VC trap handler, in case we are running
+ * on an AMD CPU in SEV-ES guest mode. Will be reset by
+ * init_x86_64().
+ * We are setting up two handlers:
+ *
+ * 1) locore_vc_trap32: Triggered when we are running in
+ * 32-bit legacy mode.
+ *
+ * 2) locore_vc_trap64: Triggered when we are running in
+ * 32-bit compatibility mode.
+ *
+ * The latter one is used by vmd(8).
+ */
+ movl $RELOC(early_idt), %ecx
+ movl $T_VC, %edx
+ leal (%ecx, %edx, 8), %ecx /* 32bit #VC IDT slot */
+
+ pushl %cs /* get current %cs */
+ popl %ebx
+ shll $16, %ebx
+
+ movl $RELOC(locore_vc_trap32), %eax
+ andl $0x0000ffff, %eax
+ orl %ebx, %eax /* use current %cs */
+ movl %eax, (%ecx)
+
+ movl $RELOC(locore_vc_trap32), %eax
+ andl $0xffff0000, %eax
+ orl $((0x80 | SDT_SYS386IGT) << 8), %eax
+ movl %eax, 4(%ecx)
+
+ movl $RELOC(early_idt), %ecx
+ movl $(2 * T_VC), %edx
+ leal (%ecx, %edx, 8), %ecx /* 64bit #VC IDT slot */
+
+ movl $RELOC(locore_vc_trap64), %eax
+ andl $0x0000ffff, %eax
+ orl $(GSEL(3, SEL_KPL) << 16), %eax
+ movl %eax, (%ecx)
+
+ movl $RELOC(locore_vc_trap64), %eax
+ andl $0xffff0000, %eax
+ orl $((0x80 | SDT_SYS386IGT) << 8), %eax
+ movl %eax, 4(%ecx)
+ xorl %eax, %eax
+ movl %eax, 8(%ecx)
+ movl %eax, 12(%ecx)
+
+ movl $RELOC(idtlc), %eax
+ lidt (%eax)
+
/* Reset debug control registers */
xorl %eax,%eax
movl %eax,%dr6
@@ -631,8 +686,14 @@ store_pte:
*/
movl $MSR_EFER,%ecx
rdmsr
+ movl %eax,%ebx
xorl %eax,%eax /* XXX */
orl $(EFER_LME|EFER_SCE),%eax
+ /* If set, preserve LMA */
+ testl $EFER_LMA,%ebx
+ jz efer_nxe
+ orl $EFER_LMA,%eax
+efer_nxe:
movl RELOC((pg_nx + 4)), %ebx
cmpl $0, %ebx
je write_efer
@@ -745,6 +806,118 @@ longmode_hi:
call init_x86_64
call main
+vc_cpuid64:
+ shll $30, %eax /* requested register */
+ orl $MSR_PROTO_CPUID_REQ, %eax
+ movl %ebx, %edx /* CPUID function */
+ movl $MSR_SEV_GHCB, %ecx
+ wrmsr
+ rep vmmcall
+ rdmsr
+ ret
+
+ .globl locore_vc_trap64
+locore_vc_trap64:
+ pushq %rax
+ pushq %rbx
+ pushq %rcx
+ pushq %rdx
+
+ cmpl $SVM_VMEXIT_CPUID, 32(%rsp)
+ jne .Lterminate64
+
+ movl %eax, %ebx /* save CPUID function */
+
+ movl $0, %eax /* request cpuid, get %eax */
+ call vc_cpuid64
+ movq %rdx, 24(%rsp)
+
+ movl $1, %eax /* get %ebx */
+ call vc_cpuid64
+ movq %rdx, 16(%rsp)
+
+ movl $2, %eax /* get %ecx */
+ call vc_cpuid64
+ movq %rdx, 8(%rsp)
+
+ movl $3, %eax /* get %edx */
+ call vc_cpuid64
+ movq %rdx, 0(%rsp)
+
+ popq %rdx
+ popq %rcx
+ popq %rbx
+ popq %rax
+ addq $8, %rsp
+ addq $2, (%rsp)
+ iretq
+
+.Lterminate64:
+ movl $MSR_PROTO_TERMINATE, %eax
+ movl $MSR_SEV_GHCB, %ecx
+ wrmsr
+ rep vmmcall
+.Lterm_loop64:
+ hlt
+ jmp .Lterm_loop64
+
+ .code32
+vc_cpuid32:
+ shll $30, %eax /* requested register */
+ orl $MSR_PROTO_CPUID_REQ, %eax
+ movl %ebx, %edx /* CPUID function */
+ movl $MSR_SEV_GHCB, %ecx
+ wrmsr
+ rep vmmcall
+ rdmsr
+ ret
+
+ .globl locore_vc_trap32
+locore_vc_trap32:
+ pushl %eax
+ pushl %ebx
+ pushl %ecx
+ pushl %edx
+
+ cmpl $SVM_VMEXIT_CPUID, 16(%esp)
+ jne .Lterminate32
+
+ movl %eax, %ebx /* save CPUID function */
+
+ movl $0, %eax /* request cpuid, get %eax */
+ call vc_cpuid32
+ movl %edx, 12(%esp)
+
+ movl $1, %eax /* get %ebx */
+ call vc_cpuid32
+ movl %edx, 8(%esp)
+
+ movl $2, %eax /* get %ecx */
+ call vc_cpuid32
+ movl %edx, 4(%esp)
+
+ movl $3, %eax /* get %edx */
+ call vc_cpuid32
+ movl %edx, 0(%esp)
+
+ popl %edx
+ popl %ecx
+ popl %ebx
+ popl %eax
+ addl $4, %esp
+ addl $2, (%esp)
+ iret
+
+.Lterminate32:
+ movl $MSR_PROTO_TERMINATE, %eax
+ movl $MSR_SEV_GHCB, %ecx
+ wrmsr
+ rep vmmcall
+.Lterm_loop32:
+ hlt
+ jmp .Lterm_loop32
+
+
.section .codepatch,"a"
.align 8, 0xcc
.globl codepatch_begin
@@ -757,6 +930,20 @@ codepatch_end:
.previous
.data
+ .globl idtlc /* temporary locore IDT */
+idtlc:
+ .word early_idt_end-early_idt-1
+ .long _RELOC(early_idt)
+ .align 64, 0xcc
+
+ .globl early_idt
+early_idt:
+ .rept NIDT
+ .quad 0x0000000000000000
+ .quad 0x0000000000000000
+ .endr
+early_idt_end:
+
.globl gdt64
gdt64:
.word gdt64_end-gdt64_start-1
diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
index 954e1fa3e3b..fb4ff389ac4 100644
--- a/sys/arch/amd64/include/ghcb.h
+++ b/sys/arch/amd64/include/ghcb.h
@@ -19,6 +19,8 @@
#ifndef _MACHINE_GHCB_H_
#define _MACHINE_GHCB_H_
+#ifndef _LOCORE
+
#include <machine/frame.h>
#define GHCB_OFFSET(m) ((m) / 8)
@@ -99,6 +101,7 @@ struct ghcb_sync {
int sz_c;
int sz_d;
};
+#endif /* !_LOCORE */
/* Definitions used with the MSR protocol */
@@ -107,6 +110,8 @@ struct ghcb_sync {
#define MSR_PROTO_TERMINATE 0x100
+#ifndef _LOCORE
+
void ghcb_clear(struct ghcb_sa *);
int ghcb_valbm_set(uint8_t *, int);
int ghcb_valbm_isset(uint8_t *, int);
@@ -118,4 +123,6 @@ void ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t,
struct ghcb_sa *, struct ghcb_sync *);
void ghcb_sync_in(struct trapframe *, struct ghcb_sa *, struct ghcb_sync *);
+#endif /* !_LOCORE */
+
#endif /* !_MACHINE_GHCB_H_ */
diff --git a/sys/arch/amd64/include/trap.h b/sys/arch/amd64/include/trap.h
index fa322ba9566..7506dddf804 100644
--- a/sys/arch/amd64/include/trap.h
+++ b/sys/arch/amd64/include/trap.h
@@ -62,3 +62,4 @@
#define T_XMM 19 /* SSE FP exception */
#define T_VE 20 /* virtualization exception */
#define T_CP 21 /* control protection exception */
+#define T_VC 29 /* VMM communication exception */
diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
index 92f3eade605..cfbfa604902 100644
--- a/sys/arch/amd64/include/vmmvar.h
+++ b/sys/arch/amd64/include/vmmvar.h
@@ -21,6 +21,8 @@
#ifndef _MACHINE_VMMVAR_H_
#define _MACHINE_VMMVAR_H_
+#ifndef _LOCORE
+
#define VMM_HV_SIGNATURE "OpenBSDVMM58"
/* VMX: Basic Exit Reasons */
@@ -94,6 +96,8 @@
#define VMX_MAX_CR3_TARGETS 256
#define VMX_VMCS_PA_CLEAR 0xFFFFFFFFFFFFFFFFUL
+#endif /* ! _LOCORE */
+
/*
* SVM: Intercept codes (exit reasons)
*/
@@ -262,6 +266,8 @@
#define SVM_VMEXIT_VMGEXIT 0x403
#define SVM_VMEXIT_INVALID -1
+#ifndef _LOCORE
+
/*
* Exception injection vectors (these correspond to the CPU exception types
* defined in the SDM.)
@@ -1057,4 +1063,6 @@ int svm_get_vmsa_pa(uint32_t, uint32_t, uint64_t *);
#endif /* _KERNEL */
+#endif /* ! _LOCORE */
+
#endif /* ! _MACHINE_VMMVAR_H_ */
--------------------------------------------------------------------------
commit c29b30056940a3f8b2acfd18b734daf60257656a
Author: Hans-Joerg Hoexer <hshoexer@genua.de>
Date: Wed Nov 20 11:16:48 2024 +0100
vmd(8): Setup long mode code segment for SEV-ES guests
Right now vmd(8) starts a kernel image in compatibility mode.
However, SEV-ES enabled guest will trigger #VC traps during locore.
To be able to run such a trap handler, we need a long mode segment.
diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c
index 2b62ca07565..73c2010397d 100644
--- a/usr.sbin/vmd/loadfile_elf.c
+++ b/usr.sbin/vmd/loadfile_elf.c
@@ -110,7 +110,7 @@ union {
} hdr;
static void setsegment(struct mem_segment_descriptor *, uint32_t,
- size_t, int, int, int, int);
+ size_t, int, int, int, int, int);
static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
@@ -148,7 +148,7 @@ uint64_t pg_crypt = 0;
*/
static void
setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
- int type, int dpl, int def32, int gran)
+ int type, int dpl, int def32, int gran, int lm)
{
sd->sd_lolimit = (int)limit;
sd->sd_lobase = (int)base;
@@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
sd->sd_p = 1;
sd->sd_hilimit = (int)limit >> 16;
sd->sd_avl = 0;
- sd->sd_long = 0;
+ sd->sd_long = lm;
sd->sd_def32 = def32;
sd->sd_gran = gran;
sd->sd_hibase = (int)base >> 24;
@@ -185,11 +185,13 @@ push_gdt(void)
* Create three segment descriptors:
*
* GDT[0] : null descriptor. "Created" via memset above.
- * GDT[1] (selector @ 0x8): Executable segment, for CS
+ * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
* GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
+ * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
*/
- setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
- setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
+ setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
+ setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
+ setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
sev_register_encryption(GDT_PAGE, PAGE_SIZE);
SEV-ES guest: locore #VC trap handling