Download raw body.
SEV-ES guest: locore #VC trap handling
On Sun, Jun 15, 2025 at 04:55:12PM +0200, Alexander Bluhm wrote:
> On Sat, Jun 14, 2025 at 10:23:53PM -0700, Mike Larkin wrote:
> > On Wed, May 21, 2025 at 05:10:27PM +0200, Hans-J?rg H?xer wrote:
> > > Hi,
> > >
> > > this change deals with locore for SEV-ES enabled guests. The approach
> > > might be a bit controversial. And it requires a diff for vmd(8), that
> > > I've also attached, to simplify the discussion:
> > >
> > > SEV-ES guest: locore #VC trap handling
> > >
> > > When locore is executed by a SEV-ES enabled guest the first cpuid
> > > instruction will raise a #VC trap that will need to be handled.
> > > However, at that point in time the guest does not know wether it's
> > > a guest at all, if it is running on an AMD cpu with SEV-ES enabled,
> > > etc.
> > >
> > > To resolve this chicken-egg situation we undconditionally setup a
> > >
> > > As vmd(8) configures the runtime for locore to be in 32 bit
> > > compatibility mode a raised #VC exception will switch to long mode.
> > > And the CPU will expect a 64 bit entry in the IDT. When running
> > > on eg. KVM locore is execute in 32 bit legacy mode. There the
> > > CPU will expect a 32 bit entry in the IDT.
> > >
> > > To accomodate both situations, we set up both 64 and 32 bit handler
> > > in the IDT.
> > >
> > > Additionally, vmd(8) has to setup a long mode segment in the GDT.
> > >
> > > Both #VC trap handler use the MSR protocol to talk to the hypervisor
> > > to emulate CPUID. The MSR protocol only supports "simple" CPUID
> > > without subfunctions.
> > >
> > > Note: When SEV-ES is enabled, the hypervisor can not intercept
> > > writes to EFER beforehand, only after the write. Thus on vmm(4)
> > > with directly executed kernel we are in compatibility mode and
> > > EFER_LMA is set. As resetting EFER_LMA raises #GP we have to
> > > preserve it.
> > >
> > > Take care,
> > > HJ.
> > >
> >
> > This one probably needs a lot of testing. What tests have you guys done so
> > far? My SEV-ES machine is available now, I will test this and let you know
> > but we probably want a lot of testing from different host/guest combos.
>
> I know that this part does not break SEV (without ES) on vmm/vmd
> and kvm/qemu.
>
> Below is the complete diff that hshoexer@ sent some months ago
> rebased to current. With that I can run OpenBSD guest with SEV-ES
> in vmm/vmd. On kvm/qemu SEV still works, but I cannot get guest
> running with SEV-ES.
>
> So if you agree with the locore0 part, that was in the original
> mail of this thread, I would like to commit it.
>
> bluhm
>
I think this diff needs a fair amount of work before it can go in.
See below for comments. This is going to require a ton of testing. How do
you want to proceed? For example, I think this needs to be tested as a
guest on a number of sev/sev-es capable hypervisors and a wide variety
of hardware. You already mentioned one breakage above, right?
-ml
> Index: sys/arch/amd64/amd64/ghcb.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/ghcb.c,v
> diff -u -p -r1.1 ghcb.c
> --- sys/arch/amd64/amd64/ghcb.c 24 May 2025 12:47:00 -0000 1.1
> +++ sys/arch/amd64/amd64/ghcb.c 15 Jun 2025 11:03:02 -0000
> @@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = {
> 0x00000000ffffffffULL, 0xffffffffffffffffULL
> };
>
> +vaddr_t ghcb_vaddr;
> +paddr_t ghcb_paddr;
> +
> /*
> * ghcb_clear
> *
> @@ -92,13 +95,28 @@ ghcb_valid(struct ghcb_sa *ghcb)
> /*
> * ghcb_verify_bm
> *
> - * To be verified positive, the given expected bitmap must be at
> - * least a subset of the provided valid bitmap.
> - * Used by host and guest.
> + * To be verified positive, the given valid bitmap must exactly
> + * match the expected bitmap.
> + * Used by host only.
> */
> int
> ghcb_verify_bm(uint8_t *valid_bm, uint8_t *expected_bm)
> {
> + return (memcmp(valid_bm, expected_bm, GHCB_VB_SZ));
> +}
> +
> +/*
> + * ghcb_verify_bm_guest
> + *
> + * To be verified positive, the given expected bitmap must be at
> + * least a subset of the provided valid bitmap. This ensures, the
> + * host provides at least the information requested by the guest.
> + * Used by guest only.
> + * This is required for running on a Linux/KVM host.
> + */
> +int
> +ghcb_verify_bm_guest(uint8_t *valid_bm, uint8_t *expected_bm)
> +{
> return ((ghcb_valbm_isset(expected_bm, GHCB_RAX) &&
> !ghcb_valbm_isset(valid_bm, GHCB_RAX)) ||
> (ghcb_valbm_isset(expected_bm, GHCB_RBX) &&
> @@ -107,18 +125,10 @@ ghcb_verify_bm(uint8_t *valid_bm, uint8_
> !ghcb_valbm_isset(valid_bm, GHCB_RCX)) ||
> (ghcb_valbm_isset(expected_bm, GHCB_RDX) &&
> !ghcb_valbm_isset(valid_bm, GHCB_RDX)) ||
> - (ghcb_valbm_isset(expected_bm, GHCB_SW_EXITCODE) &&
> - !ghcb_valbm_isset(valid_bm, GHCB_SW_EXITCODE)) ||
> (ghcb_valbm_isset(expected_bm, GHCB_SW_EXITINFO1) &&
> !ghcb_valbm_isset(valid_bm, GHCB_SW_EXITINFO1)) ||
> (ghcb_valbm_isset(expected_bm, GHCB_SW_EXITINFO2) &&
> - !ghcb_valbm_isset(valid_bm, GHCB_SW_EXITINFO2)) ||
> - (ghcb_valbm_isset(expected_bm, GHCB_SW_SCRATCH) &&
> - !ghcb_valbm_isset(valid_bm, GHCB_SW_SCRATCH)) ||
> - (ghcb_valbm_isset(expected_bm, GHCB_XCR0) &&
> - !ghcb_valbm_isset(valid_bm, GHCB_XCR0)) ||
> - (ghcb_valbm_isset(expected_bm, GHCB_XSS) &&
> - !ghcb_valbm_isset(valid_bm, GHCB_XSS)));
> + !ghcb_valbm_isset(valid_bm, GHCB_SW_EXITINFO2)));
> }
>
> /*
> Index: sys/arch/amd64/amd64/identcpu.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/identcpu.c,v
> diff -u -p -r1.150 identcpu.c
> --- sys/arch/amd64/amd64/identcpu.c 29 Apr 2025 20:19:48 -0000 1.150
> +++ sys/arch/amd64/amd64/identcpu.c 15 Jun 2025 11:03:01 -0000
> @@ -67,6 +67,7 @@ int cpuspeed;
>
> int amd64_has_xcrypt;
> int amd64_pos_cbit; /* C bit position for SEV */
> +/* Minimum ASID value for an SEV enabled, SEV-ES disabled guest. */
> int amd64_min_noes_asid;
> int has_rdrand;
> int has_rdseed;
> @@ -712,6 +713,10 @@ identifycpu(struct cpu_info *ci)
> CPUID_AMDSEV_EDX_BITS);
> amd64_pos_cbit = (ci->ci_feature_amdsev_ebx & 0x3f);
> amd64_min_noes_asid = ci->ci_feature_amdsev_edx;
> + if (cpu_sev_guestmode && CPU_IS_PRIMARY(ci))
> + printf("\n%s: SEV%s guest mode", ci->ci_dev->dv_xname,
> + ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED) ?
> + "-ES" : "");
> }
>
> printf("\n");
> Index: sys/arch/amd64/amd64/locore0.S
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/locore0.S,v
> diff -u -p -r1.27 locore0.S
> --- sys/arch/amd64/amd64/locore0.S 5 May 2025 23:02:39 -0000 1.27
> +++ sys/arch/amd64/amd64/locore0.S 15 Jun 2025 11:03:01 -0000
> @@ -111,6 +111,7 @@
> #include <machine/param.h>
> #include <machine/segments.h>
> #include <machine/specialreg.h>
> +#include <machine/trap.h>
>
> /*
> * override user-land alignment before including asm.h
> @@ -193,6 +194,58 @@ bi_size_ok:
> pushl $PSL_MBO
> popfl
>
> + /*
> + * Setup temporary #VC trap handler, in case we are running
> + * on an AMD CPU in SEV-ES guest mode. Will be reset by
> + * init_x86_64().
> + * We are setting up two handlers:
> + *
> + * 1) locore_vc_trap32: Triggered when we are running in
> + * 32-bit legacy mode.
> + *
> + * 2) locore_vc_trap64: Triggered when we are running in
> + * 32-bit compatibility mode.
> + *
> + * The latter one is used by vmd(8).
Please clarify; *when* is this used by vmd? I believe you mean when
we do a direct kernel launch? If not, then why do we need the 32 bit
one?
> + */
> + movl $RELOC(early_idt), %ecx
> + movl $T_VC, %edx
> + leal (%ecx, %edx, 8), %ecx /* 32bit #VC IDT slot */
> +
> + pushl %cs /* get current %cs */
> + popl %ebx
> + shll $16, %ebx
> +
> + movl $RELOC(locore_vc_trap32), %eax
> + andl $0x0000ffff, %eax
> + orl %ebx, %eax /* use current %cs */
> + movl %eax, (%ecx)
> +
> + movl $RELOC(locore_vc_trap32), %eax
> + andl $0xffff0000, %eax
> + orl $((0x80 | SDT_SYS386IGT) << 8), %eax
> + movl %eax, 4(%ecx)
> +
> + movl $RELOC(early_idt), %ecx
> + movl $(2 * T_VC), %edx
> + leal (%ecx, %edx, 8), %ecx /* 64bit #VC IDT slot */
> +
> + movl $RELOC(locore_vc_trap64), %eax
> + andl $0x0000ffff, %eax
> + orl $(GSEL(3, SEL_KPL) << 16), %eax
> + movl %eax, (%ecx)
> +
> + movl $RELOC(locore_vc_trap64), %eax
> + andl $0xffff0000, %eax
> + orl $((0x80 | SDT_SYS386IGT) << 8), %eax
> + movl %eax, 4(%ecx)
> + xorl %eax, %eax
> + movl %eax, 8(%ecx)
> + movl %eax, 12(%ecx)
> +
> + movl $RELOC(idtlc), %eax
> + lidt (%eax)
> +
> /* Reset debug control registers */
> xorl %eax,%eax
> movl %eax,%dr6
> @@ -293,8 +346,9 @@ cont:
> /* Are we in guest mode with SEV enabled? */
> movl $MSR_SEV_STATUS, %ecx
> rdmsr
> - andl $SEV_STAT_ENABLED, %eax
> + testl $SEV_STAT_ENABLED, %eax
> jz .Lno_sev
> + movl %eax, RELOC(cpu_sev_guestmode) /* we are a SEV guest */
>
> /* Determine C bit position */
> movl %ebx, %ecx /* %ebx from previous cpuid */
> @@ -337,8 +391,6 @@ cont:
> andl %eax, RELOC(pg_frame + 4) /* apply mask */
> andl %eax, RELOC(pg_lgframe + 4)
>
> - movl $0x1, RELOC(cpu_sev_guestmode) /* we are a SEV guest */
> -
> .Lno_sev:
>
> /*
> @@ -384,7 +436,9 @@ cont:
> #define PROC0_DMP2_OFF (PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG)
> #define TABLESIZE \
> ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \
> - NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG)
> + NDML3_ENTRIES + NDML2_ENTRIES + 2 + 3) * NBPG)
Can this just be '5' ?
> +#define PROC0_GHCB_OFF (TABLESIZE - 5 * NBPG)
> +#define GHCB_SIZE (2 * NBPG)
>
> #define fillkpt \
> pushl %ebp ; /* save */ \
> @@ -408,6 +462,17 @@ cont:
> loop 1b ; /* till finished */ \
> popl %ebp
>
> +
> +#define fillkpt_nx_nc \
> + pushl %ebp ; /* save */ \
> + movl RELOC((pg_nx + 4)), %ebp ; /* NX bit? */ \
> +1: movl %eax,(%ebx) ; /* store phys addr */ \
> + movl %ebp,4(%ebx) ; /* upper 32 bits */ \
> + addl $8,%ebx ; /* next pte/pde */ \
> + addl $NBPG,%eax ; /* next phys page */ \
> + loop 1b ; /* till finished */ \
> + popl %ebp
> +
> /* Find end of kernel image. */
> movl $RELOC(end),%edi
> #if (NKSYMS || defined(DDB))
> @@ -514,6 +579,16 @@ map_tables:
> shrl $PGSHIFT,%ecx
> fillkpt_nx
>
> + /* Re-Map GHCB shared (ie. unencrypted) */
> + /* XXX hshoexer: Only in SEV-ES guestmode. */
Can we fix the XXXs in this diff before committing please?
> + pushl %ebx /* save current slot */
> + subl $(5 << 3),%ebx /* move back to slot of GHCB */
> + leal (PROC0_GHCB_OFF)(%esi),%eax
> + orl $(PG_V|PG_KW), %eax
> + movl $(GHCB_SIZE>>PGSHIFT), %ecx
> + fillkpt_nx_nc
> + popl %ebx /* continue with slot saved above */
> +
> /* Map ISA I/O mem (later atdevbase) RW, NX */
> movl $(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax
> movl $(IOM_SIZE>>PGSHIFT),%ecx
> @@ -631,7 +706,6 @@ store_pte:
> */
> movl $MSR_EFER,%ecx
> rdmsr
> - xorl %eax,%eax /* XXX */
> orl $(EFER_LME|EFER_SCE),%eax
> movl RELOC((pg_nx + 4)), %ebx
> cmpl $0, %ebx
> @@ -717,6 +791,12 @@ longmode_hi:
> addq %rsi,%rdx
> movq %rdx,atdevbase(%rip)
>
> + /* Relocate GHCB. */
> + /* XXX hshoexer: Only in SEV-ES guestmode. */
See previous.
> + movq $(PROC0_GHCB_OFF+KERNBASE),%rdx
> + addq %rsi,%rdx
> + movq %rdx,ghcb_vaddr(%rip)
> +
> /* Record start of symbols */
> movq $__kernel_bss_end, ssym(%rip)
>
> @@ -739,12 +819,131 @@ longmode_hi:
> movw %ax,%fs
>
> leaq TABLESIZE(%rsi),%rdi
> + subq $(NBPG*2), %rdi
> subq $(NBPG*3), %rdi
Can these be combined to NBPG * 5 ?
>
> /* XXX merge these */
> call init_x86_64
> call main
>
> + /* MSR Protocol Request Codes */
> +#define MSRPROTO_CPUID_REQ 0x4
> +#define MSRPROTO_TERM_REQ 0x100
> +
> +vc_cpuid64:
> + shll $30, %eax /* requested register */
> + orl $MSRPROTO_CPUID_REQ, %eax
> + movl %ebx, %edx /* CPUID function */
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
Out of curiousity, why is the rep prefix needed here?
> + rdmsr
> + ret
> +
> + .globl locore_vc_trap64
> +locore_vc_trap64:
> + pushq %rax
> + pushq %rbx
> + pushq %rcx
> + pushq %rdx
> +
> +#define SVM_VMEXIT_CPUID 0x72
> + cmpl $SVM_VMEXIT_CPUID, 32(%rsp)
> + jne .Lterminate64
> +
> + movl %eax, %ebx /* save CPUID function */
> +
> + movl $0, %eax /* request cpuid, get %eax */
> + call vc_cpuid64
> + movq %rdx, 24(%rsp)
> +
> + movl $1, %eax /* get %ebx */
> + call vc_cpuid64
> + movq %rdx, 16(%rsp)
> +
> + movl $2, %eax /* get %ecx */
> + call vc_cpuid64
> + movq %rdx, 8(%rsp)
> +
> + movl $3, %eax /* get %edx */
> + call vc_cpuid64
> + movq %rdx, 0(%rsp)
> +
> + popq %rdx
> + popq %rcx
> + popq %rbx
> + popq %rax
> + addq $8, %rsp
> + addq $2, (%rsp)
> + iretq
> +
> +.Lterminate64:
> + movl $MSRPROTO_TERM_REQ, %eax
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> +.Lterm_loop64:
> + hlt
> + jmp .Lterm_loop64
> +
> + .code32
> +vc_cpuid32:
> + shll $30, %eax /* requested register */
> + orl $MSRPROTO_CPUID_REQ, %eax
> + movl %ebx, %edx /* CPUID function */
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> + rdmsr
> + ret
> +
> + .globl locore_vc_trap32
> +locore_vc_trap32:
> + pushl %eax
> + pushl %ebx
> + pushl %ecx
> + pushl %edx
> +
> +#define SVM_VMEXIT_CPUID 0x72
> + cmpl $SVM_VMEXIT_CPUID, 16(%esp)
> + jne .Lterminate32
> +
> + movl %eax, %ebx /* save CPUID function */
> +
> + movl $0, %eax /* request cpuid, get %eax */
> + call vc_cpuid32
> + movl %edx, 12(%esp)
> +
> + movl $1, %eax /* get %ebx */
> + call vc_cpuid32
> + movl %edx, 8(%esp)
> +
> + movl $2, %eax /* get %ecx */
> + call vc_cpuid32
> + movl %edx, 4(%esp)
> +
> + movl $3, %eax /* get %edx */
> + call vc_cpuid32
> + movl %edx, 0(%esp)
> +
> + popl %edx
> + popl %ecx
> + popl %ebx
> + popl %eax
> + addl $4, %esp
> + addl $2, (%esp)
> + iret
> +
> +.Lterminate32:
> + movl $MSRPROTO_TERM_REQ, %eax
> + movl $MSR_SEV_GHCB, %ecx
> + wrmsr
> + rep vmmcall
> +.Lterm_loop32:
> + hlt
> + jmp .Lterm_loop32
> +
> +
> .section .codepatch,"a"
> .align 8, 0xcc
> .globl codepatch_begin
> @@ -757,6 +956,20 @@ codepatch_end:
> .previous
>
> .data
> + .globl idtlc /* temporary locore IDT */
> +idtlc:
> + .word early_idt_end-early_idt-1
> + .long _RELOC(early_idt)
> + .align 64, 0xcc
> +
> + .globl early_idt
> +early_idt:
> + .rept NIDT
> + .quad 0x0000000000000000
> + .quad 0x0000000000000000
> + .endr
> +early_idt_end:
> +
> .globl gdt64
> gdt64:
> .word gdt64_end-gdt64_start-1
> Index: sys/arch/amd64/amd64/machdep.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/machdep.c,v
> diff -u -p -r1.299 machdep.c
> --- sys/arch/amd64/amd64/machdep.c 21 May 2025 04:11:57 -0000 1.299
> +++ sys/arch/amd64/amd64/machdep.c 15 Jun 2025 11:03:01 -0000
> @@ -100,6 +100,7 @@
> #include <machine/mpbiosvar.h>
> #include <machine/kcore.h>
> #include <machine/tss.h>
> +#include <machine/ghcb.h>
>
> #include <dev/isa/isareg.h>
> #include <dev/ic/i8042reg.h>
> @@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, vo
> extern int tsc_is_invariant;
> extern int amd64_has_xcrypt;
> extern int need_retpoline;
> +extern int cpu_sev_guestmode;
>
> const struct sysctl_bounded_args cpuctl_vars[] = {
> { CPU_LIDACTION, &lid_action, -1, 2 },
> @@ -500,6 +502,7 @@ const struct sysctl_bounded_args cpuctl_
> { CPU_XCRYPT, &amd64_has_xcrypt, SYSCTL_INT_READONLY },
> { CPU_INVARIANTTSC, &tsc_is_invariant, SYSCTL_INT_READONLY },
> { CPU_RETPOLINE, &need_retpoline, SYSCTL_INT_READONLY },
> + { CPU_SEVGUESTMODE, &cpu_sev_guestmode, SYSCTL_INT_READONLY },
> };
>
> /*
> @@ -1314,6 +1317,38 @@ cpu_init_idt(void)
> lidt(®ion);
> }
>
> +uint64_t early_gdt[GDT_SIZE / 8];
> +
> +void
> +cpu_init_early_vctrap(paddr_t addr)
> +{
> + struct region_descriptor region;
> +
> + extern struct region_descriptor gdt64;
> + extern struct gate_descriptor early_idt[NIDT];
> + extern void Xvctrap_early(void);
> +
> + /* Setup temporary "early" longmode GDT, will be reset soon */
> + memset(early_gdt, 0, sizeof(early_gdt));
> + set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff,
> + SDT_MEMERA, SEL_KPL, 1, 0, 1);
> + set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff,
> + SDT_MEMRWA, SEL_KPL, 1, 0, 1);
> + setregion(®ion, early_gdt, GDT_SIZE - 1);
> + lgdt(®ion);
> +
> + /* Setup temporary "early" longmode #VC entry, will be reset soon */
> + setgate(&early_idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT,
> + SEL_KPL, GSEL(GCODE_SEL, SEL_KPL));
> + setregion(®ion, early_idt, NIDT * sizeof(idt[0]) - 1);
> + lidt(®ion);
> +
> + /* Tell vmm(4) about our GHCB. */
> + ghcb_paddr = addr;
> + memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE);
> + wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +}
> +
> void
> cpu_init_extents(void)
> {
> @@ -1433,6 +1468,13 @@ init_x86_64(paddr_t first_avail)
> bios_memmap_t *bmp;
> int x, ist;
> uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
> +
> + /*
> + * locore0 mapped 2 pages for use as GHCB before pmap is initialized.
> + */
> + if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> + cpu_init_early_vctrap(first_avail);
> + first_avail += 2 * NBPG;
Wasn't 'first_avail' already biased by the subq instructions in locore0 though?
Why do we need to adjust again?
>
> /*
> * locore0 mapped 3 pages for use before the pmap is initialized
> Index: sys/arch/amd64/amd64/trap.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/trap.c,v
> diff -u -p -r1.107 trap.c
> --- sys/arch/amd64/amd64/trap.c 5 May 2025 23:02:39 -0000 1.107
> +++ sys/arch/amd64/amd64/trap.c 15 Jun 2025 11:03:01 -0000
> @@ -86,6 +86,8 @@
> #include <machine/fpu.h>
> #include <machine/psl.h>
> #include <machine/trap.h>
> +#include <machine/ghcb.h>
> +#include <machine/vmmvar.h>
> #ifdef DDB
> #include <ddb/db_output.h>
> #include <machine/db_machdep.h>
> @@ -95,6 +97,7 @@
>
> int upageflttrap(struct trapframe *, uint64_t);
> int kpageflttrap(struct trapframe *, uint64_t);
> +int vctrap(struct trapframe *, int);
> void kerntrap(struct trapframe *);
> void usertrap(struct trapframe *);
> void ast(struct trapframe *);
> @@ -123,6 +126,7 @@ const char * const trap_type[] = {
> "SSE FP exception", /* 19 T_XMM */
> "virtualization exception", /* 20 T_VE */
> "control protection exception", /* 21 T_CP */
> + "VMM communication exception", /* 29 T_VC */
> };
> const int trap_types = nitems(trap_type);
>
> @@ -297,6 +301,150 @@ kpageflttrap(struct trapframe *frame, ui
> return 1;
> }
>
> +int
> +vctrap(struct trapframe *frame, int user)
> +{
> + uint64_t sw_exitcode, sw_exitinfo1, sw_exitinfo2;
> + uint8_t *rip = (uint8_t *)(frame->tf_rip);
> + uint16_t port;
> + struct ghcb_sync syncout, syncin;
> + struct ghcb_sa *ghcb;
> +
> + intr_disable();
> +
> + memset(&syncout, 0, sizeof(syncout));
> + memset(&syncin, 0, sizeof(syncin));
> +
> + sw_exitcode = frame->tf_err;
> + sw_exitinfo1 = 0;
> + sw_exitinfo2 = 0;
> +
> + switch (sw_exitcode) {
> + case SVM_VMEXIT_CPUID:
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncout);
> + ghcb_sync_val(GHCB_RCX, GHCB_SZ32, &syncout);
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncin);
> + ghcb_sync_val(GHCB_RBX, GHCB_SZ32, &syncin);
> + ghcb_sync_val(GHCB_RCX, GHCB_SZ32, &syncin);
> + ghcb_sync_val(GHCB_RDX, GHCB_SZ32, &syncin);
> + frame->tf_rip += 2;
> + break;
> + case SVM_VMEXIT_MSR: {
> + if (user)
> + return 0; /* not allowed from userspace */
> + if (*rip == 0x0f && *(rip + 1) == 0x30) {
Doesn't this break with XO kernels?
> + /* WRMSR */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncout);
> + ghcb_sync_val(GHCB_RCX, GHCB_SZ32, &syncout);
> + ghcb_sync_val(GHCB_RDX, GHCB_SZ32, &syncout);
> + sw_exitinfo1 = 1;
> + } else if (*rip == 0x0f && *(rip + 1) == 0x32) {
see above
> + /* RDMSR */
> + ghcb_sync_val(GHCB_RCX, GHCB_SZ32, &syncout);
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncin);
> + ghcb_sync_val(GHCB_RDX, GHCB_SZ32, &syncin);
> + } else
> + panic("failed to decode MSR");
> + frame->tf_rip += 2;
> + break;
> + }
> + case SVM_VMEXIT_IOIO: {
> + if (user)
> + return 0; /* not allowed from userspace */
> + switch (*rip) {
> + case 0x66: {
> + switch (*(rip + 1)) {
> + case 0xef: /* out %ax,(%dx) */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ16, &syncout);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) |
> + (1ULL << 5);
> + frame->tf_rip += 2;
> + break;
> + case 0xed: /* in (%dx),%ax */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ16, &syncin);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) |
> + (1ULL << 5) | (1ULL << 0);
> + frame->tf_rip += 2;
> + break;
> + default:
> + panic("failed to decode prefixed IOIO");
> + }
> + break;
> + }
> + case 0xe4: /* in $0x71,%al */
I don't understand this assumption. next byte 0xe4 does not imply
"in $0x71, %al"; what is meant by this comment? (also the next one).
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ8, &syncin);
> + port = *(rip + 1);
> + sw_exitinfo1 = (port << 16) | (1ULL << 4) |
> + (1ULL << 0);
> + frame->tf_rip += 2;
> + break;
> + case 0xe6: /* outb %al,$0x43 */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ8, &syncout);
> + port = *(rip + 1);
> + sw_exitinfo1 = (port << 16) | (1ULL << 4);
> + frame->tf_rip += 2;
> + break;
> + case 0xec: /* in (%dx),%al */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ8, &syncin);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) | (1ULL << 4) |
> + (1ULL << 0);
> + frame->tf_rip += 1;
> + break;
> + case 0xed: /* in (%dx),%eax */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncin);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) | (1ULL << 6) |
> + (1ULL << 0);
> + frame->tf_rip += 1;
> + break;
> + case 0xee: /* out %al,(%dx) */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ8, &syncout);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) | (1ULL << 4);
> + frame->tf_rip += 1;
> + break;
> + case 0xef: /* out %eax,(%dx) */
> + ghcb_sync_val(GHCB_RAX, GHCB_SZ32, &syncout);
> + port = (uint16_t)frame->tf_rdx;
> + sw_exitinfo1 = (port << 16) | (1ULL << 6);
> + frame->tf_rip += 1;
> + break;
> + default:
> + panic("failed to decode IOIO");
> + }
> + break;
> + }
> + default:
> + panic("invalid exit code 0x%llx", sw_exitcode);
> + }
> +
> + /* Always required */
> + ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout);
> + ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout);
> + ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
> +
> + /* Sync out to GHCB */
> + ghcb = (struct ghcb_sa *)ghcb_vaddr;
> + ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb,
> + &syncout);
> +
> + /* Call hypervisor. */
> + vmgexit();
> +
> + /* Verify response */
> + if (ghcb_verify_bm_guest(ghcb->valid_bitmap, syncin.valid_bitmap)) {
> + ghcb_clear(ghcb);
> + panic("invalid hypervisor response");
> + }
> +
> + /* Sync in from GHCB */
> + ghcb_sync_in(frame, ghcb, &syncin);
> +
> + return 1;
> +}
>
> /*
> * kerntrap(frame):
> @@ -348,6 +496,11 @@ kerntrap(struct trapframe *frame)
> else
> return;
> #endif /* NISA > 0 */
> +
> + case T_VC:
> + if (vctrap(frame, 0))
> + return;
> + goto we_re_toast;
> }
> }
>
> @@ -427,7 +580,12 @@ usertrap(struct trapframe *frame)
> code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI
> : ILL_BADSTK;
> break;
> -
> + case T_VC:
> + if (vctrap(frame, 1))
> + goto out;
> + sig = SIGILL;
> + code = ILL_PRVOPC;
> + break;
> case T_PAGEFLT: /* page fault */
> if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
> "[%s]%d/%d sp=%lx inside %lx-%lx: not MAP_STACK\n",
> Index: sys/arch/amd64/amd64/vector.S
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/vector.S,v
> diff -u -p -r1.100 vector.S
> --- sys/arch/amd64/amd64/vector.S 23 Apr 2025 15:08:05 -0000 1.100
> +++ sys/arch/amd64/amd64/vector.S 15 Jun 2025 11:03:01 -0000
> @@ -373,6 +373,43 @@ IDTVEC(trap14)
> ZTRAP(T_VE)
> IDTVEC(trap15)
> TRAP(T_CP)
> +
> +IDTVEC(trap1d)
> + /*
> + * #VC is AMD CPU specific, thus we don't use any Intel Meltdown
> + * workarounds.
> + *
> + * We handle #VC different from other traps, as we do not want
> + * to re-enable interrupts. #VC might happen during IRQ handling
> + * before a specific hardware interrupt gets masked. Re-enabling
> + * interrupts in the trap handler might cause nested IRQs of
> + * the same level. Thus keep interrupts disabled.
> + *
> + * On Intel CPUs we could use code patch to reset this entry.
> + */
> + pushq $T_VC
> + testb $SEL_RPL,24(%rsp)
> + je vctrap_kern
> + swapgs
> + FENCE_SWAPGS_MIS_TAKEN
> + movq %rax,CPUVAR(SCRATCH)
> +
> + /* #VC from userspace */
> + TRAP_ENTRY_USER
> + cld
> + SMAP_CLAC
> + /* shortcut to regular path, but with interrupts disabled */
> + jmp recall_trap
> +
> + /* #VC from kernspace */
> +vctrap_kern:
> + FENCE_NO_SAFE_SMAP
> + TRAP_ENTRY_KERN
> + cld
> + SMAP_CLAC
> + /* shortcut to regular path, but with interrupts disabled */
> + jmp .Lreal_kern_trap
> +
> IDTVEC(trap1f)
> IDTVEC_ALIAS(trap16, trap1f)
> IDTVEC_ALIAS(trap17, trap1f)
> @@ -381,7 +418,6 @@ IDTVEC_ALIAS(trap19, trap1f)
> IDTVEC_ALIAS(trap1a, trap1f)
> IDTVEC_ALIAS(trap1b, trap1f)
> IDTVEC_ALIAS(trap1c, trap1f)
> -IDTVEC_ALIAS(trap1d, trap1f)
> IDTVEC_ALIAS(trap1e, trap1f)
> /* 22 - 31 reserved for future exp */
> ZTRAP(T_RESERVED)
> @@ -513,6 +549,16 @@ END(alltraps_kern)
> END(alltraps_kern_meltdown)
> KTEXT_PAGE_END
>
> +/* #VC trap entry for early bootstrap */
> +IDTVEC(vctrap_early)
> + pushq $T_VC
> + TRAP_ENTRY_KERN /* early #VC has to be in kernel mode */
> + cld
> + movq %rsp, %rdi
> + movq $0x0, %rsi
> + call vctrap
> + movq $0,-8(%rsp)
> + INTRFASTEXIT
>
> /*
> * Macros for interrupt entry, call to handler, and exit.
> Index: sys/arch/amd64/amd64/vmm_machdep.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/amd64/vmm_machdep.c,v
> diff -u -p -r1.57 vmm_machdep.c
> --- sys/arch/amd64/amd64/vmm_machdep.c 3 Jun 2025 19:15:29 -0000 1.57
> +++ sys/arch/amd64/amd64/vmm_machdep.c 15 Jun 2025 11:03:02 -0000
> @@ -1588,15 +1588,15 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, s
> SVM_INTERCEPT_MWAIT_UNCOND | SVM_INTERCEPT_MONITOR |
> SVM_INTERCEPT_MWAIT_COND | SVM_INTERCEPT_RDTSCP;
>
> - /* With SEV-ES we cannot force access XCR0, thus no intercept */
> - if (xsave_mask && !vcpu->vc_seves)
> + if (xsave_mask && !vcpu->vc_seves) /* XXX hshoexer */
What does this XXX mean, and can we fix before commit?
> vmcb->v_intercept2 |= SVM_INTERCEPT_XSETBV;
>
> if (vcpu->vc_seves) {
> - /* With SEV-ES also intercept post EFER and CR[04] writes */
> + /* With SEV-ES also intercept post EFER and CR[048] writes */
> vmcb->v_intercept2 |= SVM_INTERCEPT_EFER_WRITE;
> vmcb->v_intercept2 |= SVM_INTERCEPT_CR0_WRITE_POST;
> vmcb->v_intercept2 |= SVM_INTERCEPT_CR4_WRITE_POST;
> + vmcb->v_intercept2 |= SVM_INTERCEPT_CR8_WRITE_POST;
> }
>
> /* Setup I/O bitmap */
> @@ -1617,22 +1617,13 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, s
> svm_setmsrbrw(vcpu, MSR_FSBASE);
> svm_setmsrbrw(vcpu, MSR_GSBASE);
> svm_setmsrbrw(vcpu, MSR_KERNELGSBASE);
> + svm_setmsrbrw(vcpu, MSR_SEV_GHCB);
>
> /* allow reading SEV status */
> svm_setmsrbrw(vcpu, MSR_SEV_STATUS);
>
> if (vcpu->vc_seves) {
> - /* Allow read/write GHCB guest physical address */
> - svm_setmsrbrw(vcpu, MSR_SEV_GHCB);
> -
> - /* Allow reading MSR_XSS; for CPUID Extended State Enum. */
> - svm_setmsrbr(vcpu, MSR_XSS);
> -
> - /*
> - * With SEV-ES SVME can't be modified by the guest;
> - * host can only intercept post-write (see
> - * SVM_INTERCEPT_EFER_WRITE above).
> - */
> + /* With SEV-ES SVME can not be modified by the guest */
> svm_setmsrbrw(vcpu, MSR_EFER);
> } else {
> /* EFER is R/O so we can ensure the guest always has SVME */
> @@ -1650,7 +1641,10 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, s
> vmcb->v_asid = vcpu->vc_vpid;
>
> /* TLB Control - First time in, flush all*/
> - vmcb->v_tlb_control = SVM_TLB_CONTROL_FLUSH_ALL;
> + if (vcpu->vc_seves)
> + vmcb->v_tlb_control = SVM_TLB_CONTROL_FLUSH_ASID; /* XXX hshoexer */
> + else
> + vmcb->v_tlb_control = SVM_TLB_CONTROL_FLUSH_ALL;
>
> /* INTR masking */
> vmcb->v_intr_masking = 1;
> @@ -1676,13 +1670,23 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, s
>
> /* Set VMSA. */
> vmcb->v_vmsa_pa = vcpu->vc_svm_vmsa_pa;
> +
> + /* XXX hshoexer: LBR: guest_state_protected flag? */
Can we fix this XXX before commit please?
> + svm_setmsrbrw(vcpu, MSR_DEBUGCTLMSR);
> + svm_setmsrbrw(vcpu, MSR_LASTBRANCHFROMIP);
> + svm_setmsrbrw(vcpu, MSR_LASTBRANCHTOIP);
> + svm_setmsrbrw(vcpu, MSR_LASTINTFROMIP);
> + svm_setmsrbrw(vcpu, MSR_LASTINTTOIP);
> +
> + /* XXX hshoexer: virt vmload/vmsave */
> + vmcb->v_lbr_virt_enable |= 0x2;
> }
>
> /* Enable SVME in EFER (must always be set) */
> vmcb->v_efer |= EFER_SVME;
>
> if ((ret = vcpu_writeregs_svm(vcpu, VM_RWREGS_ALL, vrs)) != 0)
> - return ret;
> + goto exit;
>
> /* xcr0 power on default sets bit 0 (x87 state) */
> vcpu->vc_gueststate.vg_xcr0 = XFEATURE_X87 & xsave_mask;
> @@ -1691,6 +1695,7 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, s
>
> ret = vcpu_svm_init_vmsa(vcpu, vrs);
>
> +exit:
> return ret;
> }
>
> @@ -1709,6 +1714,9 @@ vcpu_svm_init_vmsa(struct vcpu *vcpu, st
> if (!vcpu->vc_seves)
> return 0;
>
> + if (vmcb->v_dr7 & ~0x00000400) /* XXX hshoexer? */
> + return 1;
> +
> vmsa = (struct vmsa *)vcpu->vc_svm_vmsa_va;
> memcpy(vmsa, &vmcb->vmcb_layout, sizeof(vmcb->vmcb_layout));
>
> @@ -2889,6 +2897,28 @@ vcpu_init_svm(struct vcpu *vcpu, struct
> (uint64_t)vcpu->vc_svm_hsa_va,
> (uint64_t)vcpu->vc_svm_hsa_pa);
>
> +
> + /* Allocate VM save area VA */
> + vcpu->vc_svm_vmsa_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
> + &kp_zero, &kd_waitok);
> +
> + if (!vcpu->vc_svm_vmsa_va) {
> + ret = ENOMEM;
> + goto exit;
> + }
> +
> + /* Compute VM save area PA */
> + if (!pmap_extract(pmap_kernel(), vcpu->vc_svm_vmsa_va,
> + &vcpu->vc_svm_vmsa_pa)) {
> + ret = ENOMEM;
> + goto exit;
> + }
> +
> + DPRINTF("%s: VMSA va @ 0x%llx, pa @ 0x%llx\n", __func__,
> + (uint64_t)vcpu->vc_svm_vmsa_va,
> + (uint64_t)vcpu->vc_svm_vmsa_pa);
> +
> +
> /* Allocate IOIO area VA (3 pages) */
> vcpu->vc_svm_ioio_va = (vaddr_t)km_alloc(3 * PAGE_SIZE, &kv_any,
> &vmm_kp_contig, &kd_waitok);
> @@ -2909,27 +2939,9 @@ vcpu_init_svm(struct vcpu *vcpu, struct
> (uint64_t)vcpu->vc_svm_ioio_va,
> (uint64_t)vcpu->vc_svm_ioio_pa);
>
> - if (vcpu->vc_seves) {
> - /* Allocate VM save area VA */
> - vcpu->vc_svm_vmsa_va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_page,
> - &kp_zero, &kd_waitok);
> -
> - if (!vcpu->vc_svm_vmsa_va) {
> - ret = ENOMEM;
> - goto exit;
> - }
> -
> - /* Compute VM save area PA */
> - if (!pmap_extract(pmap_kernel(), vcpu->vc_svm_vmsa_va,
> - &vcpu->vc_svm_vmsa_pa)) {
> - ret = ENOMEM;
> - goto exit;
> - }
> -
> - DPRINTF("%s: VMSA va @ 0x%llx, pa @ 0x%llx\n", __func__,
> - (uint64_t)vcpu->vc_svm_vmsa_va,
> - (uint64_t)vcpu->vc_svm_vmsa_pa);
> - }
> + /* Shall we enable SEV? */
> + vcpu->vc_sev = vcp->vcp_sev;
> + vcpu->vc_seves = vcp->vcp_seves;
>
> /* Inform vmd(8) about ASID and C bit position. */
> vcp->vcp_poscbit = amd64_pos_cbit;
> @@ -4285,6 +4297,7 @@ svm_handle_exit(struct vcpu *vcpu)
> case SVM_VMEXIT_EFER_WRITE_TRAP:
> case SVM_VMEXIT_CR0_WRITE_TRAP:
> case SVM_VMEXIT_CR4_WRITE_TRAP:
> + case SVM_VMEXIT_CR8_WRITE_TRAP:
> ret = svm_handle_efercr(vcpu, exit_reason);
> update_rip = 0;
> break;
> @@ -4330,8 +4343,10 @@ svm_vmgexit_sync_host(struct vcpu *vcpu)
> if (!vcpu->vc_seves)
> return (0);
>
> - if (vcpu->vc_svm_ghcb_va == 0)
> + if (vcpu->vc_svm_ghcb_va == 0) {
> + printf("%s: GHCB not set\n", __func__);
> return (0);
> + }
>
> ghcb = (struct ghcb_sa *)vcpu->vc_svm_ghcb_va;
> if (!ghcb_valid(ghcb))
> @@ -4608,6 +4623,8 @@ svm_handle_efercr(struct vcpu *vcpu, uin
> case SVM_VMEXIT_CR4_WRITE_TRAP:
> vmcb->v_cr4 = vmcb->v_exitinfo1;
> break;
> + /* XXX hshoexer: no state for CR8? */
Can we fix this XXX please before commit?
> + break;
> default:
> return (EINVAL);
> }
> @@ -6767,6 +6784,8 @@ vcpu_run_svm(struct vcpu *vcpu, struct v
> * On exit, interrupts are disabled, and we are running with
> * the guest FPU state still possibly on the CPU. Save the FPU
> * state before re-enabling interrupts.
> + *
> + * XXX hshoexer: With SEV-ES we should be able to skip this.
Can we fix this XXX please before commit?
> */
> vmm_fpusave(vcpu);
>
> @@ -7362,7 +7381,7 @@ svm_get_vmsa_pa(uint32_t vmid, uint32_t
> return (error);
>
> vcpu = vm_find_vcpu(vm, vcpuid);
> - if (vcpu == NULL || !vcpu->vc_seves) {
> + if (vcpu == NULL) {
> ret = ENOENT;
> goto out;
> }
> Index: sys/arch/amd64/include/cpu.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/include/cpu.h,v
> diff -u -p -r1.180 cpu.h
> --- sys/arch/amd64/include/cpu.h 28 Apr 2025 16:18:25 -0000 1.180
> +++ sys/arch/amd64/include/cpu.h 15 Jun 2025 11:03:01 -0000
> @@ -502,7 +502,8 @@ void mp_setperf_init(void);
> #define CPU_INVARIANTTSC 17 /* has invariant TSC */
> #define CPU_PWRACTION 18 /* action caused by power button */
> #define CPU_RETPOLINE 19 /* cpu requires retpoline pattern */
> -#define CPU_MAXID 20 /* number of valid machdep ids */
> +#define CPU_SEVGUESTMODE 20 /* running as SEV guest */
> +#define CPU_MAXID 21 /* number of valid machdep ids */
>
> #define CTL_MACHDEP_NAMES { \
> { 0, 0 }, \
> @@ -525,6 +526,7 @@ void mp_setperf_init(void);
> { "invarianttsc", CTLTYPE_INT }, \
> { "pwraction", CTLTYPE_INT }, \
> { "retpoline", CTLTYPE_INT }, \
> + { "sevguestmode", CTLTYPE_INT}, \
> }
>
> #endif /* !_MACHINE_CPU_H_ */
> Index: sys/arch/amd64/include/cpufunc.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/include/cpufunc.h,v
> diff -u -p -r1.44 cpufunc.h
> --- sys/arch/amd64/include/cpufunc.h 5 May 2025 23:02:39 -0000 1.44
> +++ sys/arch/amd64/include/cpufunc.h 15 Jun 2025 11:03:01 -0000
> @@ -439,6 +439,27 @@ breakpoint(void)
> __asm volatile("int $3");
> }
>
> +/* VMGEXIT */
> +static __inline void
> +vmgexit(void)
> +{
> + __asm volatile("rep; vmmcall");
> +}
> +
> +/* Request VM termination from hypervisor. */
> +static __inline void
> +vmterminate(void)
> +{
> + __asm volatile(
> + " movl $MSRPROTO_TERM_REQ, %%rdx ;"
> + " movl $MSR_SEV_GHCB, %%rcx ;"
> + " wrmsr ;"
> + " rep vmmcall ;"
> + "1: hlt ;"
> + " jmp 1b ;"
> + : :);
> +}
> +
> void amd64_errata(struct cpu_info *);
> void cpu_ucode_setup(void);
> void cpu_ucode_apply(struct cpu_info *);
> Index: sys/arch/amd64/include/ghcb.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/include/ghcb.h,v
> diff -u -p -r1.2 ghcb.h
> --- sys/arch/amd64/include/ghcb.h 28 May 2025 07:59:05 -0000 1.2
> +++ sys/arch/amd64/include/ghcb.h 15 Jun 2025 11:03:01 -0000
> @@ -19,13 +19,10 @@
> #ifndef _MACHINE_GHCB_H_
> #define _MACHINE_GHCB_H_
>
> -#include <machine/frame.h>
> -
> #define GHCB_OFFSET(m) ((m) / 8)
> #define GHCB_IDX(m) (GHCB_OFFSET((m)) / 8)
> #define GHCB_BIT(m) (GHCB_OFFSET((m)) % 8)
>
> -#define GHCB_XSS 0x140
> #define GHCB_RAX 0x1F8
> #define GHCB_RBX 0x318
> #define GHCB_RCX 0x308
> @@ -33,8 +30,6 @@
> #define GHCB_SW_EXITCODE 0x390
> #define GHCB_SW_EXITINFO1 0x398
> #define GHCB_SW_EXITINFO2 0x3A0
> -#define GHCB_SW_SCRATCH 0x3A8
> -#define GHCB_XCR0 0x3E8
>
> #define GHCB_MAX 0xFFF
>
> @@ -104,11 +99,15 @@ struct ghcb_sync {
> #define MSR_PROTO_CPUID_RESP 0x5
> #define MSR_PROTO_TERMINATE 0x100
>
> +extern vaddr_t ghcb_vaddr;
> +extern paddr_t ghcb_paddr;
> +
> void ghcb_clear(struct ghcb_sa *);
> int ghcb_valbm_set(uint8_t *, int);
> int ghcb_valbm_isset(uint8_t *, int);
> -int ghcb_verify_bm(uint8_t *, uint8_t *);
> int ghcb_valid(struct ghcb_sa *);
> +int ghcb_verify_bm(uint8_t *, uint8_t *);
> +int ghcb_verify_bm_guest(uint8_t *, uint8_t *);
>
> void ghcb_sync_val(int, int, struct ghcb_sync *);
> void ghcb_sync_out(struct trapframe *, uint64_t, uint64_t, uint64_t,
> Index: sys/arch/amd64/include/specialreg.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/include/specialreg.h,v
> diff -u -p -r1.117 specialreg.h
> --- sys/arch/amd64/include/specialreg.h 19 May 2025 08:36:36 -0000 1.117
> +++ sys/arch/amd64/include/specialreg.h 15 Jun 2025 11:03:01 -0000
> @@ -729,6 +729,7 @@
>
> #define MSR_SEV_STATUS 0xc0010131
> #define SEV_STAT_ENABLED 0x00000001
> +#define SEV_STAT_ES_ENABLED 0x00000002
>
> #define MSR_LS_CFG 0xc0011020
> #define LS_CFG_DIS_LS2_SQUISH 0x02000000
> Index: sys/arch/amd64/include/trap.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/arch/amd64/include/trap.h,v
> diff -u -p -r1.5 trap.h
> --- sys/arch/amd64/include/trap.h 15 Apr 2023 01:22:50 -0000 1.5
> +++ sys/arch/amd64/include/trap.h 15 Jun 2025 11:03:01 -0000
> @@ -62,3 +62,4 @@
> #define T_XMM 19 /* SSE FP exception */
> #define T_VE 20 /* virtualization exception */
> #define T_CP 21 /* control protection exception */
> +#define T_VC 29 /* VMM communication exception */
> Index: sys/dev/ic/psp.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/ic/psp.c,v
> diff -u -p -r1.18 psp.c
> --- sys/dev/ic/psp.c 20 May 2025 07:02:20 -0000 1.18
> +++ sys/dev/ic/psp.c 15 Jun 2025 11:03:01 -0000
> @@ -861,6 +861,10 @@ pspioctl(dev_t dev, u_long cmd, caddr_t
> error = psp_launch_update_data(sc,
> (struct psp_launch_update_data *)data, p);
> break;
> + case PSP_IOC_LAUNCH_UPDATE_VMSA:
> + error = psp_launch_update_vmsa(sc,
> + (struct psp_launch_update_vmsa *)data);
> + break;
> case PSP_IOC_LAUNCH_MEASURE:
> error = psp_launch_measure(sc,
> (struct psp_launch_measure *)data);
> Index: sys/dev/ic/pspvar.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/ic/pspvar.h,v
> diff -u -p -r1.7 pspvar.h
> --- sys/dev/ic/pspvar.h 25 Apr 2025 19:10:50 -0000 1.7
> +++ sys/dev/ic/pspvar.h 15 Jun 2025 11:03:01 -0000
> @@ -275,6 +275,8 @@ struct psp_snp_platform_status {
> #define PSP_IOC_SNP_GET_PSTATUS _IOR('P', 11, struct psp_snp_platform_status)
> #define PSP_IOC_INIT _IO('P', 12)
> #define PSP_IOC_SHUTDOWN _IO('P', 13)
> +#define PSP_IOC_LAUNCH_UPDATE_VMSA \
> + _IOW('P', 14, struct psp_launch_update_vmsa)
> #define PSP_IOC_ENCRYPT_STATE _IOW('P', 254, struct psp_encrypt_state)
> #define PSP_IOC_GUEST_SHUTDOWN _IOW('P', 255, struct psp_guest_shutdown)
>
> Index: sys/dev/vmm/vmm.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/dev/vmm/vmm.c,v
> diff -u -p -r1.5 vmm.c
> --- sys/dev/vmm/vmm.c 20 May 2025 13:51:27 -0000 1.5
> +++ sys/dev/vmm/vmm.c 15 Jun 2025 11:03:01 -0000
> @@ -455,6 +455,8 @@ vm_create(struct vm_create_params *vcp,
> vcpu->vc_parent = vm;
> vcpu->vc_id = vm->vm_vcpu_ct;
> vm->vm_vcpu_ct++;
> + vcpu->vc_sev = vcp->vcp_sev;
> + vcpu->vc_seves = vcp->vcp_seves;
> if ((ret = vcpu_init(vcpu, vcp)) != 0) {
> printf("failed to init vcpu %d for vm %p\n", i, vm);
> vm_teardown(&vm);
> Index: usr.sbin/vmd/loadfile_elf.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/usr.sbin/vmd/loadfile_elf.c,v
> diff -u -p -r1.50 loadfile_elf.c
> --- usr.sbin/vmd/loadfile_elf.c 26 Sep 2024 01:45:13 -0000 1.50
> +++ usr.sbin/vmd/loadfile_elf.c 15 Jun 2025 11:02:41 -0000
> @@ -110,7 +110,7 @@ union {
> } hdr;
>
> static void setsegment(struct mem_segment_descriptor *, uint32_t,
> - size_t, int, int, int, int);
> + size_t, int, int, int, int, int);
> static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
> static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
> static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
> @@ -148,7 +148,7 @@ uint64_t pg_crypt = 0;
> */
> static void
> setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
> - int type, int dpl, int def32, int gran)
> + int type, int dpl, int def32, int gran, int lm)
> {
> sd->sd_lolimit = (int)limit;
> sd->sd_lobase = (int)base;
> @@ -157,7 +157,7 @@ setsegment(struct mem_segment_descriptor
> sd->sd_p = 1;
> sd->sd_hilimit = (int)limit >> 16;
> sd->sd_avl = 0;
> - sd->sd_long = 0;
> + sd->sd_long = lm;
> sd->sd_def32 = def32;
> sd->sd_gran = gran;
> sd->sd_hibase = (int)base >> 24;
> @@ -185,11 +185,13 @@ push_gdt(void)
> * Create three segment descriptors:
> *
> * GDT[0] : null descriptor. "Created" via memset above.
> - * GDT[1] (selector @ 0x8): Executable segment, for CS
> + * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
> * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
> + * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
> */
> - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
> - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
> + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
> + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
> + setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
>
> write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
> sev_register_encryption(GDT_PAGE, PAGE_SIZE);
>
SEV-ES guest: locore #VC trap handling