Index | Thread | Search

From:
Mike Larkin <mlarkin@nested.page>
Subject:
Re: [EXT] Re: SEV-ES guest: early GHCB allocation and early #VC trap handling 1/3
To:
tech@openbsd.org, Hans-Joerg_Hoexer@genua.de
Date:
Wed, 2 Jul 2025 11:37:14 -0700

Download raw body.

Thread
On Tue, Jul 01, 2025 at 10:29:31AM +0200, Hans-Jörg Höxer wrote:
> Hi,
>
> thanks for fixup & committing.  Answers inline.
>
> On Fri, Jun 27, 2025 at 08:03:18PM +0200, Alexander Bluhm wrote:
> > ...
> > I have commited the first diff with the comment suggested by mlarkin.
> >
> > Three remarks inline...
> >
> > > diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c
> > > index ae5aecf13f9..bb1027f82ae 100644
> > > --- a/sys/arch/amd64/amd64/ghcb.c
> > > +++ b/sys/arch/amd64/amd64/ghcb.c
> > > @@ -28,6 +28,9 @@ const uint64_t ghcb_sz_masks[] = {
> > >      0x00000000ffffffffULL, 0xffffffffffffffffULL
> > >  };
> > >
> > > +vaddr_t ghcb_vaddr;
> > > +paddr_t ghcb_paddr;
> > > +
> > >  /*
> > >   * ghcb_clear
> > >   *
> > > diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
> > > index 3f79af0d3cc..b14e053785c 100644
> > > --- a/sys/arch/amd64/amd64/locore0.S
> > > +++ b/sys/arch/amd64/amd64/locore0.S
> > > @@ -439,7 +439,9 @@ cont:
> > >  #define PROC0_DMP2_OFF	(PROC0_DMP3_OFF + NDML3_ENTRIES * NBPG)
> > >  #define TABLESIZE \
> > >      ((NKL4_KIMG_ENTRIES + TABLE_L3_ENTRIES + TABLE_L2_ENTRIES + 1 + UPAGES + \
> > > -	NDML3_ENTRIES + NDML2_ENTRIES + 3) * NBPG)
> > > +	NDML3_ENTRIES + NDML2_ENTRIES + 5) * NBPG)
> > > +#define PROC0_GHCB_OFF	(TABLESIZE - 5 * NBPG)
> > > +#define GHCB_SIZE	(2 * NBPG)
> > >
> > >  #define fillkpt \
> > >  	pushl	%ebp				;	/* save */ \
> > > @@ -463,6 +465,17 @@ cont:
> > >  	loop	1b				;	/* till finished */ \
> > >  	popl	%ebp
> > >
> > > +
> > > +#define fillkpt_nx_nc \
> > > +	pushl	%ebp				;	/* save */ \
> > > +	movl	RELOC((pg_nx + 4)), %ebp	;	/* NX bit? */ \
> > > +1:	movl	%eax,(%ebx)			;	/* store phys addr */ \
> > > +	movl	%ebp,4(%ebx)			;	/* upper 32 bits */ \
> > > +	addl	$8,%ebx				;	/* next pte/pde */ \
> > > +	addl	$NBPG,%eax			;	/* next phys page */ \
> > > +	loop	1b				;	/* till finished */ \
> > > +	popl	%ebp
> > > +
> > >  	/* Find end of kernel image. */
> > >  	movl	$RELOC(end),%edi
> > >  #if (NKSYMS || defined(DDB))
> > > @@ -569,6 +582,19 @@ map_tables:
> > >  	shrl	$PGSHIFT,%ecx
> > >  	fillkpt_nx
> > >
> > > +	/* When in SEV-ES guestmode, remap GHCB shared (ie. unencrypted) */
> > > +	movl	RELOC(cpu_sev_guestmode),%eax
> > > +	testl	$SEV_STAT_ES_ENABLED,%eax
> > > +	jz	.Lnoghcb
> > > +	pushl	%ebx			/* save current slot */
> > > +	subl	$(5 << 3),%ebx		/* move back to slot of GHCB */
> > > +	leal	(PROC0_GHCB_OFF)(%esi),%eax
> > > +	orl	$(PG_V|PG_KW), %eax
> > > +	movl	$(GHCB_SIZE>>PGSHIFT), %ecx
> > > +	fillkpt_nx_nc
> > > +	popl	%ebx			/* continue with slot saved above */
> > > +
> > > +.Lnoghcb:
> > >  	/* Map ISA I/O mem (later atdevbase) RW, NX */
> > >  	movl	$(IOM_BEGIN|PG_V|PG_KW/*|PG_N*/),%eax
> > >  	movl	$(IOM_SIZE>>PGSHIFT),%ecx
> > > @@ -778,6 +804,15 @@ longmode_hi:
> > >  	addq	%rsi,%rdx
> > >  	movq	%rdx,atdevbase(%rip)
> > >
> > > +	/* Relocate GHCB. */
> > > +	movq	cpu_sev_guestmode(%rip),%rax
> > > +	testq	$SEV_STAT_ES_ENABLED,%rax
> > > +	jz	.Lnoghcbreloc
> > > +	movq	$(PROC0_GHCB_OFF+KERNBASE),%rdx
> > > +	addq	%rsi,%rdx
> > > +	movq	%rdx,ghcb_vaddr(%rip)
> > > +
> > > +.Lnoghcbreloc:
> > >  	/* Record start of symbols */
> > >  	movq	$__kernel_bss_end, ssym(%rip)
> > >
> > > @@ -800,7 +835,7 @@ longmode_hi:
> > >  	movw	%ax,%fs
> > >
> > >  	leaq	TABLESIZE(%rsi),%rdi
> > > -	subq	$(NBPG*3), %rdi
> > > +	subq	$(NBPG*5), %rdi
> > >
> > >  	/* XXX merge these */
> > >  	call	init_x86_64
> > > diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
> > > index d9b34f2cde4..bc454f03ba3 100644
> > > --- a/sys/arch/amd64/amd64/machdep.c
> > > +++ b/sys/arch/amd64/amd64/machdep.c
> > > @@ -100,6 +100,7 @@
> > >  #include <machine/mpbiosvar.h>
> > >  #include <machine/kcore.h>
> > >  #include <machine/tss.h>
> > > +#include <machine/ghcb.h>
> > >
> > >  #include <dev/isa/isareg.h>
> > >  #include <dev/ic/i8042reg.h>
> > > @@ -491,6 +492,7 @@ bios_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
> > >  extern int tsc_is_invariant;
> > >  extern int amd64_has_xcrypt;
> > >  extern int need_retpoline;
> > > +extern int cpu_sev_guestmode;
> > >
> > >  const struct sysctl_bounded_args cpuctl_vars[] = {
> > >  	{ CPU_LIDACTION, &lid_action, -1, 2 },
> > > @@ -1314,6 +1316,37 @@ cpu_init_idt(void)
> > >  	lidt(&region);
> > >  }
> > >
> > > +uint64_t early_gdt[GDT_SIZE / 8];
> > > +
> > > +void
> > > +cpu_init_early_vctrap(paddr_t addr)
> > > +{
> > > +	struct region_descriptor region;
> > > +
> > > +	extern void Xvctrap_early(void);
> > > +
> > > +	/* Setup temporary "early" longmode GDT, will be reset soon */
> > > +	memset(early_gdt, 0, sizeof(early_gdt));
> > > +	set_mem_segment(GDT_ADDR_MEM(early_gdt, GCODE_SEL), 0, 0xfffff,
> > > +	    SDT_MEMERA, SEL_KPL, 1, 0, 1);
> > > +	set_mem_segment(GDT_ADDR_MEM(early_gdt, GDATA_SEL), 0, 0xfffff,
> > > +	    SDT_MEMRWA, SEL_KPL, 1, 0, 1);
> > > +	setregion(&region, early_gdt, GDT_SIZE - 1);
> > > +	lgdt(&region);
> > > +
> > > +	/* Setup temporary "early" longmode #VC entry, will be reset soon */
> > > +	idt = early_idt;
> > > +	memset((void *)idt, 0, NIDT * sizeof(idt[0]));
> > > +	setgate(&idt[T_VC], Xvctrap_early, 0, SDT_SYS386IGT, SEL_KPL,
> > > +	    GSEL(GCODE_SEL, SEL_KPL));
> > > +	cpu_init_idt();
> > > +
> > > +	/* Tell vmm(4) about our GHCB. */
> > > +	ghcb_paddr = addr;
> > > +	memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE);
> > > +	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> > > +}
> > > +
> > >  void
> > >  cpu_init_extents(void)
> > >  {
> > > @@ -1434,6 +1467,13 @@ init_x86_64(paddr_t first_avail)
> > >  	int x, ist;
> > >  	uint64_t max_dm_size = ((uint64_t)512 * NUM_L4_SLOT_DIRECT) << 30;
> > >
> > > +	/*
> > > +	 * locore0 mapped 2 pages for use as GHCB before pmap is initialized.
> > > +	 */
> > > +	if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> > > +		cpu_init_early_vctrap(first_avail);
> > > +	first_avail += 2 * NBPG;
> >
> > The GHCB is always allocated to we skip two pages uncoditionally,
> > right?  Can we keep these pages usable when running without SEV-ES?
>
> I guess so.  Right now we are wasting two pages in the non-SEV guest case.
> I will improve this.
>

Fellas, it's 4 pages. I'd say the complexity to reclaim those pages in the non
SEV case probably isn't worth it.

-ml

> > > +
> > >  	/*
> > >  	 * locore0 mapped 3 pages for use before the pmap is initialized
> > >  	 * starting at first_avail. These pages are currently used by
> > > diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c
> > > index ff432b7af6d..bb445a60894 100644
> > > --- a/sys/arch/amd64/amd64/trap.c
> > > +++ b/sys/arch/amd64/amd64/trap.c
> > > @@ -86,6 +86,8 @@
> > >  #include <machine/fpu.h>
> > >  #include <machine/psl.h>
> > >  #include <machine/trap.h>
> > > +#include <machine/ghcb.h>
> > > +#include <machine/vmmvar.h>
> > >  #ifdef DDB
> > >  #include <ddb/db_output.h>
> > >  #include <machine/db_machdep.h>
> > > @@ -95,6 +97,7 @@
> > >
> > >  int	upageflttrap(struct trapframe *, uint64_t);
> > >  int	kpageflttrap(struct trapframe *, uint64_t);
> > > +int	vctrap(struct trapframe *);
> > >  void	kerntrap(struct trapframe *);
> > >  void	usertrap(struct trapframe *);
> > >  void	ast(struct trapframe *);
> > > @@ -123,6 +126,7 @@ const char * const trap_type[] = {
> > >  	"SSE FP exception",			/* 19 T_XMM */
> > >  	"virtualization exception",		/* 20 T_VE */
> > >  	"control protection exception",		/* 21 T_CP */
> > > +	"VMM communication exception",		/* 29 T_VC */
> > >  };
> > >  const int	trap_types = nitems(trap_type);
> > >
> > > @@ -297,6 +301,52 @@ kpageflttrap(struct trapframe *frame, uint64_t cr2)
> > >  	return 1;
> > >  }
> > >
> > > +int
> > > +vctrap(struct trapframe *frame)
> > > +{
> > > +	uint64_t	 sw_exitcode, sw_exitinfo1, sw_exitinfo2;
> > > +	struct ghcb_sync syncout, syncin;
> > > +	struct ghcb_sa	*ghcb;
> > > +
> > > +	intr_disable();
> >
> > We were debugging some races before.  Back then, interrupts were
> > enabled at this point.  So we modified the trap entry point.  I
> > think interrupts are always disabled now, and this should be an
> > assertion instead of an additional disable.
>
> ack.  The intr_disable() is not needed, interrupts are always disabled
> at this point.  And must be!  So an assert is a good idea.
>
> > > +
> > > +	memset(&syncout, 0, sizeof(syncout));
> > > +	memset(&syncin, 0, sizeof(syncin));
> > > +
> > > +	sw_exitcode = frame->tf_err;
> > > +	sw_exitinfo1 = 0;
> > > +	sw_exitinfo2 = 0;
> > > +
> > > +	switch (sw_exitcode) {
> > > +	default:
> > > +		panic("invalid exit code 0x%llx", sw_exitcode);
> > > +	}
> > > +
> > > +	/* Always required */
> > > +	ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout);
> > > +	ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout);
> > > +	ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
> > > +
> > > +	/* Sync out to GHCB */
> > > +	ghcb = (struct ghcb_sa *)ghcb_vaddr;
> > > +	ghcb_sync_out(frame, sw_exitcode, sw_exitinfo1, sw_exitinfo2, ghcb,
> > > +	    &syncout);
> > > +
> > > +	/* Call hypervisor. */
> > > +	vmgexit();
> > > +
> > > +	/* Verify response */
> > > +	if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) {
> > > +		ghcb_clear(ghcb);
> > > +		panic("invalid hypervisor response");
> > > +	}
> > > +
> > > +	/* Sync in from GHCB */
> > > +	ghcb_sync_in(frame, ghcb, &syncin);
> > > +
> > > +	return 1;
> > > +}
> > > +
> > >
> > >  /*
> > >   * kerntrap(frame):
> > > @@ -348,6 +398,11 @@ kerntrap(struct trapframe *frame)
> > >  		else
> > >  			return;
> > >  #endif /* NISA > 0 */
> > > +
> > > +	case T_VC:
> > > +		if (vctrap(frame))
> > > +			return;
> > > +		goto we_re_toast;
> >
> > This looks like priviledged instructions are prossible from userland
> > now.  As vctrap() only calls panic and SEV-ES is not possible yet,
> > this should not be security problem.  I think a follow up diff
> > handles this situation correctly.
>
> cpuid is not privileged and is allowed from userland.  That follow-up
> diff will only allow cpuid from userland.  rdmsr/wrmsr and in/out will
> not be allowed.
>
> > >  	}
> > >  }
> > >
> > > @@ -427,6 +482,9 @@ usertrap(struct trapframe *frame)
> > >  		code = (frame->tf_err & 0x7fff) < 4 ? ILL_BTCFI
> > >  		    : ILL_BADSTK;
> > >  		break;
> > > +	case T_VC:
> > > +		vctrap(frame);
> > > +		goto out;
> > >
> > >  	case T_PAGEFLT:			/* page fault */
> > >  		if (!uvm_map_inentry(p, &p->p_spinentry, PROC_STACK(p),
> > > diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
> > > index 59c786483c3..4a106b0e8e7 100644
> > > --- a/sys/arch/amd64/amd64/vector.S
> > > +++ b/sys/arch/amd64/amd64/vector.S
> > > @@ -513,6 +513,15 @@ END(alltraps_kern)
> > >  END(alltraps_kern_meltdown)
> > >  KTEXT_PAGE_END
> > >
> > > +/* #VC trap entry for early bootstrap */
> > > +IDTVEC(vctrap_early)
> > > +	pushq	$T_VC
> > > +	TRAP_ENTRY_KERN		/* early #VC has to be in kernel mode */
> > > +	cld
> > > +	movq	%rsp, %rdi
> > > +	call	vctrap
> > > +	movq	$0,-8(%rsp)
> > > +	INTRFASTEXIT
> > >
> > >  /*
> > >   * Macros for interrupt entry, call to handler, and exit.
> > > diff --git a/sys/arch/amd64/include/cpufunc.h b/sys/arch/amd64/include/cpufunc.h
> > > index e4c8d6924d2..9f01b3cb989 100644
> > > --- a/sys/arch/amd64/include/cpufunc.h
> > > +++ b/sys/arch/amd64/include/cpufunc.h
> > > @@ -439,6 +439,13 @@ breakpoint(void)
> > >  	__asm volatile("int $3");
> > >  }
> > >
> > > +/* VMGEXIT */
> > > +static __inline void
> > > +vmgexit(void)
> > > +{
> > > +	__asm volatile("rep; vmmcall");
> > > +}
> > > +
> > >  void amd64_errata(struct cpu_info *);
> > >  void cpu_ucode_setup(void);
> > >  void cpu_ucode_apply(struct cpu_info *);
> > > diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
> > > index 5c23b310fa2..ed4bad5dda1 100644
> > > --- a/sys/arch/amd64/include/ghcb.h
> > > +++ b/sys/arch/amd64/include/ghcb.h
> > > @@ -109,6 +109,9 @@ struct ghcb_sync {
> > >
> > >  #ifndef _LOCORE
> > >
> > > +extern vaddr_t ghcb_vaddr;
> > > +extern paddr_t ghcb_paddr;
> > > +
> > >  void	ghcb_clear(struct ghcb_sa *);
> > >  int	ghcb_valbm_set(uint8_t *, int);
> > >  int	ghcb_valbm_isset(uint8_t *, int);
> > > diff --git a/sys/arch/amd64/include/segments.h b/sys/arch/amd64/include/segments.h
> > > index 308639a6baa..ce284262faa 100644
> > > --- a/sys/arch/amd64/include/segments.h
> > > +++ b/sys/arch/amd64/include/segments.h
> > > @@ -151,6 +151,7 @@ struct region_descriptor {
> > >
> > >  #ifdef _KERNEL
> > >  extern struct gate_descriptor *idt;
> > > +extern struct gate_descriptor early_idt[];
> > >
> > >  void setgate(struct gate_descriptor *, void *, int, int, int, int);
> > >  void unsetgate(struct gate_descriptor *);