Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: SEV-ES multiprocessor support
To:
Stefan Fritsch <sf@sfritsch.de>
Cc:
tech@openbsd.org, mlarkin@nested.page
Date:
Thu, 18 Sep 2025 13:22:15 +0200

Download raw body.

Thread
> Date: Thu, 18 Sep 2025 13:17:32 +0200 (CEST)
> From: Stefan Fritsch <sf@sfritsch.de>
> 
> Hi,
> 
> one remaining problem with SEV-ES is that we don't support multiprocessor 
> yet, and booting openbsd in a SEV-ES VM that has several VCPUs hangs at
> 
> cpu3 at mainbus0: apid 3 (application processor)
> cpu3: failed to become ready
> cpu3: failed to identify
> 
> Sometimes it continues after some time, but often it does not. I am not 
> sure if the problem is on our side or if there is some error handling 
> missing in qemu/KVM. Even if it does not hang, some things do not work 
> correctly, like sysctl hw.ncpu is wrong, top gives warnings, ...
> 
> In any case, I think this should be fixed somehow before the release, in 
> order to avoid support requests on the lists. There are two ways forward:
> 
> 1) try to get SEV-ES MP support finished before the release.
> 
> 2) commit some workaround that prevents openbsd from trying to use the 
> application processors if SEV-ES is enabled. Likely in cpu_match().
> 
> The diff that implements MP support is attached below. With this diff, 
> openbsd works for me in a 4 VCPU VM with SEV-ES enabled.
> 
> There is also the question if we actually need MP support for SEV-ES. 
> SEV-ES is just a intermediate step and in the end, most people will want 
> to use SEV-SNP (supported in Zen 3 Epyc CPUs and later). MP CPU bringup is 
> again a bit different with SEV-SNP compared to SEV-ES, though the larger 
> part of the diff is needed for both variants. In my opinion, skipping MP 
> support for SEV-ES and only implementing it for SEV-SNP later is also an 
> option.
> 
> I doubt there is enough time for 1). But I could start splitting the diff 
> into reviewable parts and we will see how far we get.
> 
> What do you think?

I think no.  If it isn't finished, it isn't finished.  Therwe will be
time to do things properly after the tree unlocks.

> diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c
> index 2611859f3f5..247f7b8cff1 100644
> --- a/sys/arch/amd64/amd64/cpu.c
> +++ b/sys/arch/amd64/amd64/cpu.c
> @@ -95,6 +95,7 @@
>  #include <machine/gdt.h>
>  #include <machine/pio.h>
>  #include <machine/vmmvar.h>
> +#include <machine/ghcb.h>
>  
>  #if NLAPIC > 0
>  #include <machine/i82489reg.h>
> @@ -438,6 +439,10 @@ int mp_cpu_start(struct cpu_info *);
>  void mp_cpu_start_cleanup(struct cpu_info *);
>  struct cpu_functions mp_cpu_funcs = { mp_cpu_start, NULL,
>  				      mp_cpu_start_cleanup };
> +int mp_sev_es_cpu_start(struct cpu_info *);
> +void mp_sev_es_cpu_start_cleanup(struct cpu_info *);
> +struct cpu_functions mp_sev_es_cpu_funcs = { mp_sev_es_cpu_start, NULL,
> +					     mp_sev_es_cpu_start_cleanup };
>  #endif /* MULTIPROCESSOR */
>  
>  const struct cfattach cpu_ca = {
> @@ -606,6 +611,27 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
>  		ci->ci_tlog_base = malloc(sizeof(struct tlog),
>  		    M_DEVBUF, M_WAITOK);
>  #endif
> +
> +		if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED)) {
> +			struct ghcb_sa *ghcb_va = NULL;
> +			struct vm_page *ghcb_page;
> +
> +			ghcb_page = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
> +			if (ghcb_page == NULL)
> +				panic("failed to allocate GHCB page");
> +
> +			ghcb_va = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
> +			if (ghcb_va == NULL)
> +				panic("failed to allocate virtual GHCB address");
> +
> +			pmap_kenter_pa((vaddr_t)ghcb_va, ghcb_page->phys_addr | PMAP_NOCRYPT,
> +				PROT_READ | PROT_WRITE);
> +
> +			ci->ci_ghcb_paddr = ghcb_page->phys_addr;
> +			ci->ci_ghcb = ghcb_va;
> +
> +			memset(ghcb_va, 0, PAGE_SIZE);
> +		}
>  	} else {
>  		ci = &cpu_info_primary;
>  #if defined(MULTIPROCESSOR)
> @@ -1031,6 +1057,24 @@ cpu_hatch(void *v)
>  	struct cpu_info *ci = (struct cpu_info *)v;
>  	int s;
>  
> +	/* We need the GSBASE MSR for the vctrap handler to work.
> +	 * CPUID will trap into the #VC trap handler on AMD SEV-ES.
> +	 */
> +	cpu_init_msrs(ci);
> +
> +	if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED)) {
> +		extern int x2apic_enabled;
> +
> +		/* Load IDT early for #VC handler */
> +		cpu_init_idt();
> +		if (x2apic_enabled) {
> +			/* Now that we have a #VC handler, we are able
> +			 * to enable x2APIC.
> +			 */
> +			wrmsr(MSR_APICBASE, rdmsr(MSR_APICBASE) | APICBASE_ENABLE_X2APIC);
> +		}
> +	}
> +
>  	{
>  		uint32_t vendor[4];
>  		int level;
> @@ -1040,7 +1084,6 @@ cpu_hatch(void *v)
>  		cpu_set_vendor(ci, level, (const char *)vendor);
>  	}
>  
> -	cpu_init_msrs(ci);
>  
>  #ifdef DEBUG
>  	if (ci->ci_flags & CPUF_PRESENT)
> @@ -1205,6 +1248,60 @@ mp_cpu_start_cleanup(struct cpu_info *ci)
>  	outb(IO_RTC, NVRAM_RESET);
>  	outb(IO_RTC+1, NVRAM_RESET_RST);
>  }
> +
> +paddr_t sev_es_jmp_tbl_addr;
> +
> +int mp_sev_es_cpu_start(struct cpu_info *ci)
> +{
> +	struct {
> +		uint16_t	reset_ip;
> +		uint16_t	reset_cs;
> +	} *jmp_tbl;
> +
> +	if (sev_es_jmp_tbl_addr == 0) {
> +		paddr_t jmp_tbl_paddr;
> +
> +		if (!ghcb_get_ap_jump_table(&jmp_tbl_paddr))
> +			sev_es_jmp_tbl_addr = jmp_tbl_paddr & ~PAGE_MASK;
> +		else
> +			panic("failed to get AP jump table address");
> +
> +		/* Update the AP jump table only once */
> +		jmp_tbl = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
> +		if (jmp_tbl == NULL)
> +			panic("failed to allocate virtual address");
> +
> +		pmap_kenter_pa((vaddr_t)jmp_tbl, sev_es_jmp_tbl_addr,
> +		    PROT_READ | PROT_WRITE);
> +
> +		jmp_tbl->reset_ip = 0;
> +		jmp_tbl->reset_cs = MP_TRAMPOLINE >> 4;
> +
> +		pmap_kremove((vaddr_t)jmp_tbl, PAGE_SIZE);
> +		km_free(jmp_tbl, PAGE_SIZE, &kv_any, &kp_none);
> +	}
> +
> +	if (ci->ci_flags & CPUF_AP) {
> +		x86_ipi_init(ci->ci_apicid);
> +
> +		delay(10000);
> +
> +		if (cpu_feature & CPUID_APIC) {
> +			x86_ipi(0, ci->ci_apicid, LAPIC_DLMODE_STARTUP);
> +			delay(200);
> +
> +			x86_ipi(0, ci->ci_apicid, LAPIC_DLMODE_STARTUP);
> +			delay(200);
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +void mp_sev_es_cpu_start_cleanup(struct cpu_info *ci)
> +{
> +	(void)ci;
> +}
>  #endif	/* MULTIPROCESSOR */
>  
>  typedef void (vector)(void);
> diff --git a/sys/arch/amd64/amd64/ghcb.c b/sys/arch/amd64/amd64/ghcb.c
> index 2b0fa809570..aace7f28303 100644
> --- a/sys/arch/amd64/amd64/ghcb.c
> +++ b/sys/arch/amd64/amd64/ghcb.c
> @@ -47,9 +47,6 @@ const uint64_t ghcb_sz_clear_masks[] = {
>      0xffffffffffffffffULL, 0xffffffffffffffffULL
>  };
>  
> -vaddr_t ghcb_vaddr;
> -paddr_t ghcb_paddr;
> -
>  /*
>   * ghcb_clear
>   *
> @@ -254,6 +251,11 @@ ghcb_sync_in(struct trapframe *frame, struct ghcb_extra_regs *regs,
>  		frame->tf_rdx |= (ghcb->v_rdx & ghcb_sz_masks[gsin->sz_d]);
>  	}
>  
> +	if (ghcb_valbm_isset(gsin->valid_bitmap, GHCB_SW_EXITINFO1))
> +		regs->exitinfo1 = ghcb->v_sw_exitinfo1;
> +	if (ghcb_valbm_isset(gsin->valid_bitmap, GHCB_SW_EXITINFO2))
> +		regs->exitinfo2 = ghcb->v_sw_exitinfo2;
> +
>  	if (regs && regs->data) {
>  		data_sz = regs->data_sz;
>  		KASSERT(data_sz <= sizeof(ghcb->v_sharedbuf));
> @@ -303,14 +305,14 @@ _ghcb_mem_rw(vaddr_t addr, int valsz, void *val, bool read)
>  		ghcb_regs.exitcode = SEV_VMGEXIT_MMIO_READ;
>  		ghcb_regs.exitinfo1 = paddr;
>  		ghcb_regs.exitinfo2 = size;
> -		ghcb_regs.scratch = ghcb_paddr + offsetof(struct ghcb_sa,
> -		    v_sharedbuf);
> +		ghcb_regs.scratch = curcpu()->ci_ghcb_paddr +
> +		    offsetof(struct ghcb_sa, v_sharedbuf);
>  	} else {
>  		ghcb_regs.exitcode = SEV_VMGEXIT_MMIO_WRITE;
>  		ghcb_regs.exitinfo1 = paddr;
>  		ghcb_regs.exitinfo2 = size;
> -		ghcb_regs.scratch = ghcb_paddr + offsetof(struct ghcb_sa,
> -		    v_sharedbuf);
> +		ghcb_regs.scratch = curcpu()->ci_ghcb_paddr +
> +		    offsetof(struct ghcb_sa, v_sharedbuf);
>  		ghcb_regs.data = val;
>  		ghcb_regs.data_sz = size;
>  	}
> @@ -322,10 +324,10 @@ _ghcb_mem_rw(vaddr_t addr, int valsz, void *val, bool read)
>  
>  	s = intr_disable();
>  
> -	ghcb = (struct ghcb_sa *)ghcb_vaddr;
> -	ghcb_sync_out(NULL, &ghcb_regs, ghcb, &syncout);
> +	wrmsr(MSR_SEV_GHCB, curcpu()->ci_ghcb_paddr);
>  
> -	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +	ghcb = curcpu()->ci_ghcb;
> +	ghcb_sync_out(NULL, &ghcb_regs, ghcb, &syncout);
>  
>  	vmgexit();
>  
> @@ -399,10 +401,10 @@ _ghcb_io_rw(uint16_t port, int valsz, uint32_t *val, bool read)
>  
>  	s = intr_disable();
>  
> -	ghcb = (struct ghcb_sa *)ghcb_vaddr;
> +	ghcb = curcpu()->ci_ghcb;
>  	ghcb_sync_out(&frame, &ghcb_regs, ghcb, &syncout);
>  
> -	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +	wrmsr(MSR_SEV_GHCB, curcpu()->ci_ghcb_paddr);
>  
>  	vmgexit();
>  
> @@ -418,3 +420,55 @@ _ghcb_io_rw(uint16_t port, int valsz, uint32_t *val, bool read)
>  	if (read)
>  		*val = frame.tf_rax;
>  }
> +
> +#ifdef MULTIPROCESSOR
> +int
> +ghcb_get_ap_jump_table(paddr_t *jmp_tbl_addr)
> +{
> +	struct ghcb_sa *ghcb;
> +	struct ghcb_sync syncout, syncin;
> +	struct ghcb_extra_regs ghcb_regs;
> +	unsigned long s;
> +
> +	memset(&syncout, 0, sizeof(syncout));
> +	memset(&syncin, 0, sizeof(syncin));
> +	memset(&ghcb_regs, 0, sizeof(ghcb_regs));
> +
> +	ghcb_regs.exitcode = SEV_VMGEXIT_AP_JUMP_TABLE;
> +	ghcb_sync_val(GHCB_SW_EXITCODE, GHCB_SZ64, &syncout);
> +	ghcb_regs.exitinfo1 = 1; // GET
> +	ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncout);
> +	ghcb_regs.exitinfo2 = 0;
> +	ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
> +
> +	ghcb_sync_val(GHCB_SW_EXITINFO1, GHCB_SZ64, &syncin);
> +	ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncin);
> +
> +	s = intr_disable();
> +
> +	wrmsr(MSR_SEV_GHCB, curcpu()->ci_ghcb_paddr);
> +
> +	ghcb = curcpu()->ci_ghcb;
> +	ghcb_sync_out(NULL, &ghcb_regs, ghcb, &syncout);
> +
> +	vmgexit();
> +
> +	if (ghcb_verify_bm(ghcb->valid_bitmap, syncin.valid_bitmap)) {
> +		ghcb_clear(ghcb);
> +		panic("invalid hypervisor response");
> +	}
> +
> +	memset(&ghcb_regs, 0, sizeof(ghcb_regs));
> +
> +	ghcb_sync_in(NULL, &ghcb_regs, ghcb, &syncin);
> +
> +	intr_restore(s);
> +
> +	if (ghcb_regs.exitinfo1 == 0) {
> +		*jmp_tbl_addr = ghcb_regs.exitinfo2;
> +		return 0;
> +	} else {
> +		return 1;
> +	}
> +}
> +#endif
> diff --git a/sys/arch/amd64/amd64/lapic.c b/sys/arch/amd64/amd64/lapic.c
> index f7fdb81ccca..80436294e6f 100644
> --- a/sys/arch/amd64/amd64/lapic.c
> +++ b/sys/arch/amd64/amd64/lapic.c
> @@ -99,6 +99,7 @@ struct pic local_pic = {
>  };
>  
>  extern int x2apic_eoi;
> +extern int x2apic_eoi_swapgs;
>  int x2apic_enabled = 0;
>  
>  u_int32_t x2apic_readreg(int reg);
> @@ -207,6 +208,10 @@ lapic_map(paddr_t lapic_base)
>  #endif
>  		x2apic_enabled = 1;
>  		codepatch_call(CPTAG_EOI, &x2apic_eoi);
> +		if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> +			codepatch_call(CPTAG_EOI_FAST_IPI, &x2apic_eoi_swapgs);
> +		else
> +			codepatch_call(CPTAG_EOI_FAST_IPI, &x2apic_eoi);
>  
>  		va = (vaddr_t)&local_apic;
>  	} else {
> @@ -222,6 +227,9 @@ lapic_map(paddr_t lapic_base)
>  		pte = kvtopte(va);
>  		*pte = lapic_base | PG_RW | PG_V | PG_N | PG_G | pg_nx;
>  		invlpg(va);
> +
> +		if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> +			panic("xAPIC mode not implemented for SEV-ES");
>  	}
>  
>  	/*
> diff --git a/sys/arch/amd64/amd64/locore0.S b/sys/arch/amd64/amd64/locore0.S
> index 4533b19df2f..951da60b1d2 100644
> --- a/sys/arch/amd64/amd64/locore0.S
> +++ b/sys/arch/amd64/amd64/locore0.S
> @@ -804,15 +804,6 @@ longmode_hi:
>  	addq	%rsi,%rdx
>  	movq	%rdx,atdevbase(%rip)
>  
> -	/* Relocate GHCB. */
> -	movq	cpu_sev_guestmode(%rip),%rax
> -	testq	$SEV_STAT_ES_ENABLED,%rax
> -	jz	.Lnoghcbreloc
> -	movq	$(PROC0_GHCB_OFF+KERNBASE),%rdx
> -	addq	%rsi,%rdx
> -	movq	%rdx,ghcb_vaddr(%rip)
> -
> -.Lnoghcbreloc:
>  	/* Record start of symbols */
>  	movq	$__kernel_bss_end, ssym(%rip)
>  
> diff --git a/sys/arch/amd64/amd64/machdep.c b/sys/arch/amd64/amd64/machdep.c
> index 3de32b26354..d86ec85db67 100644
> --- a/sys/arch/amd64/amd64/machdep.c
> +++ b/sys/arch/amd64/amd64/machdep.c
> @@ -1342,9 +1342,10 @@ cpu_init_early_vctrap(paddr_t addr)
>  	cpu_init_idt();
>  
>  	/* Tell vmm(4) about our GHCB. */
> -	ghcb_paddr = addr;
> -	memset((void *)ghcb_vaddr, 0, 2 * PAGE_SIZE);
> -	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +	cpu_info_primary.ci_ghcb_paddr = addr;
> +	cpu_info_primary.ci_ghcb = (struct ghcb_sa *)(addr + KERNBASE);
> +	memset(cpu_info_primary.ci_ghcb, 0, 2 * PAGE_SIZE);
> +	wrmsr(MSR_SEV_GHCB, cpu_info_primary.ci_ghcb_paddr);
>  }
>  
>  void
> @@ -1388,6 +1389,7 @@ map_tramps(void)
>  	extern u_char mp_tramp_data_start[];
>  	extern u_char mp_tramp_data_end[];
>  	extern u_int32_t mp_pdirpa;
> +	extern u_int32_t mp_sev_guestmode;
>  #endif
>  
>  	/*
> @@ -1429,6 +1431,13 @@ map_tramps(void)
>  	 */
>  	mp_pdirpa = tramp_pdirpa;
>  
> +	/*
> +	 * We need to introduce and set mp_sev_guestmode since the
> +	 * global cpu_sev_guestmode variable may not be accessable in
> +	 * 16 or 32 bit mode.
> +	 */
> +	mp_sev_guestmode = cpu_sev_guestmode;
> +
>  	/* Unmap, will be remapped in cpu_start_secondary */
>  	pmap_kremove(MP_TRAMPOLINE, PAGE_SIZE);
>  	pmap_kremove(MP_TRAMP_DATA, PAGE_SIZE);
> diff --git a/sys/arch/amd64/amd64/mptramp.S b/sys/arch/amd64/amd64/mptramp.S
> index 96247c8e890..838168843bf 100644
> --- a/sys/arch/amd64/amd64/mptramp.S
> +++ b/sys/arch/amd64/amd64/mptramp.S
> @@ -143,6 +143,14 @@ _TRMP_LABEL(.Lmp_startup)
>  	rdmsr
>  	movl	%edx, %edi		# %edx is needed by wrmsr below
>  
> +	# If SEV is enabled, we can assume that NXE is supported and we cannot
> +	# do cpuid, yet.
> +	movl	$mp_sev_guestmode, %edx
> +	movl	(%edx), %edx
> +	xorl	%eax, %eax
> +	testl	%edx, %edx
> +	jnz	4f
> +
>  	# Check if we need to enable NXE
>  	movl	$0x80000001, %eax
>  	cpuid
> @@ -150,6 +158,7 @@ _TRMP_LABEL(.Lmp_startup)
>  	xorl	%eax,%eax
>  	testl	%edx, %edx
>  	jz	1f
> +4:
>  	orl	$EFER_NXE, %eax
>  1:
>  	orl	$(EFER_LME|EFER_SCE), %eax
> @@ -192,6 +201,31 @@ END(cpu_spinup_trampoline)
>  
>  	.text
>  GENTRY(cpu_spinup_finish)
> +	movl	$mp_sev_guestmode, %eax
> +	movl	(%eax), %eax
> +	testl	$SEV_STAT_ES_ENABLED, %eax
> +	jz	5f
> +
> +	# We are in SEV-ES mode. MSR or MMIO access is only possible
> +	# through a GHCB. Query APIC ID via CPUID leaf 1 EBX
> +	movl	$1, %edx
> +	# EBX == 1, function 4 cpuid request
> +	movl	$(1 << 30 | 4), %eax
> +	movl	$MSR_SEV_GHCB, %ecx
> +	wrmsr
> +	rep vmmcall
> +	rdmsr
> +	# Make sure the query was successful
> +	cmpl	$(1 << 30 | 5), %eax
> +	jne	.Lsev_es_terminate
> +
> +	movl	%edx, %eax
> +	shrl	$24, %eax
> +	# Skip x2apic initialization if running on SEV-ES or higher.
> +	# We cannot do rdmsr/wrmsr without a GHCB. Will be done later in cpu_hatch.
> +	jmp	2f
> +
> +5:
>  	movl	x2apic_enabled,%eax
>  	testl	%eax,%eax
>  	jz	1f
> @@ -234,9 +268,18 @@ GENTRY(cpu_spinup_finish)
>  	movq	%rax,%cr0
>  	call	cpu_hatch
>  	movq	$0,-8(%rsp)
> -END(cpu_spinup_finish)
>  	/* NOTREACHED */
>  
> +.Lsev_es_terminate:
> +	xorl	%edx, %edx
> +	movl	$0x100, %eax
> +	movl	$MSR_SEV_GHCB, %ecx
> +	wrmsr
> +	rep vmmcall
> +	hlt
> +	jmp	.Lsev_es_terminate
> +END(cpu_spinup_finish)
> +
>  	.section .rodata
>  	.type	mp_tramp_data_start,@object
>  mp_tramp_data_start:
> @@ -250,6 +293,12 @@ _TRMP_DATA_LABEL(mp_pdirpa)
>  	.long 0
>  	.size	mp_pdirpa,4
>  
> +	.global mp_sev_guestmode
> +	.type	mp_sev_guestmode,@object
> +_TRMP_DATA_LABEL(mp_sev_guestmode)
> +	.long	0
> +	.size	mp_sev_guestmode,4
> +
>  
>  _TRMP_DATA_LABEL(.Lmptramp_gdt32)
>  	.quad 0x0000000000000000
> diff --git a/sys/arch/amd64/amd64/trap.c b/sys/arch/amd64/amd64/trap.c
> index 396366de89b..49c7c0ffb70 100644
> --- a/sys/arch/amd64/amd64/trap.c
> +++ b/sys/arch/amd64/amd64/trap.c
> @@ -426,6 +426,17 @@ vctrap(struct trapframe *frame, int user)
>  		}
>  		break;
>  	    }
> +	case SVM_VMEXIT_WBINVD:
> +		/* There is no special GHCB request for WBNOINVD.
> +		 * Signal WBINVD to emulate WBNOINVD.
> +		 */
> +		if (*rip == 0xf3)
> +			frame->tf_rip += 3;
> +		else
> +			frame->tf_rip += 2;
> +		break;
> +	case SVM_VMEXIT_NPF:
> +		panic("Unexptected SEV nested page fault");
>  	default:
>  		panic("invalid exit code 0x%llx", ghcb_regs.exitcode);
>  	}
> @@ -436,10 +447,10 @@ vctrap(struct trapframe *frame, int user)
>  	ghcb_sync_val(GHCB_SW_EXITINFO2, GHCB_SZ64, &syncout);
>  
>  	/* Sync out to GHCB */
> -	ghcb = (struct ghcb_sa *)ghcb_vaddr;
> +	ghcb = curcpu()->ci_ghcb;
>  	ghcb_sync_out(frame, &ghcb_regs, ghcb, &syncout);
>  
> -	wrmsr(MSR_SEV_GHCB, ghcb_paddr);
> +	wrmsr(MSR_SEV_GHCB, curcpu()->ci_ghcb_paddr);
>  
>  	/* Call hypervisor. */
>  	vmgexit();
> diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S
> index 8b82db6b4f6..cbfe817ea9c 100644
> --- a/sys/arch/amd64/amd64/vector.S
> +++ b/sys/arch/amd64/amd64/vector.S
> @@ -590,6 +590,40 @@ KUENTRY(x2apic_eoi)
>  	lfence
>  END(x2apic_eoi)
>  
> +/*
> + * With SEV-ES the wrmsr instruction traps into the #VC handler which
> + * needs the kernel GS_BASE. So if we come from the userland, we need to
> + * do swapgs. The fast IPI handler does not perform swapgs, so we need
> + * to do it here. In order to detect whether we come from user or kernel
> + * land, this function MUST be called before %rsp is modified.
> + */
> +KUENTRY(x2apic_eoi_swapgs)
> +	/* If the come from userland, go swapgs to enabled curcpu() */
> +	testb	$SEL_RPL,16(%rsp)
> +	jz	1f
> +	swapgs
> +	FENCE_SWAPGS_MIS_TAKEN
> +1:
> +	pushq	%rax
> +	pushq	%rcx
> +	pushq	%rdx
> +	mov     $MSR_X2APIC_EOI,%ecx
> +	mov     $0,%eax
> +	mov     $0,%edx
> +	wrmsr
> +	popq	%rdx
> +	popq	%rcx
> +	popq	%rax
> +
> +	testb	$SEL_RPL,16(%rsp)
> +	jz	2f
> +	swapgs
> +	FENCE_SWAPGS_MIS_TAKEN
> +2:
> +	retq
> +	lfence
> +END(x2apic_eoi_swapgs)
> +
>  #if NLAPIC > 0
>  #ifdef MULTIPROCESSOR
>  KIDTVEC(recurse_lapic_ipi)
> @@ -629,9 +663,9 @@ END(Xresume_lapic_ipi)
>   */
>  /* invalidate the entire TLB, no PCIDs version */
>  IDTVEC(ipi_invltlb)
> -	pushq	%rax
> +	ioapic_asm_ack_fast_ipi()
>  
> -	ioapic_asm_ack()
> +	pushq	%rax
>  
>  	movq	%cr3, %rax
>  	movq	%rax, %cr3
> @@ -646,11 +680,11 @@ END(Xipi_invltlb)
>  #if NVMM > 0
>  /* Invalidate VMX EPT */
>  IDTVEC(ipi_invept)
> +	ioapic_asm_ack_fast_ipi()
> +
>  	pushq	%rax
>  	pushq	%rdx
>  
> -	ioapic_asm_ack()
> -
>  	movq	$ept_shoot_vid, %rax
>  	movq	ept_shoot_mode, %rdx
>  	invept	(%rax), %rdx
> @@ -666,9 +700,9 @@ END(Xipi_invept)
>  
>  /* invalidate a single page, no PCIDs version */
>  IDTVEC(ipi_invlpg)
> -	pushq	%rax
> +	ioapic_asm_ack_fast_ipi()
>  
> -	ioapic_asm_ack()
> +	pushq	%rax
>  
>  	movq	tlb_shoot_addr1, %rax
>  	invlpg	(%rax)
> @@ -682,11 +716,11 @@ END(Xipi_invlpg)
>  
>  /* invalidate a range of pages, no PCIDs version */
>  IDTVEC(ipi_invlrange)
> +	ioapic_asm_ack_fast_ipi()
> +
>  	pushq	%rax
>  	pushq	%rdx
>  
> -	ioapic_asm_ack()
> -
>  	movq	tlb_shoot_addr1, %rax
>  	movq	tlb_shoot_addr2, %rdx
>  1:	invlpg	(%rax)
> @@ -706,9 +740,9 @@ END(Xipi_invlrange)
>   * Invalidate the userspace PCIDs.
>   */
>  IDTVEC(ipi_invltlb_pcid)
> -	pushq	%rax
> +	ioapic_asm_ack_fast_ipi()
>  
> -	ioapic_asm_ack()
> +	pushq	%rax
>  
>  	/* set the type */
>  	movl	$INVPCID_PCID,%eax
> @@ -740,9 +774,9 @@ END(Xipi_invltlb_pcid)
>   * while userspace VAs are present in PCIDs 1 and 2.
>   */
>  IDTVEC(ipi_invlpg_pcid)
> -	pushq	%rax
> +	ioapic_asm_ack_fast_ipi()
>  
> -	ioapic_asm_ack()
> +	pushq	%rax
>  
>  	/* space for the INVPCID descriptor */
>  	subq	$16,%rsp
> @@ -777,12 +811,12 @@ END(Xipi_invlpg_pcid)
>   * PCIDs 0 and 1, while userspace VAs are present in PCIDs 1 and 2.
>   */
>  IDTVEC(ipi_invlrange_pcid)
> +	ioapic_asm_ack_fast_ipi()
> +
>  	pushq	%rax
>  	pushq	%rdx
>  	pushq	%rcx
>  
> -	ioapic_asm_ack()
> -
>  	/* space for the INVPCID descriptor */
>  	subq	$16,%rsp
>  
> @@ -817,7 +851,7 @@ IDTVEC(ipi_invlrange_pcid)
>  END(Xipi_invlrange_pcid)
>  
>  IDTVEC(ipi_wbinvd)
> -	ioapic_asm_ack()
> +	ioapic_asm_ack_fast_ipi()
>  
>  	wbinvd
>  
> diff --git a/sys/arch/amd64/include/codepatch.h b/sys/arch/amd64/include/codepatch.h
> index 2ccb638a8e8..6b6bfee62e1 100644
> --- a/sys/arch/amd64/include/codepatch.h
> +++ b/sys/arch/amd64/include/codepatch.h
> @@ -70,6 +70,7 @@ void codepatch_disable(void);
>  #define CPTAG_RETPOLINE_R11		15
>  #define CPTAG_RETPOLINE_R13		16
>  #define CPTAG_IBPB_NOP			17
> +#define CPTAG_EOI_FAST_IPI		18
>  
>  /*
>   * stac/clac SMAP instructions have lfence like semantics.  Let's
> diff --git a/sys/arch/amd64/include/cpu.h b/sys/arch/amd64/include/cpu.h
> index 8c71c424a8f..6b725ff796a 100644
> --- a/sys/arch/amd64/include/cpu.h
> +++ b/sys/arch/amd64/include/cpu.h
> @@ -107,6 +107,7 @@ enum cpu_vendor {
>   */
>  struct x86_64_tss;
>  struct vcpu;
> +struct ghcb_sa;
>  struct cpu_info {
>  	/*
>  	 * The beginning of this structure in mapped in the userspace "u-k"
> @@ -219,6 +220,9 @@ struct cpu_info {
>  	struct uvm_pmr_cache	ci_uvm;		/* [o] page cache */
>  #endif
>  
> +	struct ghcb_sa	*ci_ghcb;
> +	paddr_t		ci_ghcb_paddr;
> +
>  	struct ksensordev	ci_sensordev;
>  	struct ksensor		ci_sensor;
>  	struct ksensor		ci_hz_sensor;
> diff --git a/sys/arch/amd64/include/cpuvar.h b/sys/arch/amd64/include/cpuvar.h
> index fb1de0cb1b1..5b2669a36aa 100644
> --- a/sys/arch/amd64/include/cpuvar.h
> +++ b/sys/arch/amd64/include/cpuvar.h
> @@ -71,6 +71,7 @@ struct cpu_functions {
>  };
>  
>  extern struct cpu_functions mp_cpu_funcs;
> +extern struct cpu_functions mp_sev_es_cpu_funcs;
>  
>  #define CPU_ROLE_SP	0
>  #define CPU_ROLE_BP	1
> diff --git a/sys/arch/amd64/include/ghcb.h b/sys/arch/amd64/include/ghcb.h
> index bac63968d24..a39d5a9401f 100644
> --- a/sys/arch/amd64/include/ghcb.h
> +++ b/sys/arch/amd64/include/ghcb.h
> @@ -111,9 +111,6 @@ struct ghcb_sync {
>  
>  #ifndef _LOCORE
>  
> -extern vaddr_t ghcb_vaddr;
> -extern paddr_t ghcb_paddr;
> -
>  struct ghcb_extra_regs {
>  	uint64_t	 exitcode;
>  	uint64_t	 exitinfo1;
> @@ -136,6 +133,9 @@ void	ghcb_sync_in(struct trapframe *, struct ghcb_extra_regs *,
>  	    struct ghcb_sa *, struct ghcb_sync *);
>  void	_ghcb_mem_rw(vaddr_t, int, void *, bool);
>  void	_ghcb_io_rw(uint16_t, int, uint32_t *, bool);
> +#ifdef MULTIPROCESSOR
> +int	ghcb_get_ap_jump_table(paddr_t *);
> +#endif
>  
>  static inline uint8_t
>  ghcb_mem_read_1(vaddr_t addr)
> diff --git a/sys/arch/amd64/include/i82093reg.h b/sys/arch/amd64/include/i82093reg.h
> index 99b22923499..3288176fb22 100644
> --- a/sys/arch/amd64/include/i82093reg.h
> +++ b/sys/arch/amd64/include/i82093reg.h
> @@ -114,7 +114,21 @@
>  
>  #include <machine/codepatch.h>
>  
> -#define ioapic_asm_ack(num) 					 \
> +/*
> + * This macro must also work if swapgs has not been called on entry
> + * from user land.
> + */
> +#define ioapic_asm_ack_fast_ipi(num)				\
> +	CODEPATCH_START						;\
> +	movl	$0,(local_apic+LAPIC_EOI)(%rip)			;\
> +	CODEPATCH_END(CPTAG_EOI_FAST_IPI)
> +
> +
> +/*
> + * This macro assumes that swapgs has already been called (e.g. by
> + * INTRENTRY).
> + */
> +#define ioapic_asm_ack(num)					 \
>  	CODEPATCH_START						;\
>  	movl	$0,(local_apic+LAPIC_EOI)(%rip)			;\
>  	CODEPATCH_END(CPTAG_EOI)
> diff --git a/sys/arch/amd64/include/vmmvar.h b/sys/arch/amd64/include/vmmvar.h
> index 25b1618ad1f..23ee60eb465 100644
> --- a/sys/arch/amd64/include/vmmvar.h
> +++ b/sys/arch/amd64/include/vmmvar.h
> @@ -271,6 +271,7 @@
>   */
>  #define SEV_VMGEXIT_MMIO_READ			0x80000001
>  #define SEV_VMGEXIT_MMIO_WRITE			0x80000002
> +#define SEV_VMGEXIT_AP_JUMP_TABLE		0x80000005
>  
>  #ifndef _LOCORE
>  
> diff --git a/sys/dev/acpi/acpimadt.c b/sys/dev/acpi/acpimadt.c
> index 275f2b1e6ce..f9f3a0a6538 100644
> --- a/sys/dev/acpi/acpimadt.c
> +++ b/sys/dev/acpi/acpimadt.c
> @@ -263,6 +263,10 @@ acpimadt_attach(struct device *parent, struct device *self, void *aux)
>  			caa.cpu_acpi_proc_id = entry->madt_lapic.acpi_proc_id;
>  #ifdef MULTIPROCESSOR
>  			caa.cpu_func = &mp_cpu_funcs;
> +#ifdef __amd64__
> +			if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> +				caa.cpu_func = &mp_sev_es_cpu_funcs;
> +#endif
>  #endif
>  #ifdef __i386__
>  			/*
> @@ -318,6 +322,10 @@ acpimadt_attach(struct device *parent, struct device *self, void *aux)
>  			caa.cpu_acpi_proc_id = entry->madt_x2apic.acpi_proc_uid;
>  #ifdef MULTIPROCESSOR
>  			caa.cpu_func = &mp_cpu_funcs;
> +#ifdef __amd64__
> +			if (ISSET(cpu_sev_guestmode, SEV_STAT_ES_ENABLED))
> +				caa.cpu_func = &mp_sev_es_cpu_funcs;
> +#endif
>  #endif
>  #ifdef __i386__
>  			/*
> 
>