Index | Thread | Search

From:
Hans-Jörg Höxer <hshoexer@genua.de>
Subject:
vmd(8): Use 32-bit direct kernel launch for both amd64 and i386
To:
<tech@openbsd.org>
Date:
Wed, 20 Aug 2025 14:24:34 +0200

Download raw body.

Thread
Hi,

when bootet by /boot (or EFI boot loaders) both amd64 and i386 kernel
start in a 32-bit mode.  Both kernels use 32-bit entry code.

When launching a kernel directly (vmctl start -b <path>) vmd(8) configures
a flat 64-bit register set as default register set.  The GDT provides
a 32-bit flat code segment.

For the i386 kernel the default register set is reconfigured to 32-bit
legacy mode; paging is enabled and uses 4 Mb pages.  This is different
to i386 being bootet by /boot.  /boot launches the i386 kernel with
paging disabled.

The amd64 kernel uses the default register set, ie. long mode is enabled
in EFER.  However, it uses the 32-bit code segment of the GDT.  Thus ther
kernel is effectively running in 32-bit compatibility mode.

This has implications when using SEV-ES as #VC traps are delivered
by 64-bit rules.  Booting an amd64 kernel on Linux/KVM the kernel is
actually running in 32-bit legacy mode, thus #VC traps are delivered
by 32-bit rules.  Therefore, we have tow #VC trap handlers for locore0,
a 32-bit and a 64-bit one.

To simplify this, I'd suggest to actually start both i386 and amd64
in 32-bit legacy mode with paging disabled.  The latter is needed,
as amd64 configures PAE (64 bit PTEs) in CR4 before enabling paging.
When we are running in 32-bit legacy mode with paging enabled, we double
fault when enabling PAE.

All in all with this diff the run time configuration is similar to what
/boot provides for both amd64 and i386.

In a later diff #VC trap handling in locore0 can be simplified.

Note:  When we will have a native 64-bit entry for amd64 kernels, the
removed code could be pulled from the attic again.

The diff can be tested with amd64 and i386 ramdisk kernels like this:

# vmctl start -c -b i386/bsd.rd myvm
# vmctl start -c -b amd64/bsd.rd myvm

Using a BIOS boot image (eg. /etc/firmware/vmm-bios) is not affected by
this change.

What do you think? oks?

Take care,
HJ.
----------------------------------------------------------------------
diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c
index 5f67953fb50..015609087c8 100644
--- a/usr.sbin/vmd/loadfile_elf.c
+++ b/usr.sbin/vmd/loadfile_elf.c
@@ -110,15 +110,14 @@ union {
 } hdr;
 
 static void setsegment(struct mem_segment_descriptor *, uint32_t,
-    size_t, int, int, int, int, int);
+    size_t, int, int, int, int);
 static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
 static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
 static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
 static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *);
 static size_t push_stack(uint32_t, uint32_t);
 static void push_gdt(void);
-static void push_pt_32(void);
-static void push_pt_64(void);
+static void push_pt(void);
 static void marc4random_buf(paddr_t, int);
 static void mbzero(paddr_t, int);
 static void mbcopy(void *, paddr_t, int);
@@ -126,8 +125,6 @@ static void mbcopy(void *, paddr_t, int);
 extern char *__progname;
 extern int vm_id;
 
-uint64_t pg_crypt = 0;
-
 /*
  * setsegment
  *
@@ -148,7 +145,7 @@ uint64_t pg_crypt = 0;
  */
 static void
 setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
-    int type, int dpl, int def32, int gran, int lm)
+    int type, int dpl, int def32, int gran)
 {
 	sd->sd_lolimit = (int)limit;
 	sd->sd_lobase = (int)base;
@@ -157,7 +154,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
 	sd->sd_p = 1;
 	sd->sd_hilimit = (int)limit >> 16;
 	sd->sd_avl = 0;
-	sd->sd_long = lm;
+	sd->sd_long = 0;
 	sd->sd_def32 = def32;
 	sd->sd_gran = gran;
 	sd->sd_hibase = (int)base >> 24;
@@ -185,27 +182,25 @@ push_gdt(void)
 	 * Create three segment descriptors:
 	 *
 	 * GDT[0] : null descriptor. "Created" via memset above.
-	 * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
+	 * GDT[1] (selector @ 0x8): Executable segment, for CS
 	 * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
-	 * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
 	 */
-	setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
-	setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
-	setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
+	setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
+	setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
 
 	write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
 	sev_register_encryption(GDT_PAGE, PAGE_SIZE);
 }
 
 /*
- * push_pt_32
+ * push_pt
  *
  * Create an identity-mapped page directory hierarchy mapping the first
  * 4GB of physical memory. This is used during bootstrapping i386 VMs on
  * CPUs without unrestricted guest capability.
  */
 static void
-push_pt_32(void)
+push_pt(void)
 {
 	uint32_t ptes[1024], i;
 
@@ -216,40 +211,6 @@ push_pt_32(void)
 	write_mem(PML3_PAGE, ptes, PAGE_SIZE);
 }
 
-/*
- * push_pt_64
- *
- * Create an identity-mapped page directory hierarchy mapping the first
- * 1GB of physical memory. This is used during bootstrapping 64 bit VMs on
- * CPUs without unrestricted guest capability.
- */
-static void
-push_pt_64(void)
-{
-	uint64_t ptes[512], i;
-
-	/* PDPDE0 - first 1GB */
-	memset(ptes, 0, sizeof(ptes));
-	ptes[0] = pg_crypt | PG_V | PML3_PAGE;
-	write_mem(PML4_PAGE, ptes, PAGE_SIZE);
-	sev_register_encryption(PML4_PAGE, PAGE_SIZE);
-
-	/* PDE0 - first 1GB */
-	memset(ptes, 0, sizeof(ptes));
-	ptes[0] = pg_crypt | PG_V | PG_RW | PG_u | PML2_PAGE;
-	write_mem(PML3_PAGE, ptes, PAGE_SIZE);
-	sev_register_encryption(PML3_PAGE, PAGE_SIZE);
-
-	/* First 1GB (in 2MB pages) */
-	memset(ptes, 0, sizeof(ptes));
-	for (i = 0 ; i < 512; i++) {
-		ptes[i] = pg_crypt | PG_V | PG_RW | PG_u | PG_PS |
-		    ((2048 * 1024) * i);
-	}
-	write_mem(PML2_PAGE, ptes, PAGE_SIZE);
-	sev_register_encryption(PML2_PAGE, PAGE_SIZE);
-}
-
 /*
  * loadfile_elf
  *
@@ -271,7 +232,7 @@ int
 loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
     unsigned int bootdevice)
 {
-	int r, is_i386 = 0;
+	int r;
 	uint32_t bootargsz;
 	size_t n, stacksize;
 	u_long marks[MARK_MAX];
@@ -286,7 +247,6 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
 	if (memcmp(hdr.elf32.e_ident, ELFMAG, SELFMAG) == 0 &&
 	    hdr.elf32.e_ident[EI_CLASS] == ELFCLASS32) {
 		r = elf32_exec(fp, &hdr.elf32, marks, LOAD_ALL);
-		is_i386 = 1;
 	} else if (memcmp(hdr.elf64.e_ident, ELFMAG, SELFMAG) == 0 &&
 	    hdr.elf64.e_ident[EI_CLASS] == ELFCLASS64) {
 		r = elf64_exec(fp, &hdr.elf64, marks, LOAD_ALL);
@@ -298,25 +258,17 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
 
 	push_gdt();
 
-	if (is_i386) {
-		push_pt_32();
-		/* Reconfigure the default flat-64 register set for 32 bit */
-		vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE;
-		vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE;
-		vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL;
-	}
-	else {
-		if (vcp->vcp_sev) {
-			if (vcp->vcp_poscbit == 0) {
-				log_warnx("SEV enabled but no C-bit reported");
-				return 1;
-			}
-			pg_crypt = (1ULL << vcp->vcp_poscbit);
-			log_debug("%s: poscbit %d pg_crypt 0x%016llx",
-			    __func__, vcp->vcp_poscbit, pg_crypt);
-		}
-		push_pt_64();
-	}
+	push_pt();
+
+	/*
+	 * As both amd64 and i386 kernels are launched in 32 bit
+	 * protected mode with paging disabled reconfigure the default
+	 * flat-64 register set.
+	 */
+	vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE;
+	vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE;
+	vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL;
+	vrs->vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE;
 
 	if (bootdevice == VMBOOTDEV_NET) {
 		bootmac = &bm;