From: Dave Voutila Subject: Re: vmd(8): Use 32-bit direct kernel launch for both amd64 and i386 To: tech@openbsd.org Date: Wed, 20 Aug 2025 10:44:40 -0400 Hans-Jörg Höxer writes: > Hi, > > when bootet by /boot (or EFI boot loaders) both amd64 and i386 kernel > start in a 32-bit mode. Both kernels use 32-bit entry code. > > When launching a kernel directly (vmctl start -b ) vmd(8) configures > a flat 64-bit register set as default register set. The GDT provides > a 32-bit flat code segment. > > For the i386 kernel the default register set is reconfigured to 32-bit > legacy mode; paging is enabled and uses 4 Mb pages. This is different > to i386 being bootet by /boot. /boot launches the i386 kernel with > paging disabled. > > The amd64 kernel uses the default register set, ie. long mode is enabled > in EFER. However, it uses the 32-bit code segment of the GDT. Thus ther > kernel is effectively running in 32-bit compatibility mode. > > This has implications when using SEV-ES as #VC traps are delivered > by 64-bit rules. Booting an amd64 kernel on Linux/KVM the kernel is > actually running in 32-bit legacy mode, thus #VC traps are delivered > by 32-bit rules. Therefore, we have tow #VC trap handlers for locore0, > a 32-bit and a 64-bit one. > > To simplify this, I'd suggest to actually start both i386 and amd64 > in 32-bit legacy mode with paging disabled. The latter is needed, > as amd64 configures PAE (64 bit PTEs) in CR4 before enabling paging. > When we are running in 32-bit legacy mode with paging enabled, we double > fault when enabling PAE. > > All in all with this diff the run time configuration is similar to what > /boot provides for both amd64 and i386. > > In a later diff #VC trap handling in locore0 can be simplified. > > Note: When we will have a native 64-bit entry for amd64 kernels, the > removed code could be pulled from the attic again. > > The diff can be tested with amd64 and i386 ramdisk kernels like this: > > # vmctl start -c -b i386/bsd.rd myvm > # vmctl start -c -b amd64/bsd.rd myvm > > Using a BIOS boot image (eg. /etc/firmware/vmm-bios) is not affected by > this change. > > What do you think? oks? I'm not opposed and this makes sense. Given it's end of August it's best we make this change as soon as possible to give enough time in snapshots before a release. I'll take a more detailed look at the diff this week, but a small commenta below. > > Take care, > HJ. > ---------------------------------------------------------------------- > diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c > index 5f67953fb50..015609087c8 100644 > --- a/usr.sbin/vmd/loadfile_elf.c > +++ b/usr.sbin/vmd/loadfile_elf.c > @@ -110,15 +110,14 @@ union { > } hdr; > > static void setsegment(struct mem_segment_descriptor *, uint32_t, > - size_t, int, int, int, int, int); > + size_t, int, int, int, int); > static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int); > static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int); > static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *); > static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *); > static size_t push_stack(uint32_t, uint32_t); > static void push_gdt(void); > -static void push_pt_32(void); > -static void push_pt_64(void); > +static void push_pt(void); If we expect we'll bring back launching in 64-bit mode, maybe to cut down on changes keep push_pt_32 named as it is for now. Makes this diff smaller and if we bring back 64-bit support won't require another rename. > static void marc4random_buf(paddr_t, int); > static void mbzero(paddr_t, int); > static void mbcopy(void *, paddr_t, int); > @@ -126,8 +125,6 @@ static void mbcopy(void *, paddr_t, int); > extern char *__progname; > extern int vm_id; > > -uint64_t pg_crypt = 0; > - > /* > * setsegment > * > @@ -148,7 +145,7 @@ uint64_t pg_crypt = 0; > */ > static void > setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, > - int type, int dpl, int def32, int gran, int lm) > + int type, int dpl, int def32, int gran) > { > sd->sd_lolimit = (int)limit; > sd->sd_lobase = (int)base; > @@ -157,7 +154,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, > sd->sd_p = 1; > sd->sd_hilimit = (int)limit >> 16; > sd->sd_avl = 0; > - sd->sd_long = lm; > + sd->sd_long = 0; > sd->sd_def32 = def32; > sd->sd_gran = gran; > sd->sd_hibase = (int)base >> 24; > @@ -185,27 +182,25 @@ push_gdt(void) > * Create three segment descriptors: > * > * GDT[0] : null descriptor. "Created" via memset above. > - * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS > + * GDT[1] (selector @ 0x8): Executable segment, for CS > * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS > - * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS > */ > - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0); > - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0); > - setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1); > + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1); > + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1); > > write_mem(GDT_PAGE, gdtpage, PAGE_SIZE); > sev_register_encryption(GDT_PAGE, PAGE_SIZE); > } > > /* > - * push_pt_32 > + * push_pt > * > * Create an identity-mapped page directory hierarchy mapping the first > * 4GB of physical memory. This is used during bootstrapping i386 VMs on > * CPUs without unrestricted guest capability. > */ > static void > -push_pt_32(void) > +push_pt(void) > { > uint32_t ptes[1024], i; > > @@ -216,40 +211,6 @@ push_pt_32(void) > write_mem(PML3_PAGE, ptes, PAGE_SIZE); > } > > -/* > - * push_pt_64 > - * > - * Create an identity-mapped page directory hierarchy mapping the first > - * 1GB of physical memory. This is used during bootstrapping 64 bit VMs on > - * CPUs without unrestricted guest capability. > - */ > -static void > -push_pt_64(void) > -{ > - uint64_t ptes[512], i; > - > - /* PDPDE0 - first 1GB */ > - memset(ptes, 0, sizeof(ptes)); > - ptes[0] = pg_crypt | PG_V | PML3_PAGE; > - write_mem(PML4_PAGE, ptes, PAGE_SIZE); > - sev_register_encryption(PML4_PAGE, PAGE_SIZE); > - > - /* PDE0 - first 1GB */ > - memset(ptes, 0, sizeof(ptes)); > - ptes[0] = pg_crypt | PG_V | PG_RW | PG_u | PML2_PAGE; > - write_mem(PML3_PAGE, ptes, PAGE_SIZE); > - sev_register_encryption(PML3_PAGE, PAGE_SIZE); > - > - /* First 1GB (in 2MB pages) */ > - memset(ptes, 0, sizeof(ptes)); > - for (i = 0 ; i < 512; i++) { > - ptes[i] = pg_crypt | PG_V | PG_RW | PG_u | PG_PS | > - ((2048 * 1024) * i); > - } > - write_mem(PML2_PAGE, ptes, PAGE_SIZE); > - sev_register_encryption(PML2_PAGE, PAGE_SIZE); > -} > - > /* > * loadfile_elf > * > @@ -271,7 +232,7 @@ int > loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, > unsigned int bootdevice) > { > - int r, is_i386 = 0; > + int r; > uint32_t bootargsz; > size_t n, stacksize; > u_long marks[MARK_MAX]; > @@ -286,7 +247,6 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, > if (memcmp(hdr.elf32.e_ident, ELFMAG, SELFMAG) == 0 && > hdr.elf32.e_ident[EI_CLASS] == ELFCLASS32) { > r = elf32_exec(fp, &hdr.elf32, marks, LOAD_ALL); > - is_i386 = 1; > } else if (memcmp(hdr.elf64.e_ident, ELFMAG, SELFMAG) == 0 && > hdr.elf64.e_ident[EI_CLASS] == ELFCLASS64) { > r = elf64_exec(fp, &hdr.elf64, marks, LOAD_ALL); > @@ -298,25 +258,17 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, > > push_gdt(); > > - if (is_i386) { > - push_pt_32(); > - /* Reconfigure the default flat-64 register set for 32 bit */ > - vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE; > - vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE; > - vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL; > - } > - else { > - if (vcp->vcp_sev) { > - if (vcp->vcp_poscbit == 0) { > - log_warnx("SEV enabled but no C-bit reported"); > - return 1; > - } > - pg_crypt = (1ULL << vcp->vcp_poscbit); > - log_debug("%s: poscbit %d pg_crypt 0x%016llx", > - __func__, vcp->vcp_poscbit, pg_crypt); > - } > - push_pt_64(); > - } > + push_pt(); > + > + /* > + * As both amd64 and i386 kernels are launched in 32 bit > + * protected mode with paging disabled reconfigure the default > + * flat-64 register set. > + */ > + vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE; > + vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE; > + vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL; > + vrs->vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE; > > if (bootdevice == VMBOOTDEV_NET) { > bootmac = &bm;