From: Hans-Jörg Höxer Subject: vmd(8): Use 32-bit direct kernel launch for both amd64 and i386 To: Date: Wed, 20 Aug 2025 14:24:34 +0200 Hi, when bootet by /boot (or EFI boot loaders) both amd64 and i386 kernel start in a 32-bit mode. Both kernels use 32-bit entry code. When launching a kernel directly (vmctl start -b ) vmd(8) configures a flat 64-bit register set as default register set. The GDT provides a 32-bit flat code segment. For the i386 kernel the default register set is reconfigured to 32-bit legacy mode; paging is enabled and uses 4 Mb pages. This is different to i386 being bootet by /boot. /boot launches the i386 kernel with paging disabled. The amd64 kernel uses the default register set, ie. long mode is enabled in EFER. However, it uses the 32-bit code segment of the GDT. Thus ther kernel is effectively running in 32-bit compatibility mode. This has implications when using SEV-ES as #VC traps are delivered by 64-bit rules. Booting an amd64 kernel on Linux/KVM the kernel is actually running in 32-bit legacy mode, thus #VC traps are delivered by 32-bit rules. Therefore, we have tow #VC trap handlers for locore0, a 32-bit and a 64-bit one. To simplify this, I'd suggest to actually start both i386 and amd64 in 32-bit legacy mode with paging disabled. The latter is needed, as amd64 configures PAE (64 bit PTEs) in CR4 before enabling paging. When we are running in 32-bit legacy mode with paging enabled, we double fault when enabling PAE. All in all with this diff the run time configuration is similar to what /boot provides for both amd64 and i386. In a later diff #VC trap handling in locore0 can be simplified. Note: When we will have a native 64-bit entry for amd64 kernels, the removed code could be pulled from the attic again. The diff can be tested with amd64 and i386 ramdisk kernels like this: # vmctl start -c -b i386/bsd.rd myvm # vmctl start -c -b amd64/bsd.rd myvm Using a BIOS boot image (eg. /etc/firmware/vmm-bios) is not affected by this change. What do you think? oks? Take care, HJ. ---------------------------------------------------------------------- diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c index 5f67953fb50..015609087c8 100644 --- a/usr.sbin/vmd/loadfile_elf.c +++ b/usr.sbin/vmd/loadfile_elf.c @@ -110,15 +110,14 @@ union { } hdr; static void setsegment(struct mem_segment_descriptor *, uint32_t, - size_t, int, int, int, int, int); + size_t, int, int, int, int); static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int); static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int); static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *); static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *); static size_t push_stack(uint32_t, uint32_t); static void push_gdt(void); -static void push_pt_32(void); -static void push_pt_64(void); +static void push_pt(void); static void marc4random_buf(paddr_t, int); static void mbzero(paddr_t, int); static void mbcopy(void *, paddr_t, int); @@ -126,8 +125,6 @@ static void mbcopy(void *, paddr_t, int); extern char *__progname; extern int vm_id; -uint64_t pg_crypt = 0; - /* * setsegment * @@ -148,7 +145,7 @@ uint64_t pg_crypt = 0; */ static void setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, - int type, int dpl, int def32, int gran, int lm) + int type, int dpl, int def32, int gran) { sd->sd_lolimit = (int)limit; sd->sd_lobase = (int)base; @@ -157,7 +154,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit, sd->sd_p = 1; sd->sd_hilimit = (int)limit >> 16; sd->sd_avl = 0; - sd->sd_long = lm; + sd->sd_long = 0; sd->sd_def32 = def32; sd->sd_gran = gran; sd->sd_hibase = (int)base >> 24; @@ -185,27 +182,25 @@ push_gdt(void) * Create three segment descriptors: * * GDT[0] : null descriptor. "Created" via memset above. - * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS + * GDT[1] (selector @ 0x8): Executable segment, for CS * GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS - * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS */ - setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0); - setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0); - setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1); + setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1); + setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1); write_mem(GDT_PAGE, gdtpage, PAGE_SIZE); sev_register_encryption(GDT_PAGE, PAGE_SIZE); } /* - * push_pt_32 + * push_pt * * Create an identity-mapped page directory hierarchy mapping the first * 4GB of physical memory. This is used during bootstrapping i386 VMs on * CPUs without unrestricted guest capability. */ static void -push_pt_32(void) +push_pt(void) { uint32_t ptes[1024], i; @@ -216,40 +211,6 @@ push_pt_32(void) write_mem(PML3_PAGE, ptes, PAGE_SIZE); } -/* - * push_pt_64 - * - * Create an identity-mapped page directory hierarchy mapping the first - * 1GB of physical memory. This is used during bootstrapping 64 bit VMs on - * CPUs without unrestricted guest capability. - */ -static void -push_pt_64(void) -{ - uint64_t ptes[512], i; - - /* PDPDE0 - first 1GB */ - memset(ptes, 0, sizeof(ptes)); - ptes[0] = pg_crypt | PG_V | PML3_PAGE; - write_mem(PML4_PAGE, ptes, PAGE_SIZE); - sev_register_encryption(PML4_PAGE, PAGE_SIZE); - - /* PDE0 - first 1GB */ - memset(ptes, 0, sizeof(ptes)); - ptes[0] = pg_crypt | PG_V | PG_RW | PG_u | PML2_PAGE; - write_mem(PML3_PAGE, ptes, PAGE_SIZE); - sev_register_encryption(PML3_PAGE, PAGE_SIZE); - - /* First 1GB (in 2MB pages) */ - memset(ptes, 0, sizeof(ptes)); - for (i = 0 ; i < 512; i++) { - ptes[i] = pg_crypt | PG_V | PG_RW | PG_u | PG_PS | - ((2048 * 1024) * i); - } - write_mem(PML2_PAGE, ptes, PAGE_SIZE); - sev_register_encryption(PML2_PAGE, PAGE_SIZE); -} - /* * loadfile_elf * @@ -271,7 +232,7 @@ int loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, unsigned int bootdevice) { - int r, is_i386 = 0; + int r; uint32_t bootargsz; size_t n, stacksize; u_long marks[MARK_MAX]; @@ -286,7 +247,6 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, if (memcmp(hdr.elf32.e_ident, ELFMAG, SELFMAG) == 0 && hdr.elf32.e_ident[EI_CLASS] == ELFCLASS32) { r = elf32_exec(fp, &hdr.elf32, marks, LOAD_ALL); - is_i386 = 1; } else if (memcmp(hdr.elf64.e_ident, ELFMAG, SELFMAG) == 0 && hdr.elf64.e_ident[EI_CLASS] == ELFCLASS64) { r = elf64_exec(fp, &hdr.elf64, marks, LOAD_ALL); @@ -298,25 +258,17 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs, push_gdt(); - if (is_i386) { - push_pt_32(); - /* Reconfigure the default flat-64 register set for 32 bit */ - vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE; - vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE; - vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL; - } - else { - if (vcp->vcp_sev) { - if (vcp->vcp_poscbit == 0) { - log_warnx("SEV enabled but no C-bit reported"); - return 1; - } - pg_crypt = (1ULL << vcp->vcp_poscbit); - log_debug("%s: poscbit %d pg_crypt 0x%016llx", - __func__, vcp->vcp_poscbit, pg_crypt); - } - push_pt_64(); - } + push_pt(); + + /* + * As both amd64 and i386 kernels are launched in 32 bit + * protected mode with paging disabled reconfigure the default + * flat-64 register set. + */ + vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE; + vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE; + vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL; + vrs->vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE; if (bootdevice == VMBOOTDEV_NET) { bootmac = &bm;