Download raw body.
vmd(8): Use 32-bit direct kernel launch for both amd64 and i386
Hi,
when bootet by /boot (or EFI boot loaders) both amd64 and i386 kernel
start in a 32-bit mode. Both kernels use 32-bit entry code.
When launching a kernel directly (vmctl start -b <path>) vmd(8) configures
a flat 64-bit register set as default register set. The GDT provides
a 32-bit flat code segment.
For the i386 kernel the default register set is reconfigured to 32-bit
legacy mode; paging is enabled and uses 4 Mb pages. This is different
to i386 being bootet by /boot. /boot launches the i386 kernel with
paging disabled.
The amd64 kernel uses the default register set, ie. long mode is enabled
in EFER. However, it uses the 32-bit code segment of the GDT. Thus ther
kernel is effectively running in 32-bit compatibility mode.
This has implications when using SEV-ES as #VC traps are delivered
by 64-bit rules. Booting an amd64 kernel on Linux/KVM the kernel is
actually running in 32-bit legacy mode, thus #VC traps are delivered
by 32-bit rules. Therefore, we have tow #VC trap handlers for locore0,
a 32-bit and a 64-bit one.
To simplify this, I'd suggest to actually start both i386 and amd64
in 32-bit legacy mode with paging disabled. The latter is needed,
as amd64 configures PAE (64 bit PTEs) in CR4 before enabling paging.
When we are running in 32-bit legacy mode with paging enabled, we double
fault when enabling PAE.
All in all with this diff the run time configuration is similar to what
/boot provides for both amd64 and i386.
In a later diff #VC trap handling in locore0 can be simplified.
Note: When we will have a native 64-bit entry for amd64 kernels, the
removed code could be pulled from the attic again.
The diff can be tested with amd64 and i386 ramdisk kernels like this:
# vmctl start -c -b i386/bsd.rd myvm
# vmctl start -c -b amd64/bsd.rd myvm
Using a BIOS boot image (eg. /etc/firmware/vmm-bios) is not affected by
this change.
What do you think? oks?
Take care,
HJ.
----------------------------------------------------------------------
diff --git a/usr.sbin/vmd/loadfile_elf.c b/usr.sbin/vmd/loadfile_elf.c
index 5f67953fb50..015609087c8 100644
--- a/usr.sbin/vmd/loadfile_elf.c
+++ b/usr.sbin/vmd/loadfile_elf.c
@@ -110,15 +110,14 @@ union {
} hdr;
static void setsegment(struct mem_segment_descriptor *, uint32_t,
- size_t, int, int, int, int, int);
+ size_t, int, int, int, int);
static int elf32_exec(gzFile, Elf32_Ehdr *, u_long *, int);
static int elf64_exec(gzFile, Elf64_Ehdr *, u_long *, int);
static size_t create_bios_memmap(struct vm_create_params *, bios_memmap_t *);
static uint32_t push_bootargs(bios_memmap_t *, size_t, bios_bootmac_t *);
static size_t push_stack(uint32_t, uint32_t);
static void push_gdt(void);
-static void push_pt_32(void);
-static void push_pt_64(void);
+static void push_pt(void);
static void marc4random_buf(paddr_t, int);
static void mbzero(paddr_t, int);
static void mbcopy(void *, paddr_t, int);
@@ -126,8 +125,6 @@ static void mbcopy(void *, paddr_t, int);
extern char *__progname;
extern int vm_id;
-uint64_t pg_crypt = 0;
-
/*
* setsegment
*
@@ -148,7 +145,7 @@ uint64_t pg_crypt = 0;
*/
static void
setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
- int type, int dpl, int def32, int gran, int lm)
+ int type, int dpl, int def32, int gran)
{
sd->sd_lolimit = (int)limit;
sd->sd_lobase = (int)base;
@@ -157,7 +154,7 @@ setsegment(struct mem_segment_descriptor *sd, uint32_t base, size_t limit,
sd->sd_p = 1;
sd->sd_hilimit = (int)limit >> 16;
sd->sd_avl = 0;
- sd->sd_long = lm;
+ sd->sd_long = 0;
sd->sd_def32 = def32;
sd->sd_gran = gran;
sd->sd_hibase = (int)base >> 24;
@@ -185,27 +182,25 @@ push_gdt(void)
* Create three segment descriptors:
*
* GDT[0] : null descriptor. "Created" via memset above.
- * GDT[1] (selector @ 0x8): Executable segment (compat mode), for CS
+ * GDT[1] (selector @ 0x8): Executable segment, for CS
* GDT[2] (selector @ 0x10): RW Data segment, for DS/ES/SS
- * GDT[3] (selector @ 0x18): Executable segment (long mode), for CS
*/
- setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1, 0);
- setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1, 0);
- setsegment(&sd[3], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 0, 1, 1);
+ setsegment(&sd[1], 0, 0xffffffff, SDT_MEMERA, SEL_KPL, 1, 1);
+ setsegment(&sd[2], 0, 0xffffffff, SDT_MEMRWA, SEL_KPL, 1, 1);
write_mem(GDT_PAGE, gdtpage, PAGE_SIZE);
sev_register_encryption(GDT_PAGE, PAGE_SIZE);
}
/*
- * push_pt_32
+ * push_pt
*
* Create an identity-mapped page directory hierarchy mapping the first
* 4GB of physical memory. This is used during bootstrapping i386 VMs on
* CPUs without unrestricted guest capability.
*/
static void
-push_pt_32(void)
+push_pt(void)
{
uint32_t ptes[1024], i;
@@ -216,40 +211,6 @@ push_pt_32(void)
write_mem(PML3_PAGE, ptes, PAGE_SIZE);
}
-/*
- * push_pt_64
- *
- * Create an identity-mapped page directory hierarchy mapping the first
- * 1GB of physical memory. This is used during bootstrapping 64 bit VMs on
- * CPUs without unrestricted guest capability.
- */
-static void
-push_pt_64(void)
-{
- uint64_t ptes[512], i;
-
- /* PDPDE0 - first 1GB */
- memset(ptes, 0, sizeof(ptes));
- ptes[0] = pg_crypt | PG_V | PML3_PAGE;
- write_mem(PML4_PAGE, ptes, PAGE_SIZE);
- sev_register_encryption(PML4_PAGE, PAGE_SIZE);
-
- /* PDE0 - first 1GB */
- memset(ptes, 0, sizeof(ptes));
- ptes[0] = pg_crypt | PG_V | PG_RW | PG_u | PML2_PAGE;
- write_mem(PML3_PAGE, ptes, PAGE_SIZE);
- sev_register_encryption(PML3_PAGE, PAGE_SIZE);
-
- /* First 1GB (in 2MB pages) */
- memset(ptes, 0, sizeof(ptes));
- for (i = 0 ; i < 512; i++) {
- ptes[i] = pg_crypt | PG_V | PG_RW | PG_u | PG_PS |
- ((2048 * 1024) * i);
- }
- write_mem(PML2_PAGE, ptes, PAGE_SIZE);
- sev_register_encryption(PML2_PAGE, PAGE_SIZE);
-}
-
/*
* loadfile_elf
*
@@ -271,7 +232,7 @@ int
loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
unsigned int bootdevice)
{
- int r, is_i386 = 0;
+ int r;
uint32_t bootargsz;
size_t n, stacksize;
u_long marks[MARK_MAX];
@@ -286,7 +247,6 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
if (memcmp(hdr.elf32.e_ident, ELFMAG, SELFMAG) == 0 &&
hdr.elf32.e_ident[EI_CLASS] == ELFCLASS32) {
r = elf32_exec(fp, &hdr.elf32, marks, LOAD_ALL);
- is_i386 = 1;
} else if (memcmp(hdr.elf64.e_ident, ELFMAG, SELFMAG) == 0 &&
hdr.elf64.e_ident[EI_CLASS] == ELFCLASS64) {
r = elf64_exec(fp, &hdr.elf64, marks, LOAD_ALL);
@@ -298,25 +258,17 @@ loadfile_elf(gzFile fp, struct vmd_vm *vm, struct vcpu_reg_state *vrs,
push_gdt();
- if (is_i386) {
- push_pt_32();
- /* Reconfigure the default flat-64 register set for 32 bit */
- vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE;
- vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE;
- vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL;
- }
- else {
- if (vcp->vcp_sev) {
- if (vcp->vcp_poscbit == 0) {
- log_warnx("SEV enabled but no C-bit reported");
- return 1;
- }
- pg_crypt = (1ULL << vcp->vcp_poscbit);
- log_debug("%s: poscbit %d pg_crypt 0x%016llx",
- __func__, vcp->vcp_poscbit, pg_crypt);
- }
- push_pt_64();
- }
+ push_pt();
+
+ /*
+ * As both amd64 and i386 kernels are launched in 32 bit
+ * protected mode with paging disabled reconfigure the default
+ * flat-64 register set.
+ */
+ vrs->vrs_crs[VCPU_REGS_CR3] = PML3_PAGE;
+ vrs->vrs_crs[VCPU_REGS_CR4] = CR4_PSE;
+ vrs->vrs_msrs[VCPU_REGS_EFER] = 0ULL;
+ vrs->vrs_crs[VCPU_REGS_CR0] = CR0_ET | CR0_PE;
if (bootdevice == VMBOOTDEV_NET) {
bootmac = &bm;
vmd(8): Use 32-bit direct kernel launch for both amd64 and i386