From: Alexander Bluhm Subject: Re: Reworking VMM's nested paging & guest memory (de-vmspace-ification) To: Dave Voutila Cc: tech@openbsd.org, mlarkin@openbsd.org Date: Wed, 23 Apr 2025 23:33:44 +0200 On Wed, Apr 16, 2025 at 09:48:29PM -0400, Dave Voutila wrote: > Some positive test reports and so far it looks like bluhm's issue was > hardware related, but would still like confirmation that this *works* > with SEV mode enabled on a guest vm. I have fixed my BIOS settings an can run SEV again. Diff works on AMD and Intel. AMD-SEV also tested. cpu0: AMD EPYC 73F3 16-Core Processor, 3500.00 MHz, 19-01-01, patch 0a0011d5 cpu0: Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz, 2600.09 MHz, 06-3f-02, patch 00000049 bluhm > diff refs/heads/master refs/heads/vmm-uobj-v5 > commit - 4c8bb646499097485d416c4552a733b35394749d > commit + fd3509d9f67be2e036257c8829605d6fdeb1de6d > blob - e3205f48eedda8f4158d9a8d42e6164534921844 > blob + e58730ec23702850524db522dd82d750589d4c18 > --- sys/arch/amd64/amd64/vmm_machdep.c > +++ sys/arch/amd64/amd64/vmm_machdep.c > @@ -115,6 +115,7 @@ int vmm_inject_db(struct vcpu *); > void vmx_handle_intr(struct vcpu *); > void vmx_handle_misc_enable_msr(struct vcpu *); > int vmm_get_guest_memtype(struct vm *, paddr_t); > +vaddr_t vmm_translate_gpa(struct vm *, paddr_t); > int vmx_get_guest_faulttype(void); > int svm_get_guest_faulttype(struct vmcb *); > int vmx_get_exit_qualification(uint64_t *); > @@ -905,54 +906,19 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v > int > vm_impl_init(struct vm *vm, struct proc *p) > { > - int i, mode, ret; > - vaddr_t mingpa, maxgpa; > - struct vm_mem_range *vmr; > - > /* If not EPT or RVI, nothing to do here */ > switch (vmm_softc->mode) { > case VMM_MODE_EPT: > - mode = PMAP_TYPE_EPT; > + pmap_convert(vm->vm_pmap, PMAP_TYPE_EPT); > break; > case VMM_MODE_RVI: > - mode = PMAP_TYPE_RVI; > + pmap_convert(vm->vm_pmap, PMAP_TYPE_RVI); > break; > default: > printf("%s: invalid vmm mode %d\n", __func__, vmm_softc->mode); > return (EINVAL); > } > > - vmr = &vm->vm_memranges[0]; > - mingpa = vmr->vmr_gpa; > - vmr = &vm->vm_memranges[vm->vm_nmemranges - 1]; > - maxgpa = vmr->vmr_gpa + vmr->vmr_size; > - > - /* > - * uvmspace_alloc (currently) always returns a valid vmspace > - */ > - vm->vm_vmspace = uvmspace_alloc(mingpa, maxgpa, TRUE, FALSE); > - vm->vm_map = &vm->vm_vmspace->vm_map; > - > - /* Map the new map with an anon */ > - DPRINTF("%s: created vm_map @ %p\n", __func__, vm->vm_map); > - for (i = 0; i < vm->vm_nmemranges; i++) { > - vmr = &vm->vm_memranges[i]; > - ret = uvm_share(vm->vm_map, vmr->vmr_gpa, > - PROT_READ | PROT_WRITE | PROT_EXEC, > - &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size); > - if (ret) { > - printf("%s: uvm_share failed (%d)\n", __func__, ret); > - /* uvmspace_free calls pmap_destroy for us */ > - KERNEL_LOCK(); > - uvmspace_free(vm->vm_vmspace); > - vm->vm_vmspace = NULL; > - KERNEL_UNLOCK(); > - return (ENOMEM); > - } > - } > - > - pmap_convert(vm->vm_map->pmap, mode); > - > return (0); > } > > @@ -1666,7 +1632,7 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg > > /* NPT */ > vmcb->v_np_enable = SVM_ENABLE_NP; > - vmcb->v_n_cr3 = vcpu->vc_parent->vm_map->pmap->pm_pdirpa; > + vmcb->v_n_cr3 = vcpu->vc_parent->vm_pmap->pm_pdirpa; > > /* SEV */ > if (vcpu->vc_sev) > @@ -1680,7 +1646,7 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg > /* xcr0 power on default sets bit 0 (x87 state) */ > vcpu->vc_gueststate.vg_xcr0 = XFEATURE_X87 & xsave_mask; > > - vcpu->vc_parent->vm_map->pmap->eptp = 0; > + vcpu->vc_parent->vm_pmap->eptp = 0; > > return ret; > } > @@ -2601,7 +2567,7 @@ vcpu_init_vmx(struct vcpu *vcpu) > } > > /* Configure EPT Pointer */ > - eptp = vcpu->vc_parent->vm_map->pmap->pm_pdirpa; > + eptp = vcpu->vc_parent->vm_pmap->pm_pdirpa; > msr = rdmsr(IA32_VMX_EPT_VPID_CAP); > if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) { > /* Page walk length 4 supported */ > @@ -2625,7 +2591,7 @@ vcpu_init_vmx(struct vcpu *vcpu) > goto exit; > } > > - vcpu->vc_parent->vm_map->pmap->eptp = eptp; > + vcpu->vc_parent->vm_pmap->eptp = eptp; > > /* Host CR0 */ > cr0 = rcr0() & ~CR0_TS; > @@ -3490,7 +3456,7 @@ vmm_translate_gva(struct vcpu *vcpu, uint64_t va, uint > > DPRINTF("%s: read pte level %d @ GPA 0x%llx\n", __func__, > level, pte_paddr); > - if (!pmap_extract(vcpu->vc_parent->vm_map->pmap, pte_paddr, > + if (!pmap_extract(vcpu->vc_parent->vm_pmap, pte_paddr, > &hpa)) { > DPRINTF("%s: cannot extract HPA for GPA 0x%llx\n", > __func__, pte_paddr); > @@ -3671,11 +3637,11 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params * > > /* We're now using this vcpu's EPT pmap on this cpu. */ > atomic_swap_ptr(&ci->ci_ept_pmap, > - vcpu->vc_parent->vm_map->pmap); > + vcpu->vc_parent->vm_pmap); > > /* Invalidate EPT cache. */ > vid_ept.vid_reserved = 0; > - vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp; > + vid_ept.vid_eptp = vcpu->vc_parent->vm_pmap->eptp; > if (invept(ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode, > &vid_ept)) { > printf("%s: invept\n", __func__); > @@ -4461,6 +4427,29 @@ vmm_get_guest_memtype(struct vm *vm, paddr_t gpa) > return (VMM_MEM_TYPE_UNKNOWN); > } > > +vaddr_t > +vmm_translate_gpa(struct vm *vm, paddr_t gpa) > +{ > + int i = 0; > + vaddr_t hva = 0; > + struct vm_mem_range *vmr = NULL; > + > + /* > + * Translate GPA -> userland HVA in proc p. Find the memory range > + * and use it to translate to the HVA. > + */ > + for (i = 0; i < vm->vm_nmemranges; i++) { > + vmr = &vm->vm_memranges[i]; > + if (gpa >= vmr->vmr_gpa && gpa < vmr->vmr_gpa + vmr->vmr_size) { > + hva = vmr->vmr_va + (gpa - vmr->vmr_gpa); > + break; > + } > + } > + > + return (hva); > +} > + > + > /* > * vmx_get_exit_qualification > * > @@ -4542,15 +4531,45 @@ svm_get_guest_faulttype(struct vmcb *vmcb) > int > svm_fault_page(struct vcpu *vcpu, paddr_t gpa) > { > - paddr_t pa = trunc_page(gpa); > - int ret; > + struct proc *p = curproc; > + paddr_t hpa, pa = trunc_page(gpa); > + vaddr_t hva; > + int ret = 1; > > - ret = uvm_fault_wire(vcpu->vc_parent->vm_map, pa, pa + PAGE_SIZE, > - PROT_READ | PROT_WRITE | PROT_EXEC); > - if (ret) > - printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n", > - __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip); > + hva = vmm_translate_gpa(vcpu->vc_parent, pa); > + if (hva == 0) { > + printf("%s: unable to translate gpa 0x%llx\n", __func__, > + (uint64_t)pa); > + return (EINVAL); > + } > > + /* If we don't already have a backing page... */ > + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) { > + /* ...fault a RW page into the p's address space... */ > + ret = uvm_fault_wire(&p->p_vmspace->vm_map, hva, > + hva + PAGE_SIZE, PROT_READ | PROT_WRITE); > + if (ret) { > + printf("%s: uvm_fault failed %d hva=0x%llx\n", __func__, > + ret, (uint64_t)hva); > + return (ret); > + } > + > + /* ...and then get the mapping. */ > + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) { > + printf("%s: failed to extract hpa for hva 0x%llx\n", > + __func__, (uint64_t)hva); > + return (EINVAL); > + } > + } > + > + /* Now we insert a RWX mapping into the guest's RVI pmap. */ > + ret = pmap_enter(vcpu->vc_parent->vm_pmap, pa, hpa, > + PROT_READ | PROT_WRITE | PROT_EXEC, 0); > + if (ret) { > + printf("%s: pmap_enter failed pa=0x%llx, hpa=0x%llx\n", > + __func__, (uint64_t)pa, (uint64_t)hpa); > + } > + > return (ret); > } > > @@ -4617,8 +4636,10 @@ svm_handle_np_fault(struct vcpu *vcpu) > int > vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) > { > + struct proc *p = curproc; > int fault_type, ret; > - paddr_t pa = trunc_page(gpa); > + paddr_t hpa, pa = trunc_page(gpa); > + vaddr_t hva; > > fault_type = vmx_get_guest_faulttype(); > switch (fault_type) { > @@ -4633,19 +4654,45 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa) > break; > } > > - /* We may sleep during uvm_fault_wire(), so reload VMCS. */ > - vcpu->vc_last_pcpu = curcpu(); > - ret = uvm_fault_wire(vcpu->vc_parent->vm_map, pa, pa + PAGE_SIZE, > - PROT_READ | PROT_WRITE | PROT_EXEC); > - if (vcpu_reload_vmcs_vmx(vcpu)) { > - printf("%s: failed to reload vmcs\n", __func__); > + hva = vmm_translate_gpa(vcpu->vc_parent, pa); > + if (hva == 0) { > + printf("%s: unable to translate gpa 0x%llx\n", __func__, > + (uint64_t)pa); > return (EINVAL); > } > > - if (ret) > - printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n", > - __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip); > + /* If we don't already have a backing page... */ > + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) { > + /* ...fault a RW page into the p's address space... */ > + vcpu->vc_last_pcpu = curcpu(); /* uvm_fault may sleep. */ > + ret = uvm_fault_wire(&p->p_vmspace->vm_map, hva, > + hva + PAGE_SIZE, PROT_READ | PROT_WRITE); > + if (ret) { > + printf("%s: uvm_fault failed %d hva=0x%llx\n", __func__, > + ret, (uint64_t)hva); > + return (ret); > + } > + if (vcpu_reload_vmcs_vmx(vcpu)) { > + printf("%s: failed to reload vmcs\n", __func__); > + return (EINVAL); > + } > > + /* ...and then get the mapping. */ > + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) { > + printf("%s: failed to extract hpa for hva 0x%llx\n", > + __func__, (uint64_t)hva); > + return (EINVAL); > + } > + } > + > + /* Now we insert a RWX mapping into the guest's EPT pmap. */ > + ret = pmap_enter(vcpu->vc_parent->vm_pmap, pa, hpa, > + PROT_READ | PROT_WRITE | PROT_EXEC, 0); > + if (ret) { > + printf("%s: pmap_enter failed pa=0x%llx, hpa=0x%llx\n", > + __func__, (uint64_t)pa, (uint64_t)hpa); > + } > + > return (ret); > } > > @@ -4930,7 +4977,7 @@ vmx_load_pdptes(struct vcpu *vcpu) > return (EINVAL); > } > > - if (!pmap_extract(vcpu->vc_parent->vm_map->pmap, (vaddr_t)cr3, > + if (!pmap_extract(vcpu->vc_parent->vm_pmap, (vaddr_t)cr3, > (paddr_t *)&cr3_host_phys)) { > DPRINTF("%s: nonmapped guest CR3, setting PDPTEs to 0\n", > __func__); > @@ -6459,7 +6506,7 @@ vmm_update_pvclock(struct vcpu *vcpu) > > if (vcpu->vc_pvclock_system_gpa & PVCLOCK_SYSTEM_TIME_ENABLE) { > pvclock_gpa = vcpu->vc_pvclock_system_gpa & 0xFFFFFFFFFFFFFFF0; > - if (!pmap_extract(vm->vm_map->pmap, pvclock_gpa, &pvclock_hpa)) > + if (!pmap_extract(vm->vm_pmap, pvclock_gpa, &pvclock_hpa)) > return (EINVAL); > pvclock_ti = (void*) PMAP_DIRECT_MAP(pvclock_hpa); > > blob - 0a86ddbecd3fbd3627336902c7209f6d023bc3f0 > blob + bc03a35c8cf161d3e4164f46d159325b9ef34690 > --- sys/dev/vmm/vmm.c > +++ sys/dev/vmm/vmm.c > @@ -25,6 +25,9 @@ > #include > #include > > +#include > +#include > + > #include > > #include > @@ -356,6 +359,9 @@ vm_create(struct vm_create_params *vcp, struct proc *p > size_t memsize; > struct vm *vm; > struct vcpu *vcpu; > + struct uvm_object *uao; > + struct vm_mem_range *vmr; > + unsigned int uvmflags = 0; > > memsize = vm_create_check_mem_ranges(vcp); > if (memsize == 0) > @@ -378,13 +384,58 @@ vm_create(struct vm_create_params *vcp, struct proc *p > /* Instantiate and configure the new vm. */ > vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO); > > + /* Create the VM's identity. */ > vm->vm_creator_pid = p->p_p->ps_pid; > + strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1); > + > + /* Create the pmap for nested paging. */ > + vm->vm_pmap = pmap_create(); > + > + /* Initialize memory slots. */ > vm->vm_nmemranges = vcp->vcp_nmemranges; > memcpy(vm->vm_memranges, vcp->vcp_memranges, > vm->vm_nmemranges * sizeof(vm->vm_memranges[0])); > - vm->vm_memory_size = memsize; > - strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1); > + vm->vm_memory_size = memsize; /* Calculated above. */ > > + uvmflags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, > + MAP_INHERIT_NONE, MADV_NORMAL, UVM_FLAG_CONCEAL); > + for (i = 0; i < vm->vm_nmemranges; i++) { > + vmr = &vm->vm_memranges[i]; > + > + if (vmr->vmr_type == VM_MEM_MMIO) > + continue; > + > + uao = NULL; > + uao = uao_create(vmr->vmr_size, UAO_FLAG_CANFAIL); > + if (uao == NULL) { > + printf("%s: failed to initialize memory slot\n", > + __func__); > + vm_teardown(&vm); > + return (ENOMEM); > + } > + > + /* Map the UVM aobj into the process. */ > + uao_reference(uao); > + ret = uvm_map(&p->p_vmspace->vm_map, &vmr->vmr_va, > + vmr->vmr_size, uao, 0, 0, uvmflags); > + if (ret) { > + printf("%s: uvm_map failed: %d\n", __func__, ret); > + uao_detach(uao); > + vm_teardown(&vm); > + return (ENOMEM); > + } > + vm->vm_memory_slot[i] = uao; > + > + ret = uvm_map_immutable(&p->p_vmspace->vm_map, vmr->vmr_va, > + vmr->vmr_va + vmr->vmr_size, 1); > + if (ret) { > + printf("%s: uvm_map_immutable failed: %d\n", __func__, > + ret); > + vm_teardown(&vm); > + return (ret); > + } > + } > + > if (vm_impl_init(vm, p)) { > printf("failed to init arch-specific features for vm %p\n", vm); > vm_teardown(&vm); > @@ -433,6 +484,10 @@ vm_create(struct vm_create_params *vcp, struct proc *p > vmm_softc->vm_ct++; > vmm_softc->vcpu_ct += vm->vm_vcpu_ct; > > + /* Update the userland process's view of guest memory. */ > + memcpy(vcp->vcp_memranges, vm->vm_memranges, > + vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0])); > + > rw_exit_write(&vmm_softc->vm_lock); > > return (0); > @@ -482,19 +537,6 @@ vm_create_check_mem_ranges(struct vm_create_params *vc > } > > /* > - * Make sure that all virtual addresses are within the address > - * space of the process and that they do not wrap around. > - * Calling uvm_share() when creating the VM will take care of > - * further checks. > - */ > - if (vmr->vmr_va < VM_MIN_ADDRESS || > - vmr->vmr_va >= VM_MAXUSER_ADDRESS || > - vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va) { > - DPRINTF("guest va not within range or wraps\n"); > - return (0); > - } > - > - /* > * Make sure that guest physical memory ranges do not overlap > * and that they are ascending. > */ > @@ -529,10 +571,11 @@ vm_create_check_mem_ranges(struct vm_create_params *vc > void > vm_teardown(struct vm **target) > { > - size_t nvcpu = 0; > + size_t i, nvcpu = 0; > + vaddr_t sva, eva; > struct vcpu *vcpu, *tmp; > struct vm *vm = *target; > - struct vmspace *vm_vmspace; > + struct uvm_object *uao; > > KERNEL_ASSERT_UNLOCKED(); > > @@ -540,22 +583,29 @@ vm_teardown(struct vm **target) > SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) { > SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link); > vcpu_deinit(vcpu); > - > pool_put(&vcpu_pool, vcpu); > nvcpu++; > } > > - vm_impl_deinit(vm); > + /* Remove guest mappings from our nested page tables. */ > + for (i = 0; i < vm->vm_nmemranges; i++) { > + sva = vm->vm_memranges[i].vmr_gpa; > + eva = sva + vm->vm_memranges[i].vmr_size - 1; > + pmap_remove(vm->vm_pmap, sva, eva); > + } > > - /* teardown guest vmspace */ > - KERNEL_LOCK(); > - vm_vmspace = vm->vm_vmspace; > - if (vm_vmspace != NULL) { > - vm->vm_vmspace = NULL; > - uvmspace_free(vm_vmspace); > + /* Release UVM anon objects backing our guest memory. */ > + for (i = 0; i < vm->vm_nmemranges; i++) { > + uao = vm->vm_memory_slot[i]; > + vm->vm_memory_slot[i] = NULL; > + if (uao != NULL) > + uao_detach(uao); > } > - KERNEL_UNLOCK(); > > + /* At this point, no UVM-managed pages should reference our pmap. */ > + pmap_destroy(vm->vm_pmap); > + vm->vm_pmap = NULL; > + > pool_put(&vm_pool, vm); > *target = NULL; > } > @@ -612,7 +662,7 @@ vm_get_info(struct vm_info_params *vip) > > out[i].vir_memory_size = vm->vm_memory_size; > out[i].vir_used_size = > - pmap_resident_count(vm->vm_map->pmap) * PAGE_SIZE; > + pmap_resident_count(vm->vm_pmap) * PAGE_SIZE; > out[i].vir_ncpus = vm->vm_vcpu_ct; > out[i].vir_id = vm->vm_id; > out[i].vir_creator_pid = vm->vm_creator_pid; > @@ -793,9 +843,7 @@ vcpu_must_stop(struct vcpu *vcpu) > * Return values: > * 0: if successful > * ENOENT: if the vm cannot be found by vm_find > - * EPERM: if the vm cannot be accessed by the current process > - * EINVAL: if the provide memory ranges fail checks > - * ENOMEM: if uvm_share fails to find available memory in the destination map > + * other errno on uvm_map or uvm_map_immutable failures > */ > int > vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p) > @@ -804,6 +852,8 @@ vm_share_mem(struct vm_sharemem_params *vsp, struct pr > size_t i, n; > struct vm *vm; > struct vm_mem_range *src, *dst; > + struct uvm_object *uao; > + unsigned int uvmflags; > > ret = vm_find(vsp->vsp_vm_id, &vm); > if (ret) > @@ -830,33 +880,41 @@ vm_share_mem(struct vm_sharemem_params *vsp, struct pr > if (src->vmr_size != dst->vmr_size) > goto out; > > - /* Check our intended destination is page-aligned. */ > - if (dst->vmr_va & PAGE_MASK) > + /* The virtual addresses will be chosen by uvm_map(). */ > + if (vsp->vsp_va[i] != 0) > goto out; > } > > /* > - * Share each range individually with the calling process. We do > - * not need PROC_EXEC as the emulated devices do not need to execute > - * instructions from guest memory. > + * Share each UVM aobj with the calling process. We do not > + * need PROC_EXEC as the emulated devices. > */ > + uvmflags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE, > + MAP_INHERIT_NONE, MADV_NORMAL, UVM_FLAG_CONCEAL); > for (i = 0; i < n; i++) { > - src = &vm->vm_memranges[i]; > dst = &vsp->vsp_memranges[i]; > - > - /* Skip MMIO range. */ > - if (src->vmr_type == VM_MEM_MMIO) > + if (dst->vmr_type == VM_MEM_MMIO) > continue; > > - DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n", > - src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va); > - ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va, > - PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa, > - src->vmr_size); > + uao = vm->vm_memory_slot[i]; > + KASSERT(uao != NULL); > + > + uao_reference(uao); > + ret = uvm_map(&p->p_p->ps_vmspace->vm_map, &vsp->vsp_va[i], > + dst->vmr_size, uao, 0, 0, uvmflags); > if (ret) { > - printf("%s: uvm_share failed (%d)\n", __func__, ret); > - break; > + uao_detach(uao); > + printf("%s: uvm_map failed: %d\n", __func__, ret); > + goto out; > } > + > + ret = uvm_map_immutable(&p->p_p->ps_vmspace->vm_map, > + vsp->vsp_va[i], vsp->vsp_va[i] + dst->vmr_size, 1); > + if (ret) { > + printf("%s: uvm_map_immutable failed: %d\n", > + __func__, ret); > + goto out; > + } > } > ret = 0; > out: > blob - ca5a152f550c3795714128f1a0d764dd2e593e59 > blob + 46fe5ff93413a329065196ad59343759c44aae33 > --- sys/dev/vmm/vmm.h > +++ sys/dev/vmm/vmm.h > @@ -94,6 +94,9 @@ struct vm_sharemem_params { > uint32_t vsp_vm_id; > size_t vsp_nmemranges; > struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES]; > + > + /* Output parameters from VMM_IOC_SHAREMEM */ > + vaddr_t vsp_va[VMM_MAX_MEM_RANGES]; > }; > > struct vm_run_params { > @@ -137,7 +140,7 @@ struct vm_rwvmparams_params { > #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params) > /* Set VM params */ > #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params) > -#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params) > +#define VMM_IOC_SHAREMEM _IOWR('V', 11, struct vm_sharemem_params) > > #ifdef _KERNEL > > @@ -168,14 +171,17 @@ enum { > * V vmm_softc's vm_lock > */ > struct vm { > - struct vmspace *vm_vmspace; /* [K] */ > - vm_map_t vm_map; /* [K] */ > + pmap_t vm_pmap; /* [r] */ > + > uint32_t vm_id; /* [I] */ > pid_t vm_creator_pid; /* [I] */ > + > size_t vm_nmemranges; /* [I] */ > size_t vm_memory_size; /* [I] */ > - char vm_name[VMM_MAX_NAME_LEN]; > struct vm_mem_range vm_memranges[VMM_MAX_MEM_RANGES]; > + struct uvm_object *vm_memory_slot[VMM_MAX_MEM_RANGES]; /* [I] */ > + > + char vm_name[VMM_MAX_NAME_LEN]; > struct refcnt vm_refcnt; /* [a] */ > > struct vcpu_head vm_vcpu_list; /* [v] */ > blob - e399c0c0439a02117bc53c62cbb59c950b176773 > blob + 30b12b0e30936bddf75484aeedf60a34b1296d89 > --- usr.sbin/vmd/vm.c > +++ usr.sbin/vmd/vm.c > @@ -49,7 +49,6 @@ static void vm_dispatch_vmm(int, short, void *); > static void *event_thread(void *); > static void *vcpu_run_loop(void *); > static int vmm_create_vm(struct vmd_vm *); > -static int alloc_guest_mem(struct vmd_vm *); > static int send_vm(int, struct vmd_vm *); > static int dump_vmr(int , struct vm_mem_range *); > static int dump_mem(int, struct vmd_vm *); > @@ -201,7 +200,8 @@ start_vm(struct vmd_vm *vm, int fd) > if (!(vm->vm_state & VM_STATE_RECEIVED)) > create_memory_map(vcp); > > - ret = alloc_guest_mem(vm); > + /* Create the vm in vmm(4). */ > + ret = vmm_create_vm(vm); > if (ret) { > struct rlimit lim; > char buf[FMT_SCALED_STRSIZE]; > @@ -209,15 +209,11 @@ start_vm(struct vmd_vm *vm, int fd) > if (fmt_scaled(lim.rlim_cur, buf) == 0) > fatalx("could not allocate guest memory (data " > "limit is %s)", buf); > + } else { > + errno = ret; > + log_warn("could not create vm"); > } > - errno = ret; > - log_warn("could not allocate guest memory"); > - return (ret); > - } > > - /* We've allocated guest memory, so now create the vm in vmm(4). */ > - ret = vmm_create_vm(vm); > - if (ret) { > /* Let the vmm process know we failed by sending a 0 vm id. */ > vcp->vcp_id = 0; > atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id)); > @@ -756,62 +752,6 @@ vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcp > } > > /* > - * alloc_guest_mem > - * > - * Allocates memory for the guest. > - * Instead of doing a single allocation with one mmap(), we allocate memory > - * separately for every range for the following reasons: > - * - ASLR for the individual ranges > - * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to > - * map the single mmap'd userspace memory to the individual guest physical > - * memory ranges, the underlying amap of the single mmap'd range would have > - * to allocate per-page reference counters. The reason is that the > - * individual guest physical ranges would reference the single mmap'd region > - * only partially. However, if every guest physical range has its own > - * corresponding mmap'd userspace allocation, there are no partial > - * references: every guest physical range fully references an mmap'd > - * range => no per-page reference counters have to be allocated. > - * > - * Return values: > - * 0: success > - * !0: failure - errno indicating the source of the failure > - */ > -int > -alloc_guest_mem(struct vmd_vm *vm) > -{ > - void *p; > - int ret = 0; > - size_t i, j; > - struct vm_create_params *vcp = &vm->vm_params.vmc_params; > - struct vm_mem_range *vmr; > - > - for (i = 0; i < vcp->vcp_nmemranges; i++) { > - vmr = &vcp->vcp_memranges[i]; > - > - /* > - * We only need R/W as userland. vmm(4) will use R/W/X in its > - * mapping. > - * > - * We must use MAP_SHARED so emulated devices will be able > - * to generate shared mappings. > - */ > - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE, > - MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0); > - if (p == MAP_FAILED) { > - ret = errno; > - for (j = 0; j < i; j++) { > - vmr = &vcp->vcp_memranges[j]; > - munmap((void *)vmr->vmr_va, vmr->vmr_size); > - } > - return (ret); > - } > - vmr->vmr_va = (vaddr_t)p; > - } > - > - return (ret); > -} > - > -/* > * vmm_create_vm > * > * Requests vmm(4) to create a new VM using the supplied creation > @@ -1384,87 +1324,35 @@ vm_pipe_recv(struct vm_dev_pipe *p) > } > > /* > - * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE > + * Re-map the guest address space using vmm(4)'s VMM_IOC_SHAREMEM > * > - * Returns 0 on success, non-zero in event of failure. > + * Returns 0 on success or an errno in event of failure. > */ > int > remap_guest_mem(struct vmd_vm *vm, int vmm_fd) > { > - struct vm_create_params *vcp; > - struct vm_mem_range *vmr; > + size_t i; > + struct vm_create_params *vcp = &vm->vm_params.vmc_params; > struct vm_sharemem_params vsp; > - size_t i, j; > - void *p = NULL; > - int ret; > > if (vm == NULL) > - return (1); > + return (EINVAL); > > - vcp = &vm->vm_params.vmc_params; > - > - /* > - * Initialize our VM shared memory request using our original > - * creation parameters. We'll overwrite the va's after mmap(2). > - */ > + /* Initialize using our original creation parameters. */ > memset(&vsp, 0, sizeof(vsp)); > vsp.vsp_nmemranges = vcp->vcp_nmemranges; > vsp.vsp_vm_id = vcp->vcp_id; > memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges, > sizeof(vsp.vsp_memranges)); > > - /* > - * Use mmap(2) to identify virtual address space for our mappings. > - */ > - for (i = 0; i < VMM_MAX_MEM_RANGES; i++) { > - if (i < vsp.vsp_nmemranges) { > - vmr = &vsp.vsp_memranges[i]; > - > - /* Ignore any MMIO ranges. */ > - if (vmr->vmr_type == VM_MEM_MMIO) { > - vmr->vmr_va = 0; > - vcp->vcp_memranges[i].vmr_va = 0; > - continue; > - } > - > - /* Make initial mappings for the memrange. */ > - p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1, > - 0); > - if (p == MAP_FAILED) { > - ret = errno; > - log_warn("%s: mmap", __func__); > - for (j = 0; j < i; j++) { > - vmr = &vcp->vcp_memranges[j]; > - munmap((void *)vmr->vmr_va, > - vmr->vmr_size); > - } > - return (ret); > - } > - vmr->vmr_va = (vaddr_t)p; > - vcp->vcp_memranges[i].vmr_va = vmr->vmr_va; > - } > - } > - > - /* > - * munmap(2) now that we have va's and ranges that don't overlap. vmm > - * will use the va's and sizes to recreate the mappings for us. > - */ > - for (i = 0; i < vsp.vsp_nmemranges; i++) { > - vmr = &vsp.vsp_memranges[i]; > - if (vmr->vmr_type == VM_MEM_MMIO) > - continue; > - if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1) > - fatal("%s: munmap", __func__); > - } > - > - /* > - * Ask vmm to enter the shared mappings for us. They'll point > - * to the same host physical memory, but will have a randomized > - * virtual address for the calling process. > - */ > + /* Ask vmm(4) to enter a shared mapping to guest memory. */ > if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1) > return (errno); > > + /* Update with the location of the new mappings. */ > + for (i = 0; i < vsp.vsp_nmemranges; i++) > + vcp->vcp_memranges[i].vmr_va = vsp.vsp_va[i]; > + > return (0); > }