Download raw body.
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
On Wed, Apr 16, 2025 at 09:48:29PM -0400, Dave Voutila wrote:
> Some positive test reports and so far it looks like bluhm's issue was
> hardware related, but would still like confirmation that this *works*
> with SEV mode enabled on a guest vm.
I have fixed my BIOS settings an can run SEV again. Diff works on
AMD and Intel. AMD-SEV also tested.
cpu0: AMD EPYC 73F3 16-Core Processor, 3500.00 MHz, 19-01-01, patch 0a0011d5
cpu0: Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz, 2600.09 MHz, 06-3f-02, patch 00000049
bluhm
> diff refs/heads/master refs/heads/vmm-uobj-v5
> commit - 4c8bb646499097485d416c4552a733b35394749d
> commit + fd3509d9f67be2e036257c8829605d6fdeb1de6d
> blob - e3205f48eedda8f4158d9a8d42e6164534921844
> blob + e58730ec23702850524db522dd82d750589d4c18
> --- sys/arch/amd64/amd64/vmm_machdep.c
> +++ sys/arch/amd64/amd64/vmm_machdep.c
> @@ -115,6 +115,7 @@ int vmm_inject_db(struct vcpu *);
> void vmx_handle_intr(struct vcpu *);
> void vmx_handle_misc_enable_msr(struct vcpu *);
> int vmm_get_guest_memtype(struct vm *, paddr_t);
> +vaddr_t vmm_translate_gpa(struct vm *, paddr_t);
> int vmx_get_guest_faulttype(void);
> int svm_get_guest_faulttype(struct vmcb *);
> int vmx_get_exit_qualification(uint64_t *);
> @@ -905,54 +906,19 @@ vmx_remote_vmclear(struct cpu_info *ci, struct vcpu *v
> int
> vm_impl_init(struct vm *vm, struct proc *p)
> {
> - int i, mode, ret;
> - vaddr_t mingpa, maxgpa;
> - struct vm_mem_range *vmr;
> -
> /* If not EPT or RVI, nothing to do here */
> switch (vmm_softc->mode) {
> case VMM_MODE_EPT:
> - mode = PMAP_TYPE_EPT;
> + pmap_convert(vm->vm_pmap, PMAP_TYPE_EPT);
> break;
> case VMM_MODE_RVI:
> - mode = PMAP_TYPE_RVI;
> + pmap_convert(vm->vm_pmap, PMAP_TYPE_RVI);
> break;
> default:
> printf("%s: invalid vmm mode %d\n", __func__, vmm_softc->mode);
> return (EINVAL);
> }
>
> - vmr = &vm->vm_memranges[0];
> - mingpa = vmr->vmr_gpa;
> - vmr = &vm->vm_memranges[vm->vm_nmemranges - 1];
> - maxgpa = vmr->vmr_gpa + vmr->vmr_size;
> -
> - /*
> - * uvmspace_alloc (currently) always returns a valid vmspace
> - */
> - vm->vm_vmspace = uvmspace_alloc(mingpa, maxgpa, TRUE, FALSE);
> - vm->vm_map = &vm->vm_vmspace->vm_map;
> -
> - /* Map the new map with an anon */
> - DPRINTF("%s: created vm_map @ %p\n", __func__, vm->vm_map);
> - for (i = 0; i < vm->vm_nmemranges; i++) {
> - vmr = &vm->vm_memranges[i];
> - ret = uvm_share(vm->vm_map, vmr->vmr_gpa,
> - PROT_READ | PROT_WRITE | PROT_EXEC,
> - &p->p_vmspace->vm_map, vmr->vmr_va, vmr->vmr_size);
> - if (ret) {
> - printf("%s: uvm_share failed (%d)\n", __func__, ret);
> - /* uvmspace_free calls pmap_destroy for us */
> - KERNEL_LOCK();
> - uvmspace_free(vm->vm_vmspace);
> - vm->vm_vmspace = NULL;
> - KERNEL_UNLOCK();
> - return (ENOMEM);
> - }
> - }
> -
> - pmap_convert(vm->vm_map->pmap, mode);
> -
> return (0);
> }
>
> @@ -1666,7 +1632,7 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg
>
> /* NPT */
> vmcb->v_np_enable = SVM_ENABLE_NP;
> - vmcb->v_n_cr3 = vcpu->vc_parent->vm_map->pmap->pm_pdirpa;
> + vmcb->v_n_cr3 = vcpu->vc_parent->vm_pmap->pm_pdirpa;
>
> /* SEV */
> if (vcpu->vc_sev)
> @@ -1680,7 +1646,7 @@ vcpu_reset_regs_svm(struct vcpu *vcpu, struct vcpu_reg
> /* xcr0 power on default sets bit 0 (x87 state) */
> vcpu->vc_gueststate.vg_xcr0 = XFEATURE_X87 & xsave_mask;
>
> - vcpu->vc_parent->vm_map->pmap->eptp = 0;
> + vcpu->vc_parent->vm_pmap->eptp = 0;
>
> return ret;
> }
> @@ -2601,7 +2567,7 @@ vcpu_init_vmx(struct vcpu *vcpu)
> }
>
> /* Configure EPT Pointer */
> - eptp = vcpu->vc_parent->vm_map->pmap->pm_pdirpa;
> + eptp = vcpu->vc_parent->vm_pmap->pm_pdirpa;
> msr = rdmsr(IA32_VMX_EPT_VPID_CAP);
> if (msr & IA32_EPT_VPID_CAP_PAGE_WALK_4) {
> /* Page walk length 4 supported */
> @@ -2625,7 +2591,7 @@ vcpu_init_vmx(struct vcpu *vcpu)
> goto exit;
> }
>
> - vcpu->vc_parent->vm_map->pmap->eptp = eptp;
> + vcpu->vc_parent->vm_pmap->eptp = eptp;
>
> /* Host CR0 */
> cr0 = rcr0() & ~CR0_TS;
> @@ -3490,7 +3456,7 @@ vmm_translate_gva(struct vcpu *vcpu, uint64_t va, uint
>
> DPRINTF("%s: read pte level %d @ GPA 0x%llx\n", __func__,
> level, pte_paddr);
> - if (!pmap_extract(vcpu->vc_parent->vm_map->pmap, pte_paddr,
> + if (!pmap_extract(vcpu->vc_parent->vm_pmap, pte_paddr,
> &hpa)) {
> DPRINTF("%s: cannot extract HPA for GPA 0x%llx\n",
> __func__, pte_paddr);
> @@ -3671,11 +3637,11 @@ vcpu_run_vmx(struct vcpu *vcpu, struct vm_run_params *
>
> /* We're now using this vcpu's EPT pmap on this cpu. */
> atomic_swap_ptr(&ci->ci_ept_pmap,
> - vcpu->vc_parent->vm_map->pmap);
> + vcpu->vc_parent->vm_pmap);
>
> /* Invalidate EPT cache. */
> vid_ept.vid_reserved = 0;
> - vid_ept.vid_eptp = vcpu->vc_parent->vm_map->pmap->eptp;
> + vid_ept.vid_eptp = vcpu->vc_parent->vm_pmap->eptp;
> if (invept(ci->ci_vmm_cap.vcc_vmx.vmx_invept_mode,
> &vid_ept)) {
> printf("%s: invept\n", __func__);
> @@ -4461,6 +4427,29 @@ vmm_get_guest_memtype(struct vm *vm, paddr_t gpa)
> return (VMM_MEM_TYPE_UNKNOWN);
> }
>
> +vaddr_t
> +vmm_translate_gpa(struct vm *vm, paddr_t gpa)
> +{
> + int i = 0;
> + vaddr_t hva = 0;
> + struct vm_mem_range *vmr = NULL;
> +
> + /*
> + * Translate GPA -> userland HVA in proc p. Find the memory range
> + * and use it to translate to the HVA.
> + */
> + for (i = 0; i < vm->vm_nmemranges; i++) {
> + vmr = &vm->vm_memranges[i];
> + if (gpa >= vmr->vmr_gpa && gpa < vmr->vmr_gpa + vmr->vmr_size) {
> + hva = vmr->vmr_va + (gpa - vmr->vmr_gpa);
> + break;
> + }
> + }
> +
> + return (hva);
> +}
> +
> +
> /*
> * vmx_get_exit_qualification
> *
> @@ -4542,15 +4531,45 @@ svm_get_guest_faulttype(struct vmcb *vmcb)
> int
> svm_fault_page(struct vcpu *vcpu, paddr_t gpa)
> {
> - paddr_t pa = trunc_page(gpa);
> - int ret;
> + struct proc *p = curproc;
> + paddr_t hpa, pa = trunc_page(gpa);
> + vaddr_t hva;
> + int ret = 1;
>
> - ret = uvm_fault_wire(vcpu->vc_parent->vm_map, pa, pa + PAGE_SIZE,
> - PROT_READ | PROT_WRITE | PROT_EXEC);
> - if (ret)
> - printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
> - __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip);
> + hva = vmm_translate_gpa(vcpu->vc_parent, pa);
> + if (hva == 0) {
> + printf("%s: unable to translate gpa 0x%llx\n", __func__,
> + (uint64_t)pa);
> + return (EINVAL);
> + }
>
> + /* If we don't already have a backing page... */
> + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) {
> + /* ...fault a RW page into the p's address space... */
> + ret = uvm_fault_wire(&p->p_vmspace->vm_map, hva,
> + hva + PAGE_SIZE, PROT_READ | PROT_WRITE);
> + if (ret) {
> + printf("%s: uvm_fault failed %d hva=0x%llx\n", __func__,
> + ret, (uint64_t)hva);
> + return (ret);
> + }
> +
> + /* ...and then get the mapping. */
> + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) {
> + printf("%s: failed to extract hpa for hva 0x%llx\n",
> + __func__, (uint64_t)hva);
> + return (EINVAL);
> + }
> + }
> +
> + /* Now we insert a RWX mapping into the guest's RVI pmap. */
> + ret = pmap_enter(vcpu->vc_parent->vm_pmap, pa, hpa,
> + PROT_READ | PROT_WRITE | PROT_EXEC, 0);
> + if (ret) {
> + printf("%s: pmap_enter failed pa=0x%llx, hpa=0x%llx\n",
> + __func__, (uint64_t)pa, (uint64_t)hpa);
> + }
> +
> return (ret);
> }
>
> @@ -4617,8 +4636,10 @@ svm_handle_np_fault(struct vcpu *vcpu)
> int
> vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
> {
> + struct proc *p = curproc;
> int fault_type, ret;
> - paddr_t pa = trunc_page(gpa);
> + paddr_t hpa, pa = trunc_page(gpa);
> + vaddr_t hva;
>
> fault_type = vmx_get_guest_faulttype();
> switch (fault_type) {
> @@ -4633,19 +4654,45 @@ vmx_fault_page(struct vcpu *vcpu, paddr_t gpa)
> break;
> }
>
> - /* We may sleep during uvm_fault_wire(), so reload VMCS. */
> - vcpu->vc_last_pcpu = curcpu();
> - ret = uvm_fault_wire(vcpu->vc_parent->vm_map, pa, pa + PAGE_SIZE,
> - PROT_READ | PROT_WRITE | PROT_EXEC);
> - if (vcpu_reload_vmcs_vmx(vcpu)) {
> - printf("%s: failed to reload vmcs\n", __func__);
> + hva = vmm_translate_gpa(vcpu->vc_parent, pa);
> + if (hva == 0) {
> + printf("%s: unable to translate gpa 0x%llx\n", __func__,
> + (uint64_t)pa);
> return (EINVAL);
> }
>
> - if (ret)
> - printf("%s: uvm_fault returns %d, GPA=0x%llx, rip=0x%llx\n",
> - __func__, ret, (uint64_t)gpa, vcpu->vc_gueststate.vg_rip);
> + /* If we don't already have a backing page... */
> + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) {
> + /* ...fault a RW page into the p's address space... */
> + vcpu->vc_last_pcpu = curcpu(); /* uvm_fault may sleep. */
> + ret = uvm_fault_wire(&p->p_vmspace->vm_map, hva,
> + hva + PAGE_SIZE, PROT_READ | PROT_WRITE);
> + if (ret) {
> + printf("%s: uvm_fault failed %d hva=0x%llx\n", __func__,
> + ret, (uint64_t)hva);
> + return (ret);
> + }
> + if (vcpu_reload_vmcs_vmx(vcpu)) {
> + printf("%s: failed to reload vmcs\n", __func__);
> + return (EINVAL);
> + }
>
> + /* ...and then get the mapping. */
> + if (!pmap_extract(p->p_vmspace->vm_map.pmap, hva, &hpa)) {
> + printf("%s: failed to extract hpa for hva 0x%llx\n",
> + __func__, (uint64_t)hva);
> + return (EINVAL);
> + }
> + }
> +
> + /* Now we insert a RWX mapping into the guest's EPT pmap. */
> + ret = pmap_enter(vcpu->vc_parent->vm_pmap, pa, hpa,
> + PROT_READ | PROT_WRITE | PROT_EXEC, 0);
> + if (ret) {
> + printf("%s: pmap_enter failed pa=0x%llx, hpa=0x%llx\n",
> + __func__, (uint64_t)pa, (uint64_t)hpa);
> + }
> +
> return (ret);
> }
>
> @@ -4930,7 +4977,7 @@ vmx_load_pdptes(struct vcpu *vcpu)
> return (EINVAL);
> }
>
> - if (!pmap_extract(vcpu->vc_parent->vm_map->pmap, (vaddr_t)cr3,
> + if (!pmap_extract(vcpu->vc_parent->vm_pmap, (vaddr_t)cr3,
> (paddr_t *)&cr3_host_phys)) {
> DPRINTF("%s: nonmapped guest CR3, setting PDPTEs to 0\n",
> __func__);
> @@ -6459,7 +6506,7 @@ vmm_update_pvclock(struct vcpu *vcpu)
>
> if (vcpu->vc_pvclock_system_gpa & PVCLOCK_SYSTEM_TIME_ENABLE) {
> pvclock_gpa = vcpu->vc_pvclock_system_gpa & 0xFFFFFFFFFFFFFFF0;
> - if (!pmap_extract(vm->vm_map->pmap, pvclock_gpa, &pvclock_hpa))
> + if (!pmap_extract(vm->vm_pmap, pvclock_gpa, &pvclock_hpa))
> return (EINVAL);
> pvclock_ti = (void*) PMAP_DIRECT_MAP(pvclock_hpa);
>
> blob - 0a86ddbecd3fbd3627336902c7209f6d023bc3f0
> blob + bc03a35c8cf161d3e4164f46d159325b9ef34690
> --- sys/dev/vmm/vmm.c
> +++ sys/dev/vmm/vmm.c
> @@ -25,6 +25,9 @@
> #include <sys/malloc.h>
> #include <sys/signalvar.h>
>
> +#include <uvm/uvm_extern.h>
> +#include <uvm/uvm_aobj.h>
> +
> #include <machine/vmmvar.h>
>
> #include <dev/vmm/vmm.h>
> @@ -356,6 +359,9 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> size_t memsize;
> struct vm *vm;
> struct vcpu *vcpu;
> + struct uvm_object *uao;
> + struct vm_mem_range *vmr;
> + unsigned int uvmflags = 0;
>
> memsize = vm_create_check_mem_ranges(vcp);
> if (memsize == 0)
> @@ -378,13 +384,58 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> /* Instantiate and configure the new vm. */
> vm = pool_get(&vm_pool, PR_WAITOK | PR_ZERO);
>
> + /* Create the VM's identity. */
> vm->vm_creator_pid = p->p_p->ps_pid;
> + strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
> +
> + /* Create the pmap for nested paging. */
> + vm->vm_pmap = pmap_create();
> +
> + /* Initialize memory slots. */
> vm->vm_nmemranges = vcp->vcp_nmemranges;
> memcpy(vm->vm_memranges, vcp->vcp_memranges,
> vm->vm_nmemranges * sizeof(vm->vm_memranges[0]));
> - vm->vm_memory_size = memsize;
> - strncpy(vm->vm_name, vcp->vcp_name, VMM_MAX_NAME_LEN - 1);
> + vm->vm_memory_size = memsize; /* Calculated above. */
>
> + uvmflags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
> + MAP_INHERIT_NONE, MADV_NORMAL, UVM_FLAG_CONCEAL);
> + for (i = 0; i < vm->vm_nmemranges; i++) {
> + vmr = &vm->vm_memranges[i];
> +
> + if (vmr->vmr_type == VM_MEM_MMIO)
> + continue;
> +
> + uao = NULL;
> + uao = uao_create(vmr->vmr_size, UAO_FLAG_CANFAIL);
> + if (uao == NULL) {
> + printf("%s: failed to initialize memory slot\n",
> + __func__);
> + vm_teardown(&vm);
> + return (ENOMEM);
> + }
> +
> + /* Map the UVM aobj into the process. */
> + uao_reference(uao);
> + ret = uvm_map(&p->p_vmspace->vm_map, &vmr->vmr_va,
> + vmr->vmr_size, uao, 0, 0, uvmflags);
> + if (ret) {
> + printf("%s: uvm_map failed: %d\n", __func__, ret);
> + uao_detach(uao);
> + vm_teardown(&vm);
> + return (ENOMEM);
> + }
> + vm->vm_memory_slot[i] = uao;
> +
> + ret = uvm_map_immutable(&p->p_vmspace->vm_map, vmr->vmr_va,
> + vmr->vmr_va + vmr->vmr_size, 1);
> + if (ret) {
> + printf("%s: uvm_map_immutable failed: %d\n", __func__,
> + ret);
> + vm_teardown(&vm);
> + return (ret);
> + }
> + }
> +
> if (vm_impl_init(vm, p)) {
> printf("failed to init arch-specific features for vm %p\n", vm);
> vm_teardown(&vm);
> @@ -433,6 +484,10 @@ vm_create(struct vm_create_params *vcp, struct proc *p
> vmm_softc->vm_ct++;
> vmm_softc->vcpu_ct += vm->vm_vcpu_ct;
>
> + /* Update the userland process's view of guest memory. */
> + memcpy(vcp->vcp_memranges, vm->vm_memranges,
> + vcp->vcp_nmemranges * sizeof(vcp->vcp_memranges[0]));
> +
> rw_exit_write(&vmm_softc->vm_lock);
>
> return (0);
> @@ -482,19 +537,6 @@ vm_create_check_mem_ranges(struct vm_create_params *vc
> }
>
> /*
> - * Make sure that all virtual addresses are within the address
> - * space of the process and that they do not wrap around.
> - * Calling uvm_share() when creating the VM will take care of
> - * further checks.
> - */
> - if (vmr->vmr_va < VM_MIN_ADDRESS ||
> - vmr->vmr_va >= VM_MAXUSER_ADDRESS ||
> - vmr->vmr_size >= VM_MAXUSER_ADDRESS - vmr->vmr_va) {
> - DPRINTF("guest va not within range or wraps\n");
> - return (0);
> - }
> -
> - /*
> * Make sure that guest physical memory ranges do not overlap
> * and that they are ascending.
> */
> @@ -529,10 +571,11 @@ vm_create_check_mem_ranges(struct vm_create_params *vc
> void
> vm_teardown(struct vm **target)
> {
> - size_t nvcpu = 0;
> + size_t i, nvcpu = 0;
> + vaddr_t sva, eva;
> struct vcpu *vcpu, *tmp;
> struct vm *vm = *target;
> - struct vmspace *vm_vmspace;
> + struct uvm_object *uao;
>
> KERNEL_ASSERT_UNLOCKED();
>
> @@ -540,22 +583,29 @@ vm_teardown(struct vm **target)
> SLIST_FOREACH_SAFE(vcpu, &vm->vm_vcpu_list, vc_vcpu_link, tmp) {
> SLIST_REMOVE(&vm->vm_vcpu_list, vcpu, vcpu, vc_vcpu_link);
> vcpu_deinit(vcpu);
> -
> pool_put(&vcpu_pool, vcpu);
> nvcpu++;
> }
>
> - vm_impl_deinit(vm);
> + /* Remove guest mappings from our nested page tables. */
> + for (i = 0; i < vm->vm_nmemranges; i++) {
> + sva = vm->vm_memranges[i].vmr_gpa;
> + eva = sva + vm->vm_memranges[i].vmr_size - 1;
> + pmap_remove(vm->vm_pmap, sva, eva);
> + }
>
> - /* teardown guest vmspace */
> - KERNEL_LOCK();
> - vm_vmspace = vm->vm_vmspace;
> - if (vm_vmspace != NULL) {
> - vm->vm_vmspace = NULL;
> - uvmspace_free(vm_vmspace);
> + /* Release UVM anon objects backing our guest memory. */
> + for (i = 0; i < vm->vm_nmemranges; i++) {
> + uao = vm->vm_memory_slot[i];
> + vm->vm_memory_slot[i] = NULL;
> + if (uao != NULL)
> + uao_detach(uao);
> }
> - KERNEL_UNLOCK();
>
> + /* At this point, no UVM-managed pages should reference our pmap. */
> + pmap_destroy(vm->vm_pmap);
> + vm->vm_pmap = NULL;
> +
> pool_put(&vm_pool, vm);
> *target = NULL;
> }
> @@ -612,7 +662,7 @@ vm_get_info(struct vm_info_params *vip)
>
> out[i].vir_memory_size = vm->vm_memory_size;
> out[i].vir_used_size =
> - pmap_resident_count(vm->vm_map->pmap) * PAGE_SIZE;
> + pmap_resident_count(vm->vm_pmap) * PAGE_SIZE;
> out[i].vir_ncpus = vm->vm_vcpu_ct;
> out[i].vir_id = vm->vm_id;
> out[i].vir_creator_pid = vm->vm_creator_pid;
> @@ -793,9 +843,7 @@ vcpu_must_stop(struct vcpu *vcpu)
> * Return values:
> * 0: if successful
> * ENOENT: if the vm cannot be found by vm_find
> - * EPERM: if the vm cannot be accessed by the current process
> - * EINVAL: if the provide memory ranges fail checks
> - * ENOMEM: if uvm_share fails to find available memory in the destination map
> + * other errno on uvm_map or uvm_map_immutable failures
> */
> int
> vm_share_mem(struct vm_sharemem_params *vsp, struct proc *p)
> @@ -804,6 +852,8 @@ vm_share_mem(struct vm_sharemem_params *vsp, struct pr
> size_t i, n;
> struct vm *vm;
> struct vm_mem_range *src, *dst;
> + struct uvm_object *uao;
> + unsigned int uvmflags;
>
> ret = vm_find(vsp->vsp_vm_id, &vm);
> if (ret)
> @@ -830,33 +880,41 @@ vm_share_mem(struct vm_sharemem_params *vsp, struct pr
> if (src->vmr_size != dst->vmr_size)
> goto out;
>
> - /* Check our intended destination is page-aligned. */
> - if (dst->vmr_va & PAGE_MASK)
> + /* The virtual addresses will be chosen by uvm_map(). */
> + if (vsp->vsp_va[i] != 0)
> goto out;
> }
>
> /*
> - * Share each range individually with the calling process. We do
> - * not need PROC_EXEC as the emulated devices do not need to execute
> - * instructions from guest memory.
> + * Share each UVM aobj with the calling process. We do not
> + * need PROC_EXEC as the emulated devices.
> */
> + uvmflags = UVM_MAPFLAG(PROT_READ | PROT_WRITE, PROT_READ | PROT_WRITE,
> + MAP_INHERIT_NONE, MADV_NORMAL, UVM_FLAG_CONCEAL);
> for (i = 0; i < n; i++) {
> - src = &vm->vm_memranges[i];
> dst = &vsp->vsp_memranges[i];
> -
> - /* Skip MMIO range. */
> - if (src->vmr_type == VM_MEM_MMIO)
> + if (dst->vmr_type == VM_MEM_MMIO)
> continue;
>
> - DPRINTF("sharing gpa=0x%lx for pid %d @ va=0x%lx\n",
> - src->vmr_gpa, p->p_p->ps_pid, dst->vmr_va);
> - ret = uvm_share(&p->p_vmspace->vm_map, dst->vmr_va,
> - PROT_READ | PROT_WRITE, vm->vm_map, src->vmr_gpa,
> - src->vmr_size);
> + uao = vm->vm_memory_slot[i];
> + KASSERT(uao != NULL);
> +
> + uao_reference(uao);
> + ret = uvm_map(&p->p_p->ps_vmspace->vm_map, &vsp->vsp_va[i],
> + dst->vmr_size, uao, 0, 0, uvmflags);
> if (ret) {
> - printf("%s: uvm_share failed (%d)\n", __func__, ret);
> - break;
> + uao_detach(uao);
> + printf("%s: uvm_map failed: %d\n", __func__, ret);
> + goto out;
> }
> +
> + ret = uvm_map_immutable(&p->p_p->ps_vmspace->vm_map,
> + vsp->vsp_va[i], vsp->vsp_va[i] + dst->vmr_size, 1);
> + if (ret) {
> + printf("%s: uvm_map_immutable failed: %d\n",
> + __func__, ret);
> + goto out;
> + }
> }
> ret = 0;
> out:
> blob - ca5a152f550c3795714128f1a0d764dd2e593e59
> blob + 46fe5ff93413a329065196ad59343759c44aae33
> --- sys/dev/vmm/vmm.h
> +++ sys/dev/vmm/vmm.h
> @@ -94,6 +94,9 @@ struct vm_sharemem_params {
> uint32_t vsp_vm_id;
> size_t vsp_nmemranges;
> struct vm_mem_range vsp_memranges[VMM_MAX_MEM_RANGES];
> +
> + /* Output parameters from VMM_IOC_SHAREMEM */
> + vaddr_t vsp_va[VMM_MAX_MEM_RANGES];
> };
>
> struct vm_run_params {
> @@ -137,7 +140,7 @@ struct vm_rwvmparams_params {
> #define VMM_IOC_READVMPARAMS _IOWR('V', 9, struct vm_rwvmparams_params)
> /* Set VM params */
> #define VMM_IOC_WRITEVMPARAMS _IOW('V', 10, struct vm_rwvmparams_params)
> -#define VMM_IOC_SHAREMEM _IOW('V', 11, struct vm_sharemem_params)
> +#define VMM_IOC_SHAREMEM _IOWR('V', 11, struct vm_sharemem_params)
>
> #ifdef _KERNEL
>
> @@ -168,14 +171,17 @@ enum {
> * V vmm_softc's vm_lock
> */
> struct vm {
> - struct vmspace *vm_vmspace; /* [K] */
> - vm_map_t vm_map; /* [K] */
> + pmap_t vm_pmap; /* [r] */
> +
> uint32_t vm_id; /* [I] */
> pid_t vm_creator_pid; /* [I] */
> +
> size_t vm_nmemranges; /* [I] */
> size_t vm_memory_size; /* [I] */
> - char vm_name[VMM_MAX_NAME_LEN];
> struct vm_mem_range vm_memranges[VMM_MAX_MEM_RANGES];
> + struct uvm_object *vm_memory_slot[VMM_MAX_MEM_RANGES]; /* [I] */
> +
> + char vm_name[VMM_MAX_NAME_LEN];
> struct refcnt vm_refcnt; /* [a] */
>
> struct vcpu_head vm_vcpu_list; /* [v] */
> blob - e399c0c0439a02117bc53c62cbb59c950b176773
> blob + 30b12b0e30936bddf75484aeedf60a34b1296d89
> --- usr.sbin/vmd/vm.c
> +++ usr.sbin/vmd/vm.c
> @@ -49,7 +49,6 @@ static void vm_dispatch_vmm(int, short, void *);
> static void *event_thread(void *);
> static void *vcpu_run_loop(void *);
> static int vmm_create_vm(struct vmd_vm *);
> -static int alloc_guest_mem(struct vmd_vm *);
> static int send_vm(int, struct vmd_vm *);
> static int dump_vmr(int , struct vm_mem_range *);
> static int dump_mem(int, struct vmd_vm *);
> @@ -201,7 +200,8 @@ start_vm(struct vmd_vm *vm, int fd)
> if (!(vm->vm_state & VM_STATE_RECEIVED))
> create_memory_map(vcp);
>
> - ret = alloc_guest_mem(vm);
> + /* Create the vm in vmm(4). */
> + ret = vmm_create_vm(vm);
> if (ret) {
> struct rlimit lim;
> char buf[FMT_SCALED_STRSIZE];
> @@ -209,15 +209,11 @@ start_vm(struct vmd_vm *vm, int fd)
> if (fmt_scaled(lim.rlim_cur, buf) == 0)
> fatalx("could not allocate guest memory (data "
> "limit is %s)", buf);
> + } else {
> + errno = ret;
> + log_warn("could not create vm");
> }
> - errno = ret;
> - log_warn("could not allocate guest memory");
> - return (ret);
> - }
>
> - /* We've allocated guest memory, so now create the vm in vmm(4). */
> - ret = vmm_create_vm(vm);
> - if (ret) {
> /* Let the vmm process know we failed by sending a 0 vm id. */
> vcp->vcp_id = 0;
> atomicio(vwrite, fd, &vcp->vcp_id, sizeof(vcp->vcp_id));
> @@ -756,62 +752,6 @@ vcpu_reset(uint32_t vmid, uint32_t vcpu_id, struct vcp
> }
>
> /*
> - * alloc_guest_mem
> - *
> - * Allocates memory for the guest.
> - * Instead of doing a single allocation with one mmap(), we allocate memory
> - * separately for every range for the following reasons:
> - * - ASLR for the individual ranges
> - * - to reduce memory consumption in the UVM subsystem: if vmm(4) had to
> - * map the single mmap'd userspace memory to the individual guest physical
> - * memory ranges, the underlying amap of the single mmap'd range would have
> - * to allocate per-page reference counters. The reason is that the
> - * individual guest physical ranges would reference the single mmap'd region
> - * only partially. However, if every guest physical range has its own
> - * corresponding mmap'd userspace allocation, there are no partial
> - * references: every guest physical range fully references an mmap'd
> - * range => no per-page reference counters have to be allocated.
> - *
> - * Return values:
> - * 0: success
> - * !0: failure - errno indicating the source of the failure
> - */
> -int
> -alloc_guest_mem(struct vmd_vm *vm)
> -{
> - void *p;
> - int ret = 0;
> - size_t i, j;
> - struct vm_create_params *vcp = &vm->vm_params.vmc_params;
> - struct vm_mem_range *vmr;
> -
> - for (i = 0; i < vcp->vcp_nmemranges; i++) {
> - vmr = &vcp->vcp_memranges[i];
> -
> - /*
> - * We only need R/W as userland. vmm(4) will use R/W/X in its
> - * mapping.
> - *
> - * We must use MAP_SHARED so emulated devices will be able
> - * to generate shared mappings.
> - */
> - p = mmap(NULL, vmr->vmr_size, PROT_READ | PROT_WRITE,
> - MAP_ANON | MAP_CONCEAL | MAP_SHARED, -1, 0);
> - if (p == MAP_FAILED) {
> - ret = errno;
> - for (j = 0; j < i; j++) {
> - vmr = &vcp->vcp_memranges[j];
> - munmap((void *)vmr->vmr_va, vmr->vmr_size);
> - }
> - return (ret);
> - }
> - vmr->vmr_va = (vaddr_t)p;
> - }
> -
> - return (ret);
> -}
> -
> -/*
> * vmm_create_vm
> *
> * Requests vmm(4) to create a new VM using the supplied creation
> @@ -1384,87 +1324,35 @@ vm_pipe_recv(struct vm_dev_pipe *p)
> }
>
> /*
> - * Re-map the guest address space using vmm(4)'s VMM_IOC_SHARE
> + * Re-map the guest address space using vmm(4)'s VMM_IOC_SHAREMEM
> *
> - * Returns 0 on success, non-zero in event of failure.
> + * Returns 0 on success or an errno in event of failure.
> */
> int
> remap_guest_mem(struct vmd_vm *vm, int vmm_fd)
> {
> - struct vm_create_params *vcp;
> - struct vm_mem_range *vmr;
> + size_t i;
> + struct vm_create_params *vcp = &vm->vm_params.vmc_params;
> struct vm_sharemem_params vsp;
> - size_t i, j;
> - void *p = NULL;
> - int ret;
>
> if (vm == NULL)
> - return (1);
> + return (EINVAL);
>
> - vcp = &vm->vm_params.vmc_params;
> -
> - /*
> - * Initialize our VM shared memory request using our original
> - * creation parameters. We'll overwrite the va's after mmap(2).
> - */
> + /* Initialize using our original creation parameters. */
> memset(&vsp, 0, sizeof(vsp));
> vsp.vsp_nmemranges = vcp->vcp_nmemranges;
> vsp.vsp_vm_id = vcp->vcp_id;
> memcpy(&vsp.vsp_memranges, &vcp->vcp_memranges,
> sizeof(vsp.vsp_memranges));
>
> - /*
> - * Use mmap(2) to identify virtual address space for our mappings.
> - */
> - for (i = 0; i < VMM_MAX_MEM_RANGES; i++) {
> - if (i < vsp.vsp_nmemranges) {
> - vmr = &vsp.vsp_memranges[i];
> -
> - /* Ignore any MMIO ranges. */
> - if (vmr->vmr_type == VM_MEM_MMIO) {
> - vmr->vmr_va = 0;
> - vcp->vcp_memranges[i].vmr_va = 0;
> - continue;
> - }
> -
> - /* Make initial mappings for the memrange. */
> - p = mmap(NULL, vmr->vmr_size, PROT_READ, MAP_ANON, -1,
> - 0);
> - if (p == MAP_FAILED) {
> - ret = errno;
> - log_warn("%s: mmap", __func__);
> - for (j = 0; j < i; j++) {
> - vmr = &vcp->vcp_memranges[j];
> - munmap((void *)vmr->vmr_va,
> - vmr->vmr_size);
> - }
> - return (ret);
> - }
> - vmr->vmr_va = (vaddr_t)p;
> - vcp->vcp_memranges[i].vmr_va = vmr->vmr_va;
> - }
> - }
> -
> - /*
> - * munmap(2) now that we have va's and ranges that don't overlap. vmm
> - * will use the va's and sizes to recreate the mappings for us.
> - */
> - for (i = 0; i < vsp.vsp_nmemranges; i++) {
> - vmr = &vsp.vsp_memranges[i];
> - if (vmr->vmr_type == VM_MEM_MMIO)
> - continue;
> - if (munmap((void*)vmr->vmr_va, vmr->vmr_size) == -1)
> - fatal("%s: munmap", __func__);
> - }
> -
> - /*
> - * Ask vmm to enter the shared mappings for us. They'll point
> - * to the same host physical memory, but will have a randomized
> - * virtual address for the calling process.
> - */
> + /* Ask vmm(4) to enter a shared mapping to guest memory. */
> if (ioctl(vmm_fd, VMM_IOC_SHAREMEM, &vsp) == -1)
> return (errno);
>
> + /* Update with the location of the new mappings. */
> + for (i = 0; i < vsp.vsp_nmemranges; i++)
> + vcp->vcp_memranges[i].vmr_va = vsp.vsp_va[i];
> +
> return (0);
> }
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)
Reworking VMM's nested paging & guest memory (de-vmspace-ification)