Download raw body.
UVM amap & vmmap rbtree
The use of a rb-tree in the current vmmap implementation is responsible
for the poor page fault performances we're all experiencing.
To reduce wasted memory in UVM large anonymous segments are chunked into
64k ones. For example the main firefox process, after starting it, on
this laptop has the following:
$ doas procmap -p 9434|wc -l
6849
$ doas procmap -p 9434|grep " 64k"|wc -l
457
Chunking a segment via UVM_MAP_CLIP_START/END was O(1) before using a
rbtree and now it requires rebalancing the tree. This is not only bad
for single-threaded performances, it completely kills multi-threaded
performances because this operation requires an exclusive lock.
Diff below allows multi-threaded process to perform CoW in parallel.
Sadly it doesn't gain much because chunking a segment needs an exclusive
lock.
Clearly segment chunking is not compatible with the use of a rbtree to
store addresses. Getting rid of chunking would mean follow the Solaris
way and allocate a page-table for every segment or follow the
Linux/FreeBSD way and go back to a tree of anonymous objects. Do you
see another option?
On the other hand why did we pick a rbtree for vmmap? Is this really
needed? I really doubt it makes sense to keep this data structure with
memory spaces that become bigger and bigger.
Comments? Ideas? Inputs?
diff --git sys/uvm/uvm_amap.c sys/uvm/uvm_amap.c
index cc9a5a12587..0f819647d1a 100644
--- sys/uvm/uvm_amap.c
+++ sys/uvm/uvm_amap.c
@@ -38,6 +38,7 @@
#include <sys/malloc.h>
#include <sys/kernel.h>
#include <sys/pool.h>
+#include <sys/proc.h>
#include <sys/atomic.h>
#include <uvm/uvm.h>
@@ -68,6 +69,8 @@ static char amap_small_pool_names[UVM_AMAP_CHUNK][9];
static struct vm_amap *amap_alloc1(int, int, int);
static inline void amap_list_insert(struct vm_amap *);
static inline void amap_list_remove(struct vm_amap *);
+int amap_set(struct vm_map *, struct vm_map_entry *, struct vm_amap *,
+ boolean_t);
struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int, int);
void amap_chunk_free(struct vm_amap *, struct vm_amap_chunk *);
@@ -511,12 +514,50 @@ amap_wipeout(struct vm_amap *amap)
amap_free(amap);
}
+
+/*
+ * amap_set: ensure that a map entry's "needs_copy" flag is false
+ * by associating it with `amap' if necessary.
+ *
+ * `needlock' is TRUE only if called from the fault handler with a shared
+ * vm_map lock.
+ */
+int
+amap_set(struct vm_map *map, struct vm_map_entry *entry, struct vm_amap *amap,
+ boolean_t needlock)
+{
+ int locked = 0, rv = -1;
+
+ if (needlock) {
+ if (P_HASSIBLING(curproc)) {
+ mtx_enter(&curproc->p_p->ps_mtx);
+ locked = 1;
+ }
+ } else
+ vm_map_assert_wrlock(map);
+
+ if (UVM_ET_ISNEEDSCOPY(entry)) {
+ /* Install new amap. */
+ if (amap != NULL) {
+ entry->aref.ar_pageoff = 0;
+ entry->aref.ar_amap = amap;
+ }
+ entry->etype &= ~UVM_ET_NEEDSCOPY;
+ rv = 0;
+ }
+ if (locked)
+ mtx_leave(&curproc->p_p->ps_mtx);
+
+ return rv;
+}
+
/*
- * amap_copy: ensure that a map entry's "needs_copy" flag is false
- * by copying the amap if necessary.
+ * amap_copy: if necessary, copy the amap of an entry with "needs_copy" flag
*
- * => an entry with a null amap pointer will get a new (blank) one.
- * => the map that the map entry belongs to must be locked by caller.
+ * => return a new (blank) amap if `entry' has a null amap pointer.
+ return the existing amap pointer if `entry' has a non-shared amap.
+ return a copy of the existing amap pointer otherwise.
+ * => the map that entry belongs to must be locked by caller.
* => the amap currently attached to "entry" (if any) must be unlocked.
* => if canchunk is true, then we may clip the entry into a chunk
* => "startva" and "endva" are used only if canchunk is true. they are
@@ -524,24 +565,32 @@ amap_wipeout(struct vm_amap *amap)
* know you are going to need to allocate amaps for, there is no point
* in allowing that to be chunked)
*/
-
-void
+int
amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
boolean_t canchunk, vaddr_t startva, vaddr_t endva)
{
- struct vm_amap *amap, *srcamap;
+ struct vm_amap *amap = NULL, *srcamap;
int slots, lcv, lazyalloc = 0;
vaddr_t chunksize;
- int i, j, k, n, srcslot;
+ int i, j, k, n, srcslot, pageoff;
struct vm_amap_chunk *chunk = NULL, *srcchunk = NULL;
struct vm_anon *anon;
+ boolean_t needlock;
+ vsize_t len;
KASSERT(map != kernel_map); /* we use sleeping locks */
+ needlock = waitf & A_LOCK;
+ waitf &= ~A_LOCK;
+
+ len = entry->end - entry->start;
+ srcamap = entry->aref.ar_amap;
+ pageoff = entry->aref.ar_pageoff;
+
/*
* Is there an amap to copy? If not, create one.
*/
- if (entry->aref.ar_amap == NULL) {
+ if (srcamap == NULL) {
/*
* Check to see if we have a large amap that we can
* chunk. We align startva/endva to chunk-sized
@@ -551,7 +600,7 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
* that makes it grow or shrink dynamically with
* the number of slots.
*/
- if (atop(entry->end - entry->start) >= UVM_AMAP_LARGE) {
+ if (atop(len) >= UVM_AMAP_LARGE) {
if (canchunk) {
/* convert slots to bytes */
chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
@@ -565,12 +614,20 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
lazyalloc = 1;
}
- entry->aref.ar_pageoff = 0;
- entry->aref.ar_amap = amap_alloc(entry->end - entry->start,
- waitf, lazyalloc);
- if (entry->aref.ar_amap != NULL)
- entry->etype &= ~UVM_ET_NEEDSCOPY;
- return;
+ /*
+ * Allocate an initialised amap.
+ * Note: we must update the length after clipping.
+ */
+ len = entry->end - entry->start;
+ amap = amap_alloc(len, waitf, lazyalloc);
+ if (amap == NULL)
+ return ENOMEM;
+ if (amap_set(map, entry, amap, needlock)) {
+ /* Lost the race against sibling. */
+ KASSERT(needlock);
+ amap_unref(amap, 0, atop(len), 0);
+ }
+ return 0;
}
/*
@@ -581,31 +638,30 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
* to the amap (via our locked map). If the value is greater than
* one, then allocate amap and re-check the value.
*/
- if (entry->aref.ar_amap->am_ref == 1) {
- entry->etype &= ~UVM_ET_NEEDSCOPY;
- return;
+ if (srcamap->am_ref == 1) {
+ amap_set(map, entry, NULL, needlock);
+ return 0;
}
/*
* Allocate a new amap (note: not initialised, etc).
*/
- AMAP_B2SLOT(slots, entry->end - entry->start);
- if (!UVM_AMAP_SMALL(entry->aref.ar_amap) &&
- entry->aref.ar_amap->am_hashshift != 0)
+ AMAP_B2SLOT(slots, len);
+ if (!UVM_AMAP_SMALL(srcamap) && srcamap->am_hashshift != 0)
lazyalloc = 1;
amap = amap_alloc1(slots, waitf, lazyalloc);
if (amap == NULL)
- return;
- srcamap = entry->aref.ar_amap;
+ return ENOMEM;
- /*
- * Make the new amap share the source amap's lock, and then lock
- * both.
- */
- amap->am_lock = srcamap->am_lock;
- rw_obj_hold(amap->am_lock);
-
- amap_lock(srcamap, RW_WRITE);
+ amap_lock(srcamap, RW_READ);
+ /* Re-check the source amap. If it has changed we lost the race. */
+ if (entry->aref.ar_amap != srcamap) {
+ amap_unlock(srcamap);
+ /* Destroy the new (unused) amap. */
+ amap->am_ref--;
+ amap_free(amap);
+ return 0;
+ }
/*
* Re-check the reference count with the lock held. If it has
@@ -613,14 +669,21 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
*/
if (srcamap->am_ref == 1) {
/* Just take over the existing amap. */
- entry->etype &= ~UVM_ET_NEEDSCOPY;
+ amap_set(map, entry, NULL, needlock);
amap_unlock(srcamap);
/* Destroy the new (unused) amap. */
amap->am_ref--;
amap_free(amap);
- return;
+ return 0;
}
+ /*
+ * Make the new amap share the source amap's lock, and then lock
+ * both.
+ */
+ amap->am_lock = srcamap->am_lock;
+ rw_obj_hold(amap->am_lock);
+
/*
* Copy the slots.
*/
@@ -645,8 +708,8 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
amap_unlock(srcamap);
/* Destroy the new amap. */
amap->am_ref--;
- amap_free(amap);
- return;
+ amap_free(amap); /* XXX leaks anon. */
+ return ENOMEM;
}
for (k = 0; k < n; i++, j++, k++) {
@@ -662,20 +725,7 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
}
}
- /*
- * Drop our reference to the old amap (srcamap) and unlock.
- * Since the reference count on srcamap is greater than one,
- * (we checked above), it cannot drop to zero while it is locked.
- */
- srcamap->am_ref--;
- KASSERT(srcamap->am_ref > 0);
-
- if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0)
- srcamap->am_flags &= ~AMAP_SHARED; /* clear shared flag */
- if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) {
- amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
- (entry->end - entry->start) >> PAGE_SHIFT, -1);
- }
+ KASSERT(srcamap->am_ref > 1);
/*
* If we referenced any anons, then share the source amap's lock.
@@ -691,14 +741,18 @@ amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
if (amap->am_lock == NULL)
amap_lock_alloc(amap);
- /*
- * Install new amap.
- */
- entry->aref.ar_pageoff = 0;
- entry->aref.ar_amap = amap;
- entry->etype &= ~UVM_ET_NEEDSCOPY;
-
amap_list_insert(amap);
+
+ if (amap_set(map, entry, amap, TRUE)) {
+ /* Lost the race against sibling. */
+ KASSERT(needlock);
+ amap_unref(amap, 0, atop(len), 0);
+ return 0;
+ }
+
+ /* Drop our reference to the old amap (srcamap). */
+ amap_unref(srcamap, pageoff, atop(len), 0);
+ return 0;
}
/*
diff --git sys/uvm/uvm_amap.h sys/uvm/uvm_amap.h
index 3097e5b3a48..c7ca5786d70 100644
--- sys/uvm/uvm_amap.h
+++ sys/uvm/uvm_amap.h
@@ -70,7 +70,7 @@ int amap_add(struct vm_aref *, vaddr_t, struct vm_anon *,
/* allocate a new amap */
struct vm_amap *amap_alloc(vaddr_t, int, int);
/* clear amap needs-copy flag */
-void amap_copy(vm_map_t, vm_map_entry_t, int, boolean_t, vaddr_t,
+int amap_copy(vm_map_t, vm_map_entry_t, int, boolean_t, vaddr_t,
vaddr_t);
/* resolve all COW faults now */
void amap_cow_now(vm_map_t, vm_map_entry_t);
@@ -102,6 +102,11 @@ boolean_t amap_swap_off(int, int);
#define AMAP_REFALL 0x2 /* amap_ref: reference entire amap */
#define AMAP_SWAPOFF 0x4 /* amap_swap_off() is in progress */
+/*
+ * amap_copy() flags
+ */
+#define A_LOCK 0x10 /* amap_set() needs a lock */
+
#endif /* _KERNEL */
/**********************************************************************/
diff --git sys/uvm/uvm_fault.c sys/uvm/uvm_fault.c
index b3b4d8aecc4..ac6da7f4e7e 100644
--- sys/uvm/uvm_fault.c
+++ sys/uvm/uvm_fault.c
@@ -158,7 +158,6 @@ static struct uvm_advice uvmadvice[MADV_MASK + 1];
/*
* private prototypes
*/
-static void uvmfault_amapcopy(struct uvm_faultinfo *);
static inline void uvmfault_anonflush(struct vm_anon **, int);
void uvmfault_unlockmaps(struct uvm_faultinfo *, boolean_t);
void uvmfault_update_stats(struct uvm_faultinfo *);
@@ -214,48 +213,6 @@ uvmfault_init(void)
}
}
-/*
- * uvmfault_amapcopy: clear "needs_copy" in a map.
- *
- * => called with VM data structures unlocked (usually, see below)
- * => we get a write lock on the maps and clear needs_copy for a VA
- * => if we are out of RAM we sleep (waiting for more)
- */
-static void
-uvmfault_amapcopy(struct uvm_faultinfo *ufi)
-{
- for (;;) {
- /*
- * no mapping? give up.
- */
- if (uvmfault_lookup(ufi, TRUE) == FALSE)
- return;
-
- /*
- * copy if needed.
- */
- if (UVM_ET_ISNEEDSCOPY(ufi->entry))
- amap_copy(ufi->map, ufi->entry, M_NOWAIT,
- UVM_ET_ISSTACK(ufi->entry) ? FALSE : TRUE,
- ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
-
- /*
- * didn't work? must be out of RAM. unlock and sleep.
- */
- if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
- uvmfault_unlockmaps(ufi, TRUE);
- uvm_wait("fltamapcopy");
- continue;
- }
-
- /*
- * got it! unlock and return.
- */
- uvmfault_unlockmaps(ufi, TRUE);
- return;
- }
- /*NOTREACHED*/
-}
/*
* uvmfault_anonget: get data in an anon into a non-busy, non-released
@@ -659,13 +616,8 @@ uvm_fault(vm_map_t orig_map, vaddr_t vaddr, vm_fault_t fault_type,
flt.access_type = access_type;
flt.narrow = FALSE; /* assume normal fault for now */
flt.wired = FALSE; /* assume non-wired fault for now */
-#if notyet
flt.upper_lock_type = RW_READ;
flt.lower_lock_type = RW_READ; /* shared lock for now */
-#else
- flt.upper_lock_type = RW_WRITE;
- flt.lower_lock_type = RW_WRITE; /* exclusive lock for now */
-#endif
error = ERESTART;
while (error == ERESTART) { /* ReFault: */
@@ -731,14 +683,15 @@ uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
struct vm_amap *amap;
struct uvm_object *uobj;
int nback, nforw;
+ boolean_t write_locked = FALSE;
/*
* lookup and lock the maps
*/
- if (uvmfault_lookup(ufi, FALSE) == FALSE) {
+lookup:
+ if (uvmfault_lookup(ufi, write_locked) == FALSE) {
return EFAULT;
}
- /* locked: maps(read) */
#ifdef DIAGNOSTIC
if ((ufi->map->flags & VM_MAP_PAGEABLE) == 0)
@@ -776,11 +729,35 @@ uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
if (UVM_ET_ISNEEDSCOPY(ufi->entry)) {
if ((flt->access_type & PROT_WRITE) ||
(ufi->entry->object.uvm_obj == NULL)) {
- /* need to clear */
- uvmfault_unlockmaps(ufi, FALSE);
- uvmfault_amapcopy(ufi);
+ boolean_t canchunk = FALSE;
+ vsize_t len;
+ int error;
+
+ /* large amaps are chunked to reduce fragmentation. */
+ len = ufi->entry->end - ufi->entry->start;
+ if ((atop(len) >= UVM_AMAP_LARGE) &&
+ !UVM_ET_ISSTACK(ufi->entry))
+ canchunk = TRUE;
+
+ /* chunking `ufi->entry' requires write lock */
+ if (canchunk && !write_locked) {
+ write_locked = TRUE;
+ if (!vm_map_upgrade(ufi->map)) {
+ uvmfault_unlockmaps(ufi, FALSE);
+ goto lookup;
+ }
+ }
+
+ error = amap_copy(ufi->map, ufi->entry, M_NOWAIT|A_LOCK,
+ canchunk, ufi->orig_rvaddr, ufi->orig_rvaddr + 1);
+ /* didn't work? must be out of RAM. */
+ if (error) {
+ uvmfault_unlockmaps(ufi, write_locked);
+ uvm_wait("fltamapcopy");
+ return ERESTART;
+ }
+
counters_inc(uvmexp_counters, flt_amcopy);
- return ERESTART;
} else {
/*
* ensure that we pmap_enter page R/O since
@@ -790,6 +767,12 @@ uvm_fault_check(struct uvm_faultinfo *ufi, struct uvm_faultctx *flt,
}
}
+ /* We lost the "needs_copy" race against a sibling. */
+ if (write_locked) {
+ vm_map_downgrade(ufi->map);
+ write_locked = FALSE;
+ }
+
/*
* identify the players
*/
diff --git sys/uvm/uvm_map.c sys/uvm/uvm_map.c
index bbbf4455d09..b2a83d294e6 100644
--- sys/uvm/uvm_map.c
+++ sys/uvm/uvm_map.c
@@ -4247,15 +4247,10 @@ uvm_map_extract(struct vm_map *srcmap, vaddr_t start, vsize_t len,
for (entry = first; entry != NULL && entry->start < end;
entry = RBT_NEXT(uvm_map_addr, entry)) {
if (UVM_ET_ISNEEDSCOPY(entry))
- amap_copy(srcmap, entry, M_NOWAIT,
+ error = amap_copy(srcmap, entry, M_NOWAIT,
UVM_ET_ISSTACK(entry) ? FALSE : TRUE, start, end);
- if (UVM_ET_ISNEEDSCOPY(entry)) {
- /*
- * amap_copy failure
- */
- error = ENOMEM;
+ if (error)
goto fail;
- }
}
/* Lock destination map (kernel_map). */
@@ -5217,6 +5212,56 @@ vm_map_unlock_read_ln(struct vm_map *map, char *file, int line)
mtx_leave(&map->mtx);
}
+boolean_t
+vm_map_upgrade_ln(struct vm_map *map, char *file, int line)
+{
+ int rv;
+
+ if (map->flags & VM_MAP_INTRSAFE) {
+ MUTEX_ASSERT_LOCKED(&map->mtx);
+ } else {
+ struct proc *busy;
+
+ mtx_enter(&map->flags_lock);
+ busy = map->busy;
+ mtx_leave(&map->flags_lock);
+ if (busy != NULL && busy != curproc)
+ return FALSE;
+
+ rv = rw_enter(&map->lock, RW_UPGRADE|RW_NOSLEEP);
+ if (rv != 0)
+ return FALSE;
+ }
+
+ map->timestamp++;
+ LPRINTF(("map upgrade: %p (at %s %d)\n", map, file, line));
+ uvm_tree_sanity(map, file, line);
+ uvm_tree_size_chk(map, file, line);
+
+ return TRUE;
+}
+
+boolean_t
+vm_map_downgrade_ln(struct vm_map *map, char *file, int line)
+{
+ int rv;
+
+ if (map->flags & VM_MAP_INTRSAFE) {
+ MUTEX_ASSERT_LOCKED(&map->mtx);
+ } else {
+ rv = rw_enter(&map->lock, RW_DOWNGRADE);
+ if (rv != 0)
+ return FALSE;
+
+ }
+
+ map->timestamp++;
+ LPRINTF(("map downgrade: %p (at %s %d)\n", map, file, line));
+ uvm_tree_sanity(map, file, line);
+ uvm_tree_size_chk(map, file, line);
+
+ return TRUE;
+}
void
vm_map_busy_ln(struct vm_map *map, char *file, int line)
{
diff --git sys/uvm/uvm_map.h sys/uvm/uvm_map.h
index cc83fc69956..73eca9a5959 100644
--- sys/uvm/uvm_map.h
+++ sys/uvm/uvm_map.h
@@ -402,6 +402,8 @@ void vm_map_lock_ln(struct vm_map*, char*, int);
void vm_map_lock_read_ln(struct vm_map*, char*, int);
void vm_map_unlock_ln(struct vm_map*, char*, int);
void vm_map_unlock_read_ln(struct vm_map*, char*, int);
+boolean_t vm_map_upgrade_ln(struct vm_map*, char*, int);
+boolean_t vm_map_downgrade_ln(struct vm_map*, char*, int);
void vm_map_busy_ln(struct vm_map*, char*, int);
void vm_map_unbusy_ln(struct vm_map*, char*, int);
void vm_map_assert_anylock_ln(struct vm_map*, char*, int);
@@ -413,6 +415,8 @@ void vm_map_assert_wrlock_ln(struct vm_map*, char*, int);
#define vm_map_lock_read(map) vm_map_lock_read_ln(map, __FILE__, __LINE__)
#define vm_map_unlock(map) vm_map_unlock_ln(map, __FILE__, __LINE__)
#define vm_map_unlock_read(map) vm_map_unlock_read_ln(map, __FILE__, __LINE__)
+#define vm_map_upgrade(map) vm_map_upgrade_ln(map, __FILE__, __LINE__)
+#define vm_map_downgrade(map) vm_map_downgrade_ln(map, __FILE__, __LINE__)
#define vm_map_busy(map) vm_map_busy_ln(map, __FILE__, __LINE__)
#define vm_map_unbusy(map) vm_map_unbusy_ln(map, __FILE__, __LINE__)
#define vm_map_assert_anylock(map) \
UVM amap & vmmap rbtree