Download raw body.
use the SMR api instead of SRPs in art/rtable code
On Sat, Jun 28, 2025 at 10:22:14AM +1000, David Gwynne wrote:
> On Thu, Jun 26, 2025 at 10:33:50PM +0200, Alexander Bluhm wrote:
> > On Fri, Jun 13, 2025 at 04:44:56PM +1000, David Gwynne wrote:
> > > On Fri, Jun 13, 2025 at 12:03:37PM +1000, David Gwynne wrote:
> > > > there's other changes too, but moving to SMR is the important one.
> > > >
> > > > SRPs complicated this code massively, so moving it to SMR is almost like
> > > > backing the SRP changes out. it becomes a lot more readable and similar
> > > > to other ART implementations, which can't hurt.
> > > >
> > > > another change is strip non-ART code out of art.c, specifically the
> > > > rwlock and route sourceaddr info. these features have been moved
> > > > up a layer into rtable.c and are now stored as part of a new struct
> > > > rtable. again, this makes the art code easier to read and compare.
> > > >
> > > > the other big change is to walking over an ART. there's now a
> > > > reentract and non-recursive iterator api which art_walk and rtable_walk
> > > > are now built on top of. rtable_walk can basically do a foreach
> > > > over the ART directly rather than having to pass a bunch of state
> > > > into callbacks to apply with art_walk. we can still do that if we
> > > > want, i just find this easier to deal with in rtable_walk directly.
> > > >
> > > > i modified how the heaps are traversed inside ART as well to try and
> > > > minimise the pointer accesses as much as possible.
> > > >
> > > > i'm sorry the diff is so big. i don't expect code reviews. it took
> > > > me 4 years and at least 3 attempts to do this diff. however, tests
> > > > would be much appreciated.
> > > >
> > > > it looks like the generated code is smaller too.
> > >
> > > apologies, i sent a slightly old version that missed some fixes around
> > > reprio and mpath insertion. i didn't use scp enough before sending it
> > > out.
> >
> > This time it crashes in regress/sbin/ifconfig
> >
> > ==== run-ether-inet6-contiguous-netmask ====
> > /sbin/ifconfig vether99 destroy 2>/dev/null || true
> > /sbin/ifconfig vether99 create
> > /sbin/ifconfig vether99 inet6 fdd7:e83e:66bc:254::74 netmask ffff:ffff:ffff:ffff:ffff::
> > /sbin/ifconfig vether99 inet6 fdd7:e83e:66bc:254::74 delete
> > ! /sbin/ifconfig vether99 inet6 fdd7:e83e:66bc:254::74 netmask ffff:ffff:ffff:ffff:ffff:4000::
> > ifconfig: SIOCAIFADDR: Invalid argument
> > Timeout, server ot29 not responding.
> >
> > panic: kernel diagnostic assertion "curcpu()->ci_schedstate.spc_smrdepth == 0" failed: file "/usr/src/sys/kern/subr_xxx.c", line 157
> > Stopped at db_enter+0x14: popq %rbp
> > TID PID UID PRFLAGS PFLAGS CPU COMMAND
> > 57177 76970 0 0x14000 0x200 4 sensors
> > db_enter() at db_enter+0x14
> > panic(ffffffff825ee372) at panic+0xd5
> > __assert(ffffffff8262ac13,ffffffff8256b2e4,9d,ffffffff825e06cd) at __assert+0x29
> > assertwaitok() at assertwaitok+0xdc
> > mi_switch() at mi_switch+0x218
> > sched_idle(ffff8000491d2ff0) at sched_idle+0x13a
> > end trace frame: 0x0, count: 9
> > https://www.openbsd.org/ddb.html describes the minimum info required in bug
> > reports. Insufficient info makes it difficult to find and fix bugs.
>
> ...
>
> > Can you reporduce it? Anything else I should collect from ddb?
>
> That's great, thank you.
>
> This:
>
> > panic: kernel diagnostic assertion "curcpu()->ci_schedstate.spc_smrdepth == 0" failed: file "/usr/src/sys/kern/subr_xxx.c", line 157
>
> makes it easy to find.
>
> i had this in my diff in rtable_lookup:
>
> smr_read_enter();
> if (mask == NULL) {
> /* No need for a perfect match. */
> an = art_match(r->r_art, addr);
> } else {
> plen = rtable_satoplen(dst->sa_family, mask);
> if (plen == -1)
> return (NULL);
>
> an = art_lookup(r->r_art, addr, plen);
> }
> if (an == NULL)
> goto out;
> ...
> out:
> smr_read_leave();
>
> that return in the smr critical section should be a goto out.
matthieu pointed out that netstat needs tweaks too.
this moves some data structure info back to art.h from art.c because
netstat needs to understand it.
Index: sys/net/art.c
===================================================================
RCS file: /cvs/src/sys/net/art.c,v
diff -u -p -r1.31 art.c
--- sys/net/art.c 11 Nov 2023 12:17:50 -0000 1.31
+++ sys/net/art.c 28 Jun 2025 23:15:37 -0000
@@ -22,6 +22,58 @@
*
* Yoichi Hariguchi paper can be found at:
* http://www.hariguchi.org/art/art.pdf
+ *
+ * This implementation is tweaked to minimise pointer traversal
+ * during lookups by only accessing the heaps. It avoids following
+ * pointers to or through the art_table structures.
+ *
+ * The heap is an array of art_heap_entry values which have different
+ * meanings depending on their location. The majority of entries in
+ * the heap are used as pointers to nodes (struct an_node, aka leaf
+ * entries), or the next heap to traverse. These pointers are typed/
+ * tagged by the low bit in their heap value, and must be masked
+ * appropriately before use.
+ *
+ * The first two entries in the heap are not used by the search
+ * algorithm, so they are used to store data associated with this
+ * part of the heap. The first entry (heap[0]) stores a pointer to
+ * an art_table struct associated with this heap. The second entry
+ * (heap[1]) stores the node pointer which would be in the parent
+ * heap if this table wasn't using it. This node pointer is also known
+ * as the default entry/route for the current table in the ART.
+ *
+ * Nodes contain the exact information about the prefix they represent
+ * in the ART, ie, the address and prefix length, and a pointer to
+ * data associated with that prefix (an_value).
+ *
+ * The relationships between the separate data structures looks
+ * like the following:
+ *
+ * +------------------+
+ * | struct art |
+ * +------------------+ NULL
+ * | ^
+ * | art_root | at_parent
+ * v heap[0] |
+ * +------------------+ ----------> +------------------+
+ * | heap | | struct art_table |
+ * +------------------+ <---------- +------------------+
+ * | at_heap ^
+ * | heap entry | at_parent
+ * v heap[0[ |
+ * +------------------+ ----------> +------------------+
+ * | heap | | struct art_table |
+ * +------------------+ <---------- +------------------+
+ * | at_heap
+ * | node entry
+ * v
+ * +------------------+
+ * | struct art_node |
+ * +------------------+
+ * |
+ * | an_value
+ * v
+ * "user" data
*/
#ifndef _KERNEL
@@ -33,43 +85,38 @@
#include <sys/pool.h>
#include <sys/task.h>
#include <sys/socket.h>
+#include <sys/smr.h>
#endif
#include <net/art.h>
-int art_bindex(struct art_table *, const uint8_t *, int);
-void art_allot(struct art_table *at, int, struct art_node *,
- struct art_node *);
-struct art_table *art_table_get(struct art_root *, struct art_table *,
- int);
-struct art_table *art_table_put(struct art_root *, struct art_table *);
-struct art_node *art_table_insert(struct art_root *, struct art_table *,
+static void art_allot(struct art_table *at, unsigned int,
+ art_heap_entry, art_heap_entry);
+struct art_table *art_table_get(struct art *, struct art_table *,
+ unsigned int);
+struct art_table *art_table_put(struct art *, struct art_table *);
+struct art_node *art_table_insert(struct art *, struct art_table *,
int, struct art_node *);
-struct art_node *art_table_delete(struct art_root *, struct art_table *,
+struct art_node *art_table_delete(struct art *, struct art_table *,
int, struct art_node *);
-struct art_table *art_table_ref(struct art_root *, struct art_table *);
-int art_table_free(struct art_root *, struct art_table *);
-int art_table_walk(struct art_root *, struct art_table *,
- int (*f)(struct art_node *, void *), void *);
-int art_walk_apply(struct art_root *,
- struct art_node *, struct art_node *,
- int (*f)(struct art_node *, void *), void *);
+struct art_table *art_table_ref(struct art *, struct art_table *);
+int art_table_free(struct art *, struct art_table *);
void art_table_gc(void *);
void art_gc(void *);
-struct pool an_pool, at_pool, at_heap_4_pool, at_heap_8_pool;
+static struct pool an_pool, at_pool, at_heap_4_pool, at_heap_8_pool;
-struct art_table *art_table_gc_list = NULL;
-struct mutex art_table_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
-struct task art_table_gc_task =
+static struct art_table *art_table_gc_list = NULL;
+static struct mutex art_table_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+static struct task art_table_gc_task =
TASK_INITIALIZER(art_table_gc, NULL);
-struct art_node *art_node_gc_list = NULL;
-struct mutex art_node_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
-struct task art_node_gc_task = TASK_INITIALIZER(art_gc, NULL);
+static struct art_node *art_node_gc_list = NULL;
+static struct mutex art_node_gc_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
+static struct task art_node_gc_task = TASK_INITIALIZER(art_gc, NULL);
void
-art_init(void)
+art_boot(void)
{
pool_init(&an_pool, sizeof(struct art_node), 0, IPL_SOFTNET, 0,
"art_node", NULL);
@@ -84,56 +131,74 @@ art_init(void)
/*
* Per routing table initialization API function.
*/
-struct art_root *
-art_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
-{
- struct art_root *ar;
- int i;
- ar = malloc(sizeof(*ar), M_RTABLE, M_NOWAIT|M_ZERO);
- if (ar == NULL)
- return (NULL);
+static const unsigned int art_plen32_levels[] = {
+ 8, 4, 4, 4, 4, 4, 4
+};
+
+static const unsigned int art_plen128_levels[] = {
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4
+};
+
+static const unsigned int art_plen20_levels[] = {
+ 4, 4, 4, 4, 4
+};
+
+void
+art_init(struct art *art, unsigned int alen)
+{
+ const unsigned int *levels;
+ unsigned int nlevels;
+#ifdef DIAGNOSTIC
+ unsigned int i;
+ unsigned int bits = 0;
+#endif
switch (alen) {
case 32:
- ar->ar_alen = 32;
- ar->ar_nlvl = 7;
- ar->ar_bits[0] = 8;
- for (i = 1; i < ar->ar_nlvl; i++)
- ar->ar_bits[i] = 4;
+ levels = art_plen32_levels;
+ nlevels = nitems(art_plen32_levels);
break;
case 128:
- ar->ar_alen = 128;
- ar->ar_nlvl = 32;
- for (i = 0; i < ar->ar_nlvl; i++)
- ar->ar_bits[i] = 4;
+ levels = art_plen128_levels;
+ nlevels = nitems(art_plen128_levels);
+ break;
+ case 20:
+ levels = art_plen20_levels;
+ nlevels = nitems(art_plen20_levels);
break;
default:
- printf("%s: incorrect address length %u\n", __func__, alen);
- free(ar, M_RTABLE, sizeof(*ar));
- return (NULL);
+ panic("no configuration for alen %u", alen);
+ /* NOTREACHED */
}
- ar->ar_off = off;
- rw_init(&ar->ar_lock, "art");
+#ifdef DIAGNOSTIC
+ for (i = 0; i < nlevels; i++)
+ bits += levels[i];
+
+ if (alen != bits)
+ panic("sum of levels %u != address len %u", bits, alen);
+#endif /* DIAGNOSTIC */
- return (ar);
+ art->art_root = 0;
+ art->art_levels = levels;
+ art->art_nlevels = nlevels;
+ art->art_alen = alen;
}
-/*
- * Return 1 if ``old'' and ``new`` are identical, 0 otherwise.
- */
-static inline int
-art_check_duplicate(struct art_root *ar, struct art_node *old,
- struct art_node *new)
+struct art *
+art_alloc(unsigned int alen)
{
- if (old == NULL)
- return (0);
+ struct art *art;
- if (old->an_plen == new->an_plen)
- return (1);
+ art = malloc(sizeof(*art), M_RTABLE, M_NOWAIT|M_ZERO);
+ if (art == NULL)
+ return (NULL);
- return (0);
+ art_init(art, alen);
+
+ return (art);
}
/*
@@ -148,30 +213,31 @@ art_check_duplicate(struct art_root *ar,
* 8bit-long tables, there's a maximum of 4 base indexes if the
* prefix length is > 24.
*/
-int
-art_bindex(struct art_table *at, const uint8_t *addr, int plen)
+static unsigned int __pure
+art_bindex(unsigned int offset, unsigned int bits,
+ const uint8_t *addr, unsigned int plen)
{
- uint8_t boff, bend;
+ unsigned int boff, bend;
uint32_t k;
- if (plen < at->at_offset || plen > (at->at_offset + at->at_bits))
- return (-1);
+ KASSERT(plen >= offset);
+ KASSERT(plen <= (offset + bits));
/*
* We are only interested in the part of the prefix length
* corresponding to the range of this table.
*/
- plen -= at->at_offset;
+ plen -= offset;
/*
* Jump to the first byte of the address containing bits
* covered by this table.
*/
- addr += (at->at_offset / 8);
+ addr += (offset / 8);
/* ``at'' covers the bit range between ``boff'' & ``bend''. */
- boff = (at->at_offset % 8);
- bend = (at->at_bits + boff);
+ boff = (offset % 8);
+ bend = (bits + boff);
KASSERT(bend <= 32);
@@ -188,25 +254,13 @@ art_bindex(struct art_table *at, const u
k = (addr[0] & ((1 << (8 - boff)) - 1)) << (bend - 8);
k |= addr[1] >> (16 - bend);
} else {
- k = (addr[0] >> (8 - bend)) & ((1 << at->at_bits) - 1);
+ k = (addr[0] >> (8 - bend)) & ((1 << bits) - 1);
}
/*
* Single level base index formula:
*/
- return ((k >> (at->at_bits - plen)) + (1 << plen));
-}
-
-/*
- * Single level lookup function.
- *
- * Return the fringe index of the part of ``addr''
- * corresponding to the range covered by the table ``at''.
- */
-static inline int
-art_findex(struct art_table *at, const uint8_t *addr)
-{
- return art_bindex(at, addr, (at->at_offset + at->at_bits));
+ return ((k >> (bits - plen)) + (1 << plen));
}
/*
@@ -215,60 +269,94 @@ art_findex(struct art_table *at, const u
* Return the best existing match for a destination.
*/
struct art_node *
-art_match(struct art_root *ar, const void *addr, struct srp_ref *nsr)
+art_match(struct art *art, const void *addr)
{
- struct srp_ref dsr, ndsr;
- void *entry;
- struct art_table *at;
- struct art_node *dflt, *ndflt;
- int j;
-
- entry = srp_enter(nsr, &ar->ar_root);
- at = entry;
+ unsigned int offset = 0;
+ unsigned int level = 0;
+ art_heap_entry *heap;
+ art_heap_entry ahe, dahe = 0;
+ struct art_node *an;
+ int j;
- if (at == NULL)
- goto done;
+ SMR_ASSERT_CRITICAL();
- /*
- * Remember the default route of each table we visit in case
- * we do not find a better matching route.
- */
- dflt = srp_enter(&dsr, &at->at_default);
+ heap = SMR_PTR_GET(&art->art_root);
+ if (heap == NULL)
+ return (NULL);
/*
* Iterate until we find a leaf.
*/
- while (1) {
+ for (;;) {
+ unsigned int bits = art->art_levels[level];
+ unsigned int p = offset + bits;
+
+ /*
+ * Remember the default route of each table we visit in case
+ * we do not find a better matching route.
+ */
+ ahe = SMR_PTR_GET(&heap[ART_HEAP_IDX_DEFAULT]);
+ if (ahe != 0)
+ dahe = ahe;
+
/* Do a single level route lookup. */
- j = art_findex(at, addr);
- entry = srp_follow(nsr, &at->at_heap[j].node);
+ j = art_bindex(offset, bits, addr, p);
+ ahe = SMR_PTR_GET(&heap[j]);
/* If this is a leaf (NULL is a leaf) we're done. */
- if (ISLEAF(entry))
+ if (art_heap_entry_is_node(ahe))
break;
- at = SUBTABLE(entry);
+ heap = art_heap_entry_to_heap(ahe);
+ offset = p;
+ level++;
+
+ KASSERT(level < art->art_nlevels);
+ }
+
+ an = art_heap_entry_to_node(ahe);
+ if (an != NULL)
+ return (an);
+
+ return art_heap_entry_to_node(dahe);
+}
+
+static int
+art_node_check(const struct art_node *an,
+ const uint8_t *addr, unsigned int plen)
+{
+ const uint32_t *wan = (const uint32_t *)an->an_addr;
+ const uint32_t *waddr = (const uint32_t *)addr;
+ unsigned int i = 0;
+ unsigned int shift;
+
+ if (an->an_plen != plen)
+ return (0);
+
+ while (plen >= 32) {
+ if (wan[i] != waddr[i])
+ return (0);
+
+ i++;
+ plen -= 32;
+ }
+ if (plen == 0)
+ return (1);
- ndflt = srp_enter(&ndsr, &at->at_default);
- if (ndflt != NULL) {
- srp_leave(&dsr);
- dsr = ndsr;
- dflt = ndflt;
- } else
- srp_leave(&ndsr);
- }
-
- if (entry == NULL) {
- srp_leave(nsr);
- *nsr = dsr;
- KASSERT(ISLEAF(dflt));
- return (dflt);
- }
-
- srp_leave(&dsr);
-done:
- KASSERT(ISLEAF(entry));
- return (entry);
+ i *= 4;
+ while (plen >= 8) {
+ if (an->an_addr[i] != addr[i])
+ return (0);
+
+ i++;
+ plen -= 8;
+ }
+ if (plen == 0)
+ return (1);
+
+ shift = 8 - plen;
+
+ return ((an->an_addr[i] >> shift) == (addr[i] >> shift));
}
/*
@@ -278,24 +366,28 @@ done:
* it does not exist.
*/
struct art_node *
-art_lookup(struct art_root *ar, const void *addr, int plen, struct srp_ref *nsr)
+art_lookup(struct art *art, const void *addr, unsigned int plen)
{
- void *entry;
- struct art_table *at;
- int i, j;
+ unsigned int offset = 0;
+ unsigned int bits, p;
+ unsigned int level = 0;
+ art_heap_entry *heap;
+ art_heap_entry ahe;
+ struct art_node *an;
+ unsigned int i, j;
- KASSERT(plen >= 0 && plen <= ar->ar_alen);
+ KASSERT(plen <= art->art_alen);
- entry = srp_enter(nsr, &ar->ar_root);
- at = entry;
+ SMR_ASSERT_CRITICAL();
- if (at == NULL)
- goto done;
+ heap = SMR_PTR_GET(&art->art_root);
+ if (heap == NULL)
+ return (NULL);
/* Default route */
if (plen == 0) {
- entry = srp_follow(nsr, &at->at_default);
- goto done;
+ ahe = SMR_PTR_GET(&heap[ART_HEAP_IDX_DEFAULT]);
+ return art_heap_entry_to_node(ahe);
}
/*
@@ -303,31 +395,47 @@ art_lookup(struct art_root *ar, const vo
* the stride length at this level the entry must
* be in the current table.
*/
- while (plen > (at->at_offset + at->at_bits)) {
+ for (;;) {
+ bits = art->art_levels[level];
+ p = offset + bits;
+ if (plen <= p)
+ break;
+
/* Do a single level route lookup. */
- j = art_findex(at, addr);
- entry = srp_follow(nsr, &at->at_heap[j].node);
+ j = art_bindex(offset, bits, addr, p);
+ ahe = SMR_PTR_GET(&heap[j]);
/* A leaf is a match, but not a perfect one, or NULL */
- if (ISLEAF(entry))
+ if (art_heap_entry_is_node(ahe))
return (NULL);
- at = SUBTABLE(entry);
+ heap = art_heap_entry_to_heap(ahe);
+ offset = p;
+ level++;
+
+ KASSERT(level < art->art_nlevels);
}
- i = art_bindex(at, addr, plen);
- if (i == -1)
- return (NULL);
+ i = art_bindex(offset, bits, addr, plen);
+ ahe = SMR_PTR_GET(&heap[i]);
+ if (!art_heap_entry_is_node(ahe)) {
+ heap = art_heap_entry_to_heap(ahe);
+ ahe = SMR_PTR_GET(&heap[ART_HEAP_IDX_DEFAULT]);
+ }
- entry = srp_follow(nsr, &at->at_heap[i].node);
- if (!ISLEAF(entry))
- entry = srp_follow(nsr, &SUBTABLE(entry)->at_default);
-
-done:
- KASSERT(ISLEAF(entry));
- return (entry);
+ /* Make sure we've got a perfect match */
+ an = art_heap_entry_to_node(ahe);
+ if (an != NULL && art_node_check(an, addr, plen))
+ return (an);
+
+ return (NULL);
}
+int
+art_is_empty(struct art *art)
+{
+ return (SMR_PTR_GET_LOCKED(&art->art_root) == NULL);
+}
/*
* Insertion API function.
@@ -336,32 +444,38 @@ done:
* same destination/mask pair is already present.
*/
struct art_node *
-art_insert(struct art_root *ar, struct art_node *an, const void *addr, int plen)
+art_insert(struct art *art, struct art_node *an)
{
- struct art_table *at, *child;
- struct art_node *node;
- int i, j;
-
- rw_assert_wrlock(&ar->ar_lock);
- KASSERT(plen >= 0 && plen <= ar->ar_alen);
-
- at = srp_get_locked(&ar->ar_root);
- if (at == NULL) {
- at = art_table_get(ar, NULL, -1);
+ unsigned int p;
+ art_heap_entry ahe, oahe, *ahep;
+ art_heap_entry *heap;
+ struct art_node *oan;
+ struct art_table *at;
+ unsigned int i, j;
+
+ KASSERT(an->an_plen <= art->art_alen);
+
+ heap = SMR_PTR_GET_LOCKED(&art->art_root);
+ if (heap == NULL) {
+ at = art_table_get(art, NULL, -1);
if (at == NULL)
return (NULL);
- srp_swap_locked(&ar->ar_root, at);
- }
+ heap = at->at_heap;
+ SMR_PTR_SET_LOCKED(&art->art_root, heap);
+ } else
+ at = art_heap_to_table(heap);
/* Default route */
- if (plen == 0) {
- node = srp_get_locked(&at->at_default);
- if (node != NULL)
- return (node);
+ if (an->an_plen == 0) {
+ ahep = &heap[ART_HEAP_IDX_DEFAULT];
+ oahe = SMR_PTR_GET_LOCKED(ahep);
+ oan = art_heap_entry_to_node(oahe);
+ if (oan != NULL)
+ return (oan);
- art_table_ref(ar, at);
- srp_swap_locked(&at->at_default, an);
+ art_table_ref(art, at);
+ SMR_PTR_SET_LOCKED(ahep, art_node_to_heap_entry(an));
return (an);
}
@@ -370,10 +484,11 @@ art_insert(struct art_root *ar, struct a
* the stride length at this level the entry must
* be in the current table.
*/
- while (plen > (at->at_offset + at->at_bits)) {
+ while ((p = at->at_offset + at->at_bits) < an->an_plen) {
/* Do a single level route lookup. */
- j = art_findex(at, addr);
- node = srp_get_locked(&at->at_heap[j].node);
+ j = art_bindex(at->at_offset, at->at_bits, an->an_addr, p);
+ ahep = &heap[j];
+ ahe = SMR_PTR_GET_LOCKED(ahep);
/*
* If the node corresponding to the fringe index is
@@ -381,84 +496,86 @@ art_insert(struct art_root *ar, struct a
* entry of this node will then become the default
* route of the subtable.
*/
- if (ISLEAF(node)) {
- child = art_table_get(ar, at, j);
+ if (art_heap_entry_is_node(ahe)) {
+ struct art_table *child = art_table_get(art, at, j);
if (child == NULL)
return (NULL);
- art_table_ref(ar, at);
- srp_swap_locked(&at->at_heap[j].node, ASNODE(child));
+ art_table_ref(art, at);
+
at = child;
- } else
- at = SUBTABLE(node);
+ heap = at->at_heap;
+ SMR_PTR_SET_LOCKED(&heap[ART_HEAP_IDX_DEFAULT], ahe);
+ SMR_PTR_SET_LOCKED(ahep, art_heap_to_heap_entry(heap));
+ } else {
+ heap = art_heap_entry_to_heap(ahe);
+ at = art_heap_to_table(heap);
+ }
}
- i = art_bindex(at, addr, plen);
- if (i == -1)
- return (NULL);
-
- return (art_table_insert(ar, at, i, an));
-}
-
-/*
- * Single level insertion.
- */
-struct art_node *
-art_table_insert(struct art_root *ar, struct art_table *at, int i,
- struct art_node *an)
-{
- struct art_node *prev, *node;
-
- node = srp_get_locked(&at->at_heap[i].node);
- if (!ISLEAF(node))
- prev = srp_get_locked(&SUBTABLE(node)->at_default);
- else
- prev = node;
-
- if (art_check_duplicate(ar, prev, an))
- return (prev);
+ i = art_bindex(at->at_offset, at->at_bits, an->an_addr, an->an_plen);
+ ahep = &heap[i];
+ oahe = SMR_PTR_GET_LOCKED(ahep);
+ if (!art_heap_entry_is_node(oahe)) {
+ heap = art_heap_entry_to_heap(oahe);
+ ahep = &heap[ART_HEAP_IDX_DEFAULT];
+ oahe = SMR_PTR_GET_LOCKED(ahep);
+ }
- art_table_ref(ar, at);
+ /* Check if there's an existing node */
+ oan = art_heap_entry_to_node(oahe);
+ if (oan != NULL && art_node_check(oan, an->an_addr, an->an_plen))
+ return (oan);
/*
* If the index `i' of the route that we are inserting is not
* a fringe index, we need to allot this new route pointer to
* all the corresponding fringe indices.
*/
+ art_table_ref(art, at);
+ ahe = art_node_to_heap_entry(an);
if (i < at->at_minfringe)
- art_allot(at, i, prev, an);
- else if (!ISLEAF(node))
- srp_swap_locked(&SUBTABLE(node)->at_default, an);
+ art_allot(at, i, oahe, ahe);
else
- srp_swap_locked(&at->at_heap[i].node, an);
+ SMR_PTR_SET_LOCKED(ahep, ahe);
return (an);
}
-
/*
* Deletion API function.
*/
struct art_node *
-art_delete(struct art_root *ar, struct art_node *an, const void *addr, int plen)
+art_delete(struct art *art, const void *addr, unsigned int plen)
{
+ unsigned int p;
+ art_heap_entry *heap;
struct art_table *at;
- struct art_node *node;
- int i, j;
+ art_heap_entry ahe, pahe, *ahep;
+ struct art_node *an;
+ unsigned int i, j;
- rw_assert_wrlock(&ar->ar_lock);
- KASSERT(plen >= 0 && plen <= ar->ar_alen);
+ KASSERT(plen <= art->art_alen);
- at = srp_get_locked(&ar->ar_root);
- if (at == NULL)
+ heap = SMR_PTR_GET_LOCKED(&art->art_root);
+ if (heap == NULL)
return (NULL);
+ at = art_heap_to_table(heap);
+
/* Default route */
if (plen == 0) {
- node = srp_get_locked(&at->at_default);
- srp_swap_locked(&at->at_default, NULL);
- art_table_free(ar, at);
- return (node);
+ ahep = &heap[ART_HEAP_IDX_DEFAULT];
+
+ ahe = SMR_PTR_GET_LOCKED(ahep);
+ an = art_heap_entry_to_node(ahe);
+ if (an == NULL)
+ return (NULL);
+
+ SMR_PTR_SET_LOCKED(ahep, 0);
+ art_table_free(art, at);
+
+ return (an);
}
/*
@@ -466,53 +583,37 @@ art_delete(struct art_root *ar, struct a
* the stride length at this level the entry must
* be in the current table.
*/
- while (plen > (at->at_offset + at->at_bits)) {
+ while ((p = at->at_offset + at->at_bits) < plen) {
/* Do a single level route lookup. */
- j = art_findex(at, addr);
- node = srp_get_locked(&at->at_heap[j].node);
+ j = art_bindex(at->at_offset, at->at_bits, addr, p);
+ ahe = SMR_PTR_GET_LOCKED(&heap[j]);
/* If this is a leaf, there is no route to delete. */
- if (ISLEAF(node))
+ if (art_heap_entry_is_node(ahe))
return (NULL);
- at = SUBTABLE(node);
+ heap = art_heap_entry_to_heap(ahe);
+ at = art_heap_to_table(heap);
}
- i = art_bindex(at, addr, plen);
- if (i == -1)
+ i = art_bindex(at->at_offset, at->at_bits, addr, plen);
+ ahep = &heap[i];
+ ahe = SMR_PTR_GET_LOCKED(ahep);
+ if (!art_heap_entry_is_node(ahe)) {
+ art_heap_entry *nheap = art_heap_entry_to_heap(ahe);
+ ahep = &nheap[ART_HEAP_IDX_DEFAULT];
+ ahe = SMR_PTR_GET_LOCKED(ahep);
+ }
+
+ an = art_heap_entry_to_node(ahe);
+ if (an == NULL) {
+ /* No route to delete */
return (NULL);
-
- return (art_table_delete(ar, at, i, an));
-}
-
-/*
- * Single level deletion.
- */
-struct art_node *
-art_table_delete(struct art_root *ar, struct art_table *at, int i,
- struct art_node *an)
-{
- struct art_node *next, *node;
-
-#ifdef DIAGNOSTIC
- struct art_node *prev;
-#endif
-
- node = srp_get_locked(&at->at_heap[i].node);
-#ifdef DIAGNOSTIC
- if (!ISLEAF(node))
- prev = srp_get_locked(&SUBTABLE(node)->at_default);
- else
- prev = node;
-
- KASSERT(prev == an);
-#endif
+ }
/* Get the next most specific route for the index `i'. */
- if ((i >> 1) > 1)
- next = srp_get_locked(&at->at_heap[i >> 1].node);
- else
- next = NULL;
+ j = i >> 1;
+ pahe = (j > 1) ? SMR_PTR_GET_LOCKED(&heap[j]) : 0;
/*
* If the index `i' of the route that we are removing is not
@@ -520,20 +621,18 @@ art_table_delete(struct art_root *ar, st
* route pointer to all the corresponding fringe indices.
*/
if (i < at->at_minfringe)
- art_allot(at, i, an, next);
- else if (!ISLEAF(node))
- srp_swap_locked(&SUBTABLE(node)->at_default, next);
+ art_allot(at, i, ahe, pahe);
else
- srp_swap_locked(&at->at_heap[i].node, next);
+ SMR_PTR_SET_LOCKED(ahep, pahe);
/* We have removed an entry from this table. */
- art_table_free(ar, at);
+ art_table_free(art, at);
return (an);
}
struct art_table *
-art_table_ref(struct art_root *ar, struct art_table *at)
+art_table_ref(struct art *art, struct art_table *at)
{
at->at_refcnt++;
return (at);
@@ -544,12 +643,11 @@ art_table_rele(struct art_table *at)
{
if (at == NULL)
return (0);
-
return (--at->at_refcnt == 0);
}
int
-art_table_free(struct art_root *ar, struct art_table *at)
+art_table_free(struct art *art, struct art_table *at)
{
if (art_table_rele(at)) {
/*
@@ -557,7 +655,7 @@ art_table_free(struct art_root *ar, stru
* that are empty.
*/
do {
- at = art_table_put(ar, at);
+ at = art_table_put(art, at);
} while (art_table_rele(at));
return (1);
@@ -569,116 +667,187 @@ art_table_free(struct art_root *ar, stru
/*
* Iteration API function.
*/
-int
-art_walk(struct art_root *ar, int (*f)(struct art_node *, void *), void *arg)
+
+static struct art_node *
+art_iter_descend(struct art_iter *ai, art_heap_entry *heap,
+ art_heap_entry pahe)
{
- struct srp_ref sr;
- struct art_table *at;
- struct art_node *node;
- int error = 0;
+ struct art_table *at;
+ art_heap_entry ahe;
- rw_enter_write(&ar->ar_lock);
- at = srp_get_locked(&ar->ar_root);
- if (at != NULL) {
- art_table_ref(ar, at);
+ at = art_heap_to_table(heap);
+ ai->ai_table = art_table_ref(ai->ai_art, at);
- /*
- * The default route should be processed here because the root
- * table does not have a parent.
- */
- node = srp_enter(&sr, &at->at_default);
- error = art_walk_apply(ar, node, NULL, f, arg);
- srp_leave(&sr);
+ /*
+ * Start looking at non-fringe nodes.
+ */
+ ai->ai_j = 1;
- if (error == 0)
- error = art_table_walk(ar, at, f, arg);
+ /*
+ * The default route (index 1) is processed by the
+ * parent table (where it belongs) otherwise it could
+ * be processed more than once.
+ */
+ ai->ai_i = 2;
+
+ /*
+ * Process the default route now.
+ */
+ ahe = SMR_PTR_GET_LOCKED(&heap[ART_HEAP_IDX_DEFAULT]);
+ if (ahe != 0 && ahe != pahe)
+ return (art_heap_entry_to_node(ahe));
- art_table_free(ar, at);
+ /*
+ * Tell the caller to proceed with art_iter_next.
+ */
+ return (NULL);
+}
+
+struct art_node *
+art_iter_open(struct art *art, struct art_iter *ai)
+{
+ art_heap_entry *heap = SMR_PTR_GET(&art->art_root);
+ struct art_node *an;
+
+ ai->ai_art = art;
+
+ if (heap == NULL) {
+ /* empty, we're already done */
+ ai->ai_table = NULL;
+ return (NULL);
}
- rw_exit_write(&ar->ar_lock);
- return (error);
+ an = art_iter_descend(ai, heap, 0);
+ if (an != NULL)
+ return (an); /* default route */
+
+ return (art_iter_next(ai));
}
-int
-art_table_walk(struct art_root *ar, struct art_table *at,
- int (*f)(struct art_node *, void *), void *arg)
+struct art_node *
+art_iter_next(struct art_iter *ai)
{
- struct srp_ref sr;
- struct art_node *node, *next;
- struct art_table *nat;
- int i, j, error = 0;
- uint32_t maxfringe = (at->at_minfringe << 1);
+ struct art_table *at = ai->ai_table;
+ art_heap_entry *heap = at->at_heap;
+ art_heap_entry ahe, pahe;
+ unsigned int i;
+descend:
/*
* Iterate non-fringe nodes in ``natural'' order.
*/
- for (j = 1; j < at->at_minfringe; j += 2) {
- /*
- * The default route (index 1) is processed by the
- * parent table (where it belongs) otherwise it could
- * be processed more than once.
- */
- for (i = max(j, 2); i < at->at_minfringe; i <<= 1) {
- next = srp_get_locked(&at->at_heap[i >> 1].node);
-
- node = srp_enter(&sr, &at->at_heap[i].node);
- error = art_walk_apply(ar, node, next, f, arg);
- srp_leave(&sr);
+ if (ai->ai_j < at->at_minfringe) {
+ for (;;) {
+ while ((i = ai->ai_i) < at->at_minfringe) {
+ ai->ai_i = i << 1;
+
+ pahe = SMR_PTR_GET_LOCKED(&heap[i >> 1]);
+ ahe = SMR_PTR_GET_LOCKED(&heap[i]);
+ if (ahe != 0 && ahe != pahe)
+ return (art_heap_entry_to_node(ahe));
+ }
- if (error != 0)
- return (error);
+ ai->ai_j += 2;
+ if (ai->ai_j < at->at_minfringe)
+ ai->ai_i = ai->ai_j;
+ else {
+ /* Set up the fringe loop */
+ ai->ai_i = at->at_minfringe;
+ break;
+ }
}
}
/*
- * Iterate fringe nodes.
+ * Descendent tables are only found on fringe nodes, and
+ * they record where they were placed in their parent. This
+ * allows the iterator to know where to resume traversal when
+ * it ascends back to the parent table. By keeping the table
+ * refs when going down the tree, the topolgy is preserved
+ * even if all the nodes are removed.
*/
- for (i = at->at_minfringe; i < maxfringe; i++) {
- next = srp_get_locked(&at->at_heap[i >> 1].node);
- node = srp_enter(&sr, &at->at_heap[i].node);
- if (!ISLEAF(node)) {
- nat = art_table_ref(ar, SUBTABLE(node));
- node = srp_follow(&sr, &nat->at_default);
- } else
- nat = NULL;
+ for (;;) {
+ unsigned int maxfringe = at->at_minfringe << 1;
+ struct art_table *parent;
- error = art_walk_apply(ar, node, next, f, arg);
- srp_leave(&sr);
+ /*
+ * Iterate fringe nodes.
+ */
+ while ((i = ai->ai_i) < maxfringe) {
+ ai->ai_i = i + 1;
- if (error != 0) {
- art_table_free(ar, nat);
- return (error);
+ pahe = SMR_PTR_GET_LOCKED(&heap[i >> 1]);
+ ahe = SMR_PTR_GET_LOCKED(&heap[i]);
+ if (art_heap_entry_is_node(ahe)) {
+ if (ahe != 0 && ahe != pahe)
+ return (art_heap_entry_to_node(ahe));
+ } else {
+ struct art_node *an;
+
+ heap = art_heap_entry_to_heap(ahe);
+
+ an = art_iter_descend(ai, heap, pahe);
+ if (an != NULL) /* default route? */
+ return (an);
+
+ /* Start looping over the child table */
+ at = art_heap_to_table(heap);
+ goto descend;
+ }
}
- if (nat != NULL) {
- error = art_table_walk(ar, nat, f, arg);
- art_table_free(ar, nat);
- if (error != 0)
- return (error);
+ /*
+ * Ascend back up to the parent
+ */
+ parent = at->at_parent;
+ ai->ai_i = at->at_index + 1;
+ art_table_free(ai->ai_art, at);
+
+ ai->ai_table = parent;
+ if (parent == NULL) {
+ /* The root table has no parent */
+ break;
}
+
+ at = parent;
+ ai->ai_j = at->at_minfringe;
+ heap = at->at_heap;
}
- return (0);
+ return (NULL);
}
-int
-art_walk_apply(struct art_root *ar,
- struct art_node *an, struct art_node *next,
- int (*f)(struct art_node *, void *), void *arg)
+void
+art_iter_close(struct art_iter *ai)
{
- int error = 0;
+ struct art_table *at, *parent;
- if ((an != NULL) && (an != next)) {
- rw_exit_write(&ar->ar_lock);
- error = (*f)(an, arg);
- rw_enter_write(&ar->ar_lock);
+ for (at = ai->ai_table; at != NULL; at = parent) {
+ parent = at->at_parent;
+ art_table_free(ai->ai_art, at);
}
- return (error);
+ ai->ai_table = NULL;
}
+int
+art_walk(struct art *art, int (*f)(struct art_node *, void *), void *arg)
+{
+ struct art_iter ai;
+ struct art_node *an;
+ int error = 0;
+
+ ART_FOREACH(an, art, &ai) {
+ error = f(an, arg);
+ if (error != 0) {
+ art_iter_close(&ai);
+ return (error);
+ }
+ }
+
+ return (0);
+}
/*
* Create a table and use the given index to set its default route.
@@ -686,89 +855,90 @@ art_walk_apply(struct art_root *ar,
* Note: This function does not modify the root or the parent.
*/
struct art_table *
-art_table_get(struct art_root *ar, struct art_table *parent, int j)
+art_table_get(struct art *art, struct art_table *parent, unsigned int j)
{
struct art_table *at;
- struct art_node *node;
- void *at_heap;
- uint32_t lvl;
+ art_heap_entry *heap;
+ unsigned int level;
+ unsigned int bits;
- KASSERT(j != 0 && j != 1);
+ KASSERT(j != ART_HEAP_IDX_TABLE && j != ART_HEAP_IDX_DEFAULT);
KASSERT(parent != NULL || j == -1);
- if (parent != NULL)
- lvl = parent->at_level + 1;
- else
- lvl = 0;
-
- KASSERT(lvl < ar->ar_nlvl);
+ level = (parent != NULL) ? parent->at_level + 1 : 0;
+ KASSERT(level < art->art_nlevels);
at = pool_get(&at_pool, PR_NOWAIT|PR_ZERO);
if (at == NULL)
return (NULL);
- switch (AT_HEAPSIZE(ar->ar_bits[lvl])) {
- case AT_HEAPSIZE(4):
- at_heap = pool_get(&at_heap_4_pool, PR_NOWAIT|PR_ZERO);
+ bits = art->art_levels[level];
+ switch (bits) {
+ case 4:
+ heap = pool_get(&at_heap_4_pool, PR_NOWAIT|PR_ZERO);
break;
- case AT_HEAPSIZE(8):
- at_heap = pool_get(&at_heap_8_pool, PR_NOWAIT|PR_ZERO);
+ case 8:
+ heap = pool_get(&at_heap_8_pool, PR_NOWAIT|PR_ZERO);
break;
default:
- panic("incorrect stride length %u", ar->ar_bits[lvl]);
+ panic("incorrect stride length %u", bits);
}
- if (at_heap == NULL) {
+ if (heap == NULL) {
pool_put(&at_pool, at);
return (NULL);
}
+ heap[ART_HEAP_IDX_TABLE] = (art_heap_entry)at;
+
at->at_parent = parent;
at->at_index = j;
- at->at_minfringe = (1 << ar->ar_bits[lvl]);
- at->at_level = lvl;
- at->at_bits = ar->ar_bits[lvl];
- at->at_heap = at_heap;
+ at->at_minfringe = (1 << bits);
+ at->at_level = level;
+ at->at_bits = bits;
+ at->at_heap = heap;
at->at_refcnt = 0;
if (parent != NULL) {
- node = srp_get_locked(&parent->at_heap[j].node);
- /* node isn't being deleted, no srp_finalize needed */
- srp_swap_locked(&at->at_default, node);
+ art_heap_entry ahe;
+
at->at_offset = (parent->at_offset + parent->at_bits);
+
+ ahe = SMR_PTR_GET_LOCKED(&parent->at_heap[j]);
+ SMR_PTR_SET_LOCKED(&heap[ART_HEAP_IDX_DEFAULT], ahe);
}
return (at);
}
-
/*
* Delete a table and use its index to restore its parent's default route.
*
* Note: Modify its parent to unlink the table from it.
*/
struct art_table *
-art_table_put(struct art_root *ar, struct art_table *at)
+art_table_put(struct art *art, struct art_table *at)
{
struct art_table *parent = at->at_parent;
- struct art_node *node;
- uint32_t j = at->at_index;
+ unsigned int j = at->at_index;
KASSERT(at->at_refcnt == 0);
KASSERT(j != 0 && j != 1);
if (parent != NULL) {
+ art_heap_entry ahe;
+
KASSERT(j != -1);
KASSERT(at->at_level == parent->at_level + 1);
KASSERT(parent->at_refcnt >= 1);
/* Give the route back to its parent. */
- node = srp_get_locked(&at->at_default);
- srp_swap_locked(&parent->at_heap[j].node, node);
+ ahe = SMR_PTR_GET_LOCKED(&at->at_heap[ART_HEAP_IDX_DEFAULT]);
+ SMR_PTR_SET_LOCKED(&parent->at_heap[j], ahe);
} else {
KASSERT(j == -1);
KASSERT(at->at_level == 0);
- srp_swap_locked(&ar->ar_root, NULL);
+ SMR_PTR_SET_LOCKED(&art->art_root, NULL);
}
mtx_enter(&art_table_gc_mtx);
@@ -791,19 +961,16 @@ art_table_gc(void *null)
art_table_gc_list = NULL;
mtx_leave(&art_table_gc_mtx);
+ smr_barrier();
+
while (at != NULL) {
next = at->at_parent;
- if (at->at_level == 0)
- srp_finalize(at, "arttfini");
- else
- srp_finalize(ASNODE(at), "arttfini");
-
- switch (AT_HEAPSIZE(at->at_bits)) {
- case AT_HEAPSIZE(4):
+ switch (at->at_bits) {
+ case 4:
pool_put(&at_heap_4_pool, at->at_heap);
break;
- case AT_HEAPSIZE(8):
+ case 8:
pool_put(&at_heap_8_pool, at->at_heap);
break;
default:
@@ -839,40 +1006,45 @@ art_table_gc(void *null)
* art_allot(at, k, old, new);
* }
*/
-void
-art_allot(struct art_table *at, int i, struct art_node *old,
- struct art_node *new)
-{
- struct art_node *node, *dflt;
- int k = i;
+static void
+art_allot(struct art_table *at, unsigned int i,
+ art_heap_entry oahe, art_heap_entry nahe)
+{
+ art_heap_entry ahe, *ahep;
+ art_heap_entry *heap;
+ unsigned int k = i;
KASSERT(i < at->at_minfringe);
+ heap = at->at_heap;
+
again:
k <<= 1;
if (k < at->at_minfringe)
goto nonfringe;
/* Change fringe nodes. */
- while (1) {
- node = srp_get_locked(&at->at_heap[k].node);
- if (!ISLEAF(node)) {
- dflt = srp_get_locked(&SUBTABLE(node)->at_default);
- if (dflt == old) {
- srp_swap_locked(&SUBTABLE(node)->at_default,
- new);
- }
- } else if (node == old) {
- srp_swap_locked(&at->at_heap[k].node, new);
+ for (;;) {
+ ahep = &heap[k];
+ ahe = SMR_PTR_GET_LOCKED(ahep);
+
+ if (!art_heap_entry_is_node(ahe)) {
+ art_heap_entry *child = art_heap_entry_to_heap(ahe);
+ ahep = &child[ART_HEAP_IDX_DEFAULT];
+ ahe = SMR_PTR_GET_LOCKED(ahep);
}
+
+ if (ahe == oahe)
+ SMR_PTR_SET_LOCKED(ahep, nahe);
+
if (k % 2)
goto moveup;
k++;
}
nonfringe:
- node = srp_get_locked(&at->at_heap[k].node);
- if (node == old)
+ ahe = SMR_PTR_GET_LOCKED(&heap[k]);
+ if (ahe == oahe)
goto again;
moveon:
if (k % 2)
@@ -881,7 +1053,7 @@ moveon:
goto nonfringe;
moveup:
k >>= 1;
- srp_swap_locked(&at->at_heap[k].node, new);
+ SMR_PTR_SET_LOCKED(&heap[k], nahe);
/* Change non-fringe node. */
if (k != i)
@@ -889,7 +1061,7 @@ moveup:
}
struct art_node *
-art_get(uint8_t plen)
+art_get(const uint8_t *addr, unsigned int plen)
{
struct art_node *an;
@@ -897,17 +1069,25 @@ art_get(uint8_t plen)
if (an == NULL)
return (NULL);
- an->an_plen = plen;
- SRPL_INIT(&an->an_rtlist);
+ art_node_init(an, addr, plen);
return (an);
}
void
-art_put(struct art_node *an)
+art_node_init(struct art_node *an, const uint8_t *addr, unsigned int plen)
{
- KASSERT(SRPL_EMPTY_LOCKED(&an->an_rtlist));
+ size_t len;
+
+ len = roundup(plen, 8) / 8;
+ KASSERT(len <= sizeof(an->an_addr));
+ memcpy(an->an_addr, addr, len);
+ an->an_plen = plen;
+}
+void
+art_put(struct art_node *an)
+{
mtx_enter(&art_node_gc_mtx);
an->an_gc = art_node_gc_list;
art_node_gc_list = an;
@@ -919,17 +1099,17 @@ art_put(struct art_node *an)
void
art_gc(void *null)
{
- struct art_node *an, *next;
+ struct art_node *an;
mtx_enter(&art_node_gc_mtx);
an = art_node_gc_list;
art_node_gc_list = NULL;
mtx_leave(&art_node_gc_mtx);
- while (an != NULL) {
- next = an->an_gc;
+ smr_barrier();
- srp_finalize(an, "artnfini");
+ while (an != NULL) {
+ struct art_node *next = an->an_gc;
pool_put(&an_pool, an);
Index: sys/net/art.h
===================================================================
RCS file: /cvs/src/sys/net/art.h,v
diff -u -p -r1.25 art.h
--- sys/net/art.h 11 Nov 2023 12:17:50 -0000 1.25
+++ sys/net/art.h 28 Jun 2025 23:15:37 -0000
@@ -19,94 +19,151 @@
#ifndef _NET_ART_H_
#define _NET_ART_H_
-#include <sys/rwlock.h>
-#include <sys/srp.h>
+/*
+ * Allotment Routing Table (ART)
+ *
+ * Yoichi Hariguchi paper can be found at:
+ * http://www.hariguchi.org/art/art.pdf
+ *
+ * Locking:
+ *
+ * Modification (ie, art_insert or art_delete) and iteration
+ * (art_iter_next, etc) over the ART must be serialised by the caller.
+ * Lookups (ie, art_match and art_lookup) run within an SMR critical
+ * section.
+ *
+ * Iteration requires serialisation as it manipulates the reference
+ * counts on tables as it traverses the tree. The iterator maintains
+ * these references until it runs out of entries. This allows code
+ * iterating over the ART to release locks in between calls to
+ * art_iter_open and art_iter_next. The references may be dropped
+ * early with art_iter_close.
+ *
+ * Note, the iterator does not hold a reference to the art_node
+ * structure or the data hanging off the an_value pointer, they must
+ * be accounted for separately or their use must be serialised with
+ * art_delete.
+ */
-#define ART_MAXLVL 32 /* We currently use 32 levels for IPv6. */
+typedef uintptr_t art_heap_entry;
/*
- * Root of the ART tables, equivalent to the radix head.
- *
- * Locks used to protect struct members in this file:
- * I immutable after creation
- * l root's `ar_lock'
- * K kernel lock
- * For SRP related structures that allow lock-free reads, the write lock
- * is indicated below.
+ * Root of the ART, equivalent to the radix head.
*/
-struct art_root {
- struct srp ar_root; /* [l] First table */
- struct rwlock ar_lock; /* [] Serialise modifications */
- uint8_t ar_bits[ART_MAXLVL]; /* [I] Per level stride */
- uint8_t ar_nlvl; /* [I] Number of levels */
- uint8_t ar_alen; /* [I] Address length in bits */
- uint8_t ar_off; /* [I] Offset of key in bytes */
- struct sockaddr *ar_source; /* [N] use optional src addr */
-};
-#define ISLEAF(e) (((unsigned long)(e) & 1) == 0)
-#define SUBTABLE(e) ((struct art_table *)((unsigned long)(e) & ~1))
-#define ASNODE(t) ((struct art_node *)((unsigned long)(t) | 1))
+struct art {
+ art_heap_entry *art_root;
+ const unsigned int *art_levels;
+ unsigned int art_nlevels;
+ unsigned int art_alen;
+};
/*
* Allotment Table.
*/
struct art_table {
+ art_heap_entry *at_heap;
struct art_table *at_parent; /* Parent table */
- uint32_t at_index; /* Index in the parent table */
- uint32_t at_minfringe; /* Index that fringe begins */
- uint32_t at_level; /* Level of the table */
- uint8_t at_bits; /* Stride length of the table */
- uint8_t at_offset; /* Sum of parents' stride len */
-
- /*
- * Items stored in the heap are pointers to nodes, in the leaf
- * case, or tables otherwise. One exception is index 0 which
- * is a route counter.
- */
- union {
- struct srp node;
- unsigned long count;
- } *at_heap; /* Array of 2^(slen+1) items */
+
+ unsigned int at_index; /* Index in the parent table */
+ unsigned int at_minfringe; /* Index that fringe begins */
+
+ unsigned int at_level; /* Level of the table */
+ unsigned int at_bits; /* Stride length of the table */
+ unsigned int at_offset; /* Sum of parents' stride len */
+
+ unsigned int at_refcnt;
};
-#define at_refcnt at_heap[0].count/* Refcounter (1 per different route) */
-#define at_default at_heap[1].node /* Default route (was in parent heap) */
-/* Heap size for an ART table of stride length ``slen''. */
-#define AT_HEAPSIZE(slen) ((1 << ((slen) + 1)) * sizeof(void *))
+#define ART_HEAP_IDX_TABLE 0
+#define ART_HEAP_IDX_DEFAULT 1
-/*
- * Forward declaration needed for the list of mpath routes
- * attached to a single ART node.
- */
-struct rtentry;
+#define AT_HEAPSIZE(bits) ((1 << ((bits) + 1)) * sizeof(art_heap_entry))
/*
* A node is the internal representation of a route entry.
*/
struct art_node {
+ void *an_value;
union {
- SRPL_HEAD(, rtentry) an__rtlist; /* Route related to this node */
- struct art_node *an__gc; /* Entry on GC list */
- } an_pointer;
- uint8_t an_plen; /* Prefix length */
+ struct art_node *an__gc;
+ uint8_t an__addr[16];
+ } an__u;
+#define an_gc an__u.an__gc
+#define an_addr an__u.an__addr
+ unsigned int an_plen;
};
-#define an_rtlist an_pointer.an__rtlist
-#define an_gc an_pointer.an__gc
-void art_init(void);
-struct art_root *art_alloc(unsigned int, unsigned int, unsigned int);
-struct art_node *art_insert(struct art_root *, struct art_node *, const void *,
- int);
-struct art_node *art_delete(struct art_root *, struct art_node *, const void *,
- int);
-struct art_node *art_match(struct art_root *, const void *, struct srp_ref *);
-struct art_node *art_lookup(struct art_root *, const void *, int,
- struct srp_ref *);
-int art_walk(struct art_root *,
+static inline struct art_table *
+art_heap_to_table(art_heap_entry *heap)
+{
+ return ((struct art_table *)heap[ART_HEAP_IDX_TABLE]);
+}
+
+static inline int
+art_heap_entry_is_node(art_heap_entry ahe)
+{
+ return ((ahe & 1UL) == 0);
+}
+
+static inline struct art_node *
+art_heap_entry_to_node(art_heap_entry ahe)
+{
+ return ((struct art_node *)ahe);
+}
+
+static inline art_heap_entry *
+art_heap_entry_to_heap(art_heap_entry ahe)
+{
+ return ((art_heap_entry *)(ahe & ~1UL));
+}
+
+static inline art_heap_entry
+art_node_to_heap_entry(struct art_node *an)
+{
+ return ((art_heap_entry)an);
+}
+
+static inline art_heap_entry
+art_heap_to_heap_entry(art_heap_entry *heap)
+{
+ return ((art_heap_entry)heap | 1UL);
+}
+
+#ifdef _KERNEL
+void art_boot(void);
+struct art *art_alloc(unsigned int);
+void art_init(struct art *, unsigned int);
+struct art_node *art_insert(struct art *, struct art_node *);
+struct art_node *art_delete(struct art *, const void *, unsigned int);
+struct art_node *art_match(struct art *, const void *);
+struct art_node *art_lookup(struct art *, const void *, unsigned int);
+int art_is_empty(struct art *);
+
+struct art_node *art_get(const uint8_t *, unsigned int);
+void art_node_init(struct art_node *,
+ const uint8_t *, unsigned int);
+void art_put(struct art_node *);
+
+struct art_iter {
+ struct art *ai_art;
+ struct art_table *ai_table;
+ unsigned int ai_j;
+ unsigned int ai_i;
+};
+
+struct art_node *art_iter_open(struct art *, struct art_iter *);
+struct art_node *art_iter_next(struct art_iter *);
+void art_iter_close(struct art_iter *);
+
+#define ART_FOREACH(_an, _art, _ai) \
+ for ((_an) = art_iter_open((_art), (_ai)); \
+ (_an) != NULL; \
+ (_an) = art_iter_next((_ai)))
+
+int art_walk(struct art *,
int (*)(struct art_node *, void *), void *);
-struct art_node *art_get(uint8_t);
-void art_put(struct art_node *);
+#endif /* _KERNEL */
#endif /* _NET_ART_H_ */
Index: sys/net/if_wg.c
===================================================================
RCS file: /cvs/src/sys/net/if_wg.c,v
diff -u -p -r1.42 if_wg.c
--- sys/net/if_wg.c 17 Apr 2025 12:42:50 -0000 1.42
+++ sys/net/if_wg.c 28 Jun 2025 23:15:37 -0000
@@ -31,6 +31,7 @@
#include <sys/ioctl.h>
#include <sys/mbuf.h>
#include <sys/syslog.h>
+#include <sys/smr.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -251,10 +252,11 @@ struct wg_softc {
struct socket *sc_so6;
#endif
+ struct rwlock sc_aip_lock;
size_t sc_aip_num;
- struct art_root *sc_aip4;
+ struct art *sc_aip4;
#ifdef INET6
- struct art_root *sc_aip6;
+ struct art *sc_aip6;
#endif
struct rwlock sc_peer_lock;
@@ -290,7 +292,7 @@ void wg_peer_counters_add(struct wg_peer
int wg_aip_add(struct wg_softc *, struct wg_peer *, struct wg_aip_io *);
struct wg_peer *
- wg_aip_lookup(struct art_root *, void *);
+ wg_aip_lookup(struct art *, void *);
int wg_aip_remove(struct wg_softc *, struct wg_peer *,
struct wg_aip_io *);
@@ -609,7 +611,7 @@ wg_peer_counters_add(struct wg_peer *pee
int
wg_aip_add(struct wg_softc *sc, struct wg_peer *peer, struct wg_aip_io *d)
{
- struct art_root *root;
+ struct art *root;
struct art_node *node;
struct wg_aip *aip;
int ret = 0;
@@ -625,8 +627,10 @@ wg_aip_add(struct wg_softc *sc, struct w
if ((aip = pool_get(&wg_aip_pool, PR_NOWAIT|PR_ZERO)) == NULL)
return ENOBUFS;
- rw_enter_write(&root->ar_lock);
- node = art_insert(root, &aip->a_node, &d->a_addr, d->a_cidr);
+ art_node_init(&aip->a_node, &d->a_addr.addr_bytes, d->a_cidr);
+
+ rw_enter_write(&sc->sc_aip_lock);
+ node = art_insert(root, &aip->a_node);
if (node == &aip->a_node) {
aip->a_peer = peer;
@@ -642,18 +646,18 @@ wg_aip_add(struct wg_softc *sc, struct w
aip->a_peer = peer;
}
}
- rw_exit_write(&root->ar_lock);
+ rw_exit_write(&sc->sc_aip_lock);
return ret;
}
struct wg_peer *
-wg_aip_lookup(struct art_root *root, void *addr)
+wg_aip_lookup(struct art *root, void *addr)
{
- struct srp_ref sr;
struct art_node *node;
- node = art_match(root, addr, &sr);
- srp_leave(&sr);
+ smr_read_enter();
+ node = art_match(root, addr);
+ smr_read_leave();
return node == NULL ? NULL : ((struct wg_aip *) node)->a_peer;
}
@@ -661,8 +665,7 @@ wg_aip_lookup(struct art_root *root, voi
int
wg_aip_remove(struct wg_softc *sc, struct wg_peer *peer, struct wg_aip_io *d)
{
- struct srp_ref sr;
- struct art_root *root;
+ struct art *root;
struct art_node *node;
struct wg_aip *aip;
int ret = 0;
@@ -675,23 +678,21 @@ wg_aip_remove(struct wg_softc *sc, struc
default: return EAFNOSUPPORT;
}
- rw_enter_write(&root->ar_lock);
- if ((node = art_lookup(root, &d->a_addr, d->a_cidr, &sr)) == NULL) {
+ rw_enter_write(&sc->sc_aip_lock);
+ if ((node = art_lookup(root, &d->a_addr, d->a_cidr)) == NULL) {
ret = ENOENT;
} else if (((struct wg_aip *) node)->a_peer != peer) {
ret = EXDEV;
} else {
aip = (struct wg_aip *)node;
- if (art_delete(root, node, &d->a_addr, d->a_cidr) == NULL)
+ if (art_delete(root, &d->a_addr, d->a_cidr) == NULL)
panic("art_delete failed to delete node %p", node);
sc->sc_aip_num--;
LIST_REMOVE(aip, a_entry);
pool_put(&wg_aip_pool, aip);
}
-
- srp_leave(&sr);
- rw_exit_write(&root->ar_lock);
+ rw_exit_write(&sc->sc_aip_lock);
return ret;
}
@@ -2726,10 +2727,11 @@ wg_clone_create(struct if_clone *ifc, in
#endif
sc->sc_aip_num = 0;
- if ((sc->sc_aip4 = art_alloc(0, 32, 0)) == NULL)
+ rw_init(&sc->sc_aip_lock, "wgaip");
+ if ((sc->sc_aip4 = art_alloc(32)) == NULL)
goto ret_02;
#ifdef INET6
- if ((sc->sc_aip6 = art_alloc(0, 128, 0)) == NULL)
+ if ((sc->sc_aip6 = art_alloc(128)) == NULL)
goto ret_03;
#endif
Index: sys/net/if_wg.h
===================================================================
RCS file: /cvs/src/sys/net/if_wg.h,v
diff -u -p -r1.6 if_wg.h
--- sys/net/if_wg.h 13 Oct 2024 00:53:21 -0000 1.6
+++ sys/net/if_wg.h 28 Jun 2025 23:15:37 -0000
@@ -46,6 +46,7 @@ struct wg_aip_io {
sa_family_t a_af;
int a_cidr;
union wg_aip_addr {
+ uint8_t addr_bytes;
struct in_addr addr_ipv4;
struct in6_addr addr_ipv6;
} a_addr;
Index: sys/net/route.h
===================================================================
RCS file: /cvs/src/sys/net/route.h,v
diff -u -p -r1.216 route.h
--- sys/net/route.h 16 Mar 2025 21:58:08 -0000 1.216
+++ sys/net/route.h 28 Jun 2025 23:15:37 -0000
@@ -40,7 +40,7 @@
* I immutable after creation
* N net lock
* X exclusive net lock, or shared net lock + kernel lock
- * R art (rtable) lock
+ * R rtable lock
* r per route entry mutex rt_mtx
* L arp/nd6/etc lock for updates, net lock for reads
* T rttimer_mtx route timer lists
@@ -117,7 +117,7 @@ struct rttimer;
struct rtentry {
struct sockaddr *rt_dest; /* [I] destination */
- SRPL_ENTRY(rtentry) rt_next; /* [R] next mpath entry to our dst */
+ struct rtentry *rt_next; /* [R] next mpath entry to our dst */
struct sockaddr *rt_gateway; /* [X] gateway address */
struct ifaddr *rt_ifa; /* [N] interface addr to use */
caddr_t rt_llinfo; /* [L] pointer to link level info or
Index: sys/net/rtable.c
===================================================================
RCS file: /cvs/src/sys/net/rtable.c,v
diff -u -p -r1.87 rtable.c
--- sys/net/rtable.c 9 Apr 2024 12:53:08 -0000 1.87
+++ sys/net/rtable.c 28 Jun 2025 23:15:37 -0000
@@ -26,12 +26,21 @@
#include <sys/queue.h>
#include <sys/domain.h>
#include <sys/srp.h>
+#include <sys/smr.h>
#endif
#include <net/rtable.h>
#include <net/route.h>
#include <net/art.h>
+struct rtable {
+ struct rwlock r_lock;
+ struct art *r_art; /* [I] */
+ unsigned int r_off; /* [I] Offset of key in bytes */
+
+ struct sockaddr *r_source; /* [N] use optional src addr */
+};
+
/*
* Structures used by rtable_get() to retrieve the corresponding
* routing table for a given pair of ``af'' and ``rtableid''.
@@ -85,8 +94,8 @@ void rtmap_dtor(void *, void *);
struct srp_gc rtmap_gc = SRP_GC_INITIALIZER(rtmap_dtor, NULL);
void rtable_init_backend(void);
-void *rtable_alloc(unsigned int, unsigned int, unsigned int);
-void *rtable_get(unsigned int, sa_family_t);
+struct rtable *rtable_alloc(unsigned int, unsigned int, unsigned int);
+struct rtable *rtable_get(unsigned int, sa_family_t);
void
rtmap_init(void)
@@ -193,7 +202,7 @@ int
rtable_add(unsigned int id)
{
const struct domain *dp;
- void *tbl;
+ struct rtable *tbl;
struct rtmap *map;
struct dommp *dmm;
sa_family_t af;
@@ -244,11 +253,11 @@ out:
return (error);
}
-void *
+struct rtable *
rtable_get(unsigned int rtableid, sa_family_t af)
{
struct rtmap *map;
- void *tbl = NULL;
+ struct rtable *tbl = NULL;
struct srp_ref sr;
if (af >= nitems(af2idx) || af2idx[af] == 0)
@@ -286,7 +295,7 @@ rtable_empty(unsigned int rtableid)
{
const struct domain *dp;
int i;
- struct art_root *tbl;
+ struct rtable *tbl;
for (i = 0; (dp = domains[i]) != NULL; i++) {
if (dp->dom_rtoffset == 0)
@@ -295,7 +304,7 @@ rtable_empty(unsigned int rtableid)
tbl = rtable_get(rtableid, dp->dom_family);
if (tbl == NULL)
continue;
- if (tbl->ar_root.ref != NULL)
+ if (!art_is_empty(tbl->r_art))
return (0);
}
@@ -350,40 +359,51 @@ rtable_l2set(unsigned int rtableid, unsi
}
-static inline const uint8_t *satoaddr(struct art_root *,
+static inline const uint8_t *satoaddr(struct rtable *,
const struct sockaddr *);
-int an_match(struct art_node *, const struct sockaddr *, int);
-void rtentry_ref(void *, void *);
-void rtentry_unref(void *, void *);
-
void rtable_mpath_insert(struct art_node *, struct rtentry *);
-struct srpl_rc rt_rc = SRPL_RC_INITIALIZER(rtentry_ref, rtentry_unref, NULL);
-
void
rtable_init_backend(void)
{
- art_init();
+ art_boot();
}
-void *
+struct rtable *
rtable_alloc(unsigned int rtableid, unsigned int alen, unsigned int off)
{
- return (art_alloc(rtableid, alen, off));
+ struct rtable *r;
+
+ r = malloc(sizeof(*r), M_RTABLE, M_NOWAIT|M_ZERO);
+ if (r == NULL)
+ return (r);
+
+
+ r->r_art = art_alloc(alen);
+ if (r->r_art == NULL) {
+ free(r, M_RTABLE, sizeof(*r));
+ return (NULL);
+ }
+
+ rw_init(&r->r_lock, "rtable");
+ r->r_off = off;
+ r->r_source = NULL;
+
+ return (r);
}
int
rtable_setsource(unsigned int rtableid, int af, struct sockaddr *src)
{
- struct art_root *ar;
+ struct rtable *r;
NET_ASSERT_LOCKED_EXCLUSIVE();
- if ((ar = rtable_get(rtableid, af)) == NULL)
+ if ((r = rtable_get(rtableid, af)) == NULL)
return (EAFNOSUPPORT);
- ar->ar_source = src;
+ r->r_source = src;
return (0);
}
@@ -391,15 +411,15 @@ rtable_setsource(unsigned int rtableid,
struct sockaddr *
rtable_getsource(unsigned int rtableid, int af)
{
- struct art_root *ar;
+ struct rtable *r;
NET_ASSERT_LOCKED();
- ar = rtable_get(rtableid, af);
- if (ar == NULL)
+ r = rtable_get(rtableid, af);
+ if (r == NULL)
return (NULL);
- return (ar->ar_source);
+ return (r->r_source);
}
void
@@ -419,37 +439,34 @@ struct rtentry *
rtable_lookup(unsigned int rtableid, const struct sockaddr *dst,
const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio)
{
- struct art_root *ar;
+ struct rtable *r;
struct art_node *an;
struct rtentry *rt = NULL;
- struct srp_ref sr, nsr;
const uint8_t *addr;
int plen;
- ar = rtable_get(rtableid, dst->sa_family);
- if (ar == NULL)
+ r = rtable_get(rtableid, dst->sa_family);
+ if (r == NULL)
return (NULL);
- addr = satoaddr(ar, dst);
+ addr = satoaddr(r, dst);
- /* No need for a perfect match. */
+ smr_read_enter();
if (mask == NULL) {
- an = art_match(ar, addr, &nsr);
- if (an == NULL)
- goto out;
+ /* No need for a perfect match. */
+ an = art_match(r->r_art, addr);
} else {
plen = rtable_satoplen(dst->sa_family, mask);
if (plen == -1)
- return (NULL);
-
- an = art_lookup(ar, addr, plen, &nsr);
-
- /* Make sure we've got a perfect match. */
- if (!an_match(an, dst, plen))
goto out;
+
+ an = art_lookup(r->r_art, addr, plen);
}
+ if (an == NULL)
+ goto out;
- SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
+ for (rt = SMR_PTR_GET(&an->an_value); rt != NULL;
+ rt = SMR_PTR_GET(&rt->rt_next)) {
if (prio != RTP_ANY &&
(rt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
continue;
@@ -464,9 +481,8 @@ rtable_lookup(unsigned int rtableid, con
if (rt != NULL)
rtref(rt);
- SRPL_LEAVE(&sr);
out:
- srp_leave(&nsr);
+ smr_read_leave();
return (rt);
}
@@ -474,44 +490,41 @@ out:
struct rtentry *
rtable_match(unsigned int rtableid, const struct sockaddr *dst, uint32_t *src)
{
- struct art_root *ar;
+ struct rtable *r;
struct art_node *an;
struct rtentry *rt = NULL;
- struct srp_ref sr, nsr;
const uint8_t *addr;
int hash;
+ uint8_t prio;
- ar = rtable_get(rtableid, dst->sa_family);
- if (ar == NULL)
+ r = rtable_get(rtableid, dst->sa_family);
+ if (r == NULL)
return (NULL);
- addr = satoaddr(ar, dst);
+ addr = satoaddr(r, dst);
- an = art_match(ar, addr, &nsr);
+ smr_read_enter();
+ an = art_match(r->r_art, addr);
if (an == NULL)
goto out;
- rt = SRPL_FIRST(&sr, &an->an_rtlist);
- if (rt == NULL) {
- SRPL_LEAVE(&sr);
- goto out;
- }
- rtref(rt);
- SRPL_LEAVE(&sr);
+ rt = SMR_PTR_GET(&an->an_value);
+ KASSERT(rt != NULL);
+ prio = rt->rt_priority;
/* Gateway selection by Hash-Threshold (RFC 2992) */
if ((hash = rt_hash(rt, dst, src)) != -1) {
struct rtentry *mrt;
- int threshold, npaths = 0;
+ int threshold, npaths = 1;
KASSERT(hash <= 0xffff);
- SRPL_FOREACH(mrt, &sr, &an->an_rtlist, rt_next) {
- /* Only count nexthops with the same priority. */
- if (mrt->rt_priority == rt->rt_priority)
+ /* Only count nexthops with the same priority. */
+ mrt = rt;
+ while ((mrt = SMR_PTR_GET(&mrt->rt_next)) != NULL) {
+ if (mrt->rt_priority == prio)
npaths++;
}
- SRPL_LEAVE(&sr);
threshold = (0xffff / npaths) + 1;
@@ -520,27 +533,22 @@ rtable_match(unsigned int rtableid, cons
* route list attached to the node, so we won't necessarily
* have the same number of routes. for most modifications,
* we'll pick a route that we wouldn't have if we only saw the
- * list before or after the change. if we were going to use
- * the last available route, but it got removed, we'll hit
- * the end of the list and then pick the first route.
+ * list before or after the change.
*/
-
- mrt = SRPL_FIRST(&sr, &an->an_rtlist);
- while (hash > threshold && mrt != NULL) {
- if (mrt->rt_priority == rt->rt_priority)
+ mrt = rt;
+ while (hash > threshold) {
+ if (mrt->rt_priority == prio) {
+ rt = mrt;
hash -= threshold;
- mrt = SRPL_FOLLOW(&sr, mrt, rt_next);
- }
-
- if (mrt != NULL) {
- rtref(mrt);
- rtfree(rt);
- rt = mrt;
+ }
+ mrt = SMR_PTR_GET(&mrt->rt_next);
+ if (mrt == NULL)
+ break;
}
- SRPL_LEAVE(&sr);
}
+ rtref(rt);
out:
- srp_leave(&nsr);
+ smr_read_leave();
return (rt);
}
@@ -549,35 +557,59 @@ rtable_insert(unsigned int rtableid, str
const struct sockaddr *mask, const struct sockaddr *gateway, uint8_t prio,
struct rtentry *rt)
{
- struct rtentry *mrt;
- struct srp_ref sr;
- struct art_root *ar;
+ struct rtable *r;
struct art_node *an, *prev;
const uint8_t *addr;
int plen;
unsigned int rt_flags;
int error = 0;
- ar = rtable_get(rtableid, dst->sa_family);
- if (ar == NULL)
+ r = rtable_get(rtableid, dst->sa_family);
+ if (r == NULL)
return (EAFNOSUPPORT);
- addr = satoaddr(ar, dst);
+ addr = satoaddr(r, dst);
plen = rtable_satoplen(dst->sa_family, mask);
if (plen == -1)
return (EINVAL);
- rtref(rt); /* guarantee rtfree won't do anything during insert */
- rw_enter_write(&ar->ar_lock);
+ an = art_get(addr, plen);
+ if (an == NULL)
+ return (ENOMEM);
+
+ /* prepare for immediate operation if insert succeeds */
+ rt_flags = rt->rt_flags;
+ rt->rt_flags &= ~RTF_MPATH;
+ rt->rt_dest = dst;
+ rt->rt_plen = plen;
+ rt->rt_next = NULL;
+
+ rtref(rt); /* take a ref for the table */
+ an->an_value = rt;
+
+ rw_enter_write(&r->r_lock);
+ prev = art_insert(r->r_art, an);
+ if (prev == NULL) {
+ error = ENOMEM;
+ goto put;
+ }
+
+ if (prev != an) {
+ struct rtentry *mrt;
+ int mpathok = ISSET(rt_flags, RTF_MPATH);
+ int mpath = 0;
- /* Do not permit exactly the same dst/mask/gw pair. */
- an = art_lookup(ar, addr, plen, &sr);
- srp_leave(&sr); /* an can't go away while we have the lock */
- if (an_match(an, dst, plen)) {
- struct rtentry *mrt;
- int mpathok = ISSET(rt->rt_flags, RTF_MPATH);
+ /*
+ * An ART node with the same destination/netmask already
+ * exists.
+ */
+ art_put(an);
+ an = prev;
- SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
+ /* Do not permit exactly the same dst/mask/gw pair. */
+ for (mrt = SMR_PTR_GET_LOCKED(&an->an_value);
+ mrt != NULL;
+ mrt = SMR_PTR_GET_LOCKED(&mrt->rt_next)) {
if (prio != RTP_ANY &&
(mrt->rt_priority & RTP_MASK) != (prio & RTP_MASK))
continue;
@@ -589,63 +621,34 @@ rtable_insert(unsigned int rtableid, str
error = EEXIST;
goto leave;
}
+ mpath = RTF_MPATH;
}
- }
-
- an = art_get(plen);
- if (an == NULL) {
- error = ENOBUFS;
- goto leave;
- }
-
- /* prepare for immediate operation if insert succeeds */
- rt_flags = rt->rt_flags;
- rt->rt_flags &= ~RTF_MPATH;
- rt->rt_dest = dst;
- rt->rt_plen = plen;
- SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
-
- prev = art_insert(ar, an, addr, plen);
- if (prev != an) {
- SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
- rt_next);
- rt->rt_flags = rt_flags;
- art_put(an);
-
- if (prev == NULL) {
- error = ESRCH;
- goto leave;
- }
-
- an = prev;
- mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
- KASSERT(mrt != NULL);
- KASSERT((rt->rt_flags & RTF_MPATH) || mrt->rt_priority != prio);
+ /* The new route can be added to the list. */
+ if (mpath) {
+ SET(rt->rt_flags, RTF_MPATH);
+
+ for (mrt = SMR_PTR_GET_LOCKED(&an->an_value);
+ mrt != NULL;
+ mrt = SMR_PTR_GET_LOCKED(&mrt->rt_next)) {
+ if ((mrt->rt_priority & RTP_MASK) !=
+ (prio & RTP_MASK))
+ continue;
- /*
- * An ART node with the same destination/netmask already
- * exists, MPATH conflict must have been already checked.
- */
- if (rt->rt_flags & RTF_MPATH) {
- /*
- * Only keep the RTF_MPATH flag if two routes have
- * the same gateway.
- */
- rt->rt_flags &= ~RTF_MPATH;
- SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next) {
- if (mrt->rt_priority == prio) {
- mrt->rt_flags |= RTF_MPATH;
- rt->rt_flags |= RTF_MPATH;
- }
+ SET(mrt->rt_flags, RTF_MPATH);
}
}
/* Put newly inserted entry at the right place. */
rtable_mpath_insert(an, rt);
}
+ rw_exit_write(&r->r_lock);
+ return (error);
+
+put:
+ art_put(an);
leave:
- rw_exit_write(&ar->ar_lock);
+ rw_exit_write(&r->r_lock);
rtfree(rt);
return (error);
}
@@ -654,118 +657,136 @@ int
rtable_delete(unsigned int rtableid, const struct sockaddr *dst,
const struct sockaddr *mask, struct rtentry *rt)
{
- struct art_root *ar;
+ struct rtable *r;
struct art_node *an;
- struct srp_ref sr;
const uint8_t *addr;
int plen;
struct rtentry *mrt;
- int npaths = 0;
- int error = 0;
- ar = rtable_get(rtableid, dst->sa_family);
- if (ar == NULL)
+ r = rtable_get(rtableid, dst->sa_family);
+ if (r == NULL)
return (EAFNOSUPPORT);
- addr = satoaddr(ar, dst);
+ addr = satoaddr(r, dst);
plen = rtable_satoplen(dst->sa_family, mask);
if (plen == -1)
return (EINVAL);
- rtref(rt); /* guarantee rtfree won't do anything under ar_lock */
- rw_enter_write(&ar->ar_lock);
- an = art_lookup(ar, addr, plen, &sr);
- srp_leave(&sr); /* an can't go away while we have the lock */
-
- /* Make sure we've got a perfect match. */
- if (!an_match(an, dst, plen)) {
- error = ESRCH;
- goto leave;
+ rw_enter_write(&r->r_lock);
+ smr_read_enter();
+ an = art_lookup(r->r_art, addr, plen);
+ smr_read_leave();
+ if (an == NULL) {
+ rw_exit_write(&r->r_lock);
+ return (ESRCH);
}
- /*
- * If other multipath route entries are still attached to
- * this ART node we only have to unlink it.
- */
- SRPL_FOREACH_LOCKED(mrt, &an->an_rtlist, rt_next)
- npaths++;
-
- if (npaths > 1) {
- KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
- SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry,
- rt_next);
-
- mrt = SRPL_FIRST_LOCKED(&an->an_rtlist);
- if (npaths == 2)
- mrt->rt_flags &= ~RTF_MPATH;
+ /* If this is the only route in the list then we can delete the node */
+ if (SMR_PTR_GET_LOCKED(&an->an_value) == rt &&
+ SMR_PTR_GET_LOCKED(&rt->rt_next) == NULL) {
+ struct art_node *oan;
+ oan = art_delete(r->r_art, addr, plen);
+ if (oan != an)
+ panic("art %p changed shape during delete", r->r_art);
+ art_put(an);
+ /*
+ * XXX an and the rt ref could still be alive on other cpus.
+ * this currently works because of the NET_LOCK/KERNEL_LOCK
+ * but should be fixed if we want to do route lookups outside
+ * these locks. - dlg@
+ */
+ } else {
+ struct rtentry **prt;
+ struct rtentry *nrt;
+ unsigned int found = 0;
+ unsigned int npaths = 0;
- goto leave;
+ /*
+ * If other multipath route entries are still attached to
+ * this ART node we only have to unlink it.
+ */
+ prt = (struct rtentry **)&an->an_value;
+ while ((mrt = SMR_PTR_GET_LOCKED(prt)) != NULL) {
+ if (mrt == rt) {
+ found = 1;
+ SMR_PTR_SET_LOCKED(prt,
+ SMR_PTR_GET_LOCKED(&mrt->rt_next));
+ } else if ((mrt->rt_priority & RTP_MASK) ==
+ (rt->rt_priority & RTP_MASK)) {
+ npaths++;
+ nrt = mrt;
+ }
+ prt = &mrt->rt_next;
+ }
+ if (!found)
+ panic("removing non-existent route");
+ if (npaths == 1)
+ CLR(nrt->rt_flags, RTF_MPATH);
}
-
- if (art_delete(ar, an, addr, plen) == NULL)
- panic("art_delete failed to find node %p", an);
-
KASSERT(refcnt_read(&rt->rt_refcnt) >= 1);
- SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist, rt, rtentry, rt_next);
- art_put(an);
-
-leave:
- rw_exit_write(&ar->ar_lock);
+ rw_exit_write(&r->r_lock);
rtfree(rt);
- return (error);
-}
-
-struct rtable_walk_cookie {
- int (*rwc_func)(struct rtentry *, void *, unsigned int);
- void *rwc_arg;
- struct rtentry **rwc_prt;
- unsigned int rwc_rid;
-};
-
-/*
- * Helper for rtable_walk to keep the ART code free from any "struct rtentry".
- */
-int
-rtable_walk_helper(struct art_node *an, void *xrwc)
-{
- struct srp_ref sr;
- struct rtable_walk_cookie *rwc = xrwc;
- struct rtentry *rt;
- int error = 0;
-
- SRPL_FOREACH(rt, &sr, &an->an_rtlist, rt_next) {
- error = (*rwc->rwc_func)(rt, rwc->rwc_arg, rwc->rwc_rid);
- if (error != 0)
- break;
- }
- if (rwc->rwc_prt != NULL && rt != NULL) {
- rtref(rt);
- *rwc->rwc_prt = rt;
- }
- SRPL_LEAVE(&sr);
-
- return (error);
+ return (0);
}
int
rtable_walk(unsigned int rtableid, sa_family_t af, struct rtentry **prt,
int (*func)(struct rtentry *, void *, unsigned int), void *arg)
{
- struct art_root *ar;
- struct rtable_walk_cookie rwc;
+ struct rtable *r;
+ struct art_iter ai;
+ struct art_node *an;
int error;
- ar = rtable_get(rtableid, af);
- if (ar == NULL)
+ r = rtable_get(rtableid, af);
+ if (r == NULL)
return (EAFNOSUPPORT);
- rwc.rwc_func = func;
- rwc.rwc_arg = arg;
- rwc.rwc_prt = prt;
- rwc.rwc_rid = rtableid;
+ rw_enter_write(&r->r_lock);
+ ART_FOREACH(an, r->r_art, &ai) {
+ /*
+ * ART nodes have a list of rtentries.
+ *
+ * art_iter holds references to the topology
+ * so it won't change, but not the an_node or rtentries.
+ */
+ struct rtentry *rt = SMR_PTR_GET_LOCKED(&an->an_value);
+ rtref(rt);
- error = art_walk(ar, rtable_walk_helper, &rwc);
+ rw_exit_write(&r->r_lock);
+ do {
+ struct rtentry *nrt;
+
+ smr_read_enter();
+ /* Get ready for the next entry. */
+ nrt = SMR_PTR_GET(&rt->rt_next);
+ if (nrt != NULL)
+ rtref(nrt);
+ smr_read_leave();
+
+ error = func(rt, arg, rtableid);
+ if (error != 0) {
+ if (prt != NULL)
+ *prt = rt;
+ else
+ rtfree(rt);
+
+ if (nrt != NULL)
+ rtfree(nrt);
+
+ rw_enter_write(&r->r_lock);
+ art_iter_close(&ai);
+ rw_exit_write(&r->r_lock);
+ return (error);
+ }
+
+ rtfree(rt);
+ rt = nrt;
+ } while (rt != NULL);
+ rw_enter_write(&r->r_lock);
+ }
+ rw_exit_write(&r->r_lock);
return (error);
}
@@ -774,12 +795,12 @@ struct rtentry *
rtable_iterate(struct rtentry *rt0)
{
struct rtentry *rt = NULL;
- struct srp_ref sr;
- rt = SRPL_NEXT(&sr, rt0, rt_next);
+ smr_read_enter();
+ rt = SMR_PTR_GET(&rt0->rt_next);
if (rt != NULL)
rtref(rt);
- SRPL_LEAVE(&sr);
+ smr_read_leave();
rtfree(rt0);
return (rt);
}
@@ -794,27 +815,25 @@ int
rtable_mpath_reprio(unsigned int rtableid, struct sockaddr *dst,
int plen, uint8_t prio, struct rtentry *rt)
{
- struct art_root *ar;
+ struct rtable *r;
struct art_node *an;
- struct srp_ref sr;
const uint8_t *addr;
int error = 0;
- ar = rtable_get(rtableid, dst->sa_family);
- if (ar == NULL)
+ r = rtable_get(rtableid, dst->sa_family);
+ if (r == NULL)
return (EAFNOSUPPORT);
- addr = satoaddr(ar, dst);
-
- rw_enter_write(&ar->ar_lock);
- an = art_lookup(ar, addr, plen, &sr);
- srp_leave(&sr); /* an can't go away while we have the lock */
+ addr = satoaddr(r, dst);
- /* Make sure we've got a perfect match. */
- if (!an_match(an, dst, plen)) {
+ rw_enter_write(&r->r_lock);
+ smr_read_enter();
+ an = art_lookup(r->r_art, addr, plen);
+ smr_read_leave();
+ if (an == NULL) {
error = ESRCH;
- } else if (SRPL_FIRST_LOCKED(&an->an_rtlist) == rt &&
- SRPL_NEXT_LOCKED(rt, rt_next) == NULL) {
+ } else if (SMR_PTR_GET_LOCKED(&an->an_value) == rt &&
+ SMR_PTR_GET_LOCKED(&rt->rt_next) == NULL) {
/*
* If there's only one entry on the list do not go
* through an insert/remove cycle. This is done to
@@ -823,15 +842,23 @@ rtable_mpath_reprio(unsigned int rtablei
*/
rt->rt_priority = prio;
} else {
- rtref(rt); /* keep rt alive in between remove and insert */
- SRPL_REMOVE_LOCKED(&rt_rc, &an->an_rtlist,
- rt, rtentry, rt_next);
+ struct rtentry **prt;
+ struct rtentry *mrt;
+
+ prt = (struct rtentry **)&an->an_value;
+ while ((mrt = SMR_PTR_GET_LOCKED(prt)) != NULL) {
+ if (mrt == rt)
+ break;
+ prt = &mrt->rt_next;
+ }
+ KASSERT(mrt != NULL);
+
+ SMR_PTR_SET_LOCKED(prt, SMR_PTR_GET_LOCKED(&rt->rt_next));
rt->rt_priority = prio;
rtable_mpath_insert(an, rt);
- rtfree(rt);
error = EAGAIN;
}
- rw_exit_write(&ar->ar_lock);
+ rw_exit_write(&r->r_lock);
return (error);
}
@@ -839,63 +866,21 @@ rtable_mpath_reprio(unsigned int rtablei
void
rtable_mpath_insert(struct art_node *an, struct rtentry *rt)
{
- struct rtentry *mrt, *prt = NULL;
+ struct rtentry *mrt, **prt;
uint8_t prio = rt->rt_priority;
- if ((mrt = SRPL_FIRST_LOCKED(&an->an_rtlist)) == NULL) {
- SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
- return;
- }
-
/* Iterate until we find the route to be placed after ``rt''. */
- while (mrt->rt_priority <= prio && SRPL_NEXT_LOCKED(mrt, rt_next)) {
- prt = mrt;
- mrt = SRPL_NEXT_LOCKED(mrt, rt_next);
- }
- if (mrt->rt_priority <= prio) {
- SRPL_INSERT_AFTER_LOCKED(&rt_rc, mrt, rt, rt_next);
- } else if (prt != NULL) {
- SRPL_INSERT_AFTER_LOCKED(&rt_rc, prt, rt, rt_next);
- } else {
- SRPL_INSERT_HEAD_LOCKED(&rt_rc, &an->an_rtlist, rt, rt_next);
- }
-}
-
-/*
- * Returns 1 if ``an'' perfectly matches (``dst'', ``plen''), 0 otherwise.
- */
-int
-an_match(struct art_node *an, const struct sockaddr *dst, int plen)
-{
- struct rtentry *rt;
- struct srp_ref sr;
- int match;
-
- if (an == NULL || an->an_plen != plen)
- return (0);
-
- rt = SRPL_FIRST(&sr, &an->an_rtlist);
- match = (rt != NULL && memcmp(rt->rt_dest, dst, dst->sa_len) == 0);
- SRPL_LEAVE(&sr);
-
- return (match);
-}
-
-void
-rtentry_ref(void *null, void *xrt)
-{
- struct rtentry *rt = xrt;
-
- rtref(rt);
-}
+ prt = (struct rtentry **)&an->an_value;
+ while ((mrt = SMR_PTR_GET_LOCKED(prt)) != NULL) {
+ if (mrt->rt_priority >= prio)
+ break;
-void
-rtentry_unref(void *null, void *xrt)
-{
- struct rtentry *rt = xrt;
+ prt = &mrt->rt_next;
+ }
- rtfree(rt);
+ SMR_PTR_SET_LOCKED(&rt->rt_next, mrt);
+ SMR_PTR_SET_LOCKED(prt, rt);
}
/*
@@ -904,9 +889,9 @@ rtentry_unref(void *null, void *xrt)
* of "struct sockaddr" used by this routing table.
*/
static inline const uint8_t *
-satoaddr(struct art_root *at, const struct sockaddr *sa)
+satoaddr(struct rtable *r, const struct sockaddr *sa)
{
- return (((const uint8_t *)sa) + at->ar_off);
+ return (((const uint8_t *)sa) + r->r_off);
}
/*
Index: usr.bin/netstat/route.c
===================================================================
RCS file: /cvs/src/usr.bin/netstat/route.c,v
diff -u -p -r1.110 route.c
--- usr.bin/netstat/route.c 14 Nov 2023 10:31:22 -0000 1.110
+++ usr.bin/netstat/route.c 28 Jun 2025 23:15:37 -0000
@@ -33,13 +33,11 @@
#include <sys/types.h>
#include <sys/protosw.h>
#include <sys/socket.h>
+#include <sys/refcnt.h>
+#include <sys/srp.h>
#include <net/if.h>
#include <net/if_types.h>
-#define _KERNEL
-#include <net/art.h>
-#include <net/route.h>
-#undef _KERNEL
#include <netinet/ip_ipsp.h>
#include <netinet/in.h>
#include <arpa/inet.h>
@@ -51,10 +49,16 @@
#include <netdb.h>
#include <stdio.h>
#include <stdlib.h>
+#include <stdint.h>
#include <string.h>
#include <unistd.h>
#include <ifaddrs.h>
+#define _KERNEL
+#include <net/art.h>
+#include <net/route.h>
+#undef _KERNEL
+
#include "netstat.h"
static union {
@@ -67,8 +71,8 @@ struct rtentry rtentry;
static struct sockaddr *kgetsa(struct sockaddr *);
static struct sockaddr *plentosa(sa_family_t, int, struct sockaddr *);
-static struct art_node *getdefault(struct art_table *);
-static void p_table(struct art_table *);
+static struct art_node *getdefault(art_heap_entry *);
+static void p_table(art_heap_entry *);
static void p_artnode(struct art_node *);
static void p_krtentry(struct rtentry *);
@@ -78,7 +82,7 @@ static void p_krtentry(struct rtentry *)
void
routepr(u_long afmap, u_long af2idx, u_long af2idx_max, u_int tableid)
{
- struct art_root ar;
+ struct art art;
struct art_node *node;
struct srp *afm_head, *afm;
struct {
@@ -123,21 +127,21 @@ routepr(u_long afmap, u_long af2idx, u_l
if (tbl[tableid] == NULL)
continue;
- kread((u_long)tbl[tableid], &ar, sizeof(ar));
+ kread((u_long)tbl[tableid], &art, sizeof(art));
free(tbl);
- if (ar.ar_root.ref == NULL)
+ if (art.art_root == NULL)
continue;
pr_family(i);
pr_rthdr(i, Aflag);
- node = getdefault(ar.ar_root.ref);
+ node = getdefault(art.art_root);
if (node != NULL)
p_artnode(node);
- p_table(ar.ar_root.ref);
+ p_table(art.art_root);
}
free(afm);
@@ -196,64 +200,52 @@ plentosa(sa_family_t af, int plen, struc
}
static struct art_node *
-getdefault(struct art_table *at)
+getdefault(art_heap_entry *heap)
{
- struct art_node *node;
- struct art_table table;
- union {
- struct srp node;
- unsigned long count;
- } *heap;
-
- kread((u_long)at, &table, sizeof(table));
- heap = calloc(1, AT_HEAPSIZE(table.at_bits));
- kread((u_long)table.at_heap, heap, AT_HEAPSIZE(table.at_bits));
-
- node = heap[1].node.ref;
-
- free(heap);
-
- return (node);
+ art_heap_entry entry;
+ kread((u_long)(heap + ART_HEAP_IDX_DEFAULT), &entry, sizeof(&entry));
+ return (art_heap_entry_to_node(entry));
}
static void
-p_table(struct art_table *at)
+p_table(art_heap_entry *kheap)
{
+ art_heap_entry entry, *heap, *nheap;
struct art_node *next, *node;
- struct art_table *nat, table;
- union {
- struct srp node;
- unsigned long count;
- } *heap;
+ struct art_table table;
int i, j;
- kread((u_long)at, &table, sizeof(table));
+ kread((u_long)(kheap + ART_HEAP_IDX_TABLE), &entry, sizeof(entry));
+ kread((u_long)entry, &table, sizeof(table));
heap = calloc(1, AT_HEAPSIZE(table.at_bits));
- kread((u_long)table.at_heap, heap, AT_HEAPSIZE(table.at_bits));
+ kread((u_long)kheap, heap, AT_HEAPSIZE(table.at_bits));
for (j = 1; j < table.at_minfringe; j += 2) {
for (i = (j > 2) ? j : 2; i < table.at_minfringe; i <<= 1) {
- next = heap[i >> 1].node.ref;
- node = heap[i].node.ref;
+ next = art_heap_entry_to_node(heap[i >> 1]);
+ node = art_heap_entry_to_node(heap[i]);
if (node != NULL && node != next)
p_artnode(node);
}
}
for (i = table.at_minfringe; i < table.at_minfringe << 1; i++) {
- next = heap[i >> 1].node.ref;
- node = heap[i].node.ref;
- if (!ISLEAF(node)) {
- nat = SUBTABLE(node);
- node = getdefault(nat);
- } else
- nat = NULL;
+ next = art_heap_entry_to_node(heap[i >> 1]);
+
+ entry = heap[i];
+ if (art_heap_entry_is_node(entry)) {
+ nheap = NULL;
+ node = art_heap_entry_to_node(entry);
+ } else {
+ nheap = art_heap_entry_to_heap(entry);
+ node = getdefault(nheap);
+ }
if (node != NULL && node != next)
p_artnode(node);
- if (nat != NULL)
- p_table(nat);
+ if (nheap != NULL)
+ p_table(nheap);
}
free(heap);
@@ -266,14 +258,14 @@ p_artnode(struct art_node *an)
struct rtentry *rt;
kread((u_long)an, &node, sizeof(node));
- rt = node.an_rtlist.sl_head.ref;
+ rt = node.an_value;
while (rt != NULL) {
kread((u_long)rt, &rtentry, sizeof(rtentry));
if (Aflag)
printf("%-16p ", rt);
p_krtentry(&rtentry);
- rt = rtentry.rt_next.se_next.ref;
+ rt = rtentry.rt_next;
}
}
use the SMR api instead of SRPs in art/rtable code