From: Theo Buehler Subject: Re: bgpd: introduce pending attr and prefix queues To: tech@openbsd.org Date: Wed, 10 Dec 2025 11:08:23 +0100 On Thu, Dec 04, 2025 at 05:14:15PM +0100, Claudio Jeker wrote: > Implement a per-peer pending prefix queue and lookup table and > a pending attribute queue and lookup table. > > Withdraws just end up in the peer pending prefix queue while for updates > the prefix is queued on a pending attribute entry which itself is queued > on the peer. This allows to aggregate multiple prefixes into a single > UPDATE message. When prefixes are added check the lookup table if there > is already an object. In such a case the prefix is first dequeued and then > readded. pend_prefix_add() is therefor a bit fiddly. > > If the attr pointer in struct pend_prefix is NULL then it is a withdraw. > The pend_attr needs to hold the aid so prefixes end up on the right queue. > If the attrs pointer is NULL the pend_attr is actually the End-of-RIB > marker. > > This replaces the red-black tree contraption used right now. Which is a > big preformance bottleneck. It's a tricky diff with all the decoupling deep in the guts of everything, but overall it looks like a clear win. Not only due to replacing expensive RB operations but also because largely independent logic becomes more isolated and therfore easier to follow. pend_prefix_add() is a bit of an exception there and indeed fiddly. Also, fewer magic flags is always good. There are a ton of possible side effects that are hard to evaluate in full detail but I failed to spot issues. I've spent quite a bit of time on the various parts of this diff and what the changes do and I think I've crossed the point of diminishing returns. Let's get it in and see how it works out. ok tb A few very minor things inline. > -- > :wq Claudio > > Index: bgpctl/output.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpctl/output.c,v > diff -u -p -r1.64 output.c > --- bgpctl/output.c 2 Dec 2025 13:03:54 -0000 1.64 > +++ bgpctl/output.c 4 Dec 2025 15:01:57 -0000 > @@ -1097,6 +1097,12 @@ show_rib_mem(struct rde_memstats *stats) > stats->attr_refs); > printf("%10lld BGP attributes using %s of memory\n", > stats->attr_dcnt, fmt_mem(stats->attr_data)); > + printf("%10lld pending attribute entries using %s of memory\n", > + stats->pend_attr_cnt, fmt_mem(stats->pend_attr_cnt * > + sizeof(struct pend_attr))); > + printf("%10lld pending prefix entries using %s of memory\n", > + stats->pend_prefix_cnt, fmt_mem(stats->pend_prefix_cnt * > + sizeof(struct pend_prefix))); > printf("%10lld as-set elements in %lld tables using " > "%s of memory\n", stats->aset_nmemb, stats->aset_cnt, > fmt_mem(stats->aset_size)); > @@ -1104,6 +1110,10 @@ show_rib_mem(struct rde_memstats *stats) > stats->pset_cnt, fmt_mem(stats->pset_size)); > printf("RIB using %s of memory\n", fmt_mem(pts + > stats->prefix_cnt * sizeof(struct prefix) + > + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + > + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + > + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + > + stats->pend_attr_cnt * sizeof(struct pend_attr) + > stats->rib_cnt * sizeof(struct rib_entry) + > stats->path_cnt * sizeof(struct rde_aspath) + > stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + > Index: bgpctl/output_json.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpctl/output_json.c,v > diff -u -p -r1.55 output_json.c > --- bgpctl/output_json.c 2 Dec 2025 13:03:54 -0000 1.55 > +++ bgpctl/output_json.c 4 Dec 2025 15:01:57 -0000 > @@ -909,6 +909,10 @@ json_rib_mem(struct rde_memstats *stats) > json_rib_mem_element("adjout_attr", stats->adjout_attr_cnt, > stats->adjout_attr_cnt * sizeof(struct adjout_attr), > stats->adjout_attr_refs); > + json_rib_mem_element("pend_attr", stats->pend_attr_cnt, > + stats->pend_attr_cnt * sizeof(struct pend_attr), UINT64_MAX); > + json_rib_mem_element("pend_prefix", stats->pend_prefix_cnt, > + stats->pend_prefix_cnt * sizeof(struct pend_prefix), UINT64_MAX); > json_rib_mem_element("rde_aspath", stats->path_cnt, > stats->path_cnt * sizeof(struct rde_aspath), > stats->path_refs); > @@ -924,6 +928,10 @@ json_rib_mem(struct rde_memstats *stats) > stats->attr_data, UINT64_MAX); > json_rib_mem_element("total", UINT64_MAX, > pts + stats->prefix_cnt * sizeof(struct prefix) + > + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + > + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + > + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + > + stats->pend_attr_cnt * sizeof(struct pend_attr) + > stats->rib_cnt * sizeof(struct rib_entry) + > stats->path_cnt * sizeof(struct rde_aspath) + > stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + > Index: bgpctl/output_ometric.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpctl/output_ometric.c,v > diff -u -p -r1.19 output_ometric.c > --- bgpctl/output_ometric.c 2 Dec 2025 13:03:54 -0000 1.19 > +++ bgpctl/output_ometric.c 4 Dec 2025 15:01:57 -0000 > @@ -296,6 +296,10 @@ ometric_rib_mem(struct rde_memstats *sta > ometric_rib_mem_element("adjout_prefix", stats->adjout_prefix_cnt, > stats->adjout_prefix_cnt * sizeof(struct adjout_prefix), > UINT64_MAX); > + ometric_rib_mem_element("pend_attr", stats->pend_attr_cnt, > + stats->pend_attr_cnt * sizeof(struct pend_attr), UINT64_MAX); > + ometric_rib_mem_element("pend_prefix", stats->pend_prefix_cnt, > + stats->pend_prefix_cnt * sizeof(struct pend_prefix), UINT64_MAX); > ometric_rib_mem_element("adjout_attr", stats->adjout_attr_cnt, > stats->adjout_attr_cnt * sizeof(struct adjout_attr), > stats->adjout_attr_refs); > @@ -315,6 +319,10 @@ ometric_rib_mem(struct rde_memstats *sta > > ometric_rib_mem_element("total", UINT64_MAX, > pts + stats->prefix_cnt * sizeof(struct prefix) + > + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + > + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + > + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + > + stats->pend_attr_cnt * sizeof(struct pend_attr) + > stats->rib_cnt * sizeof(struct rib_entry) + > stats->path_cnt * sizeof(struct rde_aspath) + > stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + > Index: bgpd/bgpd.h > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v > diff -u -p -r1.524 bgpd.h > --- bgpd/bgpd.h 3 Dec 2025 12:20:19 -0000 1.524 > +++ bgpd/bgpd.h 4 Dec 2025 15:01:57 -0000 > @@ -1392,6 +1392,8 @@ struct rde_memstats { > long long path_refs; > long long prefix_cnt; > long long adjout_prefix_cnt; > + long long pend_prefix_cnt; > + long long pend_attr_cnt; > long long rib_cnt; > long long pt_cnt[AID_MAX]; > long long pt_size[AID_MAX]; > Index: bgpd/rde.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v > diff -u -p -r1.673 rde.c > --- bgpd/rde.c 3 Dec 2025 12:20:19 -0000 1.673 > +++ bgpd/rde.c 4 Dec 2025 15:04:40 -0000 > @@ -2938,16 +2938,18 @@ rde_dump_rib_as(struct prefix *p, struct > > static void > rde_dump_adjout_as(struct rde_peer *peer, struct adjout_prefix *p, > - struct rde_aspath *asp, pid_t pid, int flags) > + struct adjout_attr *attrs, pid_t pid, int flags) > { > struct ctl_show_rib rib; > struct ibuf *wbuf; > struct attr *a; > + struct rde_aspath *asp; > struct nexthop *nexthop; > size_t aslen; > uint8_t l; > > - nexthop = adjout_prefix_nexthop(p); > + nexthop = attrs->nexthop; > + asp = attrs->aspath; > memset(&rib, 0, sizeof(rib)); > rib.local_pref = asp->lpref; > rib.med = asp->med; > @@ -2989,7 +2991,7 @@ rde_dump_adjout_as(struct rde_peer *peer > imsg_close(ibuf_se_ctl, wbuf); > > if (flags & F_CTL_DETAIL) { > - struct rde_community *comm = adjout_prefix_communities(p); > + struct rde_community *comm = attrs->communities; > size_t len = comm->nentries * sizeof(struct community); > if (comm->nentries > 0) { > if (imsg_compose(ibuf_se_ctl, > @@ -3073,14 +3075,14 @@ rde_dump_filter(struct prefix *p, struct > > static void > rde_dump_adjout_filter(struct rde_peer *peer, struct adjout_prefix *p, > - struct ctl_show_rib_request *req) > + struct adjout_attr *attrs, struct ctl_show_rib_request *req) > { > struct rde_aspath *asp; > > if (!rde_match_peer(peer, &req->neighbor)) > return; > > - asp = adjout_prefix_aspath(p); > + asp = attrs->aspath; > if ((req->flags & F_CTL_HAS_PATHID)) { > /* Match against the transmit path id if adjout is used. */ > if (req->path_id != p->path_id_tx) > @@ -3090,12 +3092,11 @@ rde_dump_adjout_filter(struct rde_peer * > !aspath_match(asp->aspath, &req->as, 0)) > return; > if (req->community.flags != 0) { > - if (!community_match(adjout_prefix_communities(p), > - &req->community, NULL)) > + if (!community_match(attrs->communities, &req->community, NULL)) > return; > } > /* in the adj-rib-out, skip matching against roa and aspa state */ > - rde_dump_adjout_as(peer, p, asp, req->pid, req->flags); > + rde_dump_adjout_as(peer, p, attrs, req->pid, req->flags); > } > > static void > @@ -3115,13 +3116,13 @@ rde_dump_adjout_upcall(struct adjout_pre > { > struct rde_dump_ctx *ctx = ptr; > struct rde_peer *peer; > + struct adjout_attr *attrs; > > if ((peer = peer_get(ctx->peerid)) == NULL) > return; > - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) > - return; > > - rde_dump_adjout_filter(peer, p, &ctx->req); > + attrs = p->attrs; > + rde_dump_adjout_filter(peer, p, attrs, &ctx->req); I'm not sure what the attrs variable buys us here, but maybe that's preparing an upcoming change. You do the same in peer_blast_upcall() and it looks a bit odd to me. > } > > static int > @@ -3561,8 +3562,8 @@ rde_update_queue_pending(void) > if (peer->throttled) > continue; > for (aid = AID_MIN; aid < AID_MAX; aid++) { > - if (!RB_EMPTY(&peer->updates[aid]) || > - !RB_EMPTY(&peer->withdraws[aid])) > + if (!TAILQ_EMPTY(&peer->updates[aid]) || > + !TAILQ_EMPTY(&peer->withdraws[aid])) > return 1; > } > } > @@ -3585,7 +3586,7 @@ rde_update_queue_runner(uint8_t aid) > continue; > if (peer->throttled) > continue; > - if (RB_EMPTY(&peer->withdraws[aid])) > + if (TAILQ_EMPTY(&peer->withdraws[aid])) > continue; > > up_dump_withdraws(ibuf_se, peer, aid); > @@ -3605,7 +3606,7 @@ rde_update_queue_runner(uint8_t aid) > continue; > if (peer->throttled) > continue; > - if (RB_EMPTY(&peer->updates[aid])) > + if (TAILQ_EMPTY(&peer->updates[aid])) > continue; > > if (up_is_eor(peer, aid)) { > Index: bgpd/rde.h > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v > diff -u -p -r1.327 rde.h > --- bgpd/rde.h 2 Dec 2025 13:03:35 -0000 1.327 > +++ bgpd/rde.h 4 Dec 2025 15:58:51 -0000 > @@ -27,6 +27,7 @@ > > #include "bgpd.h" > #include "log.h" > +#include "chash.h" I'd sort these alphabetically > > /* rde internal structures */ > > @@ -71,9 +72,13 @@ struct rib { > * Currently I assume that we can do that with the neighbor_ip... > */ > RB_HEAD(peer_tree, rde_peer); > -RB_HEAD(prefix_tree, adjout_prefix); > RB_HEAD(prefix_index, adjout_prefix); > > +CH_HEAD(pend_prefix_hash, pend_prefix); > +TAILQ_HEAD(pend_prefix_queue, pend_prefix); > +CH_HEAD(pend_attr_hash, pend_prefix); > +TAILQ_HEAD(pend_attr_queue, pend_attr); > + > struct rde_peer { > RB_ENTRY(rde_peer) entry; > struct peer_config conf; > @@ -84,8 +89,10 @@ struct rde_peer { > struct capabilities capa; > struct addpath_eval eval; > struct prefix_index adj_rib_out; > - struct prefix_tree updates[AID_MAX]; > - struct prefix_tree withdraws[AID_MAX]; > + struct pend_prefix_hash pend_prefixes; > + struct pend_attr_hash pend_attrs; > + struct pend_attr_queue updates[AID_MAX]; > + struct pend_prefix_queue withdraws[AID_MAX]; The order is slightly strange but I understand why you did it this way. > struct filter_head *out_rules; > struct ibufqueue *ibufq; > monotime_t staletime[AID_MAX]; > @@ -311,20 +318,31 @@ struct adjout_attr { > }; > > struct adjout_prefix { > - RB_ENTRY(adjout_prefix) index, update; > + RB_ENTRY(adjout_prefix) index; > struct pt_entry *pt; > struct adjout_attr *attrs; > uint32_t path_id_tx; > uint8_t flags; > }; > -#define PREFIX_ADJOUT_FLAG_WITHDRAW 0x01 /* enqueued on withdraw queue */ > -#define PREFIX_ADJOUT_FLAG_UPDATE 0x02 /* enqueued on update queue */ > #define PREFIX_ADJOUT_FLAG_DEAD 0x04 /* locked but removed */ > #define PREFIX_ADJOUT_FLAG_STALE 0x08 /* stale entry (for addpath) */ > #define PREFIX_ADJOUT_FLAG_MASK 0x0f /* mask for the prefix types */ > -#define PREFIX_ADJOUT_FLAG_EOR 0x10 /* prefix is EoR */ > #define PREFIX_ADJOUT_FLAG_LOCKED 0x20 /* locked by rib walker */ > > +struct pend_attr { > + TAILQ_ENTRY(pend_attr) entry; > + struct pend_prefix_queue prefixes; > + struct adjout_attr *attrs; > + uint8_t aid; > +}; > + > +struct pend_prefix { > + TAILQ_ENTRY(pend_prefix) entry; > + struct pt_entry *pt; > + struct pend_attr *attrs; > + uint32_t path_id_tx; > +}; > + > struct filterstate { > struct rde_aspath aspath; > struct rde_community communities; > @@ -637,8 +655,6 @@ struct prefix *prefix_bypeer(struct rib_ > uint32_t); > void prefix_destroy(struct prefix *); > > -RB_PROTOTYPE(prefix_tree, adjout_prefix, entry, prefix_cmp) > - > static inline struct rde_peer * > prefix_peer(struct prefix *p) > { > @@ -750,24 +766,14 @@ int adjout_prefix_dump_subtree(struct > struct bgpd_addr *, uint8_t, unsigned int, void *, > void (*)(struct adjout_prefix *, void *), > void (*)(void *, uint8_t), int (*)(void *)); > +void adjout_peer_init(struct rde_peer *); > > -static inline struct rde_aspath * > -adjout_prefix_aspath(struct adjout_prefix *p) > -{ > - return (p->attrs->aspath); > -} > - > -static inline struct rde_community * > -adjout_prefix_communities(struct adjout_prefix *p) > -{ > - return (p->attrs->communities); > -} > - > -static inline struct nexthop * > -adjout_prefix_nexthop(struct adjout_prefix *p) > -{ > - return (p->attrs->nexthop); > -} > +void pend_attr_done(struct pend_attr *, struct rde_peer *); > +void pend_eor_add(struct rde_peer *, uint8_t); > +void pend_prefix_add(struct rde_peer *, struct adjout_attr *, > + struct pt_entry *, uint32_t); > +void pend_prefix_free(struct pend_prefix *, > + struct pend_prefix_queue *, struct rde_peer *); > > /* rde_update.c */ > void up_generate_updates(struct rde_peer *, struct rib_entry *); > Index: bgpd/rde_adjout.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_adjout.c,v > diff -u -p -r1.8 rde_adjout.c > --- bgpd/rde_adjout.c 2 Dec 2025 13:03:35 -0000 1.8 > +++ bgpd/rde_adjout.c 4 Dec 2025 16:09:14 -0000 > @@ -30,6 +30,228 @@ > #include "log.h" > #include "chash.h" > > +static struct adjout_attr *adjout_attr_ref(struct adjout_attr *); > +static void adjout_attr_unref(struct adjout_attr *); > + > +static uint64_t pendkey; > + > +static inline uint64_t > +pend_prefix_hash(const struct pend_prefix *pp) > +{ > + uint64_t h = pendkey; > + > + h = ch_qhash64(h, (uintptr_t)pp->pt); > + h = ch_qhash64(h, pp->path_id_tx); > + return h; > +} > + > +static inline uint64_t > +pend_attr_hash(const struct pend_attr *pa) > +{ > + uint64_t h = pendkey; > + > + h = ch_qhash64(h, (uintptr_t)pa->attrs); > + h = ch_qhash64(h, pa->aid); > + return h; > +} > + > +CH_PROTOTYPE(pend_prefix_hash, pend_prefix, pend_prefix_hash); > +CH_PROTOTYPE(pend_attr_hash, pend_attr, pend_attr_hash); > + > +/* pending prefix queue functions */ > +static struct pend_attr * > +pend_attr_alloc(struct adjout_attr *attrs, uint8_t aid, > + struct rde_peer *peer) > +{ > + struct pend_attr *pa; > + > + if ((pa = calloc(1, sizeof(*pa))) == NULL) > + fatal(__func__); > + rdemem.pend_attr_cnt++; > + TAILQ_INIT(&pa->prefixes); > + if (attrs) > + pa->attrs = adjout_attr_ref(attrs); > + pa->aid = aid; > + > + TAILQ_INSERT_TAIL(&peer->updates[aid], pa, entry); > + if (CH_INSERT(pend_attr_hash, &peer->pend_attrs, pa, NULL) != 1) > + fatalx("corrupted pending attr hash table"); > + return pa; > +} > + > +static void > +pend_attr_free(struct pend_attr *pa, struct rde_peer *peer) > +{ > + if (!TAILQ_EMPTY(&pa->prefixes)) { > + log_warnx("freeing not empty pending attribute"); > + abort(); > + } > + > + TAILQ_REMOVE(&peer->updates[pa->aid], pa, entry); > + CH_REMOVE(pend_attr_hash, &peer->pend_attrs, pa); > + > + if (pa->attrs != NULL) > + adjout_attr_unref(pa->attrs); > + > + rdemem.pend_attr_cnt--; > + free(pa); > +} > + > +void > +pend_attr_done(struct pend_attr *pa, struct rde_peer *peer) > +{ > + if (pa == NULL) > + return; > + if (TAILQ_EMPTY(&pa->prefixes)) > + pend_attr_free(pa, peer); > +} > + > +static struct pend_attr * > +pend_attr_lookup(struct rde_peer *peer, struct adjout_attr *attrs, uint8_t aid) > +{ > + struct pend_attr needle = { .attrs = attrs, .aid = aid }; > + > + return CH_FIND(pend_attr_hash, &peer->pend_attrs, &needle); > +} > + > +static inline int > +pend_attr_eq(const struct pend_attr *a, const struct pend_attr *b) > +{ > + if (a->attrs != b->attrs) > + return 0; > + if (a->aid != b->aid) > + return 0; > + return 1; > +} > + > +CH_GENERATE(pend_attr_hash, pend_attr, pend_attr_eq, pend_attr_hash); > + > +/* > + * Insert an End-of-RIB marker into the update queue. > + */ > +void > +pend_eor_add(struct rde_peer *peer, uint8_t aid) > +{ > + struct pend_attr *pa; > + > + pa = pend_attr_lookup(peer, NULL, aid); > + if (pa == NULL) > + pa = pend_attr_alloc(NULL, aid, peer); > +} > + > + > +static struct pend_prefix *pend_prefix_alloc(struct pend_attr *, > + struct pt_entry *, uint32_t); > + > +static struct pend_prefix * > +pend_prefix_lookup(struct rde_peer *peer, struct pt_entry *pt, > + uint32_t path_id_tx) > +{ > + struct pend_prefix needle = { .pt = pt, .path_id_tx = path_id_tx }; > + > + return CH_FIND(pend_prefix_hash, &peer->pend_prefixes, &needle); > +} > + > +static void > +pend_prefix_remove(struct pend_prefix *pp, struct pend_prefix_queue *head, > + struct rde_peer *peer) > +{ > + if (CH_REMOVE(pend_prefix_hash, &peer->pend_prefixes, pp) != pp) { > + log_warnx("missing pending prefix in hash table"); > + abort(); > + } > + TAILQ_REMOVE(head, pp, entry); > + > + if (pp->attrs == NULL) { > + peer->stats.pending_withdraw--; > + } else { > + peer->stats.pending_update--; > + } > + pp->attrs = NULL; > +} > + > +void > +pend_prefix_add(struct rde_peer *peer, struct adjout_attr *attrs, > + struct pt_entry *pt, uint32_t path_id_tx) > +{ > + struct pend_attr *pa = NULL, *oldpa = NULL; > + struct pend_prefix *pp; > + struct pend_prefix_queue *head; > + > + if (attrs != NULL) { > + pa = pend_attr_lookup(peer, attrs, pt->aid); > + if (pa == NULL) > + pa = pend_attr_alloc(attrs, pt->aid, peer); > + } > + > + pp = pend_prefix_lookup(peer, pt, path_id_tx); > + if (pp == NULL) { > + pp = pend_prefix_alloc(pa, pt, path_id_tx); > + } else { > + if (pp->attrs == NULL) > + head = &peer->withdraws[pt->aid]; > + else > + head = &pp->attrs->prefixes; > + oldpa = pp->attrs; > + pend_prefix_remove(pp, head, peer); > + pp->attrs = pa; > + } > + > + if (pa == NULL) { > + head = &peer->withdraws[pt->aid]; > + peer->stats.pending_withdraw++; > + } else { > + head = &pa->prefixes; > + peer->stats.pending_update++; > + } > + > + TAILQ_INSERT_TAIL(head, pp, entry); > + if (CH_INSERT(pend_prefix_hash, &peer->pend_prefixes, pp, NULL) != 1) { > + log_warnx("corrupted pending prefix hash table"); > + abort(); > + } > + > + pend_attr_done(oldpa, peer); > +} > + > +static struct pend_prefix * > +pend_prefix_alloc(struct pend_attr *attrs, struct pt_entry *pt, > + uint32_t path_id_tx) > +{ > + struct pend_prefix *pp; > + > + if ((pp = calloc(1, sizeof(*pp))) == NULL) > + fatal(__func__); > + rdemem.pend_prefix_cnt++; > + pp->pt = pt_ref(pt); > + pp->attrs = attrs; > + pp->path_id_tx = path_id_tx; > + > + return pp; > +} > + > +void > +pend_prefix_free(struct pend_prefix *pp, struct pend_prefix_queue *head, > + struct rde_peer *peer) > +{ > + pend_prefix_remove(pp, head, peer); > + pt_unref(pp->pt); > + rdemem.pend_prefix_cnt--; > + free(pp); > +} > + > +static inline int > +pend_prefix_eq(const struct pend_prefix *a, const struct pend_prefix *b) > +{ > + if (a->pt != b->pt) > + return 0; > + if (a->path_id_tx != b->path_id_tx) > + return 0; > + return 1; > +} > + > +CH_GENERATE(pend_prefix_hash, pend_prefix, pend_prefix_eq, pend_prefix_hash); > + > /* adj-rib-out specific functions */ > static uint64_t attrkey; > > @@ -68,6 +290,7 @@ void > adjout_init(void) > { > arc4random_buf(&attrkey, sizeof(attrkey)); > + arc4random_buf(&pendkey, sizeof(pendkey)); > } > > /* Alloc, init and add a new entry into the has table. May not fail. */ > @@ -77,8 +300,7 @@ adjout_attr_alloc(struct rde_aspath *asp > { > struct adjout_attr *a; > > - a = calloc(1, sizeof(*a)); > - if (a == NULL) > + if ((a = calloc(1, sizeof(*a))) == NULL) > fatal(__func__); > rdemem.adjout_attr_cnt++; > > @@ -115,7 +337,7 @@ adjout_attr_free(struct adjout_attr *a) > } > > static struct adjout_attr * > -adjout_attr_ref(struct adjout_attr *attrs, struct rde_peer *peer) > +adjout_attr_ref(struct adjout_attr *attrs) > { > attrs->refcnt++; > rdemem.adjout_attr_refs++; > @@ -123,7 +345,7 @@ adjout_attr_ref(struct adjout_attr *attr > } > > static void > -adjout_attr_unref(struct adjout_attr *attrs, struct rde_peer *peer) > +adjout_attr_unref(struct adjout_attr *attrs) > { > attrs->refcnt--; > rdemem.adjout_attr_refs--; > @@ -217,22 +439,6 @@ prefix_index_cmp(struct adjout_prefix *a > return 0; > } > > -static inline int > -prefix_cmp(struct adjout_prefix *a, struct adjout_prefix *b) > -{ > - if ((a->flags & PREFIX_ADJOUT_FLAG_EOR) != > - (b->flags & PREFIX_ADJOUT_FLAG_EOR)) > - return (a->flags & PREFIX_ADJOUT_FLAG_EOR) ? 1 : -1; > - /* if EOR marker no need to check the rest */ > - if (a->flags & PREFIX_ADJOUT_FLAG_EOR) > - return 0; > - > - if (a->attrs != b->attrs) > - return (a->attrs > b->attrs ? 1 : -1); > - return prefix_index_cmp(a, b); > -} > - > -RB_GENERATE(prefix_tree, adjout_prefix, update, prefix_cmp) > RB_GENERATE_STATIC(prefix_index, adjout_prefix, index, prefix_index_cmp) > > /* > @@ -328,22 +534,6 @@ adjout_prefix_match(struct rde_peer *pee > } > > /* > - * Insert an End-of-RIB marker into the update queue. > - */ > -void > -prefix_add_eor(struct rde_peer *peer, uint8_t aid) > -{ > - struct adjout_prefix *p; > - > - p = adjout_prefix_alloc(); > - p->flags = PREFIX_ADJOUT_FLAG_UPDATE | PREFIX_ADJOUT_FLAG_EOR; > - if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) > - /* no need to add if EoR marker already present */ > - adjout_prefix_free(p); > - /* EOR marker is not inserted into the adj_rib_out index */ > -} > - > -/* > * Put a prefix from the Adj-RIB-Out onto the update queue. > */ > void > @@ -364,38 +554,28 @@ adjout_prefix_update(struct adjout_prefi > fatalx("%s: RB index invariant violated", __func__); > } > > - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | > - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { > + if ((p->flags & (PREFIX_ADJOUT_FLAG_DEAD)) == 0) { extra parens around PREFIX_ADJOUT_FLAG_DEAD > /* > * XXX for now treat a different path_id_tx like different > * attributes and force out an update. It is unclear how > * common it is to have equivalent updates from alternative > * paths. > */ > + attrs = p->attrs; > if (p->path_id_tx == path_id_tx && > - adjout_prefix_nexthop(p) == state->nexthop && > + attrs->nexthop == state->nexthop && > communities_equal(&state->communities, > - adjout_prefix_communities(p)) && > - path_equal(&state->aspath, adjout_prefix_aspath(p))) { > + attrs->communities) && > + path_equal(&state->aspath, attrs->aspath)) { > /* nothing changed */ > p->flags &= ~PREFIX_ADJOUT_FLAG_STALE; > return; > } > > - /* if pending update unhook it before it is unlinked */ > - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { > - RB_REMOVE(prefix_tree, &peer->updates[pte->aid], p); > - peer->stats.pending_update--; > - } > - > /* unlink prefix so it can be relinked below */ > adjout_prefix_unlink(p, peer); > peer->stats.prefix_out_cnt--; > } > - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { > - RB_REMOVE(prefix_tree, &peer->withdraws[pte->aid], p); > - peer->stats.pending_withdraw--; > - } > > /* nothing needs to be done for PREFIX_ADJOUT_FLAG_DEAD and STALE */ > p->flags &= ~PREFIX_ADJOUT_FLAG_MASK; > @@ -417,10 +597,7 @@ adjout_prefix_update(struct adjout_prefi > if (p->flags & PREFIX_ADJOUT_FLAG_MASK) > fatalx("%s: bad flags %x", __func__, p->flags); > if (peer_is_up(peer)) { > - p->flags |= PREFIX_ADJOUT_FLAG_UPDATE; > - if (RB_INSERT(prefix_tree, &peer->updates[pte->aid], p) != NULL) > - fatalx("%s: RB tree invariant violated", __func__); > - peer->stats.pending_update++; > + pend_prefix_add(peer, p->attrs, p->pt, p->path_id_tx); > } > } > > @@ -431,59 +608,17 @@ adjout_prefix_update(struct adjout_prefi > void > adjout_prefix_withdraw(struct rde_peer *peer, struct adjout_prefix *p) > { > - /* already a withdraw, shortcut */ > - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { > - p->flags &= ~PREFIX_ADJOUT_FLAG_STALE; > - return; > - } > - /* pending update just got withdrawn */ > - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { > - RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); > - peer->stats.pending_update--; > - } > - /* unlink prefix if it was linked (not a withdraw or dead) */ > - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | > - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { > - adjout_prefix_unlink(p, peer); > - peer->stats.prefix_out_cnt--; > - } > + if (peer_is_up(peer)) > + pend_prefix_add(peer, NULL, p->pt, p->path_id_tx); > > - /* nothing needs to be done for PREFIX_ADJOUT_FLAG_DEAD and STALE */ > - p->flags &= ~PREFIX_ADJOUT_FLAG_MASK; > - > - if (peer_is_up(peer)) { > - p->flags |= PREFIX_ADJOUT_FLAG_WITHDRAW; > - if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], > - p) != NULL) > - fatalx("%s: RB tree invariant violated", __func__); > - peer->stats.pending_withdraw++; > - } else { > - /* mark prefix dead to skip unlink on destroy */ > - p->flags |= PREFIX_ADJOUT_FLAG_DEAD; > - adjout_prefix_destroy(peer, p); > - } > + adjout_prefix_destroy(peer, p); > } > > void > adjout_prefix_destroy(struct rde_peer *peer, struct adjout_prefix *p) > { > - if (p->flags & PREFIX_ADJOUT_FLAG_EOR) { > - /* EOR marker is not linked in the index */ > - adjout_prefix_free(p); > - return; > - } > - > - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { > - RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p); > - peer->stats.pending_withdraw--; > - } > - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { > - RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); > - peer->stats.pending_update--; > - } > - /* unlink prefix if it was linked (not a withdraw or dead) */ > - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | > - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { > + /* unlink prefix if it was linked (not dead) */ > + if ((p->flags & PREFIX_ADJOUT_FLAG_DEAD) == 0) { > adjout_prefix_unlink(p, peer); > peer->stats.prefix_out_cnt--; > } > @@ -505,21 +640,19 @@ adjout_prefix_destroy(struct rde_peer *p > void > adjout_prefix_flush_pending(struct rde_peer *peer) > { > - struct adjout_prefix *p, *np; > + struct pend_attr *pa, *npa; > + struct pend_prefix *pp, *npp; > uint8_t aid; > > for (aid = AID_MIN; aid < AID_MAX; aid++) { > - RB_FOREACH_SAFE(p, prefix_tree, &peer->withdraws[aid], np) { > - adjout_prefix_destroy(peer, p); > + TAILQ_FOREACH_SAFE(pp, &peer->withdraws[aid], entry, npp) { > + pend_prefix_free(pp, &peer->withdraws[aid], peer); > } > - RB_FOREACH_SAFE(p, prefix_tree, &peer->updates[aid], np) { > - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; > - RB_REMOVE(prefix_tree, &peer->updates[aid], p); > - if (p->flags & PREFIX_ADJOUT_FLAG_EOR) { > - adjout_prefix_destroy(peer, p); > - } else { > - peer->stats.pending_update--; > + TAILQ_FOREACH_SAFE(pa, &peer->updates[aid], entry, npa) { > + TAILQ_FOREACH_SAFE(pp, &pa->prefixes, entry, npp) { > + pend_prefix_free(pp, &pa->prefixes, peer); > } > + pend_attr_done(pa, peer); > } > } > } > @@ -690,7 +823,7 @@ static void > adjout_prefix_link(struct adjout_prefix *p, struct rde_peer *peer, > struct adjout_attr *attrs, struct pt_entry *pt, uint32_t path_id_tx) > { > - p->attrs = adjout_attr_ref(attrs, peer); > + p->attrs = adjout_attr_ref(attrs); > p->pt = pt_ref(pt); > p->path_id_tx = path_id_tx; > } > @@ -702,7 +835,7 @@ static void > adjout_prefix_unlink(struct adjout_prefix *p, struct rde_peer *peer) > { > /* destroy all references to other objects */ > - adjout_attr_unref(p->attrs, peer); > + adjout_attr_unref(p->attrs); > p->attrs = NULL; > pt_unref(p->pt); > /* must keep p->pt valid since there is an extra ref */ > @@ -727,4 +860,17 @@ adjout_prefix_free(struct adjout_prefix > { > rdemem.adjout_prefix_cnt--; > free(p); > +} > + > +void > +adjout_peer_init(struct rde_peer *peer) > +{ > + unsigned int i; > + > + CH_INIT(pend_attr_hash, &peer->pend_attrs); > + CH_INIT(pend_prefix_hash, &peer->pend_prefixes); > + for (i = 0; i < nitems(peer->updates); i++) > + TAILQ_INIT(&peer->updates[i]); > + for (i = 0; i < nitems(peer->withdraws); i++) > + TAILQ_INIT(&peer->withdraws[i]); > } > Index: bgpd/rde_peer.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v > diff -u -p -r1.59 rde_peer.c > --- bgpd/rde_peer.c 2 Dec 2025 13:03:35 -0000 1.59 > +++ bgpd/rde_peer.c 4 Dec 2025 15:03:48 -0000 > @@ -175,6 +175,7 @@ peer_add(uint32_t id, struct peer_config > if ((peer->ibufq = ibufq_new()) == NULL) > fatal(NULL); > > + adjout_peer_init(peer); > peer_apply_out_filter(peer, rules); > > /* > @@ -520,15 +521,11 @@ static void > peer_blast_upcall(struct adjout_prefix *p, void *ptr) > { > struct rde_peer *peer = ptr; > + struct adjout_attr *attrs = NULL; > > - if ((p->flags & PREFIX_ADJOUT_FLAG_MASK) == 0) { > - /* put entries on the update queue if not already on a queue */ > - p->flags |= PREFIX_ADJOUT_FLAG_UPDATE; > - if (RB_INSERT(prefix_tree, &peer->updates[p->pt->aid], > - p) != NULL) > - fatalx("%s: RB tree invariant violated", __func__); > - peer->stats.pending_update++; > - } > + attrs = p->attrs; > + > + pend_prefix_add(peer, attrs, p->pt, p->path_id_tx); > } > > /* > @@ -543,7 +540,7 @@ peer_blast_done(void *ptr, uint8_t aid) > /* Adj-RIB-Out ready, unthrottle peer and inject EOR */ > peer->throttled = 0; > if (peer->capa.grestart.restart) > - prefix_add_eor(peer, aid); > + pend_eor_add(peer, aid); > } > > /* > Index: bgpd/rde_update.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v > diff -u -p -r1.187 rde_update.c > --- bgpd/rde_update.c 3 Dec 2025 10:00:15 -0000 1.187 > +++ bgpd/rde_update.c 4 Dec 2025 15:59:28 -0000 > @@ -799,17 +799,11 @@ up_generate_attr(struct ibuf *buf, struc > int > up_is_eor(struct rde_peer *peer, uint8_t aid) > { > - struct adjout_prefix *p; > + struct pend_attr *pa; > > - p = RB_MIN(prefix_tree, &peer->updates[aid]); > - if (p != NULL && (p->flags & PREFIX_ADJOUT_FLAG_EOR)) { > - /* > - * Need to remove eor from update tree because > - * adjout_prefix_destroy() can't handle that. > - */ > - RB_REMOVE(prefix_tree, &peer->updates[aid], p); > - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; > - adjout_prefix_destroy(peer, p); > + pa = TAILQ_FIRST(&peer->updates[aid]); > + if (pa != NULL && (uintptr_t)pa->attrs < AID_MAX) { > + pend_attr_done(pa, peer); > return 1; > } > return 0; > @@ -818,36 +812,19 @@ up_is_eor(struct rde_peer *peer, uint8_t > /* minimal buffer size > withdraw len + attr len + attr hdr + afi/safi */ > #define MIN_UPDATE_LEN 16 > > -static void > -up_prefix_free(struct prefix_tree *prefix_head, struct adjout_prefix *p, > - struct rde_peer *peer, int withdraw) > -{ > - if (withdraw) { > - /* prefix no longer needed, remove it */ > - adjout_prefix_destroy(peer, p); > - peer->stats.prefix_sent_withdraw++; > - } else { > - /* prefix still in Adj-RIB-Out, keep it */ > - RB_REMOVE(prefix_tree, prefix_head, p); > - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; > - peer->stats.pending_update--; > - peer->stats.prefix_sent_update++; > - } > -} > - > /* > * Write prefixes to buffer until either there is no more space or > * the next prefix has no longer the same ASPATH attributes. > * Returns -1 if no prefix was written else 0. > */ > static int > -up_dump_prefix(struct ibuf *buf, struct prefix_tree *prefix_head, > +up_dump_prefix(struct ibuf *buf, struct pend_prefix_queue *prefix_head, > struct rde_peer *peer, int withdraw) > { > - struct adjout_prefix *p, *np; > - int done = 0, has_ap = -1, rv = -1; > + struct pend_prefix *p, *np; > + int has_ap = -1, rv = -1; > > - RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) { > + TAILQ_FOREACH_SAFE(p, prefix_head, entry, np) { > if (has_ap == -1) > has_ap = peer_has_add_path(peer, p->pt->aid, > CAPA_AP_SEND); > @@ -855,24 +832,21 @@ up_dump_prefix(struct ibuf *buf, struct > -1) > break; > > - /* make sure we only dump prefixes which belong together */ > - if (np == NULL || > - np->attrs != p->attrs || > - (np->flags & PREFIX_ADJOUT_FLAG_EOR)) > - done = 1; > - > rv = 0; > - up_prefix_free(prefix_head, p, peer, withdraw); > - if (done) > - break; > + if (withdraw) > + peer->stats.prefix_sent_withdraw++; > + else > + peer->stats.prefix_sent_update++; > + pend_prefix_free(p, prefix_head, peer); > } > return rv; > } > > static int > up_generate_mp_reach(struct ibuf *buf, struct rde_peer *peer, > - struct nexthop *nh, uint8_t aid) > + struct pend_attr *pa, uint8_t aid) > { > + struct nexthop *nh = pa->attrs->nexthop; > struct bgpd_addr *nexthop; > size_t off, nhoff; > uint16_t len, afi; > @@ -969,7 +943,7 @@ up_generate_mp_reach(struct ibuf *buf, s > if (ibuf_add_zero(buf, 1) == -1) /* Reserved must be 0 */ > return -1; > > - if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1) > + if (up_dump_prefix(buf, &pa->prefixes, peer, 0) == -1) > /* no prefixes written, fail update */ > return -1; > > @@ -1084,8 +1058,8 @@ up_dump_withdraws(struct imsgbuf *imsg, > * Withdraw a single prefix after an error. > */ > static int > -up_dump_withdraw_one(struct rde_peer *peer, struct adjout_prefix *p, > - struct ibuf *buf) > +up_dump_withdraw_one(struct rde_peer *peer, struct pt_entry *pt, > + uint32_t path_id_tx, struct ibuf *buf) > { > size_t off; > int has_ap; > @@ -1100,7 +1074,7 @@ up_dump_withdraw_one(struct rde_peer *pe > if (ibuf_add_zero(buf, sizeof(len)) == -1) > return -1; > > - if (p->pt->aid != AID_INET) { > + if (pt->aid != AID_INET) { > /* reserve space for 2-byte path attribute length */ > off = ibuf_size(buf); > if (ibuf_add_zero(buf, sizeof(len)) == -1) > @@ -1115,7 +1089,7 @@ up_dump_withdraw_one(struct rde_peer *pe > return -1; > > /* afi & safi */ > - if (aid2afi(p->pt->aid, &afi, &safi) == -1) > + if (aid2afi(pt->aid, &afi, &safi) == -1) > fatalx("%s: bad AID", __func__); > if (ibuf_add_n16(buf, afi) == -1) > return -1; > @@ -1123,8 +1097,8 @@ up_dump_withdraw_one(struct rde_peer *pe > return -1; > } > > - has_ap = peer_has_add_path(peer, p->pt->aid, CAPA_AP_SEND); > - if (pt_writebuf(buf, p->pt, 1, has_ap, p->path_id_tx) == -1) > + has_ap = peer_has_add_path(peer, pt->aid, CAPA_AP_SEND); > + if (pt_writebuf(buf, pt, 1, has_ap, path_id_tx) == -1) > return -1; > > /* update length field (either withdrawn routes or attribute length) */ > @@ -1132,7 +1106,7 @@ up_dump_withdraw_one(struct rde_peer *pe > if (ibuf_set_n16(buf, off, len) == -1) > return -1; > > - if (p->pt->aid != AID_INET) { > + if (pt->aid != AID_INET) { > /* write MP_UNREACH_NLRI attribute length (always extended) */ > len -= 4; /* skip attribute header */ > if (ibuf_set_n16(buf, off + sizeof(len) + 2, len) == -1) > @@ -1158,17 +1132,18 @@ up_dump_update(struct imsgbuf *imsg, str > { > struct ibuf *buf; > struct bgpd_addr addr; > - struct adjout_prefix *p; > + struct pend_attr *pa; > + struct pend_prefix *pp; > size_t off, pkgsize = MAX_PKTSIZE; > uint16_t len; > int force_ip4mp = 0; > > - p = RB_MIN(prefix_tree, &peer->updates[aid]); > - if (p == NULL) > + pa = TAILQ_FIRST(&peer->updates[aid]); > + if (pa == NULL) > return; > > if (aid == AID_INET && peer_has_ext_nexthop(peer, AID_INET)) { > - struct nexthop *nh = adjout_prefix_nexthop(p); > + struct nexthop *nh = pa->attrs->nexthop; > if (nh != NULL && nh->exit_nexthop.aid == AID_INET6) > force_ip4mp = 1; > } > @@ -1191,8 +1166,8 @@ up_dump_update(struct imsgbuf *imsg, str > if (ibuf_add_zero(buf, sizeof(len)) == -1) > goto fail; > > - if (up_generate_attr(buf, peer, adjout_prefix_aspath(p), > - adjout_prefix_communities(p), adjout_prefix_nexthop(p), aid) == -1) > + if (up_generate_attr(buf, peer, pa->attrs->aspath, > + pa->attrs->communities, pa->attrs->nexthop, aid) == -1) > goto drop; > > if (aid != AID_INET || force_ip4mp) { > @@ -1204,8 +1179,7 @@ up_dump_update(struct imsgbuf *imsg, str > * merge the attributes together in reverse order of > * creation. > */ > - if (up_generate_mp_reach(buf, peer, adjout_prefix_nexthop(p), > - aid) == -1) > + if (up_generate_mp_reach(buf, peer, pa, aid) == -1) > goto drop; > } > > @@ -1216,28 +1190,32 @@ up_dump_update(struct imsgbuf *imsg, str > > if (aid == AID_INET && !force_ip4mp) { > /* last but not least dump the IPv4 nlri */ > - if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1) > + if (up_dump_prefix(buf, &pa->prefixes, peer, 0) == -1) > goto drop; > } > > + pend_attr_done(pa, peer); > + > imsg_close(imsg, buf); > return; > > drop: > /* Not enough space. Drop current prefix, it will never fit. */ > - p = RB_MIN(prefix_tree, &peer->updates[aid]); > - pt_getaddr(p->pt, &addr); > + pp = TAILQ_FIRST(&pa->prefixes); > + pt_getaddr(pp->pt, &addr); > log_peer_warnx(&peer->conf, "generating update failed, " > - "prefix %s/%d dropped", log_addr(&addr), p->pt->prefixlen); > + "prefix %s/%d dropped", log_addr(&addr), pp->pt->prefixlen); > > - up_prefix_free(&peer->updates[aid], p, peer, 0); > - if (up_dump_withdraw_one(peer, p, buf) == -1) > + if (up_dump_withdraw_one(peer, pp->pt, pp->path_id_tx, buf) == -1) > goto fail; > + pend_prefix_free(pp, &pa->prefixes, peer); > + pend_attr_done(pa, peer); > imsg_close(imsg, buf); > return; > > fail: > /* something went horribly wrong */ > + pend_attr_done(pa, peer); > log_peer_warn(&peer->conf, "generating update failed, peer desynced"); > ibuf_free(buf); > } >