From: Claudio Jeker Subject: bgpd: introduce pending attr and prefix queues To: tech@openbsd.org Date: Thu, 4 Dec 2025 17:14:15 +0100 Implement a per-peer pending prefix queue and lookup table and a pending attribute queue and lookup table. Withdraws just end up in the peer pending prefix queue while for updates the prefix is queued on a pending attribute entry which itself is queued on the peer. This allows to aggregate multiple prefixes into a single UPDATE message. When prefixes are added check the lookup table if there is already an object. In such a case the prefix is first dequeued and then readded. pend_prefix_add() is therefor a bit fiddly. If the attr pointer in struct pend_prefix is NULL then it is a withdraw. The pend_attr needs to hold the aid so prefixes end up on the right queue. If the attrs pointer is NULL the pend_attr is actually the End-of-RIB marker. This replaces the red-black tree contraption used right now. Which is a big preformance bottleneck. -- :wq Claudio Index: bgpctl/output.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpctl/output.c,v diff -u -p -r1.64 output.c --- bgpctl/output.c 2 Dec 2025 13:03:54 -0000 1.64 +++ bgpctl/output.c 4 Dec 2025 15:01:57 -0000 @@ -1097,6 +1097,12 @@ show_rib_mem(struct rde_memstats *stats) stats->attr_refs); printf("%10lld BGP attributes using %s of memory\n", stats->attr_dcnt, fmt_mem(stats->attr_data)); + printf("%10lld pending attribute entries using %s of memory\n", + stats->pend_attr_cnt, fmt_mem(stats->pend_attr_cnt * + sizeof(struct pend_attr))); + printf("%10lld pending prefix entries using %s of memory\n", + stats->pend_prefix_cnt, fmt_mem(stats->pend_prefix_cnt * + sizeof(struct pend_prefix))); printf("%10lld as-set elements in %lld tables using " "%s of memory\n", stats->aset_nmemb, stats->aset_cnt, fmt_mem(stats->aset_size)); @@ -1104,6 +1110,10 @@ show_rib_mem(struct rde_memstats *stats) stats->pset_cnt, fmt_mem(stats->pset_size)); printf("RIB using %s of memory\n", fmt_mem(pts + stats->prefix_cnt * sizeof(struct prefix) + + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + + stats->pend_attr_cnt * sizeof(struct pend_attr) + stats->rib_cnt * sizeof(struct rib_entry) + stats->path_cnt * sizeof(struct rde_aspath) + stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + Index: bgpctl/output_json.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpctl/output_json.c,v diff -u -p -r1.55 output_json.c --- bgpctl/output_json.c 2 Dec 2025 13:03:54 -0000 1.55 +++ bgpctl/output_json.c 4 Dec 2025 15:01:57 -0000 @@ -909,6 +909,10 @@ json_rib_mem(struct rde_memstats *stats) json_rib_mem_element("adjout_attr", stats->adjout_attr_cnt, stats->adjout_attr_cnt * sizeof(struct adjout_attr), stats->adjout_attr_refs); + json_rib_mem_element("pend_attr", stats->pend_attr_cnt, + stats->pend_attr_cnt * sizeof(struct pend_attr), UINT64_MAX); + json_rib_mem_element("pend_prefix", stats->pend_prefix_cnt, + stats->pend_prefix_cnt * sizeof(struct pend_prefix), UINT64_MAX); json_rib_mem_element("rde_aspath", stats->path_cnt, stats->path_cnt * sizeof(struct rde_aspath), stats->path_refs); @@ -924,6 +928,10 @@ json_rib_mem(struct rde_memstats *stats) stats->attr_data, UINT64_MAX); json_rib_mem_element("total", UINT64_MAX, pts + stats->prefix_cnt * sizeof(struct prefix) + + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + + stats->pend_attr_cnt * sizeof(struct pend_attr) + stats->rib_cnt * sizeof(struct rib_entry) + stats->path_cnt * sizeof(struct rde_aspath) + stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + Index: bgpctl/output_ometric.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpctl/output_ometric.c,v diff -u -p -r1.19 output_ometric.c --- bgpctl/output_ometric.c 2 Dec 2025 13:03:54 -0000 1.19 +++ bgpctl/output_ometric.c 4 Dec 2025 15:01:57 -0000 @@ -296,6 +296,10 @@ ometric_rib_mem(struct rde_memstats *sta ometric_rib_mem_element("adjout_prefix", stats->adjout_prefix_cnt, stats->adjout_prefix_cnt * sizeof(struct adjout_prefix), UINT64_MAX); + ometric_rib_mem_element("pend_attr", stats->pend_attr_cnt, + stats->pend_attr_cnt * sizeof(struct pend_attr), UINT64_MAX); + ometric_rib_mem_element("pend_prefix", stats->pend_prefix_cnt, + stats->pend_prefix_cnt * sizeof(struct pend_prefix), UINT64_MAX); ometric_rib_mem_element("adjout_attr", stats->adjout_attr_cnt, stats->adjout_attr_cnt * sizeof(struct adjout_attr), stats->adjout_attr_refs); @@ -315,6 +319,10 @@ ometric_rib_mem(struct rde_memstats *sta ometric_rib_mem_element("total", UINT64_MAX, pts + stats->prefix_cnt * sizeof(struct prefix) + + stats->adjout_prefix_cnt * sizeof(struct adjout_prefix) + + stats->adjout_attr_cnt * sizeof(struct adjout_attr) + + stats->pend_prefix_cnt * sizeof(struct pend_prefix) + + stats->pend_attr_cnt * sizeof(struct pend_attr) + stats->rib_cnt * sizeof(struct rib_entry) + stats->path_cnt * sizeof(struct rde_aspath) + stats->aspath_size + stats->attr_cnt * sizeof(struct attr) + Index: bgpd/bgpd.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v diff -u -p -r1.524 bgpd.h --- bgpd/bgpd.h 3 Dec 2025 12:20:19 -0000 1.524 +++ bgpd/bgpd.h 4 Dec 2025 15:01:57 -0000 @@ -1392,6 +1392,8 @@ struct rde_memstats { long long path_refs; long long prefix_cnt; long long adjout_prefix_cnt; + long long pend_prefix_cnt; + long long pend_attr_cnt; long long rib_cnt; long long pt_cnt[AID_MAX]; long long pt_size[AID_MAX]; Index: bgpd/rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v diff -u -p -r1.673 rde.c --- bgpd/rde.c 3 Dec 2025 12:20:19 -0000 1.673 +++ bgpd/rde.c 4 Dec 2025 15:04:40 -0000 @@ -2938,16 +2938,18 @@ rde_dump_rib_as(struct prefix *p, struct static void rde_dump_adjout_as(struct rde_peer *peer, struct adjout_prefix *p, - struct rde_aspath *asp, pid_t pid, int flags) + struct adjout_attr *attrs, pid_t pid, int flags) { struct ctl_show_rib rib; struct ibuf *wbuf; struct attr *a; + struct rde_aspath *asp; struct nexthop *nexthop; size_t aslen; uint8_t l; - nexthop = adjout_prefix_nexthop(p); + nexthop = attrs->nexthop; + asp = attrs->aspath; memset(&rib, 0, sizeof(rib)); rib.local_pref = asp->lpref; rib.med = asp->med; @@ -2989,7 +2991,7 @@ rde_dump_adjout_as(struct rde_peer *peer imsg_close(ibuf_se_ctl, wbuf); if (flags & F_CTL_DETAIL) { - struct rde_community *comm = adjout_prefix_communities(p); + struct rde_community *comm = attrs->communities; size_t len = comm->nentries * sizeof(struct community); if (comm->nentries > 0) { if (imsg_compose(ibuf_se_ctl, @@ -3073,14 +3075,14 @@ rde_dump_filter(struct prefix *p, struct static void rde_dump_adjout_filter(struct rde_peer *peer, struct adjout_prefix *p, - struct ctl_show_rib_request *req) + struct adjout_attr *attrs, struct ctl_show_rib_request *req) { struct rde_aspath *asp; if (!rde_match_peer(peer, &req->neighbor)) return; - asp = adjout_prefix_aspath(p); + asp = attrs->aspath; if ((req->flags & F_CTL_HAS_PATHID)) { /* Match against the transmit path id if adjout is used. */ if (req->path_id != p->path_id_tx) @@ -3090,12 +3092,11 @@ rde_dump_adjout_filter(struct rde_peer * !aspath_match(asp->aspath, &req->as, 0)) return; if (req->community.flags != 0) { - if (!community_match(adjout_prefix_communities(p), - &req->community, NULL)) + if (!community_match(attrs->communities, &req->community, NULL)) return; } /* in the adj-rib-out, skip matching against roa and aspa state */ - rde_dump_adjout_as(peer, p, asp, req->pid, req->flags); + rde_dump_adjout_as(peer, p, attrs, req->pid, req->flags); } static void @@ -3115,13 +3116,13 @@ rde_dump_adjout_upcall(struct adjout_pre { struct rde_dump_ctx *ctx = ptr; struct rde_peer *peer; + struct adjout_attr *attrs; if ((peer = peer_get(ctx->peerid)) == NULL) return; - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) - return; - rde_dump_adjout_filter(peer, p, &ctx->req); + attrs = p->attrs; + rde_dump_adjout_filter(peer, p, attrs, &ctx->req); } static int @@ -3561,8 +3562,8 @@ rde_update_queue_pending(void) if (peer->throttled) continue; for (aid = AID_MIN; aid < AID_MAX; aid++) { - if (!RB_EMPTY(&peer->updates[aid]) || - !RB_EMPTY(&peer->withdraws[aid])) + if (!TAILQ_EMPTY(&peer->updates[aid]) || + !TAILQ_EMPTY(&peer->withdraws[aid])) return 1; } } @@ -3585,7 +3586,7 @@ rde_update_queue_runner(uint8_t aid) continue; if (peer->throttled) continue; - if (RB_EMPTY(&peer->withdraws[aid])) + if (TAILQ_EMPTY(&peer->withdraws[aid])) continue; up_dump_withdraws(ibuf_se, peer, aid); @@ -3605,7 +3606,7 @@ rde_update_queue_runner(uint8_t aid) continue; if (peer->throttled) continue; - if (RB_EMPTY(&peer->updates[aid])) + if (TAILQ_EMPTY(&peer->updates[aid])) continue; if (up_is_eor(peer, aid)) { Index: bgpd/rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v diff -u -p -r1.327 rde.h --- bgpd/rde.h 2 Dec 2025 13:03:35 -0000 1.327 +++ bgpd/rde.h 4 Dec 2025 15:58:51 -0000 @@ -27,6 +27,7 @@ #include "bgpd.h" #include "log.h" +#include "chash.h" /* rde internal structures */ @@ -71,9 +72,13 @@ struct rib { * Currently I assume that we can do that with the neighbor_ip... */ RB_HEAD(peer_tree, rde_peer); -RB_HEAD(prefix_tree, adjout_prefix); RB_HEAD(prefix_index, adjout_prefix); +CH_HEAD(pend_prefix_hash, pend_prefix); +TAILQ_HEAD(pend_prefix_queue, pend_prefix); +CH_HEAD(pend_attr_hash, pend_prefix); +TAILQ_HEAD(pend_attr_queue, pend_attr); + struct rde_peer { RB_ENTRY(rde_peer) entry; struct peer_config conf; @@ -84,8 +89,10 @@ struct rde_peer { struct capabilities capa; struct addpath_eval eval; struct prefix_index adj_rib_out; - struct prefix_tree updates[AID_MAX]; - struct prefix_tree withdraws[AID_MAX]; + struct pend_prefix_hash pend_prefixes; + struct pend_attr_hash pend_attrs; + struct pend_attr_queue updates[AID_MAX]; + struct pend_prefix_queue withdraws[AID_MAX]; struct filter_head *out_rules; struct ibufqueue *ibufq; monotime_t staletime[AID_MAX]; @@ -311,20 +318,31 @@ struct adjout_attr { }; struct adjout_prefix { - RB_ENTRY(adjout_prefix) index, update; + RB_ENTRY(adjout_prefix) index; struct pt_entry *pt; struct adjout_attr *attrs; uint32_t path_id_tx; uint8_t flags; }; -#define PREFIX_ADJOUT_FLAG_WITHDRAW 0x01 /* enqueued on withdraw queue */ -#define PREFIX_ADJOUT_FLAG_UPDATE 0x02 /* enqueued on update queue */ #define PREFIX_ADJOUT_FLAG_DEAD 0x04 /* locked but removed */ #define PREFIX_ADJOUT_FLAG_STALE 0x08 /* stale entry (for addpath) */ #define PREFIX_ADJOUT_FLAG_MASK 0x0f /* mask for the prefix types */ -#define PREFIX_ADJOUT_FLAG_EOR 0x10 /* prefix is EoR */ #define PREFIX_ADJOUT_FLAG_LOCKED 0x20 /* locked by rib walker */ +struct pend_attr { + TAILQ_ENTRY(pend_attr) entry; + struct pend_prefix_queue prefixes; + struct adjout_attr *attrs; + uint8_t aid; +}; + +struct pend_prefix { + TAILQ_ENTRY(pend_prefix) entry; + struct pt_entry *pt; + struct pend_attr *attrs; + uint32_t path_id_tx; +}; + struct filterstate { struct rde_aspath aspath; struct rde_community communities; @@ -637,8 +655,6 @@ struct prefix *prefix_bypeer(struct rib_ uint32_t); void prefix_destroy(struct prefix *); -RB_PROTOTYPE(prefix_tree, adjout_prefix, entry, prefix_cmp) - static inline struct rde_peer * prefix_peer(struct prefix *p) { @@ -750,24 +766,14 @@ int adjout_prefix_dump_subtree(struct struct bgpd_addr *, uint8_t, unsigned int, void *, void (*)(struct adjout_prefix *, void *), void (*)(void *, uint8_t), int (*)(void *)); +void adjout_peer_init(struct rde_peer *); -static inline struct rde_aspath * -adjout_prefix_aspath(struct adjout_prefix *p) -{ - return (p->attrs->aspath); -} - -static inline struct rde_community * -adjout_prefix_communities(struct adjout_prefix *p) -{ - return (p->attrs->communities); -} - -static inline struct nexthop * -adjout_prefix_nexthop(struct adjout_prefix *p) -{ - return (p->attrs->nexthop); -} +void pend_attr_done(struct pend_attr *, struct rde_peer *); +void pend_eor_add(struct rde_peer *, uint8_t); +void pend_prefix_add(struct rde_peer *, struct adjout_attr *, + struct pt_entry *, uint32_t); +void pend_prefix_free(struct pend_prefix *, + struct pend_prefix_queue *, struct rde_peer *); /* rde_update.c */ void up_generate_updates(struct rde_peer *, struct rib_entry *); Index: bgpd/rde_adjout.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_adjout.c,v diff -u -p -r1.8 rde_adjout.c --- bgpd/rde_adjout.c 2 Dec 2025 13:03:35 -0000 1.8 +++ bgpd/rde_adjout.c 4 Dec 2025 16:09:14 -0000 @@ -30,6 +30,228 @@ #include "log.h" #include "chash.h" +static struct adjout_attr *adjout_attr_ref(struct adjout_attr *); +static void adjout_attr_unref(struct adjout_attr *); + +static uint64_t pendkey; + +static inline uint64_t +pend_prefix_hash(const struct pend_prefix *pp) +{ + uint64_t h = pendkey; + + h = ch_qhash64(h, (uintptr_t)pp->pt); + h = ch_qhash64(h, pp->path_id_tx); + return h; +} + +static inline uint64_t +pend_attr_hash(const struct pend_attr *pa) +{ + uint64_t h = pendkey; + + h = ch_qhash64(h, (uintptr_t)pa->attrs); + h = ch_qhash64(h, pa->aid); + return h; +} + +CH_PROTOTYPE(pend_prefix_hash, pend_prefix, pend_prefix_hash); +CH_PROTOTYPE(pend_attr_hash, pend_attr, pend_attr_hash); + +/* pending prefix queue functions */ +static struct pend_attr * +pend_attr_alloc(struct adjout_attr *attrs, uint8_t aid, + struct rde_peer *peer) +{ + struct pend_attr *pa; + + if ((pa = calloc(1, sizeof(*pa))) == NULL) + fatal(__func__); + rdemem.pend_attr_cnt++; + TAILQ_INIT(&pa->prefixes); + if (attrs) + pa->attrs = adjout_attr_ref(attrs); + pa->aid = aid; + + TAILQ_INSERT_TAIL(&peer->updates[aid], pa, entry); + if (CH_INSERT(pend_attr_hash, &peer->pend_attrs, pa, NULL) != 1) + fatalx("corrupted pending attr hash table"); + return pa; +} + +static void +pend_attr_free(struct pend_attr *pa, struct rde_peer *peer) +{ + if (!TAILQ_EMPTY(&pa->prefixes)) { + log_warnx("freeing not empty pending attribute"); + abort(); + } + + TAILQ_REMOVE(&peer->updates[pa->aid], pa, entry); + CH_REMOVE(pend_attr_hash, &peer->pend_attrs, pa); + + if (pa->attrs != NULL) + adjout_attr_unref(pa->attrs); + + rdemem.pend_attr_cnt--; + free(pa); +} + +void +pend_attr_done(struct pend_attr *pa, struct rde_peer *peer) +{ + if (pa == NULL) + return; + if (TAILQ_EMPTY(&pa->prefixes)) + pend_attr_free(pa, peer); +} + +static struct pend_attr * +pend_attr_lookup(struct rde_peer *peer, struct adjout_attr *attrs, uint8_t aid) +{ + struct pend_attr needle = { .attrs = attrs, .aid = aid }; + + return CH_FIND(pend_attr_hash, &peer->pend_attrs, &needle); +} + +static inline int +pend_attr_eq(const struct pend_attr *a, const struct pend_attr *b) +{ + if (a->attrs != b->attrs) + return 0; + if (a->aid != b->aid) + return 0; + return 1; +} + +CH_GENERATE(pend_attr_hash, pend_attr, pend_attr_eq, pend_attr_hash); + +/* + * Insert an End-of-RIB marker into the update queue. + */ +void +pend_eor_add(struct rde_peer *peer, uint8_t aid) +{ + struct pend_attr *pa; + + pa = pend_attr_lookup(peer, NULL, aid); + if (pa == NULL) + pa = pend_attr_alloc(NULL, aid, peer); +} + + +static struct pend_prefix *pend_prefix_alloc(struct pend_attr *, + struct pt_entry *, uint32_t); + +static struct pend_prefix * +pend_prefix_lookup(struct rde_peer *peer, struct pt_entry *pt, + uint32_t path_id_tx) +{ + struct pend_prefix needle = { .pt = pt, .path_id_tx = path_id_tx }; + + return CH_FIND(pend_prefix_hash, &peer->pend_prefixes, &needle); +} + +static void +pend_prefix_remove(struct pend_prefix *pp, struct pend_prefix_queue *head, + struct rde_peer *peer) +{ + if (CH_REMOVE(pend_prefix_hash, &peer->pend_prefixes, pp) != pp) { + log_warnx("missing pending prefix in hash table"); + abort(); + } + TAILQ_REMOVE(head, pp, entry); + + if (pp->attrs == NULL) { + peer->stats.pending_withdraw--; + } else { + peer->stats.pending_update--; + } + pp->attrs = NULL; +} + +void +pend_prefix_add(struct rde_peer *peer, struct adjout_attr *attrs, + struct pt_entry *pt, uint32_t path_id_tx) +{ + struct pend_attr *pa = NULL, *oldpa = NULL; + struct pend_prefix *pp; + struct pend_prefix_queue *head; + + if (attrs != NULL) { + pa = pend_attr_lookup(peer, attrs, pt->aid); + if (pa == NULL) + pa = pend_attr_alloc(attrs, pt->aid, peer); + } + + pp = pend_prefix_lookup(peer, pt, path_id_tx); + if (pp == NULL) { + pp = pend_prefix_alloc(pa, pt, path_id_tx); + } else { + if (pp->attrs == NULL) + head = &peer->withdraws[pt->aid]; + else + head = &pp->attrs->prefixes; + oldpa = pp->attrs; + pend_prefix_remove(pp, head, peer); + pp->attrs = pa; + } + + if (pa == NULL) { + head = &peer->withdraws[pt->aid]; + peer->stats.pending_withdraw++; + } else { + head = &pa->prefixes; + peer->stats.pending_update++; + } + + TAILQ_INSERT_TAIL(head, pp, entry); + if (CH_INSERT(pend_prefix_hash, &peer->pend_prefixes, pp, NULL) != 1) { + log_warnx("corrupted pending prefix hash table"); + abort(); + } + + pend_attr_done(oldpa, peer); +} + +static struct pend_prefix * +pend_prefix_alloc(struct pend_attr *attrs, struct pt_entry *pt, + uint32_t path_id_tx) +{ + struct pend_prefix *pp; + + if ((pp = calloc(1, sizeof(*pp))) == NULL) + fatal(__func__); + rdemem.pend_prefix_cnt++; + pp->pt = pt_ref(pt); + pp->attrs = attrs; + pp->path_id_tx = path_id_tx; + + return pp; +} + +void +pend_prefix_free(struct pend_prefix *pp, struct pend_prefix_queue *head, + struct rde_peer *peer) +{ + pend_prefix_remove(pp, head, peer); + pt_unref(pp->pt); + rdemem.pend_prefix_cnt--; + free(pp); +} + +static inline int +pend_prefix_eq(const struct pend_prefix *a, const struct pend_prefix *b) +{ + if (a->pt != b->pt) + return 0; + if (a->path_id_tx != b->path_id_tx) + return 0; + return 1; +} + +CH_GENERATE(pend_prefix_hash, pend_prefix, pend_prefix_eq, pend_prefix_hash); + /* adj-rib-out specific functions */ static uint64_t attrkey; @@ -68,6 +290,7 @@ void adjout_init(void) { arc4random_buf(&attrkey, sizeof(attrkey)); + arc4random_buf(&pendkey, sizeof(pendkey)); } /* Alloc, init and add a new entry into the has table. May not fail. */ @@ -77,8 +300,7 @@ adjout_attr_alloc(struct rde_aspath *asp { struct adjout_attr *a; - a = calloc(1, sizeof(*a)); - if (a == NULL) + if ((a = calloc(1, sizeof(*a))) == NULL) fatal(__func__); rdemem.adjout_attr_cnt++; @@ -115,7 +337,7 @@ adjout_attr_free(struct adjout_attr *a) } static struct adjout_attr * -adjout_attr_ref(struct adjout_attr *attrs, struct rde_peer *peer) +adjout_attr_ref(struct adjout_attr *attrs) { attrs->refcnt++; rdemem.adjout_attr_refs++; @@ -123,7 +345,7 @@ adjout_attr_ref(struct adjout_attr *attr } static void -adjout_attr_unref(struct adjout_attr *attrs, struct rde_peer *peer) +adjout_attr_unref(struct adjout_attr *attrs) { attrs->refcnt--; rdemem.adjout_attr_refs--; @@ -217,22 +439,6 @@ prefix_index_cmp(struct adjout_prefix *a return 0; } -static inline int -prefix_cmp(struct adjout_prefix *a, struct adjout_prefix *b) -{ - if ((a->flags & PREFIX_ADJOUT_FLAG_EOR) != - (b->flags & PREFIX_ADJOUT_FLAG_EOR)) - return (a->flags & PREFIX_ADJOUT_FLAG_EOR) ? 1 : -1; - /* if EOR marker no need to check the rest */ - if (a->flags & PREFIX_ADJOUT_FLAG_EOR) - return 0; - - if (a->attrs != b->attrs) - return (a->attrs > b->attrs ? 1 : -1); - return prefix_index_cmp(a, b); -} - -RB_GENERATE(prefix_tree, adjout_prefix, update, prefix_cmp) RB_GENERATE_STATIC(prefix_index, adjout_prefix, index, prefix_index_cmp) /* @@ -328,22 +534,6 @@ adjout_prefix_match(struct rde_peer *pee } /* - * Insert an End-of-RIB marker into the update queue. - */ -void -prefix_add_eor(struct rde_peer *peer, uint8_t aid) -{ - struct adjout_prefix *p; - - p = adjout_prefix_alloc(); - p->flags = PREFIX_ADJOUT_FLAG_UPDATE | PREFIX_ADJOUT_FLAG_EOR; - if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) - /* no need to add if EoR marker already present */ - adjout_prefix_free(p); - /* EOR marker is not inserted into the adj_rib_out index */ -} - -/* * Put a prefix from the Adj-RIB-Out onto the update queue. */ void @@ -364,38 +554,28 @@ adjout_prefix_update(struct adjout_prefi fatalx("%s: RB index invariant violated", __func__); } - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { + if ((p->flags & (PREFIX_ADJOUT_FLAG_DEAD)) == 0) { /* * XXX for now treat a different path_id_tx like different * attributes and force out an update. It is unclear how * common it is to have equivalent updates from alternative * paths. */ + attrs = p->attrs; if (p->path_id_tx == path_id_tx && - adjout_prefix_nexthop(p) == state->nexthop && + attrs->nexthop == state->nexthop && communities_equal(&state->communities, - adjout_prefix_communities(p)) && - path_equal(&state->aspath, adjout_prefix_aspath(p))) { + attrs->communities) && + path_equal(&state->aspath, attrs->aspath)) { /* nothing changed */ p->flags &= ~PREFIX_ADJOUT_FLAG_STALE; return; } - /* if pending update unhook it before it is unlinked */ - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { - RB_REMOVE(prefix_tree, &peer->updates[pte->aid], p); - peer->stats.pending_update--; - } - /* unlink prefix so it can be relinked below */ adjout_prefix_unlink(p, peer); peer->stats.prefix_out_cnt--; } - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { - RB_REMOVE(prefix_tree, &peer->withdraws[pte->aid], p); - peer->stats.pending_withdraw--; - } /* nothing needs to be done for PREFIX_ADJOUT_FLAG_DEAD and STALE */ p->flags &= ~PREFIX_ADJOUT_FLAG_MASK; @@ -417,10 +597,7 @@ adjout_prefix_update(struct adjout_prefi if (p->flags & PREFIX_ADJOUT_FLAG_MASK) fatalx("%s: bad flags %x", __func__, p->flags); if (peer_is_up(peer)) { - p->flags |= PREFIX_ADJOUT_FLAG_UPDATE; - if (RB_INSERT(prefix_tree, &peer->updates[pte->aid], p) != NULL) - fatalx("%s: RB tree invariant violated", __func__); - peer->stats.pending_update++; + pend_prefix_add(peer, p->attrs, p->pt, p->path_id_tx); } } @@ -431,59 +608,17 @@ adjout_prefix_update(struct adjout_prefi void adjout_prefix_withdraw(struct rde_peer *peer, struct adjout_prefix *p) { - /* already a withdraw, shortcut */ - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { - p->flags &= ~PREFIX_ADJOUT_FLAG_STALE; - return; - } - /* pending update just got withdrawn */ - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { - RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); - peer->stats.pending_update--; - } - /* unlink prefix if it was linked (not a withdraw or dead) */ - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { - adjout_prefix_unlink(p, peer); - peer->stats.prefix_out_cnt--; - } + if (peer_is_up(peer)) + pend_prefix_add(peer, NULL, p->pt, p->path_id_tx); - /* nothing needs to be done for PREFIX_ADJOUT_FLAG_DEAD and STALE */ - p->flags &= ~PREFIX_ADJOUT_FLAG_MASK; - - if (peer_is_up(peer)) { - p->flags |= PREFIX_ADJOUT_FLAG_WITHDRAW; - if (RB_INSERT(prefix_tree, &peer->withdraws[p->pt->aid], - p) != NULL) - fatalx("%s: RB tree invariant violated", __func__); - peer->stats.pending_withdraw++; - } else { - /* mark prefix dead to skip unlink on destroy */ - p->flags |= PREFIX_ADJOUT_FLAG_DEAD; - adjout_prefix_destroy(peer, p); - } + adjout_prefix_destroy(peer, p); } void adjout_prefix_destroy(struct rde_peer *peer, struct adjout_prefix *p) { - if (p->flags & PREFIX_ADJOUT_FLAG_EOR) { - /* EOR marker is not linked in the index */ - adjout_prefix_free(p); - return; - } - - if (p->flags & PREFIX_ADJOUT_FLAG_WITHDRAW) { - RB_REMOVE(prefix_tree, &peer->withdraws[p->pt->aid], p); - peer->stats.pending_withdraw--; - } - if (p->flags & PREFIX_ADJOUT_FLAG_UPDATE) { - RB_REMOVE(prefix_tree, &peer->updates[p->pt->aid], p); - peer->stats.pending_update--; - } - /* unlink prefix if it was linked (not a withdraw or dead) */ - if ((p->flags & (PREFIX_ADJOUT_FLAG_WITHDRAW | - PREFIX_ADJOUT_FLAG_DEAD)) == 0) { + /* unlink prefix if it was linked (not dead) */ + if ((p->flags & PREFIX_ADJOUT_FLAG_DEAD) == 0) { adjout_prefix_unlink(p, peer); peer->stats.prefix_out_cnt--; } @@ -505,21 +640,19 @@ adjout_prefix_destroy(struct rde_peer *p void adjout_prefix_flush_pending(struct rde_peer *peer) { - struct adjout_prefix *p, *np; + struct pend_attr *pa, *npa; + struct pend_prefix *pp, *npp; uint8_t aid; for (aid = AID_MIN; aid < AID_MAX; aid++) { - RB_FOREACH_SAFE(p, prefix_tree, &peer->withdraws[aid], np) { - adjout_prefix_destroy(peer, p); + TAILQ_FOREACH_SAFE(pp, &peer->withdraws[aid], entry, npp) { + pend_prefix_free(pp, &peer->withdraws[aid], peer); } - RB_FOREACH_SAFE(p, prefix_tree, &peer->updates[aid], np) { - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; - RB_REMOVE(prefix_tree, &peer->updates[aid], p); - if (p->flags & PREFIX_ADJOUT_FLAG_EOR) { - adjout_prefix_destroy(peer, p); - } else { - peer->stats.pending_update--; + TAILQ_FOREACH_SAFE(pa, &peer->updates[aid], entry, npa) { + TAILQ_FOREACH_SAFE(pp, &pa->prefixes, entry, npp) { + pend_prefix_free(pp, &pa->prefixes, peer); } + pend_attr_done(pa, peer); } } } @@ -690,7 +823,7 @@ static void adjout_prefix_link(struct adjout_prefix *p, struct rde_peer *peer, struct adjout_attr *attrs, struct pt_entry *pt, uint32_t path_id_tx) { - p->attrs = adjout_attr_ref(attrs, peer); + p->attrs = adjout_attr_ref(attrs); p->pt = pt_ref(pt); p->path_id_tx = path_id_tx; } @@ -702,7 +835,7 @@ static void adjout_prefix_unlink(struct adjout_prefix *p, struct rde_peer *peer) { /* destroy all references to other objects */ - adjout_attr_unref(p->attrs, peer); + adjout_attr_unref(p->attrs); p->attrs = NULL; pt_unref(p->pt); /* must keep p->pt valid since there is an extra ref */ @@ -727,4 +860,17 @@ adjout_prefix_free(struct adjout_prefix { rdemem.adjout_prefix_cnt--; free(p); +} + +void +adjout_peer_init(struct rde_peer *peer) +{ + unsigned int i; + + CH_INIT(pend_attr_hash, &peer->pend_attrs); + CH_INIT(pend_prefix_hash, &peer->pend_prefixes); + for (i = 0; i < nitems(peer->updates); i++) + TAILQ_INIT(&peer->updates[i]); + for (i = 0; i < nitems(peer->withdraws); i++) + TAILQ_INIT(&peer->withdraws[i]); } Index: bgpd/rde_peer.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v diff -u -p -r1.59 rde_peer.c --- bgpd/rde_peer.c 2 Dec 2025 13:03:35 -0000 1.59 +++ bgpd/rde_peer.c 4 Dec 2025 15:03:48 -0000 @@ -175,6 +175,7 @@ peer_add(uint32_t id, struct peer_config if ((peer->ibufq = ibufq_new()) == NULL) fatal(NULL); + adjout_peer_init(peer); peer_apply_out_filter(peer, rules); /* @@ -520,15 +521,11 @@ static void peer_blast_upcall(struct adjout_prefix *p, void *ptr) { struct rde_peer *peer = ptr; + struct adjout_attr *attrs = NULL; - if ((p->flags & PREFIX_ADJOUT_FLAG_MASK) == 0) { - /* put entries on the update queue if not already on a queue */ - p->flags |= PREFIX_ADJOUT_FLAG_UPDATE; - if (RB_INSERT(prefix_tree, &peer->updates[p->pt->aid], - p) != NULL) - fatalx("%s: RB tree invariant violated", __func__); - peer->stats.pending_update++; - } + attrs = p->attrs; + + pend_prefix_add(peer, attrs, p->pt, p->path_id_tx); } /* @@ -543,7 +540,7 @@ peer_blast_done(void *ptr, uint8_t aid) /* Adj-RIB-Out ready, unthrottle peer and inject EOR */ peer->throttled = 0; if (peer->capa.grestart.restart) - prefix_add_eor(peer, aid); + pend_eor_add(peer, aid); } /* Index: bgpd/rde_update.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v diff -u -p -r1.187 rde_update.c --- bgpd/rde_update.c 3 Dec 2025 10:00:15 -0000 1.187 +++ bgpd/rde_update.c 4 Dec 2025 15:59:28 -0000 @@ -799,17 +799,11 @@ up_generate_attr(struct ibuf *buf, struc int up_is_eor(struct rde_peer *peer, uint8_t aid) { - struct adjout_prefix *p; + struct pend_attr *pa; - p = RB_MIN(prefix_tree, &peer->updates[aid]); - if (p != NULL && (p->flags & PREFIX_ADJOUT_FLAG_EOR)) { - /* - * Need to remove eor from update tree because - * adjout_prefix_destroy() can't handle that. - */ - RB_REMOVE(prefix_tree, &peer->updates[aid], p); - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; - adjout_prefix_destroy(peer, p); + pa = TAILQ_FIRST(&peer->updates[aid]); + if (pa != NULL && (uintptr_t)pa->attrs < AID_MAX) { + pend_attr_done(pa, peer); return 1; } return 0; @@ -818,36 +812,19 @@ up_is_eor(struct rde_peer *peer, uint8_t /* minimal buffer size > withdraw len + attr len + attr hdr + afi/safi */ #define MIN_UPDATE_LEN 16 -static void -up_prefix_free(struct prefix_tree *prefix_head, struct adjout_prefix *p, - struct rde_peer *peer, int withdraw) -{ - if (withdraw) { - /* prefix no longer needed, remove it */ - adjout_prefix_destroy(peer, p); - peer->stats.prefix_sent_withdraw++; - } else { - /* prefix still in Adj-RIB-Out, keep it */ - RB_REMOVE(prefix_tree, prefix_head, p); - p->flags &= ~PREFIX_ADJOUT_FLAG_UPDATE; - peer->stats.pending_update--; - peer->stats.prefix_sent_update++; - } -} - /* * Write prefixes to buffer until either there is no more space or * the next prefix has no longer the same ASPATH attributes. * Returns -1 if no prefix was written else 0. */ static int -up_dump_prefix(struct ibuf *buf, struct prefix_tree *prefix_head, +up_dump_prefix(struct ibuf *buf, struct pend_prefix_queue *prefix_head, struct rde_peer *peer, int withdraw) { - struct adjout_prefix *p, *np; - int done = 0, has_ap = -1, rv = -1; + struct pend_prefix *p, *np; + int has_ap = -1, rv = -1; - RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) { + TAILQ_FOREACH_SAFE(p, prefix_head, entry, np) { if (has_ap == -1) has_ap = peer_has_add_path(peer, p->pt->aid, CAPA_AP_SEND); @@ -855,24 +832,21 @@ up_dump_prefix(struct ibuf *buf, struct -1) break; - /* make sure we only dump prefixes which belong together */ - if (np == NULL || - np->attrs != p->attrs || - (np->flags & PREFIX_ADJOUT_FLAG_EOR)) - done = 1; - rv = 0; - up_prefix_free(prefix_head, p, peer, withdraw); - if (done) - break; + if (withdraw) + peer->stats.prefix_sent_withdraw++; + else + peer->stats.prefix_sent_update++; + pend_prefix_free(p, prefix_head, peer); } return rv; } static int up_generate_mp_reach(struct ibuf *buf, struct rde_peer *peer, - struct nexthop *nh, uint8_t aid) + struct pend_attr *pa, uint8_t aid) { + struct nexthop *nh = pa->attrs->nexthop; struct bgpd_addr *nexthop; size_t off, nhoff; uint16_t len, afi; @@ -969,7 +943,7 @@ up_generate_mp_reach(struct ibuf *buf, s if (ibuf_add_zero(buf, 1) == -1) /* Reserved must be 0 */ return -1; - if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1) + if (up_dump_prefix(buf, &pa->prefixes, peer, 0) == -1) /* no prefixes written, fail update */ return -1; @@ -1084,8 +1058,8 @@ up_dump_withdraws(struct imsgbuf *imsg, * Withdraw a single prefix after an error. */ static int -up_dump_withdraw_one(struct rde_peer *peer, struct adjout_prefix *p, - struct ibuf *buf) +up_dump_withdraw_one(struct rde_peer *peer, struct pt_entry *pt, + uint32_t path_id_tx, struct ibuf *buf) { size_t off; int has_ap; @@ -1100,7 +1074,7 @@ up_dump_withdraw_one(struct rde_peer *pe if (ibuf_add_zero(buf, sizeof(len)) == -1) return -1; - if (p->pt->aid != AID_INET) { + if (pt->aid != AID_INET) { /* reserve space for 2-byte path attribute length */ off = ibuf_size(buf); if (ibuf_add_zero(buf, sizeof(len)) == -1) @@ -1115,7 +1089,7 @@ up_dump_withdraw_one(struct rde_peer *pe return -1; /* afi & safi */ - if (aid2afi(p->pt->aid, &afi, &safi) == -1) + if (aid2afi(pt->aid, &afi, &safi) == -1) fatalx("%s: bad AID", __func__); if (ibuf_add_n16(buf, afi) == -1) return -1; @@ -1123,8 +1097,8 @@ up_dump_withdraw_one(struct rde_peer *pe return -1; } - has_ap = peer_has_add_path(peer, p->pt->aid, CAPA_AP_SEND); - if (pt_writebuf(buf, p->pt, 1, has_ap, p->path_id_tx) == -1) + has_ap = peer_has_add_path(peer, pt->aid, CAPA_AP_SEND); + if (pt_writebuf(buf, pt, 1, has_ap, path_id_tx) == -1) return -1; /* update length field (either withdrawn routes or attribute length) */ @@ -1132,7 +1106,7 @@ up_dump_withdraw_one(struct rde_peer *pe if (ibuf_set_n16(buf, off, len) == -1) return -1; - if (p->pt->aid != AID_INET) { + if (pt->aid != AID_INET) { /* write MP_UNREACH_NLRI attribute length (always extended) */ len -= 4; /* skip attribute header */ if (ibuf_set_n16(buf, off + sizeof(len) + 2, len) == -1) @@ -1158,17 +1132,18 @@ up_dump_update(struct imsgbuf *imsg, str { struct ibuf *buf; struct bgpd_addr addr; - struct adjout_prefix *p; + struct pend_attr *pa; + struct pend_prefix *pp; size_t off, pkgsize = MAX_PKTSIZE; uint16_t len; int force_ip4mp = 0; - p = RB_MIN(prefix_tree, &peer->updates[aid]); - if (p == NULL) + pa = TAILQ_FIRST(&peer->updates[aid]); + if (pa == NULL) return; if (aid == AID_INET && peer_has_ext_nexthop(peer, AID_INET)) { - struct nexthop *nh = adjout_prefix_nexthop(p); + struct nexthop *nh = pa->attrs->nexthop; if (nh != NULL && nh->exit_nexthop.aid == AID_INET6) force_ip4mp = 1; } @@ -1191,8 +1166,8 @@ up_dump_update(struct imsgbuf *imsg, str if (ibuf_add_zero(buf, sizeof(len)) == -1) goto fail; - if (up_generate_attr(buf, peer, adjout_prefix_aspath(p), - adjout_prefix_communities(p), adjout_prefix_nexthop(p), aid) == -1) + if (up_generate_attr(buf, peer, pa->attrs->aspath, + pa->attrs->communities, pa->attrs->nexthop, aid) == -1) goto drop; if (aid != AID_INET || force_ip4mp) { @@ -1204,8 +1179,7 @@ up_dump_update(struct imsgbuf *imsg, str * merge the attributes together in reverse order of * creation. */ - if (up_generate_mp_reach(buf, peer, adjout_prefix_nexthop(p), - aid) == -1) + if (up_generate_mp_reach(buf, peer, pa, aid) == -1) goto drop; } @@ -1216,28 +1190,32 @@ up_dump_update(struct imsgbuf *imsg, str if (aid == AID_INET && !force_ip4mp) { /* last but not least dump the IPv4 nlri */ - if (up_dump_prefix(buf, &peer->updates[aid], peer, 0) == -1) + if (up_dump_prefix(buf, &pa->prefixes, peer, 0) == -1) goto drop; } + pend_attr_done(pa, peer); + imsg_close(imsg, buf); return; drop: /* Not enough space. Drop current prefix, it will never fit. */ - p = RB_MIN(prefix_tree, &peer->updates[aid]); - pt_getaddr(p->pt, &addr); + pp = TAILQ_FIRST(&pa->prefixes); + pt_getaddr(pp->pt, &addr); log_peer_warnx(&peer->conf, "generating update failed, " - "prefix %s/%d dropped", log_addr(&addr), p->pt->prefixlen); + "prefix %s/%d dropped", log_addr(&addr), pp->pt->prefixlen); - up_prefix_free(&peer->updates[aid], p, peer, 0); - if (up_dump_withdraw_one(peer, p, buf) == -1) + if (up_dump_withdraw_one(peer, pp->pt, pp->path_id_tx, buf) == -1) goto fail; + pend_prefix_free(pp, &pa->prefixes, peer); + pend_attr_done(pa, peer); imsg_close(imsg, buf); return; fail: /* something went horribly wrong */ + pend_attr_done(pa, peer); log_peer_warn(&peer->conf, "generating update failed, peer desynced"); ibuf_free(buf); }