From: Theo Buehler Subject: Re: bgpd: split up processing of updates To: tech@openbsd.org Date: Sun, 28 Dec 2025 13:06:31 +0100 On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote: > One of the biggest latency inducing bottlenecks in bgpd is that UPDATE > messages are processed to completion. Mainly we do the input processing > update the local-RIB and then also handle the updates for the adj-rib-out > at the same time. > > The adj-rib-out is a loop over all peers and is therefor super heavy. > By taking this part of the pipeline into its own step we can drop the > latency inside the main poll loop by a lot. > > For this we introduce a per-peer update queue that enqueues the rib > entries and then handles those one by one. This also has a benefit that > a single peer can not monopolize the processing in bgpd. So a single > flapping peer should result in far less noticeable delays on all other > UPDATES from other peers. > > For now I had to disable the addpath send all optimisation since for that > we need to keep an extra queue of updates. Instead we just use the regular > addpath codepath that is less optimised but produces the same results. > > I will add the needed code later on. See a couple comments below. > -- > :wq Claudio > > Index: rde.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v > diff -u -p -r1.678 rde.c > --- rde.c 24 Dec 2025 07:59:55 -0000 1.678 > +++ rde.c 27 Dec 2025 15:47:29 -0000 > @@ -326,6 +326,7 @@ rde_main(int debug, int verbose) > monotime_to_usec(monotime_sub(io_end, loop_start)); > > peer_foreach(rde_dispatch_imsg_peer, NULL); > + peer_foreach(peer_process_updates, NULL); > > peer_end = getmonotime(); > rdemem.rde_event_peer_usec += > Index: rde.h > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v > diff -u -p -r1.336 rde.h > --- rde.h 24 Dec 2025 07:59:55 -0000 1.336 > +++ rde.h 27 Dec 2025 15:47:29 -0000 > @@ -41,13 +41,19 @@ enum peer_state { > LIST_HEAD(prefix_list, prefix); > TAILQ_HEAD(prefix_queue, prefix); > RB_HEAD(rib_tree, rib_entry); > +TAILQ_HEAD(rib_queue, rib_entry); > +LIST_HEAD(rib_pq_head, rib_pq); > > struct rib_entry { > RB_ENTRY(rib_entry) rib_e; > + TAILQ_ENTRY(rib_entry) rib_queue; > struct prefix_queue prefix_h; > struct pt_entry *prefix; > + struct rib_pq_head rib_pq_list; > + uint32_t pq_peer_id; > uint16_t rib_id; > - uint16_t lock; > + uint8_t lock; > + uint8_t pq_mode; > }; > > struct rib { > @@ -93,6 +99,7 @@ struct rde_peer { > struct pend_prefix_hash pend_prefixes; > struct filter_head *out_rules; > struct ibufqueue *ibufq; > + struct rib_queue rib_pq_head; > monotime_t staletime[AID_MAX]; > uint32_t adjout_bid; > uint32_t remote_bgpid; > @@ -350,9 +357,9 @@ struct filterstate { > }; > > enum eval_mode { > + EVAL_RECONF, > EVAL_DEFAULT, > EVAL_ALL, > - EVAL_RECONF, I found this change confusing. I think you handle things correctly, but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile. I mean the if (re->pq_mode) { ... } in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued() and rib_dequeue() doing re->pq_mode = 0. Should 0 not be EVAL_UNQUEUED or something? > }; > > struct rib_context { > @@ -414,6 +421,7 @@ struct filter_head *peer_apply_out_filte > > void rde_generate_updates(struct rib_entry *, struct prefix *, > uint32_t, enum eval_mode); > +void peer_process_updates(struct rde_peer *, void *); > > void peer_up(struct rde_peer *, struct session_up *); > void peer_down(struct rde_peer *); > @@ -612,6 +620,7 @@ int rib_dump_subtree(uint16_t, struct > void (*)(void *, uint8_t), > int (*)(void *)); > void rib_dump_terminate(void *); > +void rib_dequeue(struct rib_entry *); > > extern struct rib flowrib; > > Index: rde_peer.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v > diff -u -p -r1.64 rde_peer.c > --- rde_peer.c 24 Dec 2025 07:59:55 -0000 1.64 > +++ rde_peer.c 27 Dec 2025 15:53:38 -0000 > @@ -26,6 +26,12 @@ > #include "bgpd.h" > #include "rde.h" > > +struct rib_pq { > + LIST_ENTRY(rib_pq) entry; > + struct prefix *new; > + uint32_t old_pathid_tx; > +}; > + > struct peer_tree peertable = RB_INITIALIZER(&peertable); > struct peer_tree zombietable = RB_INITIALIZER(&zombietable); > struct rde_peer *peerself; > @@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config > if (peer == NULL) > fatal("peer_add"); > > + TAILQ_INIT(&peer->rib_pq_head); > memcpy(&peer->conf, p_conf, sizeof(struct peer_config)); > peer->remote_bgpid = 0; > peer->loc_rib_id = rib_find(peer->conf.rib); > @@ -264,16 +271,20 @@ peer_generate_update(struct rde_peer *pe > peer->export_type == EXPORT_DEFAULT_ROUTE) > return; > > - /* if reconf skip peers which don't need to reconfigure */ > - if (mode == EVAL_RECONF && peer->reconf_out == 0) > - return; > - > /* handle peers with add-path */ > if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) { > +#if NOTYET > + /* XXX skip addpath all optimisation for now until the queue > + * is extended to allow this mode to work again. > + * This mode is not very common and so taking the slow path > + * here like for all other addpath send modes is not that > + * big of an issue right now. > + */ Comment wants more indent. > if (peer->eval.mode == ADDPATH_EVAL_ALL) > up_generate_addpath_all(peer, re, newpath, > old_pathid_tx); consider a return here and dropping the else, then the up_generate_addpath() can be without weird extra indent. > else > +#endif > up_generate_addpath(peer, re); > return; > } > @@ -290,8 +301,64 @@ rde_generate_updates(struct rib_entry *r > { > struct rde_peer *peer; > > - RB_FOREACH(peer, peer_tree, &peertable) > - peer_generate_update(peer, re, newpath, old_pathid_tx, mode); > + switch (mode) { > + case EVAL_RECONF: > + /* skip peers which don't need to reconfigure */ > + RB_FOREACH(peer, peer_tree, &peertable) { > + if (peer->reconf_out == 0) > + continue; > + peer_generate_update(peer, re, NULL, 0, EVAL_RECONF); > + } > + return; > + case EVAL_DEFAULT: > + break; > + case EVAL_ALL: > + /* > + * EVAL_DEFAULT is triggered when a new best path is selected. > + * EVAL_ALL is sent for any other update (needed for peers with > + * addpath or evaluate all set). > + * There can be only one EVAL_DEFAULT queued, it replaces the > + * previous one. A flag is enough. > + * A path can only exist once in the queue (old or new). > + */ > + if (re->pq_mode == EVAL_DEFAULT) > + /* already a best path update pending, nothing to do */ > + return; > + > + break; > + } > + > + if (re->pq_mode) { > + peer = peer_get(re->pq_peer_id); > + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); > + } > + if (newpath != NULL) > + peer = prefix_peer(newpath); > + else > + peer = peerself; > + re->pq_mode = mode; > + re->pq_peer_id = peer->conf.id; > + TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue); > +} > + > +void > +peer_process_updates(struct rde_peer *peer, void *bula) > +{ > + struct rib_entry *re; > + struct rde_peer *p; > + enum eval_mode mode; > + > + re = TAILQ_FIRST(&peer->rib_pq_head); > + if (re == NULL) > + return; > + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); > + > + mode = re->pq_mode; > + > + RB_FOREACH(p, peer_tree, &peertable) > + peer_generate_update(p, re, NULL, 0, mode); > + > + rib_dequeue(re); > } > > /* > @@ -658,6 +725,8 @@ peer_work_pending(void) > > RB_FOREACH(p, peer_tree, &peertable) { > if (ibufq_queuelen(p->ibufq) != 0) > + return 1; > + if (!TAILQ_EMPTY(&p->rib_pq_head)) > return 1; > } > > Index: rde_rib.c > =================================================================== > RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v > diff -u -p -r1.285 rde_rib.c > --- rde_rib.c 24 Dec 2025 07:59:55 -0000 1.285 > +++ rde_rib.c 27 Dec 2025 15:47:29 -0000 > @@ -77,6 +77,12 @@ re_is_locked(struct rib_entry *re) > return (re->lock != 0); > } > > +static inline int > +re_is_queued(struct rib_entry *re) > +{ > + return (re->pq_mode != 0); > +} > + > static inline struct rib_tree * > rib_tree(struct rib *rib) > { > @@ -334,8 +340,8 @@ rib_remove(struct rib_entry *re) > if (!rib_empty(re)) > fatalx("rib_remove: entry not empty"); > > - if (re_is_locked(re)) > - /* entry is locked, don't free it. */ > + if (re_is_locked(re) || re_is_queued(re)) > + /* entry is locked or queued, don't free it. */ > return; > > pt_unref(re->prefix); > @@ -351,6 +357,14 @@ static inline int > rib_empty(struct rib_entry *re) > { > return TAILQ_EMPTY(&re->prefix_h); > +} > + > +void > +rib_dequeue(struct rib_entry *re) > +{ > + re->pq_mode = 0; > + if (rib_empty(re)) > + rib_remove(re); > } > > static struct rib_entry * >