From: Claudio Jeker Subject: Re: bgpd: split up processing of updates To: Theo Buehler Cc: tech@openbsd.org Date: Sun, 28 Dec 2025 17:34:23 +0100 On Sun, Dec 28, 2025 at 01:06:31PM +0100, Theo Buehler wrote: > On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote: > > One of the biggest latency inducing bottlenecks in bgpd is that UPDATE > > messages are processed to completion. Mainly we do the input processing > > update the local-RIB and then also handle the updates for the adj-rib-out > > at the same time. > > > > The adj-rib-out is a loop over all peers and is therefor super heavy. > > By taking this part of the pipeline into its own step we can drop the > > latency inside the main poll loop by a lot. > > > > For this we introduce a per-peer update queue that enqueues the rib > > entries and then handles those one by one. This also has a benefit that > > a single peer can not monopolize the processing in bgpd. So a single > > flapping peer should result in far less noticeable delays on all other > > UPDATES from other peers. > > > > For now I had to disable the addpath send all optimisation since for that > > we need to keep an extra queue of updates. Instead we just use the regular > > addpath codepath that is less optimised but produces the same results. > > > > I will add the needed code later on. > > See a couple comments below. > > enum eval_mode { > > + EVAL_RECONF, > > EVAL_DEFAULT, > > EVAL_ALL, > > - EVAL_RECONF, > > I found this change confusing. I think you handle things correctly, > but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile. > I mean the > > if (re->pq_mode) { > ... > } > > in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued() > and rib_dequeue() doing re->pq_mode = 0. > > Should 0 not be EVAL_UNQUEUED or something? See diff below, I used EVAL_NONE since we have a few other NONE enums in bgpd. I think I replaced all places that used 0 with EVAL_NONE and also added it in rib_add() to be explicit. > > +#if NOTYET > > + /* XXX skip addpath all optimisation for now until the queue > > + * is extended to allow this mode to work again. > > + * This mode is not very common and so taking the slow path > > + * here like for all other addpath send modes is not that > > + * big of an issue right now. > > + */ > > Comment wants more indent. > > > if (peer->eval.mode == ADDPATH_EVAL_ALL) > > up_generate_addpath_all(peer, re, newpath, > > old_pathid_tx); > > consider a return here and dropping the else, then the up_generate_addpath() > can be without weird extra indent. > All done. New diff below. -- :wq Claudio Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v diff -u -p -r1.678 rde.c --- rde.c 24 Dec 2025 07:59:55 -0000 1.678 +++ rde.c 27 Dec 2025 15:47:29 -0000 @@ -326,6 +326,7 @@ rde_main(int debug, int verbose) monotime_to_usec(monotime_sub(io_end, loop_start)); peer_foreach(rde_dispatch_imsg_peer, NULL); + peer_foreach(peer_process_updates, NULL); peer_end = getmonotime(); rdemem.rde_event_peer_usec += Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v diff -u -p -r1.336 rde.h --- rde.h 24 Dec 2025 07:59:55 -0000 1.336 +++ rde.h 28 Dec 2025 16:25:11 -0000 @@ -41,13 +41,19 @@ enum peer_state { LIST_HEAD(prefix_list, prefix); TAILQ_HEAD(prefix_queue, prefix); RB_HEAD(rib_tree, rib_entry); +TAILQ_HEAD(rib_queue, rib_entry); +LIST_HEAD(rib_pq_head, rib_pq); struct rib_entry { RB_ENTRY(rib_entry) rib_e; + TAILQ_ENTRY(rib_entry) rib_queue; struct prefix_queue prefix_h; struct pt_entry *prefix; + struct rib_pq_head rib_pq_list; + uint32_t pq_peer_id; uint16_t rib_id; - uint16_t lock; + uint8_t lock; + uint8_t pq_mode; }; struct rib { @@ -93,6 +99,7 @@ struct rde_peer { struct pend_prefix_hash pend_prefixes; struct filter_head *out_rules; struct ibufqueue *ibufq; + struct rib_queue rib_pq_head; monotime_t staletime[AID_MAX]; uint32_t adjout_bid; uint32_t remote_bgpid; @@ -350,9 +357,10 @@ struct filterstate { }; enum eval_mode { + EVAL_NONE, + EVAL_RECONF, EVAL_DEFAULT, EVAL_ALL, - EVAL_RECONF, }; struct rib_context { @@ -414,6 +422,7 @@ struct filter_head *peer_apply_out_filte void rde_generate_updates(struct rib_entry *, struct prefix *, uint32_t, enum eval_mode); +void peer_process_updates(struct rde_peer *, void *); void peer_up(struct rde_peer *, struct session_up *); void peer_down(struct rde_peer *); @@ -612,6 +621,7 @@ int rib_dump_subtree(uint16_t, struct void (*)(void *, uint8_t), int (*)(void *)); void rib_dump_terminate(void *); +void rib_dequeue(struct rib_entry *); extern struct rib flowrib; Index: rde_peer.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v diff -u -p -r1.64 rde_peer.c --- rde_peer.c 24 Dec 2025 07:59:55 -0000 1.64 +++ rde_peer.c 28 Dec 2025 16:29:22 -0000 @@ -26,6 +26,12 @@ #include "bgpd.h" #include "rde.h" +struct rib_pq { + LIST_ENTRY(rib_pq) entry; + struct prefix *new; + uint32_t old_pathid_tx; +}; + struct peer_tree peertable = RB_INITIALIZER(&peertable); struct peer_tree zombietable = RB_INITIALIZER(&zombietable); struct rde_peer *peerself; @@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config if (peer == NULL) fatal("peer_add"); + TAILQ_INIT(&peer->rib_pq_head); memcpy(&peer->conf, p_conf, sizeof(struct peer_config)); peer->remote_bgpid = 0; peer->loc_rib_id = rib_find(peer->conf.rib); @@ -264,17 +271,22 @@ peer_generate_update(struct rde_peer *pe peer->export_type == EXPORT_DEFAULT_ROUTE) return; - /* if reconf skip peers which don't need to reconfigure */ - if (mode == EVAL_RECONF && peer->reconf_out == 0) - return; - /* handle peers with add-path */ if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) { - if (peer->eval.mode == ADDPATH_EVAL_ALL) +#if NOTYET + /* XXX skip addpath all optimisation for now until the queue + * is extended to allow this mode to work again. + * This mode is not very common and so taking the slow path + * here like for all other addpath send modes is not that + * big of an issue right now. + */ + if (peer->eval.mode == ADDPATH_EVAL_ALL) { up_generate_addpath_all(peer, re, newpath, old_pathid_tx); - else - up_generate_addpath(peer, re); + return; + } +#endif + up_generate_addpath(peer, re); return; } @@ -290,8 +302,66 @@ rde_generate_updates(struct rib_entry *r { struct rde_peer *peer; - RB_FOREACH(peer, peer_tree, &peertable) - peer_generate_update(peer, re, newpath, old_pathid_tx, mode); + switch (mode) { + case EVAL_RECONF: + /* skip peers which don't need to reconfigure */ + RB_FOREACH(peer, peer_tree, &peertable) { + if (peer->reconf_out == 0) + continue; + peer_generate_update(peer, re, NULL, 0, EVAL_RECONF); + } + return; + case EVAL_DEFAULT: + break; + case EVAL_ALL: + /* + * EVAL_DEFAULT is triggered when a new best path is selected. + * EVAL_ALL is sent for any other update (needed for peers with + * addpath or evaluate all set). + * There can be only one EVAL_DEFAULT queued, it replaces the + * previous one. A flag is enough. + * A path can only exist once in the queue (old or new). + */ + if (re->pq_mode == EVAL_DEFAULT) + /* already a best path update pending, nothing to do */ + return; + + break; + case EVAL_NONE: + fatalx("bad eval mode in %s", __func__); + } + + if (re->pq_mode == EVAL_NONE) { + peer = peer_get(re->pq_peer_id); + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); + } + if (newpath != NULL) + peer = prefix_peer(newpath); + else + peer = peerself; + re->pq_mode = mode; + re->pq_peer_id = peer->conf.id; + TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue); +} + +void +peer_process_updates(struct rde_peer *peer, void *bula) +{ + struct rib_entry *re; + struct rde_peer *p; + enum eval_mode mode; + + re = TAILQ_FIRST(&peer->rib_pq_head); + if (re == NULL) + return; + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); + + mode = re->pq_mode; + + RB_FOREACH(p, peer_tree, &peertable) + peer_generate_update(p, re, NULL, 0, mode); + + rib_dequeue(re); } /* @@ -658,6 +728,8 @@ peer_work_pending(void) RB_FOREACH(p, peer_tree, &peertable) { if (ibufq_queuelen(p->ibufq) != 0) + return 1; + if (!TAILQ_EMPTY(&p->rib_pq_head)) return 1; } Index: rde_rib.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v diff -u -p -r1.286 rde_rib.c --- rde_rib.c 27 Dec 2025 19:30:46 -0000 1.286 +++ rde_rib.c 28 Dec 2025 16:29:07 -0000 @@ -78,6 +78,12 @@ re_is_locked(struct rib_entry *re) return (re->lock != 0); } +static inline int +re_is_queued(struct rib_entry *re) +{ + return (re->pq_mode != EVAL_NONE); +} + static inline struct rib_tree * rib_tree(struct rib *rib) { @@ -317,6 +323,7 @@ rib_add(struct rib *rib, struct pt_entry TAILQ_INIT(&re->prefix_h); re->prefix = pt_ref(pte); re->rib_id = rib->id; + re->pq_mode = EVAL_NONE; if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) { log_warnx("rib_add: insert failed"); @@ -335,8 +342,8 @@ rib_remove(struct rib_entry *re) if (!rib_empty(re)) fatalx("rib_remove: entry not empty"); - if (re_is_locked(re)) - /* entry is locked, don't free it. */ + if (re_is_locked(re) || re_is_queued(re)) + /* entry is locked or queued, don't free it. */ return; pt_unref(re->prefix); @@ -352,6 +359,14 @@ static inline int rib_empty(struct rib_entry *re) { return TAILQ_EMPTY(&re->prefix_h); +} + +void +rib_dequeue(struct rib_entry *re) +{ + re->pq_mode = EVAL_NONE; + if (rib_empty(re)) + rib_remove(re); } static struct rib_entry *