From: Claudio Jeker Subject: bgpd: split up processing of updates To: tech@openbsd.org Date: Sat, 27 Dec 2025 17:04:31 +0100 One of the biggest latency inducing bottlenecks in bgpd is that UPDATE messages are processed to completion. Mainly we do the input processing update the local-RIB and then also handle the updates for the adj-rib-out at the same time. The adj-rib-out is a loop over all peers and is therefor super heavy. By taking this part of the pipeline into its own step we can drop the latency inside the main poll loop by a lot. For this we introduce a per-peer update queue that enqueues the rib entries and then handles those one by one. This also has a benefit that a single peer can not monopolize the processing in bgpd. So a single flapping peer should result in far less noticeable delays on all other UPDATES from other peers. For now I had to disable the addpath send all optimisation since for that we need to keep an extra queue of updates. Instead we just use the regular addpath codepath that is less optimised but produces the same results. I will add the needed code later on. -- :wq Claudio Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v diff -u -p -r1.678 rde.c --- rde.c 24 Dec 2025 07:59:55 -0000 1.678 +++ rde.c 27 Dec 2025 15:47:29 -0000 @@ -326,6 +326,7 @@ rde_main(int debug, int verbose) monotime_to_usec(monotime_sub(io_end, loop_start)); peer_foreach(rde_dispatch_imsg_peer, NULL); + peer_foreach(peer_process_updates, NULL); peer_end = getmonotime(); rdemem.rde_event_peer_usec += Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v diff -u -p -r1.336 rde.h --- rde.h 24 Dec 2025 07:59:55 -0000 1.336 +++ rde.h 27 Dec 2025 15:47:29 -0000 @@ -41,13 +41,19 @@ enum peer_state { LIST_HEAD(prefix_list, prefix); TAILQ_HEAD(prefix_queue, prefix); RB_HEAD(rib_tree, rib_entry); +TAILQ_HEAD(rib_queue, rib_entry); +LIST_HEAD(rib_pq_head, rib_pq); struct rib_entry { RB_ENTRY(rib_entry) rib_e; + TAILQ_ENTRY(rib_entry) rib_queue; struct prefix_queue prefix_h; struct pt_entry *prefix; + struct rib_pq_head rib_pq_list; + uint32_t pq_peer_id; uint16_t rib_id; - uint16_t lock; + uint8_t lock; + uint8_t pq_mode; }; struct rib { @@ -93,6 +99,7 @@ struct rde_peer { struct pend_prefix_hash pend_prefixes; struct filter_head *out_rules; struct ibufqueue *ibufq; + struct rib_queue rib_pq_head; monotime_t staletime[AID_MAX]; uint32_t adjout_bid; uint32_t remote_bgpid; @@ -350,9 +357,9 @@ struct filterstate { }; enum eval_mode { + EVAL_RECONF, EVAL_DEFAULT, EVAL_ALL, - EVAL_RECONF, }; struct rib_context { @@ -414,6 +421,7 @@ struct filter_head *peer_apply_out_filte void rde_generate_updates(struct rib_entry *, struct prefix *, uint32_t, enum eval_mode); +void peer_process_updates(struct rde_peer *, void *); void peer_up(struct rde_peer *, struct session_up *); void peer_down(struct rde_peer *); @@ -612,6 +620,7 @@ int rib_dump_subtree(uint16_t, struct void (*)(void *, uint8_t), int (*)(void *)); void rib_dump_terminate(void *); +void rib_dequeue(struct rib_entry *); extern struct rib flowrib; Index: rde_peer.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v diff -u -p -r1.64 rde_peer.c --- rde_peer.c 24 Dec 2025 07:59:55 -0000 1.64 +++ rde_peer.c 27 Dec 2025 15:53:38 -0000 @@ -26,6 +26,12 @@ #include "bgpd.h" #include "rde.h" +struct rib_pq { + LIST_ENTRY(rib_pq) entry; + struct prefix *new; + uint32_t old_pathid_tx; +}; + struct peer_tree peertable = RB_INITIALIZER(&peertable); struct peer_tree zombietable = RB_INITIALIZER(&zombietable); struct rde_peer *peerself; @@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config if (peer == NULL) fatal("peer_add"); + TAILQ_INIT(&peer->rib_pq_head); memcpy(&peer->conf, p_conf, sizeof(struct peer_config)); peer->remote_bgpid = 0; peer->loc_rib_id = rib_find(peer->conf.rib); @@ -264,16 +271,20 @@ peer_generate_update(struct rde_peer *pe peer->export_type == EXPORT_DEFAULT_ROUTE) return; - /* if reconf skip peers which don't need to reconfigure */ - if (mode == EVAL_RECONF && peer->reconf_out == 0) - return; - /* handle peers with add-path */ if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) { +#if NOTYET + /* XXX skip addpath all optimisation for now until the queue + * is extended to allow this mode to work again. + * This mode is not very common and so taking the slow path + * here like for all other addpath send modes is not that + * big of an issue right now. + */ if (peer->eval.mode == ADDPATH_EVAL_ALL) up_generate_addpath_all(peer, re, newpath, old_pathid_tx); else +#endif up_generate_addpath(peer, re); return; } @@ -290,8 +301,64 @@ rde_generate_updates(struct rib_entry *r { struct rde_peer *peer; - RB_FOREACH(peer, peer_tree, &peertable) - peer_generate_update(peer, re, newpath, old_pathid_tx, mode); + switch (mode) { + case EVAL_RECONF: + /* skip peers which don't need to reconfigure */ + RB_FOREACH(peer, peer_tree, &peertable) { + if (peer->reconf_out == 0) + continue; + peer_generate_update(peer, re, NULL, 0, EVAL_RECONF); + } + return; + case EVAL_DEFAULT: + break; + case EVAL_ALL: + /* + * EVAL_DEFAULT is triggered when a new best path is selected. + * EVAL_ALL is sent for any other update (needed for peers with + * addpath or evaluate all set). + * There can be only one EVAL_DEFAULT queued, it replaces the + * previous one. A flag is enough. + * A path can only exist once in the queue (old or new). + */ + if (re->pq_mode == EVAL_DEFAULT) + /* already a best path update pending, nothing to do */ + return; + + break; + } + + if (re->pq_mode) { + peer = peer_get(re->pq_peer_id); + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); + } + if (newpath != NULL) + peer = prefix_peer(newpath); + else + peer = peerself; + re->pq_mode = mode; + re->pq_peer_id = peer->conf.id; + TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue); +} + +void +peer_process_updates(struct rde_peer *peer, void *bula) +{ + struct rib_entry *re; + struct rde_peer *p; + enum eval_mode mode; + + re = TAILQ_FIRST(&peer->rib_pq_head); + if (re == NULL) + return; + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue); + + mode = re->pq_mode; + + RB_FOREACH(p, peer_tree, &peertable) + peer_generate_update(p, re, NULL, 0, mode); + + rib_dequeue(re); } /* @@ -658,6 +725,8 @@ peer_work_pending(void) RB_FOREACH(p, peer_tree, &peertable) { if (ibufq_queuelen(p->ibufq) != 0) + return 1; + if (!TAILQ_EMPTY(&p->rib_pq_head)) return 1; } Index: rde_rib.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v diff -u -p -r1.285 rde_rib.c --- rde_rib.c 24 Dec 2025 07:59:55 -0000 1.285 +++ rde_rib.c 27 Dec 2025 15:47:29 -0000 @@ -77,6 +77,12 @@ re_is_locked(struct rib_entry *re) return (re->lock != 0); } +static inline int +re_is_queued(struct rib_entry *re) +{ + return (re->pq_mode != 0); +} + static inline struct rib_tree * rib_tree(struct rib *rib) { @@ -334,8 +340,8 @@ rib_remove(struct rib_entry *re) if (!rib_empty(re)) fatalx("rib_remove: entry not empty"); - if (re_is_locked(re)) - /* entry is locked, don't free it. */ + if (re_is_locked(re) || re_is_queued(re)) + /* entry is locked or queued, don't free it. */ return; pt_unref(re->prefix); @@ -351,6 +357,14 @@ static inline int rib_empty(struct rib_entry *re) { return TAILQ_EMPTY(&re->prefix_h); +} + +void +rib_dequeue(struct rib_entry *re) +{ + re->pq_mode = 0; + if (rib_empty(re)) + rib_remove(re); } static struct rib_entry *