From: Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject: bgpd: split up processing of updates
To: tech@openbsd.org
Date: Sat, 27 Dec 2025 17:04:31 +0100

One of the biggest latency inducing bottlenecks in bgpd is that UPDATE
messages are processed to completion. Mainly we do the input processing
update the local-RIB and then also handle the updates for the adj-rib-out
at the same time.

The adj-rib-out is a loop over all peers and is therefor super heavy.
By taking this part of the pipeline into its own step we can drop the
latency inside the main poll loop by a lot.

For this we introduce a per-peer update queue that enqueues the rib
entries and then handles those one by one. This also has a benefit that
a single peer can not monopolize the processing in bgpd. So a single
flapping peer should result in far less noticeable delays on all other
UPDATES from other peers.

For now I had to disable the addpath send all optimisation since for that
we need to keep an extra queue of updates. Instead we just use the regular
addpath codepath that is less optimised but produces the same results.

I will add the needed code later on.
-- 
:wq Claudio

Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
diff -u -p -r1.678 rde.c
--- rde.c	24 Dec 2025 07:59:55 -0000	1.678
+++ rde.c	27 Dec 2025 15:47:29 -0000
@@ -326,6 +326,7 @@ rde_main(int debug, int verbose)
 		    monotime_to_usec(monotime_sub(io_end, loop_start));
 
 		peer_foreach(rde_dispatch_imsg_peer, NULL);
+		peer_foreach(peer_process_updates, NULL);
 
 		peer_end = getmonotime();
 		rdemem.rde_event_peer_usec +=
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
diff -u -p -r1.336 rde.h
--- rde.h	24 Dec 2025 07:59:55 -0000	1.336
+++ rde.h	27 Dec 2025 15:47:29 -0000
@@ -41,13 +41,19 @@ enum peer_state {
 LIST_HEAD(prefix_list, prefix);
 TAILQ_HEAD(prefix_queue, prefix);
 RB_HEAD(rib_tree, rib_entry);
+TAILQ_HEAD(rib_queue, rib_entry);
+LIST_HEAD(rib_pq_head, rib_pq);
 
 struct rib_entry {
 	RB_ENTRY(rib_entry)	 rib_e;
+	TAILQ_ENTRY(rib_entry)	 rib_queue;
 	struct prefix_queue	 prefix_h;
 	struct pt_entry		*prefix;
+	struct rib_pq_head	 rib_pq_list;
+	uint32_t		 pq_peer_id;
 	uint16_t		 rib_id;
-	uint16_t		 lock;
+	uint8_t			 lock;
+	uint8_t			 pq_mode;
 };
 
 struct rib {
@@ -93,6 +99,7 @@ struct rde_peer {
 	struct pend_prefix_hash		 pend_prefixes;
 	struct filter_head		*out_rules;
 	struct ibufqueue		*ibufq;
+	struct rib_queue		 rib_pq_head;
 	monotime_t			 staletime[AID_MAX];
 	uint32_t			 adjout_bid;
 	uint32_t			 remote_bgpid;
@@ -350,9 +357,9 @@ struct filterstate {
 };
 
 enum eval_mode {
+	EVAL_RECONF,
 	EVAL_DEFAULT,
 	EVAL_ALL,
-	EVAL_RECONF,
 };
 
 struct rib_context {
@@ -414,6 +421,7 @@ struct filter_head	*peer_apply_out_filte
 
 void		 rde_generate_updates(struct rib_entry *, struct prefix *,
 		    uint32_t, enum eval_mode);
+void		 peer_process_updates(struct rde_peer *, void *);
 
 void		 peer_up(struct rde_peer *, struct session_up *);
 void		 peer_down(struct rde_peer *);
@@ -612,6 +620,7 @@ int		 rib_dump_subtree(uint16_t, struct 
 		    void (*)(void *, uint8_t),
 		    int (*)(void *));
 void		 rib_dump_terminate(void *);
+void		 rib_dequeue(struct rib_entry *);
 
 extern struct rib flowrib;
 
Index: rde_peer.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
diff -u -p -r1.64 rde_peer.c
--- rde_peer.c	24 Dec 2025 07:59:55 -0000	1.64
+++ rde_peer.c	27 Dec 2025 15:53:38 -0000
@@ -26,6 +26,12 @@
 #include "bgpd.h"
 #include "rde.h"
 
+struct rib_pq {
+	LIST_ENTRY(rib_pq)	entry;
+	struct prefix		*new;
+	uint32_t		old_pathid_tx;
+};
+
 struct peer_tree	 peertable = RB_INITIALIZER(&peertable);
 struct peer_tree	 zombietable = RB_INITIALIZER(&zombietable);
 struct rde_peer		*peerself;
@@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config
 	if (peer == NULL)
 		fatal("peer_add");
 
+	TAILQ_INIT(&peer->rib_pq_head);
 	memcpy(&peer->conf, p_conf, sizeof(struct peer_config));
 	peer->remote_bgpid = 0;
 	peer->loc_rib_id = rib_find(peer->conf.rib);
@@ -264,16 +271,20 @@ peer_generate_update(struct rde_peer *pe
 	    peer->export_type == EXPORT_DEFAULT_ROUTE)
 		return;
 
-	/* if reconf skip peers which don't need to reconfigure */
-	if (mode == EVAL_RECONF && peer->reconf_out == 0)
-		return;
-
 	/* handle peers with add-path */
 	if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) {
+#if NOTYET
+	/* XXX skip addpath all optimisation for now until the queue
+	 * is extended to allow this mode to work again.
+	 * This mode is not very common and so taking the slow path
+	 * here like for all other addpath send modes is not that
+	 * big of an issue right now.
+	 */
 		if (peer->eval.mode == ADDPATH_EVAL_ALL)
 			up_generate_addpath_all(peer, re, newpath,
 			    old_pathid_tx);
 		else
+#endif
 			up_generate_addpath(peer, re);
 		return;
 	}
@@ -290,8 +301,64 @@ rde_generate_updates(struct rib_entry *r
 {
 	struct rde_peer	*peer;
 
-	RB_FOREACH(peer, peer_tree, &peertable)
-		peer_generate_update(peer, re, newpath, old_pathid_tx, mode);
+	switch (mode) {
+	case EVAL_RECONF:
+		/* skip peers which don't need to reconfigure */
+		RB_FOREACH(peer, peer_tree, &peertable) {
+			if (peer->reconf_out == 0)
+				continue;
+			peer_generate_update(peer, re, NULL, 0, EVAL_RECONF);
+		}
+		return;
+	case EVAL_DEFAULT:
+		break;
+	case EVAL_ALL:
+		/*
+		 * EVAL_DEFAULT is triggered when a new best path is selected.
+		 * EVAL_ALL is sent for any other update (needed for peers with
+		 * addpath or evaluate all set).
+		 * There can be only one EVAL_DEFAULT queued, it replaces the
+		 * previous one. A flag is enough.
+		 * A path can only exist once in the queue (old or new).
+		 */
+		if (re->pq_mode == EVAL_DEFAULT)
+			/* already a best path update pending, nothing to do */
+			return;
+
+		break;
+	}
+
+	if (re->pq_mode) {
+		peer = peer_get(re->pq_peer_id);
+		TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+	}
+	if (newpath != NULL)
+		peer = prefix_peer(newpath);
+	else
+		peer = peerself;
+	re->pq_mode = mode;
+	re->pq_peer_id = peer->conf.id;
+	TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue);
+}
+
+void
+peer_process_updates(struct rde_peer *peer, void *bula)
+{
+	struct rib_entry *re;
+	struct rde_peer *p;
+	enum eval_mode mode;
+
+	re = TAILQ_FIRST(&peer->rib_pq_head);
+	if (re == NULL)
+		return;
+	TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+
+	mode = re->pq_mode;
+
+	RB_FOREACH(p, peer_tree, &peertable)
+		peer_generate_update(p, re, NULL, 0, mode);
+
+	rib_dequeue(re);
 }
 
 /*
@@ -658,6 +725,8 @@ peer_work_pending(void)
 
 	RB_FOREACH(p, peer_tree, &peertable) {
 		if (ibufq_queuelen(p->ibufq) != 0)
+			return 1;
+		if (!TAILQ_EMPTY(&p->rib_pq_head))
 			return 1;
 	}
 
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
diff -u -p -r1.285 rde_rib.c
--- rde_rib.c	24 Dec 2025 07:59:55 -0000	1.285
+++ rde_rib.c	27 Dec 2025 15:47:29 -0000
@@ -77,6 +77,12 @@ re_is_locked(struct rib_entry *re)
 	return (re->lock != 0);
 }
 
+static inline int
+re_is_queued(struct rib_entry *re)
+{
+	return (re->pq_mode != 0);
+}
+
 static inline struct rib_tree *
 rib_tree(struct rib *rib)
 {
@@ -334,8 +340,8 @@ rib_remove(struct rib_entry *re)
 	if (!rib_empty(re))
 		fatalx("rib_remove: entry not empty");
 
-	if (re_is_locked(re))
-		/* entry is locked, don't free it. */
+	if (re_is_locked(re) || re_is_queued(re))
+		/* entry is locked or queued, don't free it. */
 		return;
 
 	pt_unref(re->prefix);
@@ -351,6 +357,14 @@ static inline int
 rib_empty(struct rib_entry *re)
 {
 	return TAILQ_EMPTY(&re->prefix_h);
+}
+
+void
+rib_dequeue(struct rib_entry *re)
+{
+	re->pq_mode = 0;
+	if (rib_empty(re))
+		rib_remove(re);
 }
 
 static struct rib_entry *