Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
Re: bgpd: split up processing of updates
To:
Theo Buehler <tb@theobuehler.org>
Cc:
tech@openbsd.org
Date:
Sun, 28 Dec 2025 17:34:23 +0100

Download raw body.

Thread
On Sun, Dec 28, 2025 at 01:06:31PM +0100, Theo Buehler wrote:
> On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote:
> > One of the biggest latency inducing bottlenecks in bgpd is that UPDATE
> > messages are processed to completion. Mainly we do the input processing
> > update the local-RIB and then also handle the updates for the adj-rib-out
> > at the same time.
> > 
> > The adj-rib-out is a loop over all peers and is therefor super heavy.
> > By taking this part of the pipeline into its own step we can drop the
> > latency inside the main poll loop by a lot.
> > 
> > For this we introduce a per-peer update queue that enqueues the rib
> > entries and then handles those one by one. This also has a benefit that
> > a single peer can not monopolize the processing in bgpd. So a single
> > flapping peer should result in far less noticeable delays on all other
> > UPDATES from other peers.
> > 
> > For now I had to disable the addpath send all optimisation since for that
> > we need to keep an extra queue of updates. Instead we just use the regular
> > addpath codepath that is less optimised but produces the same results.
> > 
> > I will add the needed code later on.
> 
> See a couple comments below.


> >  enum eval_mode {
> > +	EVAL_RECONF,
> >  	EVAL_DEFAULT,
> >  	EVAL_ALL,
> > -	EVAL_RECONF,
> 
> I found this change confusing. I think you handle things correctly,
> but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile.
> I mean the
> 
> 	if (re->pq_mode) {
> 	...
> 	}
> 
> in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued()
> and rib_dequeue() doing re->pq_mode = 0.
> 
> Should 0 not be EVAL_UNQUEUED or something?

See diff below, I used EVAL_NONE since we have a few other NONE enums in
bgpd. I think I replaced all places that used 0 with EVAL_NONE and also
added it in rib_add() to be explicit.
 
> > +#if NOTYET
> > +	/* XXX skip addpath all optimisation for now until the queue
> > +	 * is extended to allow this mode to work again.
> > +	 * This mode is not very common and so taking the slow path
> > +	 * here like for all other addpath send modes is not that
> > +	 * big of an issue right now.
> > +	 */
> 
> Comment wants more indent.
> 
> >  		if (peer->eval.mode == ADDPATH_EVAL_ALL)
> >  			up_generate_addpath_all(peer, re, newpath,
> >  			    old_pathid_tx);
> 
> consider a return here and dropping the else, then the up_generate_addpath()
> can be without weird extra indent.
> 

All done.
 
New diff below.
-- 
:wq Claudio

Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
diff -u -p -r1.678 rde.c
--- rde.c	24 Dec 2025 07:59:55 -0000	1.678
+++ rde.c	27 Dec 2025 15:47:29 -0000
@@ -326,6 +326,7 @@ rde_main(int debug, int verbose)
 		    monotime_to_usec(monotime_sub(io_end, loop_start));
 
 		peer_foreach(rde_dispatch_imsg_peer, NULL);
+		peer_foreach(peer_process_updates, NULL);
 
 		peer_end = getmonotime();
 		rdemem.rde_event_peer_usec +=
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
diff -u -p -r1.336 rde.h
--- rde.h	24 Dec 2025 07:59:55 -0000	1.336
+++ rde.h	28 Dec 2025 16:25:11 -0000
@@ -41,13 +41,19 @@ enum peer_state {
 LIST_HEAD(prefix_list, prefix);
 TAILQ_HEAD(prefix_queue, prefix);
 RB_HEAD(rib_tree, rib_entry);
+TAILQ_HEAD(rib_queue, rib_entry);
+LIST_HEAD(rib_pq_head, rib_pq);
 
 struct rib_entry {
 	RB_ENTRY(rib_entry)	 rib_e;
+	TAILQ_ENTRY(rib_entry)	 rib_queue;
 	struct prefix_queue	 prefix_h;
 	struct pt_entry		*prefix;
+	struct rib_pq_head	 rib_pq_list;
+	uint32_t		 pq_peer_id;
 	uint16_t		 rib_id;
-	uint16_t		 lock;
+	uint8_t			 lock;
+	uint8_t			 pq_mode;
 };
 
 struct rib {
@@ -93,6 +99,7 @@ struct rde_peer {
 	struct pend_prefix_hash		 pend_prefixes;
 	struct filter_head		*out_rules;
 	struct ibufqueue		*ibufq;
+	struct rib_queue		 rib_pq_head;
 	monotime_t			 staletime[AID_MAX];
 	uint32_t			 adjout_bid;
 	uint32_t			 remote_bgpid;
@@ -350,9 +357,10 @@ struct filterstate {
 };
 
 enum eval_mode {
+	EVAL_NONE,
+	EVAL_RECONF,
 	EVAL_DEFAULT,
 	EVAL_ALL,
-	EVAL_RECONF,
 };
 
 struct rib_context {
@@ -414,6 +422,7 @@ struct filter_head	*peer_apply_out_filte
 
 void		 rde_generate_updates(struct rib_entry *, struct prefix *,
 		    uint32_t, enum eval_mode);
+void		 peer_process_updates(struct rde_peer *, void *);
 
 void		 peer_up(struct rde_peer *, struct session_up *);
 void		 peer_down(struct rde_peer *);
@@ -612,6 +621,7 @@ int		 rib_dump_subtree(uint16_t, struct 
 		    void (*)(void *, uint8_t),
 		    int (*)(void *));
 void		 rib_dump_terminate(void *);
+void		 rib_dequeue(struct rib_entry *);
 
 extern struct rib flowrib;
 
Index: rde_peer.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
diff -u -p -r1.64 rde_peer.c
--- rde_peer.c	24 Dec 2025 07:59:55 -0000	1.64
+++ rde_peer.c	28 Dec 2025 16:29:22 -0000
@@ -26,6 +26,12 @@
 #include "bgpd.h"
 #include "rde.h"
 
+struct rib_pq {
+	LIST_ENTRY(rib_pq)	entry;
+	struct prefix		*new;
+	uint32_t		old_pathid_tx;
+};
+
 struct peer_tree	 peertable = RB_INITIALIZER(&peertable);
 struct peer_tree	 zombietable = RB_INITIALIZER(&zombietable);
 struct rde_peer		*peerself;
@@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config
 	if (peer == NULL)
 		fatal("peer_add");
 
+	TAILQ_INIT(&peer->rib_pq_head);
 	memcpy(&peer->conf, p_conf, sizeof(struct peer_config));
 	peer->remote_bgpid = 0;
 	peer->loc_rib_id = rib_find(peer->conf.rib);
@@ -264,17 +271,22 @@ peer_generate_update(struct rde_peer *pe
 	    peer->export_type == EXPORT_DEFAULT_ROUTE)
 		return;
 
-	/* if reconf skip peers which don't need to reconfigure */
-	if (mode == EVAL_RECONF && peer->reconf_out == 0)
-		return;
-
 	/* handle peers with add-path */
 	if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) {
-		if (peer->eval.mode == ADDPATH_EVAL_ALL)
+#if NOTYET
+		/* XXX skip addpath all optimisation for now until the queue
+		 * is extended to allow this mode to work again.
+		 * This mode is not very common and so taking the slow path
+		 * here like for all other addpath send modes is not that
+		 * big of an issue right now.
+		 */
+		if (peer->eval.mode == ADDPATH_EVAL_ALL) {
 			up_generate_addpath_all(peer, re, newpath,
 			    old_pathid_tx);
-		else
-			up_generate_addpath(peer, re);
+			return;
+		}
+#endif
+		up_generate_addpath(peer, re);
 		return;
 	}
 
@@ -290,8 +302,66 @@ rde_generate_updates(struct rib_entry *r
 {
 	struct rde_peer	*peer;
 
-	RB_FOREACH(peer, peer_tree, &peertable)
-		peer_generate_update(peer, re, newpath, old_pathid_tx, mode);
+	switch (mode) {
+	case EVAL_RECONF:
+		/* skip peers which don't need to reconfigure */
+		RB_FOREACH(peer, peer_tree, &peertable) {
+			if (peer->reconf_out == 0)
+				continue;
+			peer_generate_update(peer, re, NULL, 0, EVAL_RECONF);
+		}
+		return;
+	case EVAL_DEFAULT:
+		break;
+	case EVAL_ALL:
+		/*
+		 * EVAL_DEFAULT is triggered when a new best path is selected.
+		 * EVAL_ALL is sent for any other update (needed for peers with
+		 * addpath or evaluate all set).
+		 * There can be only one EVAL_DEFAULT queued, it replaces the
+		 * previous one. A flag is enough.
+		 * A path can only exist once in the queue (old or new).
+		 */
+		if (re->pq_mode == EVAL_DEFAULT)
+			/* already a best path update pending, nothing to do */
+			return;
+
+		break;
+	case EVAL_NONE:
+		fatalx("bad eval mode in %s", __func__);
+	}
+
+	if (re->pq_mode == EVAL_NONE) {
+		peer = peer_get(re->pq_peer_id);
+		TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+	}
+	if (newpath != NULL)
+		peer = prefix_peer(newpath);
+	else
+		peer = peerself;
+	re->pq_mode = mode;
+	re->pq_peer_id = peer->conf.id;
+	TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue);
+}
+
+void
+peer_process_updates(struct rde_peer *peer, void *bula)
+{
+	struct rib_entry *re;
+	struct rde_peer *p;
+	enum eval_mode mode;
+
+	re = TAILQ_FIRST(&peer->rib_pq_head);
+	if (re == NULL)
+		return;
+	TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+
+	mode = re->pq_mode;
+
+	RB_FOREACH(p, peer_tree, &peertable)
+		peer_generate_update(p, re, NULL, 0, mode);
+
+	rib_dequeue(re);
 }
 
 /*
@@ -658,6 +728,8 @@ peer_work_pending(void)
 
 	RB_FOREACH(p, peer_tree, &peertable) {
 		if (ibufq_queuelen(p->ibufq) != 0)
+			return 1;
+		if (!TAILQ_EMPTY(&p->rib_pq_head))
 			return 1;
 	}
 
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
diff -u -p -r1.286 rde_rib.c
--- rde_rib.c	27 Dec 2025 19:30:46 -0000	1.286
+++ rde_rib.c	28 Dec 2025 16:29:07 -0000
@@ -78,6 +78,12 @@ re_is_locked(struct rib_entry *re)
 	return (re->lock != 0);
 }
 
+static inline int
+re_is_queued(struct rib_entry *re)
+{
+	return (re->pq_mode != EVAL_NONE);
+}
+
 static inline struct rib_tree *
 rib_tree(struct rib *rib)
 {
@@ -317,6 +323,7 @@ rib_add(struct rib *rib, struct pt_entry
 	TAILQ_INIT(&re->prefix_h);
 	re->prefix = pt_ref(pte);
 	re->rib_id = rib->id;
+	re->pq_mode = EVAL_NONE;
 
 	if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
 		log_warnx("rib_add: insert failed");
@@ -335,8 +342,8 @@ rib_remove(struct rib_entry *re)
 	if (!rib_empty(re))
 		fatalx("rib_remove: entry not empty");
 
-	if (re_is_locked(re))
-		/* entry is locked, don't free it. */
+	if (re_is_locked(re) || re_is_queued(re))
+		/* entry is locked or queued, don't free it. */
 		return;
 
 	pt_unref(re->prefix);
@@ -352,6 +359,14 @@ static inline int
 rib_empty(struct rib_entry *re)
 {
 	return TAILQ_EMPTY(&re->prefix_h);
+}
+
+void
+rib_dequeue(struct rib_entry *re)
+{
+	re->pq_mode = EVAL_NONE;
+	if (rib_empty(re))
+		rib_remove(re);
 }
 
 static struct rib_entry *