Download raw body.
bgpd: split up processing of updates
On Sun, Dec 28, 2025 at 01:06:31PM +0100, Theo Buehler wrote:
> On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote:
> > One of the biggest latency inducing bottlenecks in bgpd is that UPDATE
> > messages are processed to completion. Mainly we do the input processing
> > update the local-RIB and then also handle the updates for the adj-rib-out
> > at the same time.
> >
> > The adj-rib-out is a loop over all peers and is therefor super heavy.
> > By taking this part of the pipeline into its own step we can drop the
> > latency inside the main poll loop by a lot.
> >
> > For this we introduce a per-peer update queue that enqueues the rib
> > entries and then handles those one by one. This also has a benefit that
> > a single peer can not monopolize the processing in bgpd. So a single
> > flapping peer should result in far less noticeable delays on all other
> > UPDATES from other peers.
> >
> > For now I had to disable the addpath send all optimisation since for that
> > we need to keep an extra queue of updates. Instead we just use the regular
> > addpath codepath that is less optimised but produces the same results.
> >
> > I will add the needed code later on.
>
> See a couple comments below.
> > enum eval_mode {
> > + EVAL_RECONF,
> > EVAL_DEFAULT,
> > EVAL_ALL,
> > - EVAL_RECONF,
>
> I found this change confusing. I think you handle things correctly,
> but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile.
> I mean the
>
> if (re->pq_mode) {
> ...
> }
>
> in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued()
> and rib_dequeue() doing re->pq_mode = 0.
>
> Should 0 not be EVAL_UNQUEUED or something?
See diff below, I used EVAL_NONE since we have a few other NONE enums in
bgpd. I think I replaced all places that used 0 with EVAL_NONE and also
added it in rib_add() to be explicit.
> > +#if NOTYET
> > + /* XXX skip addpath all optimisation for now until the queue
> > + * is extended to allow this mode to work again.
> > + * This mode is not very common and so taking the slow path
> > + * here like for all other addpath send modes is not that
> > + * big of an issue right now.
> > + */
>
> Comment wants more indent.
>
> > if (peer->eval.mode == ADDPATH_EVAL_ALL)
> > up_generate_addpath_all(peer, re, newpath,
> > old_pathid_tx);
>
> consider a return here and dropping the else, then the up_generate_addpath()
> can be without weird extra indent.
>
All done.
New diff below.
--
:wq Claudio
Index: rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
diff -u -p -r1.678 rde.c
--- rde.c 24 Dec 2025 07:59:55 -0000 1.678
+++ rde.c 27 Dec 2025 15:47:29 -0000
@@ -326,6 +326,7 @@ rde_main(int debug, int verbose)
monotime_to_usec(monotime_sub(io_end, loop_start));
peer_foreach(rde_dispatch_imsg_peer, NULL);
+ peer_foreach(peer_process_updates, NULL);
peer_end = getmonotime();
rdemem.rde_event_peer_usec +=
Index: rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
diff -u -p -r1.336 rde.h
--- rde.h 24 Dec 2025 07:59:55 -0000 1.336
+++ rde.h 28 Dec 2025 16:25:11 -0000
@@ -41,13 +41,19 @@ enum peer_state {
LIST_HEAD(prefix_list, prefix);
TAILQ_HEAD(prefix_queue, prefix);
RB_HEAD(rib_tree, rib_entry);
+TAILQ_HEAD(rib_queue, rib_entry);
+LIST_HEAD(rib_pq_head, rib_pq);
struct rib_entry {
RB_ENTRY(rib_entry) rib_e;
+ TAILQ_ENTRY(rib_entry) rib_queue;
struct prefix_queue prefix_h;
struct pt_entry *prefix;
+ struct rib_pq_head rib_pq_list;
+ uint32_t pq_peer_id;
uint16_t rib_id;
- uint16_t lock;
+ uint8_t lock;
+ uint8_t pq_mode;
};
struct rib {
@@ -93,6 +99,7 @@ struct rde_peer {
struct pend_prefix_hash pend_prefixes;
struct filter_head *out_rules;
struct ibufqueue *ibufq;
+ struct rib_queue rib_pq_head;
monotime_t staletime[AID_MAX];
uint32_t adjout_bid;
uint32_t remote_bgpid;
@@ -350,9 +357,10 @@ struct filterstate {
};
enum eval_mode {
+ EVAL_NONE,
+ EVAL_RECONF,
EVAL_DEFAULT,
EVAL_ALL,
- EVAL_RECONF,
};
struct rib_context {
@@ -414,6 +422,7 @@ struct filter_head *peer_apply_out_filte
void rde_generate_updates(struct rib_entry *, struct prefix *,
uint32_t, enum eval_mode);
+void peer_process_updates(struct rde_peer *, void *);
void peer_up(struct rde_peer *, struct session_up *);
void peer_down(struct rde_peer *);
@@ -612,6 +621,7 @@ int rib_dump_subtree(uint16_t, struct
void (*)(void *, uint8_t),
int (*)(void *));
void rib_dump_terminate(void *);
+void rib_dequeue(struct rib_entry *);
extern struct rib flowrib;
Index: rde_peer.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
diff -u -p -r1.64 rde_peer.c
--- rde_peer.c 24 Dec 2025 07:59:55 -0000 1.64
+++ rde_peer.c 28 Dec 2025 16:29:22 -0000
@@ -26,6 +26,12 @@
#include "bgpd.h"
#include "rde.h"
+struct rib_pq {
+ LIST_ENTRY(rib_pq) entry;
+ struct prefix *new;
+ uint32_t old_pathid_tx;
+};
+
struct peer_tree peertable = RB_INITIALIZER(&peertable);
struct peer_tree zombietable = RB_INITIALIZER(&zombietable);
struct rde_peer *peerself;
@@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config
if (peer == NULL)
fatal("peer_add");
+ TAILQ_INIT(&peer->rib_pq_head);
memcpy(&peer->conf, p_conf, sizeof(struct peer_config));
peer->remote_bgpid = 0;
peer->loc_rib_id = rib_find(peer->conf.rib);
@@ -264,17 +271,22 @@ peer_generate_update(struct rde_peer *pe
peer->export_type == EXPORT_DEFAULT_ROUTE)
return;
- /* if reconf skip peers which don't need to reconfigure */
- if (mode == EVAL_RECONF && peer->reconf_out == 0)
- return;
-
/* handle peers with add-path */
if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) {
- if (peer->eval.mode == ADDPATH_EVAL_ALL)
+#if NOTYET
+ /* XXX skip addpath all optimisation for now until the queue
+ * is extended to allow this mode to work again.
+ * This mode is not very common and so taking the slow path
+ * here like for all other addpath send modes is not that
+ * big of an issue right now.
+ */
+ if (peer->eval.mode == ADDPATH_EVAL_ALL) {
up_generate_addpath_all(peer, re, newpath,
old_pathid_tx);
- else
- up_generate_addpath(peer, re);
+ return;
+ }
+#endif
+ up_generate_addpath(peer, re);
return;
}
@@ -290,8 +302,66 @@ rde_generate_updates(struct rib_entry *r
{
struct rde_peer *peer;
- RB_FOREACH(peer, peer_tree, &peertable)
- peer_generate_update(peer, re, newpath, old_pathid_tx, mode);
+ switch (mode) {
+ case EVAL_RECONF:
+ /* skip peers which don't need to reconfigure */
+ RB_FOREACH(peer, peer_tree, &peertable) {
+ if (peer->reconf_out == 0)
+ continue;
+ peer_generate_update(peer, re, NULL, 0, EVAL_RECONF);
+ }
+ return;
+ case EVAL_DEFAULT:
+ break;
+ case EVAL_ALL:
+ /*
+ * EVAL_DEFAULT is triggered when a new best path is selected.
+ * EVAL_ALL is sent for any other update (needed for peers with
+ * addpath or evaluate all set).
+ * There can be only one EVAL_DEFAULT queued, it replaces the
+ * previous one. A flag is enough.
+ * A path can only exist once in the queue (old or new).
+ */
+ if (re->pq_mode == EVAL_DEFAULT)
+ /* already a best path update pending, nothing to do */
+ return;
+
+ break;
+ case EVAL_NONE:
+ fatalx("bad eval mode in %s", __func__);
+ }
+
+ if (re->pq_mode == EVAL_NONE) {
+ peer = peer_get(re->pq_peer_id);
+ TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+ }
+ if (newpath != NULL)
+ peer = prefix_peer(newpath);
+ else
+ peer = peerself;
+ re->pq_mode = mode;
+ re->pq_peer_id = peer->conf.id;
+ TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue);
+}
+
+void
+peer_process_updates(struct rde_peer *peer, void *bula)
+{
+ struct rib_entry *re;
+ struct rde_peer *p;
+ enum eval_mode mode;
+
+ re = TAILQ_FIRST(&peer->rib_pq_head);
+ if (re == NULL)
+ return;
+ TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
+
+ mode = re->pq_mode;
+
+ RB_FOREACH(p, peer_tree, &peertable)
+ peer_generate_update(p, re, NULL, 0, mode);
+
+ rib_dequeue(re);
}
/*
@@ -658,6 +728,8 @@ peer_work_pending(void)
RB_FOREACH(p, peer_tree, &peertable) {
if (ibufq_queuelen(p->ibufq) != 0)
+ return 1;
+ if (!TAILQ_EMPTY(&p->rib_pq_head))
return 1;
}
Index: rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
diff -u -p -r1.286 rde_rib.c
--- rde_rib.c 27 Dec 2025 19:30:46 -0000 1.286
+++ rde_rib.c 28 Dec 2025 16:29:07 -0000
@@ -78,6 +78,12 @@ re_is_locked(struct rib_entry *re)
return (re->lock != 0);
}
+static inline int
+re_is_queued(struct rib_entry *re)
+{
+ return (re->pq_mode != EVAL_NONE);
+}
+
static inline struct rib_tree *
rib_tree(struct rib *rib)
{
@@ -317,6 +323,7 @@ rib_add(struct rib *rib, struct pt_entry
TAILQ_INIT(&re->prefix_h);
re->prefix = pt_ref(pte);
re->rib_id = rib->id;
+ re->pq_mode = EVAL_NONE;
if (RB_INSERT(rib_tree, rib_tree(rib), re) != NULL) {
log_warnx("rib_add: insert failed");
@@ -335,8 +342,8 @@ rib_remove(struct rib_entry *re)
if (!rib_empty(re))
fatalx("rib_remove: entry not empty");
- if (re_is_locked(re))
- /* entry is locked, don't free it. */
+ if (re_is_locked(re) || re_is_queued(re))
+ /* entry is locked or queued, don't free it. */
return;
pt_unref(re->prefix);
@@ -352,6 +359,14 @@ static inline int
rib_empty(struct rib_entry *re)
{
return TAILQ_EMPTY(&re->prefix_h);
+}
+
+void
+rib_dequeue(struct rib_entry *re)
+{
+ re->pq_mode = EVAL_NONE;
+ if (rib_empty(re))
+ rib_remove(re);
}
static struct rib_entry *
bgpd: split up processing of updates