Download raw body.
bgpd: split up processing of updates
On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote:
> One of the biggest latency inducing bottlenecks in bgpd is that UPDATE
> messages are processed to completion. Mainly we do the input processing
> update the local-RIB and then also handle the updates for the adj-rib-out
> at the same time.
>
> The adj-rib-out is a loop over all peers and is therefor super heavy.
> By taking this part of the pipeline into its own step we can drop the
> latency inside the main poll loop by a lot.
>
> For this we introduce a per-peer update queue that enqueues the rib
> entries and then handles those one by one. This also has a benefit that
> a single peer can not monopolize the processing in bgpd. So a single
> flapping peer should result in far less noticeable delays on all other
> UPDATES from other peers.
>
> For now I had to disable the addpath send all optimisation since for that
> we need to keep an extra queue of updates. Instead we just use the regular
> addpath codepath that is less optimised but produces the same results.
>
> I will add the needed code later on.
See a couple comments below.
> --
> :wq Claudio
>
> Index: rde.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> diff -u -p -r1.678 rde.c
> --- rde.c 24 Dec 2025 07:59:55 -0000 1.678
> +++ rde.c 27 Dec 2025 15:47:29 -0000
> @@ -326,6 +326,7 @@ rde_main(int debug, int verbose)
> monotime_to_usec(monotime_sub(io_end, loop_start));
>
> peer_foreach(rde_dispatch_imsg_peer, NULL);
> + peer_foreach(peer_process_updates, NULL);
>
> peer_end = getmonotime();
> rdemem.rde_event_peer_usec +=
> Index: rde.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> diff -u -p -r1.336 rde.h
> --- rde.h 24 Dec 2025 07:59:55 -0000 1.336
> +++ rde.h 27 Dec 2025 15:47:29 -0000
> @@ -41,13 +41,19 @@ enum peer_state {
> LIST_HEAD(prefix_list, prefix);
> TAILQ_HEAD(prefix_queue, prefix);
> RB_HEAD(rib_tree, rib_entry);
> +TAILQ_HEAD(rib_queue, rib_entry);
> +LIST_HEAD(rib_pq_head, rib_pq);
>
> struct rib_entry {
> RB_ENTRY(rib_entry) rib_e;
> + TAILQ_ENTRY(rib_entry) rib_queue;
> struct prefix_queue prefix_h;
> struct pt_entry *prefix;
> + struct rib_pq_head rib_pq_list;
> + uint32_t pq_peer_id;
> uint16_t rib_id;
> - uint16_t lock;
> + uint8_t lock;
> + uint8_t pq_mode;
> };
>
> struct rib {
> @@ -93,6 +99,7 @@ struct rde_peer {
> struct pend_prefix_hash pend_prefixes;
> struct filter_head *out_rules;
> struct ibufqueue *ibufq;
> + struct rib_queue rib_pq_head;
> monotime_t staletime[AID_MAX];
> uint32_t adjout_bid;
> uint32_t remote_bgpid;
> @@ -350,9 +357,9 @@ struct filterstate {
> };
>
> enum eval_mode {
> + EVAL_RECONF,
> EVAL_DEFAULT,
> EVAL_ALL,
> - EVAL_RECONF,
I found this change confusing. I think you handle things correctly,
but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile.
I mean the
if (re->pq_mode) {
...
}
in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued()
and rib_dequeue() doing re->pq_mode = 0.
Should 0 not be EVAL_UNQUEUED or something?
> };
>
> struct rib_context {
> @@ -414,6 +421,7 @@ struct filter_head *peer_apply_out_filte
>
> void rde_generate_updates(struct rib_entry *, struct prefix *,
> uint32_t, enum eval_mode);
> +void peer_process_updates(struct rde_peer *, void *);
>
> void peer_up(struct rde_peer *, struct session_up *);
> void peer_down(struct rde_peer *);
> @@ -612,6 +620,7 @@ int rib_dump_subtree(uint16_t, struct
> void (*)(void *, uint8_t),
> int (*)(void *));
> void rib_dump_terminate(void *);
> +void rib_dequeue(struct rib_entry *);
>
> extern struct rib flowrib;
>
> Index: rde_peer.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
> diff -u -p -r1.64 rde_peer.c
> --- rde_peer.c 24 Dec 2025 07:59:55 -0000 1.64
> +++ rde_peer.c 27 Dec 2025 15:53:38 -0000
> @@ -26,6 +26,12 @@
> #include "bgpd.h"
> #include "rde.h"
>
> +struct rib_pq {
> + LIST_ENTRY(rib_pq) entry;
> + struct prefix *new;
> + uint32_t old_pathid_tx;
> +};
> +
> struct peer_tree peertable = RB_INITIALIZER(&peertable);
> struct peer_tree zombietable = RB_INITIALIZER(&zombietable);
> struct rde_peer *peerself;
> @@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config
> if (peer == NULL)
> fatal("peer_add");
>
> + TAILQ_INIT(&peer->rib_pq_head);
> memcpy(&peer->conf, p_conf, sizeof(struct peer_config));
> peer->remote_bgpid = 0;
> peer->loc_rib_id = rib_find(peer->conf.rib);
> @@ -264,16 +271,20 @@ peer_generate_update(struct rde_peer *pe
> peer->export_type == EXPORT_DEFAULT_ROUTE)
> return;
>
> - /* if reconf skip peers which don't need to reconfigure */
> - if (mode == EVAL_RECONF && peer->reconf_out == 0)
> - return;
> -
> /* handle peers with add-path */
> if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) {
> +#if NOTYET
> + /* XXX skip addpath all optimisation for now until the queue
> + * is extended to allow this mode to work again.
> + * This mode is not very common and so taking the slow path
> + * here like for all other addpath send modes is not that
> + * big of an issue right now.
> + */
Comment wants more indent.
> if (peer->eval.mode == ADDPATH_EVAL_ALL)
> up_generate_addpath_all(peer, re, newpath,
> old_pathid_tx);
consider a return here and dropping the else, then the up_generate_addpath()
can be without weird extra indent.
> else
> +#endif
> up_generate_addpath(peer, re);
> return;
> }
> @@ -290,8 +301,64 @@ rde_generate_updates(struct rib_entry *r
> {
> struct rde_peer *peer;
>
> - RB_FOREACH(peer, peer_tree, &peertable)
> - peer_generate_update(peer, re, newpath, old_pathid_tx, mode);
> + switch (mode) {
> + case EVAL_RECONF:
> + /* skip peers which don't need to reconfigure */
> + RB_FOREACH(peer, peer_tree, &peertable) {
> + if (peer->reconf_out == 0)
> + continue;
> + peer_generate_update(peer, re, NULL, 0, EVAL_RECONF);
> + }
> + return;
> + case EVAL_DEFAULT:
> + break;
> + case EVAL_ALL:
> + /*
> + * EVAL_DEFAULT is triggered when a new best path is selected.
> + * EVAL_ALL is sent for any other update (needed for peers with
> + * addpath or evaluate all set).
> + * There can be only one EVAL_DEFAULT queued, it replaces the
> + * previous one. A flag is enough.
> + * A path can only exist once in the queue (old or new).
> + */
> + if (re->pq_mode == EVAL_DEFAULT)
> + /* already a best path update pending, nothing to do */
> + return;
> +
> + break;
> + }
> +
> + if (re->pq_mode) {
> + peer = peer_get(re->pq_peer_id);
> + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
> + }
> + if (newpath != NULL)
> + peer = prefix_peer(newpath);
> + else
> + peer = peerself;
> + re->pq_mode = mode;
> + re->pq_peer_id = peer->conf.id;
> + TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue);
> +}
> +
> +void
> +peer_process_updates(struct rde_peer *peer, void *bula)
> +{
> + struct rib_entry *re;
> + struct rde_peer *p;
> + enum eval_mode mode;
> +
> + re = TAILQ_FIRST(&peer->rib_pq_head);
> + if (re == NULL)
> + return;
> + TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
> +
> + mode = re->pq_mode;
> +
> + RB_FOREACH(p, peer_tree, &peertable)
> + peer_generate_update(p, re, NULL, 0, mode);
> +
> + rib_dequeue(re);
> }
>
> /*
> @@ -658,6 +725,8 @@ peer_work_pending(void)
>
> RB_FOREACH(p, peer_tree, &peertable) {
> if (ibufq_queuelen(p->ibufq) != 0)
> + return 1;
> + if (!TAILQ_EMPTY(&p->rib_pq_head))
> return 1;
> }
>
> Index: rde_rib.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
> diff -u -p -r1.285 rde_rib.c
> --- rde_rib.c 24 Dec 2025 07:59:55 -0000 1.285
> +++ rde_rib.c 27 Dec 2025 15:47:29 -0000
> @@ -77,6 +77,12 @@ re_is_locked(struct rib_entry *re)
> return (re->lock != 0);
> }
>
> +static inline int
> +re_is_queued(struct rib_entry *re)
> +{
> + return (re->pq_mode != 0);
> +}
> +
> static inline struct rib_tree *
> rib_tree(struct rib *rib)
> {
> @@ -334,8 +340,8 @@ rib_remove(struct rib_entry *re)
> if (!rib_empty(re))
> fatalx("rib_remove: entry not empty");
>
> - if (re_is_locked(re))
> - /* entry is locked, don't free it. */
> + if (re_is_locked(re) || re_is_queued(re))
> + /* entry is locked or queued, don't free it. */
> return;
>
> pt_unref(re->prefix);
> @@ -351,6 +357,14 @@ static inline int
> rib_empty(struct rib_entry *re)
> {
> return TAILQ_EMPTY(&re->prefix_h);
> +}
> +
> +void
> +rib_dequeue(struct rib_entry *re)
> +{
> + re->pq_mode = 0;
> + if (rib_empty(re))
> + rib_remove(re);
> }
>
> static struct rib_entry *
>
bgpd: split up processing of updates