From: Theo Buehler <tb@theobuehler.org>
Subject: Re: bgpd: split up processing of updates
To: tech@openbsd.org
Date: Sun, 28 Dec 2025 13:06:31 +0100

On Sat, Dec 27, 2025 at 05:04:31PM +0100, Claudio Jeker wrote:
> One of the biggest latency inducing bottlenecks in bgpd is that UPDATE
> messages are processed to completion. Mainly we do the input processing
> update the local-RIB and then also handle the updates for the adj-rib-out
> at the same time.
> 
> The adj-rib-out is a loop over all peers and is therefor super heavy.
> By taking this part of the pipeline into its own step we can drop the
> latency inside the main poll loop by a lot.
> 
> For this we introduce a per-peer update queue that enqueues the rib
> entries and then handles those one by one. This also has a benefit that
> a single peer can not monopolize the processing in bgpd. So a single
> flapping peer should result in far less noticeable delays on all other
> UPDATES from other peers.
> 
> For now I had to disable the addpath send all optimisation since for that
> we need to keep an extra queue of updates. Instead we just use the regular
> addpath codepath that is less optimised but produces the same results.
> 
> I will add the needed code later on.

See a couple comments below.

> -- 
> :wq Claudio
> 
> Index: rde.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> diff -u -p -r1.678 rde.c
> --- rde.c	24 Dec 2025 07:59:55 -0000	1.678
> +++ rde.c	27 Dec 2025 15:47:29 -0000
> @@ -326,6 +326,7 @@ rde_main(int debug, int verbose)
>  		    monotime_to_usec(monotime_sub(io_end, loop_start));
>  
>  		peer_foreach(rde_dispatch_imsg_peer, NULL);
> +		peer_foreach(peer_process_updates, NULL);
>  
>  		peer_end = getmonotime();
>  		rdemem.rde_event_peer_usec +=
> Index: rde.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> diff -u -p -r1.336 rde.h
> --- rde.h	24 Dec 2025 07:59:55 -0000	1.336
> +++ rde.h	27 Dec 2025 15:47:29 -0000
> @@ -41,13 +41,19 @@ enum peer_state {
>  LIST_HEAD(prefix_list, prefix);
>  TAILQ_HEAD(prefix_queue, prefix);
>  RB_HEAD(rib_tree, rib_entry);
> +TAILQ_HEAD(rib_queue, rib_entry);
> +LIST_HEAD(rib_pq_head, rib_pq);
>  
>  struct rib_entry {
>  	RB_ENTRY(rib_entry)	 rib_e;
> +	TAILQ_ENTRY(rib_entry)	 rib_queue;
>  	struct prefix_queue	 prefix_h;
>  	struct pt_entry		*prefix;
> +	struct rib_pq_head	 rib_pq_list;
> +	uint32_t		 pq_peer_id;
>  	uint16_t		 rib_id;
> -	uint16_t		 lock;
> +	uint8_t			 lock;
> +	uint8_t			 pq_mode;
>  };
>  
>  struct rib {
> @@ -93,6 +99,7 @@ struct rde_peer {
>  	struct pend_prefix_hash		 pend_prefixes;
>  	struct filter_head		*out_rules;
>  	struct ibufqueue		*ibufq;
> +	struct rib_queue		 rib_pq_head;
>  	monotime_t			 staletime[AID_MAX];
>  	uint32_t			 adjout_bid;
>  	uint32_t			 remote_bgpid;
> @@ -350,9 +357,9 @@ struct filterstate {
>  };
>  
>  enum eval_mode {
> +	EVAL_RECONF,
>  	EVAL_DEFAULT,
>  	EVAL_ALL,
> -	EVAL_RECONF,

I found this change confusing. I think you handle things correctly,
but using 0 (== EVAL_RECONF) as a canary value feels a bit fragile.
I mean the

	if (re->pq_mode) {
	...
	}

in peer_generate_update(), the return re->pq_mode != 0 in re_is_queued()
and rib_dequeue() doing re->pq_mode = 0.

Should 0 not be EVAL_UNQUEUED or something?

>  };
>  
>  struct rib_context {
> @@ -414,6 +421,7 @@ struct filter_head	*peer_apply_out_filte
>  
>  void		 rde_generate_updates(struct rib_entry *, struct prefix *,
>  		    uint32_t, enum eval_mode);
> +void		 peer_process_updates(struct rde_peer *, void *);
>  
>  void		 peer_up(struct rde_peer *, struct session_up *);
>  void		 peer_down(struct rde_peer *);
> @@ -612,6 +620,7 @@ int		 rib_dump_subtree(uint16_t, struct 
>  		    void (*)(void *, uint8_t),
>  		    int (*)(void *));
>  void		 rib_dump_terminate(void *);
> +void		 rib_dequeue(struct rib_entry *);
>  
>  extern struct rib flowrib;
>  
> Index: rde_peer.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
> diff -u -p -r1.64 rde_peer.c
> --- rde_peer.c	24 Dec 2025 07:59:55 -0000	1.64
> +++ rde_peer.c	27 Dec 2025 15:53:38 -0000
> @@ -26,6 +26,12 @@
>  #include "bgpd.h"
>  #include "rde.h"
>  
> +struct rib_pq {
> +	LIST_ENTRY(rib_pq)	entry;
> +	struct prefix		*new;
> +	uint32_t		old_pathid_tx;
> +};
> +
>  struct peer_tree	 peertable = RB_INITIALIZER(&peertable);
>  struct peer_tree	 zombietable = RB_INITIALIZER(&zombietable);
>  struct rde_peer		*peerself;
> @@ -161,6 +167,7 @@ peer_add(uint32_t id, struct peer_config
>  	if (peer == NULL)
>  		fatal("peer_add");
>  
> +	TAILQ_INIT(&peer->rib_pq_head);
>  	memcpy(&peer->conf, p_conf, sizeof(struct peer_config));
>  	peer->remote_bgpid = 0;
>  	peer->loc_rib_id = rib_find(peer->conf.rib);
> @@ -264,16 +271,20 @@ peer_generate_update(struct rde_peer *pe
>  	    peer->export_type == EXPORT_DEFAULT_ROUTE)
>  		return;
>  
> -	/* if reconf skip peers which don't need to reconfigure */
> -	if (mode == EVAL_RECONF && peer->reconf_out == 0)
> -		return;
> -
>  	/* handle peers with add-path */
>  	if (peer_has_add_path(peer, aid, CAPA_AP_SEND)) {
> +#if NOTYET
> +	/* XXX skip addpath all optimisation for now until the queue
> +	 * is extended to allow this mode to work again.
> +	 * This mode is not very common and so taking the slow path
> +	 * here like for all other addpath send modes is not that
> +	 * big of an issue right now.
> +	 */

Comment wants more indent.

>  		if (peer->eval.mode == ADDPATH_EVAL_ALL)
>  			up_generate_addpath_all(peer, re, newpath,
>  			    old_pathid_tx);

consider a return here and dropping the else, then the up_generate_addpath()
can be without weird extra indent.


>  		else
> +#endif
>  			up_generate_addpath(peer, re);
>  		return;
>  	}
> @@ -290,8 +301,64 @@ rde_generate_updates(struct rib_entry *r
>  {
>  	struct rde_peer	*peer;
>  
> -	RB_FOREACH(peer, peer_tree, &peertable)
> -		peer_generate_update(peer, re, newpath, old_pathid_tx, mode);
> +	switch (mode) {
> +	case EVAL_RECONF:
> +		/* skip peers which don't need to reconfigure */
> +		RB_FOREACH(peer, peer_tree, &peertable) {
> +			if (peer->reconf_out == 0)
> +				continue;
> +			peer_generate_update(peer, re, NULL, 0, EVAL_RECONF);
> +		}
> +		return;
> +	case EVAL_DEFAULT:
> +		break;
> +	case EVAL_ALL:
> +		/*
> +		 * EVAL_DEFAULT is triggered when a new best path is selected.
> +		 * EVAL_ALL is sent for any other update (needed for peers with
> +		 * addpath or evaluate all set).
> +		 * There can be only one EVAL_DEFAULT queued, it replaces the
> +		 * previous one. A flag is enough.
> +		 * A path can only exist once in the queue (old or new).
> +		 */
> +		if (re->pq_mode == EVAL_DEFAULT)
> +			/* already a best path update pending, nothing to do */
> +			return;
> +
> +		break;
> +	}
> +
> +	if (re->pq_mode) {
> +		peer = peer_get(re->pq_peer_id);
> +		TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
> +	}
> +	if (newpath != NULL)
> +		peer = prefix_peer(newpath);
> +	else
> +		peer = peerself;
> +	re->pq_mode = mode;
> +	re->pq_peer_id = peer->conf.id;
> +	TAILQ_INSERT_TAIL(&peer->rib_pq_head, re, rib_queue);
> +}
> +
> +void
> +peer_process_updates(struct rde_peer *peer, void *bula)
> +{
> +	struct rib_entry *re;
> +	struct rde_peer *p;
> +	enum eval_mode mode;
> +
> +	re = TAILQ_FIRST(&peer->rib_pq_head);
> +	if (re == NULL)
> +		return;
> +	TAILQ_REMOVE(&peer->rib_pq_head, re, rib_queue);
> +
> +	mode = re->pq_mode;
> +
> +	RB_FOREACH(p, peer_tree, &peertable)
> +		peer_generate_update(p, re, NULL, 0, mode);
> +
> +	rib_dequeue(re);
>  }
>  
>  /*
> @@ -658,6 +725,8 @@ peer_work_pending(void)
>  
>  	RB_FOREACH(p, peer_tree, &peertable) {
>  		if (ibufq_queuelen(p->ibufq) != 0)
> +			return 1;
> +		if (!TAILQ_EMPTY(&p->rib_pq_head))
>  			return 1;
>  	}
>  
> Index: rde_rib.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
> diff -u -p -r1.285 rde_rib.c
> --- rde_rib.c	24 Dec 2025 07:59:55 -0000	1.285
> +++ rde_rib.c	27 Dec 2025 15:47:29 -0000
> @@ -77,6 +77,12 @@ re_is_locked(struct rib_entry *re)
>  	return (re->lock != 0);
>  }
>  
> +static inline int
> +re_is_queued(struct rib_entry *re)
> +{
> +	return (re->pq_mode != 0);
> +}
> +
>  static inline struct rib_tree *
>  rib_tree(struct rib *rib)
>  {
> @@ -334,8 +340,8 @@ rib_remove(struct rib_entry *re)
>  	if (!rib_empty(re))
>  		fatalx("rib_remove: entry not empty");
>  
> -	if (re_is_locked(re))
> -		/* entry is locked, don't free it. */
> +	if (re_is_locked(re) || re_is_queued(re))
> +		/* entry is locked or queued, don't free it. */
>  		return;
>  
>  	pt_unref(re->prefix);
> @@ -351,6 +357,14 @@ static inline int
>  rib_empty(struct rib_entry *re)
>  {
>  	return TAILQ_EMPTY(&re->prefix_h);
> +}
> +
> +void
> +rib_dequeue(struct rib_entry *re)
> +{
> +	re->pq_mode = 0;
> +	if (rib_empty(re))
> +		rib_remove(re);
>  }
>  
>  static struct rib_entry *
>