Index | Thread | Search

From:
Theo Buehler <tb@theobuehler.org>
Subject:
Re: bgpd: cache Adj-RIB-Out when a peer goes down
To:
tech@openbsd.org
Date:
Thu, 12 Dec 2024 14:34:57 +0100

Download raw body.

Thread
On Thu, Dec 12, 2024 at 01:46:33PM +0100, Claudio Jeker wrote:
> When a session goes down there is no real reason to remove the Adj-RIB-Out
> instead it is actually better to keep it synced so that when the peer
> reconnects there is no need to repopulate the full thing.
> 
> This diff does exactly that. It caches the Adj-RIB-Out for up to 1h.
> Now when the session is reestablished the peer settings are checked to see
> if any params that matter changed. If so a full sync is needed else the
> RDE can just start blasting out the RIB.
> 
> For this introduce a IMSG_SESSION_DELETE that tells the RDE to remove the
> peer and split peer_down into a part that takes the session down (and
> clears the Adj-RIB-In) and a part the frees the peer (peer_delete).
> The SE now sends an IMSG_SESSION_ADD command on first connect and skips
> that imsg on later connects unless IMSG_SESSION_DELETE was called before.
> During config reload the IMSG_SESSION_ADD calls only need to happen when
> the RDE actually has that information. For all this to work I added a
> rdesession member to struct peer to keep track of the RDE session state.
> 
> Most of the heavy lifting was already committed so this diff is reasonably
> simple :)
> -- 
> :wq Claudio

Thanks for going the extra mile of making this pleasant and reasonably
easy to review. Much appreciated.

There's one really stupid typo/bug below, otherwise ok.

> Index: bgpd.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
> diff -u -p -r1.504 bgpd.h
> --- bgpd.h	11 Dec 2024 09:19:44 -0000	1.504
> +++ bgpd.h	11 Dec 2024 09:20:17 -0000
> @@ -683,6 +683,7 @@ enum imsg_type {
>  	IMSG_SESSION_ADD,
>  	IMSG_SESSION_UP,
>  	IMSG_SESSION_DOWN,
> +	IMSG_SESSION_DELETE,
>  	IMSG_SESSION_STALE,
>  	IMSG_SESSION_NOGRACE,
>  	IMSG_SESSION_FLUSH,
> Index: rde.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
> diff -u -p -r1.645 rde.c
> --- rde.c	11 Dec 2024 09:19:44 -0000	1.645
> +++ rde.c	11 Dec 2024 10:34:35 -0000
> @@ -433,6 +433,12 @@ rde_dispatch_imsg_session(struct imsgbuf
>  			}
>  			peer_down(peer);
>  			break;
> +		case IMSG_SESSION_DELETE:
> +			/* silently ignore deletes for unknown peers */
> +			if ((peer = peer_get(peerid)) == NULL)
> +				break;
> +			peer_delete(peer);
> +			break;
>  		case IMSG_SESSION_STALE:
>  		case IMSG_SESSION_NOGRACE:
>  		case IMSG_SESSION_FLUSH:
> Index: rde.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
> diff -u -p -r1.308 rde.h
> --- rde.h	11 Dec 2024 09:19:44 -0000	1.308
> +++ rde.h	11 Dec 2024 09:20:52 -0000
> @@ -289,7 +289,7 @@ struct prefix {
>  #define	PREFIX_FLAG_WITHDRAW	0x0001	/* enqueued on withdraw queue */
>  #define	PREFIX_FLAG_UPDATE	0x0002	/* enqueued on update queue */
>  #define	PREFIX_FLAG_DEAD	0x0004	/* locked but removed */
> -#define	PREFIX_FLAG_STALE	0x0008	/* stale entry (graceful reload) */
> +#define	PREFIX_FLAG_STALE	0x0008	/* stale entry (for addpath) */
>  #define	PREFIX_FLAG_MASK	0x000f	/* mask for the prefix types */
>  #define	PREFIX_FLAG_ADJOUT	0x0010	/* prefix is in the adj-out rib */
>  #define	PREFIX_FLAG_EOR		0x0020	/* prefix is EoR */
> @@ -369,6 +369,7 @@ void		 rde_generate_updates(struct rib_e
>  
>  void		 peer_up(struct rde_peer *, struct session_up *);
>  void		 peer_down(struct rde_peer *);
> +void		 peer_delete(struct rde_peer *);
>  void		 peer_flush(struct rde_peer *, uint8_t, time_t);
>  void		 peer_stale(struct rde_peer *, uint8_t, int);
>  void		 peer_blast(struct rde_peer *, uint8_t);
> @@ -603,6 +604,7 @@ void		 prefix_adjout_update(struct prefi
>  		    struct filterstate *, struct pt_entry *, uint32_t);
>  void		 prefix_adjout_withdraw(struct prefix *);
>  void		 prefix_adjout_destroy(struct prefix *);
> +void		 prefix_adjout_flush_pending(struct rde_peer *);
>  int		 prefix_adjout_reaper(struct rde_peer *);
>  int		 prefix_dump_new(struct rde_peer *, uint8_t, unsigned int,
>  		    void *, void (*)(struct prefix *, void *),
> Index: rde_peer.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_peer.c,v
> diff -u -p -r1.41 rde_peer.c
> --- rde_peer.c	11 Dec 2024 09:19:44 -0000	1.41
> +++ rde_peer.c	11 Dec 2024 09:22:44 -0000
> @@ -82,7 +82,7 @@ peer_shutdown(void)
>  	struct rde_peer *peer, *np;
>  
>  	RB_FOREACH_SAFE(peer, peer_tree, &peertable, np)
> -		peer_down(peer);
> +		peer_delete(peer);
>  
>  	while (!RB_EMPTY(&zombietable))
>  		peer_reaper(NULL);
> @@ -241,7 +241,8 @@ peer_generate_update(struct rde_peer *pe
>  	/* skip ourself */
>  	if (peer == peerself)
>  		return;
> -	if (!peer_is_up(peer))
> +	/* skip peers that never had a session open */
> +	if (peer->state == PEER_NONE)
>  		return;
>  	/* skip peers using a different rib */
>  	if (peer->loc_rib_id != re->rib_id)
> @@ -286,28 +287,6 @@ rde_generate_updates(struct rib_entry *r
>  /*
>   * Various RIB walker callbacks.
>   */
> -static void
> -peer_adjout_clear_upcall(struct prefix *p, void *arg)
> -{
> -	prefix_adjout_destroy(p);
> -}
> -
> -static void
> -peer_adjout_stale_upcall(struct prefix *p, void *arg)
> -{
> -	if (p->flags & PREFIX_FLAG_DEAD) {
> -		return;
> -	} else if (p->flags & PREFIX_FLAG_WITHDRAW) {
> -		/* no need to keep stale withdraws, they miss all attributes */
> -		prefix_adjout_destroy(p);
> -		return;
> -	} else if (p->flags & PREFIX_FLAG_UPDATE) {
> -		RB_REMOVE(prefix_tree, &prefix_peer(p)->updates[p->pt->aid], p);
> -		p->flags &= ~PREFIX_FLAG_UPDATE;
> -	}
> -	p->flags |= PREFIX_FLAG_STALE;
> -}
> -
>  struct peer_flush {
>  	struct rde_peer *peer;
>  	time_t		 staletime;
> @@ -361,6 +340,7 @@ void
>  peer_up(struct rde_peer *peer, struct session_up *sup)
>  {
>  	uint8_t	 i;
> +	int force_sync = 1;
>  
>  	if (peer->state == PEER_ERR) {
>  		/*
> @@ -369,21 +349,33 @@ peer_up(struct rde_peer *peer, struct se
>  		 */
>  		rib_dump_terminate(peer);
>  		peer_imsg_flush(peer);
> -		if (prefix_dump_new(peer, AID_UNSPEC, 0, NULL,
> -		    peer_adjout_clear_upcall, NULL, NULL) == -1)
> -			fatal("%s: prefix_dump_new", __func__);
>  		peer_flush(peer, AID_UNSPEC, 0);
>  		peer->stats.prefix_cnt = 0;
> -		peer->stats.prefix_out_cnt = 0;
>  		peer->state = PEER_DOWN;
>  	}
> -	peer->remote_bgpid = sup->remote_bgpid;
> -	peer->short_as = sup->short_as;
> +
> +	/*
> +	 * Check if no value changed during flap to decide if the RIB
> +	 * is in sync. The capa check is maybe too strict but it should
> +	 * not matter for normal operation.
> +	 */
> +	if (memcmp(&peer->remote_addr, &sup->remote_addr,
> +	    sizeof(sup->remote_addr)) == 0 &&
> +	    memcmp(&peer->local_v4_addr, &sup->local_v4_addr,
> +	    sizeof(sup->local_v4_addr)) == 0 &&
> +	    memcmp(&peer->local_v6_addr, &sup->local_v6_addr,
> +	    sizeof(sup->local_v6_addr)) == 0 &&
> +	    memcpy(&peer->capa, &sup->capa, sizeof(peer->capa)) == 0)

memcmp, not memcpy. Also use sizeof(sup->capa) to match the others.

> +		force_sync = 0;
> +
>  	peer->remote_addr = sup->remote_addr;
>  	peer->local_v4_addr = sup->local_v4_addr;
>  	peer->local_v6_addr = sup->local_v6_addr;
> -	peer->local_if_scope = sup->if_scope;
>  	memcpy(&peer->capa, &sup->capa, sizeof(peer->capa));
> +	/* the Adj-RIB-Out does not depend on those */
> +	peer->remote_bgpid = sup->remote_bgpid;
> +	peer->local_if_scope = sup->if_scope;
> +	peer->short_as = sup->short_as;
>  
>  	/* clear eor markers depending on GR flags */
>  	if (peer->capa.grestart.restart) {
> @@ -396,9 +388,16 @@ peer_up(struct rde_peer *peer, struct se
>  	}
>  	peer->state = PEER_UP;
>  
> -	for (i = AID_MIN; i < AID_MAX; i++) {
> -		if (peer->capa.mp[i])
> -			peer_dump(peer, i);
> +	if (!force_sync) {
> +		for (i = AID_MIN; i < AID_MAX; i++) {
> +			if (peer->capa.mp[i])
> +				peer_blast(peer, i);
> +		}
> +	} else {
> +		for (i = AID_MIN; i < AID_MAX; i++) {
> +			if (peer->capa.mp[i])
> +				peer_dump(peer, i);
> +		}
>  	}
>  }
>  
> @@ -416,16 +415,24 @@ peer_down(struct rde_peer *peer)
>  	 * and flush all pending imsg from the SE.
>  	 */
>  	rib_dump_terminate(peer);
> +	prefix_adjout_flush_pending(peer);
>  	peer_imsg_flush(peer);
>  
>  	/* flush Adj-RIB-In */
>  	peer_flush(peer, AID_UNSPEC, 0);
>  	peer->stats.prefix_cnt = 0;
> +}
> +
> +void
> +peer_delete(struct rde_peer *peer)
> +{
> +	if (peer->state != PEER_DOWN)
> +		peer_down(peer);
>  
>  	/* free filters */
>  	filterlist_free(peer->out_rules);
> -	RB_REMOVE(peer_tree, &peertable, peer);
>  
> +	RB_REMOVE(peer_tree, &peertable, peer);
>  	while (RB_INSERT(peer_tree, &zombietable, peer) != NULL) {
>  		log_warnx("zombie peer conflict");
>  		peer->conf.id = arc4random();
> @@ -481,17 +488,12 @@ peer_stale(struct rde_peer *peer, uint8_
>  	 * and flush all pending imsg from the SE.
>  	 */
>  	rib_dump_terminate(peer);
> +	prefix_adjout_flush_pending(peer);
>  	peer_imsg_flush(peer);
>  
>  	if (flushall)
>  		peer_flush(peer, aid, 0);
>  
> -	/* XXX this is not quite correct */
> -	/* mark Adj-RIB-Out stale for this peer */
> -	if (prefix_dump_new(peer, aid, 0, NULL,
> -	    peer_adjout_stale_upcall, NULL, NULL) == -1)
> -		fatal("%s: prefix_dump_new", __func__);
> -
>  	/* make sure new prefixes start on a higher timestamp */
>  	while (now >= getmonotime())
>  		sleep(1);
> @@ -504,10 +506,7 @@ peer_stale(struct rde_peer *peer, uint8_
>  static void
>  peer_blast_upcall(struct prefix *p, void *ptr)
>  {
> -	if (p->flags & PREFIX_FLAG_STALE) {
> -		/* remove stale entries */
> -		prefix_adjout_destroy(p);
> -	} else if (p->flags & PREFIX_FLAG_DEAD) {
> +	if (p->flags & PREFIX_FLAG_DEAD) {
>  		/* ignore dead prefixes, they will go away soon */
>  	} else if ((p->flags & PREFIX_FLAG_MASK) == 0) {
>  		/* put entries on the update queue if not already on a queue */
> Index: rde_rib.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
> diff -u -p -r1.265 rde_rib.c
> --- rde_rib.c	11 Dec 2024 09:19:44 -0000	1.265
> +++ rde_rib.c	11 Dec 2024 09:23:04 -0000
> @@ -1273,7 +1273,6 @@ prefix_adjout_update(struct prefix *p, s
>  			/* nothing changed */
>  			p->validation_state = state->vstate;
>  			p->lastchange = getmonotime();
> -			p->flags &= ~PREFIX_FLAG_STALE;
>  			return;
>  		}
>  
> @@ -1344,7 +1343,6 @@ prefix_adjout_withdraw(struct prefix *p)
>  	/* already a withdraw, shortcut */
>  	if (p->flags & PREFIX_FLAG_WITHDRAW) {
>  		p->lastchange = getmonotime();
> -		p->flags &= ~PREFIX_FLAG_STALE;
>  		return;
>  	}
>  	/* pending update just got withdrawn */
> @@ -1414,6 +1412,24 @@ prefix_adjout_destroy(struct prefix *p)
>  		/* remove the last prefix reference before free */
>  		pt_unref(p->pt);
>  		prefix_free(p);
> +	}
> +}
> +
> +void
> +prefix_adjout_flush_pending(struct rde_peer *peer)
> +{
> +	struct prefix *p, *np;
> +	uint8_t aid;
> +
> +	for (aid = AID_MIN; aid < AID_MAX; aid++) {
> +		RB_FOREACH_SAFE(p, prefix_tree, &peer->withdraws[aid], np) {
> +			prefix_adjout_destroy(p);
> +		}
> +		RB_FOREACH_SAFE(p, prefix_tree, &peer->updates[aid], np) {
> +			p->flags &= ~PREFIX_FLAG_UPDATE;
> +			RB_REMOVE(prefix_tree, &peer->updates[aid], p);
> +			peer->stats.pending_update--;
> +		}
>  	}
>  }
>  
> Index: session.c
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/session.c,v
> diff -u -p -r1.502 session.c
> --- session.c	10 Dec 2024 14:34:51 -0000	1.502
> +++ session.c	12 Dec 2024 10:49:44 -0000
> @@ -270,6 +270,9 @@ session_main(int debug, int verbose)
>  					    NULL);
>  					timer_remove_all(&p->timers);
>  					tcp_md5_del_listener(conf, p);
> +					if (imsg_rde(IMSG_SESSION_DELETE,
> +					    p->conf.id, NULL, 0) == -1)
> +						fatalx("imsg_compose error");
>  					msgbuf_free(p->wbuf);
>  					RB_REMOVE(peer_head, &conf->peers, p);
>  					log_peer_warnx(&p->conf, "removed");
> @@ -405,6 +408,12 @@ session_main(int debug, int verbose)
>  				case Timer_SessionDown:
>  					timer_stop(&p->timers,
>  					    Timer_SessionDown);
> +
> +					if (imsg_rde(IMSG_SESSION_DELETE,
> +					    p->conf.id, NULL, 0) == -1)
> +						fatalx("imsg_compose error");
> +					p->rdesession = 0;
> +
>  					/* finally delete this cloned peer */
>  					if (p->template)
>  						p->reconf_action =
> @@ -3462,9 +3471,13 @@ session_up(struct peer *p)
>  
>  	timer_stop(&p->timers, Timer_SessionDown);
>  
> -	if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
> -	    &p->conf, sizeof(p->conf)) == -1)
> -		fatalx("imsg_compose error");
> +	if (!p->rdesession) {
> +		/* inform rde about new peer */
> +		if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
> +		    &p->conf, sizeof(p->conf)) == -1)
> +			fatalx("imsg_compose error");
> +		p->rdesession = 1;
> +	}
>  
>  	if (p->local.aid == AID_INET) {
>  		sup.local_v4_addr = p->local;
> @@ -3632,10 +3645,15 @@ merge_peers(struct bgpd_config *c, struc
>  			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
>  			    p->conf.id, 0, -1, NULL, 0);
>  
> -		/* sync the RDE in case we keep the peer */
> -		if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
> -		    &p->conf, sizeof(struct peer_config)) == -1)
> -			fatalx("imsg_compose error");
> +		/*
> +		 * If the session is established or the SessionDown timer is
> +		 * running sync with the RDE
> +		 */
> +		if (p->rdesession) {
> +			if (imsg_rde(IMSG_SESSION_ADD, p->conf.id,
> +			    &p->conf, sizeof(struct peer_config)) == -1)
> +				fatalx("imsg_compose error");
> +		}
>  
>  		/* apply the config to all clones of a template */
>  		if (p->conf.template) {
> @@ -3645,9 +3663,13 @@ merge_peers(struct bgpd_config *c, struc
>  					continue;
>  				session_template_clone(xp, NULL, xp->conf.id,
>  				    xp->conf.remote_as);
> -				if (imsg_rde(IMSG_SESSION_ADD, xp->conf.id,
> -				    &xp->conf, sizeof(xp->conf)) == -1)
> -					fatalx("imsg_compose error");
> +
> +				if (p->rdesession) {
> +					if (imsg_rde(IMSG_SESSION_ADD,
> +					    xp->conf.id, &xp->conf,
> +					    sizeof(xp->conf)) == -1)
> +						fatalx("imsg_compose error");
> +				}
>  			}
>  		}
>  	}
> Index: session.h
> ===================================================================
> RCS file: /cvs/src/usr.sbin/bgpd/session.h,v
> diff -u -p -r1.181 session.h
> --- session.h	10 Dec 2024 14:34:51 -0000	1.181
> +++ session.h	12 Dec 2024 10:47:14 -0000
> @@ -228,6 +228,7 @@ struct peer {
>  	uint8_t			 passive;
>  	uint8_t			 throttled;
>  	uint8_t			 rpending;
> +	uint8_t			 rdesession;
>  };
>  
>  extern time_t		 pauseaccept;
>