Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
bgpd: split session.c in two
To:
tech@openbsd.org
Date:
Tue, 25 Feb 2025 15:31:29 +0100

Download raw body.

Thread
session.c is too big and I would like to reuse code for another project.
So I decided to split out the BGP protocol bits into session_bgp.c
and session.c holds the rest of the code (especially the imsg bits).

Sorry the diff is huge and hard to review.
-- 
:wq Claudio

Index: Makefile
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/Makefile,v
diff -u -p -r1.40 Makefile
--- Makefile	20 Feb 2025 19:47:31 -0000	1.40
+++ Makefile	24 Feb 2025 15:56:57 -0000
@@ -1,17 +1,47 @@
 #	$OpenBSD: Makefile,v 1.40 2025/02/20 19:47:31 claudio Exp $
 
 PROG=	bgpd
-SRCS=	bgpd.c session.c log.c logmsg.c parse.y config.c monotime.c \
-	rde.c rde_rib.c rde_decide.c rde_prefix.c mrt.c kroute.c control.c \
-	pfkey.c rde_update.c rde_attr.c rde_community.c printconf.c \
-	rde_filter.c rde_sets.c rde_aspa.c rde_trie.c pftable.c name2id.c \
-	util.c carp.c timer.c rde_peer.c rtr.c rtr_proto.c flowspec.c
+SRCS=	bgpd.c
+SRCS+=	carp.c
+SRCS+=	config.c
+SRCS+=	control.c
+SRCS+=	flowspec.c
+SRCS+=	kroute.c
+SRCS+=	log.c
+SRCS+=	logmsg.c
+SRCS+=	monotime.c
+SRCS+=	mrt.c
+SRCS+=	name2id.c
+SRCS+=	parse.y
+SRCS+=	pfkey.c
+SRCS+=	pftable.c
+SRCS+=	printconf.c
+SRCS+=	rde.c
+SRCS+=	rde_aspa.c
+SRCS+=	rde_attr.c
+SRCS+=	rde_community.c
+SRCS+=	rde_decide.c
+SRCS+=	rde_filter.c
+SRCS+=	rde_peer.c
+SRCS+=	rde_prefix.c
+SRCS+=	rde_rib.c
+SRCS+=	rde_sets.c
+SRCS+=	rde_trie.c
+SRCS+=	rde_update.c
+SRCS+=	rtr.c
+SRCS+=	rtr_proto.c
+SRCS+=	session.c
+SRCS+=	session_bgp.c
+SRCS+=	timer.c
+SRCS+=	util.c
+
 CFLAGS+= -Wall -I${.CURDIR}
 CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes
 CFLAGS+= -Wmissing-declarations
 CFLAGS+= -Wshadow -Wpointer-arith -Wcast-qual
 CFLAGS+= -Wsign-compare
 YFLAGS=
+
 LDADD+=	-lutil
 DPADD+= ${LIBUTIL}
 MAN= bgpd.8 bgpd.conf.5
Index: session.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.c,v
diff -u -p -r1.518 session.c
--- session.c	20 Feb 2025 19:47:31 -0000	1.518
+++ session.c	25 Feb 2025 14:12:24 -0000
@@ -60,47 +60,17 @@
 void	session_sighdlr(int);
 int	setup_listeners(u_int *);
 void	init_peer(struct peer *, struct bgpd_config *);
-void	start_timer_holdtime(struct peer *);
-void	start_timer_sendholdtime(struct peer *);
-void	start_timer_keepalive(struct peer *);
-void	session_close_connection(struct peer *);
-void	change_state(struct peer *, enum session_state, enum session_events);
 int	session_setup_socket(struct peer *);
 void	session_accept(int);
-int	session_connect(struct peer *);
-void	session_tcp_established(struct peer *);
-int	session_capa_add(struct ibuf *, uint8_t, uint8_t);
-struct ibuf	*session_newmsg(enum msg_type, uint16_t);
-void	session_sendmsg(struct ibuf *, struct peer *, enum msg_type);
-void	session_open(struct peer *);
-void	session_keepalive(struct peer *);
-void	session_update(struct peer *, struct ibuf *);
-void	session_notification(struct peer *, uint8_t, uint8_t, struct ibuf *);
-void	session_notification_data(struct peer *, uint8_t, uint8_t, void *,
-	    size_t);
-void	session_rrefresh(struct peer *, uint8_t, uint8_t);
 int	session_graceful_restart(struct peer *);
 int	session_graceful_stop(struct peer *);
-int	session_dispatch_msg(struct pollfd *, struct peer *);
-void	session_process_msg(struct peer *);
-struct ibuf	*parse_header(struct ibuf *, void *, int *);
-int	parse_open(struct peer *, struct ibuf *);
-int	parse_update(struct peer *, struct ibuf *);
-int	parse_rrefresh(struct peer *, struct ibuf *);
-void	parse_notification(struct peer *, struct ibuf *);
-int	parse_capabilities(struct peer *, struct ibuf *, uint32_t *);
-int	capa_neg_calc(struct peer *);
 void	session_dispatch_imsg(struct imsgbuf *, int, u_int *);
-void	session_up(struct peer *);
-void	session_down(struct peer *);
 int	imsg_rde(int, uint32_t, void *, uint16_t);
-void	session_demote(struct peer *, int);
 void	merge_peers(struct bgpd_config *, struct bgpd_config *);
 
-int		 la_cmp(struct listen_addr *, struct listen_addr *);
-void		 session_template_clone(struct peer *, struct sockaddr *,
-		    uint32_t, uint32_t);
-int		 session_match_mask(struct peer *, struct bgpd_addr *);
+void	session_template_clone(struct peer *, struct sockaddr *,
+	    uint32_t, uint32_t);
+int	session_match_mask(struct peer *, struct bgpd_addr *);
 
 static struct bgpd_config	*conf, *nconf;
 static struct imsgbuf		*ibuf_rde;
@@ -116,11 +86,6 @@ u_int			 peer_cnt;
 struct mrt_head		 mrthead;
 monotime_t		 pauseaccept;
 
-static const uint8_t	 marker[MSGSIZE_HEADER_MARKER] = {
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-};
-
 static inline int
 peer_compare(const struct peer *a, const struct peer *b)
 {
@@ -429,6 +394,26 @@ session_main(int debug, int verbose)
 					timeout = nextaction;
 			}
 
+			/* check if peer needs throttling or not */
+			if (!p->throttled &&
+			    msgbuf_queuelen(p->wbuf) > SESS_MSG_HIGH_MARK) {
+				if (imsg_rde(IMSG_XOFF, p->conf.id, NULL, 0) ==
+				    -1)
+					log_peer_warn(&p->conf,
+					    "imsg_compose XOFF");
+				else
+					p->throttled = 1;
+			}
+			if (p->throttled &&
+			    msgbuf_queuelen(p->wbuf) < SESS_MSG_LOW_MARK) {
+				if (imsg_rde(IMSG_XON, p->conf.id, NULL, 0) ==
+				    -1)
+					log_peer_warn(&p->conf,
+					    "imsg_compose XON");
+				else
+					p->throttled = 0;
+			}
+
 			/* are we waiting for a write? */
 			events = POLLIN;
 			if (msgbuf_queuelen(p->wbuf) > 0 ||
@@ -600,6 +585,8 @@ init_peer(struct peer *p, struct bgpd_co
 		p->conf.holdtime = c->holdtime;
 	if (p->conf.min_holdtime == 0)
 		p->conf.min_holdtime = c->min_holdtime;
+	p->connectretry = c->connectretry;
+	p->local_bgpid = c->bgpid;
 
 	peer_cnt++;
 
@@ -620,397 +607,85 @@ init_peer(struct peer *p, struct bgpd_co
 		session_demote(p, +1);
 }
 
-void
-bgp_fsm(struct peer *peer, enum session_events event, struct ibuf *msg)
+int
+session_dispatch_msg(struct pollfd *pfd, struct peer *p)
 {
-	switch (peer->state) {
-	case STATE_NONE:
-		/* nothing */
-		break;
-	case STATE_IDLE:
-		switch (event) {
-		case EVNT_START:
-			timer_stop(&peer->timers, Timer_Hold);
-			timer_stop(&peer->timers, Timer_SendHold);
-			timer_stop(&peer->timers, Timer_Keepalive);
-			timer_stop(&peer->timers, Timer_IdleHold);
-
-			if (!peer->depend_ok)
-				timer_stop(&peer->timers, Timer_ConnectRetry);
-			else if (peer->passive || peer->conf.passive ||
-			    peer->conf.template) {
-				change_state(peer, STATE_ACTIVE, event);
-				timer_stop(&peer->timers, Timer_ConnectRetry);
-			} else {
-				change_state(peer, STATE_CONNECT, event);
-				timer_set(&peer->timers, Timer_ConnectRetry,
-				    conf->connectretry);
-				session_connect(peer);
+	socklen_t	len;
+	int		error;
+
+	if (p->state == STATE_CONNECT) {
+		if (pfd->revents & POLLOUT) {
+			if (pfd->revents & POLLIN) {
+				/* error occurred */
+				len = sizeof(error);
+				if (getsockopt(pfd->fd, SOL_SOCKET, SO_ERROR,
+				    &error, &len) == -1 || error) {
+					if (error)
+						errno = error;
+					if (errno != p->lasterr) {
+						log_peer_warn(&p->conf,
+						    "socket error");
+						p->lasterr = errno;
+					}
+					bgp_fsm(p, EVNT_CON_OPENFAIL, NULL);
+					return (1);
+				}
 			}
-			peer->passive = 0;
-			break;
-		case EVNT_STOP:
-			timer_stop(&peer->timers, Timer_IdleHold);
-			break;
-		default:
-			/* ignore */
-			break;
-		}
-		break;
-	case STATE_CONNECT:
-		switch (event) {
-		case EVNT_START:
-			/* ignore */
-			break;
-		case EVNT_CON_OPEN:
-			session_tcp_established(peer);
-			session_open(peer);
-			timer_stop(&peer->timers, Timer_ConnectRetry);
-			peer->holdtime = INTERVAL_HOLD_INITIAL;
-			start_timer_holdtime(peer);
-			change_state(peer, STATE_OPENSENT, event);
-			break;
-		case EVNT_CON_OPENFAIL:
-			timer_set(&peer->timers, Timer_ConnectRetry,
-			    conf->connectretry);
-			session_close_connection(peer);
-			change_state(peer, STATE_ACTIVE, event);
-			break;
-		case EVNT_TIMER_CONNRETRY:
-			timer_set(&peer->timers, Timer_ConnectRetry,
-			    conf->connectretry);
-			session_connect(peer);
-			break;
-		default:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		}
-		break;
-	case STATE_ACTIVE:
-		switch (event) {
-		case EVNT_START:
-			/* ignore */
-			break;
-		case EVNT_CON_OPEN:
-			session_tcp_established(peer);
-			session_open(peer);
-			timer_stop(&peer->timers, Timer_ConnectRetry);
-			peer->holdtime = INTERVAL_HOLD_INITIAL;
-			start_timer_holdtime(peer);
-			change_state(peer, STATE_OPENSENT, event);
-			break;
-		case EVNT_CON_OPENFAIL:
-			timer_set(&peer->timers, Timer_ConnectRetry,
-			    conf->connectretry);
-			session_close_connection(peer);
-			change_state(peer, STATE_ACTIVE, event);
-			break;
-		case EVNT_TIMER_CONNRETRY:
-			timer_set(&peer->timers, Timer_ConnectRetry,
-			    peer->holdtime);
-			change_state(peer, STATE_CONNECT, event);
-			session_connect(peer);
-			break;
-		default:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		}
-		break;
-	case STATE_OPENSENT:
-		switch (event) {
-		case EVNT_START:
-			/* ignore */
-			break;
-		case EVNT_STOP:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_CON_CLOSED:
-			session_close_connection(peer);
-			timer_set(&peer->timers, Timer_ConnectRetry,
-			    conf->connectretry);
-			change_state(peer, STATE_ACTIVE, event);
-			break;
-		case EVNT_CON_FATAL:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_HOLDTIME:
-			session_notification(peer, ERR_HOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_SENDHOLD:
-			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_RCVD_OPEN:
-			/* parse_open calls change_state itself on failure */
-			if (parse_open(peer, msg))
-				break;
-			session_keepalive(peer);
-			change_state(peer, STATE_OPENCONFIRM, event);
-			break;
-		case EVNT_RCVD_NOTIFICATION:
-			parse_notification(peer, msg);
-			break;
-		default:
-			session_notification(peer,
-			    ERR_FSM, ERR_FSM_UNEX_OPENSENT, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
+			bgp_fsm(p, EVNT_CON_OPEN, NULL);
+			return (1);
 		}
-		break;
-	case STATE_OPENCONFIRM:
-		switch (event) {
-		case EVNT_START:
-			/* ignore */
-			break;
-		case EVNT_STOP:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_CON_CLOSED:
-		case EVNT_CON_FATAL:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_HOLDTIME:
-			session_notification(peer, ERR_HOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_SENDHOLD:
-			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_KEEPALIVE:
-			session_keepalive(peer);
-			break;
-		case EVNT_RCVD_KEEPALIVE:
-			start_timer_holdtime(peer);
-			change_state(peer, STATE_ESTABLISHED, event);
-			break;
-		case EVNT_RCVD_NOTIFICATION:
-			parse_notification(peer, msg);
-			break;
-		default:
-			session_notification(peer,
-			    ERR_FSM, ERR_FSM_UNEX_OPENCONFIRM, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
+		if (pfd->revents & POLLHUP) {
+			bgp_fsm(p, EVNT_CON_OPENFAIL, NULL);
+			return (1);
 		}
-		break;
-	case STATE_ESTABLISHED:
-		switch (event) {
-		case EVNT_START:
-			/* ignore */
-			break;
-		case EVNT_STOP:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_CON_CLOSED:
-		case EVNT_CON_FATAL:
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_HOLDTIME:
-			session_notification(peer, ERR_HOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_SENDHOLD:
-			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
-			    0, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
-		case EVNT_TIMER_KEEPALIVE:
-			session_keepalive(peer);
-			break;
-		case EVNT_RCVD_KEEPALIVE:
-			start_timer_holdtime(peer);
-			break;
-		case EVNT_RCVD_UPDATE:
-			start_timer_holdtime(peer);
-			if (parse_update(peer, msg))
-				change_state(peer, STATE_IDLE, event);
-			else
-				start_timer_holdtime(peer);
-			break;
-		case EVNT_RCVD_NOTIFICATION:
-			parse_notification(peer, msg);
-			break;
-		default:
-			session_notification(peer,
-			    ERR_FSM, ERR_FSM_UNEX_ESTABLISHED, NULL);
-			change_state(peer, STATE_IDLE, event);
-			break;
+		if (pfd->revents & (POLLERR|POLLNVAL)) {
+			bgp_fsm(p, EVNT_CON_FATAL, NULL);
+			return (1);
 		}
-		break;
+		return (0);
 	}
-}
-
-void
-start_timer_holdtime(struct peer *peer)
-{
-	if (peer->holdtime > 0)
-		timer_set(&peer->timers, Timer_Hold, peer->holdtime);
-	else
-		timer_stop(&peer->timers, Timer_Hold);
-}
-
-void
-start_timer_sendholdtime(struct peer *peer)
-{
-	uint16_t holdtime = INTERVAL_HOLD;
-
-	if (peer->holdtime > INTERVAL_HOLD)
-		holdtime = peer->holdtime;
-
-	if (peer->holdtime > 0)
-		timer_set(&peer->timers, Timer_SendHold, holdtime);
-	else
-		timer_stop(&peer->timers, Timer_SendHold);
-}
-
-void
-start_timer_keepalive(struct peer *peer)
-{
-	if (peer->holdtime > 0)
-		timer_set(&peer->timers, Timer_Keepalive, peer->holdtime / 3);
-	else
-		timer_stop(&peer->timers, Timer_Keepalive);
-}
 
-void
-session_close_connection(struct peer *peer)
-{
-	if (peer->fd != -1) {
-		close(peer->fd);
-		pauseaccept = monotime_clear();
+	if (pfd->revents & POLLHUP) {
+		bgp_fsm(p, EVNT_CON_CLOSED, NULL);
+		return (1);
+	}
+	if (pfd->revents & (POLLERR|POLLNVAL)) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return (1);
 	}
-	peer->fd = -1;
-}
-
-void
-change_state(struct peer *peer, enum session_state state,
-    enum session_events event)
-{
-	switch (state) {
-	case STATE_IDLE:
-		/* carp demotion first. new peers handled in init_peer */
-		if (peer->state == STATE_ESTABLISHED &&
-		    peer->conf.demote_group[0] && !peer->demoted)
-			session_demote(peer, +1);
-
-		/*
-		 * try to write out what's buffered (maybe a notification),
-		 * don't bother if it fails
-		 */
-		if (peer->state >= STATE_OPENSENT &&
-		    msgbuf_queuelen(peer->wbuf) > 0)
-			ibuf_write(peer->fd, peer->wbuf);
 
-		/*
-		 * we must start the timer for the next EVNT_START
-		 * if we are coming here due to an error and the
-		 * session was not established successfully before, the
-		 * starttimerinterval needs to be exponentially increased
-		 */
-		if (peer->IdleHoldTime == 0)
-			peer->IdleHoldTime = INTERVAL_IDLE_HOLD_INITIAL;
-		peer->holdtime = INTERVAL_HOLD_INITIAL;
-		timer_stop(&peer->timers, Timer_ConnectRetry);
-		timer_stop(&peer->timers, Timer_Keepalive);
-		timer_stop(&peer->timers, Timer_Hold);
-		timer_stop(&peer->timers, Timer_SendHold);
-		timer_stop(&peer->timers, Timer_IdleHold);
-		timer_stop(&peer->timers, Timer_IdleHoldReset);
-		session_close_connection(peer);
-		msgbuf_clear(peer->wbuf);
-		peer->rpending = 0;
-		memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
-		if (!peer->template)
-			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
-			    peer->conf.id, 0, -1, NULL, 0);
-
-		if (peer->state == STATE_ESTABLISHED) {
-			if (peer->capa.neg.grestart.restart == 2 &&
-			    (event == EVNT_CON_CLOSED ||
-			    event == EVNT_CON_FATAL ||
-			    (peer->capa.neg.grestart.grnotification &&
-			    (event == EVNT_RCVD_GRACE_NOTIFICATION ||
-			    event == EVNT_TIMER_HOLDTIME ||
-			    event == EVNT_TIMER_SENDHOLD)))) {
-				/* don't punish graceful restart */
-				timer_set(&peer->timers, Timer_IdleHold, 0);
-				session_graceful_restart(peer);
-			} else if (event != EVNT_STOP) {
-				timer_set(&peer->timers, Timer_IdleHold,
-				    peer->IdleHoldTime);
-				if (event != EVNT_NONE &&
-				    peer->IdleHoldTime < MAX_IDLE_HOLD/2)
-					peer->IdleHoldTime *= 2;
-				session_down(peer);
-			} else {
-				session_down(peer);
-			}
-		} else if (event != EVNT_STOP) {
-			timer_set(&peer->timers, Timer_IdleHold,
-			    peer->IdleHoldTime);
-			if (event != EVNT_NONE &&
-			    peer->IdleHoldTime < MAX_IDLE_HOLD / 2)
-				peer->IdleHoldTime *= 2;
+	if (pfd->revents & POLLOUT && msgbuf_queuelen(p->wbuf) > 0) {
+		if (ibuf_write(p->fd, p->wbuf) == -1) {
+			if (errno == EPIPE)
+				log_peer_warnx(&p->conf, "Connection closed");
+			else
+				log_peer_warn(&p->conf, "write error");
+			bgp_fsm(p, EVNT_CON_FATAL, NULL);
+			return (1);
 		}
+		p->stats.last_write = getmonotime();
+		start_timer_sendholdtime(p);
+		if (!(pfd->revents & POLLIN))
+			return (1);
+	}
 
-		if (peer->state == STATE_NONE ||
-		    peer->state == STATE_ESTABLISHED) {
-			/* initialize capability negotiation structures */
-			memcpy(&peer->capa.ann, &peer->conf.capabilities,
-			    sizeof(peer->capa.ann));
-		}
-		break;
-	case STATE_CONNECT:
-		if (peer->state == STATE_ESTABLISHED &&
-		    peer->capa.neg.grestart.restart == 2) {
-			/* do the graceful restart dance */
-			session_graceful_restart(peer);
-			peer->holdtime = INTERVAL_HOLD_INITIAL;
-			timer_stop(&peer->timers, Timer_ConnectRetry);
-			timer_stop(&peer->timers, Timer_Keepalive);
-			timer_stop(&peer->timers, Timer_Hold);
-			timer_stop(&peer->timers, Timer_SendHold);
-			timer_stop(&peer->timers, Timer_IdleHold);
-			timer_stop(&peer->timers, Timer_IdleHoldReset);
-			session_close_connection(peer);
-			msgbuf_clear(peer->wbuf);
-			memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
+	if (p->fd != -1 && pfd->revents & POLLIN) {
+		switch (ibuf_read(p->fd, p->wbuf)) {
+		case -1:
+			if (p->state == STATE_IDLE)
+				/* error already handled before */
+				return (1);
+			log_peer_warn(&p->conf, "read error");
+			bgp_fsm(p, EVNT_CON_FATAL, NULL);
+			return (1);
+		case 0:
+			bgp_fsm(p, EVNT_CON_CLOSED, NULL);
+			return (1);
 		}
-		break;
-	case STATE_ACTIVE:
-		if (!peer->template)
-			imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
-			    peer->conf.id, 0, -1, NULL, 0);
-		break;
-	case STATE_OPENSENT:
-		break;
-	case STATE_OPENCONFIRM:
-		break;
-	case STATE_ESTABLISHED:
-		timer_set(&peer->timers, Timer_IdleHoldReset,
-		    peer->IdleHoldTime);
-		if (peer->demoted)
-			timer_set(&peer->timers, Timer_CarpUndemote,
-			    INTERVAL_HOLD_DEMOTED);
-		session_up(peer);
-		break;
-	default:		/* something seriously fucked */
-		break;
+		p->stats.last_read = getmonotime();
+		return (1);
 	}
-
-	log_statechange(peer, state, event);
-
-	session_mrt_dump_state(peer, peer->state, state);
-
-	peer->prev_state = peer->state;
-	peer->state = state;
+	return (0);
 }
 
 void
@@ -1047,7 +722,7 @@ session_accept(int listenfd)
 	    (p->state == STATE_CONNECT || p->state == STATE_ACTIVE)) {
 		if (p->fd != -1) {
 			if (p->state == STATE_CONNECT)
-				session_close_connection(p);
+				session_close(p);
 			else {
 				close(connfd);
 				return;
@@ -1136,15 +811,17 @@ session_connect(struct peer *peer)
 
 	sa = addr2sa(&peer->conf.remote_addr, peer->conf.remote_port, &sa_len);
 	if (connect(peer->fd, sa, sa_len) == -1) {
-		if (errno != EINPROGRESS) {
-			if (errno != peer->lasterr)
-				log_peer_warn(&peer->conf, "connect");
-			peer->lasterr = errno;
-			bgp_fsm(peer, EVNT_CON_OPENFAIL, NULL);
-			return (-1);
-		}
-	} else
-		bgp_fsm(peer, EVNT_CON_OPEN, NULL);
+		if (errno == EINPROGRESS)
+			return (0);
+
+		if (errno != peer->lasterr)
+			log_peer_warn(&peer->conf, "connect");
+		peer->lasterr = errno;
+		bgp_fsm(peer, EVNT_CON_OPENFAIL, NULL);
+		return (-1);
+	}
+
+	bgp_fsm(peer, EVNT_CON_OPEN, NULL);
 
 	return (0);
 }
@@ -1243,6 +920,16 @@ session_setup_socket(struct peer *p)
 	return (0);
 }
 
+void
+session_close(struct peer *peer)
+{
+	if (peer->fd != -1) {
+		close(peer->fd);
+		pauseaccept = monotime_clear();
+	}
+	peer->fd = -1;
+}
+
 /*
  * compare the bgpd_addr with the sockaddr by converting the latter into
  * a bgpd_addr. Return true if the two are equal, including any scope
@@ -1256,7 +943,7 @@ sa_equal(struct bgpd_addr *ba, struct so
 	return (memcmp(ba, &bb, sizeof(*ba)) == 0);
 }
 
-static void
+void
 get_alternate_addr(struct bgpd_addr *local, struct bgpd_addr *remote,
     struct bgpd_addr *alt, unsigned int *scope)
 {
@@ -1339,1679 +1026,95 @@ get_alternate_addr(struct bgpd_addr *loc
 	freeifaddrs(ifap);
 }
 
-void
-session_tcp_established(struct peer *peer)
-{
-	struct sockaddr_storage	ss;
-	socklen_t		len;
-
-	len = sizeof(ss);
-	if (getsockname(peer->fd, (struct sockaddr *)&ss, &len) == -1)
-		log_warn("getsockname");
-	sa2addr((struct sockaddr *)&ss, &peer->local, &peer->local_port);
-	len = sizeof(ss);
-	if (getpeername(peer->fd, (struct sockaddr *)&ss, &len) == -1)
-		log_warn("getpeername");
-	sa2addr((struct sockaddr *)&ss, &peer->remote, &peer->remote_port);
-
-	get_alternate_addr(&peer->local, &peer->remote, &peer->local_alt,
-	    &peer->if_scope);
-}
-
 int
-session_capa_add(struct ibuf *opb, uint8_t capa_code, uint8_t capa_len)
+session_hanlde_update(struct peer *peer, struct ibuf *msg)
 {
-	int errs = 0;
-
-	errs += ibuf_add_n8(opb, capa_code);
-	errs += ibuf_add_n8(opb, capa_len);
-	return (errs);
+	/*
+	 * we pass the message verbatim to the rde.
+	 * in case of errors the whole session is reset with a
+	 * notification anyway, we only need to know the peer
+	 */
+	if (imsg_rde(IMSG_UPDATE, peer->conf.id, ibuf_data(msg),
+	    ibuf_size(msg)) == -1)
+		return (-1);
+	return (0);
 }
 
-static int
-session_capa_add_mp(struct ibuf *buf, uint8_t aid)
+int
+session_handle_rrefresh(struct peer *peer, struct route_refresh *rr)
 {
-	uint16_t		 afi;
-	uint8_t			 safi;
-	int			 errs = 0;
-
-	if (aid2afi(aid, &afi, &safi) == -1) {
-		log_warn("%s: bad AID", __func__);
+	if (imsg_rde(IMSG_REFRESH, peer->conf.id, rr, sizeof(*rr)) == -1)
 		return (-1);
-	}
-
-	errs += ibuf_add_n16(buf, afi);
-	errs += ibuf_add_zero(buf, 1);
-	errs += ibuf_add_n8(buf, safi);
-
-	return (errs);
+	return (0);
 }
 
-static int
-session_capa_add_afi(struct ibuf *b, uint8_t aid, uint8_t flags)
+int
+session_graceful_restart(struct peer *p)
 {
-	int		errs = 0;
-	uint16_t	afi;
-	uint8_t		safi;
+	uint8_t	i;
+	uint16_t staletime = conf->staletime;
 
-	if (aid2afi(aid, &afi, &safi)) {
-		log_warn("%s: bad AID", __func__);
-		return (-1);
-	}
+	if (p->conf.staletime)
+		staletime = p->conf.staletime;
 
-	errs += ibuf_add_n16(b, afi);
-	errs += ibuf_add_n8(b, safi);
-	errs += ibuf_add_n8(b, flags);
+	/* RFC 8538: enforce configurable upper bound of the stale timer */
+	if (staletime > p->capa.neg.grestart.timeout)
+		staletime = p->capa.neg.grestart.timeout;
+	timer_set(&p->timers, Timer_RestartTimeout, staletime);
 
-	return (errs);
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
+			if (imsg_rde(IMSG_SESSION_STALE, p->conf.id,
+			    &i, sizeof(i)) == -1)
+				return -1;
+			log_peer_warnx(&p->conf,
+			    "graceful restart of %s, keeping routes",
+			    aid2str(i));
+			p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING;
+		} else if (p->capa.neg.mp[i]) {
+			if (imsg_rde(IMSG_SESSION_NOGRACE, p->conf.id,
+			    &i, sizeof(i)) == -1)
+				return -1;
+			log_peer_warnx(&p->conf,
+			    "graceful restart of %s, flushing routes",
+			    aid2str(i));
+		}
+	}
+	return 0;
 }
 
-static int
-session_capa_add_ext_nh(struct ibuf *b, uint8_t aid)
+int
+session_graceful_stop(struct peer *p)
 {
-	int		errs = 0;
-	uint16_t	afi;
-	uint8_t		safi;
+	uint8_t	i;
 
-	if (aid2afi(aid, &afi, &safi)) {
-		log_warn("%s: bad AID", __func__);
-		return (-1);
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		/*
+		 * Only flush if the peer is restarting and the timeout fired.
+		 * In all other cases the session was already flushed when the
+		 * session went down or when the new open message was parsed.
+		 */
+		if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
+			if (session_graceful_flush(p, i, "time-out") == -1)
+				return -1;
+		p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING;
 	}
-
-	errs += ibuf_add_n16(b, afi);
-	errs += ibuf_add_n16(b, safi);
-	errs += ibuf_add_n16(b, AFI_IPv6);
-
-	return (errs);
+	return 0;
 }
 
-struct ibuf *
-session_newmsg(enum msg_type msgtype, uint16_t len)
+int
+session_graceful_flush(struct peer *p, uint8_t aid, const char *why)
 {
-	struct ibuf		*buf;
-	int			 errs = 0;
-
-	if ((buf = ibuf_open(len)) == NULL)
-		return (NULL);
-
-	errs += ibuf_add(buf, marker, sizeof(marker));
-	errs += ibuf_add_n16(buf, len);
-	errs += ibuf_add_n8(buf, msgtype);
-
-	if (errs) {
-		ibuf_free(buf);
-		return (NULL);
-	}
-
-	return (buf);
-}
+	log_peer_warnx(&p->conf, "graceful restart of %s, %s, flushing",
+	    aid2str(aid), why);
+	if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id, &aid, sizeof(aid)) == -1)
+		return -1;
+	return 0;
+}	
 
 void
-session_sendmsg(struct ibuf *msg, struct peer *p, enum msg_type msgtype)
-{
-	session_mrt_dump_bgp_msg(p, msg, msgtype, DIR_OUT);
-
-	ibuf_close(p->wbuf, msg);
-	if (!p->throttled && msgbuf_queuelen(p->wbuf) > SESS_MSG_HIGH_MARK) {
-		if (imsg_rde(IMSG_XOFF, p->conf.id, NULL, 0) == -1)
-			log_peer_warn(&p->conf, "imsg_compose XOFF");
-		else
-			p->throttled = 1;
-	}
-}
-
-/*
- * Translate between internal roles and the value expected by RFC 9234.
- */
-static uint8_t
-role2capa(enum role role)
-{
-	switch (role) {
-	case ROLE_CUSTOMER:
-		return CAPA_ROLE_CUSTOMER;
-	case ROLE_PROVIDER:
-		return CAPA_ROLE_PROVIDER;
-	case ROLE_RS:
-		return CAPA_ROLE_RS;
-	case ROLE_RS_CLIENT:
-		return CAPA_ROLE_RS_CLIENT;
-	case ROLE_PEER:
-		return CAPA_ROLE_PEER;
-	default:
-		fatalx("Unsupported role for role capability");
-	}
-}
-
-static enum role
-capa2role(uint8_t val)
-{
-	switch (val) {
-	case CAPA_ROLE_PROVIDER:
-		return ROLE_PROVIDER;
-	case CAPA_ROLE_RS:
-		return ROLE_RS;
-	case CAPA_ROLE_RS_CLIENT:
-		return ROLE_RS_CLIENT;
-	case CAPA_ROLE_CUSTOMER:
-		return ROLE_CUSTOMER;
-	case CAPA_ROLE_PEER:
-		return ROLE_PEER;
-	default:
-		return ROLE_NONE;
-	}
-}
-
-void
-session_open(struct peer *p)
-{
-	struct ibuf		*buf, *opb;
-	size_t			 len, optparamlen;
-	uint8_t			 i;
-	int			 errs = 0, extlen = 0;
-	int			 mpcapa = 0;
-
-
-	if ((opb = ibuf_dynamic(0, MAX_PKTSIZE - MSGSIZE_OPEN_MIN - 6)) ==
-	    NULL) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	/* multiprotocol extensions, RFC 4760 */
-	for (i = AID_MIN; i < AID_MAX; i++)
-		if (p->capa.ann.mp[i]) {	/* 4 bytes data */
-			errs += session_capa_add(opb, CAPA_MP, 4);
-			errs += session_capa_add_mp(opb, i);
-			mpcapa++;
-		}
-
-	/* route refresh, RFC 2918 */
-	if (p->capa.ann.refresh)	/* no data */
-		errs += session_capa_add(opb, CAPA_REFRESH, 0);
-
-	/* extended nexthop encoding, RFC 8950 */
-	if (p->capa.ann.ext_nh[AID_INET]) {
-		uint8_t enhlen = 0;
-
-		if (p->capa.ann.mp[AID_INET])
-			enhlen += 6;
-		if (p->capa.ann.mp[AID_VPN_IPv4])
-			enhlen += 6;
-		errs += session_capa_add(opb, CAPA_EXT_NEXTHOP, enhlen);
-		if (p->capa.ann.mp[AID_INET])
-			errs += session_capa_add_ext_nh(opb, AID_INET);
-		if (p->capa.ann.mp[AID_VPN_IPv4])
-			errs += session_capa_add_ext_nh(opb, AID_VPN_IPv4);
-	}
-
-	/* extended message support, RFC 8654 */
-	if (p->capa.ann.ext_msg)	/* no data */
-		errs += session_capa_add(opb, CAPA_EXT_MSG, 0);
-
-	/* BGP open policy, RFC 9234, only for ebgp sessions */
-	if (p->conf.ebgp && p->capa.ann.policy &&
-	    p->conf.role != ROLE_NONE &&
-	    (p->capa.ann.mp[AID_INET] || p->capa.ann.mp[AID_INET6] ||
-	    mpcapa == 0)) {
-		errs += session_capa_add(opb, CAPA_ROLE, 1);
-		errs += ibuf_add_n8(opb, role2capa(p->conf.role));
-	}
-
-	/* graceful restart and End-of-RIB marker, RFC 4724 */
-	if (p->capa.ann.grestart.restart) {
-		int		rst = 0;
-		uint16_t	hdr = 0;
-
-		for (i = AID_MIN; i < AID_MAX; i++) {
-			if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
-				rst++;
-		}
-
-		/* Only set the R-flag if no graceful restart is ongoing */
-		if (!rst)
-			hdr |= CAPA_GR_R_FLAG;
-		if (p->capa.ann.grestart.grnotification)
-			hdr |= CAPA_GR_N_FLAG;
-		errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
-		errs += ibuf_add_n16(opb, hdr);
-	}
-
-	/* 4-bytes AS numbers, RFC6793 */
-	if (p->capa.ann.as4byte) {	/* 4 bytes data */
-		errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(uint32_t));
-		errs += ibuf_add_n32(opb, p->conf.local_as);
-	}
-
-	/* advertisement of multiple paths, RFC7911 */
-	if (p->capa.ann.add_path[AID_MIN]) {	/* variable */
-		uint8_t	aplen;
-
-		if (mpcapa)
-			aplen = 4 * mpcapa;
-		else	/* AID_INET */
-			aplen = 4;
-		errs += session_capa_add(opb, CAPA_ADD_PATH, aplen);
-		if (mpcapa) {
-			for (i = AID_MIN; i < AID_MAX; i++) {
-				if (p->capa.ann.mp[i]) {
-					errs += session_capa_add_afi(opb,
-					    i, p->capa.ann.add_path[i] &
-					    CAPA_AP_MASK);
-				}
-			}
-		} else {	/* AID_INET */
-			errs += session_capa_add_afi(opb, AID_INET,
-			    p->capa.ann.add_path[AID_INET] & CAPA_AP_MASK);
-		}
-	}
-
-	/* enhanced route-refresh, RFC7313 */
-	if (p->capa.ann.enhanced_rr)	/* no data */
-		errs += session_capa_add(opb, CAPA_ENHANCED_RR, 0);
-
-	if (errs) {
-		ibuf_free(opb);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	optparamlen = ibuf_size(opb);
-	len = MSGSIZE_OPEN_MIN + optparamlen;
-	if (optparamlen == 0) {
-		/* nothing */
-	} else if (optparamlen + 2 >= 255) {
-		/* RFC9072: use 255 as magic size and request extra header */
-		optparamlen = 255;
-		extlen = 1;
-		/* 3 byte OPT_PARAM_EXT_LEN and OPT_PARAM_CAPABILITIES */
-		len += 2 * 3;
-	} else {
-		/* regular capabilities header */
-		optparamlen += 2;
-		len += 2;
-	}
-
-	if ((buf = session_newmsg(BGP_OPEN, len)) == NULL) {
-		ibuf_free(opb);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	errs += ibuf_add_n8(buf, 4);
-	errs += ibuf_add_n16(buf, p->conf.local_short_as);
-	errs += ibuf_add_n16(buf, p->conf.holdtime);
-	/* is already in network byte order */
-	errs += ibuf_add_n32(buf, conf->bgpid);
-	errs += ibuf_add_n8(buf, optparamlen);
-
-	if (extlen) {
-		/* RFC9072 extra header which spans over the capabilities hdr */
-		errs += ibuf_add_n8(buf, OPT_PARAM_EXT_LEN);
-		errs += ibuf_add_n16(buf, ibuf_size(opb) + 1 + 2);
-	}
-
-	if (optparamlen) {
-		errs += ibuf_add_n8(buf, OPT_PARAM_CAPABILITIES);
-
-		if (extlen) {
-			/* RFC9072: 2-byte extended length */
-			errs += ibuf_add_n16(buf, ibuf_size(opb));
-		} else {
-			errs += ibuf_add_n8(buf, ibuf_size(opb));
-		}
-		errs += ibuf_add_ibuf(buf, opb);
-	}
-
-	ibuf_free(opb);
-
-	if (errs) {
-		ibuf_free(buf);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	session_sendmsg(buf, p, BGP_OPEN);
-	p->stats.msg_sent_open++;
-}
-
-void
-session_keepalive(struct peer *p)
-{
-	struct ibuf		*buf;
-
-	if ((buf = session_newmsg(BGP_KEEPALIVE, MSGSIZE_KEEPALIVE)) == NULL) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	session_sendmsg(buf, p, BGP_KEEPALIVE);
-	start_timer_keepalive(p);
-	p->stats.msg_sent_keepalive++;
-}
-
-void
-session_update(struct peer *p, struct ibuf *ibuf)
-{
-	struct ibuf	*buf;
-	size_t		 len, maxsize = MAX_PKTSIZE;
-
-	if (p->state != STATE_ESTABLISHED)
-		return;
-
-	if (p->capa.neg.ext_msg)
-		maxsize = MAX_EXT_PKTSIZE;
-	len = ibuf_size(ibuf);
-	if (len < MSGSIZE_UPDATE_MIN - MSGSIZE_HEADER ||
-	    len > maxsize - MSGSIZE_HEADER) {
-		log_peer_warnx(&p->conf, "bad UPDATE from RDE");
-		return;
-	}
-
-	if ((buf = session_newmsg(BGP_UPDATE, MSGSIZE_HEADER + len)) == NULL) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	if (ibuf_add_ibuf(buf, ibuf)) {
-		ibuf_free(buf);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	session_sendmsg(buf, p, BGP_UPDATE);
-	start_timer_keepalive(p);
-	p->stats.msg_sent_update++;
-}
-
-/* Return 1 if a hard reset should be issued, 0 for a graceful notification */
-static int
-session_req_hard_reset(enum err_codes errcode, uint8_t subcode)
-{
-	switch (errcode) {
-	case ERR_HEADER:
-	case ERR_OPEN:
-	case ERR_UPDATE:
-	case ERR_FSM:
-	case ERR_RREFRESH:
-		/*
-		 * Protocol errors trigger a hard reset. The peer
-		 * is not trustworthy and so there is no realistic
-		 * hope that forwarding can continue.
-		 */
-		break;
-	case ERR_HOLDTIMEREXPIRED:
-	case ERR_SENDHOLDTIMEREXPIRED:
-		/* Keep forwarding and hope the other side is back soon. */
-		return 0;
-	case ERR_CEASE:
-		switch (subcode) {
-		case ERR_CEASE_CONN_REJECT:
-		case ERR_CEASE_OTHER_CHANGE:
-		case ERR_CEASE_COLLISION:
-		case ERR_CEASE_RSRC_EXHAUST:
-			/* Per RFC8538 suggestion make these graceful. */
-			return 0;
-		}
-		break;
-	}
-	return 1;
-}
-
-void
-session_notification_data(struct peer *p, uint8_t errcode, uint8_t subcode,
-    void *data, size_t datalen)
-{
-	struct ibuf ibuf;
-
-	ibuf_from_buffer(&ibuf, data, datalen);
-	session_notification(p, errcode, subcode, &ibuf);
-}
-
-void
-session_notification(struct peer *p, uint8_t errcode, uint8_t subcode,
-    struct ibuf *ibuf)
-{
-	struct ibuf		*buf;
-	const char		*reason = "sending";
-	int			 errs = 0, need_hard_reset = 0;
-	size_t			 datalen = 0;
-
-	switch (p->state) {
-	case STATE_OPENSENT:
-	case STATE_OPENCONFIRM:
-	case STATE_ESTABLISHED:
-		break;
-	default:
-		/* session not open, no need to send notification */
-		log_notification(p, errcode, subcode, ibuf, "dropping");
-		return;
-	}
-
-	if (p->capa.neg.grestart.grnotification) {
-		if (session_req_hard_reset(errcode, subcode)) {
-			need_hard_reset = 1;
-			datalen += 2;
-			reason = "sending hard-reset";
-		} else {
-			reason = "sending graceful";
-		}
-	}
-
-	log_notification(p, errcode, subcode, ibuf, reason);
-
-	/* cap to maximum size */
-	if (ibuf != NULL) {
-		if (ibuf_size(ibuf) >
-		    MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN - datalen) {
-			log_peer_warnx(&p->conf,
-			    "oversized notification, data trunkated");
-			ibuf_truncate(ibuf, MAX_PKTSIZE -
-			    MSGSIZE_NOTIFICATION_MIN - datalen);
-		}
-		datalen += ibuf_size(ibuf);
-	}
-
-	if ((buf = session_newmsg(BGP_NOTIFICATION,
-	    MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	if (need_hard_reset) {
-		errs += ibuf_add_n8(buf, ERR_CEASE);
-		errs += ibuf_add_n8(buf, ERR_CEASE_HARD_RESET);
-	}
-
-	errs += ibuf_add_n8(buf, errcode);
-	errs += ibuf_add_n8(buf, subcode);
-
-	if (ibuf != NULL)
-		errs += ibuf_add_ibuf(buf, ibuf);
-
-	if (errs) {
-		ibuf_free(buf);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	session_sendmsg(buf, p, BGP_NOTIFICATION);
-	p->stats.msg_sent_notification++;
-	p->stats.last_sent_errcode = errcode;
-	p->stats.last_sent_suberr = subcode;
-}
-
-int
-session_neighbor_rrefresh(struct peer *p)
-{
-	uint8_t	i;
-
-	if (!(p->capa.neg.refresh || p->capa.neg.enhanced_rr))
-		return (-1);
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.mp[i] != 0)
-			session_rrefresh(p, i, ROUTE_REFRESH_REQUEST);
-	}
-
-	return (0);
-}
-
-void
-session_rrefresh(struct peer *p, uint8_t aid, uint8_t subtype)
-{
-	struct ibuf		*buf;
-	int			 errs = 0;
-	uint16_t		 afi;
-	uint8_t			 safi;
-
-	switch (subtype) {
-	case ROUTE_REFRESH_REQUEST:
-		p->stats.refresh_sent_req++;
-		break;
-	case ROUTE_REFRESH_BEGIN_RR:
-	case ROUTE_REFRESH_END_RR:
-		/* requires enhanced route refresh */
-		if (!p->capa.neg.enhanced_rr)
-			return;
-		if (subtype == ROUTE_REFRESH_BEGIN_RR)
-			p->stats.refresh_sent_borr++;
-		else
-			p->stats.refresh_sent_eorr++;
-		break;
-	default:
-		fatalx("session_rrefresh: bad subtype %d", subtype);
-	}
-
-	if (aid2afi(aid, &afi, &safi) == -1)
-		fatalx("session_rrefresh: bad afi/safi pair");
-
-	if ((buf = session_newmsg(BGP_RREFRESH, MSGSIZE_RREFRESH)) == NULL) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	errs += ibuf_add_n16(buf, afi);
-	errs += ibuf_add_n8(buf, subtype);
-	errs += ibuf_add_n8(buf, safi);
-
-	if (errs) {
-		ibuf_free(buf);
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return;
-	}
-
-	session_sendmsg(buf, p, BGP_RREFRESH);
-	p->stats.msg_sent_rrefresh++;
-}
-
-int
-session_graceful_restart(struct peer *p)
-{
-	uint8_t	i;
-	uint16_t staletime = conf->staletime;
-
-	if (p->conf.staletime)
-		staletime = p->conf.staletime;
-
-	/* RFC 8538: enforce configurable upper bound of the stale timer */
-	if (staletime > p->capa.neg.grestart.timeout)
-		staletime = p->capa.neg.grestart.timeout;
-	timer_set(&p->timers, Timer_RestartTimeout, staletime);
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
-			if (imsg_rde(IMSG_SESSION_STALE, p->conf.id,
-			    &i, sizeof(i)) == -1)
-				return (-1);
-			log_peer_warnx(&p->conf,
-			    "graceful restart of %s, keeping routes",
-			    aid2str(i));
-			p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING;
-		} else if (p->capa.neg.mp[i]) {
-			if (imsg_rde(IMSG_SESSION_NOGRACE, p->conf.id,
-			    &i, sizeof(i)) == -1)
-				return (-1);
-			log_peer_warnx(&p->conf,
-			    "graceful restart of %s, flushing routes",
-			    aid2str(i));
-		}
-	}
-	return (0);
-}
-
-int
-session_graceful_stop(struct peer *p)
-{
-	uint8_t	i;
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		/*
-		 * Only flush if the peer is restarting and the timeout fired.
-		 * In all other cases the session was already flushed when the
-		 * session went down or when the new open message was parsed.
-		 */
-		if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) {
-			log_peer_warnx(&p->conf, "graceful restart of %s, "
-			    "time-out, flushing", aid2str(i));
-			if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
-			    &i, sizeof(i)) == -1)
-				return (-1);
-		}
-		p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING;
-	}
-	return (0);
-}
-
-int
-session_dispatch_msg(struct pollfd *pfd, struct peer *p)
-{
-	socklen_t	len;
-	int		error;
-
-	if (p->state == STATE_CONNECT) {
-		if (pfd->revents & POLLOUT) {
-			if (pfd->revents & POLLIN) {
-				/* error occurred */
-				len = sizeof(error);
-				if (getsockopt(pfd->fd, SOL_SOCKET, SO_ERROR,
-				    &error, &len) == -1 || error) {
-					if (error)
-						errno = error;
-					if (errno != p->lasterr) {
-						log_peer_warn(&p->conf,
-						    "socket error");
-						p->lasterr = errno;
-					}
-					bgp_fsm(p, EVNT_CON_OPENFAIL, NULL);
-					return (1);
-				}
-			}
-			bgp_fsm(p, EVNT_CON_OPEN, NULL);
-			return (1);
-		}
-		if (pfd->revents & POLLHUP) {
-			bgp_fsm(p, EVNT_CON_OPENFAIL, NULL);
-			return (1);
-		}
-		if (pfd->revents & (POLLERR|POLLNVAL)) {
-			bgp_fsm(p, EVNT_CON_FATAL, NULL);
-			return (1);
-		}
-		return (0);
-	}
-
-	if (pfd->revents & POLLHUP) {
-		bgp_fsm(p, EVNT_CON_CLOSED, NULL);
-		return (1);
-	}
-	if (pfd->revents & (POLLERR|POLLNVAL)) {
-		bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		return (1);
-	}
-
-	if (pfd->revents & POLLOUT && msgbuf_queuelen(p->wbuf) > 0) {
-		if (ibuf_write(p->fd, p->wbuf) == -1) {
-			if (errno == EPIPE)
-				log_peer_warnx(&p->conf, "Connection closed");
-			else
-				log_peer_warn(&p->conf, "write error");
-			bgp_fsm(p, EVNT_CON_FATAL, NULL);
-			return (1);
-		}
-		p->stats.last_write = getmonotime();
-		start_timer_sendholdtime(p);
-		if (p->throttled &&
-		    msgbuf_queuelen(p->wbuf) < SESS_MSG_LOW_MARK) {
-			if (imsg_rde(IMSG_XON, p->conf.id, NULL, 0) == -1)
-				log_peer_warn(&p->conf, "imsg_compose XON");
-			else
-				p->throttled = 0;
-		}
-		if (!(pfd->revents & POLLIN))
-			return (1);
-	}
-
-	if (p->fd != -1 && pfd->revents & POLLIN) {
-		switch (ibuf_read(p->fd, p->wbuf)) {
-		case -1:
-			if (p->state == STATE_IDLE)
-				/* error already handled before */
-				return (1);
-			log_peer_warn(&p->conf, "read error");
-			bgp_fsm(p, EVNT_CON_FATAL, NULL);
-			return (1);
-		case 0:
-			bgp_fsm(p, EVNT_CON_CLOSED, NULL);
-			return (1);
-		}
-		p->stats.last_read = getmonotime();
-		return (1);
-	}
-	return (0);
-}
-
-void
-session_process_msg(struct peer *p)
-{
-	struct ibuf	*msg;
-	int		processed = 0;
-	uint8_t		msgtype;
-
-	p->rpending = 0;
-	if (p->wbuf == NULL)
-		return;
-
-	/*
-	 * session might drop to IDLE -> all buffers are flushed
-	 */
-	while ((msg = msgbuf_get(p->wbuf)) != NULL) {
-		/* skip msg header and extract type */
-		if (ibuf_skip(msg, MSGSIZE_HEADER_MARKER) == -1 ||
-		    ibuf_skip(msg, sizeof(uint16_t)) == -1 ||
-		    ibuf_get_n8(msg, &msgtype) == -1) {
-			log_peer_warn(&p->conf, "process message failed");
-			bgp_fsm(p, EVNT_CON_FATAL, NULL);
-			ibuf_free(msg);
-			return;
-		}
-		ibuf_rewind(msg);
-
-		session_mrt_dump_bgp_msg(p, msg, msgtype, DIR_IN);
-
-		ibuf_skip(msg, MSGSIZE_HEADER);
-
-		switch (msgtype) {
-		case BGP_OPEN:
-			bgp_fsm(p, EVNT_RCVD_OPEN, msg);
-			p->stats.msg_rcvd_open++;
-			break;
-		case BGP_UPDATE:
-			bgp_fsm(p, EVNT_RCVD_UPDATE, msg);
-			p->stats.msg_rcvd_update++;
-			break;
-		case BGP_NOTIFICATION:
-			bgp_fsm(p, EVNT_RCVD_NOTIFICATION, msg);
-			p->stats.msg_rcvd_notification++;
-			break;
-		case BGP_KEEPALIVE:
-			bgp_fsm(p, EVNT_RCVD_KEEPALIVE, msg);
-			p->stats.msg_rcvd_keepalive++;
-			break;
-		case BGP_RREFRESH:
-			parse_rrefresh(p, msg);
-			p->stats.msg_rcvd_rrefresh++;
-			break;
-		default:	/* cannot happen */
-			session_notification_data(p, ERR_HEADER, ERR_HDR_TYPE,
-			    &msgtype, 1);
-			log_peer_warnx(&p->conf,
-			    "received message with unknown type %u", msgtype);
-			bgp_fsm(p, EVNT_CON_FATAL, NULL);
-		}
-		ibuf_free(msg);
-		if (++processed > MSG_PROCESS_LIMIT) {
-			p->rpending = 1;
-			break;
-		}
-	}
-}
-
-struct ibuf *
-parse_header(struct ibuf *msg, void *arg, int *fd)
-{
-	struct peer		*peer = arg;
-	struct ibuf		*b;
-	u_char			 m[MSGSIZE_HEADER_MARKER];
-	uint16_t		 len, maxlen = MAX_PKTSIZE;
-	uint8_t			 type;
-
-	if (ibuf_get(msg, m, sizeof(m)) == -1 ||
-	    ibuf_get_n16(msg, &len) == -1 ||
-	    ibuf_get_n8(msg, &type) == -1)
-		return (NULL);
-	/* caller MUST make sure we are getting 19 bytes! */
-	if (memcmp(m, marker, sizeof(marker))) {
-		log_peer_warnx(&peer->conf, "sync error");
-		session_notification(peer, ERR_HEADER, ERR_HDR_SYNC, NULL);
-		bgp_fsm(peer, EVNT_CON_FATAL, NULL);
-		errno = EINVAL;
-		return (NULL);
-	}
-
-	if (peer->capa.ann.ext_msg)
-		maxlen = MAX_EXT_PKTSIZE;
-
-	if (len < MSGSIZE_HEADER || len > maxlen) {
-		log_peer_warnx(&peer->conf,
-		    "received message: illegal length: %u byte", len);
-		goto badlen;
-	}
-
-	switch (type) {
-	case BGP_OPEN:
-		if (len < MSGSIZE_OPEN_MIN || len > MAX_PKTSIZE) {
-			log_peer_warnx(&peer->conf,
-			    "received OPEN: illegal len: %u byte", len);
-			goto badlen;
-		}
-		break;
-	case BGP_NOTIFICATION:
-		if (len < MSGSIZE_NOTIFICATION_MIN) {
-			log_peer_warnx(&peer->conf,
-			    "received NOTIFICATION: illegal len: %u byte", len);
-			goto badlen;
-		}
-		break;
-	case BGP_UPDATE:
-		if (len < MSGSIZE_UPDATE_MIN) {
-			log_peer_warnx(&peer->conf,
-			    "received UPDATE: illegal len: %u byte", len);
-			goto badlen;
-		}
-		break;
-	case BGP_KEEPALIVE:
-		if (len != MSGSIZE_KEEPALIVE) {
-			log_peer_warnx(&peer->conf,
-			    "received KEEPALIVE: illegal len: %u byte", len);
-			goto badlen;
-		}
-		break;
-	case BGP_RREFRESH:
-		if (len < MSGSIZE_RREFRESH_MIN) {
-			log_peer_warnx(&peer->conf,
-			    "received RREFRESH: illegal len: %u byte", len);
-			goto badlen;
-		}
-		break;
-	default:
-		log_peer_warnx(&peer->conf,
-		    "received msg with unknown type %u", type);
-		session_notification_data(peer, ERR_HEADER, ERR_HDR_TYPE,
-		    &type, sizeof(type));
-		bgp_fsm(peer, EVNT_CON_FATAL, NULL);
-		errno = EINVAL;
-		return (NULL);
-	}
-
-	if ((b = ibuf_open(len)) == NULL)
-		return (NULL);
-	return (b);
-
- badlen:
-	len = htons(len);
-	session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
-	    &len, sizeof(len));
-	bgp_fsm(peer, EVNT_CON_FATAL, NULL);
-	errno = ERANGE;
-	return (NULL);
-}
-
-int
-parse_open(struct peer *peer, struct ibuf *msg)
-{
-	uint8_t		 version, rversion;
-	uint16_t	 short_as;
-	uint16_t	 holdtime;
-	uint32_t	 as, bgpid;
-	uint8_t		 optparamlen;
-
-	if (ibuf_get_n8(msg, &version) == -1 ||
-	    ibuf_get_n16(msg, &short_as) == -1 ||
-	    ibuf_get_n16(msg, &holdtime) == -1 ||
-	    ibuf_get_n32(msg, &bgpid) == -1 ||
-	    ibuf_get_n8(msg, &optparamlen) == -1)
-		goto bad_len;
-
-	if (version != BGP_VERSION) {
-		log_peer_warnx(&peer->conf,
-		    "peer wants unrecognized version %u", version);
-		if (version > BGP_VERSION)
-			rversion = version - BGP_VERSION;
-		else
-			rversion = BGP_VERSION;
-		session_notification_data(peer, ERR_OPEN, ERR_OPEN_VERSION,
-		    &rversion, sizeof(rversion));
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	as = peer->short_as = short_as;
-	if (as == 0) {
-		log_peer_warnx(&peer->conf,
-		    "peer requests unacceptable AS %u", as);
-		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	if (holdtime != 0 && holdtime < peer->conf.min_holdtime) {
-		log_peer_warnx(&peer->conf,
-		    "peer requests unacceptable holdtime %u", holdtime);
-		session_notification(peer, ERR_OPEN, ERR_OPEN_HOLDTIME, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	if (holdtime < peer->conf.holdtime)
-		peer->holdtime = holdtime;
-	else
-		peer->holdtime = peer->conf.holdtime;
-
-	/* check bgpid for validity - just disallow 0 */
-	if (bgpid == 0) {
-		log_peer_warnx(&peer->conf, "peer BGPID 0 unacceptable");
-		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-	peer->remote_bgpid = bgpid;
-
-	if (optparamlen != 0) {
-		struct ibuf oparams, op;
-		uint8_t ext_type, op_type;
-		uint16_t ext_len, op_len;
-
-		ibuf_from_ibuf(&oparams, msg);
-
-		/* check for RFC9072 encoding */
-		if (ibuf_get_n8(&oparams, &ext_type) == -1)
-			goto bad_len;
-		if (ext_type == OPT_PARAM_EXT_LEN) {
-			if (ibuf_get_n16(&oparams, &ext_len) == -1)
-				goto bad_len;
-			/* skip RFC9072 header */
-			if (ibuf_skip(msg, 3) == -1)
-				goto bad_len;
-		} else {
-			ext_len = optparamlen;
-			ibuf_rewind(&oparams);
-		}
-
-		if (ibuf_truncate(&oparams, ext_len) == -1 ||
-		    ibuf_skip(msg, ext_len) == -1)
-			goto bad_len;
-
-		while (ibuf_size(&oparams) > 0) {
-			if (ibuf_get_n8(&oparams, &op_type) == -1)
-				goto bad_len;
-
-			if (ext_type == OPT_PARAM_EXT_LEN) {
-				if (ibuf_get_n16(&oparams, &op_len) == -1)
-					goto bad_len;
-			} else {
-				uint8_t tmp;
-				if (ibuf_get_n8(&oparams, &tmp) == -1)
-					goto bad_len;
-				op_len = tmp;
-			}
-
-			if (ibuf_get_ibuf(&oparams, op_len, &op) == -1)
-				goto bad_len;
-
-			switch (op_type) {
-			case OPT_PARAM_CAPABILITIES:		/* RFC 3392 */
-				if (parse_capabilities(peer, &op, &as) == -1) {
-					session_notification(peer, ERR_OPEN, 0,
-					    NULL);
-					change_state(peer, STATE_IDLE,
-					    EVNT_RCVD_OPEN);
-					return (-1);
-				}
-				break;
-			case OPT_PARAM_AUTH:			/* deprecated */
-			default:
-				/*
-				 * unsupported type
-				 * the RFCs tell us to leave the data section
-				 * empty and notify the peer with ERR_OPEN,
-				 * ERR_OPEN_OPT. How the peer should know
-				 * _which_ optional parameter we don't support
-				 * is beyond me.
-				 */
-				log_peer_warnx(&peer->conf,
-				    "received OPEN message with unsupported "
-				    "optional parameter: type %u", op_type);
-				session_notification(peer, ERR_OPEN,
-				    ERR_OPEN_OPT, NULL);
-				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-				return (-1);
-			}
-		}
-	}
-
-	if (ibuf_size(msg) != 0) {
- bad_len:
-		log_peer_warnx(&peer->conf,
-		    "corrupt OPEN message received: length mismatch");
-		session_notification(peer, ERR_OPEN, 0, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	/*
-	 * if remote-as is zero and it's a cloned neighbor, accept any
-	 * but only on the first connect, after that the remote-as needs
-	 * to remain the same.
-	 */
-	if (peer->template && !peer->conf.remote_as && as != AS_TRANS) {
-		peer->conf.remote_as = as;
-		peer->conf.ebgp = (peer->conf.remote_as != peer->conf.local_as);
-		if (!peer->conf.ebgp)
-			/* force enforce_as off for iBGP sessions */
-			peer->conf.enforce_as = ENFORCE_AS_OFF;
-	}
-
-	if (peer->conf.remote_as != as) {
-		log_peer_warnx(&peer->conf, "peer sent wrong AS %s",
-		    log_as(as));
-		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	/* on iBGP sessions check for bgpid collision */
-	if (!peer->conf.ebgp && peer->remote_bgpid == conf->bgpid) {
-		struct in_addr ina;
-		ina.s_addr = htonl(bgpid);
-		log_peer_warnx(&peer->conf, "peer BGPID %s conflicts with ours",
-		    inet_ntoa(ina));
-		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	if (capa_neg_calc(peer) == -1) {
-		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-		return (-1);
-	}
-
-	return (0);
-}
-
-int
-parse_update(struct peer *peer, struct ibuf *msg)
-{
-	/*
-	 * we pass the message verbatim to the rde.
-	 * in case of errors the whole session is reset with a
-	 * notification anyway, we only need to know the peer
-	 */
-	if (imsg_rde(IMSG_UPDATE, peer->conf.id, ibuf_data(msg),
-	    ibuf_size(msg)) == -1)
-		return (-1);
-
-	return (0);
-}
-
-int
-parse_rrefresh(struct peer *peer, struct ibuf *msg)
-{
-	struct route_refresh rr;
-	uint16_t afi, datalen;
-	uint8_t aid, safi, subtype;
-
-	datalen = ibuf_size(msg) + MSGSIZE_HEADER;
-
-	if (ibuf_get_n16(msg, &afi) == -1 ||
-	    ibuf_get_n8(msg, &subtype) == -1 ||
-	    ibuf_get_n8(msg, &safi) == -1) {
-		/* minimum size checked in session_process_msg() */
-		fatalx("%s: message too small", __func__);
-	}
-
-	/* check subtype if peer announced enhanced route refresh */
-	if (peer->capa.neg.enhanced_rr) {
-		switch (subtype) {
-		case ROUTE_REFRESH_REQUEST:
-			/* no ORF support, so no oversized RREFRESH msgs */
-			if (datalen != MSGSIZE_RREFRESH) {
-				log_peer_warnx(&peer->conf,
-				    "received RREFRESH: illegal len: %u byte",
-				    datalen);
-				datalen = htons(datalen);
-				session_notification_data(peer, ERR_HEADER,
-				    ERR_HDR_LEN, &datalen, sizeof(datalen));
-				bgp_fsm(peer, EVNT_CON_FATAL, NULL);
-				return (-1);
-			}
-			peer->stats.refresh_rcvd_req++;
-			break;
-		case ROUTE_REFRESH_BEGIN_RR:
-		case ROUTE_REFRESH_END_RR:
-			/* special handling for RFC7313 */
-			if (datalen != MSGSIZE_RREFRESH) {
-				log_peer_warnx(&peer->conf,
-				    "received RREFRESH: illegal len: %u byte",
-				    datalen);
-				ibuf_rewind(msg);
-				session_notification(peer, ERR_RREFRESH,
-				    ERR_RR_INV_LEN, msg);
-				bgp_fsm(peer, EVNT_CON_FATAL, NULL);
-				return (-1);
-			}
-			if (subtype == ROUTE_REFRESH_BEGIN_RR)
-				peer->stats.refresh_rcvd_borr++;
-			else
-				peer->stats.refresh_rcvd_eorr++;
-			break;
-		default:
-			log_peer_warnx(&peer->conf, "peer sent bad refresh, "
-			    "bad subtype %d", subtype);
-			return (0);
-		}
-	} else {
-		/* force subtype to default */
-		subtype = ROUTE_REFRESH_REQUEST;
-		peer->stats.refresh_rcvd_req++;
-	}
-
-	/* afi/safi unchecked -	unrecognized values will be ignored anyway */
-	if (afi2aid(afi, safi, &aid) == -1) {
-		log_peer_warnx(&peer->conf, "peer sent bad refresh, "
-		    "invalid afi/safi pair");
-		return (0);
-	}
-
-	if (!peer->capa.neg.refresh && !peer->capa.neg.enhanced_rr) {
-		log_peer_warnx(&peer->conf, "peer sent unexpected refresh");
-		return (0);
-	}
-
-	rr.aid = aid;
-	rr.subtype = subtype;
-
-	if (imsg_rde(IMSG_REFRESH, peer->conf.id, &rr, sizeof(rr)) == -1)
-		return (-1);
-
-	return (0);
-}
-
-void
-parse_notification(struct peer *peer, struct ibuf *msg)
-{
-	const char		*reason = "received";
-	uint8_t			 errcode, subcode;
-	uint8_t			 reason_len;
-	enum session_events	 event = EVNT_RCVD_NOTIFICATION;
-
-	if (ibuf_get_n8(msg, &errcode) == -1 ||
-	    ibuf_get_n8(msg, &subcode) == -1) {
-		log_peer_warnx(&peer->conf, "received bad notification");
-		goto done;
-	}
-
-	/* RFC8538: check for hard-reset or graceful notification */
-	if (peer->capa.neg.grestart.grnotification) {
-		if (errcode == ERR_CEASE && subcode == ERR_CEASE_HARD_RESET) {
-			if (ibuf_get_n8(msg, &errcode) == -1 ||
-			    ibuf_get_n8(msg, &subcode) == -1) {
-				log_peer_warnx(&peer->conf,
-				    "received bad hard-reset notification");
-				goto done;
-			}
-			reason = "received hard-reset";
-		} else {
-			reason = "received graceful";
-			event = EVNT_RCVD_GRACE_NOTIFICATION;
-		}
-	}
-
-	peer->errcnt++;
-	peer->stats.last_rcvd_errcode = errcode;
-	peer->stats.last_rcvd_suberr = subcode;
-
-	log_notification(peer, errcode, subcode, msg, reason);
-
-	CTASSERT(sizeof(peer->stats.last_reason) > UINT8_MAX);
-	memset(peer->stats.last_reason, 0, sizeof(peer->stats.last_reason));
-	if (errcode == ERR_CEASE &&
-	    (subcode == ERR_CEASE_ADMIN_DOWN ||
-	     subcode == ERR_CEASE_ADMIN_RESET)) {
-		/* check if shutdown reason is included */
-		if (ibuf_get_n8(msg, &reason_len) != -1 && reason_len != 0) {
-			if (ibuf_get(msg, peer->stats.last_reason,
-			    reason_len) == -1)
-				log_peer_warnx(&peer->conf,
-				    "received truncated shutdown reason");
-		}
-	}
-
-done:
-	change_state(peer, STATE_IDLE, event);
-}
-
-int
-parse_capabilities(struct peer *peer, struct ibuf *buf, uint32_t *as)
-{
-	struct ibuf	 capabuf;
-	uint16_t	 afi, nhafi, gr_header;
-	uint8_t		 capa_code, capa_len;
-	uint8_t		 safi, aid, role, flags;
-
-	while (ibuf_size(buf) > 0) {
-		if (ibuf_get_n8(buf, &capa_code) == -1 ||
-		    ibuf_get_n8(buf, &capa_len) == -1) {
-			log_peer_warnx(&peer->conf, "Bad capabilities attr "
-			    "length: too short");
-			return (-1);
-		}
-		if (ibuf_get_ibuf(buf, capa_len, &capabuf) == -1) {
-			log_peer_warnx(&peer->conf,
-			    "Received bad capabilities attr length: "
-			    "len %zu smaller than capa_len %u",
-			    ibuf_size(buf), capa_len);
-			return (-1);
-		}
-
-		switch (capa_code) {
-		case CAPA_MP:			/* RFC 4760 */
-			if (capa_len != 4 ||
-			    ibuf_get_n16(&capabuf, &afi) == -1 ||
-			    ibuf_skip(&capabuf, 1) == -1 ||
-			    ibuf_get_n8(&capabuf, &safi) == -1) {
-				log_peer_warnx(&peer->conf,
-				    "Received bad multi protocol capability");
-				break;
-			}
-			if (afi2aid(afi, safi, &aid) == -1) {
-				log_peer_warnx(&peer->conf,
-				    "Received multi protocol capability: "
-				    " unknown AFI %u, safi %u pair",
-				    afi, safi);
-				peer->capa.peer.mp[AID_UNSPEC] = 1;
-				break;
-			}
-			peer->capa.peer.mp[aid] = 1;
-			break;
-		case CAPA_REFRESH:
-			peer->capa.peer.refresh = 1;
-			break;
-		case CAPA_EXT_NEXTHOP:
-			while (ibuf_size(&capabuf) > 0) {
-				uint16_t tmp16;
-				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
-				    ibuf_get_n16(&capabuf, &tmp16) == -1 ||
-				    ibuf_get_n16(&capabuf, &nhafi) == -1) {
-					log_peer_warnx(&peer->conf,
-					    "Received bad %s capability",
-					    log_capability(CAPA_EXT_NEXTHOP));
-					memset(peer->capa.peer.ext_nh, 0,
-					    sizeof(peer->capa.peer.ext_nh));
-					break;
-				}
-				safi = tmp16;
-				if (afi2aid(afi, safi, &aid) == -1 ||
-				    !(aid == AID_INET || aid == AID_VPN_IPv4)) {
-					log_peer_warnx(&peer->conf,
-					    "Received %s capability: "
-					    " unsupported AFI %u, safi %u pair",
-					    log_capability(CAPA_EXT_NEXTHOP),
-					    afi, safi);
-					continue;
-				}
-				if (nhafi != AFI_IPv6) {
-					log_peer_warnx(&peer->conf,
-					    "Received %s capability: "
-					    " unsupported nexthop AFI %u",
-					    log_capability(CAPA_EXT_NEXTHOP),
-					    nhafi);
-					continue;
-				}
-				peer->capa.peer.ext_nh[aid] = 1;
-			}
-			break;
-		case CAPA_EXT_MSG:
-			peer->capa.peer.ext_msg = 1;
-			break;
-		case CAPA_ROLE:
-			if (capa_len != 1 ||
-			    ibuf_get_n8(&capabuf, &role) == -1) {
-				log_peer_warnx(&peer->conf,
-				    "Received bad role capability");
-				break;
-			}
-			if (!peer->conf.ebgp) {
-				log_peer_warnx(&peer->conf,
-				    "Received role capability on iBGP session");
-				break;
-			}
-			peer->capa.peer.policy = 1;
-			peer->remote_role = capa2role(role);
-			break;
-		case CAPA_RESTART:
-			if (capa_len == 2) {
-				/* peer only supports EoR marker */
-				peer->capa.peer.grestart.restart = 1;
-				peer->capa.peer.grestart.timeout = 0;
-				break;
-			} else if (capa_len % 4 != 2) {
-				log_peer_warnx(&peer->conf,
-				    "Bad graceful restart capability");
-				peer->capa.peer.grestart.restart = 0;
-				peer->capa.peer.grestart.timeout = 0;
-				break;
-			}
-
-			if (ibuf_get_n16(&capabuf, &gr_header) == -1) {
- bad_gr_restart:
-				log_peer_warnx(&peer->conf,
-				    "Bad graceful restart capability");
-				peer->capa.peer.grestart.restart = 0;
-				peer->capa.peer.grestart.timeout = 0;
-				break;
-			}
-
-			peer->capa.peer.grestart.timeout =
-			    gr_header & CAPA_GR_TIMEMASK;
-			if (peer->capa.peer.grestart.timeout == 0) {
-				log_peer_warnx(&peer->conf, "Received "
-				    "graceful restart with zero timeout");
-				peer->capa.peer.grestart.restart = 0;
-				break;
-			}
-
-			while (ibuf_size(&capabuf) > 0) {
-				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
-				    ibuf_get_n8(&capabuf, &safi) == -1 ||
-				    ibuf_get_n8(&capabuf, &flags) == -1)
-					goto bad_gr_restart;
-				if (afi2aid(afi, safi, &aid) == -1) {
-					log_peer_warnx(&peer->conf,
-					    "Received graceful restart capa: "
-					    " unknown AFI %u, safi %u pair",
-					    afi, safi);
-					continue;
-				}
-				peer->capa.peer.grestart.flags[aid] |=
-				    CAPA_GR_PRESENT;
-				if (flags & CAPA_GR_F_FLAG)
-					peer->capa.peer.grestart.flags[aid] |=
-					    CAPA_GR_FORWARD;
-				if (gr_header & CAPA_GR_R_FLAG)
-					peer->capa.peer.grestart.flags[aid] |=
-					    CAPA_GR_RESTART;
-				peer->capa.peer.grestart.restart = 2;
-			}
-			if (gr_header & CAPA_GR_N_FLAG)
-				peer->capa.peer.grestart.grnotification = 1;
-			break;
-		case CAPA_AS4BYTE:
-			if (capa_len != 4 ||
-			    ibuf_get_n32(&capabuf, as) == -1) {
-				log_peer_warnx(&peer->conf,
-				    "Received bad AS4BYTE capability");
-				peer->capa.peer.as4byte = 0;
-				break;
-			}
-			if (*as == 0) {
-				log_peer_warnx(&peer->conf,
-				    "peer requests unacceptable AS %u", *as);
-				session_notification(peer, ERR_OPEN,
-				    ERR_OPEN_AS, NULL);
-				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
-				return (-1);
-			}
-			peer->capa.peer.as4byte = 1;
-			break;
-		case CAPA_ADD_PATH:
-			if (capa_len % 4 != 0) {
- bad_add_path:
-				log_peer_warnx(&peer->conf,
-				    "Received bad ADD-PATH capability");
-				memset(peer->capa.peer.add_path, 0,
-				    sizeof(peer->capa.peer.add_path));
-				break;
-			}
-			while (ibuf_size(&capabuf) > 0) {
-				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
-				    ibuf_get_n8(&capabuf, &safi) == -1 ||
-				    ibuf_get_n8(&capabuf, &flags) == -1)
-					goto bad_add_path;
-				if (afi2aid(afi, safi, &aid) == -1) {
-					log_peer_warnx(&peer->conf,
-					    "Received ADD-PATH capa: "
-					    " unknown AFI %u, safi %u pair",
-					    afi, safi);
-					memset(peer->capa.peer.add_path, 0,
-					    sizeof(peer->capa.peer.add_path));
-					break;
-				}
-				if (flags & ~CAPA_AP_BIDIR) {
-					log_peer_warnx(&peer->conf,
-					    "Received ADD-PATH capa: "
-					    " bad flags %x", flags);
-					memset(peer->capa.peer.add_path, 0,
-					    sizeof(peer->capa.peer.add_path));
-					break;
-				}
-				peer->capa.peer.add_path[aid] = flags;
-			}
-			break;
-		case CAPA_ENHANCED_RR:
-			peer->capa.peer.enhanced_rr = 1;
-			break;
-		default:
-			break;
-		}
-	}
-
-	return (0);
-}
-
-int
-capa_neg_calc(struct peer *p)
-{
-	struct ibuf *ebuf;
-	uint8_t	i, hasmp = 0, capa_code, capa_len, capa_aid = 0;
-
-	/* a capability is accepted only if both sides announced it */
-
-	p->capa.neg.refresh =
-	    (p->capa.ann.refresh && p->capa.peer.refresh) != 0;
-	p->capa.neg.enhanced_rr =
-	    (p->capa.ann.enhanced_rr && p->capa.peer.enhanced_rr) != 0;
-	p->capa.neg.as4byte =
-	    (p->capa.ann.as4byte && p->capa.peer.as4byte) != 0;
-	p->capa.neg.ext_msg =
-	    (p->capa.ann.ext_msg && p->capa.peer.ext_msg) != 0;
-
-	/* MP: both side must agree on the AFI,SAFI pair */
-	if (p->capa.peer.mp[AID_UNSPEC])
-		hasmp = 1;
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.ann.mp[i] && p->capa.peer.mp[i])
-			p->capa.neg.mp[i] = 1;
-		else
-			p->capa.neg.mp[i] = 0;
-		if (p->capa.ann.mp[i] || p->capa.peer.mp[i])
-			hasmp = 1;
-	}
-	/* if no MP capability present default to IPv4 unicast mode */
-	if (!hasmp)
-		p->capa.neg.mp[AID_INET] = 1;
-
-	/*
-	 * graceful restart: the peer capabilities are of interest here.
-	 * It is necessary to compare the new values with the previous ones
-	 * and act accordingly. AFI/SAFI that are not part in the MP capability
-	 * are treated as not being present.
-	 * Also make sure that a flush happens if the session stopped
-	 * supporting graceful restart.
-	 */
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		int8_t	negflags;
-
-		/* disable GR if the AFI/SAFI is not present */
-		if ((p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
-		    p->capa.neg.mp[i] == 0))
-			p->capa.peer.grestart.flags[i] = 0;	/* disable */
-		/* look at current GR state and decide what to do */
-		negflags = p->capa.neg.grestart.flags[i];
-		p->capa.neg.grestart.flags[i] = p->capa.peer.grestart.flags[i];
-		if (negflags & CAPA_GR_RESTARTING) {
-			if (p->capa.ann.grestart.restart != 0 &&
-			    p->capa.peer.grestart.flags[i] & CAPA_GR_FORWARD) {
-				p->capa.neg.grestart.flags[i] |=
-				    CAPA_GR_RESTARTING;
-			} else {
-				if (imsg_rde(IMSG_SESSION_FLUSH, p->conf.id,
-				    &i, sizeof(i)) == -1) {
-					log_peer_warnx(&p->conf,
-					    "imsg send failed");
-					return (-1);
-				}
-				log_peer_warnx(&p->conf, "graceful restart of "
-				    "%s, not restarted, flushing", aid2str(i));
-			}
-		}
-	}
-	p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout;
-	p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
-	if (p->capa.ann.grestart.restart == 0)
-		p->capa.neg.grestart.restart = 0;
-
-	/* RFC 8538 graceful notification: both sides need to agree */
-	p->capa.neg.grestart.grnotification =
-	    (p->capa.ann.grestart.grnotification &&
-	    p->capa.peer.grestart.grnotification) != 0;
-
-	/* RFC 8950 extended nexthop encoding: both sides need to agree */
-	memset(p->capa.neg.ext_nh, 0, sizeof(p->capa.neg.ext_nh));
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.mp[i] == 0)
-			continue;
-		if (p->capa.ann.ext_nh[i] && p->capa.peer.ext_nh[i]) {
-			p->capa.neg.ext_nh[i] = 1;
-		}
-	}
-
-	/*
-	 * ADD-PATH: set only those bits where both sides agree.
-	 * For this compare our send bit with the recv bit from the peer
-	 * and vice versa.
-	 * The flags are stored from this systems view point.
-	 * At index 0 the flags are set if any per-AID flag is set.
-	 */
-	memset(p->capa.neg.add_path, 0, sizeof(p->capa.neg.add_path));
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.mp[i] == 0)
-			continue;
-		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV) &&
-		    (p->capa.peer.add_path[i] & CAPA_AP_SEND)) {
-			p->capa.neg.add_path[i] |= CAPA_AP_RECV;
-			p->capa.neg.add_path[0] |= CAPA_AP_RECV;
-		}
-		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND) &&
-		    (p->capa.peer.add_path[i] & CAPA_AP_RECV)) {
-			p->capa.neg.add_path[i] |= CAPA_AP_SEND;
-			p->capa.neg.add_path[0] |= CAPA_AP_SEND;
-		}
-	}
-
-	/*
-	 * Open policy: check that the policy is sensible.
-	 *
-	 * Make sure that the roles match and set the negotiated capability
-	 * to the role of the peer. So the RDE can inject the OTC attribute.
-	 * See RFC 9234, section 4.2.
-	 * These checks should only happen on ebgp sessions.
-	 */
-	if (p->capa.ann.policy != 0 && p->capa.peer.policy != 0 &&
-	    p->conf.ebgp) {
-		switch (p->conf.role) {
-		case ROLE_PROVIDER:
-			if (p->remote_role != ROLE_CUSTOMER)
-				goto policyfail;
-			break;
-		case ROLE_RS:
-			if (p->remote_role != ROLE_RS_CLIENT)
-				goto policyfail;
-			break;
-		case ROLE_RS_CLIENT:
-			if (p->remote_role != ROLE_RS)
-				goto policyfail;
-			break;
-		case ROLE_CUSTOMER:
-			if (p->remote_role != ROLE_PROVIDER)
-				goto policyfail;
-			break;
-		case ROLE_PEER:
-			if (p->remote_role != ROLE_PEER)
-				goto policyfail;
-			break;
-		default:
- policyfail:
-			log_peer_warnx(&p->conf, "open policy role mismatch: "
-			    "our role %s, their role %s",
-			    log_policy(p->conf.role),
-			    log_policy(p->remote_role));
-			session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
-			return (-1);
-		}
-		p->capa.neg.policy = 1;
-	}
-
-	/* enforce presence of open policy role capability */
-	if (p->capa.ann.policy == 2 && p->capa.peer.policy == 0 &&
-	    p->conf.ebgp) {
-		log_peer_warnx(&p->conf, "open policy role enforced but "
-		    "not present");
-		session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
-		return (-1);
-	}
-
-	/* enforce presence of other capabilities */
-	if (p->capa.ann.refresh == 2 && p->capa.neg.refresh == 0) {
-		capa_code = CAPA_REFRESH;
-		capa_len = 0;
-		goto fail;
-	}
-	/* enforce presence of other capabilities */
-	if (p->capa.ann.ext_msg == 2 && p->capa.neg.ext_msg == 0) {
-		capa_code = CAPA_EXT_MSG;
-		capa_len = 0;
-		goto fail;
-	}
-	if (p->capa.ann.enhanced_rr == 2 && p->capa.neg.enhanced_rr == 0) {
-		capa_code = CAPA_ENHANCED_RR;
-		capa_len = 0;
-		goto fail;
-	}
-	if (p->capa.ann.as4byte == 2 && p->capa.neg.as4byte == 0) {
-		capa_code = CAPA_AS4BYTE;
-		capa_len = 4;
-		goto fail;
-	}
-	if (p->capa.ann.grestart.restart == 2 &&
-	    p->capa.neg.grestart.restart == 0) {
-		capa_code = CAPA_RESTART;
-		capa_len = 2;
-		goto fail;
-	}
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.ann.mp[i] == 2 && p->capa.neg.mp[i] == 0) {
-			capa_code = CAPA_MP;
-			capa_len = 4;
-			capa_aid = i;
-			goto fail;
-		}
-	}
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.mp[i] == 0)
-			continue;
-		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV_ENFORCE) &&
-		    (p->capa.neg.add_path[i] & CAPA_AP_RECV) == 0) {
-			capa_code = CAPA_ADD_PATH;
-			capa_len = 4;
-			capa_aid = i;
-			goto fail;
-		}
-		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND_ENFORCE) &&
-		    (p->capa.neg.add_path[i] & CAPA_AP_SEND) == 0) {
-			capa_code = CAPA_ADD_PATH;
-			capa_len = 4;
-			capa_aid = i;
-			goto fail;
-		}
-	}
-
-	for (i = AID_MIN; i < AID_MAX; i++) {
-		if (p->capa.neg.mp[i] == 0)
-			continue;
-		if (p->capa.ann.ext_nh[i] == 2 &&
-		    p->capa.neg.ext_nh[i] == 0) {
-			capa_code = CAPA_EXT_NEXTHOP;
-			capa_len = 6;
-			capa_aid = i;
-			goto fail;
-		}
-	}
-	return (0);
-
- fail:
-	if ((ebuf = ibuf_dynamic(2, 256)) == NULL)
-		return (-1);
-	/* best effort, no problem if it fails */
-	session_capa_add(ebuf, capa_code, capa_len);
-	if (capa_code == CAPA_MP)
-		session_capa_add_mp(ebuf, capa_aid);
-	else if (capa_code == CAPA_ADD_PATH)
-		session_capa_add_afi(ebuf, capa_aid, 0);
-	else if (capa_code == CAPA_EXT_NEXTHOP)
-		session_capa_add_ext_nh(ebuf, capa_aid);
-	else if (capa_len > 0)
-		ibuf_add_zero(ebuf, capa_len);
-
-	session_notification(p, ERR_OPEN, ERR_OPEN_CAPA, ebuf);
-	ibuf_free(ebuf);
-	return (-1);
-}
-
-void
-session_mrt_dump_state(struct peer *p, enum session_state oldstate,
-    enum session_state newstate)
+session_mrt_dump_state(struct peer *p, enum session_state oldstate,
+    enum session_state newstate)
 {
 	struct mrt		*mrt;
 
@@ -3050,6 +1153,41 @@ session_mrt_dump_bgp_msg(struct peer *p,
 	}
 }
 
+static int
+la_cmp(struct listen_addr *a, struct listen_addr *b)
+{
+	struct sockaddr_in	*in_a, *in_b;
+	struct sockaddr_in6	*in6_a, *in6_b;
+
+	if (a->sa.ss_family != b->sa.ss_family)
+		return (1);
+
+	switch (a->sa.ss_family) {
+	case AF_INET:
+		in_a = (struct sockaddr_in *)&a->sa;
+		in_b = (struct sockaddr_in *)&b->sa;
+		if (in_a->sin_addr.s_addr != in_b->sin_addr.s_addr)
+			return (1);
+		if (in_a->sin_port != in_b->sin_port)
+			return (1);
+		break;
+	case AF_INET6:
+		in6_a = (struct sockaddr_in6 *)&a->sa;
+		in6_b = (struct sockaddr_in6 *)&b->sa;
+		if (memcmp(&in6_a->sin6_addr, &in6_b->sin6_addr,
+		    sizeof(struct in6_addr)))
+			return (1);
+		if (in6_a->sin6_port != in6_b->sin6_port)
+			return (1);
+		break;
+	default:
+		fatal("king bula sez: unknown address family");
+		/* NOTREACHED */
+	}
+
+	return (0);
+}
+
 void
 session_dispatch_imsg(struct imsgbuf *imsgbuf, int idx, u_int *listener_cnt)
 {
@@ -3467,41 +1605,6 @@ session_dispatch_imsg(struct imsgbuf *im
 	}
 }
 
-int
-la_cmp(struct listen_addr *a, struct listen_addr *b)
-{
-	struct sockaddr_in	*in_a, *in_b;
-	struct sockaddr_in6	*in6_a, *in6_b;
-
-	if (a->sa.ss_family != b->sa.ss_family)
-		return (1);
-
-	switch (a->sa.ss_family) {
-	case AF_INET:
-		in_a = (struct sockaddr_in *)&a->sa;
-		in_b = (struct sockaddr_in *)&b->sa;
-		if (in_a->sin_addr.s_addr != in_b->sin_addr.s_addr)
-			return (1);
-		if (in_a->sin_port != in_b->sin_port)
-			return (1);
-		break;
-	case AF_INET6:
-		in6_a = (struct sockaddr_in6 *)&a->sa;
-		in6_b = (struct sockaddr_in6 *)&b->sa;
-		if (memcmp(&in6_a->sin6_addr, &in6_b->sin6_addr,
-		    sizeof(struct in6_addr)))
-			return (1);
-		if (in6_a->sin6_port != in6_b->sin6_port)
-			return (1);
-		break;
-	default:
-		fatal("king bula sez: unknown address family");
-		/* NOTREACHED */
-	}
-
-	return (0);
-}
-
 struct peer *
 getpeerbydesc(struct bgpd_config *c, const char *descr)
 {
@@ -3765,6 +1868,15 @@ session_demote(struct peer *p, int level
 }
 
 void
+session_md5_reload(struct peer *p)
+{
+	if (!p->template)
+		if (imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
+		    p->conf.id, 0, -1, NULL, 0) == -1)
+			fatalx("imsg_compose error");
+}
+
+void
 session_stop(struct peer *peer, uint8_t subcode, const char *reason)
 {
 	struct ibuf *ibuf;
@@ -3853,6 +1965,8 @@ merge_peers(struct bgpd_config *c, struc
 			p->conf.holdtime = conf->holdtime;
 		if (p->conf.min_holdtime == 0)
 			p->conf.min_holdtime = conf->min_holdtime;
+		p->connectretry = conf->connectretry;
+		p->local_bgpid = conf->bgpid;
 
 		/* had demotion, is demoted, demote removed? */
 		if (p->demoted && !p->conf.demote_group[0])
Index: session.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.h,v
diff -u -p -r1.187 session.h
--- session.h	20 Feb 2025 19:47:31 -0000	1.187
+++ session.h	25 Feb 2025 14:17:48 -0000
@@ -216,6 +216,7 @@ struct peer {
 	u_int			 errcnt;
 	u_int			 IdleHoldTime;
 	unsigned int		 if_scope;	/* interface scope for IPv6 */
+	uint32_t		 local_bgpid;
 	uint32_t		 remote_bgpid;
 	enum session_state	 state;
 	enum session_state	 prev_state;
@@ -225,6 +226,7 @@ struct peer {
 	uint16_t		 holdtime;
 	uint16_t		 local_port;
 	uint16_t		 remote_port;
+	uint16_t		 connectretry;
 	uint8_t			 depend_ok;
 	uint8_t			 demoted;
 	uint8_t			 passive;
@@ -328,11 +330,16 @@ void	rtr_recalc(void);
 RB_PROTOTYPE(peer_head, peer, entry, peer_compare);
 
 void		 session_main(int, int);
-void		 bgp_fsm(struct peer *, enum session_events, struct ibuf *);
 int		 session_neighbor_rrefresh(struct peer *p);
+void		 get_alternate_addr(struct bgpd_addr *, struct bgpd_addr *,
+		    struct bgpd_addr *, unsigned int *);
 struct peer	*getpeerbydesc(struct bgpd_config *, const char *);
 struct peer	*getpeerbyip(struct bgpd_config *, struct sockaddr *);
 struct peer	*getpeerbyid(struct bgpd_config *, uint32_t);
+int		 session_hanlde_update(struct peer *, struct ibuf *);
+int		 session_handle_rrefresh(struct peer *, struct route_refresh *);
+int		 session_graceful_restart(struct peer *);
+int		 session_graceful_flush(struct peer *, uint8_t, const char *);
 void		 session_mrt_dump_state(struct peer *, enum session_state,
 		    enum session_state);
 void		 session_mrt_dump_bgp_msg(struct peer *, struct ibuf *,
@@ -341,8 +348,31 @@ int		 peer_matched(struct peer *, struct
 int		 imsg_ctl_parent(struct imsg *);
 int		 imsg_ctl_rde(struct imsg *);
 int		 imsg_ctl_rde_msg(int, uint32_t, pid_t);
+int		 session_connect(struct peer *);
+void		 session_close(struct peer *);
+void		 session_up(struct peer *);
+void		 session_down(struct peer *);
+void		 session_demote(struct peer *, int);
+void		 session_md5_reload(struct peer *);
 void		 session_stop(struct peer *, uint8_t, const char *);
 struct bgpd_addr *session_localaddr(struct peer *);
+
+/* session_bgp.c */
+void	session_open(struct peer *);
+void	session_keepalive(struct peer *);
+void	session_update(struct peer *, struct ibuf *);
+void	session_notification(struct peer *, uint8_t, uint8_t, struct ibuf *);
+void	session_notification_data(struct peer *, uint8_t, uint8_t, void *,
+	    size_t);
+void	session_rrefresh(struct peer *, uint8_t, uint8_t);
+int	session_dispatch_msg(struct pollfd *, struct peer *);
+void	session_process_msg(struct peer *);
+
+struct ibuf	*parse_header(struct ibuf *, void *, int *);
+
+void	start_timer_sendholdtime(struct peer *);
+void	bgp_fsm(struct peer *, enum session_events, struct ibuf *);
+void    change_state(struct peer *, enum session_state, enum session_events);
 
 /* timer.c */
 struct timer	*timer_get(struct timer_head *, enum Timer);
Index: session_bgp.c
===================================================================
RCS file: session_bgp.c
diff -N session_bgp.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ session_bgp.c	25 Feb 2025 14:16:47 -0000
@@ -0,0 +1,1930 @@
+/*	$OpenBSD$ */
+
+/*
+ * Copyright (c) 2004 - 2025 Claudio Jeker <claudio@openbsd.org>
+ * Copyright (c) 2003, 2004, 2005 Henning Brauer <henning@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include <sys/types.h>
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bgpd.h"
+#include "session.h"
+#include "log.h"
+
+static void	start_timer_holdtime(struct peer *);
+static void	start_timer_keepalive(struct peer *);
+struct ibuf	*session_newmsg(enum msg_type, uint16_t);
+void	session_sendmsg(struct ibuf *, struct peer *, enum msg_type);
+void	session_open(struct peer *);
+void	session_keepalive(struct peer *);
+void	session_update(struct peer *, struct ibuf *);
+void	session_notification(struct peer *, uint8_t, uint8_t, struct ibuf *);
+void	session_notification_data(struct peer *, uint8_t, uint8_t, void *,
+	    size_t);
+void	session_rrefresh(struct peer *, uint8_t, uint8_t);
+int	capa_neg_calc(struct peer *);
+
+static const uint8_t	 marker[MSGSIZE_HEADER_MARKER] = {
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+};
+
+struct ibuf *
+session_newmsg(enum msg_type msgtype, uint16_t len)
+{
+	struct ibuf		*buf;
+	int			 errs = 0;
+
+	if ((buf = ibuf_open(len)) == NULL)
+		return (NULL);
+
+	errs += ibuf_add(buf, marker, sizeof(marker));
+	errs += ibuf_add_n16(buf, len);
+	errs += ibuf_add_n8(buf, msgtype);
+
+	if (errs) {
+		ibuf_free(buf);
+		return (NULL);
+	}
+
+	return (buf);
+}
+
+void
+session_sendmsg(struct ibuf *msg, struct peer *p, enum msg_type msgtype)
+{
+	session_mrt_dump_bgp_msg(p, msg, msgtype, DIR_OUT);
+
+	ibuf_close(p->wbuf, msg);
+}
+
+/*
+ * Translate between internal roles and the value expected by RFC 9234.
+ */
+static uint8_t
+role2capa(enum role role)
+{
+	switch (role) {
+	case ROLE_CUSTOMER:
+		return CAPA_ROLE_CUSTOMER;
+	case ROLE_PROVIDER:
+		return CAPA_ROLE_PROVIDER;
+	case ROLE_RS:
+		return CAPA_ROLE_RS;
+	case ROLE_RS_CLIENT:
+		return CAPA_ROLE_RS_CLIENT;
+	case ROLE_PEER:
+		return CAPA_ROLE_PEER;
+	default:
+		fatalx("Unsupported role for role capability");
+	}
+}
+
+static enum role
+capa2role(uint8_t val)
+{
+	switch (val) {
+	case CAPA_ROLE_PROVIDER:
+		return ROLE_PROVIDER;
+	case CAPA_ROLE_RS:
+		return ROLE_RS;
+	case CAPA_ROLE_RS_CLIENT:
+		return ROLE_RS_CLIENT;
+	case CAPA_ROLE_CUSTOMER:
+		return ROLE_CUSTOMER;
+	case CAPA_ROLE_PEER:
+		return ROLE_PEER;
+	default:
+		return ROLE_NONE;
+	}
+}
+
+static int
+session_capa_add(struct ibuf *opb, uint8_t capa_code, uint8_t capa_len)
+{
+	int errs = 0;
+
+	errs += ibuf_add_n8(opb, capa_code);
+	errs += ibuf_add_n8(opb, capa_len);
+	return (errs);
+}
+
+static int
+session_capa_add_mp(struct ibuf *buf, uint8_t aid)
+{
+	uint16_t		 afi;
+	uint8_t			 safi;
+	int			 errs = 0;
+
+	if (aid2afi(aid, &afi, &safi) == -1) {
+		log_warn("%s: bad AID", __func__);
+		return (-1);
+	}
+
+	errs += ibuf_add_n16(buf, afi);
+	errs += ibuf_add_zero(buf, 1);
+	errs += ibuf_add_n8(buf, safi);
+
+	return (errs);
+}
+
+static int
+session_capa_add_afi(struct ibuf *b, uint8_t aid, uint8_t flags)
+{
+	int		errs = 0;
+	uint16_t	afi;
+	uint8_t		safi;
+
+	if (aid2afi(aid, &afi, &safi)) {
+		log_warn("%s: bad AID", __func__);
+		return (-1);
+	}
+
+	errs += ibuf_add_n16(b, afi);
+	errs += ibuf_add_n8(b, safi);
+	errs += ibuf_add_n8(b, flags);
+
+	return (errs);
+}
+
+static int
+session_capa_add_ext_nh(struct ibuf *b, uint8_t aid)
+{
+	int		errs = 0;
+	uint16_t	afi;
+	uint8_t		safi;
+
+	if (aid2afi(aid, &afi, &safi)) {
+		log_warn("%s: bad AID", __func__);
+		return (-1);
+	}
+
+	errs += ibuf_add_n16(b, afi);
+	errs += ibuf_add_n16(b, safi);
+	errs += ibuf_add_n16(b, AFI_IPv6);
+
+	return (errs);
+}
+
+void
+session_open(struct peer *p)
+{
+	struct ibuf		*buf, *opb;
+	size_t			 len, optparamlen;
+	uint8_t			 i;
+	int			 errs = 0, extlen = 0;
+	int			 mpcapa = 0;
+
+
+	if ((opb = ibuf_dynamic(0, MAX_PKTSIZE - MSGSIZE_OPEN_MIN - 6)) ==
+	    NULL) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	/* multiprotocol extensions, RFC 4760 */
+	for (i = AID_MIN; i < AID_MAX; i++)
+		if (p->capa.ann.mp[i]) {	/* 4 bytes data */
+			errs += session_capa_add(opb, CAPA_MP, 4);
+			errs += session_capa_add_mp(opb, i);
+			mpcapa++;
+		}
+
+	/* route refresh, RFC 2918 */
+	if (p->capa.ann.refresh)	/* no data */
+		errs += session_capa_add(opb, CAPA_REFRESH, 0);
+
+	/* extended nexthop encoding, RFC 8950 */
+	if (p->capa.ann.ext_nh[AID_INET]) {
+		uint8_t enhlen = 0;
+
+		if (p->capa.ann.mp[AID_INET])
+			enhlen += 6;
+		if (p->capa.ann.mp[AID_VPN_IPv4])
+			enhlen += 6;
+		errs += session_capa_add(opb, CAPA_EXT_NEXTHOP, enhlen);
+		if (p->capa.ann.mp[AID_INET])
+			errs += session_capa_add_ext_nh(opb, AID_INET);
+		if (p->capa.ann.mp[AID_VPN_IPv4])
+			errs += session_capa_add_ext_nh(opb, AID_VPN_IPv4);
+	}
+
+	/* extended message support, RFC 8654 */
+	if (p->capa.ann.ext_msg)	/* no data */
+		errs += session_capa_add(opb, CAPA_EXT_MSG, 0);
+
+	/* BGP open policy, RFC 9234, only for ebgp sessions */
+	if (p->conf.ebgp && p->capa.ann.policy &&
+	    p->conf.role != ROLE_NONE &&
+	    (p->capa.ann.mp[AID_INET] || p->capa.ann.mp[AID_INET6] ||
+	    mpcapa == 0)) {
+		errs += session_capa_add(opb, CAPA_ROLE, 1);
+		errs += ibuf_add_n8(opb, role2capa(p->conf.role));
+	}
+
+	/* graceful restart and End-of-RIB marker, RFC 4724 */
+	if (p->capa.ann.grestart.restart) {
+		int		rst = 0;
+		uint16_t	hdr = 0;
+
+		for (i = AID_MIN; i < AID_MAX; i++) {
+			if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
+				rst++;
+		}
+
+		/* Only set the R-flag if no graceful restart is ongoing */
+		if (!rst)
+			hdr |= CAPA_GR_R_FLAG;
+		if (p->capa.ann.grestart.grnotification)
+			hdr |= CAPA_GR_N_FLAG;
+		errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
+		errs += ibuf_add_n16(opb, hdr);
+	}
+
+	/* 4-bytes AS numbers, RFC6793 */
+	if (p->capa.ann.as4byte) {	/* 4 bytes data */
+		errs += session_capa_add(opb, CAPA_AS4BYTE, sizeof(uint32_t));
+		errs += ibuf_add_n32(opb, p->conf.local_as);
+	}
+
+	/* advertisement of multiple paths, RFC7911 */
+	if (p->capa.ann.add_path[AID_MIN]) {	/* variable */
+		uint8_t	aplen;
+
+		if (mpcapa)
+			aplen = 4 * mpcapa;
+		else	/* AID_INET */
+			aplen = 4;
+		errs += session_capa_add(opb, CAPA_ADD_PATH, aplen);
+		if (mpcapa) {
+			for (i = AID_MIN; i < AID_MAX; i++) {
+				if (p->capa.ann.mp[i]) {
+					errs += session_capa_add_afi(opb,
+					    i, p->capa.ann.add_path[i] &
+					    CAPA_AP_MASK);
+				}
+			}
+		} else {	/* AID_INET */
+			errs += session_capa_add_afi(opb, AID_INET,
+			    p->capa.ann.add_path[AID_INET] & CAPA_AP_MASK);
+		}
+	}
+
+	/* enhanced route-refresh, RFC7313 */
+	if (p->capa.ann.enhanced_rr)	/* no data */
+		errs += session_capa_add(opb, CAPA_ENHANCED_RR, 0);
+
+	if (errs) {
+		ibuf_free(opb);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	optparamlen = ibuf_size(opb);
+	len = MSGSIZE_OPEN_MIN + optparamlen;
+	if (optparamlen == 0) {
+		/* nothing */
+	} else if (optparamlen + 2 >= 255) {
+		/* RFC9072: use 255 as magic size and request extra header */
+		optparamlen = 255;
+		extlen = 1;
+		/* 3 byte OPT_PARAM_EXT_LEN and OPT_PARAM_CAPABILITIES */
+		len += 2 * 3;
+	} else {
+		/* regular capabilities header */
+		optparamlen += 2;
+		len += 2;
+	}
+
+	if ((buf = session_newmsg(BGP_OPEN, len)) == NULL) {
+		ibuf_free(opb);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	errs += ibuf_add_n8(buf, 4);
+	errs += ibuf_add_n16(buf, p->conf.local_short_as);
+	errs += ibuf_add_n16(buf, p->conf.holdtime);
+	/* is already in network byte order */
+	errs += ibuf_add_n32(buf, p->local_bgpid);
+	errs += ibuf_add_n8(buf, optparamlen);
+
+	if (extlen) {
+		/* RFC9072 extra header which spans over the capabilities hdr */
+		errs += ibuf_add_n8(buf, OPT_PARAM_EXT_LEN);
+		errs += ibuf_add_n16(buf, ibuf_size(opb) + 1 + 2);
+	}
+
+	if (optparamlen) {
+		errs += ibuf_add_n8(buf, OPT_PARAM_CAPABILITIES);
+
+		if (extlen) {
+			/* RFC9072: 2-byte extended length */
+			errs += ibuf_add_n16(buf, ibuf_size(opb));
+		} else {
+			errs += ibuf_add_n8(buf, ibuf_size(opb));
+		}
+		errs += ibuf_add_ibuf(buf, opb);
+	}
+
+	ibuf_free(opb);
+
+	if (errs) {
+		ibuf_free(buf);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	session_sendmsg(buf, p, BGP_OPEN);
+	p->stats.msg_sent_open++;
+}
+
+void
+session_keepalive(struct peer *p)
+{
+	struct ibuf		*buf;
+
+	if ((buf = session_newmsg(BGP_KEEPALIVE, MSGSIZE_KEEPALIVE)) == NULL) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	session_sendmsg(buf, p, BGP_KEEPALIVE);
+	start_timer_keepalive(p);
+	p->stats.msg_sent_keepalive++;
+}
+
+void
+session_update(struct peer *p, struct ibuf *ibuf)
+{
+	struct ibuf	*buf;
+	size_t		 len, maxsize = MAX_PKTSIZE;
+
+	if (p->state != STATE_ESTABLISHED)
+		return;
+
+	if (p->capa.neg.ext_msg)
+		maxsize = MAX_EXT_PKTSIZE;
+	len = ibuf_size(ibuf);
+	if (len < MSGSIZE_UPDATE_MIN - MSGSIZE_HEADER ||
+	    len > maxsize - MSGSIZE_HEADER) {
+		log_peer_warnx(&p->conf, "bad UPDATE from RDE");
+		return;
+	}
+
+	if ((buf = session_newmsg(BGP_UPDATE, MSGSIZE_HEADER + len)) == NULL) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	if (ibuf_add_ibuf(buf, ibuf)) {
+		ibuf_free(buf);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	session_sendmsg(buf, p, BGP_UPDATE);
+	start_timer_keepalive(p);
+	p->stats.msg_sent_update++;
+}
+
+/* Return 1 if a hard reset should be issued, 0 for a graceful notification */
+static int
+session_req_hard_reset(enum err_codes errcode, uint8_t subcode)
+{
+	switch (errcode) {
+	case ERR_HEADER:
+	case ERR_OPEN:
+	case ERR_UPDATE:
+	case ERR_FSM:
+	case ERR_RREFRESH:
+		/*
+		 * Protocol errors trigger a hard reset. The peer
+		 * is not trustworthy and so there is no realistic
+		 * hope that forwarding can continue.
+		 */
+		break;
+	case ERR_HOLDTIMEREXPIRED:
+	case ERR_SENDHOLDTIMEREXPIRED:
+		/* Keep forwarding and hope the other side is back soon. */
+		return 0;
+	case ERR_CEASE:
+		switch (subcode) {
+		case ERR_CEASE_CONN_REJECT:
+		case ERR_CEASE_OTHER_CHANGE:
+		case ERR_CEASE_COLLISION:
+		case ERR_CEASE_RSRC_EXHAUST:
+			/* Per RFC8538 suggestion make these graceful. */
+			return 0;
+		}
+		break;
+	}
+	return 1;
+}
+
+void
+session_notification_data(struct peer *p, uint8_t errcode, uint8_t subcode,
+    void *data, size_t datalen)
+{
+	struct ibuf ibuf;
+
+	ibuf_from_buffer(&ibuf, data, datalen);
+	session_notification(p, errcode, subcode, &ibuf);
+}
+
+void
+session_notification(struct peer *p, uint8_t errcode, uint8_t subcode,
+    struct ibuf *ibuf)
+{
+	struct ibuf		*buf;
+	const char		*reason = "sending";
+	int			 errs = 0, need_hard_reset = 0;
+	size_t			 datalen = 0;
+
+	switch (p->state) {
+	case STATE_OPENSENT:
+	case STATE_OPENCONFIRM:
+	case STATE_ESTABLISHED:
+		break;
+	default:
+		/* session not open, no need to send notification */
+		log_notification(p, errcode, subcode, ibuf, "dropping");
+		return;
+	}
+
+	if (p->capa.neg.grestart.grnotification) {
+		if (session_req_hard_reset(errcode, subcode)) {
+			need_hard_reset = 1;
+			datalen += 2;
+			reason = "sending hard-reset";
+		} else {
+			reason = "sending graceful";
+		}
+	}
+
+	log_notification(p, errcode, subcode, ibuf, reason);
+
+	/* cap to maximum size */
+	if (ibuf != NULL) {
+		if (ibuf_size(ibuf) >
+		    MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN - datalen) {
+			log_peer_warnx(&p->conf,
+			    "oversized notification, data trunkated");
+			ibuf_truncate(ibuf, MAX_PKTSIZE -
+			    MSGSIZE_NOTIFICATION_MIN - datalen);
+		}
+		datalen += ibuf_size(ibuf);
+	}
+
+	if ((buf = session_newmsg(BGP_NOTIFICATION,
+	    MSGSIZE_NOTIFICATION_MIN + datalen)) == NULL) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	if (need_hard_reset) {
+		errs += ibuf_add_n8(buf, ERR_CEASE);
+		errs += ibuf_add_n8(buf, ERR_CEASE_HARD_RESET);
+	}
+
+	errs += ibuf_add_n8(buf, errcode);
+	errs += ibuf_add_n8(buf, subcode);
+
+	if (ibuf != NULL)
+		errs += ibuf_add_ibuf(buf, ibuf);
+
+	if (errs) {
+		ibuf_free(buf);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	session_sendmsg(buf, p, BGP_NOTIFICATION);
+	p->stats.msg_sent_notification++;
+	p->stats.last_sent_errcode = errcode;
+	p->stats.last_sent_suberr = subcode;
+}
+
+int
+session_neighbor_rrefresh(struct peer *p)
+{
+	uint8_t	i;
+
+	if (!(p->capa.neg.refresh || p->capa.neg.enhanced_rr))
+		return (-1);
+
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.mp[i] != 0)
+			session_rrefresh(p, i, ROUTE_REFRESH_REQUEST);
+	}
+
+	return (0);
+}
+
+void
+session_rrefresh(struct peer *p, uint8_t aid, uint8_t subtype)
+{
+	struct ibuf		*buf;
+	int			 errs = 0;
+	uint16_t		 afi;
+	uint8_t			 safi;
+
+	switch (subtype) {
+	case ROUTE_REFRESH_REQUEST:
+		p->stats.refresh_sent_req++;
+		break;
+	case ROUTE_REFRESH_BEGIN_RR:
+	case ROUTE_REFRESH_END_RR:
+		/* requires enhanced route refresh */
+		if (!p->capa.neg.enhanced_rr)
+			return;
+		if (subtype == ROUTE_REFRESH_BEGIN_RR)
+			p->stats.refresh_sent_borr++;
+		else
+			p->stats.refresh_sent_eorr++;
+		break;
+	default:
+		fatalx("session_rrefresh: bad subtype %d", subtype);
+	}
+
+	if (aid2afi(aid, &afi, &safi) == -1)
+		fatalx("session_rrefresh: bad afi/safi pair");
+
+	if ((buf = session_newmsg(BGP_RREFRESH, MSGSIZE_RREFRESH)) == NULL) {
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	errs += ibuf_add_n16(buf, afi);
+	errs += ibuf_add_n8(buf, subtype);
+	errs += ibuf_add_n8(buf, safi);
+
+	if (errs) {
+		ibuf_free(buf);
+		bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		return;
+	}
+
+	session_sendmsg(buf, p, BGP_RREFRESH);
+	p->stats.msg_sent_rrefresh++;
+}
+
+struct ibuf *
+parse_header(struct ibuf *msg, void *arg, int *fd)
+{
+	struct peer		*peer = arg;
+	struct ibuf		*b;
+	u_char			 m[MSGSIZE_HEADER_MARKER];
+	uint16_t		 len, maxlen = MAX_PKTSIZE;
+	uint8_t			 type;
+
+	if (ibuf_get(msg, m, sizeof(m)) == -1 ||
+	    ibuf_get_n16(msg, &len) == -1 ||
+	    ibuf_get_n8(msg, &type) == -1)
+		return (NULL);
+	/* caller MUST make sure we are getting 19 bytes! */
+	if (memcmp(m, marker, sizeof(marker))) {
+		log_peer_warnx(&peer->conf, "sync error");
+		session_notification(peer, ERR_HEADER, ERR_HDR_SYNC, NULL);
+		bgp_fsm(peer, EVNT_CON_FATAL, NULL);
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	if (peer->capa.ann.ext_msg)
+		maxlen = MAX_EXT_PKTSIZE;
+
+	if (len < MSGSIZE_HEADER || len > maxlen) {
+		log_peer_warnx(&peer->conf,
+		    "received message: illegal length: %u byte", len);
+		goto badlen;
+	}
+
+	switch (type) {
+	case BGP_OPEN:
+		if (len < MSGSIZE_OPEN_MIN || len > MAX_PKTSIZE) {
+			log_peer_warnx(&peer->conf,
+			    "received OPEN: illegal len: %u byte", len);
+			goto badlen;
+		}
+		break;
+	case BGP_NOTIFICATION:
+		if (len < MSGSIZE_NOTIFICATION_MIN) {
+			log_peer_warnx(&peer->conf,
+			    "received NOTIFICATION: illegal len: %u byte", len);
+			goto badlen;
+		}
+		break;
+	case BGP_UPDATE:
+		if (len < MSGSIZE_UPDATE_MIN) {
+			log_peer_warnx(&peer->conf,
+			    "received UPDATE: illegal len: %u byte", len);
+			goto badlen;
+		}
+		break;
+	case BGP_KEEPALIVE:
+		if (len != MSGSIZE_KEEPALIVE) {
+			log_peer_warnx(&peer->conf,
+			    "received KEEPALIVE: illegal len: %u byte", len);
+			goto badlen;
+		}
+		break;
+	case BGP_RREFRESH:
+		if (len < MSGSIZE_RREFRESH_MIN) {
+			log_peer_warnx(&peer->conf,
+			    "received RREFRESH: illegal len: %u byte", len);
+			goto badlen;
+		}
+		break;
+	default:
+		log_peer_warnx(&peer->conf,
+		    "received msg with unknown type %u", type);
+		session_notification_data(peer, ERR_HEADER, ERR_HDR_TYPE,
+		    &type, sizeof(type));
+		bgp_fsm(peer, EVNT_CON_FATAL, NULL);
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	if ((b = ibuf_open(len)) == NULL)
+		return (NULL);
+	return (b);
+
+ badlen:
+	len = htons(len);
+	session_notification_data(peer, ERR_HEADER, ERR_HDR_LEN,
+	    &len, sizeof(len));
+	bgp_fsm(peer, EVNT_CON_FATAL, NULL);
+	errno = ERANGE;
+	return (NULL);
+}
+
+static int
+parse_capabilities(struct peer *peer, struct ibuf *buf, uint32_t *as)
+{
+	struct ibuf	 capabuf;
+	uint16_t	 afi, nhafi, gr_header;
+	uint8_t		 capa_code, capa_len;
+	uint8_t		 safi, aid, role, flags;
+
+	while (ibuf_size(buf) > 0) {
+		if (ibuf_get_n8(buf, &capa_code) == -1 ||
+		    ibuf_get_n8(buf, &capa_len) == -1) {
+			log_peer_warnx(&peer->conf, "Bad capabilities attr "
+			    "length: too short");
+			return (-1);
+		}
+		if (ibuf_get_ibuf(buf, capa_len, &capabuf) == -1) {
+			log_peer_warnx(&peer->conf,
+			    "Received bad capabilities attr length: "
+			    "len %zu smaller than capa_len %u",
+			    ibuf_size(buf), capa_len);
+			return (-1);
+		}
+
+		switch (capa_code) {
+		case CAPA_MP:			/* RFC 4760 */
+			if (capa_len != 4 ||
+			    ibuf_get_n16(&capabuf, &afi) == -1 ||
+			    ibuf_skip(&capabuf, 1) == -1 ||
+			    ibuf_get_n8(&capabuf, &safi) == -1) {
+				log_peer_warnx(&peer->conf,
+				    "Received bad multi protocol capability");
+				break;
+			}
+			if (afi2aid(afi, safi, &aid) == -1) {
+				log_peer_warnx(&peer->conf,
+				    "Received multi protocol capability: "
+				    " unknown AFI %u, safi %u pair",
+				    afi, safi);
+				peer->capa.peer.mp[AID_UNSPEC] = 1;
+				break;
+			}
+			peer->capa.peer.mp[aid] = 1;
+			break;
+		case CAPA_REFRESH:
+			peer->capa.peer.refresh = 1;
+			break;
+		case CAPA_EXT_NEXTHOP:
+			while (ibuf_size(&capabuf) > 0) {
+				uint16_t tmp16;
+				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
+				    ibuf_get_n16(&capabuf, &tmp16) == -1 ||
+				    ibuf_get_n16(&capabuf, &nhafi) == -1) {
+					log_peer_warnx(&peer->conf,
+					    "Received bad %s capability",
+					    log_capability(CAPA_EXT_NEXTHOP));
+					memset(peer->capa.peer.ext_nh, 0,
+					    sizeof(peer->capa.peer.ext_nh));
+					break;
+				}
+				safi = tmp16;
+				if (afi2aid(afi, safi, &aid) == -1 ||
+				    !(aid == AID_INET || aid == AID_VPN_IPv4)) {
+					log_peer_warnx(&peer->conf,
+					    "Received %s capability: "
+					    " unsupported AFI %u, safi %u pair",
+					    log_capability(CAPA_EXT_NEXTHOP),
+					    afi, safi);
+					continue;
+				}
+				if (nhafi != AFI_IPv6) {
+					log_peer_warnx(&peer->conf,
+					    "Received %s capability: "
+					    " unsupported nexthop AFI %u",
+					    log_capability(CAPA_EXT_NEXTHOP),
+					    nhafi);
+					continue;
+				}
+				peer->capa.peer.ext_nh[aid] = 1;
+			}
+			break;
+		case CAPA_EXT_MSG:
+			peer->capa.peer.ext_msg = 1;
+			break;
+		case CAPA_ROLE:
+			if (capa_len != 1 ||
+			    ibuf_get_n8(&capabuf, &role) == -1) {
+				log_peer_warnx(&peer->conf,
+				    "Received bad role capability");
+				break;
+			}
+			if (!peer->conf.ebgp) {
+				log_peer_warnx(&peer->conf,
+				    "Received role capability on iBGP session");
+				break;
+			}
+			peer->capa.peer.policy = 1;
+			peer->remote_role = capa2role(role);
+			break;
+		case CAPA_RESTART:
+			if (capa_len == 2) {
+				/* peer only supports EoR marker */
+				peer->capa.peer.grestart.restart = 1;
+				peer->capa.peer.grestart.timeout = 0;
+				break;
+			} else if (capa_len % 4 != 2) {
+				log_peer_warnx(&peer->conf,
+				    "Bad graceful restart capability");
+				peer->capa.peer.grestart.restart = 0;
+				peer->capa.peer.grestart.timeout = 0;
+				break;
+			}
+
+			if (ibuf_get_n16(&capabuf, &gr_header) == -1) {
+ bad_gr_restart:
+				log_peer_warnx(&peer->conf,
+				    "Bad graceful restart capability");
+				peer->capa.peer.grestart.restart = 0;
+				peer->capa.peer.grestart.timeout = 0;
+				break;
+			}
+
+			peer->capa.peer.grestart.timeout =
+			    gr_header & CAPA_GR_TIMEMASK;
+			if (peer->capa.peer.grestart.timeout == 0) {
+				log_peer_warnx(&peer->conf, "Received "
+				    "graceful restart with zero timeout");
+				peer->capa.peer.grestart.restart = 0;
+				break;
+			}
+
+			while (ibuf_size(&capabuf) > 0) {
+				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
+				    ibuf_get_n8(&capabuf, &safi) == -1 ||
+				    ibuf_get_n8(&capabuf, &flags) == -1)
+					goto bad_gr_restart;
+				if (afi2aid(afi, safi, &aid) == -1) {
+					log_peer_warnx(&peer->conf,
+					    "Received graceful restart capa: "
+					    " unknown AFI %u, safi %u pair",
+					    afi, safi);
+					continue;
+				}
+				peer->capa.peer.grestart.flags[aid] |=
+				    CAPA_GR_PRESENT;
+				if (flags & CAPA_GR_F_FLAG)
+					peer->capa.peer.grestart.flags[aid] |=
+					    CAPA_GR_FORWARD;
+				if (gr_header & CAPA_GR_R_FLAG)
+					peer->capa.peer.grestart.flags[aid] |=
+					    CAPA_GR_RESTART;
+				peer->capa.peer.grestart.restart = 2;
+			}
+			if (gr_header & CAPA_GR_N_FLAG)
+				peer->capa.peer.grestart.grnotification = 1;
+			break;
+		case CAPA_AS4BYTE:
+			if (capa_len != 4 ||
+			    ibuf_get_n32(&capabuf, as) == -1) {
+				log_peer_warnx(&peer->conf,
+				    "Received bad AS4BYTE capability");
+				peer->capa.peer.as4byte = 0;
+				break;
+			}
+			if (*as == 0) {
+				log_peer_warnx(&peer->conf,
+				    "peer requests unacceptable AS %u", *as);
+				session_notification(peer, ERR_OPEN,
+				    ERR_OPEN_AS, NULL);
+				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+				return (-1);
+			}
+			peer->capa.peer.as4byte = 1;
+			break;
+		case CAPA_ADD_PATH:
+			if (capa_len % 4 != 0) {
+ bad_add_path:
+				log_peer_warnx(&peer->conf,
+				    "Received bad ADD-PATH capability");
+				memset(peer->capa.peer.add_path, 0,
+				    sizeof(peer->capa.peer.add_path));
+				break;
+			}
+			while (ibuf_size(&capabuf) > 0) {
+				if (ibuf_get_n16(&capabuf, &afi) == -1 ||
+				    ibuf_get_n8(&capabuf, &safi) == -1 ||
+				    ibuf_get_n8(&capabuf, &flags) == -1)
+					goto bad_add_path;
+				if (afi2aid(afi, safi, &aid) == -1) {
+					log_peer_warnx(&peer->conf,
+					    "Received ADD-PATH capa: "
+					    " unknown AFI %u, safi %u pair",
+					    afi, safi);
+					memset(peer->capa.peer.add_path, 0,
+					    sizeof(peer->capa.peer.add_path));
+					break;
+				}
+				if (flags & ~CAPA_AP_BIDIR) {
+					log_peer_warnx(&peer->conf,
+					    "Received ADD-PATH capa: "
+					    " bad flags %x", flags);
+					memset(peer->capa.peer.add_path, 0,
+					    sizeof(peer->capa.peer.add_path));
+					break;
+				}
+				peer->capa.peer.add_path[aid] = flags;
+			}
+			break;
+		case CAPA_ENHANCED_RR:
+			peer->capa.peer.enhanced_rr = 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	return (0);
+}
+
+static int
+parse_open(struct peer *peer, struct ibuf *msg)
+{
+	uint8_t		 version, rversion;
+	uint16_t	 short_as;
+	uint16_t	 holdtime;
+	uint32_t	 as, bgpid;
+	uint8_t		 optparamlen;
+
+	if (ibuf_get_n8(msg, &version) == -1 ||
+	    ibuf_get_n16(msg, &short_as) == -1 ||
+	    ibuf_get_n16(msg, &holdtime) == -1 ||
+	    ibuf_get_n32(msg, &bgpid) == -1 ||
+	    ibuf_get_n8(msg, &optparamlen) == -1)
+		goto bad_len;
+
+	if (version != BGP_VERSION) {
+		log_peer_warnx(&peer->conf,
+		    "peer wants unrecognized version %u", version);
+		if (version > BGP_VERSION)
+			rversion = version - BGP_VERSION;
+		else
+			rversion = BGP_VERSION;
+		session_notification_data(peer, ERR_OPEN, ERR_OPEN_VERSION,
+		    &rversion, sizeof(rversion));
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	as = peer->short_as = short_as;
+	if (as == 0) {
+		log_peer_warnx(&peer->conf,
+		    "peer requests unacceptable AS %u", as);
+		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	if (holdtime != 0 && holdtime < peer->conf.min_holdtime) {
+		log_peer_warnx(&peer->conf,
+		    "peer requests unacceptable holdtime %u", holdtime);
+		session_notification(peer, ERR_OPEN, ERR_OPEN_HOLDTIME, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	if (holdtime < peer->conf.holdtime)
+		peer->holdtime = holdtime;
+	else
+		peer->holdtime = peer->conf.holdtime;
+
+	/* check bgpid for validity - just disallow 0 */
+	if (bgpid == 0) {
+		log_peer_warnx(&peer->conf, "peer BGPID 0 unacceptable");
+		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+	peer->remote_bgpid = bgpid;
+
+	if (optparamlen != 0) {
+		struct ibuf oparams, op;
+		uint8_t ext_type, op_type;
+		uint16_t ext_len, op_len;
+
+		ibuf_from_ibuf(&oparams, msg);
+
+		/* check for RFC9072 encoding */
+		if (ibuf_get_n8(&oparams, &ext_type) == -1)
+			goto bad_len;
+		if (ext_type == OPT_PARAM_EXT_LEN) {
+			if (ibuf_get_n16(&oparams, &ext_len) == -1)
+				goto bad_len;
+			/* skip RFC9072 header */
+			if (ibuf_skip(msg, 3) == -1)
+				goto bad_len;
+		} else {
+			ext_len = optparamlen;
+			ibuf_rewind(&oparams);
+		}
+
+		if (ibuf_truncate(&oparams, ext_len) == -1 ||
+		    ibuf_skip(msg, ext_len) == -1)
+			goto bad_len;
+
+		while (ibuf_size(&oparams) > 0) {
+			if (ibuf_get_n8(&oparams, &op_type) == -1)
+				goto bad_len;
+
+			if (ext_type == OPT_PARAM_EXT_LEN) {
+				if (ibuf_get_n16(&oparams, &op_len) == -1)
+					goto bad_len;
+			} else {
+				uint8_t tmp;
+				if (ibuf_get_n8(&oparams, &tmp) == -1)
+					goto bad_len;
+				op_len = tmp;
+			}
+
+			if (ibuf_get_ibuf(&oparams, op_len, &op) == -1)
+				goto bad_len;
+
+			switch (op_type) {
+			case OPT_PARAM_CAPABILITIES:		/* RFC 3392 */
+				if (parse_capabilities(peer, &op, &as) == -1) {
+					session_notification(peer, ERR_OPEN, 0,
+					    NULL);
+					change_state(peer, STATE_IDLE,
+					    EVNT_RCVD_OPEN);
+					return (-1);
+				}
+				break;
+			case OPT_PARAM_AUTH:			/* deprecated */
+			default:
+				/*
+				 * unsupported type
+				 * the RFCs tell us to leave the data section
+				 * empty and notify the peer with ERR_OPEN,
+				 * ERR_OPEN_OPT. How the peer should know
+				 * _which_ optional parameter we don't support
+				 * is beyond me.
+				 */
+				log_peer_warnx(&peer->conf,
+				    "received OPEN message with unsupported "
+				    "optional parameter: type %u", op_type);
+				session_notification(peer, ERR_OPEN,
+				    ERR_OPEN_OPT, NULL);
+				change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+				return (-1);
+			}
+		}
+	}
+
+	if (ibuf_size(msg) != 0) {
+ bad_len:
+		log_peer_warnx(&peer->conf,
+		    "corrupt OPEN message received: length mismatch");
+		session_notification(peer, ERR_OPEN, 0, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	/*
+	 * if remote-as is zero and it's a cloned neighbor, accept any
+	 * but only on the first connect, after that the remote-as needs
+	 * to remain the same.
+	 */
+	if (peer->template && !peer->conf.remote_as && as != AS_TRANS) {
+		peer->conf.remote_as = as;
+		peer->conf.ebgp = (peer->conf.remote_as != peer->conf.local_as);
+		if (!peer->conf.ebgp)
+			/* force enforce_as off for iBGP sessions */
+			peer->conf.enforce_as = ENFORCE_AS_OFF;
+	}
+
+	if (peer->conf.remote_as != as) {
+		log_peer_warnx(&peer->conf, "peer sent wrong AS %s",
+		    log_as(as));
+		session_notification(peer, ERR_OPEN, ERR_OPEN_AS, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	/* on iBGP sessions check for bgpid collision */
+	if (!peer->conf.ebgp && peer->remote_bgpid == peer->local_bgpid) {
+		struct in_addr ina;
+		ina.s_addr = htonl(bgpid);
+		log_peer_warnx(&peer->conf, "peer BGPID %s conflicts with ours",
+		    inet_ntoa(ina));
+		session_notification(peer, ERR_OPEN, ERR_OPEN_BGPID, NULL);
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	if (capa_neg_calc(peer) == -1) {
+		change_state(peer, STATE_IDLE, EVNT_RCVD_OPEN);
+		return (-1);
+	}
+
+	return (0);
+}
+
+static int
+parse_update(struct peer *peer, struct ibuf *msg)
+{
+	return session_hanlde_update(peer, msg);
+}
+
+static int
+parse_rrefresh(struct peer *peer, struct ibuf *msg)
+{
+	struct route_refresh rr;
+	uint16_t afi, datalen;
+	uint8_t aid, safi, subtype;
+
+	datalen = ibuf_size(msg) + MSGSIZE_HEADER;
+
+	if (ibuf_get_n16(msg, &afi) == -1 ||
+	    ibuf_get_n8(msg, &subtype) == -1 ||
+	    ibuf_get_n8(msg, &safi) == -1) {
+		/* minimum size checked in session_process_msg() */
+		fatalx("%s: message too small", __func__);
+	}
+
+	/* check subtype if peer announced enhanced route refresh */
+	if (peer->capa.neg.enhanced_rr) {
+		switch (subtype) {
+		case ROUTE_REFRESH_REQUEST:
+			/* no ORF support, so no oversized RREFRESH msgs */
+			if (datalen != MSGSIZE_RREFRESH) {
+				log_peer_warnx(&peer->conf,
+				    "received RREFRESH: illegal len: %u byte",
+				    datalen);
+				datalen = htons(datalen);
+				session_notification_data(peer, ERR_HEADER,
+				    ERR_HDR_LEN, &datalen, sizeof(datalen));
+				bgp_fsm(peer, EVNT_CON_FATAL, NULL);
+				return (-1);
+			}
+			peer->stats.refresh_rcvd_req++;
+			break;
+		case ROUTE_REFRESH_BEGIN_RR:
+		case ROUTE_REFRESH_END_RR:
+			/* special handling for RFC7313 */
+			if (datalen != MSGSIZE_RREFRESH) {
+				log_peer_warnx(&peer->conf,
+				    "received RREFRESH: illegal len: %u byte",
+				    datalen);
+				ibuf_rewind(msg);
+				session_notification(peer, ERR_RREFRESH,
+				    ERR_RR_INV_LEN, msg);
+				bgp_fsm(peer, EVNT_CON_FATAL, NULL);
+				return (-1);
+			}
+			if (subtype == ROUTE_REFRESH_BEGIN_RR)
+				peer->stats.refresh_rcvd_borr++;
+			else
+				peer->stats.refresh_rcvd_eorr++;
+			break;
+		default:
+			log_peer_warnx(&peer->conf, "peer sent bad refresh, "
+			    "bad subtype %d", subtype);
+			return (0);
+		}
+	} else {
+		/* force subtype to default */
+		subtype = ROUTE_REFRESH_REQUEST;
+		peer->stats.refresh_rcvd_req++;
+	}
+
+	/* afi/safi unchecked -	unrecognized values will be ignored anyway */
+	if (afi2aid(afi, safi, &aid) == -1) {
+		log_peer_warnx(&peer->conf, "peer sent bad refresh, "
+		    "invalid afi/safi pair");
+		return (0);
+	}
+
+	if (!peer->capa.neg.refresh && !peer->capa.neg.enhanced_rr) {
+		log_peer_warnx(&peer->conf, "peer sent unexpected refresh");
+		return (0);
+	}
+
+	rr.aid = aid;
+	rr.subtype = subtype;
+
+	return session_handle_rrefresh(peer, &rr);
+}
+
+static void
+parse_notification(struct peer *peer, struct ibuf *msg)
+{
+	const char		*reason = "received";
+	uint8_t			 errcode, subcode;
+	uint8_t			 reason_len;
+	enum session_events	 event = EVNT_RCVD_NOTIFICATION;
+
+	if (ibuf_get_n8(msg, &errcode) == -1 ||
+	    ibuf_get_n8(msg, &subcode) == -1) {
+		log_peer_warnx(&peer->conf, "received bad notification");
+		goto done;
+	}
+
+	/* RFC8538: check for hard-reset or graceful notification */
+	if (peer->capa.neg.grestart.grnotification) {
+		if (errcode == ERR_CEASE && subcode == ERR_CEASE_HARD_RESET) {
+			if (ibuf_get_n8(msg, &errcode) == -1 ||
+			    ibuf_get_n8(msg, &subcode) == -1) {
+				log_peer_warnx(&peer->conf,
+				    "received bad hard-reset notification");
+				goto done;
+			}
+			reason = "received hard-reset";
+		} else {
+			reason = "received graceful";
+			event = EVNT_RCVD_GRACE_NOTIFICATION;
+		}
+	}
+
+	peer->errcnt++;
+	peer->stats.last_rcvd_errcode = errcode;
+	peer->stats.last_rcvd_suberr = subcode;
+
+	log_notification(peer, errcode, subcode, msg, reason);
+
+	CTASSERT(sizeof(peer->stats.last_reason) > UINT8_MAX);
+	memset(peer->stats.last_reason, 0, sizeof(peer->stats.last_reason));
+	if (errcode == ERR_CEASE &&
+	    (subcode == ERR_CEASE_ADMIN_DOWN ||
+	     subcode == ERR_CEASE_ADMIN_RESET)) {
+		/* check if shutdown reason is included */
+		if (ibuf_get_n8(msg, &reason_len) != -1 && reason_len != 0) {
+			if (ibuf_get(msg, peer->stats.last_reason,
+			    reason_len) == -1)
+				log_peer_warnx(&peer->conf,
+				    "received truncated shutdown reason");
+		}
+	}
+
+done:
+	change_state(peer, STATE_IDLE, event);
+}
+
+void
+session_process_msg(struct peer *p)
+{
+	struct ibuf	*msg;
+	int		processed = 0;
+	uint8_t		msgtype;
+
+	p->rpending = 0;
+	if (p->wbuf == NULL)
+		return;
+
+	/*
+	 * session might drop to IDLE -> all buffers are flushed
+	 */
+	while ((msg = msgbuf_get(p->wbuf)) != NULL) {
+		/* skip msg header and extract type */
+		if (ibuf_skip(msg, MSGSIZE_HEADER_MARKER) == -1 ||
+		    ibuf_skip(msg, sizeof(uint16_t)) == -1 ||
+		    ibuf_get_n8(msg, &msgtype) == -1) {
+			log_peer_warn(&p->conf, "process message failed");
+			bgp_fsm(p, EVNT_CON_FATAL, NULL);
+			ibuf_free(msg);
+			return;
+		}
+		ibuf_rewind(msg);
+
+		session_mrt_dump_bgp_msg(p, msg, msgtype, DIR_IN);
+
+		ibuf_skip(msg, MSGSIZE_HEADER);
+
+		switch (msgtype) {
+		case BGP_OPEN:
+			bgp_fsm(p, EVNT_RCVD_OPEN, msg);
+			p->stats.msg_rcvd_open++;
+			break;
+		case BGP_UPDATE:
+			bgp_fsm(p, EVNT_RCVD_UPDATE, msg);
+			p->stats.msg_rcvd_update++;
+			break;
+		case BGP_NOTIFICATION:
+			bgp_fsm(p, EVNT_RCVD_NOTIFICATION, msg);
+			p->stats.msg_rcvd_notification++;
+			break;
+		case BGP_KEEPALIVE:
+			bgp_fsm(p, EVNT_RCVD_KEEPALIVE, msg);
+			p->stats.msg_rcvd_keepalive++;
+			break;
+		case BGP_RREFRESH:
+			parse_rrefresh(p, msg);
+			p->stats.msg_rcvd_rrefresh++;
+			break;
+		default:	/* cannot happen */
+			session_notification_data(p, ERR_HEADER, ERR_HDR_TYPE,
+			    &msgtype, 1);
+			log_peer_warnx(&p->conf,
+			    "received message with unknown type %u", msgtype);
+			bgp_fsm(p, EVNT_CON_FATAL, NULL);
+		}
+		ibuf_free(msg);
+		if (++processed > MSG_PROCESS_LIMIT) {
+			p->rpending = 1;
+			break;
+		}
+	}
+}
+
+int
+capa_neg_calc(struct peer *p)
+{
+	struct ibuf *ebuf;
+	uint8_t	i, hasmp = 0, capa_code, capa_len, capa_aid = 0;
+
+	/* a capability is accepted only if both sides announced it */
+
+	p->capa.neg.refresh =
+	    (p->capa.ann.refresh && p->capa.peer.refresh) != 0;
+	p->capa.neg.enhanced_rr =
+	    (p->capa.ann.enhanced_rr && p->capa.peer.enhanced_rr) != 0;
+	p->capa.neg.as4byte =
+	    (p->capa.ann.as4byte && p->capa.peer.as4byte) != 0;
+	p->capa.neg.ext_msg =
+	    (p->capa.ann.ext_msg && p->capa.peer.ext_msg) != 0;
+
+	/* MP: both side must agree on the AFI,SAFI pair */
+	if (p->capa.peer.mp[AID_UNSPEC])
+		hasmp = 1;
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.ann.mp[i] && p->capa.peer.mp[i])
+			p->capa.neg.mp[i] = 1;
+		else
+			p->capa.neg.mp[i] = 0;
+		if (p->capa.ann.mp[i] || p->capa.peer.mp[i])
+			hasmp = 1;
+	}
+	/* if no MP capability present default to IPv4 unicast mode */
+	if (!hasmp)
+		p->capa.neg.mp[AID_INET] = 1;
+
+	/*
+	 * graceful restart: the peer capabilities are of interest here.
+	 * It is necessary to compare the new values with the previous ones
+	 * and act accordingly. AFI/SAFI that are not part in the MP capability
+	 * are treated as not being present.
+	 * Also make sure that a flush happens if the session stopped
+	 * supporting graceful restart.
+	 */
+
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		int8_t	negflags;
+
+		/* disable GR if the AFI/SAFI is not present */
+		if ((p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
+		    p->capa.neg.mp[i] == 0))
+			p->capa.peer.grestart.flags[i] = 0;	/* disable */
+		/* look at current GR state and decide what to do */
+		negflags = p->capa.neg.grestart.flags[i];
+		p->capa.neg.grestart.flags[i] = p->capa.peer.grestart.flags[i];
+		if (negflags & CAPA_GR_RESTARTING) {
+			if (p->capa.ann.grestart.restart != 0 &&
+			    p->capa.peer.grestart.flags[i] & CAPA_GR_FORWARD) {
+				p->capa.neg.grestart.flags[i] |=
+				    CAPA_GR_RESTARTING;
+			} else {
+				if (session_graceful_flush(p, i,
+				    "not restarted") == -1)
+					return (-1);
+			}
+		}
+	}
+	p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout;
+	p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
+	if (p->capa.ann.grestart.restart == 0)
+		p->capa.neg.grestart.restart = 0;
+
+	/* RFC 8538 graceful notification: both sides need to agree */
+	p->capa.neg.grestart.grnotification =
+	    (p->capa.ann.grestart.grnotification &&
+	    p->capa.peer.grestart.grnotification) != 0;
+
+	/* RFC 8950 extended nexthop encoding: both sides need to agree */
+	memset(p->capa.neg.ext_nh, 0, sizeof(p->capa.neg.ext_nh));
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.mp[i] == 0)
+			continue;
+		if (p->capa.ann.ext_nh[i] && p->capa.peer.ext_nh[i]) {
+			p->capa.neg.ext_nh[i] = 1;
+		}
+	}
+
+	/*
+	 * ADD-PATH: set only those bits where both sides agree.
+	 * For this compare our send bit with the recv bit from the peer
+	 * and vice versa.
+	 * The flags are stored from this systems view point.
+	 * At index 0 the flags are set if any per-AID flag is set.
+	 */
+	memset(p->capa.neg.add_path, 0, sizeof(p->capa.neg.add_path));
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.mp[i] == 0)
+			continue;
+		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV) &&
+		    (p->capa.peer.add_path[i] & CAPA_AP_SEND)) {
+			p->capa.neg.add_path[i] |= CAPA_AP_RECV;
+			p->capa.neg.add_path[0] |= CAPA_AP_RECV;
+		}
+		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND) &&
+		    (p->capa.peer.add_path[i] & CAPA_AP_RECV)) {
+			p->capa.neg.add_path[i] |= CAPA_AP_SEND;
+			p->capa.neg.add_path[0] |= CAPA_AP_SEND;
+		}
+	}
+
+	/*
+	 * Open policy: check that the policy is sensible.
+	 *
+	 * Make sure that the roles match and set the negotiated capability
+	 * to the role of the peer. So the RDE can inject the OTC attribute.
+	 * See RFC 9234, section 4.2.
+	 * These checks should only happen on ebgp sessions.
+	 */
+	if (p->capa.ann.policy != 0 && p->capa.peer.policy != 0 &&
+	    p->conf.ebgp) {
+		switch (p->conf.role) {
+		case ROLE_PROVIDER:
+			if (p->remote_role != ROLE_CUSTOMER)
+				goto policyfail;
+			break;
+		case ROLE_RS:
+			if (p->remote_role != ROLE_RS_CLIENT)
+				goto policyfail;
+			break;
+		case ROLE_RS_CLIENT:
+			if (p->remote_role != ROLE_RS)
+				goto policyfail;
+			break;
+		case ROLE_CUSTOMER:
+			if (p->remote_role != ROLE_PROVIDER)
+				goto policyfail;
+			break;
+		case ROLE_PEER:
+			if (p->remote_role != ROLE_PEER)
+				goto policyfail;
+			break;
+		default:
+ policyfail:
+			log_peer_warnx(&p->conf, "open policy role mismatch: "
+			    "our role %s, their role %s",
+			    log_policy(p->conf.role),
+			    log_policy(p->remote_role));
+			session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
+			return (-1);
+		}
+		p->capa.neg.policy = 1;
+	}
+
+	/* enforce presence of open policy role capability */
+	if (p->capa.ann.policy == 2 && p->capa.peer.policy == 0 &&
+	    p->conf.ebgp) {
+		log_peer_warnx(&p->conf, "open policy role enforced but "
+		    "not present");
+		session_notification(p, ERR_OPEN, ERR_OPEN_ROLE, NULL);
+		return (-1);
+	}
+
+	/* enforce presence of other capabilities */
+	if (p->capa.ann.refresh == 2 && p->capa.neg.refresh == 0) {
+		capa_code = CAPA_REFRESH;
+		capa_len = 0;
+		goto fail;
+	}
+	/* enforce presence of other capabilities */
+	if (p->capa.ann.ext_msg == 2 && p->capa.neg.ext_msg == 0) {
+		capa_code = CAPA_EXT_MSG;
+		capa_len = 0;
+		goto fail;
+	}
+	if (p->capa.ann.enhanced_rr == 2 && p->capa.neg.enhanced_rr == 0) {
+		capa_code = CAPA_ENHANCED_RR;
+		capa_len = 0;
+		goto fail;
+	}
+	if (p->capa.ann.as4byte == 2 && p->capa.neg.as4byte == 0) {
+		capa_code = CAPA_AS4BYTE;
+		capa_len = 4;
+		goto fail;
+	}
+	if (p->capa.ann.grestart.restart == 2 &&
+	    p->capa.neg.grestart.restart == 0) {
+		capa_code = CAPA_RESTART;
+		capa_len = 2;
+		goto fail;
+	}
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.ann.mp[i] == 2 && p->capa.neg.mp[i] == 0) {
+			capa_code = CAPA_MP;
+			capa_len = 4;
+			capa_aid = i;
+			goto fail;
+		}
+	}
+
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.mp[i] == 0)
+			continue;
+		if ((p->capa.ann.add_path[i] & CAPA_AP_RECV_ENFORCE) &&
+		    (p->capa.neg.add_path[i] & CAPA_AP_RECV) == 0) {
+			capa_code = CAPA_ADD_PATH;
+			capa_len = 4;
+			capa_aid = i;
+			goto fail;
+		}
+		if ((p->capa.ann.add_path[i] & CAPA_AP_SEND_ENFORCE) &&
+		    (p->capa.neg.add_path[i] & CAPA_AP_SEND) == 0) {
+			capa_code = CAPA_ADD_PATH;
+			capa_len = 4;
+			capa_aid = i;
+			goto fail;
+		}
+	}
+
+	for (i = AID_MIN; i < AID_MAX; i++) {
+		if (p->capa.neg.mp[i] == 0)
+			continue;
+		if (p->capa.ann.ext_nh[i] == 2 &&
+		    p->capa.neg.ext_nh[i] == 0) {
+			capa_code = CAPA_EXT_NEXTHOP;
+			capa_len = 6;
+			capa_aid = i;
+			goto fail;
+		}
+	}
+	return (0);
+
+ fail:
+	if ((ebuf = ibuf_dynamic(2, 256)) == NULL)
+		return (-1);
+	/* best effort, no problem if it fails */
+	session_capa_add(ebuf, capa_code, capa_len);
+	if (capa_code == CAPA_MP)
+		session_capa_add_mp(ebuf, capa_aid);
+	else if (capa_code == CAPA_ADD_PATH)
+		session_capa_add_afi(ebuf, capa_aid, 0);
+	else if (capa_code == CAPA_EXT_NEXTHOP)
+		session_capa_add_ext_nh(ebuf, capa_aid);
+	else if (capa_len > 0)
+		ibuf_add_zero(ebuf, capa_len);
+
+	session_notification(p, ERR_OPEN, ERR_OPEN_CAPA, ebuf);
+	ibuf_free(ebuf);
+	return (-1);
+}
+
+static void
+session_tcp_established(struct peer *peer)
+{
+	struct sockaddr_storage	ss;
+	socklen_t		len;
+
+	len = sizeof(ss);
+	if (getsockname(peer->fd, (struct sockaddr *)&ss, &len) == -1)
+		log_warn("getsockname");
+	sa2addr((struct sockaddr *)&ss, &peer->local, &peer->local_port);
+	len = sizeof(ss);
+	if (getpeername(peer->fd, (struct sockaddr *)&ss, &len) == -1)
+		log_warn("getpeername");
+	sa2addr((struct sockaddr *)&ss, &peer->remote, &peer->remote_port);
+
+	get_alternate_addr(&peer->local, &peer->remote, &peer->local_alt,
+	    &peer->if_scope);
+}
+
+void
+bgp_fsm(struct peer *peer, enum session_events event, struct ibuf *msg)
+{
+	switch (peer->state) {
+	case STATE_NONE:
+		/* nothing */
+		break;
+	case STATE_IDLE:
+		switch (event) {
+		case EVNT_START:
+			timer_stop(&peer->timers, Timer_Hold);
+			timer_stop(&peer->timers, Timer_SendHold);
+			timer_stop(&peer->timers, Timer_Keepalive);
+			timer_stop(&peer->timers, Timer_IdleHold);
+
+			if (!peer->depend_ok)
+				timer_stop(&peer->timers, Timer_ConnectRetry);
+			else if (peer->passive || peer->conf.passive ||
+			    peer->conf.template) {
+				change_state(peer, STATE_ACTIVE, event);
+				timer_stop(&peer->timers, Timer_ConnectRetry);
+			} else {
+				change_state(peer, STATE_CONNECT, event);
+				timer_set(&peer->timers, Timer_ConnectRetry,
+				    peer->connectretry);
+				session_connect(peer);
+			}
+			peer->passive = 0;
+			break;
+		case EVNT_STOP:
+			timer_stop(&peer->timers, Timer_IdleHold);
+			break;
+		default:
+			/* ignore */
+			break;
+		}
+		break;
+	case STATE_CONNECT:
+		switch (event) {
+		case EVNT_START:
+			/* ignore */
+			break;
+		case EVNT_CON_OPEN:
+			session_tcp_established(peer);
+			session_open(peer);
+			timer_stop(&peer->timers, Timer_ConnectRetry);
+			peer->holdtime = INTERVAL_HOLD_INITIAL;
+			start_timer_holdtime(peer);
+			change_state(peer, STATE_OPENSENT, event);
+			break;
+		case EVNT_CON_OPENFAIL:
+			timer_set(&peer->timers, Timer_ConnectRetry,
+			    peer->connectretry);
+			session_close(peer);
+			change_state(peer, STATE_ACTIVE, event);
+			break;
+		case EVNT_TIMER_CONNRETRY:
+			timer_set(&peer->timers, Timer_ConnectRetry,
+			    peer->connectretry);
+			session_connect(peer);
+			break;
+		default:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		}
+		break;
+	case STATE_ACTIVE:
+		switch (event) {
+		case EVNT_START:
+			/* ignore */
+			break;
+		case EVNT_CON_OPEN:
+			session_tcp_established(peer);
+			session_open(peer);
+			timer_stop(&peer->timers, Timer_ConnectRetry);
+			peer->holdtime = INTERVAL_HOLD_INITIAL;
+			start_timer_holdtime(peer);
+			change_state(peer, STATE_OPENSENT, event);
+			break;
+		case EVNT_CON_OPENFAIL:
+			timer_set(&peer->timers, Timer_ConnectRetry,
+			    peer->connectretry);
+			session_close(peer);
+			change_state(peer, STATE_ACTIVE, event);
+			break;
+		case EVNT_TIMER_CONNRETRY:
+			timer_set(&peer->timers, Timer_ConnectRetry,
+			    peer->holdtime);
+			change_state(peer, STATE_CONNECT, event);
+			session_connect(peer);
+			break;
+		default:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		}
+		break;
+	case STATE_OPENSENT:
+		switch (event) {
+		case EVNT_START:
+			/* ignore */
+			break;
+		case EVNT_STOP:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_CON_CLOSED:
+			session_close(peer);
+			timer_set(&peer->timers, Timer_ConnectRetry,
+			    peer->connectretry);
+			change_state(peer, STATE_ACTIVE, event);
+			break;
+		case EVNT_CON_FATAL:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_HOLDTIME:
+			session_notification(peer, ERR_HOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_SENDHOLD:
+			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_RCVD_OPEN:
+			/* parse_open calls change_state itself on failure */
+			if (parse_open(peer, msg))
+				break;
+			session_keepalive(peer);
+			change_state(peer, STATE_OPENCONFIRM, event);
+			break;
+		case EVNT_RCVD_NOTIFICATION:
+			parse_notification(peer, msg);
+			break;
+		default:
+			session_notification(peer,
+			    ERR_FSM, ERR_FSM_UNEX_OPENSENT, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		}
+		break;
+	case STATE_OPENCONFIRM:
+		switch (event) {
+		case EVNT_START:
+			/* ignore */
+			break;
+		case EVNT_STOP:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_CON_CLOSED:
+		case EVNT_CON_FATAL:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_HOLDTIME:
+			session_notification(peer, ERR_HOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_SENDHOLD:
+			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_KEEPALIVE:
+			session_keepalive(peer);
+			break;
+		case EVNT_RCVD_KEEPALIVE:
+			start_timer_holdtime(peer);
+			change_state(peer, STATE_ESTABLISHED, event);
+			break;
+		case EVNT_RCVD_NOTIFICATION:
+			parse_notification(peer, msg);
+			break;
+		default:
+			session_notification(peer,
+			    ERR_FSM, ERR_FSM_UNEX_OPENCONFIRM, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		}
+		break;
+	case STATE_ESTABLISHED:
+		switch (event) {
+		case EVNT_START:
+			/* ignore */
+			break;
+		case EVNT_STOP:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_CON_CLOSED:
+		case EVNT_CON_FATAL:
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_HOLDTIME:
+			session_notification(peer, ERR_HOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_SENDHOLD:
+			session_notification(peer, ERR_SENDHOLDTIMEREXPIRED,
+			    0, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		case EVNT_TIMER_KEEPALIVE:
+			session_keepalive(peer);
+			break;
+		case EVNT_RCVD_KEEPALIVE:
+			start_timer_holdtime(peer);
+			break;
+		case EVNT_RCVD_UPDATE:
+			start_timer_holdtime(peer);
+			if (parse_update(peer, msg))
+				change_state(peer, STATE_IDLE, event);
+			else
+				start_timer_holdtime(peer);
+			break;
+		case EVNT_RCVD_NOTIFICATION:
+			parse_notification(peer, msg);
+			break;
+		default:
+			session_notification(peer,
+			    ERR_FSM, ERR_FSM_UNEX_ESTABLISHED, NULL);
+			change_state(peer, STATE_IDLE, event);
+			break;
+		}
+		break;
+	}
+}
+
+static void
+start_timer_holdtime(struct peer *peer)
+{
+	if (peer->holdtime > 0)
+		timer_set(&peer->timers, Timer_Hold, peer->holdtime);
+	else
+		timer_stop(&peer->timers, Timer_Hold);
+}
+
+void
+start_timer_sendholdtime(struct peer *peer)
+{
+	uint16_t holdtime = INTERVAL_HOLD;
+
+	if (peer->holdtime > INTERVAL_HOLD)
+		holdtime = peer->holdtime;
+
+	if (peer->holdtime > 0)
+		timer_set(&peer->timers, Timer_SendHold, holdtime);
+	else
+		timer_stop(&peer->timers, Timer_SendHold);
+}
+
+static void
+start_timer_keepalive(struct peer *peer)
+{
+	if (peer->holdtime > 0)
+		timer_set(&peer->timers, Timer_Keepalive, peer->holdtime / 3);
+	else
+		timer_stop(&peer->timers, Timer_Keepalive);
+}
+
+void
+change_state(struct peer *peer, enum session_state state,
+    enum session_events event)
+{
+	switch (state) {
+	case STATE_IDLE:
+		/* carp demotion first. new peers handled in init_peer */
+		if (peer->state == STATE_ESTABLISHED &&
+		    peer->conf.demote_group[0] && !peer->demoted)
+			session_demote(peer, +1);
+
+		/*
+		 * try to write out what's buffered (maybe a notification),
+		 * don't bother if it fails
+		 */
+		if (peer->state >= STATE_OPENSENT &&
+		    msgbuf_queuelen(peer->wbuf) > 0)
+			ibuf_write(peer->fd, peer->wbuf);
+
+		/*
+		 * we must start the timer for the next EVNT_START
+		 * if we are coming here due to an error and the
+		 * session was not established successfully before, the
+		 * starttimerinterval needs to be exponentially increased
+		 */
+		if (peer->IdleHoldTime == 0)
+			peer->IdleHoldTime = INTERVAL_IDLE_HOLD_INITIAL;
+		peer->holdtime = INTERVAL_HOLD_INITIAL;
+		timer_stop(&peer->timers, Timer_ConnectRetry);
+		timer_stop(&peer->timers, Timer_Keepalive);
+		timer_stop(&peer->timers, Timer_Hold);
+		timer_stop(&peer->timers, Timer_SendHold);
+		timer_stop(&peer->timers, Timer_IdleHold);
+		timer_stop(&peer->timers, Timer_IdleHoldReset);
+		session_close(peer);
+		msgbuf_clear(peer->wbuf);
+		peer->rpending = 0;
+		memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
+		session_md5_reload(peer);
+
+		if (peer->state == STATE_ESTABLISHED) {
+			if (peer->capa.neg.grestart.restart == 2 &&
+			    (event == EVNT_CON_CLOSED ||
+			    event == EVNT_CON_FATAL ||
+			    (peer->capa.neg.grestart.grnotification &&
+			    (event == EVNT_RCVD_GRACE_NOTIFICATION ||
+			    event == EVNT_TIMER_HOLDTIME ||
+			    event == EVNT_TIMER_SENDHOLD)))) {
+				/* don't punish graceful restart */
+				timer_set(&peer->timers, Timer_IdleHold, 0);
+				session_graceful_restart(peer);
+			} else if (event != EVNT_STOP) {
+				timer_set(&peer->timers, Timer_IdleHold,
+				    peer->IdleHoldTime);
+				if (event != EVNT_NONE &&
+				    peer->IdleHoldTime < MAX_IDLE_HOLD/2)
+					peer->IdleHoldTime *= 2;
+				session_down(peer);
+			} else {
+				session_down(peer);
+			}
+		} else if (event != EVNT_STOP) {
+			timer_set(&peer->timers, Timer_IdleHold,
+			    peer->IdleHoldTime);
+			if (event != EVNT_NONE &&
+			    peer->IdleHoldTime < MAX_IDLE_HOLD / 2)
+				peer->IdleHoldTime *= 2;
+		}
+
+		if (peer->state == STATE_NONE ||
+		    peer->state == STATE_ESTABLISHED) {
+			/* initialize capability negotiation structures */
+			memcpy(&peer->capa.ann, &peer->conf.capabilities,
+			    sizeof(peer->capa.ann));
+		}
+		break;
+	case STATE_CONNECT:
+		if (peer->state == STATE_ESTABLISHED &&
+		    peer->capa.neg.grestart.restart == 2) {
+			/* do the graceful restart dance */
+			session_graceful_restart(peer);
+			peer->holdtime = INTERVAL_HOLD_INITIAL;
+			timer_stop(&peer->timers, Timer_ConnectRetry);
+			timer_stop(&peer->timers, Timer_Keepalive);
+			timer_stop(&peer->timers, Timer_Hold);
+			timer_stop(&peer->timers, Timer_SendHold);
+			timer_stop(&peer->timers, Timer_IdleHold);
+			timer_stop(&peer->timers, Timer_IdleHoldReset);
+			session_close(peer);
+			msgbuf_clear(peer->wbuf);
+			memset(&peer->capa.peer, 0, sizeof(peer->capa.peer));
+		}
+		break;
+	case STATE_ACTIVE:
+		session_md5_reload(peer);
+		break;
+	case STATE_OPENSENT:
+		break;
+	case STATE_OPENCONFIRM:
+		break;
+	case STATE_ESTABLISHED:
+		timer_set(&peer->timers, Timer_IdleHoldReset,
+		    peer->IdleHoldTime);
+		if (peer->demoted)
+			timer_set(&peer->timers, Timer_CarpUndemote,
+			    INTERVAL_HOLD_DEMOTED);
+		session_up(peer);
+		break;
+	default:		/* something seriously fucked */
+		break;
+	}
+
+	log_statechange(peer, state, event);
+
+	session_mrt_dump_state(peer, peer->state, state);
+
+	peer->prev_state = peer->state;
+	peer->state = state;
+}