Download raw body.
bgpd: graceful notification rfc8538
This diff implements rfc8538 (hopefully, it is hard to test for me).
This is an extension to graceful restart (which is already something I
very much dislike). I implemented the NOTIFICATION bits as specified but
decided to take a much more strict stand when to send a graceful reset.
bgpd only sends graceful notifications for a few cease cases (same as in
the RFC) and for the holdtimer and sendholdtimer errors. Everything else
is a hard error because the other side is not trustworthy.
The logic in change_state() for handling graceful restart is IMO not quite
correct. Especially the inclusion of EVNT_CON_FATAL bothers me since that
error is used in many places. Since this is already like this I did not
bother to fix this nicer. I will have to circle back to this.
bgpd only implement the procedures for the Receiving Speaker and for the
time being I don't want to change that. I do not beleive in the concept of
graceful restart and think it is better to use GRACEFUL_SHUTDOWN and
localpref 0 to route traffic around the system before doing the restart.
--
:wq Claudio
? bgpctl/obj
? bgpd/obj
Index: bgpctl/output.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpctl/output.c,v
diff -u -p -r1.57 output.c
--- bgpctl/output.c 9 Dec 2024 10:52:27 -0000 1.57
+++ bgpctl/output.c 13 Dec 2024 06:54:45 -0000
@@ -182,9 +182,11 @@ show_neighbor_capa_restart(struct capabi
int comma;
uint8_t i;
- printf(" Graceful Restart");
+ printf(" Graceful Restart: ");
if (capa->grestart.timeout)
- printf(": Timeout: %d, ", capa->grestart.timeout);
+ printf("timeout: %d, ", capa->grestart.timeout);
+ if (capa->grestart.grnotification)
+ printf("graceful notification, ");
for (i = AID_MIN, comma = 0; i < AID_MAX; i++)
if (capa->grestart.flags[i] & CAPA_GR_PRESENT) {
if (!comma &&
Index: bgpctl/output_json.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpctl/output_json.c,v
diff -u -p -r1.49 output_json.c
--- bgpctl/output_json.c 9 Dec 2024 10:52:27 -0000 1.49
+++ bgpctl/output_json.c 13 Dec 2024 06:54:45 -0000
@@ -82,6 +82,8 @@ json_neighbor_capabilities(struct capabi
if (capa->grestart.timeout)
json_do_uint("timeout", capa->grestart.timeout);
+ if (capa->grestart.grnotification)
+ json_do_bool("graceful_notification", 1);
if (present) {
json_do_array("protocols");
Index: bgpd/bgpd.8
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.8,v
diff -u -p -r1.80 bgpd.8
--- bgpd/bgpd.8 9 Dec 2024 10:51:46 -0000 1.80
+++ bgpd/bgpd.8 13 Dec 2024 06:54:45 -0000
@@ -478,6 +478,16 @@ has been started.
.Re
.Pp
.Rs
+.%A K. Patel
+.%A R. Fernando
+.%A J. Scudder
+.%A J. Haas
+.%D March 2019
+.%R RFC 8538
+.%T Notification Message Support for BGP Graceful Restart
+.Re
+.Pp
+.Rs
.%A R. Bush
.%A K. Patel
.%A D. Ward
Index: bgpd/bgpd.conf.5
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v
diff -u -p -r1.244 bgpd.conf.5
--- bgpd/bgpd.conf.5 9 Dec 2024 11:38:38 -0000 1.244
+++ bgpd/bgpd.conf.5 13 Dec 2024 06:54:46 -0000
@@ -397,6 +397,10 @@ is the routing domain in which
has been started.
By default, no restricted socket is created.
.Pp
+.It Ic staletime Ar seconds
+Set the upper bound in seconds stale routes are kept during graceful restart.
+The default is 180 seconds.
+.Pp
.It Xo
.Ic transparent-as
.Pq Ic yes Ns | Ns Ic no
@@ -1107,6 +1111,18 @@ The default is
.Ic no .
.Pp
.It Xo
+.Ic announce graceful notification
+.Pq Ic yes Ns | Ns Ic no
+.Xc
+If set to
+.Ic yes ,
+the graceful notification extension to graceful restart is announced.
+The default is
+.Ic no .
+.Ic announce refresh
+must be enabled to enable graceful notifications.
+.Pp
+.It Xo
.Ic announce policy
.Pq Ic yes Ns | Ns Ic no Ns | Ns Ic enforce
.Xc
@@ -1534,6 +1550,10 @@ and
.Ic nexthop self .
These sets are rewritten into filter rules and can be viewed with
.Dq bgpd -nv .
+.Pp
+.It Ic staletime Ar seconds
+Set the upper bound stale time in seconds for graceful restart.
+Inherited from the global configuration if not given.
.Pp
.It Ic tcp md5sig password Ar secret
.It Ic tcp md5sig key Ar secret
Index: bgpd/bgpd.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
diff -u -p -r1.505 bgpd.h
--- bgpd/bgpd.h 12 Dec 2024 20:19:03 -0000 1.505
+++ bgpd/bgpd.h 13 Dec 2024 06:54:46 -0000
@@ -319,6 +319,7 @@ struct bgpd_config {
uint16_t holdtime;
uint16_t min_holdtime;
uint16_t connectretry;
+ uint16_t staletime;
uint8_t fib_priority;
uint8_t filtered_in_locrib;
};
@@ -406,6 +407,7 @@ struct capabilities {
int16_t timeout; /* graceful restart timeout */
int8_t flags[AID_MAX]; /* graceful restart per AID flags */
int8_t restart; /* graceful restart, RFC 4724 */
+ int8_t grnotification; /* graceful notification, RFC 8538 */
} grestart;
int8_t mp[AID_MAX]; /* multiprotocol extensions, RFC 4760 */
int8_t add_path[AID_MAX]; /* ADD_PATH, RFC 7911 */
@@ -435,6 +437,7 @@ enum capa_codes {
#define CAPA_GR_RESTARTING 0x08
#define CAPA_GR_TIMEMASK 0x0fff
#define CAPA_GR_R_FLAG 0x8000
+#define CAPA_GR_N_FLAG 0x4000
#define CAPA_GR_F_FLAG 0x80
/* flags for RFC 7911 - enhanced router refresh */
@@ -478,6 +481,7 @@ struct peer_config {
uint16_t max_out_prefix_restart;
uint16_t holdtime;
uint16_t min_holdtime;
+ uint16_t staletime;
uint16_t local_short_as;
uint16_t remote_port;
uint8_t template;
@@ -1662,7 +1666,8 @@ static const char * const eventnames[] =
"OPEN message received",
"KEEPALIVE message received",
"UPDATE message received",
- "NOTIFICATION received"
+ "NOTIFICATION received",
+ "graceful NOTIFICATION received",
};
static const char * const errnames[] = {
Index: bgpd/config.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/config.c,v
diff -u -p -r1.112 config.c
--- bgpd/config.c 1 Oct 2024 11:49:24 -0000 1.112
+++ bgpd/config.c 13 Dec 2024 06:54:46 -0000
@@ -85,6 +85,7 @@ copy_config(struct bgpd_config *to, stru
to->short_as = from->short_as;
to->holdtime = from->holdtime;
to->min_holdtime = from->min_holdtime;
+ to->staletime = from->staletime;
to->connectretry = from->connectretry;
to->fib_priority = from->fib_priority;
to->filtered_in_locrib = from->filtered_in_locrib;
Index: bgpd/parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
diff -u -p -r1.472 parse.y
--- bgpd/parse.y 10 Dec 2024 16:29:07 -0000 1.472
+++ bgpd/parse.y 13 Dec 2024 06:54:46 -0000
@@ -243,13 +243,13 @@ typedef struct {
%token AS ROUTERID HOLDTIME YMIN LISTEN ON FIBUPDATE FIBPRIORITY RTABLE
%token NONE UNICAST VPN RD EXPORT EXPORTTRGT IMPORTTRGT DEFAULTROUTE
-%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT MINVERSION
+%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT MINVERSION STALETIME
%token GROUP NEIGHBOR NETWORK
%token EBGP IBGP
%token FLOWSPEC PROTO FLAGS FRAGMENT TOS LENGTH ICMPTYPE CODE
%token LOCALAS REMOTEAS DESCR LOCALADDR MULTIHOP PASSIVE MAXPREFIX RESTART
%token ANNOUNCE REFRESH AS4BYTE CONNECTRETRY ENHANCED ADDPATH EXTENDED
-%token SEND RECV PLUS POLICY ROLE
+%token SEND RECV PLUS POLICY ROLE GRACEFU NOTIFICATIONL
%token DEMOTE ENFORCE NEIGHBORAS ASOVERRIDE REFLECTOR DEPEND DOWN
%token DUMP IN OUT SOCKET RESTRICTED
%token LOG TRANSPARENT FILTERED
@@ -775,6 +775,14 @@ conf_main : AS as4number {
}
conf->min_holdtime = $3;
}
+ | STALETIME NUMBER {
+ if ($2 < MIN_HOLDTIME || $2 > USHRT_MAX) {
+ yyerror("staletime must be between %u and %u",
+ MIN_HOLDTIME, USHRT_MAX);
+ YYERROR;
+ }
+ conf->staletime = $2;
+ }
| LISTEN ON address {
struct listen_addr *la;
struct sockaddr *sa;
@@ -1913,6 +1921,14 @@ peeropts : REMOTEAS as4number {
}
curpeer->conf.min_holdtime = $3;
}
+ | STALETIME NUMBER {
+ if ($2 < MIN_HOLDTIME || $2 > USHRT_MAX) {
+ yyerror("staletime must be between %u and %u",
+ MIN_HOLDTIME, USHRT_MAX);
+ YYERROR;
+ }
+ curpeer->conf.staletime = $2;
+ }
| ANNOUNCE af safi enforce {
uint8_t aid, safi;
uint16_t afi;
@@ -1944,6 +1960,9 @@ peeropts : REMOTEAS as4number {
| ANNOUNCE RESTART yesnoenforce {
curpeer->conf.capabilities.grestart.restart = $3;
}
+ | ANNOUNCE GRACEFUL NOTIFICATION yesno {
+ curpeer->conf.capabilities.grestart.grnotification = $4;
+ }
| ANNOUNCE AS4BYTE yesnoenforce {
curpeer->conf.capabilities.as4byte = $3;
}
@@ -3547,6 +3566,7 @@ lookup(char *s)
{ "flowspec", FLOWSPEC },
{ "fragment", FRAGMENT },
{ "from", FROM },
+ { "graceful", GRACEFUL },
{ "group", GROUP },
{ "holdtime", HOLDTIME },
{ "ibgp", IBGP },
@@ -3586,6 +3606,7 @@ lookup(char *s)
{ "nexthop", NEXTHOP },
{ "no-modify", NOMODIFY },
{ "none", NONE },
+ { "notification", NOTIFICATION },
{ "on", ON },
{ "or-longer", LONGER },
{ "origin", ORIGIN },
@@ -3631,6 +3652,7 @@ lookup(char *s)
{ "socket", SOCKET },
{ "source-as", SOURCEAS },
{ "spi", SPI },
+ { "staletime", STALETIME },
{ "static", STATIC },
{ "tcp", TCP },
{ "to", TO },
@@ -4029,6 +4051,7 @@ init_config(struct bgpd_config *c)
c->min_holdtime = MIN_HOLDTIME;
c->holdtime = INTERVAL_HOLD;
+ c->staletime = INTERVAL_STALE;
c->connectretry = INTERVAL_CONNECTRETRY;
c->bgpid = get_bgpid();
c->fib_priority = kr_default_prio();
Index: bgpd/printconf.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v
diff -u -p -r1.177 printconf.c
--- bgpd/printconf.c 9 Dec 2024 10:51:46 -0000 1.177
+++ bgpd/printconf.c 13 Dec 2024 06:54:46 -0000
@@ -394,6 +394,8 @@ print_mainconf(struct bgpd_config *conf)
printf("holdtime min %u\n", conf->min_holdtime);
if (conf->connectretry != INTERVAL_CONNECTRETRY)
printf("connect-retry %u\n", conf->connectretry);
+ if (conf->staletime != INTERVAL_STALE)
+ printf("staletime %u\n", conf->staletime);
if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE)
printf("rde route-age evaluate\n");
@@ -817,6 +819,8 @@ print_peer(struct peer *peer, struct bgp
printf("%s\tholdtime %u\n", c, p->holdtime);
if (p->min_holdtime)
printf("%s\tholdtime min %u\n", c, p->min_holdtime);
+ if (p->staletime)
+ printf("%s\tstaletime %u\n", c, p->staletime);
if (p->export_type == EXPORT_NONE)
printf("%s\texport none\n", c);
else if (p->export_type == EXPORT_DEFAULT_ROUTE)
@@ -954,6 +958,10 @@ print_announce(struct peer_config *p, co
printf("%s\tannounce restart enforce\n", c);
else if (p->capabilities.grestart.restart == 0)
printf("%s\tannounce restart no\n", c);
+
+ if (p->capabilities.grestart.restart != 0 &&
+ p->capabilities.grestart.grnotification)
+ printf("%s\tannounce graceful notification yes\n", c);
if (p->capabilities.as4byte == 2)
printf("%s\tannounce as4byte enforce\n", c);
Index: bgpd/session.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.c,v
diff -u -p -r1.503 session.c
--- bgpd/session.c 12 Dec 2024 20:19:03 -0000 1.503
+++ bgpd/session.c 13 Dec 2024 06:54:46 -0000
@@ -922,24 +922,33 @@ change_state(struct peer *peer, enum ses
imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD,
peer->conf.id, 0, -1, NULL, 0);
- if (event != EVNT_STOP) {
- timer_set(&peer->timers, Timer_IdleHold,
- peer->IdleHoldTime);
- if (event != EVNT_NONE &&
- peer->IdleHoldTime < MAX_IDLE_HOLD/2)
- peer->IdleHoldTime *= 2;
- }
if (peer->state == STATE_ESTABLISHED) {
if (peer->capa.neg.grestart.restart == 2 &&
(event == EVNT_CON_CLOSED ||
- event == EVNT_CON_FATAL)) {
+ event == EVNT_CON_FATAL ||
+ (peer->capa.neg.grestart.grnotification &&
+ (event == EVNT_RCVD_GRACE_NOTIFICATION ||
+ event == EVNT_TIMER_HOLDTIME ||
+ event == EVNT_TIMER_SENDHOLD)))) {
/* don't punish graceful restart */
timer_set(&peer->timers, Timer_IdleHold, 0);
- peer->IdleHoldTime /= 2;
session_graceful_restart(peer);
- } else
+ } else {
+ timer_set(&peer->timers, Timer_IdleHold,
+ peer->IdleHoldTime);
+ if (event != EVNT_NONE &&
+ peer->IdleHoldTime < MAX_IDLE_HOLD/2)
+ peer->IdleHoldTime *= 2;
session_down(peer);
+ }
+ } else if (event != EVNT_STOP) {
+ timer_set(&peer->timers, Timer_IdleHold,
+ peer->IdleHoldTime);
+ if (event != EVNT_NONE &&
+ peer->IdleHoldTime < MAX_IDLE_HOLD / 2)
+ peer->IdleHoldTime *= 2;
}
+
if (peer->state == STATE_NONE ||
peer->state == STATE_ESTABLISHED) {
/* initialize capability negotiation structures */
@@ -1532,6 +1541,8 @@ session_open(struct peer *p)
/* Only set the R-flag if no graceful restart is ongoing */
if (!rst)
hdr |= CAPA_GR_R_FLAG;
+ if (p->capa.ann.grestart.grnotification)
+ hdr |= CAPA_GR_N_FLAG;
errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr));
errs += ibuf_add_n16(opb, hdr);
}
@@ -1694,6 +1705,38 @@ session_update(uint32_t peerid, struct i
p->stats.msg_sent_update++;
}
+static int
+session_req_hard_reset(enum err_codes errcode, uint8_t subcode)
+{
+ switch (errcode) {
+ case ERR_HEADER:
+ case ERR_OPEN:
+ case ERR_UPDATE:
+ case ERR_FSM:
+ case ERR_RREFRESH:
+ /*
+ * Protocol errors trigger a hard reset. The peer
+ * is not trustworthy and so there is no realistic
+ * hope that forwarding can continue.
+ */
+ return 1;
+ case ERR_HOLDTIMEREXPIRED:
+ case ERR_SENDHOLDTIMEREXPIRED:
+ /* Keep forwarding and hope the other side is back soon. */
+ return 0;
+ case ERR_CEASE:
+ switch (subcode) {
+ case ERR_CEASE_CONN_REJECT:
+ case ERR_CEASE_OTHER_CHANGE:
+ case ERR_CEASE_COLLISION:
+ case ERR_CEASE_RSRC_EXHAUST:
+ /* Per RFC8538 suggestion make these graceful. */
+ return 0;
+ }
+ return 1;
+ }
+}
+
void
session_notification_data(struct peer *p, uint8_t errcode, uint8_t subcode,
void *data, size_t datalen)
@@ -1709,7 +1752,8 @@ session_notification(struct peer *p, uin
struct ibuf *ibuf)
{
struct ibuf *buf;
- int errs = 0;
+ const char *reason = "sending";
+ int errs = 0, need_hard_reset = 0;
size_t datalen = 0;
switch (p->state) {
@@ -1723,18 +1767,28 @@ session_notification(struct peer *p, uin
return;
}
- log_notification(p, errcode, subcode, ibuf, "sending");
+ if (p->capa.neg.grestart.grnotification) {
+ if (session_req_hard_reset(errcode, subcode)) {
+ need_hard_reset = 1;
+ datalen += 2;
+ reason = "sending hard-reset";
+ } else {
+ reason = "sending graceful";
+ }
+ }
+
+ log_notification(p, errcode, subcode, ibuf, reason);
/* cap to maximum size */
if (ibuf != NULL) {
if (ibuf_size(ibuf) >
- MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN) {
+ MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN - datalen) {
log_peer_warnx(&p->conf,
"oversized notification, data trunkated");
ibuf_truncate(ibuf, MAX_PKTSIZE -
- MSGSIZE_NOTIFICATION_MIN);
+ MSGSIZE_NOTIFICATION_MIN - datalen);
}
- datalen = ibuf_size(ibuf);
+ datalen += ibuf_size(ibuf);
}
if ((buf = session_newmsg(NOTIFICATION,
@@ -1743,6 +1797,11 @@ session_notification(struct peer *p, uin
return;
}
+ if (need_hard_reset) {
+ errs += ibuf_add_n8(buf, ERR_CEASE);
+ errs += ibuf_add_n8(buf, ERR_CEASE_HARD_RESET);
+ }
+
errs += ibuf_add_n8(buf, errcode);
errs += ibuf_add_n8(buf, subcode);
@@ -1829,9 +1888,15 @@ int
session_graceful_restart(struct peer *p)
{
uint8_t i;
+ uint16_t staletime = conf->staletime;
- timer_set(&p->timers, Timer_RestartTimeout,
- p->capa.neg.grestart.timeout);
+ if (p->conf.staletime)
+ staletime = p->conf.staletime;
+
+ /* RFC 8538: enforce configurable upper bound of the stale timer */
+ if (staletime > p->capa.neg.grestart.timeout)
+ staletime = p->capa.neg.grestart.timeout;
+ timer_set(&p->timers, Timer_RestartTimeout, staletime);
for (i = AID_MIN; i < AID_MAX; i++) {
if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
@@ -2417,8 +2482,10 @@ parse_rrefresh(struct peer *peer, struct
void
parse_notification(struct peer *peer, struct ibuf *msg)
{
- uint8_t errcode, subcode;
- uint8_t reason_len;
+ const char *reason = "received";
+ uint8_t errcode, subcode;
+ uint8_t reason_len;
+ enum session_events event = EVNT_RCVD_NOTIFICATION;
if (ibuf_get_n8(msg, &errcode) == -1 ||
ibuf_get_n8(msg, &subcode) == -1) {
@@ -2426,11 +2493,27 @@ parse_notification(struct peer *peer, st
goto done;
}
+ /* RFC8538: check for hard-reset or graceful notification */
+ if (peer->capa.neg.grestart.grnotification) {
+ if (errcode == ERR_CEASE && subcode == ERR_CEASE_HARD_RESET) {
+ if (ibuf_get_n8(msg, &errcode) == -1 ||
+ ibuf_get_n8(msg, &subcode) == -1) {
+ log_peer_warnx(&peer->conf,
+ "received bad hard-reset notification");
+ goto done;
+ }
+ reason = "received hard-reset";
+ } else {
+ reason = "received graceful";
+ event = EVNT_RCVD_GRACE_NOTIFICATION;
+ }
+ }
+
peer->errcnt++;
peer->stats.last_rcvd_errcode = errcode;
peer->stats.last_rcvd_suberr = subcode;
- log_notification(peer, errcode, subcode, msg, "received");
+ log_notification(peer, errcode, subcode, msg, reason);
CTASSERT(sizeof(peer->stats.last_reason) > UINT8_MAX);
memset(peer->stats.last_reason, 0, sizeof(peer->stats.last_reason));
@@ -2447,7 +2530,7 @@ parse_notification(struct peer *peer, st
}
done:
- change_state(peer, STATE_IDLE, EVNT_RCVD_NOTIFICATION);
+ change_state(peer, STATE_IDLE, event);
}
int
@@ -2568,6 +2651,8 @@ parse_capabilities(struct peer *peer, st
CAPA_GR_RESTART;
peer->capa.peer.grestart.restart = 2;
}
+ if (gr_header & CAPA_GR_N_FLAG)
+ peer->capa.peer.grestart.grnotification = 1;
break;
case CAPA_AS4BYTE:
if (capa_len != 4 ||
@@ -2704,6 +2789,11 @@ capa_neg_calc(struct peer *p)
p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
if (p->capa.ann.grestart.restart == 0)
p->capa.neg.grestart.restart = 0;
+
+ /* RFC 8538 graceful notification: both sides need to agree */
+ p->capa.neg.grestart.grnotification =
+ (p->capa.ann.grestart.grnotification &&
+ p->capa.peer.grestart.grnotification) != 0;
/*
* ADD-PATH: set only those bits where both sides agree.
Index: bgpd/session.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.h,v
diff -u -p -r1.182 session.h
--- bgpd/session.h 12 Dec 2024 20:19:03 -0000 1.182
+++ bgpd/session.h 13 Dec 2024 06:54:46 -0000
@@ -26,6 +26,7 @@
#define INTERVAL_HOLD 90
#define INTERVAL_IDLE_HOLD_INITIAL 30
#define INTERVAL_HOLD_DEMOTED 60
+#define INTERVAL_STALE 180
#define INTERVAL_SESSION_DOWN 3600
#define MAX_IDLE_HOLD 3600
#define MSGSIZE_HEADER 19
@@ -64,7 +65,8 @@ enum session_events {
EVNT_RCVD_OPEN,
EVNT_RCVD_KEEPALIVE,
EVNT_RCVD_UPDATE,
- EVNT_RCVD_NOTIFICATION
+ EVNT_RCVD_NOTIFICATION,
+ EVNT_RCVD_GRACE_NOTIFICATION,
};
enum msg_type {
bgpd: graceful notification rfc8538