From: Claudio Jeker Subject: bgpd: graceful notification rfc8538 To: tech@openbsd.org Date: Fri, 13 Dec 2024 08:04:01 +0100 This diff implements rfc8538 (hopefully, it is hard to test for me). This is an extension to graceful restart (which is already something I very much dislike). I implemented the NOTIFICATION bits as specified but decided to take a much more strict stand when to send a graceful reset. bgpd only sends graceful notifications for a few cease cases (same as in the RFC) and for the holdtimer and sendholdtimer errors. Everything else is a hard error because the other side is not trustworthy. The logic in change_state() for handling graceful restart is IMO not quite correct. Especially the inclusion of EVNT_CON_FATAL bothers me since that error is used in many places. Since this is already like this I did not bother to fix this nicer. I will have to circle back to this. bgpd only implement the procedures for the Receiving Speaker and for the time being I don't want to change that. I do not beleive in the concept of graceful restart and think it is better to use GRACEFUL_SHUTDOWN and localpref 0 to route traffic around the system before doing the restart. -- :wq Claudio ? bgpctl/obj ? bgpd/obj Index: bgpctl/output.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpctl/output.c,v diff -u -p -r1.57 output.c --- bgpctl/output.c 9 Dec 2024 10:52:27 -0000 1.57 +++ bgpctl/output.c 13 Dec 2024 06:54:45 -0000 @@ -182,9 +182,11 @@ show_neighbor_capa_restart(struct capabi int comma; uint8_t i; - printf(" Graceful Restart"); + printf(" Graceful Restart: "); if (capa->grestart.timeout) - printf(": Timeout: %d, ", capa->grestart.timeout); + printf("timeout: %d, ", capa->grestart.timeout); + if (capa->grestart.grnotification) + printf("graceful notification, "); for (i = AID_MIN, comma = 0; i < AID_MAX; i++) if (capa->grestart.flags[i] & CAPA_GR_PRESENT) { if (!comma && Index: bgpctl/output_json.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpctl/output_json.c,v diff -u -p -r1.49 output_json.c --- bgpctl/output_json.c 9 Dec 2024 10:52:27 -0000 1.49 +++ bgpctl/output_json.c 13 Dec 2024 06:54:45 -0000 @@ -82,6 +82,8 @@ json_neighbor_capabilities(struct capabi if (capa->grestart.timeout) json_do_uint("timeout", capa->grestart.timeout); + if (capa->grestart.grnotification) + json_do_bool("graceful_notification", 1); if (present) { json_do_array("protocols"); Index: bgpd/bgpd.8 =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.8,v diff -u -p -r1.80 bgpd.8 --- bgpd/bgpd.8 9 Dec 2024 10:51:46 -0000 1.80 +++ bgpd/bgpd.8 13 Dec 2024 06:54:45 -0000 @@ -478,6 +478,16 @@ has been started. .Re .Pp .Rs +.%A K. Patel +.%A R. Fernando +.%A J. Scudder +.%A J. Haas +.%D March 2019 +.%R RFC 8538 +.%T Notification Message Support for BGP Graceful Restart +.Re +.Pp +.Rs .%A R. Bush .%A K. Patel .%A D. Ward Index: bgpd/bgpd.conf.5 =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.conf.5,v diff -u -p -r1.244 bgpd.conf.5 --- bgpd/bgpd.conf.5 9 Dec 2024 11:38:38 -0000 1.244 +++ bgpd/bgpd.conf.5 13 Dec 2024 06:54:46 -0000 @@ -397,6 +397,10 @@ is the routing domain in which has been started. By default, no restricted socket is created. .Pp +.It Ic staletime Ar seconds +Set the upper bound in seconds stale routes are kept during graceful restart. +The default is 180 seconds. +.Pp .It Xo .Ic transparent-as .Pq Ic yes Ns | Ns Ic no @@ -1107,6 +1111,18 @@ The default is .Ic no . .Pp .It Xo +.Ic announce graceful notification +.Pq Ic yes Ns | Ns Ic no +.Xc +If set to +.Ic yes , +the graceful notification extension to graceful restart is announced. +The default is +.Ic no . +.Ic announce refresh +must be enabled to enable graceful notifications. +.Pp +.It Xo .Ic announce policy .Pq Ic yes Ns | Ns Ic no Ns | Ns Ic enforce .Xc @@ -1534,6 +1550,10 @@ and .Ic nexthop self . These sets are rewritten into filter rules and can be viewed with .Dq bgpd -nv . +.Pp +.It Ic staletime Ar seconds +Set the upper bound stale time in seconds for graceful restart. +Inherited from the global configuration if not given. .Pp .It Ic tcp md5sig password Ar secret .It Ic tcp md5sig key Ar secret Index: bgpd/bgpd.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v diff -u -p -r1.505 bgpd.h --- bgpd/bgpd.h 12 Dec 2024 20:19:03 -0000 1.505 +++ bgpd/bgpd.h 13 Dec 2024 06:54:46 -0000 @@ -319,6 +319,7 @@ struct bgpd_config { uint16_t holdtime; uint16_t min_holdtime; uint16_t connectretry; + uint16_t staletime; uint8_t fib_priority; uint8_t filtered_in_locrib; }; @@ -406,6 +407,7 @@ struct capabilities { int16_t timeout; /* graceful restart timeout */ int8_t flags[AID_MAX]; /* graceful restart per AID flags */ int8_t restart; /* graceful restart, RFC 4724 */ + int8_t grnotification; /* graceful notification, RFC 8538 */ } grestart; int8_t mp[AID_MAX]; /* multiprotocol extensions, RFC 4760 */ int8_t add_path[AID_MAX]; /* ADD_PATH, RFC 7911 */ @@ -435,6 +437,7 @@ enum capa_codes { #define CAPA_GR_RESTARTING 0x08 #define CAPA_GR_TIMEMASK 0x0fff #define CAPA_GR_R_FLAG 0x8000 +#define CAPA_GR_N_FLAG 0x4000 #define CAPA_GR_F_FLAG 0x80 /* flags for RFC 7911 - enhanced router refresh */ @@ -478,6 +481,7 @@ struct peer_config { uint16_t max_out_prefix_restart; uint16_t holdtime; uint16_t min_holdtime; + uint16_t staletime; uint16_t local_short_as; uint16_t remote_port; uint8_t template; @@ -1662,7 +1666,8 @@ static const char * const eventnames[] = "OPEN message received", "KEEPALIVE message received", "UPDATE message received", - "NOTIFICATION received" + "NOTIFICATION received", + "graceful NOTIFICATION received", }; static const char * const errnames[] = { Index: bgpd/config.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/config.c,v diff -u -p -r1.112 config.c --- bgpd/config.c 1 Oct 2024 11:49:24 -0000 1.112 +++ bgpd/config.c 13 Dec 2024 06:54:46 -0000 @@ -85,6 +85,7 @@ copy_config(struct bgpd_config *to, stru to->short_as = from->short_as; to->holdtime = from->holdtime; to->min_holdtime = from->min_holdtime; + to->staletime = from->staletime; to->connectretry = from->connectretry; to->fib_priority = from->fib_priority; to->filtered_in_locrib = from->filtered_in_locrib; Index: bgpd/parse.y =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v diff -u -p -r1.472 parse.y --- bgpd/parse.y 10 Dec 2024 16:29:07 -0000 1.472 +++ bgpd/parse.y 13 Dec 2024 06:54:46 -0000 @@ -243,13 +243,13 @@ typedef struct { %token AS ROUTERID HOLDTIME YMIN LISTEN ON FIBUPDATE FIBPRIORITY RTABLE %token NONE UNICAST VPN RD EXPORT EXPORTTRGT IMPORTTRGT DEFAULTROUTE -%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT MINVERSION +%token RDE RIB EVALUATE IGNORE COMPARE RTR PORT MINVERSION STALETIME %token GROUP NEIGHBOR NETWORK %token EBGP IBGP %token FLOWSPEC PROTO FLAGS FRAGMENT TOS LENGTH ICMPTYPE CODE %token LOCALAS REMOTEAS DESCR LOCALADDR MULTIHOP PASSIVE MAXPREFIX RESTART %token ANNOUNCE REFRESH AS4BYTE CONNECTRETRY ENHANCED ADDPATH EXTENDED -%token SEND RECV PLUS POLICY ROLE +%token SEND RECV PLUS POLICY ROLE GRACEFU NOTIFICATIONL %token DEMOTE ENFORCE NEIGHBORAS ASOVERRIDE REFLECTOR DEPEND DOWN %token DUMP IN OUT SOCKET RESTRICTED %token LOG TRANSPARENT FILTERED @@ -775,6 +775,14 @@ conf_main : AS as4number { } conf->min_holdtime = $3; } + | STALETIME NUMBER { + if ($2 < MIN_HOLDTIME || $2 > USHRT_MAX) { + yyerror("staletime must be between %u and %u", + MIN_HOLDTIME, USHRT_MAX); + YYERROR; + } + conf->staletime = $2; + } | LISTEN ON address { struct listen_addr *la; struct sockaddr *sa; @@ -1913,6 +1921,14 @@ peeropts : REMOTEAS as4number { } curpeer->conf.min_holdtime = $3; } + | STALETIME NUMBER { + if ($2 < MIN_HOLDTIME || $2 > USHRT_MAX) { + yyerror("staletime must be between %u and %u", + MIN_HOLDTIME, USHRT_MAX); + YYERROR; + } + curpeer->conf.staletime = $2; + } | ANNOUNCE af safi enforce { uint8_t aid, safi; uint16_t afi; @@ -1944,6 +1960,9 @@ peeropts : REMOTEAS as4number { | ANNOUNCE RESTART yesnoenforce { curpeer->conf.capabilities.grestart.restart = $3; } + | ANNOUNCE GRACEFUL NOTIFICATION yesno { + curpeer->conf.capabilities.grestart.grnotification = $4; + } | ANNOUNCE AS4BYTE yesnoenforce { curpeer->conf.capabilities.as4byte = $3; } @@ -3547,6 +3566,7 @@ lookup(char *s) { "flowspec", FLOWSPEC }, { "fragment", FRAGMENT }, { "from", FROM }, + { "graceful", GRACEFUL }, { "group", GROUP }, { "holdtime", HOLDTIME }, { "ibgp", IBGP }, @@ -3586,6 +3606,7 @@ lookup(char *s) { "nexthop", NEXTHOP }, { "no-modify", NOMODIFY }, { "none", NONE }, + { "notification", NOTIFICATION }, { "on", ON }, { "or-longer", LONGER }, { "origin", ORIGIN }, @@ -3631,6 +3652,7 @@ lookup(char *s) { "socket", SOCKET }, { "source-as", SOURCEAS }, { "spi", SPI }, + { "staletime", STALETIME }, { "static", STATIC }, { "tcp", TCP }, { "to", TO }, @@ -4029,6 +4051,7 @@ init_config(struct bgpd_config *c) c->min_holdtime = MIN_HOLDTIME; c->holdtime = INTERVAL_HOLD; + c->staletime = INTERVAL_STALE; c->connectretry = INTERVAL_CONNECTRETRY; c->bgpid = get_bgpid(); c->fib_priority = kr_default_prio(); Index: bgpd/printconf.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v diff -u -p -r1.177 printconf.c --- bgpd/printconf.c 9 Dec 2024 10:51:46 -0000 1.177 +++ bgpd/printconf.c 13 Dec 2024 06:54:46 -0000 @@ -394,6 +394,8 @@ print_mainconf(struct bgpd_config *conf) printf("holdtime min %u\n", conf->min_holdtime); if (conf->connectretry != INTERVAL_CONNECTRETRY) printf("connect-retry %u\n", conf->connectretry); + if (conf->staletime != INTERVAL_STALE) + printf("staletime %u\n", conf->staletime); if (conf->flags & BGPD_FLAG_DECISION_ROUTEAGE) printf("rde route-age evaluate\n"); @@ -817,6 +819,8 @@ print_peer(struct peer *peer, struct bgp printf("%s\tholdtime %u\n", c, p->holdtime); if (p->min_holdtime) printf("%s\tholdtime min %u\n", c, p->min_holdtime); + if (p->staletime) + printf("%s\tstaletime %u\n", c, p->staletime); if (p->export_type == EXPORT_NONE) printf("%s\texport none\n", c); else if (p->export_type == EXPORT_DEFAULT_ROUTE) @@ -954,6 +958,10 @@ print_announce(struct peer_config *p, co printf("%s\tannounce restart enforce\n", c); else if (p->capabilities.grestart.restart == 0) printf("%s\tannounce restart no\n", c); + + if (p->capabilities.grestart.restart != 0 && + p->capabilities.grestart.grnotification) + printf("%s\tannounce graceful notification yes\n", c); if (p->capabilities.as4byte == 2) printf("%s\tannounce as4byte enforce\n", c); Index: bgpd/session.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/session.c,v diff -u -p -r1.503 session.c --- bgpd/session.c 12 Dec 2024 20:19:03 -0000 1.503 +++ bgpd/session.c 13 Dec 2024 06:54:46 -0000 @@ -922,24 +922,33 @@ change_state(struct peer *peer, enum ses imsg_compose(ibuf_main, IMSG_PFKEY_RELOAD, peer->conf.id, 0, -1, NULL, 0); - if (event != EVNT_STOP) { - timer_set(&peer->timers, Timer_IdleHold, - peer->IdleHoldTime); - if (event != EVNT_NONE && - peer->IdleHoldTime < MAX_IDLE_HOLD/2) - peer->IdleHoldTime *= 2; - } if (peer->state == STATE_ESTABLISHED) { if (peer->capa.neg.grestart.restart == 2 && (event == EVNT_CON_CLOSED || - event == EVNT_CON_FATAL)) { + event == EVNT_CON_FATAL || + (peer->capa.neg.grestart.grnotification && + (event == EVNT_RCVD_GRACE_NOTIFICATION || + event == EVNT_TIMER_HOLDTIME || + event == EVNT_TIMER_SENDHOLD)))) { /* don't punish graceful restart */ timer_set(&peer->timers, Timer_IdleHold, 0); - peer->IdleHoldTime /= 2; session_graceful_restart(peer); - } else + } else { + timer_set(&peer->timers, Timer_IdleHold, + peer->IdleHoldTime); + if (event != EVNT_NONE && + peer->IdleHoldTime < MAX_IDLE_HOLD/2) + peer->IdleHoldTime *= 2; session_down(peer); + } + } else if (event != EVNT_STOP) { + timer_set(&peer->timers, Timer_IdleHold, + peer->IdleHoldTime); + if (event != EVNT_NONE && + peer->IdleHoldTime < MAX_IDLE_HOLD / 2) + peer->IdleHoldTime *= 2; } + if (peer->state == STATE_NONE || peer->state == STATE_ESTABLISHED) { /* initialize capability negotiation structures */ @@ -1532,6 +1541,8 @@ session_open(struct peer *p) /* Only set the R-flag if no graceful restart is ongoing */ if (!rst) hdr |= CAPA_GR_R_FLAG; + if (p->capa.ann.grestart.grnotification) + hdr |= CAPA_GR_N_FLAG; errs += session_capa_add(opb, CAPA_RESTART, sizeof(hdr)); errs += ibuf_add_n16(opb, hdr); } @@ -1694,6 +1705,38 @@ session_update(uint32_t peerid, struct i p->stats.msg_sent_update++; } +static int +session_req_hard_reset(enum err_codes errcode, uint8_t subcode) +{ + switch (errcode) { + case ERR_HEADER: + case ERR_OPEN: + case ERR_UPDATE: + case ERR_FSM: + case ERR_RREFRESH: + /* + * Protocol errors trigger a hard reset. The peer + * is not trustworthy and so there is no realistic + * hope that forwarding can continue. + */ + return 1; + case ERR_HOLDTIMEREXPIRED: + case ERR_SENDHOLDTIMEREXPIRED: + /* Keep forwarding and hope the other side is back soon. */ + return 0; + case ERR_CEASE: + switch (subcode) { + case ERR_CEASE_CONN_REJECT: + case ERR_CEASE_OTHER_CHANGE: + case ERR_CEASE_COLLISION: + case ERR_CEASE_RSRC_EXHAUST: + /* Per RFC8538 suggestion make these graceful. */ + return 0; + } + return 1; + } +} + void session_notification_data(struct peer *p, uint8_t errcode, uint8_t subcode, void *data, size_t datalen) @@ -1709,7 +1752,8 @@ session_notification(struct peer *p, uin struct ibuf *ibuf) { struct ibuf *buf; - int errs = 0; + const char *reason = "sending"; + int errs = 0, need_hard_reset = 0; size_t datalen = 0; switch (p->state) { @@ -1723,18 +1767,28 @@ session_notification(struct peer *p, uin return; } - log_notification(p, errcode, subcode, ibuf, "sending"); + if (p->capa.neg.grestart.grnotification) { + if (session_req_hard_reset(errcode, subcode)) { + need_hard_reset = 1; + datalen += 2; + reason = "sending hard-reset"; + } else { + reason = "sending graceful"; + } + } + + log_notification(p, errcode, subcode, ibuf, reason); /* cap to maximum size */ if (ibuf != NULL) { if (ibuf_size(ibuf) > - MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN) { + MAX_PKTSIZE - MSGSIZE_NOTIFICATION_MIN - datalen) { log_peer_warnx(&p->conf, "oversized notification, data trunkated"); ibuf_truncate(ibuf, MAX_PKTSIZE - - MSGSIZE_NOTIFICATION_MIN); + MSGSIZE_NOTIFICATION_MIN - datalen); } - datalen = ibuf_size(ibuf); + datalen += ibuf_size(ibuf); } if ((buf = session_newmsg(NOTIFICATION, @@ -1743,6 +1797,11 @@ session_notification(struct peer *p, uin return; } + if (need_hard_reset) { + errs += ibuf_add_n8(buf, ERR_CEASE); + errs += ibuf_add_n8(buf, ERR_CEASE_HARD_RESET); + } + errs += ibuf_add_n8(buf, errcode); errs += ibuf_add_n8(buf, subcode); @@ -1829,9 +1888,15 @@ int session_graceful_restart(struct peer *p) { uint8_t i; + uint16_t staletime = conf->staletime; - timer_set(&p->timers, Timer_RestartTimeout, - p->capa.neg.grestart.timeout); + if (p->conf.staletime) + staletime = p->conf.staletime; + + /* RFC 8538: enforce configurable upper bound of the stale timer */ + if (staletime > p->capa.neg.grestart.timeout) + staletime = p->capa.neg.grestart.timeout; + timer_set(&p->timers, Timer_RestartTimeout, staletime); for (i = AID_MIN; i < AID_MAX; i++) { if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) { @@ -2417,8 +2482,10 @@ parse_rrefresh(struct peer *peer, struct void parse_notification(struct peer *peer, struct ibuf *msg) { - uint8_t errcode, subcode; - uint8_t reason_len; + const char *reason = "received"; + uint8_t errcode, subcode; + uint8_t reason_len; + enum session_events event = EVNT_RCVD_NOTIFICATION; if (ibuf_get_n8(msg, &errcode) == -1 || ibuf_get_n8(msg, &subcode) == -1) { @@ -2426,11 +2493,27 @@ parse_notification(struct peer *peer, st goto done; } + /* RFC8538: check for hard-reset or graceful notification */ + if (peer->capa.neg.grestart.grnotification) { + if (errcode == ERR_CEASE && subcode == ERR_CEASE_HARD_RESET) { + if (ibuf_get_n8(msg, &errcode) == -1 || + ibuf_get_n8(msg, &subcode) == -1) { + log_peer_warnx(&peer->conf, + "received bad hard-reset notification"); + goto done; + } + reason = "received hard-reset"; + } else { + reason = "received graceful"; + event = EVNT_RCVD_GRACE_NOTIFICATION; + } + } + peer->errcnt++; peer->stats.last_rcvd_errcode = errcode; peer->stats.last_rcvd_suberr = subcode; - log_notification(peer, errcode, subcode, msg, "received"); + log_notification(peer, errcode, subcode, msg, reason); CTASSERT(sizeof(peer->stats.last_reason) > UINT8_MAX); memset(peer->stats.last_reason, 0, sizeof(peer->stats.last_reason)); @@ -2447,7 +2530,7 @@ parse_notification(struct peer *peer, st } done: - change_state(peer, STATE_IDLE, EVNT_RCVD_NOTIFICATION); + change_state(peer, STATE_IDLE, event); } int @@ -2568,6 +2651,8 @@ parse_capabilities(struct peer *peer, st CAPA_GR_RESTART; peer->capa.peer.grestart.restart = 2; } + if (gr_header & CAPA_GR_N_FLAG) + peer->capa.peer.grestart.grnotification = 1; break; case CAPA_AS4BYTE: if (capa_len != 4 || @@ -2704,6 +2789,11 @@ capa_neg_calc(struct peer *p) p->capa.neg.grestart.restart = p->capa.peer.grestart.restart; if (p->capa.ann.grestart.restart == 0) p->capa.neg.grestart.restart = 0; + + /* RFC 8538 graceful notification: both sides need to agree */ + p->capa.neg.grestart.grnotification = + (p->capa.ann.grestart.grnotification && + p->capa.peer.grestart.grnotification) != 0; /* * ADD-PATH: set only those bits where both sides agree. Index: bgpd/session.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/session.h,v diff -u -p -r1.182 session.h --- bgpd/session.h 12 Dec 2024 20:19:03 -0000 1.182 +++ bgpd/session.h 13 Dec 2024 06:54:46 -0000 @@ -26,6 +26,7 @@ #define INTERVAL_HOLD 90 #define INTERVAL_IDLE_HOLD_INITIAL 30 #define INTERVAL_HOLD_DEMOTED 60 +#define INTERVAL_STALE 180 #define INTERVAL_SESSION_DOWN 3600 #define MAX_IDLE_HOLD 3600 #define MSGSIZE_HEADER 19 @@ -64,7 +65,8 @@ enum session_events { EVNT_RCVD_OPEN, EVNT_RCVD_KEEPALIVE, EVNT_RCVD_UPDATE, - EVNT_RCVD_NOTIFICATION + EVNT_RCVD_NOTIFICATION, + EVNT_RCVD_GRACE_NOTIFICATION, }; enum msg_type {