From: Mark Johnston <markj@freebsd.org>
Subject: fix relayd desynchronization upon config reload
To: tech@openbsd.org
Date: Tue, 16 Jul 2024 15:33:38 -0400

Hello,

Apologies in advance for the long email.  The patch below fixes a
problem we occasionally see where relayd exits with an assertion failure
following a SIGHUP or relayctl reload.  Before that, I give some
background and explain how my patch tries to solve the crashes.  I'd be
interested in any feedback on the bug or the patch.  Thanks in advance
to anyone willing to take a look.

I can reproduce the problem by running relayd with some health checks
enabled, and spamming SIGHUP or "relayctl reload" in a loop with a bit
of delay in between (something like
'sleep $(awk "BEGIN{srand(); print(rand() / 10)}"')).
Eventually, relayd terminates with one of several possible error
messages, e.g.,:

pfe: pfe_dispatch_hce: invalid host id
hce exiting, pid 99370
lost child: pid 58165 exited abnormally
...

relayd has a worker process, hce, which performs periodic health checks
based on relayd.conf configuration.  When a health check completes, it
sends an IMSG_HOST_STATUS message to the pfe, a different process.  The
message includes the ID of the host in question.  When the pfe receives
this message and can't a host with the ID in question, it aborts and the
whole relayd process exits.

When parsing configuration, relayd creates a set of objects representing
hosts, tables, redirection rules, etc..  These all get unique 32-bit
IDs, assigned from a monotonic counter.  When relayd reloads its
configuration, all of these objects are discarded, and new objects are
created and assigned new IDs.  We can thus refer to "configuration
epochs": each object belongs to the epoch in which it was created, and
reloading configuration advances the epoch counter.

To implement a config reload, the parent process send IMSG_CTL_RESET to
worker processes, causing them to discard their state.  It then
re-parses relayd.conf and instantiates objects, copies of which then get
sent to worker processes via IMSG_{TABLE,HOST,...} messages.  Finally it
sends IMSG_CFG_DONE to indicate that the reload is complete.  These
messages can be interleaved with messages between the worker processes,
which is the root of the problem.  For example, after a config reload,
the pfe might process a IMSG_HOST_STATUS message that was sent by the
hce during the previous config epoch.

The solution taken by the patch is to make the notion of config epoch
explicit, and include the epoch number in various messages.  The
receiver compares the number in the message with its own notion of the
current epoch and discards the message if they do not match.  A simpler
approach would be to ignore messages if they refer to objects (hosts,
etc.) which do not exist, but that solution might hide real bugs.

In more detail, each worker process now has a sc_config_gen field which
gets incremented when it receives when IMSG_CTL_RESET is received.  A
few message structures now carry the generation number in which they
were transmitted, and receivers check it before doing anything with the
message.  I also added a sc_started flag, set to 0 during the
window between receipt of IMSG_CTL_RESET and IMSG_CTL_START, used to
prevent "relayctl poll" from starting a health check while the
configuration state is inconsistent.

diff --git a/usr.sbin/relayd/check_script.c b/usr.sbin/relayd/check_script.c
index d0cd1f6fc7f..e162a0942ee 100644
--- a/usr.sbin/relayd/check_script.c
+++ b/usr.sbin/relayd/check_script.c
@@ -48,6 +48,7 @@ check_script(struct relayd *env, struct host *host)
 	host->flags &= ~(F_CHECK_SENT|F_CHECK_DONE);
 
 	scr.host = host->conf.id;
+	scr.config_gen = env->sc_config_gen;
 	if ((strlcpy(scr.name, host->conf.name,sizeof(scr.name)) >=
 	    sizeof(scr.name)) ||
 	    (strlcpy(scr.path, table->conf.path, sizeof(scr.path)) >=
@@ -65,6 +66,12 @@ script_done(struct relayd *env, struct ctl_script *scr)
 {
 	struct host		*host;
 
+	if (scr->config_gen != env->sc_config_gen) {
+		log_debug("script check for %s interrupted, ignoring",
+		    scr->name);
+		return;
+	}
+
 	if ((host = host_find(env, scr->host)) == NULL)
 		fatalx("%s: invalid host id", __func__);
 
diff --git a/usr.sbin/relayd/config.c b/usr.sbin/relayd/config.c
index 3024c0768f0..ea99adefc19 100644
--- a/usr.sbin/relayd/config.c
+++ b/usr.sbin/relayd/config.c
@@ -150,6 +150,8 @@ config_purge(struct relayd *env, u_int reset)
 	struct keyname		*keyname;
 	u_int			 what;
 
+	env->sc_started = 0;
+
 	what = ps->ps_what[privsep_process] & reset;
 
 	if (what & CONFIG_TABLES && env->sc_tables != NULL) {
@@ -260,6 +262,8 @@ config_getreset(struct relayd *env, struct imsg *imsg)
 
 	config_purge(env, mode);
 
+	env->sc_config_gen++;
+
 	return (0);
 }
 
diff --git a/usr.sbin/relayd/hce.c b/usr.sbin/relayd/hce.c
index e4db3c96c91..f9ffcaa8a35 100644
--- a/usr.sbin/relayd/hce.c
+++ b/usr.sbin/relayd/hce.c
@@ -79,11 +79,10 @@ hce_setup_events(void)
 	struct timeval	 tv;
 	struct table	*table;
 
-	if (!event_initialized(&env->sc_ev)) {
+	if (!event_initialized(&env->sc_ev))
 		evtimer_set(&env->sc_ev, hce_launch_checks, env);
-		bzero(&tv, sizeof(tv));
-		evtimer_add(&env->sc_ev, &tv);
-	}
+	bzero(&tv, sizeof(tv));
+	evtimer_add(&env->sc_ev, &tv);
 
 	if (env->sc_conf.flags & F_TLS) {
 		TAILQ_FOREACH(table, env->sc_tables, entry) {
@@ -97,6 +96,8 @@ hce_setup_events(void)
 			tls_config_insecure_noverifyname(table->tls_cfg);
 		}
 	}
+
+	env->sc_started = 1;
 }
 
 void
@@ -167,6 +168,7 @@ hce_launch_checks(int fd, short event, void *arg)
 				continue;
 			bcopy(&tv, &host->cte.tv_start,
 			    sizeof(host->cte.tv_start));
+			host->config_gen = env->sc_config_gen;
 			switch (table->conf.check) {
 			case CHECK_ICMP:
 				schedule_icmp(env, host);
@@ -239,6 +241,7 @@ hce_notify_done(struct host *host, enum host_error he)
 	st.up = host->up;
 	st.check_cnt = host->check_cnt;
 	st.retry_cnt = host->retry_cnt;
+	st.config_gen = host->config_gen;
 	st.he = he;
 	host->flags |= (F_CHECK_SENT|F_CHECK_DONE);
 	msg = host_error(he);
@@ -324,10 +327,12 @@ hce_dispatch_pfe(int fd, struct privsep_proc *p, struct imsg *imsg)
 			host->up = HOST_UNKNOWN;
 		break;
 	case IMSG_CTL_POLL:
-		evtimer_del(&env->sc_ev);
-		TAILQ_FOREACH(table, env->sc_tables, entry)
-			table->skipped = 0;
-		hce_launch_checks(-1, EV_TIMEOUT, env);
+		if (env->sc_started) {
+			evtimer_del(&env->sc_ev);
+			TAILQ_FOREACH(table, env->sc_tables, entry)
+				table->skipped = 0;
+			hce_launch_checks(-1, EV_TIMEOUT, env);
+		}
 		break;
 	default:
 		return (-1);
@@ -360,6 +365,7 @@ hce_dispatch_parent(int fd, struct privsep_proc *p, struct imsg *imsg)
 		hce_setup_events();
 		break;
 	case IMSG_CTL_RESET:
+		hce_disable_events();
 		config_getreset(env, imsg);
 		break;
 	default:
diff --git a/usr.sbin/relayd/pfe.c b/usr.sbin/relayd/pfe.c
index 3aba811eed5..47c0c001a0a 100644
--- a/usr.sbin/relayd/pfe.c
+++ b/usr.sbin/relayd/pfe.c
@@ -103,11 +103,12 @@ pfe_setup_events(void)
 	struct timeval	 tv;
 
 	/* Schedule statistics timer */
-	if (!event_initialized(&env->sc_statev)) {
+	if (!event_initialized(&env->sc_statev))
 		evtimer_set(&env->sc_statev, pfe_statistics, NULL);
-		bcopy(&env->sc_conf.statinterval, &tv, sizeof(tv));
-		evtimer_add(&env->sc_statev, &tv);
-	}
+	bcopy(&env->sc_conf.statinterval, &tv, sizeof(tv));
+	evtimer_add(&env->sc_statev, &tv);
+
+	env->sc_started = 1;
 }
 
 void
@@ -129,6 +130,10 @@ pfe_dispatch_hce(int fd, struct privsep_proc *p, struct imsg *imsg)
 	case IMSG_HOST_STATUS:
 		IMSG_SIZE_CHECK(imsg, &st);
 		memcpy(&st, imsg->data, sizeof(st));
+		if (st.config_gen != env->sc_config_gen) {
+			log_debug("%s: reload gen mismatch", __func__);
+			break;
+		}
 		if ((host = host_find(env, st.id)) == NULL)
 			fatalx("%s: invalid host id", __func__);
 		host->he = st.he;
diff --git a/usr.sbin/relayd/relay.c b/usr.sbin/relayd/relay.c
index c29f3917152..a72e7f0f01d 100644
--- a/usr.sbin/relayd/relay.c
+++ b/usr.sbin/relayd/relay.c
@@ -1895,6 +1895,8 @@ relay_dispatch_pfe(int fd, struct privsep_proc *p, struct imsg *imsg)
 	case IMSG_HOST_STATUS:
 		IMSG_SIZE_CHECK(imsg, &st);
 		memcpy(&st, imsg->data, sizeof(st));
+		if (st.config_gen != env->sc_config_gen)
+			break;
 		if ((host = host_find(env, st.id)) == NULL)
 			fatalx("%s: invalid host id", __func__);
 		if (host->flags & F_DISABLE)
diff --git a/usr.sbin/relayd/relayd.h b/usr.sbin/relayd/relayd.h
index 9195be57ab0..bede386649c 100644
--- a/usr.sbin/relayd/relayd.h
+++ b/usr.sbin/relayd/relayd.h
@@ -121,6 +121,7 @@ struct ctl_status {
 	int		 up;
 	int		 retry_cnt;
 	u_long		 check_cnt;
+	u_int64_t	 config_gen;
 	u_int16_t	 he;
 };
 
@@ -152,6 +153,7 @@ struct ctl_relayfd {
 
 struct ctl_script {
 	objid_t		 host;
+	u_int64_t	 config_gen;
 	int		 retval;
 	struct timeval	 timeout;
 	char		 name[HOST_NAME_MAX+1];
@@ -441,6 +443,7 @@ struct host {
 	u_long			 up_cnt;
 	int			 retry_cnt;
 	int			 idx;
+	u_int64_t		 config_gen;
 	u_int16_t		 he;
 	int			 code;
 	struct ctl_tcp_event	 cte;
@@ -1137,6 +1140,8 @@ struct relayd {
 
 	struct privsep		*sc_ps;
 	int			 sc_reload;
+	int			 sc_started;
+	u_int64_t		 sc_config_gen;
 };
 
 #define RELAYD_OPT_VERBOSE		0x01