Index | Thread | Search

From:
Job Snijders <job@bsd.nl>
Subject:
rpki-client: backoff retry for persistently non-functional CAs
To:
tech@openbsd.org
Date:
Fri, 15 May 2026 11:56:17 +0000

Download raw body.

Thread
  • Job Snijders:

    rpki-client: backoff retry for persistently non-functional CAs

Dear all,

I had an itch to scratch related to slowdowns caused by persistently
non-functional RPKI CAs: such CAs oftentimes represent a 'dangling'
reference of sorts, in that the CA's repository content is hosted on a
RRDP/Rsync server that's down, or, more perversely, doesn't even contain
any material relevant to the CA at hand!

Currently there are about 120 non-functional CAs in the ecosystem. Some
of these CAs appear to have been broken for more than 3 years now, see
https://console.rpki-client.org/nonfunc.html for my recorded history.

While some large CAs (e.g., RIPE & APNIC) have adopted a policy of
eventually revoking persistently non-functional CAs, such a practice is
not universally adopted among RIR and NIRs (nor is expected to ever be).

With the below changeset, the existing detection mechanism for
non-functional CAs is extended into a stateful backoff retry mechanism.
While there, expose the newly gathered state in the JSON output and fix
failure attribution in the openmetrics output (by counting detected
non-func CAs towards the parent repo instead of the broken CA's own
repo).

I picked the retry timer strategy based on my understanding of the
dynamics of the RPKI ecosystem, accommodating, for example, CAs which
very recently became non-functional who might respond to out-of-band
alerting and quickly restore their certification service (warranting
more retries early on in the outage), but also also taking into account
how most RPs deployments initiate a validation run somewhere on order
of every 15 to 60 minutes. Eventually settling on retrying once per day
seemed a good lower bound and easy to reason about. The schedule is
reset if a given non-functional CA is discovered to be in working order
again (i.e., backoff is not applied to healthy CAs).

For me an interesting side-effect of this diff is that it appears to
obviate a need i had for a manually curated (and therefore, easily
outdated) /etc/rpki/skiplist. I like automation. It also appears to
result in a modest but measurable reduction in processing time and
bandwidth consumption.

OK? Feedback?

Kind regards,

Job

Index: cert.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/cert.c,v
diff -u -p -r1.236 cert.c
--- cert.c	2 May 2026 10:36:21 -0000	1.236
+++ cert.c	15 May 2026 00:47:48 -0000
@@ -2049,7 +2049,8 @@ RB_GENERATE(brk_tree, brk, entry, brkcmp
  * Add each CA cert into the non-functional CA tree.
  */
 void
-cert_insert_nca(struct nca_tree *tree, const struct cert *cert, struct repo *rp)
+cert_insert_nca(struct nca_tree *tree, const struct cert *cert, time_t since,
+    time_t last_attempt, int attempts)
 {
 	struct nonfunc_ca *nca;
 
@@ -2064,11 +2065,14 @@ cert_insert_nca(struct nca_tree *tree, c
 	if ((nca->ski = strdup(cert->ski)) == NULL)
 		err(1, NULL);
 	nca->certid = cert->certid;
+	nca->repoid = cert->repoid;
 	nca->talid = cert->talid;
+	nca->since = since;
+	nca->last_attempt = last_attempt;
+	nca->attempts = attempts;
 
 	if (RB_INSERT(nca_tree, tree, nca) != NULL)
 		errx(1, "non-functional CA tree corrupted");
-	repo_stat_inc(rp, nca->talid, RTYPE_CER, STYPE_NONFUNC);
 }
 
 void
@@ -2078,7 +2082,6 @@ cert_remove_nca(struct nca_tree *tree, i
 
 	if ((found = RB_FIND(nca_tree, tree, &needle)) != NULL) {
 		RB_REMOVE(nca_tree, tree, found);
-		repo_stat_inc(rp, found->talid, RTYPE_CER, STYPE_FUNC);
 		free(found->location);
 		free(found->carepo);
 		free(found->mfturi);
Index: extern.h
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/extern.h,v
diff -u -p -r1.279 extern.h
--- extern.h	1 May 2026 11:22:24 -0000	1.279
+++ extern.h	15 May 2026 00:47:49 -0000
@@ -147,6 +147,15 @@ struct cert {
 	unsigned char	 mfthash[SHA256_DIGEST_LENGTH]; /* of the parent mft */
 };
 
+struct nca_hist {
+	RB_ENTRY(nca_hist)	 entry;
+	char			*location;
+	char			*ski;
+	time_t			 since;
+	time_t			 last_attempt;
+	int			 attempts;
+};
+
 /*
  * Non-functional CA tree element.
  * Initially all CA and TA certs are added to this tree.
@@ -159,7 +168,11 @@ struct nonfunc_ca {
 	char			*mfturi;
 	char			*ski;
 	int			 certid;
+	unsigned int		 repoid;
 	int			 talid;
+	time_t			 since;
+	time_t			 last_attempt;
+	int			 attempts;
 };
 
 /*
@@ -698,6 +711,7 @@ extern int filemode;
 extern int excludeaspa;
 extern int experimental;
 extern int excludeas0;
+extern int retrynonfunc;
 extern const char *tals[];
 extern const char *taldescs[];
 extern unsigned int talrepocnt[];
@@ -726,7 +740,7 @@ struct cert	*ta_validate(const char *, s
 struct cert	*cert_read(struct ibuf *);
 void		 cert_insert_brks(struct brk_tree *, struct cert *);
 void		 cert_insert_nca(struct nca_tree *, const struct cert *,
-		    struct repo *);
+		    time_t, time_t, int);
 void		 cert_remove_nca(struct nca_tree *, int, struct repo *);
 
 enum rtype	 rtype_from_file_extension(const char *);
@@ -895,6 +909,7 @@ void		 repo_cleanup(struct filepath_tree
 int		 repo_check_timeout(int);
 void		 repostats_new_files_inc(struct repo *, const char *);
 void		 repo_stat_inc(struct repo *, int, enum rtype, enum stype);
+void		 repo_stat_inc_nca(unsigned int, int);
 void		 repo_tal_stats_collect(void (*)(const struct repo *,
 		    const struct repotalstats *, void *), int, void *);
 void		 repo_stats_collect(void (*)(const struct repo *,
Index: main.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/main.c,v
diff -u -p -r1.306 main.c
--- main.c	9 May 2026 01:22:32 -0000	1.306
+++ main.c	15 May 2026 00:47:49 -0000
@@ -20,6 +20,7 @@
 #include <sys/queue.h>
 #include <sys/resource.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/statvfs.h>
 #include <sys/time.h>
 #include <sys/tree.h>
@@ -73,6 +74,7 @@ int	shortlistmode;
 int	rrdpon = 1;
 int	repo_timeout;
 int	experimental;
+int	retrynonfunc = 0;
 time_t	deadline;
 
 /* 9999-12-31 23:59:59 UTC */
@@ -93,6 +95,31 @@ LIST_HEAD(fqdns, fqdnlistentry);
 struct fqdns shortlist = LIST_HEAD_INITIALIZER(fqdns);
 struct fqdns skiplist = LIST_HEAD_INITIALIZER(fqdns);
 
+static RB_HEAD(nca_hist_tree, nca_hist)	ncas_hist = RB_INITIALIZER(&ncas_hist);
+
+static inline int
+nca_hist_cmp(struct nca_hist *a, struct nca_hist *b)
+{
+	int cmp;
+
+	cmp = strcmp(a->ski, b->ski);
+	if (cmp > 0)
+		return 1;
+	if (cmp < 0)
+		return -1;
+
+	cmp = strcmp(a->location, b->location);
+	if (cmp > 0)
+		return 1;
+	if (cmp < 0)
+		return -1;
+
+	return 0;
+}
+
+RB_PROTOTYPE_STATIC(nca_hist_tree, nca_hist, entry, nca_hist_cmp);
+RB_GENERATE_STATIC(nca_hist_tree, nca_hist, entry, nca_hist_cmp);
+
 /*
  * Log a message to stderr if and only if "verbose" is non-zero.
  * This uses the err(3) functionality.
@@ -519,6 +546,9 @@ queue_add_from_cert(const struct cert *c
 	const char		*uri, *repouri, *file;
 	size_t			 hostsz, repourisz;
 	int			 shortlisted = 0;
+	struct nca_hist		*nca_hist, needle;
+	time_t			 since, last_attempt;
+	int			 attempts;
 
 	if (strncmp(cert->repo, RSYNC_PROTO, RSYNC_PROTO_LEN) != 0)
 		errx(1, "unexpected protocol");
@@ -546,6 +576,38 @@ queue_add_from_cert(const struct cert *c
 		return;
 	}
 
+	needle.location = cert->path;
+	needle.ski = cert->ski;
+	if ((nca_hist = RB_FIND(nca_hist_tree, &ncas_hist, &needle)) != NULL) {
+		since = nca_hist->since;
+		last_attempt = nca_hist->last_attempt;
+		attempts = nca_hist->attempts;
+	} else {
+		since = last_attempt = time(NULL);
+		attempts = 0;
+	}
+
+	/*
+	 * Control the rate of synchronization attempts for non-functional CAs.
+	 * First just retry a few times consecutively, then insert 90 minute
+	 * pauses between retries, and then finally settle on retrying only
+	 * once per day.
+	 * Discovery of a valid manifest resets the schedule back to normal.
+	 */
+	if (retrynonfunc || attempts < 3 ||
+	    (time(NULL) > last_attempt + 90 * 60) ||
+	    ((time(NULL) - since > 24 * 60 * 60) &&
+	    (time(NULL) - last_attempt > 24 * 60 * 60))) {
+		last_attempt = time(NULL);
+		cert_insert_nca(ncas, cert, since, last_attempt, ++attempts);
+	} else {
+		if (verbose > 1)
+			warnx("%s: deferring sync, non-functional since %s",
+			    cert->path, time2str(since));
+		cert_insert_nca(ncas, cert, since, last_attempt, attempts);
+		return;
+	}
+
 	repo = repo_lookup(cert->talid, cert->repo,
 	    rrdpon ? cert->notify : NULL);
 	if (repo == NULL)
@@ -575,7 +637,6 @@ queue_add_from_cert(const struct cert *c
 			err(1, NULL);
 	}
 
-	cert_insert_nca(ncas, cert, repo);
 	entityq_add(npath, nfile, RTYPE_MFT, DIR_UNKNOWN, repo, NULL, 0,
 	    cert->talid, cert->certid, NULL);
 }
@@ -927,6 +988,110 @@ load_skiplist(const char *slf)
 	free(line);
 }
 
+static void
+load_nca_history(void)
+{
+	struct nca_hist *nca_hist;
+	FILE *f;
+	char *line = NULL;
+	size_t linesize = 0;
+	char **ap, *argv[6];
+	const char *errstr;
+
+	if ((f = fopen(".nca_history", "r")) == NULL) {
+		if (errno == ENOENT)
+			return;
+		err(1, "failed to open .nca_history");
+	}
+
+	while (getline(&line, &linesize, f) != -1) {
+		for (ap = argv; ap < &argv[5] &&
+		    (*ap = strsep(&line, " ")) != NULL;) {
+			if (**ap != '\0')
+				ap++;
+		}
+		*ap = NULL;
+
+		if ((nca_hist = calloc(1, sizeof(*nca_hist))) == NULL)
+			err(1, NULL);
+
+		nca_hist->ski = strdup(argv[0]);
+
+		nca_hist->since = strtonum(argv[1], 1, LLONG_MAX, &errstr);
+		if (errstr != NULL)
+			errx(1, ".nca_history %s %s", errstr, argv[1]);
+
+		nca_hist->last_attempt = strtonum(argv[2], 1, LLONG_MAX,
+		    &errstr);
+		if (errstr != NULL)
+			errx(1, ".nca_history %s %s", errstr, argv[2]);
+
+		nca_hist->attempts = strtonum(argv[3], 0, LLONG_MAX, &errstr);
+		if (errstr != NULL)
+			errx(1, ".nca_history %s %s", errstr, argv[3]);
+
+		argv[4][strcspn(argv[4], "\n")] = '\0';
+		nca_hist->location = strdup(argv[4]);
+
+		if (RB_INSERT(nca_hist_tree, &ncas_hist, nca_hist) != NULL)
+			err(1, "ncas_hist_tree corrupted");
+	}
+
+	if (ferror(f))
+		warn("error reading .nca_history");
+
+	fclose(f);
+	free(line);
+}
+
+static void
+nca_history_save(struct nca_tree *ncas)
+{
+	char temp[] = ".nca_history.XXXXXXXX";
+	FILE *f = NULL;
+	int fd;
+	struct nonfunc_ca *nca;
+
+	if (RB_EMPTY(ncas)) {
+		unlink(".nca_history");
+		return;
+	}
+
+	if ((fd = mkostemp(temp, O_CLOEXEC)) == -1)
+		goto fail;
+	(void)fchmod(fd, 0644);
+	f = fdopen(fd, "w");
+	if (f == NULL)
+		err(1, "fopen");
+
+	RB_FOREACH(nca, nca_tree, ncas) {
+		repo_stat_inc_nca(nca->repoid, nca->talid);
+
+		if (fprintf(f, "%s %lld %lld %d %s\n", nca->ski,
+		    (long long)nca->since, (long long)nca->last_attempt,
+		    nca->attempts, nca->location) < 0)
+			goto fail;
+	}
+
+	if (fclose(f) != 0) {
+		f = NULL;
+		goto fail;
+	}
+
+	if (rename(temp, ".nca_history") == -1) {
+		warn("error renaming %s to .nca_history", temp);
+		unlink(temp);
+	}
+
+	return;
+
+ fail:
+	warn("error saving nca history to %s", temp);
+	if (f != NULL)
+		fclose(f);
+	unlink(temp);
+}
+
 /*
  * Load shortlist entries.
  */
@@ -1064,7 +1229,7 @@ main(int argc, char *argv[])
 		err(1, "pledge");
 
 	while ((c =
-	    getopt(argc, argv, "0Ab:Bcd:e:fH:jmnop:P:Rs:S:t:vVx")) != -1)
+	    getopt(argc, argv, "0Ab:Bcd:e:fH:jmNnop:P:Rs:S:t:vVx")) != -1)
 		switch (c) {
 		case '0':
 			excludeas0 = 0;
@@ -1101,6 +1266,9 @@ main(int argc, char *argv[])
 		case 'm':
 			outformats |= FORMAT_OMETRIC;
 			break;
+		case 'N':
+			retrynonfunc = 1;
+			break;
 		case 'n':
 			noop = 1;
 			break;
@@ -1348,6 +1516,8 @@ main(int argc, char *argv[])
 	if (fchdir(cachefd) == -1)
 		err(1, "fchdir");
 
+	load_nca_history();
+
 	while (entity_queue > 0 && !killme) {
 		int polltim;
 
@@ -1561,6 +1731,8 @@ main(int argc, char *argv[])
 
 	vd.buildtime = get_current_time();
 
+	nca_history_save(&vd.ncas);
+
 	/* change working directory to the output directory */
 	if (fchdir(outdirfd) == -1)
 		err(1, "fchdir output dir");
@@ -1632,7 +1804,7 @@ main(int argc, char *argv[])
 
 usage:
 	fprintf(stderr,
-	    "usage: rpki-client [-0ABcjmnoRVvx] [-b sourceaddr] [-d cachedir]"
+	    "usage: rpki-client [-0ABcjmNnoRVvx] [-b sourceaddr] [-d cachedir]"
 	    " [-e rsync_prog]\n"
 	    "                   [-H fqdn] [-P posix-seconds] [-p threads]"
 	    " [-S skiplist]\n"
Index: output-json.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/output-json.c,v
diff -u -p -r1.59 output-json.c
--- output-json.c	13 Nov 2025 15:18:53 -0000	1.59
+++ output-json.c	15 May 2026 00:47:49 -0000
@@ -193,6 +193,9 @@ output_json(FILE *out, struct validation
 		json_do_string("caRepository", nca->carepo);
 		json_do_string("rpkiManifest", nca->mfturi);
 		json_do_string("ski", nca->ski);
+		json_do_int("since", (long long)nca->since);
+		json_do_int("last_attempt", (long long)nca->last_attempt);
+		json_do_int("total_attempts", nca->attempts);
 		json_do_end();
 	}
 	json_do_end();
Index: repo.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/repo.c,v
diff -u -p -r1.81 repo.c
--- repo.c	13 May 2026 04:38:42 -0000	1.81
+++ repo.c	15 May 2026 00:47:49 -0000
@@ -1537,6 +1537,19 @@ repostats_new_files_inc(struct repo *rp,
 		rp->repostats.new_files++;
 }
 
+void
+repo_stat_inc_nca(unsigned int id, int talid)
+{
+	struct repo *rp;
+
+	SLIST_FOREACH(rp, &repos, entry) {
+		if (rp->id == id) {
+			rp->stats[talid].certs_nonfunc++;
+			break;
+		}
+	}
+}
+
 /*
  * Update stats object of repository depending on rtype and subtype.
  */
@@ -1552,10 +1565,6 @@ repo_stat_inc(struct repo *rp, int talid
 			rp->stats[talid].certs++;
 		if (subtype == STYPE_FAIL)
 			rp->stats[talid].certs_fail++;
-		if (subtype == STYPE_NONFUNC)
-			rp->stats[talid].certs_nonfunc++;
-		if (subtype == STYPE_FUNC)
-			rp->stats[talid].certs_nonfunc--;
 		if (subtype == STYPE_BGPSEC) {
 			rp->stats[talid].certs--;
 			rp->stats[talid].brks++;
@@ -1854,6 +1863,9 @@ repo_cleanup_entry(FTSENT *e, struct fil
 	path = skip_dotslash(e->fts_path);
 	switch (e->fts_info) {
 	case FTS_NSOK:
+		if (e->fts_level == 1 && fts_state.type == BASE_DIR &&
+		    strcmp(e->fts_name, ".nca_history") == 0)
+			break;
 		if (filepath_exists(tree, path)) {
 			e->fts_parent->fts_number++;
 			break;
Index: rpki-client.8
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/rpki-client.8,v
diff -u -p -r1.139 rpki-client.8
--- rpki-client.8	17 Feb 2026 13:54:42 -0000	1.139
+++ rpki-client.8	15 May 2026 00:47:49 -0000
@@ -22,7 +22,7 @@
 .Nd RPKI validator to support BGP routing security
 .Sh SYNOPSIS
 .Nm
-.Op Fl 0ABcjmnoRVvx
+.Op Fl 0ABcjmNnoRVvx
 .Op Fl b Ar sourceaddr
 .Op Fl d Ar cachedir
 .Op Fl e Ar rsync_prog
@@ -153,6 +153,12 @@ for a description of the fields.
 Create output in the file
 .Pa metrics
 in the output directory in OpenMetrics format.
+.It Fl N
+Disable the backoff retry mechanism applied to non-functional CAs.
+By default
+.Nm
+progressively decreases the synchronization frequency for persistently
+non-functional CAs, eventually settling on retrying once per day.
 .It Fl n
 Offline mode.
 Validate the contents of