Download raw body.
rpki-client: backoff retry for persistently non-functional CAs
Dear all,
I had an itch to scratch related to slowdowns caused by persistently
non-functional RPKI CAs: such CAs oftentimes represent a 'dangling'
reference of sorts, in that the CA's repository content is hosted on a
RRDP/Rsync server that's down, or, more perversely, doesn't even contain
any material relevant to the CA at hand!
Currently there are about 120 non-functional CAs in the ecosystem. Some
of these CAs appear to have been broken for more than 3 years now, see
https://console.rpki-client.org/nonfunc.html for my recorded history.
While some large CAs (e.g., RIPE & APNIC) have adopted a policy of
eventually revoking persistently non-functional CAs, such a practice is
not universally adopted among RIR and NIRs (nor is expected to ever be).
With the below changeset, the existing detection mechanism for
non-functional CAs is extended into a stateful backoff retry mechanism.
While there, expose the newly gathered state in the JSON output and fix
failure attribution in the openmetrics output (by counting detected
non-func CAs towards the parent repo instead of the broken CA's own
repo).
I picked the retry timer strategy based on my understanding of the
dynamics of the RPKI ecosystem, accommodating, for example, CAs which
very recently became non-functional who might respond to out-of-band
alerting and quickly restore their certification service (warranting
more retries early on in the outage), but also also taking into account
how most RPs deployments initiate a validation run somewhere on order
of every 15 to 60 minutes. Eventually settling on retrying once per day
seemed a good lower bound and easy to reason about. The schedule is
reset if a given non-functional CA is discovered to be in working order
again (i.e., backoff is not applied to healthy CAs).
For me an interesting side-effect of this diff is that it appears to
obviate a need i had for a manually curated (and therefore, easily
outdated) /etc/rpki/skiplist. I like automation. It also appears to
result in a modest but measurable reduction in processing time and
bandwidth consumption.
OK? Feedback?
Kind regards,
Job
Index: cert.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/cert.c,v
diff -u -p -r1.236 cert.c
--- cert.c 2 May 2026 10:36:21 -0000 1.236
+++ cert.c 15 May 2026 00:47:48 -0000
@@ -2049,7 +2049,8 @@ RB_GENERATE(brk_tree, brk, entry, brkcmp
* Add each CA cert into the non-functional CA tree.
*/
void
-cert_insert_nca(struct nca_tree *tree, const struct cert *cert, struct repo *rp)
+cert_insert_nca(struct nca_tree *tree, const struct cert *cert, time_t since,
+ time_t last_attempt, int attempts)
{
struct nonfunc_ca *nca;
@@ -2064,11 +2065,14 @@ cert_insert_nca(struct nca_tree *tree, c
if ((nca->ski = strdup(cert->ski)) == NULL)
err(1, NULL);
nca->certid = cert->certid;
+ nca->repoid = cert->repoid;
nca->talid = cert->talid;
+ nca->since = since;
+ nca->last_attempt = last_attempt;
+ nca->attempts = attempts;
if (RB_INSERT(nca_tree, tree, nca) != NULL)
errx(1, "non-functional CA tree corrupted");
- repo_stat_inc(rp, nca->talid, RTYPE_CER, STYPE_NONFUNC);
}
void
@@ -2078,7 +2082,6 @@ cert_remove_nca(struct nca_tree *tree, i
if ((found = RB_FIND(nca_tree, tree, &needle)) != NULL) {
RB_REMOVE(nca_tree, tree, found);
- repo_stat_inc(rp, found->talid, RTYPE_CER, STYPE_FUNC);
free(found->location);
free(found->carepo);
free(found->mfturi);
Index: extern.h
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/extern.h,v
diff -u -p -r1.279 extern.h
--- extern.h 1 May 2026 11:22:24 -0000 1.279
+++ extern.h 15 May 2026 00:47:49 -0000
@@ -147,6 +147,15 @@ struct cert {
unsigned char mfthash[SHA256_DIGEST_LENGTH]; /* of the parent mft */
};
+struct nca_hist {
+ RB_ENTRY(nca_hist) entry;
+ char *location;
+ char *ski;
+ time_t since;
+ time_t last_attempt;
+ int attempts;
+};
+
/*
* Non-functional CA tree element.
* Initially all CA and TA certs are added to this tree.
@@ -159,7 +168,11 @@ struct nonfunc_ca {
char *mfturi;
char *ski;
int certid;
+ unsigned int repoid;
int talid;
+ time_t since;
+ time_t last_attempt;
+ int attempts;
};
/*
@@ -698,6 +711,7 @@ extern int filemode;
extern int excludeaspa;
extern int experimental;
extern int excludeas0;
+extern int retrynonfunc;
extern const char *tals[];
extern const char *taldescs[];
extern unsigned int talrepocnt[];
@@ -726,7 +740,7 @@ struct cert *ta_validate(const char *, s
struct cert *cert_read(struct ibuf *);
void cert_insert_brks(struct brk_tree *, struct cert *);
void cert_insert_nca(struct nca_tree *, const struct cert *,
- struct repo *);
+ time_t, time_t, int);
void cert_remove_nca(struct nca_tree *, int, struct repo *);
enum rtype rtype_from_file_extension(const char *);
@@ -895,6 +909,7 @@ void repo_cleanup(struct filepath_tree
int repo_check_timeout(int);
void repostats_new_files_inc(struct repo *, const char *);
void repo_stat_inc(struct repo *, int, enum rtype, enum stype);
+void repo_stat_inc_nca(unsigned int, int);
void repo_tal_stats_collect(void (*)(const struct repo *,
const struct repotalstats *, void *), int, void *);
void repo_stats_collect(void (*)(const struct repo *,
Index: main.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/main.c,v
diff -u -p -r1.306 main.c
--- main.c 9 May 2026 01:22:32 -0000 1.306
+++ main.c 15 May 2026 00:47:49 -0000
@@ -20,6 +20,7 @@
#include <sys/queue.h>
#include <sys/resource.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/statvfs.h>
#include <sys/time.h>
#include <sys/tree.h>
@@ -73,6 +74,7 @@ int shortlistmode;
int rrdpon = 1;
int repo_timeout;
int experimental;
+int retrynonfunc = 0;
time_t deadline;
/* 9999-12-31 23:59:59 UTC */
@@ -93,6 +95,31 @@ LIST_HEAD(fqdns, fqdnlistentry);
struct fqdns shortlist = LIST_HEAD_INITIALIZER(fqdns);
struct fqdns skiplist = LIST_HEAD_INITIALIZER(fqdns);
+static RB_HEAD(nca_hist_tree, nca_hist) ncas_hist = RB_INITIALIZER(&ncas_hist);
+
+static inline int
+nca_hist_cmp(struct nca_hist *a, struct nca_hist *b)
+{
+ int cmp;
+
+ cmp = strcmp(a->ski, b->ski);
+ if (cmp > 0)
+ return 1;
+ if (cmp < 0)
+ return -1;
+
+ cmp = strcmp(a->location, b->location);
+ if (cmp > 0)
+ return 1;
+ if (cmp < 0)
+ return -1;
+
+ return 0;
+}
+
+RB_PROTOTYPE_STATIC(nca_hist_tree, nca_hist, entry, nca_hist_cmp);
+RB_GENERATE_STATIC(nca_hist_tree, nca_hist, entry, nca_hist_cmp);
+
/*
* Log a message to stderr if and only if "verbose" is non-zero.
* This uses the err(3) functionality.
@@ -519,6 +546,9 @@ queue_add_from_cert(const struct cert *c
const char *uri, *repouri, *file;
size_t hostsz, repourisz;
int shortlisted = 0;
+ struct nca_hist *nca_hist, needle;
+ time_t since, last_attempt;
+ int attempts;
if (strncmp(cert->repo, RSYNC_PROTO, RSYNC_PROTO_LEN) != 0)
errx(1, "unexpected protocol");
@@ -546,6 +576,38 @@ queue_add_from_cert(const struct cert *c
return;
}
+ needle.location = cert->path;
+ needle.ski = cert->ski;
+ if ((nca_hist = RB_FIND(nca_hist_tree, &ncas_hist, &needle)) != NULL) {
+ since = nca_hist->since;
+ last_attempt = nca_hist->last_attempt;
+ attempts = nca_hist->attempts;
+ } else {
+ since = last_attempt = time(NULL);
+ attempts = 0;
+ }
+
+ /*
+ * Control the rate of synchronization attempts for non-functional CAs.
+ * First just retry a few times consecutively, then insert 90 minute
+ * pauses between retries, and then finally settle on retrying only
+ * once per day.
+ * Discovery of a valid manifest resets the schedule back to normal.
+ */
+ if (retrynonfunc || attempts < 3 ||
+ (time(NULL) > last_attempt + 90 * 60) ||
+ ((time(NULL) - since > 24 * 60 * 60) &&
+ (time(NULL) - last_attempt > 24 * 60 * 60))) {
+ last_attempt = time(NULL);
+ cert_insert_nca(ncas, cert, since, last_attempt, ++attempts);
+ } else {
+ if (verbose > 1)
+ warnx("%s: deferring sync, non-functional since %s",
+ cert->path, time2str(since));
+ cert_insert_nca(ncas, cert, since, last_attempt, attempts);
+ return;
+ }
+
repo = repo_lookup(cert->talid, cert->repo,
rrdpon ? cert->notify : NULL);
if (repo == NULL)
@@ -575,7 +637,6 @@ queue_add_from_cert(const struct cert *c
err(1, NULL);
}
- cert_insert_nca(ncas, cert, repo);
entityq_add(npath, nfile, RTYPE_MFT, DIR_UNKNOWN, repo, NULL, 0,
cert->talid, cert->certid, NULL);
}
@@ -927,6 +988,110 @@ load_skiplist(const char *slf)
free(line);
}
+static void
+load_nca_history(void)
+{
+ struct nca_hist *nca_hist;
+ FILE *f;
+ char *line = NULL;
+ size_t linesize = 0;
+ char **ap, *argv[6];
+ const char *errstr;
+
+ if ((f = fopen(".nca_history", "r")) == NULL) {
+ if (errno == ENOENT)
+ return;
+ err(1, "failed to open .nca_history");
+ }
+
+ while (getline(&line, &linesize, f) != -1) {
+ for (ap = argv; ap < &argv[5] &&
+ (*ap = strsep(&line, " ")) != NULL;) {
+ if (**ap != '\0')
+ ap++;
+ }
+ *ap = NULL;
+
+ if ((nca_hist = calloc(1, sizeof(*nca_hist))) == NULL)
+ err(1, NULL);
+
+ nca_hist->ski = strdup(argv[0]);
+
+ nca_hist->since = strtonum(argv[1], 1, LLONG_MAX, &errstr);
+ if (errstr != NULL)
+ errx(1, ".nca_history %s %s", errstr, argv[1]);
+
+ nca_hist->last_attempt = strtonum(argv[2], 1, LLONG_MAX,
+ &errstr);
+ if (errstr != NULL)
+ errx(1, ".nca_history %s %s", errstr, argv[2]);
+
+ nca_hist->attempts = strtonum(argv[3], 0, LLONG_MAX, &errstr);
+ if (errstr != NULL)
+ errx(1, ".nca_history %s %s", errstr, argv[3]);
+
+ argv[4][strcspn(argv[4], "\n")] = '\0';
+ nca_hist->location = strdup(argv[4]);
+
+ if (RB_INSERT(nca_hist_tree, &ncas_hist, nca_hist) != NULL)
+ err(1, "ncas_hist_tree corrupted");
+ }
+
+ if (ferror(f))
+ warn("error reading .nca_history");
+
+ fclose(f);
+ free(line);
+}
+
+static void
+nca_history_save(struct nca_tree *ncas)
+{
+ char temp[] = ".nca_history.XXXXXXXX";
+ FILE *f = NULL;
+ int fd;
+ struct nonfunc_ca *nca;
+
+ if (RB_EMPTY(ncas)) {
+ unlink(".nca_history");
+ return;
+ }
+
+ if ((fd = mkostemp(temp, O_CLOEXEC)) == -1)
+ goto fail;
+ (void)fchmod(fd, 0644);
+ f = fdopen(fd, "w");
+ if (f == NULL)
+ err(1, "fopen");
+
+ RB_FOREACH(nca, nca_tree, ncas) {
+ repo_stat_inc_nca(nca->repoid, nca->talid);
+
+ if (fprintf(f, "%s %lld %lld %d %s\n", nca->ski,
+ (long long)nca->since, (long long)nca->last_attempt,
+ nca->attempts, nca->location) < 0)
+ goto fail;
+ }
+
+ if (fclose(f) != 0) {
+ f = NULL;
+ goto fail;
+ }
+
+ if (rename(temp, ".nca_history") == -1) {
+ warn("error renaming %s to .nca_history", temp);
+ unlink(temp);
+ }
+
+ return;
+
+ fail:
+ warn("error saving nca history to %s", temp);
+ if (f != NULL)
+ fclose(f);
+ unlink(temp);
+}
+
/*
* Load shortlist entries.
*/
@@ -1064,7 +1229,7 @@ main(int argc, char *argv[])
err(1, "pledge");
while ((c =
- getopt(argc, argv, "0Ab:Bcd:e:fH:jmnop:P:Rs:S:t:vVx")) != -1)
+ getopt(argc, argv, "0Ab:Bcd:e:fH:jmNnop:P:Rs:S:t:vVx")) != -1)
switch (c) {
case '0':
excludeas0 = 0;
@@ -1101,6 +1266,9 @@ main(int argc, char *argv[])
case 'm':
outformats |= FORMAT_OMETRIC;
break;
+ case 'N':
+ retrynonfunc = 1;
+ break;
case 'n':
noop = 1;
break;
@@ -1348,6 +1516,8 @@ main(int argc, char *argv[])
if (fchdir(cachefd) == -1)
err(1, "fchdir");
+ load_nca_history();
+
while (entity_queue > 0 && !killme) {
int polltim;
@@ -1561,6 +1731,8 @@ main(int argc, char *argv[])
vd.buildtime = get_current_time();
+ nca_history_save(&vd.ncas);
+
/* change working directory to the output directory */
if (fchdir(outdirfd) == -1)
err(1, "fchdir output dir");
@@ -1632,7 +1804,7 @@ main(int argc, char *argv[])
usage:
fprintf(stderr,
- "usage: rpki-client [-0ABcjmnoRVvx] [-b sourceaddr] [-d cachedir]"
+ "usage: rpki-client [-0ABcjmNnoRVvx] [-b sourceaddr] [-d cachedir]"
" [-e rsync_prog]\n"
" [-H fqdn] [-P posix-seconds] [-p threads]"
" [-S skiplist]\n"
Index: output-json.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/output-json.c,v
diff -u -p -r1.59 output-json.c
--- output-json.c 13 Nov 2025 15:18:53 -0000 1.59
+++ output-json.c 15 May 2026 00:47:49 -0000
@@ -193,6 +193,9 @@ output_json(FILE *out, struct validation
json_do_string("caRepository", nca->carepo);
json_do_string("rpkiManifest", nca->mfturi);
json_do_string("ski", nca->ski);
+ json_do_int("since", (long long)nca->since);
+ json_do_int("last_attempt", (long long)nca->last_attempt);
+ json_do_int("total_attempts", nca->attempts);
json_do_end();
}
json_do_end();
Index: repo.c
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/repo.c,v
diff -u -p -r1.81 repo.c
--- repo.c 13 May 2026 04:38:42 -0000 1.81
+++ repo.c 15 May 2026 00:47:49 -0000
@@ -1537,6 +1537,19 @@ repostats_new_files_inc(struct repo *rp,
rp->repostats.new_files++;
}
+void
+repo_stat_inc_nca(unsigned int id, int talid)
+{
+ struct repo *rp;
+
+ SLIST_FOREACH(rp, &repos, entry) {
+ if (rp->id == id) {
+ rp->stats[talid].certs_nonfunc++;
+ break;
+ }
+ }
+}
+
/*
* Update stats object of repository depending on rtype and subtype.
*/
@@ -1552,10 +1565,6 @@ repo_stat_inc(struct repo *rp, int talid
rp->stats[talid].certs++;
if (subtype == STYPE_FAIL)
rp->stats[talid].certs_fail++;
- if (subtype == STYPE_NONFUNC)
- rp->stats[talid].certs_nonfunc++;
- if (subtype == STYPE_FUNC)
- rp->stats[talid].certs_nonfunc--;
if (subtype == STYPE_BGPSEC) {
rp->stats[talid].certs--;
rp->stats[talid].brks++;
@@ -1854,6 +1863,9 @@ repo_cleanup_entry(FTSENT *e, struct fil
path = skip_dotslash(e->fts_path);
switch (e->fts_info) {
case FTS_NSOK:
+ if (e->fts_level == 1 && fts_state.type == BASE_DIR &&
+ strcmp(e->fts_name, ".nca_history") == 0)
+ break;
if (filepath_exists(tree, path)) {
e->fts_parent->fts_number++;
break;
Index: rpki-client.8
===================================================================
RCS file: /cvs/src/usr.sbin/rpki-client/rpki-client.8,v
diff -u -p -r1.139 rpki-client.8
--- rpki-client.8 17 Feb 2026 13:54:42 -0000 1.139
+++ rpki-client.8 15 May 2026 00:47:49 -0000
@@ -22,7 +22,7 @@
.Nd RPKI validator to support BGP routing security
.Sh SYNOPSIS
.Nm
-.Op Fl 0ABcjmnoRVvx
+.Op Fl 0ABcjmNnoRVvx
.Op Fl b Ar sourceaddr
.Op Fl d Ar cachedir
.Op Fl e Ar rsync_prog
@@ -153,6 +153,12 @@ for a description of the fields.
Create output in the file
.Pa metrics
in the output directory in OpenMetrics format.
+.It Fl N
+Disable the backoff retry mechanism applied to non-functional CAs.
+By default
+.Nm
+progressively decreases the synchronization frequency for persistently
+non-functional CAs, eventually settling on retrying once per day.
.It Fl n
Offline mode.
Validate the contents of
rpki-client: backoff retry for persistently non-functional CAs