Index | Thread | Search

From:
Alexander Bluhm <bluhm@openbsd.org>
Subject:
ARP list iterator
To:
tech@openbsd.org
Date:
Fri, 29 Aug 2025 20:12:47 +0200

Download raw body.

Thread
Hi,

I recently fixed a crash when ARP timer is traversing the list.

----------------------------
revision 1.449
date: 2025/08/13 13:00:29;  author: bluhm;  state: Exp;  lines: +3 -2;  commitid: p3MMPGGqpHWz2Pbr;
Clear RTF_MPATH flag for cloned routes.

If two CPUs concurrently send to the same destination IP, both will
call rtalloc_mpath() and rt_clone().  The resulting cloned routes
inherit the RTF_MPATH flag from the cloning route so two rtentry
ARP entries are added to the routing table and arp_list.  Later,
when the ARP entries expire in arptimer(), the function arptfree(rt)
will call rtdeletemsg(rt) which uses rtrequest_delete() in order
to delete the expired rtentry.  However, the expired rtentry is not
directly passed to rtrequest_delete(), so it will re-lookup the
entry based on lookup keys and might delete the other matching
entry.  When arptimer() continues looping over the arp_list, it
will access the already released second entry and crash due to
use-after-free.  Prevent this problem by clearing the RTF_MPATH
flag for RTM_RESOLVE in rtrequest().  This way only one ARP entry
can be created.

from markus@
----------------------------

Problem with that simple fix is that it breaks multipath link-layer
entries.  regress/sbin/route reports the problem and it looks like
a established feature.

==== rttest32 ====
# Use vether(4) and vlan(4) because we need IFT_ETHER interfaces
# for the auto-magic RTF_CLONING routes.
ifconfig vether99 rdomain 5 lladdr fe:e1:ba:d4:c8:1d up
ifconfig vether99 130.102.71.68/27
ifconfig vlan99 rdomain 5 parent vether99 vnetid 3 up
ifconfig vlan99 130.102.71.70/27
# Inserting such route generate the insertion of a RTF_CLONED entry
# attached to the specified interface
/sbin/route -T 5 -n add -net 192.168.67.0/25 130.102.71.67 -priority 9 -ifp vether99
add net 192.168.67.0/25: gateway 130.102.71.67
/sbin/route -T 5 -n add -net 192.168.67.128/25 130.102.71.67 -priority 3 -ifp vlan99
add net 192.168.67.128/25: gateway 130.102.71.67: File exists
*** Error 1 in . (Makefile:377 'rttest32')
FAILED

New idea is to make the loop in arptimer() MP safe.  I use an
iterator like mvs@ has implemented it for pcb tailq.  With that it
will not matter anymore when arptfree(rt) removes another entry
from the list.

I guess the NET_LOCK() could be a NET_LOCK_SHARED() now, but that
is a commit for another day.

ok?

bluhm

Index: netinet/if_ether.c
===================================================================
RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/if_ether.c,v
diff -u -p -r1.276 if_ether.c
--- netinet/if_ether.c	17 Jul 2025 17:31:45 -0000	1.276
+++ netinet/if_ether.c	28 Aug 2025 12:38:22 -0000
@@ -73,8 +73,10 @@
  */
 
 struct llinfo_arp {
-	LIST_ENTRY(llinfo_arp)	 la_list;	/* [mN] global arp_list */
+	LIST_ENTRY(llinfo_arp)	 la_list;	/* [m] global arp_list */
 	struct rtentry		*la_rt;		/* [I] backpointer to rtentry */
+	/* keep fields above in sync with struct llinfo_arp_iterator */
+	struct refcnt		 la_refcnt;	/* entry refereced by list */
 	struct mbuf_queue	 la_mq;		/* packet hold queue */
 	time_t			 la_refreshed;	/* when was refresh sent */
 	int			 la_asked;	/* number of queries sent */
@@ -82,6 +84,12 @@ struct llinfo_arp {
 #define LA_HOLD_QUEUE 10
 #define LA_HOLD_TOTAL 100
 
+struct llinfo_arp_iterator {
+	LIST_ENTRY(llinfo_arp)   la_list;       /* [m] global arp_list */
+	struct rtentry          *la_rt;         /* [I] always NULL */
+	/* keep fields above in sync with struct llinfo_arp */
+};
+
 /* timer values */
 int	arpt_prune = (5 * 60);	/* [I] walk list every 5 minutes */
 int	arpt_keep = (20 * 60);	/* [a] once resolved, cache for 20 minutes */
@@ -103,8 +111,8 @@ struct niqueue arpinq = NIQUEUE_INITIALI
 /* llinfo_arp live time, rt_llinfo and RTF_LLINFO are protected by arp_mtx */
 struct mutex arp_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
 
-LIST_HEAD(, llinfo_arp) arp_list =
-    LIST_HEAD_INITIALIZER(arp_list);	/* [mN] list of llinfo_arp structures */
+LIST_HEAD(, llinfo_arp) arp_list = LIST_HEAD_INITIALIZER(arp_list);
+				/* [m] list of llinfo_arp structures */
 struct	pool arp_pool;		/* [I] pool for llinfo_arp structures */
 int	arp_maxtries = 5;	/* [I] arp requests before set to rejected */
 unsigned int	la_hold_total;	/* [a] packets currently in the arp queue */
@@ -116,27 +124,63 @@ int revarp_finished;
 unsigned int revarp_ifidx;
 #endif /* NFSCLIENT */
 
+static struct llinfo_arp *
+arpiterator(struct llinfo_arp *la, struct llinfo_arp_iterator *iter)
+{
+	struct llinfo_arp *tmp;
+
+	MUTEX_ASSERT_LOCKED(&arp_mtx);
+
+	if (la)
+		tmp = LIST_NEXT((struct llinfo_arp *)iter, la_list);
+	else
+		tmp = LIST_FIRST(&arp_list);
+
+	while (tmp && tmp->la_rt == NULL)
+		tmp = LIST_NEXT(tmp, la_list);
+
+	if (la) {
+		LIST_REMOVE((struct llinfo_arp *)iter, la_list);
+		if (refcnt_rele(&la->la_refcnt))
+			pool_put(&arp_pool, la);
+	}
+	if (tmp) {
+		LIST_INSERT_AFTER(tmp, (struct llinfo_arp *)iter, la_list);
+		refcnt_take(&tmp->la_refcnt);
+	}
+
+	return tmp;
+}
+
 /*
- * Timeout routine.  Age arp_tab entries periodically.
+ * Timeout routine.  Age arp table entries periodically.
  */
 void
 arptimer(void *arg)
 {
 	struct timeout *to = arg;
-	struct llinfo_arp *la, *nla;
+	struct llinfo_arp_iterator iter = { .la_rt = NULL };
+	struct llinfo_arp *la = NULL;
 	time_t uptime;
 
-	NET_LOCK();
 	uptime = getuptime();
 	timeout_add_sec(to, arpt_prune);
-	/* Net lock is exclusive, no arp mutex needed for arp_list here. */
-	LIST_FOREACH_SAFE(la, &arp_list, la_list, nla) {
+
+	mtx_enter(&arp_mtx);
+	while ((la = arpiterator(la, &iter)) != NULL) {
 		struct rtentry *rt = la->la_rt;
 
-		if (rt->rt_expire && rt->rt_expire < uptime)
+		rtref(rt);
+		mtx_leave(&arp_mtx);
+		if (rt->rt_expire && rt->rt_expire < uptime) {
+			NET_LOCK();
 			arptfree(rt); /* timer has expired; clear */
+			NET_UNLOCK();
+		}
+		rtfree(rt);
+		mtx_enter(&arp_mtx);
 	}
-	NET_UNLOCK();
+	mtx_leave(&arp_mtx);
 }
 
 void
@@ -210,6 +254,7 @@ arp_rtrequest(struct ifnet *ifp, int req
 			pool_put(&arp_pool, la);
 			break;
 		}
+		refcnt_init(&la->la_refcnt);
 		mq_init(&la->la_mq, LA_HOLD_QUEUE, IPL_SOFTNET);
 		rt->rt_llinfo = (caddr_t)la;
 		la->la_rt = rt;
@@ -235,7 +280,8 @@ arp_rtrequest(struct ifnet *ifp, int req
 		atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq));
 		mtx_leave(&arp_mtx);
 
-		pool_put(&arp_pool, la);
+		if (refcnt_rele(&la->la_refcnt))
+			pool_put(&arp_pool, la);
 		break;
 
 	case RTM_INVALIDATE: