From: Alexander Bluhm Subject: ARP list iterator To: tech@openbsd.org Date: Fri, 29 Aug 2025 20:12:47 +0200 Hi, I recently fixed a crash when ARP timer is traversing the list. ---------------------------- revision 1.449 date: 2025/08/13 13:00:29; author: bluhm; state: Exp; lines: +3 -2; commitid: p3MMPGGqpHWz2Pbr; Clear RTF_MPATH flag for cloned routes. If two CPUs concurrently send to the same destination IP, both will call rtalloc_mpath() and rt_clone(). The resulting cloned routes inherit the RTF_MPATH flag from the cloning route so two rtentry ARP entries are added to the routing table and arp_list. Later, when the ARP entries expire in arptimer(), the function arptfree(rt) will call rtdeletemsg(rt) which uses rtrequest_delete() in order to delete the expired rtentry. However, the expired rtentry is not directly passed to rtrequest_delete(), so it will re-lookup the entry based on lookup keys and might delete the other matching entry. When arptimer() continues looping over the arp_list, it will access the already released second entry and crash due to use-after-free. Prevent this problem by clearing the RTF_MPATH flag for RTM_RESOLVE in rtrequest(). This way only one ARP entry can be created. from markus@ ---------------------------- Problem with that simple fix is that it breaks multipath link-layer entries. regress/sbin/route reports the problem and it looks like a established feature. ==== rttest32 ==== # Use vether(4) and vlan(4) because we need IFT_ETHER interfaces # for the auto-magic RTF_CLONING routes. ifconfig vether99 rdomain 5 lladdr fe:e1:ba:d4:c8:1d up ifconfig vether99 130.102.71.68/27 ifconfig vlan99 rdomain 5 parent vether99 vnetid 3 up ifconfig vlan99 130.102.71.70/27 # Inserting such route generate the insertion of a RTF_CLONED entry # attached to the specified interface /sbin/route -T 5 -n add -net 192.168.67.0/25 130.102.71.67 -priority 9 -ifp vether99 add net 192.168.67.0/25: gateway 130.102.71.67 /sbin/route -T 5 -n add -net 192.168.67.128/25 130.102.71.67 -priority 3 -ifp vlan99 add net 192.168.67.128/25: gateway 130.102.71.67: File exists *** Error 1 in . (Makefile:377 'rttest32') FAILED New idea is to make the loop in arptimer() MP safe. I use an iterator like mvs@ has implemented it for pcb tailq. With that it will not matter anymore when arptfree(rt) removes another entry from the list. I guess the NET_LOCK() could be a NET_LOCK_SHARED() now, but that is a commit for another day. ok? bluhm Index: netinet/if_ether.c =================================================================== RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/if_ether.c,v diff -u -p -r1.276 if_ether.c --- netinet/if_ether.c 17 Jul 2025 17:31:45 -0000 1.276 +++ netinet/if_ether.c 28 Aug 2025 12:38:22 -0000 @@ -73,8 +73,10 @@ */ struct llinfo_arp { - LIST_ENTRY(llinfo_arp) la_list; /* [mN] global arp_list */ + LIST_ENTRY(llinfo_arp) la_list; /* [m] global arp_list */ struct rtentry *la_rt; /* [I] backpointer to rtentry */ + /* keep fields above in sync with struct llinfo_arp_iterator */ + struct refcnt la_refcnt; /* entry refereced by list */ struct mbuf_queue la_mq; /* packet hold queue */ time_t la_refreshed; /* when was refresh sent */ int la_asked; /* number of queries sent */ @@ -82,6 +84,12 @@ struct llinfo_arp { #define LA_HOLD_QUEUE 10 #define LA_HOLD_TOTAL 100 +struct llinfo_arp_iterator { + LIST_ENTRY(llinfo_arp) la_list; /* [m] global arp_list */ + struct rtentry *la_rt; /* [I] always NULL */ + /* keep fields above in sync with struct llinfo_arp */ +}; + /* timer values */ int arpt_prune = (5 * 60); /* [I] walk list every 5 minutes */ int arpt_keep = (20 * 60); /* [a] once resolved, cache for 20 minutes */ @@ -103,8 +111,8 @@ struct niqueue arpinq = NIQUEUE_INITIALI /* llinfo_arp live time, rt_llinfo and RTF_LLINFO are protected by arp_mtx */ struct mutex arp_mtx = MUTEX_INITIALIZER(IPL_SOFTNET); -LIST_HEAD(, llinfo_arp) arp_list = - LIST_HEAD_INITIALIZER(arp_list); /* [mN] list of llinfo_arp structures */ +LIST_HEAD(, llinfo_arp) arp_list = LIST_HEAD_INITIALIZER(arp_list); + /* [m] list of llinfo_arp structures */ struct pool arp_pool; /* [I] pool for llinfo_arp structures */ int arp_maxtries = 5; /* [I] arp requests before set to rejected */ unsigned int la_hold_total; /* [a] packets currently in the arp queue */ @@ -116,27 +124,63 @@ int revarp_finished; unsigned int revarp_ifidx; #endif /* NFSCLIENT */ +static struct llinfo_arp * +arpiterator(struct llinfo_arp *la, struct llinfo_arp_iterator *iter) +{ + struct llinfo_arp *tmp; + + MUTEX_ASSERT_LOCKED(&arp_mtx); + + if (la) + tmp = LIST_NEXT((struct llinfo_arp *)iter, la_list); + else + tmp = LIST_FIRST(&arp_list); + + while (tmp && tmp->la_rt == NULL) + tmp = LIST_NEXT(tmp, la_list); + + if (la) { + LIST_REMOVE((struct llinfo_arp *)iter, la_list); + if (refcnt_rele(&la->la_refcnt)) + pool_put(&arp_pool, la); + } + if (tmp) { + LIST_INSERT_AFTER(tmp, (struct llinfo_arp *)iter, la_list); + refcnt_take(&tmp->la_refcnt); + } + + return tmp; +} + /* - * Timeout routine. Age arp_tab entries periodically. + * Timeout routine. Age arp table entries periodically. */ void arptimer(void *arg) { struct timeout *to = arg; - struct llinfo_arp *la, *nla; + struct llinfo_arp_iterator iter = { .la_rt = NULL }; + struct llinfo_arp *la = NULL; time_t uptime; - NET_LOCK(); uptime = getuptime(); timeout_add_sec(to, arpt_prune); - /* Net lock is exclusive, no arp mutex needed for arp_list here. */ - LIST_FOREACH_SAFE(la, &arp_list, la_list, nla) { + + mtx_enter(&arp_mtx); + while ((la = arpiterator(la, &iter)) != NULL) { struct rtentry *rt = la->la_rt; - if (rt->rt_expire && rt->rt_expire < uptime) + rtref(rt); + mtx_leave(&arp_mtx); + if (rt->rt_expire && rt->rt_expire < uptime) { + NET_LOCK(); arptfree(rt); /* timer has expired; clear */ + NET_UNLOCK(); + } + rtfree(rt); + mtx_enter(&arp_mtx); } - NET_UNLOCK(); + mtx_leave(&arp_mtx); } void @@ -210,6 +254,7 @@ arp_rtrequest(struct ifnet *ifp, int req pool_put(&arp_pool, la); break; } + refcnt_init(&la->la_refcnt); mq_init(&la->la_mq, LA_HOLD_QUEUE, IPL_SOFTNET); rt->rt_llinfo = (caddr_t)la; la->la_rt = rt; @@ -235,7 +280,8 @@ arp_rtrequest(struct ifnet *ifp, int req atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq)); mtx_leave(&arp_mtx); - pool_put(&arp_pool, la); + if (refcnt_rele(&la->la_refcnt)) + pool_put(&arp_pool, la); break; case RTM_INVALIDATE: