Download raw body.
ARP list iterator
On Fri, Aug 29, 2025 at 08:12:47PM +0200, Alexander Bluhm wrote:
> Hi,
>
> I recently fixed a crash when ARP timer is traversing the list.
>
> ----------------------------
> revision 1.449
> date: 2025/08/13 13:00:29; author: bluhm; state: Exp; lines: +3 -2; commitid: p3MMPGGqpHWz2Pbr;
> Clear RTF_MPATH flag for cloned routes.
>
> If two CPUs concurrently send to the same destination IP, both will
> call rtalloc_mpath() and rt_clone(). The resulting cloned routes
> inherit the RTF_MPATH flag from the cloning route so two rtentry
> ARP entries are added to the routing table and arp_list. Later,
> when the ARP entries expire in arptimer(), the function arptfree(rt)
> will call rtdeletemsg(rt) which uses rtrequest_delete() in order
> to delete the expired rtentry. However, the expired rtentry is not
> directly passed to rtrequest_delete(), so it will re-lookup the
> entry based on lookup keys and might delete the other matching
> entry. When arptimer() continues looping over the arp_list, it
> will access the already released second entry and crash due to
> use-after-free. Prevent this problem by clearing the RTF_MPATH
> flag for RTM_RESOLVE in rtrequest(). This way only one ARP entry
> can be created.
>
> from markus@
> ----------------------------
>
> Problem with that simple fix is that it breaks multipath link-layer
> entries. regress/sbin/route reports the problem and it looks like
> a established feature.
>
> ==== rttest32 ====
> # Use vether(4) and vlan(4) because we need IFT_ETHER interfaces
> # for the auto-magic RTF_CLONING routes.
> ifconfig vether99 rdomain 5 lladdr fe:e1:ba:d4:c8:1d up
> ifconfig vether99 130.102.71.68/27
> ifconfig vlan99 rdomain 5 parent vether99 vnetid 3 up
> ifconfig vlan99 130.102.71.70/27
> # Inserting such route generate the insertion of a RTF_CLONED entry
> # attached to the specified interface
> /sbin/route -T 5 -n add -net 192.168.67.0/25 130.102.71.67 -priority 9 -ifp vether99
> add net 192.168.67.0/25: gateway 130.102.71.67
> /sbin/route -T 5 -n add -net 192.168.67.128/25 130.102.71.67 -priority 3 -ifp vlan99
> add net 192.168.67.128/25: gateway 130.102.71.67: File exists
> *** Error 1 in . (Makefile:377 'rttest32')
> FAILED
>
> New idea is to make the loop in arptimer() MP safe. I use an
> iterator like mvs@ has implemented it for pcb tailq. With that it
> will not matter anymore when arptfree(rt) removes another entry
> from the list.
>
> I guess the NET_LOCK() could be a NET_LOCK_SHARED() now, but that
> is a commit for another day.
>
> ok?
>
I like to put refcounting and `arp_mtx' release down to the 'if'
block. You don't need to do this all the times. With this fix diff is OK
mvs.
> - /* Net lock is exclusive, no arp mutex needed for arp_list here. */
> - LIST_FOREACH_SAFE(la, &arp_list, la_list, nla) {
> +
> + mtx_enter(&arp_mtx);
> + while ((la = arpiterator(la, &iter)) != NULL) {
> struct rtentry *rt = la->la_rt;
>
> - if (rt->rt_expire && rt->rt_expire < uptime)
> + rtref(rt);
> + mtx_leave(&arp_mtx);
> + if (rt->rt_expire && rt->rt_expire < uptime) {
> + NET_LOCK();
> arptfree(rt); /* timer has expired; clear */
> + NET_UNLOCK();
> + }
> + rtfree(rt);
> + mtx_enter(&arp_mtx);
> bluhm
>
> Index: netinet/if_ether.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/if_ether.c,v
> diff -u -p -r1.276 if_ether.c
> --- netinet/if_ether.c 17 Jul 2025 17:31:45 -0000 1.276
> +++ netinet/if_ether.c 28 Aug 2025 12:38:22 -0000
> @@ -73,8 +73,10 @@
> */
>
> struct llinfo_arp {
> - LIST_ENTRY(llinfo_arp) la_list; /* [mN] global arp_list */
> + LIST_ENTRY(llinfo_arp) la_list; /* [m] global arp_list */
> struct rtentry *la_rt; /* [I] backpointer to rtentry */
> + /* keep fields above in sync with struct llinfo_arp_iterator */
> + struct refcnt la_refcnt; /* entry refereced by list */
> struct mbuf_queue la_mq; /* packet hold queue */
> time_t la_refreshed; /* when was refresh sent */
> int la_asked; /* number of queries sent */
> @@ -82,6 +84,12 @@ struct llinfo_arp {
> #define LA_HOLD_QUEUE 10
> #define LA_HOLD_TOTAL 100
>
> +struct llinfo_arp_iterator {
> + LIST_ENTRY(llinfo_arp) la_list; /* [m] global arp_list */
> + struct rtentry *la_rt; /* [I] always NULL */
> + /* keep fields above in sync with struct llinfo_arp */
> +};
> +
> /* timer values */
> int arpt_prune = (5 * 60); /* [I] walk list every 5 minutes */
> int arpt_keep = (20 * 60); /* [a] once resolved, cache for 20 minutes */
> @@ -103,8 +111,8 @@ struct niqueue arpinq = NIQUEUE_INITIALI
> /* llinfo_arp live time, rt_llinfo and RTF_LLINFO are protected by arp_mtx */
> struct mutex arp_mtx = MUTEX_INITIALIZER(IPL_SOFTNET);
>
> -LIST_HEAD(, llinfo_arp) arp_list =
> - LIST_HEAD_INITIALIZER(arp_list); /* [mN] list of llinfo_arp structures */
> +LIST_HEAD(, llinfo_arp) arp_list = LIST_HEAD_INITIALIZER(arp_list);
> + /* [m] list of llinfo_arp structures */
> struct pool arp_pool; /* [I] pool for llinfo_arp structures */
> int arp_maxtries = 5; /* [I] arp requests before set to rejected */
> unsigned int la_hold_total; /* [a] packets currently in the arp queue */
> @@ -116,27 +124,63 @@ int revarp_finished;
> unsigned int revarp_ifidx;
> #endif /* NFSCLIENT */
>
> +static struct llinfo_arp *
> +arpiterator(struct llinfo_arp *la, struct llinfo_arp_iterator *iter)
> +{
> + struct llinfo_arp *tmp;
> +
> + MUTEX_ASSERT_LOCKED(&arp_mtx);
> +
> + if (la)
> + tmp = LIST_NEXT((struct llinfo_arp *)iter, la_list);
> + else
> + tmp = LIST_FIRST(&arp_list);
> +
> + while (tmp && tmp->la_rt == NULL)
> + tmp = LIST_NEXT(tmp, la_list);
> +
> + if (la) {
> + LIST_REMOVE((struct llinfo_arp *)iter, la_list);
> + if (refcnt_rele(&la->la_refcnt))
> + pool_put(&arp_pool, la);
> + }
> + if (tmp) {
> + LIST_INSERT_AFTER(tmp, (struct llinfo_arp *)iter, la_list);
> + refcnt_take(&tmp->la_refcnt);
> + }
> +
> + return tmp;
> +}
> +
> /*
> - * Timeout routine. Age arp_tab entries periodically.
> + * Timeout routine. Age arp table entries periodically.
> */
> void
> arptimer(void *arg)
> {
> struct timeout *to = arg;
> - struct llinfo_arp *la, *nla;
> + struct llinfo_arp_iterator iter = { .la_rt = NULL };
> + struct llinfo_arp *la = NULL;
> time_t uptime;
>
> - NET_LOCK();
> uptime = getuptime();
> timeout_add_sec(to, arpt_prune);
> - /* Net lock is exclusive, no arp mutex needed for arp_list here. */
> - LIST_FOREACH_SAFE(la, &arp_list, la_list, nla) {
> +
> + mtx_enter(&arp_mtx);
> + while ((la = arpiterator(la, &iter)) != NULL) {
> struct rtentry *rt = la->la_rt;
>
> - if (rt->rt_expire && rt->rt_expire < uptime)
> + rtref(rt);
> + mtx_leave(&arp_mtx);
> + if (rt->rt_expire && rt->rt_expire < uptime) {
> + NET_LOCK();
> arptfree(rt); /* timer has expired; clear */
> + NET_UNLOCK();
> + }
> + rtfree(rt);
> + mtx_enter(&arp_mtx);
> }
> - NET_UNLOCK();
> + mtx_leave(&arp_mtx);
> }
>
> void
> @@ -210,6 +254,7 @@ arp_rtrequest(struct ifnet *ifp, int req
> pool_put(&arp_pool, la);
> break;
> }
> + refcnt_init(&la->la_refcnt);
> mq_init(&la->la_mq, LA_HOLD_QUEUE, IPL_SOFTNET);
> rt->rt_llinfo = (caddr_t)la;
> la->la_rt = rt;
> @@ -235,7 +280,8 @@ arp_rtrequest(struct ifnet *ifp, int req
> atomic_sub_int(&la_hold_total, mq_purge(&la->la_mq));
> mtx_leave(&arp_mtx);
>
> - pool_put(&arp_pool, la);
> + if (refcnt_rele(&la->la_refcnt))
> + pool_put(&arp_pool, la);
> break;
>
> case RTM_INVALIDATE:
>
ARP list iterator