Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
Re: route gwroute mutex
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Thu, 6 Mar 2025 15:17:54 +0100

Download raw body.

Thread
On Thu, Mar 06, 2025 at 02:47:10PM +0100, Alexander Bluhm wrote:
> Hi,
> 
> I am still trying to make the leaves of the routing tree MP safe.
> 
> A small step would be to protect rt_cachecnt and rt_gwroute writers
> with a per route mutex.  I am aware that this is not complete, e.g.
> rt_flags also looks fishy.  The READ_ONCE() in rt_getll() does not
> protect from use-after-free.  I want to avoid mutex in the hot path.
> A previous attempt with SMR failed when running BGP.  Let's make
> this first step and find a smarter solition later.  route_cleargateway()
> is not in the hot packet path, so I added the mutex there.
> 
> I have moved the rt_cachecnt from rt_setgwroute() to rt_putgwroute()
> to have the RTF_CACHED logic in one function.
> 
> ok?

I'm begrudgingly OK with this. Right now the situation is clearly
incorrect and the mutex solves this right now.
Long term we should move towards making struct rtentry more RCU friendly
so we can drop the mutex. But lets tackle that after 7.7 
 
> bluhm
> 
> Index: net/route.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v
> diff -u -p -r1.442 route.c
> --- net/route.c	21 Feb 2025 22:21:20 -0000	1.442
> +++ net/route.c	3 Mar 2025 23:50:31 -0000
> @@ -324,10 +324,12 @@ rtisvalid(struct rtentry *rt)
>  		return (0);
>  
>  	if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
> -		if (rt->rt_gwroute == NULL)
> +		struct rtentry *gwroute = READ_ONCE(rt->rt_gwroute);
> +
> +		if (gwroute == NULL)
>  			return (0);
> -		KASSERT(!ISSET(rt->rt_gwroute->rt_flags, RTF_GATEWAY));
> -		if (!ISSET(rt->rt_gwroute->rt_flags, RTF_UP))
> +		KASSERT(!ISSET(gwroute->rt_flags, RTF_GATEWAY));
> +		if (!ISSET(gwroute->rt_flags, RTF_UP))
>  			return (0);
>  	}
>  
> @@ -567,15 +569,6 @@ rt_setgwroute(struct rtentry *rt, const 
>  			atomic_cas_uint(&rt->rt_mtu, mtu, nhmtu);
>  	}
>  
> -	/*
> -	 * To avoid reference counting problems when writing link-layer
> -	 * addresses in an outgoing packet, we ensure that the lifetime
> -	 * of a cached entry is greater than the bigger lifetime of the
> -	 * gateway entries it is pointed by.
> -	 */
> -	nhrt->rt_flags |= RTF_CACHED;
> -	nhrt->rt_cachecnt++;
> -
>  	/* commit */
>  	rt_putgwroute(rt, nhrt);
>  
> @@ -595,17 +588,32 @@ rt_putgwroute(struct rtentry *rt, struct
>  	if (!ISSET(rt->rt_flags, RTF_GATEWAY))
>  		return;
>  
> -	/* this is protected as per [X] in route.h */
> +	/*
> +	 * To avoid reference counting problems when writing link-layer
> +	 * addresses in an outgoing packet, we ensure that the lifetime
> +	 * of a cached entry is greater than the bigger lifetime of the
> +	 * gateway entries it is pointed by.
> +	 */
> +	if (nhrt != NULL) {
> +		mtx_enter(&nhrt->rt_mtx);
> +		SET(nhrt->rt_flags, RTF_CACHED);
> +		nhrt->rt_cachecnt++;
> +		mtx_leave(&nhrt->rt_mtx);
> +	}
> +
> +	mtx_enter(&rt->rt_mtx);
>  	onhrt = rt->rt_gwroute;
>  	rt->rt_gwroute = nhrt;
> +	mtx_leave(&rt->rt_mtx);
>  
>  	if (onhrt != NULL) {
> +		mtx_enter(&onhrt->rt_mtx);
>  		KASSERT(onhrt->rt_cachecnt > 0);
>  		KASSERT(ISSET(onhrt->rt_flags, RTF_CACHED));
> -
> -		--onhrt->rt_cachecnt;
> +		onhrt->rt_cachecnt--;
>  		if (onhrt->rt_cachecnt == 0)
>  			CLR(onhrt->rt_flags, RTF_CACHED);
> +		mtx_leave(&onhrt->rt_mtx);
>  
>  		rtfree(onhrt);
>  	}
> @@ -1004,6 +1012,7 @@ rtrequest(int req, struct rt_addrinfo *i
>  			return (ENOBUFS);
>  		}
>  
> +		mtx_init_flags(&rt->rt_mtx, IPL_SOFTNET, "rtentry", 0);
>  		refcnt_init_trace(&rt->rt_refcnt, DT_REFCNT_IDX_RTENTRY);
>  		rt->rt_flags = info->rti_flags | RTF_UP;
>  		rt->rt_priority = prio;	/* init routing priority */
> @@ -1165,7 +1174,7 @@ rt_getll(struct rtentry *rt)
>  {
>  	if (ISSET(rt->rt_flags, RTF_GATEWAY)) {
>  	 	/* We may return NULL here. */
> -		return (rt->rt_gwroute);
> +		return (READ_ONCE(rt->rt_gwroute));
>  	}
>  
>  	return (rt);
> Index: net/route.h
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.h,v
> diff -u -p -r1.213 route.h
> --- net/route.h	16 Feb 2025 11:39:28 -0000	1.213
> +++ net/route.h	3 Mar 2025 23:50:31 -0000
> @@ -41,6 +41,7 @@
>   *	N	net lock
>   *	X	exclusive net lock, or shared net lock + kernel lock
>   *	R	art (rtable) lock
> + *	r	per route entry mutex	rt_mtx
>   *	L	arp/nd6/etc lock for updates, net lock for reads
>   *	T	rttimer_mtx		route timer lists
>   */
> @@ -114,6 +115,7 @@ struct rttimer;
>   */
>  
>  struct rtentry {
> +	struct mutex	 rt_mtx;
>  	struct sockaddr	*rt_dest;	/* [I] destination */
>  	SRPL_ENTRY(rtentry) rt_next;	/* [R] next mpath entry to our dst */
>  	struct sockaddr	*rt_gateway;	/* [X] gateway address */
> @@ -121,8 +123,8 @@ struct rtentry {
>  	caddr_t		 rt_llinfo;	/* [L] pointer to link level info or
>  					   an MPLS structure */
>  	union {
> -		struct rtentry	*_nh;	/* [X] rtentry for rt_gateway */
> -		unsigned int	 _ref;	/* [X] # gateway rtentry refs */
> +		struct rtentry	*_nh;	/* [r] rtentry for rt_gateway */
> +		unsigned int	 _ref;	/* [r] # gateway rtentry refs */
>  	} RT_gw;
>  #define rt_gwroute	 RT_gw._nh
>  #define rt_cachecnt	 RT_gw._ref
> Index: net/rtsock.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtsock.c,v
> diff -u -p -r1.381 rtsock.c
> --- net/rtsock.c	16 Feb 2025 11:39:28 -0000	1.381
> +++ net/rtsock.c	3 Mar 2025 23:50:31 -0000
> @@ -1343,9 +1343,11 @@ route_cleargateway(struct rtentry *rt, v
>  {
>  	struct rtentry *nhrt = arg;
>  
> +	mtx_enter(&rt->rt_mtx);
>  	if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt &&
>  	    !ISSET(rt->rt_locks, RTV_MTU))
>  		atomic_store_int(&rt->rt_mtu, 0);
> +	mtx_leave(&rt->rt_mtx);
>  
>  	return (0);
>  }
> 

-- 
:wq Claudio