Download raw body.
sysctl: unlock IP{,V6}CTL_MULTIPATH
On Sat, Jun 21, 2025 at 10:21:02PM +0300, Vitaliy Makkoveev wrote:
> They both are the same and both do `rtgeneration' update. As I
> understand the code, we don't need to serialize `ipmultipath' and
> `rtgeneration' update with the netstack, however, want to increment
> `rtgeneration' before `ipmultipath' update. IIRC, atomic operations will
> not be reordered and we don't need any extra barriers here.
This is architecture dependent. membar_enter_after_atomic() is not
defined everywhere. I think membar_consumer() is needed here. See
comments inline.
bluhm
> Index: sys/net/pf.c
> ===================================================================
> RCS file: /cvs/src/sys/net/pf.c,v
> retrieving revision 1.1213
> diff -u -p -r1.1213 pf.c
> --- sys/net/pf.c 9 Jun 2025 20:34:08 -0000 1.1213
> +++ sys/net/pf.c 21 Jun 2025 19:20:26 -0000
> @@ -6501,7 +6501,7 @@ pf_routable(struct pf_addr *addr, sa_fam
> dst->sin_family = AF_INET;
> dst->sin_len = sizeof(*dst);
> dst->sin_addr = addr->v4;
> - if (ipmultipath)
> + if (atomic_load_int(&ipmultipath))
> check_mpath = 1;
> break;
> #ifdef INET6
> @@ -6516,7 +6516,7 @@ pf_routable(struct pf_addr *addr, sa_fam
> dst6->sin6_family = AF_INET6;
> dst6->sin6_len = sizeof(*dst6);
> dst6->sin6_addr = addr->v6;
> - if (ip6_multipath)
> + if (atomic_load_int(&ip6_multipath))
> check_mpath = 1;
> break;
> #endif /* INET6 */
> Index: sys/net/route.c
> ===================================================================
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.444
> diff -u -p -r1.444 route.c
> --- sys/net/route.c 16 Mar 2025 23:45:06 -0000 1.444
> +++ sys/net/route.c 21 Jun 2025 19:20:26 -0000
> @@ -215,7 +215,7 @@ route_cache(struct route *ro, const stru
> ro->ro_tableid == rtableid &&
> ro->ro_dstsa.sa_family == AF_INET &&
> ro->ro_dstsin.sin_addr.s_addr == dst->s_addr) {
> - if (src == NULL || !ipmultipath ||
> + if (src == NULL || !atomic_load_int(&ipmultipath) ||
> !ISSET(ro->ro_rt->rt_flags, RTF_MPATH) ||
> (ro->ro_srcin.s_addr != INADDR_ANY &&
> ro->ro_srcin.s_addr == src->s_addr)) {
> @@ -272,7 +272,7 @@ route6_cache(struct route *ro, const str
> ro->ro_tableid == rtableid &&
> ro->ro_dstsa.sa_family == AF_INET6 &&
> IN6_ARE_ADDR_EQUAL(&ro->ro_dstsin6.sin6_addr, dst)) {
> - if (src == NULL || !ip6_multipath ||
> + if (src == NULL || !atomic_load_int(&ip6_multipath) ||
> !ISSET(ro->ro_rt->rt_flags, RTF_MPATH) ||
> (!IN6_IS_ADDR_UNSPECIFIED(&ro->ro_srcin6) &&
> IN6_ARE_ADDR_EQUAL(&ro->ro_srcin6, src))) {
> @@ -426,7 +426,7 @@ rt_hash(struct rtentry *rt, const struct
> {
> const struct sockaddr_in *sin;
>
> - if (!ipmultipath)
> + if (!atomic_load_int(&ipmultipath))
> return (-1);
>
> sin = satosin_const(dst);
> @@ -440,7 +440,7 @@ rt_hash(struct rtentry *rt, const struct
> {
> const struct sockaddr_in6 *sin6;
>
> - if (!ip6_multipath)
> + if (!atomic_load_int(&ip6_multipath))
> return (-1);
>
> sin6 = satosin6_const(dst);
> Index: sys/netinet/ip_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.410
> diff -u -p -r1.410 ip_input.c
> --- sys/netinet/ip_input.c 21 Jun 2025 14:21:17 -0000 1.410
> +++ sys/netinet/ip_input.c 21 Jun 2025 19:20:26 -0000
> @@ -93,7 +93,7 @@
> /* values controllable via sysctl */
> int ip_forwarding = 0; /* [a] */
> int ipmforwarding = 0;
> -int ipmultipath = 0;
> +int ipmultipath = 0; /* [a] */
> int ip_sendredirects = 1; /* [a] */
> int ip_dosourceroute = 0; /* [a] */
> int ip_defttl = IPDEFTTL;
> @@ -1817,13 +1817,18 @@ ip_sysctl(int *name, u_int namelen, void
> return (EOPNOTSUPP);
> #endif
> case IPCTL_MULTIPATH:
> - NET_LOCK();
> - oldval = ipmultipath;
> + oldval = newval = atomic_load_int(&ipmultipath);
> error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
> - &ipmultipath, 0, 1);
> - if (oldval != ipmultipath)
> - atomic_inc_long(&rtgeneration);
> - NET_UNLOCK();
> + &newval, 0, 1);
> + if (error == 0 && oldval != newval) {
> + rw_enter_write(&ip_sysctl_lock);
> + if (newval != atomic_load_int(&ipmultipath)) {
> + atomic_inc_long(&rtgeneration);
> + atomic_store_int(&ipmultipath, newval);
> + }
> + rw_exit_write(&ip_sysctl_lock);
The correct interaction between ipmultipath and rtgeneration
is a memory barrier.
atomic_store_int(&ipmultipath, newval);
membar_producer();
atomic_inc_long(&rtgeneration);
This ensures that route cache gets invalid after ipmultipath changes.
You need that as rt_hash() result is based on ipmultipath. That
way the ipmultipath loads in rt_hash() and route_cache() can be
kept independent. No lock needed.
> + }
> +
> return (error);
> case IPCTL_FORWARDING:
> case IPCTL_SENDREDIRECTS:
> Index: sys/netinet6/in6_proto.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/in6_proto.c,v
> retrieving revision 1.130
> diff -u -p -r1.130 in6_proto.c
> --- sys/netinet6/in6_proto.c 21 Jun 2025 14:21:17 -0000 1.130
> +++ sys/netinet6/in6_proto.c 21 Jun 2025 19:20:26 -0000
> @@ -359,7 +359,7 @@ const struct domain inet6domain = {
> */
> int ip6_forwarding = 0; /* [a] no forwarding unless sysctl to enable */
> int ip6_mforwarding = 0; /* no multicast forwarding unless ... */
> -int ip6_multipath = 0; /* no using multipath routes unless ... */
> +int ip6_multipath = 0; /* [a] no using multipath routes unless ... */
> int ip6_sendredirects = 1; /* [a] */
> int ip6_defhlim = IPV6_DEFHLIM;
> int ip6_defmcasthlim = IPV6_DEFAULT_MULTICAST_HOPS;
> Index: sys/netinet6/ip6_input.c
> ===================================================================
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.274
> diff -u -p -r1.274 ip6_input.c
> --- sys/netinet6/ip6_input.c 21 Jun 2025 14:21:17 -0000 1.274
> +++ sys/netinet6/ip6_input.c 21 Jun 2025 19:20:26 -0000
> @@ -1513,6 +1513,9 @@ int
> ip6_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp,
> void *newp, size_t newlen)
> {
> +#ifndef SMALL_KERNEL
> + int oldval, newval;
> +#endif
deraadt@ has tried hard to avoid too many #ifndef SMALL_KERNEL.
Move the declarations back into case statements.
> int error;
>
> /* Almost all sysctl names at this level are terminal. */
> @@ -1547,9 +1550,7 @@ ip6_sysctl(int *name, u_int namelen, voi
> case IPV6CTL_MRTMFC:
> return (EOPNOTSUPP);
> #endif
> - case IPV6CTL_MTUDISCTIMEOUT: {
> - int oldval, newval;
> -
> + case IPV6CTL_MTUDISCTIMEOUT:
> oldval = newval = atomic_load_int(&ip6_mtudisc_timeout);
> error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
> &newval, 0, INT_MAX);
> @@ -1561,20 +1562,23 @@ ip6_sysctl(int *name, u_int namelen, voi
> }
>
> return (error);
> - }
> case IPV6CTL_IFQUEUE:
> return (sysctl_niq(name + 1, namelen - 1,
> oldp, oldlenp, newp, newlen, &ip6intrq));
> - case IPV6CTL_MULTIPATH: {
> - NET_LOCK();
> - int oldval = ip6_multipath;
> + case IPV6CTL_MULTIPATH:
> + oldval = newval = atomic_load_int(&ip6_multipath);
> error = sysctl_int_bounded(oldp, oldlenp, newp, newlen,
> - &ip6_multipath, 0, 1);
> - if (oldval != ip6_multipath)
> - atomic_inc_long(&rtgeneration);
> - NET_UNLOCK();
> + &newval, 0, 1);
> + if (error == 0 && oldval != newval) {
> + rw_enter_write(&ip_sysctl_lock);
> + if (newval != atomic_load_int(&ip6_multipath)) {
> + atomic_inc_long(&rtgeneration);
> + atomic_store_int(&ip6_multipath, newval);
> + }
> + rw_exit_write(&ip_sysctl_lock);
No lock needed. Use memory barrier.
> + }
> +
> return (error);
> - }
> case IPV6CTL_FORWARDING:
> case IPV6CTL_SENDREDIRECTS:
> return (sysctl_bounded_arr(
sysctl: unlock IP{,V6}CTL_MULTIPATH