From: Alexander Bluhm Subject: Re: route mtu atomic To: tech@openbsd.org Date: Fri, 3 Jan 2025 14:19:54 +0100 anyone? On Thu, Dec 26, 2024 at 12:11:50PM +0100, Alexander Bluhm wrote: > Hi, > > With TCP unlocking we run into the problem that path MTU discovery > happens in parallel. To keep route mtu consistent, I want to make > access to rt_mtu atomic. > > The compare-and-swap function is used to detect if another thread > is modifying the mtu. In this case skip updating rt_mtu. > > ok? > > bluhm > > Index: net/if_loop.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v > diff -u -p -r1.98 if_loop.c > --- net/if_loop.c 29 Dec 2023 11:43:04 -0000 1.98 > +++ net/if_loop.c 25 Dec 2024 17:22:50 -0000 > @@ -277,8 +277,8 @@ looutput(struct ifnet *ifp, struct mbuf > void > lortrequest(struct ifnet *ifp, int cmd, struct rtentry *rt) > { > - if (rt && rt->rt_mtu == 0) > - rt->rt_mtu = LOMTU; > + if (rt != NULL) > + atomic_cas_uint(&rt->rt_mtu, 0, LOMTU); > } > > /* > Index: net/route.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v > diff -u -p -r1.437 route.c > --- net/route.c 20 Sep 2024 02:00:46 -0000 1.437 > +++ net/route.c 25 Dec 2024 17:35:50 -0000 > @@ -557,8 +557,14 @@ rt_setgwroute(struct rtentry *rt, const > * If the MTU of next hop is 0, this will reset the MTU of the > * route to run PMTUD again from scratch. > */ > - if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu)) > - rt->rt_mtu = nhrt->rt_mtu; > + if (!ISSET(rt->rt_locks, RTV_MTU)) { > + u_int mtu, nhmtu; > + > + mtu = atomic_load_int(&rt->rt_mtu); > + nhmtu = atomic_load_int(&nhrt->rt_mtu); > + if (mtu > nhmtu) > + atomic_cas_uint(&rt->rt_mtu, mtu, nhmtu); > + } > > /* > * To avoid reference counting problems when writing link-layer > Index: net/route.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.h,v > diff -u -p -r1.210 route.h > --- net/route.h 31 Mar 2024 15:53:12 -0000 1.210 > +++ net/route.h 25 Dec 2024 17:24:16 -0000 > @@ -61,7 +61,7 @@ struct rt_kmetrics { > u_int64_t rmx_pksent; /* packets sent using this route */ > int64_t rmx_expire; /* lifetime for route, e.g. redirect */ > u_int rmx_locks; /* Kernel must leave these values */ > - u_int rmx_mtu; /* MTU for this path */ > + u_int rmx_mtu; /* [a] MTU for this path */ > }; > #endif > > Index: net/rtsock.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtsock.c,v > diff -u -p -r1.375 rtsock.c > --- net/rtsock.c 12 Jul 2024 17:20:18 -0000 1.375 > +++ net/rtsock.c 25 Dec 2024 17:39:32 -0000 > @@ -1345,7 +1345,7 @@ route_cleargateway(struct rtentry *rt, v > > if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt && > !ISSET(rt->rt_locks, RTV_MTU)) > - rt->rt_mtu = 0; > + atomic_store_int(&rt->rt_mtu, 0); > > return (0); > } > @@ -1393,7 +1393,7 @@ rtm_setmetrics(u_long which, const struc > int64_t expire; > > if (which & RTV_MTU) > - out->rmx_mtu = in->rmx_mtu; > + atomic_store_int(&out->rmx_mtu, in->rmx_mtu); > if (which & RTV_EXPIRE) { > expire = in->rmx_expire; > if (expire != 0) { > @@ -1421,7 +1421,7 @@ rtm_getmetrics(const struct rtentry *rt, > > bzero(out, sizeof(*out)); > out->rmx_locks = in->rmx_locks; > - out->rmx_mtu = in->rmx_mtu; > + out->rmx_mtu = atomic_load_int(&in->rmx_mtu); > out->rmx_expire = expire; > out->rmx_pksent = in->rmx_pksent; > } > Index: netinet/ip_icmp.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_icmp.c,v > diff -u -p -r1.197 ip_icmp.c > --- netinet/ip_icmp.c 4 Dec 2024 22:24:11 -0000 1.197 > +++ netinet/ip_icmp.c 25 Dec 2024 17:46:46 -0000 > @@ -1008,6 +1008,7 @@ icmp_mtudisc(struct icmp *icp, u_int rta > { > struct rtentry *rt; > struct ifnet *ifp; > + u_int rtmtu; > u_long mtu = ntohs(icp->icmp_nextmtu); /* Why a long? IPv6 */ > > rt = icmp_mtudisc_clone(icp->icmp_ip.ip_dst, rtableid, 0); > @@ -1020,17 +1021,18 @@ icmp_mtudisc(struct icmp *icp, u_int rta > return; > } > > + rtmtu = atomic_load_int(&rt->rt_mtu); > if (mtu == 0) { > int i = 0; > > mtu = ntohs(icp->icmp_ip.ip_len); > /* Some 4.2BSD-based routers incorrectly adjust the ip_len */ > - if (mtu > rt->rt_mtu && rt->rt_mtu != 0) > + if (mtu > rtmtu && rtmtu != 0) > mtu -= (icp->icmp_ip.ip_hl << 2); > > /* If we still can't guess a value, try the route */ > if (mtu == 0) { > - mtu = rt->rt_mtu; > + mtu = rtmtu; > > /* If no route mtu, default to the interface mtu */ > > @@ -1055,8 +1057,8 @@ icmp_mtudisc(struct icmp *icp, u_int rta > if ((rt->rt_locks & RTV_MTU) == 0) { > if (mtu < 296 || mtu > ifp->if_mtu) > rt->rt_locks |= RTV_MTU; > - else if (rt->rt_mtu > mtu || rt->rt_mtu == 0) > - rt->rt_mtu = mtu; > + else if (rtmtu > mtu || rtmtu == 0) > + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); > } > > if_put(ifp); > @@ -1089,7 +1091,7 @@ icmp_mtudisc_timeout(struct rtentry *rt, > rtableid, NULL); > } else { > if ((rt->rt_locks & RTV_MTU) == 0) > - rt->rt_mtu = 0; > + atomic_store_int(&rt->rt_mtu, 0); > } > > if_put(ifp); > Index: netinet/ip_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v > diff -u -p -r1.402 ip_input.c > --- netinet/ip_input.c 21 Nov 2024 20:15:44 -0000 1.402 > +++ netinet/ip_input.c 25 Dec 2024 17:49:03 -0000 > @@ -1650,8 +1650,11 @@ ip_forward(struct mbuf *m, struct ifnet > type = ICMP_UNREACH; > code = ICMP_UNREACH_NEEDFRAG; > if (rt != NULL) { > - if (rt->rt_mtu) { > - destmtu = rt->rt_mtu; > + u_int rtmtu; > + > + rtmtu = atomic_load_int(&rt->rt_mtu); > + if (rtmtu != 0) { > + destmtu = rtmtu; > } else { > struct ifnet *destifp; > > Index: netinet/ip_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v > diff -u -p -r1.401 ip_output.c > --- netinet/ip_output.c 2 Jul 2024 18:33:47 -0000 1.401 > +++ netinet/ip_output.c 25 Dec 2024 18:55:12 -0000 > @@ -211,7 +211,8 @@ reroute: > error = EHOSTUNREACH; > goto bad; > } > - if ((mtu = ro->ro_rt->rt_mtu) == 0) > + mtu = atomic_load_int(&ro->ro_rt->rt_mtu); > + if (mtu == 0) > mtu = ifp->if_mtu; > > if (ro->ro_rt->rt_flags & RTF_GATEWAY) > @@ -470,9 +471,14 @@ sendit: > */ > if (rtisvalid(ro->ro_rt) && > ISSET(ro->ro_rt->rt_flags, RTF_HOST) && > - !(ro->ro_rt->rt_locks & RTV_MTU) && > - (ro->ro_rt->rt_mtu > ifp->if_mtu)) { > - ro->ro_rt->rt_mtu = ifp->if_mtu; > + !(ro->ro_rt->rt_locks & RTV_MTU)) { > + u_int rtmtu; > + > + rtmtu = atomic_load_int(&ro->ro_rt->rt_mtu); > + if (rtmtu > ifp->if_mtu) { > + atomic_cas_uint(&ro->ro_rt->rt_mtu, rtmtu, > + ifp->if_mtu); > + } > } > ipstat_inc(ips_cantfrag); > goto bad; > @@ -558,7 +564,7 @@ ip_output_ipsec_pmtu_update(struct tdb * > DPRINTF("spi %08x mtu %d rt %p cloned %d", > ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); > if (rt != NULL) { > - rt->rt_mtu = tdb->tdb_mtu; > + atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); > if (ro != NULL && ro->ro_rt != NULL) { > rtfree(ro->ro_rt); > ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, > Index: netinet/tcp_input.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v > diff -u -p -r1.412 tcp_input.c > --- netinet/tcp_input.c 24 Dec 2024 19:16:53 -0000 1.412 > +++ netinet/tcp_input.c 25 Dec 2024 18:56:37 -0000 > @@ -2835,7 +2835,7 @@ tcp_mss(struct tcpcb *tp, int offer) > * if there's an mtu associated with the route and we support > * path MTU discovery for the underlying protocol family, use it. > */ > - rtmtu = READ_ONCE(rt->rt_mtu); > + rtmtu = atomic_load_int(&rt->rt_mtu); > if (rtmtu) { > /* > * One may wish to lower MSS to take into account options, > Index: netinet6/icmp6.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/icmp6.c,v > diff -u -p -r1.255 icmp6.c > --- netinet6/icmp6.c 12 Aug 2024 11:25:27 -0000 1.255 > +++ netinet6/icmp6.c 25 Dec 2024 19:00:50 -0000 > @@ -1016,16 +1016,20 @@ icmp6_mtudisc_update(struct ip6ctlparam > rt = icmp6_mtudisc_clone(&sin6, m->m_pkthdr.ph_rtableid, 0); > > if (rt != NULL && ISSET(rt->rt_flags, RTF_HOST) && > - !(rt->rt_locks & RTV_MTU) && > - (rt->rt_mtu > mtu || rt->rt_mtu == 0)) { > - struct ifnet *ifp; > - > - ifp = if_get(rt->rt_ifidx); > - if (ifp != NULL && mtu < ifp->if_mtu) { > - icmp6stat_inc(icp6s_pmtuchg); > - rt->rt_mtu = mtu; > + !(rt->rt_locks & RTV_MTU)) { > + u_int rtmtu; > + > + rtmtu = atomic_load_int(&rt->rt_mtu); > + if (rtmtu > mtu || rtmtu == 0) { > + struct ifnet *ifp; > + > + ifp = if_get(rt->rt_ifidx); > + if (ifp != NULL && mtu < ifp->if_mtu) { > + icmp6stat_inc(icp6s_pmtuchg); > + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); > + } > + if_put(ifp); > } > - if_put(ifp); > } > rtfree(rt); > > @@ -1848,7 +1852,7 @@ icmp6_mtudisc_timeout(struct rtentry *rt > rtdeletemsg(rt, ifp, rtableid); > } else { > if (!(rt->rt_locks & RTV_MTU)) > - rt->rt_mtu = 0; > + atomic_store_int(&rt->rt_mtu, 0); > } > > if_put(ifp); > Index: netinet6/ip6_forward.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v > diff -u -p -r1.124 ip6_forward.c > --- netinet6/ip6_forward.c 19 Jul 2024 16:58:32 -0000 1.124 > +++ netinet6/ip6_forward.c 25 Dec 2024 19:02:20 -0000 > @@ -399,8 +399,11 @@ senderr: > case EMSGSIZE: > type = ICMP6_PACKET_TOO_BIG; > if (rt != NULL) { > - if (rt->rt_mtu) { > - destmtu = rt->rt_mtu; > + u_int rtmtu; > + > + rtmtu = atomic_load_int(&rt->rt_mtu); > + if (rtmtu != 0) { > + destmtu = rtmtu; > } else { > struct ifnet *destifp; > > Index: netinet6/ip6_output.c > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v > diff -u -p -r1.292 ip6_output.c > --- netinet6/ip6_output.c 4 Jul 2024 12:50:08 -0000 1.292 > +++ netinet6/ip6_output.c 25 Dec 2024 19:16:03 -0000 > @@ -156,8 +156,8 @@ struct idgen32_ctx ip6_id_ctx; > * The mbuf chain containing the packet will be freed. > * The mbuf opt, if present, will not be freed. > * > - * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int. > - * We use u_long to hold largest one, * which is rt_mtu. > + * type of "mtu": rt_mtu is u_int, ifnet.ifr_mtu is int. > + * We use u_long to hold largest one. XXX should be u_int > */ > int > ip6_output(struct mbuf *m, struct ip6_pktopts *opt, struct route *ro, > @@ -1027,11 +1027,11 @@ ip6_insertfraghdr(struct mbuf *m0, struc > int > ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup) > { > - u_int32_t mtu = 0; > + u_int mtu, rtmtu; > int error = 0; > > if (rt != NULL) { > - mtu = rt->rt_mtu; > + mtu = rtmtu = atomic_load_int(&rt->rt_mtu); > if (mtu == 0) > mtu = ifp->if_mtu; > else if (mtu < IPV6_MMTU) { > @@ -1048,7 +1048,7 @@ ip6_getpmtu(struct rtentry *rt, struct i > */ > mtu = ifp->if_mtu; > if (!(rt->rt_locks & RTV_MTU)) > - rt->rt_mtu = mtu; > + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); > } > } else { > mtu = ifp->if_mtu; > @@ -2812,7 +2812,7 @@ ip6_output_ipsec_pmtu_update(struct tdb > DPRINTF("spi %08x mtu %d rt %p cloned %d", > ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); > if (rt != NULL) { > - rt->rt_mtu = tdb->tdb_mtu; > + atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); > if (ro != NULL && ro->ro_rt != NULL) { > rtfree(ro->ro_rt); > ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, > Index: sys/atomic.h > =================================================================== > RCS file: /data/mirror/openbsd/cvs/src/sys/sys/atomic.h,v > diff -u -p -r1.9 atomic.h > --- sys/atomic.h 21 Mar 2022 05:45:52 -0000 1.9 > +++ sys/atomic.h 25 Dec 2024 17:38:09 -0000 > @@ -202,13 +202,13 @@ atomic_sub_long_nv(volatile unsigned lon > */ > > static inline unsigned int > -atomic_load_int(volatile unsigned int *p) > +atomic_load_int(volatile const unsigned int *p) > { > return *p; > } > > static inline unsigned long > -atomic_load_long(volatile unsigned long *p) > +atomic_load_long(volatile const unsigned long *p) > { > return *p; > }