From: Vitaliy Makkoveev Subject: Re: route mtu atomic To: Alexander Bluhm Cc: OpenBSD tech Date: Fri, 3 Jan 2025 18:58:54 +0300 ok mvs@ > On 3 Jan 2025, at 16:19, Alexander Bluhm wrote: > > anyone? > > On Thu, Dec 26, 2024 at 12:11:50PM +0100, Alexander Bluhm wrote: >> Hi, >> >> With TCP unlocking we run into the problem that path MTU discovery >> happens in parallel. To keep route mtu consistent, I want to make >> access to rt_mtu atomic. >> >> The compare-and-swap function is used to detect if another thread >> is modifying the mtu. In this case skip updating rt_mtu. >> >> ok? >> >> bluhm >> >> Index: net/if_loop.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/net/if_loop.c,v >> diff -u -p -r1.98 if_loop.c >> --- net/if_loop.c 29 Dec 2023 11:43:04 -0000 1.98 >> +++ net/if_loop.c 25 Dec 2024 17:22:50 -0000 >> @@ -277,8 +277,8 @@ looutput(struct ifnet *ifp, struct mbuf >> void >> lortrequest(struct ifnet *ifp, int cmd, struct rtentry *rt) >> { >> - if (rt && rt->rt_mtu == 0) >> - rt->rt_mtu = LOMTU; >> + if (rt != NULL) >> + atomic_cas_uint(&rt->rt_mtu, 0, LOMTU); >> } >> >> /* >> Index: net/route.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.c,v >> diff -u -p -r1.437 route.c >> --- net/route.c 20 Sep 2024 02:00:46 -0000 1.437 >> +++ net/route.c 25 Dec 2024 17:35:50 -0000 >> @@ -557,8 +557,14 @@ rt_setgwroute(struct rtentry *rt, const >> * If the MTU of next hop is 0, this will reset the MTU of the >> * route to run PMTUD again from scratch. >> */ >> - if (!ISSET(rt->rt_locks, RTV_MTU) && (rt->rt_mtu > nhrt->rt_mtu)) >> - rt->rt_mtu = nhrt->rt_mtu; >> + if (!ISSET(rt->rt_locks, RTV_MTU)) { >> + u_int mtu, nhmtu; >> + >> + mtu = atomic_load_int(&rt->rt_mtu); >> + nhmtu = atomic_load_int(&nhrt->rt_mtu); >> + if (mtu > nhmtu) >> + atomic_cas_uint(&rt->rt_mtu, mtu, nhmtu); >> + } >> >> /* >> * To avoid reference counting problems when writing link-layer >> Index: net/route.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/net/route.h,v >> diff -u -p -r1.210 route.h >> --- net/route.h 31 Mar 2024 15:53:12 -0000 1.210 >> +++ net/route.h 25 Dec 2024 17:24:16 -0000 >> @@ -61,7 +61,7 @@ struct rt_kmetrics { >> u_int64_t rmx_pksent; /* packets sent using this route */ >> int64_t rmx_expire; /* lifetime for route, e.g. redirect */ >> u_int rmx_locks; /* Kernel must leave these values */ >> - u_int rmx_mtu; /* MTU for this path */ >> + u_int rmx_mtu; /* [a] MTU for this path */ >> }; >> #endif >> >> Index: net/rtsock.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/net/rtsock.c,v >> diff -u -p -r1.375 rtsock.c >> --- net/rtsock.c 12 Jul 2024 17:20:18 -0000 1.375 >> +++ net/rtsock.c 25 Dec 2024 17:39:32 -0000 >> @@ -1345,7 +1345,7 @@ route_cleargateway(struct rtentry *rt, v >> >> if (ISSET(rt->rt_flags, RTF_GATEWAY) && rt->rt_gwroute == nhrt && >> !ISSET(rt->rt_locks, RTV_MTU)) >> - rt->rt_mtu = 0; >> + atomic_store_int(&rt->rt_mtu, 0); >> >> return (0); >> } >> @@ -1393,7 +1393,7 @@ rtm_setmetrics(u_long which, const struc >> int64_t expire; >> >> if (which & RTV_MTU) >> - out->rmx_mtu = in->rmx_mtu; >> + atomic_store_int(&out->rmx_mtu, in->rmx_mtu); >> if (which & RTV_EXPIRE) { >> expire = in->rmx_expire; >> if (expire != 0) { >> @@ -1421,7 +1421,7 @@ rtm_getmetrics(const struct rtentry *rt, >> >> bzero(out, sizeof(*out)); >> out->rmx_locks = in->rmx_locks; >> - out->rmx_mtu = in->rmx_mtu; >> + out->rmx_mtu = atomic_load_int(&in->rmx_mtu); >> out->rmx_expire = expire; >> out->rmx_pksent = in->rmx_pksent; >> } >> Index: netinet/ip_icmp.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_icmp.c,v >> diff -u -p -r1.197 ip_icmp.c >> --- netinet/ip_icmp.c 4 Dec 2024 22:24:11 -0000 1.197 >> +++ netinet/ip_icmp.c 25 Dec 2024 17:46:46 -0000 >> @@ -1008,6 +1008,7 @@ icmp_mtudisc(struct icmp *icp, u_int rta >> { >> struct rtentry *rt; >> struct ifnet *ifp; >> + u_int rtmtu; >> u_long mtu = ntohs(icp->icmp_nextmtu); /* Why a long? IPv6 */ >> >> rt = icmp_mtudisc_clone(icp->icmp_ip.ip_dst, rtableid, 0); >> @@ -1020,17 +1021,18 @@ icmp_mtudisc(struct icmp *icp, u_int rta >> return; >> } >> >> + rtmtu = atomic_load_int(&rt->rt_mtu); >> if (mtu == 0) { >> int i = 0; >> >> mtu = ntohs(icp->icmp_ip.ip_len); >> /* Some 4.2BSD-based routers incorrectly adjust the ip_len */ >> - if (mtu > rt->rt_mtu && rt->rt_mtu != 0) >> + if (mtu > rtmtu && rtmtu != 0) >> mtu -= (icp->icmp_ip.ip_hl << 2); >> >> /* If we still can't guess a value, try the route */ >> if (mtu == 0) { >> - mtu = rt->rt_mtu; >> + mtu = rtmtu; >> >> /* If no route mtu, default to the interface mtu */ >> >> @@ -1055,8 +1057,8 @@ icmp_mtudisc(struct icmp *icp, u_int rta >> if ((rt->rt_locks & RTV_MTU) == 0) { >> if (mtu < 296 || mtu > ifp->if_mtu) >> rt->rt_locks |= RTV_MTU; >> - else if (rt->rt_mtu > mtu || rt->rt_mtu == 0) >> - rt->rt_mtu = mtu; >> + else if (rtmtu > mtu || rtmtu == 0) >> + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); >> } >> >> if_put(ifp); >> @@ -1089,7 +1091,7 @@ icmp_mtudisc_timeout(struct rtentry *rt, >> rtableid, NULL); >> } else { >> if ((rt->rt_locks & RTV_MTU) == 0) >> - rt->rt_mtu = 0; >> + atomic_store_int(&rt->rt_mtu, 0); >> } >> >> if_put(ifp); >> Index: netinet/ip_input.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_input.c,v >> diff -u -p -r1.402 ip_input.c >> --- netinet/ip_input.c 21 Nov 2024 20:15:44 -0000 1.402 >> +++ netinet/ip_input.c 25 Dec 2024 17:49:03 -0000 >> @@ -1650,8 +1650,11 @@ ip_forward(struct mbuf *m, struct ifnet >> type = ICMP_UNREACH; >> code = ICMP_UNREACH_NEEDFRAG; >> if (rt != NULL) { >> - if (rt->rt_mtu) { >> - destmtu = rt->rt_mtu; >> + u_int rtmtu; >> + >> + rtmtu = atomic_load_int(&rt->rt_mtu); >> + if (rtmtu != 0) { >> + destmtu = rtmtu; >> } else { >> struct ifnet *destifp; >> >> Index: netinet/ip_output.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/ip_output.c,v >> diff -u -p -r1.401 ip_output.c >> --- netinet/ip_output.c 2 Jul 2024 18:33:47 -0000 1.401 >> +++ netinet/ip_output.c 25 Dec 2024 18:55:12 -0000 >> @@ -211,7 +211,8 @@ reroute: >> error = EHOSTUNREACH; >> goto bad; >> } >> - if ((mtu = ro->ro_rt->rt_mtu) == 0) >> + mtu = atomic_load_int(&ro->ro_rt->rt_mtu); >> + if (mtu == 0) >> mtu = ifp->if_mtu; >> >> if (ro->ro_rt->rt_flags & RTF_GATEWAY) >> @@ -470,9 +471,14 @@ sendit: >> */ >> if (rtisvalid(ro->ro_rt) && >> ISSET(ro->ro_rt->rt_flags, RTF_HOST) && >> - !(ro->ro_rt->rt_locks & RTV_MTU) && >> - (ro->ro_rt->rt_mtu > ifp->if_mtu)) { >> - ro->ro_rt->rt_mtu = ifp->if_mtu; >> + !(ro->ro_rt->rt_locks & RTV_MTU)) { >> + u_int rtmtu; >> + >> + rtmtu = atomic_load_int(&ro->ro_rt->rt_mtu); >> + if (rtmtu > ifp->if_mtu) { >> + atomic_cas_uint(&ro->ro_rt->rt_mtu, rtmtu, >> + ifp->if_mtu); >> + } >> } >> ipstat_inc(ips_cantfrag); >> goto bad; >> @@ -558,7 +564,7 @@ ip_output_ipsec_pmtu_update(struct tdb * >> DPRINTF("spi %08x mtu %d rt %p cloned %d", >> ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); >> if (rt != NULL) { >> - rt->rt_mtu = tdb->tdb_mtu; >> + atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); >> if (ro != NULL && ro->ro_rt != NULL) { >> rtfree(ro->ro_rt); >> ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, >> Index: netinet/tcp_input.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet/tcp_input.c,v >> diff -u -p -r1.412 tcp_input.c >> --- netinet/tcp_input.c 24 Dec 2024 19:16:53 -0000 1.412 >> +++ netinet/tcp_input.c 25 Dec 2024 18:56:37 -0000 >> @@ -2835,7 +2835,7 @@ tcp_mss(struct tcpcb *tp, int offer) >> * if there's an mtu associated with the route and we support >> * path MTU discovery for the underlying protocol family, use it. >> */ >> - rtmtu = READ_ONCE(rt->rt_mtu); >> + rtmtu = atomic_load_int(&rt->rt_mtu); >> if (rtmtu) { >> /* >> * One may wish to lower MSS to take into account options, >> Index: netinet6/icmp6.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/icmp6.c,v >> diff -u -p -r1.255 icmp6.c >> --- netinet6/icmp6.c 12 Aug 2024 11:25:27 -0000 1.255 >> +++ netinet6/icmp6.c 25 Dec 2024 19:00:50 -0000 >> @@ -1016,16 +1016,20 @@ icmp6_mtudisc_update(struct ip6ctlparam >> rt = icmp6_mtudisc_clone(&sin6, m->m_pkthdr.ph_rtableid, 0); >> >> if (rt != NULL && ISSET(rt->rt_flags, RTF_HOST) && >> - !(rt->rt_locks & RTV_MTU) && >> - (rt->rt_mtu > mtu || rt->rt_mtu == 0)) { >> - struct ifnet *ifp; >> - >> - ifp = if_get(rt->rt_ifidx); >> - if (ifp != NULL && mtu < ifp->if_mtu) { >> - icmp6stat_inc(icp6s_pmtuchg); >> - rt->rt_mtu = mtu; >> + !(rt->rt_locks & RTV_MTU)) { >> + u_int rtmtu; >> + >> + rtmtu = atomic_load_int(&rt->rt_mtu); >> + if (rtmtu > mtu || rtmtu == 0) { >> + struct ifnet *ifp; >> + >> + ifp = if_get(rt->rt_ifidx); >> + if (ifp != NULL && mtu < ifp->if_mtu) { >> + icmp6stat_inc(icp6s_pmtuchg); >> + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); >> + } >> + if_put(ifp); >> } >> - if_put(ifp); >> } >> rtfree(rt); >> >> @@ -1848,7 +1852,7 @@ icmp6_mtudisc_timeout(struct rtentry *rt >> rtdeletemsg(rt, ifp, rtableid); >> } else { >> if (!(rt->rt_locks & RTV_MTU)) >> - rt->rt_mtu = 0; >> + atomic_store_int(&rt->rt_mtu, 0); >> } >> >> if_put(ifp); >> Index: netinet6/ip6_forward.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_forward.c,v >> diff -u -p -r1.124 ip6_forward.c >> --- netinet6/ip6_forward.c 19 Jul 2024 16:58:32 -0000 1.124 >> +++ netinet6/ip6_forward.c 25 Dec 2024 19:02:20 -0000 >> @@ -399,8 +399,11 @@ senderr: >> case EMSGSIZE: >> type = ICMP6_PACKET_TOO_BIG; >> if (rt != NULL) { >> - if (rt->rt_mtu) { >> - destmtu = rt->rt_mtu; >> + u_int rtmtu; >> + >> + rtmtu = atomic_load_int(&rt->rt_mtu); >> + if (rtmtu != 0) { >> + destmtu = rtmtu; >> } else { >> struct ifnet *destifp; >> >> Index: netinet6/ip6_output.c >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/netinet6/ip6_output.c,v >> diff -u -p -r1.292 ip6_output.c >> --- netinet6/ip6_output.c 4 Jul 2024 12:50:08 -0000 1.292 >> +++ netinet6/ip6_output.c 25 Dec 2024 19:16:03 -0000 >> @@ -156,8 +156,8 @@ struct idgen32_ctx ip6_id_ctx; >> * The mbuf chain containing the packet will be freed. >> * The mbuf opt, if present, will not be freed. >> * >> - * type of "mtu": rt_mtu is u_long, ifnet.ifr_mtu is int. >> - * We use u_long to hold largest one, * which is rt_mtu. >> + * type of "mtu": rt_mtu is u_int, ifnet.ifr_mtu is int. >> + * We use u_long to hold largest one. XXX should be u_int >> */ >> int >> ip6_output(struct mbuf *m, struct ip6_pktopts *opt, struct route *ro, >> @@ -1027,11 +1027,11 @@ ip6_insertfraghdr(struct mbuf *m0, struc >> int >> ip6_getpmtu(struct rtentry *rt, struct ifnet *ifp, u_long *mtup) >> { >> - u_int32_t mtu = 0; >> + u_int mtu, rtmtu; >> int error = 0; >> >> if (rt != NULL) { >> - mtu = rt->rt_mtu; >> + mtu = rtmtu = atomic_load_int(&rt->rt_mtu); >> if (mtu == 0) >> mtu = ifp->if_mtu; >> else if (mtu < IPV6_MMTU) { >> @@ -1048,7 +1048,7 @@ ip6_getpmtu(struct rtentry *rt, struct i >> */ >> mtu = ifp->if_mtu; >> if (!(rt->rt_locks & RTV_MTU)) >> - rt->rt_mtu = mtu; >> + atomic_cas_uint(&rt->rt_mtu, rtmtu, mtu); >> } >> } else { >> mtu = ifp->if_mtu; >> @@ -2812,7 +2812,7 @@ ip6_output_ipsec_pmtu_update(struct tdb >> DPRINTF("spi %08x mtu %d rt %p cloned %d", >> ntohl(tdb->tdb_spi), tdb->tdb_mtu, rt, rt_mtucloned); >> if (rt != NULL) { >> - rt->rt_mtu = tdb->tdb_mtu; >> + atomic_store_int(&rt->rt_mtu, tdb->tdb_mtu); >> if (ro != NULL && ro->ro_rt != NULL) { >> rtfree(ro->ro_rt); >> ro->ro_rt = rtalloc(&ro->ro_dstsa, RT_RESOLVE, >> Index: sys/atomic.h >> =================================================================== >> RCS file: /data/mirror/openbsd/cvs/src/sys/sys/atomic.h,v >> diff -u -p -r1.9 atomic.h >> --- sys/atomic.h 21 Mar 2022 05:45:52 -0000 1.9 >> +++ sys/atomic.h 25 Dec 2024 17:38:09 -0000 >> @@ -202,13 +202,13 @@ atomic_sub_long_nv(volatile unsigned lon >> */ >> >> static inline unsigned int >> -atomic_load_int(volatile unsigned int *p) >> +atomic_load_int(volatile const unsigned int *p) >> { >> return *p; >> } >> >> static inline unsigned long >> -atomic_load_long(volatile unsigned long *p) >> +atomic_load_long(volatile const unsigned long *p) >> { >> return *p; >> } >