From: Vitaliy Makkoveev Subject: Re: sysctl: unlock IPCTL_MTUDISC To: Alexander Bluhm Cc: tech@openbsd.org Date: Mon, 23 Jun 2025 16:51:08 +0300 On Mon, Jun 23, 2025 at 03:26:58PM +0200, Alexander Bluhm wrote: > On Mon, Jun 23, 2025 at 03:06:23PM +0300, Vitaliy Makkoveev wrote: > > `ip_mtudisc' is atomically accessed boolean, so allow only 0 and 1 > > values to set. Also, while `ip_mtudisc' is 0, the rt_timer_queue_flush() > > will be triggered all the times even if we do read access. There is no > > reason for that, so flush the queue only if this thread successfully > > assigned 0 value. The rt_timer_queue_flush() requires to be serialized > > with the netlock. > > I would guess shared netlock would be enough for rt_timer_queue_flush(). > I think rtdeletemsg() needs it. But there are also custom callbacks. > Better safe than sorry and use exclusive netlock. Routing code is > fragile enough. > rtdeletemsg() takes kernel lock within, so hypothetically the shared netlock is enough, but some (*rtq_func)() handlers are complicated, because they call another handlers. So I prefer to keep it exclusively locked. In the most cases the 'net.inet.ip.mtudisc: 1 -> 0' setting could happen only once through /etc/sysctl.conf processing, and after this exclusively locked section will never be triggered. > OK bluhm@ > > > Index: sys/net/if_bridge.c > > =================================================================== > > RCS file: /cvs/src/sys/net/if_bridge.c,v > > diff -u -p -r1.374 if_bridge.c > > --- sys/net/if_bridge.c 2 Mar 2025 21:28:31 -0000 1.374 > > +++ sys/net/if_bridge.c 23 Jun 2025 11:47:14 -0000 > > @@ -1640,7 +1640,8 @@ bridge_ipsec(struct ifnet *ifp, struct e > > > > ip = mtod(m, struct ip *); > > if ((af == AF_INET) && > > - ip_mtudisc && (ip->ip_off & htons(IP_DF)) && > > + atomic_load_int(&ip_mtudisc) && > > + (ip->ip_off & htons(IP_DF)) && > > tdb->tdb_mtu && ntohs(ip->ip_len) > tdb->tdb_mtu && > > tdb->tdb_mtutimeout > gettime()) { > > bridge_send_icmp_err(ifp, eh, m, > > Index: sys/net/pf.c > > =================================================================== > > RCS file: /cvs/src/sys/net/pf.c,v > > diff -u -p -r1.1214 pf.c > > --- sys/net/pf.c 23 Jun 2025 09:16:32 -0000 1.1214 > > +++ sys/net/pf.c 23 Jun 2025 11:47:14 -0000 > > @@ -3326,7 +3326,7 @@ pf_build_tcp(const struct pf_rule *r, sa > > h->ip_hl = sizeof(*h) >> 2; > > h->ip_tos = IPTOS_LOWDELAY; > > h->ip_len = htons(len); > > - h->ip_off = htons(ip_mtudisc ? IP_DF : 0); > > + h->ip_off = htons(atomic_load_int(&ip_mtudisc) ? IP_DF : 0); > > h->ip_ttl = ttl ? ttl : ip_defttl; > > h->ip_sum = 0; > > h->ip_src.s_addr = saddr->v4.s_addr; > > Index: sys/netinet/ip_input.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/ip_input.c,v > > diff -u -p -r1.411 ip_input.c > > --- sys/netinet/ip_input.c 23 Jun 2025 09:16:32 -0000 1.411 > > +++ sys/netinet/ip_input.c 23 Jun 2025 11:47:14 -0000 > > @@ -97,7 +97,7 @@ int ipmultipath = 0; /* [a] */ > > int ip_sendredirects = 1; /* [a] */ > > int ip_dosourceroute = 0; /* [a] */ > > int ip_defttl = IPDEFTTL; > > -int ip_mtudisc = 1; > > +int ip_mtudisc = 1; /* [a] */ > > int ip_mtudisc_timeout = IPMTUDISCTIMEOUT; /* [a] */ > > int ip_directedbcast = 0; /* [a] */ > > > > @@ -1743,12 +1743,18 @@ ip_sysctl(int *name, u_int namelen, void > > return (sysctl_securelevel_int(oldp, oldlenp, newp, newlen, > > &ip_dosourceroute)); > > case IPCTL_MTUDISC: > > - NET_LOCK(); > > - error = sysctl_int(oldp, oldlenp, newp, newlen, &ip_mtudisc); > > - if (ip_mtudisc == 0) > > + oldval = newval = atomic_load_int(&ip_mtudisc); > > + error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, > > + &newval, 0, 1); > > + if (error == 0 && oldval != newval && > > + oldval == atomic_cas_uint(&ip_mtudisc, oldval, newval) && > > + newval == 0) { > > + NET_LOCK(); > > rt_timer_queue_flush(&ip_mtudisc_timeout_q); > > - NET_UNLOCK(); > > - return error; > > + NET_UNLOCK(); > > + } > > + > > + return (error); > > case IPCTL_MTUDISCTIMEOUT: > > oldval = newval = atomic_load_int(&ip_mtudisc_timeout); > > error = sysctl_int_bounded(oldp, oldlenp, newp, newlen, > > Index: sys/netinet/ip_output.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/ip_output.c,v > > diff -u -p -r1.409 ip_output.c > > --- sys/netinet/ip_output.c 14 May 2025 14:32:15 -0000 1.409 > > +++ sys/netinet/ip_output.c 23 Jun 2025 11:47:14 -0000 > > @@ -445,7 +445,7 @@ reroute: > > */ > > if (ip->ip_off & htons(IP_DF)) { > > #ifdef IPSEC > > - if (ip_mtudisc) > > + if (atomic_load_int(&ip_mtudisc)) > > ipsec_adjust_mtu(m, ifp->if_mtu); > > #endif > > error = EMSGSIZE; > > @@ -583,7 +583,8 @@ ip_output_ipsec_send(struct tdb *tdb, st > > struct ip *ip; > > struct in_addr dst; > > u_int len; > > - int error, tso = 0; > > + int tso = 0, ip_mtudisc_local = atomic_load_int(&ip_mtudisc); > > + int error; > > > > #if NPF > 0 > > /* > > @@ -616,7 +617,7 @@ ip_output_ipsec_send(struct tdb *tdb, st > > > > /* Check if we are allowed to fragment */ > > dst = ip->ip_dst; > > - if (ip_mtudisc && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && > > + if (ip_mtudisc_local && (ip->ip_off & htons(IP_DF)) && tdb->tdb_mtu && > > len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { > > ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid); > > ipsec_adjust_mtu(m, tdb->tdb_mtu); > > @@ -624,7 +625,7 @@ ip_output_ipsec_send(struct tdb *tdb, st > > return EMSGSIZE; > > } > > /* propagate IP_DF for v4-over-v6 */ > > - if (ip_mtudisc && ip->ip_off & htons(IP_DF)) > > + if (ip_mtudisc_local && ip->ip_off & htons(IP_DF)) > > SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); > > > > /* > > @@ -661,7 +662,7 @@ ip_output_ipsec_send(struct tdb *tdb, st > > } > > if (!error && tso) > > tcpstat_inc(tcps_outswtso); > > - if (ip_mtudisc && error == EMSGSIZE) > > + if (ip_mtudisc_local && error == EMSGSIZE) > > ip_output_ipsec_pmtu_update(tdb, ro, dst, rtableid); > > return error; > > } > > Index: sys/netinet/ipsec_input.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/ipsec_input.c,v > > diff -u -p -r1.219 ipsec_input.c > > --- sys/netinet/ipsec_input.c 21 Jun 2025 14:21:17 -0000 1.219 > > +++ sys/netinet/ipsec_input.c 23 Jun 2025 11:47:14 -0000 > > @@ -941,7 +941,8 @@ ipsec_common_ctlinput(u_int rdomain, int > > { > > struct ip *ip = v; > > > > - if (cmd == PRC_MSGSIZE && ip && ip_mtudisc && ip->ip_v == 4) { > > + if (cmd == PRC_MSGSIZE && ip && atomic_load_int(&ip_mtudisc) && > > + ip->ip_v == 4) { > > struct tdb *tdbp; > > struct sockaddr_in dst; > > struct icmp *icp; > > Index: sys/netinet/tcp_input.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/tcp_input.c,v > > diff -u -p -r1.453 tcp_input.c > > --- sys/netinet/tcp_input.c 18 Jun 2025 16:15:46 -0000 1.453 > > +++ sys/netinet/tcp_input.c 23 Jun 2025 11:47:14 -0000 > > @@ -2952,7 +2952,7 @@ tcp_mss(struct tcpcb *tp, int offer) > > } else if (ifp->if_flags & IFF_LOOPBACK) { > > mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); > > } else if (tp->pf == AF_INET) { > > - if (ip_mtudisc) > > + if (atomic_load_int(&ip_mtudisc)) > > mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr); > > } > > #ifdef INET6 > > @@ -4284,7 +4284,7 @@ syn_cache_respond(struct syn_cache *sc, > > ip->ip_tos = inp->inp_ip.ip_tos; > > > > error = ip_output(m, sc->sc_ipopts, &sc->sc_route, > > - (ip_mtudisc ? IP_MTUDISC : 0), NULL, > > + (atomic_load_int(&ip_mtudisc) ? IP_MTUDISC : 0), NULL, > > inp ? &inp->inp_seclevel : NULL, 0); > > break; > > #ifdef INET6 > > Index: sys/netinet/tcp_output.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/tcp_output.c,v > > diff -u -p -r1.154 tcp_output.c > > --- sys/netinet/tcp_output.c 21 Apr 2025 09:54:53 -0000 1.154 > > +++ sys/netinet/tcp_output.c 23 Jun 2025 11:47:14 -0000 > > @@ -1095,7 +1095,7 @@ send: > > } > > error = ip_output(m, tp->t_inpcb->inp_options, > > &tp->t_inpcb->inp_route, > > - (ip_mtudisc ? IP_MTUDISC : 0), NULL, > > + (atomic_load_int(&ip_mtudisc) ? IP_MTUDISC : 0), NULL, > > &tp->t_inpcb->inp_seclevel, 0); > > break; > > #ifdef INET6 > > Index: sys/netinet/tcp_subr.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/tcp_subr.c,v > > diff -u -p -r1.212 tcp_subr.c > > --- sys/netinet/tcp_subr.c 8 Jun 2025 17:06:19 -0000 1.212 > > +++ sys/netinet/tcp_subr.c 23 Jun 2025 11:47:14 -0000 > > @@ -412,7 +412,7 @@ tcp_respond(struct tcpcb *tp, caddr_t te > > ip->ip_tos = 0; > > ip_output(m, NULL, > > tp ? &tp->t_inpcb->inp_route : NULL, > > - ip_mtudisc ? IP_MTUDISC : 0, NULL, > > + atomic_load_int(&ip_mtudisc) ? IP_MTUDISC : 0, NULL, > > tp ? &tp->t_inpcb->inp_seclevel : NULL, 0); > > break; > > } > > @@ -744,7 +744,7 @@ tcp_ctlinput(int cmd, struct sockaddr *s > > return; > > else if (PRC_IS_REDIRECT(cmd)) > > notify = in_pcbrtchange, ip = NULL; > > - else if (cmd == PRC_MSGSIZE && ip_mtudisc && ip) { > > + else if (cmd == PRC_MSGSIZE && atomic_load_int(&ip_mtudisc) && ip) { > > struct inpcb *inp; > > struct socket *so = NULL; > > struct tcpcb *tp = NULL; > > Index: sys/netinet/tcp_timer.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/tcp_timer.c,v > > diff -u -p -r1.85 tcp_timer.c > > --- sys/netinet/tcp_timer.c 8 Jun 2025 17:06:19 -0000 1.85 > > +++ sys/netinet/tcp_timer.c 23 Jun 2025 11:47:14 -0000 > > @@ -277,7 +277,7 @@ tcp_timer_rexmt(void *arg) > > * lots more sophisticated searching to find the right > > * value here... > > */ > > - if (ip_mtudisc && > > + if (atomic_load_int(&ip_mtudisc) && > > TCPS_HAVEESTABLISHED(tp->t_state) && > > tp->t_rxtshift > TCP_MAXRXTSHIFT / 6) { > > struct rtentry *rt = NULL; > > Index: sys/netinet/udp_usrreq.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet/udp_usrreq.c,v > > diff -u -p -r1.344 udp_usrreq.c > > --- sys/netinet/udp_usrreq.c 18 Jun 2025 16:15:46 -0000 1.344 > > +++ sys/netinet/udp_usrreq.c 23 Jun 2025 11:47:14 -0000 > > @@ -919,7 +919,7 @@ udp_ctlinput(int cmd, struct sockaddr *s > > uhp = (struct udphdr *)((caddr_t)ip + (ip->ip_hl << 2)); > > #ifdef IPSEC > > /* PMTU discovery for udpencap */ > > - if (cmd == PRC_MSGSIZE && ip_mtudisc && > > + if (cmd == PRC_MSGSIZE && atomic_load_int(&ip_mtudisc) && > > atomic_load_int(&udpencap_enable) && udpencap_port_local && > > uhp->uh_sport == udpencap_port_local) { > > udpencap_ctlinput(cmd, sa, rdomain, v); > > Index: sys/netinet6/ip6_output.c > > =================================================================== > > RCS file: /cvs/src/sys/netinet6/ip6_output.c,v > > diff -u -p -r1.299 ip6_output.c > > --- sys/netinet6/ip6_output.c 14 May 2025 14:32:15 -0000 1.299 > > +++ sys/netinet6/ip6_output.c 23 Jun 2025 11:47:14 -0000 > > @@ -693,7 +693,7 @@ reroute: > > > > if (dontfrag && tlen > ifp->if_mtu) { /* case 2-b */ > > #ifdef IPSEC > > - if (ip_mtudisc) > > + if (atomic_load_int(&ip_mtudisc)) > > ipsec_adjust_mtu(m, mtu); > > #endif > > error = EMSGSIZE; > > @@ -2836,7 +2836,8 @@ ip6_output_ipsec_send(struct tdb *tdb, s > > struct ip6_hdr *ip6; > > struct in6_addr dst; > > u_int len; > > - int error, ifidx, tso = 0; > > + int ifidx, tso = 0, ip_mtudisc_local = atomic_load_int(&ip_mtudisc); > > + int error; > > > > #if NPF > 0 > > /* > > @@ -2870,7 +2871,7 @@ ip6_output_ipsec_send(struct tdb *tdb, s > > /* Check if we are allowed to fragment */ > > dst = ip6->ip6_dst; > > ifidx = m->m_pkthdr.ph_ifidx; > > - if (ip_mtudisc && tdb->tdb_mtu && > > + if (ip_mtudisc_local && tdb->tdb_mtu && > > len > tdb->tdb_mtu && tdb->tdb_mtutimeout > gettime()) { > > int transportmode; > > > > @@ -2889,7 +2890,7 @@ ip6_output_ipsec_send(struct tdb *tdb, s > > return EMSGSIZE; > > } > > /* propagate don't fragment for v6-over-v6 */ > > - if (ip_mtudisc) > > + if (ip_mtudisc_local) > > SET(m->m_pkthdr.csum_flags, M_IPV6_DF_OUT); > > > > /* > > @@ -2926,7 +2927,7 @@ ip6_output_ipsec_send(struct tdb *tdb, s > > } > > if (!error && tso) > > tcpstat_inc(tcps_outswtso); > > - if (ip_mtudisc && error == EMSGSIZE) > > + if (ip_mtudisc_local && error == EMSGSIZE) > > ip6_output_ipsec_pmtu_update(tdb, ro, &dst, ifidx, rtableid, 0); > > return error; > > }