From: Vitaliy Makkoveev Subject: Re: tcp, sysctl: unlock TCPCTL_REASS_LIMIT and TCPCTL_SACKHOLE_LIMIT To: Alexander Bluhm Cc: tech@openbsd.org Date: Tue, 20 May 2025 02:23:05 +0300 On Mon, May 19, 2025 at 11:10:48AM +0900, Alexander Bluhm wrote: > On Tue, May 13, 2025 at 01:45:36PM +0300, Vitaliy Makkoveev wrote: > > On Sat, May 10, 2025 at 08:00:27AM +0300, Vitaliy Makkoveev wrote: > > > They are identical, so unlock them both. The temporary `tcp_sysctl_lock' > > > and the pool_sethardlimit() serialization are useless because we are > > > serialized with sysctl_lock rwlock(9). However I want to do this > > > serialization for consistency reason. I will replace `tcp_sysctl_lock' > > > with `sysctl_lock' after tcp_sysctl() unlocked. > > I am not sure why the serialized with sysctl_lock is actually needed. > Are you preventing that two userland threads call sysctl() and > modify pool_sethardlimit() in parallel? Or is it about the oval > that should be consistent for these threads? > > What concerns me a bit is that pool_sethardlimit() checks (n < > pp->pr_nout) while another thread working with pools may modify it. > Also pool_sethardlimit() is setting a bunch of pr_hardlimit values > without lock. > > Should we add pl_enter(pp, &pp->pr_lock) in pool_sethardlimit()? > It is not the hot path, would prevent modifying the limits in > parallel and prevents using inconsistent limits while setting them. > > Then some of your serialization code in tcp_sysctl() can go away. > This should be the no problem, because the "pp->pr_nout < pp->pr_hardlimit" is the normal case and the code handles it. However, I'm fine to do this check and modification locked. Please note, the `pr_hardlimit_warning', `pr_hardlimit_ratecap' and `pr_hardlimit_warning_last' are not used. I want to delete them, with another diff. > > + if (error == 0 && oval != nval) { > > + extern struct rwlock sysctl_lock; > > + > > + error = rw_enter(&sysctl_lock, RW_WRITE | RW_INTR); > > + if (error) > > + return (error); > > + if (nval != atomic_load_int(var)) { > > + error = pool_sethardlimit(pool, nval, > > + NULL, 0); > > + if (error == 0) > > + atomic_store_int(var, nval); > > + } > > + rw_exit(&sysctl_lock); > > } I want to keep the `sysctl_lock' as is. I can't imagine something worse than pool_sethardlimit() dry-run, but I prefer to keep this serialized even with the pool_sethardlimit() serialized within. This is not the fast path, the interruptible `sysctl_lock' doesn't make it worse, but we 100% sure there is no race effects here. Index: sys/kern/subr_pool.c =================================================================== RCS file: /cvs/src/sys/kern/subr_pool.c,v retrieving revision 1.237 diff -u -p -r1.237 subr_pool.c --- sys/kern/subr_pool.c 4 Jan 2025 09:26:01 -0000 1.237 +++ sys/kern/subr_pool.c 19 May 2025 23:22:01 -0000 @@ -1096,6 +1096,8 @@ pool_sethardlimit(struct pool *pp, u_int { int error = 0; + pl_enter(pp, &pp->pr_lock); + if (n < pp->pr_nout) { error = EINVAL; goto done; @@ -1106,8 +1108,9 @@ pool_sethardlimit(struct pool *pp, u_int pp->pr_hardlimit_ratecap.tv_sec = ratecap; pp->pr_hardlimit_warning_last.tv_sec = 0; pp->pr_hardlimit_warning_last.tv_usec = 0; - done: + pl_leave(pp, &pp->pr_lock); + return (error); } Index: sys/netinet/in_proto.c =================================================================== RCS file: /cvs/src/sys/netinet/in_proto.c,v retrieving revision 1.122 diff -u -p -r1.122 in_proto.c --- sys/netinet/in_proto.c 26 Apr 2025 13:58:08 -0000 1.122 +++ sys/netinet/in_proto.c 19 May 2025 23:22:01 -0000 @@ -198,7 +198,7 @@ const struct protosw inetsw[] = { .pr_domain = &inetdomain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE| - PR_MPINPUT, + PR_MPINPUT|PR_MPSYSCTL, .pr_input = tcp_input, .pr_ctlinput = tcp_ctlinput, .pr_ctloutput = tcp_ctloutput, Index: sys/netinet/tcp_usrreq.c =================================================================== RCS file: /cvs/src/sys/netinet/tcp_usrreq.c,v retrieving revision 1.246 diff -u -p -r1.246 tcp_usrreq.c --- sys/netinet/tcp_usrreq.c 13 May 2025 09:17:41 -0000 1.246 +++ sys/netinet/tcp_usrreq.c 19 May 2025 23:22:01 -0000 @@ -1460,29 +1460,38 @@ tcp_sysctl(int *name, u_int namelen, voi return tcp_ident(oldp, oldlenp, newp, newlen, 1); case TCPCTL_REASS_LIMIT: - NET_LOCK(); - nval = tcp_reass_limit; - error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); - if (!error && nval != tcp_reass_limit) { - error = pool_sethardlimit(&tcpqe_pool, nval, NULL, 0); - if (!error) - tcp_reass_limit = nval; + case TCPCTL_SACKHOLE_LIMIT: { + struct pool *pool; + int *var; + + if (name[0] == TCPCTL_REASS_LIMIT) { + pool = &tcpqe_pool; + var = &tcp_reass_limit; + } else { + pool = &sackhl_pool; + var = &tcp_sackhole_limit; } - NET_UNLOCK(); - return (error); - case TCPCTL_SACKHOLE_LIMIT: - NET_LOCK(); - nval = tcp_sackhole_limit; + oval = nval = atomic_load_int(var); error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); - if (!error && nval != tcp_sackhole_limit) { - error = pool_sethardlimit(&sackhl_pool, nval, NULL, 0); - if (!error) - tcp_sackhole_limit = nval; + + if (error == 0 && oval != nval) { + extern struct rwlock sysctl_lock; + + error = rw_enter(&sysctl_lock, RW_WRITE | RW_INTR); + if (error) + return (error); + if (nval != atomic_load_int(var)) { + error = pool_sethardlimit(pool, nval, + NULL, 0); + if (error == 0) + atomic_store_int(var, nval); + } + rw_exit(&sysctl_lock); } - NET_UNLOCK(); - return (error); + return (error); + } case TCPCTL_STATS: return (tcp_sysctl_tcpstat(oldp, oldlenp, newp)); Index: sys/netinet6/in6_proto.c =================================================================== RCS file: /cvs/src/sys/netinet6/in6_proto.c,v retrieving revision 1.125 diff -u -p -r1.125 in6_proto.c --- sys/netinet6/in6_proto.c 26 Apr 2025 13:58:08 -0000 1.125 +++ sys/netinet6/in6_proto.c 19 May 2025 23:22:01 -0000 @@ -148,7 +148,7 @@ const struct protosw inet6sw[] = { .pr_domain = &inet6domain, .pr_protocol = IPPROTO_TCP, .pr_flags = PR_CONNREQUIRED|PR_WANTRCVD|PR_ABRTACPTDIS|PR_SPLICE| - PR_MPINPUT, + PR_MPINPUT|PR_MPSYSCTL, .pr_input = tcp_input, .pr_ctlinput = tcp6_ctlinput, .pr_ctloutput = tcp_ctloutput,