From: Vitaliy Makkoveev Subject: Re: Unlock sysctl kern.maxclusters To: Mark Kettenis Cc: Alexander Bluhm , tech@openbsd.org Date: Tue, 3 Jun 2025 17:10:49 +0300 On Tue, Jun 03, 2025 at 03:07:31PM +0200, Mark Kettenis wrote: > > Date: Tue, 3 Jun 2025 11:51:11 +0200 > > From: Alexander Bluhm > > > > Hi, > > > > Sysctl KERN_MAXCLUSTERS only needs a mutex to keep nmbclust and > > mbuf_mem_limit in sync. All other read access happens atomically. > > > > ok? > > This is questionable. Either it doesn't matter that these variables > are in sync, or it doesn't. And if it does matter I'd expect to see > the mutex used to access the variables. > > Now what I think you're trying to prevent here is two processes doing > sysctl kern.maxclusters=xxx racing eachother producing resulting > inconsistent values of nmbclust and mbuf_mem_limit. There is a much > better way to achieve that though: get rid of one of the variables and > calculate it from the other one when you need it. I'd probably get > rid of mbuf_mem_limit in this case. > `nmbclust' used only in sysctl path and only for `mbuf_mem_limit' calculation. Why the `nmbclust' and `mbuf_mem_limit' serialization is not enough? Why do we need to move "nmbclust * MCLBYTES" to the hot path and use it instead of pre-calculated `mbuf_mem_limit'? The only thing I don't like is the non serialized pool_wakeup() calls, but mostly for hypothetical pl_enter() for the pool_runqueue() dry runs. That's why I propose to use existing `sysctl_lock' rwlock to serialize the whole nmbclust_update(). But how often do we modify kern.maxclusters in the most common case? Once per /etc/sysctl.conf or never? > > > Index: kern/kern_sysctl.c > > =================================================================== > > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v > > diff -u -p -r1.469 kern_sysctl.c > > --- kern/kern_sysctl.c 1 Jun 2025 03:43:48 -0000 1.469 > > +++ kern/kern_sysctl.c 2 Jun 2025 18:27:49 -0000 > > @@ -553,6 +553,15 @@ kern_sysctl(int *name, u_int namelen, vo > > microboottime(&bt); > > return (sysctl_rdstruct(oldp, oldlenp, newp, &bt, sizeof bt)); > > } > > + case KERN_MAXCLUSTERS: { > > + int oval, nval; > > + > > + oval = nval = atomic_load_long(&nmbclust); > > + error = sysctl_int(oldp, oldlenp, newp, newlen, &nval); > > + if (error == 0 && oval != nval) > > + error = nmbclust_update(nval); > > + return (error); > > + } > > case KERN_MBSTAT: { > > uint64_t counters[MBSTAT_COUNT]; > > struct mbstat mbs; > > @@ -758,13 +767,6 @@ kern_sysctl_locked(int *name, u_int name > > return (EINVAL); > > stackgap_random = stackgap; > > return (0); > > - case KERN_MAXCLUSTERS: { > > - int val = nmbclust; > > - error = sysctl_int(oldp, oldlenp, newp, newlen, &val); > > - if (error == 0 && val != nmbclust) > > - error = nmbclust_update(val); > > - return (error); > > - } > > case KERN_CACHEPCT: { > > u_int64_t dmapages; > > int opct, pgs; > > Index: kern/uipc_mbuf.c > > =================================================================== > > RCS file: /data/mirror/openbsd/cvs/src/sys/kern/uipc_mbuf.c,v > > diff -u -p -r1.296 uipc_mbuf.c > > --- kern/uipc_mbuf.c 1 Jan 2025 13:44:22 -0000 1.296 > > +++ kern/uipc_mbuf.c 2 Jun 2025 18:27:49 -0000 > > @@ -129,6 +129,9 @@ struct mutex m_extref_mtx = MUTEX_INITIA > > void m_extfree(struct mbuf *); > > void m_zero(struct mbuf *); > > > > +/* keep nmbclust and mbuf_mem_limit in sync */ > > +struct mutex mbuf_nmbclust_mtx = MUTEX_INITIALIZER(IPL_NET); > > + > > unsigned long mbuf_mem_limit; /* [a] how much memory can be allocated */ > > unsigned long mbuf_mem_alloc; /* [a] how much memory has been allocated */ > > > > @@ -218,8 +221,10 @@ nmbclust_update(long newval) > > if (newval <= 0 || newval > LONG_MAX / MCLBYTES) > > return ERANGE; > > /* update the global mbuf memory limit */ > > - nmbclust = newval; > > - atomic_store_long(&mbuf_mem_limit, nmbclust * MCLBYTES); > > + mtx_enter(&mbuf_nmbclust_mtx); > > + atomic_store_long(&nmbclust, newval); > > + atomic_store_long(&mbuf_mem_limit, newval * MCLBYTES); > > + mtx_leave(&mbuf_nmbclust_mtx); > > > > pool_wakeup(&mbpool); > > for (i = 0; i < nitems(mclsizes); i++) > > Index: net/pf_ioctl.c > > =================================================================== > > RCS file: /data/mirror/openbsd/cvs/src/sys/net/pf_ioctl.c,v > > diff -u -p -r1.421 pf_ioctl.c > > --- net/pf_ioctl.c 22 May 2025 06:34:03 -0000 1.421 > > +++ net/pf_ioctl.c 2 Jun 2025 18:27:49 -0000 > > @@ -2134,7 +2134,8 @@ pfioctl(dev_t dev, u_long cmd, caddr_t a > > goto fail; > > } > > /* Fragments reference mbuf clusters. */ > > - if (pl->index == PF_LIMIT_FRAGS && pl->limit > nmbclust) { > > + if (pl->index == PF_LIMIT_FRAGS && pl->limit > > > + (long)atomic_load_long(&nmbclust)) { > > error = EINVAL; > > PF_UNLOCK(); > > goto fail; > > > > >