From: Vitaliy Makkoveev Subject: Re: sysctl(2): push locks down to vfs_sysctl() To: tech@openbsd.org Date: Fri, 8 Nov 2024 20:36:43 +0300 anyone? > On 31 Oct 2024, at 15:27, Vitaliy Makkoveev wrote: > > 1. VFS_MAXTYPENUM > > `maxvfsconf' is immutable, after vfsinit() updates it. > > 2. VFS_CONF > > The registered file systems array `vfsconflist' is immutable, so no > locks required for vfs_bytypenum() and `vfsp' dereference. > > Except atomically modified `vfc_refcount', 'vfsconf' structure is > immutable so use atomic_load_int(9) to load `vfc_refcount' value > into `tmpvfsp' local copy. > > 3. VFS_BCACHESTAT > > The `tmpbcstats' buffer cache statistics is mutable, but kernel lock > is enough to store it to local copy. So follow VFS_CONF way and > allocate temporary `tmpbcstats' to pass statistics to userland. > > 4. Non generic filesystem case > > The optional (*vfs_sysctl)() handler returns EINVAL for all the cases > except fusefs_sysctl(), nfs_sysctl() and ffs_sysctl(). > > 1. fusefs_sysctl() does read-only access to array of integers > `fusefs_vars', so no locking required. > > 2. nfs_sysctl() is complicated, so it was wrapped by sysctl_vslock() > to keep original locking. > > 3. ffs_sysctl() modifies array of integers `ffs_vars', so kernel lock > is enough because sysctl_bounded_arr() does context switch safe > load/store. > > Index: sys/kern/kern_sysctl.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v > diff -u -p -r1.451 kern_sysctl.c > --- sys/kern/kern_sysctl.c 31 Oct 2024 10:06:51 -0000 1.451 > +++ sys/kern/kern_sysctl.c 31 Oct 2024 12:21:07 -0000 > @@ -266,6 +266,7 @@ sys_sysctl(struct proc *p, void *v, regi > fn = fs_sysctl; > break; > case CTL_VFS: > + dolock = 0; > fn = vfs_sysctl; > break; > case CTL_MACHDEP: > Index: sys/kern/vfs_init.c > =================================================================== > RCS file: /cvs/src/sys/kern/vfs_init.c,v > diff -u -p -r1.44 vfs_init.c > --- sys/kern/vfs_init.c 20 May 2024 09:11:21 -0000 1.44 > +++ sys/kern/vfs_init.c 31 Oct 2024 12:21:07 -0000 > @@ -44,13 +44,18 @@ > #include > #include > > +/* > + * Locks used to protect data: > + * I immutable after creation > + */ > + > struct pool namei_pool; > > /* This defines the root filesystem. */ > struct vnode *rootvnode; > > /* Set up the filesystem operations for vnodes. */ > -static struct vfsconf vfsconflist[] = { > +static struct vfsconf vfsconflist[] = { /* [I] */ > #ifdef FFS > { &ffs_vfsops, MOUNT_FFS, 1, 0, MNT_LOCAL | MNT_SWAPPABLE, > sizeof(struct ufs_args) }, > @@ -107,7 +112,7 @@ static struct vfsconf vfsconflist[] = { > * Initially the size of the list, vfsinit will set maxvfsconf > * to the highest defined type number. > */ > -int maxvfsconf = sizeof(vfsconflist) / sizeof(struct vfsconf); > +int maxvfsconf = sizeof(vfsconflist) / sizeof(struct vfsconf); /* [I] */ > > /* Initialize the vnode structures and initialize each file system type. */ > void > Index: sys/kern/vfs_subr.c > =================================================================== > RCS file: /cvs/src/sys/kern/vfs_subr.c,v > diff -u -p -r1.325 vfs_subr.c > --- sys/kern/vfs_subr.c 31 Oct 2024 10:06:51 -0000 1.325 > +++ sys/kern/vfs_subr.c 31 Oct 2024 12:21:07 -0000 > @@ -1375,6 +1375,7 @@ vfs_sysctl(int *name, u_int namelen, voi > size_t newlen, struct proc *p) > { > struct vfsconf *vfsp, *tmpvfsp; > + struct bcachestats *tmpbcstats; > int ret; > > /* all sysctl names at this level are at least name and field */ > @@ -1402,10 +1403,14 @@ vfs_sysctl(int *name, u_int namelen, voi > if (vfsp == NULL) > return (EOPNOTSUPP); > > - /* Make a copy, clear out kernel pointers */ > + /* > + * Make a copy, clear out kernel pointers, > + * atomically load refcount > + */ > tmpvfsp = malloc(sizeof(*tmpvfsp), M_TEMP, M_WAITOK|M_ZERO); > memcpy(tmpvfsp, vfsp, sizeof(*tmpvfsp)); > tmpvfsp->vfc_vfsops = NULL; > + tmpvfsp->vfc_refcount = atomic_load_int(&vfsp->vfc_refcount); > > ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpvfsp, > sizeof(struct vfsconf)); > @@ -1413,9 +1418,17 @@ vfs_sysctl(int *name, u_int namelen, voi > free(tmpvfsp, M_TEMP, sizeof(*tmpvfsp)); > return (ret); > case VFS_BCACHESTAT: /* buffer cache statistics */ > - ret = sysctl_rdstruct(oldp, oldlenp, newp, &bcstats, > - sizeof(struct bcachestats)); > - return(ret); > + tmpbcstats = malloc(sizeof(*tmpbcstats), M_TEMP, > + M_WAITOK|M_ZERO); > + KERNEL_LOCK(); > + memcpy(tmpbcstats, &bcstats, sizeof(*tmpbcstats)); > + KERNEL_UNLOCK(); > + > + ret = sysctl_rdstruct(oldp, oldlenp, newp, tmpbcstats, > + sizeof(*tmpbcstats)); > + > + free(tmpbcstats, M_TEMP, sizeof(*tmpbcstats)); > + return (ret); > } > return (EOPNOTSUPP); > } > Index: sys/nfs/nfs_vfsops.c > =================================================================== > RCS file: /cvs/src/sys/nfs/nfs_vfsops.c,v > diff -u -p -r1.132 nfs_vfsops.c > --- sys/nfs/nfs_vfsops.c 30 Oct 2024 06:16:27 -0000 1.132 > +++ sys/nfs/nfs_vfsops.c 31 Oct 2024 12:21:07 -0000 > @@ -65,6 +65,8 @@ > extern struct nfsstats nfsstats; > extern int nfs_ticks; > > +int nfs_sysctl_locked(int *, u_int, void *, size_t *, void *, size_t, > + struct proc *); > int nfs_sysctl(int *, u_int, void *, size_t *, void *, size_t, > struct proc *); > int nfs_checkexp(struct mount *, struct mbuf *, int *, struct ucred **); > @@ -844,8 +846,8 @@ nfs_vget(struct mount *mp, ino_t ino, st > * Do that sysctl thang... > */ > int > -nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, > - size_t newlen, struct proc *p) > +nfs_sysctl_locked(int *name, u_int namelen, void *oldp, size_t *oldlenp, > + void *newp, size_t newlen, struct proc *p) > { > int rv; > > @@ -892,6 +894,21 @@ nfs_sysctl(int *name, u_int namelen, voi > } > } > > +int > +nfs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, > + size_t newlen, struct proc *p) > +{ > + size_t savelen = *oldlenp; > + int error; > + > + if ((error = sysctl_vslock(oldp, savelen))) > + return error; > + error = nfs_sysctl_locked(name, namelen, oldp, oldlenp, > + newp, newlen, p); > + sysctl_vsunlock(oldp, savelen); > + > + return error; > +} > > /* > * At this point, this should never happen > Index: sys/sys/mount.h > =================================================================== > RCS file: /cvs/src/sys/sys/mount.h,v > diff -u -p -r1.151 mount.h > --- sys/sys/mount.h 3 Feb 2024 18:51:58 -0000 1.151 > +++ sys/sys/mount.h 31 Oct 2024 12:21:07 -0000 > @@ -42,6 +42,12 @@ > #include > #include > > +/* > + * Locks used to protect data: > + * I immutable after creation > + * a atomic > + */ > + > typedef struct { int32_t val[2]; } fsid_t; /* file system id type */ > > /* > @@ -456,12 +462,13 @@ typedef struct fhandle fhandle_t; > * mount time to identify the requested filesystem. > */ > struct vfsconf { > - const struct vfsops *vfc_vfsops; /* filesystem operations vector */ > - char vfc_name[MFSNAMELEN]; /* filesystem type name */ > - int vfc_typenum; /* historic filesystem type number */ > - u_int vfc_refcount; /* number mounted of this type */ > - int vfc_flags; /* permanent flags */ > - size_t vfc_datasize; /* size of data args */ > + const struct vfsops *vfc_vfsops; /* [I] filesystem operations vector */ > + char vfc_name[MFSNAMELEN]; /* [I] filesystem type name */ > + int vfc_typenum; /* [I] historic filesystem type > + number */ > + u_int vfc_refcount; /* [a] number mounted of this type */ > + int vfc_flags; /* [I] permanent flags */ > + size_t vfc_datasize; /* [I] size of data args */ > }; > > /* buffer cache statistics */ > Index: sys/ufs/ffs/ffs_vfsops.c > =================================================================== > RCS file: /cvs/src/sys/ufs/ffs/ffs_vfsops.c,v > diff -u -p -r1.198 ffs_vfsops.c > --- sys/ufs/ffs/ffs_vfsops.c 3 Feb 2024 18:51:58 -0000 1.198 > +++ sys/ufs/ffs/ffs_vfsops.c 31 Oct 2024 12:21:07 -0000 > @@ -1458,6 +1458,12 @@ int > ffs_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, > size_t newlen, struct proc *p) > { > - return sysctl_bounded_arr(ffs_vars, nitems(ffs_vars), name, > + int error; > + > + KERNEL_LOCK(); > + error = sysctl_bounded_arr(ffs_vars, nitems(ffs_vars), name, > namelen, oldp, oldlenp, newp, newlen); > + KERNEL_UNLOCK(); > + > + return (error); > } >