From: Vitaliy Makkoveev Subject: Re: fstat sysctl crashes kernel To: Vitaliy Makkoveev Cc: Alexander Bluhm , tech@openbsd.org Date: Thu, 8 Aug 2024 14:28:09 +0300 On Thu, Aug 08, 2024 at 02:03:19AM +0300, Vitaliy Makkoveev wrote: > > On 8 Aug 2024, at 01:49, Alexander Bluhm wrote: > > > > Hi, > > > > There is this long lasting problem that fstat(1) in a loop during > > heavy network load can crash the kernel. Bug is that while traversing > > allprocess in sysctl KERN_FILE_BYPID the list may change if fill_file() > > is sleeping. Conclusion at previous hackaton was that the single > > sysctl is doing too much. > > > > So I changed fstat to first call kvm_getprocs() getting all pids, > > and then kvm_getfiles(KERN_FILE_BYPID) for each pid. This allows > > to abort the list traversal after a matching process has been found. > > So we don't have to deal with dead next pointer. Of course fstat > > gets slower as it does multiple sysctl with linear process search, > > but slow is better than crash. > > > > It is necessary to lock the current process, otherwise fd_getfile(fdp) > > may use a freed struct filedesc. I am not happy with using > > &pr->ps_lock in exit1() as it adds additional sleeping points. With > > the diff below fork+exec, tcpbench, fstat in loops is stable. To > > avoid sleeps in allprocess loop I use RW_SLEEPFAIL. > > > > Note that only KERN_FILE_BYPID with given pid is safe. After we > > remove all other sysctl in userland, we can reject them in the > > kernel. > > > > Any idea how we can lock the process in a nicer way? > > > > As I remember, when I tried to introduce allprocess rwlock(9), the > only problem was false (?) witness(4) report. I like to follow this > way. I don’t mean various allprocess list iterators, as it was exposed > this was bad idea. > The minimal diff. Since both fork1() and exit1() are kernel locked, only sysctl_file() `allprocess' loops protected with `allprocess_lock' rwlock(9). Continious "make -j 16" with simultaneous fstat(1) loops and some network load are stable on my amd64. Index: sys/kern/kern_exit.c =================================================================== RCS file: /cvs/src/sys/kern/kern_exit.c,v diff -u -p -r1.230 kern_exit.c --- sys/kern/kern_exit.c 6 Aug 2024 18:41:20 -0000 1.230 +++ sys/kern/kern_exit.c 8 Aug 2024 10:55:09 -0000 @@ -210,6 +210,8 @@ exit1(struct proc *p, int xexit, int xsi #endif if ((p->p_flag & P_THREAD) == 0) { + rw_enter_write(&allprocess_lock); + if (pr->ps_flags & PS_PROFIL) stopprofclock(pr); @@ -291,6 +293,8 @@ exit1(struct proc *p, int xexit, int xsi */ freepid(pr->ps_pid); } + + rw_exit_write(&allprocess_lock); /* * Reparent children to their original parent, in case Index: sys/kern/kern_fork.c =================================================================== RCS file: /cvs/src/sys/kern/kern_fork.c,v diff -u -p -r1.261 kern_fork.c --- sys/kern/kern_fork.c 6 Aug 2024 08:44:54 -0000 1.261 +++ sys/kern/kern_fork.c 8 Aug 2024 10:55:09 -0000 @@ -392,6 +392,8 @@ fork1(struct proc *curp, int flags, void return (ENOMEM); } + rw_enter_write(&allprocess_lock); + /* * From now on, we're committed to the fork and cannot fail. */ @@ -503,6 +505,14 @@ fork1(struct proc *curp, int flags, void *rnewprocp = p; /* + * Return child pid to parent process + */ + if (retval != NULL) + *retval = pr->ps_pid; + + rw_exit_write(&allprocess_lock); + + /* * Preserve synchronization semantics of vfork. If waiting for * child to exec or exit, set PS_PPWAIT on child and PS_ISPWAIT * on ourselves, and sleep on our process for the latter flag @@ -519,11 +529,6 @@ fork1(struct proc *curp, int flags, void if ((flags & FORK_PTRACE) && (curpr->ps_flags & PS_TRACED)) psignal(curp, SIGTRAP); - /* - * Return child pid to parent process - */ - if (retval != NULL) - *retval = pr->ps_pid; return (0); } Index: sys/kern/kern_proc.c =================================================================== RCS file: /cvs/src/sys/kern/kern_proc.c,v diff -u -p -r1.99 kern_proc.c --- sys/kern/kern_proc.c 8 Jul 2024 13:17:12 -0000 1.99 +++ sys/kern/kern_proc.c 8 Aug 2024 10:55:09 -0000 @@ -49,6 +49,7 @@ * U uidinfolk */ +struct rwlock allprocess_lock; struct rwlock uidinfolk; #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) LIST_HEAD(uihashhead, uidinfo) *uihashtbl; /* [U] */ @@ -92,6 +93,7 @@ procinit(void) LIST_INIT(&zombprocess); LIST_INIT(&allproc); + rw_init(&allprocess_lock, "allpslk"); rw_init(&uidinfolk, "uidinfo"); tidhashtbl = hashinit(maxthread / 4, M_PROC, M_NOWAIT, &tidhash); Index: sys/kern/kern_sysctl.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sysctl.c,v diff -u -p -r1.435 kern_sysctl.c --- sys/kern/kern_sysctl.c 8 Aug 2024 10:25:00 -0000 1.435 +++ sys/kern/kern_sysctl.c 8 Aug 2024 10:55:09 -0000 @@ -1675,6 +1675,7 @@ sysctl_file(int *name, u_int namelen, ch break; } matched = 0; + rw_enter_read(&allprocess_lock); LIST_FOREACH(pr, &allprocess, ps_list) { /* * skip system, exiting, embryonic and undead @@ -1703,10 +1704,12 @@ sysctl_file(int *name, u_int namelen, ch FRELE(fp, p); } } + rw_exit_read(&allprocess_lock); if (!matched) error = ESRCH; break; case KERN_FILE_BYUID: + rw_enter_read(&allprocess_lock); LIST_FOREACH(pr, &allprocess, ps_list) { /* * skip system, exiting, embryonic and undead @@ -1732,6 +1735,7 @@ sysctl_file(int *name, u_int namelen, ch FRELE(fp, p); } } + rw_exit_read(&allprocess_lock); break; default: error = EINVAL; Index: sys/sys/proc.h =================================================================== RCS file: /cvs/src/sys/sys/proc.h,v diff -u -p -r1.367 proc.h --- sys/sys/proc.h 6 Aug 2024 08:44:54 -0000 1.367 +++ sys/sys/proc.h 8 Aug 2024 10:55:09 -0000 @@ -503,6 +503,8 @@ void uid_release(struct uidinfo *); #define EXIT_THREAD 0x00000002 #define EXIT_THREAD_NOCHECK 0x00000003 +extern struct rwlock allprocess_lock; + #define TIDHASH(tid) (&tidhashtbl[(tid) & tidhash]) extern LIST_HEAD(tidhashhead, proc) *tidhashtbl; extern u_long tidhash;