Index | Thread | Search

From:
Vitaliy Makkoveev <mvs@openbsd.org>
Subject:
Re: fstat sysctl crashes kernel
To:
Vitaliy Makkoveev <otto@bsdbox.dev>
Cc:
Alexander Bluhm <bluhm@openbsd.org>, tech@openbsd.org
Date:
Thu, 8 Aug 2024 14:28:09 +0300

Download raw body.

Thread
  • Vitaliy Makkoveev:

    fstat sysctl crashes kernel

  • Vitaliy Makkoveev:

    fstat sysctl crashes kernel

  • On Thu, Aug 08, 2024 at 02:03:19AM +0300, Vitaliy Makkoveev wrote:
    > > On 8 Aug 2024, at 01:49, Alexander Bluhm <bluhm@openbsd.org> wrote:
    > > 
    > > Hi,
    > > 
    > > There is this long lasting problem that fstat(1) in a loop during
    > > heavy network load can crash the kernel.  Bug is that while traversing
    > > allprocess in sysctl KERN_FILE_BYPID the list may change if fill_file()
    > > is sleeping.  Conclusion at previous hackaton was that the single
    > > sysctl is doing too much.
    > > 
    > > So I changed fstat to first call kvm_getprocs() getting all pids,
    > > and then kvm_getfiles(KERN_FILE_BYPID) for each pid.  This allows
    > > to abort the list traversal after a matching process has been found.
    > > So we don't have to deal with dead next pointer.  Of course fstat
    > > gets slower as it does multiple sysctl with linear process search,
    > > but slow is better than crash.
    > > 
    > > It is necessary to lock the current process, otherwise fd_getfile(fdp)
    > > may use a freed struct filedesc.  I am not happy with using
    > > &pr->ps_lock in exit1() as it adds additional sleeping points.  With
    > > the diff below fork+exec, tcpbench, fstat in loops is stable.  To
    > > avoid sleeps in allprocess loop I use RW_SLEEPFAIL.
    > > 
    > > Note that only KERN_FILE_BYPID with given pid is safe.  After we
    > > remove all other sysctl in userland, we can reject them in the
    > > kernel.
    > > 
    > > Any idea how we can lock the process in a nicer way?
    > > 
    > 
    > As I remember, when I tried to introduce allprocess rwlock(9), the
    > only problem was false (?) witness(4) report. I like to follow this
    > way. I don’t mean various allprocess list iterators, as it was exposed
    > this was bad idea.
    > 
    
    The minimal diff. Since both fork1() and exit1() are kernel locked, only
    sysctl_file() `allprocess' loops protected with `allprocess_lock'
    rwlock(9).
    
    Continious "make -j 16" with simultaneous fstat(1) loops and some
    network load are stable on my amd64.
    
    Index: sys/kern/kern_exit.c
    ===================================================================
    RCS file: /cvs/src/sys/kern/kern_exit.c,v
    diff -u -p -r1.230 kern_exit.c
    --- sys/kern/kern_exit.c	6 Aug 2024 18:41:20 -0000	1.230
    +++ sys/kern/kern_exit.c	8 Aug 2024 10:55:09 -0000
    @@ -210,6 +210,8 @@ exit1(struct proc *p, int xexit, int xsi
     #endif
     
     	if ((p->p_flag & P_THREAD) == 0) {
    +		rw_enter_write(&allprocess_lock);
    +
     		if (pr->ps_flags & PS_PROFIL)
     			stopprofclock(pr);
     
    @@ -291,6 +293,8 @@ exit1(struct proc *p, int xexit, int xsi
     			 */
     			freepid(pr->ps_pid);
     		}
    +
    +		rw_exit_write(&allprocess_lock);
     
     		/*
     		 * Reparent children to their original parent, in case
    Index: sys/kern/kern_fork.c
    ===================================================================
    RCS file: /cvs/src/sys/kern/kern_fork.c,v
    diff -u -p -r1.261 kern_fork.c
    --- sys/kern/kern_fork.c	6 Aug 2024 08:44:54 -0000	1.261
    +++ sys/kern/kern_fork.c	8 Aug 2024 10:55:09 -0000
    @@ -392,6 +392,8 @@ fork1(struct proc *curp, int flags, void
     		return (ENOMEM);
     	}
     
    +	rw_enter_write(&allprocess_lock);
    +
     	/*
     	 * From now on, we're committed to the fork and cannot fail.
     	 */
    @@ -503,6 +505,14 @@ fork1(struct proc *curp, int flags, void
     		*rnewprocp = p;
     
     	/*
    +	 * Return child pid to parent process
    +	 */
    +	if (retval != NULL)
    +		*retval = pr->ps_pid;
    +
    +	rw_exit_write(&allprocess_lock);
    +
    +	/*
     	 * Preserve synchronization semantics of vfork.  If waiting for
     	 * child to exec or exit, set PS_PPWAIT on child and PS_ISPWAIT
     	 * on ourselves, and sleep on our process for the latter flag
    @@ -519,11 +529,6 @@ fork1(struct proc *curp, int flags, void
     	if ((flags & FORK_PTRACE) && (curpr->ps_flags & PS_TRACED))
     		psignal(curp, SIGTRAP);
     
    -	/*
    -	 * Return child pid to parent process
    -	 */
    -	if (retval != NULL)
    -		*retval = pr->ps_pid;
     	return (0);
     }
     
    Index: sys/kern/kern_proc.c
    ===================================================================
    RCS file: /cvs/src/sys/kern/kern_proc.c,v
    diff -u -p -r1.99 kern_proc.c
    --- sys/kern/kern_proc.c	8 Jul 2024 13:17:12 -0000	1.99
    +++ sys/kern/kern_proc.c	8 Aug 2024 10:55:09 -0000
    @@ -49,6 +49,7 @@
      *	U	uidinfolk
      */
     
    +struct rwlock allprocess_lock;
     struct rwlock uidinfolk;
     #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
     LIST_HEAD(uihashhead, uidinfo) *uihashtbl;	/* [U] */
    @@ -92,6 +93,7 @@ procinit(void)
     	LIST_INIT(&zombprocess);
     	LIST_INIT(&allproc);
     
    +	rw_init(&allprocess_lock, "allpslk");
     	rw_init(&uidinfolk, "uidinfo");
     
     	tidhashtbl = hashinit(maxthread / 4, M_PROC, M_NOWAIT, &tidhash);
    Index: sys/kern/kern_sysctl.c
    ===================================================================
    RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
    diff -u -p -r1.435 kern_sysctl.c
    --- sys/kern/kern_sysctl.c	8 Aug 2024 10:25:00 -0000	1.435
    +++ sys/kern/kern_sysctl.c	8 Aug 2024 10:55:09 -0000
    @@ -1675,6 +1675,7 @@ sysctl_file(int *name, u_int namelen, ch
     			break;
     		}
     		matched = 0;
    +		rw_enter_read(&allprocess_lock);
     		LIST_FOREACH(pr, &allprocess, ps_list) {
     			/*
     			 * skip system, exiting, embryonic and undead
    @@ -1703,10 +1704,12 @@ sysctl_file(int *name, u_int namelen, ch
     				FRELE(fp, p);
     			}
     		}
    +		rw_exit_read(&allprocess_lock);
     		if (!matched)
     			error = ESRCH;
     		break;
     	case KERN_FILE_BYUID:
    +		rw_enter_read(&allprocess_lock);
     		LIST_FOREACH(pr, &allprocess, ps_list) {
     			/*
     			 * skip system, exiting, embryonic and undead
    @@ -1732,6 +1735,7 @@ sysctl_file(int *name, u_int namelen, ch
     				FRELE(fp, p);
     			}
     		}
    +		rw_exit_read(&allprocess_lock);
     		break;
     	default:
     		error = EINVAL;
    Index: sys/sys/proc.h
    ===================================================================
    RCS file: /cvs/src/sys/sys/proc.h,v
    diff -u -p -r1.367 proc.h
    --- sys/sys/proc.h	6 Aug 2024 08:44:54 -0000	1.367
    +++ sys/sys/proc.h	8 Aug 2024 10:55:09 -0000
    @@ -503,6 +503,8 @@ void uid_release(struct uidinfo *);
     #define EXIT_THREAD		0x00000002
     #define EXIT_THREAD_NOCHECK	0x00000003
     
    +extern struct rwlock allprocess_lock;
    +
     #define	TIDHASH(tid)	(&tidhashtbl[(tid) & tidhash])
     extern LIST_HEAD(tidhashhead, proc) *tidhashtbl;
     extern u_long tidhash;
    
    
    
  • Vitaliy Makkoveev:

    fstat sysctl crashes kernel

  • Vitaliy Makkoveev:

    fstat sysctl crashes kernel