Index | Thread | Search

From:
Vitaliy Makkoveev <otto@bsdbox.dev>
Subject:
Re: fstat sysctl crashes kernel
To:
Alexander Bluhm <bluhm@openbsd.org>
Cc:
tech@openbsd.org
Date:
Thu, 8 Aug 2024 21:27:36 +0300

Download raw body.

Thread
> On 8 Aug 2024, at 20:46, Alexander Bluhm <bluhm@openbsd.org> wrote:
> 
> On Thu, Aug 08, 2024 at 09:47:27AM -0600, Theo de Raadt wrote:
>> Todd C. Miller <millert@openbsd.org> wrote:
>> 
>>> On Thu, 08 Aug 2024 13:21:54 +0200, Alexander Bluhm wrote:
>>> 
>>>> Independent of that, can we commit the chunk below?  When searching
>>>> for a specific process, there is not need to traverse the list to
>>>> the end.  Also I check for arg >= 0 as this is cosistent with the
>>>> arg < -1 check before.  Then you don't have to think about what is
>>>> special about process 0.  There is no functional difference as
>>>> process 0 has PS_SYSTEM set and is skipped anyway.
>>> 
>>> Yes, this makes sense. OK millert@
>> 
>> No matter what we do, we don't get an atomic view.  Punting the problem
>> to userland sidesteps the difficult problem in the kernel.
> 
> For me is important that we commit something instead of discussing
> pro and con of various diffs.
> 
> The idea with the marker sounds like the correct thing, but adds a
> lot of complexity.  mvs@ already worked on something simmilar.  The
> easiest fix without locking concerns is my userland fstat diff.
> Locking the whole process list while doing copyout over NFS may
> open another can of worms.
> 

Userland pages are wired at this moment, you can’t stuck in swap
on nfs here, sysctl_vslock() cares about that.

> Both bluhm@ and mvs@ diff survive my stress test for hours.  Usually
> the test crashes or hangs within minutes.  Currently I am testing
> with witness to find fallout.
> 

Nice to hear. Personal me, I prefer allprocess rwlock(9). It is much
simpler and left userland as is. Also, is allows to remove kernel
lock around some process stuff like prfind(). For example, from the 
filt_procattach() where claudio@ left it to make ps dereference safe.

Also, as I can see, others have similar lock around their process
stuff, so this could be good practice.

> Below is my diff merged with current.
> 
> bluhm
> 
> Index: sys/kern/kern_exit.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_exit.c,v
> diff -u -p -r1.230 kern_exit.c
> --- sys/kern/kern_exit.c	6 Aug 2024 18:41:20 -0000	1.230
> +++ sys/kern/kern_exit.c	8 Aug 2024 17:20:17 -0000
> @@ -137,7 +137,9 @@ exit1(struct proc *p, int xexit, int xsi
> 		if (pr->ps_pid == 1)
> 			panic("init died (signal %d, exit %d)", xsig, xexit);
> 
> +		rw_enter_write(&pr->ps_lock);
> 		atomic_setbits_int(&pr->ps_flags, PS_EXITING);
> +		rw_exit_write(&pr->ps_lock);
> 		pr->ps_xexit = xexit;
> 		pr->ps_xsig  = xsig;
> 
> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/sys/kern/kern_sysctl.c,v
> diff -u -p -r1.436 kern_sysctl.c
> --- sys/kern/kern_sysctl.c	8 Aug 2024 15:02:36 -0000	1.436
> +++ sys/kern/kern_sysctl.c	8 Aug 2024 17:20:17 -0000
> @@ -1675,15 +1675,22 @@ sysctl_file(int *name, u_int namelen, ch
> 			break;
> 		}
> 		matched = 0;
> + restart:
> 		LIST_FOREACH(pr, &allprocess, ps_list) {
> +			if (arg >= 0 && pr->ps_pid != (pid_t)arg) {
> +				/* not the pid we are looking for */
> +				continue;
> +			}
> 			/*
> 			 * skip system, exiting, embryonic and undead
> 			 * processes
> 			 */
> -			if (pr->ps_flags & (PS_SYSTEM | PS_EMBRYO | PS_EXITING))
> -				continue;
> -			if (arg >= 0 && pr->ps_pid != (pid_t)arg) {
> -				/* not the pid we are looking for */
> +			if (rw_enter(&pr->ps_lock,
> +			    arg >= 0 ? RW_READ | RW_SLEEPFAIL : RW_READ))
> +				goto restart;
> +			if (ISSET(pr->ps_flags,
> +			    PS_SYSTEM | PS_EMBRYO | PS_EXITING)) {
> +				rw_exit_read(&pr->ps_lock);
> 				continue;
> 			}
> 			matched = 1;
> @@ -1702,6 +1709,7 @@ sysctl_file(int *name, u_int namelen, ch
> 				FILLIT(fp, fdp, i, NULL, pr);
> 				FRELE(fp, p);
> 			}
> +			rw_exit_read(&pr->ps_lock);
> 			/* pid is unique, stop searching */
> 			if (arg >= 0)
> 				break;
> Index: usr.bin/fstat/fstat.c
> ===================================================================
> RCS file: /data/mirror/openbsd/cvs/src/usr.bin/fstat/fstat.c,v
> diff -u -p -r1.103 fstat.c
> --- usr.bin/fstat/fstat.c	20 Jun 2022 01:39:44 -0000	1.103
> +++ usr.bin/fstat/fstat.c	7 Aug 2024 20:19:33 -0000
> @@ -142,12 +142,13 @@ hide(void *p)
> int
> main(int argc, char *argv[])
> {
> +	struct kinfo_proc *kp, *kplast;
> 	struct kinfo_file *kf, *kflast;
> 	int ch;
> 	char *memf, *nlistf, *optstr;
> 	char buf[_POSIX2_LINE_MAX];
> 	const char *errstr;
> -	int cnt, flags;
> +	int pcnt, fcnt, flags;
> 
> 	hideroot = getuid();
> 
> @@ -282,14 +283,27 @@ main(int argc, char *argv[])
> 		checkfile = 1;
> 	}
> 
> -	if (nfilter == 1) {
> -		if ((kf = kvm_getfiles(kd, filter[0].what, filter[0].arg,
> -		    sizeof(*kf), &cnt)) == NULL)
> -			errx(1, "%s", kvm_geterr(kd));
> +	if (nfilter == 1 && filter[0].what == KERN_FILE_BYPID) {
> +		kp = kvm_getprocs(kd, KERN_PROC_PID, filter[0].arg,
> +		    sizeof(*kp), &pcnt);
> 	} else {
> -		if ((kf = kvm_getfiles(kd, KERN_FILE_BYPID, -1, sizeof(*kf),
> -		    &cnt)) == NULL)
> -			errx(1, "%s", kvm_geterr(kd));
> +		kp = kvm_getprocs(kd, KERN_PROC_ALL, -1, sizeof(*kp), &pcnt);
> +	}
> +	if (kp == NULL)
> +		errx(1, "%s", kvm_geterr(kd));
> +	kf = NULL;
> +	fcnt = 0;
> +	for (kplast = &kp[pcnt]; kp < kplast; ++kp) {
> +		struct kinfo_file *kfpid;
> +		int cnt;
> +
> +		if ((kfpid = kvm_getfiles(kd, KERN_FILE_BYPID, kp->p_pid,
> +		    sizeof(*kf), &cnt)) == NULL)
> +			continue;
> +		if ((kf = reallocarray(kf, fcnt + cnt, sizeof(*kf))) == NULL)
> +			err(1, NULL);
> +		memcpy(&kf[fcnt], kfpid, sizeof(*kf) * cnt);
> +		fcnt += cnt;
> 	}
> 
> 	if (fuser) {
> @@ -317,10 +331,10 @@ main(int argc, char *argv[])
> 			err(1, "pledge");
> 	}
> 
> -	find_splices(kf, cnt);
> +	find_splices(kf, fcnt);
> 	if (!fuser)
> 		fstat_header();
> -	for (kflast = &kf[cnt]; kf < kflast; ++kf) {
> +	for (kflast = &kf[fcnt]; kf < kflast; ++kf) {
> 		if (fuser)
> 			fuser_check(kf);
> 		else
>