Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
Re: Another attempt to get rid of the reaper
To:
Christian Ludwig <cludwig@mailbox.org>
Cc:
tech@openbsd.org
Date:
Tue, 16 Sep 2025 11:34:24 +0200

Download raw body.

Thread
On Sun, Sep 14, 2025 at 10:36:51PM +0200, Christian Ludwig wrote:
> Hi,
> 
> this is another attempt to get rid of the dedicated reaper thread.

Why is this a goal? What problem are you trying to solve with this?

In my opinion this diff makes the current exit situation worse. Instead of
having a clear reaper process that does the cleanup of the proc and
process we now end up delegating this work to init(8) or the parent
process. Neither are really ideal to do this work.

You can not assume the parent will be sitting in wait(2) / dowait6() on
exit of a child.  Actually you can not assume that the parent will ever
call wait(2) so this change would allow the collection of many fat zombies
for no good reason.
Wait(2) only needs a few bit of information (r- and tusage, signal and exit
information) to work, so things like the uarea really should be removed
early on and not linger around until the zombie is collected.

The reaper right now is no longer a bottle neck, it uses very CPU little
time.  I agree that moving more from the reaper into exit1() is a good
thing, like the wakeup signaling to the parent process. But I think this
goes a few steps to far and introduces complex problems for very little
benefit.

For me the reaper thread by itself is not an issue, it helps to finish up
the tricky bits of cleanup on exit quickly.

> The main part of the diff is moving all the code around parent wakeup
> into the exit path. Now the parent wakes up earlier, when the child is
> still running. The parent cannot free the child's resources, yet. That's
> why the parent now waits before zapping the child. The scheduler wakes
> it up after the exiting child is off the CPU. With this, I see no reason
> why we can't sleep for most of exit1().
> 
> Most of the remaining cleanup that is performed in the reaper directly
> is pushed into the parent. init(8) reaps non-zombies from the wait path.
> proc_free() runs unlocked in the reaper already. I unlocked it in the
> wait path, too. Along with most of the freeing in process_zap(), which
> is MP safe IMHO.
> 
> This diff does not force-switch to idle after exit anymore. That needs a
> small fix in amd64's cpu_switch(), since now we can switch to a non-
> system thread.
> 
> I am unsure how to deal with the new proc.p_deadcond member. I kept it
> an opaque pointer. That saves us from exposing struct cond to userspace.
> I'm also not sure that I got the accounting completely right. And there
> is room for cleanups in exit1.
> 
> I have tested this on amd64 and i386. I'd appreciate test reports for a
> variety of workloads and architectures.
> 
> 
>  - Christian
> 
> ---
>  sys/arch/amd64/amd64/locore.S |   5 +-
>  sys/kern/init_main.c          |   5 -
>  sys/kern/kern_exit.c          | 208 +++++++++++++++++++++---------------------
>  sys/kern/kern_fork.c          |  47 +++++++---
>  sys/kern/kern_sched.c         |  50 +++++-----
>  sys/kern/sched_bsd.c          |   6 ++
>  sys/sys/proc.h                |   5 +-
>  sys/sys/sched.h               |   2 +-
>  sys/uvm/uvm_glue.c            |   2 +-
>  9 files changed, 178 insertions(+), 152 deletions(-)
> 
> diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S
> index 2c19fbf0a309..a35b1f998bd6 100644
> --- a/sys/arch/amd64/amd64/locore.S
> +++ b/sys/arch/amd64/amd64/locore.S
> @@ -400,13 +400,14 @@ restore_saved:
>  	cmpq	%rcx,CPUVAR(PROC_PMAP)
>  	jnz	.Lbogus_proc_pmap
>  #endif
> -	/* record which pmap this CPU should get IPIs for */
> -	movq	%rbx,CPUVAR(PROC_PMAP)
>  
>  .Lset_cr3:
>  	movq	%rax,%cr3			/* %rax used below too */
>  
>  .Lsame_cr3:
> +	/* record which pmap this CPU should get IPIs for */
> +	movq	%rbx,CPUVAR(PROC_PMAP)
> +
>  	/*
>  	 * If we switched from a userland thread with a shallow call stack
>  	 * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
> diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
> index 9719b2666c5b..242af43d0c27 100644
> --- a/sys/kern/init_main.c
> +++ b/sys/kern/init_main.c
> @@ -117,7 +117,6 @@ struct	plimit limit0;
>  struct	vmspace vmspace0;
>  struct	sigacts sigacts0;
>  struct	process *initprocess;
> -struct	proc *reaperproc;
>  
>  extern	struct user *proc0paddr;
>  
> @@ -501,10 +500,6 @@ main(void *framep)
>  	if (kthread_create(uvm_pageout, NULL, NULL, "pagedaemon"))
>  		panic("fork pagedaemon");
>  
> -	/* Create the reaper daemon kernel thread. */
> -	if (kthread_create(reaper, NULL, &reaperproc, "reaper"))
> -		panic("fork reaper");
> -
>  	/* Create the cleaner daemon kernel thread. */
>  	if (kthread_create(buf_daemon, NULL, &cleanerproc, "cleaner"))
>  		panic("fork cleaner");
> diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> index df04c4270eac..3a49c9b53509 100644
> --- a/sys/kern/kern_exit.c
> +++ b/sys/kern/kern_exit.c
> @@ -69,8 +69,11 @@
>  #include <sys/kcov.h>
>  #endif
>  
> +void	exit2(struct proc *);
>  void	proc_finish_wait(struct proc *, struct process *);
>  void	process_clear_orphan(struct process *);
> +void	proc_reap(struct proc *);
> +void	process_remove(struct process *);
>  void	process_zap(struct process *);
>  void	proc_free(struct proc *);
>  void	unveil_destroy(struct process *ps);
> @@ -118,6 +121,7 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  {
>  	struct process *pr, *qr, *nqr;
>  	struct rusage *rup;
> +	struct vnode *otvp;
>  
>  	atomic_setbits_int(&p->p_flag, P_WEXIT);
>  
> @@ -259,6 +263,14 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  			uvm_purge();
>  			KERNEL_LOCK();
>  		}
> +
> +		/*
> +		 * Release reference to text vnode
> +		 */
> +		otvp = pr->ps_textvp;
> +		pr->ps_textvp = NULL;
> +		if (otvp)
> +			vrele(otvp);
>  	}
>  
>  	p->p_fd = NULL;		/* zap the thread's copy */
> @@ -276,15 +288,10 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  	 * Remove proc from pidhash chain and allproc so looking
>  	 * it up won't work.  We will put the proc on the
>  	 * deadproc list later (using the p_runq member), and
> -	 * wake up the reaper when we do.  If this is the last
> +	 * wake up the reaping process when we do.  If this is the last
>  	 * thread of a process that isn't PS_NOZOMBIE, we'll put
>  	 * the process on the zombprocess list below.
>  	 */
> -	/*
> -	 * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> -	 */
> -	p->p_stat = SDEAD;
> -
>  	LIST_REMOVE(p, p_hash);
>  	LIST_REMOVE(p, p_list);
>  
> @@ -359,6 +366,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  	p->p_pctcpu = 0;
>  
>  	if ((p->p_flag & P_THREAD) == 0) {
> +		struct process *pptr = pr->ps_pptr;
> +
>  		/*
>  		 * Final thread has died, so add on our children's rusage
>  		 * and calculate the total times.
> @@ -369,6 +378,9 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  		rup->ru_isrss = pr->ps_tu.tu_isrss;
>  		ruadd(rup, &pr->ps_cru);
>  
> +		/* Notify listeners of our demise and clean up. */
> +		knote_processexit(pr);
> +
>  		/*
>  		 * Notify parent that we're gone.  If we're not going to
>  		 * become a zombie, reparent to process 1 (init) so that
> @@ -376,13 +388,16 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  		 * wait4() to return ECHILD.
>  		 */
>  		mtx_enter(&pr->ps_mtx);
> -		if (pr->ps_flags & PS_NOZOMBIE) {
> -			struct process *ppr = pr->ps_pptr;
> +		if (pr->ps_flags & PS_NOZOMBIE)
>  			process_reparent(pr, initprocess);
> -			atomic_setbits_int(&ppr->ps_flags, PS_WAITEVENT);
> -			wakeup(ppr);
> +		else {
> +			/* Process is now a true zombie. */
> +			atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> +			prsignal(pptr, SIGCHLD);
>  		}
>  		mtx_leave(&pr->ps_mtx);
> +		atomic_setbits_int(&pptr->ps_flags, PS_WAITEVENT);
> +		wakeup(pptr);
>  	}
>  
>  	/* just a thread? check if last one standing. */
> @@ -396,9 +411,18 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  	}
>  
>  	/*
> -	 * Other substructures are freed from reaper and wait().
> +	 * Other substructures are freed from wait().
>  	 */
>  
> +	/*
> +	 * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> +	 */
> +	p->p_stat = SDEAD;
> +
> +	/* Do not feed zombies to init(8). */
> +	if ((p->p_flag & P_THREAD) || (pr->ps_flags & PS_NOZOMBIE))
> +		exit2(p);
> +
>  	/*
>  	 * Finally, call machine-dependent code.
>  	 */
> @@ -408,10 +432,6 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  	 * Deactivate the exiting address space before the vmspace
>  	 * is freed.  Note that we will continue to run on this
>  	 * vmspace's context until the switch to idle in sched_exit().
> -	 *
> -	 * Once we are no longer using the dead process's vmspace and
> -	 * stack, exit2() will be called to schedule those resources
> -	 * to be released by the reaper thread.
>  	 */
>  	pmap_deactivate(p);
>  	sched_exit(p);
> @@ -419,109 +439,67 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
>  }
>  
>  /*
> - * Locking of this prochead is special; it's accessed in a
> - * critical section of process exit, and thus locking it can't
> - * modify interrupt state.  We use a simple spin lock for this
> - * prochead.  We use the p_runq member to linkup to deadproc.
> + * We re-use the p_runq member to linkup to deadproc.
> + * The dead proc cannot sleep anymore.
>   */
> -struct mutex deadproc_mutex =
> -    MUTEX_INITIALIZER_FLAGS(IPL_NONE, "deadproc", MTX_NOWITNESS);
> +struct mutex deadproc_mutex = MUTEX_INITIALIZER(IPL_SCHED);
>  struct prochead deadproc = TAILQ_HEAD_INITIALIZER(deadproc);
>  
>  /*
> - * We are called from sched_idle() once it is safe to schedule the
> - * dead process's resources to be freed. So this is not allowed to sleep.
> - *
>   * We lock the deadproc list, place the proc on that list (using
> - * the p_runq member), and wake up the reaper.
> + * the p_runq member), and wake up init as the reaping process.
>   */
>  void
>  exit2(struct proc *p)
>  {
> -	/* account the remainder of time spent in exit1() */
> -	mtx_enter(&p->p_p->ps_mtx);
> -	tuagg_add_process(p->p_p, p);
> -	mtx_leave(&p->p_p->ps_mtx);
> -
>  	mtx_enter(&deadproc_mutex);
>  	TAILQ_INSERT_TAIL(&deadproc, p, p_runq);
>  	mtx_leave(&deadproc_mutex);
>  
> -	wakeup(&deadproc);
> +	atomic_setbits_int(&initprocess->ps_flags, PS_WAITEVENT);
> +	wakeup(initprocess);
>  }
>  
>  void
>  proc_free(struct proc *p)
>  {
> +	WITNESS_THREAD_EXIT(p);
> +
> +	uvm_uarea_free(p);
> +	p->p_vmspace = NULL;		/* zap the thread's copy */
> +
> +	free(p->p_deadcond, M_SUBPROC, sizeof(*p->p_deadcond));
>  	crfree(p->p_ucred);
>  	pool_put(&proc_pool, p);
>  	atomic_dec_int(&nthreads);
>  }
>  
>  /*
> - * Process reaper.  This is run by a kernel thread to free the resources
> - * of a dead process.  Once the resources are free, the process becomes
> - * a zombie, and the parent is allowed to read the undead's status.
> + * Free proc's ressources.
>   */
>  void
> -reaper(void *arg)
> +proc_reap(struct proc *p)
>  {
> -	struct proc *p;
> +	/* Wait for the thread to be scheduled off the CPU. */
> +	cond_wait(p->p_deadcond, "pdead");
>  
> -	KERNEL_UNLOCK();
> +	/*
> +	 * Free the VM resources we're still holding on to.
> +	 * We must do this from a valid thread because doing
> +	 * so may block.
> +	 */
> +	if (p->p_flag & P_THREAD) {
> +		/* Just a thread */
> +		proc_free(p);
> +	} else {
> +		struct process *pr = p->p_p;
>  
> -	SCHED_ASSERT_UNLOCKED();
> +		KERNEL_LOCK();
> +		/* No one will wait for us, just zap it. */
> +		process_remove(pr);
> +		KERNEL_UNLOCK();
>  
> -	for (;;) {
> -		mtx_enter(&deadproc_mutex);
> -		while ((p = TAILQ_FIRST(&deadproc)) == NULL)
> -			msleep_nsec(&deadproc, &deadproc_mutex, PVM, "reaper",
> -			    INFSLP);
> -
> -		/* Remove us from the deadproc list. */
> -		TAILQ_REMOVE(&deadproc, p, p_runq);
> -		mtx_leave(&deadproc_mutex);
> -
> -		WITNESS_THREAD_EXIT(p);
> -
> -		/*
> -		 * Free the VM resources we're still holding on to.
> -		 * We must do this from a valid thread because doing
> -		 * so may block.
> -		 */
> -		uvm_uarea_free(p);
> -		p->p_vmspace = NULL;		/* zap the thread's copy */
> -
> -		if (p->p_flag & P_THREAD) {
> -			/* Just a thread */
> -			proc_free(p);
> -		} else {
> -			struct process *pr = p->p_p;
> -
> -			/* Release the rest of the process's vmspace */
> -			uvm_exit(pr);
> -
> -			KERNEL_LOCK();
> -			if ((pr->ps_flags & PS_NOZOMBIE) == 0) {
> -				/* Process is now a true zombie. */
> -				atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> -			}
> -
> -			/* Notify listeners of our demise and clean up. */
> -			knote_processexit(pr);
> -
> -			if (pr->ps_flags & PS_ZOMBIE) {
> -				/* Post SIGCHLD and wake up parent. */
> -				prsignal(pr->ps_pptr, SIGCHLD);
> -				atomic_setbits_int(&pr->ps_pptr->ps_flags,
> -				    PS_WAITEVENT);
> -				wakeup(pr->ps_pptr);
> -			} else {
> -				/* No one will wait for us, just zap it. */
> -				process_zap(pr);
> -			}
> -			KERNEL_UNLOCK();
> -		}
> +		process_zap(pr);
>  	}
>  }
>  
> @@ -667,6 +645,26 @@ loop:
>  			break;
>  		}
>  	}
> +	/* init(8) accounts for cleaning up deadprocs. */
> +	if (q->p_p == initprocess) {
> +		struct proc *p;
> +		struct schedstate_percpu *spc;
> +
> +		KERNEL_UNLOCK();
> +		mtx_enter(&deadproc_mutex);
> +		while ((p = TAILQ_FIRST(&deadproc)) != NULL) {
> +			TAILQ_REMOVE(&deadproc, p, p_runq);
> +			mtx_leave(&deadproc_mutex);
> +			proc_reap(p);
> +			mtx_enter(&deadproc_mutex);
> +
> +			spc = &curcpu()->ci_schedstate;
> +			if (spc->spc_schedflags & SPCF_SHOULDYIELD)
> +				break;
> +		}
> +		mtx_leave(&deadproc_mutex);
> +		KERNEL_LOCK();
> +	}
>  	if (nfound == 0)
>  		return (ECHILD);
>  	if (options & WNOHANG) {
> @@ -787,12 +785,20 @@ proc_finish_wait(struct proc *waiter, struct process *pr)
>  		wakeup(tr);
>  	} else {
>  		mtx_leave(&pr->ps_mtx);
> +		/* Wait until the proc is off of its CPU. */
> +		cond_wait(pr->ps_mainproc->p_deadcond, "pdead");
> +
>  		scheduler_wait_hook(waiter, pr->ps_mainproc);
>  		rup = &waiter->p_p->ps_cru;
>  		ruadd(rup, pr->ps_ru);
>  		LIST_REMOVE(pr, ps_list);	/* off zombprocess */
>  		freepid(pr->ps_pid);
> +		process_remove(pr);
> +		KERNEL_UNLOCK();
> +
>  		process_zap(pr);
> +
> +		KERNEL_LOCK();
>  	}
>  }
>  
> @@ -857,32 +863,30 @@ process_reparent(struct process *child, struct process *parent)
>  }
>  
>  void
> -process_zap(struct process *pr)
> +process_remove(struct process *pr)
>  {
> -	struct vnode *otvp;
> -	struct proc *p = pr->ps_mainproc;
> -
>  	/*
>  	 * Finally finished with old proc entry.
> -	 * Unlink it from its process group and free it.
> +	 * Unlink it from its process group.
>  	 */
>  	leavepgrp(pr);
>  	LIST_REMOVE(pr, ps_sibling);
>  	process_clear_orphan(pr);
> +}
> +
> +void
> +process_zap(struct process *pr)
> +{
> +	struct proc *p = pr->ps_mainproc;
> +
> +	/* Release the rest of the process's vmspace */
> +	uvm_exit(pr);
>  
>  	/*
>  	 * Decrement the count of procs running with this uid.
>  	 */
>  	(void)chgproccnt(pr->ps_ucred->cr_ruid, -1);
>  
> -	/*
> -	 * Release reference to text vnode
> -	 */
> -	otvp = pr->ps_textvp;
> -	pr->ps_textvp = NULL;
> -	if (otvp)
> -		vrele(otvp);
> -
>  	KASSERT(pr->ps_threadcnt == 0);
>  	KASSERT(pr->ps_exitcnt == 1);
>  	if (pr->ps_ptstat != NULL)
> @@ -893,7 +897,7 @@ process_zap(struct process *pr)
>  	lim_free(pr->ps_limit);
>  	crfree(pr->ps_ucred);
>  	pool_put(&process_pool, pr);
> -	nprocesses--;
> +	atomic_dec_int(&nprocesses);
>  
>  	proc_free(p);
>  }
> diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
> index 5b925518234d..e1fbdfa14b66 100644
> --- a/sys/kern/kern_fork.c
> +++ b/sys/kern/kern_fork.c
> @@ -78,7 +78,8 @@ void unveil_copy(struct process *parent, struct process *child);
>  
>  struct proc *thread_new(struct proc *_parent, vaddr_t _uaddr);
>  struct process *process_new(struct proc *, struct process *, int);
> -int fork_check_maxthread(uid_t _uid);
> +int fork_check_maxthread(uid_t);
> +int fork_check_maxprocess(uid_t);
>  
>  void
>  fork_return(void *arg)
> @@ -164,6 +165,8 @@ thread_new(struct proc *parent, vaddr_t uaddr)
>  	    (caddr_t)&p->p_endcopy - (caddr_t)&p->p_startcopy);
>  	crhold(p->p_ucred);
>  	p->p_addr = (struct user *)uaddr;
> +	p->p_deadcond = malloc(sizeof(*p->p_deadcond), M_SUBPROC, M_WAITOK);
> +	cond_init(p->p_deadcond);
>  
>  	/*
>  	 * Initialize the timeouts.
> @@ -333,6 +336,25 @@ fork_check_maxthread(uid_t uid)
>  	return 0;
>  }
>  
> +int
> +fork_check_maxprocess(uid_t uid)
> +{
> +	int maxprocess_local, val;
> +
> +	maxprocess_local = atomic_load_int(&maxprocess);
> +	val = atomic_inc_int_nv(&nprocesses);
> +	if ((val > maxprocess_local - 5 && uid != 0) ||
> +	    val > maxprocess_local) {
> +		static struct timeval lasttfm;
> +
> +		if (ratecheck(&lasttfm, &fork_tfmrate))
> +			tablefull("process");
> +		atomic_dec_int(&nprocesses);
> +		return EAGAIN;
> +	}
> +	return 0;
> +}
> +
>  static inline void
>  fork_thread_start(struct proc *p, struct proc *parent, int flags)
>  {
> @@ -355,7 +377,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
>  	struct proc *p;
>  	uid_t uid = curp->p_ucred->cr_ruid;
>  	struct vmspace *vm;
> -	int count, maxprocess_local;
> +	int count;
>  	vaddr_t uaddr;
>  	int error;
>  	struct  ptrace_state *newptstat = NULL;
> @@ -368,17 +390,10 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
>  	if ((error = fork_check_maxthread(uid)))
>  		return error;
>  
> -	maxprocess_local = atomic_load_int(&maxprocess);
> -	if ((nprocesses >= maxprocess_local - 5 && uid != 0) ||
> -	    nprocesses >= maxprocess_local) {
> -		static struct timeval lasttfm;
> -
> -		if (ratecheck(&lasttfm, &fork_tfmrate))
> -			tablefull("process");
> +	if ((error = fork_check_maxprocess(uid))) {
>  		atomic_dec_int(&nthreads);
> -		return EAGAIN;
> +		return error;
>  	}
> -	nprocesses++;
>  
>  	/*
>  	 * Increment the count of processes running with this uid.
> @@ -387,7 +402,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
>  	count = chgproccnt(uid, 1);
>  	if (uid != 0 && count > lim_cur(RLIMIT_NPROC)) {
>  		(void)chgproccnt(uid, -1);
> -		nprocesses--;
> +		atomic_dec_int(&nprocesses);
>  		atomic_dec_int(&nthreads);
>  		return EAGAIN;
>  	}
> @@ -395,7 +410,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
>  	uaddr = uvm_uarea_alloc();
>  	if (uaddr == 0) {
>  		(void)chgproccnt(uid, -1);
> -		nprocesses--;
> +		atomic_dec_int(&nprocesses);
>  		atomic_dec_int(&nthreads);
>  		return (ENOMEM);
>  	}
> @@ -714,6 +729,12 @@ proc_trampoline_mi(void)
>  	assertwaitok();
>  	smr_idle();
>  
> +	/* Signal that the previous proc is off the CPU now. */
> +	if (spc->spc_deadcond) {
> +		cond_signal(spc->spc_deadcond);
> +		spc->spc_deadcond = NULL;
> +	}
> +
>  	/* Start any optional clock interrupts needed by the thread. */
>  	if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
>  		atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
> index 74183e6bb681..6a80146a36c2 100644
> --- a/sys/kern/kern_sched.c
> +++ b/sys/kern/kern_sched.c
> @@ -34,6 +34,7 @@ void sched_kthreads_create(void *);
>  
>  int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
>  struct proc *sched_steal_proc(struct cpu_info *);
> +void sched_to(struct proc *);
>  
>  /*
>   * To help choosing which cpu should run which process we keep track
> @@ -107,7 +108,6 @@ sched_init_cpu(struct cpu_info *ci)
>  
>  	kthread_create_deferred(sched_kthreads_create, ci);
>  
> -	TAILQ_INIT(&spc->spc_deadproc);
>  	SIMPLEQ_INIT(&spc->spc_deferred);
>  
>  	/*
> @@ -170,16 +170,9 @@ sched_idle(void *v)
>  
>  	while (1) {
>  		while (spc->spc_whichqs != 0) {
> -			struct proc *dead;
> -
>  			SCHED_LOCK();
>  			p->p_stat = SSLEEP;
>  			mi_switch();
> -
> -			while ((dead = TAILQ_FIRST(&spc->spc_deadproc))) {
> -				TAILQ_REMOVE(&spc->spc_deadproc, dead, p_runq);
> -				exit2(dead);
> -			}
>  		}
>  
>  		splassert(IPL_NONE);
> @@ -209,31 +202,28 @@ sched_idle(void *v)
>  
>  /*
>   * To free our address space we have to jump through a few hoops.
> - * The freeing is done by the reaper, but until we have one reaper
> - * per cpu, we have no way of putting this proc on the deadproc list
> - * and waking up the reaper without risking having our address space and
> - * stack torn from under us before we manage to switch to another proc.
> - * Therefore we have a per-cpu list of dead processes where we put this
> - * proc and have idle clean up that list and move it to the reaper list.
> + * The freeing is done by the reaper.  We make sure that this proc
> + * gets freed only after switching to another proc with the spc_deadcond
> + * signal. The reaping process waits for it.
>   */
>  void
>  sched_exit(struct proc *p)
>  {
>  	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
>  
> -	TAILQ_INSERT_TAIL(&spc->spc_deadproc, p, p_runq);
> +	KASSERT(spc->spc_deadcond == NULL);
> +	spc->spc_deadcond = p->p_deadcond;
>  
>  	tuagg_add_runtime();
>  
> -	KERNEL_ASSERT_LOCKED();
> -	sched_toidle();
> +	SCHED_LOCK();
> +	sched_to(sched_chooseproc());
>  }
>  
>  void
> -sched_toidle(void)
> +sched_to(struct proc *nextproc)
>  {
>  	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> -	struct proc *idle;
>  
>  #ifdef MULTIPROCESSOR
>  	/* This process no longer needs to hold the kernel lock. */
> @@ -252,18 +242,28 @@ sched_toidle(void)
>  
>  	atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
>  
> -	SCHED_LOCK();
> -	idle = spc->spc_idleproc;
> -	idle->p_stat = SRUN;
> +	SCHED_ASSERT_LOCKED();
>  
>  	uvmexp.swtch++;
>  	if (curproc != NULL)
> -		TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
> -		    idle->p_p->ps_pid);
> -	cpu_switchto(NULL, idle);
> +		TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
> +		    nextproc->p_p->ps_pid);
> +	cpu_switchto(NULL, nextproc);
>  	panic("cpu_switchto returned");
>  }
>  
> +void
> +sched_toidle(void)
> +{
> +	struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> +	struct proc *idle;
> +
> +	SCHED_LOCK();
> +	idle = spc->spc_idleproc;
> +	idle->p_stat = SRUN;
> +	sched_to(idle);
> +}
> +
>  void
>  setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
>  {
> diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
> index 998e8e8b7897..e00b0b0dd420 100644
> --- a/sys/kern/sched_bsd.c
> +++ b/sys/kern/sched_bsd.c
> @@ -423,6 +423,12 @@ mi_switch(void)
>  	KASSERT(p->p_cpu == curcpu());
>  	spc = &p->p_cpu->ci_schedstate;
>  
> +	/* Signal that the previous proc is off the CPU now. */
> +	if (spc->spc_deadcond) {
> +		cond_signal(spc->spc_deadcond);
> +		spc->spc_deadcond = NULL;
> +	}
> +
>  	/* Start any optional clock interrupts needed by the thread. */
>  	if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
>  		atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/sys/proc.h b/sys/sys/proc.h
> index c7fe5a03cf44..45dd7cf33d3e 100644
> --- a/sys/sys/proc.h
> +++ b/sys/sys/proc.h
> @@ -339,6 +339,7 @@ struct process {
>  #define PS_FLAGS_INHERITED_ON_FORK \
>      (PS_SUGID | PS_SUGIDEXEC | PS_PLEDGE | PS_EXECPLEDGE | PS_CHROOT)
>  
> +struct cond;
>  struct kcov_dev;
>  struct lock_list_entry;
>  struct kqueue;
> @@ -364,6 +365,7 @@ struct proc {
>  
>  	struct	process *p_p;		/* [I] The process of this thread. */
>  	TAILQ_ENTRY(proc) p_thr_link;	/* [K|m] Threads in a process linkage. */
> +	struct cond	*p_deadcond;	/* Sync wrt. reaping us. */
>  
>  	/* substructures: */
>  	struct	filedesc *p_fd;		/* copy of p_p->ps_fd */
> @@ -554,7 +556,6 @@ extern struct processlist zombprocess;	/* List of zombie processes. */
>  extern struct proclist allproc;		/* List of all threads. */
>  
>  extern struct process *initprocess;	/* Process slot for init. */
> -extern struct proc *reaperproc;		/* Thread slot for reaper. */
>  extern struct proc *syncerproc;		/* filesystem syncer daemon */
>  
>  extern struct pool process_pool;	/* memory pool for processes */
> @@ -588,9 +589,7 @@ void	setrunnable(struct proc *);
>  void	endtsleep(void *);
>  int	wakeup_proc(struct proc *);
>  void	unsleep(struct proc *);
> -void	reaper(void *);
>  __dead void exit1(struct proc *, int, int, int);
> -void	exit2(struct proc *);
>  void	cpu_fork(struct proc *_curp, struct proc *_child, void *_stack,
>  	    void *_tcb, void (*_func)(void *), void *_arg);
>  void	cpu_exit(struct proc *);
> diff --git a/sys/sys/sched.h b/sys/sys/sched.h
> index 64c7044204c3..9fb9c1e9e2fa 100644
> --- a/sys/sys/sched.h
> +++ b/sys/sys/sched.h
> @@ -110,7 +110,7 @@ struct smr_entry;
>  struct schedstate_percpu {
>  	struct proc *spc_idleproc;	/* idle proc for this cpu */
>  	TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> -	TAILQ_HEAD(,proc) spc_deadproc;
> +	struct cond	*spc_deadcond;	/* [o] Proc is off the CPU condition */
>  	struct timespec spc_runtime;	/* time curproc started running */
>  	volatile int spc_schedflags;	/* flags; see below */
>  	u_int spc_schedticks;		/* ticks for schedclock() */
> diff --git a/sys/uvm/uvm_glue.c b/sys/uvm/uvm_glue.c
> index 748937af5c9d..f0cb8e7827b9 100644
> --- a/sys/uvm/uvm_glue.c
> +++ b/sys/uvm/uvm_glue.c
> @@ -295,7 +295,7 @@ uvm_uarea_alloc(void)
>   * uvm_uarea_free: free a dead thread's stack
>   *
>   * - the thread passed to us is a dead thread; we
> - *   are running on a different context now (the reaper).
> + *   are running on a different context now.
>   */
>  void
>  uvm_uarea_free(struct proc *p)
> 

-- 
:wq Claudio