Download raw body.
Another attempt to get rid of the reaper
On Sun, Sep 14, 2025 at 10:36:51PM +0200, Christian Ludwig wrote:
> Hi,
>
> this is another attempt to get rid of the dedicated reaper thread.
>
> The main part of the diff is moving all the code around parent wakeup
> into the exit path. Now the parent wakes up earlier, when the child is
> still running. The parent cannot free the child's resources, yet. That's
> why the parent now waits before zapping the child. The scheduler wakes
> it up after the exiting child is off the CPU. With this, I see no reason
> why we can't sleep for most of exit1().
>
> Most of the remaining cleanup that is performed in the reaper directly
> is pushed into the parent. init(8) reaps non-zombies from the wait path.
> proc_free() runs unlocked in the reaper already. I unlocked it in the
> wait path, too. Along with most of the freeing in process_zap(), which
> is MP safe IMHO.
>
> This diff does not force-switch to idle after exit anymore. That needs a
> small fix in amd64's cpu_switch(), since now we can switch to a non-
> system thread.
>
> I am unsure how to deal with the new proc.p_deadcond member. I kept it
> an opaque pointer. That saves us from exposing struct cond to userspace.
> I'm also not sure that I got the accounting completely right. And there
> is room for cleanups in exit1.
>
> I have tested this on amd64 and i386. I'd appreciate test reports for a
> variety of workloads and architectures.
>
Please tell specifically what you want tested. "a variety of workloads
and architectures" doesn't tell:
1. what you want tested
2. what we should be looking for
3. what feedback you want
For example:
"I'd appreciate someone running a bulk build on armv7 and telling me if this
results in hangs. If it doesn't, did performance improve or degrade, and by
how much".
Otherwise, people are just going to apply the diff, boot and say "yeah it
works" but I don't think that's what you want. Or is that what you want? It's
not clear from the email.
-ml
>
> - Christian
>
> ---
> sys/arch/amd64/amd64/locore.S | 5 +-
> sys/kern/init_main.c | 5 -
> sys/kern/kern_exit.c | 208 +++++++++++++++++++++---------------------
> sys/kern/kern_fork.c | 47 +++++++---
> sys/kern/kern_sched.c | 50 +++++-----
> sys/kern/sched_bsd.c | 6 ++
> sys/sys/proc.h | 5 +-
> sys/sys/sched.h | 2 +-
> sys/uvm/uvm_glue.c | 2 +-
> 9 files changed, 178 insertions(+), 152 deletions(-)
>
> diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S
> index 2c19fbf0a309..a35b1f998bd6 100644
> --- a/sys/arch/amd64/amd64/locore.S
> +++ b/sys/arch/amd64/amd64/locore.S
> @@ -400,13 +400,14 @@ restore_saved:
> cmpq %rcx,CPUVAR(PROC_PMAP)
> jnz .Lbogus_proc_pmap
> #endif
> - /* record which pmap this CPU should get IPIs for */
> - movq %rbx,CPUVAR(PROC_PMAP)
>
> .Lset_cr3:
> movq %rax,%cr3 /* %rax used below too */
>
> .Lsame_cr3:
> + /* record which pmap this CPU should get IPIs for */
> + movq %rbx,CPUVAR(PROC_PMAP)
> +
> /*
> * If we switched from a userland thread with a shallow call stack
> * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
> diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
> index 9719b2666c5b..242af43d0c27 100644
> --- a/sys/kern/init_main.c
> +++ b/sys/kern/init_main.c
> @@ -117,7 +117,6 @@ struct plimit limit0;
> struct vmspace vmspace0;
> struct sigacts sigacts0;
> struct process *initprocess;
> -struct proc *reaperproc;
>
> extern struct user *proc0paddr;
>
> @@ -501,10 +500,6 @@ main(void *framep)
> if (kthread_create(uvm_pageout, NULL, NULL, "pagedaemon"))
> panic("fork pagedaemon");
>
> - /* Create the reaper daemon kernel thread. */
> - if (kthread_create(reaper, NULL, &reaperproc, "reaper"))
> - panic("fork reaper");
> -
> /* Create the cleaner daemon kernel thread. */
> if (kthread_create(buf_daemon, NULL, &cleanerproc, "cleaner"))
> panic("fork cleaner");
> diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> index df04c4270eac..3a49c9b53509 100644
> --- a/sys/kern/kern_exit.c
> +++ b/sys/kern/kern_exit.c
> @@ -69,8 +69,11 @@
> #include <sys/kcov.h>
> #endif
>
> +void exit2(struct proc *);
> void proc_finish_wait(struct proc *, struct process *);
> void process_clear_orphan(struct process *);
> +void proc_reap(struct proc *);
> +void process_remove(struct process *);
> void process_zap(struct process *);
> void proc_free(struct proc *);
> void unveil_destroy(struct process *ps);
> @@ -118,6 +121,7 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> {
> struct process *pr, *qr, *nqr;
> struct rusage *rup;
> + struct vnode *otvp;
>
> atomic_setbits_int(&p->p_flag, P_WEXIT);
>
> @@ -259,6 +263,14 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> uvm_purge();
> KERNEL_LOCK();
> }
> +
> + /*
> + * Release reference to text vnode
> + */
> + otvp = pr->ps_textvp;
> + pr->ps_textvp = NULL;
> + if (otvp)
> + vrele(otvp);
> }
>
> p->p_fd = NULL; /* zap the thread's copy */
> @@ -276,15 +288,10 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * Remove proc from pidhash chain and allproc so looking
> * it up won't work. We will put the proc on the
> * deadproc list later (using the p_runq member), and
> - * wake up the reaper when we do. If this is the last
> + * wake up the reaping process when we do. If this is the last
> * thread of a process that isn't PS_NOZOMBIE, we'll put
> * the process on the zombprocess list below.
> */
> - /*
> - * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> - */
> - p->p_stat = SDEAD;
> -
> LIST_REMOVE(p, p_hash);
> LIST_REMOVE(p, p_list);
>
> @@ -359,6 +366,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> p->p_pctcpu = 0;
>
> if ((p->p_flag & P_THREAD) == 0) {
> + struct process *pptr = pr->ps_pptr;
> +
> /*
> * Final thread has died, so add on our children's rusage
> * and calculate the total times.
> @@ -369,6 +378,9 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> rup->ru_isrss = pr->ps_tu.tu_isrss;
> ruadd(rup, &pr->ps_cru);
>
> + /* Notify listeners of our demise and clean up. */
> + knote_processexit(pr);
> +
> /*
> * Notify parent that we're gone. If we're not going to
> * become a zombie, reparent to process 1 (init) so that
> @@ -376,13 +388,16 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * wait4() to return ECHILD.
> */
> mtx_enter(&pr->ps_mtx);
> - if (pr->ps_flags & PS_NOZOMBIE) {
> - struct process *ppr = pr->ps_pptr;
> + if (pr->ps_flags & PS_NOZOMBIE)
> process_reparent(pr, initprocess);
> - atomic_setbits_int(&ppr->ps_flags, PS_WAITEVENT);
> - wakeup(ppr);
> + else {
> + /* Process is now a true zombie. */
> + atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> + prsignal(pptr, SIGCHLD);
> }
> mtx_leave(&pr->ps_mtx);
> + atomic_setbits_int(&pptr->ps_flags, PS_WAITEVENT);
> + wakeup(pptr);
> }
>
> /* just a thread? check if last one standing. */
> @@ -396,9 +411,18 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> }
>
> /*
> - * Other substructures are freed from reaper and wait().
> + * Other substructures are freed from wait().
> */
>
> + /*
> + * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> + */
> + p->p_stat = SDEAD;
> +
> + /* Do not feed zombies to init(8). */
> + if ((p->p_flag & P_THREAD) || (pr->ps_flags & PS_NOZOMBIE))
> + exit2(p);
> +
> /*
> * Finally, call machine-dependent code.
> */
> @@ -408,10 +432,6 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * Deactivate the exiting address space before the vmspace
> * is freed. Note that we will continue to run on this
> * vmspace's context until the switch to idle in sched_exit().
> - *
> - * Once we are no longer using the dead process's vmspace and
> - * stack, exit2() will be called to schedule those resources
> - * to be released by the reaper thread.
> */
> pmap_deactivate(p);
> sched_exit(p);
> @@ -419,109 +439,67 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> }
>
> /*
> - * Locking of this prochead is special; it's accessed in a
> - * critical section of process exit, and thus locking it can't
> - * modify interrupt state. We use a simple spin lock for this
> - * prochead. We use the p_runq member to linkup to deadproc.
> + * We re-use the p_runq member to linkup to deadproc.
> + * The dead proc cannot sleep anymore.
> */
> -struct mutex deadproc_mutex =
> - MUTEX_INITIALIZER_FLAGS(IPL_NONE, "deadproc", MTX_NOWITNESS);
> +struct mutex deadproc_mutex = MUTEX_INITIALIZER(IPL_SCHED);
> struct prochead deadproc = TAILQ_HEAD_INITIALIZER(deadproc);
>
> /*
> - * We are called from sched_idle() once it is safe to schedule the
> - * dead process's resources to be freed. So this is not allowed to sleep.
> - *
> * We lock the deadproc list, place the proc on that list (using
> - * the p_runq member), and wake up the reaper.
> + * the p_runq member), and wake up init as the reaping process.
> */
> void
> exit2(struct proc *p)
> {
> - /* account the remainder of time spent in exit1() */
> - mtx_enter(&p->p_p->ps_mtx);
> - tuagg_add_process(p->p_p, p);
> - mtx_leave(&p->p_p->ps_mtx);
> -
> mtx_enter(&deadproc_mutex);
> TAILQ_INSERT_TAIL(&deadproc, p, p_runq);
> mtx_leave(&deadproc_mutex);
>
> - wakeup(&deadproc);
> + atomic_setbits_int(&initprocess->ps_flags, PS_WAITEVENT);
> + wakeup(initprocess);
> }
>
> void
> proc_free(struct proc *p)
> {
> + WITNESS_THREAD_EXIT(p);
> +
> + uvm_uarea_free(p);
> + p->p_vmspace = NULL; /* zap the thread's copy */
> +
> + free(p->p_deadcond, M_SUBPROC, sizeof(*p->p_deadcond));
> crfree(p->p_ucred);
> pool_put(&proc_pool, p);
> atomic_dec_int(&nthreads);
> }
>
> /*
> - * Process reaper. This is run by a kernel thread to free the resources
> - * of a dead process. Once the resources are free, the process becomes
> - * a zombie, and the parent is allowed to read the undead's status.
> + * Free proc's ressources.
> */
> void
> -reaper(void *arg)
> +proc_reap(struct proc *p)
> {
> - struct proc *p;
> + /* Wait for the thread to be scheduled off the CPU. */
> + cond_wait(p->p_deadcond, "pdead");
>
> - KERNEL_UNLOCK();
> + /*
> + * Free the VM resources we're still holding on to.
> + * We must do this from a valid thread because doing
> + * so may block.
> + */
> + if (p->p_flag & P_THREAD) {
> + /* Just a thread */
> + proc_free(p);
> + } else {
> + struct process *pr = p->p_p;
>
> - SCHED_ASSERT_UNLOCKED();
> + KERNEL_LOCK();
> + /* No one will wait for us, just zap it. */
> + process_remove(pr);
> + KERNEL_UNLOCK();
>
> - for (;;) {
> - mtx_enter(&deadproc_mutex);
> - while ((p = TAILQ_FIRST(&deadproc)) == NULL)
> - msleep_nsec(&deadproc, &deadproc_mutex, PVM, "reaper",
> - INFSLP);
> -
> - /* Remove us from the deadproc list. */
> - TAILQ_REMOVE(&deadproc, p, p_runq);
> - mtx_leave(&deadproc_mutex);
> -
> - WITNESS_THREAD_EXIT(p);
> -
> - /*
> - * Free the VM resources we're still holding on to.
> - * We must do this from a valid thread because doing
> - * so may block.
> - */
> - uvm_uarea_free(p);
> - p->p_vmspace = NULL; /* zap the thread's copy */
> -
> - if (p->p_flag & P_THREAD) {
> - /* Just a thread */
> - proc_free(p);
> - } else {
> - struct process *pr = p->p_p;
> -
> - /* Release the rest of the process's vmspace */
> - uvm_exit(pr);
> -
> - KERNEL_LOCK();
> - if ((pr->ps_flags & PS_NOZOMBIE) == 0) {
> - /* Process is now a true zombie. */
> - atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> - }
> -
> - /* Notify listeners of our demise and clean up. */
> - knote_processexit(pr);
> -
> - if (pr->ps_flags & PS_ZOMBIE) {
> - /* Post SIGCHLD and wake up parent. */
> - prsignal(pr->ps_pptr, SIGCHLD);
> - atomic_setbits_int(&pr->ps_pptr->ps_flags,
> - PS_WAITEVENT);
> - wakeup(pr->ps_pptr);
> - } else {
> - /* No one will wait for us, just zap it. */
> - process_zap(pr);
> - }
> - KERNEL_UNLOCK();
> - }
> + process_zap(pr);
> }
> }
>
> @@ -667,6 +645,26 @@ loop:
> break;
> }
> }
> + /* init(8) accounts for cleaning up deadprocs. */
> + if (q->p_p == initprocess) {
> + struct proc *p;
> + struct schedstate_percpu *spc;
> +
> + KERNEL_UNLOCK();
> + mtx_enter(&deadproc_mutex);
> + while ((p = TAILQ_FIRST(&deadproc)) != NULL) {
> + TAILQ_REMOVE(&deadproc, p, p_runq);
> + mtx_leave(&deadproc_mutex);
> + proc_reap(p);
> + mtx_enter(&deadproc_mutex);
> +
> + spc = &curcpu()->ci_schedstate;
> + if (spc->spc_schedflags & SPCF_SHOULDYIELD)
> + break;
> + }
> + mtx_leave(&deadproc_mutex);
> + KERNEL_LOCK();
> + }
> if (nfound == 0)
> return (ECHILD);
> if (options & WNOHANG) {
> @@ -787,12 +785,20 @@ proc_finish_wait(struct proc *waiter, struct process *pr)
> wakeup(tr);
> } else {
> mtx_leave(&pr->ps_mtx);
> + /* Wait until the proc is off of its CPU. */
> + cond_wait(pr->ps_mainproc->p_deadcond, "pdead");
> +
> scheduler_wait_hook(waiter, pr->ps_mainproc);
> rup = &waiter->p_p->ps_cru;
> ruadd(rup, pr->ps_ru);
> LIST_REMOVE(pr, ps_list); /* off zombprocess */
> freepid(pr->ps_pid);
> + process_remove(pr);
> + KERNEL_UNLOCK();
> +
> process_zap(pr);
> +
> + KERNEL_LOCK();
> }
> }
>
> @@ -857,32 +863,30 @@ process_reparent(struct process *child, struct process *parent)
> }
>
> void
> -process_zap(struct process *pr)
> +process_remove(struct process *pr)
> {
> - struct vnode *otvp;
> - struct proc *p = pr->ps_mainproc;
> -
> /*
> * Finally finished with old proc entry.
> - * Unlink it from its process group and free it.
> + * Unlink it from its process group.
> */
> leavepgrp(pr);
> LIST_REMOVE(pr, ps_sibling);
> process_clear_orphan(pr);
> +}
> +
> +void
> +process_zap(struct process *pr)
> +{
> + struct proc *p = pr->ps_mainproc;
> +
> + /* Release the rest of the process's vmspace */
> + uvm_exit(pr);
>
> /*
> * Decrement the count of procs running with this uid.
> */
> (void)chgproccnt(pr->ps_ucred->cr_ruid, -1);
>
> - /*
> - * Release reference to text vnode
> - */
> - otvp = pr->ps_textvp;
> - pr->ps_textvp = NULL;
> - if (otvp)
> - vrele(otvp);
> -
> KASSERT(pr->ps_threadcnt == 0);
> KASSERT(pr->ps_exitcnt == 1);
> if (pr->ps_ptstat != NULL)
> @@ -893,7 +897,7 @@ process_zap(struct process *pr)
> lim_free(pr->ps_limit);
> crfree(pr->ps_ucred);
> pool_put(&process_pool, pr);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
>
> proc_free(p);
> }
> diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
> index 5b925518234d..e1fbdfa14b66 100644
> --- a/sys/kern/kern_fork.c
> +++ b/sys/kern/kern_fork.c
> @@ -78,7 +78,8 @@ void unveil_copy(struct process *parent, struct process *child);
>
> struct proc *thread_new(struct proc *_parent, vaddr_t _uaddr);
> struct process *process_new(struct proc *, struct process *, int);
> -int fork_check_maxthread(uid_t _uid);
> +int fork_check_maxthread(uid_t);
> +int fork_check_maxprocess(uid_t);
>
> void
> fork_return(void *arg)
> @@ -164,6 +165,8 @@ thread_new(struct proc *parent, vaddr_t uaddr)
> (caddr_t)&p->p_endcopy - (caddr_t)&p->p_startcopy);
> crhold(p->p_ucred);
> p->p_addr = (struct user *)uaddr;
> + p->p_deadcond = malloc(sizeof(*p->p_deadcond), M_SUBPROC, M_WAITOK);
> + cond_init(p->p_deadcond);
>
> /*
> * Initialize the timeouts.
> @@ -333,6 +336,25 @@ fork_check_maxthread(uid_t uid)
> return 0;
> }
>
> +int
> +fork_check_maxprocess(uid_t uid)
> +{
> + int maxprocess_local, val;
> +
> + maxprocess_local = atomic_load_int(&maxprocess);
> + val = atomic_inc_int_nv(&nprocesses);
> + if ((val > maxprocess_local - 5 && uid != 0) ||
> + val > maxprocess_local) {
> + static struct timeval lasttfm;
> +
> + if (ratecheck(&lasttfm, &fork_tfmrate))
> + tablefull("process");
> + atomic_dec_int(&nprocesses);
> + return EAGAIN;
> + }
> + return 0;
> +}
> +
> static inline void
> fork_thread_start(struct proc *p, struct proc *parent, int flags)
> {
> @@ -355,7 +377,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> struct proc *p;
> uid_t uid = curp->p_ucred->cr_ruid;
> struct vmspace *vm;
> - int count, maxprocess_local;
> + int count;
> vaddr_t uaddr;
> int error;
> struct ptrace_state *newptstat = NULL;
> @@ -368,17 +390,10 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> if ((error = fork_check_maxthread(uid)))
> return error;
>
> - maxprocess_local = atomic_load_int(&maxprocess);
> - if ((nprocesses >= maxprocess_local - 5 && uid != 0) ||
> - nprocesses >= maxprocess_local) {
> - static struct timeval lasttfm;
> -
> - if (ratecheck(&lasttfm, &fork_tfmrate))
> - tablefull("process");
> + if ((error = fork_check_maxprocess(uid))) {
> atomic_dec_int(&nthreads);
> - return EAGAIN;
> + return error;
> }
> - nprocesses++;
>
> /*
> * Increment the count of processes running with this uid.
> @@ -387,7 +402,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> count = chgproccnt(uid, 1);
> if (uid != 0 && count > lim_cur(RLIMIT_NPROC)) {
> (void)chgproccnt(uid, -1);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
> atomic_dec_int(&nthreads);
> return EAGAIN;
> }
> @@ -395,7 +410,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> uaddr = uvm_uarea_alloc();
> if (uaddr == 0) {
> (void)chgproccnt(uid, -1);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
> atomic_dec_int(&nthreads);
> return (ENOMEM);
> }
> @@ -714,6 +729,12 @@ proc_trampoline_mi(void)
> assertwaitok();
> smr_idle();
>
> + /* Signal that the previous proc is off the CPU now. */
> + if (spc->spc_deadcond) {
> + cond_signal(spc->spc_deadcond);
> + spc->spc_deadcond = NULL;
> + }
> +
> /* Start any optional clock interrupts needed by the thread. */
> if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
> atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
> index 74183e6bb681..6a80146a36c2 100644
> --- a/sys/kern/kern_sched.c
> +++ b/sys/kern/kern_sched.c
> @@ -34,6 +34,7 @@ void sched_kthreads_create(void *);
>
> int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
> struct proc *sched_steal_proc(struct cpu_info *);
> +void sched_to(struct proc *);
>
> /*
> * To help choosing which cpu should run which process we keep track
> @@ -107,7 +108,6 @@ sched_init_cpu(struct cpu_info *ci)
>
> kthread_create_deferred(sched_kthreads_create, ci);
>
> - TAILQ_INIT(&spc->spc_deadproc);
> SIMPLEQ_INIT(&spc->spc_deferred);
>
> /*
> @@ -170,16 +170,9 @@ sched_idle(void *v)
>
> while (1) {
> while (spc->spc_whichqs != 0) {
> - struct proc *dead;
> -
> SCHED_LOCK();
> p->p_stat = SSLEEP;
> mi_switch();
> -
> - while ((dead = TAILQ_FIRST(&spc->spc_deadproc))) {
> - TAILQ_REMOVE(&spc->spc_deadproc, dead, p_runq);
> - exit2(dead);
> - }
> }
>
> splassert(IPL_NONE);
> @@ -209,31 +202,28 @@ sched_idle(void *v)
>
> /*
> * To free our address space we have to jump through a few hoops.
> - * The freeing is done by the reaper, but until we have one reaper
> - * per cpu, we have no way of putting this proc on the deadproc list
> - * and waking up the reaper without risking having our address space and
> - * stack torn from under us before we manage to switch to another proc.
> - * Therefore we have a per-cpu list of dead processes where we put this
> - * proc and have idle clean up that list and move it to the reaper list.
> + * The freeing is done by the reaper. We make sure that this proc
> + * gets freed only after switching to another proc with the spc_deadcond
> + * signal. The reaping process waits for it.
> */
> void
> sched_exit(struct proc *p)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
>
> - TAILQ_INSERT_TAIL(&spc->spc_deadproc, p, p_runq);
> + KASSERT(spc->spc_deadcond == NULL);
> + spc->spc_deadcond = p->p_deadcond;
>
> tuagg_add_runtime();
>
> - KERNEL_ASSERT_LOCKED();
> - sched_toidle();
> + SCHED_LOCK();
> + sched_to(sched_chooseproc());
> }
>
> void
> -sched_toidle(void)
> +sched_to(struct proc *nextproc)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> - struct proc *idle;
>
> #ifdef MULTIPROCESSOR
> /* This process no longer needs to hold the kernel lock. */
> @@ -252,18 +242,28 @@ sched_toidle(void)
>
> atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
>
> - SCHED_LOCK();
> - idle = spc->spc_idleproc;
> - idle->p_stat = SRUN;
> + SCHED_ASSERT_LOCKED();
>
> uvmexp.swtch++;
> if (curproc != NULL)
> - TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
> - idle->p_p->ps_pid);
> - cpu_switchto(NULL, idle);
> + TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
> + nextproc->p_p->ps_pid);
> + cpu_switchto(NULL, nextproc);
> panic("cpu_switchto returned");
> }
>
> +void
> +sched_toidle(void)
> +{
> + struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> + struct proc *idle;
> +
> + SCHED_LOCK();
> + idle = spc->spc_idleproc;
> + idle->p_stat = SRUN;
> + sched_to(idle);
> +}
> +
> void
> setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
> {
> diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
> index 998e8e8b7897..e00b0b0dd420 100644
> --- a/sys/kern/sched_bsd.c
> +++ b/sys/kern/sched_bsd.c
> @@ -423,6 +423,12 @@ mi_switch(void)
> KASSERT(p->p_cpu == curcpu());
> spc = &p->p_cpu->ci_schedstate;
>
> + /* Signal that the previous proc is off the CPU now. */
> + if (spc->spc_deadcond) {
> + cond_signal(spc->spc_deadcond);
> + spc->spc_deadcond = NULL;
> + }
> +
> /* Start any optional clock interrupts needed by the thread. */
> if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
> atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/sys/proc.h b/sys/sys/proc.h
> index c7fe5a03cf44..45dd7cf33d3e 100644
> --- a/sys/sys/proc.h
> +++ b/sys/sys/proc.h
> @@ -339,6 +339,7 @@ struct process {
> #define PS_FLAGS_INHERITED_ON_FORK \
> (PS_SUGID | PS_SUGIDEXEC | PS_PLEDGE | PS_EXECPLEDGE | PS_CHROOT)
>
> +struct cond;
> struct kcov_dev;
> struct lock_list_entry;
> struct kqueue;
> @@ -364,6 +365,7 @@ struct proc {
>
> struct process *p_p; /* [I] The process of this thread. */
> TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */
> + struct cond *p_deadcond; /* Sync wrt. reaping us. */
>
> /* substructures: */
> struct filedesc *p_fd; /* copy of p_p->ps_fd */
> @@ -554,7 +556,6 @@ extern struct processlist zombprocess; /* List of zombie processes. */
> extern struct proclist allproc; /* List of all threads. */
>
> extern struct process *initprocess; /* Process slot for init. */
> -extern struct proc *reaperproc; /* Thread slot for reaper. */
> extern struct proc *syncerproc; /* filesystem syncer daemon */
>
> extern struct pool process_pool; /* memory pool for processes */
> @@ -588,9 +589,7 @@ void setrunnable(struct proc *);
> void endtsleep(void *);
> int wakeup_proc(struct proc *);
> void unsleep(struct proc *);
> -void reaper(void *);
> __dead void exit1(struct proc *, int, int, int);
> -void exit2(struct proc *);
> void cpu_fork(struct proc *_curp, struct proc *_child, void *_stack,
> void *_tcb, void (*_func)(void *), void *_arg);
> void cpu_exit(struct proc *);
> diff --git a/sys/sys/sched.h b/sys/sys/sched.h
> index 64c7044204c3..9fb9c1e9e2fa 100644
> --- a/sys/sys/sched.h
> +++ b/sys/sys/sched.h
> @@ -110,7 +110,7 @@ struct smr_entry;
> struct schedstate_percpu {
> struct proc *spc_idleproc; /* idle proc for this cpu */
> TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> - TAILQ_HEAD(,proc) spc_deadproc;
> + struct cond *spc_deadcond; /* [o] Proc is off the CPU condition */
> struct timespec spc_runtime; /* time curproc started running */
> volatile int spc_schedflags; /* flags; see below */
> u_int spc_schedticks; /* ticks for schedclock() */
> diff --git a/sys/uvm/uvm_glue.c b/sys/uvm/uvm_glue.c
> index 748937af5c9d..f0cb8e7827b9 100644
> --- a/sys/uvm/uvm_glue.c
> +++ b/sys/uvm/uvm_glue.c
> @@ -295,7 +295,7 @@ uvm_uarea_alloc(void)
> * uvm_uarea_free: free a dead thread's stack
> *
> * - the thread passed to us is a dead thread; we
> - * are running on a different context now (the reaper).
> + * are running on a different context now.
> */
> void
> uvm_uarea_free(struct proc *p)
>
Another attempt to get rid of the reaper