Download raw body.
Another attempt to get rid of the reaper
Hey Christian,
On 14/09/25(Sun) 22:36, Christian Ludwig wrote:
> Hi,
>
> this is another attempt to get rid of the dedicated reaper thread.
This is lovely. I tried it on arm64 and it mostly work. cc(1) died
with SIGSEGV twice while building a kernel. Unfortunately I couldn't
get any useful information with egdb(1) and the core file.
I suppose that's because the "pdead" barrier is in the wrong place and
should be placed in process_zap(). Otherwise the stack of a zombie can
be unmapped before it is off CPU.
> The main part of the diff is moving all the code around parent wakeup
> into the exit path. Now the parent wakes up earlier, when the child is
> still running. The parent cannot free the child's resources, yet. That's
> why the parent now waits before zapping the child. The scheduler wakes
> it up after the exiting child is off the CPU. With this, I see no reason
> why we can't sleep for most of exit1().
Is it possible to delay the wakeup after the child is off CPU?
> Most of the remaining cleanup that is performed in the reaper directly
> is pushed into the parent. init(8) reaps non-zombies from the wait path.
> proc_free() runs unlocked in the reaper already. I unlocked it in the
> wait path, too. Along with most of the freeing in process_zap(), which
> is MP safe IMHO.
I like it. I'd suggest splitting the bits that should be safe and try
to commit them first. See below.
> This diff does not force-switch to idle after exit anymore. That needs a
> small fix in amd64's cpu_switch(), since now we can switch to a non-
> system thread.
Indeed.
> I am unsure how to deal with the new proc.p_deadcond member. I kept it
> an opaque pointer. That saves us from exposing struct cond to userspace.
> I'm also not sure that I got the accounting completely right. And there
> is room for cleanups in exit1.
I'm not very happy about stuffing per-thread data into the per-CPU `spc'
descriptor. I wish we could make cpu_switchto(9) return a pointer to
the previous thread. This will also help us with locking and tracing.
That said it is good enough for now.
> I have tested this on amd64 and i386. I'd appreciate test reports for a
> variety of workloads and architectures.
Comments below.
> ---
> sys/arch/amd64/amd64/locore.S | 5 +-
> sys/kern/init_main.c | 5 -
> sys/kern/kern_exit.c | 208 +++++++++++++++++++++---------------------
> sys/kern/kern_fork.c | 47 +++++++---
> sys/kern/kern_sched.c | 50 +++++-----
> sys/kern/sched_bsd.c | 6 ++
> sys/sys/proc.h | 5 +-
> sys/sys/sched.h | 2 +-
> sys/uvm/uvm_glue.c | 2 +-
> 9 files changed, 178 insertions(+), 152 deletions(-)
>
> diff --git a/sys/arch/amd64/amd64/locore.S b/sys/arch/amd64/amd64/locore.S
> index 2c19fbf0a309..a35b1f998bd6 100644
> --- a/sys/arch/amd64/amd64/locore.S
> +++ b/sys/arch/amd64/amd64/locore.S
> @@ -400,13 +400,14 @@ restore_saved:
> cmpq %rcx,CPUVAR(PROC_PMAP)
> jnz .Lbogus_proc_pmap
> #endif
> - /* record which pmap this CPU should get IPIs for */
> - movq %rbx,CPUVAR(PROC_PMAP)
>
> .Lset_cr3:
> movq %rax,%cr3 /* %rax used below too */
>
> .Lsame_cr3:
> + /* record which pmap this CPU should get IPIs for */
> + movq %rbx,CPUVAR(PROC_PMAP)
> +
> /*
> * If we switched from a userland thread with a shallow call stack
> * (e.g interrupt->ast->mi_ast->prempt->mi_switch->cpu_switchto)
> diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
> index 9719b2666c5b..242af43d0c27 100644
> --- a/sys/kern/init_main.c
> +++ b/sys/kern/init_main.c
> @@ -117,7 +117,6 @@ struct plimit limit0;
> struct vmspace vmspace0;
> struct sigacts sigacts0;
> struct process *initprocess;
> -struct proc *reaperproc;
>
> extern struct user *proc0paddr;
>
> @@ -501,10 +500,6 @@ main(void *framep)
> if (kthread_create(uvm_pageout, NULL, NULL, "pagedaemon"))
> panic("fork pagedaemon");
>
> - /* Create the reaper daemon kernel thread. */
> - if (kthread_create(reaper, NULL, &reaperproc, "reaper"))
> - panic("fork reaper");
> -
> /* Create the cleaner daemon kernel thread. */
> if (kthread_create(buf_daemon, NULL, &cleanerproc, "cleaner"))
> panic("fork cleaner");
> diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> index df04c4270eac..3a49c9b53509 100644
> --- a/sys/kern/kern_exit.c
> +++ b/sys/kern/kern_exit.c
> @@ -69,8 +69,11 @@
> #include <sys/kcov.h>
> #endif
>
> +void exit2(struct proc *);
> void proc_finish_wait(struct proc *, struct process *);
> void process_clear_orphan(struct process *);
> +void proc_reap(struct proc *);
> +void process_remove(struct process *);
> void process_zap(struct process *);
> void proc_free(struct proc *);
> void unveil_destroy(struct process *ps);
> @@ -118,6 +121,7 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> {
> struct process *pr, *qr, *nqr;
> struct rusage *rup;
> + struct vnode *otvp;
>
> atomic_setbits_int(&p->p_flag, P_WEXIT);
>
> @@ -259,6 +263,14 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> uvm_purge();
> KERNEL_LOCK();
> }
> +
> + /*
> + * Release reference to text vnode
> + */
> + otvp = pr->ps_textvp;
> + pr->ps_textvp = NULL;
> + if (otvp)
> + vrele(otvp);
> }
Can this be committed independently? What are the requirements of this
change?
> p->p_fd = NULL; /* zap the thread's copy */
> @@ -276,15 +288,10 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * Remove proc from pidhash chain and allproc so looking
> * it up won't work. We will put the proc on the
> * deadproc list later (using the p_runq member), and
> - * wake up the reaper when we do. If this is the last
> + * wake up the reaping process when we do. If this is the last
> * thread of a process that isn't PS_NOZOMBIE, we'll put
> * the process on the zombprocess list below.
> */
> - /*
> - * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> - */
> - p->p_stat = SDEAD;
> -
> LIST_REMOVE(p, p_hash);
> LIST_REMOVE(p, p_list);
>
> @@ -359,6 +366,8 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> p->p_pctcpu = 0;
>
> if ((p->p_flag & P_THREAD) == 0) {
> + struct process *pptr = pr->ps_pptr;
> +
> /*
> * Final thread has died, so add on our children's rusage
> * and calculate the total times.
> @@ -369,6 +378,9 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> rup->ru_isrss = pr->ps_tu.tu_isrss;
> ruadd(rup, &pr->ps_cru);
>
> + /* Notify listeners of our demise and clean up. */
> + knote_processexit(pr);
> +
This depends on the SDEAD chunk above being delayed because it sleeps.
Are we sure it is safe to delay it? If so can this be extracted and
committed independently?
> /*
> * Notify parent that we're gone. If we're not going to
> * become a zombie, reparent to process 1 (init) so that
> @@ -376,13 +388,16 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * wait4() to return ECHILD.
> */
> mtx_enter(&pr->ps_mtx);
> - if (pr->ps_flags & PS_NOZOMBIE) {
> - struct process *ppr = pr->ps_pptr;
> + if (pr->ps_flags & PS_NOZOMBIE)
> process_reparent(pr, initprocess);
> - atomic_setbits_int(&ppr->ps_flags, PS_WAITEVENT);
> - wakeup(ppr);
> + else {
> + /* Process is now a true zombie. */
> + atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> + prsignal(pptr, SIGCHLD);
> }
> mtx_leave(&pr->ps_mtx);
> + atomic_setbits_int(&pptr->ps_flags, PS_WAITEVENT);
> + wakeup(pptr);
> }
>
> /* just a thread? check if last one standing. */
> @@ -396,9 +411,18 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> }
>
> /*
> - * Other substructures are freed from reaper and wait().
> + * Other substructures are freed from wait().
> */
>
> + /*
> + * NOTE: WE ARE NO LONGER ALLOWED TO SLEEP!
> + */
> + p->p_stat = SDEAD;
> +
> + /* Do not feed zombies to init(8). */
> + if ((p->p_flag & P_THREAD) || (pr->ps_flags & PS_NOZOMBIE))
> + exit2(p);
Dead processes, except zombies, have been adopted by `initprocess' just
above. So we can kill the `deadproc' list and its mutex and instead
iterate `ps_children'. This would reduce the diff by getting rid of
exit2().
Note that dowait6() currently skip PS_NOZOMBIE, which should be relaxed
if in `initprocess'.
> /*
> * Finally, call machine-dependent code.
> */
> @@ -408,10 +432,6 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> * Deactivate the exiting address space before the vmspace
> * is freed. Note that we will continue to run on this
> * vmspace's context until the switch to idle in sched_exit().
> - *
> - * Once we are no longer using the dead process's vmspace and
> - * stack, exit2() will be called to schedule those resources
> - * to be released by the reaper thread.
> */
> pmap_deactivate(p);
> sched_exit(p);
> @@ -419,109 +439,67 @@ exit1(struct proc *p, int xexit, int xsig, int flags)
> }
>
> /*
> - * Locking of this prochead is special; it's accessed in a
> - * critical section of process exit, and thus locking it can't
> - * modify interrupt state. We use a simple spin lock for this
> - * prochead. We use the p_runq member to linkup to deadproc.
> + * We re-use the p_runq member to linkup to deadproc.
> + * The dead proc cannot sleep anymore.
> */
> -struct mutex deadproc_mutex =
> - MUTEX_INITIALIZER_FLAGS(IPL_NONE, "deadproc", MTX_NOWITNESS);
> +struct mutex deadproc_mutex = MUTEX_INITIALIZER(IPL_SCHED);
> struct prochead deadproc = TAILQ_HEAD_INITIALIZER(deadproc);
>
> /*
> - * We are called from sched_idle() once it is safe to schedule the
> - * dead process's resources to be freed. So this is not allowed to sleep.
> - *
> * We lock the deadproc list, place the proc on that list (using
> - * the p_runq member), and wake up the reaper.
> + * the p_runq member), and wake up init as the reaping process.
> */
> void
> exit2(struct proc *p)
> {
> - /* account the remainder of time spent in exit1() */
> - mtx_enter(&p->p_p->ps_mtx);
> - tuagg_add_process(p->p_p, p);
> - mtx_leave(&p->p_p->ps_mtx);
> -
> mtx_enter(&deadproc_mutex);
> TAILQ_INSERT_TAIL(&deadproc, p, p_runq);
> mtx_leave(&deadproc_mutex);
>
> - wakeup(&deadproc);
> + atomic_setbits_int(&initprocess->ps_flags, PS_WAITEVENT);
> + wakeup(initprocess);
> }
>
> void
> proc_free(struct proc *p)
> {
> + WITNESS_THREAD_EXIT(p);
> +
> + uvm_uarea_free(p);
> + p->p_vmspace = NULL; /* zap the thread's copy */
> +
> + free(p->p_deadcond, M_SUBPROC, sizeof(*p->p_deadcond));
> crfree(p->p_ucred);
> pool_put(&proc_pool, p);
> atomic_dec_int(&nthreads);
> }
>
> /*
> - * Process reaper. This is run by a kernel thread to free the resources
> - * of a dead process. Once the resources are free, the process becomes
> - * a zombie, and the parent is allowed to read the undead's status.
> + * Free proc's ressources.
> */
> void
> -reaper(void *arg)
> +proc_reap(struct proc *p)
> {
> - struct proc *p;
> + /* Wait for the thread to be scheduled off the CPU. */
> + cond_wait(p->p_deadcond, "pdead");
>
> - KERNEL_UNLOCK();
> + /*
> + * Free the VM resources we're still holding on to.
> + * We must do this from a valid thread because doing
> + * so may block.
> + */
> + if (p->p_flag & P_THREAD) {
> + /* Just a thread */
> + proc_free(p);
> + } else {
> + struct process *pr = p->p_p;
>
> - SCHED_ASSERT_UNLOCKED();
> + KERNEL_LOCK();
> + /* No one will wait for us, just zap it. */
> + process_remove(pr);
> + KERNEL_UNLOCK();
>
> - for (;;) {
> - mtx_enter(&deadproc_mutex);
> - while ((p = TAILQ_FIRST(&deadproc)) == NULL)
> - msleep_nsec(&deadproc, &deadproc_mutex, PVM, "reaper",
> - INFSLP);
> -
> - /* Remove us from the deadproc list. */
> - TAILQ_REMOVE(&deadproc, p, p_runq);
> - mtx_leave(&deadproc_mutex);
> -
> - WITNESS_THREAD_EXIT(p);
> -
> - /*
> - * Free the VM resources we're still holding on to.
> - * We must do this from a valid thread because doing
> - * so may block.
> - */
> - uvm_uarea_free(p);
> - p->p_vmspace = NULL; /* zap the thread's copy */
> -
> - if (p->p_flag & P_THREAD) {
> - /* Just a thread */
> - proc_free(p);
> - } else {
> - struct process *pr = p->p_p;
> -
> - /* Release the rest of the process's vmspace */
> - uvm_exit(pr);
> -
> - KERNEL_LOCK();
> - if ((pr->ps_flags & PS_NOZOMBIE) == 0) {
> - /* Process is now a true zombie. */
> - atomic_setbits_int(&pr->ps_flags, PS_ZOMBIE);
> - }
> -
> - /* Notify listeners of our demise and clean up. */
> - knote_processexit(pr);
> -
> - if (pr->ps_flags & PS_ZOMBIE) {
> - /* Post SIGCHLD and wake up parent. */
> - prsignal(pr->ps_pptr, SIGCHLD);
> - atomic_setbits_int(&pr->ps_pptr->ps_flags,
> - PS_WAITEVENT);
> - wakeup(pr->ps_pptr);
> - } else {
> - /* No one will wait for us, just zap it. */
> - process_zap(pr);
> - }
> - KERNEL_UNLOCK();
> - }
> + process_zap(pr);
> }
> }
>
> @@ -667,6 +645,26 @@ loop:
> break;
> }
> }
> + /* init(8) accounts for cleaning up deadprocs. */
> + if (q->p_p == initprocess) {
> + struct proc *p;
> + struct schedstate_percpu *spc;
> +
> + KERNEL_UNLOCK();
> + mtx_enter(&deadproc_mutex);
> + while ((p = TAILQ_FIRST(&deadproc)) != NULL) {
> + TAILQ_REMOVE(&deadproc, p, p_runq);
> + mtx_leave(&deadproc_mutex);
> + proc_reap(p);
> + mtx_enter(&deadproc_mutex);
> +
> + spc = &curcpu()->ci_schedstate;
> + if (spc->spc_schedflags & SPCF_SHOULDYIELD)
> + break;
> + }
> + mtx_leave(&deadproc_mutex);
> + KERNEL_LOCK();
> + }
> if (nfound == 0)
> return (ECHILD);
> if (options & WNOHANG) {
> @@ -787,12 +785,20 @@ proc_finish_wait(struct proc *waiter, struct process *pr)
> wakeup(tr);
> } else {
> mtx_leave(&pr->ps_mtx);
> + /* Wait until the proc is off of its CPU. */
> + cond_wait(pr->ps_mainproc->p_deadcond, "pdead");
> +
> scheduler_wait_hook(waiter, pr->ps_mainproc);
> rup = &waiter->p_p->ps_cru;
> ruadd(rup, pr->ps_ru);
> LIST_REMOVE(pr, ps_list); /* off zombprocess */
> freepid(pr->ps_pid);
> + process_remove(pr);
> + KERNEL_UNLOCK();
> +
> process_zap(pr);
> +
> }
> }
>
> @@ -857,32 +863,30 @@ process_reparent(struct process *child, struct process *parent)
> }
>
> void
> -process_zap(struct process *pr)
> +process_remove(struct process *pr)
> {
> - struct vnode *otvp;
> - struct proc *p = pr->ps_mainproc;
> -
> /*
> * Finally finished with old proc entry.
> - * Unlink it from its process group and free it.
> + * Unlink it from its process group.
> */
> leavepgrp(pr);
> LIST_REMOVE(pr, ps_sibling);
> process_clear_orphan(pr);
> +}
> +
> +void
> +process_zap(struct process *pr)
> +{
> + struct proc *p = pr->ps_mainproc;
> +
> + /* Release the rest of the process's vmspace */
> + uvm_exit(pr);
>
> /*
> * Decrement the count of procs running with this uid.
> */
> (void)chgproccnt(pr->ps_ucred->cr_ruid, -1);
>
> - /*
> - * Release reference to text vnode
> - */
> - otvp = pr->ps_textvp;
> - pr->ps_textvp = NULL;
> - if (otvp)
> - vrele(otvp);
> -
> KASSERT(pr->ps_threadcnt == 0);
> KASSERT(pr->ps_exitcnt == 1);
> if (pr->ps_ptstat != NULL)
> @@ -893,7 +897,7 @@ process_zap(struct process *pr)
> lim_free(pr->ps_limit);
> crfree(pr->ps_ucred);
> pool_put(&process_pool, pr);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
>
> proc_free(p);
> }
> diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
> index 5b925518234d..e1fbdfa14b66 100644
> --- a/sys/kern/kern_fork.c
> +++ b/sys/kern/kern_fork.c
> @@ -78,7 +78,8 @@ void unveil_copy(struct process *parent, struct process *child);
>
> struct proc *thread_new(struct proc *_parent, vaddr_t _uaddr);
> struct process *process_new(struct proc *, struct process *, int);
> -int fork_check_maxthread(uid_t _uid);
> +int fork_check_maxthread(uid_t);
> +int fork_check_maxprocess(uid_t);
>
> void
> fork_return(void *arg)
> @@ -164,6 +165,8 @@ thread_new(struct proc *parent, vaddr_t uaddr)
> (caddr_t)&p->p_endcopy - (caddr_t)&p->p_startcopy);
> crhold(p->p_ucred);
> p->p_addr = (struct user *)uaddr;
> + p->p_deadcond = malloc(sizeof(*p->p_deadcond), M_SUBPROC, M_WAITOK);
> + cond_init(p->p_deadcond);
>
> /*
> * Initialize the timeouts.
> @@ -333,6 +336,25 @@ fork_check_maxthread(uid_t uid)
> return 0;
> }
>
> +int
> +fork_check_maxprocess(uid_t uid)
> +{
> + int maxprocess_local, val;
> +
> + maxprocess_local = atomic_load_int(&maxprocess);
> + val = atomic_inc_int_nv(&nprocesses);
> + if ((val > maxprocess_local - 5 && uid != 0) ||
> + val > maxprocess_local) {
> + static struct timeval lasttfm;
> +
> + if (ratecheck(&lasttfm, &fork_tfmrate))
> + tablefull("process");
> + atomic_dec_int(&nprocesses);
> + return EAGAIN;
> + }
> + return 0;
> +}
> +
This could already be extracted and easily committed.
> static inline void
> fork_thread_start(struct proc *p, struct proc *parent, int flags)
> {
> @@ -355,7 +377,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> struct proc *p;
> uid_t uid = curp->p_ucred->cr_ruid;
> struct vmspace *vm;
> - int count, maxprocess_local;
> + int count;
> vaddr_t uaddr;
> int error;
> struct ptrace_state *newptstat = NULL;
> @@ -368,17 +390,10 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> if ((error = fork_check_maxthread(uid)))
> return error;
>
> - maxprocess_local = atomic_load_int(&maxprocess);
> - if ((nprocesses >= maxprocess_local - 5 && uid != 0) ||
> - nprocesses >= maxprocess_local) {
> - static struct timeval lasttfm;
> -
> - if (ratecheck(&lasttfm, &fork_tfmrate))
> - tablefull("process");
> + if ((error = fork_check_maxprocess(uid))) {
> atomic_dec_int(&nthreads);
> - return EAGAIN;
> + return error;
> }
> - nprocesses++;
>
> /*
> * Increment the count of processes running with this uid.
> @@ -387,7 +402,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> count = chgproccnt(uid, 1);
> if (uid != 0 && count > lim_cur(RLIMIT_NPROC)) {
> (void)chgproccnt(uid, -1);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
> atomic_dec_int(&nthreads);
> return EAGAIN;
> }
> @@ -395,7 +410,7 @@ fork1(struct proc *curp, int flags, void (*func)(void *), void *arg,
> uaddr = uvm_uarea_alloc();
> if (uaddr == 0) {
> (void)chgproccnt(uid, -1);
> - nprocesses--;
> + atomic_dec_int(&nprocesses);
> atomic_dec_int(&nthreads);
> return (ENOMEM);
> }
> @@ -714,6 +729,12 @@ proc_trampoline_mi(void)
> assertwaitok();
> smr_idle();
>
> + /* Signal that the previous proc is off the CPU now. */
> + if (spc->spc_deadcond) {
> + cond_signal(spc->spc_deadcond);
> + spc->spc_deadcond = NULL;
> + }
> +
> /* Start any optional clock interrupts needed by the thread. */
> if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
> atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
> index 74183e6bb681..6a80146a36c2 100644
> --- a/sys/kern/kern_sched.c
> +++ b/sys/kern/kern_sched.c
> @@ -34,6 +34,7 @@ void sched_kthreads_create(void *);
>
> int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p);
> struct proc *sched_steal_proc(struct cpu_info *);
> +void sched_to(struct proc *);
>
> /*
> * To help choosing which cpu should run which process we keep track
> @@ -107,7 +108,6 @@ sched_init_cpu(struct cpu_info *ci)
>
> kthread_create_deferred(sched_kthreads_create, ci);
>
> - TAILQ_INIT(&spc->spc_deadproc);
> SIMPLEQ_INIT(&spc->spc_deferred);
>
> /*
> @@ -170,16 +170,9 @@ sched_idle(void *v)
>
> while (1) {
> while (spc->spc_whichqs != 0) {
> - struct proc *dead;
> -
> SCHED_LOCK();
> p->p_stat = SSLEEP;
> mi_switch();
> -
> - while ((dead = TAILQ_FIRST(&spc->spc_deadproc))) {
> - TAILQ_REMOVE(&spc->spc_deadproc, dead, p_runq);
> - exit2(dead);
> - }
> }
>
> splassert(IPL_NONE);
> @@ -209,31 +202,28 @@ sched_idle(void *v)
>
> /*
> * To free our address space we have to jump through a few hoops.
> - * The freeing is done by the reaper, but until we have one reaper
> - * per cpu, we have no way of putting this proc on the deadproc list
> - * and waking up the reaper without risking having our address space and
> - * stack torn from under us before we manage to switch to another proc.
> - * Therefore we have a per-cpu list of dead processes where we put this
> - * proc and have idle clean up that list and move it to the reaper list.
> + * The freeing is done by the reaper. We make sure that this proc
> + * gets freed only after switching to another proc with the spc_deadcond
> + * signal. The reaping process waits for it.
> */
> void
> sched_exit(struct proc *p)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
>
> - TAILQ_INSERT_TAIL(&spc->spc_deadproc, p, p_runq);
> + KASSERT(spc->spc_deadcond == NULL);
> + spc->spc_deadcond = p->p_deadcond;
>
> tuagg_add_runtime();
>
> - KERNEL_ASSERT_LOCKED();
> - sched_toidle();
> + SCHED_LOCK();
> + sched_to(sched_chooseproc());
> }
>
> void
> -sched_toidle(void)
> +sched_to(struct proc *nextproc)
> {
> struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> - struct proc *idle;
>
> #ifdef MULTIPROCESSOR
> /* This process no longer needs to hold the kernel lock. */
> @@ -252,18 +242,28 @@ sched_toidle(void)
>
> atomic_clearbits_int(&spc->spc_schedflags, SPCF_SWITCHCLEAR);
>
> - SCHED_LOCK();
> - idle = spc->spc_idleproc;
> - idle->p_stat = SRUN;
> + SCHED_ASSERT_LOCKED();
>
> uvmexp.swtch++;
> if (curproc != NULL)
> - TRACEPOINT(sched, off__cpu, idle->p_tid + THREAD_PID_OFFSET,
> - idle->p_p->ps_pid);
> - cpu_switchto(NULL, idle);
> + TRACEPOINT(sched, off__cpu, nextproc->p_tid + THREAD_PID_OFFSET,
> + nextproc->p_p->ps_pid);
> + cpu_switchto(NULL, nextproc);
> panic("cpu_switchto returned");
> }
>
> +void
> +sched_toidle(void)
> +{
> + struct schedstate_percpu *spc = &curcpu()->ci_schedstate;
> + struct proc *idle;
> +
> + SCHED_LOCK();
> + idle = spc->spc_idleproc;
> + idle->p_stat = SRUN;
> + sched_to(idle);
> +}
> +
> void
> setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
> {
> diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
> index 998e8e8b7897..e00b0b0dd420 100644
> --- a/sys/kern/sched_bsd.c
> +++ b/sys/kern/sched_bsd.c
> @@ -423,6 +423,12 @@ mi_switch(void)
> KASSERT(p->p_cpu == curcpu());
> spc = &p->p_cpu->ci_schedstate;
>
> + /* Signal that the previous proc is off the CPU now. */
> + if (spc->spc_deadcond) {
> + cond_signal(spc->spc_deadcond);
> + spc->spc_deadcond = NULL;
> + }
> +
> /* Start any optional clock interrupts needed by the thread. */
> if (ISSET(p->p_p->ps_flags, PS_ITIMER)) {
> atomic_setbits_int(&spc->spc_schedflags, SPCF_ITIMER);
> diff --git a/sys/sys/proc.h b/sys/sys/proc.h
> index c7fe5a03cf44..45dd7cf33d3e 100644
> --- a/sys/sys/proc.h
> +++ b/sys/sys/proc.h
> @@ -339,6 +339,7 @@ struct process {
> #define PS_FLAGS_INHERITED_ON_FORK \
> (PS_SUGID | PS_SUGIDEXEC | PS_PLEDGE | PS_EXECPLEDGE | PS_CHROOT)
>
> +struct cond;
> struct kcov_dev;
> struct lock_list_entry;
> struct kqueue;
> @@ -364,6 +365,7 @@ struct proc {
>
> struct process *p_p; /* [I] The process of this thread. */
> TAILQ_ENTRY(proc) p_thr_link; /* [K|m] Threads in a process linkage. */
> + struct cond *p_deadcond; /* Sync wrt. reaping us. */
>
> /* substructures: */
> struct filedesc *p_fd; /* copy of p_p->ps_fd */
> @@ -554,7 +556,6 @@ extern struct processlist zombprocess; /* List of zombie processes. */
> extern struct proclist allproc; /* List of all threads. */
>
> extern struct process *initprocess; /* Process slot for init. */
> -extern struct proc *reaperproc; /* Thread slot for reaper. */
> extern struct proc *syncerproc; /* filesystem syncer daemon */
>
> extern struct pool process_pool; /* memory pool for processes */
> @@ -588,9 +589,7 @@ void setrunnable(struct proc *);
> void endtsleep(void *);
> int wakeup_proc(struct proc *);
> void unsleep(struct proc *);
> -void reaper(void *);
> __dead void exit1(struct proc *, int, int, int);
> -void exit2(struct proc *);
> void cpu_fork(struct proc *_curp, struct proc *_child, void *_stack,
> void *_tcb, void (*_func)(void *), void *_arg);
> void cpu_exit(struct proc *);
> diff --git a/sys/sys/sched.h b/sys/sys/sched.h
> index 64c7044204c3..9fb9c1e9e2fa 100644
> --- a/sys/sys/sched.h
> +++ b/sys/sys/sched.h
> @@ -110,7 +110,7 @@ struct smr_entry;
> struct schedstate_percpu {
> struct proc *spc_idleproc; /* idle proc for this cpu */
> TAILQ_HEAD(prochead, proc) spc_qs[SCHED_NQS];
> - TAILQ_HEAD(,proc) spc_deadproc;
> + struct cond *spc_deadcond; /* [o] Proc is off the CPU condition */
> struct timespec spc_runtime; /* time curproc started running */
> volatile int spc_schedflags; /* flags; see below */
> u_int spc_schedticks; /* ticks for schedclock() */
> diff --git a/sys/uvm/uvm_glue.c b/sys/uvm/uvm_glue.c
> index 748937af5c9d..f0cb8e7827b9 100644
> --- a/sys/uvm/uvm_glue.c
> +++ b/sys/uvm/uvm_glue.c
> @@ -295,7 +295,7 @@ uvm_uarea_alloc(void)
> * uvm_uarea_free: free a dead thread's stack
> *
> * - the thread passed to us is a dead thread; we
> - * are running on a different context now (the reaper).
> + * are running on a different context now.
> */
> void
> uvm_uarea_free(struct proc *p)
>
Another attempt to get rid of the reaper