From: Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject: Re: One reaper per CPU
To: Christian Ludwig <christian_ludwig@genua.de>
Cc: "mark.kettenis@xs4all.nl" <mark.kettenis@xs4all.nl>, "tech@openbsd.org" <tech@openbsd.org>
Date: Wed, 10 Jul 2024 20:25:00 +0200

On Wed, Jul 10, 2024 at 05:36:02PM +0000, Christian Ludwig wrote:
> Hi,
> 
> On Wed, 2024-07-10 at 17:42 +0200, Mark Kettenis wrote:
> > > Date: Wed, 10 Jul 2024 17:21:35 +0200
> > > From: Christian Ludwig <cludwig@genua.de>
> > 
> > Hi Christian,
> > 
> > > Hi,
> > > 
> > > This implements one reaper process per CPU. The benefit is that we
> > > can
> > > switch to a safe stack (the reaper) on process teardown without the
> > > need
> > > to lock data structures between sched_exit() and the local reaper.
> > > That
> > > means the global deadproc mutex goes away. And there is also no
> > > need to
> > > go through idle anymore, because we know that the local reaper is
> > > not
> > > currently running when the dying proc enlists itself.
> > > 
> > > I have tested it lightly. It does not fall apart immediately. I do
> > > not
> > > see any performance regressions in my test case, which is
> > > recompiling
> > > the kernel over and over again, but your mileage my vary. So please
> > > give
> > > this some serious testing on your boxes.
> > > 
> > > So long,
> > > 
> > >  - Christian
> > > 
> > > ---
> > >  sys/kern/init_main.c  |  5 ----
> > >  sys/kern/kern_exit.c  | 59 ++++++++++++---------------------------
> > > ----
> > >  sys/kern/kern_sched.c | 43 ++++++++++++++++++++-----------
> > >  sys/sys/proc.h        |  1 -
> > >  4 files changed, 44 insertions(+), 64 deletions(-)
> > > 
> > > diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
> > > index d571b0855cff..83a03a14a042 100644
> > > --- a/sys/kern/init_main.c
> > > +++ b/sys/kern/init_main.c
> > > @@ -117,7 +117,6 @@ struct      plimit limit0;
> > >  struct vmspace vmspace0;
> > >  struct sigacts sigacts0;
> > >  struct process *initprocess;
> > > -struct proc *reaperproc;
> > >  
> > >  extern struct user *proc0paddr;
> > >  
> > > @@ -495,10 +494,6 @@ main(void *framep)
> > >         if (kthread_create(uvm_pageout, NULL, NULL, "pagedaemon"))
> > >                 panic("fork pagedaemon");
> > >  
> > > -       /* Create the reaper daemon kernel thread. */
> > > -       if (kthread_create(reaper, NULL, &reaperproc, "reaper"))
> > > -               panic("fork reaper");
> > > -
> > >         /* Create the cleaner daemon kernel thread. */
> > >         if (kthread_create(buf_daemon, NULL, &cleanerproc,
> > > "cleaner"))
> > >                 panic("fork cleaner");
> > > diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
> > > index ba87dcd42cd0..a5cc37ac0f74 100644
> > > --- a/sys/kern/kern_exit.c
> > > +++ b/sys/kern/kern_exit.c
> > > @@ -378,11 +378,10 @@ exit1(struct proc *p, int xexit, int xsig,
> > > int flags)
> > >          */
> > >  
> > >         /*
> > > -        * Finally, call machine-dependent code to switch to a new
> > > -        * context (possibly the idle context).  Once we are no
> > > longer
> > > -        * using the dead process's vmspace and stack, exit2() will
> > > be
> > > -        * called to schedule those resources to be released by the
> > > -        * reaper thread.
> > > +        * Finally, call machine-dependent code to switch away. 
> > > Then,
> > > +        * when we are no longer using the dead process's vmspace
> > > and
> > > +        * stack, the current CPU's reaper thread releases those
> > > +        * resources.
> > >          *
> > >          * Note that cpu_exit() will end with a call equivalent to
> > >          * cpu_switch(), finishing our execution (pun intended).
> > > @@ -392,38 +391,6 @@ exit1(struct proc *p, int xexit, int xsig, int
> > > flags)
> > >         panic("cpu_exit returned");
> > >  }
> > >  
> > > -/*
> > > - * Locking of this proclist is special; it's accessed in a
> > > - * critical section of process exit, and thus locking it can't
> > > - * modify interrupt state.  We use a simple spin lock for this
> > > - * proclist.  We use the p_hash member to linkup to deadproc.
> > > - */
> > > -struct mutex deadproc_mutex =
> > > -    MUTEX_INITIALIZER_FLAGS(IPL_NONE, "deadproc", MTX_NOWITNESS);
> > > -struct proclist deadproc = LIST_HEAD_INITIALIZER(deadproc);
> > > -
> > > -/*
> > > - * We are called from sched_idle() once it is safe to schedule the
> > > - * dead process's resources to be freed. So this is not allowed to
> > > sleep.
> > > - *
> > > - * We lock the deadproc list, place the proc on that list (using
> > > - * the p_hash member), and wake up the reaper.
> > > - */
> > > -void
> > > -exit2(struct proc *p)
> > > -{
> > > -       /* account the remainder of time spent in exit1() */
> > > -       mtx_enter(&p->p_p->ps_mtx);
> > > -       tuagg_add_process(p->p_p, p);
> > > -       mtx_leave(&p->p_p->ps_mtx);
> > > -
> > > -       mtx_enter(&deadproc_mutex);
> > > -       LIST_INSERT_HEAD(&deadproc, p, p_hash);
> > > -       mtx_leave(&deadproc_mutex);
> > > -
> > > -       wakeup(&deadproc);
> > > -}
> > > -
> > >  void
> > >  proc_free(struct proc *p)
> > >  {
> > > @@ -441,20 +408,26 @@ void
> > >  reaper(void *arg)
> > >  {
> > >         struct proc *p;
> > > +       struct cpu_info *ci = arg;
> > > +       struct schedstate_percpu *spc;
> > >  
> > >         KERNEL_UNLOCK();
> > > -
> > >         SCHED_ASSERT_UNLOCKED();
> > >  
> > > +       sched_peg_curproc(ci);
> > > +       spc = &ci->ci_schedstate;
> > > +
> > > +       KASSERT(ci == curcpu());
> > > +
> > >         for (;;) {
> > > -               mtx_enter(&deadproc_mutex);
> > > -               while ((p = LIST_FIRST(&deadproc)) == NULL)
> > > -                       msleep_nsec(&deadproc, &deadproc_mutex,
> > > PVM, "reaper",
> > > -                           INFSLP);
> > > +               while ((p = LIST_FIRST(&spc->spc_deadproc)) ==
> > > NULL) {
> > > +                       KERNEL_LOCK();
> > > +                       tsleep_nsec(&spc->spc_deadproc, PVM,
> > > "reaper", INFSLP);
> > > +                       KERNEL_UNLOCK();
> > > +               }
> > 
> > I don't think going back to taking the kernel lock here makes sense.
> > 
> > But if you now have a reaper per-CPU, the spc_deadproc list is only
> > ever tocuhed by the CPU that owns it isn't it?  So there would be no
> > need to do any locking...
> 
> Correct! I put it in, because tsleep() does not know about that case.
> It has the following assertion that triggers if the kernel lock is not
> held.
> 
>     KASSERT(ident == &nowake || timo || _kernel_lock_held());
> 
> We can unroll tsleep() here without holding the kernel lock. I tested
> it, and it works.
> 

You actually don't need a tsleep at all. This code needs no wakeup_one()
either. You need to think this like the idleproc. Its constant state is
SSLEEP and then by calling sched_toproc(&spc->...) it goes SONPROC runs
does its work and just mi_switch() at the end.

-- 
:wq Claudio