Index | Thread | Search

From:
Martin Pieuchot <mpi@openbsd.org>
Subject:
Re: dt(4), hardclock: move profiling entry points to dedicated clockintrs
To:
Scott Cheloha <scottcheloha@gmail.com>
Cc:
tech@openbsd.org, claudio@openbsd.org, mlarkin@openbsd.org
Date:
Tue, 6 Feb 2024 15:24:20 +0100

Download raw body.

Thread
On 05/02/24(Mon) 19:37, Scott Cheloha wrote:
> On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote:
> > This patch moves the dt(4) profiling entry points from hardclock()
> > into dedicated clockintr callbacks.  It depends on the clockintr_unbind
> > patch I sent out separately, so be sure to apply that first.
> > 
> > - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs.
> >   This is the PCB's profiling period.
> > 
> > - Each struct dt_pcb gets its own struct clockintr, dp_clockintr.
> >   dp_clockintr is bound during dt_prov_profile_alloc().  We stagger
> >   dp_clockintr to simulate the current behavior under hardclock().
> > 
> >   Note that dp_clockintr is given a reference to its enclosing
> >   PCB when it is bound.  More on this in a second.
> > 
> > - Replace dt_prov_profile_enter/dt_prov_interval_enter with
> >   dt_prov_profile_clock/dt_prov_profile_interval.  These are
> >   your clockintr callback functions.  Because the prototypes
> >   now don't match the other *_enter() functions I figure
> >   changing the names is reasonable.
> > 
> >   Because dp_clockintr has a copy of the PCB pointer we no longer
> >   need to search for the PCB with SMR_SLIST_FOREACH.  This reduces
> >   profiling overhead, especially on larger systems.
> > 
> > - dt_prov_profile_fire() now takes a count of profiling ticks and
> >   runs a loop to grab as many EVTs as it needs.  If the callback
> >   is delayed, it will assign multiple "hits" to the current PC,
> >   just like we do now under hardclock().
> > 
> > - dt_pcb_free() unbinds the clockintr.  Because dp_clockintr has
> >   a reference to the enclosing PCB we set an execution barrier
> >   here to ensure it is safe to free the PCB.  In practice this
> >   will rarely block, but you need to set the barrier to be sure.
> > 
> > DT_FA_PROFILE has been updated for amd64.  The unpruned stack now
> > looks like this:
> > 
> > dt_pcb_ring_get
> > dt_prov_profile_clock
> > clockintr_dispatch
> > lapic_clockintr
> > Xresume_lapic_ltimer
> > acpicpu_idle
> > sched_idle
> > proc_trampoline
> > kernel
> > 
> > I can test other platforms and update the DT_FA_PROFILE values
> > for everything but powerpc64 before committing.  The new value
> > for powerpc64 is probably 4, but somebody else needs to confirm.
> 
> Updated patch:
> 
> - Don't advance/cancel dp_clockintr when dp_nsecs is zero.
> - Update DT_FA_PROFILE for i386, macppc

I can't test those. 

Comments below.

> Index: sys/dev/dt/dt_prov_profile.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
> diff -u -p -r1.5 dt_prov_profile.c
> --- sys/dev/dt/dt_prov_profile.c	26 Apr 2023 16:53:59 -0000	1.5
> +++ sys/dev/dt/dt_prov_profile.c	6 Feb 2024 01:34:45 -0000
> @@ -20,6 +20,7 @@
>  #include <sys/systm.h>
>  #include <sys/param.h>
>  #include <sys/atomic.h>
> +#include <sys/clockintr.h>
>  
>  #include <dev/dt/dtvar.h>
>  
> @@ -31,13 +32,13 @@ struct dt_probe	*dtpp_interval;		/* glob
>  
>  int	dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *,
>  	    struct dt_pcb_list *, struct dtioc_req *);
> -int	dt_prov_profile_enter(struct dt_provider *, ...);
> -int	dt_prov_interval_enter(struct dt_provider *, ...);
> +void	dt_prov_profile_clock(struct clockrequest *, void *, void *);
> +void	dt_prov_interval_clock(struct clockrequest *, void *, void *);
>  
>  struct dt_provider dt_prov_profile = {
>  	.dtpv_name	= "profile",
>  	.dtpv_alloc	= dt_prov_profile_alloc,
> -	.dtpv_enter	= dt_prov_profile_enter,
> +	.dtpv_enter	= NULL,
>  	.dtpv_leave	= NULL,
>  	.dtpv_dealloc	= NULL,
>  };
> @@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = {
>  struct dt_provider dt_prov_interval = {
>  	.dtpv_name	= "interval",
>  	.dtpv_alloc	= dt_prov_profile_alloc,
> -	.dtpv_enter	= dt_prov_interval_enter,
> +	.dtpv_enter	= NULL,
>  	.dtpv_leave	= NULL,
>  	.dtpv_dealloc	= NULL,
>  };
> @@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d
>  {
>  	struct dt_pcb *dp;
>  	struct cpu_info *ci;
> +	void (*callback)(struct clockrequest *, void *, void *);
>  	CPU_INFO_ITERATOR cii;
>  	extern int hz;
>  
> @@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d
>  	if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz)
>  		return EOPNOTSUPP;
>  
> +	if (dtp == dtpp_interval)
> +		callback = dt_prov_interval_clock;
> +	else
> +		callback = dt_prov_profile_clock;

I see no difference between the two callbacks, can we get rid of
dt_prov_interval_clock()?

>  	CPU_INFO_FOREACH(cii, ci) {
>  		if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval))
>  			continue;
> @@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d
>  			return ENOMEM;
>  		}
>  
> -		dp->dp_maxtick = hz / dtrq->dtrq_rate;
> +		dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate;
>  		dp->dp_cpuid = ci->ci_cpuid;

`dp_cpuid' is no longer used now that clockintr_stagger() bind a request
to a given CPU.  You can get rid of it.

> +		clockintr_bind(&dp->dp_clockintr, ci, callback, dp);
> +		clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs,
> +		    CPU_INFO_UNIT(ci), MAXCPUS);
>  
>  		dp->dp_filter = dtrq->dtrq_filter;
>  		dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE;
> @@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d
>  }
>  
>  static inline void
> -dt_prov_profile_fire(struct dt_pcb *dp)
> +dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count)
>  {
> +	uint64_t i;
>  	struct dt_evt *dtev;
>  
> -	if (++dp->dp_nticks < dp->dp_maxtick)
> -		return;
> -
> -	dtev = dt_pcb_ring_get(dp, 1);
> -	if (dtev == NULL)
> -		return;
> -	dt_pcb_ring_consume(dp, dtev);
> -	dp->dp_nticks = 0;
> +	for (i = 0; i < count; i++) {
> +		dtev = dt_pcb_ring_get(dp, 1);
> +		if (dtev == NULL)
> +			return;
> +		dt_pcb_ring_consume(dp, dtev);
> +	}
>  }
>  
> -int
> -dt_prov_profile_enter(struct dt_provider *dtpv, ...)
> +void
> +dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg)
>  {
> -	struct cpu_info *ci = curcpu();
> -	struct dt_pcb *dp;
> -
> -	KASSERT(dtpv == &dt_prov_profile);
> -
> -	smr_read_enter();
> -	SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) {
> -		if (dp->dp_cpuid != ci->ci_cpuid)
> -			continue;
> +	uint64_t count;
> +	struct dt_pcb *dp = arg;
>  
> -		dt_prov_profile_fire(dp);
> -	}
> -	smr_read_leave();
> -	return 0;
> +	count = clockrequest_advance(cr, dp->dp_nsecs);
> +	dt_prov_profile_fire(dp, count);
>  }
>  
> -int
> -dt_prov_interval_enter(struct dt_provider *dtpv, ...)
> +void
> +dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg)
>  {
> -	struct dt_pcb *dp;
> +	uint64_t count;
> +	struct dt_pcb *dp = arg;
>  
> -	KASSERT(dtpv == &dt_prov_interval);
> -
> -	smr_read_enter();
> -	SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) {
> -		dt_prov_profile_fire(dp);
> -	}
> -	smr_read_leave();
> -	return 0;
> +	count = clockrequest_advance(cr, dp->dp_nsecs);
> +	dt_prov_profile_fire(dp, count);

>  }
> Index: sys/dev/dt/dt_dev.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> diff -u -p -r1.29 dt_dev.c
> --- sys/dev/dt/dt_dev.c	2 Jan 2024 16:32:48 -0000	1.29
> +++ sys/dev/dt/dt_dev.c	6 Feb 2024 01:34:45 -0000
> @@ -19,6 +19,7 @@
>  #include <sys/types.h>
>  #include <sys/systm.h>
>  #include <sys/param.h>
> +#include <sys/clockintr.h>
>  #include <sys/device.h>
>  #include <sys/exec_elf.h>
>  #include <sys/malloc.h>
> @@ -56,13 +57,13 @@
>   *	proc_trampoline+0x1c
>   */
>  #if defined(__amd64__)
> -#define DT_FA_PROFILE	7
> +#define DT_FA_PROFILE	5
>  #define DT_FA_STATIC	2
>  #elif defined(__i386__)
> -#define DT_FA_PROFILE	8
> +#define DT_FA_PROFILE	5
>  #define DT_FA_STATIC	2
>  #elif defined(__macppc__)
> -#define DT_FA_PROFILE  7
> +#define DT_FA_PROFILE  5
>  #define DT_FA_STATIC   2
>  #elif defined(__octeon__)
>  #define DT_FA_PROFILE	6
> @@ -492,6 +493,8 @@ dt_ioctl_record_start(struct dt_softc *s
>  		SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
>  		dtp->dtp_recording++;
>  		dtp->dtp_prov->dtpv_recording++;
> +		if (dp->dp_nsecs != 0)
> +			clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs);
>  	}
>  	rw_exit_write(&dt_lock);
>  
> @@ -518,6 +521,8 @@ dt_ioctl_record_stop(struct dt_softc *sc
>  	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
>  		struct dt_probe *dtp = dp->dp_dtp;
>  
> +		if (dp->dp_nsecs != 0)
> +			clockintr_cancel(&dp->dp_clockintr);

Here this is complex.  Since we are no longer using SMR in the callback
to get a reference to `dp' we should not do the following before we are
sure the callback won't run.  That would imply clockintr_unbind(), no?
Some caution should be taken with clockintr_unbind() because we are
holding a lock.

>  		dtp->dtp_recording--;
>  		dtp->dtp_prov->dtpv_recording--;
>  		SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
> @@ -679,6 +684,15 @@ dt_pcb_free(struct dt_pcb *dp)
>  {
>  	if (dp == NULL)
>  		return;
> +
> +	/*
> +	 * dp_clockintr is bound if dp_nsecs is non-zero.  Its callback
> +	 * has a reference to dp, so set an execution barrier and block
> +	 * until it is no longer running before freeing dp.
> +	 */
> +	if (dp->dp_nsecs != 0)
> +		clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
> +

This is too late and should not be necessary.  dtclose() calls 
ctl_record_stop() which should take care of the barrier before
decrementing counters.

>  	free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring));
>  	free(dp, M_DT, sizeof(*dp));
>  }
> Index: sys/dev/dt/dtvar.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
> diff -u -p -r1.17 dtvar.h
> --- sys/dev/dt/dtvar.h	26 Apr 2023 16:53:59 -0000	1.17
> +++ sys/dev/dt/dtvar.h	6 Feb 2024 01:34:46 -0000
> @@ -197,9 +197,9 @@ struct dt_pcb {
>  	struct dt_filter	 dp_filter;	/* [I] filter to match */
>  
>  	/* Provider specific fields. */
> +	struct clockintr	 dp_clockintr;	/* [?] profiling handle */

If you don't know it's the kernel lock K.

> +	uint64_t		 dp_nsecs;	/* [I] profiling period */
>  	unsigned int		 dp_cpuid;	/* [I] on which CPU */
> -	unsigned int		 dp_maxtick;	/* [I] freq. of profiling */
> -	unsigned int		 dp_nticks;	/* [c] current tick count */
>  
>  	/* Counters */
>  	uint64_t		 dp_dropevt;	/* [m] # dropped event */
> Index: sys/kern/kern_clock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> diff -u -p -r1.121 kern_clock.c
> --- sys/kern/kern_clock.c	17 Oct 2023 00:04:02 -0000	1.121
> +++ sys/kern/kern_clock.c	6 Feb 2024 01:34:46 -0000
> @@ -50,11 +50,6 @@
>  #include <sys/sched.h>
>  #include <sys/timetc.h>
>  
> -#include "dt.h"
> -#if NDT > 0
> -#include <dev/dt/dtvar.h>
> -#endif
> -
>  /*
>   * Clock handling routines.
>   *
> @@ -145,12 +140,6 @@ initclocks(void)
>  void
>  hardclock(struct clockframe *frame)
>  {
> -#if NDT > 0
> -	DT_ENTER(profile, NULL);
> -	if (CPU_IS_PRIMARY(curcpu()))
> -		DT_ENTER(interval, NULL);
> -#endif
> -
>  	/*
>  	 * If we are not the primary CPU, we're not allowed to do
>  	 * any more work.
>