Index | Thread | Search

From:
Scott Cheloha <scottcheloha@gmail.com>
Subject:
Re: dt(4), hardclock: move profiling entry points to dedicated clockintrs
To:
tech@openbsd.org
Cc:
claudio@openbsd.org, mlarkin@openbsd.org, mpi@openbsd.org
Date:
Mon, 5 Feb 2024 19:37:54 -0600

Download raw body.

Thread
On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote:
> This patch moves the dt(4) profiling entry points from hardclock()
> into dedicated clockintr callbacks.  It depends on the clockintr_unbind
> patch I sent out separately, so be sure to apply that first.
> 
> - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs.
>   This is the PCB's profiling period.
> 
> - Each struct dt_pcb gets its own struct clockintr, dp_clockintr.
>   dp_clockintr is bound during dt_prov_profile_alloc().  We stagger
>   dp_clockintr to simulate the current behavior under hardclock().
> 
>   Note that dp_clockintr is given a reference to its enclosing
>   PCB when it is bound.  More on this in a second.
> 
> - Replace dt_prov_profile_enter/dt_prov_interval_enter with
>   dt_prov_profile_clock/dt_prov_profile_interval.  These are
>   your clockintr callback functions.  Because the prototypes
>   now don't match the other *_enter() functions I figure
>   changing the names is reasonable.
> 
>   Because dp_clockintr has a copy of the PCB pointer we no longer
>   need to search for the PCB with SMR_SLIST_FOREACH.  This reduces
>   profiling overhead, especially on larger systems.
> 
> - dt_prov_profile_fire() now takes a count of profiling ticks and
>   runs a loop to grab as many EVTs as it needs.  If the callback
>   is delayed, it will assign multiple "hits" to the current PC,
>   just like we do now under hardclock().
> 
> - dt_pcb_free() unbinds the clockintr.  Because dp_clockintr has
>   a reference to the enclosing PCB we set an execution barrier
>   here to ensure it is safe to free the PCB.  In practice this
>   will rarely block, but you need to set the barrier to be sure.
> 
> DT_FA_PROFILE has been updated for amd64.  The unpruned stack now
> looks like this:
> 
> dt_pcb_ring_get
> dt_prov_profile_clock
> clockintr_dispatch
> lapic_clockintr
> Xresume_lapic_ltimer
> acpicpu_idle
> sched_idle
> proc_trampoline
> kernel
> 
> I can test other platforms and update the DT_FA_PROFILE values
> for everything but powerpc64 before committing.  The new value
> for powerpc64 is probably 4, but somebody else needs to confirm.

Updated patch:

- Don't advance/cancel dp_clockintr when dp_nsecs is zero.
- Update DT_FA_PROFILE for i386, macppc

Index: sys/dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
diff -u -p -r1.5 dt_prov_profile.c
--- sys/dev/dt/dt_prov_profile.c	26 Apr 2023 16:53:59 -0000	1.5
+++ sys/dev/dt/dt_prov_profile.c	6 Feb 2024 01:34:45 -0000
@@ -20,6 +20,7 @@
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/atomic.h>
+#include <sys/clockintr.h>
 
 #include <dev/dt/dtvar.h>
 
@@ -31,13 +32,13 @@ struct dt_probe	*dtpp_interval;		/* glob
 
 int	dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *,
 	    struct dt_pcb_list *, struct dtioc_req *);
-int	dt_prov_profile_enter(struct dt_provider *, ...);
-int	dt_prov_interval_enter(struct dt_provider *, ...);
+void	dt_prov_profile_clock(struct clockrequest *, void *, void *);
+void	dt_prov_interval_clock(struct clockrequest *, void *, void *);
 
 struct dt_provider dt_prov_profile = {
 	.dtpv_name	= "profile",
 	.dtpv_alloc	= dt_prov_profile_alloc,
-	.dtpv_enter	= dt_prov_profile_enter,
+	.dtpv_enter	= NULL,
 	.dtpv_leave	= NULL,
 	.dtpv_dealloc	= NULL,
 };
@@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = {
 struct dt_provider dt_prov_interval = {
 	.dtpv_name	= "interval",
 	.dtpv_alloc	= dt_prov_profile_alloc,
-	.dtpv_enter	= dt_prov_interval_enter,
+	.dtpv_enter	= NULL,
 	.dtpv_leave	= NULL,
 	.dtpv_dealloc	= NULL,
 };
@@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d
 {
 	struct dt_pcb *dp;
 	struct cpu_info *ci;
+	void (*callback)(struct clockrequest *, void *, void *);
 	CPU_INFO_ITERATOR cii;
 	extern int hz;
 
@@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d
 	if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz)
 		return EOPNOTSUPP;
 
+	if (dtp == dtpp_interval)
+		callback = dt_prov_interval_clock;
+	else
+		callback = dt_prov_profile_clock;
+
 	CPU_INFO_FOREACH(cii, ci) {
 		if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval))
 			continue;
@@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d
 			return ENOMEM;
 		}
 
-		dp->dp_maxtick = hz / dtrq->dtrq_rate;
+		dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate;
 		dp->dp_cpuid = ci->ci_cpuid;
+		clockintr_bind(&dp->dp_clockintr, ci, callback, dp);
+		clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs,
+		    CPU_INFO_UNIT(ci), MAXCPUS);
 
 		dp->dp_filter = dtrq->dtrq_filter;
 		dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE;
@@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d
 }
 
 static inline void
-dt_prov_profile_fire(struct dt_pcb *dp)
+dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count)
 {
+	uint64_t i;
 	struct dt_evt *dtev;
 
-	if (++dp->dp_nticks < dp->dp_maxtick)
-		return;
-
-	dtev = dt_pcb_ring_get(dp, 1);
-	if (dtev == NULL)
-		return;
-	dt_pcb_ring_consume(dp, dtev);
-	dp->dp_nticks = 0;
+	for (i = 0; i < count; i++) {
+		dtev = dt_pcb_ring_get(dp, 1);
+		if (dtev == NULL)
+			return;
+		dt_pcb_ring_consume(dp, dtev);
+	}
 }
 
-int
-dt_prov_profile_enter(struct dt_provider *dtpv, ...)
+void
+dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg)
 {
-	struct cpu_info *ci = curcpu();
-	struct dt_pcb *dp;
-
-	KASSERT(dtpv == &dt_prov_profile);
-
-	smr_read_enter();
-	SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) {
-		if (dp->dp_cpuid != ci->ci_cpuid)
-			continue;
+	uint64_t count;
+	struct dt_pcb *dp = arg;
 
-		dt_prov_profile_fire(dp);
-	}
-	smr_read_leave();
-	return 0;
+	count = clockrequest_advance(cr, dp->dp_nsecs);
+	dt_prov_profile_fire(dp, count);
 }
 
-int
-dt_prov_interval_enter(struct dt_provider *dtpv, ...)
+void
+dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg)
 {
-	struct dt_pcb *dp;
+	uint64_t count;
+	struct dt_pcb *dp = arg;
 
-	KASSERT(dtpv == &dt_prov_interval);
-
-	smr_read_enter();
-	SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) {
-		dt_prov_profile_fire(dp);
-	}
-	smr_read_leave();
-	return 0;
+	count = clockrequest_advance(cr, dp->dp_nsecs);
+	dt_prov_profile_fire(dp, count);
 }
Index: sys/dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
diff -u -p -r1.29 dt_dev.c
--- sys/dev/dt/dt_dev.c	2 Jan 2024 16:32:48 -0000	1.29
+++ sys/dev/dt/dt_dev.c	6 Feb 2024 01:34:45 -0000
@@ -19,6 +19,7 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/param.h>
+#include <sys/clockintr.h>
 #include <sys/device.h>
 #include <sys/exec_elf.h>
 #include <sys/malloc.h>
@@ -56,13 +57,13 @@
  *	proc_trampoline+0x1c
  */
 #if defined(__amd64__)
-#define DT_FA_PROFILE	7
+#define DT_FA_PROFILE	5
 #define DT_FA_STATIC	2
 #elif defined(__i386__)
-#define DT_FA_PROFILE	8
+#define DT_FA_PROFILE	5
 #define DT_FA_STATIC	2
 #elif defined(__macppc__)
-#define DT_FA_PROFILE  7
+#define DT_FA_PROFILE  5
 #define DT_FA_STATIC   2
 #elif defined(__octeon__)
 #define DT_FA_PROFILE	6
@@ -492,6 +493,8 @@ dt_ioctl_record_start(struct dt_softc *s
 		SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
 		dtp->dtp_recording++;
 		dtp->dtp_prov->dtpv_recording++;
+		if (dp->dp_nsecs != 0)
+			clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs);
 	}
 	rw_exit_write(&dt_lock);
 
@@ -518,6 +521,8 @@ dt_ioctl_record_stop(struct dt_softc *sc
 	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
 		struct dt_probe *dtp = dp->dp_dtp;
 
+		if (dp->dp_nsecs != 0)
+			clockintr_cancel(&dp->dp_clockintr);
 		dtp->dtp_recording--;
 		dtp->dtp_prov->dtpv_recording--;
 		SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
@@ -679,6 +684,15 @@ dt_pcb_free(struct dt_pcb *dp)
 {
 	if (dp == NULL)
 		return;
+
+	/*
+	 * dp_clockintr is bound if dp_nsecs is non-zero.  Its callback
+	 * has a reference to dp, so set an execution barrier and block
+	 * until it is no longer running before freeing dp.
+	 */
+	if (dp->dp_nsecs != 0)
+		clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
+
 	free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring));
 	free(dp, M_DT, sizeof(*dp));
 }
Index: sys/dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
diff -u -p -r1.17 dtvar.h
--- sys/dev/dt/dtvar.h	26 Apr 2023 16:53:59 -0000	1.17
+++ sys/dev/dt/dtvar.h	6 Feb 2024 01:34:46 -0000
@@ -197,9 +197,9 @@ struct dt_pcb {
 	struct dt_filter	 dp_filter;	/* [I] filter to match */
 
 	/* Provider specific fields. */
+	struct clockintr	 dp_clockintr;	/* [?] profiling handle */
+	uint64_t		 dp_nsecs;	/* [I] profiling period */
 	unsigned int		 dp_cpuid;	/* [I] on which CPU */
-	unsigned int		 dp_maxtick;	/* [I] freq. of profiling */
-	unsigned int		 dp_nticks;	/* [c] current tick count */
 
 	/* Counters */
 	uint64_t		 dp_dropevt;	/* [m] # dropped event */
Index: sys/kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
diff -u -p -r1.121 kern_clock.c
--- sys/kern/kern_clock.c	17 Oct 2023 00:04:02 -0000	1.121
+++ sys/kern/kern_clock.c	6 Feb 2024 01:34:46 -0000
@@ -50,11 +50,6 @@
 #include <sys/sched.h>
 #include <sys/timetc.h>
 
-#include "dt.h"
-#if NDT > 0
-#include <dev/dt/dtvar.h>
-#endif
-
 /*
  * Clock handling routines.
  *
@@ -145,12 +140,6 @@ initclocks(void)
 void
 hardclock(struct clockframe *frame)
 {
-#if NDT > 0
-	DT_ENTER(profile, NULL);
-	if (CPU_IS_PRIMARY(curcpu()))
-		DT_ENTER(interval, NULL);
-#endif
-
 	/*
 	 * If we are not the primary CPU, we're not allowed to do
 	 * any more work.