Index | Thread | Search

From:
Scott Cheloha <scottcheloha@gmail.com>
Subject:
dt(4), hardclock: move profiling entry points to dedicated clockintrs
To:
tech@openbsd.org
Cc:
claudio@openbsd.org, mlarkin@openbsd.org, mpi@openbsd.org
Date:
Fri, 26 Jan 2024 12:28:12 -0600

Download raw body.

Thread
This patch moves the dt(4) profiling entry points from hardclock()
into dedicated clockintr callbacks.  It depends on the clockintr_unbind
patch I sent out separately, so be sure to apply that first.

- In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs.
  This is the PCB's profiling period.

- Each struct dt_pcb gets its own struct clockintr, dp_clockintr.
  dp_clockintr is bound during dt_prov_profile_alloc().  We stagger
  dp_clockintr to simulate the current behavior under hardclock().

  Note that dp_clockintr is given a reference to its enclosing
  PCB when it is bound.  More on this in a second.

- Replace dt_prov_profile_enter/dt_prov_interval_enter with
  dt_prov_profile_clock/dt_prov_profile_interval.  These are
  your clockintr callback functions.  Because the prototypes
  now don't match the other *_enter() functions I figure
  changing the names is reasonable.

  Because dp_clockintr has a copy of the PCB pointer we no longer
  need to search for the PCB with SMR_SLIST_FOREACH.  This reduces
  profiling overhead, especially on larger systems.

- dt_prov_profile_fire() now takes a count of profiling ticks and
  runs a loop to grab as many EVTs as it needs.  If the callback
  is delayed, it will assign multiple "hits" to the current PC,
  just like we do now under hardclock().

- dt_pcb_free() unbinds the clockintr.  Because dp_clockintr has
  a reference to the enclosing PCB we set an execution barrier
  here to ensure it is safe to free the PCB.  In practice this
  will rarely block, but you need to set the barrier to be sure.

DT_FA_PROFILE has been updated for amd64.  The unpruned stack now
looks like this:

dt_pcb_ring_get
dt_prov_profile_clock
clockintr_dispatch
lapic_clockintr
Xresume_lapic_ltimer
acpicpu_idle
sched_idle
proc_trampoline
kernel

I can test other platforms and update the DT_FA_PROFILE values
for everything but powerpc64 before committing.  The new value
for powerpc64 is probably 4, but somebody else needs to confirm.

ok?

Index: sys/dev/dt/dt_prov_profile.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
diff -u -p -r1.5 dt_prov_profile.c
--- sys/dev/dt/dt_prov_profile.c	26 Apr 2023 16:53:59 -0000	1.5
+++ sys/dev/dt/dt_prov_profile.c	26 Jan 2024 17:45:09 -0000
@@ -20,6 +20,7 @@
 #include <sys/systm.h>
 #include <sys/param.h>
 #include <sys/atomic.h>
+#include <sys/clockintr.h>
 
 #include <dev/dt/dtvar.h>
 
@@ -31,13 +32,13 @@ struct dt_probe	*dtpp_interval;		/* glob
 
 int	dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *,
 	    struct dt_pcb_list *, struct dtioc_req *);
-int	dt_prov_profile_enter(struct dt_provider *, ...);
-int	dt_prov_interval_enter(struct dt_provider *, ...);
+void	dt_prov_profile_clock(struct clockrequest *, void *, void *);
+void	dt_prov_interval_clock(struct clockrequest *, void *, void *);
 
 struct dt_provider dt_prov_profile = {
 	.dtpv_name	= "profile",
 	.dtpv_alloc	= dt_prov_profile_alloc,
-	.dtpv_enter	= dt_prov_profile_enter,
+	.dtpv_enter	= NULL,
 	.dtpv_leave	= NULL,
 	.dtpv_dealloc	= NULL,
 };
@@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = {
 struct dt_provider dt_prov_interval = {
 	.dtpv_name	= "interval",
 	.dtpv_alloc	= dt_prov_profile_alloc,
-	.dtpv_enter	= dt_prov_interval_enter,
+	.dtpv_enter	= NULL,
 	.dtpv_leave	= NULL,
 	.dtpv_dealloc	= NULL,
 };
@@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d
 {
 	struct dt_pcb *dp;
 	struct cpu_info *ci;
+	void (*callback)(struct clockrequest *, void *, void *);
 	CPU_INFO_ITERATOR cii;
 	extern int hz;
 
@@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d
 	if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz)
 		return EOPNOTSUPP;
 
+	if (dtp == dtpp_interval)
+		callback = dt_prov_interval_clock;
+	else
+		callback = dt_prov_profile_clock;
+
 	CPU_INFO_FOREACH(cii, ci) {
 		if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval))
 			continue;
@@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d
 			return ENOMEM;
 		}
 
-		dp->dp_maxtick = hz / dtrq->dtrq_rate;
+		dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate;
 		dp->dp_cpuid = ci->ci_cpuid;
+		clockintr_bind(&dp->dp_clockintr, ci, callback, dp);
+		clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs,
+		    CPU_INFO_UNIT(ci), MAXCPUS);
 
 		dp->dp_filter = dtrq->dtrq_filter;
 		dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE;
@@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d
 }
 
 static inline void
-dt_prov_profile_fire(struct dt_pcb *dp)
+dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count)
 {
+	uint64_t i;
 	struct dt_evt *dtev;
 
-	if (++dp->dp_nticks < dp->dp_maxtick)
-		return;
-
-	dtev = dt_pcb_ring_get(dp, 1);
-	if (dtev == NULL)
-		return;
-	dt_pcb_ring_consume(dp, dtev);
-	dp->dp_nticks = 0;
+	for (i = 0; i < count; i++) {
+		dtev = dt_pcb_ring_get(dp, 1);
+		if (dtev == NULL)
+			return;
+		dt_pcb_ring_consume(dp, dtev);
+	}
 }
 
-int
-dt_prov_profile_enter(struct dt_provider *dtpv, ...)
+void
+dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg)
 {
-	struct cpu_info *ci = curcpu();
-	struct dt_pcb *dp;
-
-	KASSERT(dtpv == &dt_prov_profile);
-
-	smr_read_enter();
-	SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) {
-		if (dp->dp_cpuid != ci->ci_cpuid)
-			continue;
+	uint64_t count;
+	struct dt_pcb *dp = arg;
 
-		dt_prov_profile_fire(dp);
-	}
-	smr_read_leave();
-	return 0;
+	count = clockrequest_advance(cr, dp->dp_nsecs);
+	dt_prov_profile_fire(dp, count);
 }
 
-int
-dt_prov_interval_enter(struct dt_provider *dtpv, ...)
+void
+dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg)
 {
-	struct dt_pcb *dp;
+	uint64_t count;
+	struct dt_pcb *dp = arg;
 
-	KASSERT(dtpv == &dt_prov_interval);
-
-	smr_read_enter();
-	SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) {
-		dt_prov_profile_fire(dp);
-	}
-	smr_read_leave();
-	return 0;
+	count = clockrequest_advance(cr, dp->dp_nsecs);
+	dt_prov_profile_fire(dp, count);
 }
Index: sys/dev/dt/dt_dev.c
===================================================================
RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
diff -u -p -r1.29 dt_dev.c
--- sys/dev/dt/dt_dev.c	2 Jan 2024 16:32:48 -0000	1.29
+++ sys/dev/dt/dt_dev.c	26 Jan 2024 17:45:09 -0000
@@ -19,6 +19,7 @@
 #include <sys/types.h>
 #include <sys/systm.h>
 #include <sys/param.h>
+#include <sys/clockintr.h>
 #include <sys/device.h>
 #include <sys/exec_elf.h>
 #include <sys/malloc.h>
@@ -56,7 +57,7 @@
  *	proc_trampoline+0x1c
  */
 #if defined(__amd64__)
-#define DT_FA_PROFILE	7
+#define DT_FA_PROFILE	5
 #define DT_FA_STATIC	2
 #elif defined(__i386__)
 #define DT_FA_PROFILE	8
@@ -492,6 +493,7 @@ dt_ioctl_record_start(struct dt_softc *s
 		SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
 		dtp->dtp_recording++;
 		dtp->dtp_prov->dtpv_recording++;
+		clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs);
 	}
 	rw_exit_write(&dt_lock);
 
@@ -518,6 +520,7 @@ dt_ioctl_record_stop(struct dt_softc *sc
 	TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
 		struct dt_probe *dtp = dp->dp_dtp;
 
+		clockintr_cancel(&dp->dp_clockintr);
 		dtp->dtp_recording--;
 		dtp->dtp_prov->dtpv_recording--;
 		SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
@@ -679,6 +682,15 @@ dt_pcb_free(struct dt_pcb *dp)
 {
 	if (dp == NULL)
 		return;
+
+	/*
+	 * dp_clockintr is bound if dp_nsecs is non-zero.  Its callback
+	 * has a reference to dp, so set an execution barrier and block
+	 * until it is no longer running before freeing dp.
+	 */
+	if (dp->dp_nsecs != 0)
+		clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
+
 	free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring));
 	free(dp, M_DT, sizeof(*dp));
 }
Index: sys/dev/dt/dtvar.h
===================================================================
RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
diff -u -p -r1.17 dtvar.h
--- sys/dev/dt/dtvar.h	26 Apr 2023 16:53:59 -0000	1.17
+++ sys/dev/dt/dtvar.h	26 Jan 2024 17:45:09 -0000
@@ -197,9 +197,9 @@ struct dt_pcb {
 	struct dt_filter	 dp_filter;	/* [I] filter to match */
 
 	/* Provider specific fields. */
+	struct clockintr	 dp_clockintr;	/* [?] profiling handle */
+	uint64_t		 dp_nsecs;	/* [I] profiling period */
 	unsigned int		 dp_cpuid;	/* [I] on which CPU */
-	unsigned int		 dp_maxtick;	/* [I] freq. of profiling */
-	unsigned int		 dp_nticks;	/* [c] current tick count */
 
 	/* Counters */
 	uint64_t		 dp_dropevt;	/* [m] # dropped event */
Index: sys/kern/kern_clock.c
===================================================================
RCS file: /cvs/src/sys/kern/kern_clock.c,v
diff -u -p -r1.121 kern_clock.c
--- sys/kern/kern_clock.c	17 Oct 2023 00:04:02 -0000	1.121
+++ sys/kern/kern_clock.c	26 Jan 2024 17:45:09 -0000
@@ -50,11 +50,6 @@
 #include <sys/sched.h>
 #include <sys/timetc.h>
 
-#include "dt.h"
-#if NDT > 0
-#include <dev/dt/dtvar.h>
-#endif
-
 /*
  * Clock handling routines.
  *
@@ -145,12 +140,6 @@ initclocks(void)
 void
 hardclock(struct clockframe *frame)
 {
-#if NDT > 0
-	DT_ENTER(profile, NULL);
-	if (CPU_IS_PRIMARY(curcpu()))
-		DT_ENTER(interval, NULL);
-#endif
-
 	/*
 	 * If we are not the primary CPU, we're not allowed to do
 	 * any more work.