From: Scott Cheloha Subject: dt(4), hardclock: move profiling entry points to dedicated clockintrs To: tech@openbsd.org Cc: claudio@openbsd.org, mlarkin@openbsd.org, mpi@openbsd.org Date: Fri, 26 Jan 2024 12:28:12 -0600 This patch moves the dt(4) profiling entry points from hardclock() into dedicated clockintr callbacks. It depends on the clockintr_unbind patch I sent out separately, so be sure to apply that first. - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs. This is the PCB's profiling period. - Each struct dt_pcb gets its own struct clockintr, dp_clockintr. dp_clockintr is bound during dt_prov_profile_alloc(). We stagger dp_clockintr to simulate the current behavior under hardclock(). Note that dp_clockintr is given a reference to its enclosing PCB when it is bound. More on this in a second. - Replace dt_prov_profile_enter/dt_prov_interval_enter with dt_prov_profile_clock/dt_prov_profile_interval. These are your clockintr callback functions. Because the prototypes now don't match the other *_enter() functions I figure changing the names is reasonable. Because dp_clockintr has a copy of the PCB pointer we no longer need to search for the PCB with SMR_SLIST_FOREACH. This reduces profiling overhead, especially on larger systems. - dt_prov_profile_fire() now takes a count of profiling ticks and runs a loop to grab as many EVTs as it needs. If the callback is delayed, it will assign multiple "hits" to the current PC, just like we do now under hardclock(). - dt_pcb_free() unbinds the clockintr. Because dp_clockintr has a reference to the enclosing PCB we set an execution barrier here to ensure it is safe to free the PCB. In practice this will rarely block, but you need to set the barrier to be sure. DT_FA_PROFILE has been updated for amd64. The unpruned stack now looks like this: dt_pcb_ring_get dt_prov_profile_clock clockintr_dispatch lapic_clockintr Xresume_lapic_ltimer acpicpu_idle sched_idle proc_trampoline kernel I can test other platforms and update the DT_FA_PROFILE values for everything but powerpc64 before committing. The new value for powerpc64 is probably 4, but somebody else needs to confirm. ok? Index: sys/dev/dt/dt_prov_profile.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v diff -u -p -r1.5 dt_prov_profile.c --- sys/dev/dt/dt_prov_profile.c 26 Apr 2023 16:53:59 -0000 1.5 +++ sys/dev/dt/dt_prov_profile.c 26 Jan 2024 17:45:09 -0000 @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -31,13 +32,13 @@ struct dt_probe *dtpp_interval; /* glob int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, struct dt_pcb_list *, struct dtioc_req *); -int dt_prov_profile_enter(struct dt_provider *, ...); -int dt_prov_interval_enter(struct dt_provider *, ...); +void dt_prov_profile_clock(struct clockrequest *, void *, void *); +void dt_prov_interval_clock(struct clockrequest *, void *, void *); struct dt_provider dt_prov_profile = { .dtpv_name = "profile", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_profile_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = { struct dt_provider dt_prov_interval = { .dtpv_name = "interval", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_interval_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d { struct dt_pcb *dp; struct cpu_info *ci; + void (*callback)(struct clockrequest *, void *, void *); CPU_INFO_ITERATOR cii; extern int hz; @@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz) return EOPNOTSUPP; + if (dtp == dtpp_interval) + callback = dt_prov_interval_clock; + else + callback = dt_prov_profile_clock; + CPU_INFO_FOREACH(cii, ci) { if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval)) continue; @@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d return ENOMEM; } - dp->dp_maxtick = hz / dtrq->dtrq_rate; + dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate; dp->dp_cpuid = ci->ci_cpuid; + clockintr_bind(&dp->dp_clockintr, ci, callback, dp); + clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs, + CPU_INFO_UNIT(ci), MAXCPUS); dp->dp_filter = dtrq->dtrq_filter; dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE; @@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d } static inline void -dt_prov_profile_fire(struct dt_pcb *dp) +dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count) { + uint64_t i; struct dt_evt *dtev; - if (++dp->dp_nticks < dp->dp_maxtick) - return; - - dtev = dt_pcb_ring_get(dp, 1); - if (dtev == NULL) - return; - dt_pcb_ring_consume(dp, dtev); - dp->dp_nticks = 0; + for (i = 0; i < count; i++) { + dtev = dt_pcb_ring_get(dp, 1); + if (dtev == NULL) + return; + dt_pcb_ring_consume(dp, dtev); + } } -int -dt_prov_profile_enter(struct dt_provider *dtpv, ...) +void +dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg) { - struct cpu_info *ci = curcpu(); - struct dt_pcb *dp; - - KASSERT(dtpv == &dt_prov_profile); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) { - if (dp->dp_cpuid != ci->ci_cpuid) - continue; + uint64_t count; + struct dt_pcb *dp = arg; - dt_prov_profile_fire(dp); - } - smr_read_leave(); - return 0; + count = clockrequest_advance(cr, dp->dp_nsecs); + dt_prov_profile_fire(dp, count); } -int -dt_prov_interval_enter(struct dt_provider *dtpv, ...) +void +dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg) { - struct dt_pcb *dp; + uint64_t count; + struct dt_pcb *dp = arg; - KASSERT(dtpv == &dt_prov_interval); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) { - dt_prov_profile_fire(dp); - } - smr_read_leave(); - return 0; + count = clockrequest_advance(cr, dp->dp_nsecs); + dt_prov_profile_fire(dp, count); } Index: sys/dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v diff -u -p -r1.29 dt_dev.c --- sys/dev/dt/dt_dev.c 2 Jan 2024 16:32:48 -0000 1.29 +++ sys/dev/dt/dt_dev.c 26 Jan 2024 17:45:09 -0000 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -56,7 +57,7 @@ * proc_trampoline+0x1c */ #if defined(__amd64__) -#define DT_FA_PROFILE 7 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__i386__) #define DT_FA_PROFILE 8 @@ -492,6 +493,7 @@ dt_ioctl_record_start(struct dt_softc *s SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); dtp->dtp_recording++; dtp->dtp_prov->dtpv_recording++; + clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs); } rw_exit_write(&dt_lock); @@ -518,6 +520,7 @@ dt_ioctl_record_stop(struct dt_softc *sc TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { struct dt_probe *dtp = dp->dp_dtp; + clockintr_cancel(&dp->dp_clockintr); dtp->dtp_recording--; dtp->dtp_prov->dtpv_recording--; SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); @@ -679,6 +682,15 @@ dt_pcb_free(struct dt_pcb *dp) { if (dp == NULL) return; + + /* + * dp_clockintr is bound if dp_nsecs is non-zero. Its callback + * has a reference to dp, so set an execution barrier and block + * until it is no longer running before freeing dp. + */ + if (dp->dp_nsecs != 0) + clockintr_unbind(&dp->dp_clockintr, CL_BARRIER); + free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); free(dp, M_DT, sizeof(*dp)); } Index: sys/dev/dt/dtvar.h =================================================================== RCS file: /cvs/src/sys/dev/dt/dtvar.h,v diff -u -p -r1.17 dtvar.h --- sys/dev/dt/dtvar.h 26 Apr 2023 16:53:59 -0000 1.17 +++ sys/dev/dt/dtvar.h 26 Jan 2024 17:45:09 -0000 @@ -197,9 +197,9 @@ struct dt_pcb { struct dt_filter dp_filter; /* [I] filter to match */ /* Provider specific fields. */ + struct clockintr dp_clockintr; /* [?] profiling handle */ + uint64_t dp_nsecs; /* [I] profiling period */ unsigned int dp_cpuid; /* [I] on which CPU */ - unsigned int dp_maxtick; /* [I] freq. of profiling */ - unsigned int dp_nticks; /* [c] current tick count */ /* Counters */ uint64_t dp_dropevt; /* [m] # dropped event */ Index: sys/kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v diff -u -p -r1.121 kern_clock.c --- sys/kern/kern_clock.c 17 Oct 2023 00:04:02 -0000 1.121 +++ sys/kern/kern_clock.c 26 Jan 2024 17:45:09 -0000 @@ -50,11 +50,6 @@ #include #include -#include "dt.h" -#if NDT > 0 -#include -#endif - /* * Clock handling routines. * @@ -145,12 +140,6 @@ initclocks(void) void hardclock(struct clockframe *frame) { -#if NDT > 0 - DT_ENTER(profile, NULL); - if (CPU_IS_PRIMARY(curcpu())) - DT_ENTER(interval, NULL); -#endif - /* * If we are not the primary CPU, we're not allowed to do * any more work.