From: Scott Cheloha Subject: Re: dt(4), hardclock: move profiling entry points to dedicated clockintrs To: tech@openbsd.org Cc: claudio@openbsd.org, mlarkin@openbsd.org, mpi@openbsd.org Date: Mon, 5 Feb 2024 19:37:54 -0600 On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote: > This patch moves the dt(4) profiling entry points from hardclock() > into dedicated clockintr callbacks. It depends on the clockintr_unbind > patch I sent out separately, so be sure to apply that first. > > - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs. > This is the PCB's profiling period. > > - Each struct dt_pcb gets its own struct clockintr, dp_clockintr. > dp_clockintr is bound during dt_prov_profile_alloc(). We stagger > dp_clockintr to simulate the current behavior under hardclock(). > > Note that dp_clockintr is given a reference to its enclosing > PCB when it is bound. More on this in a second. > > - Replace dt_prov_profile_enter/dt_prov_interval_enter with > dt_prov_profile_clock/dt_prov_profile_interval. These are > your clockintr callback functions. Because the prototypes > now don't match the other *_enter() functions I figure > changing the names is reasonable. > > Because dp_clockintr has a copy of the PCB pointer we no longer > need to search for the PCB with SMR_SLIST_FOREACH. This reduces > profiling overhead, especially on larger systems. > > - dt_prov_profile_fire() now takes a count of profiling ticks and > runs a loop to grab as many EVTs as it needs. If the callback > is delayed, it will assign multiple "hits" to the current PC, > just like we do now under hardclock(). > > - dt_pcb_free() unbinds the clockintr. Because dp_clockintr has > a reference to the enclosing PCB we set an execution barrier > here to ensure it is safe to free the PCB. In practice this > will rarely block, but you need to set the barrier to be sure. > > DT_FA_PROFILE has been updated for amd64. The unpruned stack now > looks like this: > > dt_pcb_ring_get > dt_prov_profile_clock > clockintr_dispatch > lapic_clockintr > Xresume_lapic_ltimer > acpicpu_idle > sched_idle > proc_trampoline > kernel > > I can test other platforms and update the DT_FA_PROFILE values > for everything but powerpc64 before committing. The new value > for powerpc64 is probably 4, but somebody else needs to confirm. Updated patch: - Don't advance/cancel dp_clockintr when dp_nsecs is zero. - Update DT_FA_PROFILE for i386, macppc Index: sys/dev/dt/dt_prov_profile.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v diff -u -p -r1.5 dt_prov_profile.c --- sys/dev/dt/dt_prov_profile.c 26 Apr 2023 16:53:59 -0000 1.5 +++ sys/dev/dt/dt_prov_profile.c 6 Feb 2024 01:34:45 -0000 @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -31,13 +32,13 @@ struct dt_probe *dtpp_interval; /* glob int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, struct dt_pcb_list *, struct dtioc_req *); -int dt_prov_profile_enter(struct dt_provider *, ...); -int dt_prov_interval_enter(struct dt_provider *, ...); +void dt_prov_profile_clock(struct clockrequest *, void *, void *); +void dt_prov_interval_clock(struct clockrequest *, void *, void *); struct dt_provider dt_prov_profile = { .dtpv_name = "profile", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_profile_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = { struct dt_provider dt_prov_interval = { .dtpv_name = "interval", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_interval_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d { struct dt_pcb *dp; struct cpu_info *ci; + void (*callback)(struct clockrequest *, void *, void *); CPU_INFO_ITERATOR cii; extern int hz; @@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz) return EOPNOTSUPP; + if (dtp == dtpp_interval) + callback = dt_prov_interval_clock; + else + callback = dt_prov_profile_clock; + CPU_INFO_FOREACH(cii, ci) { if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval)) continue; @@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d return ENOMEM; } - dp->dp_maxtick = hz / dtrq->dtrq_rate; + dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate; dp->dp_cpuid = ci->ci_cpuid; + clockintr_bind(&dp->dp_clockintr, ci, callback, dp); + clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs, + CPU_INFO_UNIT(ci), MAXCPUS); dp->dp_filter = dtrq->dtrq_filter; dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE; @@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d } static inline void -dt_prov_profile_fire(struct dt_pcb *dp) +dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count) { + uint64_t i; struct dt_evt *dtev; - if (++dp->dp_nticks < dp->dp_maxtick) - return; - - dtev = dt_pcb_ring_get(dp, 1); - if (dtev == NULL) - return; - dt_pcb_ring_consume(dp, dtev); - dp->dp_nticks = 0; + for (i = 0; i < count; i++) { + dtev = dt_pcb_ring_get(dp, 1); + if (dtev == NULL) + return; + dt_pcb_ring_consume(dp, dtev); + } } -int -dt_prov_profile_enter(struct dt_provider *dtpv, ...) +void +dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg) { - struct cpu_info *ci = curcpu(); - struct dt_pcb *dp; - - KASSERT(dtpv == &dt_prov_profile); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) { - if (dp->dp_cpuid != ci->ci_cpuid) - continue; + uint64_t count; + struct dt_pcb *dp = arg; - dt_prov_profile_fire(dp); - } - smr_read_leave(); - return 0; + count = clockrequest_advance(cr, dp->dp_nsecs); + dt_prov_profile_fire(dp, count); } -int -dt_prov_interval_enter(struct dt_provider *dtpv, ...) +void +dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg) { - struct dt_pcb *dp; + uint64_t count; + struct dt_pcb *dp = arg; - KASSERT(dtpv == &dt_prov_interval); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) { - dt_prov_profile_fire(dp); - } - smr_read_leave(); - return 0; + count = clockrequest_advance(cr, dp->dp_nsecs); + dt_prov_profile_fire(dp, count); } Index: sys/dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v diff -u -p -r1.29 dt_dev.c --- sys/dev/dt/dt_dev.c 2 Jan 2024 16:32:48 -0000 1.29 +++ sys/dev/dt/dt_dev.c 6 Feb 2024 01:34:45 -0000 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -56,13 +57,13 @@ * proc_trampoline+0x1c */ #if defined(__amd64__) -#define DT_FA_PROFILE 7 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__i386__) -#define DT_FA_PROFILE 8 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__macppc__) -#define DT_FA_PROFILE 7 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__octeon__) #define DT_FA_PROFILE 6 @@ -492,6 +493,8 @@ dt_ioctl_record_start(struct dt_softc *s SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); dtp->dtp_recording++; dtp->dtp_prov->dtpv_recording++; + if (dp->dp_nsecs != 0) + clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs); } rw_exit_write(&dt_lock); @@ -518,6 +521,8 @@ dt_ioctl_record_stop(struct dt_softc *sc TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { struct dt_probe *dtp = dp->dp_dtp; + if (dp->dp_nsecs != 0) + clockintr_cancel(&dp->dp_clockintr); dtp->dtp_recording--; dtp->dtp_prov->dtpv_recording--; SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); @@ -679,6 +684,15 @@ dt_pcb_free(struct dt_pcb *dp) { if (dp == NULL) return; + + /* + * dp_clockintr is bound if dp_nsecs is non-zero. Its callback + * has a reference to dp, so set an execution barrier and block + * until it is no longer running before freeing dp. + */ + if (dp->dp_nsecs != 0) + clockintr_unbind(&dp->dp_clockintr, CL_BARRIER); + free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); free(dp, M_DT, sizeof(*dp)); } Index: sys/dev/dt/dtvar.h =================================================================== RCS file: /cvs/src/sys/dev/dt/dtvar.h,v diff -u -p -r1.17 dtvar.h --- sys/dev/dt/dtvar.h 26 Apr 2023 16:53:59 -0000 1.17 +++ sys/dev/dt/dtvar.h 6 Feb 2024 01:34:46 -0000 @@ -197,9 +197,9 @@ struct dt_pcb { struct dt_filter dp_filter; /* [I] filter to match */ /* Provider specific fields. */ + struct clockintr dp_clockintr; /* [?] profiling handle */ + uint64_t dp_nsecs; /* [I] profiling period */ unsigned int dp_cpuid; /* [I] on which CPU */ - unsigned int dp_maxtick; /* [I] freq. of profiling */ - unsigned int dp_nticks; /* [c] current tick count */ /* Counters */ uint64_t dp_dropevt; /* [m] # dropped event */ Index: sys/kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v diff -u -p -r1.121 kern_clock.c --- sys/kern/kern_clock.c 17 Oct 2023 00:04:02 -0000 1.121 +++ sys/kern/kern_clock.c 6 Feb 2024 01:34:46 -0000 @@ -50,11 +50,6 @@ #include #include -#include "dt.h" -#if NDT > 0 -#include -#endif - /* * Clock handling routines. * @@ -145,12 +140,6 @@ initclocks(void) void hardclock(struct clockframe *frame) { -#if NDT > 0 - DT_ENTER(profile, NULL); - if (CPU_IS_PRIMARY(curcpu())) - DT_ENTER(interval, NULL); -#endif - /* * If we are not the primary CPU, we're not allowed to do * any more work.