From: Martin Pieuchot Subject: Re: dt(4), hardclock: move profiling entry points to dedicated clockintrs To: Scott Cheloha Cc: tech@openbsd.org, claudio@openbsd.org, mlarkin@openbsd.org Date: Tue, 6 Feb 2024 15:24:20 +0100 On 05/02/24(Mon) 19:37, Scott Cheloha wrote: > On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote: > > This patch moves the dt(4) profiling entry points from hardclock() > > into dedicated clockintr callbacks. It depends on the clockintr_unbind > > patch I sent out separately, so be sure to apply that first. > > > > - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs. > > This is the PCB's profiling period. > > > > - Each struct dt_pcb gets its own struct clockintr, dp_clockintr. > > dp_clockintr is bound during dt_prov_profile_alloc(). We stagger > > dp_clockintr to simulate the current behavior under hardclock(). > > > > Note that dp_clockintr is given a reference to its enclosing > > PCB when it is bound. More on this in a second. > > > > - Replace dt_prov_profile_enter/dt_prov_interval_enter with > > dt_prov_profile_clock/dt_prov_profile_interval. These are > > your clockintr callback functions. Because the prototypes > > now don't match the other *_enter() functions I figure > > changing the names is reasonable. > > > > Because dp_clockintr has a copy of the PCB pointer we no longer > > need to search for the PCB with SMR_SLIST_FOREACH. This reduces > > profiling overhead, especially on larger systems. > > > > - dt_prov_profile_fire() now takes a count of profiling ticks and > > runs a loop to grab as many EVTs as it needs. If the callback > > is delayed, it will assign multiple "hits" to the current PC, > > just like we do now under hardclock(). > > > > - dt_pcb_free() unbinds the clockintr. Because dp_clockintr has > > a reference to the enclosing PCB we set an execution barrier > > here to ensure it is safe to free the PCB. In practice this > > will rarely block, but you need to set the barrier to be sure. > > > > DT_FA_PROFILE has been updated for amd64. The unpruned stack now > > looks like this: > > > > dt_pcb_ring_get > > dt_prov_profile_clock > > clockintr_dispatch > > lapic_clockintr > > Xresume_lapic_ltimer > > acpicpu_idle > > sched_idle > > proc_trampoline > > kernel > > > > I can test other platforms and update the DT_FA_PROFILE values > > for everything but powerpc64 before committing. The new value > > for powerpc64 is probably 4, but somebody else needs to confirm. > > Updated patch: > > - Don't advance/cancel dp_clockintr when dp_nsecs is zero. > - Update DT_FA_PROFILE for i386, macppc I can't test those. Comments below. > Index: sys/dev/dt/dt_prov_profile.c > =================================================================== > RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v > diff -u -p -r1.5 dt_prov_profile.c > --- sys/dev/dt/dt_prov_profile.c 26 Apr 2023 16:53:59 -0000 1.5 > +++ sys/dev/dt/dt_prov_profile.c 6 Feb 2024 01:34:45 -0000 > @@ -20,6 +20,7 @@ > #include > #include > #include > +#include > > #include > > @@ -31,13 +32,13 @@ struct dt_probe *dtpp_interval; /* glob > > int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, > struct dt_pcb_list *, struct dtioc_req *); > -int dt_prov_profile_enter(struct dt_provider *, ...); > -int dt_prov_interval_enter(struct dt_provider *, ...); > +void dt_prov_profile_clock(struct clockrequest *, void *, void *); > +void dt_prov_interval_clock(struct clockrequest *, void *, void *); > > struct dt_provider dt_prov_profile = { > .dtpv_name = "profile", > .dtpv_alloc = dt_prov_profile_alloc, > - .dtpv_enter = dt_prov_profile_enter, > + .dtpv_enter = NULL, > .dtpv_leave = NULL, > .dtpv_dealloc = NULL, > }; > @@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = { > struct dt_provider dt_prov_interval = { > .dtpv_name = "interval", > .dtpv_alloc = dt_prov_profile_alloc, > - .dtpv_enter = dt_prov_interval_enter, > + .dtpv_enter = NULL, > .dtpv_leave = NULL, > .dtpv_dealloc = NULL, > }; > @@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d > { > struct dt_pcb *dp; > struct cpu_info *ci; > + void (*callback)(struct clockrequest *, void *, void *); > CPU_INFO_ITERATOR cii; > extern int hz; > > @@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d > if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz) > return EOPNOTSUPP; > > + if (dtp == dtpp_interval) > + callback = dt_prov_interval_clock; > + else > + callback = dt_prov_profile_clock; I see no difference between the two callbacks, can we get rid of dt_prov_interval_clock()? > CPU_INFO_FOREACH(cii, ci) { > if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval)) > continue; > @@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d > return ENOMEM; > } > > - dp->dp_maxtick = hz / dtrq->dtrq_rate; > + dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate; > dp->dp_cpuid = ci->ci_cpuid; `dp_cpuid' is no longer used now that clockintr_stagger() bind a request to a given CPU. You can get rid of it. > + clockintr_bind(&dp->dp_clockintr, ci, callback, dp); > + clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs, > + CPU_INFO_UNIT(ci), MAXCPUS); > > dp->dp_filter = dtrq->dtrq_filter; > dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE; > @@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d > } > > static inline void > -dt_prov_profile_fire(struct dt_pcb *dp) > +dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count) > { > + uint64_t i; > struct dt_evt *dtev; > > - if (++dp->dp_nticks < dp->dp_maxtick) > - return; > - > - dtev = dt_pcb_ring_get(dp, 1); > - if (dtev == NULL) > - return; > - dt_pcb_ring_consume(dp, dtev); > - dp->dp_nticks = 0; > + for (i = 0; i < count; i++) { > + dtev = dt_pcb_ring_get(dp, 1); > + if (dtev == NULL) > + return; > + dt_pcb_ring_consume(dp, dtev); > + } > } > > -int > -dt_prov_profile_enter(struct dt_provider *dtpv, ...) > +void > +dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg) > { > - struct cpu_info *ci = curcpu(); > - struct dt_pcb *dp; > - > - KASSERT(dtpv == &dt_prov_profile); > - > - smr_read_enter(); > - SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) { > - if (dp->dp_cpuid != ci->ci_cpuid) > - continue; > + uint64_t count; > + struct dt_pcb *dp = arg; > > - dt_prov_profile_fire(dp); > - } > - smr_read_leave(); > - return 0; > + count = clockrequest_advance(cr, dp->dp_nsecs); > + dt_prov_profile_fire(dp, count); > } > > -int > -dt_prov_interval_enter(struct dt_provider *dtpv, ...) > +void > +dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg) > { > - struct dt_pcb *dp; > + uint64_t count; > + struct dt_pcb *dp = arg; > > - KASSERT(dtpv == &dt_prov_interval); > - > - smr_read_enter(); > - SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) { > - dt_prov_profile_fire(dp); > - } > - smr_read_leave(); > - return 0; > + count = clockrequest_advance(cr, dp->dp_nsecs); > + dt_prov_profile_fire(dp, count); > } > Index: sys/dev/dt/dt_dev.c > =================================================================== > RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v > diff -u -p -r1.29 dt_dev.c > --- sys/dev/dt/dt_dev.c 2 Jan 2024 16:32:48 -0000 1.29 > +++ sys/dev/dt/dt_dev.c 6 Feb 2024 01:34:45 -0000 > @@ -19,6 +19,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -56,13 +57,13 @@ > * proc_trampoline+0x1c > */ > #if defined(__amd64__) > -#define DT_FA_PROFILE 7 > +#define DT_FA_PROFILE 5 > #define DT_FA_STATIC 2 > #elif defined(__i386__) > -#define DT_FA_PROFILE 8 > +#define DT_FA_PROFILE 5 > #define DT_FA_STATIC 2 > #elif defined(__macppc__) > -#define DT_FA_PROFILE 7 > +#define DT_FA_PROFILE 5 > #define DT_FA_STATIC 2 > #elif defined(__octeon__) > #define DT_FA_PROFILE 6 > @@ -492,6 +493,8 @@ dt_ioctl_record_start(struct dt_softc *s > SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); > dtp->dtp_recording++; > dtp->dtp_prov->dtpv_recording++; > + if (dp->dp_nsecs != 0) > + clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs); > } > rw_exit_write(&dt_lock); > > @@ -518,6 +521,8 @@ dt_ioctl_record_stop(struct dt_softc *sc > TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { > struct dt_probe *dtp = dp->dp_dtp; > > + if (dp->dp_nsecs != 0) > + clockintr_cancel(&dp->dp_clockintr); Here this is complex. Since we are no longer using SMR in the callback to get a reference to `dp' we should not do the following before we are sure the callback won't run. That would imply clockintr_unbind(), no? Some caution should be taken with clockintr_unbind() because we are holding a lock. > dtp->dtp_recording--; > dtp->dtp_prov->dtpv_recording--; > SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); > @@ -679,6 +684,15 @@ dt_pcb_free(struct dt_pcb *dp) > { > if (dp == NULL) > return; > + > + /* > + * dp_clockintr is bound if dp_nsecs is non-zero. Its callback > + * has a reference to dp, so set an execution barrier and block > + * until it is no longer running before freeing dp. > + */ > + if (dp->dp_nsecs != 0) > + clockintr_unbind(&dp->dp_clockintr, CL_BARRIER); > + This is too late and should not be necessary. dtclose() calls ctl_record_stop() which should take care of the barrier before decrementing counters. > free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); > free(dp, M_DT, sizeof(*dp)); > } > Index: sys/dev/dt/dtvar.h > =================================================================== > RCS file: /cvs/src/sys/dev/dt/dtvar.h,v > diff -u -p -r1.17 dtvar.h > --- sys/dev/dt/dtvar.h 26 Apr 2023 16:53:59 -0000 1.17 > +++ sys/dev/dt/dtvar.h 6 Feb 2024 01:34:46 -0000 > @@ -197,9 +197,9 @@ struct dt_pcb { > struct dt_filter dp_filter; /* [I] filter to match */ > > /* Provider specific fields. */ > + struct clockintr dp_clockintr; /* [?] profiling handle */ If you don't know it's the kernel lock K. > + uint64_t dp_nsecs; /* [I] profiling period */ > unsigned int dp_cpuid; /* [I] on which CPU */ > - unsigned int dp_maxtick; /* [I] freq. of profiling */ > - unsigned int dp_nticks; /* [c] current tick count */ > > /* Counters */ > uint64_t dp_dropevt; /* [m] # dropped event */ > Index: sys/kern/kern_clock.c > =================================================================== > RCS file: /cvs/src/sys/kern/kern_clock.c,v > diff -u -p -r1.121 kern_clock.c > --- sys/kern/kern_clock.c 17 Oct 2023 00:04:02 -0000 1.121 > +++ sys/kern/kern_clock.c 6 Feb 2024 01:34:46 -0000 > @@ -50,11 +50,6 @@ > #include > #include > > -#include "dt.h" > -#if NDT > 0 > -#include > -#endif > - > /* > * Clock handling routines. > * > @@ -145,12 +140,6 @@ initclocks(void) > void > hardclock(struct clockframe *frame) > { > -#if NDT > 0 > - DT_ENTER(profile, NULL); > - if (CPU_IS_PRIMARY(curcpu())) > - DT_ENTER(interval, NULL); > -#endif > - > /* > * If we are not the primary CPU, we're not allowed to do > * any more work. >