From: Scott Cheloha Subject: Re: dt(4), hardclock: move profiling entry points to dedicated clockintrs To: mpi@openbsd.org Cc: tech@openbsd.org Date: Thu, 8 Feb 2024 22:08:13 -0600 On Thu, Feb 08, 2024 at 05:19:40PM +0100, Martin Pieuchot wrote: > On 07/02/24(Wed) 20:22, Scott Cheloha wrote: > > On Tue, Feb 06, 2024 at 03:24:20PM +0100, Martin Pieuchot wrote: > > > On 05/02/24(Mon) 19:37, Scott Cheloha wrote: > > > > On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote: > > > > > This patch moves the dt(4) profiling entry points from hardclock() > > > > > into dedicated clockintr callbacks. It depends on the clockintr_unbind > > > > > patch I sent out separately, so be sure to apply that first. > > > > > > > > > > [...] > > > > > > > > Updated patch: > > > > > > > > - Don't advance/cancel dp_clockintr when dp_nsecs is zero. > > > > - Update DT_FA_PROFILE for i386, macppc > > > > > > I can't test those. > > [...] > > We can combine them, but in multiple previous emails you have > > requested that they remain separate. > > Because they were reasons to do so which are now gone. > > > So, which do you want? Either works for me. > > Merge them. > > > > [...] > > > Here this is complex. Since we are no longer using SMR in the callback > > > to get a reference to `dp' we should not do the following before we are > > > sure the callback won't run. That would imply clockintr_unbind(), no? > > > Some caution should be taken with clockintr_unbind() because we are > > > holding a lock. > > > > > > > dtp->dtp_recording--; > > > > dtp->dtp_prov->dtpv_recording--; > > > > SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); > > > > We should not unbind here. All we need to do is cancel dp_clockintr. > > No, we must ensure no other CPU has a valid reference on the descriptor > before decrementing the counter. That's what I already said in my > previous email in different words. > > > With the patch, dt_prov_profile_clock() doesn't walk the SMR list > > (dtp_pcbs) anymore and it doesn't check dtpv_recording or > > dtp_recording anymore, so we don't need to do anything but > > advance/cancel dp_clockintr when starting/stopping recording when > > dp_nsecs is zero. Check the updated patch. Is that acceptable? > > No it's not. [...] Here's a revised patch based on our discussion off-list. It's not exactly what we talked about re. dtpv_dealloc(), but I think it's pretty close to a workable starting point. We can iterate on it once it's in the tree. - Merge the profile/interval callbacks and dt_prov_profile_fire() into a single callback, dt_clock(). - Move the clockintr_bind() and clockintr_stagger() calls from dtpv_alloc() to dt_ioctl_record_start(). - Add a dp_cpu member to struct dt_pcb so dt_ioctl_record_start() knows where to bind each dp_clockintr. - In dt_ioctl_record_stop(), replace clockintr_cancel() with clockintr_unbind(). The execution barrier ensures the reference to the enclosing PCB is dead before the caller returns. As we discussed, it's not ideal to block while holding dt_lock, but in practice blocking here is rare and dt_clock() does not run for very long. - Update the locking annotation for dp_clockintr. Both clockintr_bind() and clockintr_unbind() are now called within the global dt_lock ('D'). Regress passes. DT_FA_PROFILE values still yield correct-looking traces. Index: sys/dev/dt/dt_prov_profile.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v diff -u -p -r1.5 dt_prov_profile.c --- sys/dev/dt/dt_prov_profile.c 26 Apr 2023 16:53:59 -0000 1.5 +++ sys/dev/dt/dt_prov_profile.c 8 Feb 2024 23:03:44 -0000 @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -31,13 +32,11 @@ struct dt_probe *dtpp_interval; /* glob int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, struct dt_pcb_list *, struct dtioc_req *); -int dt_prov_profile_enter(struct dt_provider *, ...); -int dt_prov_interval_enter(struct dt_provider *, ...); struct dt_provider dt_prov_profile = { .dtpv_name = "profile", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_profile_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -45,7 +44,7 @@ struct dt_provider dt_prov_profile = { struct dt_provider dt_prov_interval = { .dtpv_name = "interval", .dtpv_alloc = dt_prov_profile_alloc, - .dtpv_enter = dt_prov_interval_enter, + .dtpv_enter = NULL, .dtpv_leave = NULL, .dtpv_dealloc = NULL, }; @@ -90,8 +89,8 @@ dt_prov_profile_alloc(struct dt_probe *d return ENOMEM; } - dp->dp_maxtick = hz / dtrq->dtrq_rate; - dp->dp_cpuid = ci->ci_cpuid; + dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate; + dp->dp_cpu = ci; dp->dp_filter = dtrq->dtrq_filter; dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE; @@ -101,51 +100,18 @@ dt_prov_profile_alloc(struct dt_probe *d return 0; } -static inline void -dt_prov_profile_fire(struct dt_pcb *dp) +void +dt_clock(struct clockrequest *cr, void *cf, void *arg) { + uint64_t count, i; struct dt_evt *dtev; + struct dt_pcb *dp = arg; - if (++dp->dp_nticks < dp->dp_maxtick) - return; - - dtev = dt_pcb_ring_get(dp, 1); - if (dtev == NULL) - return; - dt_pcb_ring_consume(dp, dtev); - dp->dp_nticks = 0; -} - -int -dt_prov_profile_enter(struct dt_provider *dtpv, ...) -{ - struct cpu_info *ci = curcpu(); - struct dt_pcb *dp; - - KASSERT(dtpv == &dt_prov_profile); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) { - if (dp->dp_cpuid != ci->ci_cpuid) - continue; - - dt_prov_profile_fire(dp); + count = clockrequest_advance(cr, dp->dp_nsecs); + for (i = 0; i < count; i++) { + dtev = dt_pcb_ring_get(dp, 1); + if (dtev == NULL) + return; + dt_pcb_ring_consume(dp, dtev); } - smr_read_leave(); - return 0; -} - -int -dt_prov_interval_enter(struct dt_provider *dtpv, ...) -{ - struct dt_pcb *dp; - - KASSERT(dtpv == &dt_prov_interval); - - smr_read_enter(); - SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) { - dt_prov_profile_fire(dp); - } - smr_read_leave(); - return 0; } Index: sys/dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v diff -u -p -r1.29 dt_dev.c --- sys/dev/dt/dt_dev.c 2 Jan 2024 16:32:48 -0000 1.29 +++ sys/dev/dt/dt_dev.c 8 Feb 2024 23:03:44 -0000 @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -56,13 +57,13 @@ * proc_trampoline+0x1c */ #if defined(__amd64__) -#define DT_FA_PROFILE 7 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__i386__) -#define DT_FA_PROFILE 8 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__macppc__) -#define DT_FA_PROFILE 7 +#define DT_FA_PROFILE 5 #define DT_FA_STATIC 2 #elif defined(__octeon__) #define DT_FA_PROFILE 6 @@ -492,6 +493,14 @@ dt_ioctl_record_start(struct dt_softc *s SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); dtp->dtp_recording++; dtp->dtp_prov->dtpv_recording++; + + if (dp->dp_nsecs != 0) { + clockintr_bind(&dp->dp_clockintr, dp->dp_cpu, dt_clock, + dp); + clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs, + CPU_INFO_UNIT(dp->dp_cpu), MAXCPUS); + clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs); + } } rw_exit_write(&dt_lock); @@ -517,6 +526,13 @@ dt_ioctl_record_stop(struct dt_softc *sc rw_enter_write(&dt_lock); TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { struct dt_probe *dtp = dp->dp_dtp; + + /* + * Set an execution barrier to ensure dt_clock() has + * released its shared reference to dp. + */ + if (dp->dp_nsecs != 0) + clockintr_unbind(&dp->dp_clockintr, CL_BARRIER); dtp->dtp_recording--; dtp->dtp_prov->dtpv_recording--; Index: sys/dev/dt/dtvar.h =================================================================== RCS file: /cvs/src/sys/dev/dt/dtvar.h,v diff -u -p -r1.17 dtvar.h --- sys/dev/dt/dtvar.h 26 Apr 2023 16:53:59 -0000 1.17 +++ sys/dev/dt/dtvar.h 8 Feb 2024 23:03:44 -0000 @@ -175,6 +175,7 @@ int dtioc_req_isvalid(struct dtioc_req * userland read(2)s them. * * Locks used to protect struct members in this file: + * D dt_lock * I immutable after creation * K kernel lock * K,S kernel lock for writing and SMR for reading @@ -197,9 +198,9 @@ struct dt_pcb { struct dt_filter dp_filter; /* [I] filter to match */ /* Provider specific fields. */ - unsigned int dp_cpuid; /* [I] on which CPU */ - unsigned int dp_maxtick; /* [I] freq. of profiling */ - unsigned int dp_nticks; /* [c] current tick count */ + struct clockintr dp_clockintr; /* [D] profiling handle */ + uint64_t dp_nsecs; /* [I] profiling period */ + struct cpu_info *dp_cpu; /* [I] on which CPU */ /* Counters */ uint64_t dp_dropevt; /* [m] # dropped event */ @@ -270,6 +271,7 @@ struct dt_probe *dt_dev_alloc_probe(cons struct dt_provider *); void dt_dev_register_probe(struct dt_probe *); +void dt_clock(struct clockrequest *, void *, void *); extern volatile uint32_t dt_tracing; /* currently tracing? */ Index: sys/kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v diff -u -p -r1.121 kern_clock.c --- sys/kern/kern_clock.c 17 Oct 2023 00:04:02 -0000 1.121 +++ sys/kern/kern_clock.c 8 Feb 2024 23:03:44 -0000 @@ -50,11 +50,6 @@ #include #include -#include "dt.h" -#if NDT > 0 -#include -#endif - /* * Clock handling routines. * @@ -145,12 +140,6 @@ initclocks(void) void hardclock(struct clockframe *frame) { -#if NDT > 0 - DT_ENTER(profile, NULL); - if (CPU_IS_PRIMARY(curcpu())) - DT_ENTER(interval, NULL); -#endif - /* * If we are not the primary CPU, we're not allowed to do * any more work.