Download raw body.
dt(4), hardclock: move profiling entry points to dedicated clockintrs
dt(4), hardclock: move profiling entry points to dedicated clockintrs
dt(4), hardclock: move profiling entry points to dedicated clockintrs
On 05/02/24(Mon) 19:37, Scott Cheloha wrote:
> On Fri, Jan 26, 2024 at 12:28:12PM -0600, Scott Cheloha wrote:
> > This patch moves the dt(4) profiling entry points from hardclock()
> > into dedicated clockintr callbacks. It depends on the clockintr_unbind
> > patch I sent out separately, so be sure to apply that first.
> >
> > - In struct dt_pcb, replace dp_maxtick and dp_nticks with dp_nsecs.
> > This is the PCB's profiling period.
> >
> > - Each struct dt_pcb gets its own struct clockintr, dp_clockintr.
> > dp_clockintr is bound during dt_prov_profile_alloc(). We stagger
> > dp_clockintr to simulate the current behavior under hardclock().
> >
> > Note that dp_clockintr is given a reference to its enclosing
> > PCB when it is bound. More on this in a second.
> >
> > - Replace dt_prov_profile_enter/dt_prov_interval_enter with
> > dt_prov_profile_clock/dt_prov_profile_interval. These are
> > your clockintr callback functions. Because the prototypes
> > now don't match the other *_enter() functions I figure
> > changing the names is reasonable.
> >
> > Because dp_clockintr has a copy of the PCB pointer we no longer
> > need to search for the PCB with SMR_SLIST_FOREACH. This reduces
> > profiling overhead, especially on larger systems.
> >
> > - dt_prov_profile_fire() now takes a count of profiling ticks and
> > runs a loop to grab as many EVTs as it needs. If the callback
> > is delayed, it will assign multiple "hits" to the current PC,
> > just like we do now under hardclock().
> >
> > - dt_pcb_free() unbinds the clockintr. Because dp_clockintr has
> > a reference to the enclosing PCB we set an execution barrier
> > here to ensure it is safe to free the PCB. In practice this
> > will rarely block, but you need to set the barrier to be sure.
> >
> > DT_FA_PROFILE has been updated for amd64. The unpruned stack now
> > looks like this:
> >
> > dt_pcb_ring_get
> > dt_prov_profile_clock
> > clockintr_dispatch
> > lapic_clockintr
> > Xresume_lapic_ltimer
> > acpicpu_idle
> > sched_idle
> > proc_trampoline
> > kernel
> >
> > I can test other platforms and update the DT_FA_PROFILE values
> > for everything but powerpc64 before committing. The new value
> > for powerpc64 is probably 4, but somebody else needs to confirm.
>
> Updated patch:
>
> - Don't advance/cancel dp_clockintr when dp_nsecs is zero.
> - Update DT_FA_PROFILE for i386, macppc
I can't test those.
Comments below.
> Index: sys/dev/dt/dt_prov_profile.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v
> diff -u -p -r1.5 dt_prov_profile.c
> --- sys/dev/dt/dt_prov_profile.c 26 Apr 2023 16:53:59 -0000 1.5
> +++ sys/dev/dt/dt_prov_profile.c 6 Feb 2024 01:34:45 -0000
> @@ -20,6 +20,7 @@
> #include <sys/systm.h>
> #include <sys/param.h>
> #include <sys/atomic.h>
> +#include <sys/clockintr.h>
>
> #include <dev/dt/dtvar.h>
>
> @@ -31,13 +32,13 @@ struct dt_probe *dtpp_interval; /* glob
>
> int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *,
> struct dt_pcb_list *, struct dtioc_req *);
> -int dt_prov_profile_enter(struct dt_provider *, ...);
> -int dt_prov_interval_enter(struct dt_provider *, ...);
> +void dt_prov_profile_clock(struct clockrequest *, void *, void *);
> +void dt_prov_interval_clock(struct clockrequest *, void *, void *);
>
> struct dt_provider dt_prov_profile = {
> .dtpv_name = "profile",
> .dtpv_alloc = dt_prov_profile_alloc,
> - .dtpv_enter = dt_prov_profile_enter,
> + .dtpv_enter = NULL,
> .dtpv_leave = NULL,
> .dtpv_dealloc = NULL,
> };
> @@ -45,7 +46,7 @@ struct dt_provider dt_prov_profile = {
> struct dt_provider dt_prov_interval = {
> .dtpv_name = "interval",
> .dtpv_alloc = dt_prov_profile_alloc,
> - .dtpv_enter = dt_prov_interval_enter,
> + .dtpv_enter = NULL,
> .dtpv_leave = NULL,
> .dtpv_dealloc = NULL,
> };
> @@ -70,6 +71,7 @@ dt_prov_profile_alloc(struct dt_probe *d
> {
> struct dt_pcb *dp;
> struct cpu_info *ci;
> + void (*callback)(struct clockrequest *, void *, void *);
> CPU_INFO_ITERATOR cii;
> extern int hz;
>
> @@ -80,6 +82,11 @@ dt_prov_profile_alloc(struct dt_probe *d
> if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate > hz)
> return EOPNOTSUPP;
>
> + if (dtp == dtpp_interval)
> + callback = dt_prov_interval_clock;
> + else
> + callback = dt_prov_profile_clock;
I see no difference between the two callbacks, can we get rid of
dt_prov_interval_clock()?
> CPU_INFO_FOREACH(cii, ci) {
> if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval))
> continue;
> @@ -90,8 +97,11 @@ dt_prov_profile_alloc(struct dt_probe *d
> return ENOMEM;
> }
>
> - dp->dp_maxtick = hz / dtrq->dtrq_rate;
> + dp->dp_nsecs = SEC_TO_NSEC(1) / dtrq->dtrq_rate;
> dp->dp_cpuid = ci->ci_cpuid;
`dp_cpuid' is no longer used now that clockintr_stagger() bind a request
to a given CPU. You can get rid of it.
> + clockintr_bind(&dp->dp_clockintr, ci, callback, dp);
> + clockintr_stagger(&dp->dp_clockintr, dp->dp_nsecs,
> + CPU_INFO_UNIT(ci), MAXCPUS);
>
> dp->dp_filter = dtrq->dtrq_filter;
> dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE;
> @@ -102,50 +112,35 @@ dt_prov_profile_alloc(struct dt_probe *d
> }
>
> static inline void
> -dt_prov_profile_fire(struct dt_pcb *dp)
> +dt_prov_profile_fire(struct dt_pcb *dp, uint64_t count)
> {
> + uint64_t i;
> struct dt_evt *dtev;
>
> - if (++dp->dp_nticks < dp->dp_maxtick)
> - return;
> -
> - dtev = dt_pcb_ring_get(dp, 1);
> - if (dtev == NULL)
> - return;
> - dt_pcb_ring_consume(dp, dtev);
> - dp->dp_nticks = 0;
> + for (i = 0; i < count; i++) {
> + dtev = dt_pcb_ring_get(dp, 1);
> + if (dtev == NULL)
> + return;
> + dt_pcb_ring_consume(dp, dtev);
> + }
> }
>
> -int
> -dt_prov_profile_enter(struct dt_provider *dtpv, ...)
> +void
> +dt_prov_profile_clock(struct clockrequest *cr, void *cf, void *arg)
> {
> - struct cpu_info *ci = curcpu();
> - struct dt_pcb *dp;
> -
> - KASSERT(dtpv == &dt_prov_profile);
> -
> - smr_read_enter();
> - SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) {
> - if (dp->dp_cpuid != ci->ci_cpuid)
> - continue;
> + uint64_t count;
> + struct dt_pcb *dp = arg;
>
> - dt_prov_profile_fire(dp);
> - }
> - smr_read_leave();
> - return 0;
> + count = clockrequest_advance(cr, dp->dp_nsecs);
> + dt_prov_profile_fire(dp, count);
> }
>
> -int
> -dt_prov_interval_enter(struct dt_provider *dtpv, ...)
> +void
> +dt_prov_interval_clock(struct clockrequest *cr, void *cf, void *arg)
> {
> - struct dt_pcb *dp;
> + uint64_t count;
> + struct dt_pcb *dp = arg;
>
> - KASSERT(dtpv == &dt_prov_interval);
> -
> - smr_read_enter();
> - SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) {
> - dt_prov_profile_fire(dp);
> - }
> - smr_read_leave();
> - return 0;
> + count = clockrequest_advance(cr, dp->dp_nsecs);
> + dt_prov_profile_fire(dp, count);
> }
> Index: sys/dev/dt/dt_dev.c
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v
> diff -u -p -r1.29 dt_dev.c
> --- sys/dev/dt/dt_dev.c 2 Jan 2024 16:32:48 -0000 1.29
> +++ sys/dev/dt/dt_dev.c 6 Feb 2024 01:34:45 -0000
> @@ -19,6 +19,7 @@
> #include <sys/types.h>
> #include <sys/systm.h>
> #include <sys/param.h>
> +#include <sys/clockintr.h>
> #include <sys/device.h>
> #include <sys/exec_elf.h>
> #include <sys/malloc.h>
> @@ -56,13 +57,13 @@
> * proc_trampoline+0x1c
> */
> #if defined(__amd64__)
> -#define DT_FA_PROFILE 7
> +#define DT_FA_PROFILE 5
> #define DT_FA_STATIC 2
> #elif defined(__i386__)
> -#define DT_FA_PROFILE 8
> +#define DT_FA_PROFILE 5
> #define DT_FA_STATIC 2
> #elif defined(__macppc__)
> -#define DT_FA_PROFILE 7
> +#define DT_FA_PROFILE 5
> #define DT_FA_STATIC 2
> #elif defined(__octeon__)
> #define DT_FA_PROFILE 6
> @@ -492,6 +493,8 @@ dt_ioctl_record_start(struct dt_softc *s
> SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext);
> dtp->dtp_recording++;
> dtp->dtp_prov->dtpv_recording++;
> + if (dp->dp_nsecs != 0)
> + clockintr_advance(&dp->dp_clockintr, dp->dp_nsecs);
> }
> rw_exit_write(&dt_lock);
>
> @@ -518,6 +521,8 @@ dt_ioctl_record_stop(struct dt_softc *sc
> TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) {
> struct dt_probe *dtp = dp->dp_dtp;
>
> + if (dp->dp_nsecs != 0)
> + clockintr_cancel(&dp->dp_clockintr);
Here this is complex. Since we are no longer using SMR in the callback
to get a reference to `dp' we should not do the following before we are
sure the callback won't run. That would imply clockintr_unbind(), no?
Some caution should be taken with clockintr_unbind() because we are
holding a lock.
> dtp->dtp_recording--;
> dtp->dtp_prov->dtpv_recording--;
> SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext);
> @@ -679,6 +684,15 @@ dt_pcb_free(struct dt_pcb *dp)
> {
> if (dp == NULL)
> return;
> +
> + /*
> + * dp_clockintr is bound if dp_nsecs is non-zero. Its callback
> + * has a reference to dp, so set an execution barrier and block
> + * until it is no longer running before freeing dp.
> + */
> + if (dp->dp_nsecs != 0)
> + clockintr_unbind(&dp->dp_clockintr, CL_BARRIER);
> +
This is too late and should not be necessary. dtclose() calls
ctl_record_stop() which should take care of the barrier before
decrementing counters.
> free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring));
> free(dp, M_DT, sizeof(*dp));
> }
> Index: sys/dev/dt/dtvar.h
> ===================================================================
> RCS file: /cvs/src/sys/dev/dt/dtvar.h,v
> diff -u -p -r1.17 dtvar.h
> --- sys/dev/dt/dtvar.h 26 Apr 2023 16:53:59 -0000 1.17
> +++ sys/dev/dt/dtvar.h 6 Feb 2024 01:34:46 -0000
> @@ -197,9 +197,9 @@ struct dt_pcb {
> struct dt_filter dp_filter; /* [I] filter to match */
>
> /* Provider specific fields. */
> + struct clockintr dp_clockintr; /* [?] profiling handle */
If you don't know it's the kernel lock K.
> + uint64_t dp_nsecs; /* [I] profiling period */
> unsigned int dp_cpuid; /* [I] on which CPU */
> - unsigned int dp_maxtick; /* [I] freq. of profiling */
> - unsigned int dp_nticks; /* [c] current tick count */
>
> /* Counters */
> uint64_t dp_dropevt; /* [m] # dropped event */
> Index: sys/kern/kern_clock.c
> ===================================================================
> RCS file: /cvs/src/sys/kern/kern_clock.c,v
> diff -u -p -r1.121 kern_clock.c
> --- sys/kern/kern_clock.c 17 Oct 2023 00:04:02 -0000 1.121
> +++ sys/kern/kern_clock.c 6 Feb 2024 01:34:46 -0000
> @@ -50,11 +50,6 @@
> #include <sys/sched.h>
> #include <sys/timetc.h>
>
> -#include "dt.h"
> -#if NDT > 0
> -#include <dev/dt/dtvar.h>
> -#endif
> -
> /*
> * Clock handling routines.
> *
> @@ -145,12 +140,6 @@ initclocks(void)
> void
> hardclock(struct clockframe *frame)
> {
> -#if NDT > 0
> - DT_ENTER(profile, NULL);
> - if (CPU_IS_PRIMARY(curcpu()))
> - DT_ENTER(interval, NULL);
> -#endif
> -
> /*
> * If we are not the primary CPU, we're not allowed to do
> * any more work.
>
dt(4), hardclock: move profiling entry points to dedicated clockintrs
dt(4), hardclock: move profiling entry points to dedicated clockintrs
dt(4), hardclock: move profiling entry points to dedicated clockintrs