Mailing List Archive

Currently p_estcpu is incremented in schedclock() under SCHED_LOCK for the running process. This moves the accumulation to tuagg_add_runtime(), which is called on every context switch to track per-thread runtime. This change: - Removes estcpu increment from the periodic clock interrupt path - Bases estcpu growth on measured runtime instead of tick count - Prepares for future work on usrpri caching and SCHED_LOCK reduction setpriority() no longer writes p_estcpu; it only computes usrpri from the proc's estcpu field. schedclock() now only recomputes usrpri to update the cached value. All accesses to p_estcpu now use READ_ONCE/WRITE_ONCE. There is a slight accounting changes: usage is rounded up to the nearest 10ms, matching the previous tick-based granularity. Tested with kernel compiles at -j1, -j6, -j8, -j300 on amd64 6-core VM with no measurable performance difference. Tim -- diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -440,6 +440,8 @@ tuagg_add_runtime(void) struct proc *p = curproc; struct timespec ts, delta; unsigned int gen; + uint64_t delta_ms; + uint32_t oldcpu, newcpu; /* * Compute the amount of time during which the current @@ -463,6 +465,17 @@ tuagg_add_runtime(void) gen = tu_enter(&p->p_tu); timespecadd(&p->p_tu.tu_runtime, &delta, &p->p_tu.tu_runtime); tu_leave(&p->p_tu, gen); + + /* + * Increase scheduler's estcpu estimate from runtime. + * Any runtime, even 1ms, increments estcpu by at least 1. + */ + delta_ms = (uint64_t)delta.tv_sec * 1000 + delta.tv_nsec / 1000000; + if (delta_ms > 0) { + oldcpu = READ_ONCE(p->p_estcpu); + newcpu = ESTCPULIM(oldcpu + (delta_ms + 9) / 10); + WRITE_ONCE(p->p_estcpu, newcpu); + } } /* diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c --- a/sys/kern/sched_bsd.c +++ b/sys/kern/sched_bsd.c @@ -271,7 +271,8 @@ schedcpu(void *unused) #endif p->p_pctcpu = pctcpu; p->p_cpticks2 = cpt; - newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu); + newcpu = (u_int) decay_cpu(loadfac, READ_ONCE(p->p_estcpu)); + WRITE_ONCE(p->p_estcpu, newcpu); setpriority(p, newcpu, p->p_p->ps_nice); if (p->p_stat == SRUN && @@ -493,7 +494,8 @@ setrunnable(struct proc *p) if (p->p_slptime > 1) { uint32_t newcpu; - newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime); + newcpu = decay_aftersleep(READ_ONCE(p->p_estcpu), p->p_slptime); + WRITE_ONCE(p->p_estcpu, newcpu); setpriority(p, newcpu, pr->ps_nice); } p->p_slptime = 0; @@ -510,7 +512,6 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice) newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI); SCHED_ASSERT_LOCKED(); - p->p_estcpu = newcpu; p->p_usrpri = newprio; } @@ -533,14 +534,12 @@ schedclock(struct proc *p) { struct cpu_info *ci = curcpu(); struct schedstate_percpu *spc = &ci->ci_schedstate; - uint32_t newcpu; if (p == spc->spc_idleproc || spc->spc_spinning) return; SCHED_LOCK(); - newcpu = ESTCPULIM(p->p_estcpu + 1); - setpriority(p, newcpu, p->p_p->ps_nice); + setpriority(p, READ_ONCE(p->p_estcpu), READ_ONCE(p->p_p->ps_nice)); SCHED_UNLOCK(); } diff --git a/sys/sys/proc.h b/sys/sys/proc.h --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -357,6 +357,7 @@ struct p_inentry { * l read only reference, see lim_read_enter() * o owned (modified only) by this thread * m this proc's' `p->p_p->ps_mtx' */ struct proc { TAILQ_ENTRY(proc) p_runq; /* [S] current run/sleep queue */ @@ -414,7 +415,7 @@ struct proc { char p_name[_MAXCOMLEN]; /* thread name, incl NUL */ u_char p_slppri; /* [S] Sleeping priority */ u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */ - u_int p_estcpu; /* [S] Time averaged val of p_cpticks */ + u_int p_estcpu; /* [a] Scheduler's CPU usage metric */ int p_pledge_syscall; /* Cache of current syscall */ uint64_t p_pledge; /* [o] copy of p_p->ps_pledge */ diff --git a/sys/sys/sched.h b/sys/sys/sched.h --- a/sys/sys/sched.h +++ b/sys/sys/sched.h @@ -194,7 +194,10 @@ void remrunqueue(struct proc *); /* Chargeback parents for the sins of their children. */ #define scheduler_wait_hook(parent, child) do { \ - (parent)->p_estcpu = ESTCPULIM((parent)->p_estcpu + (child)->p_estcpu);\ + uint32_t __p_estcpu = READ_ONCE((parent)->p_estcpu); \ + uint32_t __c_estcpu = READ_ONCE((child)->p_estcpu); \ + WRITE_ONCE((parent)->p_estcpu, \ + ESTCPULIM(__p_estcpu + __c_estcpu)); \ } while (0) /* Allow other processes to progress */ diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h --- a/sys/sys/sysctl.h +++ b/sys/sys/sysctl.h @@ -625,7 +625,7 @@ do { \ \ (kp)->p_jobc = (pg)->pg_jobc; \ \ - (kp)->p_estcpu = (p)->p_estcpu; \ + (kp)->p_estcpu = READ_ONCE((p)->p_estcpu); \ if (isthread) { \ (kp)->p_tid = (p)->p_tid + THREAD_PID_OFFSET; \ strlcpy((kp)->p_name, (p)->p_name, sizeof((kp)->p_name)); \

2026-02-05 00:56 Tim Leslie:
Move p_estcpu accumulation from schedclock to tuagg_add_runtime
- 2026-02-05 06:15 Claudio Jeker:
  Move p_estcpu accumulation from schedclock to tuagg_add_runtime
- - 2026-02-05 11:11 Tim Leslie:
    Move p_estcpu accumulation from schedclock to tuagg_add_runtime