Download raw body.
Move p_estcpu accumulation from schedclock to tuagg_add_runtime
Currently p_estcpu is incremented in schedclock() under SCHED_LOCK for the running process. This moves the accumulation to tuagg_add_runtime(), which is called on every context switch to track per-thread runtime.
This change:
- Removes estcpu increment from the periodic clock interrupt path
- Bases estcpu growth on measured runtime instead of tick count
- Prepares for future work on usrpri caching and SCHED_LOCK reduction
setpriority() no longer writes p_estcpu; it only computes usrpri from the proc's estcpu field. schedclock() now only recomputes usrpri to update the cached value. All accesses to p_estcpu now use READ_ONCE/WRITE_ONCE.
There is a slight accounting changes: usage is rounded up to the nearest 10ms, matching the previous tick-based granularity.
Tested with kernel compiles at -j1, -j6, -j8, -j300 on amd64 6-core VM with no measurable performance difference.
Tim
--
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -440,6 +440,8 @@ tuagg_add_runtime(void)
struct proc *p = curproc;
struct timespec ts, delta;
unsigned int gen;
+ uint64_t delta_ms;
+ uint32_t oldcpu, newcpu;
/*
* Compute the amount of time during which the current
@@ -463,6 +465,17 @@ tuagg_add_runtime(void)
gen = tu_enter(&p->p_tu);
timespecadd(&p->p_tu.tu_runtime, &delta, &p->p_tu.tu_runtime);
tu_leave(&p->p_tu, gen);
+
+ /*
+ * Increase scheduler's estcpu estimate from runtime.
+ * Any runtime, even 1ms, increments estcpu by at least 1.
+ */
+ delta_ms = (uint64_t)delta.tv_sec * 1000 + delta.tv_nsec / 1000000;
+ if (delta_ms > 0) {
+ oldcpu = READ_ONCE(p->p_estcpu);
+ newcpu = ESTCPULIM(oldcpu + (delta_ms + 9) / 10);
+ WRITE_ONCE(p->p_estcpu, newcpu);
+ }
}
/*
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -271,7 +271,8 @@ schedcpu(void *unused)
#endif
p->p_pctcpu = pctcpu;
p->p_cpticks2 = cpt;
- newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
+ newcpu = (u_int) decay_cpu(loadfac, READ_ONCE(p->p_estcpu));
+ WRITE_ONCE(p->p_estcpu, newcpu);
setpriority(p, newcpu, p->p_p->ps_nice);
if (p->p_stat == SRUN &&
@@ -493,7 +494,8 @@ setrunnable(struct proc *p)
if (p->p_slptime > 1) {
uint32_t newcpu;
- newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
+ newcpu = decay_aftersleep(READ_ONCE(p->p_estcpu), p->p_slptime);
+ WRITE_ONCE(p->p_estcpu, newcpu);
setpriority(p, newcpu, pr->ps_nice);
}
p->p_slptime = 0;
@@ -510,7 +512,6 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
SCHED_ASSERT_LOCKED();
- p->p_estcpu = newcpu;
p->p_usrpri = newprio;
}
@@ -533,14 +534,12 @@ schedclock(struct proc *p)
{
struct cpu_info *ci = curcpu();
struct schedstate_percpu *spc = &ci->ci_schedstate;
- uint32_t newcpu;
if (p == spc->spc_idleproc || spc->spc_spinning)
return;
SCHED_LOCK();
- newcpu = ESTCPULIM(p->p_estcpu + 1);
- setpriority(p, newcpu, p->p_p->ps_nice);
+ setpriority(p, READ_ONCE(p->p_estcpu), READ_ONCE(p->p_p->ps_nice));
SCHED_UNLOCK();
}
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -357,6 +357,7 @@ struct p_inentry {
* l read only reference, see lim_read_enter()
* o owned (modified only) by this thread
* m this proc's' `p->p_p->ps_mtx'
*/
struct proc {
TAILQ_ENTRY(proc) p_runq; /* [S] current run/sleep queue */
@@ -414,7 +415,7 @@ struct proc {
char p_name[_MAXCOMLEN]; /* thread name, incl NUL */
u_char p_slppri; /* [S] Sleeping priority */
u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */
- u_int p_estcpu; /* [S] Time averaged val of p_cpticks */
+ u_int p_estcpu; /* [a] Scheduler's CPU usage metric */
int p_pledge_syscall; /* Cache of current syscall */
uint64_t p_pledge; /* [o] copy of p_p->ps_pledge */
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -194,7 +194,10 @@ void remrunqueue(struct proc *);
/* Chargeback parents for the sins of their children. */
#define scheduler_wait_hook(parent, child) do { \
- (parent)->p_estcpu = ESTCPULIM((parent)->p_estcpu + (child)->p_estcpu);\
+ uint32_t __p_estcpu = READ_ONCE((parent)->p_estcpu); \
+ uint32_t __c_estcpu = READ_ONCE((child)->p_estcpu); \
+ WRITE_ONCE((parent)->p_estcpu, \
+ ESTCPULIM(__p_estcpu + __c_estcpu)); \
} while (0)
/* Allow other processes to progress */
diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
--- a/sys/sys/sysctl.h
+++ b/sys/sys/sysctl.h
@@ -625,7 +625,7 @@ do { \
\
(kp)->p_jobc = (pg)->pg_jobc; \
\
- (kp)->p_estcpu = (p)->p_estcpu; \
+ (kp)->p_estcpu = READ_ONCE((p)->p_estcpu); \
if (isthread) { \
(kp)->p_tid = (p)->p_tid + THREAD_PID_OFFSET; \
strlcpy((kp)->p_name, (p)->p_name, sizeof((kp)->p_name)); \
Move p_estcpu accumulation from schedclock to tuagg_add_runtime