Index | Thread | Search

From:
Tim Leslie <tleslie@protonmail.com>
Subject:
Move p_estcpu accumulation from schedclock to tuagg_add_runtime
To:
"tech@openbsd.org" <tech@openbsd.org>
Date:
Thu, 05 Feb 2026 00:56:22 +0000

Download raw body.

Thread
Currently p_estcpu is incremented in schedclock() under SCHED_LOCK for the running process. This moves the accumulation to tuagg_add_runtime(), which is called on every context switch to track per-thread runtime.

This change:
- Removes estcpu increment from the periodic clock interrupt path
- Bases estcpu growth on measured runtime instead of tick count
- Prepares for future work on usrpri caching and SCHED_LOCK reduction

setpriority() no longer writes p_estcpu; it only computes usrpri from the proc's estcpu field. schedclock() now only recomputes usrpri to update the cached value. All accesses to p_estcpu now use READ_ONCE/WRITE_ONCE.

There is a slight accounting changes: usage is rounded up to the nearest 10ms, matching the previous tick-based granularity.

Tested with kernel compiles at -j1, -j6, -j8, -j300 on amd64 6-core VM with no measurable performance difference.

Tim 

--

diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -440,6 +440,8 @@ tuagg_add_runtime(void)
 	struct proc *p = curproc;
 	struct timespec ts, delta;
 	unsigned int gen;
+	uint64_t delta_ms;
+	uint32_t oldcpu, newcpu;

 	/*
 	 * Compute the amount of time during which the current
@@ -463,6 +465,17 @@ tuagg_add_runtime(void)
 	gen = tu_enter(&p->p_tu);
 	timespecadd(&p->p_tu.tu_runtime, &delta, &p->p_tu.tu_runtime);
 	tu_leave(&p->p_tu, gen);
+
+	/*
+	 * Increase scheduler's estcpu estimate from runtime.
+	 * Any runtime, even 1ms, increments estcpu by at least 1.
+	 */
+	delta_ms = (uint64_t)delta.tv_sec * 1000 + delta.tv_nsec / 1000000;
+	if (delta_ms > 0) {
+		oldcpu = READ_ONCE(p->p_estcpu);
+		newcpu = ESTCPULIM(oldcpu + (delta_ms + 9) / 10);
+		WRITE_ONCE(p->p_estcpu, newcpu);
+	}
 }

 /*
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -271,7 +271,8 @@ schedcpu(void *unused)
 #endif
 		p->p_pctcpu = pctcpu;
 		p->p_cpticks2 = cpt;
-		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
+		newcpu = (u_int) decay_cpu(loadfac, READ_ONCE(p->p_estcpu));
+		WRITE_ONCE(p->p_estcpu, newcpu);
 		setpriority(p, newcpu, p->p_p->ps_nice);

 		if (p->p_stat == SRUN &&
@@ -493,7 +494,8 @@ setrunnable(struct proc *p)
 	if (p->p_slptime > 1) {
 		uint32_t newcpu;

-		newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
+		newcpu = decay_aftersleep(READ_ONCE(p->p_estcpu), p->p_slptime);
+		WRITE_ONCE(p->p_estcpu, newcpu);
 		setpriority(p, newcpu, pr->ps_nice);
 	}
 	p->p_slptime = 0;
@@ -510,7 +512,6 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
 	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);

 	SCHED_ASSERT_LOCKED();
-	p->p_estcpu = newcpu;
 	p->p_usrpri = newprio;
 }

@@ -533,14 +534,12 @@ schedclock(struct proc *p)
 {
 	struct cpu_info *ci = curcpu();
 	struct schedstate_percpu *spc = &ci->ci_schedstate;
-	uint32_t newcpu;

 	if (p == spc->spc_idleproc || spc->spc_spinning)
 		return;

 	SCHED_LOCK();
-	newcpu = ESTCPULIM(p->p_estcpu + 1);
-	setpriority(p, newcpu, p->p_p->ps_nice);
+	setpriority(p, READ_ONCE(p->p_estcpu), READ_ONCE(p->p_p->ps_nice));
 	SCHED_UNLOCK();
 }

diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -357,6 +357,7 @@ struct p_inentry {
  *	l	read only reference, see lim_read_enter()
  *	o	owned (modified only) by this thread
  *	m	this proc's' `p->p_p->ps_mtx'
  */
 struct proc {
 	TAILQ_ENTRY(proc) p_runq;	/* [S] current run/sleep queue */
@@ -414,7 +415,7 @@ struct proc {
 	char	p_name[_MAXCOMLEN];	/* thread name, incl NUL */
 	u_char	p_slppri;		/* [S] Sleeping priority */
 	u_char	p_usrpri;	/* [S] Priority based on p_estcpu & ps_nice */
-	u_int	p_estcpu;		/* [S] Time averaged val of p_cpticks */
+	u_int	p_estcpu;		/* [a] Scheduler's CPU usage metric */
 	int	p_pledge_syscall;	/* Cache of current syscall */
 	uint64_t p_pledge;		/* [o] copy of p_p->ps_pledge */

diff --git a/sys/sys/sched.h b/sys/sys/sched.h
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -194,7 +194,10 @@ void remrunqueue(struct proc *);

 /* Chargeback parents for the sins of their children.  */
 #define scheduler_wait_hook(parent, child) do {				\
-	(parent)->p_estcpu = ESTCPULIM((parent)->p_estcpu + (child)->p_estcpu);\
+	uint32_t __p_estcpu = READ_ONCE((parent)->p_estcpu);		\
+	uint32_t __c_estcpu = READ_ONCE((child)->p_estcpu);			\
+	WRITE_ONCE((parent)->p_estcpu,								\
+	    ESTCPULIM(__p_estcpu + __c_estcpu));					\
 } while (0)

 /* Allow other processes to progress */

diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
--- a/sys/sys/sysctl.h
+++ b/sys/sys/sysctl.h
@@ -625,7 +625,7 @@ do {									\
 									\
 	(kp)->p_jobc = (pg)->pg_jobc;					\
 									\
-	(kp)->p_estcpu = (p)->p_estcpu;					\
+	(kp)->p_estcpu = READ_ONCE((p)->p_estcpu);					\
 	if (isthread) {							\
 		(kp)->p_tid = (p)->p_tid + THREAD_PID_OFFSET;		\
 		strlcpy((kp)->p_name, (p)->p_name, sizeof((kp)->p_name)); \