Index | Thread | Search

From:
Claudio Jeker <cjeker@diehard.n-r-g.com>
Subject:
Re: Move p_estcpu accumulation from schedclock to tuagg_add_runtime
To:
Tim Leslie <tleslie@protonmail.com>
Cc:
"tech@openbsd.org" <tech@openbsd.org>
Date:
Thu, 5 Feb 2026 07:15:11 +0100

Download raw body.

Thread
On Thu, Feb 05, 2026 at 12:56:22AM +0000, Tim Leslie wrote:
> Currently p_estcpu is incremented in schedclock() under SCHED_LOCK for the running process. This moves the accumulation to tuagg_add_runtime(), which is called on every context switch to track per-thread runtime.

What happens if there is no context switch? Right now the priority of the
process goes up on every schedclock() run and so another process sitting
on the run queue will force a context switch at some point.

E.g. something like md5 -tttttttt tends to hog a cpu since there is
nothing going on that would force a context switch.
 
> This change:
> - Removes estcpu increment from the periodic clock interrupt path
> - Bases estcpu growth on measured runtime instead of tick count
> - Prepares for future work on usrpri caching and SCHED_LOCK reduction
> 
> setpriority() no longer writes p_estcpu; it only computes usrpri from the proc's estcpu field. schedclock() now only recomputes usrpri to update the cached value. All accesses to p_estcpu now use READ_ONCE/WRITE_ONCE.
> 
> There is a slight accounting changes: usage is rounded up to the nearest 10ms, matching the previous tick-based granularity.
> 
> Tested with kernel compiles at -j1, -j6, -j8, -j300 on amd64 6-core VM with no measurable performance difference.
> 
> Tim 
> 
> --
> 
> diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
> --- a/sys/kern/kern_resource.c
> +++ b/sys/kern/kern_resource.c
> @@ -440,6 +440,8 @@ tuagg_add_runtime(void)
>  	struct proc *p = curproc;
>  	struct timespec ts, delta;
>  	unsigned int gen;
> +	uint64_t delta_ms;
> +	uint32_t oldcpu, newcpu;
> 
>  	/*
>  	 * Compute the amount of time during which the current
> @@ -463,6 +465,17 @@ tuagg_add_runtime(void)
>  	gen = tu_enter(&p->p_tu);
>  	timespecadd(&p->p_tu.tu_runtime, &delta, &p->p_tu.tu_runtime);
>  	tu_leave(&p->p_tu, gen);
> +
> +	/*
> +	 * Increase scheduler's estcpu estimate from runtime.
> +	 * Any runtime, even 1ms, increments estcpu by at least 1.
> +	 */
> +	delta_ms = (uint64_t)delta.tv_sec * 1000 + delta.tv_nsec / 1000000;
> +	if (delta_ms > 0) {
> +		oldcpu = READ_ONCE(p->p_estcpu);
> +		newcpu = ESTCPULIM(oldcpu + (delta_ms + 9) / 10);
> +		WRITE_ONCE(p->p_estcpu, newcpu);
> +	}
>  }
> 
>  /*
> diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
> --- a/sys/kern/sched_bsd.c
> +++ b/sys/kern/sched_bsd.c
> @@ -271,7 +271,8 @@ schedcpu(void *unused)
>  #endif
>  		p->p_pctcpu = pctcpu;
>  		p->p_cpticks2 = cpt;
> -		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
> +		newcpu = (u_int) decay_cpu(loadfac, READ_ONCE(p->p_estcpu));
> +		WRITE_ONCE(p->p_estcpu, newcpu);
>  		setpriority(p, newcpu, p->p_p->ps_nice);
> 
>  		if (p->p_stat == SRUN &&
> @@ -493,7 +494,8 @@ setrunnable(struct proc *p)
>  	if (p->p_slptime > 1) {
>  		uint32_t newcpu;
> 
> -		newcpu = decay_aftersleep(p->p_estcpu, p->p_slptime);
> +		newcpu = decay_aftersleep(READ_ONCE(p->p_estcpu), p->p_slptime);
> +		WRITE_ONCE(p->p_estcpu, newcpu);
>  		setpriority(p, newcpu, pr->ps_nice);
>  	}
>  	p->p_slptime = 0;
> @@ -510,7 +512,6 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
>  	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
> 
>  	SCHED_ASSERT_LOCKED();
> -	p->p_estcpu = newcpu;
>  	p->p_usrpri = newprio;
>  }
> 
> @@ -533,14 +534,12 @@ schedclock(struct proc *p)
>  {
>  	struct cpu_info *ci = curcpu();
>  	struct schedstate_percpu *spc = &ci->ci_schedstate;
> -	uint32_t newcpu;
> 
>  	if (p == spc->spc_idleproc || spc->spc_spinning)
>  		return;
> 
>  	SCHED_LOCK();
> -	newcpu = ESTCPULIM(p->p_estcpu + 1);
> -	setpriority(p, newcpu, p->p_p->ps_nice);
> +	setpriority(p, READ_ONCE(p->p_estcpu), READ_ONCE(p->p_p->ps_nice));
>  	SCHED_UNLOCK();
>  }
> 
> diff --git a/sys/sys/proc.h b/sys/sys/proc.h
> --- a/sys/sys/proc.h
> +++ b/sys/sys/proc.h
> @@ -357,6 +357,7 @@ struct p_inentry {
>   *	l	read only reference, see lim_read_enter()
>   *	o	owned (modified only) by this thread
>   *	m	this proc's' `p->p_p->ps_mtx'
>   */
>  struct proc {
>  	TAILQ_ENTRY(proc) p_runq;	/* [S] current run/sleep queue */
> @@ -414,7 +415,7 @@ struct proc {
>  	char	p_name[_MAXCOMLEN];	/* thread name, incl NUL */
>  	u_char	p_slppri;		/* [S] Sleeping priority */
>  	u_char	p_usrpri;	/* [S] Priority based on p_estcpu & ps_nice */
> -	u_int	p_estcpu;		/* [S] Time averaged val of p_cpticks */
> +	u_int	p_estcpu;		/* [a] Scheduler's CPU usage metric */
>  	int	p_pledge_syscall;	/* Cache of current syscall */
>  	uint64_t p_pledge;		/* [o] copy of p_p->ps_pledge */
> 
> diff --git a/sys/sys/sched.h b/sys/sys/sched.h
> --- a/sys/sys/sched.h
> +++ b/sys/sys/sched.h
> @@ -194,7 +194,10 @@ void remrunqueue(struct proc *);
> 
>  /* Chargeback parents for the sins of their children.  */
>  #define scheduler_wait_hook(parent, child) do {				\
> -	(parent)->p_estcpu = ESTCPULIM((parent)->p_estcpu + (child)->p_estcpu);\
> +	uint32_t __p_estcpu = READ_ONCE((parent)->p_estcpu);		\
> +	uint32_t __c_estcpu = READ_ONCE((child)->p_estcpu);			\
> +	WRITE_ONCE((parent)->p_estcpu,								\
> +	    ESTCPULIM(__p_estcpu + __c_estcpu));					\
>  } while (0)
> 
>  /* Allow other processes to progress */
> 
> diff --git a/sys/sys/sysctl.h b/sys/sys/sysctl.h
> --- a/sys/sys/sysctl.h
> +++ b/sys/sys/sysctl.h
> @@ -625,7 +625,7 @@ do {									\
>  									\
>  	(kp)->p_jobc = (pg)->pg_jobc;					\
>  									\
> -	(kp)->p_estcpu = (p)->p_estcpu;					\
> +	(kp)->p_estcpu = READ_ONCE((p)->p_estcpu);					\
>  	if (isthread) {							\
>  		(kp)->p_tid = (p)->p_tid + THREAD_PID_OFFSET;		\
>  		strlcpy((kp)->p_name, (p)->p_name, sizeof((kp)->p_name)); \
> 

-- 
:wq Claudio