Index | Thread | Search

From:
Tim Leslie <tleslie@protonmail.com>
Subject:
Shift several proc.h fields out of SCHED_LOCK
To:
"tech@openbsd.org" <tech@openbsd.org>
Date:
Sun, 02 Nov 2025 14:34:46 +0000

Download raw body.

Thread
@tech,

This patch addresses the proc.h fields of p_cpu, p_estcpu, p_slptime, p_slppri, p_runpri, p_usrpri and their coverage under the SCHED_LOCK. Writes for the multi-byte fields of p_estcpu (atomic stores) and p_cpu (casts to long for atomic store of the pointer value) are done to prevent torn reads. Single-byte fields (p_slptime, p_slppri, p_runpri, p_usrpri) are written directly, as byte stores are atomic by nature. It is possible that there are small windows where cross-CPU scheduling decisions use stale data. This tradeoff for lock coverage should have minimal impact, as a slightly stale priority or CPU ID for a single tick likely only results in a minor sub-optimal scheduling decision, not a correctness issue like a crash or deadlock.

This change means setpriority() no longer requires SCHED_LOCK, which enables schedclock() and kern_resource to become lock-free and reduces scope in schedcpu(). Contention reduction is minimal but non-zero.

SCHED_LOCK's scope is now to protect runqueues, sleep queues, and p_stat/p_wchan/p_wmesg.

—
Tim Leslie



diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -211,11 +211,9 @@ donice(struct proc *curp, struct process *chgpr, int n)
 		return (EACCES);
 	chgpr->ps_nice = n;
 	mtx_enter(&chgpr->ps_mtx);
-	SCHED_LOCK();
 	TAILQ_FOREACH(p, &chgpr->ps_threads, p_thr_link) {
 		setpriority(p, p->p_estcpu, n);
 	}
-	SCHED_UNLOCK();
 	mtx_leave(&chgpr->ps_mtx);
 	return (0);
 }
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -278,7 +278,7 @@ setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
 	KASSERT(p->p_wchan == NULL);
 	KASSERT(!ISSET(p->p_flag, P_INSCHED));
 
-	p->p_cpu = ci;
+	atomic_store_long((unsigned long *)&p->p_cpu, (unsigned long)ci);
 	p->p_stat = SRUN;
 	p->p_runpri = prio;
 
@@ -537,7 +537,7 @@ sched_steal_proc(struct cpu_info *self)
 	    best->p_p->ps_pid, CPU_INFO_UNIT(self));
 
 	remrunqueue(best);
-	best->p_cpu = self;
+	atomic_store_long((unsigned long *)&best->p_cpu, (unsigned long)self);
 
 	sched_stolen++;
 #endif
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -255,7 +255,6 @@ schedcpu(void *unused)
 			p->p_pctcpu = pctcpu;
 			continue;
 		}
-		SCHED_LOCK();
 		/*
 		 * p_pctcpu is only for diagnostic tools such as ps.
 		 */
@@ -274,6 +273,7 @@ schedcpu(void *unused)
 		newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
 		setpriority(p, newcpu, p->p_p->ps_nice);
 
+		SCHED_LOCK();
 		if (p->p_stat == SRUN &&
 		    (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
 			remrunqueue(p);
@@ -509,8 +509,7 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
 
 	newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
 
-	SCHED_ASSERT_LOCKED();
-	p->p_estcpu = newcpu;
+	atomic_store_int(&p->p_estcpu, newcpu);
 	p->p_usrpri = newprio;
 }
 
@@ -538,10 +537,8 @@ schedclock(struct proc *p)
 	if (p == spc->spc_idleproc || spc->spc_spinning)
 		return;
 
-	SCHED_LOCK();
 	newcpu = ESTCPULIM(p->p_estcpu + 1);
 	setpriority(p, newcpu, p->p_p->ps_nice);
-	SCHED_UNLOCK();
 }
 
 void (*cpu_setperf)(int);
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -373,7 +373,7 @@ struct proc {
 	int	p_flag;			/* P_* flags. */
 	u_char	p_spare;		/* unused */
 	char	p_stat;			/* [S] S* process status. */
-	u_char	p_runpri;		/* [S] Runqueue priority */
+	u_char	p_runpri;		/* Runqueue priority */
 	u_char	p_descfd;		/* if not 255, fdesc permits this fd */
 
 	pid_t	p_tid;			/* Thread identifier. */
@@ -390,8 +390,8 @@ struct proc {
 	struct	timeout p_sleep_to;/* timeout for tsleep() */
 	const char *p_wmesg;		/* [S] Reason for sleep. */
 	volatile fixpt_t p_pctcpu;	/* [a] %cpu for this thread */
-	u_int	p_slptime;		/* [S] Time since last blocked. */
-	struct	cpu_info * volatile p_cpu; /* [S] CPU we're running on. */
+	u_int	p_slptime;		/* Time since last blocked. */
+	struct	cpu_info * volatile p_cpu; /* [a] CPU we're running on. */
 
 	struct	rusage p_ru;		/* Statistics */
 	struct	tusage p_tu;		/* [o] accumulated times. */
@@ -412,9 +412,9 @@ struct proc {
 	sigset_t p_sigmask;		/* [o] Current signal mask */
 
 	char	p_name[_MAXCOMLEN];	/* thread name, incl NUL */
-	u_char	p_slppri;		/* [S] Sleeping priority */
-	u_char	p_usrpri;	/* [S] Priority based on p_estcpu & ps_nice */
-	u_int	p_estcpu;		/* [S] Time averaged val of p_cpticks */
+	u_char	p_slppri;		/* Sleeping priority */
+	u_char	p_usrpri;	/* Priority based on p_estcpu & ps_nice */
+	u_int	p_estcpu;		/* [a] Time averaged val of p_cpticks */
 	int	p_pledge_syscall;	/* Cache of current syscall */
 	uint64_t p_pledge;		/* [o] copy of p_p->ps_pledge */