From: Tim Leslie Subject: Shift several proc.h fields out of SCHED_LOCK To: "tech@openbsd.org" Date: Sun, 02 Nov 2025 14:34:46 +0000 @tech, This patch addresses the proc.h fields of p_cpu, p_estcpu, p_slptime, p_slppri, p_runpri, p_usrpri and their coverage under the SCHED_LOCK. Writes for the multi-byte fields of p_estcpu (atomic stores) and p_cpu (casts to long for atomic store of the pointer value) are done to prevent torn reads. Single-byte fields (p_slptime, p_slppri, p_runpri, p_usrpri) are written directly, as byte stores are atomic by nature. It is possible that there are small windows where cross-CPU scheduling decisions use stale data. This tradeoff for lock coverage should have minimal impact, as a slightly stale priority or CPU ID for a single tick likely only results in a minor sub-optimal scheduling decision, not a correctness issue like a crash or deadlock. This change means setpriority() no longer requires SCHED_LOCK, which enables schedclock() and kern_resource to become lock-free and reduces scope in schedcpu(). Contention reduction is minimal but non-zero. SCHED_LOCK's scope is now to protect runqueues, sleep queues, and p_stat/p_wchan/p_wmesg. — Tim Leslie diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -211,11 +211,9 @@ donice(struct proc *curp, struct process *chgpr, int n) return (EACCES); chgpr->ps_nice = n; mtx_enter(&chgpr->ps_mtx); - SCHED_LOCK(); TAILQ_FOREACH(p, &chgpr->ps_threads, p_thr_link) { setpriority(p, p->p_estcpu, n); } - SCHED_UNLOCK(); mtx_leave(&chgpr->ps_mtx); return (0); } diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c --- a/sys/kern/kern_sched.c +++ b/sys/kern/kern_sched.c @@ -278,7 +278,7 @@ setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio) KASSERT(p->p_wchan == NULL); KASSERT(!ISSET(p->p_flag, P_INSCHED)); - p->p_cpu = ci; + atomic_store_long((unsigned long *)&p->p_cpu, (unsigned long)ci); p->p_stat = SRUN; p->p_runpri = prio; @@ -537,7 +537,7 @@ sched_steal_proc(struct cpu_info *self) best->p_p->ps_pid, CPU_INFO_UNIT(self)); remrunqueue(best); - best->p_cpu = self; + atomic_store_long((unsigned long *)&best->p_cpu, (unsigned long)self); sched_stolen++; #endif diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c --- a/sys/kern/sched_bsd.c +++ b/sys/kern/sched_bsd.c @@ -255,7 +255,6 @@ schedcpu(void *unused) p->p_pctcpu = pctcpu; continue; } - SCHED_LOCK(); /* * p_pctcpu is only for diagnostic tools such as ps. */ @@ -274,6 +273,7 @@ schedcpu(void *unused) newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu); setpriority(p, newcpu, p->p_p->ps_nice); + SCHED_LOCK(); if (p->p_stat == SRUN && (p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) { remrunqueue(p); @@ -509,8 +509,7 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice) newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI); - SCHED_ASSERT_LOCKED(); - p->p_estcpu = newcpu; + atomic_store_int(&p->p_estcpu, newcpu); p->p_usrpri = newprio; } @@ -538,10 +537,8 @@ schedclock(struct proc *p) if (p == spc->spc_idleproc || spc->spc_spinning) return; - SCHED_LOCK(); newcpu = ESTCPULIM(p->p_estcpu + 1); setpriority(p, newcpu, p->p_p->ps_nice); - SCHED_UNLOCK(); } void (*cpu_setperf)(int); diff --git a/sys/sys/proc.h b/sys/sys/proc.h --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -373,7 +373,7 @@ struct proc { int p_flag; /* P_* flags. */ u_char p_spare; /* unused */ char p_stat; /* [S] S* process status. */ - u_char p_runpri; /* [S] Runqueue priority */ + u_char p_runpri; /* Runqueue priority */ u_char p_descfd; /* if not 255, fdesc permits this fd */ pid_t p_tid; /* Thread identifier. */ @@ -390,8 +390,8 @@ struct proc { struct timeout p_sleep_to;/* timeout for tsleep() */ const char *p_wmesg; /* [S] Reason for sleep. */ volatile fixpt_t p_pctcpu; /* [a] %cpu for this thread */ - u_int p_slptime; /* [S] Time since last blocked. */ - struct cpu_info * volatile p_cpu; /* [S] CPU we're running on. */ + u_int p_slptime; /* Time since last blocked. */ + struct cpu_info * volatile p_cpu; /* [a] CPU we're running on. */ struct rusage p_ru; /* Statistics */ struct tusage p_tu; /* [o] accumulated times. */ @@ -412,9 +412,9 @@ struct proc { sigset_t p_sigmask; /* [o] Current signal mask */ char p_name[_MAXCOMLEN]; /* thread name, incl NUL */ - u_char p_slppri; /* [S] Sleeping priority */ - u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */ - u_int p_estcpu; /* [S] Time averaged val of p_cpticks */ + u_char p_slppri; /* Sleeping priority */ + u_char p_usrpri; /* Priority based on p_estcpu & ps_nice */ + u_int p_estcpu; /* [a] Time averaged val of p_cpticks */ int p_pledge_syscall; /* Cache of current syscall */ uint64_t p_pledge; /* [o] copy of p_p->ps_pledge */