Download raw body.
Shift several proc.h fields out of SCHED_LOCK
@tech,
This patch addresses the proc.h fields of p_cpu, p_estcpu, p_slptime, p_slppri, p_runpri, p_usrpri and their coverage under the SCHED_LOCK. Writes for the multi-byte fields of p_estcpu (atomic stores) and p_cpu (casts to long for atomic store of the pointer value) are done to prevent torn reads. Single-byte fields (p_slptime, p_slppri, p_runpri, p_usrpri) are written directly, as byte stores are atomic by nature. It is possible that there are small windows where cross-CPU scheduling decisions use stale data. This tradeoff for lock coverage should have minimal impact, as a slightly stale priority or CPU ID for a single tick likely only results in a minor sub-optimal scheduling decision, not a correctness issue like a crash or deadlock.
This change means setpriority() no longer requires SCHED_LOCK, which enables schedclock() and kern_resource to become lock-free and reduces scope in schedcpu(). Contention reduction is minimal but non-zero.
SCHED_LOCK's scope is now to protect runqueues, sleep queues, and p_stat/p_wchan/p_wmesg.
—
Tim Leslie
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -211,11 +211,9 @@ donice(struct proc *curp, struct process *chgpr, int n)
return (EACCES);
chgpr->ps_nice = n;
mtx_enter(&chgpr->ps_mtx);
- SCHED_LOCK();
TAILQ_FOREACH(p, &chgpr->ps_threads, p_thr_link) {
setpriority(p, p->p_estcpu, n);
}
- SCHED_UNLOCK();
mtx_leave(&chgpr->ps_mtx);
return (0);
}
diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c
--- a/sys/kern/kern_sched.c
+++ b/sys/kern/kern_sched.c
@@ -278,7 +278,7 @@ setrunqueue(struct cpu_info *ci, struct proc *p, uint8_t prio)
KASSERT(p->p_wchan == NULL);
KASSERT(!ISSET(p->p_flag, P_INSCHED));
- p->p_cpu = ci;
+ atomic_store_long((unsigned long *)&p->p_cpu, (unsigned long)ci);
p->p_stat = SRUN;
p->p_runpri = prio;
@@ -537,7 +537,7 @@ sched_steal_proc(struct cpu_info *self)
best->p_p->ps_pid, CPU_INFO_UNIT(self));
remrunqueue(best);
- best->p_cpu = self;
+ atomic_store_long((unsigned long *)&best->p_cpu, (unsigned long)self);
sched_stolen++;
#endif
diff --git a/sys/kern/sched_bsd.c b/sys/kern/sched_bsd.c
--- a/sys/kern/sched_bsd.c
+++ b/sys/kern/sched_bsd.c
@@ -255,7 +255,6 @@ schedcpu(void *unused)
p->p_pctcpu = pctcpu;
continue;
}
- SCHED_LOCK();
/*
* p_pctcpu is only for diagnostic tools such as ps.
*/
@@ -274,6 +273,7 @@ schedcpu(void *unused)
newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu);
setpriority(p, newcpu, p->p_p->ps_nice);
+ SCHED_LOCK();
if (p->p_stat == SRUN &&
(p->p_runpri / SCHED_PPQ) != (p->p_usrpri / SCHED_PPQ)) {
remrunqueue(p);
@@ -509,8 +509,7 @@ setpriority(struct proc *p, uint32_t newcpu, uint8_t nice)
newprio = min((PUSER + newcpu + NICE_WEIGHT * (nice - NZERO)), MAXPRI);
- SCHED_ASSERT_LOCKED();
- p->p_estcpu = newcpu;
+ atomic_store_int(&p->p_estcpu, newcpu);
p->p_usrpri = newprio;
}
@@ -538,10 +537,8 @@ schedclock(struct proc *p)
if (p == spc->spc_idleproc || spc->spc_spinning)
return;
- SCHED_LOCK();
newcpu = ESTCPULIM(p->p_estcpu + 1);
setpriority(p, newcpu, p->p_p->ps_nice);
- SCHED_UNLOCK();
}
void (*cpu_setperf)(int);
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -373,7 +373,7 @@ struct proc {
int p_flag; /* P_* flags. */
u_char p_spare; /* unused */
char p_stat; /* [S] S* process status. */
- u_char p_runpri; /* [S] Runqueue priority */
+ u_char p_runpri; /* Runqueue priority */
u_char p_descfd; /* if not 255, fdesc permits this fd */
pid_t p_tid; /* Thread identifier. */
@@ -390,8 +390,8 @@ struct proc {
struct timeout p_sleep_to;/* timeout for tsleep() */
const char *p_wmesg; /* [S] Reason for sleep. */
volatile fixpt_t p_pctcpu; /* [a] %cpu for this thread */
- u_int p_slptime; /* [S] Time since last blocked. */
- struct cpu_info * volatile p_cpu; /* [S] CPU we're running on. */
+ u_int p_slptime; /* Time since last blocked. */
+ struct cpu_info * volatile p_cpu; /* [a] CPU we're running on. */
struct rusage p_ru; /* Statistics */
struct tusage p_tu; /* [o] accumulated times. */
@@ -412,9 +412,9 @@ struct proc {
sigset_t p_sigmask; /* [o] Current signal mask */
char p_name[_MAXCOMLEN]; /* thread name, incl NUL */
- u_char p_slppri; /* [S] Sleeping priority */
- u_char p_usrpri; /* [S] Priority based on p_estcpu & ps_nice */
- u_int p_estcpu; /* [S] Time averaged val of p_cpticks */
+ u_char p_slppri; /* Sleeping priority */
+ u_char p_usrpri; /* Priority based on p_estcpu & ps_nice */
+ u_int p_estcpu; /* [a] Time averaged val of p_cpticks */
int p_pledge_syscall; /* Cache of current syscall */
uint64_t p_pledge; /* [o] copy of p_p->ps_pledge */
Shift several proc.h fields out of SCHED_LOCK