Mailing List Archive

All, Attached below is a patch for the scheduler. It does two things. Most substantially, it removes the complex cpu transfer cost calculations, and instead relies on the number of threads as the heuristic for process placement and steal decisions. There are some other tweaks to choosecpu to keep process-cpu affinity and grab a self-idle cpu faster, as well as adding a few more comments. No sched locks were harmed in the making of this patch. For metric purposes I have utilized a 6-core VM to compile the kernel to compare the current and patched versions: -j1 c: 955.55 real, 639.12 user, 268.73 sys p: 927.49 real, 634.63 user, 191.79 sys -j2 c: 530.12 real, 658.38 user, 312.95 sys p: 515.89 real, 656.91 user, 242.68 sys -j3 c: 389.31 real, 688.62 user, 388.38 sys p: 378.53 real, 666.87 user, 272.60 sys -j4 c: 319.53 real, 679.70 user, 459.68 sys p: 302.75 real, 677.95 user, 318.58 sys -j5 c: 288.11 real, 718.70 user, 501.19 sys p: 263.33 real, 704.05 user, 362.25 sys -j6 (1process:1cpu) c: 275.47 real, 789.24 user, 559.34 sys p: 255.25 real, 771.28 user, 389.47 sys -j8 (lowest clock time for current) c: 267.78 real, 781.21 user, 577.53 sys p: 252.92 real, 802.24 user, 456.29 sys -j10 c: 280.30 real, 808.41 user, 614.31 sys p: 251.93 real, 808.56 user, 475.82 sys -j12 (lowest clock time for patched) c: 277.11 real, 798.15 user, 600.36 sys p: 250.84 real, 808.21 user, 470.69 sys -j16 c: 281.15 real, 793.20 user, 619.98 sys p: 253.20 real, 804.40 user, 489.67 sys -j24 c: 283.27 real, 794.86 user, 626.90 sys p: 252.64 real, 804.47 user, 478.16 sys -j48 c: 284.13 real, 784.67 user, 625.37 sys p: 258.54 real, 807.94 user, 492.51 sys -j72 c: 289.20 real, 795.76 user, 640.93 sys p: 261.72 real, 803.68 user, 505.93 sys Patched version is a few percent faster in real time when lightly loaded (at least one idle core), but ~10% faster when moderately or highly loaded (e.g. when the current patch utilizes the cost calculation for every enqueue and the patched version utilizes a threadcount). Sys times are reduced 20%+. Effective real time moves from 3.47x speed from 1 to 6 processors with current to 3.63. When job counts are pushed up, real times increase less in the patched version. Scheduler modifications are tricky territory, so wider testing appreciated with more varied workloads. Tim -- diff --git a/sys/kern/kern_sched.c b/sys/kern/kern_sched.c index 74183e6bb68..5c9e60b29e8 100644 --- a/sys/kern/kern_sched.c +++ b/sys/kern/kern_sched.c @@ -32,7 +32,6 @@ void sched_kthreads_create(void *); -int sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p); struct proc *sched_steal_proc(struct cpu_info *); /* @@ -422,56 +421,59 @@ sched_choosecpu_fork(struct proc *parent, int flags) #endif return (curcpu()); } - struct cpu_info * sched_choosecpu(struct proc *p) { #ifdef MULTIPROCESSOR struct cpu_info *choice = NULL; - int last_cost = INT_MAX; struct cpu_info *ci; struct cpuset set; + int bestcost = INT_MAX; /* - * If pegged to a cpu, don't allow it to move. + * Fast path: if the process is pegged to a CPU, or has a valid assignment + * that's online and not halting, keep it there. */ - if (p->p_flag & P_CPUPEG) - return (p->p_cpu); + if (p->p_cpu != NULL) { + if ((p->p_flag & P_CPUPEG) || + (cpu_is_online(p->p_cpu) && + (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0)) + return (p->p_cpu); + } sched_choose++; /* - * Look at all cpus that are currently idle and have nothing queued. - * If there are none, pick the cheapest of those. - * (idle + queued could mean that the cpu is handling an interrupt - * at this moment and haven't had time to leave idle yet). + * Local idle: if my current CPU’s run-queue is empty and not halting, stay here. */ - cpuset_complement(&set, &sched_queued_cpus, &sched_idle_cpus); - cpuset_intersection(&set, &set, &sched_all_cpus); + ci = curcpu(); + if (ci->ci_schedstate.spc_nrun == 0 && + (ci->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0) { + sched_wasidle++; + return (ci); + } /* - * First, just check if our current cpu is in that set, if it is, - * this is simple. - * Also, our cpu might not be idle, but if it's the current cpu - * and it has nothing else queued and we're curproc, take it. + * Global idle search: pick any CPU whose run-queue is empty and eligible. */ - if (cpuset_isset(&set, p->p_cpu) || - (p->p_cpu == curcpu() && p->p_cpu->ci_schedstate.spc_nrun == 0 && - (p->p_cpu->ci_schedstate.spc_schedflags & SPCF_SHOULDHALT) == 0 && - curproc == p)) { + cpuset_copy(&set, &sched_idle_cpus); + cpuset_intersection(&set, &set, &sched_all_cpus); + if ((ci = cpuset_first(&set)) != NULL) { sched_wasidle++; - return (p->p_cpu); + return (ci); } - if (cpuset_first(&set) == NULL) - cpuset_copy(&set, &sched_all_cpus); + /* + * Fallback: search all CPUs for the one with lowest cost. + */ + cpuset_copy(&set, &sched_all_cpus); while ((ci = cpuset_first(&set)) != NULL) { - int cost = sched_proc_to_cpu_cost(ci, p); + int run = ci->ci_schedstate.spc_nrun; - if (choice == NULL || cost < last_cost) { + if (choice == NULL || run < bestcost) { choice = ci; - last_cost = cost; + bestcost = run; } cpuset_del(&set, ci); } @@ -496,7 +498,7 @@ sched_steal_proc(struct cpu_info *self) struct proc *best = NULL; #ifdef MULTIPROCESSOR struct schedstate_percpu *spc; - int bestcost = INT_MAX; + int bestcost = 0; struct cpu_info *ci; struct cpuset set; @@ -511,7 +513,6 @@ sched_steal_proc(struct cpu_info *self) while ((ci = cpuset_first(&set)) != NULL) { struct proc *p; int queue; - int cost; cpuset_del(&set, ci); @@ -522,11 +523,11 @@ sched_steal_proc(struct cpu_info *self) if (p->p_flag & P_CPUPEG) continue; - cost = sched_proc_to_cpu_cost(self, p); + int run = ci->ci_schedstate.spc_nrun; - if (best == NULL || cost < bestcost) { + if (best == NULL || run > bestcost) { best = p; - bestcost = cost; + bestcost = run; } } } @@ -544,82 +545,6 @@ sched_steal_proc(struct cpu_info *self) return (best); } -#ifdef MULTIPROCESSOR -/* - * Base 2 logarithm of an int. returns 0 for 0 (yeye, I know). - */ -static int -log2(unsigned int i) -{ - int ret = 0; - - while (i >>= 1) - ret++; - - return (ret); -} - -/* - * Calculate the cost of moving the proc to this cpu. - * - * What we want is some guesstimate of how much "performance" it will - * cost us to move the proc here. Not just for caches and TLBs and NUMA - * memory, but also for the proc itself. A highly loaded cpu might not - * be the best candidate for this proc since it won't get run. - * - * Just total guesstimates for now. - */ - -int sched_cost_priority = 1; -int sched_cost_runnable = 3; -int sched_cost_resident = 1; -#endif - -int -sched_proc_to_cpu_cost(struct cpu_info *ci, struct proc *p) -{ - int cost = 0; -#ifdef MULTIPROCESSOR - struct schedstate_percpu *spc; - int l2resident = 0; - - spc = &ci->ci_schedstate; - - /* - * First, account for the priority of the proc we want to move. - * More willing to move, the lower the priority of the destination - * and the higher the priority of the proc. - */ - if (!cpuset_isset(&sched_idle_cpus, ci)) { - cost += (p->p_usrpri - spc->spc_curpriority) * - sched_cost_priority; - cost += sched_cost_runnable; - } - if (cpuset_isset(&sched_queued_cpus, ci)) - cost += spc->spc_nrun * sched_cost_runnable; - - /* - * Try to avoid the primary cpu as it handles hardware interrupts. - * - * XXX Needs to be revisited when we distribute interrupts - * over cpus. - */ - if (CPU_IS_PRIMARY(ci)) - cost += sched_cost_runnable; - - /* - * If the proc is on this cpu already, lower the cost by how much - * it has been running and an estimate of its footprint. - */ - if (p->p_cpu == ci && p->p_slptime == 0) { - l2resident = - log2(pmap_resident_count(p->p_vmspace->vm_map.pmap)); - cost -= l2resident * sched_cost_resident; - } -#endif - return (cost); -} - /* * Peg a proc to a cpu. */

2025-07-28 19:33 Tim Leslie:
Patch: Refactor scheduler choosecpu function
- 2025-07-29 14:43 Kirill A. Korinsky:
  Patch: Refactor scheduler choosecpu function