From: Bryan Steele Subject: Re: sys/sysctl: make hw.setperf MP safe To: tech@openbsd.org Date: Wed, 21 May 2025 10:17:10 -0400 On Wed, May 21, 2025 at 02:13:25PM +0200, Kirill A. Korinsky wrote: > On Wed, 21 May 2025 13:45:22 +0200, > "Lorenz (xha)" wrote: > > > > Hi Kirill, > > > > On Wed, May 21, 2025 at 12:37:39PM +0200, Kirill A. Korinsky wrote: > > > tech@, > > > > > > Here a diff which makes hw.setperf MP safe. > > > > > > Tested on -current/amd64. > > > > > > Feedback? Ok? > > > > i think this is wrong -- cpu_setperf() doesn't seem thread-safe on > > at least arm64. also, there are some races that could happen in > > setperf_auto with your diff (loading perflevel multiple times which > > could yield different values, not really the end of the world in > > that case but still). > > > > i would suggest using a mutex to protect setting hw.setperf? it > > doesn't make much sense to set hw.setperf concurrently either way? > > > > Well, setperf_auto is scheduled to run by TIMEOUT_INITIALIZER and it runs > all callback under kernel lock unless TIMEOUT_MPSAFE is used. > > So, I doubt that any race condition can be triggered here. > > But wrap cpu_setperf() inside sysctl_hwsetperf() make sence and here an > updated diff. > > Thanks for catching it. > > > Index: sys/arch/amd64/amd64/est.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/amd64/amd64/est.c,v > diff -u -p -r1.42 est.c > --- sys/arch/amd64/amd64/est.c 12 Aug 2021 15:16:23 -0000 1.42 > +++ sys/arch/amd64/amd64/est.c 21 May 2025 12:01:46 -0000 > @@ -55,6 +55,7 @@ > > #include > #include > +#include > #include > #include > > @@ -96,8 +97,13 @@ struct fqlist { > > static struct fqlist *est_fqlist; > > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > extern int setperf_prio; > -extern int perflevel; > +extern int perflevel; /* [a] */ > > int bus_clock; > > @@ -324,7 +330,7 @@ est_acpi_pss_changed(struct acpicpu_pss > est_fqlist = acpilist; > > if (needtran) { > - est_setperf(perflevel); > + est_setperf(atomic_load_int(&perflevel)); > } > } > #endif > @@ -457,7 +463,7 @@ est_init(struct cpu_info *ci) > if (low == high) > goto nospeedstep; > > - perflevel = (cpuspeed - low) * 100 / (high - low); > + atomic_store_int(&perflevel, (cpuspeed - low) * 100 / (high - low)); > > printf("%s: Enhanced SpeedStep %d MHz", cpu_device, cpuspeed); > > Index: sys/arch/amd64/amd64/powernow-k8.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/amd64/amd64/powernow-k8.c,v > diff -u -p -r1.29 powernow-k8.c > --- sys/arch/amd64/amd64/powernow-k8.c 21 Feb 2022 08:16:08 -0000 1.29 > +++ sys/arch/amd64/amd64/powernow-k8.c 21 May 2025 12:01:46 -0000 > @@ -28,6 +28,7 @@ > > #include > #include > +#include > #include > #include > > @@ -47,8 +48,13 @@ > #define BIOS_START 0xe0000 > #define BIOS_LEN 0x20000 > > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > extern int setperf_prio; > -extern int perflevel; > +extern int perflevel; /* [a] */ > > /* > * MSRs and bits used by PowerNow technology > @@ -364,7 +370,7 @@ k8pnow_acpi_pss_changed(struct acpicpu_p > * Our current operating state is not among > * the ones found the new PSS. > */ > - curs = ((perflevel * npss) + 1) / 101; > + curs = ((atomic_load_int(&perflevel) * npss) + 1) / 101; > if (curs >= npss) > curs = npss - 1; > needtran = 1; > Index: sys/arch/i386/i386/apm.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/i386/i386/apm.c,v > diff -u -p -r1.133 apm.c > --- sys/arch/i386/i386/apm.c 7 Oct 2024 01:31:22 -0000 1.133 > +++ sys/arch/i386/i386/apm.c 21 May 2025 12:01:46 -0000 > @@ -38,6 +38,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -232,7 +233,12 @@ apm_perror(const char *str, struct apmre > void > apm_suspend(int state) > { > - extern int perflevel; > + /* > + * Locks used to protect data: > + * a atomic > + */ > + > + extern int perflevel; /* [a] */ > int s; > > #if NWSDISPLAY > 0 > @@ -286,7 +292,7 @@ apm_suspend(int state) > > /* restore hw.setperf */ > if (cpu_setperf != NULL) > - cpu_setperf(perflevel); > + cpu_setperf(atomic_load_int(&perflevel)); > } > > void > Index: sys/arch/i386/i386/powernow-k8.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/i386/i386/powernow-k8.c,v > diff -u -p -r1.35 powernow-k8.c > --- sys/arch/i386/i386/powernow-k8.c 30 Jan 2023 10:49:05 -0000 1.35 > +++ sys/arch/i386/i386/powernow-k8.c 21 May 2025 12:01:46 -0000 > @@ -138,7 +138,6 @@ struct pst_s { > > struct k8pnow_cpu_state *k8pnow_current_state = NULL; > extern int setperf_prio; > -extern int perflevel; > > int k8pnow_read_pending_wait(uint64_t *); > int k8pnow_decode_pst(struct k8pnow_cpu_state *, uint8_t *); > Index: sys/arch/macppc/dev/dfs.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/macppc/dev/dfs.c,v > diff -u -p -r1.4 dfs.c > --- sys/arch/macppc/dev/dfs.c 13 Mar 2022 12:33:01 -0000 1.4 > +++ sys/arch/macppc/dev/dfs.c 21 May 2025 12:01:46 -0000 > @@ -17,6 +17,7 @@ > > #include > #include > +#include > #include > #include > > @@ -27,7 +28,12 @@ > #include > #include > > -extern int perflevel; > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > +extern int perflevel; /* [a] */ > > struct dfs_softc { > struct device sc_dev; > @@ -85,10 +91,10 @@ dfs_attach(struct device *parent, struct > > if (hid1 & HID1_DFS4) { > ppc_curfreq = ppc_maxfreq / 4; > - perflevel = 25; > + atomic_store_int(&perflevel, 25); > } else if (hid1 & HID1_DFS2) { > ppc_curfreq = ppc_maxfreq / 2; > - perflevel = 50; > + atomic_store_int(&perflevel, 50); > } > > cpu_setperf = dfs_setperf; > Index: sys/arch/macppc/macppc/cpu.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/macppc/macppc/cpu.c,v > diff -u -p -r1.90 cpu.c > --- sys/arch/macppc/macppc/cpu.c 24 Oct 2024 17:37:06 -0000 1.90 > +++ sys/arch/macppc/macppc/cpu.c 21 May 2025 12:01:46 -0000 > @@ -35,6 +35,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -154,7 +155,7 @@ ppc64_scale_frequency(u_int freq_scale) > ppc_intr_enable(s); > } > > -extern int perflevel; > +extern int perflevel; /* [a] */ > > struct task ppc64_setperf_task; > int ppc64_perflevel; > Index: sys/arch/powerpc64/dev/opal.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/powerpc64/dev/opal.c,v > diff -u -p -r1.14 opal.c > --- sys/arch/powerpc64/dev/opal.c 12 Oct 2022 13:39:50 -0000 1.14 > +++ sys/arch/powerpc64/dev/opal.c 21 May 2025 12:01:46 -0000 > @@ -17,6 +17,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -79,7 +80,12 @@ int opal_settime(struct todr_chip_handle > void opal_configure_idle_states(struct opal_softc *, int); > void opal_found_stop_state(struct opal_softc *, uint64_t); > > -extern int perflevel; > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > +extern int perflevel; /* [a] */ > > void opalpm_init(struct opal_softc *, int); > int opalpm_find_index(struct opal_softc *); > @@ -437,7 +443,8 @@ opalpm_init(struct opal_softc *sc, int n > OF_getprop(node, "ibm,pstate-frequencies-mhz", sc->sc_freq, len); > > if ((i = opalpm_find_index(sc)) != -1) > - perflevel = (sc->sc_npstate - 1 - i) * 100 / sc->sc_npstate; > + atomic_store_int(&perflevel, > + sc->sc_npstate - 1 - i) * 100 / sc->sc_npstate); > cpu_cpuspeed = opalpm_cpuspeed; > #ifndef MULTIPROCESSOR > cpu_setperf = opalpm_setperf; > Index: sys/arch/riscv64/riscv64/cpu.c > =================================================================== > RCS file: /home/cvs/src/sys/arch/riscv64/riscv64/cpu.c,v > diff -u -p -r1.21 cpu.c > --- sys/arch/riscv64/riscv64/cpu.c 10 Nov 2024 06:51:59 -0000 1.21 > +++ sys/arch/riscv64/riscv64/cpu.c 21 May 2025 12:01:46 -0000 > @@ -495,7 +495,12 @@ cpu_unidle(struct cpu_info *ci) > * Dynamic voltage and frequency scaling implementation. > */ > > -extern int perflevel; > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > +extern int perflevel; /* [a] */ > > struct opp { > uint64_t opp_hz; > @@ -691,8 +696,8 @@ cpu_opp_mountroot(struct device *self) > task_set(&cpu_opp_task, cpu_opp_dotask, NULL); > cpu_setperf = cpu_opp_setperf; > > - perflevel = (level > 0) ? level : 0; > - cpu_setperf(perflevel); > + atomic_store_int(&perflevel, (level > 0) ? level : 0); > + cpu_setperf(atomic_load_int(&perflevel)); > } > } > > Index: sys/dev/acpi/acpitz.c > =================================================================== > RCS file: /home/cvs/src/sys/dev/acpi/acpitz.c,v > diff -u -p -r1.61 acpitz.c > --- sys/dev/acpi/acpitz.c 5 Feb 2025 11:03:36 -0000 1.61 > +++ sys/dev/acpi/acpitz.c 21 May 2025 12:01:46 -0000 > @@ -21,6 +21,7 @@ > #include > #include > #include > +#include > #include > #include > #include > @@ -88,7 +89,13 @@ void acpitz_init(struct acpitz_softc *, > void (*acpitz_cpu_setperf)(int); > int acpitz_perflevel = -1; > extern void (*cpu_setperf)(int); > -extern int perflevel; > + > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > +extern int perflevel; /* [a] */ > #define PERFSTEP 10 > > #define ACPITZ_TRIPS (1L << 0) > @@ -99,7 +106,7 @@ void > acpitz_init_perf(void *arg) > { > if (acpitz_perflevel == -1) > - acpitz_perflevel = perflevel; > + acpitz_perflevel = atomic_load_int(&perflevel); > > if (cpu_setperf != acpitz_setperf) { > acpitz_cpu_setperf = cpu_setperf; > @@ -412,7 +419,7 @@ acpitz_refresh(void *arg) > > /* clamp passive cooling request */ > if (nperf > perflevel) > - nperf = perflevel; > + nperf = atomic_load_int(&perflevel); > > /* Perform CPU setperf */ > if (acpitz_cpu_setperf && nperf != acpitz_perflevel) { > Index: sys/kern/kern_sysctl.c > =================================================================== > RCS file: /home/cvs/src/sys/kern/kern_sysctl.c,v > diff -u -p -r1.468 kern_sysctl.c > --- sys/kern/kern_sysctl.c 9 May 2025 14:53:22 -0000 1.468 > +++ sys/kern/kern_sysctl.c 21 May 2025 12:01:46 -0000 > @@ -849,7 +849,6 @@ hw_sysctl(int *name, u_int namelen, void > case HW_CPUSPEED: > #ifndef SMALL_KERNEL > case HW_SENSORS: > - case HW_SETPERF: > case HW_PERFPOLICY: > case HW_BATTERY: > #endif /* !SMALL_KERNEL */ > @@ -867,6 +866,10 @@ hw_sysctl(int *name, u_int namelen, void > sysctl_vsunlock(oldp, savelen); > return (err); > } > +#ifndef SMALL_KERNEL > + case HW_SETPERF: > + return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen)); > +#endif /* !SMALL_KERNEL */ > case HW_VENDOR: > if (hw_vendor) > return (sysctl_rdstring(oldp, oldlenp, newp, > @@ -940,8 +943,6 @@ hw_sysctl_locked(int *name, u_int namele > case HW_SENSORS: > return (sysctl_sensors(name + 1, namelen - 1, oldp, oldlenp, > newp, newlen)); > - case HW_SETPERF: > - return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen)); > case HW_PERFPOLICY: > return (sysctl_hwperfpolicy(oldp, oldlenp, newp, newlen)); > #endif /* !SMALL_KERNEL */ > Index: sys/kern/sched_bsd.c > =================================================================== > RCS file: /home/cvs/src/sys/kern/sched_bsd.c,v > diff -u -p -r1.99 sched_bsd.c > --- sys/kern/sched_bsd.c 10 Mar 2025 09:28:56 -0000 1.99 > +++ sys/kern/sched_bsd.c 21 May 2025 12:03:15 -0000 > @@ -548,7 +548,13 @@ void (*cpu_setperf)(int); > #define PERFPOL_MANUAL 0 > #define PERFPOL_AUTO 1 > #define PERFPOL_HIGH 2 > -int perflevel = 100; > + > +/* > + * Locks used to protect data: > + * a atomic > + */ > + > +int perflevel = 100; /* [a] */ > int perfpolicy_on_ac = PERFPOL_HIGH; > int perfpolicy_on_battery = PERFPOL_AUTO; > > @@ -630,13 +636,14 @@ setperf_auto(void *v) > if (speedup && downbeats < 5) > downbeats++; > > - if (speedup && perflevel != 100) { > + if (speedup && atomic_load_int(&perflevel) != 100) { > faster: > - perflevel = 100; > - cpu_setperf(perflevel); > - } else if (!speedup && perflevel != 0 && --downbeats <= 0) { > - perflevel = 0; > - cpu_setperf(perflevel); > + atomic_store_int(&perflevel, 100); > + cpu_setperf(atomic_load_int(&perflevel)); > + } else if (!speedup && atomic_load_int(&perflevel) != 0 && > + --downbeats <= 0) { > + atomic_store_int(&perflevel, 0); > + cpu_setperf(atomic_load_int(&perflevel)); > } > > timeout_add_msec(&setperf_to, 100); > @@ -658,8 +665,11 @@ sysctl_hwsetperf(void *oldp, size_t *old > if (err) > return err; > > - if (newp != NULL) > - cpu_setperf(perflevel); > + if (newp != NULL) { > + KERNEL_LOCK(); > + cpu_setperf(atomic_load_int(&perflevel)); > + KERNEL_UNLOCK(); > + } > > return 0; > } > @@ -729,8 +739,8 @@ sysctl_hwperfpolicy(void *oldp, size_t * > } > > if (current_perfpolicy() == PERFPOL_HIGH) { > - perflevel = 100; > - cpu_setperf(perflevel); > + atomic_store_int(&perflevel, 100); > + cpu_setperf(atomic_load_int(&perflevel)); > } > > if (perfpolicy_dynamic()) > Index: sys/kern/subr_suspend.c > =================================================================== > RCS file: /home/cvs/src/sys/kern/subr_suspend.c,v > diff -u -p -r1.18 subr_suspend.c > --- sys/kern/subr_suspend.c 28 May 2024 09:40:40 -0000 1.18 > +++ sys/kern/subr_suspend.c 21 May 2025 12:01:46 -0000 > @@ -18,6 +18,7 @@ > > #include > #include > +#include > #include > #include > #include > @@ -48,8 +49,13 @@ device_register_wakeup(struct device *de > int > sleep_state(void *v, int sleepmode) > { > + /* > + * Locks used to protect data: > + * a atomic > + */ > + > int error, s; > - extern int perflevel; > + extern int perflevel; /* [a] */ > size_t rndbuflen; > char *rndbuf; > #ifdef GPROF > @@ -210,7 +216,8 @@ fail_hiballoc: > #endif > sys_sync(curproc, NULL, NULL); > if (cpu_setperf != NULL) > - cpu_setperf(perflevel); /* Restore hw.setperf */ > + /* Restore hw.setperf */ > + cpu_setperf(atomic_load_int(&perflevel)); > if (suspend_finish(v) == EAGAIN) > goto top; > return (error); > I think this is missing k1x-pstate.c for modern AMD CPUs on amd64/i386. -Bryan.