Download raw body.
sys/sysctl: make hw.setperf MP safe
On Wed, May 21, 2025 at 02:13:25PM +0200, Kirill A. Korinsky wrote:
> On Wed, 21 May 2025 13:45:22 +0200,
> "Lorenz (xha)" <me@xha.li> wrote:
> >
> > Hi Kirill,
> >
> > On Wed, May 21, 2025 at 12:37:39PM +0200, Kirill A. Korinsky wrote:
> > > tech@,
> > >
> > > Here a diff which makes hw.setperf MP safe.
> > >
> > > Tested on -current/amd64.
> > >
> > > Feedback? Ok?
> >
> > i think this is wrong -- cpu_setperf() doesn't seem thread-safe on
> > at least arm64. also, there are some races that could happen in
> > setperf_auto with your diff (loading perflevel multiple times which
> > could yield different values, not really the end of the world in
> > that case but still).
> >
> > i would suggest using a mutex to protect setting hw.setperf? it
> > doesn't make much sense to set hw.setperf concurrently either way?
> >
>
> Well, setperf_auto is scheduled to run by TIMEOUT_INITIALIZER and it runs
> all callback under kernel lock unless TIMEOUT_MPSAFE is used.
>
> So, I doubt that any race condition can be triggered here.
>
> But wrap cpu_setperf() inside sysctl_hwsetperf() make sence and here an
> updated diff.
>
> Thanks for catching it.
>
>
> Index: sys/arch/amd64/amd64/est.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/amd64/amd64/est.c,v
> diff -u -p -r1.42 est.c
> --- sys/arch/amd64/amd64/est.c 12 Aug 2021 15:16:23 -0000 1.42
> +++ sys/arch/amd64/amd64/est.c 21 May 2025 12:01:46 -0000
> @@ -55,6 +55,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/sysctl.h>
> #include <sys/malloc.h>
>
> @@ -96,8 +97,13 @@ struct fqlist {
>
> static struct fqlist *est_fqlist;
>
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> extern int setperf_prio;
> -extern int perflevel;
> +extern int perflevel; /* [a] */
>
> int bus_clock;
>
> @@ -324,7 +330,7 @@ est_acpi_pss_changed(struct acpicpu_pss
> est_fqlist = acpilist;
>
> if (needtran) {
> - est_setperf(perflevel);
> + est_setperf(atomic_load_int(&perflevel));
> }
> }
> #endif
> @@ -457,7 +463,7 @@ est_init(struct cpu_info *ci)
> if (low == high)
> goto nospeedstep;
>
> - perflevel = (cpuspeed - low) * 100 / (high - low);
> + atomic_store_int(&perflevel, (cpuspeed - low) * 100 / (high - low));
>
> printf("%s: Enhanced SpeedStep %d MHz", cpu_device, cpuspeed);
>
> Index: sys/arch/amd64/amd64/powernow-k8.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/amd64/amd64/powernow-k8.c,v
> diff -u -p -r1.29 powernow-k8.c
> --- sys/arch/amd64/amd64/powernow-k8.c 21 Feb 2022 08:16:08 -0000 1.29
> +++ sys/arch/amd64/amd64/powernow-k8.c 21 May 2025 12:01:46 -0000
> @@ -28,6 +28,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/malloc.h>
> #include <sys/sysctl.h>
>
> @@ -47,8 +48,13 @@
> #define BIOS_START 0xe0000
> #define BIOS_LEN 0x20000
>
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> extern int setperf_prio;
> -extern int perflevel;
> +extern int perflevel; /* [a] */
>
> /*
> * MSRs and bits used by PowerNow technology
> @@ -364,7 +370,7 @@ k8pnow_acpi_pss_changed(struct acpicpu_p
> * Our current operating state is not among
> * the ones found the new PSS.
> */
> - curs = ((perflevel * npss) + 1) / 101;
> + curs = ((atomic_load_int(&perflevel) * npss) + 1) / 101;
> if (curs >= npss)
> curs = npss - 1;
> needtran = 1;
> Index: sys/arch/i386/i386/apm.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/i386/i386/apm.c,v
> diff -u -p -r1.133 apm.c
> --- sys/arch/i386/i386/apm.c 7 Oct 2024 01:31:22 -0000 1.133
> +++ sys/arch/i386/i386/apm.c 21 May 2025 12:01:46 -0000
> @@ -38,6 +38,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/kernel.h>
> #include <sys/kthread.h>
> #include <sys/rwlock.h>
> @@ -232,7 +233,12 @@ apm_perror(const char *str, struct apmre
> void
> apm_suspend(int state)
> {
> - extern int perflevel;
> + /*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> + extern int perflevel; /* [a] */
> int s;
>
> #if NWSDISPLAY > 0
> @@ -286,7 +292,7 @@ apm_suspend(int state)
>
> /* restore hw.setperf */
> if (cpu_setperf != NULL)
> - cpu_setperf(perflevel);
> + cpu_setperf(atomic_load_int(&perflevel));
> }
>
> void
> Index: sys/arch/i386/i386/powernow-k8.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/i386/i386/powernow-k8.c,v
> diff -u -p -r1.35 powernow-k8.c
> --- sys/arch/i386/i386/powernow-k8.c 30 Jan 2023 10:49:05 -0000 1.35
> +++ sys/arch/i386/i386/powernow-k8.c 21 May 2025 12:01:46 -0000
> @@ -138,7 +138,6 @@ struct pst_s {
>
> struct k8pnow_cpu_state *k8pnow_current_state = NULL;
> extern int setperf_prio;
> -extern int perflevel;
>
> int k8pnow_read_pending_wait(uint64_t *);
> int k8pnow_decode_pst(struct k8pnow_cpu_state *, uint8_t *);
> Index: sys/arch/macppc/dev/dfs.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/macppc/dev/dfs.c,v
> diff -u -p -r1.4 dfs.c
> --- sys/arch/macppc/dev/dfs.c 13 Mar 2022 12:33:01 -0000 1.4
> +++ sys/arch/macppc/dev/dfs.c 21 May 2025 12:01:46 -0000
> @@ -17,6 +17,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/proc.h>
> #include <sys/sysctl.h>
>
> @@ -27,7 +28,12 @@
> #include <macppc/pci/macobio.h>
> #include <powerpc/hid.h>
>
> -extern int perflevel;
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> +extern int perflevel; /* [a] */
>
> struct dfs_softc {
> struct device sc_dev;
> @@ -85,10 +91,10 @@ dfs_attach(struct device *parent, struct
>
> if (hid1 & HID1_DFS4) {
> ppc_curfreq = ppc_maxfreq / 4;
> - perflevel = 25;
> + atomic_store_int(&perflevel, 25);
> } else if (hid1 & HID1_DFS2) {
> ppc_curfreq = ppc_maxfreq / 2;
> - perflevel = 50;
> + atomic_store_int(&perflevel, 50);
> }
>
> cpu_setperf = dfs_setperf;
> Index: sys/arch/macppc/macppc/cpu.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/macppc/macppc/cpu.c,v
> diff -u -p -r1.90 cpu.c
> --- sys/arch/macppc/macppc/cpu.c 24 Oct 2024 17:37:06 -0000 1.90
> +++ sys/arch/macppc/macppc/cpu.c 21 May 2025 12:01:46 -0000
> @@ -35,6 +35,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/proc.h>
> #include <sys/sysctl.h>
> #include <sys/task.h>
> @@ -154,7 +155,7 @@ ppc64_scale_frequency(u_int freq_scale)
> ppc_intr_enable(s);
> }
>
> -extern int perflevel;
> +extern int perflevel; /* [a] */
>
> struct task ppc64_setperf_task;
> int ppc64_perflevel;
> Index: sys/arch/powerpc64/dev/opal.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/powerpc64/dev/opal.c,v
> diff -u -p -r1.14 opal.c
> --- sys/arch/powerpc64/dev/opal.c 12 Oct 2022 13:39:50 -0000 1.14
> +++ sys/arch/powerpc64/dev/opal.c 21 May 2025 12:01:46 -0000
> @@ -17,6 +17,7 @@
>
> #include <sys/param.h>
> #include <sys/device.h>
> +#include <sys/atomic.h>
> #include <sys/malloc.h>
> #include <sys/sysctl.h>
> #include <sys/systm.h>
> @@ -79,7 +80,12 @@ int opal_settime(struct todr_chip_handle
> void opal_configure_idle_states(struct opal_softc *, int);
> void opal_found_stop_state(struct opal_softc *, uint64_t);
>
> -extern int perflevel;
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> +extern int perflevel; /* [a] */
>
> void opalpm_init(struct opal_softc *, int);
> int opalpm_find_index(struct opal_softc *);
> @@ -437,7 +443,8 @@ opalpm_init(struct opal_softc *sc, int n
> OF_getprop(node, "ibm,pstate-frequencies-mhz", sc->sc_freq, len);
>
> if ((i = opalpm_find_index(sc)) != -1)
> - perflevel = (sc->sc_npstate - 1 - i) * 100 / sc->sc_npstate;
> + atomic_store_int(&perflevel,
> + sc->sc_npstate - 1 - i) * 100 / sc->sc_npstate);
> cpu_cpuspeed = opalpm_cpuspeed;
> #ifndef MULTIPROCESSOR
> cpu_setperf = opalpm_setperf;
> Index: sys/arch/riscv64/riscv64/cpu.c
> ===================================================================
> RCS file: /home/cvs/src/sys/arch/riscv64/riscv64/cpu.c,v
> diff -u -p -r1.21 cpu.c
> --- sys/arch/riscv64/riscv64/cpu.c 10 Nov 2024 06:51:59 -0000 1.21
> +++ sys/arch/riscv64/riscv64/cpu.c 21 May 2025 12:01:46 -0000
> @@ -495,7 +495,12 @@ cpu_unidle(struct cpu_info *ci)
> * Dynamic voltage and frequency scaling implementation.
> */
>
> -extern int perflevel;
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> +extern int perflevel; /* [a] */
>
> struct opp {
> uint64_t opp_hz;
> @@ -691,8 +696,8 @@ cpu_opp_mountroot(struct device *self)
> task_set(&cpu_opp_task, cpu_opp_dotask, NULL);
> cpu_setperf = cpu_opp_setperf;
>
> - perflevel = (level > 0) ? level : 0;
> - cpu_setperf(perflevel);
> + atomic_store_int(&perflevel, (level > 0) ? level : 0);
> + cpu_setperf(atomic_load_int(&perflevel));
> }
> }
>
> Index: sys/dev/acpi/acpitz.c
> ===================================================================
> RCS file: /home/cvs/src/sys/dev/acpi/acpitz.c,v
> diff -u -p -r1.61 acpitz.c
> --- sys/dev/acpi/acpitz.c 5 Feb 2025 11:03:36 -0000 1.61
> +++ sys/dev/acpi/acpitz.c 21 May 2025 12:01:46 -0000
> @@ -21,6 +21,7 @@
> #include <sys/signalvar.h>
> #include <sys/systm.h>
> #include <sys/device.h>
> +#include <sys/atomic.h>
> #include <sys/malloc.h>
> #include <sys/kernel.h>
> #include <sys/kthread.h>
> @@ -88,7 +89,13 @@ void acpitz_init(struct acpitz_softc *,
> void (*acpitz_cpu_setperf)(int);
> int acpitz_perflevel = -1;
> extern void (*cpu_setperf)(int);
> -extern int perflevel;
> +
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> +extern int perflevel; /* [a] */
> #define PERFSTEP 10
>
> #define ACPITZ_TRIPS (1L << 0)
> @@ -99,7 +106,7 @@ void
> acpitz_init_perf(void *arg)
> {
> if (acpitz_perflevel == -1)
> - acpitz_perflevel = perflevel;
> + acpitz_perflevel = atomic_load_int(&perflevel);
>
> if (cpu_setperf != acpitz_setperf) {
> acpitz_cpu_setperf = cpu_setperf;
> @@ -412,7 +419,7 @@ acpitz_refresh(void *arg)
>
> /* clamp passive cooling request */
> if (nperf > perflevel)
> - nperf = perflevel;
> + nperf = atomic_load_int(&perflevel);
>
> /* Perform CPU setperf */
> if (acpitz_cpu_setperf && nperf != acpitz_perflevel) {
> Index: sys/kern/kern_sysctl.c
> ===================================================================
> RCS file: /home/cvs/src/sys/kern/kern_sysctl.c,v
> diff -u -p -r1.468 kern_sysctl.c
> --- sys/kern/kern_sysctl.c 9 May 2025 14:53:22 -0000 1.468
> +++ sys/kern/kern_sysctl.c 21 May 2025 12:01:46 -0000
> @@ -849,7 +849,6 @@ hw_sysctl(int *name, u_int namelen, void
> case HW_CPUSPEED:
> #ifndef SMALL_KERNEL
> case HW_SENSORS:
> - case HW_SETPERF:
> case HW_PERFPOLICY:
> case HW_BATTERY:
> #endif /* !SMALL_KERNEL */
> @@ -867,6 +866,10 @@ hw_sysctl(int *name, u_int namelen, void
> sysctl_vsunlock(oldp, savelen);
> return (err);
> }
> +#ifndef SMALL_KERNEL
> + case HW_SETPERF:
> + return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen));
> +#endif /* !SMALL_KERNEL */
> case HW_VENDOR:
> if (hw_vendor)
> return (sysctl_rdstring(oldp, oldlenp, newp,
> @@ -940,8 +943,6 @@ hw_sysctl_locked(int *name, u_int namele
> case HW_SENSORS:
> return (sysctl_sensors(name + 1, namelen - 1, oldp, oldlenp,
> newp, newlen));
> - case HW_SETPERF:
> - return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen));
> case HW_PERFPOLICY:
> return (sysctl_hwperfpolicy(oldp, oldlenp, newp, newlen));
> #endif /* !SMALL_KERNEL */
> Index: sys/kern/sched_bsd.c
> ===================================================================
> RCS file: /home/cvs/src/sys/kern/sched_bsd.c,v
> diff -u -p -r1.99 sched_bsd.c
> --- sys/kern/sched_bsd.c 10 Mar 2025 09:28:56 -0000 1.99
> +++ sys/kern/sched_bsd.c 21 May 2025 12:03:15 -0000
> @@ -548,7 +548,13 @@ void (*cpu_setperf)(int);
> #define PERFPOL_MANUAL 0
> #define PERFPOL_AUTO 1
> #define PERFPOL_HIGH 2
> -int perflevel = 100;
> +
> +/*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> +int perflevel = 100; /* [a] */
> int perfpolicy_on_ac = PERFPOL_HIGH;
> int perfpolicy_on_battery = PERFPOL_AUTO;
>
> @@ -630,13 +636,14 @@ setperf_auto(void *v)
> if (speedup && downbeats < 5)
> downbeats++;
>
> - if (speedup && perflevel != 100) {
> + if (speedup && atomic_load_int(&perflevel) != 100) {
> faster:
> - perflevel = 100;
> - cpu_setperf(perflevel);
> - } else if (!speedup && perflevel != 0 && --downbeats <= 0) {
> - perflevel = 0;
> - cpu_setperf(perflevel);
> + atomic_store_int(&perflevel, 100);
> + cpu_setperf(atomic_load_int(&perflevel));
> + } else if (!speedup && atomic_load_int(&perflevel) != 0 &&
> + --downbeats <= 0) {
> + atomic_store_int(&perflevel, 0);
> + cpu_setperf(atomic_load_int(&perflevel));
> }
>
> timeout_add_msec(&setperf_to, 100);
> @@ -658,8 +665,11 @@ sysctl_hwsetperf(void *oldp, size_t *old
> if (err)
> return err;
>
> - if (newp != NULL)
> - cpu_setperf(perflevel);
> + if (newp != NULL) {
> + KERNEL_LOCK();
> + cpu_setperf(atomic_load_int(&perflevel));
> + KERNEL_UNLOCK();
> + }
>
> return 0;
> }
> @@ -729,8 +739,8 @@ sysctl_hwperfpolicy(void *oldp, size_t *
> }
>
> if (current_perfpolicy() == PERFPOL_HIGH) {
> - perflevel = 100;
> - cpu_setperf(perflevel);
> + atomic_store_int(&perflevel, 100);
> + cpu_setperf(atomic_load_int(&perflevel));
> }
>
> if (perfpolicy_dynamic())
> Index: sys/kern/subr_suspend.c
> ===================================================================
> RCS file: /home/cvs/src/sys/kern/subr_suspend.c,v
> diff -u -p -r1.18 subr_suspend.c
> --- sys/kern/subr_suspend.c 28 May 2024 09:40:40 -0000 1.18
> +++ sys/kern/subr_suspend.c 21 May 2025 12:01:46 -0000
> @@ -18,6 +18,7 @@
>
> #include <sys/param.h>
> #include <sys/systm.h>
> +#include <sys/atomic.h>
> #include <sys/buf.h>
> #include <sys/clockintr.h>
> #include <sys/reboot.h>
> @@ -48,8 +49,13 @@ device_register_wakeup(struct device *de
> int
> sleep_state(void *v, int sleepmode)
> {
> + /*
> + * Locks used to protect data:
> + * a atomic
> + */
> +
> int error, s;
> - extern int perflevel;
> + extern int perflevel; /* [a] */
> size_t rndbuflen;
> char *rndbuf;
> #ifdef GPROF
> @@ -210,7 +216,8 @@ fail_hiballoc:
> #endif
> sys_sync(curproc, NULL, NULL);
> if (cpu_setperf != NULL)
> - cpu_setperf(perflevel); /* Restore hw.setperf */
> + /* Restore hw.setperf */
> + cpu_setperf(atomic_load_int(&perflevel));
> if (suspend_finish(v) == EAGAIN)
> goto top;
> return (error);
>
I think this is missing k1x-pstate.c for modern AMD CPUs on amd64/i386.
-Bryan.
sys/sysctl: make hw.setperf MP safe