From: Kirill A. Korinsky Subject: Re: powersave CPU policy To: Philip Guenther Cc: "Lorenz (xha)" , OpenBSD tech Date: Thu, 23 May 2024 11:25:56 +0100 On Wed, 15 May 2024 08:19:12 +0100, Philip Guenther wrote: > > acpicpi_idle() has nothing to tell it why the cpu is being idle so it will > use the same logic for this as it does for when a CPU goes idle because > your web browser is waiting for a response from a web-server...which is > just a really simple (aka dumb) decaying average of how long it stayed idle > when this function was previously called compared to the C-state latencies, > such that it was idling longer recently then it'll idle deeper now. > > (With dynamic clock interrupts, that may actually result in it going to > deep C-states only after some activity so it can wake up and see that it > was idle for a long time. :-| Indeed, it really needs to calculate a > function of energy use and latency choice vs how long to the next clock > alarm to see which state is best to drop into...and by how much it should > ask for an early alarm so that it can come out of the C-state in time for > when the *real* target time is.) > > Here an updated version the patch where I've enforced the deepst supported stage inside acpicpi_idle. I've used it for last days and it feels that it had win some additional time on my machine. Like 25% battery is enough for about an hour working. User test, review and feedback welcome. diff --git lib/libc/sys/sysctl.2 lib/libc/sys/sysctl.2 index 78c69ff5813..f0f1b613a72 100644 --- lib/libc/sys/sysctl.2 +++ lib/libc/sys/sysctl.2 @@ -293,6 +293,7 @@ privileges may change the value. .It Dv HW_PHYSMEM Ta "integer" Ta "no" .It Dv HW_PHYSMEM64 Ta "int64_t" Ta "no" .It Dv HW_POWER Ta "integer" Ta "no" +.It Dv HW_POWERSAVELIMIT Ta "integer" Ta "yes" .It Dv HW_PRODUCT Ta "string" Ta "no" .It Dv HW_SENSORS Ta "node" Ta "not applicable" .It Dv HW_SETPERF Ta "integer" Ta "yes" @@ -384,8 +385,19 @@ The performance policy for power management. Can be one of .Dq manual , .Dq auto , +.Dq powersaving , or .Dq high . +.Pp +The differences between +.Dq auto +and +.Dq powersaving +policies are that the latter trades off some system responsiveness for +reduced power consumption. In addition, it has an upper limit on CPU +performance that can be adjusted by +.Dv HW_POWERSAVELIMIT , +and it offlines unused CPU cores. .It Dv HW_PHYSMEM The total physical memory, in bytes. This variable is deprecated; use @@ -395,6 +407,10 @@ instead. The total physical memory, in bytes. .It Dv HW_POWER Pq Va hw.power Machine has wall-power. +.It Dv HW_POWERSAVELIMIT Pq Va hw.powersavelimit +An upper limit of CPU performance in +.Dq powersaving +policy. .It Dv HW_PRODUCT Pq Va hw.product The product name of the machine. .It Dv HW_SENSORS Pq Va hw.sensors diff --git sys/dev/acpi/acpicpu.c sys/dev/acpi/acpicpu.c index 0a5ee1fea7d..aa581c3c467 100644 --- sys/dev/acpi/acpicpu.c +++ sys/dev/acpi/acpicpu.c @@ -1155,6 +1155,7 @@ acpicpu_idle(void) struct acpicpu_softc *sc = (struct acpicpu_softc *)ci->ci_acpicpudev; struct acpi_cstate *best, *cx; unsigned long itime; + int is_halted; if (sc == NULL) { __asm volatile("sti"); @@ -1168,11 +1169,27 @@ acpicpu_idle(void) } else if (ci->ci_mwait & MWAIT_ONLY) atomic_clearbits_int(&ci->ci_mwait, MWAIT_ONLY); +#ifdef MULTIPROCESSOR + is_halted = ci->ci_schedstate.spc_schedflags & SPCF_HALTED; +#else + is_halted = 0; +#endif + + best = cx = SLIST_FIRST(&sc->sc_cstates); + + /* When CPU is halted use the deepest supported sleep state. */ + if (is_halted) { + while ((cx = SLIST_NEXT(cx, link)) != NULL) { + if (cx->state > best->state) + best = cx; + } + goto bypass_latency_search; + } + /* * Find the first state with a latency we'll accept, ignoring - * states marked skippable + * states marked skippable. */ - best = cx = SLIST_FIRST(&sc->sc_cstates); while ((cx->flags & CST_FLAG_SKIP) || cx->latency * 3 > sc->sc_prev_sleep) { if ((cx = SLIST_NEXT(cx, link)) == NULL) @@ -1196,7 +1213,7 @@ acpicpu_idle(void) best = cx; } - +bypass_latency_search: atomic_inc_long(&cst_stats[best->state]); itime = tick / 2; @@ -1272,8 +1289,11 @@ acpicpu_idle(void) } - sc->sc_last_itime = itime; - itime >>= 1; - sc->sc_prev_sleep = (sc->sc_prev_sleep + (sc->sc_prev_sleep >> 1) - + itime) >> 1; + /* ignore latency statistics for halted CPU */ + if (is_halted) { + sc->sc_last_itime = itime; + itime >>= 1; + sc->sc_prev_sleep = (sc->sc_prev_sleep + (sc->sc_prev_sleep >> 1) + + itime) >> 1; + } } diff --git sys/kern/kern_sched.c sys/kern/kern_sched.c index dd54f98d4b8..ec2ad9eb7ce 100644 --- sys/kern/kern_sched.c +++ sys/kern/kern_sched.c @@ -639,6 +639,36 @@ sched_peg_curproc(struct cpu_info *ci) #ifdef MULTIPROCESSOR +void +sched_start_cpu(struct cpu_info *ci) +{ + struct schedstate_percpu *spc = &ci->ci_schedstate; + + atomic_clearbits_int(&spc->spc_schedflags, + SPCF_SHOULDHALT | SPCF_HALTED); + cpuset_add(&sched_all_cpus, ci); +} + +void sched_stop_cpu(struct cpu_info *ci) +{ + struct schedstate_percpu *spc = &ci->ci_schedstate; + + cpuset_del(&sched_all_cpus, ci); + atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); + + need_resched(ci); +} + +void sched_wait_halted_cpu(struct cpu_info *ci) +{ + struct schedstate_percpu *spc = &ci->ci_schedstate; + + while ((spc->spc_schedflags & SPCF_HALTED) == 0) { + sleep_setup(spc, PZERO, "schedstate"); + sleep_finish(0, (spc->spc_schedflags & SPCF_HALTED) == 0); + } +} + void sched_start_secondary_cpus(void) { @@ -646,17 +676,13 @@ sched_start_secondary_cpus(void) struct cpu_info *ci; CPU_INFO_FOREACH(cii, ci) { - struct schedstate_percpu *spc = &ci->ci_schedstate; - if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) continue; - atomic_clearbits_int(&spc->spc_schedflags, - SPCF_SHOULDHALT | SPCF_HALTED); #ifdef __HAVE_CPU_TOPOLOGY if (!sched_smt && ci->ci_smt_id > 0) continue; #endif - cpuset_add(&sched_all_cpus, ci); + sched_start_cpu(ci); } } @@ -670,23 +696,14 @@ sched_stop_secondary_cpus(void) * Make sure we stop the secondary CPUs. */ CPU_INFO_FOREACH(cii, ci) { - struct schedstate_percpu *spc = &ci->ci_schedstate; - if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) continue; - cpuset_del(&sched_all_cpus, ci); - atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT); + sched_stop_cpu(ci); } CPU_INFO_FOREACH(cii, ci) { - struct schedstate_percpu *spc = &ci->ci_schedstate; - if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci)) continue; - while ((spc->spc_schedflags & SPCF_HALTED) == 0) { - sleep_setup(spc, PZERO, "schedstate"); - sleep_finish(0, - (spc->spc_schedflags & SPCF_HALTED) == 0); - } + sched_wait_halted_cpu(ci); } } @@ -883,9 +900,9 @@ sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen) if (ci->ci_smt_id == 0) continue; if (sched_smt) - cpuset_add(&sched_all_cpus, ci); + sched_start_cpu(ci); else - cpuset_del(&sched_all_cpus, ci); + sched_stop_cpu(ci); } return 0; diff --git sys/kern/kern_sysctl.c sys/kern/kern_sysctl.c index 5d0d1a8a851..0c55bb3da67 100644 --- sys/kern/kern_sysctl.c +++ sys/kern/kern_sysctl.c @@ -733,6 +733,8 @@ hw_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp, return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen)); case HW_PERFPOLICY: return (sysctl_hwperfpolicy(oldp, oldlenp, newp, newlen)); + case HW_POWERSAVELIMIT: + return (sysctl_powersavelimit(oldp, oldlenp, newp, newlen)); #endif /* !SMALL_KERNEL */ case HW_VENDOR: if (hw_vendor) diff --git sys/kern/sched_bsd.c sys/kern/sched_bsd.c index 25b221c1ee2..ecf17a0f334 100644 --- sys/kern/sched_bsd.c +++ sys/kern/sched_bsd.c @@ -67,6 +67,10 @@ uint32_t decay_aftersleep(uint32_t, uint32_t); extern struct cpuset sched_idle_cpus; +#ifdef __HAVE_CPU_TOPOLOGY +extern int sched_smt; +#endif + /* * constants for averages over 1, 5, and 15 minutes when sampling at * 5 second intervals. @@ -573,8 +577,11 @@ void (*cpu_setperf)(int); #define PERFPOL_MANUAL 0 #define PERFPOL_AUTO 1 #define PERFPOL_HIGH 2 +#define PERFPOL_POWERSAVING 4 int perflevel = 100; int perfpolicy = PERFPOL_AUTO; +/* avoid turbo frequency by default */ +int powersavelimit = 75; #ifndef SMALL_KERNEL /* @@ -583,7 +590,9 @@ int perfpolicy = PERFPOL_AUTO; #include void setperf_auto(void *); +void setperf_powersaving(void *); struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL); +struct timeout setperf_to_powersaving = TIMEOUT_INITIALIZER(setperf_powersaving, NULL); extern int hw_power; void @@ -653,6 +662,91 @@ faster: timeout_add_msec(&setperf_to, 100); } +void +setperf_powersaving(void *v) +{ + static uint64_t *idleticks, *totalticks; + static int downbeats; + int i, j = 0; + int speedup = 0; + CPU_INFO_ITERATOR cii; + struct cpu_info *ci, *firstoffline = NULL, *lastidle = NULL; + uint64_t idle, total, allidle = 0, alltotal = 0; + + /* policy was changed, online all CPU cores */ + if (perfpolicy != PERFPOL_POWERSAVING) { + CPU_INFO_FOREACH(cii, ci) { + if (!cpu_is_online(ci)) { +#ifdef __HAVE_CPU_TOPOLOGY + if (!(sched_smt || ci->ci_smt_id == 0)) + continue; +#endif + sched_start_cpu(ci); + } + } + return; + } + + if (!idleticks) + if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks), + M_DEVBUF, M_NOWAIT | M_ZERO))) + return; + if (!totalticks) + if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks), + M_DEVBUF, M_NOWAIT | M_ZERO))) { + free(idleticks, M_DEVBUF, + sizeof(*idleticks) * ncpusfound); + return; + } + CPU_INFO_FOREACH(cii, ci) { + if (!cpu_is_online(ci)) { + if (!firstoffline) +#ifdef __HAVE_CPU_TOPOLOGY + if (sched_smt || ci->ci_smt_id == 0) +#endif + firstoffline = ci; + continue; + } + total = 0; + for (i = 0; i < CPUSTATES; i++) { + total += ci->ci_schedstate.spc_cp_time[i]; + } + total -= totalticks[j]; + idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j]; + if (idle < total / 3) + speedup = 1; + alltotal += total; + allidle += idle; + idleticks[j] += idle; + totalticks[j] += total; + /* it shoul keep at least one CPU online */ + if (j++ && cpuset_isset(&sched_idle_cpus, ci)) + lastidle = ci; + } + if (allidle < alltotal / 3) + speedup = 1; + if (speedup) + /* twice as long here because we check every 200ms */ + downbeats = 1; + + if (speedup && cpu_setperf && perflevel != powersavelimit) { + perflevel = powersavelimit; + cpu_setperf(perflevel); + } else if (speedup && firstoffline) { + sched_start_cpu(firstoffline); + } else if (!speedup && cpu_setperf && perflevel != 0 && --downbeats <= 0) { + perflevel = 0; + cpu_setperf(perflevel); + } else if (!speedup && lastidle) { + sched_stop_cpu(lastidle); + } + + /* every 200ms to have a better resolution of the load */ + timeout_add_msec(&setperf_to_powersaving, 200); + return; +} + + int sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen) { @@ -691,6 +785,9 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) case PERFPOL_AUTO: strlcpy(policy, "auto", sizeof(policy)); break; + case PERFPOL_POWERSAVING: + strlcpy(policy, "powersaving", sizeof(policy)); + break; case PERFPOL_HIGH: strlcpy(policy, "high", sizeof(policy)); break; @@ -709,6 +806,8 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) perfpolicy = PERFPOL_MANUAL; else if (strcmp(policy, "auto") == 0) perfpolicy = PERFPOL_AUTO; + else if (strcmp(policy, "powersaving") == 0) + perfpolicy = PERFPOL_POWERSAVING; else if (strcmp(policy, "high") == 0) perfpolicy = PERFPOL_HIGH; else @@ -716,12 +815,24 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen) if (perfpolicy == PERFPOL_AUTO) { timeout_add_msec(&setperf_to, 200); + } else if (perfpolicy == PERFPOL_POWERSAVING) { + timeout_add_msec(&setperf_to_powersaving, 200); } else if (perfpolicy == PERFPOL_HIGH) { perflevel = 100; cpu_setperf(perflevel); } return 0; } + +int +sysctl_powersavelimit(void *oldp, size_t *oldlenp, void *newp, size_t newlen) +{ + if (!cpu_setperf) + return EOPNOTSUPP; + + return sysctl_int_bounded(oldp, oldlenp, newp, newlen, + &powersavelimit, 0, 100); +} #endif /* diff --git sys/sys/sched.h sys/sys/sched.h index ac6dad2a8bb..844bad4834e 100644 --- sys/sys/sched.h +++ sys/sys/sched.h @@ -173,10 +173,14 @@ void sched_barrier(struct cpu_info *ci); int sysctl_hwsetperf(void *, size_t *, void *, size_t); int sysctl_hwperfpolicy(void *, size_t *, void *, size_t); +int sysctl_powersavelimit(void *, size_t *, void *, size_t); int sysctl_hwsmt(void *, size_t *, void *, size_t); int sysctl_hwncpuonline(void); #ifdef MULTIPROCESSOR +void sched_start_cpu(struct cpu_info *ci); +void sched_stop_cpu(struct cpu_info *ci); +void sched_wait_halted_cpu(struct cpu_info *ci); void sched_start_secondary_cpus(void); void sched_stop_secondary_cpus(void); #endif diff --git sys/sys/sysctl.h sys/sys/sysctl.h index e695f1d95cd..4b8ec5d2487 100644 --- sys/sys/sysctl.h +++ sys/sys/sysctl.h @@ -950,6 +950,7 @@ struct kinfo_file { #define HW_POWER 26 /* int: machine has wall-power */ #define HW_BATTERY 27 /* node: battery */ #define HW_UCOMNAMES 28 /* strings: ucom names */ +#define HW_POWERSAVELIMIT 29 /* set the limit for powersave policy */ #define HW_MAXID 30 /* number of valid hw ids */ #define CTL_HW_NAMES { \ @@ -982,6 +983,7 @@ struct kinfo_file { { "power", CTLTYPE_INT }, \ { "battery", CTLTYPE_NODE }, \ { "ucomnames", CTLTYPE_STRING }, \ + { "powersavelimit", CTLTYPE_INT }, \ } /* diff --git usr.sbin/apm/apm.8 usr.sbin/apm/apm.8 index ed110e11f14..6b25228bfd1 100644 --- usr.sbin/apm/apm.8 +++ usr.sbin/apm/apm.8 @@ -90,6 +90,7 @@ If charging, the estimated time to fully charge is displayed instead. Display the performance adjustment mode. 0 means manual mode. 1 means automatic mode. +2 means powersaving mode. .It Fl S Put the system into stand-by (light sleep) state. .It Fl v diff --git usr.sbin/apmd/apm-proto.h usr.sbin/apmd/apm-proto.h index 867d0afbd70..166618e996f 100644 --- usr.sbin/apmd/apm-proto.h +++ usr.sbin/apmd/apm-proto.h @@ -38,6 +38,7 @@ enum apm_action { SETPERF_LOW, SETPERF_HIGH, SETPERF_AUTO, + SETPERF_POWERSAVING, }; enum apm_state { @@ -51,6 +52,7 @@ enum apm_perfmode { PERF_NONE = -1, PERF_MANUAL, PERF_AUTO, + PERF_POWERSAVING, }; struct apm_command { diff --git usr.sbin/apmd/apmd.8 usr.sbin/apmd/apmd.8 index 7c02bb10c8a..8c54efc1a69 100644 --- usr.sbin/apmd/apmd.8 +++ usr.sbin/apmd/apmd.8 @@ -34,7 +34,7 @@ .Nd Advanced Power Management daemon .Sh SYNOPSIS .Nm apmd -.Op Fl AadHLs +.Op Fl AadHLPs .Op Fl f Ar devname .Op Fl S Ar sockname .Op Fl t Ar seconds @@ -94,6 +94,12 @@ Start in manual performance adjustment mode, initialising .Va hw.setperf to 0. +.It Fl P +Start +.Nm +in powersaving mode. This flag can be used with +.Fl A +to use auto mode on AC power, and powersaving on battery. .It Fl S Ar sockname Specify an alternate socket name, .Ar sockname . diff --git usr.sbin/apmd/apmd.c usr.sbin/apmd/apmd.c index f29d5c9a081..af01acc7a0b 100644 --- usr.sbin/apmd/apmd.c +++ usr.sbin/apmd/apmd.c @@ -58,6 +58,7 @@ #define AUTO_HIBERNATE 2 int debug = 0; +int use_powersaving = 0; extern char *__progname; @@ -100,7 +101,7 @@ void usage(void) { fprintf(stderr, - "usage: %s [-AadHLs] [-f devname] [-S sockname] [-t seconds] " + "usage: %s [-AadHLPs] [-f devname] [-S sockname] [-t seconds] " "[-Z percent] [-z percent]\n", __progname); exit(1); } @@ -139,6 +140,10 @@ power_status(int fd, int force, struct apm_power_info *pinfo) static struct apm_power_info last; int acon = 0, priority = LOG_NOTICE; + int perfpol_mib[] = { CTL_HW, HW_PERFPOLICY }; + char perfpol[32]; + size_t perfpol_sz = sizeof(perfpol); + if (fd == -1) { if (pinfo) { bstate.battery_state = 255; @@ -151,11 +156,19 @@ power_status(int fd, int force, struct apm_power_info *pinfo) return 0; } + if (sysctl(perfpol_mib, 2, perfpol, &perfpol_sz, NULL, 0) == -1) + logmsg(LOG_INFO, "cannot read hw.perfpolicy"); + if (ioctl(fd, APM_IOC_GETPOWER, &bstate) == 0) { /* various conditions under which we report status: something changed * enough since last report, or asked to force a print */ - if (bstate.ac_state == APM_AC_ON) + if (bstate.ac_state == APM_AC_ON) { acon = 1; + if(use_powersaving && strcmp(perfpol, "powersaving") == 0) + setperfpolicy("auto"); + } else if(use_powersaving && strcmp(perfpol, "auto") == 0) + setperfpolicy("powersaving"); + if (bstate.battery_state == APM_BATT_CRITICAL && bstate.battery_state != last.battery_state) priority = LOG_EMERG; @@ -282,6 +295,11 @@ handle_client(int sock_fd, int ctl_fd) logmsg(LOG_NOTICE, "setting hw.perfpolicy to high"); setperfpolicy("high"); break; + case SETPERF_POWERSAVING: + reply.newstate = NORMAL; + logmsg(LOG_NOTICE, "setting hw.perfpolicy to powersaving"); + setperfpolicy("powersaving"); + break; case SETPERF_AUTO: reply.newstate = NORMAL; logmsg(LOG_NOTICE, "setting hw.perfpolicy to auto"); @@ -299,8 +317,10 @@ handle_client(int sock_fd, int ctl_fd) if (strcmp(perfpol, "manual") == 0 || strcmp(perfpol, "high") == 0) { reply.perfmode = PERF_MANUAL; - } else if (strcmp(perfpol, "auto") == 0) + } else if (strcmp(perfpol, "auto") == 0) { reply.perfmode = PERF_AUTO; + } else if (strcmp(perfpol, "powersaving") == 0) + reply.perfmode = PERF_POWERSAVING; } if (sysctl(cpuspeed_mib, 2, &cpuspeed, &cpuspeed_sz, NULL, 0) == -1) { @@ -423,7 +443,7 @@ main(int argc, char *argv[]) struct kevent ev[2]; int doperf = PERF_NONE; - while ((ch = getopt(argc, argv, "aACdHLsf:t:S:z:Z:")) != -1) + while ((ch = getopt(argc, argv, "aACdHLPsf:t:S:z:Z:")) != -1) switch(ch) { case 'a': noacsleep = 1; @@ -448,7 +468,9 @@ main(int argc, char *argv[]) break; case 'A': case 'C': - if (doperf != PERF_NONE) + if (doperf == PERF_POWERSAVING) + use_powersaving = 1; + else if (doperf != PERF_NONE) usage(); doperf = PERF_AUTO; setperfpolicy("auto"); @@ -459,6 +481,15 @@ main(int argc, char *argv[]) doperf = PERF_MANUAL; setperfpolicy("low"); break; + case 'P': + if (doperf == PERF_NONE) { + doperf = PERF_POWERSAVING; + setperfpolicy("powersaving"); + } else if (doperf != PERF_AUTO) + usage(); + else + use_powersaving = 1; + break; case 'H': if (doperf != PERF_NONE) usage(); diff --git usr.sbin/apmd/apmsubr.c usr.sbin/apmd/apmsubr.c index 6504fe823bb..a1dbcfb2c61 100644 --- usr.sbin/apmd/apmsubr.c +++ usr.sbin/apmd/apmsubr.c @@ -79,6 +79,8 @@ perf_mode(int mode) return "manual"; case PERF_AUTO: return "auto"; + case PERF_POWERSAVING: + return "powersaving"; default: return "invalid"; } -- wbr, Kirill