Index | Thread | Search

From:
Kirill A. Korinsky <kirill@korins.ky>
Subject:
Re: powersave CPU policy
To:
"Lorenz (xha)" <me@xha.li>, OpenBSD tech <tech@openbsd.org>
Date:
Wed, 15 May 2024 00:47:29 +0100

Download raw body.

Thread
tech@,

I'd like to suggest an updated version of patch that introduced powersaving
cpu policies that trade off some of the system's responsiveness for battery
life. My use case allows to go from ~2 to ~3 hours on a journey 100% -> 15%.

This feature only works when ampd is started with the -P flags. Also, it can
be mixed with auto mode by using both -A and -P flags, which uses auto on AC
power, and powersave on the battery.

This patch dynamically removes CPU cores when the system is idle to prevent
the scheduler from using them for any task, i.e. assuming the hardware puts
unused CPU to sleep to save some power.

I also localize all CPU start and stop to a clean functions, instead bit magic.

It also introduce an upper limit on CPU frequency, which can be controlled
by the hw.powersavelimit at sysctl. I need this to prevent the CPU from
running in turbo mode, which consumes a lot of power. So the default setting
is 75, which is kind a guess, but which is good for the i5-10210U which I'm
using.

I have been using this patch for two weeks I guess, without any issue and
mainly enjoy extended life of daily driver.
    
Kernel + includes + sbin/sysctl + usr.sbin/apm{,d} recompilation required
    
Thus, this work is based on Solene's patch which she announced at
https://marc.info/?l=openbsd-tech&m=163259444331471&w=2
    
diff --git lib/libc/sys/sysctl.2 lib/libc/sys/sysctl.2
index 78c69ff5813..f0f1b613a72 100644
--- lib/libc/sys/sysctl.2
+++ lib/libc/sys/sysctl.2
@@ -293,6 +293,7 @@ privileges may change the value.
 .It Dv HW_PHYSMEM Ta "integer" Ta "no"
 .It Dv HW_PHYSMEM64 Ta "int64_t" Ta "no"
 .It Dv HW_POWER Ta "integer" Ta "no"
+.It Dv HW_POWERSAVELIMIT Ta "integer" Ta "yes"
 .It Dv HW_PRODUCT Ta "string" Ta "no"
 .It Dv HW_SENSORS Ta "node" Ta "not applicable"
 .It Dv HW_SETPERF Ta "integer" Ta "yes"
@@ -384,8 +385,19 @@ The performance policy for power management.
 Can be one of
 .Dq manual ,
 .Dq auto ,
+.Dq powersaving ,
 or
 .Dq high .
+.Pp
+The differences between
+.Dq auto
+and
+.Dq powersaving
+policies are that the latter trades off some system responsiveness for
+reduced power consumption. In addition, it has an upper limit on CPU
+performance that can be adjusted by
+.Dv HW_POWERSAVELIMIT ,
+and it offlines unused CPU cores.
 .It Dv HW_PHYSMEM
 The total physical memory, in bytes.
 This variable is deprecated; use
@@ -395,6 +407,10 @@ instead.
 The total physical memory, in bytes.
 .It Dv HW_POWER Pq Va hw.power
 Machine has wall-power.
+.It Dv HW_POWERSAVELIMIT Pq Va hw.powersavelimit
+An upper limit of CPU performance in
+.Dq powersaving
+policy.
 .It Dv HW_PRODUCT Pq Va hw.product
 The product name of the machine.
 .It Dv HW_SENSORS Pq Va hw.sensors
diff --git sys/kern/kern_sched.c sys/kern/kern_sched.c
index dd54f98d4b8..ec2ad9eb7ce 100644
--- sys/kern/kern_sched.c
+++ sys/kern/kern_sched.c
@@ -639,6 +639,36 @@ sched_peg_curproc(struct cpu_info *ci)
 
 #ifdef MULTIPROCESSOR
 
+void
+sched_start_cpu(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+	atomic_clearbits_int(&spc->spc_schedflags,
+		SPCF_SHOULDHALT | SPCF_HALTED);
+	cpuset_add(&sched_all_cpus, ci);
+}
+
+void sched_stop_cpu(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+	cpuset_del(&sched_all_cpus, ci);
+	atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
+
+	need_resched(ci);
+}
+
+void sched_wait_halted_cpu(struct cpu_info *ci)
+{
+	struct schedstate_percpu *spc = &ci->ci_schedstate;
+
+	while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
+		sleep_setup(spc, PZERO, "schedstate");
+		sleep_finish(0, (spc->spc_schedflags & SPCF_HALTED) == 0);
+	}
+}
+
 void
 sched_start_secondary_cpus(void)
 {
@@ -646,17 +676,13 @@ sched_start_secondary_cpus(void)
 	struct cpu_info *ci;
 
 	CPU_INFO_FOREACH(cii, ci) {
-		struct schedstate_percpu *spc = &ci->ci_schedstate;
-
 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
 			continue;
-		atomic_clearbits_int(&spc->spc_schedflags,
-		    SPCF_SHOULDHALT | SPCF_HALTED);
 #ifdef __HAVE_CPU_TOPOLOGY
 		if (!sched_smt && ci->ci_smt_id > 0)
 			continue;
 #endif
-		cpuset_add(&sched_all_cpus, ci);
+		sched_start_cpu(ci);
 	}
 }
 
@@ -670,23 +696,14 @@ sched_stop_secondary_cpus(void)
 	 * Make sure we stop the secondary CPUs.
 	 */
 	CPU_INFO_FOREACH(cii, ci) {
-		struct schedstate_percpu *spc = &ci->ci_schedstate;
-
 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
 			continue;
-		cpuset_del(&sched_all_cpus, ci);
-		atomic_setbits_int(&spc->spc_schedflags, SPCF_SHOULDHALT);
+		sched_stop_cpu(ci);
 	}
 	CPU_INFO_FOREACH(cii, ci) {
-		struct schedstate_percpu *spc = &ci->ci_schedstate;
-
 		if (CPU_IS_PRIMARY(ci) || !CPU_IS_RUNNING(ci))
 			continue;
-		while ((spc->spc_schedflags & SPCF_HALTED) == 0) {
-			sleep_setup(spc, PZERO, "schedstate");
-			sleep_finish(0,
-			    (spc->spc_schedflags & SPCF_HALTED) == 0);
-		}
+		sched_wait_halted_cpu(ci);
 	}
 }
 
@@ -883,9 +900,9 @@ sysctl_hwsmt(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 		if (ci->ci_smt_id == 0)
 			continue;
 		if (sched_smt)
-			cpuset_add(&sched_all_cpus, ci);
+			sched_start_cpu(ci);
 		else
-			cpuset_del(&sched_all_cpus, ci);
+			sched_stop_cpu(ci);
 	}
 
 	return 0;
diff --git sys/kern/kern_sysctl.c sys/kern/kern_sysctl.c
index 5d0d1a8a851..0c55bb3da67 100644
--- sys/kern/kern_sysctl.c
+++ sys/kern/kern_sysctl.c
@@ -733,6 +733,8 @@ hw_sysctl(int *name, u_int namelen, void *oldp, size_t *oldlenp, void *newp,
 		return (sysctl_hwsetperf(oldp, oldlenp, newp, newlen));
 	case HW_PERFPOLICY:
 		return (sysctl_hwperfpolicy(oldp, oldlenp, newp, newlen));
+	case HW_POWERSAVELIMIT:
+		return (sysctl_powersavelimit(oldp, oldlenp, newp, newlen));
 #endif /* !SMALL_KERNEL */
 	case HW_VENDOR:
 		if (hw_vendor)
diff --git sys/kern/sched_bsd.c sys/kern/sched_bsd.c
index 25b221c1ee2..ecf17a0f334 100644
--- sys/kern/sched_bsd.c
+++ sys/kern/sched_bsd.c
@@ -67,6 +67,10 @@ uint32_t		decay_aftersleep(uint32_t, uint32_t);
 
 extern struct cpuset sched_idle_cpus;
 
+#ifdef __HAVE_CPU_TOPOLOGY
+extern int sched_smt;
+#endif
+
 /*
  * constants for averages over 1, 5, and 15 minutes when sampling at
  * 5 second intervals.
@@ -573,8 +577,11 @@ void (*cpu_setperf)(int);
 #define PERFPOL_MANUAL 0
 #define PERFPOL_AUTO 1
 #define PERFPOL_HIGH 2
+#define PERFPOL_POWERSAVING 4
 int perflevel = 100;
 int perfpolicy = PERFPOL_AUTO;
+/* avoid turbo frequency by default */
+int powersavelimit = 75;
 
 #ifndef SMALL_KERNEL
 /*
@@ -583,7 +590,9 @@ int perfpolicy = PERFPOL_AUTO;
 #include <sys/sysctl.h>
 
 void setperf_auto(void *);
+void setperf_powersaving(void *);
 struct timeout setperf_to = TIMEOUT_INITIALIZER(setperf_auto, NULL);
+struct timeout setperf_to_powersaving = TIMEOUT_INITIALIZER(setperf_powersaving, NULL);
 extern int hw_power;
 
 void
@@ -653,6 +662,91 @@ faster:
 	timeout_add_msec(&setperf_to, 100);
 }
 
+void
+setperf_powersaving(void *v)
+{
+	static uint64_t *idleticks, *totalticks;
+	static int downbeats;
+	int i, j = 0;
+	int speedup = 0;
+	CPU_INFO_ITERATOR cii;
+	struct cpu_info *ci, *firstoffline = NULL, *lastidle = NULL;
+	uint64_t idle, total, allidle = 0, alltotal = 0;
+
+	/* policy was changed, online all CPU cores */
+	if (perfpolicy != PERFPOL_POWERSAVING) {
+		CPU_INFO_FOREACH(cii, ci) {
+			if (!cpu_is_online(ci)) {
+#ifdef __HAVE_CPU_TOPOLOGY
+				if (!(sched_smt || ci->ci_smt_id == 0))
+					continue;
+#endif
+				sched_start_cpu(ci);
+			}
+		}
+		return;
+	}
+
+	if (!idleticks)
+		if (!(idleticks = mallocarray(ncpusfound, sizeof(*idleticks),
+		    M_DEVBUF, M_NOWAIT | M_ZERO)))
+			return;
+	if (!totalticks)
+		if (!(totalticks = mallocarray(ncpusfound, sizeof(*totalticks),
+		    M_DEVBUF, M_NOWAIT | M_ZERO))) {
+			free(idleticks, M_DEVBUF,
+			    sizeof(*idleticks) * ncpusfound);
+			return;
+		}
+	CPU_INFO_FOREACH(cii, ci) {
+		if (!cpu_is_online(ci)) {
+			if (!firstoffline)
+#ifdef __HAVE_CPU_TOPOLOGY
+				if (sched_smt || ci->ci_smt_id == 0)
+#endif
+					firstoffline = ci;
+			continue;
+		}
+		total = 0;
+		for (i = 0; i < CPUSTATES; i++) {
+			total += ci->ci_schedstate.spc_cp_time[i];
+		}
+		total -= totalticks[j];
+		idle = ci->ci_schedstate.spc_cp_time[CP_IDLE] - idleticks[j];
+		if (idle < total / 3)
+			speedup = 1;
+		alltotal += total;
+		allidle += idle;
+		idleticks[j] += idle;
+		totalticks[j] += total;
+		/* it shoul keep at least one CPU online */
+		if (j++ && cpuset_isset(&sched_idle_cpus, ci))
+			lastidle = ci;
+	}
+	if (allidle < alltotal / 3)
+		speedup = 1;
+	if (speedup)
+		/* twice as long here because we check every 200ms */
+		downbeats = 1;
+
+	if (speedup && cpu_setperf && perflevel != powersavelimit) {
+		perflevel = powersavelimit;
+		cpu_setperf(perflevel);
+	} else if (speedup && firstoffline) {
+		sched_start_cpu(firstoffline);
+	} else if (!speedup && cpu_setperf && perflevel != 0 && --downbeats <= 0) {
+		perflevel = 0;
+		cpu_setperf(perflevel);
+	} else if (!speedup && lastidle) {
+		sched_stop_cpu(lastidle);
+	}
+
+	/* every 200ms to have a better resolution of the load */
+	timeout_add_msec(&setperf_to_powersaving, 200);
+	return;
+}
+
+
 int
 sysctl_hwsetperf(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 {
@@ -691,6 +785,9 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 	case PERFPOL_AUTO:
 		strlcpy(policy, "auto", sizeof(policy));
 		break;
+	case PERFPOL_POWERSAVING:
+		strlcpy(policy, "powersaving", sizeof(policy));
+		break;
 	case PERFPOL_HIGH:
 		strlcpy(policy, "high", sizeof(policy));
 		break;
@@ -709,6 +806,8 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 		perfpolicy = PERFPOL_MANUAL;
 	else if (strcmp(policy, "auto") == 0)
 		perfpolicy = PERFPOL_AUTO;
+	else if (strcmp(policy, "powersaving") == 0)
+		perfpolicy = PERFPOL_POWERSAVING;
 	else if (strcmp(policy, "high") == 0)
 		perfpolicy = PERFPOL_HIGH;
 	else
@@ -716,12 +815,24 @@ sysctl_hwperfpolicy(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
 
 	if (perfpolicy == PERFPOL_AUTO) {
 		timeout_add_msec(&setperf_to, 200);
+	} else if (perfpolicy == PERFPOL_POWERSAVING) {
+		timeout_add_msec(&setperf_to_powersaving, 200);
 	} else if (perfpolicy == PERFPOL_HIGH) {
 		perflevel = 100;
 		cpu_setperf(perflevel);
 	}
 	return 0;
 }
+
+int
+sysctl_powersavelimit(void *oldp, size_t *oldlenp, void *newp, size_t newlen)
+{
+	if (!cpu_setperf)
+		return EOPNOTSUPP;
+
+	return sysctl_int_bounded(oldp, oldlenp, newp, newlen,
+	    &powersavelimit, 0, 100);
+}
 #endif
 
 /*
diff --git sys/sys/sched.h sys/sys/sched.h
index ac6dad2a8bb..844bad4834e 100644
--- sys/sys/sched.h
+++ sys/sys/sched.h
@@ -173,10 +173,14 @@ void sched_barrier(struct cpu_info *ci);
 
 int sysctl_hwsetperf(void *, size_t *, void *, size_t);
 int sysctl_hwperfpolicy(void *, size_t *, void *, size_t);
+int sysctl_powersavelimit(void *, size_t *, void *, size_t);
 int sysctl_hwsmt(void *, size_t *, void *, size_t);
 int sysctl_hwncpuonline(void);
 
 #ifdef MULTIPROCESSOR
+void sched_start_cpu(struct cpu_info *ci);
+void sched_stop_cpu(struct cpu_info *ci);
+void sched_wait_halted_cpu(struct cpu_info *ci);
 void sched_start_secondary_cpus(void);
 void sched_stop_secondary_cpus(void);
 #endif
diff --git sys/sys/sysctl.h sys/sys/sysctl.h
index e695f1d95cd..4b8ec5d2487 100644
--- sys/sys/sysctl.h
+++ sys/sys/sysctl.h
@@ -950,6 +950,7 @@ struct kinfo_file {
 #define	HW_POWER		26	/* int: machine has wall-power */
 #define	HW_BATTERY		27	/* node: battery */
 #define	HW_UCOMNAMES		28	/* strings: ucom names */
+#define	HW_POWERSAVELIMIT	29	/* set the limit for powersave policy */
 #define	HW_MAXID		30	/* number of valid hw ids */
 
 #define	CTL_HW_NAMES { \
@@ -982,6 +983,7 @@ struct kinfo_file {
 	{ "power", CTLTYPE_INT }, \
 	{ "battery", CTLTYPE_NODE }, \
 	{ "ucomnames", CTLTYPE_STRING }, \
+	{ "powersavelimit", CTLTYPE_INT }, \
 }
 
 /*
diff --git usr.sbin/apm/apm.8 usr.sbin/apm/apm.8
index ed110e11f14..6b25228bfd1 100644
--- usr.sbin/apm/apm.8
+++ usr.sbin/apm/apm.8
@@ -90,6 +90,7 @@ If charging, the estimated time to fully charge is displayed instead.
 Display the performance adjustment mode.
 0 means manual mode.
 1 means automatic mode.
+2 means powersaving mode.
 .It Fl S
 Put the system into stand-by (light sleep) state.
 .It Fl v
diff --git usr.sbin/apmd/apm-proto.h usr.sbin/apmd/apm-proto.h
index 867d0afbd70..166618e996f 100644
--- usr.sbin/apmd/apm-proto.h
+++ usr.sbin/apmd/apm-proto.h
@@ -38,6 +38,7 @@ enum apm_action {
 	SETPERF_LOW,
 	SETPERF_HIGH,
 	SETPERF_AUTO,
+	SETPERF_POWERSAVING,
 };
 
 enum apm_state {
@@ -51,6 +52,7 @@ enum apm_perfmode {
 	PERF_NONE = -1,
 	PERF_MANUAL,
 	PERF_AUTO,
+	PERF_POWERSAVING,
 };
 
 struct apm_command {
diff --git usr.sbin/apmd/apmd.8 usr.sbin/apmd/apmd.8
index 7c02bb10c8a..8c54efc1a69 100644
--- usr.sbin/apmd/apmd.8
+++ usr.sbin/apmd/apmd.8
@@ -34,7 +34,7 @@
 .Nd Advanced Power Management daemon
 .Sh SYNOPSIS
 .Nm apmd
-.Op Fl AadHLs
+.Op Fl AadHLPs
 .Op Fl f Ar devname
 .Op Fl S Ar sockname
 .Op Fl t Ar seconds
@@ -94,6 +94,12 @@ Start
 in manual performance adjustment mode, initialising
 .Va hw.setperf
 to 0.
+.It Fl P
+Start
+.Nm
+in powersaving mode. This flag can be used with
+.Fl A
+to use auto mode on AC power, and powersaving on battery.
 .It Fl S Ar sockname
 Specify an alternate socket name,
 .Ar sockname .
diff --git usr.sbin/apmd/apmd.c usr.sbin/apmd/apmd.c
index f29d5c9a081..af01acc7a0b 100644
--- usr.sbin/apmd/apmd.c
+++ usr.sbin/apmd/apmd.c
@@ -58,6 +58,7 @@
 #define AUTO_HIBERNATE 2
 
 int debug = 0;
+int use_powersaving = 0;
 
 extern char *__progname;
 
@@ -100,7 +101,7 @@ void
 usage(void)
 {
 	fprintf(stderr,
-	    "usage: %s [-AadHLs] [-f devname] [-S sockname] [-t seconds] "
+	    "usage: %s [-AadHLPs] [-f devname] [-S sockname] [-t seconds] "
 		"[-Z percent] [-z percent]\n", __progname);
 	exit(1);
 }
@@ -139,6 +140,10 @@ power_status(int fd, int force, struct apm_power_info *pinfo)
 	static struct apm_power_info last;
 	int acon = 0, priority = LOG_NOTICE;
 
+	int perfpol_mib[] = { CTL_HW, HW_PERFPOLICY };
+	char perfpol[32];
+	size_t perfpol_sz = sizeof(perfpol);
+
 	if (fd == -1) {
 		if (pinfo) {
 			bstate.battery_state = 255;
@@ -151,11 +156,19 @@ power_status(int fd, int force, struct apm_power_info *pinfo)
 		return 0;
 	}
 
+	if (sysctl(perfpol_mib, 2, perfpol, &perfpol_sz, NULL, 0) == -1)
+		logmsg(LOG_INFO, "cannot read hw.perfpolicy");
+
 	if (ioctl(fd, APM_IOC_GETPOWER, &bstate) == 0) {
 	/* various conditions under which we report status:  something changed
 	 * enough since last report, or asked to force a print */
-		if (bstate.ac_state == APM_AC_ON)
+		if (bstate.ac_state == APM_AC_ON) {
 			acon = 1;
+			if(use_powersaving && strcmp(perfpol, "powersaving") == 0)
+				setperfpolicy("auto");
+		} else if(use_powersaving && strcmp(perfpol, "auto") == 0)
+				setperfpolicy("powersaving");
+
 		if (bstate.battery_state == APM_BATT_CRITICAL &&
 		    bstate.battery_state != last.battery_state)
 			priority = LOG_EMERG;
@@ -282,6 +295,11 @@ handle_client(int sock_fd, int ctl_fd)
 		logmsg(LOG_NOTICE, "setting hw.perfpolicy to high");
 		setperfpolicy("high");
 		break;
+	case SETPERF_POWERSAVING:
+		reply.newstate = NORMAL;
+		logmsg(LOG_NOTICE, "setting hw.perfpolicy to powersaving");
+		setperfpolicy("powersaving");
+		break;
 	case SETPERF_AUTO:
 		reply.newstate = NORMAL;
 		logmsg(LOG_NOTICE, "setting hw.perfpolicy to auto");
@@ -299,8 +317,10 @@ handle_client(int sock_fd, int ctl_fd)
 		if (strcmp(perfpol, "manual") == 0 ||
 		    strcmp(perfpol, "high") == 0) {
 			reply.perfmode = PERF_MANUAL;
-		} else if (strcmp(perfpol, "auto") == 0)
+		} else if (strcmp(perfpol, "auto") == 0) {
 			reply.perfmode = PERF_AUTO;
+		} else if (strcmp(perfpol, "powersaving") == 0)
+			reply.perfmode = PERF_POWERSAVING;
 	}
 
 	if (sysctl(cpuspeed_mib, 2, &cpuspeed, &cpuspeed_sz, NULL, 0) == -1) {
@@ -423,7 +443,7 @@ main(int argc, char *argv[])
 	struct kevent ev[2];
 	int doperf = PERF_NONE;
 
-	while ((ch = getopt(argc, argv, "aACdHLsf:t:S:z:Z:")) != -1)
+	while ((ch = getopt(argc, argv, "aACdHLPsf:t:S:z:Z:")) != -1)
 		switch(ch) {
 		case 'a':
 			noacsleep = 1;
@@ -448,7 +468,9 @@ main(int argc, char *argv[])
 			break;
 		case 'A':
 		case 'C':
-			if (doperf != PERF_NONE)
+			if (doperf == PERF_POWERSAVING)
+				use_powersaving = 1;
+			else if (doperf != PERF_NONE)
 				usage();
 			doperf = PERF_AUTO;
 			setperfpolicy("auto");
@@ -459,6 +481,15 @@ main(int argc, char *argv[])
 			doperf = PERF_MANUAL;
 			setperfpolicy("low");
 			break;
+		case 'P':
+			if (doperf == PERF_NONE) {
+				doperf = PERF_POWERSAVING;
+				setperfpolicy("powersaving");
+			} else if (doperf != PERF_AUTO)
+				usage();
+			else
+				use_powersaving = 1;
+			break;
 		case 'H':
 			if (doperf != PERF_NONE)
 				usage();
diff --git usr.sbin/apmd/apmsubr.c usr.sbin/apmd/apmsubr.c
index 6504fe823bb..a1dbcfb2c61 100644
--- usr.sbin/apmd/apmsubr.c
+++ usr.sbin/apmd/apmsubr.c
@@ -79,6 +79,8 @@ perf_mode(int mode)
 		return "manual";
 	case PERF_AUTO:
 		return "auto";
+	case PERF_POWERSAVING:
+		return "powersaving";
 	default:
 		return "invalid";
 	}


-- 
wbr, Kirill