Index | Thread | Search

From:
Stefan Fritsch <sf@sfritsch.de>
Subject:
Re: Mostly-Inverted CPU-Core ordering
To:
Greg Schaefer <gsgs7878@proton.me>
Cc:
"tech@openbsd.org" <tech@openbsd.org>, visa@openbsd.org
Date:
Tue, 13 Jan 2026 18:34:39 +0100

Download raw body.

Thread
visa@ Theo suggested to cc you because of mips64

On Tue, 13 Jan 2026, Stefan Fritsch wrote:

> On Mon, 12 Jan 2026, Greg Schaefer wrote:
> 
> > While debugging another issue, I noticed that amd64/cpu_attach (and others
> > it appears) reorder the ACPI x2APIC CPU list. In a hybrid world, think that
> > Intel recommends x2APIC ordering by "CPU power" (P-cores first, E-cores next,
> > LP-cores last). cpu_info_list contains a static primary (boot-CPU) record
> > and a linked-list to the remaining application processors. The boot core is
> > copied to the static allocation, but the remaining cores are head-inserted
> > (reversed) into the linked-list.
> > 
> >         case CPU_ROLE_AP:
> > #if defined(MULTIPROCESSOR)
> >                 cpu_intr_init(ci);
> >                 cpu_start_secondary(ci);
> >                 clockqueue_init(&ci->ci_queue);
> >                 sched_init_cpu(ci);
> >                 ncpus++;
> >                 if (ci->ci_flags & CPUF_PRESENT) {
> >  **                      ci->ci_next = cpu_info_list->ci_next;
> >  **                      cpu_info_list->ci_next = ci;
> > 
> > Using the example of a hybrid process with 2 P-cores, 2 E-cores and 2 LP-cores,
> > the resulting ordering is P-Core 0, LP-Core 1, LP-Core 0, E-Core 1, E-core 0,
> > P-core 0. When NICs setup their interrupt queues (often power-of-two), via
> > intrmap_create(), they are striped across the cores in that mostly-reversed
> > order.
> > 
> > Unclear what, if any, performance impact this has, but it can make debugging
> > interesting. The simple fix is appending to cpu_info_list instead of reverse
> > head-inserting. Please ignore if a known/non-issue.
> 
> I can confirm your analysis. I don't have access to a hybrid CPU system
> at the moment, but I have Linux cpuinfo and acpi table infos from two
> Dell laptops with Core i7-1270P and Core i7-1370P and they seem to have
> the P-cores before the E-cores in the MADT. Also, if I enable INTRDEBUG
> in amd64/intr.c, I see that CPU_INFO_FOREACH traverses the CPUs in the
> order 0,5,4,3,2,1 on a 6 VCPU VM and intrmap_create() uses
> CPU_INFO_FOREACH.
> 
> I also agree that one probably wants to have multi-queue NICs use the
> CPUs with the highest performance and re-ordering the cpu_info_list
> seems to have that effect at least on intel.  The amd64 diff below puts
> them in the correct order and seems to work. But the same code seems to
> be in most (all?) other MP architectures.
> 
> I don't know what effect the different ordering could have on non-hybrid
> systems. On multi-socket systems, the interrupt distribution over the
> sockets may change, but I don't see a problem with that. The order in
> which IPIs are send will also change, and the scheduler will likely also
> do things slightly differently, but I have no idea if that would have a
> measurable impact. Anything else?

Updated diff below that should fix all architectures. sparc64 already has 
the correct ordering. Boot tested on amd64, i386, arm64.


diff --git a/sys/arch/alpha/alpha/cpu.c b/sys/arch/alpha/alpha/cpu.c
index 23b25c94219..38425f1d905 100644
--- a/sys/arch/alpha/alpha/cpu.c
+++ b/sys/arch/alpha/alpha/cpu.c
@@ -419,7 +419,7 @@ cpu_announce_extensions(struct cpu_info *ci)
 void
 cpu_boot_secondary_processors(void)
 {
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 	u_long i;
 
 	for (i = 0; i < ALPHA_MAXPROCS; i++) {
@@ -434,8 +434,10 @@ cpu_boot_secondary_processors(void)
 		/*
 		 * Link the processor into the list, and launch it.
 		 */
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		atomic_setbits_ulong(&ci->ci_flags, CPUF_RUNNING);
 		atomic_setbits_ulong(&cpus_running, (1UL << i));
 		ncpus++;
diff --git a/sys/arch/amd64/amd64/cpu.c b/sys/arch/amd64/amd64/cpu.c
index 18cfb8d6377..b2e59ae800f 100644
--- a/sys/arch/amd64/amd64/cpu.c
+++ b/sys/arch/amd64/amd64/cpu.c
@@ -583,7 +583,7 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
 {
 	struct cpu_softc *sc = (void *) self;
 	struct cpu_attach_args *caa = aux;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 #if defined(MULTIPROCESSOR)
 	int cpunum = sc->sc_dev.dv_unit;
 	vaddr_t kstack;
@@ -729,8 +729,10 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
 		sched_init_cpu(ci);
 		ncpus++;
 		if (ci->ci_flags & CPUF_PRESENT) {
-			ci->ci_next = cpu_info_list->ci_next;
-			cpu_info_list->ci_next = ci;
+			ci_last = cpu_info_list;
+			while (ci_last->ci_next != NULL)
+				ci_last = ci_last->ci_next;
+			ci_last->ci_next = ci;
 		}
 #else
 		printf("%s: not started\n", sc->sc_dev.dv_xname);
diff --git a/sys/arch/arm/arm/cpu.c b/sys/arch/arm/arm/cpu.c
index f51404f73be..0ce7f92fbf2 100644
--- a/sys/arch/arm/arm/cpu.c
+++ b/sys/arch/arm/arm/cpu.c
@@ -342,7 +342,7 @@ void
 cpu_attach(struct device *parent, struct device *dev, void *aux)
 {
 	struct fdt_attach_args *faa = aux;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 	uint32_t mpidr;
 	uint32_t opp;
 
@@ -356,8 +356,10 @@ cpu_attach(struct device *parent, struct device *dev, void *aux)
 	} else {
 		ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK | M_ZERO);
 		cpu_info[dev->dv_unit] = ci;
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		ci->ci_flags |= CPUF_AP;
 		ncpus++;
 	}
diff --git a/sys/arch/arm64/arm64/cpu.c b/sys/arch/arm64/arm64/cpu.c
index f9784e432db..5dabc020a9c 100644
--- a/sys/arch/arm64/arm64/cpu.c
+++ b/sys/arch/arm64/arm64/cpu.c
@@ -1478,7 +1478,7 @@ void
 cpu_attach(struct device *parent, struct device *dev, void *aux)
 {
 	struct fdt_attach_args *faa = aux;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 	void *kstack;
 #ifdef MULTIPROCESSOR
 	uint64_t mpidr = READ_SPECIALREG(mpidr_el1);
@@ -1494,8 +1494,10 @@ cpu_attach(struct device *parent, struct device *dev, void *aux)
 	} else {
 		ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK | M_ZERO);
 		cpu_info[dev->dv_unit] = ci;
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		ci->ci_flags |= CPUF_AP;
 		ncpus++;
 	}
diff --git a/sys/arch/i386/i386/cpu.c b/sys/arch/i386/i386/cpu.c
index c5f18dd4433..1de18768a37 100644
--- a/sys/arch/i386/i386/cpu.c
+++ b/sys/arch/i386/i386/cpu.c
@@ -231,7 +231,7 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
 {
 	struct cpu_softc *sc = (void *)self;
 	struct cpu_attach_args *caa = (struct cpu_attach_args *)aux;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 
 #ifdef MULTIPROCESSOR
 	int cpunum = sc->sc_dev.dv_unit;
@@ -365,8 +365,10 @@ cpu_attach(struct device *parent, struct device *self, void *aux)
 		identifycpu(ci);
 		clockqueue_init(&ci->ci_queue);
 		sched_init_cpu(ci);
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		ncpus++;
 #endif
 		break;
diff --git a/sys/arch/mips64/mips64/cpu.c b/sys/arch/mips64/mips64/cpu.c
index 040669d397b..309f671b117 100644
--- a/sys/arch/mips64/mips64/cpu.c
+++ b/sys/arch/mips64/mips64/cpu.c
@@ -78,7 +78,7 @@ cpuattach(struct device *parent, struct device *dev, void *aux)
 {
 	struct cpu_attach_args *caa = aux;
 	struct cpu_hwinfo *ch = caa->caa_hw;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 	int cpuno = dev->dv_unit;
 	int fptype, vers_maj, vers_min;
 	int displayver;
@@ -96,8 +96,10 @@ cpuattach(struct device *parent, struct device *dev, void *aux)
 		}
 	} else {
 		ci = &cpu_info_secondaries[cpuno - 1];
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		ci->ci_flags |= CPUF_PRESENT;
 	}
 #else
diff --git a/sys/arch/riscv64/riscv64/cpu.c b/sys/arch/riscv64/riscv64/cpu.c
index e8400d4eed3..fe75ab407cd 100644
--- a/sys/arch/riscv64/riscv64/cpu.c
+++ b/sys/arch/riscv64/riscv64/cpu.c
@@ -183,7 +183,7 @@ void
 cpu_attach(struct device *parent, struct device *dev, void *aux)
 {
 	struct fdt_attach_args *faa = aux;
-	struct cpu_info *ci;
+	struct cpu_info *ci, *ci_last;
 	int node, level;
 	uint32_t opp;
 
@@ -197,8 +197,10 @@ cpu_attach(struct device *parent, struct device *dev, void *aux)
 	} else {
 		ci = malloc(sizeof(*ci), M_DEVBUF, M_WAITOK | M_ZERO);
 		cpu_info[dev->dv_unit] = ci;
-		ci->ci_next = cpu_info_list->ci_next;
-		cpu_info_list->ci_next = ci;
+		ci_last = cpu_info_list;
+		while (ci_last->ci_next != NULL)
+			ci_last = ci_last->ci_next;
+		ci_last->ci_next = ci;
 		ci->ci_flags |= CPUF_AP;
 		ncpus++;
 	}