Index | Thread | Search

From:
Mateusz Guzik <mjguzik@gmail.com>
Subject:
[PATCH] amd64: import optimized memcmp from FreeBSD
To:
tech@openbsd.org
Cc:
Mateusz Guzik <mjguzik@gmail.com>
Date:
Fri, 29 Nov 2024 02:01:45 +0100

Download raw body.

Thread
The rep-prefixed cmps is incredibly slow even on modern CPUs.

The new implementation uses regular cmp to do it.

The code got augmented to account for retguard, otherwise it matches FreeBSD.

On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
before:	436177
after:	464670 (+6.5%)

Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
shows the routine at top of the profile, it drops to the bottom with the patch.

The new top is memmove which also got optimized in FreeBSD.

The other routines can receive similar treatment (including for libc), but I
have no interest in doing it. I guarantee there is quite a bit of extra
single-threaded perf up for grabs with this alone -- the old string ops are total crap.

cheers
---
 sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
 1 file changed, 214 insertions(+), 30 deletions(-)

diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
index b9944361869..f76552af1b4 100644
--- a/sys/lib/libkern/arch/amd64/memcmp.S
+++ b/sys/lib/libkern/arch/amd64/memcmp.S
@@ -1,39 +1,223 @@
-/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
- * Public domain.
- * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
+/*-
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
  */
 
 #include <machine/asm.h>
 
+#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
 ENTRY(memcmp)
 	RETGUARD_SETUP(memcmp, r11)
-	movq	%rdx,%rcx		/* compare by longs */
-	shrq	$3,%rcx
-	repe
-	cmpsq
-	jne	5f			/* do we match so far? */
-
-	movq	%rdx,%rcx		/* compare remainder by bytes */
-	andq	$7,%rcx
-	repe
-	cmpsb
-	jne	6f			/* do we match? */
-
-	xorl	%eax,%eax		/* we match, return zero	*/
-	jmp	7f
-
-5:	movl	$8,%ecx			/* We know that one of the next	*/
-	subq	%rcx,%rdi		/* eight pairs of bytes do not	*/
-	subq	%rcx,%rsi		/* match.			*/
-	repe
-	cmpsb
-6:	xorl	%eax,%eax		/* Perform unsigned comparison	*/
-	movb	-1(%rdi),%al
-	xorl	%edx,%edx
-	movb	-1(%rsi),%dl
-	subl    %edx,%eax
-7:	RETGUARD_CHECK(memcmp, r11)
+	xorl	%eax,%eax
+10:
+	cmpq	$16,%rdx
+	ja	101632f
+
+	cmpb	$8,%dl
+	jg	100816f
+
+	cmpb	$4,%dl
+	jg	100408f
+
+	cmpb	$2,%dl
+	jge	100204f
+
+	cmpb	$1,%dl
+	jl	100000f
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
+	subl	%r8d,%eax
+100000:
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+
+	ALIGN_TEXT
+100816:
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10081608f
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+	ALIGN_TEXT
+100408:
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	80f
+	movl	-4(%rdi,%rdx),%r8d
+	movl	-4(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	10040804f
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+	ALIGN_TEXT
+100204:
+	movzwl	(%rdi),%r8d
+	movzwl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	movzwl	-2(%rdi,%rdx),%r8d
+	movzwl	-2(%rsi,%rdx),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+	ALIGN_TEXT
+101632:
+	cmpq	$32,%rdx
+	ja	103200f
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	movq	8(%rdi),%r8
+	movq	8(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	10163208f
+	movq	-16(%rdi,%rdx),%r8
+	movq	-16(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163216f
+	movq	-8(%rdi,%rdx),%r8
+	movq	-8(%rsi,%rdx),%r9
+	cmpq	%r8,%r9
+	jne	10163224f
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+	ALIGN_TEXT
+103200:
+	movq	(%rdi),%r8
+	movq	8(%rdi),%r9
+	subq	(%rsi),%r8
+	subq	8(%rsi),%r9
+	orq	%r8,%r9
+	jnz	10320000f
+
+	movq    16(%rdi),%r8
+	movq    24(%rdi),%r9
+	subq    16(%rsi),%r8
+	subq    24(%rsi),%r9
+	orq	%r8,%r9
+	jnz     10320016f
+
+	leaq	32(%rdi),%rdi
+	leaq	32(%rsi),%rsi
+	subq	$32,%rdx
+	cmpq	$32,%rdx
+	jae	103200b
+	cmpb	$0,%dl
+	jne	10b
+	RETGUARD_CHECK(memcmp, r11)
+	ret
+	lfence
+
+/*
+ * Mismatch was found.
+*
+ * We need to compute the difference between strings.
+ * Start with narrowing the range down (16 -> 8 -> 4 bytes).
+ */
+	ALIGN_TEXT
+10320016:
+	leaq	16(%rdi),%rdi
+	leaq	16(%rsi),%rsi
+10320000:
+	movq	(%rdi),%r8
+	movq	(%rsi),%r9
+	cmpq	%r8,%r9
+	jne	80f
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	80f
+	ALIGN_TEXT
+10081608:
+10163224:
+	leaq	-8(%rdi,%rdx),%rdi
+	leaq	-8(%rsi,%rdx),%rsi
+	jmp	80f
+	ALIGN_TEXT
+10163216:
+	leaq	-16(%rdi,%rdx),%rdi
+	leaq	-16(%rsi,%rdx),%rsi
+	jmp	80f
+	ALIGN_TEXT
+10163208:
+	leaq	8(%rdi),%rdi
+	leaq	8(%rsi),%rsi
+	jmp	80f
+	ALIGN_TEXT
+10040804:
+	leaq	-4(%rdi,%rdx),%rdi
+	leaq	-4(%rsi,%rdx),%rsi
+	jmp	1f
+
+	ALIGN_TEXT
+80:
+	movl	(%rdi),%r8d
+	movl	(%rsi),%r9d
+	cmpl	%r8d,%r9d
+	jne	1f
+	leaq	4(%rdi),%rdi
+	leaq	4(%rsi),%rsi
+
+/*
+ * We have up to 4 bytes to inspect.
+ */
+1:
+	movzbl	(%rdi),%eax
+	movzbl	(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	1(%rdi),%eax
+	movzbl	1(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	2(%rdi),%eax
+	movzbl	2(%rsi),%r8d
+	cmpb	%r8b,%al
+	jne	2f
+
+	movzbl	3(%rdi),%eax
+	movzbl	3(%rsi),%r8d
+2:
+	subl	%r8d,%eax
+	RETGUARD_CHECK(memcmp, r11)
 	ret
 	lfence
 END(memcmp)
-- 
2.44.0