Download raw body.
[PATCH] amd64: import optimized memcmp from FreeBSD]
I just noticed that I've still got this diff in my tree and wondered
if it's going to go anywhere.
The thread from last year got derailed into discussion about libc
(partly my fault) and SIMD (not likely to be really useful in the
kernel) - https://marc.info/?t=173284225400001&r=1&w=2 - but I don't
think there were any particular objections, and there were good test
reports, for the diff as presented.
----- Forwarded message from Mateusz Guzik <mjguzik@gmail.com> -----
From: Mateusz Guzik <mjguzik@gmail.com>
Date: Fri, 29 Nov 2024 02:01:45 +0100
To: tech@openbsd.org
Cc: Mateusz Guzik <mjguzik@gmail.com>
X-Mailer: git-send-email 2.43.0
Subject: [PATCH] amd64: import optimized memcmp from FreeBSD
The rep-prefixed cmps is incredibly slow even on modern CPUs.
The new implementation uses regular cmp to do it.
The code got augmented to account for retguard, otherwise it matches FreeBSD.
On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
before: 436177
after: 464670 (+6.5%)
Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
shows the routine at top of the profile, it drops to the bottom with the patch.
The new top is memmove which also got optimized in FreeBSD.
The other routines can receive similar treatment (including for libc), but I
have no interest in doing it. I guarantee there is quite a bit of extra
single-threaded perf up for grabs with this alone -- the old string ops are total crap.
cheers
---
sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
1 file changed, 214 insertions(+), 30 deletions(-)
diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
index b9944361869..f76552af1b4 100644
--- a/sys/lib/libkern/arch/amd64/memcmp.S
+++ b/sys/lib/libkern/arch/amd64/memcmp.S
@@ -1,39 +1,223 @@
-/*
- * Written by J.T. Conklin <jtc@netbsd.org>.
- * Public domain.
- * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
+/*-
+ * Copyright (c) 2018 The FreeBSD Foundation
+ *
+ * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
+ * under sponsorship from the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
*/
#include <machine/asm.h>
+#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
+
ENTRY(memcmp)
RETGUARD_SETUP(memcmp, r11)
- movq %rdx,%rcx /* compare by longs */
- shrq $3,%rcx
- repe
- cmpsq
- jne 5f /* do we match so far? */
-
- movq %rdx,%rcx /* compare remainder by bytes */
- andq $7,%rcx
- repe
- cmpsb
- jne 6f /* do we match? */
-
- xorl %eax,%eax /* we match, return zero */
- jmp 7f
-
-5: movl $8,%ecx /* We know that one of the next */
- subq %rcx,%rdi /* eight pairs of bytes do not */
- subq %rcx,%rsi /* match. */
- repe
- cmpsb
-6: xorl %eax,%eax /* Perform unsigned comparison */
- movb -1(%rdi),%al
- xorl %edx,%edx
- movb -1(%rsi),%dl
- subl %edx,%eax
-7: RETGUARD_CHECK(memcmp, r11)
+ xorl %eax,%eax
+10:
+ cmpq $16,%rdx
+ ja 101632f
+
+ cmpb $8,%dl
+ jg 100816f
+
+ cmpb $4,%dl
+ jg 100408f
+
+ cmpb $2,%dl
+ jge 100204f
+
+ cmpb $1,%dl
+ jl 100000f
+ movzbl (%rdi),%eax
+ movzbl (%rsi),%r8d
+ subl %r8d,%eax
+100000:
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+
+ ALIGN_TEXT
+100816:
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 80f
+ movq -8(%rdi,%rdx),%r8
+ movq -8(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10081608f
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+ ALIGN_TEXT
+100408:
+ movl (%rdi),%r8d
+ movl (%rsi),%r9d
+ cmpl %r8d,%r9d
+ jne 80f
+ movl -4(%rdi,%rdx),%r8d
+ movl -4(%rsi,%rdx),%r9d
+ cmpl %r8d,%r9d
+ jne 10040804f
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+ ALIGN_TEXT
+100204:
+ movzwl (%rdi),%r8d
+ movzwl (%rsi),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ movzwl -2(%rdi,%rdx),%r8d
+ movzwl -2(%rsi,%rdx),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+ ALIGN_TEXT
+101632:
+ cmpq $32,%rdx
+ ja 103200f
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 80f
+ movq 8(%rdi),%r8
+ movq 8(%rsi),%r9
+ cmpq %r8,%r9
+ jne 10163208f
+ movq -16(%rdi,%rdx),%r8
+ movq -16(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10163216f
+ movq -8(%rdi,%rdx),%r8
+ movq -8(%rsi,%rdx),%r9
+ cmpq %r8,%r9
+ jne 10163224f
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+ ALIGN_TEXT
+103200:
+ movq (%rdi),%r8
+ movq 8(%rdi),%r9
+ subq (%rsi),%r8
+ subq 8(%rsi),%r9
+ orq %r8,%r9
+ jnz 10320000f
+
+ movq 16(%rdi),%r8
+ movq 24(%rdi),%r9
+ subq 16(%rsi),%r8
+ subq 24(%rsi),%r9
+ orq %r8,%r9
+ jnz 10320016f
+
+ leaq 32(%rdi),%rdi
+ leaq 32(%rsi),%rsi
+ subq $32,%rdx
+ cmpq $32,%rdx
+ jae 103200b
+ cmpb $0,%dl
+ jne 10b
+ RETGUARD_CHECK(memcmp, r11)
+ ret
+ lfence
+
+/*
+ * Mismatch was found.
+*
+ * We need to compute the difference between strings.
+ * Start with narrowing the range down (16 -> 8 -> 4 bytes).
+ */
+ ALIGN_TEXT
+10320016:
+ leaq 16(%rdi),%rdi
+ leaq 16(%rsi),%rsi
+10320000:
+ movq (%rdi),%r8
+ movq (%rsi),%r9
+ cmpq %r8,%r9
+ jne 80f
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jmp 80f
+ ALIGN_TEXT
+10081608:
+10163224:
+ leaq -8(%rdi,%rdx),%rdi
+ leaq -8(%rsi,%rdx),%rsi
+ jmp 80f
+ ALIGN_TEXT
+10163216:
+ leaq -16(%rdi,%rdx),%rdi
+ leaq -16(%rsi,%rdx),%rsi
+ jmp 80f
+ ALIGN_TEXT
+10163208:
+ leaq 8(%rdi),%rdi
+ leaq 8(%rsi),%rsi
+ jmp 80f
+ ALIGN_TEXT
+10040804:
+ leaq -4(%rdi,%rdx),%rdi
+ leaq -4(%rsi,%rdx),%rsi
+ jmp 1f
+
+ ALIGN_TEXT
+80:
+ movl (%rdi),%r8d
+ movl (%rsi),%r9d
+ cmpl %r8d,%r9d
+ jne 1f
+ leaq 4(%rdi),%rdi
+ leaq 4(%rsi),%rsi
+
+/*
+ * We have up to 4 bytes to inspect.
+ */
+1:
+ movzbl (%rdi),%eax
+ movzbl (%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 1(%rdi),%eax
+ movzbl 1(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 2(%rdi),%eax
+ movzbl 2(%rsi),%r8d
+ cmpb %r8b,%al
+ jne 2f
+
+ movzbl 3(%rdi),%eax
+ movzbl 3(%rsi),%r8d
+2:
+ subl %r8d,%eax
+ RETGUARD_CHECK(memcmp, r11)
ret
lfence
END(memcmp)
--
2.44.0
----- End forwarded message -----
[PATCH] amd64: import optimized memcmp from FreeBSD]