From: Mike Larkin Subject: Re: [PATCH] amd64: import optimized memcmp from FreeBSD To: Mateusz Guzik Cc: tech@openbsd.org Date: Fri, 29 Nov 2024 16:04:52 -0800 On Fri, Nov 29, 2024 at 02:01:45AM +0100, Mateusz Guzik wrote: > The rep-prefixed cmps is incredibly slow even on modern CPUs. > > The new implementation uses regular cmp to do it. > > The code got augmented to account for retguard, otherwise it matches FreeBSD. > > On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s): > before: 436177 > after: 464670 (+6.5%) > > Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }' > shows the routine at top of the profile, it drops to the bottom with the patch. > > The new top is memmove which also got optimized in FreeBSD. > > The other routines can receive similar treatment (including for libc), but I > have no interest in doing it. I guarantee there is quite a bit of extra > single-threaded perf up for grabs with this alone -- the old string ops are total crap. > > cheers > --- > sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++---- > 1 file changed, 214 insertions(+), 30 deletions(-) > can you please explain what the asm labels mean? eg what is the difference between 10080816, 10001632, 10000408, etc? > diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S > index b9944361869..f76552af1b4 100644 > --- a/sys/lib/libkern/arch/amd64/memcmp.S > +++ b/sys/lib/libkern/arch/amd64/memcmp.S > @@ -1,39 +1,223 @@ > -/* > - * Written by J.T. Conklin . > - * Public domain. > - * Adapted for NetBSD/x86_64 by Frank van der Linden > +/*- > + * Copyright (c) 2018 The FreeBSD Foundation > + * > + * This software was developed by Mateusz Guzik > + * under sponsorship from the FreeBSD Foundation. > + * > + * Redistribution and use in source and binary forms, with or without > + * modification, are permitted provided that the following conditions > + * are met: > + * 1. Redistributions of source code must retain the above copyright > + * notice, this list of conditions and the following disclaimer. > + * 2. Redistributions in binary form must reproduce the above copyright > + * notice, this list of conditions and the following disclaimer in the > + * documentation and/or other materials provided with the distribution. > + * > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > + * SUCH DAMAGE. > */ > > #include > > +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ > + > ENTRY(memcmp) > RETGUARD_SETUP(memcmp, r11) > - movq %rdx,%rcx /* compare by longs */ > - shrq $3,%rcx > - repe > - cmpsq > - jne 5f /* do we match so far? */ > - > - movq %rdx,%rcx /* compare remainder by bytes */ > - andq $7,%rcx > - repe > - cmpsb > - jne 6f /* do we match? */ > - > - xorl %eax,%eax /* we match, return zero */ > - jmp 7f > - > -5: movl $8,%ecx /* We know that one of the next */ > - subq %rcx,%rdi /* eight pairs of bytes do not */ > - subq %rcx,%rsi /* match. */ > - repe > - cmpsb > -6: xorl %eax,%eax /* Perform unsigned comparison */ > - movb -1(%rdi),%al > - xorl %edx,%edx > - movb -1(%rsi),%dl > - subl %edx,%eax > -7: RETGUARD_CHECK(memcmp, r11) > + xorl %eax,%eax > +10: > + cmpq $16,%rdx > + ja 101632f > + > + cmpb $8,%dl > + jg 100816f > + > + cmpb $4,%dl > + jg 100408f > + > + cmpb $2,%dl > + jge 100204f > + > + cmpb $1,%dl > + jl 100000f > + movzbl (%rdi),%eax > + movzbl (%rsi),%r8d > + subl %r8d,%eax > +100000: > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + > + ALIGN_TEXT > +100816: > + movq (%rdi),%r8 > + movq (%rsi),%r9 > + cmpq %r8,%r9 > + jne 80f > + movq -8(%rdi,%rdx),%r8 > + movq -8(%rsi,%rdx),%r9 > + cmpq %r8,%r9 > + jne 10081608f > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + ALIGN_TEXT > +100408: > + movl (%rdi),%r8d > + movl (%rsi),%r9d > + cmpl %r8d,%r9d > + jne 80f > + movl -4(%rdi,%rdx),%r8d > + movl -4(%rsi,%rdx),%r9d > + cmpl %r8d,%r9d > + jne 10040804f > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + ALIGN_TEXT > +100204: > + movzwl (%rdi),%r8d > + movzwl (%rsi),%r9d > + cmpl %r8d,%r9d > + jne 1f > + movzwl -2(%rdi,%rdx),%r8d > + movzwl -2(%rsi,%rdx),%r9d > + cmpl %r8d,%r9d > + jne 1f > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + ALIGN_TEXT > +101632: > + cmpq $32,%rdx > + ja 103200f > + movq (%rdi),%r8 > + movq (%rsi),%r9 > + cmpq %r8,%r9 > + jne 80f > + movq 8(%rdi),%r8 > + movq 8(%rsi),%r9 > + cmpq %r8,%r9 > + jne 10163208f > + movq -16(%rdi,%rdx),%r8 > + movq -16(%rsi,%rdx),%r9 > + cmpq %r8,%r9 > + jne 10163216f > + movq -8(%rdi,%rdx),%r8 > + movq -8(%rsi,%rdx),%r9 > + cmpq %r8,%r9 > + jne 10163224f > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + ALIGN_TEXT > +103200: > + movq (%rdi),%r8 > + movq 8(%rdi),%r9 > + subq (%rsi),%r8 > + subq 8(%rsi),%r9 > + orq %r8,%r9 > + jnz 10320000f > + > + movq 16(%rdi),%r8 > + movq 24(%rdi),%r9 > + subq 16(%rsi),%r8 > + subq 24(%rsi),%r9 > + orq %r8,%r9 > + jnz 10320016f > + > + leaq 32(%rdi),%rdi > + leaq 32(%rsi),%rsi > + subq $32,%rdx > + cmpq $32,%rdx > + jae 103200b > + cmpb $0,%dl > + jne 10b > + RETGUARD_CHECK(memcmp, r11) > + ret > + lfence > + > +/* > + * Mismatch was found. > +* > + * We need to compute the difference between strings. > + * Start with narrowing the range down (16 -> 8 -> 4 bytes). > + */ > + ALIGN_TEXT > +10320016: > + leaq 16(%rdi),%rdi > + leaq 16(%rsi),%rsi > +10320000: > + movq (%rdi),%r8 > + movq (%rsi),%r9 > + cmpq %r8,%r9 > + jne 80f > + leaq 8(%rdi),%rdi > + leaq 8(%rsi),%rsi > + jmp 80f > + ALIGN_TEXT > +10081608: > +10163224: > + leaq -8(%rdi,%rdx),%rdi > + leaq -8(%rsi,%rdx),%rsi > + jmp 80f > + ALIGN_TEXT > +10163216: > + leaq -16(%rdi,%rdx),%rdi > + leaq -16(%rsi,%rdx),%rsi > + jmp 80f > + ALIGN_TEXT > +10163208: > + leaq 8(%rdi),%rdi > + leaq 8(%rsi),%rsi > + jmp 80f > + ALIGN_TEXT > +10040804: > + leaq -4(%rdi,%rdx),%rdi > + leaq -4(%rsi,%rdx),%rsi > + jmp 1f > + > + ALIGN_TEXT > +80: > + movl (%rdi),%r8d > + movl (%rsi),%r9d > + cmpl %r8d,%r9d > + jne 1f > + leaq 4(%rdi),%rdi > + leaq 4(%rsi),%rsi > + > +/* > + * We have up to 4 bytes to inspect. > + */ > +1: > + movzbl (%rdi),%eax > + movzbl (%rsi),%r8d > + cmpb %r8b,%al > + jne 2f > + > + movzbl 1(%rdi),%eax > + movzbl 1(%rsi),%r8d > + cmpb %r8b,%al > + jne 2f > + > + movzbl 2(%rdi),%eax > + movzbl 2(%rsi),%r8d > + cmpb %r8b,%al > + jne 2f > + > + movzbl 3(%rdi),%eax > + movzbl 3(%rsi),%r8d > +2: > + subl %r8d,%eax > + RETGUARD_CHECK(memcmp, r11) > ret > lfence > END(memcmp) > -- > 2.44.0 >