From: Mateusz Guzik Subject: Re: [PATCH] amd64: import optimized memcmp from FreeBSD To: Mike Larkin Cc: tech@openbsd.org Date: Sat, 30 Nov 2024 03:49:33 +0100 On Sat, Nov 30, 2024 at 1:04 AM Mike Larkin wrote: > > On Fri, Nov 29, 2024 at 02:01:45AM +0100, Mateusz Guzik wrote: > > The rep-prefixed cmps is incredibly slow even on modern CPUs. > > > > The new implementation uses regular cmp to do it. > > > > The code got augmented to account for retguard, otherwise it matches FreeBSD. > > > > On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s): > > before: 436177 > > after: 464670 (+6.5%) > > > > Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }' > > shows the routine at top of the profile, it drops to the bottom with the patch. > > > > The new top is memmove which also got optimized in FreeBSD. > > > > The other routines can receive similar treatment (including for libc), but I > > have no interest in doing it. I guarantee there is quite a bit of extra > > single-threaded perf up for grabs with this alone -- the old string ops are total crap. > > > > cheers > > --- > > sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++---- > > 1 file changed, 214 insertions(+), 30 deletions(-) > > > > can you please explain what the asm labels mean? > > eg what is the difference between 10080816, 10001632, 10000408, etc? > They indicate what sizes are handled by given chunk (e.g. 0408 handles between 4 and 8 bytes). I would use a more descriptive name, but I kept it like that for consistency with the other routines in FreeBSD memmove got implemented as a macro for easy reuse for copyin/copyout and that needed numeric labels instead of ascii names, thus I picked something which encodes sizes. > > diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S > > index b9944361869..f76552af1b4 100644 > > --- a/sys/lib/libkern/arch/amd64/memcmp.S > > +++ b/sys/lib/libkern/arch/amd64/memcmp.S > > @@ -1,39 +1,223 @@ > > -/* > > - * Written by J.T. Conklin . > > - * Public domain. > > - * Adapted for NetBSD/x86_64 by Frank van der Linden > > +/*- > > + * Copyright (c) 2018 The FreeBSD Foundation > > + * > > + * This software was developed by Mateusz Guzik > > + * under sponsorship from the FreeBSD Foundation. > > + * > > + * Redistribution and use in source and binary forms, with or without > > + * modification, are permitted provided that the following conditions > > + * are met: > > + * 1. Redistributions of source code must retain the above copyright > > + * notice, this list of conditions and the following disclaimer. > > + * 2. Redistributions in binary form must reproduce the above copyright > > + * notice, this list of conditions and the following disclaimer in the > > + * documentation and/or other materials provided with the distribution. > > + * > > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND > > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE > > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE > > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE > > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL > > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS > > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT > > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY > > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF > > + * SUCH DAMAGE. > > */ > > > > #include > > > > +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */ > > + > > ENTRY(memcmp) > > RETGUARD_SETUP(memcmp, r11) > > - movq %rdx,%rcx /* compare by longs */ > > - shrq $3,%rcx > > - repe > > - cmpsq > > - jne 5f /* do we match so far? */ > > - > > - movq %rdx,%rcx /* compare remainder by bytes */ > > - andq $7,%rcx > > - repe > > - cmpsb > > - jne 6f /* do we match? */ > > - > > - xorl %eax,%eax /* we match, return zero */ > > - jmp 7f > > - > > -5: movl $8,%ecx /* We know that one of the next */ > > - subq %rcx,%rdi /* eight pairs of bytes do not */ > > - subq %rcx,%rsi /* match. */ > > - repe > > - cmpsb > > -6: xorl %eax,%eax /* Perform unsigned comparison */ > > - movb -1(%rdi),%al > > - xorl %edx,%edx > > - movb -1(%rsi),%dl > > - subl %edx,%eax > > -7: RETGUARD_CHECK(memcmp, r11) > > + xorl %eax,%eax > > +10: > > + cmpq $16,%rdx > > + ja 101632f > > + > > + cmpb $8,%dl > > + jg 100816f > > + > > + cmpb $4,%dl > > + jg 100408f > > + > > + cmpb $2,%dl > > + jge 100204f > > + > > + cmpb $1,%dl > > + jl 100000f > > + movzbl (%rdi),%eax > > + movzbl (%rsi),%r8d > > + subl %r8d,%eax > > +100000: > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + > > + ALIGN_TEXT > > +100816: > > + movq (%rdi),%r8 > > + movq (%rsi),%r9 > > + cmpq %r8,%r9 > > + jne 80f > > + movq -8(%rdi,%rdx),%r8 > > + movq -8(%rsi,%rdx),%r9 > > + cmpq %r8,%r9 > > + jne 10081608f > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + ALIGN_TEXT > > +100408: > > + movl (%rdi),%r8d > > + movl (%rsi),%r9d > > + cmpl %r8d,%r9d > > + jne 80f > > + movl -4(%rdi,%rdx),%r8d > > + movl -4(%rsi,%rdx),%r9d > > + cmpl %r8d,%r9d > > + jne 10040804f > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + ALIGN_TEXT > > +100204: > > + movzwl (%rdi),%r8d > > + movzwl (%rsi),%r9d > > + cmpl %r8d,%r9d > > + jne 1f > > + movzwl -2(%rdi,%rdx),%r8d > > + movzwl -2(%rsi,%rdx),%r9d > > + cmpl %r8d,%r9d > > + jne 1f > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + ALIGN_TEXT > > +101632: > > + cmpq $32,%rdx > > + ja 103200f > > + movq (%rdi),%r8 > > + movq (%rsi),%r9 > > + cmpq %r8,%r9 > > + jne 80f > > + movq 8(%rdi),%r8 > > + movq 8(%rsi),%r9 > > + cmpq %r8,%r9 > > + jne 10163208f > > + movq -16(%rdi,%rdx),%r8 > > + movq -16(%rsi,%rdx),%r9 > > + cmpq %r8,%r9 > > + jne 10163216f > > + movq -8(%rdi,%rdx),%r8 > > + movq -8(%rsi,%rdx),%r9 > > + cmpq %r8,%r9 > > + jne 10163224f > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + ALIGN_TEXT > > +103200: > > + movq (%rdi),%r8 > > + movq 8(%rdi),%r9 > > + subq (%rsi),%r8 > > + subq 8(%rsi),%r9 > > + orq %r8,%r9 > > + jnz 10320000f > > + > > + movq 16(%rdi),%r8 > > + movq 24(%rdi),%r9 > > + subq 16(%rsi),%r8 > > + subq 24(%rsi),%r9 > > + orq %r8,%r9 > > + jnz 10320016f > > + > > + leaq 32(%rdi),%rdi > > + leaq 32(%rsi),%rsi > > + subq $32,%rdx > > + cmpq $32,%rdx > > + jae 103200b > > + cmpb $0,%dl > > + jne 10b > > + RETGUARD_CHECK(memcmp, r11) > > + ret > > + lfence > > + > > +/* > > + * Mismatch was found. > > +* > > + * We need to compute the difference between strings. > > + * Start with narrowing the range down (16 -> 8 -> 4 bytes). > > + */ > > + ALIGN_TEXT > > +10320016: > > + leaq 16(%rdi),%rdi > > + leaq 16(%rsi),%rsi > > +10320000: > > + movq (%rdi),%r8 > > + movq (%rsi),%r9 > > + cmpq %r8,%r9 > > + jne 80f > > + leaq 8(%rdi),%rdi > > + leaq 8(%rsi),%rsi > > + jmp 80f > > + ALIGN_TEXT > > +10081608: > > +10163224: > > + leaq -8(%rdi,%rdx),%rdi > > + leaq -8(%rsi,%rdx),%rsi > > + jmp 80f > > + ALIGN_TEXT > > +10163216: > > + leaq -16(%rdi,%rdx),%rdi > > + leaq -16(%rsi,%rdx),%rsi > > + jmp 80f > > + ALIGN_TEXT > > +10163208: > > + leaq 8(%rdi),%rdi > > + leaq 8(%rsi),%rsi > > + jmp 80f > > + ALIGN_TEXT > > +10040804: > > + leaq -4(%rdi,%rdx),%rdi > > + leaq -4(%rsi,%rdx),%rsi > > + jmp 1f > > + > > + ALIGN_TEXT > > +80: > > + movl (%rdi),%r8d > > + movl (%rsi),%r9d > > + cmpl %r8d,%r9d > > + jne 1f > > + leaq 4(%rdi),%rdi > > + leaq 4(%rsi),%rsi > > + > > +/* > > + * We have up to 4 bytes to inspect. > > + */ > > +1: > > + movzbl (%rdi),%eax > > + movzbl (%rsi),%r8d > > + cmpb %r8b,%al > > + jne 2f > > + > > + movzbl 1(%rdi),%eax > > + movzbl 1(%rsi),%r8d > > + cmpb %r8b,%al > > + jne 2f > > + > > + movzbl 2(%rdi),%eax > > + movzbl 2(%rsi),%r8d > > + cmpb %r8b,%al > > + jne 2f > > + > > + movzbl 3(%rdi),%eax > > + movzbl 3(%rsi),%r8d > > +2: > > + subl %r8d,%eax > > + RETGUARD_CHECK(memcmp, r11) > > ret > > lfence > > END(memcmp) > > -- > > 2.44.0 > > -- Mateusz Guzik