Download raw body.
[PATCH] amd64: import optimized memcmp from FreeBSD
On Sat, Nov 30, 2024 at 1:04 AM Mike Larkin <mlarkin@nested.page> wrote:
>
> On Fri, Nov 29, 2024 at 02:01:45AM +0100, Mateusz Guzik wrote:
> > The rep-prefixed cmps is incredibly slow even on modern CPUs.
> >
> > The new implementation uses regular cmp to do it.
> >
> > The code got augmented to account for retguard, otherwise it matches FreeBSD.
> >
> > On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
> > before: 436177
> > after: 464670 (+6.5%)
> >
> > Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
> > shows the routine at top of the profile, it drops to the bottom with the patch.
> >
> > The new top is memmove which also got optimized in FreeBSD.
> >
> > The other routines can receive similar treatment (including for libc), but I
> > have no interest in doing it. I guarantee there is quite a bit of extra
> > single-threaded perf up for grabs with this alone -- the old string ops are total crap.
> >
> > cheers
> > ---
> > sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
> > 1 file changed, 214 insertions(+), 30 deletions(-)
> >
>
> can you please explain what the asm labels mean?
>
> eg what is the difference between 10080816, 10001632, 10000408, etc?
>
They indicate what sizes are handled by given chunk (e.g. 0408 handles
between 4 and 8 bytes).
I would use a more descriptive name, but I kept it like that for
consistency with the other routines in FreeBSD
memmove got implemented as a macro for easy reuse for copyin/copyout
and that needed numeric labels instead of ascii names, thus I picked
something which encodes sizes.
> > diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
> > index b9944361869..f76552af1b4 100644
> > --- a/sys/lib/libkern/arch/amd64/memcmp.S
> > +++ b/sys/lib/libkern/arch/amd64/memcmp.S
> > @@ -1,39 +1,223 @@
> > -/*
> > - * Written by J.T. Conklin <jtc@netbsd.org>.
> > - * Public domain.
> > - * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
> > +/*-
> > + * Copyright (c) 2018 The FreeBSD Foundation
> > + *
> > + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
> > + * under sponsorship from the FreeBSD Foundation.
> > + *
> > + * Redistribution and use in source and binary forms, with or without
> > + * modification, are permitted provided that the following conditions
> > + * are met:
> > + * 1. Redistributions of source code must retain the above copyright
> > + * notice, this list of conditions and the following disclaimer.
> > + * 2. Redistributions in binary form must reproduce the above copyright
> > + * notice, this list of conditions and the following disclaimer in the
> > + * documentation and/or other materials provided with the distribution.
> > + *
> > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> > + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> > + * SUCH DAMAGE.
> > */
> >
> > #include <machine/asm.h>
> >
> > +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
> > +
> > ENTRY(memcmp)
> > RETGUARD_SETUP(memcmp, r11)
> > - movq %rdx,%rcx /* compare by longs */
> > - shrq $3,%rcx
> > - repe
> > - cmpsq
> > - jne 5f /* do we match so far? */
> > -
> > - movq %rdx,%rcx /* compare remainder by bytes */
> > - andq $7,%rcx
> > - repe
> > - cmpsb
> > - jne 6f /* do we match? */
> > -
> > - xorl %eax,%eax /* we match, return zero */
> > - jmp 7f
> > -
> > -5: movl $8,%ecx /* We know that one of the next */
> > - subq %rcx,%rdi /* eight pairs of bytes do not */
> > - subq %rcx,%rsi /* match. */
> > - repe
> > - cmpsb
> > -6: xorl %eax,%eax /* Perform unsigned comparison */
> > - movb -1(%rdi),%al
> > - xorl %edx,%edx
> > - movb -1(%rsi),%dl
> > - subl %edx,%eax
> > -7: RETGUARD_CHECK(memcmp, r11)
> > + xorl %eax,%eax
> > +10:
> > + cmpq $16,%rdx
> > + ja 101632f
> > +
> > + cmpb $8,%dl
> > + jg 100816f
> > +
> > + cmpb $4,%dl
> > + jg 100408f
> > +
> > + cmpb $2,%dl
> > + jge 100204f
> > +
> > + cmpb $1,%dl
> > + jl 100000f
> > + movzbl (%rdi),%eax
> > + movzbl (%rsi),%r8d
> > + subl %r8d,%eax
> > +100000:
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > +
> > + ALIGN_TEXT
> > +100816:
> > + movq (%rdi),%r8
> > + movq (%rsi),%r9
> > + cmpq %r8,%r9
> > + jne 80f
> > + movq -8(%rdi,%rdx),%r8
> > + movq -8(%rsi,%rdx),%r9
> > + cmpq %r8,%r9
> > + jne 10081608f
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > + ALIGN_TEXT
> > +100408:
> > + movl (%rdi),%r8d
> > + movl (%rsi),%r9d
> > + cmpl %r8d,%r9d
> > + jne 80f
> > + movl -4(%rdi,%rdx),%r8d
> > + movl -4(%rsi,%rdx),%r9d
> > + cmpl %r8d,%r9d
> > + jne 10040804f
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > + ALIGN_TEXT
> > +100204:
> > + movzwl (%rdi),%r8d
> > + movzwl (%rsi),%r9d
> > + cmpl %r8d,%r9d
> > + jne 1f
> > + movzwl -2(%rdi,%rdx),%r8d
> > + movzwl -2(%rsi,%rdx),%r9d
> > + cmpl %r8d,%r9d
> > + jne 1f
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > + ALIGN_TEXT
> > +101632:
> > + cmpq $32,%rdx
> > + ja 103200f
> > + movq (%rdi),%r8
> > + movq (%rsi),%r9
> > + cmpq %r8,%r9
> > + jne 80f
> > + movq 8(%rdi),%r8
> > + movq 8(%rsi),%r9
> > + cmpq %r8,%r9
> > + jne 10163208f
> > + movq -16(%rdi,%rdx),%r8
> > + movq -16(%rsi,%rdx),%r9
> > + cmpq %r8,%r9
> > + jne 10163216f
> > + movq -8(%rdi,%rdx),%r8
> > + movq -8(%rsi,%rdx),%r9
> > + cmpq %r8,%r9
> > + jne 10163224f
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > + ALIGN_TEXT
> > +103200:
> > + movq (%rdi),%r8
> > + movq 8(%rdi),%r9
> > + subq (%rsi),%r8
> > + subq 8(%rsi),%r9
> > + orq %r8,%r9
> > + jnz 10320000f
> > +
> > + movq 16(%rdi),%r8
> > + movq 24(%rdi),%r9
> > + subq 16(%rsi),%r8
> > + subq 24(%rsi),%r9
> > + orq %r8,%r9
> > + jnz 10320016f
> > +
> > + leaq 32(%rdi),%rdi
> > + leaq 32(%rsi),%rsi
> > + subq $32,%rdx
> > + cmpq $32,%rdx
> > + jae 103200b
> > + cmpb $0,%dl
> > + jne 10b
> > + RETGUARD_CHECK(memcmp, r11)
> > + ret
> > + lfence
> > +
> > +/*
> > + * Mismatch was found.
> > +*
> > + * We need to compute the difference between strings.
> > + * Start with narrowing the range down (16 -> 8 -> 4 bytes).
> > + */
> > + ALIGN_TEXT
> > +10320016:
> > + leaq 16(%rdi),%rdi
> > + leaq 16(%rsi),%rsi
> > +10320000:
> > + movq (%rdi),%r8
> > + movq (%rsi),%r9
> > + cmpq %r8,%r9
> > + jne 80f
> > + leaq 8(%rdi),%rdi
> > + leaq 8(%rsi),%rsi
> > + jmp 80f
> > + ALIGN_TEXT
> > +10081608:
> > +10163224:
> > + leaq -8(%rdi,%rdx),%rdi
> > + leaq -8(%rsi,%rdx),%rsi
> > + jmp 80f
> > + ALIGN_TEXT
> > +10163216:
> > + leaq -16(%rdi,%rdx),%rdi
> > + leaq -16(%rsi,%rdx),%rsi
> > + jmp 80f
> > + ALIGN_TEXT
> > +10163208:
> > + leaq 8(%rdi),%rdi
> > + leaq 8(%rsi),%rsi
> > + jmp 80f
> > + ALIGN_TEXT
> > +10040804:
> > + leaq -4(%rdi,%rdx),%rdi
> > + leaq -4(%rsi,%rdx),%rsi
> > + jmp 1f
> > +
> > + ALIGN_TEXT
> > +80:
> > + movl (%rdi),%r8d
> > + movl (%rsi),%r9d
> > + cmpl %r8d,%r9d
> > + jne 1f
> > + leaq 4(%rdi),%rdi
> > + leaq 4(%rsi),%rsi
> > +
> > +/*
> > + * We have up to 4 bytes to inspect.
> > + */
> > +1:
> > + movzbl (%rdi),%eax
> > + movzbl (%rsi),%r8d
> > + cmpb %r8b,%al
> > + jne 2f
> > +
> > + movzbl 1(%rdi),%eax
> > + movzbl 1(%rsi),%r8d
> > + cmpb %r8b,%al
> > + jne 2f
> > +
> > + movzbl 2(%rdi),%eax
> > + movzbl 2(%rsi),%r8d
> > + cmpb %r8b,%al
> > + jne 2f
> > +
> > + movzbl 3(%rdi),%eax
> > + movzbl 3(%rsi),%r8d
> > +2:
> > + subl %r8d,%eax
> > + RETGUARD_CHECK(memcmp, r11)
> > ret
> > lfence
> > END(memcmp)
> > --
> > 2.44.0
> >
--
Mateusz Guzik <mjguzik gmail.com>
[PATCH] amd64: import optimized memcmp from FreeBSD