Download raw body.
[PATCH] amd64: import optimized memcmp from FreeBSD
On 2024/11/29 02:01, Mateusz Guzik wrote:
> The rep-prefixed cmps is incredibly slow even on modern CPUs.
>
> The new implementation uses regular cmp to do it.
>
> The code got augmented to account for retguard, otherwise it matches FreeBSD.
Would that make sense in libc too?
> On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
> before: 436177
> after: 464670 (+6.5%)
>
> Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
> shows the routine at top of the profile, it drops to the bottom with the patch.
>
> The new top is memmove which also got optimized in FreeBSD.
>
> The other routines can receive similar treatment (including for libc), but I
> have no interest in doing it. I guarantee there is quite a bit of extra
> single-threaded perf up for grabs with this alone -- the old string ops are total crap.
>
> cheers
> ---
> sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
> 1 file changed, 214 insertions(+), 30 deletions(-)
>
> diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
> index b9944361869..f76552af1b4 100644
> --- a/sys/lib/libkern/arch/amd64/memcmp.S
> +++ b/sys/lib/libkern/arch/amd64/memcmp.S
> @@ -1,39 +1,223 @@
> -/*
> - * Written by J.T. Conklin <jtc@netbsd.org>.
> - * Public domain.
> - * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
> +/*-
> + * Copyright (c) 2018 The FreeBSD Foundation
> + *
> + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
> + * under sponsorship from the FreeBSD Foundation.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
> */
>
> #include <machine/asm.h>
>
> +#define ALIGN_TEXT .p2align 4,0x90 /* 16-byte alignment, nop filled */
> +
> ENTRY(memcmp)
> RETGUARD_SETUP(memcmp, r11)
> - movq %rdx,%rcx /* compare by longs */
> - shrq $3,%rcx
> - repe
> - cmpsq
> - jne 5f /* do we match so far? */
> -
> - movq %rdx,%rcx /* compare remainder by bytes */
> - andq $7,%rcx
> - repe
> - cmpsb
> - jne 6f /* do we match? */
> -
> - xorl %eax,%eax /* we match, return zero */
> - jmp 7f
> -
> -5: movl $8,%ecx /* We know that one of the next */
> - subq %rcx,%rdi /* eight pairs of bytes do not */
> - subq %rcx,%rsi /* match. */
> - repe
> - cmpsb
> -6: xorl %eax,%eax /* Perform unsigned comparison */
> - movb -1(%rdi),%al
> - xorl %edx,%edx
> - movb -1(%rsi),%dl
> - subl %edx,%eax
> -7: RETGUARD_CHECK(memcmp, r11)
> + xorl %eax,%eax
> +10:
> + cmpq $16,%rdx
> + ja 101632f
> +
> + cmpb $8,%dl
> + jg 100816f
> +
> + cmpb $4,%dl
> + jg 100408f
> +
> + cmpb $2,%dl
> + jge 100204f
> +
> + cmpb $1,%dl
> + jl 100000f
> + movzbl (%rdi),%eax
> + movzbl (%rsi),%r8d
> + subl %r8d,%eax
> +100000:
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> +
> + ALIGN_TEXT
> +100816:
> + movq (%rdi),%r8
> + movq (%rsi),%r9
> + cmpq %r8,%r9
> + jne 80f
> + movq -8(%rdi,%rdx),%r8
> + movq -8(%rsi,%rdx),%r9
> + cmpq %r8,%r9
> + jne 10081608f
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> + ALIGN_TEXT
> +100408:
> + movl (%rdi),%r8d
> + movl (%rsi),%r9d
> + cmpl %r8d,%r9d
> + jne 80f
> + movl -4(%rdi,%rdx),%r8d
> + movl -4(%rsi,%rdx),%r9d
> + cmpl %r8d,%r9d
> + jne 10040804f
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> + ALIGN_TEXT
> +100204:
> + movzwl (%rdi),%r8d
> + movzwl (%rsi),%r9d
> + cmpl %r8d,%r9d
> + jne 1f
> + movzwl -2(%rdi,%rdx),%r8d
> + movzwl -2(%rsi,%rdx),%r9d
> + cmpl %r8d,%r9d
> + jne 1f
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> + ALIGN_TEXT
> +101632:
> + cmpq $32,%rdx
> + ja 103200f
> + movq (%rdi),%r8
> + movq (%rsi),%r9
> + cmpq %r8,%r9
> + jne 80f
> + movq 8(%rdi),%r8
> + movq 8(%rsi),%r9
> + cmpq %r8,%r9
> + jne 10163208f
> + movq -16(%rdi,%rdx),%r8
> + movq -16(%rsi,%rdx),%r9
> + cmpq %r8,%r9
> + jne 10163216f
> + movq -8(%rdi,%rdx),%r8
> + movq -8(%rsi,%rdx),%r9
> + cmpq %r8,%r9
> + jne 10163224f
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> + ALIGN_TEXT
> +103200:
> + movq (%rdi),%r8
> + movq 8(%rdi),%r9
> + subq (%rsi),%r8
> + subq 8(%rsi),%r9
> + orq %r8,%r9
> + jnz 10320000f
> +
> + movq 16(%rdi),%r8
> + movq 24(%rdi),%r9
> + subq 16(%rsi),%r8
> + subq 24(%rsi),%r9
> + orq %r8,%r9
> + jnz 10320016f
> +
> + leaq 32(%rdi),%rdi
> + leaq 32(%rsi),%rsi
> + subq $32,%rdx
> + cmpq $32,%rdx
> + jae 103200b
> + cmpb $0,%dl
> + jne 10b
> + RETGUARD_CHECK(memcmp, r11)
> + ret
> + lfence
> +
> +/*
> + * Mismatch was found.
> +*
> + * We need to compute the difference between strings.
> + * Start with narrowing the range down (16 -> 8 -> 4 bytes).
> + */
> + ALIGN_TEXT
> +10320016:
> + leaq 16(%rdi),%rdi
> + leaq 16(%rsi),%rsi
> +10320000:
> + movq (%rdi),%r8
> + movq (%rsi),%r9
> + cmpq %r8,%r9
> + jne 80f
> + leaq 8(%rdi),%rdi
> + leaq 8(%rsi),%rsi
> + jmp 80f
> + ALIGN_TEXT
> +10081608:
> +10163224:
> + leaq -8(%rdi,%rdx),%rdi
> + leaq -8(%rsi,%rdx),%rsi
> + jmp 80f
> + ALIGN_TEXT
> +10163216:
> + leaq -16(%rdi,%rdx),%rdi
> + leaq -16(%rsi,%rdx),%rsi
> + jmp 80f
> + ALIGN_TEXT
> +10163208:
> + leaq 8(%rdi),%rdi
> + leaq 8(%rsi),%rsi
> + jmp 80f
> + ALIGN_TEXT
> +10040804:
> + leaq -4(%rdi,%rdx),%rdi
> + leaq -4(%rsi,%rdx),%rsi
> + jmp 1f
> +
> + ALIGN_TEXT
> +80:
> + movl (%rdi),%r8d
> + movl (%rsi),%r9d
> + cmpl %r8d,%r9d
> + jne 1f
> + leaq 4(%rdi),%rdi
> + leaq 4(%rsi),%rsi
> +
> +/*
> + * We have up to 4 bytes to inspect.
> + */
> +1:
> + movzbl (%rdi),%eax
> + movzbl (%rsi),%r8d
> + cmpb %r8b,%al
> + jne 2f
> +
> + movzbl 1(%rdi),%eax
> + movzbl 1(%rsi),%r8d
> + cmpb %r8b,%al
> + jne 2f
> +
> + movzbl 2(%rdi),%eax
> + movzbl 2(%rsi),%r8d
> + cmpb %r8b,%al
> + jne 2f
> +
> + movzbl 3(%rdi),%eax
> + movzbl 3(%rsi),%r8d
> +2:
> + subl %r8d,%eax
> + RETGUARD_CHECK(memcmp, r11)
> ret
> lfence
> END(memcmp)
> --
> 2.44.0
>
[PATCH] amd64: import optimized memcmp from FreeBSD