Index | Thread | Search

From:
Mateusz Guzik <mjguzik@gmail.com>
Subject:
Re: [PATCH] amd64: import optimized memcmp from FreeBSD
To:
Mike Larkin <mlarkin@nested.page>
Cc:
tech@openbsd.org
Date:
Sat, 30 Nov 2024 03:49:33 +0100

Download raw body.

Thread
On Sat, Nov 30, 2024 at 1:04 AM Mike Larkin <mlarkin@nested.page> wrote:
>
> On Fri, Nov 29, 2024 at 02:01:45AM +0100, Mateusz Guzik wrote:
> > The rep-prefixed cmps is incredibly slow even on modern CPUs.
> >
> > The new implementation uses regular cmp to do it.
> >
> > The code got augmented to account for retguard, otherwise it matches FreeBSD.
> >
> > On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
> > before:       436177
> > after:        464670 (+6.5%)
> >
> > Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
> > shows the routine at top of the profile, it drops to the bottom with the patch.
> >
> > The new top is memmove which also got optimized in FreeBSD.
> >
> > The other routines can receive similar treatment (including for libc), but I
> > have no interest in doing it. I guarantee there is quite a bit of extra
> > single-threaded perf up for grabs with this alone -- the old string ops are total crap.
> >
> > cheers
> > ---
> >  sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
> >  1 file changed, 214 insertions(+), 30 deletions(-)
> >
>
> can you please explain what the asm labels mean?
>
> eg what is the difference between 10080816, 10001632, 10000408, etc?
>

They indicate what sizes are handled by given chunk (e.g. 0408 handles
between 4 and 8 bytes).

I would use a more descriptive name, but I kept it like that for
consistency with the other routines in FreeBSD

memmove got implemented as a macro for easy reuse for copyin/copyout
and that needed numeric labels instead of ascii names, thus I picked
something which encodes sizes.

> > diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
> > index b9944361869..f76552af1b4 100644
> > --- a/sys/lib/libkern/arch/amd64/memcmp.S
> > +++ b/sys/lib/libkern/arch/amd64/memcmp.S
> > @@ -1,39 +1,223 @@
> > -/*
> > - * Written by J.T. Conklin <jtc@netbsd.org>.
> > - * Public domain.
> > - * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
> > +/*-
> > + * Copyright (c) 2018 The FreeBSD Foundation
> > + *
> > + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
> > + * under sponsorship from the FreeBSD Foundation.
> > + *
> > + * Redistribution and use in source and binary forms, with or without
> > + * modification, are permitted provided that the following conditions
> > + * are met:
> > + * 1. Redistributions of source code must retain the above copyright
> > + *    notice, this list of conditions and the following disclaimer.
> > + * 2. Redistributions in binary form must reproduce the above copyright
> > + *    notice, this list of conditions and the following disclaimer in the
> > + *    documentation and/or other materials provided with the distribution.
> > + *
> > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> > + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> > + * SUCH DAMAGE.
> >   */
> >
> >  #include <machine/asm.h>
> >
> > +#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
> > +
> >  ENTRY(memcmp)
> >       RETGUARD_SETUP(memcmp, r11)
> > -     movq    %rdx,%rcx               /* compare by longs */
> > -     shrq    $3,%rcx
> > -     repe
> > -     cmpsq
> > -     jne     5f                      /* do we match so far? */
> > -
> > -     movq    %rdx,%rcx               /* compare remainder by bytes */
> > -     andq    $7,%rcx
> > -     repe
> > -     cmpsb
> > -     jne     6f                      /* do we match? */
> > -
> > -     xorl    %eax,%eax               /* we match, return zero        */
> > -     jmp     7f
> > -
> > -5:   movl    $8,%ecx                 /* We know that one of the next */
> > -     subq    %rcx,%rdi               /* eight pairs of bytes do not  */
> > -     subq    %rcx,%rsi               /* match.                       */
> > -     repe
> > -     cmpsb
> > -6:   xorl    %eax,%eax               /* Perform unsigned comparison  */
> > -     movb    -1(%rdi),%al
> > -     xorl    %edx,%edx
> > -     movb    -1(%rsi),%dl
> > -     subl    %edx,%eax
> > -7:   RETGUARD_CHECK(memcmp, r11)
> > +     xorl    %eax,%eax
> > +10:
> > +     cmpq    $16,%rdx
> > +     ja      101632f
> > +
> > +     cmpb    $8,%dl
> > +     jg      100816f
> > +
> > +     cmpb    $4,%dl
> > +     jg      100408f
> > +
> > +     cmpb    $2,%dl
> > +     jge     100204f
> > +
> > +     cmpb    $1,%dl
> > +     jl      100000f
> > +     movzbl  (%rdi),%eax
> > +     movzbl  (%rsi),%r8d
> > +     subl    %r8d,%eax
> > +100000:
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +
> > +     ALIGN_TEXT
> > +100816:
> > +     movq    (%rdi),%r8
> > +     movq    (%rsi),%r9
> > +     cmpq    %r8,%r9
> > +     jne     80f
> > +     movq    -8(%rdi,%rdx),%r8
> > +     movq    -8(%rsi,%rdx),%r9
> > +     cmpq    %r8,%r9
> > +     jne     10081608f
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +     ALIGN_TEXT
> > +100408:
> > +     movl    (%rdi),%r8d
> > +     movl    (%rsi),%r9d
> > +     cmpl    %r8d,%r9d
> > +     jne     80f
> > +     movl    -4(%rdi,%rdx),%r8d
> > +     movl    -4(%rsi,%rdx),%r9d
> > +     cmpl    %r8d,%r9d
> > +     jne     10040804f
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +     ALIGN_TEXT
> > +100204:
> > +     movzwl  (%rdi),%r8d
> > +     movzwl  (%rsi),%r9d
> > +     cmpl    %r8d,%r9d
> > +     jne     1f
> > +     movzwl  -2(%rdi,%rdx),%r8d
> > +     movzwl  -2(%rsi,%rdx),%r9d
> > +     cmpl    %r8d,%r9d
> > +     jne     1f
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +     ALIGN_TEXT
> > +101632:
> > +     cmpq    $32,%rdx
> > +     ja      103200f
> > +     movq    (%rdi),%r8
> > +     movq    (%rsi),%r9
> > +     cmpq    %r8,%r9
> > +     jne     80f
> > +     movq    8(%rdi),%r8
> > +     movq    8(%rsi),%r9
> > +     cmpq    %r8,%r9
> > +     jne     10163208f
> > +     movq    -16(%rdi,%rdx),%r8
> > +     movq    -16(%rsi,%rdx),%r9
> > +     cmpq    %r8,%r9
> > +     jne     10163216f
> > +     movq    -8(%rdi,%rdx),%r8
> > +     movq    -8(%rsi,%rdx),%r9
> > +     cmpq    %r8,%r9
> > +     jne     10163224f
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +     ALIGN_TEXT
> > +103200:
> > +     movq    (%rdi),%r8
> > +     movq    8(%rdi),%r9
> > +     subq    (%rsi),%r8
> > +     subq    8(%rsi),%r9
> > +     orq     %r8,%r9
> > +     jnz     10320000f
> > +
> > +     movq    16(%rdi),%r8
> > +     movq    24(%rdi),%r9
> > +     subq    16(%rsi),%r8
> > +     subq    24(%rsi),%r9
> > +     orq     %r8,%r9
> > +     jnz     10320016f
> > +
> > +     leaq    32(%rdi),%rdi
> > +     leaq    32(%rsi),%rsi
> > +     subq    $32,%rdx
> > +     cmpq    $32,%rdx
> > +     jae     103200b
> > +     cmpb    $0,%dl
> > +     jne     10b
> > +     RETGUARD_CHECK(memcmp, r11)
> > +     ret
> > +     lfence
> > +
> > +/*
> > + * Mismatch was found.
> > +*
> > + * We need to compute the difference between strings.
> > + * Start with narrowing the range down (16 -> 8 -> 4 bytes).
> > + */
> > +     ALIGN_TEXT
> > +10320016:
> > +     leaq    16(%rdi),%rdi
> > +     leaq    16(%rsi),%rsi
> > +10320000:
> > +     movq    (%rdi),%r8
> > +     movq    (%rsi),%r9
> > +     cmpq    %r8,%r9
> > +     jne     80f
> > +     leaq    8(%rdi),%rdi
> > +     leaq    8(%rsi),%rsi
> > +     jmp     80f
> > +     ALIGN_TEXT
> > +10081608:
> > +10163224:
> > +     leaq    -8(%rdi,%rdx),%rdi
> > +     leaq    -8(%rsi,%rdx),%rsi
> > +     jmp     80f
> > +     ALIGN_TEXT
> > +10163216:
> > +     leaq    -16(%rdi,%rdx),%rdi
> > +     leaq    -16(%rsi,%rdx),%rsi
> > +     jmp     80f
> > +     ALIGN_TEXT
> > +10163208:
> > +     leaq    8(%rdi),%rdi
> > +     leaq    8(%rsi),%rsi
> > +     jmp     80f
> > +     ALIGN_TEXT
> > +10040804:
> > +     leaq    -4(%rdi,%rdx),%rdi
> > +     leaq    -4(%rsi,%rdx),%rsi
> > +     jmp     1f
> > +
> > +     ALIGN_TEXT
> > +80:
> > +     movl    (%rdi),%r8d
> > +     movl    (%rsi),%r9d
> > +     cmpl    %r8d,%r9d
> > +     jne     1f
> > +     leaq    4(%rdi),%rdi
> > +     leaq    4(%rsi),%rsi
> > +
> > +/*
> > + * We have up to 4 bytes to inspect.
> > + */
> > +1:
> > +     movzbl  (%rdi),%eax
> > +     movzbl  (%rsi),%r8d
> > +     cmpb    %r8b,%al
> > +     jne     2f
> > +
> > +     movzbl  1(%rdi),%eax
> > +     movzbl  1(%rsi),%r8d
> > +     cmpb    %r8b,%al
> > +     jne     2f
> > +
> > +     movzbl  2(%rdi),%eax
> > +     movzbl  2(%rsi),%r8d
> > +     cmpb    %r8b,%al
> > +     jne     2f
> > +
> > +     movzbl  3(%rdi),%eax
> > +     movzbl  3(%rsi),%r8d
> > +2:
> > +     subl    %r8d,%eax
> > +     RETGUARD_CHECK(memcmp, r11)
> >       ret
> >       lfence
> >  END(memcmp)
> > --
> > 2.44.0
> >



-- 
Mateusz Guzik <mjguzik gmail.com>