Index | Thread | Search

From:
Mark Kettenis <mark.kettenis@xs4all.nl>
Subject:
Re: [PATCH] amd64: import optimized memcmp from FreeBSD
To:
Mateusz Guzik <mjguzik@gmail.com>
Cc:
tech@openbsd.org, mjguzik@gmail.com
Date:
Fri, 29 Nov 2024 23:08:19 +0100

Download raw body.

Thread
  • Steffen Nurpmeso:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • Mark Kettenis:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • > From: Mateusz Guzik <mjguzik@gmail.com>
    > Date: Fri, 29 Nov 2024 02:01:45 +0100
    > 
    > The rep-prefixed cmps is incredibly slow even on modern CPUs.
    > 
    > The new implementation uses regular cmp to do it.
    > 
    > The code got augmented to account for retguard, otherwise it matches FreeBSD.
    > 
    > On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
    > before:	436177
    > after:	464670 (+6.5%)
    
    That is a nice speedup, but you'd need to present more than a single
    microbenchmark on a single microarchitecture to convince me that this
    is worth the extra complication of the assembly code.
    
    > Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
    > shows the routine at top of the profile, it drops to the bottom with the patch.
    > 
    > The new top is memmove which also got optimized in FreeBSD.
    > 
    > The other routines can receive similar treatment (including for libc), but I
    > have no interest in doing it. I guarantee there is quite a bit of extra
    > single-threaded perf up for grabs with this alone -- the old string ops are total crap.
    > 
    > cheers
    > ---
    >  sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
    >  1 file changed, 214 insertions(+), 30 deletions(-)
    > 
    > diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
    > index b9944361869..f76552af1b4 100644
    > --- a/sys/lib/libkern/arch/amd64/memcmp.S
    > +++ b/sys/lib/libkern/arch/amd64/memcmp.S
    > @@ -1,39 +1,223 @@
    > -/*
    > - * Written by J.T. Conklin <jtc@netbsd.org>.
    > - * Public domain.
    > - * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
    > +/*-
    > + * Copyright (c) 2018 The FreeBSD Foundation
    > + *
    > + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
    > + * under sponsorship from the FreeBSD Foundation.
    > + *
    > + * Redistribution and use in source and binary forms, with or without
    > + * modification, are permitted provided that the following conditions
    > + * are met:
    > + * 1. Redistributions of source code must retain the above copyright
    > + *    notice, this list of conditions and the following disclaimer.
    > + * 2. Redistributions in binary form must reproduce the above copyright
    > + *    notice, this list of conditions and the following disclaimer in the
    > + *    documentation and/or other materials provided with the distribution.
    > + *
    > + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
    > + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    > + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    > + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
    > + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    > + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    > + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    > + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    > + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    > + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    > + * SUCH DAMAGE.
    >   */
    >  
    >  #include <machine/asm.h>
    >  
    > +#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
    > +
    >  ENTRY(memcmp)
    >  	RETGUARD_SETUP(memcmp, r11)
    > -	movq	%rdx,%rcx		/* compare by longs */
    > -	shrq	$3,%rcx
    > -	repe
    > -	cmpsq
    > -	jne	5f			/* do we match so far? */
    > -
    > -	movq	%rdx,%rcx		/* compare remainder by bytes */
    > -	andq	$7,%rcx
    > -	repe
    > -	cmpsb
    > -	jne	6f			/* do we match? */
    > -
    > -	xorl	%eax,%eax		/* we match, return zero	*/
    > -	jmp	7f
    > -
    > -5:	movl	$8,%ecx			/* We know that one of the next	*/
    > -	subq	%rcx,%rdi		/* eight pairs of bytes do not	*/
    > -	subq	%rcx,%rsi		/* match.			*/
    > -	repe
    > -	cmpsb
    > -6:	xorl	%eax,%eax		/* Perform unsigned comparison	*/
    > -	movb	-1(%rdi),%al
    > -	xorl	%edx,%edx
    > -	movb	-1(%rsi),%dl
    > -	subl    %edx,%eax
    > -7:	RETGUARD_CHECK(memcmp, r11)
    > +	xorl	%eax,%eax
    > +10:
    > +	cmpq	$16,%rdx
    > +	ja	101632f
    > +
    > +	cmpb	$8,%dl
    > +	jg	100816f
    > +
    > +	cmpb	$4,%dl
    > +	jg	100408f
    > +
    > +	cmpb	$2,%dl
    > +	jge	100204f
    > +
    > +	cmpb	$1,%dl
    > +	jl	100000f
    > +	movzbl	(%rdi),%eax
    > +	movzbl	(%rsi),%r8d
    > +	subl	%r8d,%eax
    > +100000:
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +
    > +	ALIGN_TEXT
    > +100816:
    > +	movq	(%rdi),%r8
    > +	movq	(%rsi),%r9
    > +	cmpq	%r8,%r9
    > +	jne	80f
    > +	movq	-8(%rdi,%rdx),%r8
    > +	movq	-8(%rsi,%rdx),%r9
    > +	cmpq	%r8,%r9
    > +	jne	10081608f
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +	ALIGN_TEXT
    > +100408:
    > +	movl	(%rdi),%r8d
    > +	movl	(%rsi),%r9d
    > +	cmpl	%r8d,%r9d
    > +	jne	80f
    > +	movl	-4(%rdi,%rdx),%r8d
    > +	movl	-4(%rsi,%rdx),%r9d
    > +	cmpl	%r8d,%r9d
    > +	jne	10040804f
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +	ALIGN_TEXT
    > +100204:
    > +	movzwl	(%rdi),%r8d
    > +	movzwl	(%rsi),%r9d
    > +	cmpl	%r8d,%r9d
    > +	jne	1f
    > +	movzwl	-2(%rdi,%rdx),%r8d
    > +	movzwl	-2(%rsi,%rdx),%r9d
    > +	cmpl	%r8d,%r9d
    > +	jne	1f
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +	ALIGN_TEXT
    > +101632:
    > +	cmpq	$32,%rdx
    > +	ja	103200f
    > +	movq	(%rdi),%r8
    > +	movq	(%rsi),%r9
    > +	cmpq	%r8,%r9
    > +	jne	80f
    > +	movq	8(%rdi),%r8
    > +	movq	8(%rsi),%r9
    > +	cmpq	%r8,%r9
    > +	jne	10163208f
    > +	movq	-16(%rdi,%rdx),%r8
    > +	movq	-16(%rsi,%rdx),%r9
    > +	cmpq	%r8,%r9
    > +	jne	10163216f
    > +	movq	-8(%rdi,%rdx),%r8
    > +	movq	-8(%rsi,%rdx),%r9
    > +	cmpq	%r8,%r9
    > +	jne	10163224f
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +	ALIGN_TEXT
    > +103200:
    > +	movq	(%rdi),%r8
    > +	movq	8(%rdi),%r9
    > +	subq	(%rsi),%r8
    > +	subq	8(%rsi),%r9
    > +	orq	%r8,%r9
    > +	jnz	10320000f
    > +
    > +	movq    16(%rdi),%r8
    > +	movq    24(%rdi),%r9
    > +	subq    16(%rsi),%r8
    > +	subq    24(%rsi),%r9
    > +	orq	%r8,%r9
    > +	jnz     10320016f
    > +
    > +	leaq	32(%rdi),%rdi
    > +	leaq	32(%rsi),%rsi
    > +	subq	$32,%rdx
    > +	cmpq	$32,%rdx
    > +	jae	103200b
    > +	cmpb	$0,%dl
    > +	jne	10b
    > +	RETGUARD_CHECK(memcmp, r11)
    > +	ret
    > +	lfence
    > +
    > +/*
    > + * Mismatch was found.
    > +*
    > + * We need to compute the difference between strings.
    > + * Start with narrowing the range down (16 -> 8 -> 4 bytes).
    > + */
    > +	ALIGN_TEXT
    > +10320016:
    > +	leaq	16(%rdi),%rdi
    > +	leaq	16(%rsi),%rsi
    > +10320000:
    > +	movq	(%rdi),%r8
    > +	movq	(%rsi),%r9
    > +	cmpq	%r8,%r9
    > +	jne	80f
    > +	leaq	8(%rdi),%rdi
    > +	leaq	8(%rsi),%rsi
    > +	jmp	80f
    > +	ALIGN_TEXT
    > +10081608:
    > +10163224:
    > +	leaq	-8(%rdi,%rdx),%rdi
    > +	leaq	-8(%rsi,%rdx),%rsi
    > +	jmp	80f
    > +	ALIGN_TEXT
    > +10163216:
    > +	leaq	-16(%rdi,%rdx),%rdi
    > +	leaq	-16(%rsi,%rdx),%rsi
    > +	jmp	80f
    > +	ALIGN_TEXT
    > +10163208:
    > +	leaq	8(%rdi),%rdi
    > +	leaq	8(%rsi),%rsi
    > +	jmp	80f
    > +	ALIGN_TEXT
    > +10040804:
    > +	leaq	-4(%rdi,%rdx),%rdi
    > +	leaq	-4(%rsi,%rdx),%rsi
    > +	jmp	1f
    > +
    > +	ALIGN_TEXT
    > +80:
    > +	movl	(%rdi),%r8d
    > +	movl	(%rsi),%r9d
    > +	cmpl	%r8d,%r9d
    > +	jne	1f
    > +	leaq	4(%rdi),%rdi
    > +	leaq	4(%rsi),%rsi
    > +
    > +/*
    > + * We have up to 4 bytes to inspect.
    > + */
    > +1:
    > +	movzbl	(%rdi),%eax
    > +	movzbl	(%rsi),%r8d
    > +	cmpb	%r8b,%al
    > +	jne	2f
    > +
    > +	movzbl	1(%rdi),%eax
    > +	movzbl	1(%rsi),%r8d
    > +	cmpb	%r8b,%al
    > +	jne	2f
    > +
    > +	movzbl	2(%rdi),%eax
    > +	movzbl	2(%rsi),%r8d
    > +	cmpb	%r8b,%al
    > +	jne	2f
    > +
    > +	movzbl	3(%rdi),%eax
    > +	movzbl	3(%rsi),%r8d
    > +2:
    > +	subl	%r8d,%eax
    > +	RETGUARD_CHECK(memcmp, r11)
    >  	ret
    >  	lfence
    >  END(memcmp)
    > -- 
    > 2.44.0
    > 
    > 
    
    
    
  • Steffen Nurpmeso:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • Mark Kettenis:

    [PATCH] amd64: import optimized memcmp from FreeBSD