Index | Thread | Search

From:
Stuart Henderson <stu@spacehopper.org>
Subject:
Re: [PATCH] amd64: import optimized memcmp from FreeBSD
To:
Mateusz Guzik <mjguzik@gmail.com>
Cc:
tech@openbsd.org
Date:
Fri, 29 Nov 2024 15:49:08 +0000

Download raw body.

Thread
On 2024/11/29 02:01, Mateusz Guzik wrote:
> The rep-prefixed cmps is incredibly slow even on modern CPUs.
> 
> The new implementation uses regular cmp to do it.
> 
> The code got augmented to account for retguard, otherwise it matches FreeBSD.

Would that make sense in libc too?

> On Sapphire Rapids open1 (open + close in a loop) from will-it-scale (ops/s):
> before:	436177
> after:	464670 (+6.5%)
> 
> Profiling the stock kernel with btrace -e 'profile:hz:99 { @[kstack] = count(); }'
> shows the routine at top of the profile, it drops to the bottom with the patch.
> 
> The new top is memmove which also got optimized in FreeBSD.
> 
> The other routines can receive similar treatment (including for libc), but I
> have no interest in doing it. I guarantee there is quite a bit of extra
> single-threaded perf up for grabs with this alone -- the old string ops are total crap.
> 
> cheers
> ---
>  sys/lib/libkern/arch/amd64/memcmp.S | 244 ++++++++++++++++++++++++----
>  1 file changed, 214 insertions(+), 30 deletions(-)
> 
> diff --git a/sys/lib/libkern/arch/amd64/memcmp.S b/sys/lib/libkern/arch/amd64/memcmp.S
> index b9944361869..f76552af1b4 100644
> --- a/sys/lib/libkern/arch/amd64/memcmp.S
> +++ b/sys/lib/libkern/arch/amd64/memcmp.S
> @@ -1,39 +1,223 @@
> -/*
> - * Written by J.T. Conklin <jtc@netbsd.org>.
> - * Public domain.
> - * Adapted for NetBSD/x86_64 by Frank van der Linden <fvdl@wasabisystems.com>
> +/*-
> + * Copyright (c) 2018 The FreeBSD Foundation
> + *
> + * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
> + * under sponsorship from the FreeBSD Foundation.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + * 1. Redistributions of source code must retain the above copyright
> + *    notice, this list of conditions and the following disclaimer.
> + * 2. Redistributions in binary form must reproduce the above copyright
> + *    notice, this list of conditions and the following disclaimer in the
> + *    documentation and/or other materials provided with the distribution.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
> + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
> + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
> + * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
> + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
> + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
> + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
> + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
> + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
> + * SUCH DAMAGE.
>   */
>  
>  #include <machine/asm.h>
>  
> +#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
> +
>  ENTRY(memcmp)
>  	RETGUARD_SETUP(memcmp, r11)
> -	movq	%rdx,%rcx		/* compare by longs */
> -	shrq	$3,%rcx
> -	repe
> -	cmpsq
> -	jne	5f			/* do we match so far? */
> -
> -	movq	%rdx,%rcx		/* compare remainder by bytes */
> -	andq	$7,%rcx
> -	repe
> -	cmpsb
> -	jne	6f			/* do we match? */
> -
> -	xorl	%eax,%eax		/* we match, return zero	*/
> -	jmp	7f
> -
> -5:	movl	$8,%ecx			/* We know that one of the next	*/
> -	subq	%rcx,%rdi		/* eight pairs of bytes do not	*/
> -	subq	%rcx,%rsi		/* match.			*/
> -	repe
> -	cmpsb
> -6:	xorl	%eax,%eax		/* Perform unsigned comparison	*/
> -	movb	-1(%rdi),%al
> -	xorl	%edx,%edx
> -	movb	-1(%rsi),%dl
> -	subl    %edx,%eax
> -7:	RETGUARD_CHECK(memcmp, r11)
> +	xorl	%eax,%eax
> +10:
> +	cmpq	$16,%rdx
> +	ja	101632f
> +
> +	cmpb	$8,%dl
> +	jg	100816f
> +
> +	cmpb	$4,%dl
> +	jg	100408f
> +
> +	cmpb	$2,%dl
> +	jge	100204f
> +
> +	cmpb	$1,%dl
> +	jl	100000f
> +	movzbl	(%rdi),%eax
> +	movzbl	(%rsi),%r8d
> +	subl	%r8d,%eax
> +100000:
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +
> +	ALIGN_TEXT
> +100816:
> +	movq	(%rdi),%r8
> +	movq	(%rsi),%r9
> +	cmpq	%r8,%r9
> +	jne	80f
> +	movq	-8(%rdi,%rdx),%r8
> +	movq	-8(%rsi,%rdx),%r9
> +	cmpq	%r8,%r9
> +	jne	10081608f
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +	ALIGN_TEXT
> +100408:
> +	movl	(%rdi),%r8d
> +	movl	(%rsi),%r9d
> +	cmpl	%r8d,%r9d
> +	jne	80f
> +	movl	-4(%rdi,%rdx),%r8d
> +	movl	-4(%rsi,%rdx),%r9d
> +	cmpl	%r8d,%r9d
> +	jne	10040804f
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +	ALIGN_TEXT
> +100204:
> +	movzwl	(%rdi),%r8d
> +	movzwl	(%rsi),%r9d
> +	cmpl	%r8d,%r9d
> +	jne	1f
> +	movzwl	-2(%rdi,%rdx),%r8d
> +	movzwl	-2(%rsi,%rdx),%r9d
> +	cmpl	%r8d,%r9d
> +	jne	1f
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +	ALIGN_TEXT
> +101632:
> +	cmpq	$32,%rdx
> +	ja	103200f
> +	movq	(%rdi),%r8
> +	movq	(%rsi),%r9
> +	cmpq	%r8,%r9
> +	jne	80f
> +	movq	8(%rdi),%r8
> +	movq	8(%rsi),%r9
> +	cmpq	%r8,%r9
> +	jne	10163208f
> +	movq	-16(%rdi,%rdx),%r8
> +	movq	-16(%rsi,%rdx),%r9
> +	cmpq	%r8,%r9
> +	jne	10163216f
> +	movq	-8(%rdi,%rdx),%r8
> +	movq	-8(%rsi,%rdx),%r9
> +	cmpq	%r8,%r9
> +	jne	10163224f
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +	ALIGN_TEXT
> +103200:
> +	movq	(%rdi),%r8
> +	movq	8(%rdi),%r9
> +	subq	(%rsi),%r8
> +	subq	8(%rsi),%r9
> +	orq	%r8,%r9
> +	jnz	10320000f
> +
> +	movq    16(%rdi),%r8
> +	movq    24(%rdi),%r9
> +	subq    16(%rsi),%r8
> +	subq    24(%rsi),%r9
> +	orq	%r8,%r9
> +	jnz     10320016f
> +
> +	leaq	32(%rdi),%rdi
> +	leaq	32(%rsi),%rsi
> +	subq	$32,%rdx
> +	cmpq	$32,%rdx
> +	jae	103200b
> +	cmpb	$0,%dl
> +	jne	10b
> +	RETGUARD_CHECK(memcmp, r11)
> +	ret
> +	lfence
> +
> +/*
> + * Mismatch was found.
> +*
> + * We need to compute the difference between strings.
> + * Start with narrowing the range down (16 -> 8 -> 4 bytes).
> + */
> +	ALIGN_TEXT
> +10320016:
> +	leaq	16(%rdi),%rdi
> +	leaq	16(%rsi),%rsi
> +10320000:
> +	movq	(%rdi),%r8
> +	movq	(%rsi),%r9
> +	cmpq	%r8,%r9
> +	jne	80f
> +	leaq	8(%rdi),%rdi
> +	leaq	8(%rsi),%rsi
> +	jmp	80f
> +	ALIGN_TEXT
> +10081608:
> +10163224:
> +	leaq	-8(%rdi,%rdx),%rdi
> +	leaq	-8(%rsi,%rdx),%rsi
> +	jmp	80f
> +	ALIGN_TEXT
> +10163216:
> +	leaq	-16(%rdi,%rdx),%rdi
> +	leaq	-16(%rsi,%rdx),%rsi
> +	jmp	80f
> +	ALIGN_TEXT
> +10163208:
> +	leaq	8(%rdi),%rdi
> +	leaq	8(%rsi),%rsi
> +	jmp	80f
> +	ALIGN_TEXT
> +10040804:
> +	leaq	-4(%rdi,%rdx),%rdi
> +	leaq	-4(%rsi,%rdx),%rsi
> +	jmp	1f
> +
> +	ALIGN_TEXT
> +80:
> +	movl	(%rdi),%r8d
> +	movl	(%rsi),%r9d
> +	cmpl	%r8d,%r9d
> +	jne	1f
> +	leaq	4(%rdi),%rdi
> +	leaq	4(%rsi),%rsi
> +
> +/*
> + * We have up to 4 bytes to inspect.
> + */
> +1:
> +	movzbl	(%rdi),%eax
> +	movzbl	(%rsi),%r8d
> +	cmpb	%r8b,%al
> +	jne	2f
> +
> +	movzbl	1(%rdi),%eax
> +	movzbl	1(%rsi),%r8d
> +	cmpb	%r8b,%al
> +	jne	2f
> +
> +	movzbl	2(%rdi),%eax
> +	movzbl	2(%rsi),%r8d
> +	cmpb	%r8b,%al
> +	jne	2f
> +
> +	movzbl	3(%rdi),%eax
> +	movzbl	3(%rsi),%r8d
> +2:
> +	subl	%r8d,%eax
> +	RETGUARD_CHECK(memcmp, r11)
>  	ret
>  	lfence
>  END(memcmp)
> -- 
> 2.44.0
>