Index | Thread | Search

From:
Christian Schulte <cs@schulte.it>
Subject:
Re: [PATCH] amd64: import optimized memcmp from FreeBSD
To:
tech@openbsd.org
Date:
Wed, 4 Dec 2024 02:36:03 +0100

Download raw body.

Thread
  • Steffen Nurpmeso:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • Mark Kettenis:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • On 12/3/24 22:31, Christian Weisgerber wrote:
    > Christian Schulte:
    > 
    >> Noticed this myself. I am quite confused right now, as I also wrote a bcmp
    >> function in C and the compiler also generates faster code.
    > 
    > Be careful about running afoul of clang's optimizations.
    > 
    > Calls to memcmp(), memcpy(), memset() with known fixed sizes may
    > be replaced by optimized inline code.
    > 
    > Clang also recognizes some algorithms and replaces them with optimized
    > code.  I would not be surprised if it recognized memcmp()-like code
    > and replaced it with a call to memcmp().
    > 
    > Check the generated assembly to be sure.
    > 
    
    Already looked at the assembly it generates. Do you mean cc -S or objdump -d?
    It seems it really calls the functions also when compiled with -O2.
    
    callq   memcmp@PLT
    callq   c_memcmp@PLT
    callq   bcmp@PLT
    callq   c_bcmp@PLT
    
    Rebuild libc and GENERIC.MP with those two versions. I cannot tell if that
    makes any huge difference. At least it does not slow things down. Without
    a real world test dataset, it makes no sense doing anything about it. For
    example, if things are a bit slower for small length values, but faster for
    greater length values, it makes no sense, if the majority of calls is done
    with small length values. memcmp and bcmp aren't really used that much.
    There are other candidates worth looking at. I will run it that way for
    some time.
    
    
    
    
    Index: lib/libc/arch/amd64/string/Makefile.inc
    ===================================================================
    RCS file: /cvs/src/lib/libc/arch/amd64/string/Makefile.inc,v
    retrieving revision 1.8
    diff -u -p -u -r1.8 Makefile.inc
    --- lib/libc/arch/amd64/string/Makefile.inc	9 Dec 2014 15:10:39 -0000	1.8
    +++ lib/libc/arch/amd64/string/Makefile.inc	4 Dec 2024 01:21:02 -0000
    @@ -2,8 +2,8 @@
     
     SRCS+=	memcpy.c memmove.S \
     	strchr.S strrchr.S \
    -	bcmp.S bzero.S ffs.S memchr.S \
    +	bcmp_64.c bzero.S ffs.S memchr.S \
     	memset.S strcat.c strcmp.S \
    -	memcmp.c strcpy.c strcspn.c strlen.S strlcat.c \
    +	memcmp_64.c strcpy.c strcspn.c strlen.S strlcat.c \
     	strlcpy.c strncat.c strncmp.c strncpy.c strpbrk.c \
     	strsep.c strspn.c strstr.c swab.c
    Index: lib/libc/arch/amd64/string/bcmp.S
    ===================================================================
    RCS file: lib/libc/arch/amd64/string/bcmp.S
    diff -N lib/libc/arch/amd64/string/bcmp.S
    --- lib/libc/arch/amd64/string/bcmp.S	1 Jan 2022 23:47:14 -0000	1.8
    +++ /dev/null	1 Jan 1970 00:00:00 -0000
    @@ -1,23 +0,0 @@
    -#include "DEFS.h"
    -
    -ENTRY_NB(bcmp)
    -	RETGUARD_SETUP(bcmp, r11)
    -	xorl	%eax,%eax		/* clear return value */
    -	cld				/* set compare direction forward */
    -
    -	movq	%rdx,%rcx		/* compare by words */
    -	shrq	$3,%rcx
    -	repe
    -	cmpsq
    -	jne	L1
    -
    -	movq	%rdx,%rcx		/* compare remainder by bytes */
    -	andq	$7,%rcx
    -	repe
    -	cmpsb
    -	je	L2
    -
    -L1:	incl	%eax
    -L2:	RETGUARD_CHECK(bcmp, r11)
    -	ret
    -END_WEAK(bcmp)
    Index: lib/libc/arch/amd64/string/bcmp_64.c
    ===================================================================
    RCS file: lib/libc/arch/amd64/string/bcmp_64.c
    diff -N lib/libc/arch/amd64/string/bcmp_64.c
    --- /dev/null	1 Jan 1970 00:00:00 -0000
    +++ lib/libc/arch/amd64/string/bcmp_64.c	4 Dec 2024 01:21:02 -0000
    @@ -0,0 +1,71 @@
    +/*
    + * Copyright (c) 2024 Christian Schulte <cs@schulte.it>
    + *
    + * Permission to use, copy, modify, and distribute this software for any
    + * purpose with or without fee is hereby granted, provided that the above
    + * copyright notice and this permission notice appear in all copies.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
    + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
    + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
    + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
    + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
    + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
    + */
    +
    +#include <string.h>
    +#include <sys/types.h>
    +
    +int
    +bcmp(const void *b1, const void *b2, size_t len)
    +{
    +	size_t 		w;
    +	uint8_t        *p8_1 = (void *) b1, *p8_2 = (void *) b2;
    +	uint32_t       *p32_1, *p32_2;
    +	uint64_t       *p64_1, *p64_2;
    +
    +	if (len > 8) {
    +		w = len / 8;
    +
    +		p64_1 = (uint64_t *) p8_1;
    +		p64_2 = (uint64_t *) p8_2;
    +
    +		do {
    +			if (*p64_1++ != *p64_2++)
    +				return (1);
    +
    +			len -= 8;
    +		} while (--w != 0);
    +
    +		p8_1 = (uint8_t *) p64_1;
    +		p8_2 = (uint8_t *) p64_2;
    +	}
    +
    +	if (len > 4) {
    +		w = len / 4;
    +
    +		p32_1 = (uint32_t *) p8_1;
    +		p32_2 = (uint32_t *) p8_2;
    +
    +		do {
    +			if (*p32_1++ != *p32_2++)
    +				return (1);
    +
    +			len -= 4;
    +		} while (--w != 0);
    +
    +		p8_1 = (uint8_t *) p32_1;
    +		p8_2 = (uint8_t *) p32_2;
    +	}
    +
    +	if (len != 0) {
    +		do {
    +			if (*p8_1++ != *p8_2++)
    +				return (1);
    +
    +		} while (--len != 0);
    +	}
    +	return (0);
    +}
    +DEF_WEAK(bcmp);
    Index: lib/libc/arch/amd64/string/memcmp_64.c
    ===================================================================
    RCS file: lib/libc/arch/amd64/string/memcmp_64.c
    diff -N lib/libc/arch/amd64/string/memcmp_64.c
    --- /dev/null	1 Jan 1970 00:00:00 -0000
    +++ lib/libc/arch/amd64/string/memcmp_64.c	4 Dec 2024 01:21:02 -0000
    @@ -0,0 +1,75 @@
    +/*
    + * Copyright (c) 2024 Christian Schulte <cs@schulte.it>
    + *
    + * Permission to use, copy, modify, and distribute this software for any
    + * purpose with or without fee is hereby granted, provided that the above
    + * copyright notice and this permission notice appear in all copies.
    + *
    + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
    + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
    + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
    + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
    + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
    + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
    + */
    +
    +#include <string.h>
    +#include <sys/types.h>
    +
    +int
    +memcmp(const void *b1, const void *b2, size_t len)
    +{
    +	size_t 		w;
    +	unsigned char  *p8_1 = (void *) b1, *p8_2 = (void *) b2;
    +	uint32_t       *p32_1, *p32_2;
    +	uint64_t       *p64_1, *p64_2;
    +
    +	if (len > 8) {
    +		w = len / 8;
    +
    +		p64_1 = (uint64_t *) p8_1;
    +		p64_2 = (uint64_t *) p8_2;
    +
    +		do {
    +			if (*p64_1++ != *p64_2++) {
    +				p8_1 = (unsigned char *) --p64_1;
    +				p8_2 = (unsigned char *) --p64_2;
    +				goto bytes;
    +			}
    +			len -= 8;
    +		} while (--w != 0);
    +
    +		p8_1 = (unsigned char *) p64_1;
    +		p8_2 = (unsigned char *) p64_2;
    +	}
    +
    +	if (len > 4) {
    +		w = len / 4;
    +
    +		p32_1 = (uint32_t *) p8_1;
    +		p32_2 = (uint32_t *) p8_2;
    +
    +		do {
    +			if (*p32_1++ != *p32_2++) {
    +				p8_1 = (unsigned char *) --p32_1;
    +				p8_2 = (unsigned char *) --p32_2;
    +				goto bytes;
    +			}
    +			len -= 4;
    +		} while (--w != 0);
    +
    +		p8_1 = (unsigned char *) p32_1;
    +		p8_2 = (unsigned char *) p32_2;
    +	}
    +
    +	if (len != 0) {
    +bytes:
    +		do {
    +			if (*p8_1++ != *p8_2++)
    +				return *--p8_1 - *--p8_2;
    +		} while (--len != 0);
    +	}
    +	return (0);
    +}
    +DEF_STRONG(memcmp);
    
    
    
  • Steffen Nurpmeso:

    [PATCH] amd64: import optimized memcmp from FreeBSD

  • Mark Kettenis:

    [PATCH] amd64: import optimized memcmp from FreeBSD