From: Christian Schulte Subject: Re: [PATCH] amd64: import optimized memcmp from FreeBSD To: tech@openbsd.org Date: Wed, 4 Dec 2024 02:36:03 +0100 On 12/3/24 22:31, Christian Weisgerber wrote: > Christian Schulte: > >> Noticed this myself. I am quite confused right now, as I also wrote a bcmp >> function in C and the compiler also generates faster code. > > Be careful about running afoul of clang's optimizations. > > Calls to memcmp(), memcpy(), memset() with known fixed sizes may > be replaced by optimized inline code. > > Clang also recognizes some algorithms and replaces them with optimized > code. I would not be surprised if it recognized memcmp()-like code > and replaced it with a call to memcmp(). > > Check the generated assembly to be sure. > Already looked at the assembly it generates. Do you mean cc -S or objdump -d? It seems it really calls the functions also when compiled with -O2. callq memcmp@PLT callq c_memcmp@PLT callq bcmp@PLT callq c_bcmp@PLT Rebuild libc and GENERIC.MP with those two versions. I cannot tell if that makes any huge difference. At least it does not slow things down. Without a real world test dataset, it makes no sense doing anything about it. For example, if things are a bit slower for small length values, but faster for greater length values, it makes no sense, if the majority of calls is done with small length values. memcmp and bcmp aren't really used that much. There are other candidates worth looking at. I will run it that way for some time. Index: lib/libc/arch/amd64/string/Makefile.inc =================================================================== RCS file: /cvs/src/lib/libc/arch/amd64/string/Makefile.inc,v retrieving revision 1.8 diff -u -p -u -r1.8 Makefile.inc --- lib/libc/arch/amd64/string/Makefile.inc 9 Dec 2014 15:10:39 -0000 1.8 +++ lib/libc/arch/amd64/string/Makefile.inc 4 Dec 2024 01:21:02 -0000 @@ -2,8 +2,8 @@ SRCS+= memcpy.c memmove.S \ strchr.S strrchr.S \ - bcmp.S bzero.S ffs.S memchr.S \ + bcmp_64.c bzero.S ffs.S memchr.S \ memset.S strcat.c strcmp.S \ - memcmp.c strcpy.c strcspn.c strlen.S strlcat.c \ + memcmp_64.c strcpy.c strcspn.c strlen.S strlcat.c \ strlcpy.c strncat.c strncmp.c strncpy.c strpbrk.c \ strsep.c strspn.c strstr.c swab.c Index: lib/libc/arch/amd64/string/bcmp.S =================================================================== RCS file: lib/libc/arch/amd64/string/bcmp.S diff -N lib/libc/arch/amd64/string/bcmp.S --- lib/libc/arch/amd64/string/bcmp.S 1 Jan 2022 23:47:14 -0000 1.8 +++ /dev/null 1 Jan 1970 00:00:00 -0000 @@ -1,23 +0,0 @@ -#include "DEFS.h" - -ENTRY_NB(bcmp) - RETGUARD_SETUP(bcmp, r11) - xorl %eax,%eax /* clear return value */ - cld /* set compare direction forward */ - - movq %rdx,%rcx /* compare by words */ - shrq $3,%rcx - repe - cmpsq - jne L1 - - movq %rdx,%rcx /* compare remainder by bytes */ - andq $7,%rcx - repe - cmpsb - je L2 - -L1: incl %eax -L2: RETGUARD_CHECK(bcmp, r11) - ret -END_WEAK(bcmp) Index: lib/libc/arch/amd64/string/bcmp_64.c =================================================================== RCS file: lib/libc/arch/amd64/string/bcmp_64.c diff -N lib/libc/arch/amd64/string/bcmp_64.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ lib/libc/arch/amd64/string/bcmp_64.c 4 Dec 2024 01:21:02 -0000 @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2024 Christian Schulte + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +int +bcmp(const void *b1, const void *b2, size_t len) +{ + size_t w; + uint8_t *p8_1 = (void *) b1, *p8_2 = (void *) b2; + uint32_t *p32_1, *p32_2; + uint64_t *p64_1, *p64_2; + + if (len > 8) { + w = len / 8; + + p64_1 = (uint64_t *) p8_1; + p64_2 = (uint64_t *) p8_2; + + do { + if (*p64_1++ != *p64_2++) + return (1); + + len -= 8; + } while (--w != 0); + + p8_1 = (uint8_t *) p64_1; + p8_2 = (uint8_t *) p64_2; + } + + if (len > 4) { + w = len / 4; + + p32_1 = (uint32_t *) p8_1; + p32_2 = (uint32_t *) p8_2; + + do { + if (*p32_1++ != *p32_2++) + return (1); + + len -= 4; + } while (--w != 0); + + p8_1 = (uint8_t *) p32_1; + p8_2 = (uint8_t *) p32_2; + } + + if (len != 0) { + do { + if (*p8_1++ != *p8_2++) + return (1); + + } while (--len != 0); + } + return (0); +} +DEF_WEAK(bcmp); Index: lib/libc/arch/amd64/string/memcmp_64.c =================================================================== RCS file: lib/libc/arch/amd64/string/memcmp_64.c diff -N lib/libc/arch/amd64/string/memcmp_64.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ lib/libc/arch/amd64/string/memcmp_64.c 4 Dec 2024 01:21:02 -0000 @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2024 Christian Schulte + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +int +memcmp(const void *b1, const void *b2, size_t len) +{ + size_t w; + unsigned char *p8_1 = (void *) b1, *p8_2 = (void *) b2; + uint32_t *p32_1, *p32_2; + uint64_t *p64_1, *p64_2; + + if (len > 8) { + w = len / 8; + + p64_1 = (uint64_t *) p8_1; + p64_2 = (uint64_t *) p8_2; + + do { + if (*p64_1++ != *p64_2++) { + p8_1 = (unsigned char *) --p64_1; + p8_2 = (unsigned char *) --p64_2; + goto bytes; + } + len -= 8; + } while (--w != 0); + + p8_1 = (unsigned char *) p64_1; + p8_2 = (unsigned char *) p64_2; + } + + if (len > 4) { + w = len / 4; + + p32_1 = (uint32_t *) p8_1; + p32_2 = (uint32_t *) p8_2; + + do { + if (*p32_1++ != *p32_2++) { + p8_1 = (unsigned char *) --p32_1; + p8_2 = (unsigned char *) --p32_2; + goto bytes; + } + len -= 4; + } while (--w != 0); + + p8_1 = (unsigned char *) p32_1; + p8_2 = (unsigned char *) p32_2; + } + + if (len != 0) { +bytes: + do { + if (*p8_1++ != *p8_2++) + return *--p8_1 - *--p8_2; + } while (--len != 0); + } + return (0); +} +DEF_STRONG(memcmp);