From: Christian Schulte Subject: Re: [PATCH] amd64: import optimized memcmp from FreeBSD To: Stuart Henderson Cc: Mateusz Guzik , deraadt@openbsd.org, tech@openbsd.org Date: Tue, 3 Dec 2024 01:38:03 +0100 On 12/2/24 11:03, Stuart Henderson wrote: > On 2024/12/02 07:57, Mateusz Guzik wrote: >> OpenBSD libc for the amd64 platform uses a C variant of the memcmp >> routine (as opposed to the kernel which has the problematic rep variant) > > oh, interesting, so bcmp is asm and memcmp is C for amd64! Noticed this myself. I am quite confused right now, as I also wrote a bcmp function in C and the compiler also generates faster code. This is what I am doing: cc -c -o c_memcmp.o -pg -O2 c_memcmp.c cc -c -o c_bcmp.o -pg -O2 c_bcmp.c cc -c -o test.o -pg test.c cc -o test -pg -static c_memcmp.o c_bcmp.o test.o Files see below. Those C versions run a tiny bit slower whith small length values due to the two added length checks (>8,>4). For every other values (>8) the C versions run a lot faster. I doubt it myself and maybe gprof test gmon.out just displays garbage. Just update the test(0..32) call in main() of test.c and see the profiling results yourself. I have not yet tested those functions extensively. Just profiled them. Not very experienced with C. So if this looks completely incorrect, just forget about it. -- Christian ---- c_memcmp.c ---- #include int c_memcmp(const void *b1, const void *b2, size_t len) { size_t w; unsigned char *p8_1 = (void *) b1, *p8_2 = (void *) b2; uint32_t *p32_1, *p32_2; uint64_t *p64_1, *p64_2; if (len > 8) { w = len / 8; p64_1 = (uint64_t *) p8_1; p64_2 = (uint64_t *) p8_2; do { if (*p64_1++ != *p64_2++) { p8_1 = (unsigned char *) --p64_1; p8_2 = (unsigned char *) --p64_2; goto bytes; } len -= 8; } while (--w != 0); p8_1 = (unsigned char *) p64_1; p8_2 = (unsigned char *) p64_2; } if (len > 4) { w = len / 4; p32_1 = (uint32_t *) p8_1; p32_2 = (uint32_t *) p8_2; do { if (*p32_1++ != *p32_2++) { p8_1 = (unsigned char *) --p32_1; p8_2 = (unsigned char *) --p32_2; goto bytes; } len -= 4; } while (--w != 0); p8_1 = (unsigned char *) p32_1; p8_2 = (unsigned char *) p32_2; } bytes: if (len != 0) { do { if (*p8_1++ != *p8_2++) return *--p8_1 - *--p8_2; } while (--len != 0); } return (0); } ---- c_bcmp.c ---- #include int c_bcmp(const void *b1, const void *b2, size_t len) { size_t w; uint8_t *p8_1 = (void *) b1, *p8_2 = (void *) b2; uint32_t *p32_1, *p32_2; uint64_t *p64_1, *p64_2; if (len > 8) { w = len / 8; p64_1 = (uint64_t *) p8_1; p64_2 = (uint64_t *) p8_2; do { if (*p64_1++ != *p64_2++) { p8_1 = (uint8_t *) --p64_1; p8_2 = (uint8_t *) --p64_2; goto bytes; } len -= 8; } while (--w != 0); p8_1 = (uint8_t *) p64_1; p8_2 = (uint8_t *) p64_2; } if (len > 4) { w = len / 4; p32_1 = (uint32_t *) p8_1; p32_2 = (uint32_t *) p8_2; do { if (*p32_1++ != *p32_2++) { p8_1 = (uint8_t *) --p32_1; p8_2 = (uint8_t *) --p32_2; goto bytes; } len -= 4; } while (--w != 0); p8_1 = (uint8_t *) p32_1; p8_2 = (uint8_t *) p32_2; } bytes: if (len != 0) { do { if (*p8_1++ != *p8_2++) return (-1); } while (--len != 0); } return (0); } ---- test.c ---- #include #include #include #include #define ITERATIONS 1000000000 int c_memcmp(const void *, const void *, size_t); int c_bcmp(const void *, const void *, size_t); void test(size_t len) { char *b1, *b2; int i; b1 = malloc(len); if (b1 == NULL) err(1, NULL); b2 = malloc(len); if (b2 == NULL) err(1, NULL); memset(b1, 'A', len); memset(b2, 'A', len); if (len > 0) b2[len - 1] = 'B'; for (i = ITERATIONS; i > 0; i--) if (memcmp(b1, b2, len) != -1 && len > 0) err(1, "equal"); for (i = ITERATIONS; i > 0; i--) if (c_memcmp(b1, b2, len) != -1 && len > 0) err(1, "equal"); for (i = ITERATIONS; i > 0; i--) if (bcmp(b1, b2, len) == 0 && len > 0) err(1, "equal"); for (i = ITERATIONS; i > 0; i--) if (c_bcmp(b1, b2, len) == 0 && len > 0) err(1, "equal"); free(b1); free(b2); } int main() { test(9); return (0); }