Download raw body.
[PATCH] amd64: import optimized memcmp from FreeBSD
On 12/2/24 11:03, Stuart Henderson wrote:
> On 2024/12/02 07:57, Mateusz Guzik wrote:
>> OpenBSD libc for the amd64 platform uses a C variant of the memcmp
>> routine (as opposed to the kernel which has the problematic rep variant)
>
> oh, interesting, so bcmp is asm and memcmp is C for amd64!
Noticed this myself. I am quite confused right now, as I also wrote a bcmp
function in C and the compiler also generates faster code. This is what I am
doing:
cc -c -o c_memcmp.o -pg -O2 c_memcmp.c
cc -c -o c_bcmp.o -pg -O2 c_bcmp.c
cc -c -o test.o -pg test.c
cc -o test -pg -static c_memcmp.o c_bcmp.o test.o
Files see below. Those C versions run a tiny bit slower whith small length
values due to the two added length checks (>8,>4). For every other values (>8)
the C versions run a lot faster. I doubt it myself and maybe gprof test gmon.out
just displays garbage. Just update the test(0..32) call in main() of test.c and
see the profiling results yourself. I have not yet tested those functions
extensively. Just profiled them. Not very experienced with C. So if this looks
completely incorrect, just forget about it.
--
Christian
---- c_memcmp.c ----
#include <sys/types.h>
int
c_memcmp(const void *b1, const void *b2, size_t len)
{
size_t w;
unsigned char *p8_1 = (void *) b1, *p8_2 = (void *) b2;
uint32_t *p32_1, *p32_2;
uint64_t *p64_1, *p64_2;
if (len > 8) {
w = len / 8;
p64_1 = (uint64_t *) p8_1;
p64_2 = (uint64_t *) p8_2;
do {
if (*p64_1++ != *p64_2++) {
p8_1 = (unsigned char *) --p64_1;
p8_2 = (unsigned char *) --p64_2;
goto bytes;
}
len -= 8;
} while (--w != 0);
p8_1 = (unsigned char *) p64_1;
p8_2 = (unsigned char *) p64_2;
}
if (len > 4) {
w = len / 4;
p32_1 = (uint32_t *) p8_1;
p32_2 = (uint32_t *) p8_2;
do {
if (*p32_1++ != *p32_2++) {
p8_1 = (unsigned char *) --p32_1;
p8_2 = (unsigned char *) --p32_2;
goto bytes;
}
len -= 4;
} while (--w != 0);
p8_1 = (unsigned char *) p32_1;
p8_2 = (unsigned char *) p32_2;
}
bytes:
if (len != 0) {
do {
if (*p8_1++ != *p8_2++)
return *--p8_1 - *--p8_2;
} while (--len != 0);
}
return (0);
}
---- c_bcmp.c ----
#include <sys/types.h>
int
c_bcmp(const void *b1, const void *b2, size_t len)
{
size_t w;
uint8_t *p8_1 = (void *) b1, *p8_2 = (void *) b2;
uint32_t *p32_1, *p32_2;
uint64_t *p64_1, *p64_2;
if (len > 8) {
w = len / 8;
p64_1 = (uint64_t *) p8_1;
p64_2 = (uint64_t *) p8_2;
do {
if (*p64_1++ != *p64_2++) {
p8_1 = (uint8_t *) --p64_1;
p8_2 = (uint8_t *) --p64_2;
goto bytes;
}
len -= 8;
} while (--w != 0);
p8_1 = (uint8_t *) p64_1;
p8_2 = (uint8_t *) p64_2;
}
if (len > 4) {
w = len / 4;
p32_1 = (uint32_t *) p8_1;
p32_2 = (uint32_t *) p8_2;
do {
if (*p32_1++ != *p32_2++) {
p8_1 = (uint8_t *) --p32_1;
p8_2 = (uint8_t *) --p32_2;
goto bytes;
}
len -= 4;
} while (--w != 0);
p8_1 = (uint8_t *) p32_1;
p8_2 = (uint8_t *) p32_2;
}
bytes:
if (len != 0) {
do {
if (*p8_1++ != *p8_2++)
return (-1);
} while (--len != 0);
}
return (0);
}
---- test.c ----
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <err.h>
#define ITERATIONS 1000000000
int
c_memcmp(const void *, const void *, size_t);
int
c_bcmp(const void *, const void *, size_t);
void
test(size_t len)
{
char *b1, *b2;
int i;
b1 = malloc(len);
if (b1 == NULL)
err(1, NULL);
b2 = malloc(len);
if (b2 == NULL)
err(1, NULL);
memset(b1, 'A', len);
memset(b2, 'A', len);
if (len > 0)
b2[len - 1] = 'B';
for (i = ITERATIONS; i > 0; i--)
if (memcmp(b1, b2, len) != -1 && len > 0)
err(1, "equal");
for (i = ITERATIONS; i > 0; i--)
if (c_memcmp(b1, b2, len) != -1 && len > 0)
err(1, "equal");
for (i = ITERATIONS; i > 0; i--)
if (bcmp(b1, b2, len) == 0 && len > 0)
err(1, "equal");
for (i = ITERATIONS; i > 0; i--)
if (c_bcmp(b1, b2, len) == 0 && len > 0)
err(1, "equal");
free(b1);
free(b2);
}
int
main()
{
test(9);
return (0);
}
[PATCH] amd64: import optimized memcmp from FreeBSD