From: Crystal Kolipe <kolipe.c@exoticsilicon.com>
Subject: Re: [PATCH] amd64: import optimized memcmp from FreeBSD]
To: tech <tech@openbsd.org>, Mateusz Guzik <mjguzik@gmail.com>
Cc: Stuart <stu@spacehopper.org>
Date: Fri, 06 Jun 2025 12:02:10 -0300

I've ported a cut-down version of this code to i386 in case anybody wants to
look for possible performance gains there.

It could probably be improved further if there is interest.

/*-
 * Copyright (c) 2018 The FreeBSD Foundation
 *
 * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
 * under sponsorship from the FreeBSD Foundation.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Ported from AMD64 to i386 by Exotic Silicon
 */

#include <machine/asm.h>

#define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */

ENTRY(memcmp)
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	pushl	%edx

	movl	20(%esp),%edi
	movl	24(%esp),%esi
	movl	28(%esp),%ecx			/* ecx is the 'data left' counter */

	xorl	%eax,%eax			/* Zero the accumulator */

	cmpl	$4,%ecx				/* If ecx > 4 then goto 100408 */
	jg	100408f

	cmpb	$2,%cl				/* If ecx >= 2 then goto 100204 */
	jge	100204f

	cmpb	$1,%cl				/* If ecx < 1 then exit */
	jl	out

	movzbl	(%edi),%eax			/* Load byte at address pointed to by edi in to lower bits of accumulator and zero the upper bits */
	movzbl	(%esi),%edx			/* Load byte at address pointed to by esi in to lower bits of edx and zero the upper bits */
	subl	%edx,%eax			/* Long subtract edx from eax */
	jmp	out				/* Exit */

	ALIGN_TEXT
100408:						/* Subroutine to handle ecx > 4 */
	movl	(%edi),%ebx			/* Load next 4 bytes in to ebx */
	movl	(%esi),%edx			/* Load next 4 bytes in to esi */
	cmpl	%ebx,%edx			/* Same? */
	jne	1f				/* If not, go to byte-comparing code */
	leal	4(%edi),%edi			/* Increase edi by 4 bytes */
	leal	4(%esi),%esi			/* Increase esi by 4 bytes */
	subl	$4,%ecx				/* Reduce ecx by 4 bytes */
	cmpl	$4,%ecx	
	jg	100408b				/* If ecx > 4 then loop back to 100408 */
	leal	-4(%edi,%ecx),%edi		/* Align with tail bytes */
	leal	-4(%esi,%ecx),%esi
	movl	(%edi),%ebx			/* Load tail bytes in to ebx */
	movl	(%esi),%edx			/* Load tail bytes in to edx */
	cmpl	%ebx,%edx			/* Are they the same? */
	jne	1f				/* If no, then jump to byte-comparing code */
	jmp	out				/* Else exit */

	ALIGN_TEXT
100204:						/* Subroutine if 4 >= ecx >= 2 */
	movzwl	(%edi),%ebx			/* Load next 2 bytes in to the lower 16 bits of ebx, zeroing the upper 16 bits */
	movzwl	(%esi),%edx			/* Load next 2 bytes in to the lower 16 bits of edx, zeroing the upper 16 bits */
	cmpl	%ebx,%edx			/* Compare */
	jne	1f				/* If not equal, go to byte-comparing code */
	movzwl	-2(%edi,%ecx),%ebx		/* Align with tail - note that if ecx==2 then we end up checking the same bytes twice */
	movzwl	-2(%esi,%ecx),%edx
	cmpl	%ebx,%edx			/* Same? */
	jne	1f				/* If not, go to byte-comparing code */
	jmp	out				/* Else exit */

	ALIGN_TEXT
1:						/* Byte comparing to find the first differing byte out of a maximum of four bytes */
	movzbl	(%edi),%eax			/* Load single byte in to lower part of accumulator and zero-extend */
	movzbl	(%esi),%ebx			/* Load single byte in to lower part of ebx and zero-extend */
	cmpb	%bl,%al				/* Byte compare */
	jne	2f				/* If it's different, jump forward */

	movzbl	1(%edi),%eax			/* Same for byte 2 of 4 */
	movzbl	1(%esi),%ebx
	cmpb	%bl,%al
	jne	2f

	movzbl	2(%edi),%eax			/* Same for byte 3 of 4 */
	movzbl	2(%esi),%ebx
	cmpb	%bl,%al
	jne	2f

	movzbl	3(%edi),%eax			/* If we reach here, then it must be byte 4 that differs, so no need to compare */
	movzbl	3(%esi),%ebx

2:
	subl	%ebx,%eax			/* Store the difference in the accumulator */

	ALIGN_TEXT
out:
	popl	%edx
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
END(memcmp)

On Thu, Jun 05, 2025 at 05:56:22AM -0300, Crystal Kolipe wrote:
> On Wed, Jun 04, 2025 at 10:25:36AM +0100, Stuart Henderson wrote:
> > I just noticed that I've still got this diff in my tree and wondered
> > if it's going to go anywhere.
> > 
> > The thread from last year got derailed into discussion about libc
> > (partly my fault) and SIMD (not likely to be really useful in the
> > kernel) - https://marc.info/?t=173284225400001&r=1&w=2 - but I don't
> > think there were any particular objections, and there were good test
> > reports, for the diff as presented.
> 
> Reading the code it appears to do what it claims to do without introducing any
> new bugs.
> 
> But if the existing code is going to be replaced at all, presumably we want to
> ensure that the replacement is the most appropriate, (in terms of trade off of
> performance and complexity), for the foreseeable future to avoid churn.
> 
> On this point, I'm dubious.  The lack of comments and numeric labels doesn't
> make the assessment any easier, because it isn't obvious whether any
> particular part of the code is intentionally done for performance reasons or
> whether it's an oversight.
> 
> For example, there are two code paths that can deal with the case of having
> exactly 32 bytes left in the buffer.
> 
> If you enter memcmp() with rdx==32, then the byte comparison is handled by
> the routine at 101632.
> 
> If you enter memcmp() with rdx being a multiple of 32, then you end up
> spinning in 103200, because the cmpq $32,%rdx is followed by a jae,
> (jump above _or equal_), meaning it's testing rdx >= 32.
> 
> So the last 32 bytes are compared by 103200 instead of 101632.
> 
> If the 103200 routine is indeed preferred for performance, (and on what
> hardware?), then why is the initial cmpq in 101632 followed by a ja instead of
> a jae?
> 
> I can see what the code _does_, but without comments who knows what it's
> _supposed_ to do?
>