Index | Thread | Search

From:
Martin Pieuchot <mpi@grenadille.net>
Subject:
Re: [PATCH] amd64: import optimized memcmp from FreeBSD]
To:
tech <tech@openbsd.org>, Mateusz Guzik <mjguzik@gmail.com>, Stuart <stu@spacehopper.org>
Date:
Sat, 7 Jun 2025 11:06:39 +0200

Download raw body.

Thread
On 06/06/25(Fri) 12:02, Crystal Kolipe wrote:
> I've ported a cut-down version of this code to i386 in case anybody wants to
> look for possible performance gains there.
> 
> It could probably be improved further if there is interest.

There is interest.  We just need somebody which will lead the effort, do
the testing, report back and integrate the comments from the list.

This might not be trivial.

Would you like to do it?  Thanks!

> /*-
>  * Copyright (c) 2018 The FreeBSD Foundation
>  *
>  * This software was developed by Mateusz Guzik <mjg@FreeBSD.org>
>  * under sponsorship from the FreeBSD Foundation.
>  *
>  * Redistribution and use in source and binary forms, with or without
>  * modification, are permitted provided that the following conditions
>  * are met:
>  * 1. Redistributions of source code must retain the above copyright
>  *    notice, this list of conditions and the following disclaimer.
>  * 2. Redistributions in binary form must reproduce the above copyright
>  *    notice, this list of conditions and the following disclaimer in the
>  *    documentation and/or other materials provided with the distribution.
>  *
>  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
>  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
>  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
>  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
>  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
>  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
>  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
>  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
>  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
>  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
>  * SUCH DAMAGE.
>  */
> 
> /*
>  * Ported from AMD64 to i386 by Exotic Silicon
>  */
> 
> #include <machine/asm.h>
> 
> #define ALIGN_TEXT      .p2align 4,0x90 /* 16-byte alignment, nop filled */
> 
> ENTRY(memcmp)
> 	pushl	%edi
> 	pushl	%esi
> 	pushl	%ebx
> 	pushl	%edx
> 
> 	movl	20(%esp),%edi
> 	movl	24(%esp),%esi
> 	movl	28(%esp),%ecx			/* ecx is the 'data left' counter */
> 
> 	xorl	%eax,%eax			/* Zero the accumulator */
> 
> 	cmpl	$4,%ecx				/* If ecx > 4 then goto 100408 */
> 	jg	100408f
> 
> 	cmpb	$2,%cl				/* If ecx >= 2 then goto 100204 */
> 	jge	100204f
> 
> 	cmpb	$1,%cl				/* If ecx < 1 then exit */
> 	jl	out
> 
> 	movzbl	(%edi),%eax			/* Load byte at address pointed to by edi in to lower bits of accumulator and zero the upper bits */
> 	movzbl	(%esi),%edx			/* Load byte at address pointed to by esi in to lower bits of edx and zero the upper bits */
> 	subl	%edx,%eax			/* Long subtract edx from eax */
> 	jmp	out				/* Exit */
> 
> 	ALIGN_TEXT
> 100408:						/* Subroutine to handle ecx > 4 */
> 	movl	(%edi),%ebx			/* Load next 4 bytes in to ebx */
> 	movl	(%esi),%edx			/* Load next 4 bytes in to esi */
> 	cmpl	%ebx,%edx			/* Same? */
> 	jne	1f				/* If not, go to byte-comparing code */
> 	leal	4(%edi),%edi			/* Increase edi by 4 bytes */
> 	leal	4(%esi),%esi			/* Increase esi by 4 bytes */
> 	subl	$4,%ecx				/* Reduce ecx by 4 bytes */
> 	cmpl	$4,%ecx	
> 	jg	100408b				/* If ecx > 4 then loop back to 100408 */
> 	leal	-4(%edi,%ecx),%edi		/* Align with tail bytes */
> 	leal	-4(%esi,%ecx),%esi
> 	movl	(%edi),%ebx			/* Load tail bytes in to ebx */
> 	movl	(%esi),%edx			/* Load tail bytes in to edx */
> 	cmpl	%ebx,%edx			/* Are they the same? */
> 	jne	1f				/* If no, then jump to byte-comparing code */
> 	jmp	out				/* Else exit */
> 
> 	ALIGN_TEXT
> 100204:						/* Subroutine if 4 >= ecx >= 2 */
> 	movzwl	(%edi),%ebx			/* Load next 2 bytes in to the lower 16 bits of ebx, zeroing the upper 16 bits */
> 	movzwl	(%esi),%edx			/* Load next 2 bytes in to the lower 16 bits of edx, zeroing the upper 16 bits */
> 	cmpl	%ebx,%edx			/* Compare */
> 	jne	1f				/* If not equal, go to byte-comparing code */
> 	movzwl	-2(%edi,%ecx),%ebx		/* Align with tail - note that if ecx==2 then we end up checking the same bytes twice */
> 	movzwl	-2(%esi,%ecx),%edx
> 	cmpl	%ebx,%edx			/* Same? */
> 	jne	1f				/* If not, go to byte-comparing code */
> 	jmp	out				/* Else exit */
> 
> 	ALIGN_TEXT
> 1:						/* Byte comparing to find the first differing byte out of a maximum of four bytes */
> 	movzbl	(%edi),%eax			/* Load single byte in to lower part of accumulator and zero-extend */
> 	movzbl	(%esi),%ebx			/* Load single byte in to lower part of ebx and zero-extend */
> 	cmpb	%bl,%al				/* Byte compare */
> 	jne	2f				/* If it's different, jump forward */
> 
> 	movzbl	1(%edi),%eax			/* Same for byte 2 of 4 */
> 	movzbl	1(%esi),%ebx
> 	cmpb	%bl,%al
> 	jne	2f
> 
> 	movzbl	2(%edi),%eax			/* Same for byte 3 of 4 */
> 	movzbl	2(%esi),%ebx
> 	cmpb	%bl,%al
> 	jne	2f
> 
> 	movzbl	3(%edi),%eax			/* If we reach here, then it must be byte 4 that differs, so no need to compare */
> 	movzbl	3(%esi),%ebx
> 
> 2:
> 	subl	%ebx,%eax			/* Store the difference in the accumulator */
> 
> 	ALIGN_TEXT
> out:
> 	popl	%edx
> 	popl	%ebx
> 	popl	%esi
> 	popl	%edi
> 	ret
> END(memcmp)
> 
> On Thu, Jun 05, 2025 at 05:56:22AM -0300, Crystal Kolipe wrote:
> > On Wed, Jun 04, 2025 at 10:25:36AM +0100, Stuart Henderson wrote:
> > > I just noticed that I've still got this diff in my tree and wondered
> > > if it's going to go anywhere.
> > > 
> > > The thread from last year got derailed into discussion about libc
> > > (partly my fault) and SIMD (not likely to be really useful in the
> > > kernel) - https://marc.info/?t=173284225400001&r=1&w=2 - but I don't
> > > think there were any particular objections, and there were good test
> > > reports, for the diff as presented.
> > 
> > Reading the code it appears to do what it claims to do without introducing any
> > new bugs.
> > 
> > But if the existing code is going to be replaced at all, presumably we want to
> > ensure that the replacement is the most appropriate, (in terms of trade off of
> > performance and complexity), for the foreseeable future to avoid churn.
> > 
> > On this point, I'm dubious.  The lack of comments and numeric labels doesn't
> > make the assessment any easier, because it isn't obvious whether any
> > particular part of the code is intentionally done for performance reasons or
> > whether it's an oversight.
> > 
> > For example, there are two code paths that can deal with the case of having
> > exactly 32 bytes left in the buffer.
> > 
> > If you enter memcmp() with rdx==32, then the byte comparison is handled by
> > the routine at 101632.
> > 
> > If you enter memcmp() with rdx being a multiple of 32, then you end up
> > spinning in 103200, because the cmpq $32,%rdx is followed by a jae,
> > (jump above _or equal_), meaning it's testing rdx >= 32.
> > 
> > So the last 32 bytes are compared by 103200 instead of 101632.
> > 
> > If the 103200 routine is indeed preferred for performance, (and on what
> > hardware?), then why is the initial cmpq in 101632 followed by a ja instead of
> > a jae?
> > 
> > I can see what the code _does_, but without comments who knows what it's
> > _supposed_ to do?
> > 
>