From cae28869c106eb342dd5a1c8242f933efab6f772 Mon Sep 17 00:00:00 2001 From: Jeff Johnston Date: Mon, 26 May 2008 22:56:14 +0000 Subject: 2008-05-26 Eric Blake Optimize the generic and x86 strlen. * libc/string/strlen.c (strlen) [!__OPTIMIZE_SIZE__]: Pre-align data so unaligned searches aren't penalized. * libc/machine/i386/strlen.S (strlen) [!__OPTIMIZE_SIZE__]: Word operations are faster than repnz byte searches. --- newlib/libc/machine/i386/strlen.S | 65 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) (limited to 'newlib/libc/machine') diff --git a/newlib/libc/machine/i386/strlen.S b/newlib/libc/machine/i386/strlen.S index 459b3a959..0e3cb640c 100644 --- a/newlib/libc/machine/i386/strlen.S +++ b/newlib/libc/machine/i386/strlen.S @@ -1,6 +1,6 @@ /* * ==================================================== - * Copyright (C) 1998, 2002 by Red Hat Inc. All rights reserved. + * Copyright (C) 1998, 2002, 2008 by Red Hat Inc. All rights reserved. * * Permission to use, copy, modify, and distribute this * software is freely granted, provided that this notice @@ -20,12 +20,75 @@ SYM (strlen): pushl edi movl 8(ebp),edx +#ifdef __OPTIMIZE_SIZE__ cld movl edx,edi movl $4294967295,ecx xor eax,eax repnz scasb +#else +/* Modern x86 hardware is much faster at double-word + manipulation than with bytewise repnz scasb. */ + +/* Do byte-wise checks until string is aligned. */ + movl edx,edi + test $3,edi + je L5 + movb (edi),cl + incl edi + testb cl,cl + je L15 + + test $3,edi + je L5 + movb (edi),cl + incl edi + testb cl,cl + je L15 + + test $3,edi + je L5 + movb (edi),cl + incl edi + testb cl,cl + je L15 + +L5: + subl $4,edi + +/* loop performing 4 byte mask checking for desired 0 byte */ + .p2align 4,,7 +L10: + addl $4,edi + movl (edi),ecx + leal -16843009(ecx),eax + notl ecx + andl ecx,eax + testl $-2139062144,eax + je L10 + +/* Find which of four bytes is 0. */ + notl ecx + incl edi + + testb cl,cl + je L15 + incl edi + shrl $8,ecx + + testb cl,cl + je L15 + incl edi + shrl $8,ecx + + testb cl,cl + je L15 + incl edi + +#endif + +L15: subl edx,edi leal -1(edi),eax -- cgit v1.2.3