diff options
| author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-10-23 14:56:04 -0400 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-23 14:56:04 -0400 |
| commit | ce7dd29f2863820ba858fe06c2ff61417df40d75 (patch) | |
| tree | 32b9c93de4777c2d8239d94d67b9257cfccf1745 | |
| parent | 6ddf460fa5f7f96e96134f962f4b2ba708b1095b (diff) | |
| download | glibc-ce7dd29f2863820ba858fe06c2ff61417df40d75.tar.xz glibc-ce7dd29f2863820ba858fe06c2ff61417df40d75.zip | |
Optimized strnlen and wcscmp for x86-64
| -rw-r--r-- | ChangeLog | 14 | ||||
| -rw-r--r-- | NEWS | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat-ssse3.S | 2 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen-no-bsf.S | 313 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S | 686 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen.S | 2 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S | 3 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strnlen.S | 55 | ||||
| -rw-r--r-- | sysdeps/x86_64/wcslen.S | 239 |
10 files changed, 1003 insertions, 319 deletions
@@ -1,3 +1,17 @@ +2011-10-20 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strnlen-sse2-no-bsf. + Rename strlen-no-bsf to strlen-sse2-no-bsf. + * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Rename to + * sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S: + Add strnlen support. + (USE_AS_STRNLEN): New macro. + * sysdeps/x86_64/multiarch/strnlen-sse2-no-bsf.S: New file. + * sysdeps/x86_64/multiarch/strcat-ssse3.S: Update. + Rename strlen-no-bsf.S to strlen-sse2-no-bsf.S + * sysdeps/x86_64/wcslen.S: New file. + 2011-10-20 Michael Zolotukhin <michael.v.zolotukhin@gmail.com> * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update. @@ -26,8 +26,8 @@ Version 2.15 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. Contributed by HJ Lu. -* Optimized strcat and strncat on x86-64 and optimized wcscmp on x86-32 and - x86-64. +* Optimized strcat, strncat, wcslen, strnlen on x86-64 and optimized + wcscmp on x86-32 and x86-64. Contributed by Liubov Dmitrieva. * Optimized strchr and strrchr for SSE on x86-32. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e0bb9847a8..4cf4cf4b28 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -9,13 +9,13 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ - strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ + strncase_l-ssse3 strlen-sse4 strlen-sse2-no-bsf memset-x86-64 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ strcat-ssse3 strncat-ssse3 strlen-sse2-pminub \ - strrchr-sse2-no-bsf strchr-sse2-no-bsf \ + strnlen-sse2-no-bsf strrchr-sse2-no-bsf strchr-sse2-no-bsf \ memcmp-ssse3 wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S index 34b61b8468..2ec3ba7c3e 100644 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -35,7 +35,7 @@ ENTRY (STRCAT) # endif # define RETURN jmp L(StartStrcpyPart) -# include "strlen-no-bsf.S" +# include "strlen-sse2-no-bsf.S" # undef RETURN diff --git a/sysdeps/x86_64/multiarch/strlen-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-no-bsf.S deleted file mode 100644 index a430e5fa0f..0000000000 --- a/sysdeps/x86_64/multiarch/strlen-no-bsf.S +++ /dev/null @@ -1,313 +0,0 @@ -/* strlen SSE2 without bsf - Copyright (C) 2010, 2011 Free Software Foundation, Inc. - Contributed by Intel Corporation. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA. */ - -#if (defined SHARED || defined USE_AS_STRCAT) && !defined NOT_IN_libc - -# ifndef USE_AS_STRCAT - -# include <sysdep.h> - -# define RETURN ret - - atom_text_section -ENTRY (__strlen_no_bsf) -# endif - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - mov %rdi, %rcx - mov %rdi, %rax - and $-16, %rax - add $16, %rax - add $16, %rcx - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(aligned_64_exit) -L(aligned_64_exit_16): - lea -48(%rax), %rax -L(aligned_64_exit): -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - RETURN - -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - RETURN - .p2align 4 -L(exit_tail1): - add $1, %eax - RETURN - -L(exit_tail2): - add $2, %eax - RETURN - -L(exit_tail3): - add $3, %eax - RETURN - -L(exit_tail4): - add $4, %eax - RETURN - -L(exit_tail5): - add $5, %eax - RETURN -L(exit_tail6): - add $6, %eax - RETURN -L(exit_tail7): - add $7, %eax - RETURN -L(exit_tail8): - add $8, %eax - RETURN -L(exit_tail9): - add $9, %eax - RETURN -L(exit_tail10): - add $10, %eax - RETURN -L(exit_tail11): - add $11, %eax - RETURN -L(exit_tail12): - add $12, %eax - RETURN -L(exit_tail13): - add $13, %eax - RETURN -L(exit_tail14): - add $14, %eax - RETURN -L(exit_tail15): - add $15, %eax -# ifndef USE_AS_STRCAT - RETURN -END (__strlen_no_bsf) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S new file mode 100644 index 0000000000..0b1c9731c3 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strlen-sse2-no-bsf.S @@ -0,0 +1,686 @@ +/* strlen SSE2 without bsf + Copyright (C) 2010, 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +/* only for strlen case we don't use optimized version for STATIC build just for SHARED */ + +#if (defined SHARED || defined USE_AS_STRCAT || defined USE_AS_STRNLEN) && !defined NOT_IN_libc + +# ifndef USE_AS_STRCAT + +# include <sysdep.h> + +# define RETURN ret + +# ifndef STRLEN +# define STRLEN __strlen_sse2_no_bsf +# endif + + atom_text_section +ENTRY (STRLEN) +# endif + xor %eax, %eax +# ifdef USE_AS_STRNLEN + mov %rsi, %r8 + sub $4, %rsi + jbe L(len_less4_prolog) +# endif + cmpb $0, (%rdi) + jz L(exit_tail0) + cmpb $0, 1(%rdi) + jz L(exit_tail1) + cmpb $0, 2(%rdi) + jz L(exit_tail2) + cmpb $0, 3(%rdi) + jz L(exit_tail3) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less8_prolog) +# endif + + cmpb $0, 4(%rdi) + jz L(exit_tail4) + cmpb $0, 5(%rdi) + jz L(exit_tail5) + cmpb $0, 6(%rdi) + jz L(exit_tail6) + cmpb $0, 7(%rdi) + jz L(exit_tail7) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less12_prolog) +# endif + + cmpb $0, 8(%rdi) + jz L(exit_tail8) + cmpb $0, 9(%rdi) + jz L(exit_tail9) + cmpb $0, 10(%rdi) + jz L(exit_tail10) + cmpb $0, 11(%rdi) + jz L(exit_tail11) + +# ifdef USE_AS_STRNLEN + sub $4, %rsi + jbe L(len_less16_prolog) +# endif + + cmpb $0, 12(%rdi) + jz L(exit_tail12) + cmpb $0, 13(%rdi) + jz L(exit_tail13) + cmpb $0, 14(%rdi) + jz L(exit_tail14) + cmpb $0, 15(%rdi) + jz L(exit_tail15) + pxor %xmm0, %xmm0 + lea 16(%rdi), %rcx + lea 16(%rdi), %rax + and $-16, %rax + +# ifdef USE_AS_STRNLEN + and $15, %rdi + add %rdi, %rsi + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + + pcmpeqb (%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + lea 16(%rax), %rax + jnz L(exit) + +# ifdef USE_AS_STRNLEN + mov %rax, %rdx + and $63, %rdx + add %rdx, %rsi +# endif + + and $-0x40, %rax + + .p2align 4 +L(aligned_64): +# ifdef USE_AS_STRNLEN + sub $64, %rsi + jbe L(len_less64) +# endif + pcmpeqb (%rax), %xmm0 + pcmpeqb 16(%rax), %xmm1 + pcmpeqb 32(%rax), %xmm2 + pcmpeqb 48(%rax), %xmm3 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %r11d + pmovmskb %xmm2, %r10d + pmovmskb %xmm3, %r9d + or %edx, %r9d + or %r11d, %r9d + or %r10d, %r9d + lea 64(%rax), %rax + jz L(aligned_64) + + test %edx, %edx + jnz L(aligned_64_exit_16) + test %r11d, %r11d + jnz L(aligned_64_exit_32) + test %r10d, %r10d + jnz L(aligned_64_exit_48) +L(aligned_64_exit_64): + pmovmskb %xmm3, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_48): + lea -16(%rax), %rax + mov %r10d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_32): + lea -32(%rax), %rax + mov %r11d, %edx + jmp L(aligned_64_exit) +L(aligned_64_exit_16): + lea -48(%rax), %rax +L(aligned_64_exit): +L(exit): + sub %rcx, %rax + test %dl, %dl + jz L(exit_high) + test $0x01, %dl + jnz L(exit_tail0) + + test $0x02, %dl + jnz L(exit_tail1) + + test $0x04, %dl + jnz L(exit_tail2) + + test $0x08, %dl + jnz L(exit_tail3) + + test $0x10, %dl + jnz L(exit_tail4) + + test $0x20, %dl + jnz L(exit_tail5) + + test $0x40, %dl + jnz L(exit_tail6) + add $7, %eax +L(exit_tail0): + RETURN + +L(exit_high): + add $8, %eax + test $0x01, %dh + jnz L(exit_tail0) + + test $0x02, %dh + jnz L(exit_tail1) + + test $0x04, %dh + jnz L(exit_tail2) + + test $0x08, %dh + jnz L(exit_tail3) + + test $0x10, %dh + jnz L(exit_tail4) + + test $0x20, %dh + jnz L(exit_tail5) + + test $0x40, %dh + jnz L(exit_tail6) + add $7, %eax + RETURN + +# ifdef USE_AS_STRNLEN + + .p2align 4 +L(len_less64): + pxor %xmm0, %xmm0 + add $64, %rsi + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm0 + pmovmskb %xmm0, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + sub $16, %rsi + jbe L(return_start_len) + + pcmpeqb (%rax), %xmm1 + pmovmskb %xmm1, %edx + lea 16(%rax), %rax + test %edx, %edx + jnz L(strnlen_exit) + + mov %r8, %rax + ret + + .p2align 4 +L(strnlen_exit): + sub %rcx, %rax + + test %dl, %dl + jz L(strnlen_exit_high) + mov %dl, %cl + and $15, %cl + jz L(strnlen_exit_8) + test $0x01, %dl + jnz L(exit_tail0) + test $0x02, %dl + jnz L(strnlen_exit_tail1) + test $0x04, %dl + jnz L(strnlen_exit_tail2) + sub $4, %rsi + jb L(return_start_len) + lea 3(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_8): + test $0x10, %dl + jnz L(strnlen_exit_tail4) + test $0x20, %dl + jnz L(strnlen_exit_tail5) + test $0x40, %dl + jnz L(strnlen_exit_tail6) + sub $8, %rsi + jb L(return_start_len) + lea 7(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high): + mov %dh, %ch + and $15, %ch + jz L(strnlen_exit_high_8) + test $0x01, %dh + jnz L(strnlen_exit_tail8) + test $0x02, %dh + jnz L(strnlen_exit_tail9) + test $0x04, %dh + jnz L(strnlen_exit_tail10) + sub $12, %rsi + jb L(return_start_len) + lea 11(%eax), %eax + ret + + .p2align 4 +L(strnlen_exit_high_8): + test $0x10, %dh + jnz L(strnlen_exit_tail12) + test $0x20, %dh + jnz L(strnlen_exit_tail13) + test $0x40, %dh + jnz L(strnlen_exit_tail14) + sub $16, %rsi + jb L(return_start_len) + lea 15(%eax), %eax + ret + |
