diff options
| author | Ondřej Bílka <neleai@seznam.cz> | 2013-12-14 19:33:56 +0100 |
|---|---|---|
| committer | Ondřej Bílka <neleai@seznam.cz> | 2013-12-14 20:08:13 +0100 |
| commit | 584b18eb4df61ccd447db2dfe8c8a7901f8c8598 (patch) | |
| tree | 8240dbf408eadda74685f951e36f8885f77c2f77 | |
| parent | 8a5c7897dd1c52ca74b06aaf5a3bacf0919c97aa (diff) | |
| download | glibc-584b18eb4df61ccd447db2dfe8c8a7901f8c8598.tar.xz glibc-584b18eb4df61ccd447db2dfe8c8a7901f8c8598.zip | |
Add strstr with unaligned loads. Fixes bug 12100.
A sse42 version of strstr used pcmpistr instruction which is quite
ineffective. A faster way is look for pairs of characters which is uses
sse2, is faster than pcmpistr and for real strings a pairs we look for
are relatively rare.
For linear time complexity we use buy or rent technique which switches
to two-way algorithm when superlinear behaviour is detected.
| -rw-r--r-- | ChangeLog | 14 | ||||
| -rw-r--r-- | NEWS | 24 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 9 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcasestr-c.c | 19 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcasestr-nonascii.c | 50 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcasestr.c | 18 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strstr-c.c | 47 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S | 374 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strstr.c | 388 |
10 files changed, 441 insertions, 506 deletions
@@ -1,3 +1,17 @@ +2013-12-14 Ondřej Bílka <neleai@seznam.cz> + + [BZ #12100] + * sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file + * sysdeps/x86_64/multiarch/strstr-c.c: Moved to ... + * sysdeps/x86_64/multiarch/strstr.c: ... here. + (strstr): Add __strstr_sse2_unaligned ifunc. + * sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ... + * sysdeps/x86_64/multiarch/strcasestr.c ... here. + (strcasestr): Remove __strcasestr_sse42 ifunc. + * sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update. + 2013-12-14 Kaz Kojima <kkojima@rr.iij4u.or.jp> * sysdeps/sh/sh4/fpu/bits/fenv.h: Move to ... @@ -10,18 +10,18 @@ Version 2.19 * The following bugs are resolved with this release: 156, 387, 431, 832, 926, 2801, 4772, 6786, 6787, 6807, 6810, 7003, 9954, - 10253, 10278, 11087, 11157, 11214, 12486, 13028, 13982, 13985, 14029, - 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004, 15048, - 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425, 15427, - 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610, 15632, - 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736, 15748, - 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843, 15844, - 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887, 15890, - 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917, 15919, - 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988, 15997, - 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072, 16074, - 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151, 16153, - 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289. + 10253, 10278, 11087, 11157, 11214, 12100, 12486, 13028, 13982, 13985, + 14029, 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004, + 15048, 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425, + 15427, 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610, + 15632, 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736, + 15748, 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843, + 15844, 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887, + 15890, 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917, + 15919, 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988, + 15997, 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072, + 16074, 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151, + 16153, 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289. * The public headers no longer use __unused nor __block. This change is to support compiling programs that are derived from BSD sources and use diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 9fd0fd64c5..57a3c13e8a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -11,22 +11,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ memcmp-sse4 memcpy-ssse3 \ memcpy-sse2-unaligned mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ - memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ + memmove-ssse3-back strcasecmp_l-ssse3 \ strncase_l-ssse3 strcat-ssse3 strncat-ssse3\ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ strcat-sse2-unaligned strncat-sse2-unaligned \ - strchr-sse2-no-bsf memcmp-ssse3 + strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned ifeq (yes,$(config-cflags-sse4)) -sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift +sysdep_routines += strcspn-c strpbrk-c strspn-c varshift CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 CFLAGS-strspn-c.c += -msse4 -CFLAGS-strstr.c += -msse4 -CFLAGS-strcasestr.c += -msse4 -CFLAGS-strcasestr-nonascii.c += -msse4 endif endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 71beab82e4..3e2cad56cb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strcasestr.c. */ IFUNC_IMPL (i, name, strcasestr, - IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2, - __strcasestr_sse42) IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2)) /* Support sysdeps/x86_64/multiarch/strcat.S. */ @@ -184,7 +182,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/strstr-c.c. */ IFUNC_IMPL (i, name, strstr, - IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) /* Support sysdeps/x86_64/multiarch/wcscpy.S. */ diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c deleted file mode 100644 index c13a4c44f3..0000000000 --- a/sysdeps/x86_64/multiarch/strcasestr-c.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Multiple versions of strcasestr - All versions must be listed in ifunc-impl-list.c. */ - -#include "init-arch.h" - -#define STRCASESTR __strcasestr_sse2 - -#include "string/strcasestr.c" - -extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden; -extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden; - -#if 1 -libc_ifunc (__strcasestr, - HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2); -#else -libc_ifunc (__strcasestr, - 0 ? __strcasestr_sse42 : __strcasestr_sse2); -#endif diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c deleted file mode 100644 index 032a6420d6..0000000000 --- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c +++ /dev/null @@ -1,50 +0,0 @@ -/* strstr with SSE4.2 intrinsics - Copyright (C) 2010-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -#include <ctype.h> -#include <xmmintrin.h> - - -/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C - locale. */ -static __m128i -__m128i_strloadu_tolower (const unsigned char *p) -{ - union - { - char b[16]; - __m128i x; - } u; - - for (int i = 0; i < 16; ++i) - if (p[i] == 0) - { - u.b[i] = 0; - break; - } - else - u.b[i] = tolower (p[i]); - - return u.x; -} - - -#define STRCASESTR_NONASCII -#define USE_AS_STRCASESTR -#define STRSTR_SSE42 __strcasestr_sse42_nonascii -#include "strstr.c" diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c index d1cfb3b264..834e656a2c 100644 --- a/sysdeps/x86_64/multiarch/strcasestr.c +++ b/sysdeps/x86_64/multiarch/strcasestr.c @@ -1,7 +1,13 @@ -extern char *__strcasestr_sse42_nonascii (const unsigned char *s1, - const unsigned char *s2) - attribute_hidden; +/* Multiple versions of strcasestr + All versions must be listed in ifunc-impl-list.c. */ -#define USE_AS_STRCASESTR -#define STRSTR_SSE42 __strcasestr_sse42 -#include "strstr.c" +#include "init-arch.h" + +#define STRCASESTR __strcasestr_sse2 + +#include "string/strcasestr.c" + +extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden; + +libc_ifunc (__strcasestr, + __strcasestr_sse2); diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c deleted file mode 100644 index 42bbe48172..0000000000 --- a/sysdeps/x86_64/multiarch/strstr-c.c +++ /dev/null @@ -1,47 +0,0 @@ -/* Multiple versions of strstr. - All versions must be listed in ifunc-impl-list.c. - Copyright (C) 2012-2013 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <http://www.gnu.org/licenses/>. */ - -/* Redefine strstr so that the compiler won't complain about the type - mismatch with the IFUNC selector in strong_alias, below. */ -#undef strstr -#define strstr __redirect_strstr -#include <string.h> -#undef strstr - -#define STRSTR __strstr_sse2 -#ifdef SHARED -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(name) \ - __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2); -#endif - -#include "string/strstr.c" - -extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden; -extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; - -#include "init-arch.h" - -/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle - ifunc symbol properly. */ -extern __typeof (__redirect_strstr) __libc_strstr; -libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2) - -#undef strstr -strong_alias (__libc_strstr, strstr) diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S new file mode 100644 index 0000000000..99bae2cc83 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S @@ -0,0 +1,374 @@ +/* strstr with unaligned loads + Copyright (C) 2009-2013 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> + +ENTRY(__strstr_sse2_unaligned) + movzbl (%rsi), %eax + testb %al, %al + je L(empty) + movzbl 1(%rsi), %edx + testb %dl, %dl + je L(strchr) + movd %eax, %xmm1 + movd %edx, %xmm2 + movq %rdi, %rax + andl $4095, %eax + punpcklbw %xmm1, %xmm1 + cmpq $4031, %rax + punpcklbw %xmm2, %xmm2 + punpcklwd %xmm1, %xmm1 + punpcklwd %xmm2, %xmm2 + pshufd $0, %xmm1, %xmm1 + pshufd $0, %xmm2, %xmm2 + ja L(cross_page) + movdqu (%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 1(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 16(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 17(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %r8d + pmovmskb %xmm0, %eax + salq $16, %rax + orq %rax, %r8 + je L(next_32_bytes) +L(next_pair_index): + bsf %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero1) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found1) + cmpb 2(%rax), %dl + jne L(next_pair) + xorl %edx, %edx + jmp L(pair_loop_start) + + .p2align 4 +L(strchr): + movzbl %al, %esi + jmp __strchr_sse2 + + .p2align 4 +L(pair_loop): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair) +L(pair_loop_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop) +L(found1): + ret +L(zero1): + xorl %eax, %eax + ret + + .p2align 4 +L(next_pair): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index) + + .p2align 4 +L(next_32_bytes): + movdqu 32(%rdi), %xmm3 + pxor %xmm5, %xmm5 + movdqu 33(%rdi), %xmm4 + movdqa %xmm3, %xmm6 + pcmpeqb %xmm1, %xmm3 + pcmpeqb %xmm2, %xmm4 + movdqu 48(%rdi), %xmm0 + pcmpeqb %xmm5, %xmm6 + pminub %xmm4, %xmm3 + movdqa %xmm3, %xmm4 + movdqu 49(%rdi), %xmm3 + pcmpeqb %xmm0, %xmm5 + pcmpeqb %xmm2, %xmm3 + por %xmm6, %xmm4 + pcmpeqb %xmm1, %xmm0 + pminub %xmm3, %xmm0 + por %xmm5, %xmm0 + pmovmskb %xmm4, %eax + salq $32, %rax + pmovmskb %xmm0, %r8d + salq $48, %r8 + orq %rax, %r8 + je L(loop_header) +L(next_pair2_index): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero2) + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found2) + cmpb 2(%rax), %dl + jne L(next_pair2) + xorl %edx, %edx + jmp L(pair_loop2_start) + + .p2align 4 +L(pair_loop2): + addq $1, %rdx + cmpb 2(%rax,%rdx), %cl + jne L(next_pair2) +L(pair_loop2_start): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop2) +L(found2): + ret + L(zero2): + xorl %eax, %eax + ret +L(empty): + mov %rdi, %rax + ret + + .p2align 4 +L(next_pair2): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair2_index) +L(loop_header): + movq $-512, %r11 + movq %rdi, %r9 + + pxor %xmm7, %xmm7 + andq $-64, %rdi + + .p2align 4 +L(loop): + movdqa 64(%rdi), %xmm3 + movdqu 63(%rdi), %xmm6 + movdqa %xmm3, %xmm0 + pxor %xmm2, %xmm3 + pxor %xmm1, %xmm6 + movdqa 80(%rdi), %xmm10 + por %xmm3, %xmm6 + pminub %xmm10, %xmm0 + movdqu 79(%rdi), %xmm3 + pxor %xmm2, %xmm10 + pxor %xmm1, %xmm3 + movdqa 96(%rdi), %xmm9 + por %xmm10, %xmm3 + pminub %xmm9, %xmm0 + pxor %xmm2, %xmm9 + movdqa 112(%rdi), %xmm8 + addq $64, %rdi + pminub %xmm6, %xmm3 + movdqu 31(%rdi), %xmm4 + pminub %xmm8, %xmm0 + pxor %xmm2, %xmm8 + pxor %xmm1, %xmm4 + por %xmm9, %xmm4 + pminub %xmm4, %xmm3 + movdqu 47(%rdi), %xmm5 + pxor %xmm1, %xmm5 + por %xmm8, %xmm5 + pminub %xmm5, %xmm3 + pminub %xmm3, %xmm0 + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %eax + testl %eax, %eax + je L(loop) + pminub (%rdi), %xmm6 + pminub 32(%rdi),%xmm4 + pminub 48(%rdi),%xmm5 + pcmpeqb %xmm7, %xmm6 + pcmpeqb %xmm7, %xmm5 + pmovmskb %xmm6, %edx + movdqa 16(%rdi), %xmm8 + pcmpeqb %xmm7, %xmm4 + movdqu 15(%rdi), %xmm0 + pmovmskb %xmm5, %r8d + movdqa %xmm8, %xmm3 + pmovmskb %xmm4, %ecx + pcmpeqb %xmm1,%xmm0 + pcmpeqb %xmm2,%xmm3 + salq $32, %rcx + pcmpeqb %xmm7,%xmm8 + salq $48, %r8 + pminub %xmm0,%xmm3 + orq %rcx, %rdx + por %xmm3,%xmm8 + orq %rdx, %r8 + pmovmskb %xmm8, %eax + salq $16, %rax + orq %rax, %r8 + je L(loop) +L(next_pair_index3): + bsfq %r8, %rcx + addq %rdi, %rcx + cmpb $0, (%rcx) + je L(zero) + xorl %eax, %eax + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(success3) + cmpb 1(%rcx), %dl + jne L(next_pair3) + jmp L(pair_loop_start3) + + .p2align 4 +L(pair_loop3): + addq $1, %rax + cmpb 1(%rcx,%rax), %dl + jne L(next_pair3) +L(pair_loop_start3): + movzbl 3(%rsi,%rax), %edx + testb %dl, %dl + jne L(pair_loop3) +L(success3): + lea -1(%rcx), %rax + ret + + .p2align 4 +L(next_pair3): + addq %rax, %r11 + movq %rdi, %rax + subq %r9, %rax + cmpq %r11, %rax + jl L(switch_strstr) + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index3) + jmp L(loop) + + .p2align 4 +L(switch_strstr): + movq %rdi, %rdi + jmp __strstr_sse2 + + .p2align 4 +L(cross_page): + + movq %rdi, %rax + pxor %xmm0, %xmm0 + andq $-64, %rax + movdqa (%rax), %xmm3 + movdqu -1(%rax), %xmm4 + movdqa %xmm3, %xmm8 + movdqa 16(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm0, %xmm8 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm7 + pminub %xmm4, %xmm3 + movdqu 15(%rax), %xmm4 + pcmpeqb %xmm0, %xmm7 + por %xmm3, %xmm8 + movdqa %xmm5, %xmm3 + movdqa 32(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + movdqa %xmm5, %xmm6 + pmovmskb %xmm8, %ecx + pminub %xmm4, %xmm3 + movdqu 31(%rax), %xmm4 + por %xmm3, %xmm7 + movdqa %xmm5, %xmm3 + pcmpeqb %xmm0, %xmm6 + movdqa 48(%rax), %xmm5 + pcmpeqb %xmm1, %xmm4 + pmovmskb %xmm7, %r8d + pcmpeqb %xmm2, %xmm3 + pcmpeqb %xmm5, %xmm0 + pminub %xmm4, %xmm3 + movdqu 47(%rax), %xmm4 + por %xmm3, %xmm6 + movdqa %xmm5, %xmm3 + salq $16, %r8 + pcmpeqb %xmm1, %xmm4 + pcmpeqb %xmm2, %xmm3 + pmovmskb %xmm6, %r10d + pminub %xmm4, %xmm3 + por %xmm3, %xmm0 + salq $32, %r10 + orq %r10, %r8 + orq %rcx, %r8 + movl %edi, %ecx + pmovmskb %xmm0, %edx + subl %eax, %ecx + salq $48, %rdx + orq %rdx, %r8 + shrq %cl, %r8 + je L(loop_header) +L(next_pair_index4): + bsfq %r8, %rax + addq %rdi, %rax + cmpb $0, (%rax) + je L(zero) + + cmpq %rax,%rdi + je L(next_pair4) + + movzbl 2(%rsi), %edx + testb %dl, %dl + je L(found3) + cmpb 1(%rax), %dl + jne L(next_pair4) + xorl %edx, %edx + jmp L(pair_loop_start4) + + .p2align 4 +L(pair_loop4): + addq $1, %rdx + cmpb 1(%rax,%rdx), %cl + jne L(next_pair4) +L(pair_loop_start4): + movzbl 3(%rsi,%rdx), %ecx + testb %cl, %cl + jne L(pair_loop4) +L(found3): + subq $1, %rax + ret + + .p2align 4 +L(next_pair4): + leaq -1(%r8), %rax + andq %rax, %r8 + jne L(next_pair_index4) + jmp L(loop_header) + + .p2align 4 +L(found): + rep + ret + + .p2align 4 +L(zero): + xorl %eax, %eax + ret + + +END(__strstr_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c index cd63b68c01..fbff3a8ec0 100644 --- a/sysdeps/x86_64/multiarch/strstr.c +++ b/sysdeps/x86_64/multiarch/strstr.c @@ -1,6 +1,6 @@ -/* strstr with SSE4.2 intrinsics - Copyright (C) 2009-2013 Free Software Foundation, Inc. - Contributed by Intel Corporation. +/* Multiple versions of strstr. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2012-2013 Free Software Foundation, Inc. This file is part of the GNU C Library. The GNU C Library is free software; you can redistribute it and/or @@ -17,369 +17,31 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <nmmintrin.h> -#include "varshift.h" - -#ifndef STRSTR_SSE42 -# define STRSTR_SSE42 __strstr_sse42 -#endif - -#ifdef USE_AS_STRCASESTR -# include <ctype.h> -# include <locale/localeinfo.h> - -# define LOADBYTE(C) tolower (C) -# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2)) -#else -# define |
