aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorOndřej Bílka <neleai@seznam.cz>2013-12-14 19:33:56 +0100
committerOndřej Bílka <neleai@seznam.cz>2013-12-14 20:08:13 +0100
commit584b18eb4df61ccd447db2dfe8c8a7901f8c8598 (patch)
tree8240dbf408eadda74685f951e36f8885f77c2f77
parent8a5c7897dd1c52ca74b06aaf5a3bacf0919c97aa (diff)
downloadglibc-584b18eb4df61ccd447db2dfe8c8a7901f8c8598.tar.xz
glibc-584b18eb4df61ccd447db2dfe8c8a7901f8c8598.zip
Add strstr with unaligned loads. Fixes bug 12100.
A sse42 version of strstr used pcmpistr instruction which is quite ineffective. A faster way is look for pairs of characters which is uses sse2, is faster than pcmpistr and for real strings a pairs we look for are relatively rare. For linear time complexity we use buy or rent technique which switches to two-way algorithm when superlinear behaviour is detected.
-rw-r--r--ChangeLog14
-rw-r--r--NEWS24
-rw-r--r--sysdeps/x86_64/multiarch/Makefile9
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c4
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr-c.c19
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr-nonascii.c50
-rw-r--r--sysdeps/x86_64/multiarch/strcasestr.c18
-rw-r--r--sysdeps/x86_64/multiarch/strstr-c.c47
-rw-r--r--sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S374
-rw-r--r--sysdeps/x86_64/multiarch/strstr.c388
10 files changed, 441 insertions, 506 deletions
diff --git a/ChangeLog b/ChangeLog
index 3de39014c5..811f4c9f31 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2013-12-14 Ondřej Bílka <neleai@seznam.cz>
+
+ [BZ #12100]
+ * sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S: New file
+ * sysdeps/x86_64/multiarch/strstr-c.c: Moved to ...
+ * sysdeps/x86_64/multiarch/strstr.c: ... here.
+ (strstr): Add __strstr_sse2_unaligned ifunc.
+ * sysdeps/x86_64/multiarch/strcasestr-c.c: Moved to ...
+ * sysdeps/x86_64/multiarch/strcasestr.c ... here.
+ (strcasestr): Remove __strcasestr_sse42 ifunc.
+ * sysdeps/x86_64/multiarch/strcasestr-nonascii.c: Remove.
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c: Update.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Update.
+
2013-12-14 Kaz Kojima <kkojima@rr.iij4u.or.jp>
* sysdeps/sh/sh4/fpu/bits/fenv.h: Move to ...
diff --git a/NEWS b/NEWS
index 5815d82238..ebe8197537 100644
--- a/NEWS
+++ b/NEWS
@@ -10,18 +10,18 @@ Version 2.19
* The following bugs are resolved with this release:
156, 387, 431, 832, 926, 2801, 4772, 6786, 6787, 6807, 6810, 7003, 9954,
- 10253, 10278, 11087, 11157, 11214, 12486, 13028, 13982, 13985, 14029,
- 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004, 15048,
- 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425, 15427,
- 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610, 15632,
- 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736, 15748,
- 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843, 15844,
- 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887, 15890,
- 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917, 15919,
- 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988, 15997,
- 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072, 16074,
- 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151, 16153,
- 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
+ 10253, 10278, 11087, 11157, 11214, 12100, 12486, 13028, 13982, 13985,
+ 14029, 14032, 14143, 14155, 14547, 14699, 14752, 14876, 14910, 15004,
+ 15048, 15089, 15218, 15268, 15277, 15308, 15362, 15374, 15400, 15425,
+ 15427, 15483, 15522, 15531, 15532, 15593, 15601, 15608, 15609, 15610,
+ 15632, 15640, 15670, 15672, 15680, 15681, 15723, 15734, 15735, 15736,
+ 15748, 15749, 15754, 15760, 15763, 15764, 15797, 15799, 15825, 15843,
+ 15844, 15847, 15849, 15855, 15856, 15857, 15859, 15867, 15886, 15887,
+ 15890, 15892, 15893, 15895, 15897, 15901, 15905, 15909, 15915, 15917,
+ 15919, 15921, 15923, 15939, 15941, 15948, 15963, 15966, 15985, 15988,
+ 15997, 16032, 16034, 16036, 16037, 16038, 16041, 16055, 16071, 16072,
+ 16074, 16077, 16078, 16103, 16112, 16143, 16144, 16146, 16150, 16151,
+ 16153, 16167, 16172, 16195, 16214, 16245, 16271, 16274, 16283, 16289.
* The public headers no longer use __unused nor __block. This change is to
support compiling programs that are derived from BSD sources and use
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 9fd0fd64c5..57a3c13e8a 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,22 +11,19 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcmp-sse4 memcpy-ssse3 \
memcpy-sse2-unaligned mempcpy-ssse3 \
memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
- memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \
+ memmove-ssse3-back strcasecmp_l-ssse3 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
strcpy-sse2-unaligned strncpy-sse2-unaligned \
stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3
+ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned
ifeq (yes,$(config-cflags-sse4))
-sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift
+sysdep_routines += strcspn-c strpbrk-c strspn-c varshift
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
CFLAGS-strspn-c.c += -msse4
-CFLAGS-strstr.c += -msse4
-CFLAGS-strcasestr.c += -msse4
-CFLAGS-strcasestr-nonascii.c += -msse4
endif
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 71beab82e4..3e2cad56cb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasestr.c. */
IFUNC_IMPL (i, name, strcasestr,
- IFUNC_IMPL_ADD (array, i, strcasestr, HAS_SSE4_2,
- __strcasestr_sse42)
IFUNC_IMPL_ADD (array, i, strcasestr, 1, __strcasestr_sse2))
/* Support sysdeps/x86_64/multiarch/strcat.S. */
@@ -184,7 +182,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strstr-c.c. */
IFUNC_IMPL (i, name, strstr,
- IFUNC_IMPL_ADD (array, i, strstr, HAS_SSE4_2, __strstr_sse42)
+ IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
diff --git a/sysdeps/x86_64/multiarch/strcasestr-c.c b/sysdeps/x86_64/multiarch/strcasestr-c.c
deleted file mode 100644
index c13a4c44f3..0000000000
--- a/sysdeps/x86_64/multiarch/strcasestr-c.c
+++ /dev/null
@@ -1,19 +0,0 @@
-/* Multiple versions of strcasestr
- All versions must be listed in ifunc-impl-list.c. */
-
-#include "init-arch.h"
-
-#define STRCASESTR __strcasestr_sse2
-
-#include "string/strcasestr.c"
-
-extern char *__strcasestr_sse42 (const char *, const char *) attribute_hidden;
-extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
-
-#if 1
-libc_ifunc (__strcasestr,
- HAS_SSE4_2 ? __strcasestr_sse42 : __strcasestr_sse2);
-#else
-libc_ifunc (__strcasestr,
- 0 ? __strcasestr_sse42 : __strcasestr_sse2);
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c b/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
deleted file mode 100644
index 032a6420d6..0000000000
--- a/sysdeps/x86_64/multiarch/strcasestr-nonascii.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2010-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <ctype.h>
-#include <xmmintrin.h>
-
-
-/* Similar to __m128i_strloadu. Convert to lower case for none-POSIX/C
- locale. */
-static __m128i
-__m128i_strloadu_tolower (const unsigned char *p)
-{
- union
- {
- char b[16];
- __m128i x;
- } u;
-
- for (int i = 0; i < 16; ++i)
- if (p[i] == 0)
- {
- u.b[i] = 0;
- break;
- }
- else
- u.b[i] = tolower (p[i]);
-
- return u.x;
-}
-
-
-#define STRCASESTR_NONASCII
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42_nonascii
-#include "strstr.c"
diff --git a/sysdeps/x86_64/multiarch/strcasestr.c b/sysdeps/x86_64/multiarch/strcasestr.c
index d1cfb3b264..834e656a2c 100644
--- a/sysdeps/x86_64/multiarch/strcasestr.c
+++ b/sysdeps/x86_64/multiarch/strcasestr.c
@@ -1,7 +1,13 @@
-extern char *__strcasestr_sse42_nonascii (const unsigned char *s1,
- const unsigned char *s2)
- attribute_hidden;
+/* Multiple versions of strcasestr
+ All versions must be listed in ifunc-impl-list.c. */
-#define USE_AS_STRCASESTR
-#define STRSTR_SSE42 __strcasestr_sse42
-#include "strstr.c"
+#include "init-arch.h"
+
+#define STRCASESTR __strcasestr_sse2
+
+#include "string/strcasestr.c"
+
+extern __typeof (__strcasestr_sse2) __strcasestr_sse2 attribute_hidden;
+
+libc_ifunc (__strcasestr,
+ __strcasestr_sse2);
diff --git a/sysdeps/x86_64/multiarch/strstr-c.c b/sysdeps/x86_64/multiarch/strstr-c.c
deleted file mode 100644
index 42bbe48172..0000000000
--- a/sysdeps/x86_64/multiarch/strstr-c.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Multiple versions of strstr.
- All versions must be listed in ifunc-impl-list.c.
- Copyright (C) 2012-2013 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-/* Redefine strstr so that the compiler won't complain about the type
- mismatch with the IFUNC selector in strong_alias, below. */
-#undef strstr
-#define strstr __redirect_strstr
-#include <string.h>
-#undef strstr
-
-#define STRSTR __strstr_sse2
-#ifdef SHARED
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- __hidden_ver1 (__strstr_sse2, __GI_strstr, __strstr_sse2);
-#endif
-
-#include "string/strstr.c"
-
-extern __typeof (__redirect_strstr) __strstr_sse42 attribute_hidden;
-extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
-
-#include "init-arch.h"
-
-/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
- ifunc symbol properly. */
-extern __typeof (__redirect_strstr) __libc_strstr;
-libc_ifunc (__libc_strstr, HAS_SSE4_2 ? __strstr_sse42 : __strstr_sse2)
-
-#undef strstr
-strong_alias (__libc_strstr, strstr)
diff --git a/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
new file mode 100644
index 0000000000..99bae2cc83
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strstr-sse2-unaligned.S
@@ -0,0 +1,374 @@
+/* strstr with unaligned loads
+ Copyright (C) 2009-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY(__strstr_sse2_unaligned)
+ movzbl (%rsi), %eax
+ testb %al, %al
+ je L(empty)
+ movzbl 1(%rsi), %edx
+ testb %dl, %dl
+ je L(strchr)
+ movd %eax, %xmm1
+ movd %edx, %xmm2
+ movq %rdi, %rax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpq $4031, %rax
+ punpcklbw %xmm2, %xmm2
+ punpcklwd %xmm1, %xmm1
+ punpcklwd %xmm2, %xmm2
+ pshufd $0, %xmm1, %xmm1
+ pshufd $0, %xmm2, %xmm2
+ ja L(cross_page)
+ movdqu (%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 1(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 16(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 17(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %r8d
+ pmovmskb %xmm0, %eax
+ salq $16, %rax
+ orq %rax, %r8
+ je L(next_32_bytes)
+L(next_pair_index):
+ bsf %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero1)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found1)
+ cmpb 2(%rax), %dl
+ jne L(next_pair)
+ xorl %edx, %edx
+ jmp L(pair_loop_start)
+
+ .p2align 4
+L(strchr):
+ movzbl %al, %esi
+ jmp __strchr_sse2
+
+ .p2align 4
+L(pair_loop):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair)
+L(pair_loop_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop)
+L(found1):
+ ret
+L(zero1):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(next_pair):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index)
+
+ .p2align 4
+L(next_32_bytes):
+ movdqu 32(%rdi), %xmm3
+ pxor %xmm5, %xmm5
+ movdqu 33(%rdi), %xmm4
+ movdqa %xmm3, %xmm6
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm5, %xmm6
+ pminub %xmm4, %xmm3
+ movdqa %xmm3, %xmm4
+ movdqu 49(%rdi), %xmm3
+ pcmpeqb %xmm0, %xmm5
+ pcmpeqb %xmm2, %xmm3
+ por %xmm6, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pminub %xmm3, %xmm0
+ por %xmm5, %xmm0
+ pmovmskb %xmm4, %eax
+ salq $32, %rax
+ pmovmskb %xmm0, %r8d
+ salq $48, %r8
+ orq %rax, %r8
+ je L(loop_header)
+L(next_pair2_index):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero2)
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found2)
+ cmpb 2(%rax), %dl
+ jne L(next_pair2)
+ xorl %edx, %edx
+ jmp L(pair_loop2_start)
+
+ .p2align 4
+L(pair_loop2):
+ addq $1, %rdx
+ cmpb 2(%rax,%rdx), %cl
+ jne L(next_pair2)
+L(pair_loop2_start):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop2)
+L(found2):
+ ret
+ L(zero2):
+ xorl %eax, %eax
+ ret
+L(empty):
+ mov %rdi, %rax
+ ret
+
+ .p2align 4
+L(next_pair2):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair2_index)
+L(loop_header):
+ movq $-512, %r11
+ movq %rdi, %r9
+
+ pxor %xmm7, %xmm7
+ andq $-64, %rdi
+
+ .p2align 4
+L(loop):
+ movdqa 64(%rdi), %xmm3
+ movdqu 63(%rdi), %xmm6
+ movdqa %xmm3, %xmm0
+ pxor %xmm2, %xmm3
+ pxor %xmm1, %xmm6
+ movdqa 80(%rdi), %xmm10
+ por %xmm3, %xmm6
+ pminub %xmm10, %xmm0
+ movdqu 79(%rdi), %xmm3
+ pxor %xmm2, %xmm10
+ pxor %xmm1, %xmm3
+ movdqa 96(%rdi), %xmm9
+ por %xmm10, %xmm3
+ pminub %xmm9, %xmm0
+ pxor %xmm2, %xmm9
+ movdqa 112(%rdi), %xmm8
+ addq $64, %rdi
+ pminub %xmm6, %xmm3
+ movdqu 31(%rdi), %xmm4
+ pminub %xmm8, %xmm0
+ pxor %xmm2, %xmm8
+ pxor %xmm1, %xmm4
+ por %xmm9, %xmm4
+ pminub %xmm4, %xmm3
+ movdqu 47(%rdi), %xmm5
+ pxor %xmm1, %xmm5
+ por %xmm8, %xmm5
+ pminub %xmm5, %xmm3
+ pminub %xmm3, %xmm0
+ pcmpeqb %xmm7, %xmm0
+ pmovmskb %xmm0, %eax
+ testl %eax, %eax
+ je L(loop)
+ pminub (%rdi), %xmm6
+ pminub 32(%rdi),%xmm4
+ pminub 48(%rdi),%xmm5
+ pcmpeqb %xmm7, %xmm6
+ pcmpeqb %xmm7, %xmm5
+ pmovmskb %xmm6, %edx
+ movdqa 16(%rdi), %xmm8
+ pcmpeqb %xmm7, %xmm4
+ movdqu 15(%rdi), %xmm0
+ pmovmskb %xmm5, %r8d
+ movdqa %xmm8, %xmm3
+ pmovmskb %xmm4, %ecx
+ pcmpeqb %xmm1,%xmm0
+ pcmpeqb %xmm2,%xmm3
+ salq $32, %rcx
+ pcmpeqb %xmm7,%xmm8
+ salq $48, %r8
+ pminub %xmm0,%xmm3
+ orq %rcx, %rdx
+ por %xmm3,%xmm8
+ orq %rdx, %r8
+ pmovmskb %xmm8, %eax
+ salq $16, %rax
+ orq %rax, %r8
+ je L(loop)
+L(next_pair_index3):
+ bsfq %r8, %rcx
+ addq %rdi, %rcx
+ cmpb $0, (%rcx)
+ je L(zero)
+ xorl %eax, %eax
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(success3)
+ cmpb 1(%rcx), %dl
+ jne L(next_pair3)
+ jmp L(pair_loop_start3)
+
+ .p2align 4
+L(pair_loop3):
+ addq $1, %rax
+ cmpb 1(%rcx,%rax), %dl
+ jne L(next_pair3)
+L(pair_loop_start3):
+ movzbl 3(%rsi,%rax), %edx
+ testb %dl, %dl
+ jne L(pair_loop3)
+L(success3):
+ lea -1(%rcx), %rax
+ ret
+
+ .p2align 4
+L(next_pair3):
+ addq %rax, %r11
+ movq %rdi, %rax
+ subq %r9, %rax
+ cmpq %r11, %rax
+ jl L(switch_strstr)
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index3)
+ jmp L(loop)
+
+ .p2align 4
+L(switch_strstr):
+ movq %rdi, %rdi
+ jmp __strstr_sse2
+
+ .p2align 4
+L(cross_page):
+
+ movq %rdi, %rax
+ pxor %xmm0, %xmm0
+ andq $-64, %rax
+ movdqa (%rax), %xmm3
+ movdqu -1(%rax), %xmm4
+ movdqa %xmm3, %xmm8
+ movdqa 16(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm0, %xmm8
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm7
+ pminub %xmm4, %xmm3
+ movdqu 15(%rax), %xmm4
+ pcmpeqb %xmm0, %xmm7
+ por %xmm3, %xmm8
+ movdqa %xmm5, %xmm3
+ movdqa 32(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ movdqa %xmm5, %xmm6
+ pmovmskb %xmm8, %ecx
+ pminub %xmm4, %xmm3
+ movdqu 31(%rax), %xmm4
+ por %xmm3, %xmm7
+ movdqa %xmm5, %xmm3
+ pcmpeqb %xmm0, %xmm6
+ movdqa 48(%rax), %xmm5
+ pcmpeqb %xmm1, %xmm4
+ pmovmskb %xmm7, %r8d
+ pcmpeqb %xmm2, %xmm3
+ pcmpeqb %xmm5, %xmm0
+ pminub %xmm4, %xmm3
+ movdqu 47(%rax), %xmm4
+ por %xmm3, %xmm6
+ movdqa %xmm5, %xmm3
+ salq $16, %r8
+ pcmpeqb %xmm1, %xmm4
+ pcmpeqb %xmm2, %xmm3
+ pmovmskb %xmm6, %r10d
+ pminub %xmm4, %xmm3
+ por %xmm3, %xmm0
+ salq $32, %r10
+ orq %r10, %r8
+ orq %rcx, %r8
+ movl %edi, %ecx
+ pmovmskb %xmm0, %edx
+ subl %eax, %ecx
+ salq $48, %rdx
+ orq %rdx, %r8
+ shrq %cl, %r8
+ je L(loop_header)
+L(next_pair_index4):
+ bsfq %r8, %rax
+ addq %rdi, %rax
+ cmpb $0, (%rax)
+ je L(zero)
+
+ cmpq %rax,%rdi
+ je L(next_pair4)
+
+ movzbl 2(%rsi), %edx
+ testb %dl, %dl
+ je L(found3)
+ cmpb 1(%rax), %dl
+ jne L(next_pair4)
+ xorl %edx, %edx
+ jmp L(pair_loop_start4)
+
+ .p2align 4
+L(pair_loop4):
+ addq $1, %rdx
+ cmpb 1(%rax,%rdx), %cl
+ jne L(next_pair4)
+L(pair_loop_start4):
+ movzbl 3(%rsi,%rdx), %ecx
+ testb %cl, %cl
+ jne L(pair_loop4)
+L(found3):
+ subq $1, %rax
+ ret
+
+ .p2align 4
+L(next_pair4):
+ leaq -1(%r8), %rax
+ andq %rax, %r8
+ jne L(next_pair_index4)
+ jmp L(loop_header)
+
+ .p2align 4
+L(found):
+ rep
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+
+END(__strstr_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
index cd63b68c01..fbff3a8ec0 100644
--- a/sysdeps/x86_64/multiarch/strstr.c
+++ b/sysdeps/x86_64/multiarch/strstr.c
@@ -1,6 +1,6 @@
-/* strstr with SSE4.2 intrinsics
- Copyright (C) 2009-2013 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
+/* Multiple versions of strstr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2012-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -17,369 +17,31 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <nmmintrin.h>
-#include "varshift.h"
-
-#ifndef STRSTR_SSE42
-# define STRSTR_SSE42 __strstr_sse42
-#endif
-
-#ifdef USE_AS_STRCASESTR
-# include <ctype.h>
-# include <locale/localeinfo.h>
-
-# define LOADBYTE(C) tolower (C)
-# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
-#else
-# define