aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-07-12 12:28:06 -0700
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-07-13 14:55:31 -0700
commitd561fbb041fe6aa205f652aecefe4bb84fd124a5 (patch)
tree4fa103748eb51664285b31d4186ee06b6bea35b2
parent30e57e0a21cc50eead14f729a29a5604a6b23e18 (diff)
downloadglibc-d561fbb041fe6aa205f652aecefe4bb84fd124a5.tar.xz
glibc-d561fbb041fe6aa205f652aecefe4bb84fd124a5.zip
x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Because strcmp-sse2.S implements so many functions (more from avx2/evex/sse42) add a new file 'strcmp-naming.h' to assist in getting the correct symbol name for all the function across multiarch/non-multiarch builds. Tested build on x86_64 and x86_32 with/without multiarch.
-rw-r--r--sysdeps/x86_64/multiarch/rtld-strcmp.S18
-rw-r--r--sysdeps/x86_64/multiarch/rtld-strncmp.S18
-rw-r--r--sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-naming.h68
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-sse2.S2140
-rw-r--r--sysdeps/x86_64/multiarch/strncase_l-sse2.S5
-rw-r--r--sysdeps/x86_64/multiarch/strncmp-sse2.S12
-rw-r--r--sysdeps/x86_64/strcasecmp_l.S11
-rw-r--r--sysdeps/x86_64/strcmp.S2147
-rw-r--r--sysdeps/x86_64/strncase_l.S11
-rw-r--r--sysdeps/x86_64/strncmp.S7
11 files changed, 2264 insertions, 2178 deletions
diff --git a/sysdeps/x86_64/multiarch/rtld-strcmp.S b/sysdeps/x86_64/multiarch/rtld-strcmp.S
new file mode 100644
index 0000000000..207078bdcc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strcmp.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strncmp.S b/sysdeps/x86_64/multiarch/rtld-strncmp.S
new file mode 100644
index 0000000000..ac32150406
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strncmp.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strncmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
index 2360d104dd..a2b5741399 100644
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
@@ -16,8 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP __strcasecmp_l_sse2
#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define __strcasecmp __strcasecmp_sse2
-#include <sysdeps/x86_64/strcmp.S>
+#include "strcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-naming.h b/sysdeps/x86_64/multiarch/strcmp-naming.h
new file mode 100644
index 0000000000..6a7529b6a4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-naming.h
@@ -0,0 +1,68 @@
+#ifndef _STRCMP_NAMING_H_
+#define _STRCMP_NAMING_H_
+
+/* Utility macros. */
+#define STRCMP_SUFFIX(x, y) x##y
+#define STRCMP_NAME(x, y) STRCMP_SUFFIX (x, y)
+
+/* Setup base of all definitions. */
+#define STRNCASECMP_BASE __strncasecmp
+#define STRCASECMP_BASE __strcasecmp
+#define WCSCMP_BASE __wcscmp
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+# define WCSNCMP_BASE __wcsncmp
+# define STRNCMP_BASE __strncmp
+# define STRCMP_BASE __strcmp
+
+#else
+/* Covers IS_IN (rtld) or non-multiarch build. */
+# define WCSNCMP_BASE wcsncmp
+# define STRNCMP_BASE strncmp
+# define STRCMP_BASE strcmp
+
+# undef STRCMP_ISA
+# define STRCMP_ISA
+#endif
+
+#if IS_IN (rtld) || defined USE_MULTIARCH
+# define ISA_HIDDEN_JUMPTARGET(...) __VA_ARGS__
+#else
+# define ISA_HIDDEN_JUMPTARGET(...) HIDDEN_JUMPTARGET (__VA_ARGS__)
+#endif
+
+/* Get correct symbol for OVERFLOW_STRCMP, STRCMP, and
+ STRCASECMP. */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+
+# if defined USE_AS_WCSCMP || defined USE_AS_WCSNCMP
+# define OVERFLOW_STRCMP_SYM WCSCMP_BASE
+# define STRCMP_SYM WCSNCMP_BASE
+# elif defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define OVERFLOW_STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
+# define STRCMP_SYM STRCMP_NAME (STRNCASECMP_BASE, _l)
+# else
+# define OVERFLOW_STRCMP_SYM STRCMP_BASE
+# define STRCMP_SYM STRNCMP_BASE
+# endif
+
+# define STRCASECMP_SYM STRNCASECMP_BASE
+# define OVERFLOW_STRCMP \
+ ISA_HIDDEN_JUMPTARGET (STRCMP_NAME (OVERFLOW_STRCMP_SYM, STRCMP_ISA))
+#else
+# ifdef USE_AS_WCSCMP
+# define STRCMP_SYM WCSCMP_BASE
+# elif defined USE_AS_STRCASECMP_L
+# define STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
+# else
+# define STRCMP_SYM STRCMP_BASE
+# endif
+
+# define STRCASECMP_SYM STRCASECMP_BASE
+#endif
+
+#define STRCASECMP_L_NONASCII STRCMP_NAME (STRCASECMP_SYM, _l_nonascii)
+#define STRCASECMP STRCMP_NAME (STRCASECMP_SYM, STRCMP_ISA)
+#define STRCMP STRCMP_NAME (STRCMP_SYM, STRCMP_ISA)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S
index b8f95e59cf..b1220231ab 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S
@@ -16,13 +16,2141 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
+#if IS_IN (libc) || IS_IN (rtld)
+
+# define STRCMP_ISA _sse2
+# include "strcmp-naming.h"
+
# include <sysdep.h>
-# define STRCMP __strcmp_sse2
+# undef UPDATE_STRNCMP_COUNTER
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcmp)
-#endif
+# ifndef LABEL
+# define LABEL(l) L(l)
+# endif
+
+# ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+
+# elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+# elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+# else
+# define UPDATE_STRNCMP_COUNTER
+# endif
+
+ .text
+# ifdef USE_AS_STRCASECMP_L
+# ifndef ENTRY2
+# define ENTRY2(name) ENTRY (name)
+# define END2(name) END (name)
+# endif
+
+ENTRY2 (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RDX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END2 (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp_l. */
+# elif defined USE_AS_STRNCASECMP_L
+# ifndef ENTRY2
+# define ENTRY2(name) ENTRY (name)
+# define END2(name) END (name)
+# endif
+
+ENTRY2 (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RCX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END2 (STRCASECMP)
+ /* FALLTHROUGH to strncasecmp_l. */
+# endif
+
+ENTRY (STRCMP)
+# ifdef USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+ mov (%rdx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strcasecmp_l_nonascii
+# elif defined USE_AS_STRNCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+ mov (%rcx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strncasecmp_l_nonascii
+# endif
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ test %RDX_LP, %RDX_LP
+ je LABEL(strcmp_exitz)
+ cmp $1, %RDX_LP
+ je LABEL(Byte0)
+ mov %RDX_LP, %R11_LP
+# endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Llcase_min:
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+.Lcase_add:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+ movdqa .Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+ movdqa .Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+ movdqa .Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
+# endif
+ cmp $0x30, %ecx
+ ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ movdqa LCASE_MIN_reg, %xmm9; \
+ paddb reg1, %xmm8; \
+ paddb reg2, %xmm9; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
+ pandn CASE_ADD_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm9; \
+ paddb %xmm8, reg1; \
+ paddb %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
+# else
+# define TOLOWER(reg1, reg2)
+# endif
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz LABEL(less16bytes) /* If not, find different value or null char */
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz) /* finish comparision */
+# endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+LABEL(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je LABEL(ashr_0) /* rsi and rdi relative offset same */
+ ja LABEL(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+LABEL(bigger):
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea LABEL(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ lea (%r10, %r9), %r10
+ _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+LABEL(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+# else
+ movdqa (%rdi), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
+# endif
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne LABEL(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+LABEL(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit) /* mismatch or null char seen */
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ jmp LABEL(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+LABEL(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz LABEL(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_1):
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+LABEL(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_1)
+
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+LABEL(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
+ pmovmskb %xmm0, %edx
+ test $0xfffe, %edx
+ jnz LABEL(ashr_1_exittail) /* find null char*/
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $15, %r11
+ jbe LABEL(ashr_1_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* substract 4K from %r10 */
+ jmp LABEL(gobble_ashr_1)
+
+ /*
+ * Once find null char, determine if there is a string mismatch
+ * before the null char.
+ */
+ .p2align 4
+LABEL(ashr_1_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+LABEL(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_2):
+ add $16, %r10
+ jg LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_2) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_2)
+
+ .p2align 4
+LABEL(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfffc, %edx
+ jnz LABEL(ashr_2_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $14, %r11
+ jbe LABEL(ashr_2_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_2)
+
+ .p2align 4
+LABEL(ashr_2_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+LABEL(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %x