diff options
| author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-12 12:28:06 -0700 |
|---|---|---|
| committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-07-13 14:55:31 -0700 |
| commit | d561fbb041fe6aa205f652aecefe4bb84fd124a5 (patch) | |
| tree | 4fa103748eb51664285b31d4186ee06b6bea35b2 | |
| parent | 30e57e0a21cc50eead14f729a29a5604a6b23e18 (diff) | |
| download | glibc-d561fbb041fe6aa205f652aecefe4bb84fd124a5.tar.xz glibc-d561fbb041fe6aa205f652aecefe4bb84fd124a5.zip | |
x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Because strcmp-sse2.S implements so many functions (more from
avx2/evex/sse42) add a new file 'strcmp-naming.h' to assist in
getting the correct symbol name for all the function across
multiarch/non-multiarch builds.
Tested build on x86_64 and x86_32 with/without multiarch.
| -rw-r--r-- | sysdeps/x86_64/multiarch/rtld-strcmp.S | 18 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/rtld-strncmp.S | 18 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-naming.h | 68 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-sse2.S | 2140 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncase_l-sse2.S | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncmp-sse2.S | 12 | ||||
| -rw-r--r-- | sysdeps/x86_64/strcasecmp_l.S | 11 | ||||
| -rw-r--r-- | sysdeps/x86_64/strcmp.S | 2147 | ||||
| -rw-r--r-- | sysdeps/x86_64/strncase_l.S | 11 | ||||
| -rw-r--r-- | sysdeps/x86_64/strncmp.S | 7 |
11 files changed, 2264 insertions, 2178 deletions
diff --git a/sysdeps/x86_64/multiarch/rtld-strcmp.S b/sysdeps/x86_64/multiarch/rtld-strcmp.S new file mode 100644 index 0000000000..207078bdcc --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strcmp.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/rtld-strncmp.S b/sysdeps/x86_64/multiarch/rtld-strncmp.S new file mode 100644 index 0000000000..ac32150406 --- /dev/null +++ b/sysdeps/x86_64/multiarch/rtld-strncmp.S @@ -0,0 +1,18 @@ +/* Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include "../strncmp.S" diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S index 2360d104dd..a2b5741399 100644 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S @@ -16,8 +16,5 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define STRCMP __strcasecmp_l_sse2 #define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define __strcasecmp __strcasecmp_sse2 -#include <sysdeps/x86_64/strcmp.S> +#include "strcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-naming.h b/sysdeps/x86_64/multiarch/strcmp-naming.h new file mode 100644 index 0000000000..6a7529b6a4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcmp-naming.h @@ -0,0 +1,68 @@ +#ifndef _STRCMP_NAMING_H_ +#define _STRCMP_NAMING_H_ + +/* Utility macros. */ +#define STRCMP_SUFFIX(x, y) x##y +#define STRCMP_NAME(x, y) STRCMP_SUFFIX (x, y) + +/* Setup base of all definitions. */ +#define STRNCASECMP_BASE __strncasecmp +#define STRCASECMP_BASE __strcasecmp +#define WCSCMP_BASE __wcscmp + +#if defined USE_MULTIARCH && IS_IN (libc) +# define WCSNCMP_BASE __wcsncmp +# define STRNCMP_BASE __strncmp +# define STRCMP_BASE __strcmp + +#else +/* Covers IS_IN (rtld) or non-multiarch build. */ +# define WCSNCMP_BASE wcsncmp +# define STRNCMP_BASE strncmp +# define STRCMP_BASE strcmp + +# undef STRCMP_ISA +# define STRCMP_ISA +#endif + +#if IS_IN (rtld) || defined USE_MULTIARCH +# define ISA_HIDDEN_JUMPTARGET(...) __VA_ARGS__ +#else +# define ISA_HIDDEN_JUMPTARGET(...) HIDDEN_JUMPTARGET (__VA_ARGS__) +#endif + +/* Get correct symbol for OVERFLOW_STRCMP, STRCMP, and + STRCASECMP. */ +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + +# if defined USE_AS_WCSCMP || defined USE_AS_WCSNCMP +# define OVERFLOW_STRCMP_SYM WCSCMP_BASE +# define STRCMP_SYM WCSNCMP_BASE +# elif defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define OVERFLOW_STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l) +# define STRCMP_SYM STRCMP_NAME (STRNCASECMP_BASE, _l) +# else +# define OVERFLOW_STRCMP_SYM STRCMP_BASE +# define STRCMP_SYM STRNCMP_BASE +# endif + +# define STRCASECMP_SYM STRNCASECMP_BASE +# define OVERFLOW_STRCMP \ + ISA_HIDDEN_JUMPTARGET (STRCMP_NAME (OVERFLOW_STRCMP_SYM, STRCMP_ISA)) +#else +# ifdef USE_AS_WCSCMP +# define STRCMP_SYM WCSCMP_BASE +# elif defined USE_AS_STRCASECMP_L +# define STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l) +# else +# define STRCMP_SYM STRCMP_BASE +# endif + +# define STRCASECMP_SYM STRCASECMP_BASE +#endif + +#define STRCASECMP_L_NONASCII STRCMP_NAME (STRCASECMP_SYM, _l_nonascii) +#define STRCASECMP STRCMP_NAME (STRCASECMP_SYM, STRCMP_ISA) +#define STRCMP STRCMP_NAME (STRCMP_SYM, STRCMP_ISA) + +#endif diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S index b8f95e59cf..b1220231ab 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S @@ -16,13 +16,2141 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#if IS_IN (libc) +#if IS_IN (libc) || IS_IN (rtld) + +# define STRCMP_ISA _sse2 +# include "strcmp-naming.h" + # include <sysdep.h> -# define STRCMP __strcmp_sse2 +# undef UPDATE_STRNCMP_COUNTER -# undef libc_hidden_builtin_def -# define libc_hidden_builtin_def(strcmp) -#endif +# ifndef LABEL +# define LABEL(l) L(l) +# endif + +# ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 + +# elif defined USE_AS_STRCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER +# elif defined USE_AS_STRNCASECMP_L +# include "locale-defines.h" + +# define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb LABEL(strcmp_exitz); \ + test %r9, %r9; \ + je LABEL(strcmp_exitz); \ + mov %r9, %r11 +# else +# define UPDATE_STRNCMP_COUNTER +# endif + + .text +# ifdef USE_AS_STRCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END2 (STRCASECMP) + /* FALLTHROUGH to strcasecmp_l. */ +# elif defined USE_AS_STRNCASECMP_L +# ifndef ENTRY2 +# define ENTRY2(name) ENTRY (name) +# define END2(name) END (name) +# endif + +ENTRY2 (STRCASECMP) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 +END2 (STRCASECMP) + /* FALLTHROUGH to strncasecmp_l. */ +# endif + +ENTRY (STRCMP) +# ifdef USE_AS_STRCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP +# else + mov (%rdx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strcasecmp_l_nonascii +# elif defined USE_AS_STRNCASECMP_L + /* We have to fall back on the C implementation for locales + with encodings not matching ASCII for single bytes. */ +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP +# else + mov (%rcx), %RAX_LP +# endif + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax) + jne __strncasecmp_l_nonascii +# endif + +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) + cmp $1, %RDX_LP + je LABEL(Byte0) + mov %RDX_LP, %R11_LP +# endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +.Llcase_min: + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +.Llcase_max: + .quad 0x9999999999999999 + .quad 0x9999999999999999 +.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous + movdqa .Llcase_min(%rip), %xmm5 +# define LCASE_MIN_reg %xmm5 + movdqa .Llcase_max(%rip), %xmm6 +# define LCASE_MAX_reg %xmm6 + movdqa .Lcase_add(%rip), %xmm7 +# define CASE_ADD_reg %xmm7 +# endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm8; \ + movdqa LCASE_MIN_reg, %xmm9; \ + paddb reg1, %xmm8; \ + paddb reg2, %xmm9; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm9; \ + pandn CASE_ADD_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm9; \ + paddb %xmm8, reg1; \ + paddb %xmm9, reg2 + TOLOWER (%xmm1, %xmm2) +# else +# define TOLOWER(reg1, reg2) +# endif + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes) /* If not, find different value or null char */ +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) /* finish comparision */ +# endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +LABEL(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +LABEL(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +LABEL(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ +# else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ +# endif + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne LABEL(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +LABEL(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) /* mismatch or null char seen */ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + jmp LABEL(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +LABEL(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz LABEL(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_1): + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + +LABEL(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + psrldq $1, %xmm3 + pslldq $15, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +LABEL(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz LABEL(ashr_1_exittail) /* find null char*/ + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $15, %r11 + jbe LABEL(ashr_1_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp LABEL(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +LABEL(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +LABEL(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz LABEL(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +LABEL(loop_ashr_2): + add $16, %r10 + jg LABEL(nibble_ashr_2) + +LABEL(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg LABEL(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + psrldq $2, %xmm3 + pslldq $14, %xmm2 + por %xmm3, %xmm2 /* merge into one 16byte value */ + + TOLOWER (%xmm1, %xmm2) + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz LABEL(exit) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + sub $16, %r11 + jbe LABEL(strcmp_exitz) +# endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp LABEL(loop_ashr_2) + + .p2align 4 +LABEL(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz LABEL(ashr_2_exittail) + +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp $14, %r11 + jbe LABEL(ashr_2_exittail) +# endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp LABEL(gobble_ashr_2) + + .p2align 4 +LABEL(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp LABEL(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +LABEL(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %x |
