diff options
| author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-04-14 11:47:37 -0500 |
|---|---|---|
| committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-04-14 23:21:42 -0500 |
| commit | 41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (patch) | |
| tree | ca15c1fa72963705a68a309cb2b3ac8c2f54c3d1 | |
| parent | e084ccd37ef6374962fb10d5f6479f55e1130d33 (diff) | |
| download | glibc-41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4.tar.xz glibc-41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4.zip | |
x86: Remove str{p}{n}cpy-ssse3
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 |
6 files changed, 0 insertions, 3572 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2b3c625ea2..5b02ec8de5 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -46,13 +46,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -83,7 +81,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 41a04621ad..49ce6860d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - |
