aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 11:47:37 -0500
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 23:21:42 -0500
commit41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (patch)
treeca15c1fa72963705a68a309cb2b3ac8c2f54c3d1
parente084ccd37ef6374962fb10d5f6479f55e1130d33 (diff)
downloadglibc-41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4.tar.xz
glibc-41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4.zip
x86: Remove str{p}{n}cpy-ssse3
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-rw-r--r--sysdeps/x86_64/multiarch/Makefile4
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c8
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-ssse3.S3550
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-ssse3.S3
6 files changed, 0 insertions, 3572 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-