aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 11:47:38 -0500
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 23:21:42 -0500
commitd85916e30a902ff4bce5b0b44ff245ef58b79236 (patch)
tree3e3bcb8cf609e49cbc3fd4de0acbffbd021f1c90
parent41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (diff)
downloadglibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.tar.xz
glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.zip
x86: Remove mem{move|cpy}-ssse3-back
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-rw-r--r--sysdeps/x86_64/multiarch/Makefile2
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c15
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-memmove.h16
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S3181
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3-back.S4
5 files changed, 6 insertions, 3212 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
memcmpeq-evex \
memcmpeq-sse2 \
memcpy-ssse3 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -134,9 +134,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
@@ -178,8 +175,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
@@ -874,9 +869,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
@@ -909,8 +901,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
@@ -1004,8 +991,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ return OPTIMIZE (ssse3);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0,