diff options
| author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-04-14 11:47:38 -0500 |
|---|---|---|
| committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-04-14 23:21:42 -0500 |
| commit | d85916e30a902ff4bce5b0b44ff245ef58b79236 (patch) | |
| tree | 3e3bcb8cf609e49cbc3fd4de0acbffbd021f1c90 | |
| parent | 41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (diff) | |
| download | glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.tar.xz glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.zip | |
x86: Remove mem{move|cpy}-ssse3-back
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 2 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 |
5 files changed, 6 insertions, 3212 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5b02ec8de5..303fb5d734 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,6 @@ sysdep_routines += \ memcmpeq-evex \ memcmpeq-sse2 \ memcpy-ssse3 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -25,7 +24,6 @@ sysdep_routines += \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ memmove-ssse3 \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 49ce6860d0..c6008a73ed 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -134,9 +134,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2_unaligned) @@ -178,8 +175,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, @@ -874,9 +869,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2_unaligned) @@ -909,8 +901,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512F), @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_chk_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2_unaligned) @@ -1004,8 +991,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned) diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..fb01fbb301 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + return OPTIMIZE (ssse3); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, |
