aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 11:47:40 -0500
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-04-14 23:21:42 -0500
commit26b2478322db94edc9e0e8f577b2f71d291e5acb (patch)
tree6087539637c25880c3f6e32d0ab19d0d23c9e0df
parentd85916e30a902ff4bce5b0b44ff245ef58b79236 (diff)
downloadglibc-26b2478322db94edc9e0e8f577b2f71d291e5acb.tar.xz
glibc-26b2478322db94edc9e0e8f577b2f71d291e5acb.zip
x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.957 bench-memcpy-random geometric_mean(N=50) New / Original: 0.912 bench-memcpy-large geometric_mean(N=50) New / Original: 0.892 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries.
-rw-r--r--sysdeps/x86_64/multiarch/Makefile1
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S3151
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3.S384
3 files changed, 380 insertions, 3156 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)