x86: Remove mem{move|cpy}-ssse3-back

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
author: Noah Goldstein <goldstein.w.n@gmail.com> 2022-04-14 11:47:38 -0500
committer: Noah Goldstein <goldstein.w.n@gmail.com> 2022-04-14 23:21:42 -0500
commit: d85916e30a902ff4bce5b0b44ff245ef58b79236 (patch)
tree: 3e3bcb8cf609e49cbc3fd4de0acbffbd021f1c90
parent: 41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (diff)
download: glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.tar.xz
glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.zip
5 files changed, 6 insertions, 3212 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
   memcmpeq-evex \
   memcmpeq-sse2 \
   memcpy-ssse3 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
   memmove-ssse3 \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -134,9 +134,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
@@ -178,8 +175,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
@@ -874,9 +869,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
@@ -909,8 +901,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
@@ -1004,8 +991,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      return OPTIMIZE (ssse3);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0,
author	Noah Goldstein <goldstein.w.n@gmail.com>	2022-04-14 11:47:38 -0500
committer	Noah Goldstein <goldstein.w.n@gmail.com>	2022-04-14 23:21:42 -0500
commit	d85916e30a902ff4bce5b0b44ff245ef58b79236 (patch)
tree	3e3bcb8cf609e49cbc3fd4de0acbffbd021f1c90
parent	41bfe224e5e2e23c8d8a0d3d45e66591373d3fd4 (diff)
download	glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.tar.xz glibc-d85916e30a902ff4bce5b0b44ff245ef58b79236.zip