aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2010-06-30 08:26:11 -0700
committerUlrich Drepper <drepper@redhat.com>2010-06-30 08:26:11 -0700
commit6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (patch)
tree3a39ddec3a6cf66f8541c6591dbe4017136580f0
parentd85f8ff66711fd3b1c5753330499c7403fa46d81 (diff)
downloadglibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.tar.xz
glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.zip
Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7
This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and Core i7. It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and up to 1X on Core i7. It also improves memmove by up to 3X on Atom, up to 4X on Core 2 and up to 2X on Core i7.
-rw-r--r--ChangeLog32
-rw-r--r--debug/memmove_chk.c6
-rw-r--r--string/memmove.c5
-rw-r--r--sysdeps/x86_64/memcpy.S4
-rw-r--r--sysdeps/x86_64/multiarch/Makefile4
-rw-r--r--sysdeps/x86_64/multiarch/bcopy.S7
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c9
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h16
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3-back.S3169
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-ssse3.S3139
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S73
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S47
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3-back.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/memmove.c24
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.c15
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S4
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy-ssse3.S4
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S75
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S47
-rw-r--r--sysdeps/x86_64/sysdep.h3
21 files changed, 6681 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index eaf57497c1..175c6ed53b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2010-06-25 H.J. Lu <hongjiu.lu@intel.com>
+
+ * debug/memmove_chk.c (__memmove_chk): Renamed to ...
+ (MEMMOVE_CHK): ...this. Default to __memmove_chk.
+ * string/memmove.c (memmove): Renamed to ...
+ (MEMMOVE): ...this. Default to memmove.
+ * sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK.
+ * sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define.
+ (END_CHK): Define.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back
+ mempcpy-ssse3-back memmove-ssse3-back.
+ * sysdeps/x86_64/multiarch/bcopy.S: New file .
+ * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy.S: New file.
+ * sysdeps/x86_64/multiarch/memcpy_chk.S: New file.
+ * sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/memmove-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/memmove.c: New file.
+ * sysdeps/x86_64/multiarch/memmove_chk.c: New file.
+ * sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy.S: New file.
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S: New file.
+ * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward):
+ Define.
+ (index_Fast_Copy_Backward): Define.
+ (HAS_ARCH_FEATURE): Define.
+ (HAS_FAST_REP_STRING): Define.
+ (HAS_FAST_COPY_BACKWARD): Define.
+
2010-06-21 Andreas Schwab <schwab@redhat.com>
* sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid):
diff --git a/debug/memmove_chk.c b/debug/memmove_chk.c
index f3b74d23d9..6a3e157d8b 100644
--- a/debug/memmove_chk.c
+++ b/debug/memmove_chk.c
@@ -23,8 +23,12 @@
#include <memcopy.h>
#include <pagecopy.h>
+#ifndef MEMMOVE_CHK
+# define MEMMOVE_CHK __memmove_chk
+#endif
+
void *
-__memmove_chk (dest, src, len, destlen)
+MEMMOVE_CHK (dest, src, len, destlen)
void *dest;
const void *src;
size_t len;
diff --git a/string/memmove.c b/string/memmove.c
index 16671f7bb5..8e36e7c5a3 100644
--- a/string/memmove.c
+++ b/string/memmove.c
@@ -37,9 +37,12 @@
#define rettype void *
#endif
+#ifndef MEMMOVE
+#define MEMMOVE memmove
+#endif
rettype
-memmove (a1, a2, len)
+MEMMOVE (a1, a2, len)
a1const void *a1;
a2const void *a2;
size_t len;
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index b25646b8c5..b4545ac9f7 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -40,12 +40,12 @@
.text
#if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
cmpq %rdx, %rcx
jb HIDDEN_JUMPTARGET (__chk_fail)
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
#endif
ENTRY(memcpy) /* (void *, const void*, size_t) */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c61cf70345..0ca914a377 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,9 @@ endif
ifeq ($(subdir),string)
sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
- strend-sse4 memcmp-sse4
+ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+ memmove-ssse3-back
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000000..11e250f1cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+ .text
+ENTRY(bcopy)
+ xchg %rdi, %rsi
+ jmp HIDDEN_BUILTIN_JUMPTARGET(memmove)
+END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f13a9f4b79..55c9f54f96 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -78,10 +78,13 @@ __init_cpu_features (void)
case 0x25:
case 0x2e:
case 0x2f:
- /* Rep string instructions are fast on Intel Core i3, i5
- and i7. */
+ /* Rep string instructions and copy backward are fast on
+ Intel Core i3, i5 and i7. */
+#if index_Fast_Rep_String != index_Fast_Copy_Backward
+# error index_Fast_Rep_String != index_Fast_Copy_Backward
+#endif
__cpu_features.feature[index_Fast_Rep_String]
- |= bit_Fast_Rep_String;
+ |= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
break;
}
}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index b2f2de3796..4a211c0864 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -17,6 +17,7 @@
02111-1307 USA. */
#define bit_Fast_Rep_String (1 << 0)
+#define bit_Fast_Copy_Backward (1 << 1)
#ifdef __ASSEMBLER__
@@ -32,7 +33,8 @@
# define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
# define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
-#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE
#else /* __ASSEMBLER__ */
@@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void)
# define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
# define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
-# define index_Fast_Rep_String FEATURE_INDEX_1
+# define index_Fast_Rep_String FEATURE_INDEX_1
+# define index_Fast_Copy_Backward FEATURE_INDEX_1
+
+#define HAS_ARCH_FEATURE(idx, bit) \
+ ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
+
+#define HAS_FAST_REP_STRING \
+ HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
+
+#define HAS_FAST_COPY_BACKWARD \
+ HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
#endif /* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000000..48c974e97f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3169 @@
+/* memcpy with SSSE3 and REP string
+ Copyright (C) 2010 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+ && (defined SHARED \
+ || defined USE_AS_MEMMOVE \
+ || !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3_back
+# define MEMCPY_CHK __memcpy_chk_ssse3_back
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n) .p2align n
+#endif
+
+#define JMPTBL(I, B) I - B
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ relative offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ lea TABLE(%rip), %r11; \
+ movslq (%r11, INDEX, SCALE), INDEX; \
+ lea (%r11, INDEX), INDEX; \
+ jmp *INDEX; \
+ ud2
+
+ .section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+ cmpq %rdx, %rcx
+ jb HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+ mov %rdi, %rax
+#ifdef USE_AS_MEMPCPY
+ add %rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+ cmp %rsi, %rdi
+ jb L(copy_forward)
+ je L(bwd_write_0bytes)
+ cmp $144, %rdx
+ jae L(copy_backward)
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+ cmp $144, %rdx
+ jae L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+ cmp %dil, %sil
+ jbe L(bk_write)
+#endif
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+ ALIGN (4)
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+ cmp %dil, %sil
+ jle L(copy_backward)
+#endif
+ movdqu (%rsi), %xmm0
+ mov %rdi, %r8
+ and $-16, %rdi
+ add $16, %rdi
+ mov %rdi, %r9
+ sub %r8, %r9
+ sub %r9, %rdx
+ add %r9, %rsi
+ mov %rsi, %r9
+ and $0xf, %r9
+ jz L(shl_0)
+#ifdef DATA_CACHE_SIZE
+ mov $DATA_CACHE_SIZE, %rcx
+#else
+ mov __x86_64_data_cache_size(%rip), %rcx
+#endif
+ cmp %rcx, %rdx
+ jae L(gobble_mem_fwd)
+ lea L(shl_table_fwd)(%rip), %r11
+ sub $0x80, %rdx
+ movslq (%r11, %r9, 4), %r9
+ add %r11, %r9
+ jmp *%r9
+ ud2
+
+ ALIGN (4)
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+ mov $DATA_CACHE_SIZE, %rcx
+#else
+ mov __x86_64_data_cache_size(%rip), %rcx
+#endif
+ shl $1, %rcx
+ cmp %rcx, %rdx
+ ja L(gobble_mem_bwd)
+
+ add %rdx, %rdi
+ add %rdx, %rsi
+ movdqu -16(%rsi), %xmm0
+ lea -16(%rdi), %r8
+ mov %rdi, %r9
+ and $0xf, %r9
+ xor %r9, %rdi
+ sub %r9, %rsi
+ sub %r9, %rdx
+ mov %rsi, %r9
+ and $0xf, %r9
+ jz L(shl_0_bwd)
+ lea L(shl_table_bwd)(%rip), %r11
+ sub $0x80, %rdx
+ movslq (%r11, %r9, 4), %r9
+ add %r11, %r9
+ jmp *%r9
+ ud2
+
+ ALIGN (4)
+L(shl_0):
+
+ mov %rdx, %r9
+ shr $8, %r9
+ add %rdx, %r9
+#ifdef DATA_CACHE_SIZE
+ cmp $DATA_CACHE_SIZE_HALF, %r9
+#else
+ cmp __x86_64_data_cache_size_half(%rip), %r9
+#endif
+ jae L(gobble_mem_fwd)
+ sub $0x80, %rdx
+ ALIGN (4)
+L(shl_0_loop):
+ movdqa (%rsi), %xmm1
+ movdqa %xmm1, (%rdi)
+ movaps 0x10(%rsi), %xmm2
+ movaps %xmm2, 0x10(%rdi)
+ movaps 0x20(%rsi), %xmm3
+ movaps %xmm3, 0x20(%rdi)
+ movaps 0x30(%rsi), %xmm4
+ movaps %xmm4, 0x30(%rdi)
+ movaps 0x40(%rsi), %xmm1
+ movaps %xmm1, 0x40(%rdi)
+ movaps 0x50(%rsi), %xmm2
+ movaps %xmm2, 0x50(%rdi)
+ movaps 0x60(%rsi), %xmm3
+ movaps %xmm3, 0x60(%rdi)
+ movaps 0x70(%rsi), %xmm4
+ movaps %xmm4, 0x70(%rdi)
+ sub $0x80, %rdx
+ lea 0x80(%rsi), %rsi
+ lea 0x80(%rdi), %rdi
+ jae L(shl_0_loop)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rsi
+ add %rdx, %rdi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_0_bwd):
+ sub $0x80, %rdx
+L(copy_backward_loop):
+ movaps -0x10(%rsi), %xmm1
+ movaps %xmm1, -0x10(%rdi)
+ movaps -0x20(%rsi), %xmm2
+ movaps %xmm2, -0x20(%rdi)
+ movaps -0x30(%rsi), %xmm3
+ movaps %xmm3, -0x30(%rdi)
+ movaps -0x40(%rsi), %xmm4
+ movaps %xmm4, -0x40(%rdi)
+ movaps -0x50(%rsi), %xmm5
+ movaps %xmm5, -0x50(%rdi)
+ movaps -0x60(%rsi), %xmm5
+ movaps %xmm5, -0x60(%rdi)
+ movaps -0x70(%rsi), %xmm5
+ movaps %xmm5, -0x70(%rdi)
+ movaps -0x80(%rsi), %xmm5
+ movaps %xmm5, -0x80(%rdi)
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(copy_backward_loop)
+
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1):
+ sub $0x80, %rdx
+ movaps -0x01(%rsi), %xmm1
+ movaps 0x0f(%rsi), %xmm2
+ movaps 0x1f(%rsi), %xmm3
+ movaps 0x2f(%rsi), %xmm4
+ movaps 0x3f(%rsi), %xmm5
+ movaps 0x4f(%rsi), %xmm6
+ movaps 0x5f(%rsi), %xmm7
+ movaps 0x6f(%rsi), %xmm8
+ movaps 0x7f(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $1, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $1, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $1, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $1, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $1, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $1, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_1)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_1_bwd):
+ movaps -0x01(%rsi), %xmm1
+
+ movaps -0x11(%rsi), %xmm2
+ palignr $1, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x21(%rsi), %xmm3
+ palignr $1, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x31(%rsi), %xmm4
+ palignr $1, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x41(%rsi), %xmm5
+ palignr $1, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x51(%rsi), %xmm6
+ palignr $1, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x61(%rsi), %xmm7
+ palignr $1, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x71(%rsi), %xmm8
+ palignr $1, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x81(%rsi), %xmm9
+ palignr $1, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_1_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2):
+ sub $0x80, %rdx
+ movaps -0x02(%rsi), %xmm1
+ movaps 0x0e(%rsi), %xmm2
+ movaps 0x1e(%rsi), %xmm3
+ movaps 0x2e(%rsi), %xmm4
+ movaps 0x3e(%rsi), %xmm5
+ movaps 0x4e(%rsi), %xmm6
+ movaps 0x5e(%rsi), %xmm7
+ movaps 0x6e(%rsi), %xmm8
+ movaps 0x7e(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $2, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $2, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $2, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $2, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $2, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $2, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_2)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ add %rdx, %rdi
+ add %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_2_bwd):
+ movaps -0x02(%rsi), %xmm1
+
+ movaps -0x12(%rsi), %xmm2
+ palignr $2, %xmm2, %xmm1
+ movaps %xmm1, -0x10(%rdi)
+
+ movaps -0x22(%rsi), %xmm3
+ palignr $2, %xmm3, %xmm2
+ movaps %xmm2, -0x20(%rdi)
+
+ movaps -0x32(%rsi), %xmm4
+ palignr $2, %xmm4, %xmm3
+ movaps %xmm3, -0x30(%rdi)
+
+ movaps -0x42(%rsi), %xmm5
+ palignr $2, %xmm5, %xmm4
+ movaps %xmm4, -0x40(%rdi)
+
+ movaps -0x52(%rsi), %xmm6
+ palignr $2, %xmm6, %xmm5
+ movaps %xmm5, -0x50(%rdi)
+
+ movaps -0x62(%rsi), %xmm7
+ palignr $2, %xmm7, %xmm6
+ movaps %xmm6, -0x60(%rdi)
+
+ movaps -0x72(%rsi), %xmm8
+ palignr $2, %xmm8, %xmm7
+ movaps %xmm7, -0x70(%rdi)
+
+ movaps -0x82(%rsi), %xmm9
+ palignr $2, %xmm9, %xmm8
+ movaps %xmm8, -0x80(%rdi)
+
+ sub $0x80, %rdx
+ lea -0x80(%rdi), %rdi
+ lea -0x80(%rsi), %rsi
+ jae L(shl_2_bwd)
+ movdqu %xmm0, (%r8)
+ add $0x80, %rdx
+ sub %rdx, %rdi
+ sub %rdx, %rsi
+ BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+ ALIGN (4)
+L(shl_3):
+ sub $0x80, %rdx
+ movaps -0x03(%rsi), %xmm1
+ movaps 0x0d(%rsi), %xmm2
+ movaps 0x1d(%rsi), %xmm3
+ movaps 0x2d(%rsi), %xmm4
+ movaps 0x3d(%rsi), %xmm5
+ movaps 0x4d(%rsi), %xmm6
+ movaps 0x5d(%rsi), %xmm7
+ movaps 0x6d(%rsi), %xmm8
+ movaps 0x7d(%rsi), %xmm9
+ lea 0x80(%rsi), %rsi
+ palignr $3, %xmm8, %xmm9
+ movaps %xmm9, 0x70(%rdi)
+ palignr $3, %xmm7, %xmm8
+ movaps %xmm8, 0x60(%rdi)
+ palignr $3, %xmm6, %xmm7
+ movaps %xmm7, 0x50(%rdi)
+ palignr $3, %xmm5, %xmm6
+ movaps %xmm6, 0x40(%rdi)
+ palignr $3, %xmm4, %xmm5
+ movaps %xmm5, 0x30(%rdi)
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm4, 0x20(%rdi)
+ palignr $3, %xmm2, %xmm3
+ movaps %xmm3, 0x10(%rdi)
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%rdi)
+ lea 0x80(%rdi), %rdi
+ jae L(shl_3)