diff options
| author | H.J. Lu <hongjiu.lu@intel.com> | 2010-06-30 08:26:11 -0700 |
|---|---|---|
| committer | Ulrich Drepper <drepper@redhat.com> | 2010-06-30 08:26:11 -0700 |
| commit | 6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (patch) | |
| tree | 3a39ddec3a6cf66f8541c6591dbe4017136580f0 | |
| parent | d85f8ff66711fd3b1c5753330499c7403fa46d81 (diff) | |
| download | glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.tar.xz glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.zip | |
Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7
This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and
Core i7. It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and
up to 1X on Core i7. It also improves memmove by up to 3X on Atom, up to
4X on Core 2 and up to 2X on Core i7.
| -rw-r--r-- | ChangeLog | 32 | ||||
| -rw-r--r-- | debug/memmove_chk.c | 6 | ||||
| -rw-r--r-- | string/memmove.c | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/memcpy.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/bcopy.S | 7 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.c | 9 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.h | 16 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3169 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3139 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memcpy.S | 73 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memcpy_chk.S | 47 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove.c | 24 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove_chk.c | 15 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy-ssse3.S | 4 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy.S | 75 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/mempcpy_chk.S | 47 | ||||
| -rw-r--r-- | sysdeps/x86_64/sysdep.h | 3 |
21 files changed, 6681 insertions, 10 deletions
@@ -1,3 +1,35 @@ +2010-06-25 H.J. Lu <hongjiu.lu@intel.com> + + * debug/memmove_chk.c (__memmove_chk): Renamed to ... + (MEMMOVE_CHK): ...this. Default to __memmove_chk. + * string/memmove.c (memmove): Renamed to ... + (MEMMOVE): ...this. Default to memmove. + * sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK. + * sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define. + (END_CHK): Define. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back + mempcpy-ssse3-back memmove-ssse3-back. + * sysdeps/x86_64/multiarch/bcopy.S: New file . + * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file. + * sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/memcpy.S: New file. + * sysdeps/x86_64/multiarch/memcpy_chk.S: New file. + * sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file. + * sysdeps/x86_64/multiarch/memmove-ssse3.S: New file. + * sysdeps/x86_64/multiarch/memmove.c: New file. + * sysdeps/x86_64/multiarch/memmove_chk.c: New file. + * sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file. + * sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file. + * sysdeps/x86_64/multiarch/mempcpy.S: New file. + * sysdeps/x86_64/multiarch/mempcpy_chk.S: New file. + * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward): + Define. + (index_Fast_Copy_Backward): Define. + (HAS_ARCH_FEATURE): Define. + (HAS_FAST_REP_STRING): Define. + (HAS_FAST_COPY_BACKWARD): Define. + 2010-06-21 Andreas Schwab <schwab@redhat.com> * sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid): diff --git a/debug/memmove_chk.c b/debug/memmove_chk.c index f3b74d23d9..6a3e157d8b 100644 --- a/debug/memmove_chk.c +++ b/debug/memmove_chk.c @@ -23,8 +23,12 @@ #include <memcopy.h> #include <pagecopy.h> +#ifndef MEMMOVE_CHK +# define MEMMOVE_CHK __memmove_chk +#endif + void * -__memmove_chk (dest, src, len, destlen) +MEMMOVE_CHK (dest, src, len, destlen) void *dest; const void *src; size_t len; diff --git a/string/memmove.c b/string/memmove.c index 16671f7bb5..8e36e7c5a3 100644 --- a/string/memmove.c +++ b/string/memmove.c @@ -37,9 +37,12 @@ #define rettype void * #endif +#ifndef MEMMOVE +#define MEMMOVE memmove +#endif rettype -memmove (a1, a2, len) +MEMMOVE (a1, a2, len) a1const void *a1; a2const void *a2; size_t len; diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S index b25646b8c5..b4545ac9f7 100644 --- a/sysdeps/x86_64/memcpy.S +++ b/sysdeps/x86_64/memcpy.S @@ -40,12 +40,12 @@ .text #if defined PIC && !defined NOT_IN_libc -ENTRY (__memcpy_chk) +ENTRY_CHK (__memcpy_chk) cmpq %rdx, %rcx jb HIDDEN_JUMPTARGET (__chk_fail) -END (__memcpy_chk) +END_CHK (__memcpy_chk) #endif ENTRY(memcpy) /* (void *, const void*, size_t) */ diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index c61cf70345..0ca914a377 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,7 +5,9 @@ endif ifeq ($(subdir),string) sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ - strend-sse4 memcmp-sse4 + strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ + memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ + memmove-ssse3-back ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-strcspn-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S new file mode 100644 index 0000000000..11e250f1cb --- /dev/null +++ b/sysdeps/x86_64/multiarch/bcopy.S @@ -0,0 +1,7 @@ +#include <sysdep.h> + + .text +ENTRY(bcopy) + xchg %rdi, %rsi + jmp HIDDEN_BUILTIN_JUMPTARGET(memmove) +END(bcopy) diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index f13a9f4b79..55c9f54f96 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -78,10 +78,13 @@ __init_cpu_features (void) case 0x25: case 0x2e: case 0x2f: - /* Rep string instructions are fast on Intel Core i3, i5 - and i7. */ + /* Rep string instructions and copy backward are fast on + Intel Core i3, i5 and i7. */ +#if index_Fast_Rep_String != index_Fast_Copy_Backward +# error index_Fast_Rep_String != index_Fast_Copy_Backward +#endif __cpu_features.feature[index_Fast_Rep_String] - |= bit_Fast_Rep_String; + |= bit_Fast_Rep_String | bit_Fast_Copy_Backward; break; } } diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index b2f2de3796..4a211c0864 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -17,6 +17,7 @@ 02111-1307 USA. */ #define bit_Fast_Rep_String (1 << 0) +#define bit_Fast_Copy_Backward (1 << 1) #ifdef __ASSEMBLER__ @@ -32,7 +33,8 @@ # define index_SSE4_1 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET # define index_SSE4_2 COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET -#define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE +# define index_Fast_Rep_String FEATURE_INDEX_1*FEATURE_SIZE +# define index_Fast_Copy_Backward FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ @@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void) # define HAS_SSE4_2 HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20) # define HAS_FMA HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12) -# define index_Fast_Rep_String FEATURE_INDEX_1 +# define index_Fast_Rep_String FEATURE_INDEX_1 +# define index_Fast_Copy_Backward FEATURE_INDEX_1 + +#define HAS_ARCH_FEATURE(idx, bit) \ + ((__get_cpu_features ()->feature[idx] & (bit)) != 0) + +#define HAS_FAST_REP_STRING \ + HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String) + +#define HAS_FAST_COPY_BACKWARD \ + HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward) #endif /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S new file mode 100644 index 0000000000..48c974e97f --- /dev/null +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -0,0 +1,3169 @@ +/* memcpy with SSSE3 and REP string + Copyright (C) 2010 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> + +#if !defined NOT_IN_libc \ + && (defined SHARED \ + || defined USE_AS_MEMMOVE \ + || !defined USE_MULTIARCH) + +#include "asm-syntax.h" + +#ifndef MEMCPY +# define MEMCPY __memcpy_ssse3_back +# define MEMCPY_CHK __memcpy_chk_ssse3_back +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) I - B + +/* Branch to an entry in a jump table. TABLE is a jump table with + relative offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), INDEX; \ + lea (%r11, INDEX), INDEX; \ + jmp *INDEX; \ + ud2 + + .section .text.ssse3,"ax",@progbits +#if defined SHARED && !defined NOT_IN_libc +ENTRY (MEMCPY_CHK) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMCPY_CHK) +#endif + +ENTRY (MEMCPY) + mov %rdi, %rax +#ifdef USE_AS_MEMPCPY + add %rdx, %rax +#endif + +#ifdef USE_AS_MEMMOVE + cmp %rsi, %rdi + jb L(copy_forward) + je L(bwd_write_0bytes) + cmp $144, %rdx + jae L(copy_backward) + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +L(copy_forward): +#endif + cmp $144, %rdx + jae L(144bytesormore) + +L(fwd_write_less32bytes): +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jbe L(bk_write) +#endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) +#ifndef USE_AS_MEMMOVE +L(bk_write): + + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) +#endif + + ALIGN (4) +L(144bytesormore): + +#ifndef USE_AS_MEMMOVE + cmp %dil, %sil + jle L(copy_backward) +#endif + movdqu (%rsi), %xmm0 + mov %rdi, %r8 + and $-16, %rdi + add $16, %rdi + mov %rdi, %r9 + sub %r8, %r9 + sub %r9, %rdx + add %r9, %rsi + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0) +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %rcx +#else + mov __x86_64_data_cache_size(%rip), %rcx +#endif + cmp %rcx, %rdx + jae L(gobble_mem_fwd) + lea L(shl_table_fwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + ALIGN (4) +L(copy_backward): +#ifdef DATA_CACHE_SIZE + mov $DATA_CACHE_SIZE, %rcx +#else + mov __x86_64_data_cache_size(%rip), %rcx +#endif + shl $1, %rcx + cmp %rcx, %rdx + ja L(gobble_mem_bwd) + + add %rdx, %rdi + add %rdx, %rsi + movdqu -16(%rsi), %xmm0 + lea -16(%rdi), %r8 + mov %rdi, %r9 + and $0xf, %r9 + xor %r9, %rdi + sub %r9, %rsi + sub %r9, %rdx + mov %rsi, %r9 + and $0xf, %r9 + jz L(shl_0_bwd) + lea L(shl_table_bwd)(%rip), %r11 + sub $0x80, %rdx + movslq (%r11, %r9, 4), %r9 + add %r11, %r9 + jmp *%r9 + ud2 + + ALIGN (4) +L(shl_0): + + mov %rdx, %r9 + shr $8, %r9 + add %rdx, %r9 +#ifdef DATA_CACHE_SIZE + cmp $DATA_CACHE_SIZE_HALF, %r9 +#else + cmp __x86_64_data_cache_size_half(%rip), %r9 +#endif + jae L(gobble_mem_fwd) + sub $0x80, %rdx + ALIGN (4) +L(shl_0_loop): + movdqa (%rsi), %xmm1 + movdqa %xmm1, (%rdi) + movaps 0x10(%rsi), %xmm2 + movaps %xmm2, 0x10(%rdi) + movaps 0x20(%rsi), %xmm3 + movaps %xmm3, 0x20(%rdi) + movaps 0x30(%rsi), %xmm4 + movaps %xmm4, 0x30(%rdi) + movaps 0x40(%rsi), %xmm1 + movaps %xmm1, 0x40(%rdi) + movaps 0x50(%rsi), %xmm2 + movaps %xmm2, 0x50(%rdi) + movaps 0x60(%rsi), %xmm3 + movaps %xmm3, 0x60(%rdi) + movaps 0x70(%rsi), %xmm4 + movaps %xmm4, 0x70(%rdi) + sub $0x80, %rdx + lea 0x80(%rsi), %rsi + lea 0x80(%rdi), %rdi + jae L(shl_0_loop) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + ALIGN (4) +L(shl_0_bwd): + sub $0x80, %rdx +L(copy_backward_loop): + movaps -0x10(%rsi), %xmm1 + movaps %xmm1, -0x10(%rdi) + movaps -0x20(%rsi), %xmm2 + movaps %xmm2, -0x20(%rdi) + movaps -0x30(%rsi), %xmm3 + movaps %xmm3, -0x30(%rdi) + movaps -0x40(%rsi), %xmm4 + movaps %xmm4, -0x40(%rdi) + movaps -0x50(%rsi), %xmm5 + movaps %xmm5, -0x50(%rdi) + movaps -0x60(%rsi), %xmm5 + movaps %xmm5, -0x60(%rdi) + movaps -0x70(%rsi), %xmm5 + movaps %xmm5, -0x70(%rdi) + movaps -0x80(%rsi), %xmm5 + movaps %xmm5, -0x80(%rdi) + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(copy_backward_loop) + + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + ALIGN (4) +L(shl_1): + sub $0x80, %rdx + movaps -0x01(%rsi), %xmm1 + movaps 0x0f(%rsi), %xmm2 + movaps 0x1f(%rsi), %xmm3 + movaps 0x2f(%rsi), %xmm4 + movaps 0x3f(%rsi), %xmm5 + movaps 0x4f(%rsi), %xmm6 + movaps 0x5f(%rsi), %xmm7 + movaps 0x6f(%rsi), %xmm8 + movaps 0x7f(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $1, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $1, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $1, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $1, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $1, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $1, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $1, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_1) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + ALIGN (4) +L(shl_1_bwd): + movaps -0x01(%rsi), %xmm1 + + movaps -0x11(%rsi), %xmm2 + palignr $1, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x21(%rsi), %xmm3 + palignr $1, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x31(%rsi), %xmm4 + palignr $1, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x41(%rsi), %xmm5 + palignr $1, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x51(%rsi), %xmm6 + palignr $1, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x61(%rsi), %xmm7 + palignr $1, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x71(%rsi), %xmm8 + palignr $1, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x81(%rsi), %xmm9 + palignr $1, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_1_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + ALIGN (4) +L(shl_2): + sub $0x80, %rdx + movaps -0x02(%rsi), %xmm1 + movaps 0x0e(%rsi), %xmm2 + movaps 0x1e(%rsi), %xmm3 + movaps 0x2e(%rsi), %xmm4 + movaps 0x3e(%rsi), %xmm5 + movaps 0x4e(%rsi), %xmm6 + movaps 0x5e(%rsi), %xmm7 + movaps 0x6e(%rsi), %xmm8 + movaps 0x7e(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $2, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $2, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $2, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $2, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $2, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $2, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $2, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_2) + movdqu %xmm0, (%r8) + add $0x80, %rdx + add %rdx, %rdi + add %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) + + ALIGN (4) +L(shl_2_bwd): + movaps -0x02(%rsi), %xmm1 + + movaps -0x12(%rsi), %xmm2 + palignr $2, %xmm2, %xmm1 + movaps %xmm1, -0x10(%rdi) + + movaps -0x22(%rsi), %xmm3 + palignr $2, %xmm3, %xmm2 + movaps %xmm2, -0x20(%rdi) + + movaps -0x32(%rsi), %xmm4 + palignr $2, %xmm4, %xmm3 + movaps %xmm3, -0x30(%rdi) + + movaps -0x42(%rsi), %xmm5 + palignr $2, %xmm5, %xmm4 + movaps %xmm4, -0x40(%rdi) + + movaps -0x52(%rsi), %xmm6 + palignr $2, %xmm6, %xmm5 + movaps %xmm5, -0x50(%rdi) + + movaps -0x62(%rsi), %xmm7 + palignr $2, %xmm7, %xmm6 + movaps %xmm6, -0x60(%rdi) + + movaps -0x72(%rsi), %xmm8 + palignr $2, %xmm8, %xmm7 + movaps %xmm7, -0x70(%rdi) + + movaps -0x82(%rsi), %xmm9 + palignr $2, %xmm9, %xmm8 + movaps %xmm8, -0x80(%rdi) + + sub $0x80, %rdx + lea -0x80(%rdi), %rdi + lea -0x80(%rsi), %rsi + jae L(shl_2_bwd) + movdqu %xmm0, (%r8) + add $0x80, %rdx + sub %rdx, %rdi + sub %rdx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) + + ALIGN (4) +L(shl_3): + sub $0x80, %rdx + movaps -0x03(%rsi), %xmm1 + movaps 0x0d(%rsi), %xmm2 + movaps 0x1d(%rsi), %xmm3 + movaps 0x2d(%rsi), %xmm4 + movaps 0x3d(%rsi), %xmm5 + movaps 0x4d(%rsi), %xmm6 + movaps 0x5d(%rsi), %xmm7 + movaps 0x6d(%rsi), %xmm8 + movaps 0x7d(%rsi), %xmm9 + lea 0x80(%rsi), %rsi + palignr $3, %xmm8, %xmm9 + movaps %xmm9, 0x70(%rdi) + palignr $3, %xmm7, %xmm8 + movaps %xmm8, 0x60(%rdi) + palignr $3, %xmm6, %xmm7 + movaps %xmm7, 0x50(%rdi) + palignr $3, %xmm5, %xmm6 + movaps %xmm6, 0x40(%rdi) + palignr $3, %xmm4, %xmm5 + movaps %xmm5, 0x30(%rdi) + palignr $3, %xmm3, %xmm4 + movaps %xmm4, 0x20(%rdi) + palignr $3, %xmm2, %xmm3 + movaps %xmm3, 0x10(%rdi) + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%rdi) + lea 0x80(%rdi), %rdi + jae L(shl_3) |
