Improve 64bit memcpy/memmove for Atom, Core 2 and Core i7

This patch includes optimized 64bit memcpy/memmove for Atom, Core 2 and Core i7. It improves memcpy by up to 3X on Atom, up to 4X on Core 2 and up to 1X on Core i7. It also improves memmove by up to 3X on Atom, up to 4X on Core 2 and up to 2X on Core i7.
author: H.J. Lu <hongjiu.lu@intel.com> 2010-06-30 08:26:11 -0700
committer: Ulrich Drepper <drepper@redhat.com> 2010-06-30 08:26:11 -0700
commit: 6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (patch)
tree: 3a39ddec3a6cf66f8541c6591dbe4017136580f0
parent: d85f8ff66711fd3b1c5753330499c7403fa46d81 (diff)
download: glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.tar.xz
glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.zip
21 files changed, 6681 insertions, 10 deletions
diff --git a/ChangeLog b/ChangeLog
index eaf57497c1..175c6ed53b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,35 @@
+2010-06-25  H.J. Lu  <hongjiu.lu@intel.com>
+
+	* debug/memmove_chk.c (__memmove_chk): Renamed to ...
+	(MEMMOVE_CHK): ...this.  Default to __memmove_chk.
+	* string/memmove.c (memmove): Renamed to ...
+	(MEMMOVE): ...this.  Default to memmove.
+	* sysdeps/x86_64/memcpy.S: Use ENTRY_CHK and END_CHK.
+	* sysdeps/x86_64/sysdep.h (ENTRY_CHK): Define.
+	(END_CHK): Define.
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	memcpy-ssse3 mempcpy-ssse3 memmove-ssse3 memcpy-ssse3-back
+	mempcpy-ssse3-back memmove-ssse3-back.
+	* sysdeps/x86_64/multiarch/bcopy.S: New file .
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy.S: New file.
+	* sysdeps/x86_64/multiarch/memcpy_chk.S: New file.
+	* sysdeps/x86_64/multiarch/memmove-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/memmove-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/memmove.c: New file.
+	* sysdeps/x86_64/multiarch/memmove_chk.c: New file.
+	* sysdeps/x86_64/multiarch/mempcpy-ssse3-back.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy-ssse3.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy.S: New file.
+	* sysdeps/x86_64/multiarch/mempcpy_chk.S: New file.
+	* sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Copy_Backward):
+	Define.
+	(index_Fast_Copy_Backward): Define.
+	(HAS_ARCH_FEATURE): Define.
+	(HAS_FAST_REP_STRING): Define.
+	(HAS_FAST_COPY_BACKWARD): Define.
+
 2010-06-21  Andreas Schwab  <schwab@redhat.com>
 
 	* sysdeps/unix/sysv/linux/getlogin_r.c (__getlogin_r_loginuid):
diff --git a/debug/memmove_chk.c b/debug/memmove_chk.c
index f3b74d23d9..6a3e157d8b 100644
--- a/debug/memmove_chk.c
+++ b/debug/memmove_chk.c
@@ -23,8 +23,12 @@
 #include <memcopy.h>
 #include <pagecopy.h>
 
+#ifndef MEMMOVE_CHK
+# define MEMMOVE_CHK __memmove_chk
+#endif
+
 void *
-__memmove_chk (dest, src, len, destlen)
+MEMMOVE_CHK (dest, src, len, destlen)
      void *dest;
      const void *src;
      size_t len;
diff --git a/string/memmove.c b/string/memmove.c
index 16671f7bb5..8e36e7c5a3 100644
--- a/string/memmove.c
+++ b/string/memmove.c
@@ -37,9 +37,12 @@
 #define	rettype		void *
 #endif
 
+#ifndef MEMMOVE
+#define MEMMOVE memmove
+#endif
 
 rettype
-memmove (a1, a2, len)
+MEMMOVE (a1, a2, len)
      a1const void *a1;
      a2const void *a2;
      size_t len;
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index b25646b8c5..b4545ac9f7 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -40,12 +40,12 @@
         .text
 
 #if defined PIC && !defined NOT_IN_libc
-ENTRY (__memcpy_chk)
+ENTRY_CHK (__memcpy_chk)
 
 	cmpq	%rdx, %rcx
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 
-END (__memcpy_chk)
+END_CHK (__memcpy_chk)
 #endif
 
 ENTRY(memcpy)				/* (void *, const void*, size_t) */
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index c61cf70345..0ca914a377 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -5,7 +5,9 @@ endif
 
 ifeq ($(subdir),string)
 sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \
-		   strend-sse4 memcmp-sse4
+		   strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \
+		   memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \
+		   memmove-ssse3-back
 ifeq (yes,$(config-cflags-sse4))
 sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
 CFLAGS-strcspn-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
new file mode 100644
index 0000000000..11e250f1cb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bcopy.S
@@ -0,0 +1,7 @@
+#include <sysdep.h>
+
+	.text
+ENTRY(bcopy)
+	xchg	%rdi, %rsi
+	jmp	HIDDEN_BUILTIN_JUMPTARGET(memmove)
+END(bcopy)
diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c
index f13a9f4b79..55c9f54f96 100644
--- a/sysdeps/x86_64/multiarch/init-arch.c
+++ b/sysdeps/x86_64/multiarch/init-arch.c
@@ -78,10 +78,13 @@ __init_cpu_features (void)
 	    case 0x25:
 	    case 0x2e:
 	    case 0x2f:
-	      /* Rep string instructions are fast on Intel Core i3, i5
-		 and i7.  */
+	      /* Rep string instructions and copy backward are fast on
+		 Intel Core i3, i5 and i7.  */
+#if index_Fast_Rep_String != index_Fast_Copy_Backward
+# error index_Fast_Rep_String != index_Fast_Copy_Backward
+#endif
 	      __cpu_features.feature[index_Fast_Rep_String]
-		|= bit_Fast_Rep_String;
+		|= bit_Fast_Rep_String | bit_Fast_Copy_Backward;
 	      break;
 	    }
 	}
diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h
index b2f2de3796..4a211c0864 100644
--- a/sysdeps/x86_64/multiarch/init-arch.h
+++ b/sysdeps/x86_64/multiarch/init-arch.h
@@ -17,6 +17,7 @@
    02111-1307 USA.  */
 
 #define bit_Fast_Rep_String	(1 << 0)
+#define bit_Fast_Copy_Backward	(1 << 1)
 
 #ifdef	__ASSEMBLER__
 
@@ -32,7 +33,8 @@
 # define index_SSE4_1	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 # define index_SSE4_2	COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET
 
-#define index_Fast_Rep_String	FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Rep_String		FEATURE_INDEX_1*FEATURE_SIZE
+# define index_Fast_Copy_Backward	FEATURE_INDEX_1*FEATURE_SIZE
 
 #else	/* __ASSEMBLER__ */
 
@@ -102,6 +104,16 @@ extern const struct cpu_features *__get_cpu_features (void)
 # define HAS_SSE4_2	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 20)
 # define HAS_FMA	HAS_CPU_FEATURE (COMMON_CPUID_INDEX_1, ecx, 12)
 
-# define index_Fast_Rep_String	FEATURE_INDEX_1
+# define index_Fast_Rep_String		FEATURE_INDEX_1
+# define index_Fast_Copy_Backward	FEATURE_INDEX_1
+
+#define HAS_ARCH_FEATURE(idx, bit) \
+  ((__get_cpu_features ()->feature[idx] & (bit)) != 0)
+
+#define HAS_FAST_REP_STRING \
+  HAS_ARCH_FEATURE (index_Fast_Rep_String, bit_Fast_Rep_String)
+
+#define HAS_FAST_COPY_BACKWARD \
+  HAS_ARCH_FEATURE (index_Fast_Copy_Backward, bit_Fast_Copy_Backward)
 
 #endif	/* __ASSEMBLER__ */
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
new file mode 100644
index 0000000000..48c974e97f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -0,0 +1,3169 @@
+/* memcpy with SSSE3 and REP string
+   Copyright (C) 2010 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, write to the Free
+   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+   02111-1307 USA.  */
+
+#include <sysdep.h>
+
+#if !defined NOT_IN_libc \
+    && (defined SHARED \
+        || defined USE_AS_MEMMOVE \
+	|| !defined USE_MULTIARCH)
+
+#include "asm-syntax.h"
+
+#ifndef MEMCPY
+# define MEMCPY		__memcpy_ssse3_back
+# define MEMCPY_CHK	__memcpy_chk_ssse3_back
+#endif
+
+#ifndef ALIGN
+# define ALIGN(n)	.p2align n
+#endif
+
+#define JMPTBL(I, B)	I - B
+
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+   relative offsets.  INDEX is a register contains the index into the
+   jump table.  SCALE is the scale of INDEX.  */
+#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+  lea		TABLE(%rip), %r11;				\
+  movslq	(%r11, INDEX, SCALE), INDEX;			\
+  lea		(%r11, INDEX), INDEX;				\
+  jmp		*INDEX;						\
+  ud2
+
+	.section .text.ssse3,"ax",@progbits
+#if defined SHARED && !defined NOT_IN_libc
+ENTRY (MEMCPY_CHK)
+	cmpq	%rdx, %rcx
+	jb	HIDDEN_JUMPTARGET (__chk_fail)
+END (MEMCPY_CHK)
+#endif
+
+ENTRY (MEMCPY)
+	mov	%rdi, %rax
+#ifdef USE_AS_MEMPCPY
+	add	%rdx, %rax
+#endif
+
+#ifdef USE_AS_MEMMOVE
+	cmp	%rsi, %rdi
+	jb	L(copy_forward)
+	je	L(bwd_write_0bytes)
+	cmp	$144, %rdx
+	jae	L(copy_backward)
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+L(copy_forward):
+#endif
+	cmp	$144, %rdx
+	jae	L(144bytesormore)
+
+L(fwd_write_less32bytes):
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jbe	L(bk_write)
+#endif
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+#ifndef USE_AS_MEMMOVE
+L(bk_write):
+
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+#endif
+
+	ALIGN (4)
+L(144bytesormore):
+
+#ifndef USE_AS_MEMMOVE
+	cmp	%dil, %sil
+	jle	L(copy_backward)
+#endif
+	movdqu	(%rsi), %xmm0
+	mov	%rdi, %r8
+	and	$-16, %rdi
+	add	$16, %rdi
+	mov	%rdi, %r9
+	sub	%r8, %r9
+	sub	%r9, %rdx
+	add	%r9, %rsi
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0)
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %rcx
+#else
+	mov	__x86_64_data_cache_size(%rip), %rcx
+#endif
+	cmp	%rcx, %rdx
+	jae	L(gobble_mem_fwd)
+	lea    	L(shl_table_fwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	ALIGN (4)
+L(copy_backward):
+#ifdef DATA_CACHE_SIZE
+	mov	$DATA_CACHE_SIZE, %rcx
+#else
+	mov	__x86_64_data_cache_size(%rip), %rcx
+#endif
+	shl	$1, %rcx
+	cmp	%rcx, %rdx
+	ja	L(gobble_mem_bwd)
+
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	movdqu	-16(%rsi), %xmm0
+	lea	-16(%rdi), %r8
+	mov	%rdi, %r9
+	and	$0xf, %r9
+	xor	%r9, %rdi
+	sub	%r9, %rsi
+	sub	%r9, %rdx
+	mov	%rsi, %r9
+	and	$0xf, %r9
+	jz	L(shl_0_bwd)
+	lea    	L(shl_table_bwd)(%rip), %r11
+	sub	$0x80, %rdx
+	movslq	(%r11, %r9, 4), %r9
+	add	%r11, %r9
+	jmp	*%r9
+	ud2
+
+	ALIGN (4)
+L(shl_0):
+
+	mov	%rdx, %r9
+	shr	$8, %r9
+	add	%rdx, %r9
+#ifdef DATA_CACHE_SIZE
+	cmp	$DATA_CACHE_SIZE_HALF, %r9
+#else
+	cmp	__x86_64_data_cache_size_half(%rip), %r9
+#endif
+	jae	L(gobble_mem_fwd)
+	sub	$0x80, %rdx
+	ALIGN (4)
+L(shl_0_loop):
+	movdqa	(%rsi), %xmm1
+	movdqa	%xmm1, (%rdi)
+	movaps	0x10(%rsi), %xmm2
+	movaps	%xmm2, 0x10(%rdi)
+	movaps	0x20(%rsi), %xmm3
+	movaps	%xmm3, 0x20(%rdi)
+	movaps	0x30(%rsi), %xmm4
+	movaps	%xmm4, 0x30(%rdi)
+	movaps	0x40(%rsi), %xmm1
+	movaps	%xmm1, 0x40(%rdi)
+	movaps	0x50(%rsi), %xmm2
+	movaps	%xmm2, 0x50(%rdi)
+	movaps	0x60(%rsi), %xmm3
+	movaps	%xmm3, 0x60(%rdi)
+	movaps	0x70(%rsi), %xmm4
+	movaps	%xmm4, 0x70(%rdi)
+	sub	$0x80, %rdx
+	lea	0x80(%rsi), %rsi
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_0_loop)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rsi
+	add	%rdx, %rdi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_0_bwd):
+	sub	$0x80, %rdx
+L(copy_backward_loop):
+	movaps	-0x10(%rsi), %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+	movaps	-0x20(%rsi), %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+	movaps	-0x30(%rsi), %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+	movaps	-0x40(%rsi), %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+	movaps	-0x50(%rsi), %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+	movaps	-0x60(%rsi), %xmm5
+	movaps	%xmm5, -0x60(%rdi)
+	movaps	-0x70(%rsi), %xmm5
+	movaps	%xmm5, -0x70(%rdi)
+	movaps	-0x80(%rsi), %xmm5
+	movaps	%xmm5, -0x80(%rdi)
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(copy_backward_loop)
+
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1):
+	sub	$0x80, %rdx
+	movaps	-0x01(%rsi), %xmm1
+	movaps	0x0f(%rsi), %xmm2
+	movaps	0x1f(%rsi), %xmm3
+	movaps	0x2f(%rsi), %xmm4
+	movaps	0x3f(%rsi), %xmm5
+	movaps	0x4f(%rsi), %xmm6
+	movaps	0x5f(%rsi), %xmm7
+	movaps	0x6f(%rsi), %xmm8
+	movaps	0x7f(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$1, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$1, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$1, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$1, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$1, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$1, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_1)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_1_bwd):
+	movaps	-0x01(%rsi), %xmm1
+
+	movaps	-0x11(%rsi), %xmm2
+	palignr	$1, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x21(%rsi), %xmm3
+	palignr	$1, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x31(%rsi), %xmm4
+	palignr	$1, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x41(%rsi), %xmm5
+	palignr	$1, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x51(%rsi), %xmm6
+	palignr	$1, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x61(%rsi), %xmm7
+	palignr	$1, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x71(%rsi), %xmm8
+	palignr	$1, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x81(%rsi), %xmm9
+	palignr	$1, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_1_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2):
+	sub	$0x80, %rdx
+	movaps	-0x02(%rsi), %xmm1
+	movaps	0x0e(%rsi), %xmm2
+	movaps	0x1e(%rsi), %xmm3
+	movaps	0x2e(%rsi), %xmm4
+	movaps	0x3e(%rsi), %xmm5
+	movaps	0x4e(%rsi), %xmm6
+	movaps	0x5e(%rsi), %xmm7
+	movaps	0x6e(%rsi), %xmm8
+	movaps	0x7e(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$2, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$2, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$2, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$2, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$2, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$2, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_2)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	add	%rdx, %rdi
+	add	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_2_bwd):
+	movaps	-0x02(%rsi), %xmm1
+
+	movaps	-0x12(%rsi), %xmm2
+	palignr	$2, %xmm2, %xmm1
+	movaps	%xmm1, -0x10(%rdi)
+
+	movaps	-0x22(%rsi), %xmm3
+	palignr	$2, %xmm3, %xmm2
+	movaps	%xmm2, -0x20(%rdi)
+
+	movaps	-0x32(%rsi), %xmm4
+	palignr	$2, %xmm4, %xmm3
+	movaps	%xmm3, -0x30(%rdi)
+
+	movaps	-0x42(%rsi), %xmm5
+	palignr	$2, %xmm5, %xmm4
+	movaps	%xmm4, -0x40(%rdi)
+
+	movaps	-0x52(%rsi), %xmm6
+	palignr	$2, %xmm6, %xmm5
+	movaps	%xmm5, -0x50(%rdi)
+
+	movaps	-0x62(%rsi), %xmm7
+	palignr	$2, %xmm7, %xmm6
+	movaps	%xmm6, -0x60(%rdi)
+
+	movaps	-0x72(%rsi), %xmm8
+	palignr	$2, %xmm8, %xmm7
+	movaps	%xmm7, -0x70(%rdi)
+
+	movaps	-0x82(%rsi), %xmm9
+	palignr	$2, %xmm9, %xmm8
+	movaps	%xmm8, -0x80(%rdi)
+
+	sub	$0x80, %rdx
+	lea	-0x80(%rdi), %rdi
+	lea	-0x80(%rsi), %rsi
+	jae	L(shl_2_bwd)
+	movdqu	%xmm0, (%r8)
+	add	$0x80, %rdx
+	sub	%rdx, %rdi
+	sub	%rdx, %rsi
+	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
+
+	ALIGN (4)
+L(shl_3):
+	sub	$0x80, %rdx
+	movaps -0x03(%rsi), %xmm1
+	movaps	0x0d(%rsi), %xmm2
+	movaps	0x1d(%rsi), %xmm3
+	movaps	0x2d(%rsi), %xmm4
+	movaps	0x3d(%rsi), %xmm5
+	movaps	0x4d(%rsi), %xmm6
+	movaps	0x5d(%rsi), %xmm7
+	movaps	0x6d(%rsi), %xmm8
+	movaps	0x7d(%rsi), %xmm9
+	lea	0x80(%rsi), %rsi
+	palignr	$3, %xmm8, %xmm9
+	movaps	%xmm9, 0x70(%rdi)
+	palignr	$3, %xmm7, %xmm8
+	movaps	%xmm8, 0x60(%rdi)
+	palignr	$3, %xmm6, %xmm7
+	movaps	%xmm7, 0x50(%rdi)
+	palignr	$3, %xmm5, %xmm6
+	movaps	%xmm6, 0x40(%rdi)
+	palignr	$3, %xmm4, %xmm5
+	movaps	%xmm5, 0x30(%rdi)
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm4, 0x20(%rdi)
+	palignr	$3, %xmm2, %xmm3
+	movaps	%xmm3, 0x10(%rdi)
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%rdi)
+	lea	0x80(%rdi), %rdi
+	jae	L(shl_3)
author	H.J. Lu <hongjiu.lu@intel.com>	2010-06-30 08:26:11 -0700
committer	Ulrich Drepper <drepper@redhat.com>	2010-06-30 08:26:11 -0700
commit	6fb8cbcb58a29fff73eb2101b34caa19a7f88eba (patch)
tree	3a39ddec3a6cf66f8541c6591dbe4017136580f0
parent	d85f8ff66711fd3b1c5753330499c7403fa46d81 (diff)
download	glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.tar.xz glibc-6fb8cbcb58a29fff73eb2101b34caa19a7f88eba.zip