optimize the following memcpy: sysdeps/i386/i686/multiarch/memcpy-ssse3.S

I've improved the following implementation of memcpy: "sysdeps/i386/i686/multiarch/memcpy-ssse3.S". The patch includes some minor style fixes, but the important part is just using prefetch loops for the case: DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and src and dst pointers have unequal 16 byte alignments. This gives from 6% - 50% performance boost on the atom machine, about 24,73% in geometric mean.
author: Liubov Dmitrieva <liubov.dmitrieva@gmail.com> 2012-03-30 16:45:27 -0400
committer: Ulrich Drepper <drepper@gmail.com> 2012-03-30 16:45:27 -0400
commit: 4b43400f6a710fa3d931a57eaae4cb332fb60edc (patch)
tree: b6c7b892ce5c42a2ba042c8a3369476bac077260
parent: 48c41d04ee06efc6ec97325ed6697c121b40865f (diff)
download: glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.tar.xz
glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.zip
2 files changed, 1456 insertions, 536 deletions
diff --git a/ChangeLog b/ChangeLog
index 2e16f982fc..61ec1e16d0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-03-22  Liubov Dmitrieva  <liubov.dmitrieva@gmail.com>
+
+	* sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update.
+	Optimize memcpy with prefetch if
+	DATA_CACHE_SIZE_HALF <= len <  SHARED_CACHE_SIZE_HALF and
+	src, dst pointers have unequal 16 byte alignments.
+
 2012-03-30  Siddhesh Poyarekar  <siddhesh@redhat.com>
 
 	[BZ #13928]
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index 3a3ab792a3..30bdad6e88 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -17,109 +17,100 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
 #if !defined NOT_IN_libc \
     && (defined SHARED \
 	|| defined USE_AS_MEMMOVE \
 	|| !defined USE_MULTIARCH)
 
-#include "asm-syntax.h"
+# include <sysdep.h>
+# include "asm-syntax.h"
 
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-#endif
+# ifndef MEMCPY
+#  define MEMCPY		__memcpy_ssse3
+#  define MEMCPY_CHK	__memcpy_chk_ssse3
+# endif
 
-#ifdef USE_AS_BCOPY
-# define SRC		PARMS
-# define DEST		SRC+4
-# define LEN		DEST+4
-#else
-# define DEST		PARMS
-# define SRC		DEST+4
-# define LEN		SRC+4
-#endif
+# ifdef USE_AS_BCOPY
+#  define SRC		PARMS
+#  define DEST		SRC+4
+#  define LEN		DEST+4
+# else
+#  define DEST		PARMS
+#  define SRC		DEST+4
+#  define LEN		SRC+4
+# endif
 
-#define CFI_PUSH(REG)						\
-  cfi_adjust_cfa_offset (4);					\
+# define CFI_PUSH(REG)		\
+  cfi_adjust_cfa_offset (4);		\
   cfi_rel_offset (REG, 0)
 
-#define CFI_POP(REG)						\
-  cfi_adjust_cfa_offset (-4);					\
+# define CFI_POP(REG)		\
+  cfi_adjust_cfa_offset (-4);		\
   cfi_restore (REG)
 
-#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
-#define POP(REG)	popl REG; CFI_POP (REG)
+# define PUSH(REG)	pushl REG; CFI_PUSH (REG)
+# define POP(REG)	popl REG; CFI_POP (REG)
 
-#ifdef SHARED
-# define PARMS		8		/* Preserve EBX.  */
-# define ENTRANCE	PUSH (%ebx);
-# define RETURN_END	POP (%ebx); ret
-# define RETURN		RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B)	I - B
+# ifdef SHARED
+#  define PARMS		8		/* Preserve EBX.  */
+#  define ENTRANCE	PUSH (%ebx);
+#  define RETURN_END	POP (%ebx); ret
+#  define RETURN		RETURN_END; CFI_PUSH (%ebx)
+#  define JMPTBL(I, B)	I - B
 
 /* Load an entry in a jump table into EBX and branch to it.  TABLE is a
-   jump table with relative offsets.  INDEX is a register contains the
-   index into the jump table.   SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    /* We first load PC into EBX.  */				\
-    SETUP_PIC_REG(bx);						\
-    /* Get the address of the jump table.  */			\
-    addl	$(TABLE - .), %ebx;				\
-    /* Get the entry and convert the relative offset to the	\
-       absolute address.  */					\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)			\
-    addl	$(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)	\
-    addl	(%ebx,INDEX,SCALE), %ebx;			\
-    /* We loaded the jump table.  Go.  */			\
-    jmp		*%ebx
-#else
-# define PARMS		4
-# define ENTRANCE
-# define RETURN_END	ret
-# define RETURN		RETURN_END
-# define JMPTBL(I, B)	I
+	jump table with relative offsets.  INDEX is a register contains the
+	index into the jump table.   SCALE is the scale of INDEX. */
+
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+    /* We first load PC into EBX.  */		\
+	SETUP_PIC_REG(bx);		\
+    /* Get the address of the jump table.  */		\
+	addl	$(TABLE - .), %ebx;		\
+    /* Get the entry and convert the relative offset to the		\
+	absolute	address.  */		\
+	addl	(%ebx, INDEX, SCALE), %ebx;		\
+    /* We loaded the jump table.  Go.  */		\
+	jmp	*%ebx
+# else
 
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   absolute offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
+#  define PARMS		4
+#  define ENTRANCE
+#  define RETURN_END	ret
+#  define RETURN		RETURN_END
+#  define JMPTBL(I, B)	I
 
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+/* Branch to an entry in a jump table.  TABLE is a jump table with
+	absolute offsets.  INDEX is a register contains the index into the
+	jump table.  SCALE is the scale of INDEX. */
 
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE)		\
-    jmp		*TABLE(,INDEX,SCALE)
-#endif
+#  define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
+	jmp	*TABLE(, INDEX, SCALE)
+# endif
 
 	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+# if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 	movl	12(%esp), %eax
 	cmpl	%eax, 16(%esp)
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
-#endif
+# endif
 ENTRY (MEMCPY)
 	ENTRANCE
 	movl	LEN(%esp), %ecx
 	movl	SRC(%esp), %eax
 	movl	DEST(%esp), %edx
 
-#ifdef USE_AS_MEMMOVE
+# ifdef USE_AS_MEMMOVE
 	cmp	%eax, %edx
 	jb	L(copy_forward)
 	je	L(fwd_write_0bytes)
 	cmp	$32, %ecx
 	jae	L(memmove_bwd)
 	jmp	L(bk_write_less32bytes_2)
+
+	.p2align 4
 L(memmove_bwd):
 	add	%ecx, %eax
 	cmp	%eax, %edx
@@ -127,67 +118,72 @@ L(memmove_bwd):
 	jb	L(copy_backward)
 
 L(copy_forward):
-#endif
+# endif
 	cmp	$48, %ecx
 	jae	L(48bytesormore)
 
 L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
 	cmp	%dl, %al
 	jb	L(bk_write)
-#endif
+# endif
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
+	.p2align 4
 L(bk_write):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
+# endif
 
-	ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned.  */
+	.p2align 4
 L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+	movlpd	(%eax), %xmm0
+	movlpd	8(%eax), %xmm1
+	movlpd	%xmm0, (%edx)
+	movlpd	%xmm1, 8(%edx)
+# else
 	movdqu	(%eax), %xmm0
+# endif
 	PUSH (%edi)
 	movl	%edx, %edi
 	and	$-16, %edx
-	PUSH (%esi)
-	cfi_remember_state
 	add	$16, %edx
-	movl	%edi, %esi
 	sub	%edx, %edi
 	add	%edi, %ecx
 	sub	%edi, %eax
 
-#ifdef SHARED_CACHE_SIZE_HALF
+# ifdef SHARED_CACHE_SIZE_HALF
 	cmp	$SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+#  ifdef SHARED
 	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+#  else
 	cmp	__x86_shared_cache_size_half, %ecx
+#  endif
 # endif
-#endif
 
 	mov	%eax, %edi
 	jae	L(large_page)
 	and	$0xf, %edi
 	jz	L(shl_0)
-
 	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_0):
-	movdqu	%xmm0, (%esi)
+# ifdef USE_AS_MEMMOVE
+	movl	DEST+4(%esp), %edi
+	movdqu	%xmm0, (%edi)
+# endif
 	xor	%edi, %edi
-	POP (%esi)
 	cmp	$127, %ecx
 	ja	L(shl_0_gobble)
 	lea	-32(%ecx), %ecx
+
+	.p2align 4
 L(shl_0_loop):
 	movdqa	(%eax, %edi), %xmm0
 	movdqa	16(%eax, %edi), %xmm1
@@ -219,6 +215,7 @@ L(shl_0_loop):
 	movdqa	%xmm0, (%edx, %edi)
 	movdqa	%xmm1, 16(%edx, %edi)
 	lea	32(%edi), %edi
+
 L(shl_0_end):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
@@ -228,23 +225,25 @@ L(shl_0_end):
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
 	CFI_PUSH (%edi)
-L(shl_0_gobble):
 
-#ifdef DATA_CACHE_SIZE_HALF
+	.p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
 	cmp	$DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+#  ifdef SHARED
 	SETUP_PIC_REG(bx)
 	add	$_GLOBAL_OFFSET_TABLE_, %ebx
 	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+#  else
 	cmp	__x86_data_cache_size_half, %ecx
+#  endif
 # endif
-#endif
-
-	POP (%edi)
+	POP	(%edi)
 	lea	-128(%ecx), %ecx
 	jae	L(shl_0_gobble_mem_loop)
+
+	.p2align 4
 L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	movdqa	0x10(%eax), %xmm1
@@ -274,17 +273,15 @@ L(shl_0_gobble_cache_loop):
 	movdqa	(%eax), %xmm0
 	sub	$0x40, %ecx
 	movdqa	0x10(%eax), %xmm1
-
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
-
 	movdqa	0x20(%eax), %xmm0
 	movdqa	0x30(%eax), %xmm1
 	add	$0x40, %eax
-
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_cache_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_cache_less_32bytes)
@@ -295,6 +292,7 @@ L(shl_0_cache_less_64bytes):
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_cache_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_cache_less_16bytes)
@@ -303,13 +301,13 @@ L(shl_0_cache_less_32bytes):
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_cache_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
 
-
-	ALIGN (4)
+	.p2align 4
 L(shl_0_gobble_mem_loop):
 	prefetcht0 0x1c0(%eax)
 	prefetcht0 0x280(%eax)
@@ -354,6 +352,7 @@ L(shl_0_gobble_mem_loop):
 	movdqa	%xmm0, 0x20(%edx)
 	movdqa	%xmm1, 0x30(%edx)
 	add	$0x40, %edx
+
 L(shl_0_mem_less_64bytes):
 	cmp	$0x20, %ecx
 	jb	L(shl_0_mem_less_32bytes)
@@ -364,6 +363,7 @@ L(shl_0_mem_less_64bytes):
 	movdqa	%xmm0, (%edx)
 	movdqa	%xmm1, 0x10(%edx)
 	add	$0x20, %edx
+
 L(shl_0_mem_less_32bytes):
 	cmp	$0x10, %ecx
 	jb	L(shl_0_mem_less_16bytes)
@@ -372,24 +372,84 @@ L(shl_0_mem_less_32bytes):
 	add	$0x10, %eax
 	movdqa	%xmm0, (%edx)
 	add	$0x10, %edx
+
 L(shl_0_mem_less_16bytes):
 	add	%ecx, %edx
 	add	%ecx, %eax
 	BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_1):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-1(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-1(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_1_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl1LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	movaps	47(%eax), %xmm4
+	movaps	63(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$1, %xmm4, %xmm5
+	palignr	$1, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$1, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	15(%eax), %xmm2
+	movaps	31(%eax), %xmm3
+	palignr	$1, %xmm2, %xmm3
+	palignr	$1, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_1_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-1(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_1_loop):
 
+	.p2align 4
+L(sh_1_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -399,8 +459,7 @@ L(shl_1_loop):
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_1_end)
+	jb	L(sh_1_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -411,30 +470,90 @@ L(shl_1_loop):
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_1_no_prefetch_loop)
 
-	jae	L(shl_1_loop)
-
-L(shl_1_end):
+L(sh_1_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	1(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_2):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-2(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-2(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_2_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl2LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	movaps	46(%eax), %xmm4
+	movaps	62(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$2, %xmm4, %xmm5
+	palignr	$2, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$2, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	14(%eax), %xmm2
+	movaps	30(%eax), %xmm3
+	palignr	$2, %xmm2, %xmm3
+	palignr	$2, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_2_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-2(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_2_loop):
 
+	.p2align 4
+L(sh_2_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -444,8 +563,7 @@ L(shl_2_loop):
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
-
-	jb	L(shl_2_end)
+	jb	L(sh_2_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -456,30 +574,90 @@ L(shl_2_loop):
 	lea	32(%edi), %edi
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
+	jae	L(sh_2_no_prefetch_loop)
 
-	jae	L(shl_2_loop)
-
-L(shl_2_end):
+L(sh_2_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	2(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_3):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-3(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-3(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_3_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl3LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	movaps	45(%eax), %xmm4
+	movaps	61(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$3, %xmm4, %xmm5
+	palignr	$3, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$3, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	13(%eax), %xmm2
+	movaps	29(%eax), %xmm3
+	palignr	$3, %xmm2, %xmm3
+	palignr	$3, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_3_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-3(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_3_loop):
 
+	.p2align 4
+L(sh_3_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -490,7 +668,7 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_3_end)
+	jb	L(sh_3_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -502,29 +680,90 @@ L(shl_3_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_3_loop)
+	jae	L(sh_3_no_prefetch_loop)
 
-L(shl_3_end):
+L(sh_3_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	3(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	.p2align 4
 L(shl_4):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-4(%eax), %xmm1
+# else
+	movl	DEST+4(%esp), %edi
+	movaps	-4(%eax), %xmm1
+	movdqu	%xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+	cmp	$DATA_CACHE_SIZE_HALF, %ecx
+# else
+#  ifdef SHARED
+	SETUP_PIC_REG(bx)
+	add	$_GLOBAL_OFFSET_TABLE_, %ebx
+	cmp	__x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+#  else
+	cmp	__x86_data_cache_size_half, %ecx
+#  endif
+# endif
+	jb L(sh_4_no_prefetch)
+
+	lea	-64(%ecx), %ecx
+
+	.p2align 4
+L(Shl4LoopStart):
+	prefetcht0 0x1c0(%eax)
+	prefetcht0 0x1c0(%edx)
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	movaps	44(%eax), %xmm4
+	movaps	60(%eax), %xmm5
+	movaps	%xmm5, %xmm7
+	palignr	$4, %xmm4, %xmm5
+	palignr	$4, %xmm3, %xmm4
+	movaps	%xmm5, 48(%edx)
+	palignr	$4, %xmm2, %xmm3
+	lea	64(%eax), %eax
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm4, 32(%edx)
+	movaps	%xmm3, 16(%edx)
+	movaps	%xmm7, %xmm1
+	movaps	%xmm2, (%edx)
+	lea	64(%edx), %edx
+	sub	$64, %ecx
+	ja	L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+	add	$32, %ecx
+	jle	L(shl_end_0)
+
+	movaps	12(%eax), %xmm2
+	movaps	28(%eax), %xmm3
+	palignr	$4, %xmm2, %xmm3
+	palignr	$4, %xmm1, %xmm2
+	movaps	%xmm2, (%edx)
+	movaps	%xmm3, 16(%edx)
+	lea	32(%edx, %ecx), %edx
+	lea	32(%eax, %ecx), %eax
+	POP (%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+	CFI_PUSH (%edi)
+
+	.p2align 4
+L(sh_4_no_prefetch):
+	lea	-32(%ecx), %ecx
 	lea	-4(%eax), %eax
-	movaps	(%eax), %xmm1
 	xor	%edi, %edi
-	lea	-32(%ecx), %ecx
-	movdqu	%xmm0, (%esi)
-	POP (%esi)
-L(shl_4_loop):
 
+	.p2align 4
+L(sh_4_no_prefetch_loop):
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
 	movdqa	32(%eax, %edi), %xmm3
@@ -535,7 +774,7 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jb	L(shl_4_end)
+	jb	L(sh_4_end_no_prefetch_loop)
 
 	movdqa	16(%eax, %edi), %xmm2
 	sub	$32, %ecx
@@ -547,29 +786,90 @@ L(shl_4_loop):
 	movdqa	%xmm2, -32(%edx, %edi)
 	movdqa	%xmm3, -16(%edx, %edi)
 
-	jae	L(shl_4_loop)
+	jae	L(sh_4_no_prefetch_loop)
 
-L(shl_4_end):
+L(sh_4_end_no_prefetch_loop):
 	lea	32(%ecx), %ecx
 	add	%ecx, %edi
 	add	%edi, %edx
 	lea	4(%edi, %eax), %eax
-	POP (%edi)
-	BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+	POP	(%edi)
+	BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
 
-	cfi_restore_state
-	cfi_remember_state
-	ALIGN (4)
+	CFI_PUSH (%edi)
+
+	.p2align 4
 L(shl_5):
-	BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+	movaps	-5(%eax), %xmm1
+# else
+
author	Liubov Dmitrieva <liubov.dmitrieva@gmail.com>	2012-03-30 16:45:27 -0400
committer	Ulrich Drepper <drepper@gmail.com>	2012-03-30 16:45:27 -0400
commit	4b43400f6a710fa3d931a57eaae4cb332fb60edc (patch)
tree	b6c7b892ce5c42a2ba042c8a3369476bac077260
parent	48c41d04ee06efc6ec97325ed6697c121b40865f (diff)
download	glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.tar.xz glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.zip