aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2012-03-30 16:45:27 -0400
committerUlrich Drepper <drepper@gmail.com>2012-03-30 16:45:27 -0400
commit4b43400f6a710fa3d931a57eaae4cb332fb60edc (patch)
treeb6c7b892ce5c42a2ba042c8a3369476bac077260
parent48c41d04ee06efc6ec97325ed6697c121b40865f (diff)
downloadglibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.tar.xz
glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.zip
optimize the following memcpy: sysdeps/i386/i686/multiarch/memcpy-ssse3.S
I've improved the following implementation of memcpy: "sysdeps/i386/i686/multiarch/memcpy-ssse3.S". The patch includes some minor style fixes, but the important part is just using prefetch loops for the case: DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and src and dst pointers have unequal 16 byte alignments. This gives from 6% - 50% performance boost on the atom machine, about 24,73% in geometric mean.
-rw-r--r--ChangeLog7
-rw-r--r--sysdeps/i386/i686/multiarch/memcpy-ssse3.S1985
2 files changed, 1456 insertions, 536 deletions
diff --git a/ChangeLog b/ChangeLog
index 2e16f982fc..61ec1e16d0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,10 @@
+2012-03-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update.
+ Optimize memcpy with prefetch if
+ DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and
+ src, dst pointers have unequal 16 byte alignments.
+
2012-03-30 Siddhesh Poyarekar <siddhesh@redhat.com>
[BZ #13928]
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
index 3a3ab792a3..30bdad6e88 100644
--- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S
@@ -17,109 +17,100 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
#if !defined NOT_IN_libc \
&& (defined SHARED \
|| defined USE_AS_MEMMOVE \
|| !defined USE_MULTIARCH)
-#include "asm-syntax.h"
+# include <sysdep.h>
+# include "asm-syntax.h"
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-#endif
+# ifndef MEMCPY
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# endif
-#ifdef USE_AS_BCOPY
-# define SRC PARMS
-# define DEST SRC+4
-# define LEN DEST+4
-#else
-# define DEST PARMS
-# define SRC DEST+4
-# define LEN SRC+4
-#endif
+# ifdef USE_AS_BCOPY
+# define SRC PARMS
+# define DEST SRC+4
+# define LEN DEST+4
+# else
+# define DEST PARMS
+# define SRC DEST+4
+# define LEN SRC+4
+# endif
-#define CFI_PUSH(REG) \
- cfi_adjust_cfa_offset (4); \
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
cfi_rel_offset (REG, 0)
-#define CFI_POP(REG) \
- cfi_adjust_cfa_offset (-4); \
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
-#define PUSH(REG) pushl REG; CFI_PUSH (REG)
-#define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
-#ifdef SHARED
-# define PARMS 8 /* Preserve EBX. */
-# define ENTRANCE PUSH (%ebx);
-# define RETURN_END POP (%ebx); ret
-# define RETURN RETURN_END; CFI_PUSH (%ebx)
-# define JMPTBL(I, B) I - B
+# ifdef SHARED
+# define PARMS 8 /* Preserve EBX. */
+# define ENTRANCE PUSH (%ebx);
+# define RETURN_END POP (%ebx); ret
+# define RETURN RETURN_END; CFI_PUSH (%ebx)
+# define JMPTBL(I, B) I - B
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
- jump table with relative offsets. INDEX is a register contains the
- index into the jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- /* We first load PC into EBX. */ \
- SETUP_PIC_REG(bx); \
- /* Get the address of the jump table. */ \
- addl $(TABLE - .), %ebx; \
- /* Get the entry and convert the relative offset to the \
- absolute address. */ \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \
- addl $(TABLE - .), %ebx
-
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- addl (%ebx,INDEX,SCALE), %ebx; \
- /* We loaded the jump table. Go. */ \
- jmp *%ebx
-#else
-# define PARMS 4
-# define ENTRANCE
-# define RETURN_END ret
-# define RETURN RETURN_END
-# define JMPTBL(I, B) I
+ jump table with relative offsets. INDEX is a register contains the
+ index into the jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into EBX. */ \
+ SETUP_PIC_REG(bx); \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ebx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ebx, INDEX, SCALE), %ebx; \
+ /* We loaded the jump table. Go. */ \
+ jmp *%ebx
+# else
-/* Branch to an entry in a jump table. TABLE is a jump table with
- absolute offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
+# define PARMS 4
+# define ENTRANCE
+# define RETURN_END ret
+# define RETURN RETURN_END
+# define JMPTBL(I, B) I
-# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE)
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
-# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \
- jmp *TABLE(,INDEX,SCALE)
-#endif
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(, INDEX, SCALE)
+# endif
.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_BCOPY
+# if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
movl 12(%esp), %eax
cmpl %eax, 16(%esp)
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
-#endif
+# endif
ENTRY (MEMCPY)
ENTRANCE
movl LEN(%esp), %ecx
movl SRC(%esp), %eax
movl DEST(%esp), %edx
-#ifdef USE_AS_MEMMOVE
+# ifdef USE_AS_MEMMOVE
cmp %eax, %edx
jb L(copy_forward)
je L(fwd_write_0bytes)
cmp $32, %ecx
jae L(memmove_bwd)
jmp L(bk_write_less32bytes_2)
+
+ .p2align 4
L(memmove_bwd):
add %ecx, %eax
cmp %eax, %edx
@@ -127,67 +118,72 @@ L(memmove_bwd):
jb L(copy_backward)
L(copy_forward):
-#endif
+# endif
cmp $48, %ecx
jae L(48bytesormore)
L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
cmp %dl, %al
jb L(bk_write)
-#endif
+# endif
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-#ifndef USE_AS_MEMMOVE
+# ifndef USE_AS_MEMMOVE
+ .p2align 4
L(bk_write):
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
-#endif
+# endif
- ALIGN (4)
-/* ECX > 32 and EDX is 4 byte aligned. */
+ .p2align 4
L(48bytesormore):
+# ifndef USE_AS_MEMMOVE
+ movlpd (%eax), %xmm0
+ movlpd 8(%eax), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 8(%edx)
+# else
movdqu (%eax), %xmm0
+# endif
PUSH (%edi)
movl %edx, %edi
and $-16, %edx
- PUSH (%esi)
- cfi_remember_state
add $16, %edx
- movl %edi, %esi
sub %edx, %edi
add %edi, %ecx
sub %edi, %eax
-#ifdef SHARED_CACHE_SIZE_HALF
+# ifdef SHARED_CACHE_SIZE_HALF
cmp $SHARED_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+# ifdef SHARED
SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+# else
cmp __x86_shared_cache_size_half, %ecx
+# endif
# endif
-#endif
mov %eax, %edi
jae L(large_page)
and $0xf, %edi
jz L(shl_0)
-
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_0):
- movdqu %xmm0, (%esi)
+# ifdef USE_AS_MEMMOVE
+ movl DEST+4(%esp), %edi
+ movdqu %xmm0, (%edi)
+# endif
xor %edi, %edi
- POP (%esi)
cmp $127, %ecx
ja L(shl_0_gobble)
lea -32(%ecx), %ecx
+
+ .p2align 4
L(shl_0_loop):
movdqa (%eax, %edi), %xmm0
movdqa 16(%eax, %edi), %xmm1
@@ -219,6 +215,7 @@ L(shl_0_loop):
movdqa %xmm0, (%edx, %edi)
movdqa %xmm1, 16(%edx, %edi)
lea 32(%edi), %edi
+
L(shl_0_end):
lea 32(%ecx), %ecx
add %ecx, %edi
@@ -228,23 +225,25 @@ L(shl_0_end):
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
CFI_PUSH (%edi)
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
+ .p2align 4
+L(shl_0_gobble):
+# ifdef DATA_CACHE_SIZE_HALF
cmp $DATA_CACHE_SIZE_HALF, %ecx
-#else
-# ifdef SHARED
+# else
+# ifdef SHARED
SETUP_PIC_REG(bx)
add $_GLOBAL_OFFSET_TABLE_, %ebx
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
-# else
+# else
cmp __x86_data_cache_size_half, %ecx
+# endif
# endif
-#endif
-
- POP (%edi)
+ POP (%edi)
lea -128(%ecx), %ecx
jae L(shl_0_gobble_mem_loop)
+
+ .p2align 4
L(shl_0_gobble_cache_loop):
movdqa (%eax), %xmm0
movdqa 0x10(%eax), %xmm1
@@ -274,17 +273,15 @@ L(shl_0_gobble_cache_loop):
movdqa (%eax), %xmm0
sub $0x40, %ecx
movdqa 0x10(%eax), %xmm1
-
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
-
movdqa 0x20(%eax), %xmm0
movdqa 0x30(%eax), %xmm1
add $0x40, %eax
-
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_cache_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_cache_less_32bytes)
@@ -295,6 +292,7 @@ L(shl_0_cache_less_64bytes):
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_cache_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_cache_less_16bytes)
@@ -303,13 +301,13 @@ L(shl_0_cache_less_32bytes):
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_cache_less_16bytes):
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
-
- ALIGN (4)
+ .p2align 4
L(shl_0_gobble_mem_loop):
prefetcht0 0x1c0(%eax)
prefetcht0 0x280(%eax)
@@ -354,6 +352,7 @@ L(shl_0_gobble_mem_loop):
movdqa %xmm0, 0x20(%edx)
movdqa %xmm1, 0x30(%edx)
add $0x40, %edx
+
L(shl_0_mem_less_64bytes):
cmp $0x20, %ecx
jb L(shl_0_mem_less_32bytes)
@@ -364,6 +363,7 @@ L(shl_0_mem_less_64bytes):
movdqa %xmm0, (%edx)
movdqa %xmm1, 0x10(%edx)
add $0x20, %edx
+
L(shl_0_mem_less_32bytes):
cmp $0x10, %ecx
jb L(shl_0_mem_less_16bytes)
@@ -372,24 +372,84 @@ L(shl_0_mem_less_32bytes):
add $0x10, %eax
movdqa %xmm0, (%edx)
add $0x10, %edx
+
L(shl_0_mem_less_16bytes):
add %ecx, %edx
add %ecx, %eax
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_1):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -1(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -1(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_1_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl1LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ movaps 47(%eax), %xmm4
+ movaps 63(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $1, %xmm4, %xmm5
+ palignr $1, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $1, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl1LoopStart)
+
+L(Shl1LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 15(%eax), %xmm2
+ movaps 31(%eax), %xmm3
+ palignr $1, %xmm2, %xmm3
+ palignr $1, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_1_no_prefetch):
+ lea -32(%ecx), %ecx
lea -1(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_1_loop):
+ .p2align 4
+L(sh_1_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -399,8 +459,7 @@ L(shl_1_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_1_end)
+ jb L(sh_1_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -411,30 +470,90 @@ L(shl_1_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_1_no_prefetch_loop)
- jae L(shl_1_loop)
-
-L(shl_1_end):
+L(sh_1_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 1(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_2):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -2(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -2(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_2_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl2LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ movaps 46(%eax), %xmm4
+ movaps 62(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $2, %xmm4, %xmm5
+ palignr $2, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $2, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl2LoopStart)
+
+L(Shl2LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 14(%eax), %xmm2
+ movaps 30(%eax), %xmm3
+ palignr $2, %xmm2, %xmm3
+ palignr $2, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_2_no_prefetch):
+ lea -32(%ecx), %ecx
lea -2(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_2_loop):
+ .p2align 4
+L(sh_2_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -444,8 +563,7 @@ L(shl_2_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
-
- jb L(shl_2_end)
+ jb L(sh_2_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -456,30 +574,90 @@ L(shl_2_loop):
lea 32(%edi), %edi
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
+ jae L(sh_2_no_prefetch_loop)
- jae L(shl_2_loop)
-
-L(shl_2_end):
+L(sh_2_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 2(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_3):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -3(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -3(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_3_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl3LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ movaps 45(%eax), %xmm4
+ movaps 61(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $3, %xmm4, %xmm5
+ palignr $3, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $3, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl3LoopStart)
+
+L(Shl3LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 13(%eax), %xmm2
+ movaps 29(%eax), %xmm3
+ palignr $3, %xmm2, %xmm3
+ palignr $3, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_3_no_prefetch):
+ lea -32(%ecx), %ecx
lea -3(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_3_loop):
+ .p2align 4
+L(sh_3_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -490,7 +668,7 @@ L(shl_3_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_3_end)
+ jb L(sh_3_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -502,29 +680,90 @@ L(shl_3_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_3_loop)
+ jae L(sh_3_no_prefetch_loop)
-L(shl_3_end):
+L(sh_3_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 3(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ .p2align 4
L(shl_4):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -4(%eax), %xmm1
+# else
+ movl DEST+4(%esp), %edi
+ movaps -4(%eax), %xmm1
+ movdqu %xmm0, (%edi)
+# endif
+# ifdef DATA_CACHE_SIZE_HALF
+ cmp $DATA_CACHE_SIZE_HALF, %ecx
+# else
+# ifdef SHARED
+ SETUP_PIC_REG(bx)
+ add $_GLOBAL_OFFSET_TABLE_, %ebx
+ cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
+# else
+ cmp __x86_data_cache_size_half, %ecx
+# endif
+# endif
+ jb L(sh_4_no_prefetch)
+
+ lea -64(%ecx), %ecx
+
+ .p2align 4
+L(Shl4LoopStart):
+ prefetcht0 0x1c0(%eax)
+ prefetcht0 0x1c0(%edx)
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ movaps 44(%eax), %xmm4
+ movaps 60(%eax), %xmm5
+ movaps %xmm5, %xmm7
+ palignr $4, %xmm4, %xmm5
+ palignr $4, %xmm3, %xmm4
+ movaps %xmm5, 48(%edx)
+ palignr $4, %xmm2, %xmm3
+ lea 64(%eax), %eax
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm4, 32(%edx)
+ movaps %xmm3, 16(%edx)
+ movaps %xmm7, %xmm1
+ movaps %xmm2, (%edx)
+ lea 64(%edx), %edx
+ sub $64, %ecx
+ ja L(Shl4LoopStart)
+
+L(Shl4LoopLeave):
+ add $32, %ecx
+ jle L(shl_end_0)
+
+ movaps 12(%eax), %xmm2
+ movaps 28(%eax), %xmm3
+ palignr $4, %xmm2, %xmm3
+ palignr $4, %xmm1, %xmm2
+ movaps %xmm2, (%edx)
+ movaps %xmm3, 16(%edx)
+ lea 32(%edx, %ecx), %edx
+ lea 32(%eax, %ecx), %eax
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
+
+ CFI_PUSH (%edi)
+
+ .p2align 4
+L(sh_4_no_prefetch):
+ lea -32(%ecx), %ecx
lea -4(%eax), %eax
- movaps (%eax), %xmm1
xor %edi, %edi
- lea -32(%ecx), %ecx
- movdqu %xmm0, (%esi)
- POP (%esi)
-L(shl_4_loop):
+ .p2align 4
+L(sh_4_no_prefetch_loop):
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
movdqa 32(%eax, %edi), %xmm3
@@ -535,7 +774,7 @@ L(shl_4_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jb L(shl_4_end)
+ jb L(sh_4_end_no_prefetch_loop)
movdqa 16(%eax, %edi), %xmm2
sub $32, %ecx
@@ -547,29 +786,90 @@ L(shl_4_loop):
movdqa %xmm2, -32(%edx, %edi)
movdqa %xmm3, -16(%edx, %edi)
- jae L(shl_4_loop)
+ jae L(sh_4_no_prefetch_loop)
-L(shl_4_end):
+L(sh_4_end_no_prefetch_loop):
lea 32(%ecx), %ecx
add %ecx, %edi
add %edi, %edx
lea 4(%edi, %eax), %eax
- POP (%edi)
- BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4)
+ POP (%edi)
+ BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
- cfi_restore_state
- cfi_remember_state
- ALIGN (4)
+ CFI_PUSH (%edi)
+
+ .p2align 4
L(shl_5):
- BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd))
+# ifndef USE_AS_MEMMOVE
+ movaps -5(%eax), %xmm1
+# else
+