diff options
| author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2012-03-30 16:45:27 -0400 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2012-03-30 16:45:27 -0400 |
| commit | 4b43400f6a710fa3d931a57eaae4cb332fb60edc (patch) | |
| tree | b6c7b892ce5c42a2ba042c8a3369476bac077260 | |
| parent | 48c41d04ee06efc6ec97325ed6697c121b40865f (diff) | |
| download | glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.tar.xz glibc-4b43400f6a710fa3d931a57eaae4cb332fb60edc.zip | |
optimize the following memcpy: sysdeps/i386/i686/multiarch/memcpy-ssse3.S
I've improved the following implementation of memcpy:
"sysdeps/i386/i686/multiarch/memcpy-ssse3.S".
The patch includes some minor style fixes, but the important part is
just using prefetch loops for the case:
DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and
src and dst pointers have unequal 16 byte alignments.
This gives from 6% - 50% performance boost on the atom machine, about
24,73% in geometric mean.
| -rw-r--r-- | ChangeLog | 7 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 1985 |
2 files changed, 1456 insertions, 536 deletions
@@ -1,3 +1,10 @@ +2012-03-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Update. + Optimize memcpy with prefetch if + DATA_CACHE_SIZE_HALF <= len < SHARED_CACHE_SIZE_HALF and + src, dst pointers have unequal 16 byte alignments. + 2012-03-30 Siddhesh Poyarekar <siddhesh@redhat.com> [BZ #13928] diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index 3a3ab792a3..30bdad6e88 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -17,109 +17,100 @@ License along with the GNU C Library; if not, see <http://www.gnu.org/licenses/>. */ -#include <sysdep.h> - #if !defined NOT_IN_libc \ && (defined SHARED \ || defined USE_AS_MEMMOVE \ || !defined USE_MULTIARCH) -#include "asm-syntax.h" +# include <sysdep.h> +# include "asm-syntax.h" -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -#endif +# ifndef MEMCPY +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# endif -#ifdef USE_AS_BCOPY -# define SRC PARMS -# define DEST SRC+4 -# define LEN DEST+4 -#else -# define DEST PARMS -# define SRC DEST+4 -# define LEN SRC+4 -#endif +# ifdef USE_AS_BCOPY +# define SRC PARMS +# define DEST SRC+4 +# define LEN DEST+4 +# else +# define DEST PARMS +# define SRC DEST+4 +# define LEN SRC+4 +# endif -#define CFI_PUSH(REG) \ - cfi_adjust_cfa_offset (4); \ +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ cfi_rel_offset (REG, 0) -#define CFI_POP(REG) \ - cfi_adjust_cfa_offset (-4); \ +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -#define PUSH(REG) pushl REG; CFI_PUSH (REG) -#define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) -#ifdef SHARED -# define PARMS 8 /* Preserve EBX. */ -# define ENTRANCE PUSH (%ebx); -# define RETURN_END POP (%ebx); ret -# define RETURN RETURN_END; CFI_PUSH (%ebx) -# define JMPTBL(I, B) I - B +# ifdef SHARED +# define PARMS 8 /* Preserve EBX. */ +# define ENTRANCE PUSH (%ebx); +# define RETURN_END POP (%ebx); ret +# define RETURN RETURN_END; CFI_PUSH (%ebx) +# define JMPTBL(I, B) I - B /* Load an entry in a jump table into EBX and branch to it. TABLE is a - jump table with relative offsets. INDEX is a register contains the - index into the jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - /* We first load PC into EBX. */ \ - SETUP_PIC_REG(bx); \ - /* Get the address of the jump table. */ \ - addl $(TABLE - .), %ebx; \ - /* Get the entry and convert the relative offset to the \ - absolute address. */ \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx - -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) \ - addl $(TABLE - .), %ebx - -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - addl (%ebx,INDEX,SCALE), %ebx; \ - /* We loaded the jump table. Go. */ \ - jmp *%ebx -#else -# define PARMS 4 -# define ENTRANCE -# define RETURN_END ret -# define RETURN RETURN_END -# define JMPTBL(I, B) I + jump table with relative offsets. INDEX is a register contains the + index into the jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into EBX. */ \ + SETUP_PIC_REG(bx); \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ebx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ebx, INDEX, SCALE), %ebx; \ + /* We loaded the jump table. Go. */ \ + jmp *%ebx +# else -/* Branch to an entry in a jump table. TABLE is a jump table with - absolute offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) +# define PARMS 4 +# define ENTRANCE +# define RETURN_END ret +# define RETURN RETURN_END +# define JMPTBL(I, B) I -# define BRANCH_TO_JMPTBL_ENTRY_VALUE(TABLE) +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ -# define BRANCH_TO_JMPTBL_ENTRY_TAIL(TABLE, INDEX, SCALE) \ - jmp *TABLE(,INDEX,SCALE) -#endif +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(, INDEX, SCALE) +# endif .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_BCOPY +# if !defined USE_AS_BCOPY ENTRY (MEMCPY_CHK) movl 12(%esp), %eax cmpl %eax, 16(%esp) jb HIDDEN_JUMPTARGET (__chk_fail) END (MEMCPY_CHK) -#endif +# endif ENTRY (MEMCPY) ENTRANCE movl LEN(%esp), %ecx movl SRC(%esp), %eax movl DEST(%esp), %edx -#ifdef USE_AS_MEMMOVE +# ifdef USE_AS_MEMMOVE cmp %eax, %edx jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) + + .p2align 4 L(memmove_bwd): add %ecx, %eax cmp %eax, %edx @@ -127,67 +118,72 @@ L(memmove_bwd): jb L(copy_backward) L(copy_forward): -#endif +# endif cmp $48, %ecx jae L(48bytesormore) L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE +# ifndef USE_AS_MEMMOVE cmp %dl, %al jb L(bk_write) -#endif +# endif add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) -#ifndef USE_AS_MEMMOVE +# ifndef USE_AS_MEMMOVE + .p2align 4 L(bk_write): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) -#endif +# endif - ALIGN (4) -/* ECX > 32 and EDX is 4 byte aligned. */ + .p2align 4 L(48bytesormore): +# ifndef USE_AS_MEMMOVE + movlpd (%eax), %xmm0 + movlpd 8(%eax), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 8(%edx) +# else movdqu (%eax), %xmm0 +# endif PUSH (%edi) movl %edx, %edi and $-16, %edx - PUSH (%esi) - cfi_remember_state add $16, %edx - movl %edi, %esi sub %edx, %edi add %edi, %ecx sub %edi, %eax -#ifdef SHARED_CACHE_SIZE_HALF +# ifdef SHARED_CACHE_SIZE_HALF cmp $SHARED_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED +# else +# ifdef SHARED SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx -# else +# else cmp __x86_shared_cache_size_half, %ecx +# endif # endif -#endif mov %eax, %edi jae L(large_page) and $0xf, %edi jz L(shl_0) - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_0): - movdqu %xmm0, (%esi) +# ifdef USE_AS_MEMMOVE + movl DEST+4(%esp), %edi + movdqu %xmm0, (%edi) +# endif xor %edi, %edi - POP (%esi) cmp $127, %ecx ja L(shl_0_gobble) lea -32(%ecx), %ecx + + .p2align 4 L(shl_0_loop): movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -219,6 +215,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi + L(shl_0_end): lea 32(%ecx), %ecx add %ecx, %edi @@ -228,23 +225,25 @@ L(shl_0_end): BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) CFI_PUSH (%edi) -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF + .p2align 4 +L(shl_0_gobble): +# ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %ecx -#else -# ifdef SHARED +# else +# ifdef SHARED SETUP_PIC_REG(bx) add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx -# else +# else cmp __x86_data_cache_size_half, %ecx +# endif # endif -#endif - - POP (%edi) + POP (%edi) lea -128(%ecx), %ecx jae L(shl_0_gobble_mem_loop) + + .p2align 4 L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -274,17 +273,15 @@ L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 sub $0x40, %ecx movdqa 0x10(%eax), %xmm1 - movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) - movdqa 0x20(%eax), %xmm0 movdqa 0x30(%eax), %xmm1 add $0x40, %eax - movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_cache_less_64bytes): cmp $0x20, %ecx jb L(shl_0_cache_less_32bytes) @@ -295,6 +292,7 @@ L(shl_0_cache_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_cache_less_32bytes): cmp $0x10, %ecx jb L(shl_0_cache_less_16bytes) @@ -303,13 +301,13 @@ L(shl_0_cache_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_cache_less_16bytes): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_loop): prefetcht0 0x1c0(%eax) prefetcht0 0x280(%eax) @@ -354,6 +352,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm0, 0x20(%edx) movdqa %xmm1, 0x30(%edx) add $0x40, %edx + L(shl_0_mem_less_64bytes): cmp $0x20, %ecx jb L(shl_0_mem_less_32bytes) @@ -364,6 +363,7 @@ L(shl_0_mem_less_64bytes): movdqa %xmm0, (%edx) movdqa %xmm1, 0x10(%edx) add $0x20, %edx + L(shl_0_mem_less_32bytes): cmp $0x10, %ecx jb L(shl_0_mem_less_16bytes) @@ -372,24 +372,84 @@ L(shl_0_mem_less_32bytes): add $0x10, %eax movdqa %xmm0, (%edx) add $0x10, %edx + L(shl_0_mem_less_16bytes): add %ecx, %edx add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_1): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +# ifndef USE_AS_MEMMOVE + movaps -1(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -1(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_1_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl1LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + movaps 47(%eax), %xmm4 + movaps 63(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $1, %xmm4, %xmm5 + palignr $1, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $1, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $1, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl1LoopStart) + +L(Shl1LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 15(%eax), %xmm2 + movaps 31(%eax), %xmm3 + palignr $1, %xmm2, %xmm3 + palignr $1, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_1_no_prefetch): + lea -32(%ecx), %ecx lea -1(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_1_loop): + .p2align 4 +L(sh_1_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -399,8 +459,7 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_1_end) + jb L(sh_1_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -411,30 +470,90 @@ L(shl_1_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_1_no_prefetch_loop) - jae L(shl_1_loop) - -L(shl_1_end): +L(sh_1_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 1(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_2): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +# ifndef USE_AS_MEMMOVE + movaps -2(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -2(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_2_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl2LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + movaps 46(%eax), %xmm4 + movaps 62(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $2, %xmm4, %xmm5 + palignr $2, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $2, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $2, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl2LoopStart) + +L(Shl2LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 14(%eax), %xmm2 + movaps 30(%eax), %xmm3 + palignr $2, %xmm2, %xmm3 + palignr $2, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_2_no_prefetch): + lea -32(%ecx), %ecx lea -2(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_2_loop): + .p2align 4 +L(sh_2_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -444,8 +563,7 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - - jb L(shl_2_end) + jb L(sh_2_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -456,30 +574,90 @@ L(shl_2_loop): lea 32(%edi), %edi movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) + jae L(sh_2_no_prefetch_loop) - jae L(shl_2_loop) - -L(shl_2_end): +L(sh_2_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 2(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_3): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +# ifndef USE_AS_MEMMOVE + movaps -3(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -3(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_3_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl3LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + movaps 45(%eax), %xmm4 + movaps 61(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $3, %xmm4, %xmm5 + palignr $3, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $3, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $3, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl3LoopStart) + +L(Shl3LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 13(%eax), %xmm2 + movaps 29(%eax), %xmm3 + palignr $3, %xmm2, %xmm3 + palignr $3, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_3_no_prefetch): + lea -32(%ecx), %ecx lea -3(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_3_loop): + .p2align 4 +L(sh_3_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -490,7 +668,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_3_end) + jb L(sh_3_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -502,29 +680,90 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_3_loop) + jae L(sh_3_no_prefetch_loop) -L(shl_3_end): +L(sh_3_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 3(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) - cfi_restore_state - cfi_remember_state - ALIGN (4) + .p2align 4 L(shl_4): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +# ifndef USE_AS_MEMMOVE + movaps -4(%eax), %xmm1 +# else + movl DEST+4(%esp), %edi + movaps -4(%eax), %xmm1 + movdqu %xmm0, (%edi) +# endif +# ifdef DATA_CACHE_SIZE_HALF + cmp $DATA_CACHE_SIZE_HALF, %ecx +# else +# ifdef SHARED + SETUP_PIC_REG(bx) + add $_GLOBAL_OFFSET_TABLE_, %ebx + cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx +# else + cmp __x86_data_cache_size_half, %ecx +# endif +# endif + jb L(sh_4_no_prefetch) + + lea -64(%ecx), %ecx + + .p2align 4 +L(Shl4LoopStart): + prefetcht0 0x1c0(%eax) + prefetcht0 0x1c0(%edx) + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + movaps 44(%eax), %xmm4 + movaps 60(%eax), %xmm5 + movaps %xmm5, %xmm7 + palignr $4, %xmm4, %xmm5 + palignr $4, %xmm3, %xmm4 + movaps %xmm5, 48(%edx) + palignr $4, %xmm2, %xmm3 + lea 64(%eax), %eax + palignr $4, %xmm1, %xmm2 + movaps %xmm4, 32(%edx) + movaps %xmm3, 16(%edx) + movaps %xmm7, %xmm1 + movaps %xmm2, (%edx) + lea 64(%edx), %edx + sub $64, %ecx + ja L(Shl4LoopStart) + +L(Shl4LoopLeave): + add $32, %ecx + jle L(shl_end_0) + + movaps 12(%eax), %xmm2 + movaps 28(%eax), %xmm3 + palignr $4, %xmm2, %xmm3 + palignr $4, %xmm1, %xmm2 + movaps %xmm2, (%edx) + movaps %xmm3, 16(%edx) + lea 32(%edx, %ecx), %edx + lea 32(%eax, %ecx), %eax + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) + + CFI_PUSH (%edi) + + .p2align 4 +L(sh_4_no_prefetch): + lea -32(%ecx), %ecx lea -4(%eax), %eax - movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx - movdqu %xmm0, (%esi) - POP (%esi) -L(shl_4_loop): + .p2align 4 +L(sh_4_no_prefetch_loop): movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx movdqa 32(%eax, %edi), %xmm3 @@ -535,7 +774,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jb L(shl_4_end) + jb L(sh_4_end_no_prefetch_loop) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -547,29 +786,90 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jae L(shl_4_loop) + jae L(sh_4_no_prefetch_loop) -L(shl_4_end): +L(sh_4_end_no_prefetch_loop): lea 32(%ecx), %ecx add %ecx, %edi add %edi, %edx lea 4(%edi, %eax), %eax - POP (%edi) - BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + POP (%edi) + BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4) - cfi_restore_state - cfi_remember_state - ALIGN (4) + CFI_PUSH (%edi) + + .p2align 4 L(shl_5): - BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) +# ifndef USE_AS_MEMMOVE + movaps -5(%eax), %xmm1 +# else + |
