aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hjl.tools@gmail.com>2016-06-08 13:57:50 -0700
committerH.J. Lu <hjl.tools@gmail.com>2016-06-08 13:58:08 -0700
commitc867597bff2562180a18da4b8dba89d24e8b65c4 (patch)
tree3770c51728e718a0fffe569aca738749982b535a
parent5e8c5bb1ac83aa2577d64d82467a653fa413f7ce (diff)
downloadglibc-c867597bff2562180a18da4b8dba89d24e8b65c4.tar.xz
glibc-c867597bff2562180a18da4b8dba89d24e8b65c4.zip
X86-64: Remove previous default/SSE2/AVX2 memcpy/memmove
Since the new SSE2/AVX2 memcpy/memmove are faster than the previous ones, we can remove the previous SSE2/AVX2 memcpy/memmove and replace them with the new ones. No change in IFUNC selection if SSE2 and AVX2 memcpy/memmove weren't used before. If SSE2 or AVX2 memcpy/memmove were used, the new SSE2 or AVX2 memcpy/memmove optimized with Enhanced REP MOVSB will be used for processors with ERMS. The new AVX512 memcpy/memmove will be used for processors with AVX512 which prefer vzeroupper. Since the new SSE2 memcpy/memmove are faster than the previous default memcpy/memmove used in libc.a and ld.so, we also remove the previous default memcpy/memmove and make them the default memcpy/memmove, except that non-temporal store isn't used in ld.so. Together, it reduces the size of libc.so by about 6 KB and the size of ld.so by about 2 KB. [BZ #19776] * sysdeps/x86_64/memcpy.S: Make it dummy. * sysdeps/x86_64/mempcpy.S: Likewise. * sysdeps/x86_64/memmove.S: New file. * sysdeps/x86_64/memmove_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.S: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise. * sysdeps/x86_64/memmove.c: Removed. * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise. * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove memcpy-sse2-unaligned, memmove-avx-unaligned, memcpy-avx-unaligned and memmove-sse2-unaligned-erms. * sysdeps/x86_64/multiarch/ifunc-impl-list.c (__libc_ifunc_impl_list): Replace __memmove_chk_avx512_unaligned_2 with __memmove_chk_avx512_unaligned. Remove __memmove_chk_avx_unaligned_2. Replace __memmove_chk_sse2_unaligned_2 with __memmove_chk_sse2_unaligned. Remove __memmove_chk_sse2 and __memmove_avx_unaligned_2. Replace __memmove_avx512_unaligned_2 with __memmove_avx512_unaligned. Replace __memmove_sse2_unaligned_2 with __memmove_sse2_unaligned. Remove __memmove_sse2. Replace __memcpy_chk_avx512_unaligned_2 with __memcpy_chk_avx512_unaligned. Remove __memcpy_chk_avx_unaligned_2. Replace __memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned. Remove __memcpy_chk_sse2. Remove __memcpy_avx_unaligned_2. Replace __memcpy_avx512_unaligned_2 with __memcpy_avx512_unaligned. Remove __memcpy_sse2_unaligned_2 and __memcpy_sse2. Replace __mempcpy_chk_avx512_unaligned_2 with __mempcpy_chk_avx512_unaligned. Remove __mempcpy_chk_avx_unaligned_2. Replace __mempcpy_chk_sse2_unaligned_2 with __mempcpy_chk_sse2_unaligned. Remove __mempcpy_chk_sse2. Replace __mempcpy_avx512_unaligned_2 with __mempcpy_avx512_unaligned. Remove __mempcpy_avx_unaligned_2. Replace __mempcpy_sse2_unaligned_2 with __mempcpy_sse2_unaligned. Remove __mempcpy_sse2. * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support __memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned. Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms if processor has ERMS. Default to __memcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../memcpy.S. * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support __memcpy_chk_avx512_unaligned_erms and __memcpy_chk_avx512_unaligned. Use __memcpy_chk_avx_unaligned_erms and __memcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __memcpy_chk_sse2_unaligned. * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S Change function suffix from unaligned_2 to unaligned. * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support __mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned. Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms if processor has ERMS. Default to __mempcpy_sse2_unaligned. (ENTRY): Removed. (END): Likewise. (ENTRY_CHK): Likewise. (libc_hidden_builtin_def): Likewise. Don't include ../mempcpy.S. (mempcpy): New. Add a weak alias. * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support __mempcpy_chk_avx512_unaligned_erms and __mempcpy_chk_avx512_unaligned. Use __mempcpy_chk_avx_unaligned_erms and __mempcpy_chk_sse2_unaligned_erms if if processor has ERMS. Default to __mempcpy_chk_sse2_unaligned.
-rw-r--r--ChangeLog80
-rw-r--r--sysdeps/x86_64/memcpy.S585
-rw-r--r--sysdeps/x86_64/memmove.S71
-rw-r--r--sysdeps/x86_64/memmove_chk.S (renamed from sysdeps/x86_64/memmove.c)23
-rw-r--r--sysdeps/x86_64/mempcpy.S9
-rw-r--r--sysdeps/x86_64/multiarch/Makefile6
-rw-r--r--sysdeps/x86_64/multiarch/ifunc-impl-list.c63
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S391
-rw-r--r--sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S175
-rw-r--r--sysdeps/x86_64/multiarch/memcpy.S62
-rw-r--r--sysdeps/x86_64/multiarch/memcpy_chk.S40
-rw-r--r--sysdeps/x86_64/multiarch/memmove-avx-unaligned.S22
-rw-r--r--sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S13
-rw-r--r--sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S24
-rw-r--r--sysdeps/x86_64/multiarch/memmove.S98
-rw-r--r--sysdeps/x86_64/multiarch/memmove.c73
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.S71
-rw-r--r--sysdeps/x86_64/multiarch/memmove_chk.c46
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy.S74
-rw-r--r--sysdeps/x86_64/multiarch/mempcpy_chk.S38
20 files changed, 474 insertions, 1490 deletions
diff --git a/ChangeLog b/ChangeLog
index b811b0e12d..20e21393d0 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,85 @@
2016-06-08 H.J. Lu <hongjiu.lu@intel.com>
+ [BZ #19776]
+ * sysdeps/x86_64/memcpy.S: Make it dummy.
+ * sysdeps/x86_64/mempcpy.S: Likewise.
+ * sysdeps/x86_64/memmove.S: New file.
+ * sysdeps/x86_64/memmove_chk.S: Likewise.
+ * sysdeps/x86_64/multiarch/memmove.S: Likewise.
+ * sysdeps/x86_64/multiarch/memmove_chk.S: Likewise.
+ * sysdeps/x86_64/memmove.c: Removed.
+ * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise.
+ * sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S: Likewise.
+ * sysdeps/x86_64/multiarch/memmove-avx-unaligned.S: Likewise.
+ * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
+ Likewise.
+ * sysdeps/x86_64/multiarch/memmove.c: Likewise.
+ * sysdeps/x86_64/multiarch/memmove_chk.c: Likewise.
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Remove
+ memcpy-sse2-unaligned, memmove-avx-unaligned,
+ memcpy-avx-unaligned and memmove-sse2-unaligned-erms.
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Replace
+ __memmove_chk_avx512_unaligned_2 with
+ __memmove_chk_avx512_unaligned. Remove
+ __memmove_chk_avx_unaligned_2. Replace
+ __memmove_chk_sse2_unaligned_2 with
+ __memmove_chk_sse2_unaligned. Remove __memmove_chk_sse2 and
+ __memmove_avx_unaligned_2. Replace __memmove_avx512_unaligned_2
+ with __memmove_avx512_unaligned. Replace
+ __memmove_sse2_unaligned_2 with __memmove_sse2_unaligned.
+ Remove __memmove_sse2. Replace __memcpy_chk_avx512_unaligned_2
+ with __memcpy_chk_avx512_unaligned. Remove
+ __memcpy_chk_avx_unaligned_2. Replace
+ __memcpy_chk_sse2_unaligned_2 with __memcpy_chk_sse2_unaligned.
+ Remove __memcpy_chk_sse2. Remove __memcpy_avx_unaligned_2.
+ Replace __memcpy_avx512_unaligned_2 with
+ __memcpy_avx512_unaligned. Remove __memcpy_sse2_unaligned_2
+ and __memcpy_sse2. Replace __mempcpy_chk_avx512_unaligned_2
+ with __mempcpy_chk_avx512_unaligned. Remove
+ __mempcpy_chk_avx_unaligned_2. Replace
+ __mempcpy_chk_sse2_unaligned_2 with
+ __mempcpy_chk_sse2_unaligned. Remove __mempcpy_chk_sse2.
+ Replace __mempcpy_avx512_unaligned_2 with
+ __mempcpy_avx512_unaligned. Remove __mempcpy_avx_unaligned_2.
+ Replace __mempcpy_sse2_unaligned_2 with
+ __mempcpy_sse2_unaligned. Remove __mempcpy_sse2.
+ * sysdeps/x86_64/multiarch/memcpy.S (__new_memcpy): Support
+ __memcpy_avx512_unaligned_erms and __memcpy_avx512_unaligned.
+ Use __memcpy_avx_unaligned_erms and __memcpy_sse2_unaligned_erms
+ if processor has ERMS. Default to __memcpy_sse2_unaligned.
+ (ENTRY): Removed.
+ (END): Likewise.
+ (ENTRY_CHK): Likewise.
+ (libc_hidden_builtin_def): Likewise.
+ Don't include ../memcpy.S.
+ * sysdeps/x86_64/multiarch/memcpy_chk.S (__memcpy_chk): Support
+ __memcpy_chk_avx512_unaligned_erms and
+ __memcpy_chk_avx512_unaligned. Use
+ __memcpy_chk_avx_unaligned_erms and
+ __memcpy_chk_sse2_unaligned_erms if if processor has ERMS.
+ Default to __memcpy_chk_sse2_unaligned.
+ * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+ Change function suffix from unaligned_2 to unaligned.
+ * sysdeps/x86_64/multiarch/mempcpy.S (__mempcpy): Support
+ __mempcpy_avx512_unaligned_erms and __mempcpy_avx512_unaligned.
+ Use __mempcpy_avx_unaligned_erms and __mempcpy_sse2_unaligned_erms
+ if processor has ERMS. Default to __mempcpy_sse2_unaligned.
+ (ENTRY): Removed.
+ (END): Likewise.
+ (ENTRY_CHK): Likewise.
+ (libc_hidden_builtin_def): Likewise.
+ Don't include ../mempcpy.S.
+ (mempcpy): New. Add a weak alias.
+ * sysdeps/x86_64/multiarch/mempcpy_chk.S (__mempcpy_chk): Support
+ __mempcpy_chk_avx512_unaligned_erms and
+ __mempcpy_chk_avx512_unaligned. Use
+ __mempcpy_chk_avx_unaligned_erms and
+ __mempcpy_chk_sse2_unaligned_erms if if processor has ERMS.
+ Default to __mempcpy_chk_sse2_unaligned.
+
+2016-06-08 H.J. Lu <hongjiu.lu@intel.com>
+
[BZ #19881]
* sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S: Folded
into ...
diff --git a/sysdeps/x86_64/memcpy.S b/sysdeps/x86_64/memcpy.S
index f6e3d9396c..d98500a78a 100644
--- a/sysdeps/x86_64/memcpy.S
+++ b/sysdeps/x86_64/memcpy.S
@@ -1,584 +1 @@
-/*
- Optimized memcpy for x86-64.
-
- Copyright (C) 2007-2016 Free Software Foundation, Inc.
- Contributed by Evandro Menezes <evandro.menezes@amd.com>, 2007.
-
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>.
-*/
-
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Stack slots in the red-zone. */
-
-#ifdef USE_AS_MEMPCPY
-# define RETVAL (0)
-#else
-# define RETVAL (-8)
-# if defined SHARED && !defined USE_MULTIARCH && IS_IN (libc)
-# define memcpy __memcpy
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(name) \
- .globl __GI_memcpy; __GI_memcpy = __memcpy
-# endif
-#endif
-#define SAVE0 (RETVAL - 8)
-#define SAVE1 (SAVE0 - 8)
-#define SAVE2 (SAVE1 - 8)
-#define SAVE3 (SAVE2 - 8)
-
- .text
-
-#if defined PIC && IS_IN (libc)
-ENTRY_CHK (__memcpy_chk)
-
- cmpq %rdx, %rcx
- jb HIDDEN_JUMPTARGET (__chk_fail)
-
-END_CHK (__memcpy_chk)
-#endif
-
-ENTRY(memcpy) /* (void *, const void*, size_t) */
-
-/* Handle tiny blocks. */
-
-L(1try): /* up to 32B */
- cmpq $32, %rdx
-#ifndef USE_AS_MEMPCPY
- movq %rdi, %rax /* save return value */
-#endif
- jae L(1after)
-
-L(1): /* 1-byte once */
- testb $1, %dl
- jz L(1a)
-
- movzbl (%rsi), %ecx
- movb %cl, (%rdi)
-
- incq %rsi
- incq %rdi
-
- .p2align 4,, 4
-
-L(1a): /* 2-byte once */
- testb $2, %dl
- jz L(1b)
-
- movzwl (%rsi), %ecx
- movw %cx, (%rdi)
-
- addq $2, %rsi
- addq $2, %rdi
-
- .p2align 4,, 4
-
-L(1b): /* 4-byte once */
- testb $4, %dl
- jz L(1c)
-
- movl (%rsi), %ecx
- movl %ecx, (%rdi)
-
- addq $4, %rsi
- addq $4, %rdi
-
- .p2align 4,, 4
-
-L(1c): /* 8-byte once */
- testb $8, %dl
- jz L(1d)
-
- movq (%rsi), %rcx
- movq %rcx, (%rdi)
-
- addq $8, %rsi
- addq $8, %rdi
-
- .p2align 4,, 4
-
-L(1d): /* 16-byte loop */
- andl $0xf0, %edx
- jz L(exit)
-
- .p2align 4
-
-L(1loop):
- movq (%rsi), %rcx
- movq 8(%rsi), %r8
- movq %rcx, (%rdi)
- movq %r8, 8(%rdi)
-
- subl $16, %edx
-
- leaq 16(%rsi), %rsi
- leaq 16(%rdi), %rdi
-
- jnz L(1loop)
-
- .p2align 4,, 4
-
-L(exit): /* exit */
-#ifdef USE_AS_MEMPCPY
- movq %rdi, %rax /* return value */
-#else
- rep
-#endif
- retq
-
- .p2align 4
-
-L(1after):
-#ifndef USE_AS_MEMPCPY
- movq %rax, RETVAL(%rsp) /* save return value */
-#endif
-
-/* Align to the natural word size. */
-
-L(aligntry):
- movl %esi, %ecx /* align by source */
-
- andl $7, %ecx
- jz L(alignafter) /* already aligned */
-
-L(align): /* align */
- leaq -8(%rcx, %rdx), %rdx /* calculate remaining bytes */
- subl $8, %ecx
-
- .p2align 4
-
-L(alignloop): /* 1-byte alignment loop */
- movzbl (%rsi), %eax
- movb %al, (%rdi)
-
- incl %ecx
-
- leaq 1(%rsi), %rsi
- leaq 1(%rdi), %rdi
-
- jnz L(alignloop)
-
- .p2align 4
-
-L(alignafter):
-
-/* Handle mid-sized blocks. */
-
-L(32try): /* up to 1KB */
- cmpq $1024, %rdx
- ja L(32after)
-
-L(32): /* 32-byte loop */
- movl %edx, %ecx
- shrl $5, %ecx
- jz L(32skip)
-
- .p2align 4
-
-L(32loop):
- decl %ecx
-
- movq (%rsi), %rax
- movq 8(%rsi), %r8
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
-
- movq %rax, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
-
- leaq 32(%rsi), %rsi
- leaq 32(%rdi), %rdi
-
- jz L(32skip) /* help out smaller blocks */
-
- decl %ecx
-
- movq (%rsi), %rax
- movq 8(%rsi), %r8
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
-
- movq %rax, (%rdi)
- movq %r8, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
-
- leaq 32(%rsi), %rsi
- leaq 32(%rdi), %rdi
-
- jnz L(32loop)
-
- .p2align 4
-
-L(32skip):
- andl $31, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
- .p2align 4
-
-L(32after):
-
-/*
- In order to minimize code-size in RTLD, algorithms specific for
- larger blocks are excluded when building for RTLD.
-*/
-
-/* Handle blocks smaller than 1/2 L1. */
-
-L(fasttry): /* first 1/2 L1 */
-#if IS_IN (libc) /* only up to this algorithm outside of libc.so */
- mov __x86_data_cache_size_half(%rip), %R11_LP
- cmpq %rdx, %r11 /* calculate the smaller of */
- cmovaq %rdx, %r11 /* remaining bytes and 1/2 L1 */
-#endif
-
-L(fast): /* good ol' MOVS */
-#if IS_IN (libc)
- movq %r11, %rcx
- andq $-8, %r11
-#else
- movq %rdx, %rcx
-#endif
- shrq $3, %rcx
- jz L(fastskip)
-
- rep
- movsq
-
- .p2align 4,, 4
-
-L(fastskip):
-#if IS_IN (libc)
- subq %r11, %rdx /* check for more */
- testq $-8, %rdx
- jnz L(fastafter)
-#endif
-
- andl $7, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
-#if IS_IN (libc) /* none of the algorithms below for RTLD */
-
- .p2align 4
-
-L(fastafter):
-
-/* Handle large blocks smaller than 1/2 L2. */
-
-L(pretry): /* first 1/2 L2 */
- mov __x86_shared_cache_size_half (%rip), %R8_LP
- cmpq %rdx, %r8 /* calculate the lesser of */
- cmovaq %rdx, %r8 /* remaining bytes and 1/2 L2 */
-
-L(pre): /* 64-byte with prefetching */
- movq %r8, %rcx
- andq $-64, %r8
- shrq $6, %rcx
- jz L(preskip)
-
- movq %r14, SAVE0(%rsp)
- cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1(%rsp)
- cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2(%rsp)
- cfi_rel_offset (%r12, SAVE2)
- movq %rbx, SAVE3(%rsp)
- cfi_rel_offset (%rbx, SAVE3)
-
- cmpl $0, __x86_prefetchw(%rip)
- jz L(preloop) /* check if PREFETCHW OK */
-
- .p2align 4
-
-/* ... when PREFETCHW is available (less cache-probe traffic in MP systems). */
-
-L(prewloop): /* cache-line in state M */
- decq %rcx
-
- movq (%rsi), %rax
- movq 8 (%rsi), %rbx
- movq 16 (%rsi), %r9
- movq 24 (%rsi), %r10
- movq 32 (%rsi), %r11
- movq 40 (%rsi), %r12
- movq 48 (%rsi), %r13
- movq 56 (%rsi), %r14
-
- prefetcht0 0 + 896 (%rsi)
- prefetcht0 64 + 896 (%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jz L(prebail)
-
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- prefetchw 896 - 64(%rdi)
- prefetchw 896 - 0(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz L(prewloop)
- jmp L(prebail)
-
- .p2align 4
-
-/* ... when PREFETCHW is not available. */
-
-L(preloop): /* cache-line in state E */
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- prefetcht0 896 + 0(%rsi)
- prefetcht0 896 + 64(%rsi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64 (%rsi), %rsi
- leaq 64 (%rdi), %rdi
-
- jz L(prebail)
-
- decq %rcx
-
- movq (%rsi), %rax
- movq 8(%rsi), %rbx
- movq 16(%rsi), %r9
- movq 24(%rsi), %r10
- movq 32(%rsi), %r11
- movq 40(%rsi), %r12
- movq 48(%rsi), %r13
- movq 56(%rsi), %r14
-
- prefetcht0 896 - 64(%rdi)
- prefetcht0 896 - 0(%rdi)
-
- movq %rax, (%rdi)
- movq %rbx, 8(%rdi)
- movq %r9, 16(%rdi)
- movq %r10, 24(%rdi)
- movq %r11, 32(%rdi)
- movq %r12, 40(%rdi)
- movq %r13, 48(%rdi)
- movq %r14, 56(%rdi)
-
- leaq 64(%rsi), %rsi
- leaq 64(%rdi), %rdi
-
- jnz L(preloop)
-
-L(prebail):
- movq SAVE3(%rsp), %rbx
- cfi_restore (%rbx)
- movq SAVE2(%rsp), %r12
- cfi_restore (%r12)
- movq SAVE1(%rsp), %r13
- cfi_restore (%r13)
- movq SAVE0(%rsp), %r14
- cfi_restore (%r14)
-
-/* .p2align 4 */
-
-L(preskip):
- subq %r8, %rdx /* check for more */
- testq $-64, %rdx
- jnz L(preafter)
-
- andl $63, %edx /* check for left overs */
-#ifdef USE_AS_MEMPCPY
- jnz L(1)
-
- movq %rdi, %rax
-#else
- movq RETVAL(%rsp), %rax
- jnz L(1)
-
- rep
-#endif
- retq /* exit */
-
- .p2align 4
-
-L(preafter):
-
-/* Handle huge blocks. */
-
-L(NTtry):
-
-L(NT): /* non-temporal 128-byte */
- movq %rdx, %rcx
- shrq $7, %rcx
- jz L(NTskip)
-
- movq %r14, SAVE0(%rsp)
- cfi_rel_offset (%r14, SAVE0)
- movq %r13, SAVE1(%rsp)
- cfi_rel_offset (%r13, SAVE1)
- movq %r12, SAVE2(%rsp)
- cfi_rel_offset (%r12, SAVE2)
-
- .p2align 4