diff options
| author | H.J. Lu <hjl.tools@gmail.com> | 2016-03-31 10:04:26 -0700 |
|---|---|---|
| committer | H.J. Lu <hjl.tools@gmail.com> | 2016-03-31 10:04:40 -0700 |
| commit | 88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb (patch) | |
| tree | 798e0ca5f87b073921766fb4a41aa8f9095137d8 | |
| parent | 5cdd1989d1d2f135d02e66250f37ba8e767f9772 (diff) | |
| download | glibc-88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb.tar.xz glibc-88b57b8ed41d5ecf2e1bdfc19556f9246a665ebb.zip | |
Add x86-64 memmove with unaligned load/store and rep movsb
Implement x86-64 memmove with unaligned load/store and rep movsb.
Support 16-byte, 32-byte and 64-byte vector register sizes. When
size <= 8 times of vector register size, there is no check for
address overlap bewteen source and destination. Since overhead for
overlap check is small when size > 8 times of vector register size,
memcpy is an alias of memmove.
A single file provides 2 implementations of memmove, one with rep movsb
and the other without rep movsb. They share the same codes when size is
between 2 times of vector register size and REP_MOVSB_THRESHOLD which
is 2KB for 16-byte vector register size and scaled up by large vector
register size.
Key features:
1. Use overlapping load and store to avoid branch.
2. For size <= 8 times of vector register size, load all sources into
registers and store them together.
3. If there is no address overlap bewteen source and destination, copy
from both ends with 4 times of vector register size at a time.
4. If address of destination > address of source, backward copy 8 times
of vector register size at a time.
5. Otherwise, forward copy 8 times of vector register size at a time.
6. Use rep movsb only for forward copy. Avoid slow backward rep movsb
by fallbacking to backward copy 8 times of vector register size at a
time.
7. Skip when address of destination == address of source.
[BZ #19776]
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and
memmove-avx512-unaligned-erms.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Test
__memmove_chk_avx512_unaligned_2,
__memmove_chk_avx512_unaligned_erms,
__memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms,
__memmove_chk_sse2_unaligned_2,
__memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2,
__memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2,
__memmove_avx512_unaligned_erms, __memmove_erms,
__memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms,
__memcpy_chk_avx512_unaligned_2,
__memcpy_chk_avx512_unaligned_erms,
__memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms,
__memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms,
__memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms,
__memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms,
__memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms,
__memcpy_erms, __mempcpy_chk_avx512_unaligned_2,
__mempcpy_chk_avx512_unaligned_erms,
__mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms,
__mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms,
__mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms,
__mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms,
__mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and
__mempcpy_erms.
* sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New
file.
* sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S:
Likwise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likwise.
| -rw-r--r-- | ChangeLog | 40 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/ifunc-impl-list.c | 99 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S | 9 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S | 11 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S | 9 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 462 |
7 files changed, 634 insertions, 1 deletions
@@ -1,3 +1,43 @@ +2016-03-31 H.J. Lu <hongjiu.lu@intel.com> + + [BZ #19776] + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + memmove-sse2-unaligned-erms, memmove-avx-unaligned-erms and + memmove-avx512-unaligned-erms. + * sysdeps/x86_64/multiarch/ifunc-impl-list.c + (__libc_ifunc_impl_list): Test + __memmove_chk_avx512_unaligned_2, + __memmove_chk_avx512_unaligned_erms, + __memmove_chk_avx_unaligned_2, __memmove_chk_avx_unaligned_erms, + __memmove_chk_sse2_unaligned_2, + __memmove_chk_sse2_unaligned_erms, __memmove_avx_unaligned_2, + __memmove_avx_unaligned_erms, __memmove_avx512_unaligned_2, + __memmove_avx512_unaligned_erms, __memmove_erms, + __memmove_sse2_unaligned_2, __memmove_sse2_unaligned_erms, + __memcpy_chk_avx512_unaligned_2, + __memcpy_chk_avx512_unaligned_erms, + __memcpy_chk_avx_unaligned_2, __memcpy_chk_avx_unaligned_erms, + __memcpy_chk_sse2_unaligned_2, __memcpy_chk_sse2_unaligned_erms, + __memcpy_avx_unaligned_2, __memcpy_avx_unaligned_erms, + __memcpy_avx512_unaligned_2, __memcpy_avx512_unaligned_erms, + __memcpy_sse2_unaligned_2, __memcpy_sse2_unaligned_erms, + __memcpy_erms, __mempcpy_chk_avx512_unaligned_2, + __mempcpy_chk_avx512_unaligned_erms, + __mempcpy_chk_avx_unaligned_2, __mempcpy_chk_avx_unaligned_erms, + __mempcpy_chk_sse2_unaligned_2, __mempcpy_chk_sse2_unaligned_erms, + __mempcpy_avx512_unaligned_2, __mempcpy_avx512_unaligned_erms, + __mempcpy_avx_unaligned_2, __mempcpy_avx_unaligned_erms, + __mempcpy_sse2_unaligned_2, __mempcpy_sse2_unaligned_erms and + __mempcpy_erms. + * sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S: New + file. + * sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S: + Likwise. + * sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S: + Likwise. + * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: + Likwise. + 2016-03-31 Stefan Liebler <stli@linux.vnet.ibm.com> * sysdeps/s390/bits/link.h: (La_s390_vr) New typedef. diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 7fc89c253f..ef4dbc0c6f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -20,7 +20,10 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \ strcat-sse2-unaligned strncat-sse2-unaligned \ strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ strcspn-c strpbrk-c strspn-c varshift memset-avx2 \ - memset-avx512-no-vzeroupper + memset-avx512-no-vzeroupper \ + memmove-sse2-unaligned-erms \ + memmove-avx-unaligned-erms \ + memmove-avx512-unaligned-erms CFLAGS-varshift.c += -msse4 CFLAGS-strcspn-c.c += -msse4 CFLAGS-strpbrk-c.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 188b6d36c6..9204da450a 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -52,17 +52,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_chk_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_ARCH_FEATURE (AVX_Usable), __memmove_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, HAS_CPU_FEATURE (SSSE3), __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, + __memmove_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2)) /* Support sysdeps/x86_64/multiarch/memmove.S. */ @@ -70,15 +86,32 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX_Usable), __memmove_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX_Usable), + __memmove_avx_unaligned_erms) #ifdef HAVE_AVX512_ASM_SUPPORT IFUNC_IMPL_ADD (array, i, memmove, HAS_ARCH_FEATURE (AVX512F_Usable), __memmove_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, memmove, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memmove_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, HAS_CPU_FEATURE (SSSE3), __memmove_ssse3) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, memmove, 1, + __memmove_sse2_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2)) /* Support sysdeps/x86_64/multiarch/memset_chk.S. */ @@ -267,17 +300,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_chk_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, HAS_CPU_FEATURE (SSSE3), __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, + __memcpy_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2)) /* Support sysdeps/x86_64/multiarch/memcpy.S. */ @@ -285,6 +334,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX_Usable), __memcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __memcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, HAS_CPU_FEATURE (SSSE3), @@ -293,8 +348,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __memcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, memcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __memcpy_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, + __memcpy_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, memcpy, 1, + __memcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_erms) IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2)) /* Support sysdeps/x86_64/multiarch/mempcpy_chk.S. */ @@ -303,17 +369,33 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_chk_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_chk_avx_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_chk_avx_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, HAS_CPU_FEATURE (SSSE3), __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, + __mempcpy_chk_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2)) /* Support sysdeps/x86_64/multiarch/mempcpy.S. */ @@ -322,14 +404,31 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX512F_Usable), __mempcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned_2) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX512F_Usable), + __mempcpy_avx512_unaligned_erms) #endif IFUNC_IMPL_ADD (array, i, mempcpy, HAS_ARCH_FEATURE (AVX_Usable), __mempcpy_avx_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned_2) + IFUNC_IMPL_ADD (array, i, mempcpy, + HAS_ARCH_FEATURE (AVX_Usable), + __mempcpy_avx_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, HAS_CPU_FEATURE (SSSE3), __mempcpy_ssse3) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned_2) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, + __mempcpy_sse2_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_erms) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2)) /* Support sysdeps/x86_64/multiarch/strncmp.S. */ diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S new file mode 100644 index 0000000000..3a72c7eafd --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S @@ -0,0 +1,9 @@ +#define VEC_SIZE 32 +#define VEC(i) ymm##i +#define VMOVU vmovdqu +#define VMOVA vmovdqa + +#define SECTION(p) p##.avx +#define MEMMOVE_SYMBOL(p,s) p##_avx_##s + +#include "memmove-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S new file mode 100644 index 0000000000..38358fa37c --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S @@ -0,0 +1,11 @@ +#ifdef HAVE_AVX512_ASM_SUPPORT +# define VEC_SIZE 64 +# define VEC(i) zmm##i +# define VMOVU vmovdqu64 +# define VMOVA vmovdqa64 + +# define SECTION(p) p##.avx512 +# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + +# include "memmove-vec-unaligned-erms.S" +#endif diff --git a/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S new file mode 100644 index 0000000000..52b9ae08fc --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-sse2-unaligned-erms.S @@ -0,0 +1,9 @@ +#define VEC_SIZE 16 +#define VEC(i) xmm##i +#define VMOVU movdqu +#define VMOVA movdqa + +#define SECTION(p) p +#define MEMMOVE_SYMBOL(p,s) p##_sse2_##s + +#include "memmove-vec-unaligned-erms.S" diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S new file mode 100644 index 0000000000..cf645dd7ff --- /dev/null +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -0,0 +1,462 @@ +/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb + Copyright (C) 2016 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +/* memmove/memcpy/mempcpy is implemented as: + 1. Use overlapping load and store to avoid branch. + 2. Use 8-bit or 32-bit displacements for branches and nop paddings + to avoid long nop between instructions. + 3. Load all sources into registers and store them together to avoid + possible address overflap between source and destination. + 4. If size is 2 * VEC_SIZE or less, load all sources into registers + and store them together. + 5. If there is no address overflap, copy from both ends with + 4 * VEC_SIZE at a time. + 6. If size is 8 * VEC_SIZE or less, load all sources into registers + and store them together. + 7. If address of destination > address of source, backward copy + 8 * VEC_SIZE at a time. + 8. Otherwise, forward copy 8 * VEC_SIZE at a time. */ + +#if IS_IN (libc) + +# include <sysdep.h> +# include "asm-syntax.h" + +# ifndef VZEROUPPER +# if VEC_SIZE > 16 +# define VZEROUPPER vzeroupper +# else +# define VZEROUPPER +# endif +# endif + +/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set + up REP MOVSB operation, REP MOVSB isn't faster on short data. The + memcpy micro benchmark in glibc shows that 2KB is the approximate + value above which REP MOVSB becomes faster than SSE2 optimization + on processors with Enhanced REP MOVSB. Since larger register size + can move more data with a single load and store, the threshold is + higher with larger register size. */ +# ifndef REP_MOVSB_THRESHOLD +# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16)) +# endif + +# ifndef SECTION +# error SECTION is not defined! +# endif + .section SECTION(.text),"ax",@progbits + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_2)) + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_2)) + +ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_2)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_2)) + movq %rdi, %rax +L(start): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + VZEROUPPER + ret +END (MEMMOVE_SYMBOL (__memmove, unaligned_2)) + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__mempcpy_chk, unaligned_erms)) +# endif + +ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + movq %rdi, %rax + addq %rdx, %rax + jmp L(start_erms) +END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + +# ifdef SHARED +ENTRY (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) + cmpq %rdx, %rcx + jb HIDDEN_JUMPTARGET (__chk_fail) +END (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms)) +# endif + +# if VEC_SIZE == 16 +/* Only used to measure performance of REP MOVSB. */ +# ifdef SHARED +ENTRY (__mempcpy_erms) + movq %rdi, %rax + addq %rdx, %rax + jmp L(movsb) +END (__mempcpy_erms) +# endif + +ENTRY (__memmove_erms) + movq %rdi, %rax + movq %rdx, %rcx + cmpq %rsi, %rdi + jbe 1f + leaq (%rsi,%rcx), %rdx + cmpq %rdx, %rdi + jb L(movsb_backward) +1: + rep movsb + ret +L(movsb_backward): + leaq -1(%rdi,%rcx), %rdi + leaq -1(%rsi,%rcx), %rsi + std + rep movsb + cld + ret +END (__memmove_erms) +strong_alias (__memmove_erms, __memcpy_erms) +# endif + +ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax +L(start_erms): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + cmpq $(VEC_SIZE * 2), %rdx + ja L(movsb_more_2x_vec) +L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %VEC(0) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +L(return): + VZEROUPPER + ret + +L(movsb): + cmpq %rsi, %rdi + je L(nop) + jb 1f + leaq (%rsi,%rdx), %r9 + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ +# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8) +# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE! +# endif + jb L(more_8x_vec_backward) +1: + movq %rdx, %rcx + rep movsb +L(nop): + ret + + .p2align 4 +L(movsb_more_2x_vec): + cmpq $REP_MOVSB_THRESHOLD, %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + ja.d32 L(movsb) + .p2align 4 +L(more_2x_vec): + /* More than 2 * VEC. */ + cmpq %rsi, %rdi + je L(nop) + jb L(copy_forward) + leaq (%rsi,%rdx), %rcx + cmpq %rcx, %rdi + jb L(more_2x_vec_overlap) +L(copy_forward): + leaq (%rdi,%rdx), %rcx + cmpq %rcx, %rsi + jb L(more_2x_vec_overlap) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) + cmpq $(VEC_SIZE * 4), %rdx + /* Force 32-bit displacement to avoid long nop between + instructions. */ + jbe.d32 L(return) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(0) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1) + VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(2) + VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(3) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(1), (VEC_SIZE * 3)(%rdi) + VMOVU %VEC(2), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 4)(%rdi,%rdx) + cmpq $(VEC_SIZE * 8), %rdx +# if VEC_SIZE == 16 + jbe L(return) +# else + /* Use 8-bit displacement to avoid long nop between + instructions. */ + jbe L(return_disp8) +# endif + leaq (VEC_SIZE * 4)(%rdi), %rcx + addq %rdi, %rdx + andq $-(VEC_SIZE * 4), %rdx + andq $-(VEC_SIZE * 4), %rcx + movq %rcx, %r11 + subq %rdi, %r11 + addq %r11, %rsi + cmpq %rdx, %rcx + /* Use 8-bit displacement to avoid long nop between + instructions. */ + je L(return_disp8) + movq %rsi, %r10 + subq %rcx, %r10 + leaq VEC_SIZE(%r10), %r9 + leaq (VEC_SIZE * 2)(%r10), %r8 |
