aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 17:38:38 -0800
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 19:22:33 -0800
commitf049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
treea6c13dc462411b308467b26a3a0f1062e0597bbd
parentd44e116428fefa0c2d01151af11f7a41fb525536 (diff)
downloadglibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip
x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-evex.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcat-evex.S291
-rw-r--r--sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S110
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-evex.S1282
-rw-r--r--sysdeps/x86_64/multiarch/strncat-evex.S525
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-evex.S995
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h80
7 files changed, 2115 insertions, 1173 deletions
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
- Copyright (C) 2021-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_evex
-# endif
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
- xor %eax, %eax
- mov %edi, %ecx
- and $((VEC_SIZE * 4) - 1), %ecx
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
- cmp $(VEC_SIZE * 3), %ecx
- ja L(fourth_vector_boundary)
- vpcmpb $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_first_vector)
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- jmp L(align_vec_size_start)
-L(fourth_vector_boundary):
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- vpcmpb $0, (%rax), %YMMZERO, %k0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- kmovd %k0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align_vec_size_start):
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- kmovd %k4, %edx
- add $(VEC_SIZE * 4), %rax
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 4), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
- add $(VEC_SIZE * 5), %rax
- kmovd %k4, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0
- add $VEC_SIZE, %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1
- add $VEC_SIZE, %rax
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $VEC_SIZE, %rax
-
- .p2align 4
-L(align_four_vec_loop):
- VMOVA (%rax), %YMM0
- VMOVA (VEC_SIZE * 2)(%rax), %YMM1
- vpminub VEC_SIZE(%rax), %YMM0, %YMM0
- vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1
- vpminub %YMM0, %YMM1, %YMM0
- /* If K0 != 0, there is a null byte. */
- vpcmpb $0, %YMM0, %YMMZERO, %k0
- add $(VEC_SIZE * 4), %rax
- ktestd %k0, %k0
- jz L(align_four_vec_loop)
-
- vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
- sub $(VEC_SIZE * 5), %rax
- kmovd %k0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
- kmovd %k1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
- kmovd %k2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
- kmovd %k3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_null_on_first_vector):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_second_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $VEC_SIZE, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_third_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 2), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fourth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 3), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fifth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT __strcat_evex
#endif
+
+#define USE_AS_STRCAT
+#define STRCPY STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
new file mode 100644
index 0000000000..9530d7b683
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+ Copyright (C) 2011-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+ strncat-evex and does not standalone. Before including %rdi
+ must be saved in %rax. */
+
+
+/* Simple strlen implementation that ends at
+ L(strcat_strlen_done). */
+ vpxorq %VZERO_128, %VZERO_128, %VZERO_128
+ movq %rdi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+#ifdef USE_AS_WCSCPY
+ subl %r8d, %edi
+ shrl $2, %edi
+#endif
+ shrx %VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ movq %rax, %rdi
+#endif
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+
+ VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ leaq (VEC_SIZE)(%r8), %rdi
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v3)
+
+ andq $-(VEC_SIZE * 4), %rdi
+ .p2align 4,, 8
+L(loop_2x_vec):
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0)
+ VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+ VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2)
+ VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+ VPTESTN %VMM(1), %VMM(1), %k1
+ VPTESTN %VMM(3), %VMM(3), %k3
+ subq $(VEC_SIZE * -4), %rdi
+ KORTEST %k1, %k3
+ jz L(loop_2x_vec)
+
+ VPTESTN %VMM(0), %VMM(0), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v0)
+
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v1)
+
+ VPTESTN %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRCX
+ test %VRCX, %VRCX
+ jnz L(bsf_and_done_v2)
+
+ KMOV %k3, %VRCX
+L(bsf_and_done_v3):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+ bsf %VRCX, %VRCX
+ leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+ jmp L(strcat_strlen_done)
+
+ .p2align 4,, 4
+L(bsf_and_done_v1):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+ bsf %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+ leaq (%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+ addq %rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..932129ab40 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
Copyright (C) 2021-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -17,990 +17,526 @@
<https://www.gnu.org/licenses/>. */
#include <isa-level.h>
-
#if ISA_SHOULD_BUILD (4)
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
+ /* Use evex-masked stores for small sizes. Turned off at the
+ moment. */
+# define USE_EVEX_MASKED_STORE 0
+ /* Use movsb in page cross case to save code size. */
+# define USE_MOVSB_IN_PAGE_CROSS 1
-# ifndef STRCPY
-# define STRCPY __strcpy_evex
-# endif
+# include <sysdep.h>
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-# define VEC_SIZE 32
+# ifndef STRCPY
+# define STRCPY __strcpy_evex
# endif
-# define XMM2 xmm18
-# define XMM3 xmm19
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-# define YMM7 ymm23
+# ifdef USE_AS_WCSCPY
+# define VMOVU_MASK vmovdqu32
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
+# define VPTEST vptestmd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
-# ifndef USE_AS_STRCAT
+# define REP_MOVS rep movsd
-/* zero register */
-# define XMMZERO xmm16
-# define YMMZERO ymm16
-# define YMM1 ymm17
-
- .section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
- test %R8_LP, %R8_LP
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
+# define USE_WIDE_CHAR
+# else
+# define VMOVU_MASK vmovdqu8
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
+# define VPTEST vptestmb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+# define REP_MOVS rep movsb
# endif
- and $((VEC_SIZE * 4) - 1), %ecx
- cmp $(VEC_SIZE * 2), %ecx
- jbe L(SourceStringAlignmentLessTwoVecSize)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
-
- vpcmpb $0, (%rsi), %YMMZERO, %k0
- kmovd %k0, %edx
- shr %cl, %rdx
+# include "reg-macros.h"
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $VEC_SIZE, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $(VEC_SIZE + 1), %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyVecSizeTailCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail)
-
- vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1
- kmovd %k1, %edx
-# ifdef USE_AS_STRNCPY
- add $VEC_SIZE, %r10
- cmp %r10, %r8
- jbe L(CopyTwoVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize)
-
- VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */
- VMOVU %YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(UnalignVecSizeBoth):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $VEC_SIZE, %rcx
- VMOVA (%rsi, %rcx), %YMM2
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 3), %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+# define END_REG rax
# else
- jnz L(CopyVecSize)
+# define END_REG rdi, %rdx, CHAR_SIZE
# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+# define PAGE_ALIGN_REG edx
+# define PAGE_ALIGN_REG_64 rdx
# else
- jnz L(CopyVecSize)
+# define PAGE_ALIGN_REG eax
+# define PAGE_ALIGN_REG_64 rax
# endif
- VMOVU %YMM3, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM4
- vpcmpb $0, %YMM4, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
-# else
- jnz L(CopyVecSize)
-# endif
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
- VMOVU %YMM4, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM2, (%rdi, %rcx)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM2
- vpcmpb $0, %YMM2, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
- VMOVA VEC_SIZE(%rsi, %rcx), %YMM3
- VMOVU %YMM2, (%rdi, %rcx)
- vpcmpb $0, %YMM3, %YMMZERO, %k0
- kmovd %k0, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
- VMOVU %YMM3, (%rdi, %rcx)
- mov %rsi, %rdx
- lea VEC_SIZE(%rsi, %rcx), %rsi
- and $-(VEC_SIZE * 4), %rsi
- sub %rsi, %rdx
- sub %rdx, %rdi
-# ifdef USE_AS_STRNCPY
- lea (VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
- VMOVA (%rsi), %YMM4
- VMOVA VEC_SIZE(%rsi), %YMM5
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM5, %YMM4, %YMM2
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8
- jbe L(UnalignedLeaveCase2OrCase3)
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+ movq %rdi, %rax
+# include "strcat-strlen-evex.h.S"
# endif
- test %edx, %edx
- jnz L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
- add $(VEC_SIZE * 4), %rdi
- add $(VEC_SIZE * 4), %rsi
- VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi)
- VMOVA (%rsi), %YMM4
- VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi)
- VMOVA VEC_SIZE(%rsi), %YMM5
- vpminub %YMM5, %YMM4, %YMM2
- VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi)
- VMOVA (VEC_SIZE * 2)(%rsi), %YMM6
- VMOVU %YMM7, -VEC_SIZE(%rdi)
- VMOVA (VEC_SIZE * 3)(%rsi), %YMM7
- vpminub %YMM7, %YMM6, %YMM3
- vpminub %YMM2, %YMM3, %YMM2
- /* If K7 != 0, there is a null byte. */
- vpcmpb $0, %YMM2, %YMMZERO, %k7
- kmovd %k7, %edx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 4), %r8