aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 17:38:39 -0800
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-11-08 19:22:33 -0800
commit642933158e7cf072d873231b1a9bb03291f2b989 (patch)
tree352c3956cef706e683d0ac26ef85d165d1adcceb
parentf049f52dfeed8129c11ab1641a815705d09ff7e8 (diff)
downloadglibc-642933158e7cf072d873231b1a9bb03291f2b989.tar.xz
glibc-642933158e7cf072d873231b1a9bb03291f2b989.zip
x86: Optimize and shrink st{r|p}{n}{cat|cpy}-avx2 functions
Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. strcat-avx2 -> 0.998 strcpy-avx2 -> 0.937 stpcpy-avx2 -> 0.971 strncpy-avx2 -> 0.793 stpncpy-avx2 -> 0.775 strncat-avx2 -> 0.962 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-avx2 -> 685 / 1639 -> 0.418 strcpy-avx2 -> 560 / 903 -> 0.620 stpcpy-avx2 -> 592 / 939 -> 0.630 strncpy-avx2 -> 1176 / 2390 -> 0.492 stpncpy-avx2 -> 1268 / 2438 -> 0.520 strncat-avx2 -> 1042 / 2563 -> 0.407 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-avx2.S -> strcpy, stpcpy, strcat strncpy-avx2.S -> strncpy strncat-avx2.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
-rw-r--r--sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S6
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S7
-rw-r--r--sysdeps/x86_64/multiarch/stpncpy-avx2.S5
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2-rtm.S13
-rw-r--r--sysdeps/x86_64/multiarch/strcat-avx2.S268
-rw-r--r--sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S101
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S13
-rw-r--r--sysdeps/x86_64/multiarch/strcpy-avx2.S1236
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2-rtm.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncat-avx2.S424
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S6
-rw-r--r--sysdeps/x86_64/multiarch/strncpy-avx2.S740
-rw-r--r--sysdeps/x86_64/multiarch/x86-avx-vecs.h3
13 files changed, 1594 insertions, 1234 deletions
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
index 2b9c07a59f..90e532dbe8 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S
@@ -1,3 +1,3 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPCPY __stpcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
index 60a2ccfe53..46ee07be36 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S
@@ -1,4 +1,3 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_avx2_rtm
-#include "strcpy-avx2-rtm.S"
+#define STPNCPY __stpncpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
+#include "stpncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
index b2f8c19143..a46a8edbe2 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -3,6 +3,5 @@
#endif
#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY STPNCPY
-#include "strcpy-avx2.S"
+#define STRNCPY STPNCPY
+#include "strncpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
index 637fb557c4..e84f4f1fef 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCAT
-# define STRCAT __strcat_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCAT __strcat_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
#include "strcat-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
index d9b7fb2a43..3f914fa342 100644
--- a/sysdeps/x86_64/multiarch/strcat-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -16,266 +16,10 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (3)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_avx2
-# endif
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE 32
-
-# ifndef SECTION
-# define SECTION(p) p##.avx
-# endif
-
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRCAT)
- mov %rdi, %r9
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
- xor %eax, %eax
- mov %edi, %ecx
- and $((VEC_SIZE * 4) - 1), %ecx
- vpxor %xmm6, %xmm6, %xmm6
- cmp $(VEC_SIZE * 3), %ecx
- ja L(fourth_vector_boundary)
- vpcmpeqb (%rdi), %ymm6, %ymm0
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_first_vector)
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- jmp L(align_vec_size_start)
-L(fourth_vector_boundary):
- mov %rdi, %rax
- and $-VEC_SIZE, %rax
- vpcmpeqb (%rax), %ymm6, %ymm0
- mov $-1, %r10d
- sub %rax, %rcx
- shl %cl, %r10d
- vpmovmskb %ymm0, %edx
- and %r10d, %edx
- jnz L(exit)
-
-L(align_vec_size_start):
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 4), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit_null_on_fifth_vector)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
- add $(VEC_SIZE * 5), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
- add $VEC_SIZE, %rax
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
- add $VEC_SIZE, %rax
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit)
-
- test $((VEC_SIZE * 4) - 1), %rax
- jz L(align_four_vec_loop)
-
- vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
- add $VEC_SIZE, %rax
- vpmovmskb %ymm3, %edx
- test %edx, %edx
- jnz L(exit)
-
- add $VEC_SIZE, %rax
-
- .p2align 4
-L(align_four_vec_loop):
- vmovaps (%rax), %ymm4
- vpminub VEC_SIZE(%rax), %ymm4, %ymm4
- vmovaps (VEC_SIZE * 2)(%rax), %ymm5
- vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
- add $(VEC_SIZE * 4), %rax
- vpminub %ymm4, %ymm5, %ymm5
- vpcmpeqb %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %edx
- test %edx, %edx
- jz L(align_four_vec_loop)
-
- vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
- sub $(VEC_SIZE * 5), %rax
- vpmovmskb %ymm0, %edx
- test %edx, %edx
- jnz L(exit_null_on_second_vector)
-
- vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
- vpmovmskb %ymm1, %edx
- test %edx, %edx
- jnz L(exit_null_on_third_vector)
-
- vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
- vpmovmskb %ymm2, %edx
- test %edx, %edx
- jnz L(exit_null_on_fourth_vector)
-
- vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
- vpmovmskb %ymm3, %edx
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
-L(exit_null_on_first_vector):
- bsf %rdx, %rdx
- add %rdx, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_second_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $VEC_SIZE, %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_third_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 2), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fourth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 3), %rax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_null_on_fifth_vector):
- sub %rdi, %rax
- bsf %rdx, %rdx
- add %rdx, %rax
- add $(VEC_SIZE * 4), %rax
-
- .p2align 4
-L(StartStrcpyPart):
- lea (%r9, %rax), %rdi
- mov %rsi, %rcx
- mov %r9, %rax /* save result */
-
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(ExitZero)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-avx2.S"
+#ifndef STRCAT
+# define STRCAT __strcat_avx2
#endif
+
+#define USE_AS_STRCAT
+#define STRCPY STRCAT
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
new file mode 100644
index 0000000000..f50514e07c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-avx2.h.S
@@ -0,0 +1,101 @@
+/* strlen used for begining of str{n}cat using AVX2.
+ Copyright (C) 2011-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+/* NOTE: This file is meant to be included by strcat-avx2 or
+ strncat-avx2 and does not standalone. Before including %rdi
+ must be saved in %rax. */
+
+
+/* Simple strlen implementation that ends at
+ L(strcat_strlen_done). */
+ movq %rdi, %r8
+ andq $(VEC_SIZE * -1), %r8
+ VPCMPEQ (%r8), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ VEC_SIZE(%r8), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ leaq (VEC_SIZE)(%r8), %rdi
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v2)
+
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v3)
+
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ .p2align 4,, 8
+L(loop_2x_vec):
+ VMOVA (VEC_SIZE * 0 + 1)(%rdi), %VMM(0)
+ VPMIN (VEC_SIZE * 1 + 1)(%rdi), %VMM(0), %VMM(1)
+ VMOVA (VEC_SIZE * 2 + 1)(%rdi), %VMM(2)
+ VPMIN (VEC_SIZE * 3 + 1)(%rdi), %VMM(2), %VMM(3)
+ VPMIN %VMM(1), %VMM(3), %VMM(3)
+ VPCMPEQ %VMM(3), %VZERO, %VMM(3)
+ vpmovmskb %VMM(3), %r8d
+ subq $(VEC_SIZE * -4), %rdi
+ testl %r8d, %r8d
+ jz L(loop_2x_vec)
+
+ addq $(VEC_SIZE * -4 + 1), %rdi
+
+ VPCMPEQ %VMM(0), %VZERO, %VMM(0)
+ vpmovmskb %VMM(0), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v0)
+
+ VPCMPEQ %VMM(1), %VZERO, %VMM(1)
+ vpmovmskb %VMM(1), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v1)
+
+ VPCMPEQ %VMM(2), %VZERO, %VMM(2)
+ vpmovmskb %VMM(2), %ecx
+ testl %ecx, %ecx
+ jnz L(bsf_and_done_v2)
+
+ movl %r8d, %ecx
+L(bsf_and_done_v3):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+ bsfl %ecx, %ecx
+ leaq (VEC_SIZE * 2)(%rdi, %rcx), %rdi
+ jmp L(strcat_strlen_done)
+
+ .p2align 4,, 4
+L(bsf_and_done_v1):
+ addq $VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+ bsfl %ecx, %ecx
+ addq %rcx, %rdi
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
index c2c581ecf7..3ae2de8ea9 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S
@@ -1,12 +1,3 @@
-#ifndef STRCPY
-# define STRCPY __strcpy_avx2_rtm
-#endif
-
-#define ZERO_UPPER_VEC_REGISTERS_RETURN \
- ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
-
-#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
-
-#define SECTION(p) p##.avx.rtm
-
+#define STRCPY __strcpy_avx2_rtm
+#include "x86-avx-rtm-vecs.h"
#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
index c725834929..32f86baa4c 100644
--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -20,984 +20,378 @@
#if ISA_SHOULD_BUILD (3)
+# include <sysdep.h>
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_avx2
-# endif
-
-# endif
-
-/* Number of bytes in a vector register */
# ifndef VEC_SIZE
-# define VEC_SIZE 32
-# endif
-
-# ifndef VZEROUPPER
-# define VZEROUPPER vzeroupper
-# endif
-
-# ifndef SECTION
-# define SECTION(p) p##.avx
-# endif
-
-/* zero register */
-#define xmmZ xmm0
-#define ymmZ ymm0
-
-/* mask register */
-#define ymmM ymm1
-
-# ifndef USE_AS_STRCAT
-
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRCPY)
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
- test %R8_LP, %R8_LP
- jz L(ExitZero)
-# endif
- mov %rsi, %rcx
-# ifndef USE_AS_STPCPY
- mov %rdi, %rax /* save result */
-# endif
-
+# include "x86-avx-vecs.h"
# endif
- vpxor %xmmZ, %xmmZ, %xmmZ
-
- and $((VEC_SIZE * 4) - 1), %ecx
- cmp $(VEC_SIZE * 2), %ecx
- jbe L(SourceStringAlignmentLessTwoVecSize)
-
- and $-VEC_SIZE, %rsi
- and $(VEC_SIZE - 1), %ecx
-
- vpcmpeqb (%rsi), %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- shr %cl, %rdx
-
-# ifdef USE_AS_STRNCPY
-# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
- mov $VEC_SIZE, %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# else
- mov $(VEC_SIZE + 1), %r10
- sub %rcx, %r10
- cmp %r10, %r8
-# endif
- jbe L(CopyVecSizeTailCase2OrCase3)
+# ifndef STRCPY
+# define STRCPY __strcpy_avx2
# endif
- test %edx, %edx
- jnz L(CopyVecSizeTail)
- vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
- vpmovmskb %ymm2, %edx
+ /* Use movsb in page cross case to save code size. */
+# define USE_MOVSB_IN_PAGE_CROSS 1
-# ifdef USE_AS_STRNCPY
- add $VEC_SIZE, %r10
- cmp %r10, %r8
- jbe L(CopyTwoVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
- jnz L(CopyTwoVecSize)
-
- vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
- vmovdqu %ymm2, (%rdi)
-
-/* If source address alignment != destination address alignment */
- .p2align 4
-L(UnalignVecSizeBoth):
- sub %rcx, %rdi
-# ifdef USE_AS_STRNCPY
- add %rcx, %r8
- sbb %rcx, %rcx
- or %rcx, %r8
-# endif
- mov $VEC_SIZE, %rcx
- vmovdqa (%rsi, %rcx), %ymm2
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $(VEC_SIZE * 3), %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_WCSCPY
+# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
- jnz L(CopyVecSize)
+# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
- vpcmpeqb %ymm3, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec3)
-# else
- jnz L(CopyVecSize)
-# endif
+# define PAGE_SIZE 4096
- vmovdqu %ymm3, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
- vpcmpeqb %ymm4, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec4)
+# ifdef USE_AS_STPCPY
+# define END_REG rax
# else
- jnz L(CopyVecSize)
+# define END_REG rdi, %rdx
# endif
- vmovdqu %ymm4, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STRCAT
+# define PAGE_ALIGN_REG ecx
# else
- jnz L(CopyVecSize)
+# define PAGE_ALIGN_REG eax
# endif
- vmovdqu %ymm2, (%rdi, %rcx)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
- vpcmpeqb %ymm2, %ymmZ, %ymmM
- vpmovmskb %ymmM, %edx
- add $VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
- sub $VEC_SIZE, %r8
- jbe L(CopyVecSizeCase2OrCase3)
-# endif
- test %edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
- jnz L(CopyVecSizeUnalignedVec2)
-# else
- jnz L(CopyVecSize)
-# endif
+# define VZERO VMM(7)
+# define VZERO_128 VMM_128(7)
- vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
- vmovdqu %ymm2, (%rdi, %rcx)
- vpcmpeqb %ymm3, %ymmZ, %ymmM
- vpmovmskb %ym