x86-64: Add strcpy family functions with 256-bit EVEX

Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit.
author: H.J. Lu <hjl.tools@gmail.com> 2021-03-05 06:36:50 -0800
committer: H.J. Lu <hjl.tools@gmail.com> 2021-03-29 07:40:17 -0700
commit: 525bc2a32c9710df40371f951217c6ae7a923aee (patch)
tree: efd1cd7acf189386bf9223678ccea60de70bc93b
parent: 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 (diff)
download: glibc-525bc2a32c9710df40371f951217c6ae7a923aee.tar.xz
glibc-525bc2a32c9710df40371f951217c6ae7a923aee.zip
9 files changed, 1339 insertions, 3 deletions
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5ce858823a..46783cd14b 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memchr-evex \
 		   memrchr-evex \
 		   rawmemchr-evex \
+		   stpcpy-evex \
+		   stpncpy-evex \
+		   strcat-evex \
 		   strchr-evex \
 		   strchrnul-evex \
 		   strcmp-evex \
+		   strcpy-evex \
 		   strlen-evex \
+		   strncat-evex \
 		   strncmp-evex \
+		   strncpy-evex \
 		   strnlen-evex \
 		   strrchr-evex
 CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3bf10d3714..74b20d8bd1 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __stpncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, stpcpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __stpcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2),
 			      __strcat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcat,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcat_evex)
 	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
 			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcpy,
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2),
 			      __strcpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strcpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncat,
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2),
 			      __strncat_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncat,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncat_evex)
 	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strncpy,
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2),
 			      __strncpy_avx2)
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncpy_evex)
 	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 1100cd23c6..f31f436adf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
-    return OPTIMIZE (avx2);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
 
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S
new file mode 100644
index 0000000000..7c6f26cd98
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
new file mode 100644
index 0000000000..1570014d1c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_evex
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
new file mode 100644
index 0000000000..97c3d85b6d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -0,0 +1,283 @@
+/* strcat with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_evex
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* zero register */
+# define XMMZERO	xmm16
+# define YMMZERO	ymm16
+# define YMM0		ymm17
+# define YMM1		ymm18
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE	32
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+	cmp	$(VEC_SIZE * 3), %ecx
+	ja	L(fourth_vector_boundary)
+	vpcmpb	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_first_vector)
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	jmp	L(align_vec_size_start)
+L(fourth_vector_boundary):
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	vpcmpb	$0, (%rax), %YMMZERO, %k0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	kmovd	%k0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align_vec_size_start):
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	kmovd	%k4, %edx
+	add	$(VEC_SIZE * 4), %rax
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 4), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
+	add	$(VEC_SIZE * 5), %rax
+	kmovd	%k4, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
+	add	$VEC_SIZE, %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
+	add	$VEC_SIZE, %rax
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$VEC_SIZE, %rax
+
+	.p2align 4
+L(align_four_vec_loop):
+	VMOVA	(%rax), %YMM0
+	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
+	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
+	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
+	vpminub	%YMM0, %YMM1, %YMM0
+	/* If K0 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM0, %YMMZERO, %k0
+	add	$(VEC_SIZE * 4), %rax
+	ktestd	%k0, %k0
+	jz	L(align_four_vec_loop)
+
+	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
+	sub	$(VEC_SIZE * 5), %rax
+	kmovd	%k0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
+	kmovd	%k1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
+	kmovd	%k2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
+	kmovd	%k3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_null_on_first_vector):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_second_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$VEC_SIZE, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_third_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 2), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fourth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 3), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fifth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-evex.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
new file mode 100644
index 0000000000..a343a1a692
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -0,0 +1,1003 @@
+/* strcpy with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_evex
+#  endif
+
+# endif
+
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+#  define VEC_SIZE	32
+# endif
+
+# define XMM2		xmm18
+# define XMM3		xmm19
+
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+# define YMM7		ymm23
+
+# ifndef USE_AS_STRCAT
+
+/* zero register */
+#  define XMMZERO	xmm16
+#  define YMMZERO	ymm16
+#  define YMM1		ymm17
+
+	.section .text.evex,"ax",@progbits
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# endif
+
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	cmp	$(VEC_SIZE * 2), %ecx
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+
+	vpcmpb	$0, (%rsi), %YMMZERO, %k0
+	kmovd	%k0, %edx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$VEC_SIZE, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$(VEC_SIZE + 1), %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyVecSizeTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail)
+
+	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
+	kmovd	%k1, %edx
+
+# ifdef USE_AS_STRNCPY
+	add	$VEC_SIZE, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize)
+
+	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
+	VMOVU	%YMM2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(UnalignVecSizeBoth):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$VEC_SIZE, %rcx
+	VMOVA	(%rsi, %rcx), %YMM2
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 3), %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
+	vpcmpb	$0, %YMM4, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM4, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM2, (%rdi, %rcx)
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
+	vpcmpb	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
+	VMOVU	%YMM2, (%rdi, %rcx)
+	vpcmpb	$0, %YMM3, %YMMZERO, %k0
+	kmovd	%k0, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	VMOVU	%YMM3, (%rdi, %rcx)
+	mov	%rsi, %rdx
+	lea	VEC_SIZE(%rsi, %rcx), %rsi
+	and	$-(VEC_SIZE * 4), %rsi
+	sub	%rsi, %rdx
+	sub	%rdx, %rdi
+# ifdef USE_AS_STRNCPY
+	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
+# endif
+L(UnalignedFourVecSizeLoop):
+	VMOVA	(%rsi), %YMM4
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM5, %YMM4, %YMM2
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(UnalignedFourVecSizeLeave)
+
+L(UnalignedFourVecSizeLoop_start):
+	add	$(VEC_SIZE * 4), %rdi
+	add	$(VEC_SIZE * 4), %rsi
+	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
+	VMOVA	(%rsi), %YMM4
+	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
+	VMOVA	VEC_SIZE(%rsi), %YMM5
+	vpminub	%YMM5, %YMM4, %YMM2
+	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
+	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
+	VMOVU	%YMM7, -VEC_SIZE(%rdi)
+	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
+	vpminub	%YMM7, %YMM6, %YMM3
+	vpminub	%YMM2, %YMM3, %YMM2
+	/* If K7 != 0, there is a null byte.  */
+	vpcmpb	$0, %YMM2, %YMMZERO, %k7
+	kmovd	%k7, %edx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 4), %r8
+	jbe	L(UnalignedLeaveCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jz	L(UnalignedFourVecSizeLoop_start)
+
+L(UnalignedFourVecSizeLeave):
+	vpcmpb	$0, %YMM4, %YMMZERO, %k1
author	H.J. Lu <hjl.tools@gmail.com>	2021-03-05 06:36:50 -0800
committer	H.J. Lu <hjl.tools@gmail.com>	2021-03-29 07:40:17 -0700
commit	525bc2a32c9710df40371f951217c6ae7a923aee (patch)
tree	efd1cd7acf189386bf9223678ccea60de70bc93b
parent	1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 (diff)
download	glibc-525bc2a32c9710df40371f951217c6ae7a923aee.tar.xz glibc-525bc2a32c9710df40371f951217c6ae7a923aee.zip