16 files changed, 1358 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 101a267ebc..3a94b71fe7 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,24 @@
+2019-01-14  Leonardo Sandoval  <leonardo.sandoval.gonzalez@intel.com>
+
+	* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+	strcat-avx2, strncat-avx2, strcpy-avx2, strncpy-avx2,
+	stpcpy-avx2 and stpncpy-avx2.
+	* sysdeps/x86_64/multiarch/ifunc-impl-list.c:
+	(__libc_ifunc_impl_list): Add tests for __strcat_avx2,
+	__strncat_avx2, __strcpy_avx2, __strncpy_avx2, __stpcpy_avx2
+	and __stpncpy_avx2.
+	* sysdeps/x86_64/multiarch/{ifunc-unaligned-ssse3.h =>
+	ifunc-strcpy.h}: rename header for a more generic name.
+	* sysdeps/x86_64/multiarch/ifunc-strcpy.h:
+	(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
+	AVX unaligned load is fast and vzeroupper is preferred.
+	* sysdeps/x86_64/multiarch/stpcpy-avx2.S: New file.
+	* sysdeps/x86_64/multiarch/stpncpy-avx2.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcat-avx2.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Likewise.
+	* sysdeps/x86_64/multiarch/strncat-avx2.S: Likewise.
+	* sysdeps/x86_64/multiarch/strncpy-avx2.S: Likewise.
+
 2019-01-12  Dmitry V. Levin  <ldv@altlinux.org>
 
 	* argp/argp-help.c: Fix typo in comment.
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bb5e970735..395e432c09 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -24,11 +24,14 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
 		   strrchr-sse2 strrchr-avx2 \
 		   strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
+		   strcat-avx2 strncat-avx2 \
 		   strcat-ssse3 strncat-ssse3\
+		   strcpy-avx2 strncpy-avx2 \
 		   strcpy-sse2 stpcpy-sse2 \
 		   strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
 		   strcpy-sse2-unaligned strncpy-sse2-unaligned \
 		   stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
+		   stpcpy-avx2 stpncpy-avx2 \
 		   strcat-sse2 \
 		   strcat-sse2-unaligned strncat-sse2-unaligned \
 		   strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cee2aebaee..39a45d0742 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -199,6 +199,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, stpncpy,
 	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpncpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			      __stpncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2))
@@ -207,6 +209,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, stpcpy,
 	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_CPU_FEATURE (SSSE3),
 			      __stpcpy_ssse3)
+	      IFUNC_IMPL_ADD (array, i, stpcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2))
 
@@ -239,6 +243,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
+	      IFUNC_IMPL_ADD (array, i, strcat, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strcat_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcat, HAS_CPU_FEATURE (SSSE3),
 			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
@@ -280,6 +286,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
+	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, strcpy, HAS_CPU_FEATURE (SSSE3),
 			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
@@ -321,6 +329,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncat.c.  */
   IFUNC_IMPL (i, name, strncat,
+	      IFUNC_IMPL_ADD (array, i, strncat, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strncat_avx2)
 	      IFUNC_IMPL_ADD (array, i, strncat, HAS_CPU_FEATURE (SSSE3),
 			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
@@ -329,6 +339,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_ARCH_FEATURE (AVX2_Usable),
+			      __strncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, strncpy, HAS_CPU_FEATURE (SSSE3),
 			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 4ddca9f15f..bc6a70fc7c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-unaligned-ssse3.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -24,12 +24,18 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
+  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    return OPTIMIZE (avx2);
+
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2.S b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
new file mode 100644
index 0000000000..f0bd3029fe
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpcpy-avx2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy.c b/sysdeps/x86_64/multiarch/stpcpy.c
index 0b12f5d7a1..fabe03204b 100644
--- a/sysdeps/x86_64/multiarch/stpcpy.c
+++ b/sysdeps/x86_64/multiarch/stpcpy.c
@@ -28,7 +28,7 @@
 # undef __stpcpy
 
 # define SYMBOL_NAME stpcpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_stpcpy, __stpcpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2.S b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
new file mode 100644
index 0000000000..032b0407d0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/stpncpy-avx2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_avx2
+#include "strcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy.c b/sysdeps/x86_64/multiarch/stpncpy.c
index 71edc291e4..3b618e7491 100644
--- a/sysdeps/x86_64/multiarch/stpncpy.c
+++ b/sysdeps/x86_64/multiarch/stpncpy.c
@@ -26,7 +26,7 @@
 # undef __stpncpy
 
 # define SYMBOL_NAME stpncpy
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_stpncpy, __stpncpy, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S
new file mode 100644
index 0000000000..b062356427
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-avx2.S
@@ -0,0 +1,275 @@
+/* strcat with AVX2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCAT
+#  define STRCAT  __strcat_avx2
+# endif
+
+# define USE_AS_STRCAT
+
+/* Number of bytes in a vector register */
+# define VEC_SIZE	32
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRCAT)
+	mov	%rdi, %r9
+# ifdef USE_AS_STRNCAT
+	mov	%rdx, %r8
+# endif
+
+	xor	%eax, %eax
+	mov	%edi, %ecx
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	vpxor	%xmm6, %xmm6, %xmm6
+	cmp	$(VEC_SIZE * 3), %ecx
+	ja	L(fourth_vector_boundary)
+	vpcmpeqb (%rdi), %ymm6, %ymm0
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_first_vector)
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	jmp	L(align_vec_size_start)
+L(fourth_vector_boundary):
+	mov	%rdi, %rax
+	and	$-VEC_SIZE, %rax
+	vpcmpeqb	(%rax), %ymm6, %ymm0
+	mov	$-1, %r10d
+	sub	%rax, %rcx
+	shl	%cl, %r10d
+	vpmovmskb %ymm0, %edx
+	and	%r10d, %edx
+	jnz	L(exit)
+
+L(align_vec_size_start):
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 4), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fifth_vector)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
+	add	$(VEC_SIZE * 5), %rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	test	$((VEC_SIZE * 4) - 1), %rax
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
+	add	$VEC_SIZE, %rax
+	vpmovmskb %ymm3, %edx
+	test	%edx, %edx
+	jnz	L(exit)
+
+	add	$VEC_SIZE, %rax
+
+	.p2align 4
+L(align_four_vec_loop):
+	vmovaps	(%rax),	%ymm4
+	vpminub	VEC_SIZE(%rax),	%ymm4, %ymm4
+	vmovaps	(VEC_SIZE * 2)(%rax),	%ymm5
+	vpminub	(VEC_SIZE * 3)(%rax),	%ymm5, %ymm5
+	add	$(VEC_SIZE * 4),	%rax
+	vpminub	%ymm4,	%ymm5, %ymm5
+	vpcmpeqb %ymm5,	%ymm6, %ymm5
+	vpmovmskb %ymm5,	%edx
+	test	%edx,	%edx
+	jz	L(align_four_vec_loop)
+
+	vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
+	sub	$(VEC_SIZE * 5),	%rax
+	vpmovmskb %ymm0, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_second_vector)
+
+	vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
+	vpmovmskb %ymm1, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_third_vector)
+
+	vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
+	vpmovmskb %ymm2, %edx
+	test	%edx, %edx
+	jnz	L(exit_null_on_fourth_vector)
+
+	vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
+	vpmovmskb %ymm3, %edx
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+L(exit_null_on_first_vector):
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_second_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$VEC_SIZE, %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_third_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 2), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fourth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 3), %rax
+	jmp	L(StartStrcpyPart)
+
+	.p2align 4
+L(exit_null_on_fifth_vector):
+	sub	%rdi, %rax
+	bsf	%rdx, %rdx
+	add	%rdx, %rax
+	add	$(VEC_SIZE * 4), %rax
+
+	.p2align 4
+L(StartStrcpyPart):
+	lea	(%r9, %rax), %rdi
+	mov	%rsi, %rcx
+	mov	%r9, %rax      /* save result */
+
+# ifdef USE_AS_STRNCAT
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  define USE_AS_STRNCPY
+# endif
+
+# include "strcpy-avx2.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcat.c b/sysdeps/x86_64/multiarch/strcat.c
index a0341b9514..92c360afc4 100644
--- a/sysdeps/x86_64/multiarch/strcat.c
+++ b/sysdeps/x86_64/multiarch/strcat.c
@@ -24,7 +24,7 @@
 # undef strcat
 
 # define SYMBOL_NAME strcat
-# include "ifunc-unaligned-ssse3.h"
+# include "ifunc-strcpy.h"
 
 libc_ifunc_redirected (__redirect_strcat, strcat, IFUNC_SELECTOR ());
 
diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S
new file mode 100644
index 0000000000..81677f9060
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S
@@ -0,0 +1,1022 @@
+/* strcpy with AVX2
+   Copyright (C) 2011-2018 Free Software Foundation, Inc.
+   Contributed by Intel Corporation.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# ifndef USE_AS_STRCAT
+#  include <sysdep.h>
+
+#  ifndef STRCPY
+#   define STRCPY  __strcpy_avx2
+#  endif
+
+# endif
+
+/* Number of bytes in a vector register */
+# ifndef VEC_SIZE
+#  define VEC_SIZE	32
+# endif
+
+# ifndef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+/* zero register */
+#define xmmZ	xmm0
+#define ymmZ	ymm0
+
+/* mask register */
+#define ymmM	ymm1
+
+# ifndef USE_AS_STRCAT
+
+	.section .text.avx,"ax",@progbits
+ENTRY (STRCPY)
+#  ifdef USE_AS_STRNCPY
+	mov	%rdx, %r8
+	test	%r8, %r8
+	jz	L(ExitZero)
+#  endif
+	mov	%rsi, %rcx
+#  ifndef USE_AS_STPCPY
+	mov	%rdi, %rax      /* save result */
+#  endif
+
+# endif
+
+	vpxor	%xmmZ, %xmmZ, %xmmZ
+
+	and	$((VEC_SIZE * 4) - 1), %ecx
+	cmp	$(VEC_SIZE * 2), %ecx
+	jbe	L(SourceStringAlignmentLessTwoVecSize)
+
+	and	$-VEC_SIZE, %rsi
+	and	$(VEC_SIZE - 1), %ecx
+
+	vpcmpeqb (%rsi), %ymmZ, %ymmM
+	vpmovmskb %ymmM, %edx
+	shr	%cl, %rdx
+
+# ifdef USE_AS_STRNCPY
+#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
+	mov	$VEC_SIZE, %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  else
+	mov	$(VEC_SIZE + 1), %r10
+	sub	%rcx, %r10
+	cmp	%r10, %r8
+#  endif
+	jbe	L(CopyVecSizeTailCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyVecSizeTail)
+
+	vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
+	vpmovmskb %ymm2, %edx
+
+# ifdef USE_AS_STRNCPY
+	add	$VEC_SIZE, %r10
+	cmp	%r10, %r8
+	jbe	L(CopyTwoVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+	jnz	L(CopyTwoVecSize)
+
+	vmovdqu (%rsi, %rcx), %ymm2   /* copy VEC_SIZE bytes */
+	vmovdqu %ymm2, (%rdi)
+
+/* If source address alignment != destination address alignment */
+	.p2align 4
+L(UnalignVecSizeBoth):
+	sub	%rcx, %rdi
+# ifdef USE_AS_STRNCPY
+	add	%rcx, %r8
+	sbb	%rcx, %rcx
+	or	%rcx, %r8
+# endif
+	mov	$VEC_SIZE, %rcx
+	vmovdqa (%rsi, %rcx), %ymm2
+	vmovdqu %ymm2, (%rdi, %rcx)
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
+	vpmovmskb %ymmM, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$(VEC_SIZE * 3), %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec2)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqu %ymm2, (%rdi, %rcx)
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
+	vpcmpeqb %ymm3, %ymmZ, %ymmM
+	vpmovmskb %ymmM, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec3)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqu %ymm3, (%rdi, %rcx)
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
+	vpcmpeqb %ymm4, %ymmZ, %ymmM
+	vpmovmskb %ymmM, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
+	jnz	L(CopyVecSizeUnalignedVec4)
+# else
+	jnz	L(CopyVecSize)
+# endif
+
+	vmovdqu %ymm4, (%rdi, %rcx)
+	vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
+	vpcmpeqb %ymm2, %ymmZ, %ymmM
+	vpmovmskb %ymmM, %edx
+	add	$VEC_SIZE, %rcx
+# ifdef USE_AS_STRNCPY
+	sub	$VEC_SIZE, %r8
+	jbe	L(CopyVecSizeCase2OrCase3)
+# endif
+	test	%edx, %edx
+# if defined USE_AS_STRNCPY &