x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions

Optimizations are: 1. Use more overlapping stores to avoid branches. 2. Reduce how unrolled the aligning copies are (this is more of a code-size save, its a negative for some sizes in terms of perf). 3. Improve the loop a bit (similiar to what we do in strlen with 2x vpminu + kortest instead of 3x vpminu + kmov + test). 4. For st{r|p}n{cat|cpy} re-order the branches to minimize the number that are taken. Performance Changes: Times are from N = 10 runs of the benchmark suite and are reported as geometric mean of all ratios of New Implementation / Old Implementation. stpcpy-evex -> 0.922 strcat-evex -> 0.985 strcpy-evex -> 0.880 strncpy-evex -> 0.831 stpncpy-evex -> 0.780 strncat-evex -> 0.958 Code Size Changes: function -> Bytes New / Bytes Old -> Ratio strcat-evex -> 819 / 1874 -> 0.437 strcpy-evex -> 700 / 1074 -> 0.652 stpcpy-evex -> 735 / 1094 -> 0.672 strncpy-evex -> 1397 / 2611 -> 0.535 stpncpy-evex -> 1489 / 2691 -> 0.553 strncat-evex -> 1184 / 2832 -> 0.418 Notes: 1. Because of the significant difference between the implementations they are split into three files. strcpy-evex.S -> strcpy, stpcpy, strcat strncpy-evex.S -> strncpy strncat-evex.S > strncat I couldn't find a way to merge them without making the ifdefs incredibly difficult to follow. 2. All implementations can be made evex512 by including "x86-evex512-vecs.h" at the top. 3. All implementations have an optional define: `USE_EVEX_MASKED_STORE` Setting to one uses evex-masked stores for handling short strings. This saves code size and branches. It's disabled for all implementations are the moment as there are some serious drawbacks to masked stores in certain cases, but that may be fixed on future architectures. Full check passes on x86-64 and build succeeds for all ISA levels w/ and w/o multiarch.
author: Noah Goldstein <goldstein.w.n@gmail.com> 2022-11-08 17:38:38 -0800
committer: Noah Goldstein <goldstein.w.n@gmail.com> 2022-11-08 19:22:33 -0800
commit: f049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
tree: a6c13dc462411b308467b26a3a0f1062e0597bbd
parent: d44e116428fefa0c2d01151af11f7a41fb525536 (diff)
download: glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz
glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip
7 files changed, 2115 insertions, 1173 deletions
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S
index 99ea76a372..3693491baa 100644
--- a/sysdeps/x86_64/multiarch/stpncpy-evex.S
+++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S
@@ -3,6 +3,5 @@
 #endif
 
 #define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY	STPNCPY
-#include "strcpy-evex.S"
+#define STRNCPY	STPNCPY
+#include "strncpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S
index 0e2df947e9..b4207b7889 100644
--- a/sysdeps/x86_64/multiarch/strcat-evex.S
+++ b/sysdeps/x86_64/multiarch/strcat-evex.S
@@ -1,286 +1,7 @@
-/* strcat with 256-bit EVEX instructions.
-   Copyright (C) 2021-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <isa-level.h>
-
-#if ISA_SHOULD_BUILD (4)
-
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_evex
-# endif
-
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* zero register */
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-
-# define USE_AS_STRCAT
-
-/* Number of bytes in a vector register */
-# define VEC_SIZE	32
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCAT)
-	mov	%rdi, %r9
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-	xor	%eax, %eax
-	mov	%edi, %ecx
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	cmp	$(VEC_SIZE * 3), %ecx
-	ja	L(fourth_vector_boundary)
-	vpcmpb	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_first_vector)
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	jmp	L(align_vec_size_start)
-L(fourth_vector_boundary):
-	mov	%rdi, %rax
-	and	$-VEC_SIZE, %rax
-	vpcmpb	$0, (%rax), %YMMZERO, %k0
-	mov	$-1, %r10d
-	sub	%rax, %rcx
-	shl	%cl, %r10d
-	kmovd	%k0, %edx
-	and	%r10d, %edx
-	jnz	L(exit)
-
-L(align_vec_size_start):
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	kmovd	%k4, %edx
-	add	$(VEC_SIZE * 4), %rax
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 4), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fifth_vector)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4
-	add	$(VEC_SIZE * 5), %rax
-	kmovd	%k4, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k0
-	add	$VEC_SIZE, %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	test	$((VEC_SIZE * 4) - 1), %rax
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, VEC_SIZE(%rax), %YMMZERO, %k1
-	add	$VEC_SIZE, %rax
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit)
-
-	add	$VEC_SIZE, %rax
-
-	.p2align 4
-L(align_four_vec_loop):
-	VMOVA	(%rax), %YMM0
-	VMOVA	(VEC_SIZE * 2)(%rax), %YMM1
-	vpminub	VEC_SIZE(%rax), %YMM0, %YMM0
-	vpminub	(VEC_SIZE * 3)(%rax), %YMM1, %YMM1
-	vpminub	%YMM0, %YMM1, %YMM0
-	/* If K0 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM0, %YMMZERO, %k0
-	add	$(VEC_SIZE * 4), %rax
-	ktestd	%k0, %k0
-	jz	L(align_four_vec_loop)
-
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0
-	sub	$(VEC_SIZE * 5), %rax
-	kmovd	%k0, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_second_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1
-	kmovd	%k1, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_third_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2
-	kmovd	%k2, %edx
-	test	%edx, %edx
-	jnz	L(exit_null_on_fourth_vector)
-
-	vpcmpb	$0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3
-	kmovd	%k3, %edx
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-L(exit_null_on_first_vector):
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_second_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$VEC_SIZE, %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_third_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 2), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fourth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 3), %rax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_null_on_fifth_vector):
-	sub	%rdi, %rax
-	bsf	%rdx, %rdx
-	add	%rdx, %rax
-	add	$(VEC_SIZE * 4), %rax
-
-	.p2align 4
-L(StartStrcpyPart):
-	lea	(%r9, %rax), %rdi
-	mov	%rsi, %rcx
-	mov	%r9, %rax      /* save result */
-
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(ExitZero)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-evex.S"
+#ifndef STRCAT
+# define STRCAT	__strcat_evex
 #endif
+
+#define USE_AS_STRCAT
+#define STRCPY	STRCAT
+#include "strcpy-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
new file mode 100644
index 0000000000..9530d7b683
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S
@@ -0,0 +1,110 @@
+/* strlen used for begining of str{n}cat using EVEX 256/512.
+   Copyright (C) 2011-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+/* NOTE: This file is meant to be included by strcat-evex or
+   strncat-evex and does not standalone.  Before including %rdi
+   must be saved in %rax.  */
+
+
+/* Simple strlen implementation that ends at
+   L(strcat_strlen_done).  */
+	vpxorq	%VZERO_128, %VZERO_128, %VZERO_128
+	movq	%rdi, %r8
+	andq	$(VEC_SIZE * -1), %r8
+	VPCMPEQ	(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+#ifdef USE_AS_WCSCPY
+	subl	%r8d, %edi
+	shrl	$2, %edi
+#endif
+	shrx	%VRDI, %VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	movq	%rax, %rdi
+#endif
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+
+	VPCMPEQ	VEC_SIZE(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	leaq	(VEC_SIZE)(%r8), %rdi
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%r8), %VZERO, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v3)
+
+	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4,, 8
+L(loop_2x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(0)
+	VPMIN	(VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(2)
+	VPMIN	(VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	VPTESTN	%VMM(3), %VMM(3), %k3
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST	%k1, %k3
+	jz	L(loop_2x_vec)
+
+	VPTESTN	%VMM(0), %VMM(0), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v0)
+
+	KMOV	%k1, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v1)
+
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(bsf_and_done_v2)
+
+	KMOV	%k3, %VRCX
+L(bsf_and_done_v3):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v2):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi
+	jmp	L(strcat_strlen_done)
+
+	.p2align 4,, 4
+L(bsf_and_done_v1):
+	addq	$VEC_SIZE, %rdi
+L(bsf_and_done_v0):
+	bsf	%VRCX, %VRCX
+#ifdef USE_AS_WCSCPY
+	leaq	(%rdi, %rcx, CHAR_SIZE), %rdi
+#else
+	addq	%rcx, %rdi
+#endif
+L(strcat_strlen_done):
diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S
index 82e45ac675..932129ab40 100644
--- a/sysdeps/x86_64/multiarch/strcpy-evex.S
+++ b/sysdeps/x86_64/multiarch/strcpy-evex.S
@@ -1,4 +1,4 @@
-/* strcpy with 256-bit EVEX instructions.
+/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions.
    Copyright (C) 2021-2022 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -17,990 +17,526 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include <isa-level.h>
-
 #if ISA_SHOULD_BUILD (4)
 
 
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
+	/* Use evex-masked stores for small sizes. Turned off at the
+	   moment.  */
+# define USE_EVEX_MASKED_STORE	0
+	/* Use movsb in page cross case to save code size.  */
+# define USE_MOVSB_IN_PAGE_CROSS	1
 
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_evex
-#  endif
+# include <sysdep.h>
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
-
-/* Number of bytes in a vector register */
-# ifndef VEC_SIZE
-#  define VEC_SIZE	32
+# ifndef STRCPY
+#  define STRCPY	__strcpy_evex
 # endif
 
-# define XMM2		xmm18
-# define XMM3		xmm19
 
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-# define YMM7		ymm23
+# ifdef USE_AS_WCSCPY
+#  define VMOVU_MASK	vmovdqu32
+#  define VPMIN	vpminud
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 
-# ifndef USE_AS_STRCAT
+#  define REP_MOVS	rep movsd
 
-/* zero register */
-#  define XMMZERO	xmm16
-#  define YMMZERO	ymm16
-#  define YMM1		ymm17
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRCPY)
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-	test	%R8_LP, %R8_LP
-	jz	L(ExitZero)
-#  endif
-	mov	%rsi, %rcx
-#  ifndef USE_AS_STPCPY
-	mov	%rdi, %rax      /* save result */
-#  endif
+#  define USE_WIDE_CHAR
+# else
+#  define VMOVU_MASK	vmovdqu8
+#  define VPMIN	vpminub
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+#  define REP_MOVS	rep movsb
 # endif
 
-	and	$((VEC_SIZE * 4) - 1), %ecx
-	cmp	$(VEC_SIZE * 2), %ecx
-	jbe	L(SourceStringAlignmentLessTwoVecSize)
-
-	and	$-VEC_SIZE, %rsi
-	and	$(VEC_SIZE - 1), %ecx
-
-	vpcmpb	$0, (%rsi), %YMMZERO, %k0
-	kmovd	%k0, %edx
-	shr	%cl, %rdx
+# include "reg-macros.h"
 
-# ifdef USE_AS_STRNCPY
-#  if defined USE_AS_STPCPY || defined USE_AS_STRCAT
-	mov	$VEC_SIZE, %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  else
-	mov	$(VEC_SIZE + 1), %r10
-	sub	%rcx, %r10
-	cmp	%r10, %r8
-#  endif
-	jbe	L(CopyVecSizeTailCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyVecSizeTail)
-
-	vpcmpb	$0, VEC_SIZE(%rsi), %YMMZERO, %k1
-	kmovd	%k1, %edx
 
-# ifdef USE_AS_STRNCPY
-	add	$VEC_SIZE, %r10
-	cmp	%r10, %r8
-	jbe	L(CopyTwoVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-	jnz	L(CopyTwoVecSize)
-
-	VMOVU	(%rsi, %rcx), %YMM2   /* copy VEC_SIZE bytes */
-	VMOVU	%YMM2, (%rdi)
-
-/* If source address alignment != destination address alignment */
-	.p2align 4
-L(UnalignVecSizeBoth):
-	sub	%rcx, %rdi
-# ifdef USE_AS_STRNCPY
-	add	%rcx, %r8
-	sbb	%rcx, %rcx
-	or	%rcx, %r8
-# endif
-	mov	$VEC_SIZE, %rcx
-	VMOVA	(%rsi, %rcx), %YMM2
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 3), %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
+# ifdef USE_AS_STPCPY
+#  define END_REG	rax
 # else
-	jnz	L(CopyVecSize)
+#  define END_REG	rdi, %rdx, CHAR_SIZE
 # endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
+# ifdef USE_AS_STRCAT
+#  define PAGE_ALIGN_REG	edx
+#  define PAGE_ALIGN_REG_64	rdx
 # else
-	jnz	L(CopyVecSize)
+#  define PAGE_ALIGN_REG	eax
+#  define PAGE_ALIGN_REG_64	rax
 # endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM4
-	vpcmpb	$0, %YMM4, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec4)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define VZERO	VMM(7)
+# define VZERO_128	VMM_128(7)
 
-	VMOVU	%YMM4, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM2, (%rdi, %rcx)
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM2
-	vpcmpb	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec2)
-# else
-	jnz	L(CopyVecSize)
-# endif
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-	VMOVA	VEC_SIZE(%rsi, %rcx), %YMM3
-	VMOVU	%YMM2, (%rdi, %rcx)
-	vpcmpb	$0, %YMM3, %YMMZERO, %k0
-	kmovd	%k0, %edx
-	add	$VEC_SIZE, %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$VEC_SIZE, %r8
-	jbe	L(CopyVecSizeCase2OrCase3)
-# endif
-	test	%edx, %edx
-# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
-	jnz	L(CopyVecSizeUnalignedVec3)
-# else
-	jnz	L(CopyVecSize)
-# endif
 
-	VMOVU	%YMM3, (%rdi, %rcx)
-	mov	%rsi, %rdx
-	lea	VEC_SIZE(%rsi, %rcx), %rsi
-	and	$-(VEC_SIZE * 4), %rsi
-	sub	%rsi, %rdx
-	sub	%rdx, %rdi
-# ifdef USE_AS_STRNCPY
-	lea	(VEC_SIZE * 8)(%r8, %rdx), %r8
-# endif
-L(UnalignedFourVecSizeLoop):
-	VMOVA	(%rsi), %YMM4
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM5, %YMM4, %YMM2
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
-	jbe	L(UnalignedLeaveCase2OrCase3)
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRCPY)
+# ifdef USE_AS_STRCAT
+	movq	%rdi, %rax
+#  include "strcat-strlen-evex.h.S"
 # endif
-	test	%edx, %edx
-	jnz	L(UnalignedFourVecSizeLeave)
-
-L(UnalignedFourVecSizeLoop_start):
-	add	$(VEC_SIZE * 4), %rdi
-	add	$(VEC_SIZE * 4), %rsi
-	VMOVU	%YMM4, -(VEC_SIZE * 4)(%rdi)
-	VMOVA	(%rsi), %YMM4
-	VMOVU	%YMM5, -(VEC_SIZE * 3)(%rdi)
-	VMOVA	VEC_SIZE(%rsi), %YMM5
-	vpminub	%YMM5, %YMM4, %YMM2
-	VMOVU	%YMM6, -(VEC_SIZE * 2)(%rdi)
-	VMOVA	(VEC_SIZE * 2)(%rsi), %YMM6
-	VMOVU	%YMM7, -VEC_SIZE(%rdi)
-	VMOVA	(VEC_SIZE * 3)(%rsi), %YMM7
-	vpminub	%YMM7, %YMM6, %YMM3
-	vpminub	%YMM2, %YMM3, %YMM2
-	/* If K7 != 0, there is a null byte.  */
-	vpcmpb	$0, %YMM2, %YMMZERO, %k7
-	kmovd	%k7, %edx
-# ifdef USE_AS_STRNCPY
-	sub	$(VEC_SIZE * 4), %r8
author	Noah Goldstein <goldstein.w.n@gmail.com>	2022-11-08 17:38:38 -0800
committer	Noah Goldstein <goldstein.w.n@gmail.com>	2022-11-08 19:22:33 -0800
commit	f049f52dfeed8129c11ab1641a815705d09ff7e8 (patch)
tree	a6c13dc462411b308467b26a3a0f1062e0597bbd
parent	d44e116428fefa0c2d01151af11f7a41fb525536 (diff)
download	glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip