diff options
| author | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-11-08 17:38:38 -0800 |
|---|---|---|
| committer | Noah Goldstein <goldstein.w.n@gmail.com> | 2022-11-08 19:22:33 -0800 |
| commit | f049f52dfeed8129c11ab1641a815705d09ff7e8 (patch) | |
| tree | a6c13dc462411b308467b26a3a0f1062e0597bbd | |
| parent | d44e116428fefa0c2d01151af11f7a41fb525536 (diff) | |
| download | glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.tar.xz glibc-f049f52dfeed8129c11ab1641a815705d09ff7e8.zip | |
x86: Optimize and shrink st{r|p}{n}{cat|cpy}-evex functions
Optimizations are:
1. Use more overlapping stores to avoid branches.
2. Reduce how unrolled the aligning copies are (this is more of a
code-size save, its a negative for some sizes in terms of
perf).
3. Improve the loop a bit (similiar to what we do in strlen with
2x vpminu + kortest instead of 3x vpminu + kmov + test).
4. For st{r|p}n{cat|cpy} re-order the branches to minimize the
number that are taken.
Performance Changes:
Times are from N = 10 runs of the benchmark suite and are
reported as geometric mean of all ratios of
New Implementation / Old Implementation.
stpcpy-evex -> 0.922
strcat-evex -> 0.985
strcpy-evex -> 0.880
strncpy-evex -> 0.831
stpncpy-evex -> 0.780
strncat-evex -> 0.958
Code Size Changes:
function -> Bytes New / Bytes Old -> Ratio
strcat-evex -> 819 / 1874 -> 0.437
strcpy-evex -> 700 / 1074 -> 0.652
stpcpy-evex -> 735 / 1094 -> 0.672
strncpy-evex -> 1397 / 2611 -> 0.535
stpncpy-evex -> 1489 / 2691 -> 0.553
strncat-evex -> 1184 / 2832 -> 0.418
Notes:
1. Because of the significant difference between the
implementations they are split into three files.
strcpy-evex.S -> strcpy, stpcpy, strcat
strncpy-evex.S -> strncpy
strncat-evex.S > strncat
I couldn't find a way to merge them without making the
ifdefs incredibly difficult to follow.
2. All implementations can be made evex512 by including
"x86-evex512-vecs.h" at the top.
3. All implementations have an optional define:
`USE_EVEX_MASKED_STORE`
Setting to one uses evex-masked stores for handling short
strings. This saves code size and branches. It's disabled
for all implementations are the moment as there are some
serious drawbacks to masked stores in certain cases, but
that may be fixed on future architectures.
Full check passes on x86-64 and build succeeds for all ISA levels w/
and w/o multiarch.
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-evex.S | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat-evex.S | 291 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S | 110 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-evex.S | 1282 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncat-evex.S | 525 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-evex.S | 995 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-or-cat-overflow-def.h | 80 |
7 files changed, 2115 insertions, 1173 deletions
diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S index 99ea76a372..3693491baa 100644 --- a/sysdeps/x86_64/multiarch/stpncpy-evex.S +++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S @@ -3,6 +3,5 @@ #endif #define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY STPNCPY -#include "strcpy-evex.S" +#define STRNCPY STPNCPY +#include "strncpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S index 0e2df947e9..b4207b7889 100644 --- a/sysdeps/x86_64/multiarch/strcat-evex.S +++ b/sysdeps/x86_64/multiarch/strcat-evex.S @@ -1,286 +1,7 @@ -/* strcat with 256-bit EVEX instructions. - Copyright (C) 2021-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <isa-level.h> - -#if ISA_SHOULD_BUILD (4) - - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_evex -# endif - -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -/* zero register */ -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM0 ymm17 -# define YMM1 ymm18 - -# define USE_AS_STRCAT - -/* Number of bytes in a vector register */ -# define VEC_SIZE 32 - - .section .text.evex,"ax",@progbits -ENTRY (STRCAT) - mov %rdi, %r9 -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - xor %eax, %eax - mov %edi, %ecx - and $((VEC_SIZE * 4) - 1), %ecx - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - cmp $(VEC_SIZE * 3), %ecx - ja L(fourth_vector_boundary) - vpcmpb $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_first_vector) - mov %rdi, %rax - and $-VEC_SIZE, %rax - jmp L(align_vec_size_start) -L(fourth_vector_boundary): - mov %rdi, %rax - and $-VEC_SIZE, %rax - vpcmpb $0, (%rax), %YMMZERO, %k0 - mov $-1, %r10d - sub %rax, %rcx - shl %cl, %r10d - kmovd %k0, %edx - and %r10d, %edx - jnz L(exit) - -L(align_vec_size_start): - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 4), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - kmovd %k4, %edx - add $(VEC_SIZE * 4), %rax - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 4), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - test %edx, %edx - jnz L(exit_null_on_fifth_vector) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 - add $(VEC_SIZE * 5), %rax - kmovd %k4, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - add $VEC_SIZE, %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 - add $VEC_SIZE, %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit) - - test $((VEC_SIZE * 4) - 1), %rax - jz L(align_four_vec_loop) - - vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 - add $VEC_SIZE, %rax - kmovd %k1, %edx - test %edx, %edx - jnz L(exit) - - add $VEC_SIZE, %rax - - .p2align 4 -L(align_four_vec_loop): - VMOVA (%rax), %YMM0 - VMOVA (VEC_SIZE * 2)(%rax), %YMM1 - vpminub VEC_SIZE(%rax), %YMM0, %YMM0 - vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 - vpminub %YMM0, %YMM1, %YMM0 - /* If K0 != 0, there is a null byte. */ - vpcmpb $0, %YMM0, %YMMZERO, %k0 - add $(VEC_SIZE * 4), %rax - ktestd %k0, %k0 - jz L(align_four_vec_loop) - - vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 - sub $(VEC_SIZE * 5), %rax - kmovd %k0, %edx - test %edx, %edx - jnz L(exit_null_on_second_vector) - - vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 - kmovd %k1, %edx - test %edx, %edx - jnz L(exit_null_on_third_vector) - - vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 - kmovd %k2, %edx - test %edx, %edx - jnz L(exit_null_on_fourth_vector) - - vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 - kmovd %k3, %edx - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit): - sub %rdi, %rax -L(exit_null_on_first_vector): - bsf %rdx, %rdx - add %rdx, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_second_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $VEC_SIZE, %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_third_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 2), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fourth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 3), %rax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_null_on_fifth_vector): - sub %rdi, %rax - bsf %rdx, %rdx - add %rdx, %rax - add $(VEC_SIZE * 4), %rax - - .p2align 4 -L(StartStrcpyPart): - lea (%r9, %rax), %rdi - mov %rsi, %rcx - mov %r9, %rax /* save result */ - -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(ExitZero) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-evex.S" +#ifndef STRCAT +# define STRCAT __strcat_evex #endif + +#define USE_AS_STRCAT +#define STRCPY STRCAT +#include "strcpy-evex.S" diff --git a/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S new file mode 100644 index 0000000000..9530d7b683 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-strlen-evex.h.S @@ -0,0 +1,110 @@ +/* strlen used for begining of str{n}cat using EVEX 256/512. + Copyright (C) 2011-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +/* NOTE: This file is meant to be included by strcat-evex or + strncat-evex and does not standalone. Before including %rdi + must be saved in %rax. */ + + +/* Simple strlen implementation that ends at + L(strcat_strlen_done). */ + vpxorq %VZERO_128, %VZERO_128, %VZERO_128 + movq %rdi, %r8 + andq $(VEC_SIZE * -1), %r8 + VPCMPEQ (%r8), %VZERO, %k0 + KMOV %k0, %VRCX +#ifdef USE_AS_WCSCPY + subl %r8d, %edi + shrl $2, %edi +#endif + shrx %VRDI, %VRCX, %VRCX +#ifdef USE_AS_WCSCPY + movq %rax, %rdi +#endif + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + + VPCMPEQ VEC_SIZE(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + leaq (VEC_SIZE)(%r8), %rdi + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + VPCMPEQ (VEC_SIZE * 2)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v1) + + VPCMPEQ (VEC_SIZE * 3)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v2) + + VPCMPEQ (VEC_SIZE * 4)(%r8), %VZERO, %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v3) + + andq $-(VEC_SIZE * 4), %rdi + .p2align 4,, 8 +L(loop_2x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(0) + VPMIN (VEC_SIZE * 5)(%rdi), %VMM(0), %VMM(1) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(2) + VPMIN (VEC_SIZE * 7)(%rdi), %VMM(2), %VMM(3) + VPTESTN %VMM(1), %VMM(1), %k1 + VPTESTN %VMM(3), %VMM(3), %k3 + subq $(VEC_SIZE * -4), %rdi + KORTEST %k1, %k3 + jz L(loop_2x_vec) + + VPTESTN %VMM(0), %VMM(0), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v0) + + KMOV %k1, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v1) + + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(bsf_and_done_v2) + + KMOV %k3, %VRCX +L(bsf_and_done_v3): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v2): + bsf %VRCX, %VRCX + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rdi + jmp L(strcat_strlen_done) + + .p2align 4,, 4 +L(bsf_and_done_v1): + addq $VEC_SIZE, %rdi +L(bsf_and_done_v0): + bsf %VRCX, %VRCX +#ifdef USE_AS_WCSCPY + leaq (%rdi, %rcx, CHAR_SIZE), %rdi +#else + addq %rcx, %rdi +#endif +L(strcat_strlen_done): diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S index 82e45ac675..932129ab40 100644 --- a/sysdeps/x86_64/multiarch/strcpy-evex.S +++ b/sysdeps/x86_64/multiarch/strcpy-evex.S @@ -1,4 +1,4 @@ -/* strcpy with 256-bit EVEX instructions. +/* {wcs|wcp|str|stp}cpy with 256/512-bit EVEX instructions. Copyright (C) 2021-2022 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -17,990 +17,526 @@ <https://www.gnu.org/licenses/>. */ #include <isa-level.h> - #if ISA_SHOULD_BUILD (4) -# ifndef USE_AS_STRCAT -# include <sysdep.h> + /* Use evex-masked stores for small sizes. Turned off at the + moment. */ +# define USE_EVEX_MASKED_STORE 0 + /* Use movsb in page cross case to save code size. */ +# define USE_MOVSB_IN_PAGE_CROSS 1 -# ifndef STRCPY -# define STRCPY __strcpy_evex -# endif +# include <sysdep.h> +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" # endif -# define VMOVU vmovdqu64 -# define VMOVA vmovdqa64 - -/* Number of bytes in a vector register */ -# ifndef VEC_SIZE -# define VEC_SIZE 32 +# ifndef STRCPY +# define STRCPY __strcpy_evex # endif -# define XMM2 xmm18 -# define XMM3 xmm19 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 -# define YMM7 ymm23 +# ifdef USE_AS_WCSCPY +# define VMOVU_MASK vmovdqu32 +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPCMPEQ vpcmpeqd +# define CHAR_SIZE 4 -# ifndef USE_AS_STRCAT +# define REP_MOVS rep movsd -/* zero register */ -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM1 ymm17 - - .section .text.evex,"ax",@progbits -ENTRY (STRCPY) -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP - test %R8_LP, %R8_LP - jz L(ExitZero) -# endif - mov %rsi, %rcx -# ifndef USE_AS_STPCPY - mov %rdi, %rax /* save result */ -# endif +# define USE_WIDE_CHAR +# else +# define VMOVU_MASK vmovdqu8 +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPCMPEQ vpcmpeqb +# define CHAR_SIZE 1 - vpxorq %XMMZERO, %XMMZERO, %XMMZERO +# define REP_MOVS rep movsb # endif - and $((VEC_SIZE * 4) - 1), %ecx - cmp $(VEC_SIZE * 2), %ecx - jbe L(SourceStringAlignmentLessTwoVecSize) - - and $-VEC_SIZE, %rsi - and $(VEC_SIZE - 1), %ecx - - vpcmpb $0, (%rsi), %YMMZERO, %k0 - kmovd %k0, %edx - shr %cl, %rdx +# include "reg-macros.h" -# ifdef USE_AS_STRNCPY -# if defined USE_AS_STPCPY || defined USE_AS_STRCAT - mov $VEC_SIZE, %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# else - mov $(VEC_SIZE + 1), %r10 - sub %rcx, %r10 - cmp %r10, %r8 -# endif - jbe L(CopyVecSizeTailCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyVecSizeTail) - - vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 - kmovd %k1, %edx -# ifdef USE_AS_STRNCPY - add $VEC_SIZE, %r10 - cmp %r10, %r8 - jbe L(CopyTwoVecSizeCase2OrCase3) -# endif - test %edx, %edx - jnz L(CopyTwoVecSize) - - VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ - VMOVU %YMM2, (%rdi) - -/* If source address alignment != destination address alignment */ - .p2align 4 -L(UnalignVecSizeBoth): - sub %rcx, %rdi -# ifdef USE_AS_STRNCPY - add %rcx, %r8 - sbb %rcx, %rcx - or %rcx, %r8 -# endif - mov $VEC_SIZE, %rcx - VMOVA (%rsi, %rcx), %YMM2 - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 3), %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) +# ifdef USE_AS_STPCPY +# define END_REG rax # else - jnz L(CopyVecSize) +# define END_REG rdi, %rdx, CHAR_SIZE # endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) +# ifdef USE_AS_STRCAT +# define PAGE_ALIGN_REG edx +# define PAGE_ALIGN_REG_64 rdx # else - jnz L(CopyVecSize) +# define PAGE_ALIGN_REG eax +# define PAGE_ALIGN_REG_64 rax # endif - VMOVU %YMM3, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 - vpcmpb $0, %YMM4, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec4) -# else - jnz L(CopyVecSize) -# endif +# define VZERO VMM(7) +# define VZERO_128 VMM_128(7) - VMOVU %YMM4, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM2, (%rdi, %rcx) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 - vpcmpb $0, %YMM2, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec2) -# else - jnz L(CopyVecSize) -# endif +# define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 - VMOVU %YMM2, (%rdi, %rcx) - vpcmpb $0, %YMM3, %YMMZERO, %k0 - kmovd %k0, %edx - add $VEC_SIZE, %rcx -# ifdef USE_AS_STRNCPY - sub $VEC_SIZE, %r8 - jbe L(CopyVecSizeCase2OrCase3) -# endif - test %edx, %edx -# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT - jnz L(CopyVecSizeUnalignedVec3) -# else - jnz L(CopyVecSize) -# endif - VMOVU %YMM3, (%rdi, %rcx) - mov %rsi, %rdx - lea VEC_SIZE(%rsi, %rcx), %rsi - and $-(VEC_SIZE * 4), %rsi - sub %rsi, %rdx - sub %rdx, %rdi -# ifdef USE_AS_STRNCPY - lea (VEC_SIZE * 8)(%r8, %rdx), %r8 -# endif -L(UnalignedFourVecSizeLoop): - VMOVA (%rsi), %YMM4 - VMOVA VEC_SIZE(%rsi), %YMM5 - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM5, %YMM4, %YMM2 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 - jbe L(UnalignedLeaveCase2OrCase3) + .section SECTION(.text), "ax", @progbits +ENTRY(STRCPY) +# ifdef USE_AS_STRCAT + movq %rdi, %rax +# include "strcat-strlen-evex.h.S" # endif - test %edx, %edx - jnz L(UnalignedFourVecSizeLeave) - -L(UnalignedFourVecSizeLoop_start): - add $(VEC_SIZE * 4), %rdi - add $(VEC_SIZE * 4), %rsi - VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) - VMOVA (%rsi), %YMM4 - VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) - VMOVA VEC_SIZE(%rsi), %YMM5 - vpminub %YMM5, %YMM4, %YMM2 - VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) - VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 - VMOVU %YMM7, -VEC_SIZE(%rdi) - VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 - vpminub %YMM7, %YMM6, %YMM3 - vpminub %YMM2, %YMM3, %YMM2 - /* If K7 != 0, there is a null byte. */ - vpcmpb $0, %YMM2, %YMMZERO, %k7 - kmovd %k7, %edx -# ifdef USE_AS_STRNCPY - sub $(VEC_SIZE * 4), %r8 |
