aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-01-10 15:35:39 -0600
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-02-03 16:41:41 -0600
commit8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 (patch)
tree9195daf9352320fb92b9e4f23be773e24c0a5cea
parentb77b06e0e296f1a2276c27a67e1d44f2cfa38d45 (diff)
downloadglibc-8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9.tar.xz
glibc-8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9.zip
x86: Optimize strcmp-evex.S
Optimization are primarily to the loop logic and how the page cross logic interacts with the loop. The page cross logic is at times more expensive for short strings near the end of a page but not crossing the page. This is done to retest the page cross conditions with a non-faulty check and to improve the logic for entering the loop afterwards. This is only particular cases, however, and is general made up for by more than 10x improvements on the transition from the page cross -> loop case. The non-page cross cases as well are nearly universally improved. test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-rw-r--r--sysdeps/x86_64/multiarch/strcmp-evex.S1712
1 files changed, 919 insertions, 793 deletions
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 0cd939d5af..e5070f3d53 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -26,54 +26,69 @@
# define PAGE_SIZE 4096
-/* VEC_SIZE = Number of bytes in a ymm register */
+ /* VEC_SIZE = Number of bytes in a ymm register. */
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR)
-/* Shift for dividing by (VEC_SIZE * 4). */
-# define DIVIDE_BY_VEC_4_SHIFT 7
-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
-# endif
-
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSCMP
-/* Compare packed dwords. */
-# define VPCMP vpcmpd
+# define TESTEQ subl $0xff,
+ /* Compare packed dwords. */
+# define VPCMP vpcmpd
# define VPMINU vpminud
# define VPTESTM vptestmd
-# define SHIFT_REG32 r8d
-# define SHIFT_REG64 r8
-/* 1 dword char == 4 bytes. */
+ /* 1 dword char == 4 bytes. */
# define SIZE_OF_CHAR 4
# else
-/* Compare packed bytes. */
-# define VPCMP vpcmpb
+# define TESTEQ incl
+ /* Compare packed bytes. */
+# define VPCMP vpcmpb
# define VPMINU vpminub
# define VPTESTM vptestmb
-# define SHIFT_REG32 ecx
-# define SHIFT_REG64 rcx
-/* 1 byte char == 1 byte. */
+ /* 1 byte char == 1 byte. */
# define SIZE_OF_CHAR 1
# endif
+# ifdef USE_AS_STRNCMP
+# define LOOP_REG r9d
+# define LOOP_REG64 r9
+
+# define OFFSET_REG8 r9b
+# define OFFSET_REG r9d
+# define OFFSET_REG64 r9
+# else
+# define LOOP_REG edx
+# define LOOP_REG64 rdx
+
+# define OFFSET_REG8 dl
+# define OFFSET_REG edx
+# define OFFSET_REG64 rdx
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
+# define VEC_OFFSET 0
+# else
+# define VEC_OFFSET (-VEC_SIZE)
+# endif
+
# define XMMZERO xmm16
-# define XMM0 xmm17
-# define XMM1 xmm18
+# define XMM0 xmm17
+# define XMM1 xmm18
# define YMMZERO ymm16
-# define YMM0 ymm17
-# define YMM1 ymm18
-# define YMM2 ymm19
-# define YMM3 ymm20
-# define YMM4 ymm21
-# define YMM5 ymm22
-# define YMM6 ymm23
-# define YMM7 ymm24
-# define YMM8 ymm25
-# define YMM9 ymm26
-# define YMM10 ymm27
+# define YMM0 ymm17
+# define YMM1 ymm18
+# define YMM2 ymm19
+# define YMM3 ymm20
+# define YMM4 ymm21
+# define YMM5 ymm22
+# define YMM6 ymm23
+# define YMM7 ymm24
+# define YMM8 ymm25
+# define YMM9 ymm26
+# define YMM10 ymm27
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -96,985 +111,1096 @@
the maximum offset is reached before a difference is found, zero is
returned. */
- .section .text.evex,"ax",@progbits
-ENTRY (STRCMP)
+ .section .text.evex, "ax", @progbits
+ENTRY(STRCMP)
# ifdef USE_AS_STRNCMP
- /* Check for simple cases (0 or 1) in offset. */
- cmp $1, %RDX_LP
- je L(char0)
- jb L(zero)
-# ifdef USE_AS_WCSCMP
-# ifndef __ILP32__
- movq %rdx, %rcx
- /* Check if length could overflow when multiplied by
- sizeof(wchar_t). Checking top 8 bits will cover all potential
- overflow cases as well as redirect cases where its impossible to
- length to bound a valid memory region. In these cases just use
- 'wcscmp'. */
- shrq $56, %rcx
- jnz __wcscmp_evex
-# endif
- /* Convert units: from wide to byte char. */
- shl $2, %RDX_LP
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %rdx
# endif
- /* Register %r11 tracks the maximum offset. */
- mov %RDX_LP, %R11_LP
+ cmp $1, %RDX_LP
+ /* Signed comparison intentional. We use this branch to also
+ test cases where length >= 2^63. These very large sizes can be
+ handled with strcmp as there is no way for that length to
+ actually bound the buffer. */
+ jle L(one_or_less)
# endif
movl %edi, %eax
- xorl %edx, %edx
- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
- jg L(cross_page)
- /* Start comparing 4 vectors. */
+ /* Shift out the bits irrelivant to page boundary ([63:12]). */
+ sall $20, %eax
+ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */
+ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax
+ ja L(page_cross)
+
+L(no_page_cross):
+ /* Safe to compare 4x vectors. */
VMOVU (%rdi), %YMM0
-
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
-
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi). */
VPCMP $0, (%rsi), %YMM0, %k1{%k2}
-
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(next_3_vectors)
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx) is after the maximum
- offset (%r11). */
- cmpq %r11, %rdx
- jae L(zero)
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(vec_0_test_len)
# endif
+
+ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for
+ wcscmp/wcsncmp. */
+
+ /* All 1s represents all equals. TESTEQ will overflow to zero in
+ all equals case. Otherwise 1s will carry until position of first
+ mismatch. */
+ TESTEQ %ecx
+ jz L(more_3x_vec)
+
+ .p2align 4,, 4
+L(return_vec_0):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- je L(return)
-L(wcscmp_return):
+ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret0)
setl %al
negl %eax
orl $1, %eax
-L(return):
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %ecx
+ subl %ecx, %eax
# endif
+L(ret0):
ret
-L(return_vec_size):
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
- the maximum offset (%r11). */
- addq $VEC_SIZE, %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ .p2align 4,, 4
+L(vec_0_test_len):
+ notl %ecx
+ bzhil %edx, %ecx, %eax
+ jnz L(return_vec_0)
+ /* Align if will cross fetch block. */
+ .p2align 4,, 2
+L(ret_zero):
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
-# else
+ ret
+
+ .p2align 4,, 5
+L(one_or_less):
+ jb L(ret_zero)
# ifdef USE_AS_WCSCMP
+ /* 'nbe' covers the case where length is negative (large
+ unsigned). */
+ jnbe __wcscmp_evex
+ movl (%rdi), %edx
xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rdx), %ecx
- cmpl VEC_SIZE(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (%rsi), %edx
+ je L(ret1)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl VEC_SIZE(%rdi, %rdx), %eax
- movzbl VEC_SIZE(%rsi, %rdx), %edx
- subl %edx, %eax
+ /* 'nbe' covers the case where length is negative (large
+ unsigned). */
+ jnbe __strcmp_evex
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
# endif
-# endif
+L(ret1):
ret
+# endif
-L(return_2_vec_size):
- tzcntl %ecx, %edx
+ .p2align 4,, 10
+L(return_vec_1):
+ tzcntl %ecx, %ecx
+# ifdef USE_AS_STRNCMP
+ /* rdx must be > CHAR_PER_VEC so its safe to subtract without
+ worrying about underflow. */
+ addq $-CHAR_PER_VEC, %rdx
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
+ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
+ xorl %eax, %eax
+ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret2)
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ subl %ecx, %eax
# endif
+L(ret2):
+ ret
+
+ .p2align 4,, 10
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 2), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+L(return_vec_3):
+# if CHAR_PER_VEC <= 16
+ sall $CHAR_PER_VEC, %ecx
# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
+ salq $CHAR_PER_VEC, %rcx
# endif
+# endif
+L(return_vec_2):
+# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
+ tzcntl %ecx, %ecx
# else
-# ifdef USE_AS_WCSCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ tzcntq %rcx, %rcx
# endif
- ret
-L(return_3_vec_size):
- tzcntl %ecx, %edx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edx
-# endif
# ifdef USE_AS_STRNCMP
- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
- after the maximum offset (%r11). */
- addq $(VEC_SIZE * 3), %rdx
- cmpq %r11, %rdx
- jae L(zero)
-# ifdef USE_AS_WCSCMP
+ cmpq %rcx, %rdx
+ jbe L(ret_zero)
+# endif
+
+# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (%rdi, %rdx), %ecx
- cmpl (%rsi, %rdx), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- subl %edx, %eax
-# endif
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret3)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
+# endif
+L(ret3):
+ ret
+
+# ifndef USE_AS_STRNCMP
+ .p2align 4,, 10
+L(return_vec_3):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
- jne L(wcscmp_return)
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+ je L(ret4)
+ setl %al
+ negl %eax
+ orl $1, %eax
# else
- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
- subl %edx, %eax
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
# endif
-# endif
+L(ret4):
ret
+# endif
- .p2align 4
-L(next_3_vectors):
- VMOVU VEC_SIZE(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ /* 32 byte align here ensures the main loop is ideally aligned
+ for DSB. */
+ .p2align 5
+L(more_3x_vec):
+ /* Safe to compare 4x vectors. */
+ VMOVU (VEC_SIZE)(%rdi), %YMM0
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_1)
+
+# ifdef USE_AS_STRNCMP
+ subq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero)
# endif
- jne L(return_vec_size)
VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
kmovd %k1, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- jne L(return_2_vec_size)
+ TESTEQ %ecx
+ jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
- /* Each bit set in K2 represents a non-null CHAR in YMM0. */
VPTESTM %YMM0, %YMM0, %k2
- /* Each bit cleared in K1 represents a mismatch or a null CHAR
- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
kmovd %k1, %ecx
+ TESTEQ %ecx
+ jnz L(return_vec_3)
+
+# ifdef USE_AS_STRNCMP
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero)
+# endif
+
+
# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ /* any non-zero positive value that doesn't inference with 0x1.
+ */
+ movl $2, %r8d
+
# else
- incl %ecx
+ xorl %r8d, %r8d
# endif
- jne L(return_3_vec_size)
-L(main_loop_header):
- leaq (VEC_SIZE * 4)(%rdi), %rdx
- movl $PAGE_SIZE, %ecx
- /* Align load via RAX. */
- andq $-(VEC_SIZE * 4), %rdx
- subq %rdi, %rdx
- leaq (%rdi, %rdx), %rax
+
+ /* The prepare labels are various entry points from the page
+ cross logic. */
+L(prepare_loop):
+
# ifdef USE_AS_STRNCMP
- /* Starting from this point, the maximum offset, or simply the
- 'offset', DECREASES by the same amount when base pointers are
- moved forward. Return 0 when:
- 1) On match: offset <= the matched vector index.
- 2) On mistmach, offset is before the mistmatched index.
- */
- subq %rdx, %r11
- jbe L(zero)
+# ifdef USE_AS_WCSCMP
+L(prepare_loop_no_len):
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ shrl $2, %ecx
+ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx
+# else
+ /* Store N + (VEC_SIZE * 4) and place check at the begining of
+ the loop. */
+ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx
+L(prepare_loop_no_len):
+# endif
+# else
+L(prepare_loop_no_len):
# endif
- addq %rsi, %rdx
- movq %rdx, %rsi
- andl $(PAGE_SIZE - 1), %esi
- /* Number of bytes before page crossing. */
- subq %rsi, %rcx
- /* Number of VEC_SIZE * 4 blocks before page crossing. */
- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
- movl %ecx, %esi
- jmp L(loop_start)
+ /* Align s1 and adjust s2 accordingly. */
+ subq %rdi, %rsi
+ andq $-(VEC_SIZE * 4), %rdi
+L(prepare_loop_readj):
+ addq %rdi, %rsi
+# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP)
+ subq %rdi, %rdx
+# endif
+
+L(prepare_loop_aligned):
+ /* eax stores distance from rsi to next page cross. These cases
+ need to be handled specially as the 4x loop could potentially
+ read memory past the length of s1 or s2 and across a page
+ boundary. */
+ movl $-(VEC_SIZE * 4), %eax
+ subl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+
+ vpxorq %YMMZERO, %YMMZERO, %YMMZERO
+
+ /* Loop 4x comparisons at a time. */
.p2align 4
L(loop):
+
+ /* End condition for strncmp. */
# ifdef USE_AS_STRNCMP
- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
- the maximum offset (%r11) by the same amount. */
- subq $(VEC_SIZE * 4), %r11
- jbe L(zero)
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_zero)
# endif
- addq $(VEC_SIZE * 4), %rax
- addq $(VEC_SIZE * 4), %rdx
-L(loop_start):
- testl %esi, %esi
- leal -1(%esi), %esi
- je L(loop_cross_page)
-L(back_to_loop):
- /* Main loop, comparing 4 vectors are a time. */
- VMOVA (%rax), %YMM0
- VMOVA VEC_SIZE(%rax), %YMM2
- VMOVA (VEC_SIZE * 2)(%rax), %YMM4
- VMOVA (VEC_SIZE * 3)(%rax), %YMM6
+
+ subq $-(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rsi
+
+ /* Check if rsi loads will cross a page boundary. */
+ addl $-(VEC_SIZE * 4), %eax
+ jnb L(page_cross_during_loop)
+
+ /* Loop entry after handling page cross during loop. */
+L(loop_skip_page_cross_check):
+ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0
+ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6
VPMINU %YMM0, %YMM2, %YMM8
VPMINU %YMM4, %YMM6, %YMM9
- /* A zero CHAR in YMM8 means that there is a null CHAR. */
- VPMINU %YMM8, %YMM9, %YMM8
+ /* A zero CHAR in YMM9 means that there is a null CHAR. */
+ VPMINU %YMM8, %YMM9, %YMM9
/* Each bit set in K1 represents a non-null CHAR in YMM8. */
- VPTESTM %YMM8, %YMM8, %k1
+ VPTESTM %YMM9, %YMM9, %k1
- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
- vpxorq (%rdx), %YMM0, %YMM1
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
+ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
+ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
+ oring with YMM1. Result is stored in YMM6. */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
- vporq %YMM1, %YMM3, %YMM9
- vporq %YMM5, %YMM7, %YMM10
+ /* Or together YMM3, YMM5, and YMM6. */
+ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
- /* A non-zero CHAR in YMM9 represents a mismatch. */
- vporq %YMM9, %YMM10, %YMM9
- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
- kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(loop)
+ /* A non-zero CHAR in YMM6 represents a mismatch. */
+ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1}
+ kmovd %k0, %LOOP_REG
- /* Each bit set in K1 represents a non-null CHAR in YMM0. */
+ TESTEQ %LOOP_REG
+ jz L(loop)
+
+
+ /* Find which VEC has the mismatch of end of string. */
VPTESTM %YMM0, %YMM0, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM0 and (%rdx). */
VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(test_vec)
- tzcntl %ecx, %ecx
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %ecx
-# endif
-# ifdef USE_AS_STRNCMP
- cmpq %rcx, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rcx), %edi
- cmpl (%rdx, %rcx), %edi
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rcx), %eax
- movzbl (%rdx, %rcx), %edx
- subl %edx, %eax
-# endif
-# endif
- ret
+ TESTEQ %ecx
+ jnz L(return_vec_0_end)
- .p2align 4
-L(test_vec):
-# ifdef USE_AS_STRNCMP
- /* The first vector matched. Return 0 if the maximum offset
- (%r11) <= VEC_SIZE. */
- cmpq $VEC_SIZE, %r11
- jbe L(zero)
-# endif
- /* Each bit set in K1 represents a non-null CHAR in YMM2. */
VPTESTM %YMM2, %YMM2, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM2 and VEC_SIZE(%rdx). */
VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
-# else
- incl %ecx
-# endif
- je L(test_2_vec)
- tzcntl %ecx, %edi
-# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %edi
-# endif
-# ifdef USE_AS_STRNCMP
- addq $VEC_SIZE, %rdi
- cmpq %rdi, %r11
- jbe L(zero)
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl (%rsi, %rdi), %ecx
- cmpl (%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl (%rax, %rdi), %eax
- movzbl (%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
-# else
-# ifdef USE_AS_WCSCMP
- movq %rax, %rsi
- xorl %eax, %eax
- movl VEC_SIZE(%rsi, %rdi), %ecx
- cmpl VEC_SIZE(%rdx, %rdi), %ecx
- jne L(wcscmp_return)
-# else
- movzbl VEC_SIZE(%rax, %rdi), %eax
- movzbl VEC_SIZE(%rdx, %rdi), %edx
- subl %edx, %eax
-# endif
-# endif
- ret
+ TESTEQ %ecx
+ jnz L(return_vec_1_end)
- .p2align 4
-L(test_2_vec):
+
+ /* Handle VEC 2 and 3 without branches. */
+L(return_vec_2_3_end):
# ifdef USE_AS_STRNCMP
- /* The first 2 vectors matched. Return 0 if the maximum offset
- (%r11) <= 2 * VEC_SIZE. */
- cmpq $(VEC_SIZE * 2), %r11
- jbe L(zero)
+ subq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(ret_zero_end)
# endif
- /* Each bit set in K1 represents a non-null CHAR in YMM4. */
+
VPTESTM %YMM4, %YMM4, %k1
- /* Each bit cleared in K0 represents a mismatch or a null CHAR
- in YMM4 and (VEC_SIZE * 2)(%rdx). */
VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
kmovd %k0, %ecx
-# ifdef USE_AS_WCSCMP
- subl $0xff, %ecx
+ TESTEQ %ecx
+# if CHAR_PER_VEC <= 16
+ sall $CHAR_PER_VEC, %LOOP_REG
+ orl %ecx, %LOOP_REG
# else
- incl %ecx
+ salq $CHAR_PER_VEC, %LOOP_REG64
+ orq %rcx, %LOOP_REG64
+# endif
+L(return_vec_3_end):
+ /* LOOP_REG contains matches for null/mismatch from the loop. If
+ VEC 0,1,and 2 all have no null and no mismatches then mismatch
+ must entirely be from VEC 3 which is fully represented by
+ LOOP_REG. */
+# if CHAR_PER_VEC <= 16
+ tzcntl %LOOP_REG, %LOOP_REG
+# else
+ tzcntq %LOOP_REG64, %LOOP_REG64
+# endif
+# ifdef USE_AS_STRNCMP
+ cmpq %LOOP_REG64, %rdx
+ jbe L(ret_zero_end)
# endif
- je L(test_3_vec)
- tzcntl %ecx, %edi
+
# ifdef USE_AS_WCSCMP
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
<