diff options
| author | H.J. Lu <hongjiu.lu@intel.com> | 2009-07-02 03:39:03 -0700 |
|---|---|---|
| committer | Ulrich Drepper <drepper@redhat.com> | 2009-07-02 03:39:03 -0700 |
| commit | ab6a873fe07b8ded403bc5a5ca73be5d04820d61 (patch) | |
| tree | 82c30accc8655738ef1831579a1d3c1e93270cf2 | |
| parent | 6cbbaa50aac809ad6e0692247876c82d58e466bf (diff) | |
| download | glibc-ab6a873fe07b8ded403bc5a5ca73be5d04820d61.tar.xz glibc-ab6a873fe07b8ded403bc5a5ca73be5d04820d61.zip | |
SSSE3 strcpy/stpcpy for x86-64
This patch adds SSSE3 strcpy/stpcpy. I got up to 4X speed up on Core 2
and Core i7. I disabled it on Atom since SSSE3 version is slower for
shorter (<64byte) data.
| -rw-r--r-- | ChangeLog | 19 | ||||
| -rw-r--r-- | string/stpncpy.c | 15 | ||||
| -rw-r--r-- | string/strncpy.c | 9 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 2 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpcpy.S | 7 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy-c.c | 8 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/stpncpy.S | 6 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcpy.S | 1917 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncpy-c.c | 8 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncpy.S | 3 |
10 files changed, 1982 insertions, 12 deletions
@@ -1,3 +1,22 @@ +2009-06-30 H.J. Lu <hongjiu.lu@intel.com> + + * string/stpncpy.c (STPNCPY): New. Defined if not defined. + (__stpncpy): Renamed to ... + (STPNCPY): This. + (stpncpy): Create alias only if STPNCPY is not defined. + * string/strncpy.c (STRNCPY): New. Defined to strncpy if not + defined. + (strncpy): Renamed to ... + (STRNCPY): This. + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + stpncpy-c strncpy-c for string. + * sysdeps/x86_64/multiarch/stpcpy.S: New file. + * sysdeps/x86_64/multiarch/stpncpy-c.c: New file. + * sysdeps/x86_64/multiarch/stpncpy.S: New file. + * sysdeps/x86_64/multiarch/strcpy.S: New file. + * sysdeps/x86_64/multiarch/strncpy-c.c: New file. + * sysdeps/x86_64/multiarch/strncpy.S: New file. + 2009-07-02 Ulrich Drepper <drepper@redhat.com> * malloc/malloc.c [ATOMIC_FASTBINS] (_int_free): Add full barrier when diff --git a/string/stpncpy.c b/string/stpncpy.c index 164d0f1747..2ebab33d8a 100644 --- a/string/stpncpy.c +++ b/string/stpncpy.c @@ -28,17 +28,19 @@ # include <sys/types.h> #endif -#ifndef weak_alias -# define __stpncpy stpncpy +#ifndef STPNCPY +# ifdef weak_alias +# define STPNCPY __stpncpy +weak_alias (__stpncpy, stpncpy) +# else +# define STPNCPY stpncpy +# endif #endif /* Copy no more than N characters of SRC to DEST, returning the address of the terminating '\0' in DEST, if any, or else DEST + N. */ char * -__stpncpy (dest, src, n) - char *dest; - const char *src; - size_t n; +STPNCPY (char *dest, const char *src, size_t n) { char c; char *s = dest; @@ -96,5 +98,4 @@ __stpncpy (dest, src, n) } #ifdef weak_alias libc_hidden_def (__stpncpy) -weak_alias (__stpncpy, stpncpy) #endif diff --git a/string/strncpy.c b/string/strncpy.c index f32612e1cf..2274d7d31e 100644 --- a/string/strncpy.c +++ b/string/strncpy.c @@ -21,11 +21,12 @@ #undef strncpy +#ifndef STRNCPY +#define STRNCPY strncpy +#endif + char * -strncpy (s1, s2, n) - char *s1; - const char *s2; - size_t n; +STRNCPY (char *s1, const char *s2, size_t n) { reg_char c; char *s = s1; diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 1c35e1ffb4..127592aa3a 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -4,5 +4,5 @@ gen-as-const-headers += ifunc-defines.sym endif ifeq ($(subdir),string) -sysdep_routines += strncmp-c +sysdep_routines += stpncpy-c strncpy-c strncmp-c endif diff --git a/sysdeps/x86_64/multiarch/stpcpy.S b/sysdeps/x86_64/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/x86_64/multiarch/stpncpy-c.c b/sysdeps/x86_64/multiarch/stpncpy-c.c new file mode 100644 index 0000000000..2fde77dcab --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy-c.c @@ -0,0 +1,8 @@ +#define STPNCPY __stpncpy_sse2 +#ifdef SHARED +#undef libc_hidden_def +#define libc_hidden_def(name) \ + __hidden_ver1 (__stpncpy_sse2, __GI___stpncpy, __stpncpy_sse2); +#endif + +#include "stpncpy.c" diff --git a/sysdeps/x86_64/multiarch/stpncpy.S b/sysdeps/x86_64/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/x86_64/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/x86_64/multiarch/strcpy.S b/sysdeps/x86_64/multiarch/strcpy.S new file mode 100644 index 0000000000..bbc9979e0c --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcpy.S @@ -0,0 +1,1917 @@ +/* strcpy with SSSE3 + Copyright (C) 2009 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#include <sysdep.h> +#include <ifunc-defines.h> + +#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY) +# ifndef STRCPY +# define STRCPY strcpy +# endif +#endif + +#ifdef USE_AS_STPCPY +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __stpncpy_ssse3 +# define STRCPY_SSE2 __stpncpy_sse2 +# define __GI_STRCPY __GI_stpncpy +# else +# define STRCPY_SSSE3 __stpcpy_ssse3 +# define STRCPY_SSE2 __stpcpy_sse2 +# define __GI_STRCPY __GI_stpcpy +# define __GI___STRCPY __GI___stpcpy +# endif +#else +# ifdef USE_AS_STRNCPY +# define STRCPY_SSSE3 __strncpy_ssse3 +# define STRCPY_SSE2 __strncpy_sse2 +# define __GI_STRCPY __GI_strncpy +# else +# define STRCPY_SSSE3 __strcpy_ssse3 +# define STRCPY_SSE2 __strcpy_sse2 +# define __GI_STRCPY __GI_strcpy +# endif +#endif + +#ifndef LABEL +#define LABEL(l) L(l) +#endif + +/* Define multiple versions only for the definition in libc. */ +#ifndef NOT_IN_libc + .text +ENTRY(STRCPY) + .type STRCPY, @gnu_indirect_function + cmpl $0, __cpu_features+KIND_OFFSET(%rip) + jne 1f + call __init_cpu_features +1: leaq STRCPY_SSE2(%rip), %rax + testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip) + jz 3f +/* Avoid SSSE3 strcpy on Atom since it is slow. */ + cmpl $1, __cpu_features+KIND_OFFSET(%rip) + jne 2f + cmpl $6, __cpu_features+FAMILY_OFFSET(%rip) + jne 2f + cmpl $28, __cpu_features+MODEL_OFFSET(%rip) + jz 3f +2: leaq STRCPY_SSSE3(%rip), %rax +3: ret +END(STRCPY) + + .section .text.ssse3,"ax",@progbits +STRCPY_SSSE3: + cfi_startproc + CALL_MCOUNT + +/* + * This implementation uses SSE to copy up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCPY + test %rdx, %rdx + jz LABEL(strncpy_exitz) + mov %rdx, %r8 +#else + xor %edx, %edx +#endif + mov %esi, %ecx + and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/ + and $15, %ecx + mov %rdi, %rax /*store return parameter*/ + + + pxor %xmm0, %xmm0 /* clear %xmm0 */ + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + shr %cl, %edx /* get real bits left in edx*/ + test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */ + jnz LABEL(less16bytes) + +#ifdef USE_AS_STRNCPY + lea -16(%r8,%rcx), %r11 + cmp $0, %r11 + jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */ +#endif + + mov %rcx, %r9 + or %edi, %ecx + and $15, %ecx + lea -16(%r9), %r10 + jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/ + + neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/ + + pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/ + pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/ + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(less32bytes) + /* + * at least 16 byte available to fill destination rdi + */ +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(less32bytes_strncpy_truncation) +#endif + mov (%rsi, %r9), %rdx + mov %rdx, (%rdi) + mov 8(%rsi, %r9), %rdx + mov %rdx, 8(%rdi) + + /* + * so far destatination rdi may be aligned by 16, re-calculate rsi to jump + * crossponding case + * rcx is offset of rsi + * rax is offset of rdi + */ + + and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */ + mov %rax, %rdx /* rax store orignal rdi */ + xor %rdi, %rdx /* equal to and $15, %rdx */ +#ifdef USE_AS_STRNCPY + add %rdx, %r8 +#endif + + add $16, %rdi /* next 16 bytes for rdi */ + sub %rdx, %r9 + + lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */ + mov %esi, %ecx /*store offset of rsi */ + and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */ + + and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/ + jz LABEL(ashr_0) + + lea -16(%rcx), %r10 + mov %rcx, %r9 + neg %r10 + lea LABEL(unaligned_table)(%rip), %r11 + movslq (%r11, %rcx,4), %rcx + lea (%r11, %rcx), %rcx + jmp *%rcx + + /* + * The following cases will be handled by ashr_0 & ashr_0_start + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * 0 0 0 ashr_0 + * n(1~15) n(1~15) 0 ashr_0_start + * + */ + .p2align 5 +LABEL(ashr_0): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */ + movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */ + add $16, %rsi + add $16, %rdi + pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */ + pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/ + + test %edx, %edx /* edx must be 0 if there is no null char in rsi*/ + jnz LABEL(aligned_16bytes) + +LABEL(ashr_0_loop): +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(aligned_exit) + +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_aligned) +#endif + movdqa (%rsi, %rcx), %xmm1 + movdqa %xmm1, (%rdi, %rcx) + add $16, %rcx + pcmpeqb (%rsi, %rcx), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jz LABEL(ashr_0_loop) + + jmp LABEL(aligned_exit) + .p2align 4 + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_15): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_15_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $15, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_15_use_ssse3) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_14): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_14_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $14, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_14_use_ssse3) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_13): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_13_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $13, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_13_use_ssse3) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_12): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_12_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $12, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_12_use_ssse3) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_11): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_11_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $11, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + jmp LABEL(ashr_11_use_ssse3) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10 + * + * Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte + */ + .p2align 4 +LABEL(ashr_10): + xor %ecx, %ecx /*clear ecx */ +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + .p2align 4 +LABEL(ashr_10_use_ssse3): + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) + add $16, %rcx + +#ifdef USE_AS_STRNCPY + cmp %r10, %r8 + jbe LABEL(unaligned_exit) +#endif + + movdqa 16(%rsi, %rcx), %xmm3 + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz LABEL(unaligned_exit) +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe LABEL(strncpy_truncation_unaligned) +#endif + + palignr $10, (%rsi, %rcx), %xmm3 + movdqa %xmm3, (%rdi, %rcx) |
