diff options
| author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-07-19 17:11:54 -0400 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2011-07-19 17:11:54 -0400 |
| commit | 99710781cc47002612e609c7dc5f34692b64e9b3 (patch) | |
| tree | ac3c980ce57d0420fff758faffbd59d111026219 | |
| parent | 7dc6bd90c569c49807462b0740b18e32fab4d8b7 (diff) | |
| download | glibc-99710781cc47002612e609c7dc5f34692b64e9b3.tar.xz glibc-99710781cc47002612e609c7dc5f34692b64e9b3.zip | |
Improve 64 bit strcat functions with SSE2/SSSE3
| -rw-r--r-- | ChangeLog | 29 | ||||
| -rw-r--r-- | NEWS | 5 | ||||
| -rw-r--r-- | string/strncat.c | 6 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/Makefile | 6 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.c | 10 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.h | 2 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S | 55 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat-ssse3.S | 559 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcat.S | 85 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S | 451 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strcpy-ssse3.S | 280 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen-no-bsf.S | 74 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen-sse2-pminub.S | 260 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strlen.S | 5 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncat-c.c | 8 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S | 3 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/strncat.S | 3 |
18 files changed, 1523 insertions, 321 deletions
@@ -1,3 +1,32 @@ +2011-07-15 Liubov Dmitrieva <liubov.dmitrieva@intel.com> + + * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add + strcat-ssse3 strcat-sse2-unaligned strncat-ssse3 + strncat-sse2-unaligned strncat-c strlen-sse2-pminub + * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strcat.S: New file. + * sysdeps/x86_64/multiarch/strncat.S: New file. + * sysdeps/x86_64/multiarch/strncat-c.c: New file. + * sysdeps/x86_64/multiarch/strcat-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strncat-sse2-unaligned.S: New file. + * sysdeps/x86_64/multiarch/strncat-ssse3.S: New file. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S + (USE_AS_STRCAT): Define. + Add strcat and strncat support. + * sysdeps/x86_64/multiarch/strlen-no-bsf.S: Likewise. + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. + * sysdeps/x86_64/multiarch/strlen-sse2-pminub.S: New file. + * string/strncat.c: Update. + (USE_AS_STRNCAT): Define. + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Turn on bit_Prefer_PMINUB_for_stringop for Intel Core i3, i5 + and i7. + * sysdeps/x86_64/multiarch/init-arch.h + (bit_Prefer_PMINUB_for_stringop): New. + (index_Prefer_PMINUB_for_stringop): Likewise. + * sysdeps/x86_64/multiarch/strlen.S (strlen): Check + bit_Prefer_PMINUB_for_stringop. + 2011-07-19 Ulrich Drepper <drepper@gmail.com> * crypt/sha512.h (struct sha512_ctx): Move buffer into union and add @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2011-7-6 +GNU C Library NEWS -- history of user-visible changes. 2011-7-19 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. See the end for copying conditions. @@ -23,6 +23,9 @@ Version 2.15 * Improved strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-64. Contributed by HJ Lu. + +* Improved strcat and strncat on x86-64. + Contributed by Liubov Dmitrieva. Version 2.14 diff --git a/string/strncat.c b/string/strncat.c index 2e2de11508..72d9d697ac 100644 --- a/string/strncat.c +++ b/string/strncat.c @@ -24,10 +24,12 @@ typedef char reg_char; #endif -#undef strncat +#ifndef STRNCAT +# define STRNCAT strncat +#endif char * -strncat (s1, s2, n) +STRNCAT (s1, s2, n) char *s1; const char *s2; size_t n; diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 88410b395a..c959dd195f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -5,14 +5,16 @@ endif ifeq ($(subdir),string) -sysdep_routines += stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ +sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 strncmp-ssse3 \ strend-sse4 memcmp-sse4 memcpy-ssse3 mempcpy-ssse3 \ memmove-ssse3 memcpy-ssse3-back mempcpy-ssse3-back \ memmove-ssse3-back strcasestr-nonascii strcasecmp_l-ssse3 \ strncase_l-ssse3 strlen-sse4 strlen-no-bsf memset-x86-64 \ strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ strcpy-sse2-unaligned strncpy-sse2-unaligned \ - stpcpy-sse2-unaligned stpncpy-sse2-unaligned + stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ + strcat-sse2-unaligned strncat-sse2-unaligned \ + strcat-ssse3 strncat-ssse3 strlen-sse2-pminub ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c varshift CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/x86_64/multiarch/init-arch.c b/sysdeps/x86_64/multiarch/init-arch.c index 81b2378467..0a145ca259 100644 --- a/sysdeps/x86_64/multiarch/init-arch.c +++ b/sysdeps/x86_64/multiarch/init-arch.c @@ -97,18 +97,22 @@ __init_cpu_features (void) case 0x2c: case 0x2e: case 0x2f: - /* Rep string instructions, copy backward and unaligned loads - are fast on Intel Core i3, i5 and i7. */ + /* Rep string instructions, copy backward, unaligned loads + and pminub are fast on Intel Core i3, i5 and i7. */ #if index_Fast_Rep_String != index_Fast_Copy_Backward # error index_Fast_Rep_String != index_Fast_Copy_Backward #endif #if index_Fast_Rep_String != index_Fast_Unaligned_Load # error index_Fast_Rep_String != index_Fast_Unaligned_Load #endif +#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop +#endif __cpu_features.feature[index_Fast_Rep_String] |= (bit_Fast_Rep_String | bit_Fast_Copy_Backward - | bit_Fast_Unaligned_Load); + | bit_Fast_Unaligned_Load + | bit_Prefer_PMINUB_for_stringop); break; } } diff --git a/sysdeps/x86_64/multiarch/init-arch.h b/sysdeps/x86_64/multiarch/init-arch.h index addf5f3dde..6cfdbddc4e 100644 --- a/sysdeps/x86_64/multiarch/init-arch.h +++ b/sysdeps/x86_64/multiarch/init-arch.h @@ -21,6 +21,7 @@ #define bit_Slow_BSF (1 << 2) #define bit_Prefer_SSE_for_memop (1 << 3) #define bit_Fast_Unaligned_Load (1 << 4) +#define bit_Prefer_PMINUB_for_stringop (1 << 5) #ifdef __ASSEMBLER__ @@ -41,6 +42,7 @@ # define index_Slow_BSF FEATURE_INDEX_1*FEATURE_SIZE # define index_Prefer_SSE_for_memop FEATURE_INDEX_1*FEATURE_SIZE # define index_Fast_Unaligned_Load FEATURE_INDEX_1*FEATURE_SIZE +# define index_Prefer_PMINUB_for_stringop FEATURE_INDEX_1*FEATURE_SIZE #else /* __ASSEMBLER__ */ diff --git a/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S new file mode 100644 index 0000000000..1150281fe4 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S @@ -0,0 +1,55 @@ +/* strcat with SSE2 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_sse2_unaligned +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) + mov %rdi, %r9 +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-sse2-pminub.S" +# undef RETURN + +L(StartStrcpyPart): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-sse2-unaligned.S" +#endif + diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S new file mode 100644 index 0000000000..66736a7087 --- /dev/null +++ b/sysdeps/x86_64/multiarch/strcat-ssse3.S @@ -0,0 +1,559 @@ +/* strcat with SSSE3 + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef NOT_IN_libc + +# include <sysdep.h> + +# ifndef STRCAT +# define STRCAT __strcat_ssse3 +# endif + +# define USE_AS_STRCAT + +.text +ENTRY (STRCAT) +# ifdef USE_AS_STRNCAT + mov %rdx, %r8 +# endif + +# define RETURN jmp L(StartStrcpyPart) +# include "strlen-no-bsf.S" + +# undef RETURN + +L(StartStrcpyPart): + mov %rsi, %rcx + lea (%rdi, %rax), %rdx +# ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(StrncatExit0) + cmp $8, %r8 + jbe L(StrncatExit8Bytes) +# endif + cmpb $0, (%rcx) + jz L(Exit1) + cmpb $0, 1(%rcx) + jz L(Exit2) + cmpb $0, 2(%rcx) + jz L(Exit3) + cmpb $0, 3(%rcx) + jz L(Exit4) + cmpb $0, 4(%rcx) + jz L(Exit5) + cmpb $0, 5(%rcx) + jz L(Exit6) + cmpb $0, 6(%rcx) + jz L(Exit7) + cmpb $0, 7(%rcx) + jz L(Exit8) + cmpb $0, 8(%rcx) + jz L(Exit9) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + jb L(StrncatExit15Bytes) +# endif + cmpb $0, 9(%rcx) + jz L(Exit10) + cmpb $0, 10(%rcx) + jz L(Exit11) + cmpb $0, 11(%rcx) + jz L(Exit12) + cmpb $0, 12(%rcx) + jz L(Exit13) + cmpb $0, 13(%rcx) + jz L(Exit14) + cmpb $0, 14(%rcx) + jz L(Exit15) + cmpb $0, 15(%rcx) + jz L(Exit16) +# ifdef USE_AS_STRNCAT + cmp $16, %r8 + je L(StrncatExit16) +# define USE_AS_STRNCPY +# endif + +# include "strcpy-ssse3.S" + + .p2align 4 +L(CopyFrom1To16Bytes): + add %rsi, %rdx + add %rsi, %rcx + + test %al, %al + jz L(ExitHigh) + test $0x01, %al + jnz L(Exit1) + test $0x02, %al + jnz L(Exit2) + test $0x04, %al + jnz L(Exit3) + test $0x08, %al + jnz L(Exit4) + test $0x10, %al + jnz L(Exit5) + test $0x20, %al + jnz L(Exit6) + test $0x40, %al + jnz L(Exit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHigh): + test $0x01, %ah + jnz L(Exit9) + test $0x02, %ah + jnz L(Exit10) + test $0x04, %ah + jnz L(Exit11) + test $0x08, %ah + jnz L(Exit12) + test $0x10, %ah + jnz L(Exit13) + test $0x20, %ah + jnz L(Exit14) + test $0x40, %ah + jnz L(Exit15) + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit1): + xor %ah, %ah + movb %ah, 1(%rdx) +L(Exit1): + movb (%rcx), %al + movb %al, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit2): + xor %ah, %ah + movb %ah, 2(%rdx) +L(Exit2): + movw (%rcx), %ax + movw %ax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit3): + xor %ah, %ah + movb %ah, 3(%rdx) +L(Exit3): + movw (%rcx), %ax + movw %ax, (%rdx) + movb 2(%rcx), %al + movb %al, 2(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit4): + xor %ah, %ah + movb %ah, 4(%rdx) +L(Exit4): + mov (%rcx), %eax + mov %eax, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit5): + xor %ah, %ah + movb %ah, 5(%rdx) +L(Exit5): + mov (%rcx), %eax + mov %eax, (%rdx) + movb 4(%rcx), %al + movb %al, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit6): + xor %ah, %ah + movb %ah, 6(%rdx) +L(Exit6): + mov (%rcx), %eax + mov %eax, (%rdx) + movw 4(%rcx), %ax + movw %ax, 4(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit7): + xor %ah, %ah + movb %ah, 7(%rdx) +L(Exit7): + mov (%rcx), %eax + mov %eax, (%rdx) + mov 3(%rcx), %eax + mov %eax, 3(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit8): + xor %ah, %ah + movb %ah, 8(%rdx) +L(Exit8): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit9): + xor %ah, %ah + movb %ah, 9(%rdx) +L(Exit9): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movb 8(%rcx), %al + movb %al, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit10): + xor %ah, %ah + movb %ah, 10(%rdx) +L(Exit10): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movw 8(%rcx), %ax + movw %ax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit11): + xor %ah, %ah + movb %ah, 11(%rdx) +L(Exit11): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 7(%rcx), %eax + mov %eax, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit12): + xor %ah, %ah + movb %ah, 12(%rdx) +L(Exit12): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + mov 8(%rcx), %eax + mov %eax, 8(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit13): + xor %ah, %ah + movb %ah, 13(%rdx) +L(Exit13): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 5(%rcx), %xmm1 + movlpd %xmm1, 5(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit14): + xor %ah, %ah + movb %ah, 14(%rdx) +L(Exit14): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 6(%rcx), %xmm1 + movlpd %xmm1, 6(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit15): + xor %ah, %ah + movb %ah, 15(%rdx) +L(Exit15): + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 7(%rcx), %xmm1 + movlpd %xmm1, 7(%rdx) + mov %rdi, %rax + ret + + .p2align 4 +L(StrncatExit16): + xor %ah, %ah + movb %ah, 16(%rdx) +L(Exit16): + movlpd (%rcx), %xmm0 + movlpd 8(%rcx), %xmm1 + movlpd %xmm0, (%rdx) + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +# ifdef USE_AS_STRNCPY + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rsi, %rcx + lea (%rsi, %rdx), %rsi + lea -9(%r8), %rdx + and $1<<7, %dh + or %al, %dh + test %dh, %dh + lea (%rsi), %rdx + jz L(ExitHighCase2) + + test $0x01, %al + jnz L(Exit1) + cmp $1, %r8 + je L(StrncatExit1) + test $0x02, %al + jnz L(Exit2) + cmp $2, %r8 + je L(StrncatExit2) + test $0x04, %al + jnz L(Exit3) + cmp $3, %r8 + je L(StrncatExit3) + test $0x08, %al + jnz L(Exit4) + cmp $4, %r8 + je L(StrncatExit4) + test $0x10, %al + jnz L(Exit5) + cmp $5, %r8 + je L(StrncatExit5) + test $0x20, %al + jnz L(Exit6) + cmp $6, %r8 + je L(StrncatExit6) + test $0x40, %al + jnz L(Exit7) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + lea 7(%rdx), %rax + cmpb $1, (%rax) + sbb $-1, %rax + xor %cl, %cl + movb %cl, (%rax) + mov %rdi, %rax + ret + + .p2align 4 +L(ExitHighCase2): + test $0x01, %ah + jnz L(Exit9) + cmp $9, %r8 + je L(StrncatExit9) + test $0x02, %ah + jnz L(Exit10) + cmp $10, %r8 + je L(StrncatExit10) + test $0x04, %ah + jnz L(Exit11) + cmp $11, %r8 + je L(StrncatExit11) + test $0x8, %ah + jnz L(Exit12) + cmp $12, %r8 + je L(StrncatExit12) + test $0x10, %ah + jnz L(Exit13) + cmp $13, %r8 + je L(StrncatExit13) + test $0x20, %ah + jnz L(Exit14) + cmp $14, %r8 + je L(StrncatExit14) + test $0x40, %ah + jnz L(Exit15) + cmp $15, %r8 + je L(StrncatExit15) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + movlpd 8(%rcx), %xmm1 + movlpd %xmm1, 8(%rdx) + mov %rdi, %rax + ret + +L(CopyFrom1To16BytesCase2OrCase3): + test %rax, %rax + jnz L(CopyFrom1To16BytesCase2) + + .p2align 4 +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rsi, %rdx + add %rsi, %rcx + + cmp $8, %r8 + ja L(ExitHighCase3) + cmp $1, %r8 + je L(StrncatExit1) + cmp $2, %r8 + je L(StrncatExit2) + cmp $3, %r8 + je L(StrncatExit3) + cmp $4, %r8 + je L(StrncatExit4) + cmp $5, %r8 + je L(StrncatExit5) + cmp $6, %r8 + je L(StrncatExit6) + cmp $7, %r8 + je L(StrncatExit7) + movlpd (%rcx), %xmm0 + movlpd %xmm0, (%rdx) + xor %ah, %ah + movb %ah, 8(%rdx) + mov |
