diff options
| author | H.J. Lu <hongjiu.lu@intel.com> | 2011-06-24 14:15:32 -0400 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2011-06-24 14:15:32 -0400 |
| commit | 0b1cbaaef5ccc21baf2c35d4698fb28e82eab385 (patch) | |
| tree | c1f6ad8a49ef79510355c765ad3e385067e7ade0 | |
| parent | 07f494a027b3adea1f3cd0cd4ca7c10949cdc476 (diff) | |
| download | glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.tar.xz glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.zip | |
Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32
| -rw-r--r-- | ChangeLog | 25 | ||||
| -rw-r--r-- | NEWS | 5 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/Makefile | 4 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpcpy-sse2.S | 3 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpcpy-ssse3.S | 3 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpcpy.S | 7 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpncpy-sse2.S | 4 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpncpy-ssse3.S | 4 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/stpncpy.S | 6 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strcpy-sse2.S | 2251 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strcpy-ssse3.S | 4090 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strcpy.S | 154 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strncpy-c.c | 8 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strncpy-sse2.S | 3 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strncpy-ssse3.S | 3 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strncpy.S | 3 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.c | 11 | ||||
| -rw-r--r-- | sysdeps/x86_64/multiarch/init-arch.h | 6 |
18 files changed, 6585 insertions, 5 deletions
@@ -1,3 +1,28 @@ +2011-06-22 H.J. Lu <hongjiu.lu@intel.com> + + * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add + strncpy-c strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 + strcpy-sse2 strncpy-sse2 stpcpy-sse2 stpncpy-sse2. + * sysdeps/i386/i686/multiarch/stpcpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/stpcpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/stpncpy.S : New file. + * sysdeps/i386/i686/multiarch/strcpy-sse2.S : New file. + * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strcpy.S: New file. + * sysdeps/i386/i686/multiarch/strncpy-c.c: New file. + * sysdeps/i386/i686/multiarch/strncpy-sse2.S: New file. + * sysdeps/i386/i686/multiarch/strncpy-ssse3.S: New file. + * sysdeps/i386/i686/multiarch/strncpy.S: New file. + * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features): + Enable unaligned load optimization for Intel Core i3, i5 and i7 + processors. + * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Unaligned_Load): + Define. + (index_Fast_Unaligned_Load): Define. + (HAS_FAST_UNALIGNED_LOAD): Define. + 2011-06-23 Marek Polacek <mpolacek@redhat.com> * nss/nss_db/db-open.c: Include <unistd.h> for read declaration. @@ -1,4 +1,4 @@ -GNU C Library NEWS -- history of user-visible changes. 2011-6-22 +GNU C Library NEWS -- history of user-visible changes. 2011-6-24 Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc. See the end for copying conditions. @@ -17,6 +17,9 @@ Version 2.15 * Add nss_db support back to glibc. No more dependency on Berkeley db and support for initgroups lookups. Implemented by Ulrich Drepper. + +* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32. + Contributed by HJ Lu. Version 2.14 diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile index 32286d8d38..4bae699caf 100644 --- a/sysdeps/i386/i686/multiarch/Makefile +++ b/sysdeps/i386/i686/multiarch/Makefile @@ -10,7 +10,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \ memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \ strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \ memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \ - strlen-sse2 strlen-sse2-bsf + strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \ + strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \ + strncpy-sse2 stpcpy-sse2 stpncpy-sse2 ifeq (yes,$(config-cflags-sse4)) sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c CFLAGS-varshift.c += -msse4 diff --git a/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S new file mode 100644 index 0000000000..46ca1b3074 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_sse2 +#include "strcpy-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S new file mode 100644 index 0000000000..d971c2da38 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S @@ -0,0 +1,3 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/stpcpy.S b/sysdeps/i386/i686/multiarch/stpcpy.S new file mode 100644 index 0000000000..b63d308edc --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpcpy.S @@ -0,0 +1,7 @@ +#define USE_AS_STPCPY +#define STRCPY __stpcpy +#include "strcpy.S" + +weak_alias (__stpcpy, stpcpy) +libc_hidden_def (__stpcpy) +libc_hidden_builtin_def (stpcpy) diff --git a/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S new file mode 100644 index 0000000000..37a703cb76 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_sse2 +#include "strcpy-sse2.S" diff --git a/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S new file mode 100644 index 0000000000..14ed16f6b5 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S @@ -0,0 +1,4 @@ +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#define STRCPY __stpncpy_ssse3 +#include "strcpy-ssse3.S" diff --git a/sysdeps/i386/i686/multiarch/stpncpy.S b/sysdeps/i386/i686/multiarch/stpncpy.S new file mode 100644 index 0000000000..ff89a89491 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/stpncpy.S @@ -0,0 +1,6 @@ +#define STRCPY __stpncpy +#define USE_AS_STPCPY +#define USE_AS_STRNCPY +#include "strcpy.S" + +weak_alias (__stpncpy, stpncpy) diff --git a/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/sysdeps/i386/i686/multiarch/strcpy-sse2.S new file mode 100644 index 0000000000..fad1ae2b67 --- /dev/null +++ b/sysdeps/i386/i686/multiarch/strcpy-sse2.S @@ -0,0 +1,2251 @@ +/* strcpy with SSE2 and unaligned load + Copyright (C) 2011 Free Software Foundation, Inc. + Contributed by Intel Corporation. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + + +#ifndef NOT_IN_libc + +# include <sysdep.h> + + +# define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +# define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) + +# ifndef STRCPY +# define STRCPY __strcpy_sse2 +# endif + +# define STR1 PARMS +# define STR2 STR1+4 +# define LEN STR2+4 + +# ifdef USE_AS_STRNCPY +# define PARMS 16 +# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi) +# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \ + CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi); + +# ifdef SHARED +# define JMPTBL(I, B) I - B + +/* Load an entry in a jump table into ECX and branch to it. TABLE is a + jump table with relative offsets. + INDEX is a register contains the index into the jump table. + SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + /* We first load PC into ECX. */ \ + call __i686.get_pc_thunk.cx; \ + /* Get the address of the jump table. */ \ + addl $(TABLE - .), %ecx; \ + /* Get the entry and convert the relative offset to the \ + absolute address. */ \ + addl (%ecx,INDEX,SCALE), %ecx; \ + /* We loaded the jump table and adjuested ECX. Go. */ \ + jmp *%ecx +# else +# define JMPTBL(I, B) I + +/* Branch to an entry in a jump table. TABLE is a jump table with + absolute offsets. INDEX is a register contains the index into the + jump table. SCALE is the scale of INDEX. */ + +# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + jmp *TABLE(,INDEX,SCALE) +# endif + +.text +ENTRY (STRCPY) + ENTRANCE + mov STR1(%esp), %edi + mov STR2(%esp), %esi + movl LEN(%esp), %ebx + test %ebx, %ebx + jz L(ExitZero) + + mov %esi, %ecx +# ifndef USE_AS_STPCPY + mov %edi, %eax /* save result */ +# endif + and $15, %ecx + jz L(SourceStringAlignmentZero) + + and $-16, %esi + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%esi), %xmm1 + add %ecx, %ebx + pmovmskb %xmm1, %edx + shr %cl, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%esi), %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32BytesCase2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes) + + movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%edi) + + sub %ecx, %edi + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + mov $16, %ecx + movdqa (%esi, %ecx), %xmm1 + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $48, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movaps 16(%esi, %ecx), %xmm4 + movdqu %xmm3, (%edi, %ecx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm4) + + movaps 16(%esi, %ecx), %xmm1 + movdqu %xmm4, (%edi, %ecx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm1) + + movaps 16(%esi, %ecx), %xmm2 + movdqu %xmm1, (%edi, %ecx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm2) + + movaps 16(%esi, %ecx), %xmm3 + movdqu %xmm2, (%edi, %ecx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %edx + add $16, %ecx + sub $16, %ebx + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %edx, %edx + jnz L(CopyFrom1To16BytesUnalignedXmm3) + + movdqu %xmm3, (%edi, %ecx) + mov %esi, %edx + lea 16(%esi, %ecx), %esi + and $-0x40, %esi + sub %esi, %edx + sub %edx, %edi + lea 128(%ebx, %edx), %ebx + +L(Unaligned64Loop): + movaps (%esi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%esi), %xmm5 + movaps 32(%esi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%esi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jnz L(Unaligned64Leave) +L(Unaligned64Loop_start): + add $64, %edi + add $64, %esi + movdqu %xmm4, -64(%edi) + movaps (%esi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%edi) + movaps 16(%esi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%esi), %xmm3 + movdqu %xmm6, -32(%edi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%edi) + movaps 48(%esi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %edx + sub $64, %ebx + jbe L(UnalignedLeaveCase2OrCase3) + test %edx, %edx + jz L(Unaligned64Loop_start) +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %ecx, %ecx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %edx + pmovmskb %xmm1, %ecx + test %edx, %edx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %ecx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) + movdqu %xmm6, 32(%edi) +# ifdef USE_AS_STPCPY + lea 48(%edi, %edx), %eax +# endif + movdqu %xmm7, 48(%edi) + add $15, %ebx + sub %edx, %ebx + lea 49(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentZero): + pxor %xmm0, %xmm0 + movdqa (%esi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $16, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# else + cmp $17, %ebx + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb 16(%esi), %xmm0 + movdqu %xmm1, (%edi) + pmovmskb %xmm0, %edx +# ifdef USE_AS_STPCPY + cmp $32, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# else + cmp $33, %ebx + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +# endif + test %edx, %edx + jnz L(CopyFrom1To32Bytes1) + + jmp L(Unalign16Both) + +/*-----------------End of main part---------------------------*/ + +/* Case1 */ + .p2align 4 +L(CopyFrom1To16BytesTail): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %esi + add $16, %edi + sub $16, %ebx +L(CopyFrom1To16BytesTail1): + bsf %edx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + sub %ecx, %ebx + bsf %edx, %edx + add %ecx, %esi + add $16, %edx + sub %ecx, %edx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %edx, %edx +# ifdef USE_AS_STPCPY + lea (%edi, %edx), %eax +# endif + movdqu %xmm4, (%edi) + add $63, %ebx + sub %edx, %ebx + lea 1(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %ecx, %edx + movdqu %xmm4, (%edi) +# ifdef USE_AS_STPCPY + lea 16(%edi, %edx), %eax +# endif + movdqu %xmm5, 16(%edi) + add $47, %ebx + sub %edx, %ebx + lea 17(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %edx, %edx + movdqu %xmm4, (%edi) + movdqu %xmm5, 16(%edi) +# ifdef USE_AS_STPCPY + lea 32(%edi, %edx), %eax +# endif + movdqu %xmm6, 32(%edi) + add $31, %ebx + sub %edx, %ebx + lea 33(%edi, %edx), %edi + jmp L(StrncpyFillTailWithZero) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%edi, %ecx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + add $16, %edx + sub %ecx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTailCase2): + sub %ecx, %ebx + add %ecx, %esi + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %edx, %edx + cmp %ebx, %edx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %ebx + add %ecx, %edi + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To32BytesCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTailCase2) + sub %ecx, %ebx + add %ecx, %esi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %edi + add $16, %esi + sub $16, %ebx +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %edx, %edx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4) + + .p2align 4 +L(Exit0): +# ifdef USE_AS_STPCPY + mov %edi, %eax +# endif + RETURN + + .p2align 4 +L(Exit1): + movb %dh, (%edi) +# ifdef USE_AS_STPCPY + lea (%edi), %eax +# endif + sub $1, %ebx + lea 1(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit2): + movw (%esi), %dx + movw %dx, (%edi) +# ifdef USE_AS_STPCPY + lea 1(%edi), %eax +# endif + sub $2, %ebx + lea 2(%edi), %edi + jnz L(StrncpyFillTailWithZero) + RETURN + + .p2align 4 +L(Exit3): + movw (%esi), %cx + movw %cx, (%edi) + movb %dh, 2(%edi) |
