aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorH.J. Lu <hongjiu.lu@intel.com>2011-06-24 14:15:32 -0400
committerUlrich Drepper <drepper@gmail.com>2011-06-24 14:15:32 -0400
commit0b1cbaaef5ccc21baf2c35d4698fb28e82eab385 (patch)
treec1f6ad8a49ef79510355c765ad3e385067e7ade0
parent07f494a027b3adea1f3cd0cd4ca7c10949cdc476 (diff)
downloadglibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.tar.xz
glibc-0b1cbaaef5ccc21baf2c35d4698fb28e82eab385.zip
Optimized st{r,p}{,n}cpy for SSE2/SSSE3 on x86-32
-rw-r--r--ChangeLog25
-rw-r--r--NEWS5
-rw-r--r--sysdeps/i386/i686/multiarch/Makefile4
-rw-r--r--sysdeps/i386/i686/multiarch/stpcpy-sse2.S3
-rw-r--r--sysdeps/i386/i686/multiarch/stpcpy-ssse3.S3
-rw-r--r--sysdeps/i386/i686/multiarch/stpcpy.S7
-rw-r--r--sysdeps/i386/i686/multiarch/stpncpy-sse2.S4
-rw-r--r--sysdeps/i386/i686/multiarch/stpncpy-ssse3.S4
-rw-r--r--sysdeps/i386/i686/multiarch/stpncpy.S6
-rw-r--r--sysdeps/i386/i686/multiarch/strcpy-sse2.S2251
-rw-r--r--sysdeps/i386/i686/multiarch/strcpy-ssse3.S4090
-rw-r--r--sysdeps/i386/i686/multiarch/strcpy.S154
-rw-r--r--sysdeps/i386/i686/multiarch/strncpy-c.c8
-rw-r--r--sysdeps/i386/i686/multiarch/strncpy-sse2.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strncpy-ssse3.S3
-rw-r--r--sysdeps/i386/i686/multiarch/strncpy.S3
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.c11
-rw-r--r--sysdeps/x86_64/multiarch/init-arch.h6
18 files changed, 6585 insertions, 5 deletions
diff --git a/ChangeLog b/ChangeLog
index b4d6496886..097ad2094c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,28 @@
+2011-06-22 H.J. Lu <hongjiu.lu@intel.com>
+
+ * sysdeps/i386/i686/multiarch/Makefile (sysdep_routines): Add
+ strncpy-c strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3
+ strcpy-sse2 strncpy-sse2 stpcpy-sse2 stpncpy-sse2.
+ * sysdeps/i386/i686/multiarch/stpcpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/stpcpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/stpncpy.S : New file.
+ * sysdeps/i386/i686/multiarch/strcpy-sse2.S : New file.
+ * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/strcpy.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-c.c: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-sse2.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy-ssse3.S: New file.
+ * sysdeps/i386/i686/multiarch/strncpy.S: New file.
+ * sysdeps/x86_64/multiarch/init-arch.c (__init_cpu_features):
+ Enable unaligned load optimization for Intel Core i3, i5 and i7
+ processors.
+ * sysdeps/x86_64/multiarch/init-arch.h (bit_Fast_Unaligned_Load):
+ Define.
+ (index_Fast_Unaligned_Load): Define.
+ (HAS_FAST_UNALIGNED_LOAD): Define.
+
2011-06-23 Marek Polacek <mpolacek@redhat.com>
* nss/nss_db/db-open.c: Include <unistd.h> for read declaration.
diff --git a/NEWS b/NEWS
index 5a7ffc2f3f..edb356d19f 100644
--- a/NEWS
+++ b/NEWS
@@ -1,4 +1,4 @@
-GNU C Library NEWS -- history of user-visible changes. 2011-6-22
+GNU C Library NEWS -- history of user-visible changes. 2011-6-24
Copyright (C) 1992-2009, 2010, 2011 Free Software Foundation, Inc.
See the end for copying conditions.
@@ -17,6 +17,9 @@ Version 2.15
* Add nss_db support back to glibc. No more dependency on Berkeley db
and support for initgroups lookups.
Implemented by Ulrich Drepper.
+
+* Optimized strcpy, strncpy, stpcpy, stpncpy for SSE2 and SSSE3 on x86-32.
+ Contributed by HJ Lu.
Version 2.14
diff --git a/sysdeps/i386/i686/multiarch/Makefile b/sysdeps/i386/i686/multiarch/Makefile
index 32286d8d38..4bae699caf 100644
--- a/sysdeps/i386/i686/multiarch/Makefile
+++ b/sysdeps/i386/i686/multiarch/Makefile
@@ -10,7 +10,9 @@ sysdep_routines += bzero-sse2 memset-sse2 memcpy-ssse3 mempcpy-ssse3 \
memset-sse2-rep bzero-sse2-rep strcmp-ssse3 \
strcmp-sse4 strncmp-c strncmp-ssse3 strncmp-sse4 \
memcmp-ssse3 memcmp-sse4 strcasestr-nonascii varshift \
- strlen-sse2 strlen-sse2-bsf
+ strlen-sse2 strlen-sse2-bsf strncpy-c strcpy-ssse3 \
+ strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 strcpy-sse2 \
+ strncpy-sse2 stpcpy-sse2 stpncpy-sse2
ifeq (yes,$(config-cflags-sse4))
sysdep_routines += strcspn-c strpbrk-c strspn-c strstr-c strcasestr-c
CFLAGS-varshift.c += -msse4
diff --git a/sysdeps/i386/i686/multiarch/stpcpy-sse2.S b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
new file mode 100644
index 0000000000..46ca1b3074
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy-sse2.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_sse2
+#include "strcpy-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
new file mode 100644
index 0000000000..d971c2da38
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy-ssse3.S
@@ -0,0 +1,3 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/stpcpy.S b/sysdeps/i386/i686/multiarch/stpcpy.S
new file mode 100644
index 0000000000..b63d308edc
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpcpy.S
@@ -0,0 +1,7 @@
+#define USE_AS_STPCPY
+#define STRCPY __stpcpy
+#include "strcpy.S"
+
+weak_alias (__stpcpy, stpcpy)
+libc_hidden_def (__stpcpy)
+libc_hidden_builtin_def (stpcpy)
diff --git a/sysdeps/i386/i686/multiarch/stpncpy-sse2.S b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
new file mode 100644
index 0000000000..37a703cb76
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy-sse2.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_sse2
+#include "strcpy-sse2.S"
diff --git a/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
new file mode 100644
index 0000000000..14ed16f6b5
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy-ssse3.S
@@ -0,0 +1,4 @@
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#define STRCPY __stpncpy_ssse3
+#include "strcpy-ssse3.S"
diff --git a/sysdeps/i386/i686/multiarch/stpncpy.S b/sysdeps/i386/i686/multiarch/stpncpy.S
new file mode 100644
index 0000000000..ff89a89491
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/stpncpy.S
@@ -0,0 +1,6 @@
+#define STRCPY __stpncpy
+#define USE_AS_STPCPY
+#define USE_AS_STRNCPY
+#include "strcpy.S"
+
+weak_alias (__stpncpy, stpncpy)
diff --git a/sysdeps/i386/i686/multiarch/strcpy-sse2.S b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
new file mode 100644
index 0000000000..fad1ae2b67
--- /dev/null
+++ b/sysdeps/i386/i686/multiarch/strcpy-sse2.S
@@ -0,0 +1,2251 @@
+/* strcpy with SSE2 and unaligned load
+ Copyright (C) 2011 Free Software Foundation, Inc.
+ Contributed by Intel Corporation.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
+ 02111-1307 USA. */
+
+
+#ifndef NOT_IN_libc
+
+# include <sysdep.h>
+
+
+# define CFI_PUSH(REG) \
+ cfi_adjust_cfa_offset (4); \
+ cfi_rel_offset (REG, 0)
+
+# define CFI_POP(REG) \
+ cfi_adjust_cfa_offset (-4); \
+ cfi_restore (REG)
+
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
+
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2
+# endif
+
+# define STR1 PARMS
+# define STR2 STR1+4
+# define LEN STR2+4
+
+# ifdef USE_AS_STRNCPY
+# define PARMS 16
+# define ENTRANCE PUSH(%ebx); PUSH(%esi); PUSH(%edi)
+# define RETURN POP(%edi); POP(%esi); POP(%ebx); ret; \
+ CFI_PUSH(%ebx); CFI_PUSH(%edi); CFI_PUSH(%edi);
+
+# ifdef SHARED
+# define JMPTBL(I, B) I - B
+
+/* Load an entry in a jump table into ECX and branch to it. TABLE is a
+ jump table with relative offsets.
+ INDEX is a register contains the index into the jump table.
+ SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ /* We first load PC into ECX. */ \
+ call __i686.get_pc_thunk.cx; \
+ /* Get the address of the jump table. */ \
+ addl $(TABLE - .), %ecx; \
+ /* Get the entry and convert the relative offset to the \
+ absolute address. */ \
+ addl (%ecx,INDEX,SCALE), %ecx; \
+ /* We loaded the jump table and adjuested ECX. Go. */ \
+ jmp *%ecx
+# else
+# define JMPTBL(I, B) I
+
+/* Branch to an entry in a jump table. TABLE is a jump table with
+ absolute offsets. INDEX is a register contains the index into the
+ jump table. SCALE is the scale of INDEX. */
+
+# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
+ jmp *TABLE(,INDEX,SCALE)
+# endif
+
+.text
+ENTRY (STRCPY)
+ ENTRANCE
+ mov STR1(%esp), %edi
+ mov STR2(%esp), %esi
+ movl LEN(%esp), %ebx
+ test %ebx, %ebx
+ jz L(ExitZero)
+
+ mov %esi, %ecx
+# ifndef USE_AS_STPCPY
+ mov %edi, %eax /* save result */
+# endif
+ and $15, %ecx
+ jz L(SourceStringAlignmentZero)
+
+ and $-16, %esi
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+
+ pcmpeqb (%esi), %xmm1
+ add %ecx, %ebx
+ pmovmskb %xmm1, %edx
+ shr %cl, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTailCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail)
+
+ pcmpeqb 16(%esi), %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32BytesCase2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes)
+
+ movdqu (%esi, %ecx), %xmm1 /* copy 16 bytes */
+ movdqu %xmm1, (%edi)
+
+ sub %ecx, %edi
+
+/* If source adress alignment != destination adress alignment */
+ .p2align 4
+L(Unalign16Both):
+ mov $16, %ecx
+ movdqa (%esi, %ecx), %xmm1
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $48, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movaps 16(%esi, %ecx), %xmm4
+ movdqu %xmm3, (%edi, %ecx)
+ pcmpeqb %xmm4, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm4)
+
+ movaps 16(%esi, %ecx), %xmm1
+ movdqu %xmm4, (%edi, %ecx)
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm1)
+
+ movaps 16(%esi, %ecx), %xmm2
+ movdqu %xmm1, (%edi, %ecx)
+ pcmpeqb %xmm2, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm2)
+
+ movaps 16(%esi, %ecx), %xmm3
+ movdqu %xmm2, (%edi, %ecx)
+ pcmpeqb %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ add $16, %ecx
+ sub $16, %ebx
+ jbe L(CopyFrom1To16BytesCase2OrCase3)
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnalignedXmm3)
+
+ movdqu %xmm3, (%edi, %ecx)
+ mov %esi, %edx
+ lea 16(%esi, %ecx), %esi
+ and $-0x40, %esi
+ sub %esi, %edx
+ sub %edx, %edi
+ lea 128(%ebx, %edx), %ebx
+
+L(Unaligned64Loop):
+ movaps (%esi), %xmm2
+ movaps %xmm2, %xmm4
+ movaps 16(%esi), %xmm5
+ movaps 32(%esi), %xmm3
+ movaps %xmm3, %xmm6
+ movaps 48(%esi), %xmm7
+ pminub %xmm5, %xmm2
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jnz L(Unaligned64Leave)
+L(Unaligned64Loop_start):
+ add $64, %edi
+ add $64, %esi
+ movdqu %xmm4, -64(%edi)
+ movaps (%esi), %xmm2
+ movdqa %xmm2, %xmm4
+ movdqu %xmm5, -48(%edi)
+ movaps 16(%esi), %xmm5
+ pminub %xmm5, %xmm2
+ movaps 32(%esi), %xmm3
+ movdqu %xmm6, -32(%edi)
+ movaps %xmm3, %xmm6
+ movdqu %xmm7, -16(%edi)
+ movaps 48(%esi), %xmm7
+ pminub %xmm7, %xmm3
+ pminub %xmm2, %xmm3
+ pcmpeqb %xmm0, %xmm3
+ pmovmskb %xmm3, %edx
+ sub $64, %ebx
+ jbe L(UnalignedLeaveCase2OrCase3)
+ test %edx, %edx
+ jz L(Unaligned64Loop_start)
+L(Unaligned64Leave):
+ pxor %xmm1, %xmm1
+
+ pcmpeqb %xmm4, %xmm0
+ pcmpeqb %xmm5, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_0)
+ test %ecx, %ecx
+ jnz L(CopyFrom1To16BytesUnaligned_16)
+
+ pcmpeqb %xmm6, %xmm0
+ pcmpeqb %xmm7, %xmm1
+ pmovmskb %xmm0, %edx
+ pmovmskb %xmm1, %ecx
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesUnaligned_32)
+
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+ movdqu %xmm6, 32(%edi)
+# ifdef USE_AS_STPCPY
+ lea 48(%edi, %edx), %eax
+# endif
+ movdqu %xmm7, 48(%edi)
+ add $15, %ebx
+ sub %edx, %ebx
+ lea 49(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+/* If source adress alignment == destination adress alignment */
+
+L(SourceStringAlignmentZero):
+ pxor %xmm0, %xmm0
+ movdqa (%esi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $16, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# else
+ cmp $17, %ebx
+ jbe L(CopyFrom1To16BytesTail1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1)
+
+ pcmpeqb 16(%esi), %xmm0
+ movdqu %xmm1, (%edi)
+ pmovmskb %xmm0, %edx
+# ifdef USE_AS_STPCPY
+ cmp $32, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# else
+ cmp $33, %ebx
+ jbe L(CopyFrom1To32Bytes1Case2OrCase3)
+# endif
+ test %edx, %edx
+ jnz L(CopyFrom1To32Bytes1)
+
+ jmp L(Unalign16Both)
+
+/*-----------------End of main part---------------------------*/
+
+/* Case1 */
+ .p2align 4
+L(CopyFrom1To16BytesTail):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1):
+ add $16, %esi
+ add $16, %edi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1):
+ bsf %edx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes):
+ sub %ecx, %ebx
+ bsf %edx, %edx
+ add %ecx, %esi
+ add $16, %edx
+ sub %ecx, %edx
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_0):
+ bsf %edx, %edx
+# ifdef USE_AS_STPCPY
+ lea (%edi, %edx), %eax
+# endif
+ movdqu %xmm4, (%edi)
+ add $63, %ebx
+ sub %edx, %ebx
+ lea 1(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_16):
+ bsf %ecx, %edx
+ movdqu %xmm4, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 16(%edi, %edx), %eax
+# endif
+ movdqu %xmm5, 16(%edi)
+ add $47, %ebx
+ sub %edx, %ebx
+ lea 17(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnaligned_32):
+ bsf %edx, %edx
+ movdqu %xmm4, (%edi)
+ movdqu %xmm5, 16(%edi)
+# ifdef USE_AS_STPCPY
+ lea 32(%edi, %edx), %eax
+# endif
+ movdqu %xmm6, 32(%edi)
+ add $31, %ebx
+ sub %edx, %ebx
+ lea 33(%edi, %edx), %edi
+ jmp L(StrncpyFillTailWithZero)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm6):
+ movdqu %xmm6, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm5):
+ movdqu %xmm5, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm4):
+ movdqu %xmm4, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm3):
+ movdqu %xmm3, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesUnalignedXmm1):
+ movdqu %xmm1, (%edi, %ecx)
+ jmp L(CopyFrom1To16BytesXmmExit)
+
+ .p2align 4
+L(CopyFrom1To16BytesExit):
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %edx, 4)
+
+/* Case2 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ add $16, %edx
+ sub %ecx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTailCase2):
+ sub %ecx, %ebx
+ add %ecx, %esi
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+L(CopyFrom1To16BytesTail1Case2):
+ bsf %edx, %edx
+ cmp %ebx, %edx
+ jb L(CopyFrom1To16BytesExit)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+/* Case2 or Case3, Case3 */
+
+ .p2align 4
+L(CopyFrom1To16BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesCase2)
+L(CopyFrom1To16BytesCase3):
+ add $16, %ebx
+ add %ecx, %edi
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32BytesCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To32BytesCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To16BytesTailCase2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTailCase2)
+ sub %ecx, %ebx
+ add %ecx, %esi
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(CopyFrom1To32Bytes1Case2OrCase3):
+ add $16, %edi
+ add $16, %esi
+ sub $16, %ebx
+L(CopyFrom1To16BytesTail1Case2OrCase3):
+ test %edx, %edx
+ jnz L(CopyFrom1To16BytesTail1Case2)
+ BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %ebx, 4)
+
+ .p2align 4
+L(Exit0):
+# ifdef USE_AS_STPCPY
+ mov %edi, %eax
+# endif
+ RETURN
+
+ .p2align 4
+L(Exit1):
+ movb %dh, (%edi)
+# ifdef USE_AS_STPCPY
+ lea (%edi), %eax
+# endif
+ sub $1, %ebx
+ lea 1(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit2):
+ movw (%esi), %dx
+ movw %dx, (%edi)
+# ifdef USE_AS_STPCPY
+ lea 1(%edi), %eax
+# endif
+ sub $2, %ebx
+ lea 2(%edi), %edi
+ jnz L(StrncpyFillTailWithZero)
+ RETURN
+
+ .p2align 4
+L(Exit3):
+ movw (%esi), %cx
+ movw %cx, (%edi)
+ movb %dh, 2(%edi)