aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLiubov Dmitrieva <liubov.dmitrieva@gmail.com>2011-12-22 14:22:00 -0500
committerUlrich Drepper <drepper@gmail.com>2011-12-22 14:22:00 -0500
commit2bd779ae3f3a86bce22fcb7665d740b14ac677ca (patch)
treeb6874177395668dca502b398d0e547c8c64902cc
parent16c6f99208229d7222fd26499749e56137322a3c (diff)
downloadglibc-2bd779ae3f3a86bce22fcb7665d740b14ac677ca.tar.xz
glibc-2bd779ae3f3a86bce22fcb7665d740b14ac677ca.zip
Fix overrun in strcpy destination buffer in x86-32/SSSE3 version
-rw-r--r--ChangeLog5
-rw-r--r--sysdeps/i386/i686/multiarch/strcpy-ssse3.S1261
2 files changed, 521 insertions, 745 deletions
diff --git a/ChangeLog b/ChangeLog
index a9cdf76f56..8595c0396d 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2011-12-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com>
+
+ * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: Fix wrong copying
+ processing for last bytes.
+
2011-12-22 Ulrich Drepper <drepper@gmail.com>
* locale/iso-639.def: Add brx entry.
diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
index 073856ff84..470ddbe279 100644
--- a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
+++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S
@@ -20,6 +20,7 @@
#ifndef NOT_IN_libc
+
# ifndef USE_AS_STRCAT
# include <sysdep.h>
@@ -31,8 +32,8 @@
cfi_adjust_cfa_offset (-4); \
cfi_restore (REG)
-# define PUSH(REG) pushl REG; CFI_PUSH (REG)
-# define POP(REG) popl REG; CFI_POP (REG)
+# define PUSH(REG) pushl REG; CFI_PUSH (REG)
+# define POP(REG) popl REG; CFI_POP (REG)
# ifndef STRCPY
# define STRCPY __strcpy_ssse3
@@ -40,14 +41,22 @@
# ifdef USE_AS_STRNCPY
# define PARMS 8
-# define ENTRANCE PUSH(%ebx)
-# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx);
-# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi)
+# define ENTRANCE PUSH (%ebx)
+# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx);
+# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi)
# else
# define PARMS 4
# define ENTRANCE
# define RETURN ret
-# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi)
+# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi)
+# endif
+
+# ifdef USE_AS_STPCPY
+# define SAVE_RESULT(n) lea n(%edx), %eax
+# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax
+# else
+# define SAVE_RESULT(n) movl %edi, %eax
+# define SAVE_RESULT_TAIL(n) movl %edx, %eax
# endif
# define STR1 PARMS
@@ -60,9 +69,7 @@
movl - 4 byte
movlpd - 8 byte
movaps - 16 byte - requires 16 byte alignment
- of sourse and destination adresses.
- 16 byte alignment: adress is 32bit value,
- right four bit of adress shall be 0.
+ of sourse and destination adresses.
*/
.text
@@ -72,8 +79,6 @@ ENTRY (STRCPY)
mov STR2(%esp), %ecx
# ifdef USE_AS_STRNCPY
movl LEN(%esp), %ebx
- test %ebx, %ebx
- jz L(ExitTail0)
cmp $8, %ebx
jbe L(StrncpyExit8Bytes)
# endif
@@ -127,39 +132,23 @@ ENTRY (STRCPY)
sub $16, %ebx
and $0xf, %esi
-/* add 16 bytes ecx_shift to ebx */
+/* add 16 bytes ecx_offset to ebx */
add %esi, %ebx
# endif
lea 16(%ecx), %esi
-/* Now:
- esi = alignment_16(ecx) + ecx_shift + 16;
- ecx_shift = ecx - alignment_16(ecx)
-*/
and $-16, %esi
-/* Now:
- esi = alignment_16(ecx) + 16
-*/
pxor %xmm0, %xmm0
movlpd (%ecx), %xmm1
movlpd %xmm1, (%edx)
-/*
- look if there is zero symbol in next 16 bytes of string
- from esi to esi + 15 and form mask in xmm0
-*/
+
pcmpeqb (%esi), %xmm0
movlpd 8(%ecx), %xmm1
movlpd %xmm1, 8(%edx)
-/* convert byte mask in xmm0 to bit mask */
-
pmovmskb %xmm0, %eax
sub %ecx, %esi
-/* esi = 16 - ecx_shift */
-
-/* eax = 0: there isn't end of string from position esi to esi+15 */
-
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(CopyFrom1To16BytesCase2OrCase3)
@@ -169,17 +158,9 @@ ENTRY (STRCPY)
mov %edx, %eax
lea 16(%edx), %edx
-/* Now:
- edx = edx + 16 = alignment_16(edx) + edx_shift + 16
-*/
and $-16, %edx
-
-/* Now: edx = alignment_16(edx) + 16 */
-
sub %edx, %eax
-/* Now: eax = edx_shift - 16 */
-
# ifdef USE_AS_STRNCPY
add %eax, %esi
lea -1(%esi), %esi
@@ -191,22 +172,11 @@ ENTRY (STRCPY)
L(ContinueCopy):
# endif
sub %eax, %ecx
-/* Now:
- case ecx_shift >= edx_shift:
- ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16
- case ecx_shift < edx_shift:
- ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift)
-*/
mov %ecx, %eax
and $0xf, %eax
-/* Now:
- case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift
- case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift)
- eax can be 0, 1, ..., 15
-*/
mov $0, %esi
-/* case: ecx_shift == edx_shift */
+/* case: ecx_offset == edx_offset */
jz L(Align16Both)
@@ -323,7 +293,7 @@ L(Align16Both):
sub %ecx, %eax
sub %eax, %edx
# ifdef USE_AS_STRNCPY
- lea 48+64(%ebx, %eax), %ebx
+ lea 112(%ebx, %eax), %ebx
# endif
mov $-0x40, %esi
@@ -441,7 +411,6 @@ L(Shl1Start):
jnz L(Shl1LoopExit)
palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 31(%ecx), %xmm2
@@ -449,7 +418,6 @@ L(Shl1Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit1Case2OrCase3)
@@ -457,8 +425,7 @@ L(Shl1Start):
test %eax, %eax
jnz L(Shl1LoopExit)
- palignr $1, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $1, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 31(%ecx), %ecx
lea 16(%edx), %edx
@@ -506,11 +473,11 @@ L(Shl1LoopStart):
jmp L(Shl1LoopStart)
L(Shl1LoopExit):
- movaps (%edx), %xmm6
- psrldq $15, %xmm6
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
+ movlpd 7(%ecx), %xmm0
+ movlpd %xmm0, 7(%edx)
mov $15, %esi
- palignr $1, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -563,7 +530,6 @@ L(Shl2Start):
jnz L(Shl2LoopExit)
palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 30(%ecx), %xmm2
@@ -571,7 +537,6 @@ L(Shl2Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit2Case2OrCase3)
@@ -579,8 +544,7 @@ L(Shl2Start):
test %eax, %eax
jnz L(Shl2LoopExit)
- palignr $2, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $2, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 30(%ecx), %ecx
lea 16(%edx), %edx
@@ -628,11 +592,11 @@ L(Shl2LoopStart):
jmp L(Shl2LoopStart)
L(Shl2LoopExit):
- movaps (%edx), %xmm6
- psrldq $14, %xmm6
+ movlpd (%ecx), %xmm0
+ movlpd 6(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 6(%edx)
mov $14, %esi
- palignr $2, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -685,7 +649,6 @@ L(Shl3Start):
jnz L(Shl3LoopExit)
palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 29(%ecx), %xmm2
@@ -693,7 +656,6 @@ L(Shl3Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit3Case2OrCase3)
@@ -701,8 +663,7 @@ L(Shl3Start):
test %eax, %eax
jnz L(Shl3LoopExit)
- palignr $3, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $3, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 29(%ecx), %ecx
lea 16(%edx), %edx
@@ -750,11 +711,11 @@ L(Shl3LoopStart):
jmp L(Shl3LoopStart)
L(Shl3LoopExit):
- movaps (%edx), %xmm6
- psrldq $13, %xmm6
+ movlpd (%ecx), %xmm0
+ movlpd 5(%ecx), %xmm1
+ movlpd %xmm0, (%edx)
+ movlpd %xmm1, 5(%edx)
mov $13, %esi
- palignr $3, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -807,7 +768,6 @@ L(Shl4Start):
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 28(%ecx), %xmm2
@@ -815,7 +775,6 @@ L(Shl4Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit4Case2OrCase3)
@@ -823,8 +782,7 @@ L(Shl4Start):
test %eax, %eax
jnz L(Shl4LoopExit)
- palignr $4, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $4, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 28(%ecx), %ecx
lea 16(%edx), %edx
@@ -872,11 +830,11 @@ L(Shl4LoopStart):
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
- movaps (%edx), %xmm6
- psrldq $12, %xmm6
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 8(%edx)
mov $12, %esi
- palignr $4, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -929,7 +887,6 @@ L(Shl5Start):
jnz L(Shl5LoopExit)
palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 27(%ecx), %xmm2
@@ -937,7 +894,6 @@ L(Shl5Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit5Case2OrCase3)
@@ -945,8 +901,7 @@ L(Shl5Start):
test %eax, %eax
jnz L(Shl5LoopExit)
- palignr $5, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $5, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 27(%ecx), %ecx
lea 16(%edx), %edx
@@ -994,11 +949,11 @@ L(Shl5LoopStart):
jmp L(Shl5LoopStart)
L(Shl5LoopExit):
- movaps (%edx), %xmm6
- psrldq $11, %xmm6
+ movlpd (%ecx), %xmm0
+ movl 7(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 7(%edx)
mov $11, %esi
- palignr $5, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1051,7 +1006,6 @@ L(Shl6Start):
jnz L(Shl6LoopExit)
palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 26(%ecx), %xmm2
@@ -1059,7 +1013,6 @@ L(Shl6Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit6Case2OrCase3)
@@ -1067,8 +1020,7 @@ L(Shl6Start):
test %eax, %eax
jnz L(Shl6LoopExit)
- palignr $6, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $6, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 26(%ecx), %ecx
lea 16(%edx), %edx
@@ -1116,11 +1068,11 @@ L(Shl6LoopStart):
jmp L(Shl6LoopStart)
L(Shl6LoopExit):
- movaps (%edx), %xmm6
- psrldq $10, %xmm6
+ movlpd (%ecx), %xmm0
+ movl 6(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 6(%edx)
mov $10, %esi
- palignr $6, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1173,7 +1125,6 @@ L(Shl7Start):
jnz L(Shl7LoopExit)
palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 25(%ecx), %xmm2
@@ -1181,7 +1132,6 @@ L(Shl7Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit7Case2OrCase3)
@@ -1189,8 +1139,7 @@ L(Shl7Start):
test %eax, %eax
jnz L(Shl7LoopExit)
- palignr $7, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $7, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 25(%ecx), %ecx
lea 16(%edx), %edx
@@ -1238,11 +1187,11 @@ L(Shl7LoopStart):
jmp L(Shl7LoopStart)
L(Shl7LoopExit):
- movaps (%edx), %xmm6
- psrldq $9, %xmm6
+ movlpd (%ecx), %xmm0
+ movl 5(%ecx), %esi
+ movlpd %xmm0, (%edx)
+ movl %esi, 5(%edx)
mov $9, %esi
- palignr $7, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1295,7 +1244,6 @@ L(Shl8Start):
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 24(%ecx), %xmm2
@@ -1303,7 +1251,6 @@ L(Shl8Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit8Case2OrCase3)
@@ -1311,8 +1258,7 @@ L(Shl8Start):
test %eax, %eax
jnz L(Shl8LoopExit)
- palignr $8, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $8, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 24(%ecx), %ecx
lea 16(%edx), %edx
@@ -1360,11 +1306,9 @@ L(Shl8LoopStart):
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
- movaps (%edx), %xmm6
- psrldq $8, %xmm6
+ movlpd (%ecx), %xmm0
+ movlpd %xmm0, (%edx)
mov $8, %esi
- palignr $8, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1417,7 +1361,6 @@ L(Shl9Start):
jnz L(Shl9LoopExit)
palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 23(%ecx), %xmm2
@@ -1425,7 +1368,6 @@ L(Shl9Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit9Case2OrCase3)
@@ -1433,8 +1375,7 @@ L(Shl9Start):
test %eax, %eax
jnz L(Shl9LoopExit)
- palignr $9, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $9, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 23(%ecx), %ecx
lea 16(%edx), %edx
@@ -1482,11 +1423,9 @@ L(Shl9LoopStart):
jmp L(Shl9LoopStart)
L(Shl9LoopExit):
- movaps (%edx), %xmm6
- psrldq $7, %xmm6
+ movlpd -1(%ecx), %xmm0
+ movlpd %xmm0, -1(%edx)
mov $7, %esi
- palignr $9, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1539,7 +1478,6 @@ L(Shl10Start):
jnz L(Shl10LoopExit)
palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 22(%ecx), %xmm2
@@ -1547,7 +1485,6 @@ L(Shl10Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit10Case2OrCase3)
@@ -1555,8 +1492,7 @@ L(Shl10Start):
test %eax, %eax
jnz L(Shl10LoopExit)
- palignr $10, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $10, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 22(%ecx), %ecx
lea 16(%edx), %edx
@@ -1604,11 +1540,9 @@ L(Shl10LoopStart):
jmp L(Shl10LoopStart)
L(Shl10LoopExit):
- movaps (%edx), %xmm6
- psrldq $6, %xmm6
+ movlpd -2(%ecx), %xmm0
+ movlpd %xmm0, -2(%edx)
mov $6, %esi
- palignr $10, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1661,7 +1595,6 @@ L(Shl11Start):
jnz L(Shl11LoopExit)
palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 21(%ecx), %xmm2
@@ -1669,7 +1602,6 @@ L(Shl11Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit11Case2OrCase3)
@@ -1677,8 +1609,7 @@ L(Shl11Start):
test %eax, %eax
jnz L(Shl11LoopExit)
- palignr $11, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $11, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 21(%ecx), %ecx
lea 16(%edx), %edx
@@ -1726,11 +1657,9 @@ L(Shl11LoopStart):
jmp L(Shl11LoopStart)
L(Shl11LoopExit):
- movaps (%edx), %xmm6
- psrldq $5, %xmm6
+ movlpd -3(%ecx), %xmm0
+ movlpd %xmm0, -3(%edx)
mov $5, %esi
- palignr $11, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1783,7 +1712,6 @@ L(Shl12Start):
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 20(%ecx), %xmm2
@@ -1791,7 +1719,6 @@ L(Shl12Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit12Case2OrCase3)
@@ -1799,8 +1726,7 @@ L(Shl12Start):
test %eax, %eax
jnz L(Shl12LoopExit)
- palignr $12, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $12, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 20(%ecx), %ecx
lea 16(%edx), %edx
@@ -1848,11 +1774,9 @@ L(Shl12LoopStart):
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
- movaps (%edx), %xmm6
- psrldq $4, %xmm6
+ movl (%ecx), %esi
+ movl %esi, (%edx)
mov $4, %esi
- palignr $12, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -1905,7 +1829,6 @@ L(Shl13Start):
jnz L(Shl13LoopExit)
palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 19(%ecx), %xmm2
@@ -1913,7 +1836,6 @@ L(Shl13Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit13Case2OrCase3)
@@ -1921,8 +1843,7 @@ L(Shl13Start):
test %eax, %eax
jnz L(Shl13LoopExit)
- palignr $13, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $13, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 19(%ecx), %ecx
lea 16(%edx), %edx
@@ -1970,11 +1891,9 @@ L(Shl13LoopStart):
jmp L(Shl13LoopStart)
L(Shl13LoopExit):
- movaps (%edx), %xmm6
- psrldq $3, %xmm6
+ movl -1(%ecx), %esi
+ movl %esi, -1(%edx)
mov $3, %esi
- palignr $13, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -2027,7 +1946,6 @@ L(Shl14Start):
jnz L(Shl14LoopExit)
palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 18(%ecx), %xmm2
@@ -2035,7 +1953,6 @@ L(Shl14Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit14Case2OrCase3)
@@ -2043,8 +1960,7 @@ L(Shl14Start):
test %eax, %eax
jnz L(Shl14LoopExit)
- palignr $14, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $14, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 18(%ecx), %ecx
lea 16(%edx), %edx
@@ -2092,11 +2008,9 @@ L(Shl14LoopStart):
jmp L(Shl14LoopStart)
L(Shl14LoopExit):
- movaps (%edx), %xmm6
- psrldq $2, %xmm6
+ movl -2(%ecx), %esi
+ movl %esi, -2(%edx)
mov $2, %esi
- palignr $14, %xmm1, %xmm6
- movaps %xmm6, (%edx)
jmp L(CopyFrom1To16Bytes)
.p2align 4
@@ -2149,7 +2063,6 @@ L(Shl15Start):
jnz L(Shl15LoopExit)
palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
movaps %xmm2, (%edx)
movaps 17(%ecx), %xmm2
@@ -2157,7 +2070,6 @@ L(Shl15Start):
lea 16(%edx), %edx
pmovmskb %xmm0, %eax
lea 16(%ecx), %ecx
- movaps %xmm2, %xmm3
# ifdef USE_AS_STRNCPY
sub $16, %ebx
jbe L(StrncpyExit15Case2OrCase3)
@@ -2165,8 +2077,7 @@ L(Shl15Start):
test %eax, %eax
jnz L(Shl15LoopExit)
- palignr $15, %xmm1, %xmm2
- movaps %xmm3, %xmm1
+ palignr $15, %xmm3, %xmm2
movaps %xmm2, (%edx)
lea 17(%ecx), %ecx
lea 16(%edx), %edx
@@ -2214,15 +2125,14 @@ L(Shl15LoopStart):
jmp L(Shl15LoopStart)
L(Shl15LoopExit):
- movaps (%edx), %xmm6
- psrldq $1, %xmm6
+ movl -3(%ecx), %esi
+ movl %esi, -3(%edx)
mov $1, %esi
- palignr $15, %xmm1, %xmm6
- movaps %xmm6, (%edx)
# ifdef USE_AS_STRCAT
jmp L(CopyFrom1To16Bytes)
# endif
+
# ifndef USE_AS_STRCAT
.p2align 4
@@ -2235,15 +2145,38 @@ L(CopyFrom1To16Bytes):
POP (%esi)
test %al, %al
- jz L(ExitHigh)
+ jz L(ExitHigh8)
+
+L(CopyFrom1To16BytesLess8):
+ mov %al, %ah
+ and $15, %ah
+ jz L(ExitHigh4)
+
test $0x01, %al
jnz L(Exit1)
test $0x02, %al
jnz L(Exit2)
test $0x04, %al
jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
+
+ .p2align 4
+L(Exit4):
+ movl (%ecx), %eax
+ movl %eax, (%edx)
+ SAVE_RESULT (3)
+# ifdef USE_AS_STRNCPY
+ sub $4, %ebx
+ lea 4(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh4):
test $0x10, %al
jnz L(Exit5)
test $0x20, %al
@@ -2255,11 +2188,7 @@ L(CopyFrom1To16Bytes):
L(Exit8):
movlpd (%ecx), %xmm0
movlpd %xmm0, (%edx)
-# ifdef USE_AS_STPCPY
- lea 7(%edx), %eax
-# else
- movl %edi, %eax
-# endif
+ SAVE_RESULT (7)
# ifdef USE_AS_STRNCPY
sub $8, %ebx
lea 8(%edx), %ecx
@@ -2272,15 +2201,38 @@ L(Exit8):
RETURN1
.p2align 4
-L(ExitHigh):
+L(ExitHigh8):
+ mov %ah, %al
+ and $15, %al
+ jz L(ExitHigh12)
+
test $0x01, %ah
jnz L(Exit9)
test $0x02, %ah
jnz L(Exit10)
test $0x04, %ah
jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
+
+ .p2align 4
+L(Exit12):
+ movlpd (%ecx), %xmm0
+ movl 8(%ecx), %eax
+ movlpd %xmm0, (%edx)
+ movl %eax, 8(%edx)
+ SAVE_RESULT (11)
+# ifdef USE_AS_STRNCPY
+ sub $12, %ebx
+ lea 12(%edx), %ecx
+ jnz L(StrncpyFillTailWithZero1)
+# ifdef USE_AS_STPCPY
+ cmpb $1, (%eax)
+ sbb $-1, %eax
+# endif
+# endif
+ RETURN1
+
+ .p2align 4
+L(ExitHigh12):
test $0x10, %ah
jnz L(Exit13)