diff options
| author | Liubov Dmitrieva <liubov.dmitrieva@gmail.com> | 2011-12-22 14:22:00 -0500 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2011-12-22 14:22:00 -0500 |
| commit | 2bd779ae3f3a86bce22fcb7665d740b14ac677ca (patch) | |
| tree | b6874177395668dca502b398d0e547c8c64902cc | |
| parent | 16c6f99208229d7222fd26499749e56137322a3c (diff) | |
| download | glibc-2bd779ae3f3a86bce22fcb7665d740b14ac677ca.tar.xz glibc-2bd779ae3f3a86bce22fcb7665d740b14ac677ca.zip | |
Fix overrun in strcpy destination buffer in x86-32/SSSE3 version
| -rw-r--r-- | ChangeLog | 5 | ||||
| -rw-r--r-- | sysdeps/i386/i686/multiarch/strcpy-ssse3.S | 1261 |
2 files changed, 521 insertions, 745 deletions
@@ -1,3 +1,8 @@ +2011-12-22 Liubov Dmitrieva <liubov.dmitrieva@gmail.com> + + * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: Fix wrong copying + processing for last bytes. + 2011-12-22 Ulrich Drepper <drepper@gmail.com> * locale/iso-639.def: Add brx entry. diff --git a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S index 073856ff84..470ddbe279 100644 --- a/sysdeps/i386/i686/multiarch/strcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/strcpy-ssse3.S @@ -20,6 +20,7 @@ #ifndef NOT_IN_libc + # ifndef USE_AS_STRCAT # include <sysdep.h> @@ -31,8 +32,8 @@ cfi_adjust_cfa_offset (-4); \ cfi_restore (REG) -# define PUSH(REG) pushl REG; CFI_PUSH (REG) -# define POP(REG) popl REG; CFI_POP (REG) +# define PUSH(REG) pushl REG; CFI_PUSH (REG) +# define POP(REG) popl REG; CFI_POP (REG) # ifndef STRCPY # define STRCPY __strcpy_ssse3 @@ -40,14 +41,22 @@ # ifdef USE_AS_STRNCPY # define PARMS 8 -# define ENTRANCE PUSH(%ebx) -# define RETURN POP(%ebx); ret; CFI_PUSH(%ebx); -# define RETURN1 POP(%edi); POP(%ebx); ret; CFI_PUSH(%ebx); CFI_PUSH(%edi) +# define ENTRANCE PUSH (%ebx) +# define RETURN POP (%ebx); ret; CFI_PUSH (%ebx); +# define RETURN1 POP (%edi); POP (%ebx); ret; CFI_PUSH (%ebx); CFI_PUSH (%edi) # else # define PARMS 4 # define ENTRANCE # define RETURN ret -# define RETURN1 POP(%edi); ret; CFI_PUSH(%edi) +# define RETURN1 POP (%edi); ret; CFI_PUSH (%edi) +# endif + +# ifdef USE_AS_STPCPY +# define SAVE_RESULT(n) lea n(%edx), %eax +# define SAVE_RESULT_TAIL(n) lea n(%edx), %eax +# else +# define SAVE_RESULT(n) movl %edi, %eax +# define SAVE_RESULT_TAIL(n) movl %edx, %eax # endif # define STR1 PARMS @@ -60,9 +69,7 @@ movl - 4 byte movlpd - 8 byte movaps - 16 byte - requires 16 byte alignment - of sourse and destination adresses. - 16 byte alignment: adress is 32bit value, - right four bit of adress shall be 0. + of sourse and destination adresses. */ .text @@ -72,8 +79,6 @@ ENTRY (STRCPY) mov STR2(%esp), %ecx # ifdef USE_AS_STRNCPY movl LEN(%esp), %ebx - test %ebx, %ebx - jz L(ExitTail0) cmp $8, %ebx jbe L(StrncpyExit8Bytes) # endif @@ -127,39 +132,23 @@ ENTRY (STRCPY) sub $16, %ebx and $0xf, %esi -/* add 16 bytes ecx_shift to ebx */ +/* add 16 bytes ecx_offset to ebx */ add %esi, %ebx # endif lea 16(%ecx), %esi -/* Now: - esi = alignment_16(ecx) + ecx_shift + 16; - ecx_shift = ecx - alignment_16(ecx) -*/ and $-16, %esi -/* Now: - esi = alignment_16(ecx) + 16 -*/ pxor %xmm0, %xmm0 movlpd (%ecx), %xmm1 movlpd %xmm1, (%edx) -/* - look if there is zero symbol in next 16 bytes of string - from esi to esi + 15 and form mask in xmm0 -*/ + pcmpeqb (%esi), %xmm0 movlpd 8(%ecx), %xmm1 movlpd %xmm1, 8(%edx) -/* convert byte mask in xmm0 to bit mask */ - pmovmskb %xmm0, %eax sub %ecx, %esi -/* esi = 16 - ecx_shift */ - -/* eax = 0: there isn't end of string from position esi to esi+15 */ - # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(CopyFrom1To16BytesCase2OrCase3) @@ -169,17 +158,9 @@ ENTRY (STRCPY) mov %edx, %eax lea 16(%edx), %edx -/* Now: - edx = edx + 16 = alignment_16(edx) + edx_shift + 16 -*/ and $-16, %edx - -/* Now: edx = alignment_16(edx) + 16 */ - sub %edx, %eax -/* Now: eax = edx_shift - 16 */ - # ifdef USE_AS_STRNCPY add %eax, %esi lea -1(%esi), %esi @@ -191,22 +172,11 @@ ENTRY (STRCPY) L(ContinueCopy): # endif sub %eax, %ecx -/* Now: - case ecx_shift >= edx_shift: - ecx = alignment_16(ecx) + (ecx_shift - edx_shift) + 16 - case ecx_shift < edx_shift: - ecx = alignment_16(ecx) + (16 + ecx_shift - edx_shift) -*/ mov %ecx, %eax and $0xf, %eax -/* Now: - case ecx_shift >= edx_shift: eax = ecx_shift - edx_shift - case ecx_shift < edx_shift: eax = (16 + ecx_shift - edx_shift) - eax can be 0, 1, ..., 15 -*/ mov $0, %esi -/* case: ecx_shift == edx_shift */ +/* case: ecx_offset == edx_offset */ jz L(Align16Both) @@ -323,7 +293,7 @@ L(Align16Both): sub %ecx, %eax sub %eax, %edx # ifdef USE_AS_STRNCPY - lea 48+64(%ebx, %eax), %ebx + lea 112(%ebx, %eax), %ebx # endif mov $-0x40, %esi @@ -441,7 +411,6 @@ L(Shl1Start): jnz L(Shl1LoopExit) palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 31(%ecx), %xmm2 @@ -449,7 +418,6 @@ L(Shl1Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit1Case2OrCase3) @@ -457,8 +425,7 @@ L(Shl1Start): test %eax, %eax jnz L(Shl1LoopExit) - palignr $1, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $1, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 31(%ecx), %ecx lea 16(%edx), %edx @@ -506,11 +473,11 @@ L(Shl1LoopStart): jmp L(Shl1LoopStart) L(Shl1LoopExit): - movaps (%edx), %xmm6 - psrldq $15, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) + movlpd 7(%ecx), %xmm0 + movlpd %xmm0, 7(%edx) mov $15, %esi - palignr $1, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -563,7 +530,6 @@ L(Shl2Start): jnz L(Shl2LoopExit) palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 30(%ecx), %xmm2 @@ -571,7 +537,6 @@ L(Shl2Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit2Case2OrCase3) @@ -579,8 +544,7 @@ L(Shl2Start): test %eax, %eax jnz L(Shl2LoopExit) - palignr $2, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $2, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 30(%ecx), %ecx lea 16(%edx), %edx @@ -628,11 +592,11 @@ L(Shl2LoopStart): jmp L(Shl2LoopStart) L(Shl2LoopExit): - movaps (%edx), %xmm6 - psrldq $14, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 6(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 6(%edx) mov $14, %esi - palignr $2, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -685,7 +649,6 @@ L(Shl3Start): jnz L(Shl3LoopExit) palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 29(%ecx), %xmm2 @@ -693,7 +656,6 @@ L(Shl3Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit3Case2OrCase3) @@ -701,8 +663,7 @@ L(Shl3Start): test %eax, %eax jnz L(Shl3LoopExit) - palignr $3, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $3, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 29(%ecx), %ecx lea 16(%edx), %edx @@ -750,11 +711,11 @@ L(Shl3LoopStart): jmp L(Shl3LoopStart) L(Shl3LoopExit): - movaps (%edx), %xmm6 - psrldq $13, %xmm6 + movlpd (%ecx), %xmm0 + movlpd 5(%ecx), %xmm1 + movlpd %xmm0, (%edx) + movlpd %xmm1, 5(%edx) mov $13, %esi - palignr $3, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -807,7 +768,6 @@ L(Shl4Start): jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 28(%ecx), %xmm2 @@ -815,7 +775,6 @@ L(Shl4Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit4Case2OrCase3) @@ -823,8 +782,7 @@ L(Shl4Start): test %eax, %eax jnz L(Shl4LoopExit) - palignr $4, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $4, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 28(%ecx), %ecx lea 16(%edx), %edx @@ -872,11 +830,11 @@ L(Shl4LoopStart): jmp L(Shl4LoopStart) L(Shl4LoopExit): - movaps (%edx), %xmm6 - psrldq $12, %xmm6 + movlpd (%ecx), %xmm0 + movl 8(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 8(%edx) mov $12, %esi - palignr $4, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -929,7 +887,6 @@ L(Shl5Start): jnz L(Shl5LoopExit) palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 27(%ecx), %xmm2 @@ -937,7 +894,6 @@ L(Shl5Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit5Case2OrCase3) @@ -945,8 +901,7 @@ L(Shl5Start): test %eax, %eax jnz L(Shl5LoopExit) - palignr $5, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $5, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 27(%ecx), %ecx lea 16(%edx), %edx @@ -994,11 +949,11 @@ L(Shl5LoopStart): jmp L(Shl5LoopStart) L(Shl5LoopExit): - movaps (%edx), %xmm6 - psrldq $11, %xmm6 + movlpd (%ecx), %xmm0 + movl 7(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 7(%edx) mov $11, %esi - palignr $5, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1051,7 +1006,6 @@ L(Shl6Start): jnz L(Shl6LoopExit) palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 26(%ecx), %xmm2 @@ -1059,7 +1013,6 @@ L(Shl6Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit6Case2OrCase3) @@ -1067,8 +1020,7 @@ L(Shl6Start): test %eax, %eax jnz L(Shl6LoopExit) - palignr $6, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $6, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 26(%ecx), %ecx lea 16(%edx), %edx @@ -1116,11 +1068,11 @@ L(Shl6LoopStart): jmp L(Shl6LoopStart) L(Shl6LoopExit): - movaps (%edx), %xmm6 - psrldq $10, %xmm6 + movlpd (%ecx), %xmm0 + movl 6(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 6(%edx) mov $10, %esi - palignr $6, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1173,7 +1125,6 @@ L(Shl7Start): jnz L(Shl7LoopExit) palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 25(%ecx), %xmm2 @@ -1181,7 +1132,6 @@ L(Shl7Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit7Case2OrCase3) @@ -1189,8 +1139,7 @@ L(Shl7Start): test %eax, %eax jnz L(Shl7LoopExit) - palignr $7, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $7, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 25(%ecx), %ecx lea 16(%edx), %edx @@ -1238,11 +1187,11 @@ L(Shl7LoopStart): jmp L(Shl7LoopStart) L(Shl7LoopExit): - movaps (%edx), %xmm6 - psrldq $9, %xmm6 + movlpd (%ecx), %xmm0 + movl 5(%ecx), %esi + movlpd %xmm0, (%edx) + movl %esi, 5(%edx) mov $9, %esi - palignr $7, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1295,7 +1244,6 @@ L(Shl8Start): jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 24(%ecx), %xmm2 @@ -1303,7 +1251,6 @@ L(Shl8Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit8Case2OrCase3) @@ -1311,8 +1258,7 @@ L(Shl8Start): test %eax, %eax jnz L(Shl8LoopExit) - palignr $8, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $8, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 24(%ecx), %ecx lea 16(%edx), %edx @@ -1360,11 +1306,9 @@ L(Shl8LoopStart): jmp L(Shl8LoopStart) L(Shl8LoopExit): - movaps (%edx), %xmm6 - psrldq $8, %xmm6 + movlpd (%ecx), %xmm0 + movlpd %xmm0, (%edx) mov $8, %esi - palignr $8, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1417,7 +1361,6 @@ L(Shl9Start): jnz L(Shl9LoopExit) palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 23(%ecx), %xmm2 @@ -1425,7 +1368,6 @@ L(Shl9Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit9Case2OrCase3) @@ -1433,8 +1375,7 @@ L(Shl9Start): test %eax, %eax jnz L(Shl9LoopExit) - palignr $9, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $9, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 23(%ecx), %ecx lea 16(%edx), %edx @@ -1482,11 +1423,9 @@ L(Shl9LoopStart): jmp L(Shl9LoopStart) L(Shl9LoopExit): - movaps (%edx), %xmm6 - psrldq $7, %xmm6 + movlpd -1(%ecx), %xmm0 + movlpd %xmm0, -1(%edx) mov $7, %esi - palignr $9, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1539,7 +1478,6 @@ L(Shl10Start): jnz L(Shl10LoopExit) palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 22(%ecx), %xmm2 @@ -1547,7 +1485,6 @@ L(Shl10Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit10Case2OrCase3) @@ -1555,8 +1492,7 @@ L(Shl10Start): test %eax, %eax jnz L(Shl10LoopExit) - palignr $10, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $10, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 22(%ecx), %ecx lea 16(%edx), %edx @@ -1604,11 +1540,9 @@ L(Shl10LoopStart): jmp L(Shl10LoopStart) L(Shl10LoopExit): - movaps (%edx), %xmm6 - psrldq $6, %xmm6 + movlpd -2(%ecx), %xmm0 + movlpd %xmm0, -2(%edx) mov $6, %esi - palignr $10, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1661,7 +1595,6 @@ L(Shl11Start): jnz L(Shl11LoopExit) palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 21(%ecx), %xmm2 @@ -1669,7 +1602,6 @@ L(Shl11Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit11Case2OrCase3) @@ -1677,8 +1609,7 @@ L(Shl11Start): test %eax, %eax jnz L(Shl11LoopExit) - palignr $11, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $11, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 21(%ecx), %ecx lea 16(%edx), %edx @@ -1726,11 +1657,9 @@ L(Shl11LoopStart): jmp L(Shl11LoopStart) L(Shl11LoopExit): - movaps (%edx), %xmm6 - psrldq $5, %xmm6 + movlpd -3(%ecx), %xmm0 + movlpd %xmm0, -3(%edx) mov $5, %esi - palignr $11, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1783,7 +1712,6 @@ L(Shl12Start): jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 20(%ecx), %xmm2 @@ -1791,7 +1719,6 @@ L(Shl12Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit12Case2OrCase3) @@ -1799,8 +1726,7 @@ L(Shl12Start): test %eax, %eax jnz L(Shl12LoopExit) - palignr $12, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $12, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 20(%ecx), %ecx lea 16(%edx), %edx @@ -1848,11 +1774,9 @@ L(Shl12LoopStart): jmp L(Shl12LoopStart) L(Shl12LoopExit): - movaps (%edx), %xmm6 - psrldq $4, %xmm6 + movl (%ecx), %esi + movl %esi, (%edx) mov $4, %esi - palignr $12, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -1905,7 +1829,6 @@ L(Shl13Start): jnz L(Shl13LoopExit) palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 19(%ecx), %xmm2 @@ -1913,7 +1836,6 @@ L(Shl13Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit13Case2OrCase3) @@ -1921,8 +1843,7 @@ L(Shl13Start): test %eax, %eax jnz L(Shl13LoopExit) - palignr $13, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $13, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 19(%ecx), %ecx lea 16(%edx), %edx @@ -1970,11 +1891,9 @@ L(Shl13LoopStart): jmp L(Shl13LoopStart) L(Shl13LoopExit): - movaps (%edx), %xmm6 - psrldq $3, %xmm6 + movl -1(%ecx), %esi + movl %esi, -1(%edx) mov $3, %esi - palignr $13, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2027,7 +1946,6 @@ L(Shl14Start): jnz L(Shl14LoopExit) palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 18(%ecx), %xmm2 @@ -2035,7 +1953,6 @@ L(Shl14Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit14Case2OrCase3) @@ -2043,8 +1960,7 @@ L(Shl14Start): test %eax, %eax jnz L(Shl14LoopExit) - palignr $14, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $14, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 18(%ecx), %ecx lea 16(%edx), %edx @@ -2092,11 +2008,9 @@ L(Shl14LoopStart): jmp L(Shl14LoopStart) L(Shl14LoopExit): - movaps (%edx), %xmm6 - psrldq $2, %xmm6 + movl -2(%ecx), %esi + movl %esi, -2(%edx) mov $2, %esi - palignr $14, %xmm1, %xmm6 - movaps %xmm6, (%edx) jmp L(CopyFrom1To16Bytes) .p2align 4 @@ -2149,7 +2063,6 @@ L(Shl15Start): jnz L(Shl15LoopExit) palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 movaps %xmm2, (%edx) movaps 17(%ecx), %xmm2 @@ -2157,7 +2070,6 @@ L(Shl15Start): lea 16(%edx), %edx pmovmskb %xmm0, %eax lea 16(%ecx), %ecx - movaps %xmm2, %xmm3 # ifdef USE_AS_STRNCPY sub $16, %ebx jbe L(StrncpyExit15Case2OrCase3) @@ -2165,8 +2077,7 @@ L(Shl15Start): test %eax, %eax jnz L(Shl15LoopExit) - palignr $15, %xmm1, %xmm2 - movaps %xmm3, %xmm1 + palignr $15, %xmm3, %xmm2 movaps %xmm2, (%edx) lea 17(%ecx), %ecx lea 16(%edx), %edx @@ -2214,15 +2125,14 @@ L(Shl15LoopStart): jmp L(Shl15LoopStart) L(Shl15LoopExit): - movaps (%edx), %xmm6 - psrldq $1, %xmm6 + movl -3(%ecx), %esi + movl %esi, -3(%edx) mov $1, %esi - palignr $15, %xmm1, %xmm6 - movaps %xmm6, (%edx) # ifdef USE_AS_STRCAT jmp L(CopyFrom1To16Bytes) # endif + # ifndef USE_AS_STRCAT .p2align 4 @@ -2235,15 +2145,38 @@ L(CopyFrom1To16Bytes): POP (%esi) test %al, %al - jz L(ExitHigh) + jz L(ExitHigh8) + +L(CopyFrom1To16BytesLess8): + mov %al, %ah + and $15, %ah + jz L(ExitHigh4) + test $0x01, %al jnz L(Exit1) test $0x02, %al jnz L(Exit2) test $0x04, %al jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) + + .p2align 4 +L(Exit4): + movl (%ecx), %eax + movl %eax, (%edx) + SAVE_RESULT (3) +# ifdef USE_AS_STRNCPY + sub $4, %ebx + lea 4(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh4): test $0x10, %al jnz L(Exit5) test $0x20, %al @@ -2255,11 +2188,7 @@ L(CopyFrom1To16Bytes): L(Exit8): movlpd (%ecx), %xmm0 movlpd %xmm0, (%edx) -# ifdef USE_AS_STPCPY - lea 7(%edx), %eax -# else - movl %edi, %eax -# endif + SAVE_RESULT (7) # ifdef USE_AS_STRNCPY sub $8, %ebx lea 8(%edx), %ecx @@ -2272,15 +2201,38 @@ L(Exit8): RETURN1 .p2align 4 -L(ExitHigh): +L(ExitHigh8): + mov %ah, %al + and $15, %al + jz L(ExitHigh12) + test $0x01, %ah jnz L(Exit9) test $0x02, %ah jnz L(Exit10) test $0x04, %ah jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) + + .p2align 4 +L(Exit12): + movlpd (%ecx), %xmm0 + movl 8(%ecx), %eax + movlpd %xmm0, (%edx) + movl %eax, 8(%edx) + SAVE_RESULT (11) +# ifdef USE_AS_STRNCPY + sub $12, %ebx + lea 12(%edx), %ecx + jnz L(StrncpyFillTailWithZero1) +# ifdef USE_AS_STPCPY + cmpb $1, (%eax) + sbb $-1, %eax +# endif +# endif + RETURN1 + + .p2align 4 +L(ExitHigh12): test $0x10, %ah jnz L(Exit13) |
