aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlan Modra <amodra@gmail.com>2013-08-17 18:47:22 +0930
committerAlan Modra <amodra@gmail.com>2013-10-04 10:41:24 +0930
commit759cfef3ac4c07dba1ece0bbc1207e099348816d (patch)
treea0e8cadce4426afb90d39b330dd50688b8975484
parentfe6e95d7171eba5f3e07848f081676fae4e86322 (diff)
downloadglibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.tar.xz
glibc-759cfef3ac4c07dba1ece0bbc1207e099348816d.zip
PowerPC LE memcpy
http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails.
-rw-r--r--ChangeLog13
-rw-r--r--sysdeps/powerpc/powerpc32/power4/memcpy.S58
-rw-r--r--sysdeps/powerpc/powerpc32/power6/memcpy.S81
-rw-r--r--sysdeps/powerpc/powerpc32/power7/memcpy.S24
-rw-r--r--sysdeps/powerpc/powerpc32/power7/mempcpy.S28
-rw-r--r--sysdeps/powerpc/powerpc64/memcpy.S27
-rw-r--r--sysdeps/powerpc/powerpc64/power4/memcpy.S61
-rw-r--r--sysdeps/powerpc/powerpc64/power6/memcpy.S329
-rw-r--r--sysdeps/powerpc/powerpc64/power7/memcpy.S704
-rw-r--r--sysdeps/powerpc/powerpc64/power7/mempcpy.S26
10 files changed, 941 insertions, 410 deletions
diff --git a/ChangeLog b/ChangeLog
index 51311857a6..959d3a30ab 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,18 @@
2013-10-04 Alan Modra <amodra@gmail.com>
+ * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+ * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+ * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better
+ use of regs. Use power7 mtocrf. Tidy function tails.
+
+2013-10-04 Alan Modra <amodra@gmail.com>
+
* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
Formatting. Consistently use rXXX register defines or rN defines.
Use early exit labels that avoid restoring unused non-volatile regs.
diff --git a/sysdeps/powerpc/powerpc32/power4/memcpy.S b/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d9146631e3..338d3cce30 100644
--- a/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 6,8(5) /* load the 3rd src word */
stw 0,0(4) /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10 /* now left align 2nd src word into R0 */
srw 8,6,9 /* shift 3rd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 7,12(5)
stw 0,4(4) /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
addi 5,5,16
bf 31,4f
/* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 3rd src word to left align it in R0 */
srw 8,7,9 /* shift 4th src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
stw 0,0(4) /* store 3rd dst word */
mr 6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
addi 5,5,8
or 0,0,8 /* or them to get word to store */
bf 31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
.align 4
4:
/* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,0(5)
stw 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,4(5)
stw 0,4(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,8(5)
stw 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,12(5)
stw 0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
bdnz+ 4b
8:
/* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
stw 0,0(4)
3:
diff --git a/sysdeps/powerpc/powerpc32/power6/memcpy.S b/sysdeps/powerpc/powerpc32/power6/memcpy.S
index a76f71e04f..f58114a0c5 100644
--- a/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
lwz 6,-1(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,8
+#else
slwi 6,6,8
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu1_32tail)
mtctr 8
@@ -585,8 +602,12 @@ L(wdu1_32):
lwz 8,3(4)
lwz 7,4(4)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu1_loop32x)
.align 4
L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
L(wdu1_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,8
+#else
slwi 6,8,8
+#endif
bdnz+ L(wdu1_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,3(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
+/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu_32tailx)
L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
lwz 6,-2(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,16
+#else
slwi 6,6,16
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu2_32tail)
mtctr 8
@@ -641,8 +678,11 @@ L(wdu2_32):
lwz 8,2(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu2_loop32x)
.align 4
L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
L(wdu2_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,16
+#else
slwi 6,8,16
+#endif
bdnz+ L(wdu2_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,2(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu_32tailx)
L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
lwz 6,-3(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,24
+#else
slwi 6,6,24
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu3_32tail)
mtctr 8
@@ -698,8 +752,11 @@ L(wdu3_32):
lwz 8,1(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu3_loop32x)
.align 4
L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
L(wdu3_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,24
+#else
slwi 6,8,24
+#endif
bdnz+ L(wdu3_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,1(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu_32tailx)
.align 4
L(wdu_32tailx):
diff --git a/sysdeps/powerpc/powerpc32/power7/memcpy.S b/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f00778236..acf3c10198 100644
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb580..4610ec5b56 100644
--- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
- lvsl 5,0,12
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
+ lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
- vperm 6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/sysdeps/powerpc/powerpc64/memcpy.S b/sysdeps/powerpc/powerpc64/memcpy.S
index b8c4cc8b10..5fc7401c99 100644
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
ld 7,8(5)
subfic 9,10,64
beq 2f
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+#else
sld 0,6,10
+#endif
cmpldi 11,1
mr 6,7
addi 4,4,-8
@@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
b 1f
2: addi 5,5,8
.align 4
+#ifdef __LITTLE_ENDIAN__
+0: srd 0,6,10
+ sld 8,7,9
+#else
0: sld 0,6,10
srd 8,7,9
+#endif
cmpldi 11,2
ld 6,8(5)
or 0,0,8
addi 11,11,-2
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+1: sld 8,6,9
+#else
sld 0,7,10
1: srd 8,6,9
+#endif
or 0,0,8
beq 8f
ld 7,16(5)
diff --git a/sysdeps/powerpc/powerpc64/power4/memcpy.S b/sysdeps/powerpc/powerpc64/power4/memcpy.S
index 4317c7e786..f9a7260dcb 100644
--- a/sysdeps/powerpc/powerpc64/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S
@@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
blt cr6,8f /* if total DWs = 3, then bypass loop */
bf 31,4f
/* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
std 0,0(4)
mr 6,7
@@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
addi 5,5,16
or 0,0,8
bf 31,4f
@@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
addi 4,4,8
.align 4
/* copy 32 bytes at a time */
-4: sld 0,6,10
+4:
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
+ sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
+#ifdef __LITTLE_ENDIAN__
+ srd 0,7,10
+ sld 8,6,9
+#else
sld 0,7,10
srd 8,6,9
+#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
.align 4
8:
/* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+ srd 0,6,10
+ sld 8,7,9
+#else
sld 0,6,10
srd 8,7,9
+#endif
or 0,0,8
std 0,0(4)
3:
diff --git a/sysdeps/powerpc/powerpc64/power6/memcpy.S b/sysdeps/powerpc/powerpc64/power6/memcpy.S
index d6d242d293..e3f3d8a303 100644
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@@ -400,15 +400,28 @@ L(das_tail2):
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+