aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/memcpy.S
diff options
context:
space:
mode:
authorWilco Dijkstra <wdijkstr@arm.com>2022-10-26 14:16:50 +0100
committerWilco Dijkstra <wdijkstr@arm.com>2022-10-26 14:16:50 +0100
commite6f3fe362f1aab78b1448d69ecdbd9e3872636d3 (patch)
treea4b65f09b750d95e3e593b4114f1212aab09fabe /sysdeps/aarch64/memcpy.S
parenta8e72913fea0c6e2832c50523c60907ffa3b753b (diff)
downloadglibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.tar.xz
glibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.zip
aarch64: Use memcpy_simd as the default memcpy
Since __memcpy_simd is the fastest memcpy on almost all cores, replace the generic memcpy with it. If SVE is available, a SVE memcpy will be used by default (including for Neoverse N2).
Diffstat (limited to 'sysdeps/aarch64/memcpy.S')
-rw-r--r--sysdeps/aarch64/memcpy.S192
1 files changed, 81 insertions, 111 deletions
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 98d4e2c0e2..7b396b202f 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+ Copyright (C) 2012-2022 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -20,7 +21,7 @@
/* Assumptions:
*
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
*
*/
@@ -36,21 +37,18 @@
#define B_l x8
#define B_lw w8
#define B_h x9
-#define C_l x10
#define C_lw w10
-#define C_h x11
-#define D_l x12
-#define D_h x13
-#define E_l x14
-#define E_h x15
-#define F_l x16
-#define F_h x17
-#define G_l count
-#define G_h dst
-#define H_l src
-#define H_h srcend
#define tmp1 x14
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
+#define H_q q7
+
#ifndef MEMMOVE
# define MEMMOVE memmove
#endif
@@ -69,10 +67,9 @@
Large copies use a software pipelined loop processing 64 bytes per
iteration. The destination pointer is 16-byte aligned to minimize
unaligned accesses. The loop tail is handled by always copying 64 bytes
- from the end.
-*/
+ from the end. */
-ENTRY_ALIGN (MEMCPY, 6)
+ENTRY (MEMCPY)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
/* Small copies: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
/* Copy 8-15 bytes. */
@@ -102,7 +99,6 @@ L(copy16):
str A_h, [dstend, -8]
ret
- .p2align 3
/* Copy 4-7 bytes. */
L(copy8):
tbz count, 2, L(copy4)
@@ -128,87 +124,69 @@ L(copy0):
.p2align 4
/* Medium copies: 33..128 bytes. */
L(copy32_128):
- ldp A_l, A_h, [src]
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- ldp D_l, D_h, [srcend, -16]
+ ldp A_q, B_q, [src]
+ ldp C_q, D_q, [srcend, -32]
cmp count, 64
b.hi L(copy128)
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ stp C_q, D_q, [dstend, -32]
ret
.p2align 4
/* Copy 65..128 bytes. */
L(copy128):
- ldp E_l, E_h, [src, 32]
- ldp F_l, F_h, [src, 48]
+ ldp E_q, F_q, [src, 32]
cmp count, 96
b.ls L(copy96)
- ldp G_l, G_h, [srcend, -64]
- ldp H_l, H_h, [srcend, -48]
- stp G_l, G_h, [dstend, -64]
- stp H_l, H_h, [dstend, -48]
+ ldp G_q, H_q, [srcend, -64]
+ stp G_q, H_q, [dstend, -64]
L(copy96):
- stp A_l, A_h, [dstin]
- stp B_l, B_h, [dstin, 16]
- stp E_l, E_h, [dstin, 32]
- stp F_l, F_h, [dstin, 48]
- stp C_l, C_h, [dstend, -32]
- stp D_l, D_h, [dstend, -16]
+ stp A_q, B_q, [dstin]
+ stp E_q, F_q, [dstin, 32]
+ stp C_q, D_q, [dstend, -32]
ret
- .p2align 4
+ /* Align loop64 below to 16 bytes. */
+ nop
+
/* Copy more than 128 bytes. */
L(copy_long):
- /* Copy 16 bytes and then align dst to 16-byte alignment. */
- ldp D_l, D_h, [src]
- and tmp1, dstin, 15
- bic dst, dstin, 15
- sub src, src, tmp1
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+ ldr D_q, [src]
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
add count, count, tmp1 /* Count is now 16 too large. */
- ldp A_l, A_h, [src, 16]
- stp D_l, D_h, [dstin]
- ldp B_l, B_h, [src, 32]
- ldp C_l, C_h, [src, 48]
- ldp D_l, D_h, [src, 64]!
+ ldp A_q, B_q, [src, 16]
+ str D_q, [dstin]
+ ldp C_q, D_q, [src, 48]
subs count, count, 128 + 16 /* Test and readjust count. */
b.ls L(copy64_from_end)
-
L(loop64):
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [src, 32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [src, 48]
- stp D_l, D_h, [dst, 64]!
- ldp D_l, D_h, [src, 64]!
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [src, 80]
+ stp C_q, D_q, [dst, 48]
+ ldp C_q, D_q, [src, 112]
+ add src, src, 64
+ add dst, dst, 64
subs count, count, 64
b.hi L(loop64)
/* Write the last iteration and copy 64 bytes from the end. */
L(copy64_from_end):
- ldp E_l, E_h, [srcend, -64]
- stp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [srcend, -48]
- stp B_l, B_h, [dst, 32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dst, 48]
- ldp C_l, C_h, [srcend, -16]
- stp D_l, D_h, [dst, 64]
- stp E_l, E_h, [dstend, -64]
- stp A_l, A_h, [dstend, -48]
- stp B_l, B_h, [dstend, -32]
- stp C_l, C_h, [dstend, -16]
+ ldp E_q, F_q, [srcend, -64]
+ stp A_q, B_q, [dst, 16]
+ ldp A_q, B_q, [srcend, -32]
+ stp C_q, D_q, [dst, 48]
+ stp E_q, F_q, [dstend, -64]
+ stp A_q, B_q, [dstend, -32]
ret
END (MEMCPY)
libc_hidden_builtin_def (MEMCPY)
-ENTRY_ALIGN (MEMMOVE, 4)
+
+ENTRY (MEMMOVE)
PTR_ARG (0)
PTR_ARG (1)
SIZE_ARG (2)
@@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
cmp count, 32
b.hi L(copy32_128)
- /* Small copies: 0..32 bytes. */
+ /* Small moves: 0..32 bytes. */
cmp count, 16
b.lo L(copy16)
- ldp A_l, A_h, [src]
- ldp D_l, D_h, [srcend, -16]
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
- .p2align 4
L(move_long):
/* Only use backward copy if there is an overlap. */
sub tmp1, dstin, src
- cbz tmp1, L(copy0)
+ cbz tmp1, L(move0)
cmp tmp1, count
b.hs L(copy_long)
/* Large backwards copy for overlapping copies.
- Copy 16 bytes and then align dst to 16-byte alignment. */
- ldp D_l, D_h, [srcend, -16]
- and tmp1, dstend, 15
- sub srcend, srcend, tmp1
+ Copy 16 bytes and then align srcend to 16-byte alignment. */
+L(copy_long_backwards):
+ ldr D_q, [srcend, -16]
+ and tmp1, srcend, 15
+ bic srcend, srcend, 15
sub count, count, tmp1
- ldp A_l, A_h, [srcend, -16]
- stp D_l, D_h, [dstend, -16]
- ldp B_l, B_h, [srcend, -32]
- ldp C_l, C_h, [srcend, -48]
- ldp D_l, D_h, [srcend, -64]!
+ ldp A_q, B_q, [srcend, -32]
+ str D_q, [dstend, -16]
+ ldp C_q, D_q, [srcend, -64]
sub dstend, dstend, tmp1
subs count, count, 128
b.ls L(copy64_from_start)
L(loop64_backwards):
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [srcend, -16]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [srcend, -32]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [srcend, -48]
- stp D_l, D_h, [dstend, -64]!
- ldp D_l, D_h, [srcend, -64]!
+ str B_q, [dstend, -16]
+ str A_q, [dstend, -32]
+ ldp A_q, B_q, [srcend, -96]
+ str D_q, [dstend, -48]
+ str C_q, [dstend, -64]!
+ ldp C_q, D_q, [srcend, -128]
+ sub srcend, srcend, 64
subs count, count, 64
b.hi L(loop64_backwards)
/* Write the last iteration and copy 64 bytes from the start. */
L(copy64_from_start):
- ldp G_l, G_h, [src, 48]
- stp A_l, A_h, [dstend, -16]
- ldp A_l, A_h, [src, 32]
- stp B_l, B_h, [dstend, -32]
- ldp B_l, B_h, [src, 16]
- stp C_l, C_h, [dstend, -48]
- ldp C_l, C_h, [src]
- stp D_l, D_h, [dstend, -64]
- stp G_l, G_h, [dstin, 48]
- stp A_l, A_h, [dstin, 32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstin]
+ ldp E_q, F_q, [src, 32]
+ stp A_q, B_q, [dstend, -32]
+ ldp A_q, B_q, [src]
+ stp C_q, D_q, [dstend, -64]
+ stp E_q, F_q, [dstin, 32]
+ stp A_q, B_q, [dstin]
+L(move0):
ret
END (MEMMOVE)