aarch64: Use memcpy_simd as the default memcpy

Since __memcpy_simd is the fastest memcpy on almost all cores, replace the generic memcpy with it. If SVE is available, a SVE memcpy will be used by default (including for Neoverse N2).
author: Wilco Dijkstra <wdijkstr@arm.com> 2022-10-26 14:16:50 +0100
committer: Wilco Dijkstra <wdijkstr@arm.com> 2022-10-26 14:16:50 +0100
commit: e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 (patch)
tree: a4b65f09b750d95e3e593b4114f1212aab09fabe /sysdeps/aarch64/memcpy.S
parent: a8e72913fea0c6e2832c50523c60907ffa3b753b (diff)
download: glibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.tar.xz
glibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.zip
1 files changed, 81 insertions, 111 deletions
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 98d4e2c0e2..7b396b202f 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -1,4 +1,5 @@
-/* Copyright (C) 2012-2022 Free Software Foundation, Inc.
+/* Generic optimized memcpy using SIMD.
+   Copyright (C) 2012-2022 Free Software Foundation, Inc.
 
    This file is part of the GNU C Library.
 
@@ -20,7 +21,7 @@
 
 /* Assumptions:
  *
- * ARMv8-a, AArch64, unaligned accesses.
+ * ARMv8-a, AArch64, Advanced SIMD, unaligned accesses.
  *
  */
 
@@ -36,21 +37,18 @@
 #define B_l	x8
 #define B_lw	w8
 #define B_h	x9
-#define C_l	x10
 #define C_lw	w10
-#define C_h	x11
-#define D_l	x12
-#define D_h	x13
-#define E_l	x14
-#define E_h	x15
-#define F_l	x16
-#define F_h	x17
-#define G_l	count
-#define G_h	dst
-#define H_l	src
-#define H_h	srcend
 #define tmp1	x14
 
+#define A_q	q0
+#define B_q	q1
+#define C_q	q2
+#define D_q	q3
+#define E_q	q4
+#define F_q	q5
+#define G_q	q6
+#define H_q	q7
+
 #ifndef MEMMOVE
 # define MEMMOVE memmove
 #endif
@@ -69,10 +67,9 @@
    Large copies use a software pipelined loop processing 64 bytes per
    iteration.  The destination pointer is 16-byte aligned to minimize
    unaligned accesses.  The loop tail is handled by always copying 64 bytes
-   from the end.
-*/
+   from the end.  */
 
-ENTRY_ALIGN (MEMCPY, 6)
+ENTRY (MEMCPY)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -87,10 +84,10 @@ ENTRY_ALIGN (MEMCPY, 6)
 	/* Small copies: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldp	A_l, A_h, [src]
-	ldp	D_l, D_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
 	/* Copy 8-15 bytes.  */
@@ -102,7 +99,6 @@ L(copy16):
 	str	A_h, [dstend, -8]
 	ret
 
-	.p2align 3
 	/* Copy 4-7 bytes.  */
 L(copy8):
 	tbz	count, 2, L(copy4)
@@ -128,87 +124,69 @@ L(copy0):
 	.p2align 4
 	/* Medium copies: 33..128 bytes.  */
 L(copy32_128):
-	ldp	A_l, A_h, [src]
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	ldp	D_l, D_h, [srcend, -16]
+	ldp	A_q, B_q, [src]
+	ldp	C_q, D_q, [srcend, -32]
 	cmp	count, 64
 	b.hi	L(copy128)
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
 	.p2align 4
 	/* Copy 65..128 bytes.  */
 L(copy128):
-	ldp	E_l, E_h, [src, 32]
-	ldp	F_l, F_h, [src, 48]
+	ldp	E_q, F_q, [src, 32]
 	cmp	count, 96
 	b.ls	L(copy96)
-	ldp	G_l, G_h, [srcend, -64]
-	ldp	H_l, H_h, [srcend, -48]
-	stp	G_l, G_h, [dstend, -64]
-	stp	H_l, H_h, [dstend, -48]
+	ldp	G_q, H_q, [srcend, -64]
+	stp	G_q, H_q, [dstend, -64]
 L(copy96):
-	stp	A_l, A_h, [dstin]
-	stp	B_l, B_h, [dstin, 16]
-	stp	E_l, E_h, [dstin, 32]
-	stp	F_l, F_h, [dstin, 48]
-	stp	C_l, C_h, [dstend, -32]
-	stp	D_l, D_h, [dstend, -16]
+	stp	A_q, B_q, [dstin]
+	stp	E_q, F_q, [dstin, 32]
+	stp	C_q, D_q, [dstend, -32]
 	ret
 
-	.p2align 4
+	/* Align loop64 below to 16 bytes.  */
+	nop
+
 	/* Copy more than 128 bytes.  */
 L(copy_long):
-	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
-	ldp	D_l, D_h, [src]
-	and	tmp1, dstin, 15
-	bic	dst, dstin, 15
-	sub	src, src, tmp1
+	/* Copy 16 bytes and then align src to 16-byte alignment.  */
+	ldr	D_q, [src]
+	and	tmp1, src, 15
+	bic	src, src, 15
+	sub	dst, dstin, tmp1
 	add	count, count, tmp1	/* Count is now 16 too large.  */
-	ldp	A_l, A_h, [src, 16]
-	stp	D_l, D_h, [dstin]
-	ldp	B_l, B_h, [src, 32]
-	ldp	C_l, C_h, [src, 48]
-	ldp	D_l, D_h, [src, 64]!
+	ldp	A_q, B_q, [src, 16]
+	str	D_q, [dstin]
+	ldp	C_q, D_q, [src, 48]
 	subs	count, count, 128 + 16	/* Test and readjust count.  */
 	b.ls	L(copy64_from_end)
-
 L(loop64):
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [src, 16]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [src, 32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [src, 48]
-	stp	D_l, D_h, [dst, 64]!
-	ldp	D_l, D_h, [src, 64]!
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [src, 80]
+	stp	C_q, D_q, [dst, 48]
+	ldp	C_q, D_q, [src, 112]
+	add	src, src, 64
+	add	dst, dst, 64
 	subs	count, count, 64
 	b.hi	L(loop64)
 
 	/* Write the last iteration and copy 64 bytes from the end.  */
 L(copy64_from_end):
-	ldp	E_l, E_h, [srcend, -64]
-	stp	A_l, A_h, [dst, 16]
-	ldp	A_l, A_h, [srcend, -48]
-	stp	B_l, B_h, [dst, 32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dst, 48]
-	ldp	C_l, C_h, [srcend, -16]
-	stp	D_l, D_h, [dst, 64]
-	stp	E_l, E_h, [dstend, -64]
-	stp	A_l, A_h, [dstend, -48]
-	stp	B_l, B_h, [dstend, -32]
-	stp	C_l, C_h, [dstend, -16]
+	ldp	E_q, F_q, [srcend, -64]
+	stp	A_q, B_q, [dst, 16]
+	ldp	A_q, B_q, [srcend, -32]
+	stp	C_q, D_q, [dst, 48]
+	stp	E_q, F_q, [dstend, -64]
+	stp	A_q, B_q, [dstend, -32]
 	ret
 
 END (MEMCPY)
 libc_hidden_builtin_def (MEMCPY)
 
-ENTRY_ALIGN (MEMMOVE, 4)
+
+ENTRY (MEMMOVE)
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
@@ -220,64 +198,56 @@ ENTRY_ALIGN (MEMMOVE, 4)
 	cmp	count, 32
 	b.hi	L(copy32_128)
 
-	/* Small copies: 0..32 bytes.  */
+	/* Small moves: 0..32 bytes.  */
 	cmp	count, 16
 	b.lo	L(copy16)
-	ldp	A_l, A_h, [src]
-	ldp	D_l, D_h, [srcend, -16]
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
+	ldr	A_q, [src]
+	ldr	B_q, [srcend, -16]
+	str	A_q, [dstin]
+	str	B_q, [dstend, -16]
 	ret
 
-	.p2align 4
 L(move_long):
 	/* Only use backward copy if there is an overlap.  */
 	sub	tmp1, dstin, src
-	cbz	tmp1, L(copy0)
+	cbz	tmp1, L(move0)
 	cmp	tmp1, count
 	b.hs	L(copy_long)
 
 	/* Large backwards copy for overlapping copies.
-	   Copy 16 bytes and then align dst to 16-byte alignment.  */
-	ldp	D_l, D_h, [srcend, -16]
-	and	tmp1, dstend, 15
-	sub	srcend, srcend, tmp1
+	   Copy 16 bytes and then align srcend to 16-byte alignment.  */
+L(copy_long_backwards):
+	ldr	D_q, [srcend, -16]
+	and	tmp1, srcend, 15
+	bic	srcend, srcend, 15
 	sub	count, count, tmp1
-	ldp	A_l, A_h, [srcend, -16]
-	stp	D_l, D_h, [dstend, -16]
-	ldp	B_l, B_h, [srcend, -32]
-	ldp	C_l, C_h, [srcend, -48]
-	ldp	D_l, D_h, [srcend, -64]!
+	ldp	A_q, B_q, [srcend, -32]
+	str	D_q, [dstend, -16]
+	ldp	C_q, D_q, [srcend, -64]
 	sub	dstend, dstend, tmp1
 	subs	count, count, 128
 	b.ls	L(copy64_from_start)
 
 L(loop64_backwards):
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [srcend, -16]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [srcend, -32]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [srcend, -48]
-	stp	D_l, D_h, [dstend, -64]!
-	ldp	D_l, D_h, [srcend, -64]!
+	str	B_q, [dstend, -16]
+	str	A_q, [dstend, -32]
+	ldp	A_q, B_q, [srcend, -96]
+	str	D_q, [dstend, -48]
+	str	C_q, [dstend, -64]!
+	ldp	C_q, D_q, [srcend, -128]
+	sub	srcend, srcend, 64
 	subs	count, count, 64
 	b.hi	L(loop64_backwards)
 
 	/* Write the last iteration and copy 64 bytes from the start.  */
 L(copy64_from_start):
-	ldp	G_l, G_h, [src, 48]
-	stp	A_l, A_h, [dstend, -16]
-	ldp	A_l, A_h, [src, 32]
-	stp	B_l, B_h, [dstend, -32]
-	ldp	B_l, B_h, [src, 16]
-	stp	C_l, C_h, [dstend, -48]
-	ldp	C_l, C_h, [src]
-	stp	D_l, D_h, [dstend, -64]
-	stp	G_l, G_h, [dstin, 48]
-	stp	A_l, A_h, [dstin, 32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstin]
+	ldp	E_q, F_q, [src, 32]
+	stp	A_q, B_q, [dstend, -32]
+	ldp	A_q, B_q, [src]
+	stp	C_q, D_q, [dstend, -64]
+	stp	E_q, F_q, [dstin, 32]
+	stp	A_q, B_q, [dstin]
+L(move0):
 	ret
 
 END (MEMMOVE)
author	Wilco Dijkstra <wdijkstr@arm.com>	2022-10-26 14:16:50 +0100
committer	Wilco Dijkstra <wdijkstr@arm.com>	2022-10-26 14:16:50 +0100
commit	e6f3fe362f1aab78b1448d69ecdbd9e3872636d3 (patch)
tree	a4b65f09b750d95e3e593b4114f1212aab09fabe /sysdeps/aarch64/memcpy.S
parent	a8e72913fea0c6e2832c50523c60907ffa3b753b (diff)
download	glibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.tar.xz glibc-e6f3fe362f1aab78b1448d69ecdbd9e3872636d3.zip