/* NEON/VFP/ARM version of memcpy optimized for Cortex-A15.
Copyright (C) 2013-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>.
This memcpy routine is optimised for Cortex-A15 cores and takes advantage
of VFP or NEON when built with the appropriate flags.
Assumptions:
ARMv6 (ARMv7-a if using Neon)
ARM state
Unaligned accesses
*/
/* Thumb cannot encode negative immediate offsets in memory operations. */
#ifndef NO_THUMB
#define NO_THUMB
#endif
#include <sysdep.h>
#include <arm-features.h>
.syntax unified
/* This implementation requires ARM state. */
.arm
#ifdef MEMCPY_NEON
.fpu neon
.arch armv7-a
# define FRAME_SIZE 4
# define USE_VFP
# define USE_NEON
#elif defined (MEMCPY_VFP)
.arch armv6
.fpu vfpv2
# define FRAME_SIZE 32
# define USE_VFP
#else
.arch armv6
# define FRAME_SIZE 32
#endif
#define ALIGN(addr, align) addr:align
#define INSN_SIZE 4
/* Call parameters. */
#define dstin r0
#define src r1
#define count r2
/* Locals. */
#define tmp1 r3
#define dst ip
#define tmp2 r8
/* These two macros both work by repeated invocation of the macro
dispatch_step (not defined here). That macro performs one "step",
doing one load instruction and one store instruction to copy one
"unit". On entry, TMP1 contains the number of bytes to be copied,
a multiple of the unit size. The macro clobbers TMP1 in the
process of doing a computed jump to the tail containing the
appropriate number of steps.
In dispatch_7_dword, dispatch_step is invoked seven times, with an
argument that is 7 for the first and 1 for the last. Units are
double-words (8 bytes). TMP1 is at most 56.
In dispatch_15_word, dispatch_step is invoked fifteen times,
with an argument that is 15 for the first and 1 for the last.
Units are words (4 bytes). TMP1 is at most 60. */
#ifndef ARM_ALWAYS_BX
# if ARM_BX_ALIGN_LOG2 != 2
# error case not handled
# endif
.macro dispatch_7_dword
rsb tmp1, tmp1, #((7 * 8) - PC_OFS + INSN_SIZE)
add pc, pc, tmp1
dispatch_step 7
dispatch_step 6
dispatch_step 5
dispatch_step