/* Optimized memset implementation for PowerPC64/POWER8.
Copyright (C) 2014-2025 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
#include <sysdep.h>
/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
Returns 's'. */
#ifndef MEMSET
# define MEMSET memset
#endif
.machine power8
ENTRY_TOCLESS (MEMSET, 5)
CALL_MCOUNT 3
L(_memset):
cmpldi cr7,r5,31
neg r0,r3
mr r10,r3
insrdi r4,r4,8,48
insrdi r4,r4,16,32 /* Replicate byte to word. */
ble cr7,L(write_LT_32)
andi. r11,r10,15 /* Check alignment of DST. */
insrdi r4,r4,32,0 /* Replicate word to double word. */
beq L(big_aligned)
mtocrf 0x01,r0
clrldi r0,r0,60
/* Get DST aligned to 16 bytes. */
1: bf 31,2f
stb r4,0(r10)
addi r10,r10,1
2: bf 30,4f
sth r4,0(r10)
addi r10,r10,2
4: bf 29,8f
stw r4,0(r10)
addi r10,r10,4
8: bf 28,16f
std r4,0(r10)
addi r10,r10,8
16: subf r5,r0,r5
.align 4
L(big_aligned):
/* For sizes larger than 255 two possible paths:
- if constant is '0', zero full cache lines with dcbz
- otherwise uses vector instructions. */
cmpldi cr5,r5,255
dcbtst 0,r10
cmpldi cr6,r4,0
crand 27,26,21
bt 27,L(huge_dcbz)
bge cr5,L(huge_vector)
/* Size between 32 and 255 bytes with constant different than 0, use
doubleword store instruction to achieve best throughput. */
srdi r8,r5,5
clrldi r11,r5,59
cmpldi cr6,r11,0
cmpdi r8,0