aboutsummaryrefslogtreecommitdiff
path: root/sysdeps/aarch64/fpu/erfc_advsimd.c
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2024-10-28 14:58:35 +0000
committerWilco Dijkstra <wilco.dijkstra@arm.com>2024-10-28 15:01:37 +0000
commit1cf29fbc5be23db775d1dfa6b332ded6e6554252 (patch)
treec4e79fb806e2400be4574bb1f29034a02ec90bf8 /sysdeps/aarch64/fpu/erfc_advsimd.c
parent95129e6b8fabdaa8cd8a4a5cc20be0f4cb0ba59f (diff)
downloadglibc-1cf29fbc5be23db775d1dfa6b332ded6e6554252.tar.xz
glibc-1cf29fbc5be23db775d1dfa6b332ded6e6554252.zip
AArch64: Small optimisation in AdvSIMD erf and erfc
In both routines, reduce register pressure such that GCC 14 emits no spills for erf and fewer spills for erfc. Also use more efficient comparison for the special-case in erf. Benchtests show erf improves by 6.4%, erfc by 1.0%.
Diffstat (limited to 'sysdeps/aarch64/fpu/erfc_advsimd.c')
-rw-r--r--sysdeps/aarch64/fpu/erfc_advsimd.c13
1 files changed, 7 insertions, 6 deletions
diff --git a/sysdeps/aarch64/fpu/erfc_advsimd.c b/sysdeps/aarch64/fpu/erfc_advsimd.c
index f1b3bfe830..2f2f755c46 100644
--- a/sysdeps/aarch64/fpu/erfc_advsimd.c
+++ b/sysdeps/aarch64/fpu/erfc_advsimd.c
@@ -24,8 +24,8 @@ static const struct data
{
uint64x2_t offset, table_scale;
float64x2_t max, shift;
- float64x2_t p20, p40, p41, p42;
- float64x2_t p51, p52;
+ float64x2_t p20, p40, p41, p51;
+ double p42, p52;
double qr5[2], qr6[2], qr7[2], qr8[2], qr9[2];
#if WANT_SIMD_EXCEPT
float64x2_t uflow_bound;
@@ -41,9 +41,9 @@ static const struct data
.p20 = V2 (0x1.5555555555555p-2), /* 1/3, used to compute 2/3 and 1/6. */
.p40 = V2 (-0x1.999999999999ap-4), /* 1/10. */
.p41 = V2 (-0x1.999999999999ap-2), /* 2/5. */
- .p42 = V2 (0x1.1111111111111p-3), /* 2/15. */
+ .p42 = 0x1.1111111111111p-3, /* 2/15. */
.p51 = V2 (-0x1.c71c71c71c71cp-3), /* 2/9. */
- .p52 = V2 (0x1.6c16c16c16c17p-5), /* 2/45. */
+ .p52 = 0x1.6c16c16c16c17p-5, /* 2/45. */
/* Qi = (i+1) / i, Ri = -2 * i / ((i+1)*(i+2)), for i = 5, ..., 9. */
.qr5 = { 0x1.3333333333333p0, -0x1.e79e79e79e79ep-3 },
.qr6 = { 0x1.2aaaaaaaaaaabp0, -0x1.b6db6db6db6dbp-3 },
@@ -157,9 +157,10 @@ float64x2_t V_NAME_D1 (erfc) (float64x2_t x)
float64x2_t p1 = r;
float64x2_t p2 = vfmsq_f64 (dat->p20, r2, vaddq_f64 (dat->p20, dat->p20));
float64x2_t p3 = vmulq_f64 (r, vfmaq_f64 (v_f64 (-0.5), r2, dat->p20));
- float64x2_t p4 = vfmaq_f64 (dat->p41, r2, dat->p42);
+ float64x2_t p42_p52 = vld1q_f64 (&dat->p42);
+ float64x2_t p4 = vfmaq_laneq_f64 (dat->p41, r2, p42_p52, 0);
p4 = vfmsq_f64 (dat->p40, r2, p4);
- float64x2_t p5 = vfmaq_f64 (dat->p51, r2, dat->p52);
+ float64x2_t p5 = vfmaq_laneq_f64 (dat->p51, r2, p42_p52, 1);
p5 = vmulq_f64 (r, vfmaq_f64 (vmulq_f64 (v_f64 (0.5), dat->p20), r2, p5));
/* Compute p_i using recurrence relation:
p_{i+2} = (p_i + r * Q_{i+1} * p_{i+1}) * R_{i+1}. */