diff options
| author | Luna Lamb <luna.lamb@arm.com> | 2025-02-13 17:54:46 +0000 |
|---|---|---|
| committer | Wilco Dijkstra <wilco.dijkstra@arm.com> | 2025-02-13 18:16:54 +0000 |
| commit | c0ff447edf19bd4630fe79adf5e8b896405b059f (patch) | |
| tree | 0b8affa5c980cf84eb05e8d7266653207f14ee49 /sysdeps/aarch64/fpu/exp2_sve.c | |
| parent | 8f0e7fe61e0a2ad5ed777933703ce09053810ec4 (diff) | |
| download | glibc-c0ff447edf19bd4630fe79adf5e8b896405b059f.tar.xz glibc-c0ff447edf19bd4630fe79adf5e8b896405b059f.zip | |
Aarch64: Improve codegen in SVE exp and users, and update expf_inline
Use unpredicted muls, and improve memory access.
7%, 3% and 1% improvement in throughput microbenchmark on Neoverse V1,
for exp, exp2 and cosh respectively.
Reviewed-by: Wilco Dijkstra <Wilco.Dijkstra@arm.com>
Diffstat (limited to 'sysdeps/aarch64/fpu/exp2_sve.c')
| -rw-r--r-- | sysdeps/aarch64/fpu/exp2_sve.c | 26 |
1 files changed, 14 insertions, 12 deletions
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c index a37c33092a..6db85266ca 100644 --- a/sysdeps/aarch64/fpu/exp2_sve.c +++ b/sysdeps/aarch64/fpu/exp2_sve.c @@ -18,7 +18,6 @@ <https://www.gnu.org/licenses/>. */ #include "sv_math.h" -#include "poly_sve_f64.h" #define N (1 << V_EXP_TABLE_BITS) @@ -27,15 +26,15 @@ static const struct data { - double poly[4]; + double c0, c2; + double c1, c3; double shift, big_bound, uoflow_bound; } data = { /* Coefficients are computed using Remez algorithm with minimisation of the absolute error. */ - .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5, - 0x1.3b2abf5571ad8p-7 }, - .shift = 0x1.8p52 / N, - .uoflow_bound = UOFlowBound, + .c0 = 0x1.62e42fefa3686p-1, .c1 = 0x1.ebfbdff82c241p-3, + .c2 = 0x1.c6b09b16de99ap-5, .c3 = 0x1.3b2abf5571ad8p-7, + .shift = 0x1.8p52 / N, .uoflow_bound = UOFlowBound, .big_bound = BigBound, }; @@ -67,9 +66,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n, /* |n| > 1280 => 2^(n) overflows. */ svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound); - svfloat64_t r1 = svmul_x (pg, s1, s1); + svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1); svfloat64_t r2 = svmla_x (pg, s2, s2, y); - svfloat64_t r0 = svmul_x (pg, r2, s1); + svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1); return svsel (p_cmp, r1, r0); } @@ -99,11 +98,14 @@ svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg) svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS); svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top)); + svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1); /* Approximate exp2(r) using polynomial. */ - svfloat64_t r2 = svmul_x (pg, r, r); - svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly); - svfloat64_t y = svmul_x (pg, r, p); - + /* y = exp2(r) - 1 ~= C0 r + C1 r^2 + C2 r^3 + C3 r^4. */ + svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r); + svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0); + svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1); + svfloat64_t p = svmla_x (pg, p01, p23, r2); + svfloat64_t y = svmul_x (svptrue_b64 (), r, p); /* Assemble exp2(x) = exp2(r) * scale. */ if (__glibc_unlikely (svptest_any (pg, special))) return special_case (pg, scale, y, kd, d); |
