diff options
| author | Joe Ramsay <Joe.Ramsay@arm.com> | 2023-10-05 17:10:48 +0100 |
|---|---|---|
| committer | Szabolcs Nagy <szabolcs.nagy@arm.com> | 2023-10-23 15:00:44 +0100 |
| commit | f554334c05a95c6b4df532ddc88cd3e72dc7d04c (patch) | |
| tree | 1ee426aaf6fbc68b5e7cb27286c8396738df9bc4 | |
| parent | 2aa0974d2573441bffd596b07bff8698b1f2f18c (diff) | |
| download | glibc-f554334c05a95c6b4df532ddc88cd3e72dc7d04c.tar.xz glibc-f554334c05a95c6b4df532ddc88cd3e72dc7d04c.zip | |
aarch64: Add vector implementations of tan routines
This includes some utility headers for evaluating polynomials using
various schemes.
21 files changed, 1274 insertions, 27 deletions
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in index 2672eb1f6a..70892503d6 100644 --- a/math/auto-libm-test-in +++ b/math/auto-libm-test-in @@ -7655,7 +7655,7 @@ sqrt min sqrt min_subnorm tan 0 -tan -0 +tan -0 no-mathvec tan pi/4 tan pi/2 tan -pi/2 diff --git a/math/auto-libm-test-out-tan b/math/auto-libm-test-out-tan index 7d00d03e1d..f46fdc7ec6 100644 --- a/math/auto-libm-test-out-tan +++ b/math/auto-libm-test-out-tan @@ -23,31 +23,31 @@ tan 0 = tan tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok = tan towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok = tan upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok -tan -0 -= tan downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok -= tan downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok -= tan downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok -= tan downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok -= tan upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok +tan -0 no-mathvec += tan downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok += tan upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok tan pi/4 = tan downward binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok = tan tonearest binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile index 04aa2e37ca..a1bbc9bcaa 100644 --- a/sysdeps/aarch64/fpu/Makefile +++ b/sysdeps/aarch64/fpu/Makefile @@ -1,7 +1,8 @@ libmvec-supported-funcs = cos \ exp \ log \ - sin + sin \ + tan float-advsimd-funcs = $(libmvec-supported-funcs) double-advsimd-funcs = $(libmvec-supported-funcs) diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions index c85c0f3efb..f0ca0940a9 100644 --- a/sysdeps/aarch64/fpu/Versions +++ b/sysdeps/aarch64/fpu/Versions @@ -17,4 +17,10 @@ libmvec { _ZGVsMxv_sin; _ZGVsMxv_sinf; } + GLIBC_2.39 { + _ZGVnN4v_tanf; + _ZGVnN2v_tan; + _ZGVsMxv_tanf; + _ZGVsMxv_tan; + } } diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h index 7c200599c1..6193213147 100644 --- a/sysdeps/aarch64/fpu/bits/math-vector.h +++ b/sysdeps/aarch64/fpu/bits/math-vector.h @@ -53,11 +53,13 @@ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t); __vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t); +__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t); __vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t); __vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t); +__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t); # undef __ADVSIMD_VEC_MATH_SUPPORTED #endif /* __ADVSIMD_VEC_MATH_SUPPORTED */ @@ -68,11 +70,13 @@ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t); __sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t); +__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t); __sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t); +__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t); # undef __SVE_VEC_MATH_SUPPORTED #endif /* __SVE_VEC_MATH_SUPPORTED */ diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f32.h b/sysdeps/aarch64/fpu/poly_advsimd_f32.h new file mode 100644 index 0000000000..9e2ad9ad94 --- /dev/null +++ b/sysdeps/aarch64/fpu/poly_advsimd_f32.h @@ -0,0 +1,36 @@ +/* Helpers for evaluating polynomials on single-precision AdvSIMD input, using + various schemes. + + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_POLY_ADVSIMD_F32_H +#define AARCH64_FPU_POLY_ADVSIMD_F32_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f32. */ +#define VTYPE float32x4_t +#define FMA(x, y, z) vfmaq_f32 (z, x, y) +#define VWRAP(f) v_##f##_f32 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f64.h b/sysdeps/aarch64/fpu/poly_advsimd_f64.h new file mode 100644 index 0000000000..955cfc08ce --- /dev/null +++ b/sysdeps/aarch64/fpu/poly_advsimd_f64.h @@ -0,0 +1,36 @@ +/* Helpers for evaluating polynomials on double-precision AdvSIMD input, using + various schemes. + + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#ifndef AARCH64_FPU_POLY_ADVSIMD_F64_H +#define AARCH64_FPU_POLY_ADVSIMD_F64_H + +#include <arm_neon.h> + +/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form: + v_[scheme]_[order]_f64. */ +#define VTYPE float64x2_t +#define FMA(x, y, z) vfmaq_f64 (z, x, y) +#define VWRAP(f) v_##f##_f64 +#include "poly_generic.h" +#undef VWRAP +#undef FMA +#undef VTYPE + +#endif diff --git a/sysdeps/aarch64/fpu/poly_generic.h b/sysdeps/aarch64/fpu/poly_generic.h new file mode 100644 index 0000000000..84f042182b --- /dev/null +++ b/sysdeps/aarch64/fpu/poly_generic.h @@ -0,0 +1,285 @@ +/* Generic helpers for evaluating polynomials with various schemes. + + Copyright (C) 2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + + +#ifndef VTYPE +# error Cannot use poly_generic without defining VTYPE +#endif +#ifndef VWRAP +# error Cannot use poly_generic without defining VWRAP +#endif +#ifndef FMA +# error Cannot use poly_generic without defining FMA +#endif + +static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2, + const VTYPE *poly) +{ + /* At order 3, Estrin and Pairwise Horner are identical. */ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + return FMA (p23, x2, p01); +} + +static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + return FMA (poly[4], x4, p03); +} +static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + return FMA (p45, x4, p03); +} +static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p46 = FMA (poly[6], x2, p45); + return FMA (p46, x4, p03); +} +static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4, + const VTYPE *poly) +{ + VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly); + VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4); + return FMA (p47, x4, p03); +} +static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p89 = FMA (poly[9], x, poly[8]); + VTYPE p8_10 = FMA (poly[10], x2, p89); + return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8); + return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + const VTYPE *poly) +{ + return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8, + VWRAP (estrin_7) (x, x2, x4, poly)); +} +static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_17 = FMA (poly[17], x, poly[16]); + VTYPE p16_18 = FMA (poly[18], x2, p16_17); + return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} +static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8, + VTYPE x16, const VTYPE *poly) +{ + VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16); + return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly)); +} + +static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[3], x, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly) +{ + VTYPE p = FMA (poly[4], x, poly[3]); + p = FMA (x, p, poly[2]); + p = FMA (x, p, poly[1]); + p = FMA (x, p, poly[0]); + return p; +} +static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]); +} +static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly) +{ + return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]); +} + +static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p; + p = FMA (x2, poly[4], p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p01 = FMA (poly[1], x, poly[0]); + VTYPE p23 = FMA (poly[3], x, poly[2]); + VTYPE p45 = FMA (poly[5], x, poly[4]); + VTYPE p; + p = FMA (x2, p45, p23); + p = FMA (x2, p, p01); + return p; +} +static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p26, p01); +} +static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p27, p01); +} +static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p28, p01); +} +static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p29, p01); +} +static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_10, p01); +} +static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_11, p01); +} +static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_12, p01); +} +static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_13, p01); +} +static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_14, p01); +} +static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_15, p01); +} +static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2); + VTYPE p01 = FMA (poly[1], x, poly[0]); + return FMA (x2, p2_16, p01); +} +static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly) +{ + VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2); + VT |
