aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJoe Ramsay <Joe.Ramsay@arm.com>2023-10-05 17:10:48 +0100
committerSzabolcs Nagy <szabolcs.nagy@arm.com>2023-10-23 15:00:44 +0100
commitf554334c05a95c6b4df532ddc88cd3e72dc7d04c (patch)
tree1ee426aaf6fbc68b5e7cb27286c8396738df9bc4
parent2aa0974d2573441bffd596b07bff8698b1f2f18c (diff)
downloadglibc-f554334c05a95c6b4df532ddc88cd3e72dc7d04c.tar.xz
glibc-f554334c05a95c6b4df532ddc88cd3e72dc7d04c.zip
aarch64: Add vector implementations of tan routines
This includes some utility headers for evaluating polynomials using various schemes.
-rw-r--r--math/auto-libm-test-in2
-rw-r--r--math/auto-libm-test-out-tan50
-rw-r--r--sysdeps/aarch64/fpu/Makefile3
-rw-r--r--sysdeps/aarch64/fpu/Versions6
-rw-r--r--sysdeps/aarch64/fpu/bits/math-vector.h4
-rw-r--r--sysdeps/aarch64/fpu/poly_advsimd_f32.h36
-rw-r--r--sysdeps/aarch64/fpu/poly_advsimd_f64.h36
-rw-r--r--sysdeps/aarch64/fpu/poly_generic.h285
-rw-r--r--sysdeps/aarch64/fpu/poly_sve_f32.h38
-rw-r--r--sysdeps/aarch64/fpu/poly_sve_f64.h38
-rw-r--r--sysdeps/aarch64/fpu/poly_sve_generic.h313
-rw-r--r--sysdeps/aarch64/fpu/tan_advsimd.c123
-rw-r--r--sysdeps/aarch64/fpu/tan_sve.c104
-rw-r--r--sysdeps/aarch64/fpu/tanf_advsimd.c129
-rw-r--r--sysdeps/aarch64/fpu/tanf_sve.c118
-rw-r--r--sysdeps/aarch64/fpu/test-double-advsimd-wrappers.c1
-rw-r--r--sysdeps/aarch64/fpu/test-double-sve-wrappers.c1
-rw-r--r--sysdeps/aarch64/fpu/test-float-advsimd-wrappers.c1
-rw-r--r--sysdeps/aarch64/fpu/test-float-sve-wrappers.c1
-rw-r--r--sysdeps/aarch64/libm-test-ulps8
-rw-r--r--sysdeps/unix/sysv/linux/aarch64/libmvec.abilist4
21 files changed, 1274 insertions, 27 deletions
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 2672eb1f6a..70892503d6 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -7655,7 +7655,7 @@ sqrt min
sqrt min_subnorm
tan 0
-tan -0
+tan -0 no-mathvec
tan pi/4
tan pi/2
tan -pi/2
diff --git a/math/auto-libm-test-out-tan b/math/auto-libm-test-out-tan
index 7d00d03e1d..f46fdc7ec6 100644
--- a/math/auto-libm-test-out-tan
+++ b/math/auto-libm-test-out-tan
@@ -23,31 +23,31 @@ tan 0
= tan tonearest ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= tan towardzero ibm128 0x0p+0 : 0x0p+0 : inexact-ok
= tan upward ibm128 0x0p+0 : 0x0p+0 : inexact-ok
-tan -0
-= tan downward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward binary32 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan downward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward binary64 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan downward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward intel96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan downward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward m68k96 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan downward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward binary128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan downward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan tonearest ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan towardzero ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
-= tan upward ibm128 -0x0p+0 : -0x0p+0 : inexact-ok
+tan -0 no-mathvec
+= tan downward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward binary32 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan downward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward binary64 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan downward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward intel96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan downward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward m68k96 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan downward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward binary128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan downward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan tonearest ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan towardzero ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
+= tan upward ibm128 -0x0p+0 : -0x0p+0 : no-mathvec inexact-ok
tan pi/4
= tan downward binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok
= tan tonearest binary32 0xc.90fdbp-4 : 0x1p+0 : inexact-ok
diff --git a/sysdeps/aarch64/fpu/Makefile b/sysdeps/aarch64/fpu/Makefile
index 04aa2e37ca..a1bbc9bcaa 100644
--- a/sysdeps/aarch64/fpu/Makefile
+++ b/sysdeps/aarch64/fpu/Makefile
@@ -1,7 +1,8 @@
libmvec-supported-funcs = cos \
exp \
log \
- sin
+ sin \
+ tan
float-advsimd-funcs = $(libmvec-supported-funcs)
double-advsimd-funcs = $(libmvec-supported-funcs)
diff --git a/sysdeps/aarch64/fpu/Versions b/sysdeps/aarch64/fpu/Versions
index c85c0f3efb..f0ca0940a9 100644
--- a/sysdeps/aarch64/fpu/Versions
+++ b/sysdeps/aarch64/fpu/Versions
@@ -17,4 +17,10 @@ libmvec {
_ZGVsMxv_sin;
_ZGVsMxv_sinf;
}
+ GLIBC_2.39 {
+ _ZGVnN4v_tanf;
+ _ZGVnN2v_tan;
+ _ZGVsMxv_tanf;
+ _ZGVsMxv_tan;
+ }
}
diff --git a/sysdeps/aarch64/fpu/bits/math-vector.h b/sysdeps/aarch64/fpu/bits/math-vector.h
index 7c200599c1..6193213147 100644
--- a/sysdeps/aarch64/fpu/bits/math-vector.h
+++ b/sysdeps/aarch64/fpu/bits/math-vector.h
@@ -53,11 +53,13 @@ __vpcs __f32x4_t _ZGVnN4v_cosf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_expf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_logf (__f32x4_t);
__vpcs __f32x4_t _ZGVnN4v_sinf (__f32x4_t);
+__vpcs __f32x4_t _ZGVnN4v_tanf (__f32x4_t);
__vpcs __f64x2_t _ZGVnN2v_cos (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_exp (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_log (__f64x2_t);
__vpcs __f64x2_t _ZGVnN2v_sin (__f64x2_t);
+__vpcs __f64x2_t _ZGVnN2v_tan (__f64x2_t);
# undef __ADVSIMD_VEC_MATH_SUPPORTED
#endif /* __ADVSIMD_VEC_MATH_SUPPORTED */
@@ -68,11 +70,13 @@ __sv_f32_t _ZGVsMxv_cosf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_expf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_logf (__sv_f32_t, __sv_bool_t);
__sv_f32_t _ZGVsMxv_sinf (__sv_f32_t, __sv_bool_t);
+__sv_f32_t _ZGVsMxv_tanf (__sv_f32_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_cos (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_exp (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_log (__sv_f64_t, __sv_bool_t);
__sv_f64_t _ZGVsMxv_sin (__sv_f64_t, __sv_bool_t);
+__sv_f64_t _ZGVsMxv_tan (__sv_f64_t, __sv_bool_t);
# undef __SVE_VEC_MATH_SUPPORTED
#endif /* __SVE_VEC_MATH_SUPPORTED */
diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f32.h b/sysdeps/aarch64/fpu/poly_advsimd_f32.h
new file mode 100644
index 0000000000..9e2ad9ad94
--- /dev/null
+++ b/sysdeps/aarch64/fpu/poly_advsimd_f32.h
@@ -0,0 +1,36 @@
+/* Helpers for evaluating polynomials on single-precision AdvSIMD input, using
+ various schemes.
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_POLY_ADVSIMD_F32_H
+#define AARCH64_FPU_POLY_ADVSIMD_F32_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f32 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f32. */
+#define VTYPE float32x4_t
+#define FMA(x, y, z) vfmaq_f32 (z, x, y)
+#define VWRAP(f) v_##f##_f32
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/sysdeps/aarch64/fpu/poly_advsimd_f64.h b/sysdeps/aarch64/fpu/poly_advsimd_f64.h
new file mode 100644
index 0000000000..955cfc08ce
--- /dev/null
+++ b/sysdeps/aarch64/fpu/poly_advsimd_f64.h
@@ -0,0 +1,36 @@
+/* Helpers for evaluating polynomials on double-precision AdvSIMD input, using
+ various schemes.
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef AARCH64_FPU_POLY_ADVSIMD_F64_H
+#define AARCH64_FPU_POLY_ADVSIMD_F64_H
+
+#include <arm_neon.h>
+
+/* Wrap AdvSIMD f64 helpers: evaluation of some scheme/order has form:
+ v_[scheme]_[order]_f64. */
+#define VTYPE float64x2_t
+#define FMA(x, y, z) vfmaq_f64 (z, x, y)
+#define VWRAP(f) v_##f##_f64
+#include "poly_generic.h"
+#undef VWRAP
+#undef FMA
+#undef VTYPE
+
+#endif
diff --git a/sysdeps/aarch64/fpu/poly_generic.h b/sysdeps/aarch64/fpu/poly_generic.h
new file mode 100644
index 0000000000..84f042182b
--- /dev/null
+++ b/sysdeps/aarch64/fpu/poly_generic.h
@@ -0,0 +1,285 @@
+/* Generic helpers for evaluating polynomials with various schemes.
+
+ Copyright (C) 2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+
+#ifndef VTYPE
+# error Cannot use poly_generic without defining VTYPE
+#endif
+#ifndef VWRAP
+# error Cannot use poly_generic without defining VWRAP
+#endif
+#ifndef FMA
+# error Cannot use poly_generic without defining FMA
+#endif
+
+static inline VTYPE VWRAP (pairwise_poly_3) (VTYPE x, VTYPE x2,
+ const VTYPE *poly)
+{
+ /* At order 3, Estrin and Pairwise Horner are identical. */
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ return FMA (p23, x2, p01);
+}
+
+static inline VTYPE VWRAP (estrin_4) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ return FMA (poly[4], x4, p03);
+}
+static inline VTYPE VWRAP (estrin_5) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ return FMA (p45, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_6) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ VTYPE p46 = FMA (poly[6], x2, p45);
+ return FMA (p46, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_7) (VTYPE x, VTYPE x2, VTYPE x4,
+ const VTYPE *poly)
+{
+ VTYPE p03 = VWRAP (pairwise_poly_3) (x, x2, poly);
+ VTYPE p47 = VWRAP (pairwise_poly_3) (x, x2, poly + 4);
+ return FMA (p47, x4, p03);
+}
+static inline VTYPE VWRAP (estrin_8) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (poly[8], x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_9) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p89 = FMA (poly[9], x, poly[8]);
+ return FMA (p89, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_10) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p89 = FMA (poly[9], x, poly[8]);
+ VTYPE p8_10 = FMA (poly[10], x2, p89);
+ return FMA (p8_10, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_11) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ VTYPE p8_11 = VWRAP (pairwise_poly_3) (x, x2, poly + 8);
+ return FMA (p8_11, x8, VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_12) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_4) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_13) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_5) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_14) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_6) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_15) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ const VTYPE *poly)
+{
+ return FMA (VWRAP (estrin_7) (x, x2, x4, poly + 8), x8,
+ VWRAP (estrin_7) (x, x2, x4, poly));
+}
+static inline VTYPE VWRAP (estrin_16) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ return FMA (poly[16], x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_17) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_17 = FMA (poly[17], x, poly[16]);
+ return FMA (p16_17, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_18) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_17 = FMA (poly[17], x, poly[16]);
+ VTYPE p16_18 = FMA (poly[18], x2, p16_17);
+ return FMA (p16_18, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+static inline VTYPE VWRAP (estrin_19) (VTYPE x, VTYPE x2, VTYPE x4, VTYPE x8,
+ VTYPE x16, const VTYPE *poly)
+{
+ VTYPE p16_19 = VWRAP (pairwise_poly_3) (x, x2, poly + 16);
+ return FMA (p16_19, x16, VWRAP (estrin_15) (x, x2, x4, x8, poly));
+}
+
+static inline VTYPE VWRAP (horner_3) (VTYPE x, const VTYPE *poly)
+{
+ VTYPE p = FMA (poly[3], x, poly[2]);
+ p = FMA (x, p, poly[1]);
+ p = FMA (x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_4) (VTYPE x, const VTYPE *poly)
+{
+ VTYPE p = FMA (poly[4], x, poly[3]);
+ p = FMA (x, p, poly[2]);
+ p = FMA (x, p, poly[1]);
+ p = FMA (x, p, poly[0]);
+ return p;
+}
+static inline VTYPE VWRAP (horner_5) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_4) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_6) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_5) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_7) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_6) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_8) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_7) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_9) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_8) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_10) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_9) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_11) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_10) (x, poly + 1), poly[0]);
+}
+static inline VTYPE VWRAP (horner_12) (VTYPE x, const VTYPE *poly)
+{
+ return FMA (x, VWRAP (horner_11) (x, poly + 1), poly[0]);
+}
+
+static inline VTYPE VWRAP (pw_horner_4) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ VTYPE p;
+ p = FMA (x2, poly[4], p23);
+ p = FMA (x2, p, p01);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_5) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ VTYPE p23 = FMA (poly[3], x, poly[2]);
+ VTYPE p45 = FMA (poly[5], x, poly[4]);
+ VTYPE p;
+ p = FMA (x2, p45, p23);
+ p = FMA (x2, p, p01);
+ return p;
+}
+static inline VTYPE VWRAP (pw_horner_6) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p26 = VWRAP (pw_horner_4) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p26, p01);
+}
+static inline VTYPE VWRAP (pw_horner_7) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p27 = VWRAP (pw_horner_5) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p27, p01);
+}
+static inline VTYPE VWRAP (pw_horner_8) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p28 = VWRAP (pw_horner_6) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p28, p01);
+}
+static inline VTYPE VWRAP (pw_horner_9) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p29 = VWRAP (pw_horner_7) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p29, p01);
+}
+static inline VTYPE VWRAP (pw_horner_10) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_10 = VWRAP (pw_horner_8) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_10, p01);
+}
+static inline VTYPE VWRAP (pw_horner_11) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_11 = VWRAP (pw_horner_9) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_11, p01);
+}
+static inline VTYPE VWRAP (pw_horner_12) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_12 = VWRAP (pw_horner_10) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_12, p01);
+}
+static inline VTYPE VWRAP (pw_horner_13) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_13 = VWRAP (pw_horner_11) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_13, p01);
+}
+static inline VTYPE VWRAP (pw_horner_14) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_14 = VWRAP (pw_horner_12) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_14, p01);
+}
+static inline VTYPE VWRAP (pw_horner_15) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_15 = VWRAP (pw_horner_13) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_15, p01);
+}
+static inline VTYPE VWRAP (pw_horner_16) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_16 = VWRAP (pw_horner_14) (x, x2, poly + 2);
+ VTYPE p01 = FMA (poly[1], x, poly[0]);
+ return FMA (x2, p2_16, p01);
+}
+static inline VTYPE VWRAP (pw_horner_17) (VTYPE x, VTYPE x2, const VTYPE *poly)
+{
+ VTYPE p2_17 = VWRAP (pw_horner_15) (x, x2, poly + 2);
+ VT