math: Use tanf from CORE-MATH

The CORE-MATH implementation is correctly rounded (for any rounding mode) and shows better performance to the generic tanf. The code was adapted to glibc style, to use the definition of math_config.h, to remove errno handling, and to use a generic 128 bit routine for ABIs that do not support it natively. Benchtest on x64_64 (Ryzen 9 5900X, gcc 14.2.1), aarch64 (neoverse1, gcc 13.2.1), and powerpc (POWER10, gcc 13.2.1): latency master patched improvement x86_64 82.3961 54.8052 33.49% x86_64v2 82.3415 54.8052 33.44% x86_64v3 69.3661 50.4864 27.22% i686 219.271 45.5396 79.23% aarch64 29.2127 19.1951 34.29% power10 19.5060 16.2760 16.56% reciprocal-throughput master patched improvement x86_64 28.3976 19.7334 30.51% x86_64v2 28.4568 19.7334 30.65% x86_64v3 21.1815 16.1811 23.61% i686 105.016 15.1426 85.58% aarch64 18.1573 10.7681 40.70% power10 8.7207 8.7097 0.13% Signed-off-by: Alexei Sibidanov <sibid@uvic.ca> Signed-off-by: Paul Zimmermann <Paul.Zimmermann@inria.fr> Signed-off-by: Adhemerval Zanella <adhemerval.zanella@linaro.org> Reviewed-by: DJ Delorie <dj@redhat.com>
author: Adhemerval Zanella <adhemerval.zanella@linaro.org> 2024-11-08 13:24:28 -0300
committer: Adhemerval Zanella <adhemerval.zanella@linaro.org> 2024-11-22 10:52:27 -0300
commit: bccb0648ea29f89a7b1b64f3e5674d2338e3798e (patch)
tree: 31849761a6d493625977c973db58c50669935fc9
parent: d846f4c12d7636efd5b7cff173456e616a185e24 (diff)
download: glibc-bccb0648ea29f89a7b1b64f3e5674d2338e3798e.tar.xz
glibc-bccb0648ea29f89a7b1b64f3e5674d2338e3798e.zip
29 files changed, 321 insertions, 270 deletions
diff --git a/SHARED-FILES b/SHARED-FILES
index 033ce7f092..9f4de81b66 100644
--- a/SHARED-FILES
+++ b/SHARED-FILES
@@ -288,3 +288,9 @@ sysdeps/ieee754/flt-32/e_lgammaf_r.c:
   - remove the errno stuff (this is done by the wrapper)
   - replace 0x1p127f * 0x1p127f by math_narrow_eval (x * 0x1p127f)
   - add libm_alias_finite (__ieee754_lgammaf_r, __lgammaf_r) at the end
+sysdeps/ieee754/flt-32/s_tanf.c:
+  (src/binary32/tan/tanf.c in CORE-MATH)
+  - The code was adapted to use glibc code style and internal
+    functions to handle errno, overflow, and underflow.  It was changed
+    to use an internal wrapper for 128 bit unsigned integer operations
+    for ABIs that do not support the type natively.
diff --git a/sysdeps/aarch64/libm-test-ulps b/sysdeps/aarch64/libm-test-ulps
index 1d3d1f9b6a..89b166b71b 100644
--- a/sysdeps/aarch64/libm-test-ulps
+++ b/sysdeps/aarch64/libm-test-ulps
@@ -1561,7 +1561,6 @@ float: 3
 ldouble: 4
 
 Function: "tan":
-float: 1
 ldouble: 1
 
 Function: "tan_advsimd":
@@ -1570,7 +1569,6 @@ float: 2
 
 Function: "tan_downward":
 double: 1
-float: 2
 ldouble: 1
 
 Function: "tan_sve":
@@ -1579,12 +1577,10 @@ float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 ldouble: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 ldouble: 1
 
 Function: "tanh":
diff --git a/sysdeps/alpha/fpu/libm-test-ulps b/sysdeps/alpha/fpu/libm-test-ulps
index 7256e674bb..0f7628b75b 100644
--- a/sysdeps/alpha/fpu/libm-test-ulps
+++ b/sysdeps/alpha/fpu/libm-test-ulps
@@ -1342,22 +1342,18 @@ float: 3
 ldouble: 4
 
 Function: "tan":
-float: 1
 ldouble: 1
 
 Function: "tan_downward":
 double: 1
-float: 2
 ldouble: 1
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 ldouble: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 ldouble: 1
 
 Function: "tanh":
diff --git a/sysdeps/arc/fpu/libm-test-ulps b/sysdeps/arc/fpu/libm-test-ulps
index 66a2b541c6..4d4b22db47 100644
--- a/sysdeps/arc/fpu/libm-test-ulps
+++ b/sysdeps/arc/fpu/libm-test-ulps
@@ -1081,19 +1081,15 @@ float: 3
 
 Function: "tan":
 double: 1
-float: 1
 
 Function: "tan_downward":
 double: 1
-float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 2
 
 Function: "tan_upward":
 double: 1
-float: 2
 
 Function: "tanh":
 double: 3
diff --git a/sysdeps/arc/nofpu/libm-test-ulps b/sysdeps/arc/nofpu/libm-test-ulps
index 38836ddc38..4faf784aad 100644
--- a/sysdeps/arc/nofpu/libm-test-ulps
+++ b/sysdeps/arc/nofpu/libm-test-ulps
@@ -259,9 +259,6 @@ Function: "sinh":
 double: 2
 float: 2
 
-Function: "tan":
-float: 1
-
 Function: "tanh":
 double: 2
 float: 2
diff --git a/sysdeps/arm/libm-test-ulps b/sysdeps/arm/libm-test-ulps
index 2651046cfa..c80122de79 100644
--- a/sysdeps/arm/libm-test-ulps
+++ b/sysdeps/arm/libm-test-ulps
@@ -1078,20 +1078,14 @@ Function: "sinh_upward":
 double: 3
 float: 3
 
-Function: "tan":
-float: 1
-
 Function: "tan_downward":
 double: 1
-float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 
 Function: "tanh":
 double: 2
diff --git a/sysdeps/csky/fpu/libm-test-ulps b/sysdeps/csky/fpu/libm-test-ulps
index 02b4cb4934..d67cfe1785 100644
--- a/sysdeps/csky/fpu/libm-test-ulps
+++ b/sysdeps/csky/fpu/libm-test-ulps
@@ -1000,20 +1000,14 @@ Function: "sinh_upward":
 double: 3
 float: 3
 
-Function: "tan":
-float: 1
-
 Function: "tan_downward":
 double: 1
-float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 
 Function: "tanh":
 double: 2
diff --git a/sysdeps/csky/nofpu/libm-test-ulps b/sysdeps/csky/nofpu/libm-test-ulps
index 34312f5a06..6cdf9fd034 100644
--- a/sysdeps/csky/nofpu/libm-test-ulps
+++ b/sysdeps/csky/nofpu/libm-test-ulps
@@ -1031,20 +1031,14 @@ Function: "sinh_upward":
 double: 3
 float: 3
 
-Function: "tan":
-float: 1
-
 Function: "tan_downward":
 double: 1
-float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 
 Function: "tanh":
 double: 2
diff --git a/sysdeps/generic/math_uint128.h b/sysdeps/generic/math_uint128.h
new file mode 100644
index 0000000000..1251f598f7
--- /dev/null
+++ b/sysdeps/generic/math_uint128.h
@@ -0,0 +1,150 @@
+/* Internal 128 bit int support.
+   Copyright (C) 2024 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef _MATH_INT128_H
+#define _MATH_INT128_H
+
+/* Limited support for internal 128 bit integer, used on some math
+   implementations.  It uses compiler builtin type if supported, otherwise
+   it is emulated.  Only unsigned and some operations are currently supported:
+
+   - u128_t:         the 128 bit unsigned type.
+   - u128_high:      return the high part of the number.
+   - u128_low:       return the low part of the number.
+   - u128_from_u64:  create a 128 bit number from a 64 bit one.
+   - u128_mul:       multiply two 128 bit numbers.
+   - u128_add:       add two 128 bit numbers.
+   - u128_lshift:    left shift a number.
+   - u128_rshift:    right shift a number.
+ */
+
+#if defined __BITINT_MAXWIDTH__ && __BITINT_MAXWIDTH__ >= 128
+typedef unsigned _BitInt(128) u128;
+# define __MATH_INT128_BUILTIN_TYPE 1
+#elif defined __SIZEOF_INT128__
+typedef unsigned __int128 u128;
+# define __MATH_INT128_BUILTIN_TYPE 1
+#else
+# define __MATH_INT128_BUILTIN_TYPE 0
+#endif
+
+#if __MATH_INT128_BUILTIN_TYPE
+# define u128_high(__x)         (uint64_t)((__x) >> 64)
+# define u128_low(__x)          (uint64_t)(__x)
+# define u128_from_u64(__x)     (u128)(__x)
+# define u128_mul(__x, __y)     (__x) * (__y)
+# define u128_add(__x, __y)     (__x) + (__y)
+# define u128_lshift(__x, __y)  (__x) << (__y)
+# define u128_rshift(__x, __y)  (__x) >> (__y)
+#else
+typedef struct
+{
+  uint64_t low;
+  uint64_t high;
+} u128;
+
+# define u128_high(__x)         (__x).high
+# define u128_low(__x)          (__x).low
+# define u128_from_u64(__x)     (u128){.low = (__x), .high = 0}
+
+# define MASK32                 (UINT64_C(0xffffffff))
+
+static u128 u128_add (u128 x, u128 y)
+{
+  bool carry = x.low + y.low < x.low;
+  return (u128) { .high = x.high + y.high + carry, .low = x.low + y.low };
+}
+
+static u128 u128_lshift (u128 x, unsigned int n)
+{
+  switch (n)
+    {
+    case 0:         return x;
+    case 1 ... 63:  return (u128) { .high = (x.high << n) | (x.low >> (64 - n)),
+				    .low = x.low << n };
+    case 64 ...127: return (u128) { .high = x.low << (n - 64), .low = 0};
+    default:        return (u128) { .high = 0, .low = 0 };
+    }
+}
+
+static u128 u128_rshift (u128 x, unsigned int n)
+{
+  switch (n)
+    {
+    case 0:         return x;
+    case 1 ... 63:  return (u128) { .high = x.high >> n,
+				    .low = (x.high << (64 - n)) | (x.low >> n) };
+    case 64 ...127: return (u128) { .high = 0, .low = x.high >> (n - 64) };
+    default:        return (u128) { .high = 0, .low = 0 };
+    }
+}
+
+static u128 u128_mul (u128 x, u128 y)
+{
+  if (x.high == 0 && y.high == 0)
+    {
+      uint64_t x0 = x.low & MASK32;
+      uint64_t x1 = x.low >> 32;
+      uint64_t y0 = y.low & MASK32;
+      uint64_t y1 = y.low >> 32;
+      u128 x0y0 = { .high = 0, .low = x0 * y0 };
+      u128 x0y1 = { .high = 0, .low = x0 * y1 };
+      u128 x1y0 = { .high = 0, .low = x1 * y0 };
+      u128 x1y1 = { .high = x1 * y1, .low = 0 };
+      /* x0y0 + ((x0y1 + x1y0) << 32) + x1y1  */
+      return u128_add (u128_add (x0y0,
+				 u128_lshift (u128_add (x0y1, x1y0),
+					      32)),
+		       x1y1);
+    }
+  else
+    {
+      uint64_t x0 = x.low & MASK32;
+      uint64_t x1 = x.low >> 32;
+      uint64_t x2 = x.high & MASK32;
+      uint64_t x3 = x.high >> 32;
+      uint64_t y0 = y.low & MASK32;
+      uint64_t y1 = y.low >> 32;
+      uint64_t y2 = y.high & MASK32;
+      uint64_t y3 = y.high >> 32;
+      u128 x0y0 = { .high = 0, .low = x0 * y0 };
+      u128 x0y1 = { .high = 0, .low = x0 * y1 };
+      u128 x0y2 = { .high = 0, .low = x0 * y2 };
+      u128 x0y3 = { .high = 0, .low = x0 * y3 };
+      u128 x1y0 = { .high = 0, .low = x1 * y0 };
+      u128 x1y1 = { .high = 0, .low = x1 * y1 };
+      u128 x1y2 = { .high = 0, .low = x1 * y2 };
+      u128 x2y0 = { .high = 0, .low = x2 * y0 };
+      u128 x2y1 = { .high = 0, .low = x2 * y1 };
+      u128 x3y0 = { .high = 0, .low = x3 * y0 };
+      /* x0y0 + ((x0y1 + x1y0) << 32) + ((x0y2 + x1y1 + x2y0) << 64) +
+          ((x0y3 + x1y2 + x2y1 + x3y0) << 96)  */
+      u128 r0 = u128_add (x0y0,
+			  u128_lshift (u128_add (x0y1, x1y0),
+				       32));
+      u128 r1 = u128_add (u128_lshift (u128_add (u128_add (x0y2, x1y1), x2y0),
+				       64),
+			  u128_lshift (u128_add (u128_add (x0y3, x1y2),
+						 u128_add (x2y1, x3y0)),
+				       96));
+      return u128_add (r0, r1);
+   }
+}
+#endif /* __SIZEOF_INT128__ */
+
+#endif
diff --git a/sysdeps/hppa/fpu/libm-test-ulps b/sysdeps/hppa/fpu/libm-test-ulps
index 976661541b..9ed2204d38 100644
--- a/sysdeps/hppa/fpu/libm-test-ulps
+++ b/sysdeps/hppa/fpu/libm-test-ulps
@@ -1107,20 +1107,16 @@ float: 3
 
 Function: "tan":
 double: 1
-float: 1
 ldouble: 1
 
 Function: "tan_downward":
 double: 1
-float: 2
 
 Function: "tan_towardzero":
 double: 1
-float: 1
 
 Function: "tan_upward":
 double: 1
-float: 1
 
 Function: "tanh":
 double: 2
diff --git a/sysdeps/i386/fpu/libm-test-ulps b/sysdeps/i386/fpu/libm-test-ulps
index 170e7cfc65..c06da68b45 100644
--- a/sysdeps/i386/fpu/libm-test-ulps
+++ b/sysdeps/i386/fpu/libm-test-ulps
@@ -1614,25 +1614,21 @@ float128: 4
 ldouble: 5
 
 Function: "tan":
-float: 1
 float128: 1
 ldouble: 2
 
 Function: "tan_downward":
 double: 1
-float: 2
 float128: 1
 ldouble: 3
 
 Function: "tan_towardzero":
 double: 1
-float: 2
 float128: 1
 ldouble: 3
 
 Function: "tan_upward":
 double: 1
-float: 2
 float128: 1
 ldouble: 2
 
diff --git a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
index a9ce632e6a..43ffbd7978 100644
--- a/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
+++ b/sysdeps/i386/i686/fpu/multiarch/libm-test-ulps
@@ -1619,25 +1619,21 @@ float128: 4
 ldouble: 5
 
 Function: "tan":
-float: 1
 float128: 1
 ldouble: 2
 
 Function: "tan_downward":
 double: 1
-float: 2
 float128: 1
 ldouble: 3
 
 Function: "tan_towardzero":
 double: 1
-float: 2
 float128: 1
 ldouble: 3
 
 Function: "tan_upward":
 double: 1
-float: 2
 float128: 1
 ldouble: 2
 
diff --git a/sysdeps/ieee754/flt-32/k_tanf.c b/sysdeps/ieee754/flt-32/k_tanf.c
index e1c9d14104..1cc8931700 100644
--- a/sysdeps/ieee754/flt-32/k_tanf.c
+++ b/sysdeps/ieee754/flt-32/k_tanf.c
@@ -1,101 +1 @@
-/* k_tanf.c -- float version of k_tan.c
- */
-
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
- */
-
-#if defined(LIBM_SCCS) && !defined(lint)
-static char rcsid[] = "$NetBSD: k_tanf.c,v 1.4 1995/05/10 20:46:39 jtc Exp $";
-#endif
-
-#include <float.h>
-#include <math.h>
-#include <math_private.h>
-#include <math-underflow.h>
-static const float
-one   =  1.0000000000e+00, /* 0x3f800000 */
-pio4  =  7.8539812565e-01, /* 0x3f490fda */
-pio4lo=  3.7748947079e-08, /* 0x33222168 */
-T[] =  {
-  3.3333334327e-01, /* 0x3eaaaaab */
-  1.3333334029e-01, /* 0x3e088889 */
-  5.3968254477e-02, /* 0x3d5d0dd1 */
-  2.1869488060e-02, /* 0x3cb327a4 */
-  8.8632395491e-03, /* 0x3c11371f */
-  3.5920790397e-03, /* 0x3b6b6916 */
-  1.4562094584e-03, /* 0x3abede48 */
-  5.8804126456e-04, /* 0x3a1a26c8 */
-  2.4646313977e-04, /* 0x398137b9 */
-  7.8179444245e-05, /* 0x38a3f445 */
-  7.1407252108e-05, /* 0x3895c07a */
- -1.8558637748e-05, /* 0xb79bae5f */
-  2.5907305826e-05, /* 0x37d95384 */
-};
-
-float __kernel_tanf(float x, float y, int iy)
-{
-	float z,r,v,w,s;
-	int32_t ix,hx;
-	GET_FLOAT_WORD(hx,x);
-	ix = hx&0x7fffffff;	/* high word of |x| */
-	if(ix<0x39000000)			/* x < 2**-13 */
-	    {if((int)x==0) {			/* generate inexact */
-		if((ix|(iy+1))==0) return one/fabsf(x);
-		else if (iy == 1)
-		  {
-		    math_check_force_underflow (x);
-		    return x;
-		  }
-		else
-		  return -one / x;
-	    }
-	    }
-	if(ix>=0x3f2ca140) { 			/* |x|>=0.6744 */
-	    if(hx<0) {x
author	Adhemerval Zanella <adhemerval.zanella@linaro.org>	2024-11-08 13:24:28 -0300
committer	Adhemerval Zanella <adhemerval.zanella@linaro.org>	2024-11-22 10:52:27 -0300
commit	bccb0648ea29f89a7b1b64f3e5674d2338e3798e (patch)
tree	31849761a6d493625977c973db58c50669935fc9
parent	d846f4c12d7636efd5b7cff173456e616a185e24 (diff)
download	glibc-bccb0648ea29f89a7b1b64f3e5674d2338e3798e.tar.xz glibc-bccb0648ea29f89a7b1b64f3e5674d2338e3798e.zip