diff options
| author | Ulrich Drepper <drepper@gmail.com> | 2011-10-24 20:19:17 -0400 |
|---|---|---|
| committer | Ulrich Drepper <drepper@gmail.com> | 2011-10-24 20:19:17 -0400 |
| commit | af968f62f24c5c0ef4e7e5ab41acae946908c112 (patch) | |
| tree | e1e0570eeb00c434cc751cbadfbeae150eeea11a | |
| parent | 58985aa92f57ff46e96b32388ce65e7fdd8c8b9e (diff) | |
| download | glibc-af968f62f24c5c0ef4e7e5ab41acae946908c112.tar.xz glibc-af968f62f24c5c0ef4e7e5ab41acae946908c112.zip | |
Optimize accurate 64-bit routines for FMA4 on x86-64
48 files changed, 595 insertions, 177 deletions
@@ -1,5 +1,62 @@ 2011-10-24 Ulrich Drepper <drepper@gmail.com> + * config.make.in: Add have-mfma4 entry. + * configure.in: Substitute libc_cv_cc_fma4. + * math/Makefile (dbl-only-routines): Add sincostab. + * sysdeps/ieee754/dbl-64/dosincos.c: Don't include sincos.tbl. + Use __sincostab not sincos. + * sysdeps/ieee754/dbl-64/e_asin.c: Don't define aliases when function + name is a macro. + * sysdeps/ieee754/dbl-64/e_exp.c: Likewise. + * sysdeps/ieee754/dbl-64/e_log.c: Likewise. + * sysdeps/ieee754/dbl-64/e_pow.c: Likewise. + * sysdeps/ieee754/dbl-64/e_atan2.c: Likewise. Define singArctan2 + using __copysign. + * sysdeps/ieee754/dbl-64/mpa.c: Don't export __acr. Don't define + __cr and __cpymn. Define __cpy unless NO___CPY is defined. Define + norm, denorm, and __mp_dbl unless NO___MP_DBL is defined. + * sysdeps/ieee754/dbl-64/mpa.h: Don't declare __acr, __cr, __cpymn, + and __inv. + * sysdeps/ieee754/dbl-64/mpsqrt.c: Make fastiroot static. + * sysdeps/ieee754/dbl-64/s_atan.c: Define __signArctan using + __copysign. + * sysdeps/ieee754/dbl-64/s_sin.c: Use __sincostab not sincos. Don't + define aliases when function name is a macro. + * sysdeps/ieee754/dbl-64/sincostab.c: Renamed from + sysdeps/ieee754/dbl-64/sincos.tbl. + * sysdeps/x86_64/fpu/multiarch/Makefile: Add entries to build + fma4-enabled routines. + * sysdeps/x86_64/fpu/multiarch/brandred-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/doasin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/dosincos-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_asin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_asin.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_atan2-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_atan2.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_exp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_exp.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_log-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_log.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_pow-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/e_pow.c: New file. + * sysdeps/x86_64/fpu/multiarch/halfulp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpa-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpatan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpatan2-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpexp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mplog-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mpsqrt-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/mptan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_atan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_atan.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_sin-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_sin.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_tan-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/s_tan.c: New file. + * sysdeps/x86_64/fpu/multiarch/sincos32-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/slowexp-fma4.c: New file. + * sysdeps/x86_64/fpu/multiarch/slowpow-fma4.c: New file. + * sysdeps/ieee754/dbl-64/doasin.c: Adjust for DLA_FMA -> DLA_FMS rename. * sysdeps/ieee754/dbl-64/dosincos.c: Likewise. diff --git a/config.make.in b/config.make.in index 2181d05ce3..d2baf6d3a9 100644 --- a/config.make.in +++ b/config.make.in @@ -59,6 +59,7 @@ have-cpp-asm-debuginfo = @libc_cv_cpp_asm_debuginfo@ enable-check-abi = @enable_check_abi@ have-forced-unwind = @libc_cv_forced_unwind@ have-fpie = @libc_cv_fpie@ +have-mfma4 = @libc_cv_cc_fma4@ gnu89-inline-CFLAGS = @gnu89_inline@ have-ssp = @libc_cv_ssp@ have-selinux = @have_selinux@ @@ -623,6 +623,7 @@ elf ldd_rewrite_script use_ldconfig libc_cv_as_i686 +libc_cv_cc_fma4 libc_cv_cc_novzeroupper libc_cv_cc_avx libc_cv_cc_sse4 @@ -7944,6 +7945,7 @@ fi + if test $elf = yes; then $as_echo "#define HAVE_ELF 1" >>confdefs.h diff --git a/configure.in b/configure.in index 6977fe1015..9678cbe712 100644 --- a/configure.in +++ b/configure.in @@ -2339,6 +2339,7 @@ AC_SUBST(libc_cv_cpp_asm_debuginfo) AC_SUBST(libc_cv_cc_sse4) AC_SUBST(libc_cv_cc_avx) AC_SUBST(libc_cv_cc_novzeroupper) +AC_SUBST(libc_cv_cc_fma4) AC_SUBST(libc_cv_as_i686) AC_SUBST(use_ldconfig) diff --git a/math/Makefile b/math/Makefile index 431eb5aa4b..41340da1bd 100644 --- a/math/Makefile +++ b/math/Makefile @@ -66,7 +66,7 @@ include ../Makeconfig dbl-only-routines := branred doasin dosincos halfulp mpa mpatan2 \ mpatan mpexp mplog mpsqrt mptan sincos32 slowexp \ - slowpow + slowpow sincostab libm-routines = $(strip $(libm-support) $(libm-calls) \ $(patsubst %_rf,%f_r,$(libm-calls:=f)) \ $(long-m-$(long-double-fcts))) \ diff --git a/sysdeps/ieee754/dbl-64/dosincos.c b/sysdeps/ieee754/dbl-64/dosincos.c index d5c6a14053..712d585b9e 100644 --- a/sysdeps/ieee754/dbl-64/dosincos.c +++ b/sysdeps/ieee754/dbl-64/dosincos.c @@ -35,11 +35,16 @@ #include "endian.h" #include "mydefs.h" -#include "sincos.tbl" #include <dla.h> #include "dosincos.h" #include "math_private.h" +extern const union +{ + int4 i[880]; + double x[440]; +} __sincostab attribute_hidden; + /***********************************************************************/ /* Routine receive Double-Length number (x+dx) and computing sin(x+dx) */ /* as Double-Length number and store it at array v .It computes it by */ @@ -66,10 +71,10 @@ void __dubsin(double x, double dx, double v[]) { dd=(x-d)+dx; /* sin(x+dx)=sin(Xi+t)=sin(Xi)*cos(t) + cos(Xi)sin(t) where t ->0 */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); - sn=sincos.x[k]; /* */ - ssn=sincos.x[k+1]; /* sin(Xi) and cos(Xi) */ - cs=sincos.x[k+2]; /* */ - ccs=sincos.x[k+3]; /* */ + sn=__sincostab.x[k]; /* */ + ssn=__sincostab.x[k+1]; /* sin(Xi) and cos(Xi) */ + cs=__sincostab.x[k+2]; /* */ + ccs=__sincostab.x[k+3]; /* */ MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* Taylor */ ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); /* series */ @@ -118,10 +123,10 @@ void __dubcos(double x, double dx, double v[]) { d=x+dx; dd=(x-d)+dx; /* cos(x+dx)=cos(Xi+t)=cos(Xi)cos(t) - sin(Xi)sin(t) */ MUL2(d,dd,d,dd,d2,dd2,p,hx,tx,hy,ty,q,c,cc); - sn=sincos.x[k]; /* */ - ssn=sincos.x[k+1]; /* sin(Xi) and cos(Xi) */ - cs=sincos.x[k+2]; /* */ - ccs=sincos.x[k+3]; /* */ + sn=__sincostab.x[k]; /* */ + ssn=__sincostab.x[k+1]; /* sin(Xi) and cos(Xi) */ + cs=__sincostab.x[k+2]; /* */ + ccs=__sincostab.x[k+3]; /* */ MUL2(d2,dd2,s7.x,ss7.x,ds,dss,p,hx,tx,hy,ty,q,c,cc); ADD2(ds,dss,s5.x,ss5.x,ds,dss,r,s); MUL2(d2,dd2,ds,dss,ds,dss,p,hx,tx,hy,ty,q,c,cc); diff --git a/sysdeps/ieee754/dbl-64/e_asin.c b/sysdeps/ieee754/dbl-64/e_asin.c index 02efb7ad2e..cd4cc2e2c2 100644 --- a/sysdeps/ieee754/dbl-64/e_asin.c +++ b/sysdeps/ieee754/dbl-64/e_asin.c @@ -324,7 +324,9 @@ double __ieee754_asin(double x){ return u.x/v.x; /* NaN */ } } +#ifndef __ieee754_asin strong_alias (__ieee754_asin, __asin_finite) +#endif /*******************************************************************/ /* */ @@ -636,4 +638,6 @@ double __ieee754_acos(double x) return u.x/v.x; } } +#ifndef __ieee754_acos strong_alias (__ieee754_acos, __acos_finite) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_atan2.c b/sysdeps/ieee754/dbl-64/e_atan2.c index 264791e0f9..9caacccf4c 100644 --- a/sysdeps/ieee754/dbl-64/e_atan2.c +++ b/sysdeps/ieee754/dbl-64/e_atan2.c @@ -51,7 +51,11 @@ /* round to nearest mode of IEEE 754 standard. */ /************************************************************************/ static double atan2Mp(double ,double ,const int[]); -static double signArctan2(double ,double); + /* Fix the sign and return after stage 1 or stage 2 */ +static double signArctan2(double y,double z) +{ + return __copysign(z, y); +} static double normalized(double ,double,double ,double); void __mpatan2(mp_no *,mp_no *,mp_no *,int); @@ -375,7 +379,9 @@ double __ieee754_atan2(double y,double x) { } } } +#ifndef __ieee754_atan2 strong_alias (__ieee754_atan2, __atan2_finite) +#endif /* Treat the Denormalized case */ static double normalized(double ax,double ay,double y, double z) @@ -387,11 +393,6 @@ static double normalized(double ax,double ay,double y, double z) __sub(&mpz,&mperr,&mpz2,p); __mp_dbl(&mpz2,&z,p); return signArctan2(y,z); } - /* Fix the sign and return after stage 1 or stage 2 */ -static double signArctan2(double y,double z) -{ - return ((y<ZERO) ? -z : z); -} /* Stage 3: Perform a multi-Precision computation */ static double atan2Mp(double x,double y,const int pr[]) { diff --git a/sysdeps/ieee754/dbl-64/e_exp.c b/sysdeps/ieee754/dbl-64/e_exp.c index f4b34a6363..48bbb05ed8 100644 --- a/sysdeps/ieee754/dbl-64/e_exp.c +++ b/sysdeps/ieee754/dbl-64/e_exp.c @@ -145,7 +145,9 @@ double __ieee754_exp(double x) { else return __slowexp(x); } } +#ifndef __ieee754_exp strong_alias (__ieee754_exp, __exp_finite) +#endif /************************************************************************/ /* Compute e^(x+xx)(Double-Length number) .The routine also receive */ diff --git a/sysdeps/ieee754/dbl-64/e_log.c b/sysdeps/ieee754/dbl-64/e_log.c index b7df81b488..7a0a26f251 100644 --- a/sysdeps/ieee754/dbl-64/e_log.c +++ b/sysdeps/ieee754/dbl-64/e_log.c @@ -207,4 +207,6 @@ double __ieee754_log(double x) { } return y1; } +#ifndef __ieee754_log strong_alias (__ieee754_log, __log_finite) +#endif diff --git a/sysdeps/ieee754/dbl-64/e_pow.c b/sysdeps/ieee754/dbl-64/e_pow.c index 0c7abb6eeb..94b1ab8961 100644 --- a/sysdeps/ieee754/dbl-64/e_pow.c +++ b/sysdeps/ieee754/dbl-64/e_pow.c @@ -153,7 +153,9 @@ double __ieee754_pow(double x, double y) { if (y<0) return (x<1.0)?INF.x:0; return 0; /* unreachable, to make the compiler happy */ } +#ifndef __ieee754_pow strong_alias (__ieee754_pow, __pow_finite) +#endif /**************************************************************************/ /* Computing x^y using more accurate but more slow log routine */ diff --git a/sysdeps/ieee754/dbl-64/mpa.c b/sysdeps/ieee754/dbl-64/mpa.c index 68647ba335..ad5a639c4b 100644 --- a/sysdeps/ieee754/dbl-64/mpa.c +++ b/sysdeps/ieee754/dbl-64/mpa.c @@ -1,8 +1,7 @@ - /* * IBM Accurate Mathematical Library * written by International Business Machines Corp. - * Copyright (C) 2001 Free Software F |
