diff options
| author | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-17 16:22:26 +0300 |
|---|---|---|
| committer | Andrew Senkevich <andrew.senkevich@intel.com> | 2015-06-17 16:22:26 +0300 |
| commit | c10b9b13f7471b08273effc8cd7e51b119df9348 (patch) | |
| tree | ca058c3446a247a5bccea211bd84a9c0130e1388 | |
| parent | 1663be053d50c06bb0f971c87d41a7b83f96fe15 (diff) | |
| download | glibc-c10b9b13f7471b08273effc8cd7e51b119df9348.tar.xz glibc-c10b9b13f7471b08273effc8cd7e51b119df9348.zip | |
Vector pow for x86_64 and tests.
Here is implementation of vectorized pow containing SSE, AVX,
AVX2 and AVX512 versions according to Vector ABI
<https://groups.google.com/forum/#!topic/x86-64-abi/LmppCfN1rZ4>.
* bits/libm-simd-decl-stubs.h: Added stubs for pow.
* math/bits/mathcalls.h: Added pow declaration with __MATHCALL_VEC.
* sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added.
* sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm
redirections for pow.
* sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files.
* sysdeps/x86_64/fpu/Versions: New versions added.
* sysdeps/x86_64/fpu/libm-test-ulps: Regenerated.
* sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added
build of SSE, AVX2 and AVX512 IFUNC versions.
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Added 2 argument wrappers.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: New file.
* sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow2_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow4_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow8_core.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow_data.S: New file.
* sysdeps/x86_64/fpu/svml_d_pow_data.h: New file.
* sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Added vector pow test.
* sysdeps/x86_64/fpu/test-double-vlen2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-avx2.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen4.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise.
* sysdeps/x86_64/fpu/test-double-vlen8.c: Likewise.
* NEWS: Mention addition of x86_64 vector pow.
31 files changed, 6934 insertions, 4 deletions
@@ -1,5 +1,38 @@ 2015-06-17 Andrew Senkevich <andrew.senkevich@intel.com> + * bits/libm-simd-decl-stubs.h: Added stubs for pow. + * math/bits/mathcalls.h: Added pow declaration with __MATHCALL_VEC. + * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New versions added. + * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm + redirections for pow. + * sysdeps/x86_64/fpu/Makefile (libmvec-support): Added new files. + * sysdeps/x86_64/fpu/Versions: New versions added. + * sysdeps/x86_64/fpu/libm-test-ulps: Regenerated. + * sysdeps/x86_64/fpu/multiarch/Makefile (libmvec-sysdep_routines): Added + build of SSE, AVX2 and AVX512 IFUNC versions. + * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Added 2 argument wrappers. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow4_core_avx2.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core.S: New file. + * sysdeps/x86_64/fpu/multiarch/svml_d_pow8_core_avx512.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow2_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow4_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow4_core_avx.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow8_core.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow_data.S: New file. + * sysdeps/x86_64/fpu/svml_d_pow_data.h: New file. + * sysdeps/x86_64/fpu/test-double-vlen2-wrappers.c: Added vector pow test. + * sysdeps/x86_64/fpu/test-double-vlen2.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-avx2-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-avx2.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen4.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen8-wrappers.c: Likewise. + * sysdeps/x86_64/fpu/test-double-vlen8.c: Likewise. + * NEWS: Mention addition of x86_64 vector pow. + * sysdeps/unix/sysv/linux/x86_64/libmvec.abilist: New symbols added. * sysdeps/x86/fpu/bits/math-vector.h: Added SIMD declaration and asm redirections for expf. @@ -53,7 +53,7 @@ Version 2.22 condition in some applications. * Added vector math library named libmvec with the following vectorized x86_64 - implementations: cos, cosf, sin, sinf, log, logf, exp, expf. + implementations: cos, cosf, sin, sinf, log, logf, exp, expf, pow. The library can be disabled with --disable-mathvec. Use of the functions is enabled with -fopenmp -ffast-math starting from -O1 for GCC version >= 4.9.0. The library is linked in as needed when using -lm (no need to specify -lmvec diff --git a/bits/libm-simd-decl-stubs.h b/bits/libm-simd-decl-stubs.h index 1a8bf6f262..b9f909efc1 100644 --- a/bits/libm-simd-decl-stubs.h +++ b/bits/libm-simd-decl-stubs.h @@ -49,4 +49,7 @@ #define __DECL_SIMD_expf #define __DECL_SIMD_expl +#define __DECL_SIMD_pow +#define __DECL_SIMD_powf +#define __DECL_SIMD_powl #endif diff --git a/math/bits/mathcalls.h b/math/bits/mathcalls.h index 1d0c6bd347..f297aa7906 100644 --- a/math/bits/mathcalls.h +++ b/math/bits/mathcalls.h @@ -150,7 +150,7 @@ __END_NAMESPACE_C99 _Mdouble_BEGIN_NAMESPACE /* Return X to the Y power. */ -__MATHCALL (pow,, (_Mdouble_ __x, _Mdouble_ __y)); +__MATHCALL_VEC (pow,, (_Mdouble_ __x, _Mdouble_ __y)); /* Return the square root of X. */ __MATHCALL (sqrt,, (_Mdouble_ __x)); diff --git a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist index 9652215777..9312a6ed2d 100644 --- a/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist +++ b/sysdeps/unix/sysv/linux/x86_64/libmvec.abilist @@ -4,6 +4,7 @@ GLIBC_2.22 _ZGVbN2v_exp F _ZGVbN2v_log F _ZGVbN2v_sin F + _ZGVbN2vv_pow F _ZGVbN4v_cosf F _ZGVbN4v_expf F _ZGVbN4v_logf F @@ -12,6 +13,7 @@ GLIBC_2.22 _ZGVcN4v_exp F _ZGVcN4v_log F _ZGVcN4v_sin F + _ZGVcN4vv_pow F _ZGVcN8v_cosf F _ZGVcN8v_expf F _ZGVcN8v_logf F @@ -20,6 +22,7 @@ GLIBC_2.22 _ZGVdN4v_exp F _ZGVdN4v_log F _ZGVdN4v_sin F + _ZGVdN4vv_pow F _ZGVdN8v_cosf F _ZGVdN8v_expf F _ZGVdN8v_logf F @@ -32,3 +35,4 @@ GLIBC_2.22 _ZGVeN8v_exp F _ZGVeN8v_log F _ZGVeN8v_sin F + _ZGVeN8vv_pow F diff --git a/sysdeps/x86/fpu/bits/math-vector.h b/sysdeps/x86/fpu/bits/math-vector.h index 3b7158952c..a5317b9e32 100644 --- a/sysdeps/x86/fpu/bits/math-vector.h +++ b/sysdeps/x86/fpu/bits/math-vector.h @@ -44,6 +44,8 @@ # define __DECL_SIMD_exp __DECL_SIMD_x86_64 # undef __DECL_SIMD_expf # define __DECL_SIMD_expf __DECL_SIMD_x86_64 +# undef __DECL_SIMD_pow +# define __DECL_SIMD_pow __DECL_SIMD_x86_64 /* Workaround to exclude unnecessary symbol aliases in libmvec while GCC creates the vector names based on scalar asm name. @@ -65,6 +67,10 @@ __asm__ ("_ZGVbN4v___expf_finite = _ZGVbN4v_expf"); __asm__ ("_ZGVcN8v___expf_finite = _ZGVcN8v_expf"); __asm__ ("_ZGVdN8v___expf_finite = _ZGVdN8v_expf"); __asm__ ("_ZGVeN16v___expf_finite = _ZGVeN16v_expf"); +__asm__ ("_ZGVbN2vv___pow_finite = _ZGVbN2vv_pow"); +__asm__ ("_ZGVcN4vv___pow_finite = _ZGVcN4vv_pow"); +__asm__ ("_ZGVdN4vv___pow_finite = _ZGVdN4vv_pow"); +__asm__ ("_ZGVeN8vv___pow_finite = _ZGVeN8vv_pow"); # endif #endif diff --git a/sysdeps/x86_64/fpu/Makefile b/sysdeps/x86_64/fpu/Makefile index eab738fb4a..aa9bdea668 100644 --- a/sysdeps/x86_64/fpu/Makefile +++ b/sysdeps/x86_64/fpu/Makefile @@ -13,7 +13,9 @@ libmvec-support += svml_d_cos2_core svml_d_cos4_core_avx \ svml_s_logf_data svml_d_exp2_core svml_d_exp4_core_avx \ svml_d_exp4_core svml_d_exp8_core svml_d_exp_data \ svml_s_expf4_core svml_s_expf8_core_avx svml_s_expf8_core \ - svml_s_expf16_core svml_s_expf_data \ + svml_s_expf16_core svml_s_expf_data svml_d_pow2_core \ + svml_d_pow4_core_avx svml_d_pow4_core svml_d_pow8_core \ + svml_d_pow_data \ init-arch endif diff --git a/sysdeps/x86_64/fpu/Versions b/sysdeps/x86_64/fpu/Versions index 0eaa8e81db..e379c36125 100644 --- a/sysdeps/x86_64/fpu/Versions +++ b/sysdeps/x86_64/fpu/Versions @@ -4,6 +4,7 @@ libmvec { _ZGVbN2v_sin; _ZGVcN4v_sin; _ZGVdN4v_sin; _ZGVeN8v_sin; _ZGVbN2v_log; _ZGVcN4v_log; _ZGVdN4v_log; _ZGVeN8v_log; _ZGVbN2v_exp; _ZGVcN4v_exp; _ZGVdN4v_exp; _ZGVeN8v_exp; + _ZGVbN2vv_pow; _ZGVcN4vv_pow; _ZGVdN4vv_pow; _ZGVeN8vv_pow; _ZGVbN4v_cosf; _ZGVcN8v_cosf; _ZGVdN8v_cosf; _ZGVeN16v_cosf; _ZGVbN4v_sinf; _ZGVcN8v_sinf; _ZGVdN8v_sinf; _ZGVeN16v_sinf; _ZGVbN4v_logf; _ZGVcN8v_logf; _ZGVdN8v_logf; _ZGVeN16v_logf; diff --git a/sysdeps/x86_64/fpu/libm-test-ulps b/sysdeps/x86_64/fpu/libm-test-ulps index ba1367f425..718e84c3f9 100644 --- a/sysdeps/x86_64/fpu/libm-test-ulps +++ b/sysdeps/x86_64/fpu/libm-test-ulps @@ -1947,6 +1947,18 @@ ifloat: 4 ildouble: 2 ldouble: 2 +Function: "pow_vlen2": +double: 1 + +Function: "pow_vlen4": +double: 1 + +Function: "pow_vlen4_avx2": +double: 1 + +Function: "pow_vlen8": +double: 1 + Function: "sin": ildouble: 1 ldouble: 1 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile index 9e1025136f..b03b1380c0 100644 --- a/sysdeps/x86_64/fpu/multiarch/Makefile +++ b/sysdeps/x86_64/fpu/multiarch/Makefile @@ -65,5 +65,6 @@ libmvec-sysdep_routines += svml_d_cos2_core_sse4 svml_d_cos4_core_avx2 \ svml_s_logf16_core_avx512 svml_d_exp2_core_sse4 \ svml_d_exp4_core_avx2 svml_d_exp8_core_avx512 \ svml_s_expf4_core_sse4 svml_s_expf8_core_avx2 \ - svml_s_expf16_core_avx512 + svml_s_expf16_core_avx512 svml_d_pow2_core_sse4 \ + svml_d_pow4_core_avx2 svml_d_pow8_core_avx512 endif diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S new file mode 100644 index 0000000000..f111388922 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core.S @@ -0,0 +1,38 @@ +/* Multiple versions of vectorized pow. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include <init-arch.h> + + .text +ENTRY (_ZGVbN2vv_pow) + .type _ZGVbN2vv_pow, @gnu_indirect_function + cmpl $0, KIND_OFFSET+__cpu_features(%rip) + jne 1f + call __init_cpu_features +1: leaq _ZGVbN2vv_pow_sse4(%rip), %rax + testl $bit_SSE4_1, __cpu_features+CPUID_OFFSET+index_SSE4_1(%rip) + jz 2f + ret +2: leaq _ZGVbN2vv_pow_sse2(%rip), %rax + ret +END (_ZGVbN2vv_pow) +libmvec_hidden_def (_ZGVbN2vv_pow) + +#define _ZGVbN2vv_pow _ZGVbN2vv_pow_sse2 +#include "../svml_d_pow2_core.S" diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S new file mode 100644 index 0000000000..9f6ec29ac5 --- /dev/null +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_pow2_core_sse4.S @@ -0,0 +1,432 @@ +/* Function pow vectorized with SSE4. + Copyright (C) 2014-2015 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#include "svml_d_pow_data.h" + + .text +ENTRY (_ZGVbN2vv_pow_sse4) +/* + ALGORITHM DESCRIPTION: + + 1) Calculating log2|x| + Here we use the following formula. + Let |x|=2^k1*X1, where k1 is integer, 1<=X1<2. + Let C ~= 1/ln(2), + Rcp1 ~= 1/X1, X2=Rcp1*X1, + Rcp2 ~= 1/X2, X3=Rcp2*X2, + Rcp3 ~= 1/X3, Rcp3C ~= C/X3. + Then + log2|x| = k1 + log2(1/Rcp1) + log2(1/Rcp2) + log2(C/Rcp3C) + + log2(X1*Rcp1*Rcp2*Rcp3C/C), + where X1*Rcp1*Rcp2*Rcp3C = C*(1+q), q is very small. + + The values of Rcp1, log2(1/Rcp1), Rcp2, log2(1/Rcp2), + Rcp3C, log2(C/Rcp3C) are taken from tables. + Values of Rcp1, Rcp2, Rcp3C are such that RcpC=Rcp1*Rcp2*Rcp3C + is exactly represented in target precision. + + log2(X1*Rcp1*Rcp2*Rcp3C/C) = log2(1+q) = ln(1+q)/ln2 = + = 1/(ln2)*q - 1/(2ln2)*q^2 + 1/(3ln2)*q^3 - ... = + = 1/(C*ln2)*cq - 1/(2*C^2*ln2)*cq^2 + 1/(3*C^3*ln2)*cq^3 - ... = + = (1 + a1)*cq + a2*cq^2 + a3*cq^3 + ..., + where cq = X1*Rcp1*Rcp2*Rcp3C-C, + a1=1/(C*ln(2))-1 is small, + a2=1/(2*C^2*ln2), + a3=1/(3*C^3*ln2), + ... + We get 3 parts of log2 result: HH+HL+HLL ~= log2|x|. + + 2) Calculation of y*(HH+HL+HLL). + Split y into YHi+YLo. + Get high PH and medium PL parts of y*log2|x|. + Get low PLL part of y*log2|x|. + Now we have PH+PL+PLL ~= y*log2|x|. + + 3) Calculation of 2^(PH+PL+PLL). + Mathematical idea of computing 2^(PH+PL+PLL) is the following. + Let's represent PH+PL+PLL in the form N + j/2^expK + Z, + where expK=7 in this implementation, N and j are integers, + 0<=j<=2^expK-1, |Z|<2^(-expK-1). + Hence 2^(PH+PL+PLL) ~= 2^N * 2^(j/2^expK) * 2^Z, + where 2^(j/2^expK) is stored in a table, and + 2^Z ~= 1 + B1*Z + B2*Z^2 ... + B5*Z^5. + + We compute 2^(PH+PL+PLL) as follows. + Break PH into PHH + PHL, where PHH = N + j/2^expK. + Z = PHL + PL + PLL + Exp2Poly = B1*Z + B2*Z^2 ... + B5*Z^5 + Get 2^(j/2^expK) from table in the form THI+TLO. + Now we have 2^(PH+PL+PLL) ~= 2^N * (THI + TLO) * (1 + Exp2Poly). + + Get significand of 2^(PH+PL+PLL) in the form ResHi+ResLo: + ResHi := THI + ResLo := THI * Exp2Poly + TLO + + Get exponent ERes of the result: + Res := ResHi + ResLo: + Result := ex(Res) + N. */ + + pushq %rbp + cfi_adjust_cfa_offset (8) + cfi_rel_offset (%rbp, 0) + movq %rsp, %rbp + cfi_def_cfa_register (%rbp) + andq $-64, %rsp + subq $256, %rsp + movq __svml_dpow_data@GOTPCREL(%rip), %rdx + movups %xmm14, 80(%rsp) + movups %xmm9, 176(%rsp) + movaps %xmm1, %xmm9 + pshufd $221, %xmm0, %xmm1 + movq _iIndexMask(%rdx), %xmm14 + movq _iIndexAdd(%rdx), %xmm6 + +/* i = (((Hi(x) & 0x000ffe00) + 0x00000200) >> 10); -> i = (b1..b11 + 1) / 2 */ + pand %xmm1, %xmm14 + paddd %xmm6, %xmm14 + psrld $10, %xmm14 + movups %xmm13, 96(%rsp) + +/* Index for reciprocal table */ + movdqa %xmm14, %xmm13 + pslld $3, %xmm13 + +/* Index for log2 table */ + pslld $4, %xmm14 + movd %xmm13, %eax + movups %xmm10, 160(%rsp) + movups _iMantissaMask(%rdx), %xmm10 + movslq %eax, %rax + +/* x1 = x; Hi(x1) = (Hi(x1)&0x000fffff)|0x3ff00000 */ + andps %xmm0, %xmm10 + pextrd $1, %xmm13, %ecx + movslq %ecx, %rcx + movups %xmm0, (%rsp) + movdqa %xmm1, %xmm0 + +/* k = Hi(x); k = k - 0x3fe7fe00; k = k >> 20 */ |
