diff options
| author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 10:47:09 -0800 |
|---|---|---|
| committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 21:14:09 -0800 |
| commit | b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0 (patch) | |
| tree | 299a228ef0972f8edc51e729bebe6f3ca4401e41 | |
| parent | a7ab967662656e8e7db43d94a075c947444a981a (diff) | |
| download | glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.tar.xz glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.zip | |
x86_64: Fix svml_d_asinh4_core_avx2.S code formatting
This commit contains following formatting changes
1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
between it and the first operand.
3. Instruction greater than 7 characters in length have a
space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S | 3077 |
1 files changed, 1538 insertions, 1539 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S index 636637b4b1..131b716c95 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S @@ -31,1571 +31,1570 @@ /* Offsets for data table __svml_dasinh_data_internal */ -#define Log_HA_table 0 -#define Log_LA_table 8224 -#define poly_coeff 12352 -#define ExpMask 12480 -#define Two10 12512 -#define MinLog1p 12544 -#define MaxLog1p 12576 -#define One 12608 -#define SgnMask 12640 -#define XThreshold 12672 -#define XhMask 12704 -#define Threshold 12736 -#define Bias 12768 -#define Bias1 12800 -#define ExpMask0 12832 -#define ExpMask2 12864 -#define L2 12896 -#define dBigThreshold 12928 -#define dC2 12960 -#define dC3 12992 -#define dC4 13024 -#define dC5 13056 -#define dHalf 13088 -#define dLargestFinite 13120 -#define dLittleThreshold 13152 -#define dSign 13184 -#define dThirtyOne 13216 -#define dTopMask12 13248 -#define dTopMask29 13280 -#define XScale 13312 +#define Log_HA_table 0 +#define Log_LA_table 8224 +#define poly_coeff 12352 +#define ExpMask 12480 +#define Two10 12512 +#define MinLog1p 12544 +#define MaxLog1p 12576 +#define One 12608 +#define SgnMask 12640 +#define XThreshold 12672 +#define XhMask 12704 +#define Threshold 12736 +#define Bias 12768 +#define Bias1 12800 +#define ExpMask0 12832 +#define ExpMask2 12864 +#define L2 12896 +#define dBigThreshold 12928 +#define dC2 12960 +#define dC3 12992 +#define dC4 13024 +#define dC5 13056 +#define dHalf 13088 +#define dLargestFinite 13120 +#define dLittleThreshold 13152 +#define dSign 13184 +#define dThirtyOne 13216 +#define dTopMask12 13248 +#define dTopMask29 13280 +#define XScale 13312 /* Lookup bias for data table __svml_dasinh_data_internal. */ -#define Table_Lookup_Bias -0x405fe0 +#define Table_Lookup_Bias -0x405fe0 #include <sysdep.h> - .text - .section .text.avx2,"ax",@progbits + .section .text.avx2, "ax", @progbits ENTRY(_ZGVdN4v_asinh_avx2) - pushq %rbp - cfi_def_cfa_offset(16) - movq %rsp, %rbp - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - andq $-32, %rsp - subq $96, %rsp - lea Table_Lookup_Bias+__svml_dasinh_data_internal(%rip), %r8 - vmovapd %ymm0, %ymm13 - vmovupd SgnMask+__svml_dasinh_data_internal(%rip), %ymm9 - -/* Load the constant 1 and a sign mask */ - vmovupd One+__svml_dasinh_data_internal(%rip), %ymm12 - -/* No need to split X when FMA is available in hardware. */ - vmulpd %ymm13, %ymm13, %ymm8 - -/* - * Get the absolute value of the input, since we will exploit antisymmetry - * and mostly assume X >= 0 in the core computation - */ - vandpd %ymm9, %ymm13, %ymm10 - -/* - * Check whether the input is finite, by checking |X| <= MaxFloat - * Otherwise set the rangemask so that the callout will get used. - * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat) - */ - vcmpnle_uqpd dLargestFinite+__svml_dasinh_data_internal(%rip), %ymm10, %ymm14 - -/* - * Finally, express Y + W = X^2 + 1 accurately where Y has <= 29 bits. - * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1 - * as the dominant component in the compensated summation. Otherwise, - * if |X| >= 1, then since X2Hi only has 52 significant bits, the basic - * addition will be exact anyway until we get to |X| >= 2^53. But by - * that time the log function is well-conditioned enough that the - * rounding error doesn't matter. Hence we can treat 1 as dominant even - * if it literally isn't. - */ - vaddpd %ymm8, %ymm12, %ymm5 - -/* - * The following computation can go wrong for very large X, basically - * because X^2 overflows. But for large X we have - * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30 - * we can just later stick X back into the log and tweak up the exponent. - * Actually we scale X by 2^-30 and tweak the exponent up by 31, - * to stay in the safe range for the later log computation. - * Compute a flag now telling us when do do this. - */ - vcmplt_oqpd dBigThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm11 - vsubpd %ymm5, %ymm12, %ymm15 - vmovmskpd %ymm14, %eax - vandpd dTopMask29+__svml_dasinh_data_internal(%rip), %ymm5, %ymm14 - -/* - * Compute R = 1/sqrt(Y + W) * (1 + d) - * Force R to <= 12 significant bits in case it isn't already - * This means that R * Y and R^2 * Y are exactly representable. - */ - vcvtpd2ps %ymm14, %xmm1 - vaddpd %ymm15, %ymm8, %ymm0 - vsubpd %ymm14, %ymm5, %ymm2 - vrsqrtps %xmm1, %xmm3 - vmovapd %ymm13, %ymm7 - vfmsub213pd %ymm8, %ymm13, %ymm7 - vcvtps2pd %xmm3, %ymm6 - vaddpd %ymm0, %ymm7, %ymm4 - -/* - * Unfortunately, we can still be in trouble if |X| <= 2^-10, since - * the absolute error 2^-(12+53)-ish in sqrt(1 + X^2) gets scaled up - * by 1/X and comes close to our threshold. Hence if |X| <= 2^-9, - * perform an alternative computation - * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16 - * X2 = X^2 - */ - vaddpd %ymm7, %ymm8, %ymm7 - vaddpd %ymm2, %ymm4, %ymm15 - -/* - * Now 1 / (1 + d) - * = 1 / (1 + (sqrt(1 - e) - 1)) - * = 1 / sqrt(1 - e) - * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + - * 63/256 * e^5 + 231/1024 * e^6 + .... - * So compute the first five nonconstant terms of that, so that - * we have a relative correction (1 + Corr) to apply to S etc. - * C1 = 1/2 - * C2 = 3/8 - * C3 = 5/16 - * C4 = 35/128 - * C5 = 63/256 - */ - vmovupd dC5+__svml_dasinh_data_internal(%rip), %ymm4 - vandpd dTopMask12+__svml_dasinh_data_internal(%rip), %ymm6, %ymm0 - -/* - * Compute S = (Y/sqrt(Y + W)) * (1 + d) - * and T = (W/sqrt(Y + W)) * (1 + d) - * so that S + T = sqrt(Y + W) * (1 + d) - * S is exact, and the rounding error in T is OK. - */ - vmulpd %ymm0, %ymm14, %ymm3 - vmulpd %ymm15, %ymm0, %ymm1 - vmovupd dHalf+__svml_dasinh_data_internal(%rip), %ymm6 - vsubpd %ymm12, %ymm3, %ymm14 - -/* - * Obtain sqrt(1 + X^2) - 1 in two pieces - * sqrt(1 + X^2) - 1 - * = sqrt(Y + W) - 1 - * = (S + T) * (1 + Corr) - 1 - * = [S - 1] + [T + (S + T) * Corr] - * We need a compensated summation for the last part. We treat S - 1 - * as the larger part; it certainly is until about X < 2^-4, and in that - * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1 - * Final sum is dTmp5 (hi) + dTmp7 (lo) - */ - vaddpd %ymm1, %ymm3, %ymm2 - -/* - * Compute e = -(2 * d + d^2) - * The first FMR is exact, and the rounding error in the other is acceptable - * since d and e are ~ 2^-12 - */ - vmovapd %ymm12, %ymm5 - vfnmadd231pd %ymm3, %ymm0, %ymm5 - vfnmadd231pd %ymm1, %ymm0, %ymm5 - vfmadd213pd dC4+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 - vfmadd213pd dC3+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 - vfmadd213pd dC2+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 - vfmadd213pd %ymm6, %ymm5, %ymm4 - vmulpd %ymm4, %ymm5, %ymm0 - vfmadd213pd %ymm1, %ymm2, %ymm0 - -/* Now multiplex the two possible computations */ - vcmple_oqpd dLittleThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm2 - vaddpd %ymm14, %ymm0, %ymm15 - -/* dX2over2 = X^2/2 */ - vmulpd %ymm7, %ymm6, %ymm0 - -/* dX4over4 = X^4/4 */ - vmulpd %ymm0, %ymm0, %ymm8 - -/* dX46 = -X^4/4 + X^6/8 */ - vfmsub231pd %ymm0, %ymm8, %ymm8 - -/* dX46over2 = -X^4/8 + x^6/16 */ - vmulpd %ymm8, %ymm6, %ymm5 - -/* 2^ (-10-exp(X) ) */ - vmovupd ExpMask2+__svml_dasinh_data_internal(%rip), %ymm8 - vaddpd %ymm5, %ymm0, %ymm4 - vblendvpd %ymm2, %ymm4, %ymm15, %ymm1 - -/* - * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1]. - * It's always safe to assume |X| is larger. - * This is the final 2-part argument to the log1p function - */ - vaddpd %ymm1, %ymm10, %ymm3 - -/* Now multiplex to the case X = 2^-30 * |input|, Xl = dL = 0 in the "big" case. */ - vmulpd XScale+__svml_dasinh_data_internal(%rip), %ymm10, %ymm10 - -/* - * Now we feed into the log1p code, using H in place of _VARG1 and - * also adding L into Xl. - * compute 1+x as high, low parts - */ - vmaxpd %ymm3, %ymm12, %ymm6 - vminpd %ymm3, %ymm12, %ymm7 - vandpd %ymm9, %ymm3, %ymm9 - vcmplt_oqpd XThreshold+__svml_dasinh_data_internal(%rip), %ymm9, %ymm0 - vaddpd %ymm7, %ymm6, %ymm5 - vorpd XhMask+__svml_dasinh_data_internal(%rip), %ymm0, %ymm4 - vandpd %ymm4, %ymm5, %ymm1 - vblendvpd %ymm11, %ymm1, %ymm10, %ymm5 - vsubpd %ymm1, %ymm6, %ymm2 - -/* exponent bits */ - vpsrlq $20, %ymm5, %ymm10 - vaddpd %ymm2, %ymm7, %ymm3 - -/* - * Now resume the main code. - * preserve mantissa, set input exponent to 2^(-10) - */ - vandpd ExpMask+__svml_dasinh_data_internal(%rip), %ymm5, %ymm0 - vorpd Two10+__svml_dasinh_data_internal(%rip), %ymm0, %ymm2 - -/* reciprocal approximation good to at least 11 bits */ - vcvtpd2ps %ymm2, %xmm6 - vrcpps %xmm6, %xmm7 - vcvtps2pd %xmm7, %ymm15 - -/* exponent of X needed to scale Xl */ - vandps ExpMask0+__svml_dasinh_data_internal(%rip), %ymm5, %ymm9 - vpsubq %ymm9, %ymm8, %ymm0 - vandpd %ymm11, %ymm3, %ymm4 - -/* round reciprocal to nearest integer, will have 1+9 mantissa bits */ - vroundpd $0, %ymm15, %ymm3 - -/* scale DblRcp */ - vmulpd %ymm0, %ymm3, %ymm2 - -/* argument reduction */ - vfmsub213pd %ymm12, %ymm2, %ymm5 - vmulpd %ymm2, %ymm4, %ymm12 - vmovupd poly_coeff+64+__svml_dasinh_data_internal(%rip), %ymm2 - vaddpd %ymm12, %ymm5, %ymm5 - vfmadd213pd poly_coeff+96+__svml_dasinh_data_internal(%rip), %ymm5, %ymm2 - vmulpd %ymm5, %ymm5, %ymm4 - vextractf128 $1, %ymm10, %xmm14 - vshufps $221, %xmm14, %xmm10, %xmm1 - -/* biased exponent in DP format */ - vcvtdq2pd %xmm1, %ymm7 - -/* exponent*log(2.0) */ - vmovupd Threshold+__svml_dasinh_data_internal(%rip), %ymm10 - -/* Add 31 to the exponent in the "large" case to get log(2 * input) */ - vaddpd dThirtyOne+__svml_dasinh_data_internal(%rip), %ymm7, %ymm6 - vblendvpd %ymm11, %ymm7, %ymm6, %ymm1 - -/* - * prepare table index - * table lookup - */ - vpsrlq $40, %ymm3, %ymm11 - vcmplt_oqpd %ymm3, %ymm10, %ymm3 - vandpd Bias+__svml_dasinh_data_internal(%rip), %ymm3, %ymm14 - vorpd Bias1+__svml_dasinh_data_internal(%rip), %ymm14, %ymm15 - vsubpd %ymm15, %ymm1, %ymm1 - vmulpd L2+__svml_dasinh_data_internal(%rip), %ymm1, %ymm3 - -/* polynomial */ - vmovupd poly_coeff+__svml_dasinh_data_internal(%rip), %ymm1 - vfmadd213pd poly_coeff+32+__svml_dasinh_data_internal(%rip), %ymm5, %ymm1 - vfmadd213pd %ymm2, %ymm4, %ymm1 - -/* reconstruction */ - vfmadd213pd %ymm5, %ymm4, %ymm1 - vextractf128 $1, %ymm11, %xmm7 - vmovd %xmm11, %edx - vmovd %xmm7, %esi - movslq %edx, %rdx - vpextrd $2, %xmm11, %ecx - movslq %esi, %rsi - vpextrd $2, %xmm7, %edi - movslq %ecx, %rcx - movslq %edi, %rdi - vmovsd (%r8,%rdx), %xmm0 - vmovsd (%r8,%rsi), %xmm8 - vmovhpd (%r8,%rcx), %xmm0, %xmm6 - vmovhpd (%r8,%rdi), %xmm8, %xmm9 - vinsertf128 $1, %xmm9, %ymm6, %ymm0 - vaddpd %ymm1, %ymm0, %ymm0 - vaddpd %ymm0, %ymm3, %ymm7 - -/* Finally, reincorporate the original sign. */ - vandpd dSign+__svml_dasinh_data_internal(%rip), %ymm13, %ymm6 - vxorpd %ymm7, %ymm6, %ymm0 - testl %eax, %eax - -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx r12 r13 r14 r15 eax ymm0 ymm13 - -/* Restore registers - * and exit the function - */ + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-32, %rsp + subq $96, %rsp + lea Table_Lookup_Bias+__svml_dasinh_data_internal(%rip), %r8 + vmovapd %ymm0, %ymm13 + vmovupd SgnMask+__svml_dasinh_data_internal(%rip), %ymm9 + + /* Load the constant 1 and a sign mask */ + vmovupd One+__svml_dasinh_data_internal(%rip), %ymm12 + + /* No need to split X when FMA is available in hardware. */ + vmulpd %ymm13, %ymm13, %ymm8 + + /* + * Get the absolute value of the input, since we will exploit antisymmetry + * and mostly assume X >= 0 in the core computation + */ + vandpd %ymm9, %ymm13, %ymm10 + + /* + * Check whether the input is finite, by checking |X| <= MaxFloat + * Otherwise set the rangemask so that the callout will get used. + * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat) + */ + vcmpnle_uqpd dLargestFinite+__svml_dasinh_data_internal(%rip), %ymm10, %ymm14 + + /* + * Finally, express Y + W = X^2 + 1 accurately where Y has <= 29 bits. + * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1 + * as the dominant component in the compensated summation. Otherwise, + * if |X| >= 1, then since X2Hi only has 52 significant bits, the basic + * addition will be exact anyway until we get to |X| >= 2^53. But by + * that time the log function is well-conditioned enough that the + * rounding error doesn't matter. Hence we can treat 1 as dominant even + * if it literally isn't. + */ + vaddpd %ymm8, %ymm12, %ymm5 + + /* + * The following computation can go wrong for very large X, basically + * because X^2 overflows. But for large X we have + * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30 + * we can just later stick X back into the log and tweak up the exponent. + * Actually we scale X by 2^-30 and tweak the exponent up by 31, + * to stay in the safe range for the later log computation. + * Compute a flag now telling us when do do this. + */ + vcmplt_oqpd dBigThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm11 + vsubpd %ymm5, %ymm12, %ymm15 + vmovmskpd %ymm14, %eax + vandpd dTopMask29+__svml_dasinh_data_internal(%rip), %ymm5, %ymm14 + + /* + * Compute R = 1/sqrt(Y + W) * (1 + d) + * Force R to <= 12 significant bits in case it isn't already + * This means that R * Y and R^2 * Y are exactly representable. + */ + vcvtpd2ps %ymm14, %xmm1 + vaddpd %ymm15, %ymm8, %ymm0 + vsubpd %ymm14, %ymm5, %ymm2 + vrsqrtps %xmm1, %xmm3 + vmovapd %ymm13, %ymm7 + vfmsub213pd %ymm8, %ymm13, %ymm7 + vcvtps2pd %xmm3, %ymm6 + vaddpd %ymm0, %ymm7, %ymm4 + + /* + * Unfortunately, we can still be in trouble if |X| <= 2^-10, since + * the absolute error 2^-(12+53)-ish in sqrt(1 + X^2) gets scaled up + * by 1/X and comes close to our threshold. Hence if |X| <= 2^-9, + * perform an alternative computation + * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16 + * X2 = X^2 + */ + vaddpd %ymm7, %ymm8, %ymm7 + vaddpd %ymm2, %ymm4, %ymm15 + + /* + * Now 1 / (1 + d) + * = 1 / (1 + (sqrt(1 - e) - 1)) + * = 1 / sqrt(1 - e) + * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 + + * 63/256 * e^5 + 231/1024 * e^6 + .... + * So compute the first five nonconstant terms of that, so that + * we have a relative correction (1 + Corr) to apply to S etc. + * C1 = 1/2 + * C2 = 3/8 + * C3 = 5/16 + * C4 = 35/128 + * C5 = 63/256 + */ + vmovupd dC5+__svml_dasinh_data_internal(%rip), %ymm4 + vandpd dTopMask12+__svml_dasinh_data_internal(%rip), %ymm6, %ymm0 + + /* + * Compute S = (Y/sqrt(Y + W)) * (1 + d) + * and T = (W/sqrt(Y + W)) * (1 + d) + * so that S + T = sqrt(Y + W) * (1 + d) + * S is exact, and the rounding error in T is OK. + */ + vmulpd %ymm0, %ymm14, %ymm3 + vmulpd %ymm15, %ymm0, %ymm1 + vmovupd dHalf+__svml_dasinh_data_internal(%rip), %ymm6 + vsubpd %ymm12, %ymm3, %ymm14 + + /* + * Obtain sqrt(1 + X^2) - 1 in two pieces + * sqrt(1 + X^2) - 1 + * = sqrt(Y + W) - 1 + * = (S + T) * (1 + Corr) - 1 + * = [S - 1] + [T + (S + T) * Corr] + * We need a compensated summation for the last part. We treat S - 1 + * as the larger part; it certainly is until about X < 2^-4, and in that + * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1 + * Final sum is dTmp5 (hi) + dTmp7 (lo) + */ + vaddpd %ymm1, %ymm3, %ymm2 + + /* + * Compute e = -(2 * d + d^2) + * The first FMR is exact, and the rounding error in the other is acceptable + * since d and e are ~ 2^-12 + */ + vmovapd %ymm12, %ymm5 + vfnmadd231pd %ymm3, %ymm0, %ymm5 + vfnmadd231pd %ymm1, %ymm0, %ymm5 + vfmadd213pd dC4+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 + vfmadd213pd dC3+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 + vfmadd213pd dC2+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4 + vfmadd213pd %ymm6, %ymm5, %ymm4 + vmulpd %ymm4, %ymm5, %ymm0 + vfmadd213pd %ymm1, %ymm2, %ymm0 + + /* Now multiplex the two possible computations */ + vcmple_oqpd dLittleThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm2 + vaddpd %ymm14, %ymm0, %ymm15 + + /* dX2over2 = X^2/2 */ + vmulpd %ymm7, %ymm6, %ymm0 + + /* dX4over4 = X^4/4 */ + vmulpd %ymm0, %ymm0, %ymm8 + + /* dX46 = -X^4/4 + X^6/8 */ + vfmsub231pd %ymm0, %ymm8, %ymm8 + + /* dX46over2 = -X^4/8 + x^6/16 */ + vmulpd %ymm8, %ymm6, %ymm5 + + /* 2^ (-10-exp(X) ) */ + vmovupd ExpMask2+__svml_dasinh_data_internal(%rip), %ymm8 + vaddpd %ymm5, %ymm0, %ymm4 + vblendvpd %ymm2, %ymm4, %ymm15, %ymm1 + + /* + * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1]. + * It's always safe to assume |X| is larger. + * This is the final 2-part argument to the log1p function + */ + vaddpd %ymm1, %ymm10, %ymm3 + + /* Now multiplex to the case X = 2^-30 * |input|, Xl = dL = 0 in the "big" case. */ + vmulpd XScale+__svml_dasinh_data_internal(%rip), %ymm10, %ymm10 + + /* + * Now we feed into the log1p code, using H in place of _VARG1 and + * also adding L into Xl. + * compute 1+x as high, low parts + */ + vmaxpd %ymm3, %ymm12, %ymm6 + vminpd %ymm3, %ymm12, %ymm7 + vandpd %ymm9, %ymm3, %ymm9 + vcmplt_oqpd XThreshold+__svml_dasinh_data_internal(%rip), %ymm9, %ymm0 + vaddpd %ymm7, %ymm6, %ymm5 + vorpd XhMask+__svml_dasinh_data_internal(%rip), %ymm0, %ymm4 + vandpd %ymm4, %ymm5, %ymm1 + vblendvpd %ymm11, %ymm1, %ymm10, %ymm5 + vsubpd %ymm1, %ymm6, %ymm2 + + /* exponent bits */ + vpsrlq $20, %ymm5, %ymm10 + vaddpd %ymm2, %ymm7, %ymm3 + + /* + * Now resume the main code. + * preserve mantissa, set input exponent to 2^(-10) + */ + vandpd ExpMask+__svml_dasinh_data_internal(%rip), %ymm5, %ymm0 + vorpd Two10+__svml_dasinh_data_internal(%rip), %ymm0, %ymm2 + + /* reciprocal approximation good to at least 11 bits */ + vcvtpd2ps %ymm2, %xmm6 + vrcpps %xmm6, %xmm7 + vcvtps2pd %xmm7, %ymm15 + + /* exponent of X needed to scale Xl */ + vandps ExpMask0+__svml_dasinh_data_internal(%rip), %ymm5, %ymm9 + vpsubq %ymm9, %ymm8, %ymm0 + vandpd %ymm11, %ymm3, %ymm4 + + /* round reciprocal to nearest integer, will have 1+9 mantissa bits */ + vroundpd $0, %ymm15, %ymm3 + + /* scale DblRcp */ + vmulpd %ymm0, %ymm3, %ymm2 + + /* argument reduction */ + vfmsub213pd %ymm12, %ymm2, %ymm5 + vmulpd %ymm2, %ymm4, %ymm12 + vmovupd poly_coeff+64+__svml_dasinh_data_internal(%rip), %ymm2 + vaddpd %ymm12, %ymm5, %ymm5 + vfmadd213pd poly_coeff+96+__svml_dasinh_data_internal(%rip), %ymm5, %ymm2 + vmulpd %ymm5, %ymm5, %ymm4 + vextractf128 $1, %ymm10, %xmm14 + vshufps $221, %xmm14, %xmm10, %xmm1 + + /* biased exponent in DP format */ + vcvtdq2pd %xmm1, %ymm7 + + /* exponent*log(2.0) */ + vmovupd Threshold+__svml_dasinh_data_internal(%rip), %ymm10 + + /* Add 31 to the exponent in the "large" case to get log(2 * input) */ + vaddpd dThirtyOne+__svml_dasinh_data_internal(%rip), %ymm7, %ymm6 + vblendvpd %ymm11, %ymm7, %ymm6, %ymm1 + + /* + * prepare table index + * table lookup + */ + vpsrlq $40, %ymm3, %ymm11 + vcmplt_oqpd %ymm3, %ymm10, %ymm3 + vandpd Bias+__svml_dasinh_data_internal(%rip), %ymm3, %ymm14 + vorpd Bias1+__svml_dasinh_data_internal(%rip), %ymm14, %ymm15 + vsubpd %ymm15, %ymm1, %ymm1 + vmulpd L2+__svml_dasinh_data_internal(%rip), %ymm1, %ymm3 + + /* polynomial */ + vmovupd poly_coeff+__svml_dasinh_data_internal(%rip), %ymm1 + vfmadd213pd poly_coeff+32+__svml_dasinh_data_internal(%rip), %ymm5, %ymm1 + vfmadd213pd %ymm2, %ymm4, %ymm1 + + /* reconstruction */ + vfmadd213pd %ymm5, %ymm4, %ymm1 + vextractf128 $1, %ymm11, %xmm7 + vmovd %xmm11, %edx + vmovd %xmm7, %esi + movslq %edx, %rdx + vpextrd $2, %xmm11, %ecx + movslq %esi, %rsi + vpextrd $2, %xmm7, %edi + movslq %ecx, %rcx + movslq %edi, %rdi + vmovsd (%r8, %rdx), %xmm0 + vmovsd (%r8, %rsi), %xmm8 + vmovhpd (%r8, %rcx), %xmm0, %xmm6 + vmovhpd (%r8, %rdi), %xmm8, %xmm9 + vinsertf128 $1, %xmm9, %ymm6, %ymm0 + vaddpd %ymm1, %ymm0, %ymm0 + vaddpd %ymm0, %ymm3, %ymm7 + + /* Finally, reincorporate the original sign. */ + vandpd dSign+__svml_dasinh_data_internal(%rip), %ymm13, %ymm6 + vxorpd %ymm7, %ymm6, %ymm0 + testl %eax, %eax + + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx r12 r13 r14 r15 eax ymm0 ymm13 + + /* Restore registers + * and exit the function + */ L(EXIT): - movq %rbp, %rsp - popq %rbp - cfi_def_cfa(7, 8) - cfi_restore(6) - ret - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - -/* Branch to process - * special inputs - */ + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - vmovupd %ymm13, 32(%rsp) - vmovupd %ymm0, 64(%rsp) - # LOE rbx r12 r13 r14 r15 eax ymm0 - - xorl %edx, %edx - # LOE rbx r12 r13 r14 r15 eax edx - - vzeroupper - movq %r12, 16(%rsp) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22 - movl %edx, %r12d - movq %r13, 8(%rsp) - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22 - movl %eax, %r13d - movq %r14, (%rsp) - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22 |
