x86_64: Fix svml_d_asinh4_core_avx2.S code formatting

This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
author: Sunil K Pandey <skpgkp2@gmail.com> 2022-03-07 10:47:09 -0800
committer: Sunil K Pandey <skpgkp2@gmail.com> 2022-03-07 21:14:09 -0800
commit: b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0 (patch)
tree: 299a228ef0972f8edc51e729bebe6f3ca4401e41
parent: a7ab967662656e8e7db43d94a075c947444a981a (diff)
download: glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.tar.xz
glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.zip
1 files changed, 1538 insertions, 1539 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
index 636637b4b1..131b716c95 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_asinh4_core_avx2.S
@@ -31,1571 +31,1570 @@
 
 /* Offsets for data table __svml_dasinh_data_internal
  */
-#define Log_HA_table                  	0
-#define Log_LA_table                  	8224
-#define poly_coeff                    	12352
-#define ExpMask                       	12480
-#define Two10                         	12512
-#define MinLog1p                      	12544
-#define MaxLog1p                      	12576
-#define One                           	12608
-#define SgnMask                       	12640
-#define XThreshold                    	12672
-#define XhMask                        	12704
-#define Threshold                     	12736
-#define Bias                          	12768
-#define Bias1                         	12800
-#define ExpMask0                      	12832
-#define ExpMask2                      	12864
-#define L2                            	12896
-#define dBigThreshold                 	12928
-#define dC2                           	12960
-#define dC3                           	12992
-#define dC4                           	13024
-#define dC5                           	13056
-#define dHalf                         	13088
-#define dLargestFinite                	13120
-#define dLittleThreshold              	13152
-#define dSign                         	13184
-#define dThirtyOne                    	13216
-#define dTopMask12                    	13248
-#define dTopMask29                    	13280
-#define XScale                        	13312
+#define Log_HA_table			0
+#define Log_LA_table			8224
+#define poly_coeff			12352
+#define ExpMask				12480
+#define Two10				12512
+#define MinLog1p			12544
+#define MaxLog1p			12576
+#define One				12608
+#define SgnMask				12640
+#define XThreshold			12672
+#define XhMask				12704
+#define Threshold			12736
+#define Bias				12768
+#define Bias1				12800
+#define ExpMask0			12832
+#define ExpMask2			12864
+#define L2				12896
+#define dBigThreshold			12928
+#define dC2				12960
+#define dC3				12992
+#define dC4				13024
+#define dC5				13056
+#define dHalf				13088
+#define dLargestFinite			13120
+#define dLittleThreshold		13152
+#define dSign				13184
+#define dThirtyOne			13216
+#define dTopMask12			13248
+#define dTopMask29			13280
+#define XScale				13312
 
 /* Lookup bias for data table __svml_dasinh_data_internal.  */
-#define Table_Lookup_Bias               -0x405fe0
+#define Table_Lookup_Bias		-0x405fe0
 
 #include <sysdep.h>
 
-        .text
-	.section .text.avx2,"ax",@progbits
+	.section .text.avx2, "ax", @progbits
 ENTRY(_ZGVdN4v_asinh_avx2)
-        pushq     %rbp
-        cfi_def_cfa_offset(16)
-        movq      %rsp, %rbp
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-        andq      $-32, %rsp
-        subq      $96, %rsp
-        lea       Table_Lookup_Bias+__svml_dasinh_data_internal(%rip), %r8
-        vmovapd   %ymm0, %ymm13
-        vmovupd   SgnMask+__svml_dasinh_data_internal(%rip), %ymm9
-
-/* Load the constant 1 and a sign mask */
-        vmovupd   One+__svml_dasinh_data_internal(%rip), %ymm12
-
-/* No need to split X when FMA is available in hardware. */
-        vmulpd    %ymm13, %ymm13, %ymm8
-
-/*
- * Get the absolute value of the input, since we will exploit antisymmetry
- * and mostly assume X >= 0 in the core computation
- */
-        vandpd    %ymm9, %ymm13, %ymm10
-
-/*
- * Check whether the input is finite, by checking |X| <= MaxFloat
- * Otherwise set the rangemask so that the callout will get used.
- * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
- */
-        vcmpnle_uqpd dLargestFinite+__svml_dasinh_data_internal(%rip), %ymm10, %ymm14
-
-/*
- * Finally, express Y + W = X^2 + 1 accurately where Y has <= 29 bits.
- * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
- * as the dominant component in the compensated summation. Otherwise,
- * if |X| >= 1, then since X2Hi only has 52 significant bits, the basic
- * addition will be exact anyway until we get to |X| >= 2^53. But by
- * that time the log function is well-conditioned enough that the
- * rounding error doesn't matter. Hence we can treat 1 as dominant even
- * if it literally isn't.
- */
-        vaddpd    %ymm8, %ymm12, %ymm5
-
-/*
- * The following computation can go wrong for very large X, basically
- * because X^2 overflows. But for large X we have
- * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
- * we can just later stick X back into the log and tweak up the exponent.
- * Actually we scale X by 2^-30 and tweak the exponent up by 31,
- * to stay in the safe range for the later log computation.
- * Compute a flag now telling us when do do this.
- */
-        vcmplt_oqpd dBigThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm11
-        vsubpd    %ymm5, %ymm12, %ymm15
-        vmovmskpd %ymm14, %eax
-        vandpd    dTopMask29+__svml_dasinh_data_internal(%rip), %ymm5, %ymm14
-
-/*
- * Compute R = 1/sqrt(Y + W) * (1 + d)
- * Force R to <= 12 significant bits in case it isn't already
- * This means that R * Y and R^2 * Y are exactly representable.
- */
-        vcvtpd2ps %ymm14, %xmm1
-        vaddpd    %ymm15, %ymm8, %ymm0
-        vsubpd    %ymm14, %ymm5, %ymm2
-        vrsqrtps  %xmm1, %xmm3
-        vmovapd   %ymm13, %ymm7
-        vfmsub213pd %ymm8, %ymm13, %ymm7
-        vcvtps2pd %xmm3, %ymm6
-        vaddpd    %ymm0, %ymm7, %ymm4
-
-/*
- * Unfortunately, we can still be in trouble if |X| <= 2^-10, since
- * the absolute error 2^-(12+53)-ish in sqrt(1 + X^2) gets scaled up
- * by 1/X and comes close to our threshold. Hence if |X| <= 2^-9,
- * perform an alternative computation
- * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
- * X2 = X^2
- */
-        vaddpd    %ymm7, %ymm8, %ymm7
-        vaddpd    %ymm2, %ymm4, %ymm15
-
-/*
- * Now       1 / (1 + d)
- * = 1 / (1 + (sqrt(1 - e) - 1))
- * = 1 / sqrt(1 - e)
- * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 +
- * 63/256 * e^5 + 231/1024 * e^6 + ....
- * So compute the first five nonconstant terms of that, so that
- * we have a relative correction (1 + Corr) to apply to S etc.
- * C1 = 1/2
- * C2 = 3/8
- * C3 = 5/16
- * C4 = 35/128
- * C5 = 63/256
- */
-        vmovupd   dC5+__svml_dasinh_data_internal(%rip), %ymm4
-        vandpd    dTopMask12+__svml_dasinh_data_internal(%rip), %ymm6, %ymm0
-
-/*
- * Compute S = (Y/sqrt(Y + W)) * (1 + d)
- * and T = (W/sqrt(Y + W)) * (1 + d)
- * so that S + T = sqrt(Y + W) * (1 + d)
- * S is exact, and the rounding error in T is OK.
- */
-        vmulpd    %ymm0, %ymm14, %ymm3
-        vmulpd    %ymm15, %ymm0, %ymm1
-        vmovupd   dHalf+__svml_dasinh_data_internal(%rip), %ymm6
-        vsubpd    %ymm12, %ymm3, %ymm14
-
-/*
- * Obtain sqrt(1 + X^2) - 1 in two pieces
- * sqrt(1 + X^2) - 1
- * = sqrt(Y + W) - 1
- * = (S + T) * (1 + Corr) - 1
- * = [S - 1] + [T + (S + T) * Corr]
- * We need a compensated summation for the last part. We treat S - 1
- * as the larger part; it certainly is until about X < 2^-4, and in that
- * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
- * Final sum is dTmp5 (hi) + dTmp7 (lo)
- */
-        vaddpd    %ymm1, %ymm3, %ymm2
-
-/*
- * Compute e = -(2 * d + d^2)
- * The first FMR is exact, and the rounding error in the other is acceptable
- * since d and e are ~ 2^-12
- */
-        vmovapd   %ymm12, %ymm5
-        vfnmadd231pd %ymm3, %ymm0, %ymm5
-        vfnmadd231pd %ymm1, %ymm0, %ymm5
-        vfmadd213pd dC4+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
-        vfmadd213pd dC3+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
-        vfmadd213pd dC2+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
-        vfmadd213pd %ymm6, %ymm5, %ymm4
-        vmulpd    %ymm4, %ymm5, %ymm0
-        vfmadd213pd %ymm1, %ymm2, %ymm0
-
-/* Now multiplex the two possible computations */
-        vcmple_oqpd dLittleThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm2
-        vaddpd    %ymm14, %ymm0, %ymm15
-
-/* dX2over2 = X^2/2 */
-        vmulpd    %ymm7, %ymm6, %ymm0
-
-/* dX4over4 = X^4/4 */
-        vmulpd    %ymm0, %ymm0, %ymm8
-
-/* dX46 = -X^4/4 + X^6/8 */
-        vfmsub231pd %ymm0, %ymm8, %ymm8
-
-/* dX46over2 = -X^4/8 + x^6/16 */
-        vmulpd    %ymm8, %ymm6, %ymm5
-
-/* 2^ (-10-exp(X) ) */
-        vmovupd   ExpMask2+__svml_dasinh_data_internal(%rip), %ymm8
-        vaddpd    %ymm5, %ymm0, %ymm4
-        vblendvpd %ymm2, %ymm4, %ymm15, %ymm1
-
-/*
- * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
- * It's always safe to assume |X| is larger.
- * This is the final 2-part argument to the log1p function
- */
-        vaddpd    %ymm1, %ymm10, %ymm3
-
-/* Now multiplex to the case X = 2^-30 * |input|, Xl = dL = 0 in the "big" case. */
-        vmulpd    XScale+__svml_dasinh_data_internal(%rip), %ymm10, %ymm10
-
-/*
- * Now we feed into the log1p code, using H in place of _VARG1 and
- * also adding L into Xl.
- * compute 1+x as high, low parts
- */
-        vmaxpd    %ymm3, %ymm12, %ymm6
-        vminpd    %ymm3, %ymm12, %ymm7
-        vandpd    %ymm9, %ymm3, %ymm9
-        vcmplt_oqpd XThreshold+__svml_dasinh_data_internal(%rip), %ymm9, %ymm0
-        vaddpd    %ymm7, %ymm6, %ymm5
-        vorpd     XhMask+__svml_dasinh_data_internal(%rip), %ymm0, %ymm4
-        vandpd    %ymm4, %ymm5, %ymm1
-        vblendvpd %ymm11, %ymm1, %ymm10, %ymm5
-        vsubpd    %ymm1, %ymm6, %ymm2
-
-/* exponent bits */
-        vpsrlq    $20, %ymm5, %ymm10
-        vaddpd    %ymm2, %ymm7, %ymm3
-
-/*
- * Now resume the main code.
- * preserve mantissa, set input exponent to 2^(-10)
- */
-        vandpd    ExpMask+__svml_dasinh_data_internal(%rip), %ymm5, %ymm0
-        vorpd     Two10+__svml_dasinh_data_internal(%rip), %ymm0, %ymm2
-
-/* reciprocal approximation good to at least 11 bits */
-        vcvtpd2ps %ymm2, %xmm6
-        vrcpps    %xmm6, %xmm7
-        vcvtps2pd %xmm7, %ymm15
-
-/* exponent of X needed to scale Xl */
-        vandps    ExpMask0+__svml_dasinh_data_internal(%rip), %ymm5, %ymm9
-        vpsubq    %ymm9, %ymm8, %ymm0
-        vandpd    %ymm11, %ymm3, %ymm4
-
-/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
-        vroundpd  $0, %ymm15, %ymm3
-
-/* scale DblRcp */
-        vmulpd    %ymm0, %ymm3, %ymm2
-
-/* argument reduction */
-        vfmsub213pd %ymm12, %ymm2, %ymm5
-        vmulpd    %ymm2, %ymm4, %ymm12
-        vmovupd   poly_coeff+64+__svml_dasinh_data_internal(%rip), %ymm2
-        vaddpd    %ymm12, %ymm5, %ymm5
-        vfmadd213pd poly_coeff+96+__svml_dasinh_data_internal(%rip), %ymm5, %ymm2
-        vmulpd    %ymm5, %ymm5, %ymm4
-        vextractf128 $1, %ymm10, %xmm14
-        vshufps   $221, %xmm14, %xmm10, %xmm1
-
-/* biased exponent in DP format */
-        vcvtdq2pd %xmm1, %ymm7
-
-/* exponent*log(2.0) */
-        vmovupd   Threshold+__svml_dasinh_data_internal(%rip), %ymm10
-
-/* Add 31 to the exponent in the "large" case to get log(2 * input) */
-        vaddpd    dThirtyOne+__svml_dasinh_data_internal(%rip), %ymm7, %ymm6
-        vblendvpd %ymm11, %ymm7, %ymm6, %ymm1
-
-/*
- * prepare table index
- * table lookup
- */
-        vpsrlq    $40, %ymm3, %ymm11
-        vcmplt_oqpd %ymm3, %ymm10, %ymm3
-        vandpd    Bias+__svml_dasinh_data_internal(%rip), %ymm3, %ymm14
-        vorpd     Bias1+__svml_dasinh_data_internal(%rip), %ymm14, %ymm15
-        vsubpd    %ymm15, %ymm1, %ymm1
-        vmulpd    L2+__svml_dasinh_data_internal(%rip), %ymm1, %ymm3
-
-/* polynomial */
-        vmovupd   poly_coeff+__svml_dasinh_data_internal(%rip), %ymm1
-        vfmadd213pd poly_coeff+32+__svml_dasinh_data_internal(%rip), %ymm5, %ymm1
-        vfmadd213pd %ymm2, %ymm4, %ymm1
-
-/* reconstruction */
-        vfmadd213pd %ymm5, %ymm4, %ymm1
-        vextractf128 $1, %ymm11, %xmm7
-        vmovd     %xmm11, %edx
-        vmovd     %xmm7, %esi
-        movslq    %edx, %rdx
-        vpextrd   $2, %xmm11, %ecx
-        movslq    %esi, %rsi
-        vpextrd   $2, %xmm7, %edi
-        movslq    %ecx, %rcx
-        movslq    %edi, %rdi
-        vmovsd    (%r8,%rdx), %xmm0
-        vmovsd    (%r8,%rsi), %xmm8
-        vmovhpd   (%r8,%rcx), %xmm0, %xmm6
-        vmovhpd   (%r8,%rdi), %xmm8, %xmm9
-        vinsertf128 $1, %xmm9, %ymm6, %ymm0
-        vaddpd    %ymm1, %ymm0, %ymm0
-        vaddpd    %ymm0, %ymm3, %ymm7
-
-/* Finally, reincorporate the original sign. */
-        vandpd    dSign+__svml_dasinh_data_internal(%rip), %ymm13, %ymm6
-        vxorpd    %ymm7, %ymm6, %ymm0
-        testl     %eax, %eax
-
-/* Go to special inputs processing branch */
-        jne       L(SPECIAL_VALUES_BRANCH)
-                                # LOE rbx r12 r13 r14 r15 eax ymm0 ymm13
-
-/* Restore registers
- * and exit the function
- */
+	pushq	%rbp
+	cfi_def_cfa_offset(16)
+	movq	%rsp, %rbp
+	cfi_def_cfa(6, 16)
+	cfi_offset(6, -16)
+	andq	$-32, %rsp
+	subq	$96, %rsp
+	lea	Table_Lookup_Bias+__svml_dasinh_data_internal(%rip), %r8
+	vmovapd	%ymm0, %ymm13
+	vmovupd	SgnMask+__svml_dasinh_data_internal(%rip), %ymm9
+
+	/* Load the constant 1 and a sign mask */
+	vmovupd	One+__svml_dasinh_data_internal(%rip), %ymm12
+
+	/* No need to split X when FMA is available in hardware. */
+	vmulpd	%ymm13, %ymm13, %ymm8
+
+	/*
+	 * Get the absolute value of the input, since we will exploit antisymmetry
+	 * and mostly assume X >= 0 in the core computation
+	 */
+	vandpd	%ymm9, %ymm13, %ymm10
+
+	/*
+	 * Check whether the input is finite, by checking |X| <= MaxFloat
+	 * Otherwise set the rangemask so that the callout will get used.
+	 * Note that this will also use the callout for NaNs since not(NaN <= MaxFloat)
+	 */
+	vcmpnle_uqpd dLargestFinite+__svml_dasinh_data_internal(%rip), %ymm10, %ymm14
+
+	/*
+	 * Finally, express Y + W = X^2 + 1 accurately where Y has <= 29 bits.
+	 * If |X| <= 1 then |XHi| <= 1 and so |X2Hi| <= 1, so we can treat 1
+	 * as the dominant component in the compensated summation. Otherwise,
+	 * if |X| >= 1, then since X2Hi only has 52 significant bits, the basic
+	 * addition will be exact anyway until we get to |X| >= 2^53. But by
+	 * that time the log function is well-conditioned enough that the
+	 * rounding error doesn't matter. Hence we can treat 1 as dominant even
+	 * if it literally isn't.
+	 */
+	vaddpd	%ymm8, %ymm12, %ymm5
+
+	/*
+	 * The following computation can go wrong for very large X, basically
+	 * because X^2 overflows. But for large X we have
+	 * asinh(X) / log(2 X) - 1 =~= 1/(4 * X^2), so for X >= 2^30
+	 * we can just later stick X back into the log and tweak up the exponent.
+	 * Actually we scale X by 2^-30 and tweak the exponent up by 31,
+	 * to stay in the safe range for the later log computation.
+	 * Compute a flag now telling us when do do this.
+	 */
+	vcmplt_oqpd dBigThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm11
+	vsubpd	%ymm5, %ymm12, %ymm15
+	vmovmskpd %ymm14, %eax
+	vandpd	dTopMask29+__svml_dasinh_data_internal(%rip), %ymm5, %ymm14
+
+	/*
+	 * Compute R = 1/sqrt(Y + W) * (1 + d)
+	 * Force R to <= 12 significant bits in case it isn't already
+	 * This means that R * Y and R^2 * Y are exactly representable.
+	 */
+	vcvtpd2ps %ymm14, %xmm1
+	vaddpd	%ymm15, %ymm8, %ymm0
+	vsubpd	%ymm14, %ymm5, %ymm2
+	vrsqrtps %xmm1, %xmm3
+	vmovapd	%ymm13, %ymm7
+	vfmsub213pd %ymm8, %ymm13, %ymm7
+	vcvtps2pd %xmm3, %ymm6
+	vaddpd	%ymm0, %ymm7, %ymm4
+
+	/*
+	 * Unfortunately, we can still be in trouble if |X| <= 2^-10, since
+	 * the absolute error 2^-(12+53)-ish in sqrt(1 + X^2) gets scaled up
+	 * by 1/X and comes close to our threshold. Hence if |X| <= 2^-9,
+	 * perform an alternative computation
+	 * sqrt(1 + X^2) - 1 = X^2/2 - X^4/8 + X^6/16
+	 * X2 = X^2
+	 */
+	vaddpd	%ymm7, %ymm8, %ymm7
+	vaddpd	%ymm2, %ymm4, %ymm15
+
+	/*
+	 * Now       1 / (1 + d)
+	 * = 1 / (1 + (sqrt(1 - e) - 1))
+	 * = 1 / sqrt(1 - e)
+	 * = 1 + 1/2 * e + 3/8 * e^2 + 5/16 * e^3 + 35/128 * e^4 +
+	 * 63/256 * e^5 + 231/1024 * e^6 + ....
+	 * So compute the first five nonconstant terms of that, so that
+	 * we have a relative correction (1 + Corr) to apply to S etc.
+	 * C1 = 1/2
+	 * C2 = 3/8
+	 * C3 = 5/16
+	 * C4 = 35/128
+	 * C5 = 63/256
+	 */
+	vmovupd	dC5+__svml_dasinh_data_internal(%rip), %ymm4
+	vandpd	dTopMask12+__svml_dasinh_data_internal(%rip), %ymm6, %ymm0
+
+	/*
+	 * Compute S = (Y/sqrt(Y + W)) * (1 + d)
+	 * and T = (W/sqrt(Y + W)) * (1 + d)
+	 * so that S + T = sqrt(Y + W) * (1 + d)
+	 * S is exact, and the rounding error in T is OK.
+	 */
+	vmulpd	%ymm0, %ymm14, %ymm3
+	vmulpd	%ymm15, %ymm0, %ymm1
+	vmovupd	dHalf+__svml_dasinh_data_internal(%rip), %ymm6
+	vsubpd	%ymm12, %ymm3, %ymm14
+
+	/*
+	 * Obtain sqrt(1 + X^2) - 1 in two pieces
+	 * sqrt(1 + X^2) - 1
+	 * = sqrt(Y + W) - 1
+	 * = (S + T) * (1 + Corr) - 1
+	 * = [S - 1] + [T + (S + T) * Corr]
+	 * We need a compensated summation for the last part. We treat S - 1
+	 * as the larger part; it certainly is until about X < 2^-4, and in that
+	 * case, the error is affordable since X dominates over sqrt(1 + X^2) - 1
+	 * Final sum is dTmp5 (hi) + dTmp7 (lo)
+	 */
+	vaddpd	%ymm1, %ymm3, %ymm2
+
+	/*
+	 * Compute e = -(2 * d + d^2)
+	 * The first FMR is exact, and the rounding error in the other is acceptable
+	 * since d and e are ~ 2^-12
+	 */
+	vmovapd	%ymm12, %ymm5
+	vfnmadd231pd %ymm3, %ymm0, %ymm5
+	vfnmadd231pd %ymm1, %ymm0, %ymm5
+	vfmadd213pd dC4+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
+	vfmadd213pd dC3+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
+	vfmadd213pd dC2+__svml_dasinh_data_internal(%rip), %ymm5, %ymm4
+	vfmadd213pd %ymm6, %ymm5, %ymm4
+	vmulpd	%ymm4, %ymm5, %ymm0
+	vfmadd213pd %ymm1, %ymm2, %ymm0
+
+	/* Now multiplex the two possible computations */
+	vcmple_oqpd dLittleThreshold+__svml_dasinh_data_internal(%rip), %ymm10, %ymm2
+	vaddpd	%ymm14, %ymm0, %ymm15
+
+	/* dX2over2 = X^2/2 */
+	vmulpd	%ymm7, %ymm6, %ymm0
+
+	/* dX4over4 = X^4/4 */
+	vmulpd	%ymm0, %ymm0, %ymm8
+
+	/* dX46 = -X^4/4 + X^6/8 */
+	vfmsub231pd %ymm0, %ymm8, %ymm8
+
+	/* dX46over2 = -X^4/8 + x^6/16 */
+	vmulpd	%ymm8, %ymm6, %ymm5
+
+	/* 2^ (-10-exp(X) ) */
+	vmovupd	ExpMask2+__svml_dasinh_data_internal(%rip), %ymm8
+	vaddpd	%ymm5, %ymm0, %ymm4
+	vblendvpd %ymm2, %ymm4, %ymm15, %ymm1
+
+	/*
+	 * Now do another compensated sum to add |X| + [sqrt(1 + X^2) - 1].
+	 * It's always safe to assume |X| is larger.
+	 * This is the final 2-part argument to the log1p function
+	 */
+	vaddpd	%ymm1, %ymm10, %ymm3
+
+	/* Now multiplex to the case X = 2^-30 * |input|, Xl = dL = 0 in the "big" case. */
+	vmulpd	XScale+__svml_dasinh_data_internal(%rip), %ymm10, %ymm10
+
+	/*
+	 * Now we feed into the log1p code, using H in place of _VARG1 and
+	 * also adding L into Xl.
+	 * compute 1+x as high, low parts
+	 */
+	vmaxpd	%ymm3, %ymm12, %ymm6
+	vminpd	%ymm3, %ymm12, %ymm7
+	vandpd	%ymm9, %ymm3, %ymm9
+	vcmplt_oqpd XThreshold+__svml_dasinh_data_internal(%rip), %ymm9, %ymm0
+	vaddpd	%ymm7, %ymm6, %ymm5
+	vorpd	XhMask+__svml_dasinh_data_internal(%rip), %ymm0, %ymm4
+	vandpd	%ymm4, %ymm5, %ymm1
+	vblendvpd %ymm11, %ymm1, %ymm10, %ymm5
+	vsubpd	%ymm1, %ymm6, %ymm2
+
+	/* exponent bits */
+	vpsrlq	$20, %ymm5, %ymm10
+	vaddpd	%ymm2, %ymm7, %ymm3
+
+	/*
+	 * Now resume the main code.
+	 * preserve mantissa, set input exponent to 2^(-10)
+	 */
+	vandpd	ExpMask+__svml_dasinh_data_internal(%rip), %ymm5, %ymm0
+	vorpd	Two10+__svml_dasinh_data_internal(%rip), %ymm0, %ymm2
+
+	/* reciprocal approximation good to at least 11 bits */
+	vcvtpd2ps %ymm2, %xmm6
+	vrcpps	%xmm6, %xmm7
+	vcvtps2pd %xmm7, %ymm15
+
+	/* exponent of X needed to scale Xl */
+	vandps	ExpMask0+__svml_dasinh_data_internal(%rip), %ymm5, %ymm9
+	vpsubq	%ymm9, %ymm8, %ymm0
+	vandpd	%ymm11, %ymm3, %ymm4
+
+	/* round reciprocal to nearest integer, will have 1+9 mantissa bits */
+	vroundpd $0, %ymm15, %ymm3
+
+	/* scale DblRcp */
+	vmulpd	%ymm0, %ymm3, %ymm2
+
+	/* argument reduction */
+	vfmsub213pd %ymm12, %ymm2, %ymm5
+	vmulpd	%ymm2, %ymm4, %ymm12
+	vmovupd	poly_coeff+64+__svml_dasinh_data_internal(%rip), %ymm2
+	vaddpd	%ymm12, %ymm5, %ymm5
+	vfmadd213pd poly_coeff+96+__svml_dasinh_data_internal(%rip), %ymm5, %ymm2
+	vmulpd	%ymm5, %ymm5, %ymm4
+	vextractf128 $1, %ymm10, %xmm14
+	vshufps	$221, %xmm14, %xmm10, %xmm1
+
+	/* biased exponent in DP format */
+	vcvtdq2pd %xmm1, %ymm7
+
+	/* exponent*log(2.0) */
+	vmovupd	Threshold+__svml_dasinh_data_internal(%rip), %ymm10
+
+	/* Add 31 to the exponent in the "large" case to get log(2 * input) */
+	vaddpd	dThirtyOne+__svml_dasinh_data_internal(%rip), %ymm7, %ymm6
+	vblendvpd %ymm11, %ymm7, %ymm6, %ymm1
+
+	/*
+	 * prepare table index
+	 * table lookup
+	 */
+	vpsrlq	$40, %ymm3, %ymm11
+	vcmplt_oqpd %ymm3, %ymm10, %ymm3
+	vandpd	Bias+__svml_dasinh_data_internal(%rip), %ymm3, %ymm14
+	vorpd	Bias1+__svml_dasinh_data_internal(%rip), %ymm14, %ymm15
+	vsubpd	%ymm15, %ymm1, %ymm1
+	vmulpd	L2+__svml_dasinh_data_internal(%rip), %ymm1, %ymm3
+
+	/* polynomial */
+	vmovupd	poly_coeff+__svml_dasinh_data_internal(%rip), %ymm1
+	vfmadd213pd poly_coeff+32+__svml_dasinh_data_internal(%rip), %ymm5, %ymm1
+	vfmadd213pd %ymm2, %ymm4, %ymm1
+
+	/* reconstruction */
+	vfmadd213pd %ymm5, %ymm4, %ymm1
+	vextractf128 $1, %ymm11, %xmm7
+	vmovd	%xmm11, %edx
+	vmovd	%xmm7, %esi
+	movslq	%edx, %rdx
+	vpextrd	$2, %xmm11, %ecx
+	movslq	%esi, %rsi
+	vpextrd	$2, %xmm7, %edi
+	movslq	%ecx, %rcx
+	movslq	%edi, %rdi
+	vmovsd	(%r8, %rdx), %xmm0
+	vmovsd	(%r8, %rsi), %xmm8
+	vmovhpd	(%r8, %rcx), %xmm0, %xmm6
+	vmovhpd	(%r8, %rdi), %xmm8, %xmm9
+	vinsertf128 $1, %xmm9, %ymm6, %ymm0
+	vaddpd	%ymm1, %ymm0, %ymm0
+	vaddpd	%ymm0, %ymm3, %ymm7
+
+	/* Finally, reincorporate the original sign. */
+	vandpd	dSign+__svml_dasinh_data_internal(%rip), %ymm13, %ymm6
+	vxorpd	%ymm7, %ymm6, %ymm0
+	testl	%eax, %eax
+
+	/* Go to special inputs processing branch */
+	jne	L(SPECIAL_VALUES_BRANCH)
+	# LOE rbx r12 r13 r14 r15 eax ymm0 ymm13
+
+	/* Restore registers
+	 * and exit the function
+	 */
 
 L(EXIT):
-        movq      %rbp, %rsp
-        popq      %rbp
-        cfi_def_cfa(7, 8)
-        cfi_restore(6)
-        ret
-        cfi_def_cfa(6, 16)
-        cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+	movq	%rbp, %rsp
+	popq	%rbp
+	cfi_def_cfa(7, 8)
+	cfi_restore(6)
+	ret
+	cfi_def_cfa(6, 16)
+	cfi_offset(6, -16)
+
+	/* Branch to process
+	 * special inputs
+	 */
 
 L(SPECIAL_VALUES_BRANCH):
-        vmovupd   %ymm13, 32(%rsp)
-        vmovupd   %ymm0, 64(%rsp)
-                                # LOE rbx r12 r13 r14 r15 eax ymm0
-
-        xorl      %edx, %edx
-                                # LOE rbx r12 r13 r14 r15 eax edx
-
-        vzeroupper
-        movq      %r12, 16(%rsp)
-        /*  DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
-        movl      %edx, %r12d
-        movq      %r13, 8(%rsp)
-        /*  DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
-        movl      %eax, %r13d
-        movq      %r14, (%rsp)
-        /*  DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus)  */
-        .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
author	Sunil K Pandey <skpgkp2@gmail.com>	2022-03-07 10:47:09 -0800
committer	Sunil K Pandey <skpgkp2@gmail.com>	2022-03-07 21:14:09 -0800
commit	b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0 (patch)
tree	299a228ef0972f8edc51e729bebe6f3ca4401e41
parent	a7ab967662656e8e7db43d94a075c947444a981a (diff)
download	glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.tar.xz glibc-b61bfd101e23489feac53c0dbe8ba3a5e5a44aa0.zip