aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSunil K Pandey <skpgkp2@gmail.com>2022-03-07 10:47:14 -0800
committerSunil K Pandey <skpgkp2@gmail.com>2022-03-07 21:44:09 -0800
commit38f0c40f28f6e90384a193318b1d6fdacdc6c2fd (patch)
tree1112bdb3a73d0652273a0bf8fb2a8adc25318ff1
parent9db25a9b138e96300fad11f65d1cd7f6d72bb52e (diff)
downloadglibc-38f0c40f28f6e90384a193318b1d6fdacdc6c2fd.tar.xz
glibc-38f0c40f28f6e90384a193318b1d6fdacdc6c2fd.zip
x86_64: Fix svml_d_sinh4_core_avx2.S code formatting
This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S814
1 files changed, 406 insertions, 408 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
index 53b8a32426..ae16600579 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sinh4_core_avx2.S
@@ -34,437 +34,435 @@
/* Offsets for data table __svml_dsinh_data_internal
*/
-#define _dbInvLn2 0
-#define _dbLn2hi 32
-#define _dbLn2lo 64
-#define _dSign 96
-#define _dbT 128
-#define _dbShifter 2176
-#define _iDomainRange 2208
-#define _dPC2 2240
-#define _dPC3 2272
-#define _dPC4 2304
-#define _dPC5 2336
-#define _lIndexMask 2368
+#define _dbInvLn2 0
+#define _dbLn2hi 32
+#define _dbLn2lo 64
+#define _dSign 96
+#define _dbT 128
+#define _dbShifter 2176
+#define _iDomainRange 2208
+#define _dPC2 2240
+#define _dPC3 2272
+#define _dPC4 2304
+#define _dPC5 2336
+#define _lIndexMask 2368
#include <sysdep.h>
- .text
- .section .text.avx2,"ax",@progbits
+ .section .text.avx2, "ax", @progbits
ENTRY(_ZGVdN4v_sinh_avx2)
- pushq %rbp
- cfi_def_cfa_offset(16)
- movq %rsp, %rbp
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
- andq $-32, %rsp
- subq $96, %rsp
- lea _dbT+8+__svml_dsinh_data_internal(%rip), %r8
- vmovupd _dbShifter+__svml_dsinh_data_internal(%rip), %ymm12
-
-/*
- * Load argument
- * dM = x*2^K/log(2) + RShifter
- */
- vmovupd _dbInvLn2+__svml_dsinh_data_internal(%rip), %ymm5
- vmovupd _dbLn2hi+__svml_dsinh_data_internal(%rip), %ymm13
- vmovapd %ymm0, %ymm8
-
-/*
- * VLOAD_CONST( D, dPC[0], TAB._dPC1 );
- * Abs argument
- */
- vandpd _dSign+__svml_dsinh_data_internal(%rip), %ymm8, %ymm7
- vxorpd %ymm8, %ymm7, %ymm6
- vfmadd213pd %ymm12, %ymm6, %ymm5
-
-/*
- * R
- * dN = dM - RShifter
- */
- vsubpd %ymm12, %ymm5, %ymm3
-
-/*
- * Index and lookup
- * j
- */
- vandps _lIndexMask+__svml_dsinh_data_internal(%rip), %ymm5, %ymm4
-
-/*
- * Check for overflow\underflow
- *
- */
- vextractf128 $1, %ymm6, %xmm9
- vshufps $221, %xmm9, %xmm6, %xmm10
-
-/* dR = dX - dN*Log2_hi/2^K */
- vfnmadd231pd %ymm13, %ymm3, %ymm6
- vpcmpgtd _iDomainRange+__svml_dsinh_data_internal(%rip), %xmm10, %xmm11
- vmovmskps %xmm11, %eax
-
-/* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
- vfnmadd231pd _dbLn2lo+__svml_dsinh_data_internal(%rip), %ymm3, %ymm6
- vextractf128 $1, %ymm4, %xmm0
- vmovd %xmm4, %edx
- vmovd %xmm0, %esi
- shll $4, %edx
- vpextrd $2, %xmm4, %ecx
-
-/* split j and N */
- vxorps %ymm4, %ymm5, %ymm3
- shll $4, %esi
- vpextrd $2, %xmm0, %edi
- shll $4, %ecx
-
-/*
- * G1,G2,G3: dTdif,dTn * 2^N,2^(-N)
- * lM now is an EXP(2^N)
- */
- vpsllq $45, %ymm3, %ymm4
- vmovq (%rdx,%r8), %xmm14
- vmovq (%rsi,%r8), %xmm1
- vmovhpd (%rcx,%r8), %xmm14, %xmm15
- shll $4, %edi
- vmovhpd (%rdi,%r8), %xmm1, %xmm2
-
-/* dR2 = dR^2 */
- vmulpd %ymm6, %ymm6, %ymm1
- vmovq -8(%rdx,%r8), %xmm9
- vmovq -8(%rsi,%r8), %xmm11
- vmovhpd -8(%rcx,%r8), %xmm9, %xmm10
- vmovhpd -8(%rdi,%r8), %xmm11, %xmm12
- vinsertf128 $1, %xmm2, %ymm15, %ymm2
-
-/* */
- vpaddq %ymm4, %ymm2, %ymm5
-
-/* */
- vpsubq %ymm4, %ymm2, %ymm14
-
-/* dG3 = dTn*2^N + dTn*2^-N */
- vaddpd %ymm14, %ymm5, %ymm2
-
-/* dG2 = dTn*2^N - dTn*2^-N */
- vsubpd %ymm14, %ymm5, %ymm14
-
-/*
- * sinh(r) = r*((a1=1)+r^2*(a3+r^2*a5)) = r + r*(r^2*(a3+r^2*a5)) ....
- * dSinh_r = (a3+r^2*a5)
- */
- vmovupd _dPC5+__svml_dsinh_data_internal(%rip), %ymm5
- vfmadd213pd _dPC3+__svml_dsinh_data_internal(%rip), %ymm1, %ymm5
- vinsertf128 $1, %xmm12, %ymm10, %ymm13
- vpaddq %ymm4, %ymm13, %ymm0
-
-/* dSinh_r = r^2*(a3+r^2*a5) */
- vmulpd %ymm5, %ymm1, %ymm4
-
-/* dG2 += dG1 */
- vaddpd %ymm14, %ymm0, %ymm3
-
-/* dG1 += dG3 */
- vaddpd %ymm2, %ymm0, %ymm0
-
-/* dSinh_r = r + r*(r^2*(a3+r^2*a5)) */
- vfmadd213pd %ymm6, %ymm6, %ymm4
-
-/*
- * poly(r) = (dG2+dG1)+dG3*sinh(dR)+dG1*sinh(dR)+(dG1+dG2)*dR2*(a2 +a4*dR2)
- * dOut = (a2 +a4*dR2)
- */
- vmovupd _dPC4+__svml_dsinh_data_internal(%rip), %ymm6
- vfmadd213pd _dPC2+__svml_dsinh_data_internal(%rip), %ymm1, %ymm6
-
-/* dOut = dR2*(a2 +a4*dR2) */
- vmulpd %ymm6, %ymm1, %ymm1
-
-/* dOut = dG2*dR2*(a2 +a4*dR2) */
- vmulpd %ymm3, %ymm1, %ymm6
-
-/* dOut = dG1*sinh(dR)+dG2*dR2*(a2 +a4*dR2) */
- vfmadd213pd %ymm6, %ymm0, %ymm4
-
-/* dOut = dG2 + dG1*sinh(dR)+dG2*dR2*(a2 +a4*dR2) */
- vaddpd %ymm4, %ymm3, %ymm5
-
-/* Ret H */
- vorpd %ymm5, %ymm7, %ymm0
- testl %eax, %eax
-
-/* Go to special inputs processing branch */
- jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx r12 r13 r14 r15 eax ymm0 ymm8
-
-/* Restore registers
- * and exit the function
- */
+ pushq %rbp
+ cfi_def_cfa_offset(16)
+ movq %rsp, %rbp
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+ andq $-32, %rsp
+ subq $96, %rsp
+ lea _dbT+8+__svml_dsinh_data_internal(%rip), %r8
+ vmovupd _dbShifter+__svml_dsinh_data_internal(%rip), %ymm12
+
+ /*
+ * Load argument
+ * dM = x*2^K/log(2) + RShifter
+ */
+ vmovupd _dbInvLn2+__svml_dsinh_data_internal(%rip), %ymm5
+ vmovupd _dbLn2hi+__svml_dsinh_data_internal(%rip), %ymm13
+ vmovapd %ymm0, %ymm8
+
+ /*
+ * VLOAD_CONST( D, dPC[0], TAB._dPC1 );
+ * Abs argument
+ */
+ vandpd _dSign+__svml_dsinh_data_internal(%rip), %ymm8, %ymm7
+ vxorpd %ymm8, %ymm7, %ymm6
+ vfmadd213pd %ymm12, %ymm6, %ymm5
+
+ /*
+ * R
+ * dN = dM - RShifter
+ */
+ vsubpd %ymm12, %ymm5, %ymm3
+
+ /*
+ * Index and lookup
+ * j
+ */
+ vandps _lIndexMask+__svml_dsinh_data_internal(%rip), %ymm5, %ymm4
+
+ /*
+ * Check for overflow\underflow
+ *
+ */
+ vextractf128 $1, %ymm6, %xmm9
+ vshufps $221, %xmm9, %xmm6, %xmm10
+
+ /* dR = dX - dN*Log2_hi/2^K */
+ vfnmadd231pd %ymm13, %ymm3, %ymm6
+ vpcmpgtd _iDomainRange+__svml_dsinh_data_internal(%rip), %xmm10, %xmm11
+ vmovmskps %xmm11, %eax
+
+ /* dR = (dX - dN*Log2_hi/2^K) - dN*Log2_lo/2^K */
+ vfnmadd231pd _dbLn2lo+__svml_dsinh_data_internal(%rip), %ymm3, %ymm6
+ vextractf128 $1, %ymm4, %xmm0
+ vmovd %xmm4, %edx
+ vmovd %xmm0, %esi
+ shll $4, %edx
+ vpextrd $2, %xmm4, %ecx
+
+ /* split j and N */
+ vxorps %ymm4, %ymm5, %ymm3
+ shll $4, %esi
+ vpextrd $2, %xmm0, %edi
+ shll $4, %ecx
+
+ /*
+ * G1, G2, G3: dTdif, dTn * 2^N, 2^(-N)
+ * lM now is an EXP(2^N)
+ */
+ vpsllq $45, %ymm3, %ymm4
+ vmovq (%rdx, %r8), %xmm14
+ vmovq (%rsi, %r8), %xmm1
+ vmovhpd (%rcx, %r8), %xmm14, %xmm15
+ shll $4, %edi
+ vmovhpd (%rdi, %r8), %xmm1, %xmm2
+
+ /* dR2 = dR^2 */
+ vmulpd %ymm6, %ymm6, %ymm1
+ vmovq -8(%rdx, %r8), %xmm9
+ vmovq -8(%rsi, %r8), %xmm11
+ vmovhpd -8(%rcx, %r8), %xmm9, %xmm10
+ vmovhpd -8(%rdi, %r8), %xmm11, %xmm12
+ vinsertf128 $1, %xmm2, %ymm15, %ymm2
+
+ /* */
+ vpaddq %ymm4, %ymm2, %ymm5
+
+ /* */
+ vpsubq %ymm4, %ymm2, %ymm14
+
+ /* dG3 = dTn*2^N + dTn*2^-N */
+ vaddpd %ymm14, %ymm5, %ymm2
+
+ /* dG2 = dTn*2^N - dTn*2^-N */
+ vsubpd %ymm14, %ymm5, %ymm14
+
+ /*
+ * sinh(r) = r*((a1=1)+r^2*(a3+r^2*a5)) = r + r*(r^2*(a3+r^2*a5)) ....
+ * dSinh_r = (a3+r^2*a5)
+ */
+ vmovupd _dPC5+__svml_dsinh_data_internal(%rip), %ymm5
+ vfmadd213pd _dPC3+__svml_dsinh_data_internal(%rip), %ymm1, %ymm5
+ vinsertf128 $1, %xmm12, %ymm10, %ymm13
+ vpaddq %ymm4, %ymm13, %ymm0
+
+ /* dSinh_r = r^2*(a3+r^2*a5) */
+ vmulpd %ymm5, %ymm1, %ymm4
+
+ /* dG2 += dG1 */
+ vaddpd %ymm14, %ymm0, %ymm3
+
+ /* dG1 += dG3 */
+ vaddpd %ymm2, %ymm0, %ymm0
+
+ /* dSinh_r = r + r*(r^2*(a3+r^2*a5)) */
+ vfmadd213pd %ymm6, %ymm6, %ymm4
+
+ /*
+ * poly(r) = (dG2+dG1)+dG3*sinh(dR)+dG1*sinh(dR)+(dG1+dG2)*dR2*(a2 +a4*dR2)
+ * dOut = (a2 +a4*dR2)
+ */
+ vmovupd _dPC4+__svml_dsinh_data_internal(%rip), %ymm6
+ vfmadd213pd _dPC2+__svml_dsinh_data_internal(%rip), %ymm1, %ymm6
+
+ /* dOut = dR2*(a2 +a4*dR2) */
+ vmulpd %ymm6, %ymm1, %ymm1
+
+ /* dOut = dG2*dR2*(a2 +a4*dR2) */
+ vmulpd %ymm3, %ymm1, %ymm6
+
+ /* dOut = dG1*sinh(dR)+dG2*dR2*(a2 +a4*dR2) */
+ vfmadd213pd %ymm6, %ymm0, %ymm4
+
+ /* dOut = dG2 + dG1*sinh(dR)+dG2*dR2*(a2 +a4*dR2) */
+ vaddpd %ymm4, %ymm3, %ymm5
+
+ /* Ret H */
+ vorpd %ymm5, %ymm7, %ymm0
+ testl %eax, %eax
+
+ /* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+ # LOE rbx r12 r13 r14 r15 eax ymm0 ymm8
+
+ /* Restore registers
+ * and exit the function
+ */
L(EXIT):
- movq %rbp, %rsp
- popq %rbp
- cfi_def_cfa(7, 8)
- cfi_restore(6)
- ret
- cfi_def_cfa(6, 16)
- cfi_offset(6, -16)
-
-/* Branch to process
- * special inputs
- */
+ movq %rbp, %rsp
+ popq %rbp
+ cfi_def_cfa(7, 8)
+ cfi_restore(6)
+ ret
+ cfi_def_cfa(6, 16)
+ cfi_offset(6, -16)
+
+ /* Branch to process
+ * special inputs
+ */
L(SPECIAL_VALUES_BRANCH):
- vmovupd %ymm8, 32(%rsp)
- vmovupd %ymm0, 64(%rsp)
- # LOE rbx r12 r13 r14 r15 eax ymm0
-
- xorl %edx, %edx
- # LOE rbx r12 r13 r14 r15 eax edx
-
- vzeroupper
- movq %r12, 16(%rsp)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
- movl %edx, %r12d
- movq %r13, 8(%rsp)
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
- movl %eax, %r13d
- movq %r14, (%rsp)
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r15 r12d r13d
-
-/* Range mask
- * bits check
- */
+ vmovupd %ymm8, 32(%rsp)
+ vmovupd %ymm0, 64(%rsp)
+ # LOE rbx r12 r13 r14 r15 eax ymm0
+
+ xorl %edx, %edx
+ # LOE rbx r12 r13 r14 r15 eax edx
+
+ vzeroupper
+ movq %r12, 16(%rsp)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+ movl %edx, %r12d
+ movq %r13, 8(%rsp)
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+ movl %eax, %r13d
+ movq %r14, (%rsp)
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+ # LOE rbx r15 r12d r13d
+
+ /* Range mask
+ * bits check
+ */
L(RANGEMASK_CHECK):
- btl %r12d, %r13d
+ btl %r12d, %r13d
-/* Call scalar math function */
- jc L(SCALAR_MATH_CALL)
- # LOE rbx r15 r12d r13d
+ /* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+ # LOE rbx r15 r12d r13d
-/* Special inputs
- * processing loop
- */
+ /* Special inputs
+ * processing loop
+ */
L(SPECIAL_VALUES_LOOP):
- incl %r12d
- cmpl $4, %r12d
-
-/* Check bits in range mask */
- jl L(RANGEMASK_CHECK)
- # LOE rbx r15 r12d r13d
-
- movq 16(%rsp), %r12
- cfi_restore(12)
- movq 8(%rsp), %r13
- cfi_restore(13)
- movq (%rsp), %r14
- cfi_restore(14)
- vmovupd 64(%rsp), %ymm0
-
-/* Go to exit */
- jmp L(EXIT)
- /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
- .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
- .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
- /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
- .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
- # LOE rbx r12 r13 r14 r15 ymm0
-
-/* Scalar math fucntion call
- * to process special input
- */
+ incl %r12d
+ cmpl $4, %r12d
+
+ /* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ # LOE rbx r15 r12d r13d
+
+ movq 16(%rsp), %r12
+ cfi_restore(12)
+ movq 8(%rsp), %r13
+ cfi_restore(13)
+ movq (%rsp), %r14
+ cfi_restore(14)
+ vmovupd 64(%rsp), %ymm0
+
+ /* Go to exit */
+ jmp L(EXIT)
+ /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -80; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xb0, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -88; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa8, 0xff, 0xff, 0xff, 0x22
+ /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -32; DW_OP_and; DW_OP_const4s: -96; DW_OP_plus) */
+ .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xe0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0xa0, 0xff, 0xff, 0xff, 0x22
+ # LOE rbx r12 r13 r14 r15 ymm0
+
+ /* Scalar math fucntion call
+ * to process special input
+ */
L(SCALAR_MATH_CALL):
- movl %r12d, %r14d
- movsd 32(%rsp,%r14,8), %xmm0
- call sinh@PLT
- # LOE rbx r14 r15 r12d r13d xmm0
+ movl %r12d, %r14d
+ movsd 32(%rsp, %r14, 8), %xmm0
+ call sinh@PLT
+ # LOE rbx r14 r15 r12d r13d xmm0
- movsd %xmm0, 64(%rsp,%r14,8)
+ movsd %xmm0, 64(%rsp, %r14, 8)
-/* Process special inputs in loop */
- jmp L(SPECIAL_VALUES_LOOP)
- # LOE rbx r15 r12d r13d
+ /* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+ # LOE rbx r15 r12d r13d
END(_ZGVdN4v_sinh_avx2)
- .section .rodata, "a"
- .align 32
+ .section .rodata, "a"
+ .align 32
#ifdef __svml_dsinh_data_internal_typedef
typedef unsigned int VUINT32;
-typedef struct
-{
- __declspec(align(32)) VUINT32 _dbInvLn2[4][2];
- __declspec(align(32)) VUINT32 _dbLn2hi[4][2];
- __declspec(align(32)) VUINT32 _dbLn2lo[4][2];
- __declspec(align(32)) VUINT32 _dSign[4][2]; //0x8000000000000000
- __declspec(align(32)) VUINT32 _dbT[(1<<7)][2][2]; //precalc poly coeff
- __declspec(align(32)) VUINT32 _dbShifter[4][2];
- __declspec(align(32)) VUINT32 _iDomainRange[8][1];
- __declspec(align(32)) VUINT32 _dPC2[4][2];
- __declspec(align(32)) VUINT32 _dPC3[4][2];
- __declspec(align(32)) VUINT32 _dPC4[4][2];
- __declspec(align(32)) VUINT32 _dPC5[4][2];
- __declspec(align(32)) VUINT32 _lIndexMask[4][2];
+typedef struct {
+ __declspec(align(32)) VUINT32 _dbInvLn2[4][2];
+ __declspec(align(32)) VUINT32 _dbLn2hi[4][2];
+ __declspec(align(32)) VUINT32 _dbLn2lo[4][2];
+ __declspec(align(32)) VUINT32 _dSign[4][2]; // 0x8000000000000000
+ __declspec(align(32)) VUINT32 _dbT[(1<<7)][2][2]; // precalc poly coeff
+ __declspec(align(32)) VUINT32 _dbShifter[4][2];
+ __declspec(align(32)) VUINT32 _iDomainRange[8][1];
+ __declspec(align(32)) VUINT32 _dPC2[4][2];
+ __declspec(align(32)) VUINT32 _dPC3[4][2];
+ __declspec(align(32)) VUINT32 _dPC4[4][2];
+ __declspec(align(32)) VUINT32 _dPC5[4][2];
+ __declspec(align(32)) VUINT32 _lIndexMask[4][2];
} __svml_dsinh_data_internal;
#endif
__svml_dsinh_data_internal:
- .quad 0x3FF71547652B82FE, 0x3FF71547652B82FE, 0x3FF71547652B82FE, 0x3FF71547652B82FE /* _dbInvLn2 = 1/log(2) */
- .align 32
- .quad 0x3FE62E42FEFA0000, 0x3FE62E42FEFA0000, 0x3FE62E42FEFA0000, 0x3FE62E42FEFA0000 /* _dbLn2hi = log(2) hi*/
- .align 32
- .quad 0x3D7CF79ABC9E3B3A, 0x3D7CF79ABC9E3B3A, 0x3D7CF79ABC9E3B3A, 0x3D7CF79ABC9E3B3A /* _dbLn2lo = log(2) lo*/
- .align 32
- .quad 0x8000000000000000, 0x8000000000000000, 0x8000000000000000, 0x8000000000000000 /* _dSign */
- //_dbT
- .align 32
- .quad 0x0000000000000000, 0x3FE0000000000000 //2^( 0 /128-1) - 2^(- 0 /128-1), 2^(- 0 /128-1)
- .quad 0x3F762E4A19BD1E74, 0x3FDFD3C22B8F71F1 //2^( 1 /128-1) - 2^(- 1 /128-1), 2^(- 1 /128-1)
- .quad 0x3F862E5F6A0DFD36, 0x3FDFA7C1819E90D8 //2^( 2 /128-1) - 2^(- 2 /128-1), 2^(- 2 /128-1)
- .quad 0x3F90A2E234040F5F, 0x3FDF7BFDAD9CBE14 //2^( 3 /128-1) - 2^(- 3 /128-1), 2^(- 3 /128-1)
- .quad 0x3F962EB4ABCC5A81, 0x3FDF50765B6E4540 //2^( 4 /128-1) - 2^(- 4 /128-1), 2^(- 4 /128-1)
- .quad 0x3F9BBAB1C5033244, 0x3FDF252B376BBA97 //2^( 5 /128-1) - 2^(- 5 /128-1), 2^(- 5 /128-1)
- .quad 0x3FA0A372144EEB45, 0x3FDEFA1BEE615A27 //2^( 6 /128-1) - 2^(- 6 /128-1), 2^(- 6 /128-1)
- .quad 0x3FA369AB3FFBF8B0, 0x3FDECF482D8E67F1 //2^( 7 /128-1) - 2^(- 7 /128-1), 2^(- 7 /128-1)
- .quad 0x3FA63009BA740A2A, 0x3FDEA4AFA2A490DA //2^( 8 /128-1) - 2^(- 8 /128-1), 2^(- 8 /128-1)
- .quad 0x3FA8F692D8EA1B5A, 0x3FDE7A51FBC74C83 //2^( 9 /128-1) - 2^(- 9 /128-1), 2^(- 9 /128-1)
- .quad 0x3FABBD4BF0E31A6F, 0x3FDE502EE78B3FF6 //2^( 10 /128-1) - 2^(- 10 /128-1), 2^(- 10 /128-1)
- .quad 0x3FAE843A5840286A, 0x3FDE264614F5A129 //2^( 11 /128-1) - 2^(- 11 /128-1), 2^(- 11 /128-1)
- .quad 0x3FB0A5B1B2A46D0A, 0x3FDDFC97337B9B5F //2^( 12 /128-1) - 2^(- 12 /128-1), 2^(- 12 /128-1)
- .quad 0x3FB20966375ABCDF, 0x3FDDD321F301B460 //2^( 13 /128-1) - 2^(- 13 /128-1), 2^(- 13 /128-1)
- .quad 0x3FB36D3D65DCA4E8, 0x3FDDA9E603DB3285 //2^( 14 /128-1) - 2^(- 14 /128-1), 2^(- 14 /128-1)
- .quad 0x3FB4D139EA06642A, 0x3FDD80E316C98398 //2^( 15 /128-1) - 2^(- 15 /128-1), 2^(- 15 /128-1)
- .quad 0x3FB6355E6FFBF9BA, 0x3FDD5818DCFBA487 //2^( 16 /128-1) - 2^(- 16 /128-1), 2^(- 16 /128-1)
- .quad 0x3FB799ADA42E4788, 0x3FDD2F87080D89F2 //2^( 17 /128-1) - 2^(- 17 /128-1), 2^(- 17 /128-1)
- .quad 0x3FB8FE2A336035BC, 0x3FDD072D4A07897C //2^( 18 /128-1) - 2^(- 18 /128-1), 2^(- 18 /128-1)
- .quad 0x3FBA62D6CAABD6B6, 0x3FDCDF0B555DC3FA //2^( 19 /128-1) - 2^(- 19 /128-1), 2^(- 19 /128-1)
- .quad 0x3FBBC7B617878BAF, 0x3FDCB720DCEF9069 //2^( 20 /128-1) - 2^(- 20 /128-1), 2^(- 20 /128-1)
- .quad 0x3FBD2CCAC7CB2A11, 0x3FDC8F6D9406E7B5 //2^( 21 /128-1) - 2^(- 21 /128-1), 2^(- 21 /128-1)
- .quad 0x3FBE921789B52185, 0x3FDC67F12E57D14B //2^( 22 /128-1) - 2^(- 22 /128-1), 2^(- 22 /128-1)
- .quad 0x3FBFF79F0BEFA2C7, 0x3FDC40AB5FFFD07A //2^( 23 /128-1) - 2^(- 23 /128-1), 2^(- 23 /128-1)
- .quad 0x3FC0AEB1FECAE3A9, 0x3FDC199BDD85529C //2^( 24 /128-1) - 2^(- 24 /128-1), 2^(- 24 /128-1)
- .quad 0x3FC161B4871C5CEC, 0x3FDBF2C25BD71E09 //2^( 25 /128-1) - 2^(- 25 /128-1), 2^(- 25 /128-1)
- .quad 0x3FC214D876F26FD0, 0x3FDBCC1E904BC1D2 //2^( 26 /128-1) - 2^(- 26 /128-1), 2^(- 26 /128-1)
- .quad 0x3FC2C81F2693816F, 0x3FDBA5B030A1064A //2^( 27 /128-1) - 2^(- 27 /128-1), 2^(- 27 /128-1)
- .quad 0x3FC37B89EE88BEF7, 0x3FDB7F76F2FB5E47 //2^( 28 /128-1) - 2^(- 28 /128-1), 2^(- 28 /128-1)
- .quad 0x3FC42F1A27A0B3CD, 0x3FDB59728DE5593A //2^( 29 /128-1) - 2^(- 29 /128-1), 2^(- 29 /128-1)
- .quad 0x3FC4E2D12AF1E037, 0x3FDB33A2B84F15FB //2^( 30 /128-1) - 2^(- 30 /128-1), 2^(- 30 /128-1)
- .quad 0x3FC596B051DD508D, 0x3FDB0E07298DB666 //2^( 31 /128-1) - 2^(- 31 /128-1), 2^(- 31 /128-1)
- .quad 0x3FC64AB8F61134FA, 0x3FDAE89F995AD3AD //2^( 32 /128-1) - 2^(- 32 /128-1), 2^(- 32 /128-1)
- .quad 0x3FC6FEEC718B79D1, 0x3FDAC36BBFD3F37A //2^( 33 /128-1) - 2^(- 33 /128-1), 2^(- 33 /128-1)
- .quad 0x3FC7B34C1E9C607F, 0x3FDA9E6B5579FDBF //2^( 34 /128-1) - 2^(- 34 /128-1), 2^(- 34 /128-1)
- .quad 0x3FC867D957E91912, 0x3FDA799E1330B358 //2^( 35 /128-1) - 2^(- 35 /128-1), 2^(- 35 /128-1)
- .quad 0x3FC91C95786E5C72, 0x3FDA5503B23E255D //2^( 36 /128-1) - 2^(- 36 /128-1), 2^(- 36 /128-1)
- .quad 0x3FC9D181DB83072F, 0x3FDA309BEC4A2D33 //2^( 37 /128-1) - 2^(- 37 /128-1), 2^(- 37 /128-1)
- .quad 0x3FCA869FDCDAB512, 0x3FDA0C667B5DE565 //2^( 38 /128-1) - 2^(- 38 /128-1), 2^(- 38 /128-1)
- .quad 0x3FCB3BF0D8885D4C, 0x3FD9E86319E32323 //2^( 39 /128-1) - 2^(- 39 /128-1), 2^(- 39 /128-1)
- .quad 0x3FCBF1762B00EF69, 0x3FD9C49182A3F090 //2^( 40 /128-1) - 2^(- 40 /128-1), 2^(- 40 /128-1)
- .quad 0x3FCCA731311DF0FB, 0x3FD9A0F170CA07BA //2^( 41 /128-1) - 2^(- 41 /128-1), 2^(- 41 /128-1)
- .quad 0x3FCD5D2348201C09, 0x3FD97D829FDE4E50 //2^( 42 /128-1) - 2^(- 42 /128-1), 2^(- 42 /128-1)
- .quad 0x3FCE134DCDB1FE3E, 0x3FD95A44CBC8520F //2^( 43 /128-1) - 2^(- 43 /128-1), 2^(- 43 /128-1)
- .quad 0x3FCEC9B21FEA98EA, 0x3FD93737B0CDC5E5 //2^( 44 /128-1) - 2^(- 44 /128-1), 2^(- 44 /128-1)
- .quad 0x3FCF80519D5001D3, 0x3FD9145B0B91FFC6 //2^( 45 /128-1) - 2^(- 45 /128-1), 2^(- 45 /128-1)
- .quad 0x3FD01B96D26D026A, 0x3FD8F1AE99157736 //2^( 46 /128-1) - 2^(- 46 /128-1), 2^(- 46 /128-1)
- .quad 0x3FD07723CAFA6331, 0x3FD8CF3216B5448C //2^( 47 /128-1) - 2^(- 47 /128-1), 2^(- 47 /128-1)
- .quad 0x3FD0D2D06841B373, 0x3FD8ACE5422AA0DB //2^( 48 /128-1) - 2^(- 48 /128-1), 2^(- 48 /128-1)
- .quad 0x3FD12E9D5A715381, 0x3FD88AC7D98A6699 //2^( 49 /128-1) - 2^(- 49 /128-1), 2^(- 49 /128-1)
- .quad 0x3FD18A8B51F5C661, 0x3FD868D99B4492ED //2^( 50 /128-1) - 2^(- 50 /128-1), 2^(- 50 /128-1)
- .quad 0x3FD1E69AFF7B04D7, 0x3FD8471A4623C7AD //2^( 51 /128-1) - 2^(- 51 /128-1), 2^(- 51 /128-1)
- .quad 0x3FD242CD13EDD0F1, 0x3FD82589994CCE13 //2^( 52 /128-1) - 2^(- 52 /128-1), 2^(- 52 /128-1)
- .quad 0x3FD29F22407D0A0C, 0x3FD80427543E1A12 //2^( 53 /128-1) - 2^(- 53 /128-1), 2^(- 53 /128-1)
- .quad 0x3FD2FB9B369B0153, 0x3FD7E2F336CF4E62 //2^( 54 /128-1) - 2^(- 54 /128-1), 2^(- 54 /128-1)
- .quad 0x3FD35838A7FECEC8, 0x3FD7C1ED0130C132 //2^( 55 /128-1) - 2^(- 55 /128-1), 2^(- 55 /128-1)
- .quad 0x3FD3B4FB46A5A6CC, 0x3FD7A11473EB0187 //2^( 56 /128-1) - 2^(- 56 /128-1), 2^(- 56 /128-1)
- .quad 0x3FD411E3C4D4302F, 0x3FD780694FDE5D3F //2^( 57 /128-1) - 2^(- 57 /128-1), 2^(- 57 /128-1)
- .quad 0x3FD46EF2D517DAC8, 0x3FD75FEB564267C9 //2^( 58 /128-1) - 2^(- 58 /128-1), 2^(- 58 /128-1)
- .quad 0x3FD4CC292A48369E, 0x3FD73F9A48A58174 //2^( 59 /128-1) - 2^(- 59 /128-1), 2^(- 59 /128-1)
- .quad 0x3FD5298777884B96, 0x3FD71F75E8EC5F74 //2^( 60 /128-1) - 2^(- 60 /128-1), 2^(- 60 /128-1)
- .quad 0x3FD5870E7047F1BC, 0x3FD6FF7DF9519484 //2^( 61 /128-1) - 2^(- 61 /128-1), 2^(- 61 /128-1)
- .quad 0x3FD5E4BEC8452A1A, 0x3FD6DFB23C651A2F //2^( 62 /128-1) - 2^(- 62 /128-1), 2^(- 62 /128-1)
- .quad 0x3FD64299338D7827, 0x3FD6C012750BDABF //2^( 63 /128-1) - 2^(- 63 /128-1), 2^(- 63 /128-1)
- .quad 0x3FD6A09E667F3BCD, 0x3FD6A09E667F3BCD //2^( 64 /128-1) - 2^(- 64 /128-1), 2^(- 64 /128-1)
- .quad 0x3FD6FECF15CB0C0B, 0x3FD68155D44CA973 //2^( 65 /128-1) - 2^(- 65 /128-1), 2^(- 65 /128-1)
- .quad 0x3FD75D2BF6751239, 0x3FD6623882552225 //2^( 66 /128-1) - 2^(- 66 /128-1), 2^(- 66 /128-1)
- .quad 0x3FD7BBB5BDD665E8, 0x3FD6434634CCC320 //2^( 67 /128-1) - 2^(- 67 /128-1), 2^(- 67 /128-1)
- .quad 0x3FD81A6D219E6963, 0x3FD6247EB03A5585 //2^( 68 /128-1) - 2^(- 68 /128-1), 2^(- 68 /128-1)
- .quad 0x3FD87952D7D426DF, 0x3FD605E1B976DC09 //2^( 69 /128-1) - 2^(- 69 /128-1), 2^(- 69 /128-1)
- .quad 0x3FD8D86796D7AE49, 0x3FD5E76F15AD2148 //2^( 70 /128-1) - 2^(- 70 /128-1), 2^(- 70 /128-1)
- .quad 0x3FD937AC156373C8, 0x3FD5C9268A5946B7 //2^( 71 /128-1) - 2^(- 71 /128-1