aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSunil K Pandey <skpgkp2@gmail.com>2022-03-07 10:47:15 -0800
committerSunil K Pandey <skpgkp2@gmail.com>2022-03-07 21:44:09 -0800
commit7f852d2592b50ef9c6daed656b8f33c65bfe594a (patch)
treea616daeb732d919c2f2b4aa389526f842b14b1d6
parent160e183a9a1d2bb54fb899f86ab67ce516a95304 (diff)
downloadglibc-7f852d2592b50ef9c6daed656b8f33c65bfe594a.tar.xz
glibc-7f852d2592b50ef9c6daed656b8f33c65bfe594a.zip
x86_64: Fix svml_d_tan2_core_sse4.S code formatting
This commit contains following formatting changes 1. Instructions proceeded by a tab. 2. Instruction less than 8 characters in length have a tab between it and the first operand. 3. Instruction greater than 7 characters in length have a space between it and the first operand. 4. Tabs after `#define`d names and their value. 5. 8 space at the beginning of line replaced by tab. 6. Indent comments with code. 7. Remove redundent .text section. 8. 1 space between line content and line comment. 9. Space after all commas. Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
-rw-r--r--sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S12143
1 files changed, 6070 insertions, 6073 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S
index 879cfd631d..d572b886be 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S
@@ -47,6213 +47,6210 @@
/* Offsets for data table __svml_dtan_data_internal
*/
-#define _dAbsMask 0
-#define _dRangeVal 16
-#define _dRShift 32
-#define _dCoeffs 48
-#define _dReductionRangeVal 26672
-#define _dInvPi 26688
-#define _dPI1 26704
-#define _dPI2 26720
-#define _dPI3 26736
-#define _dP1 26752
-#define _dP2 26768
-#define _dP3 26784
-#define _dQ0 26800
-#define _dQ1 26816
-#define _dQ2 26832
-#define _dQ3 26848
+#define _dAbsMask 0
+#define _dRangeVal 16
+#define _dRShift 32
+#define _dCoeffs 48
+#define _dReductionRangeVal 26672
+#define _dInvPi 26688
+#define _dPI1 26704
+#define _dPI2 26720
+#define _dPI3 26736
+#define _dP1 26752
+#define _dP2 26768
+#define _dP3 26784
+#define _dQ0 26800
+#define _dQ1 26816
+#define _dQ2 26832
+#define _dQ3 26848
#include <sysdep.h>
- .text
- .section .text.sse4,"ax",@progbits
+ .section .text.sse4, "ax", @progbits
ENTRY(_ZGVbN2v_tan_sse4)
- subq $72, %rsp
- cfi_def_cfa_offset(80)
- movaps %xmm0, %xmm1
- movups _dAbsMask+__svml_dtan_data_internal(%rip), %xmm4
+ subq $72, %rsp
+ cfi_def_cfa_offset(80)
+ movaps %xmm0, %xmm1
+ movups _dAbsMask+__svml_dtan_data_internal(%rip), %xmm4
-/* Legacy Code */
- xorl %eax, %eax
+ /* Legacy Code */
+ xorl %eax, %eax
-/* b) Remove sign using AND 0x7fffffffffffffff operation */
- movaps %xmm4, %xmm5
+ /* b) Remove sign using AND 0x7fffffffffffffff operation */
+ movaps %xmm4, %xmm5
-/* 1) Range reduction to [-Pi/4; +Pi/4] interval */
- pxor %xmm11, %xmm11
+ /* 1) Range reduction to [-Pi/4; +Pi/4] interval */
+ pxor %xmm11, %xmm11
-/*
- * c) Getting octant Y by 2/Pi multiplication
- * d) Add "Right Shifter" (0x4330000000000000) value
- */
- movups _dInvPi+__svml_dtan_data_internal(%rip), %xmm3
- andps %xmm1, %xmm5
- mulpd %xmm5, %xmm3
- movups _dRShift+__svml_dtan_data_internal(%rip), %xmm6
+ /*
+ * c) Getting octant Y by 2/Pi multiplication
+ * d) Add "Right Shifter" (0x4330000000000000) value
+ */
+ movups _dInvPi+__svml_dtan_data_internal(%rip), %xmm3
+ andps %xmm1, %xmm5
+ mulpd %xmm5, %xmm3
+ movups _dRShift+__svml_dtan_data_internal(%rip), %xmm6
-/*
- * Range reduction
- * X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
- */
- movaps %xmm5, %xmm2
- addpd %xmm6, %xmm3
+ /*
+ * Range reduction
+ * X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
+ */
+ movaps %xmm5, %xmm2
+ addpd %xmm6, %xmm3
-/* g) Subtract "Right Shifter" (0x4330000000000000) value */
- movaps %xmm3, %xmm9
+ /* g) Subtract "Right Shifter" (0x4330000000000000) value */
+ movaps %xmm3, %xmm9
-/* a) Grab sign from source argument and save it. */
- andnps %xmm1, %xmm4
- subpd %xmm6, %xmm9
- movups _dPI1+__svml_dtan_data_internal(%rip), %xmm7
+ /* a) Grab sign from source argument and save it. */
+ andnps %xmm1, %xmm4
+ subpd %xmm6, %xmm9
+ movups _dPI1+__svml_dtan_data_internal(%rip), %xmm7
-/*
- * e) Treat obtained value as integer for destination sign setting.
- * Shift first bit of this value to the last (sign) position (S << 63)
- * f) Change destination sign if source sign is negative
- * using XOR operation.
- */
- movaps %xmm3, %xmm6
- mulpd %xmm9, %xmm7
- movups _dPI2+__svml_dtan_data_internal(%rip), %xmm8
- psllq $62, %xmm3
- mulpd %xmm9, %xmm8
- subpd %xmm7, %xmm2
- cmpneqpd %xmm11, %xmm3
- subpd %xmm8, %xmm2
- movups _dPI3+__svml_dtan_data_internal(%rip), %xmm10
+ /*
+ * e) Treat obtained value as integer for destination sign setting.
+ * Shift first bit of this value to the last (sign) position (S << 63)
+ * f) Change destination sign if source sign is negative
+ * using XOR operation.
+ */
+ movaps %xmm3, %xmm6
+ mulpd %xmm9, %xmm7
+ movups _dPI2+__svml_dtan_data_internal(%rip), %xmm8
+ psllq $62, %xmm3
+ mulpd %xmm9, %xmm8
+ subpd %xmm7, %xmm2
+ cmpneqpd %xmm11, %xmm3
+ subpd %xmm8, %xmm2
+ movups _dPI3+__svml_dtan_data_internal(%rip), %xmm10
-/*
- * c) Swap P and Q if first bit of obtained value after
- * Right Shifting is set to 1. Using And, Andnot & Or operations.
- */
- movaps %xmm3, %xmm0
- mulpd %xmm9, %xmm10
- subpd %xmm10, %xmm2
+ /*
+ * c) Swap P and Q if first bit of obtained value after
+ * Right Shifting is set to 1. Using And, Andnot & Or operations.
+ */
+ movaps %xmm3, %xmm0
+ mulpd %xmm9, %xmm10
+ subpd %xmm10, %xmm2
-/* a) Calculate X^2 = X * X */
- movaps %xmm2, %xmm15
- movaps %xmm3, %xmm14
- mulpd %xmm2, %xmm15
+ /* a) Calculate X^2 = X * X */
+ movaps %xmm2, %xmm15
+ movaps %xmm3, %xmm14
+ mulpd %xmm2, %xmm15
-/*
- * b) Calculate 2 polynomials:
- * P = X * (P0 + X^2 * (P1 + x^2 * (P2 + x^2 * (P3))));
- * Q = Q0 + X^2 * (Q1 + x^2 * (Q2 + x^2 * (Q3)));
- * Assume P0 = 1
- */
- movups _dP3+__svml_dtan_data_internal(%rip), %xmm13
- psllq $63, %xmm6
- mulpd %xmm15, %xmm13
- movups _dQ3+__svml_dtan_data_internal(%rip), %xmm12
- pxor %xmm4, %xmm6
- addpd _dP2+__svml_dtan_data_internal(%rip), %xmm13
- mulpd %xmm15, %xmm12
- mulpd %xmm15, %xmm13
- addpd _dQ2+__svml_dtan_data_internal(%rip), %xmm12
- addpd _dP1+__svml_dtan_data_internal(%rip), %xmm13
- mulpd %xmm15, %xmm12
- mulpd %xmm15, %xmm13
- addpd _dQ1+__svml_dtan_data_internal(%rip), %xmm12
- mulpd %xmm2, %xmm13
- mulpd %xmm12, %xmm15
- addpd %xmm13, %xmm2
- addpd _dQ0+__svml_dtan_data_internal(%rip), %xmm15
- andnps %xmm2, %xmm0
- andps %xmm15, %xmm14
- andps %xmm3, %xmm2
- andnps %xmm15, %xmm3
- orps %xmm14, %xmm0
- orps %xmm3, %xmm2
+ /*
+ * b) Calculate 2 polynomials:
+ * P = X * (P0 + X^2 * (P1 + x^2 * (P2 + x^2 * (P3))));
+ * Q = Q0 + X^2 * (Q1 + x^2 * (Q2 + x^2 * (Q3)));
+ * Assume P0 = 1
+ */
+ movups _dP3+__svml_dtan_data_internal(%rip), %xmm13
+ psllq $63, %xmm6
+ mulpd %xmm15, %xmm13
+ movups _dQ3+__svml_dtan_data_internal(%rip), %xmm12
+ pxor %xmm4, %xmm6
+ addpd _dP2+__svml_dtan_data_internal(%rip), %xmm13
+ mulpd %xmm15, %xmm12
+ mulpd %xmm15, %xmm13
+ addpd _dQ2+__svml_dtan_data_internal(%rip), %xmm12
+ addpd _dP1+__svml_dtan_data_internal(%rip), %xmm13
+ mulpd %xmm15, %xmm12
+ mulpd %xmm15, %xmm13
+ addpd _dQ1+__svml_dtan_data_internal(%rip), %xmm12
+ mulpd %xmm2, %xmm13
+ mulpd %xmm12, %xmm15
+ addpd %xmm13, %xmm2
+ addpd _dQ0+__svml_dtan_data_internal(%rip), %xmm15
+ andnps %xmm2, %xmm0
+ andps %xmm15, %xmm14
+ andps %xmm3, %xmm2
+ andnps %xmm15, %xmm3
+ orps %xmm14, %xmm0
+ orps %xmm3, %xmm2
-/* d) Divide R = P / Q; */
- divpd %xmm2, %xmm0
+ /* d) Divide R = P / Q; */
+ divpd %xmm2, %xmm0
-/* Large values check */
- movaps %xmm5, %xmm4
+ /* Large values check */
+ movaps %xmm5, %xmm4
-/*
- * 3) Destination sign setting
- * a) Set shifted destination sign using XOR operation:
- * R = XOR( R, S );
- */
- pxor %xmm6, %xmm0
- cmpnlepd _dReductionRangeVal+__svml_dtan_data_internal(%rip), %xmm4
- movmskpd %xmm4, %edx
- testl %edx, %edx
+ /*
+ * 3) Destination sign setting
+ * a) Set shifted destination sign using XOR operation:
+ * R = XOR( R, S );
+ */
+ pxor %xmm6, %xmm0
+ cmpnlepd _dReductionRangeVal+__svml_dtan_data_internal(%rip), %xmm4
+ movmskpd %xmm4, %edx
+ testl %edx, %edx
-/* Go to auxilary branch */
- jne L(AUX_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 xmm4 xmm5
+ /* Go to auxilary branch */
+ jne L(AUX_BRANCH)
+ # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 xmm4 xmm5
-/* Return from auxilary branch
- * for out of main path inputs
- */
+ /* Return from auxilary branch
+ * for out of main path inputs
+ */
L(AUX_BRANCH_RETURN):
- testl %eax, %eax
+ testl %eax, %eax
-/* Go to special inputs processing branch */
- jne L(SPECIAL_VALUES_BRANCH)
- # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1
+ /* Go to special inputs processing branch */
+ jne L(SPECIAL_VALUES_BRANCH)
+ # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1
-/* Restore registers
- * and exit the function
- */
+ /* Restore registers
+ * and exit the function
+ */
L(EXIT):
- addq $72, %rsp
- cfi_def_cfa_offset(8)
- ret
- cfi_def_cfa_offset(80)
+ addq $72, %rsp
+ cfi_def_cfa_offset(8)
+ ret
+ cfi_def_cfa_offset(80)
-/* Branch to process
- * special inputs
- */
+ /* Branch to process
+ * special inputs
+ */
L(SPECIAL_VALUES_BRANCH):
- movups %xmm1, 32(%rsp)
- movups %xmm0, 48(%rsp)
- # LOE rbx rbp r12 r13 r14 r15 eax xmm0
+ movups %xmm1, 32(%rsp)
+ movups %xmm0, 48(%rsp)
+ # LOE rbx rbp r12 r13 r14 r15 eax xmm0
- xorl %edx, %edx
- movq %r12, 16(%rsp)
- cfi_offset(12, -64)
- movl %edx, %r12d
- movq %r13, 8(%rsp)
- cfi_offset(13, -72)
- movl %eax, %r13d
- movq %r14, (%rsp)
- cfi_offset(14, -80)
- # LOE rbx rbp r15 r12d r13d
+ xorl %edx, %edx
+ movq %r12, 16(%rsp)
+ cfi_offset(12, -64)
+ movl %edx, %r12d
+ movq %r13, 8(%rsp)
+ cfi_offset(13, -72)
+ movl %eax, %r13d
+ movq %r14, (%rsp)
+ cfi_offset(14, -80)
+ # LOE rbx rbp r15 r12d r13d
-/* Range mask
- * bits check
- */
+ /* Range mask
+ * bits check
+ */
L(RANGEMASK_CHECK):
- btl %r12d, %r13d
+ btl %r12d, %r13d
-/* Call scalar math function */
- jc L(SCALAR_MATH_CALL)
- # LOE rbx rbp r15 r12d r13d
+ /* Call scalar math function */
+ jc L(SCALAR_MATH_CALL)
+ # LOE rbx rbp r15 r12d r13d
-/* Special inputs
- * processing loop
- */
+ /* Special inputs
+ * processing loop
+ */
L(SPECIAL_VALUES_LOOP):
- incl %r12d
- cmpl $2, %r12d
+ incl %r12d
+ cmpl $2, %r12d
-/* Check bits in range mask */
- jl L(RANGEMASK_CHECK)
- # LOE rbx rbp r15 r12d r13d
+ /* Check bits in range mask */
+ jl L(RANGEMASK_CHECK)
+ # LOE rbx rbp r15 r12d r13d
- movq 16(%rsp), %r12
- cfi_restore(12)
- movq 8(%rsp), %r13
- cfi_restore(13)
- movq (%rsp), %r14
- cfi_restore(14)
- movups 48(%rsp), %xmm0
+ movq 16(%rsp), %r12
+ cfi_restore(12)
+ movq 8(%rsp), %r13
+ cfi_restore(13)
+ movq (%rsp), %r14
+ cfi_restore(14)
+ movups 48(%rsp), %xmm0
-/* Go to exit */
- jmp L(EXIT)
- cfi_offset(12, -64)
- cfi_offset(13, -72)
- cfi_offset(14, -80)
- # LOE rbx rbp r12 r13 r14 r15 xmm0
+ /* Go to exit */
+ jmp L(EXIT)
+ cfi_offset(12, -64)
+ cfi_offset(13, -72)
+ cfi_offset(14, -80)
+ # LOE rbx rbp r12 r13 r14 r15 xmm0
-/* Scalar math fucntion call
- * to process special input
- */
+ /* Scalar math fucntion call
+ * to process special input
+ */
L(SCALAR_MATH_CALL):
- movl %r12d, %r14d
- movsd 32(%rsp,%r14,8), %xmm0
- call tan@PLT
- # LOE rbx rbp r14 r15 r12d r13d xmm0
+ movl %r12d, %r14d
+ movsd 32(%rsp, %r14, 8), %xmm0
+ call tan@PLT
+ # LOE rbx rbp r14 r15 r12d r13d xmm0
- movsd %xmm0, 48(%rsp,%r14,8)
+ movsd %xmm0, 48(%rsp, %r14, 8)
-/* Process special inputs in loop */
- jmp L(SPECIAL_VALUES_LOOP)
- cfi_restore(12)
- cfi_restore(13)
- cfi_restore(14)
- # LOE rbx rbp r15 r12d r13d
+ /* Process special inputs in loop */
+ jmp L(SPECIAL_VALUES_LOOP)
+ cfi_restore(12)
+ cfi_restore(13)
+ cfi_restore(14)
+ # LOE rbx rbp r15 r12d r13d
-/* Auxilary branch
- * for out of main path inputs
- */
+ /* Auxilary branch
+ * for out of main path inputs
+ */
L(AUX_BRANCH):
- movdqu .FLT_17(%rip), %xmm3
+ movdqu .FLT_17(%rip), %xmm3
-/*
- * Get the (2^a / 2pi) mod 1 values from the table.
- * Because doesn't have L-type gather, we need a trivial cast
- */
- lea __svml_dtan_reduction_data_internal(%rip), %r8
- pand %xmm1, %xmm3
- psrlq $52, %xmm3
+ /*
+ * Get the (2^a / 2pi) mod 1 values from the table.
+ * Because doesn't have L-type gather, we need a trivial cast
+ */
+ lea __svml_dtan_reduction_data_internal(%rip), %r8
+ pand %xmm1, %xmm3
+ psrlq $52, %xmm3
-/*
- * Also get the significand as an integer
- * NB: adding in the integer bit is wrong for denorms!
- * To make this work for denorms we should do something slightly different
- */
- movdqu .FLT_18(%rip), %xmm2
- movd %xmm3, %edx
- pand %xmm1, %xmm2
- paddq .FLT_19(%rip), %xmm2
- pextrw $4, %xmm3, %esi
- movups _dRangeVal+__svml_dtan_data_internal(%rip), %xmm10
- lea (%rdx,%rdx,2), %ecx
- shll $3, %ecx
- lea (%rsi,%rsi,2), %edi
- shll $3, %edi
- movdqa %xmm2, %xmm6
- movq 16(%rcx,%r8), %xmm8
- andps %xmm10, %xmm5
- movhpd 16(%rdi,%r8), %xmm8
- psrlq $32, %xmm6
- movups %xmm0, 16(%rsp)
- movaps %xmm8, %xmm0
+ /*
+ * Also get the significand as an integer
+ * NB: adding in the integer bit is wrong for denorms!
+ * To make this work for denorms we should do something slightly different
+ */
+ movdqu .FLT_18(%rip), %xmm2
+ movd %xmm3, %edx
+ pand %xmm1, %xmm2
+ paddq .FLT_19(%rip), %xmm2
+ pextrw $4, %xmm3, %esi
+ movups _dRangeVal+__svml_dtan_data_internal(%rip), %xmm10
+ lea (%rdx, %rdx, 2), %ecx
+ shll $3, %ecx
+ lea (%rsi, %rsi, 2), %edi
+ shll $3, %edi
+ movdqa %xmm2, %xmm6
+ movq 16(%rcx, %r8), %xmm8
+ andps %xmm10, %xmm5
+ movhpd 16(%rdi, %r8), %xmm8
+ psrlq $32, %xmm6
+ movups %xmm0, 16(%rsp)
+ movaps %xmm8, %xmm0
-/*
- * Break the P_xxx and m into 32-bit chunks ready for
- * the long multiplication via 32x32->64 multiplications
- */
- movdqu .FLT_20(%rip), %xmm15
- psrlq $32, %xmm0
- movq 8(%rcx,%r8), %xmm13
- pand %xmm15, %xmm2
- cmpeqpd %xmm10, %xmm5
- movdqa %xmm6, %xmm10
- movdqa %xmm2, %xmm11
- movhpd 8(%rdi,%r8), %xmm13
- pand %xmm15, %xmm8
- pmuludq %xmm0, %xmm10
- movaps %xmm13, %xmm14
- pmuludq %xmm2, %xmm0
- pmuludq %xmm6, %xmm8
- movmskpd %xmm5, %eax
- pand %xmm15, %xmm13
- psrlq $32, %xmm0
- pmuludq %xmm13, %xmm11
- psrlq $32, %xmm14
- pmuludq %xmm6, %xmm13
- paddq %xmm0, %xmm10
- movdqa %xmm2, %xmm12
- movdqa %xmm15, %xmm3
- pmuludq %xmm14, %xmm12
- pand %xmm11, %xmm3
- pmuludq %xmm6, %xmm14
- paddq %xmm10, %xmm3
- movq (%rcx,%r8), %xmm7
- movdqa %xmm15, %xmm9
- movhpd (%rdi,%r8), %xmm7
- psrlq $32, %xmm8
- psrlq $32, %xmm11
- pand %xmm7, %xmm9
- movdqa %xmm2, %xmm5
- movdqa %xmm15, %xmm10
- paddq %xmm3, %xmm8
- paddq %xmm11, %xmm13
- pmuludq %xmm9, %xmm5
+ /*
+ * Break the P_xxx and m into 32-bit chunks ready for
+ * the long multiplication via 32x32->64 multiplications
+ */
+ movdqu .FLT_20(%rip), %xmm15
+ psrlq $32, %xmm0
+ movq 8(%rcx, %r8), %xmm13
+ pand %xmm15, %xmm2
+ cmpeqpd %xmm10, %xmm5
+ movdqa %xmm6, %xmm10
+ movdqa %xmm2, %xmm11
+ movhpd 8(%rdi, %r8), %xmm13
+ pand %xmm15, %xmm8
+ pmuludq %xmm0, %xmm10
+ movaps %xmm13, %xmm14
+ pmuludq %xmm2, %xmm0
+ pmuludq %xmm6, %xmm8
+ movmskpd %xmm5, %eax
+ pand %xmm15, %xmm13
+ psrlq $32, %xmm0
+ pmuludq %xmm13, %xmm11
+ psrlq $32, %xmm14
+ pmuludq %xmm6, %xmm13
+ paddq %xmm0, %xmm10
+ movdqa %xmm2, %xmm12
+ movdqa %xmm15, %xmm3
+ pmuludq %xmm14, %xmm12
+ pand %xmm11, %xmm3
+ pmuludq %xmm6, %xmm14
+ paddq %xmm10, %xmm3
+ movq (%rcx, %r8), %xmm7
+ movdqa %xmm15, %xmm9
+ movhpd (%rdi, %r8), %xmm7
+ psrlq $32, %xmm8
+ psrlq $32, %xmm11
+ pand %xmm7, %xmm9
+ movdqa %xmm2, %xmm5
+ movdqa %xmm15, %xmm10
+ paddq %xmm3, %xmm8
+ paddq %xmm11, %xmm13
+ pmuludq %xmm9, %xmm5
-/* Now do the big multiplication and carry propagation */
- pmuludq %xmm9, %xmm6
- pand %xmm12, %xmm10
- movaps %xmm8, %xmm0
- paddq %xmm13, %xmm10
- psrlq $32, %xmm0
- psrlq $32, %xmm12
- psrlq $32, %xmm7
- movdqa %xmm15, %xmm11
- paddq %xmm10, %xmm0
- paddq %xmm12, %xmm14
- pmuludq %xmm7, %xmm2
- pand %xmm5, %xmm11
- movdqa %xmm0, %xmm13
- paddq %xmm14, %xmm11
- psrlq $32, %xmm13
- psrlq $32, %xmm5
- paddq %xmm11, %xmm13
- paddq %xmm5, %xmm6
- pand %xmm15, %xmm2
- movdqa %xmm13, %xmm3
- paddq %xmm6, %xmm2
- psrlq $32, %xmm3
- pand %xmm15, %xmm13
- paddq %xmm2, %xmm3
- psllq $32, %xmm3
+ /* Now do the big multiplication and carry propagation */
+ pmuludq %xmm9, %xmm6
+ pand %xmm12, %xmm10
+ movaps %xmm8, %xmm0
+ paddq %xmm13, %xmm10
+ psrlq $32, %xmm0
+ psrlq $32, %xmm12
+ psrlq $32, %xmm7
+ movdqa %xmm15, %xmm11
+ paddq %xmm10, %xmm0
+ paddq %xmm12, %xmm14
+ pmuludq %xmm7, %xmm2
+ pand %xmm5, %xmm11
+ movdqa %xmm0, %xmm13
+ paddq %xmm14, %xmm11
+ psrlq $32, %xmm13
+ psrlq $32, %xmm5
+ paddq %xmm11, %xmm13
+ paddq %xmm5, %xmm6
+ pand %xmm15, %xmm2
+ movdqa %xmm13, %xmm3
+ paddq %xmm6, %xmm2
+ psrlq $32, %xmm3
+ pand %xmm15, %xmm13
+ paddq %xmm2, %xmm3
+ psllq $32, %xmm3
-/* Assemble reduced argument from the pieces */
- pand %xmm15, %xmm8
- paddq %xmm13, %xmm3
+ /* Assemble reduced argument from the pieces */
+ pand %xmm15, %xmm8
+ paddq %xmm13, %xmm3
-/*
- * We want to incorporate the original sign now too.
- * Do it here for convenience in getting the right N value,
- * though we could wait right to the end if we were prepared
- * to modify the sign of N later too.
- * So get the appropriate sign mask now (or sooner).
- */
- movdqu .FLT_21(%rip), %xmm9
- movdqa %xmm3, %xmm5
+ /*
+ * We want to incorporate the original sign now too.
+ * Do it here for convenience in getting the right N value,
+ * though we could wait right to the end if we were prepared
+ * to modify the sign of N later too.
+ * So get the appropriate sign mask now (or sooner).
+ */
+ movdqu .FLT_21(%rip), %xmm9
+ movdqa %xmm3, %xmm5
-/*
- * Create floating-point high part, implicitly adding integer bit 1
- * Incorporate overall sign at this stage too.
- */
- movdqu .FLT_22(%rip), %xmm15
- pand %xmm1, %xmm9
+ /*
+ * Create floating-point high part, implicitly adding integer bit 1
+ * Incorporate overall sign at this stage too.
+ */
+ movdqu .FLT_22(%rip), %xmm15
+ pand %xmm1, %xmm9
-/*
- * Now round at the 2^-9 bit position for reduction mod pi/2^8
- * instead of the original 2pi (but still with the same 2pi scaling).
- * Use a shifter of 2^43 + 2^42.
- * The N we get is our final version; it has an offset of
- * 2^9 because of the implicit integer bit, and anyway for negative
- * starting value it's a 2s complement thing. But we need to mask
- * off the exponent part anyway so it's fine.
- */
- movups .FLT_23(%rip), %xmm12
- psrlq $12, %xmm5
- pxor %xmm9, %xmm15
- movaps %xmm12, %xmm10
- por %xmm15, %xmm5
- psllq $32, %xmm0
- addpd %xmm5, %xmm10
- paddq %xmm8, %xmm0
- movaps %xmm10, %xmm14
+ /*
+ * Now round at the 2^-9 bit position for reduction mod pi/2^8
+ * instead of the original 2pi (but still with the same 2pi scaling).
+ * Use a shifter of 2^43 + 2^42.
+ * The N we get is our final version; it has an offset of
+ * 2^9 because of the implicit integer bit, and anyway for negative
+ * starting value it's a 2s complement thing. But we need to mask
+ * off the exponent part anyway so it's fine.
+ */
+ movups .FLT_23(%rip), %xmm12
+ psrlq $12, %xmm5
+ pxor %xmm9, %xmm15
+ movaps %xmm12, %xmm10
+ por %xmm15, %xmm5
+ psllq $32, %xmm0
+ addpd %xmm5, %xmm10
+ paddq %xmm8, %xmm0
+ movaps %xmm10, %xmm14
-/* Load constants (not all needed at once) */
- lea _dCoeffs+96+__svml_dtan_data_internal(%rip), %rdx
- movdqu .FLT_27(%rip), %xmm6
- movdqu .FLT_25(%rip), %xmm7
- pand %xmm3, %xmm6
+ /* Load constants (not all needed at once) */
+ lea _dCoeffs+96+__svml_dtan_data_internal(%rip), %rdx
+ movdqu .FLT_27(%rip), %xmm6
+ movdqu .FLT_25(%rip), %xmm7
+ pand %xmm3, %xmm6
-/*
- * Create floating-point low and medium parts, respectively
- * lo_23, ... lo_0, 0, ..., 0
- * hi_11, ... hi_0, lo_63, ..., lo_24
- * then subtract off the implicitly added integer bits,
- * 2^-104 and 2^-52, respectively.
- * Put the original sign into all of them at this stage.
- */
- movdqu .FLT_24(%rip), %xmm8
- pand %xmm0, %xmm7
- subpd %xmm12, %xmm14
- psllq $40, %xmm6
- psrlq $24, %xmm0
- pxor %xmm9, %xmm8
- por %xmm0, %xmm6
- pxor .FLT_26(%rip), %xmm9
- psllq $28, %xmm7
- subpd %xmm14, %xmm5
- por %xmm9, %xmm6