diff options
| author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 10:47:15 -0800 |
|---|---|---|
| committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 21:44:09 -0800 |
| commit | 7f852d2592b50ef9c6daed656b8f33c65bfe594a (patch) | |
| tree | a616daeb732d919c2f2b4aa389526f842b14b1d6 | |
| parent | 160e183a9a1d2bb54fb899f86ab67ce516a95304 (diff) | |
| download | glibc-7f852d2592b50ef9c6daed656b8f33c65bfe594a.tar.xz glibc-7f852d2592b50ef9c6daed656b8f33c65bfe594a.zip | |
x86_64: Fix svml_d_tan2_core_sse4.S code formatting
This commit contains following formatting changes
1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
between it and the first operand.
3. Instruction greater than 7 characters in length have a
space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S | 12143 |
1 files changed, 6070 insertions, 6073 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S b/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S index 879cfd631d..d572b886be 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_tan2_core_sse4.S @@ -47,6213 +47,6210 @@ /* Offsets for data table __svml_dtan_data_internal */ -#define _dAbsMask 0 -#define _dRangeVal 16 -#define _dRShift 32 -#define _dCoeffs 48 -#define _dReductionRangeVal 26672 -#define _dInvPi 26688 -#define _dPI1 26704 -#define _dPI2 26720 -#define _dPI3 26736 -#define _dP1 26752 -#define _dP2 26768 -#define _dP3 26784 -#define _dQ0 26800 -#define _dQ1 26816 -#define _dQ2 26832 -#define _dQ3 26848 +#define _dAbsMask 0 +#define _dRangeVal 16 +#define _dRShift 32 +#define _dCoeffs 48 +#define _dReductionRangeVal 26672 +#define _dInvPi 26688 +#define _dPI1 26704 +#define _dPI2 26720 +#define _dPI3 26736 +#define _dP1 26752 +#define _dP2 26768 +#define _dP3 26784 +#define _dQ0 26800 +#define _dQ1 26816 +#define _dQ2 26832 +#define _dQ3 26848 #include <sysdep.h> - .text - .section .text.sse4,"ax",@progbits + .section .text.sse4, "ax", @progbits ENTRY(_ZGVbN2v_tan_sse4) - subq $72, %rsp - cfi_def_cfa_offset(80) - movaps %xmm0, %xmm1 - movups _dAbsMask+__svml_dtan_data_internal(%rip), %xmm4 + subq $72, %rsp + cfi_def_cfa_offset(80) + movaps %xmm0, %xmm1 + movups _dAbsMask+__svml_dtan_data_internal(%rip), %xmm4 -/* Legacy Code */ - xorl %eax, %eax + /* Legacy Code */ + xorl %eax, %eax -/* b) Remove sign using AND 0x7fffffffffffffff operation */ - movaps %xmm4, %xmm5 + /* b) Remove sign using AND 0x7fffffffffffffff operation */ + movaps %xmm4, %xmm5 -/* 1) Range reduction to [-Pi/4; +Pi/4] interval */ - pxor %xmm11, %xmm11 + /* 1) Range reduction to [-Pi/4; +Pi/4] interval */ + pxor %xmm11, %xmm11 -/* - * c) Getting octant Y by 2/Pi multiplication - * d) Add "Right Shifter" (0x4330000000000000) value - */ - movups _dInvPi+__svml_dtan_data_internal(%rip), %xmm3 - andps %xmm1, %xmm5 - mulpd %xmm5, %xmm3 - movups _dRShift+__svml_dtan_data_internal(%rip), %xmm6 + /* + * c) Getting octant Y by 2/Pi multiplication + * d) Add "Right Shifter" (0x4330000000000000) value + */ + movups _dInvPi+__svml_dtan_data_internal(%rip), %xmm3 + andps %xmm1, %xmm5 + mulpd %xmm5, %xmm3 + movups _dRShift+__svml_dtan_data_internal(%rip), %xmm6 -/* - * Range reduction - * X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; - */ - movaps %xmm5, %xmm2 - addpd %xmm6, %xmm3 + /* + * Range reduction + * X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; + */ + movaps %xmm5, %xmm2 + addpd %xmm6, %xmm3 -/* g) Subtract "Right Shifter" (0x4330000000000000) value */ - movaps %xmm3, %xmm9 + /* g) Subtract "Right Shifter" (0x4330000000000000) value */ + movaps %xmm3, %xmm9 -/* a) Grab sign from source argument and save it. */ - andnps %xmm1, %xmm4 - subpd %xmm6, %xmm9 - movups _dPI1+__svml_dtan_data_internal(%rip), %xmm7 + /* a) Grab sign from source argument and save it. */ + andnps %xmm1, %xmm4 + subpd %xmm6, %xmm9 + movups _dPI1+__svml_dtan_data_internal(%rip), %xmm7 -/* - * e) Treat obtained value as integer for destination sign setting. - * Shift first bit of this value to the last (sign) position (S << 63) - * f) Change destination sign if source sign is negative - * using XOR operation. - */ - movaps %xmm3, %xmm6 - mulpd %xmm9, %xmm7 - movups _dPI2+__svml_dtan_data_internal(%rip), %xmm8 - psllq $62, %xmm3 - mulpd %xmm9, %xmm8 - subpd %xmm7, %xmm2 - cmpneqpd %xmm11, %xmm3 - subpd %xmm8, %xmm2 - movups _dPI3+__svml_dtan_data_internal(%rip), %xmm10 + /* + * e) Treat obtained value as integer for destination sign setting. + * Shift first bit of this value to the last (sign) position (S << 63) + * f) Change destination sign if source sign is negative + * using XOR operation. + */ + movaps %xmm3, %xmm6 + mulpd %xmm9, %xmm7 + movups _dPI2+__svml_dtan_data_internal(%rip), %xmm8 + psllq $62, %xmm3 + mulpd %xmm9, %xmm8 + subpd %xmm7, %xmm2 + cmpneqpd %xmm11, %xmm3 + subpd %xmm8, %xmm2 + movups _dPI3+__svml_dtan_data_internal(%rip), %xmm10 -/* - * c) Swap P and Q if first bit of obtained value after - * Right Shifting is set to 1. Using And, Andnot & Or operations. - */ - movaps %xmm3, %xmm0 - mulpd %xmm9, %xmm10 - subpd %xmm10, %xmm2 + /* + * c) Swap P and Q if first bit of obtained value after + * Right Shifting is set to 1. Using And, Andnot & Or operations. + */ + movaps %xmm3, %xmm0 + mulpd %xmm9, %xmm10 + subpd %xmm10, %xmm2 -/* a) Calculate X^2 = X * X */ - movaps %xmm2, %xmm15 - movaps %xmm3, %xmm14 - mulpd %xmm2, %xmm15 + /* a) Calculate X^2 = X * X */ + movaps %xmm2, %xmm15 + movaps %xmm3, %xmm14 + mulpd %xmm2, %xmm15 -/* - * b) Calculate 2 polynomials: - * P = X * (P0 + X^2 * (P1 + x^2 * (P2 + x^2 * (P3)))); - * Q = Q0 + X^2 * (Q1 + x^2 * (Q2 + x^2 * (Q3))); - * Assume P0 = 1 - */ - movups _dP3+__svml_dtan_data_internal(%rip), %xmm13 - psllq $63, %xmm6 - mulpd %xmm15, %xmm13 - movups _dQ3+__svml_dtan_data_internal(%rip), %xmm12 - pxor %xmm4, %xmm6 - addpd _dP2+__svml_dtan_data_internal(%rip), %xmm13 - mulpd %xmm15, %xmm12 - mulpd %xmm15, %xmm13 - addpd _dQ2+__svml_dtan_data_internal(%rip), %xmm12 - addpd _dP1+__svml_dtan_data_internal(%rip), %xmm13 - mulpd %xmm15, %xmm12 - mulpd %xmm15, %xmm13 - addpd _dQ1+__svml_dtan_data_internal(%rip), %xmm12 - mulpd %xmm2, %xmm13 - mulpd %xmm12, %xmm15 - addpd %xmm13, %xmm2 - addpd _dQ0+__svml_dtan_data_internal(%rip), %xmm15 - andnps %xmm2, %xmm0 - andps %xmm15, %xmm14 - andps %xmm3, %xmm2 - andnps %xmm15, %xmm3 - orps %xmm14, %xmm0 - orps %xmm3, %xmm2 + /* + * b) Calculate 2 polynomials: + * P = X * (P0 + X^2 * (P1 + x^2 * (P2 + x^2 * (P3)))); + * Q = Q0 + X^2 * (Q1 + x^2 * (Q2 + x^2 * (Q3))); + * Assume P0 = 1 + */ + movups _dP3+__svml_dtan_data_internal(%rip), %xmm13 + psllq $63, %xmm6 + mulpd %xmm15, %xmm13 + movups _dQ3+__svml_dtan_data_internal(%rip), %xmm12 + pxor %xmm4, %xmm6 + addpd _dP2+__svml_dtan_data_internal(%rip), %xmm13 + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm13 + addpd _dQ2+__svml_dtan_data_internal(%rip), %xmm12 + addpd _dP1+__svml_dtan_data_internal(%rip), %xmm13 + mulpd %xmm15, %xmm12 + mulpd %xmm15, %xmm13 + addpd _dQ1+__svml_dtan_data_internal(%rip), %xmm12 + mulpd %xmm2, %xmm13 + mulpd %xmm12, %xmm15 + addpd %xmm13, %xmm2 + addpd _dQ0+__svml_dtan_data_internal(%rip), %xmm15 + andnps %xmm2, %xmm0 + andps %xmm15, %xmm14 + andps %xmm3, %xmm2 + andnps %xmm15, %xmm3 + orps %xmm14, %xmm0 + orps %xmm3, %xmm2 -/* d) Divide R = P / Q; */ - divpd %xmm2, %xmm0 + /* d) Divide R = P / Q; */ + divpd %xmm2, %xmm0 -/* Large values check */ - movaps %xmm5, %xmm4 + /* Large values check */ + movaps %xmm5, %xmm4 -/* - * 3) Destination sign setting - * a) Set shifted destination sign using XOR operation: - * R = XOR( R, S ); - */ - pxor %xmm6, %xmm0 - cmpnlepd _dReductionRangeVal+__svml_dtan_data_internal(%rip), %xmm4 - movmskpd %xmm4, %edx - testl %edx, %edx + /* + * 3) Destination sign setting + * a) Set shifted destination sign using XOR operation: + * R = XOR( R, S ); + */ + pxor %xmm6, %xmm0 + cmpnlepd _dReductionRangeVal+__svml_dtan_data_internal(%rip), %xmm4 + movmskpd %xmm4, %edx + testl %edx, %edx -/* Go to auxilary branch */ - jne L(AUX_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 xmm4 xmm5 + /* Go to auxilary branch */ + jne L(AUX_BRANCH) + # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 xmm4 xmm5 -/* Return from auxilary branch - * for out of main path inputs - */ + /* Return from auxilary branch + * for out of main path inputs + */ L(AUX_BRANCH_RETURN): - testl %eax, %eax + testl %eax, %eax -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx rbp r12 r13 r14 r15 eax xmm0 xmm1 -/* Restore registers - * and exit the function - */ + /* Restore registers + * and exit the function + */ L(EXIT): - addq $72, %rsp - cfi_def_cfa_offset(8) - ret - cfi_def_cfa_offset(80) + addq $72, %rsp + cfi_def_cfa_offset(8) + ret + cfi_def_cfa_offset(80) -/* Branch to process - * special inputs - */ + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - movups %xmm1, 32(%rsp) - movups %xmm0, 48(%rsp) - # LOE rbx rbp r12 r13 r14 r15 eax xmm0 + movups %xmm1, 32(%rsp) + movups %xmm0, 48(%rsp) + # LOE rbx rbp r12 r13 r14 r15 eax xmm0 - xorl %edx, %edx - movq %r12, 16(%rsp) - cfi_offset(12, -64) - movl %edx, %r12d - movq %r13, 8(%rsp) - cfi_offset(13, -72) - movl %eax, %r13d - movq %r14, (%rsp) - cfi_offset(14, -80) - # LOE rbx rbp r15 r12d r13d + xorl %edx, %edx + movq %r12, 16(%rsp) + cfi_offset(12, -64) + movl %edx, %r12d + movq %r13, 8(%rsp) + cfi_offset(13, -72) + movl %eax, %r13d + movq %r14, (%rsp) + cfi_offset(14, -80) + # LOE rbx rbp r15 r12d r13d -/* Range mask - * bits check - */ + /* Range mask + * bits check + */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d -/* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx rbp r15 r12d r13d + /* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + # LOE rbx rbp r15 r12d r13d -/* Special inputs - * processing loop - */ + /* Special inputs + * processing loop + */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $2, %r12d + incl %r12d + cmpl $2, %r12d -/* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx rbp r15 r12d r13d + /* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + # LOE rbx rbp r15 r12d r13d - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - movups 48(%rsp), %xmm0 + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + movups 48(%rsp), %xmm0 -/* Go to exit */ - jmp L(EXIT) - cfi_offset(12, -64) - cfi_offset(13, -72) - cfi_offset(14, -80) - # LOE rbx rbp r12 r13 r14 r15 xmm0 + /* Go to exit */ + jmp L(EXIT) + cfi_offset(12, -64) + cfi_offset(13, -72) + cfi_offset(14, -80) + # LOE rbx rbp r12 r13 r14 r15 xmm0 -/* Scalar math fucntion call - * to process special input - */ + /* Scalar math fucntion call + * to process special input + */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movsd 32(%rsp,%r14,8), %xmm0 - call tan@PLT - # LOE rbx rbp r14 r15 r12d r13d xmm0 + movl %r12d, %r14d + movsd 32(%rsp, %r14, 8), %xmm0 + call tan@PLT + # LOE rbx rbp r14 r15 r12d r13d xmm0 - movsd %xmm0, 48(%rsp,%r14,8) + movsd %xmm0, 48(%rsp, %r14, 8) -/* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - cfi_restore(12) - cfi_restore(13) - cfi_restore(14) - # LOE rbx rbp r15 r12d r13d + /* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + cfi_restore(12) + cfi_restore(13) + cfi_restore(14) + # LOE rbx rbp r15 r12d r13d -/* Auxilary branch - * for out of main path inputs - */ + /* Auxilary branch + * for out of main path inputs + */ L(AUX_BRANCH): - movdqu .FLT_17(%rip), %xmm3 + movdqu .FLT_17(%rip), %xmm3 -/* - * Get the (2^a / 2pi) mod 1 values from the table. - * Because doesn't have L-type gather, we need a trivial cast - */ - lea __svml_dtan_reduction_data_internal(%rip), %r8 - pand %xmm1, %xmm3 - psrlq $52, %xmm3 + /* + * Get the (2^a / 2pi) mod 1 values from the table. + * Because doesn't have L-type gather, we need a trivial cast + */ + lea __svml_dtan_reduction_data_internal(%rip), %r8 + pand %xmm1, %xmm3 + psrlq $52, %xmm3 -/* - * Also get the significand as an integer - * NB: adding in the integer bit is wrong for denorms! - * To make this work for denorms we should do something slightly different - */ - movdqu .FLT_18(%rip), %xmm2 - movd %xmm3, %edx - pand %xmm1, %xmm2 - paddq .FLT_19(%rip), %xmm2 - pextrw $4, %xmm3, %esi - movups _dRangeVal+__svml_dtan_data_internal(%rip), %xmm10 - lea (%rdx,%rdx,2), %ecx - shll $3, %ecx - lea (%rsi,%rsi,2), %edi - shll $3, %edi - movdqa %xmm2, %xmm6 - movq 16(%rcx,%r8), %xmm8 - andps %xmm10, %xmm5 - movhpd 16(%rdi,%r8), %xmm8 - psrlq $32, %xmm6 - movups %xmm0, 16(%rsp) - movaps %xmm8, %xmm0 + /* + * Also get the significand as an integer + * NB: adding in the integer bit is wrong for denorms! + * To make this work for denorms we should do something slightly different + */ + movdqu .FLT_18(%rip), %xmm2 + movd %xmm3, %edx + pand %xmm1, %xmm2 + paddq .FLT_19(%rip), %xmm2 + pextrw $4, %xmm3, %esi + movups _dRangeVal+__svml_dtan_data_internal(%rip), %xmm10 + lea (%rdx, %rdx, 2), %ecx + shll $3, %ecx + lea (%rsi, %rsi, 2), %edi + shll $3, %edi + movdqa %xmm2, %xmm6 + movq 16(%rcx, %r8), %xmm8 + andps %xmm10, %xmm5 + movhpd 16(%rdi, %r8), %xmm8 + psrlq $32, %xmm6 + movups %xmm0, 16(%rsp) + movaps %xmm8, %xmm0 -/* - * Break the P_xxx and m into 32-bit chunks ready for - * the long multiplication via 32x32->64 multiplications - */ - movdqu .FLT_20(%rip), %xmm15 - psrlq $32, %xmm0 - movq 8(%rcx,%r8), %xmm13 - pand %xmm15, %xmm2 - cmpeqpd %xmm10, %xmm5 - movdqa %xmm6, %xmm10 - movdqa %xmm2, %xmm11 - movhpd 8(%rdi,%r8), %xmm13 - pand %xmm15, %xmm8 - pmuludq %xmm0, %xmm10 - movaps %xmm13, %xmm14 - pmuludq %xmm2, %xmm0 - pmuludq %xmm6, %xmm8 - movmskpd %xmm5, %eax - pand %xmm15, %xmm13 - psrlq $32, %xmm0 - pmuludq %xmm13, %xmm11 - psrlq $32, %xmm14 - pmuludq %xmm6, %xmm13 - paddq %xmm0, %xmm10 - movdqa %xmm2, %xmm12 - movdqa %xmm15, %xmm3 - pmuludq %xmm14, %xmm12 - pand %xmm11, %xmm3 - pmuludq %xmm6, %xmm14 - paddq %xmm10, %xmm3 - movq (%rcx,%r8), %xmm7 - movdqa %xmm15, %xmm9 - movhpd (%rdi,%r8), %xmm7 - psrlq $32, %xmm8 - psrlq $32, %xmm11 - pand %xmm7, %xmm9 - movdqa %xmm2, %xmm5 - movdqa %xmm15, %xmm10 - paddq %xmm3, %xmm8 - paddq %xmm11, %xmm13 - pmuludq %xmm9, %xmm5 + /* + * Break the P_xxx and m into 32-bit chunks ready for + * the long multiplication via 32x32->64 multiplications + */ + movdqu .FLT_20(%rip), %xmm15 + psrlq $32, %xmm0 + movq 8(%rcx, %r8), %xmm13 + pand %xmm15, %xmm2 + cmpeqpd %xmm10, %xmm5 + movdqa %xmm6, %xmm10 + movdqa %xmm2, %xmm11 + movhpd 8(%rdi, %r8), %xmm13 + pand %xmm15, %xmm8 + pmuludq %xmm0, %xmm10 + movaps %xmm13, %xmm14 + pmuludq %xmm2, %xmm0 + pmuludq %xmm6, %xmm8 + movmskpd %xmm5, %eax + pand %xmm15, %xmm13 + psrlq $32, %xmm0 + pmuludq %xmm13, %xmm11 + psrlq $32, %xmm14 + pmuludq %xmm6, %xmm13 + paddq %xmm0, %xmm10 + movdqa %xmm2, %xmm12 + movdqa %xmm15, %xmm3 + pmuludq %xmm14, %xmm12 + pand %xmm11, %xmm3 + pmuludq %xmm6, %xmm14 + paddq %xmm10, %xmm3 + movq (%rcx, %r8), %xmm7 + movdqa %xmm15, %xmm9 + movhpd (%rdi, %r8), %xmm7 + psrlq $32, %xmm8 + psrlq $32, %xmm11 + pand %xmm7, %xmm9 + movdqa %xmm2, %xmm5 + movdqa %xmm15, %xmm10 + paddq %xmm3, %xmm8 + paddq %xmm11, %xmm13 + pmuludq %xmm9, %xmm5 -/* Now do the big multiplication and carry propagation */ - pmuludq %xmm9, %xmm6 - pand %xmm12, %xmm10 - movaps %xmm8, %xmm0 - paddq %xmm13, %xmm10 - psrlq $32, %xmm0 - psrlq $32, %xmm12 - psrlq $32, %xmm7 - movdqa %xmm15, %xmm11 - paddq %xmm10, %xmm0 - paddq %xmm12, %xmm14 - pmuludq %xmm7, %xmm2 - pand %xmm5, %xmm11 - movdqa %xmm0, %xmm13 - paddq %xmm14, %xmm11 - psrlq $32, %xmm13 - psrlq $32, %xmm5 - paddq %xmm11, %xmm13 - paddq %xmm5, %xmm6 - pand %xmm15, %xmm2 - movdqa %xmm13, %xmm3 - paddq %xmm6, %xmm2 - psrlq $32, %xmm3 - pand %xmm15, %xmm13 - paddq %xmm2, %xmm3 - psllq $32, %xmm3 + /* Now do the big multiplication and carry propagation */ + pmuludq %xmm9, %xmm6 + pand %xmm12, %xmm10 + movaps %xmm8, %xmm0 + paddq %xmm13, %xmm10 + psrlq $32, %xmm0 + psrlq $32, %xmm12 + psrlq $32, %xmm7 + movdqa %xmm15, %xmm11 + paddq %xmm10, %xmm0 + paddq %xmm12, %xmm14 + pmuludq %xmm7, %xmm2 + pand %xmm5, %xmm11 + movdqa %xmm0, %xmm13 + paddq %xmm14, %xmm11 + psrlq $32, %xmm13 + psrlq $32, %xmm5 + paddq %xmm11, %xmm13 + paddq %xmm5, %xmm6 + pand %xmm15, %xmm2 + movdqa %xmm13, %xmm3 + paddq %xmm6, %xmm2 + psrlq $32, %xmm3 + pand %xmm15, %xmm13 + paddq %xmm2, %xmm3 + psllq $32, %xmm3 -/* Assemble reduced argument from the pieces */ - pand %xmm15, %xmm8 - paddq %xmm13, %xmm3 + /* Assemble reduced argument from the pieces */ + pand %xmm15, %xmm8 + paddq %xmm13, %xmm3 -/* - * We want to incorporate the original sign now too. - * Do it here for convenience in getting the right N value, - * though we could wait right to the end if we were prepared - * to modify the sign of N later too. - * So get the appropriate sign mask now (or sooner). - */ - movdqu .FLT_21(%rip), %xmm9 - movdqa %xmm3, %xmm5 + /* + * We want to incorporate the original sign now too. + * Do it here for convenience in getting the right N value, + * though we could wait right to the end if we were prepared + * to modify the sign of N later too. + * So get the appropriate sign mask now (or sooner). + */ + movdqu .FLT_21(%rip), %xmm9 + movdqa %xmm3, %xmm5 -/* - * Create floating-point high part, implicitly adding integer bit 1 - * Incorporate overall sign at this stage too. - */ - movdqu .FLT_22(%rip), %xmm15 - pand %xmm1, %xmm9 + /* + * Create floating-point high part, implicitly adding integer bit 1 + * Incorporate overall sign at this stage too. + */ + movdqu .FLT_22(%rip), %xmm15 + pand %xmm1, %xmm9 -/* - * Now round at the 2^-9 bit position for reduction mod pi/2^8 - * instead of the original 2pi (but still with the same 2pi scaling). - * Use a shifter of 2^43 + 2^42. - * The N we get is our final version; it has an offset of - * 2^9 because of the implicit integer bit, and anyway for negative - * starting value it's a 2s complement thing. But we need to mask - * off the exponent part anyway so it's fine. - */ - movups .FLT_23(%rip), %xmm12 - psrlq $12, %xmm5 - pxor %xmm9, %xmm15 - movaps %xmm12, %xmm10 - por %xmm15, %xmm5 - psllq $32, %xmm0 - addpd %xmm5, %xmm10 - paddq %xmm8, %xmm0 - movaps %xmm10, %xmm14 + /* + * Now round at the 2^-9 bit position for reduction mod pi/2^8 + * instead of the original 2pi (but still with the same 2pi scaling). + * Use a shifter of 2^43 + 2^42. + * The N we get is our final version; it has an offset of + * 2^9 because of the implicit integer bit, and anyway for negative + * starting value it's a 2s complement thing. But we need to mask + * off the exponent part anyway so it's fine. + */ + movups .FLT_23(%rip), %xmm12 + psrlq $12, %xmm5 + pxor %xmm9, %xmm15 + movaps %xmm12, %xmm10 + por %xmm15, %xmm5 + psllq $32, %xmm0 + addpd %xmm5, %xmm10 + paddq %xmm8, %xmm0 + movaps %xmm10, %xmm14 -/* Load constants (not all needed at once) */ - lea _dCoeffs+96+__svml_dtan_data_internal(%rip), %rdx - movdqu .FLT_27(%rip), %xmm6 - movdqu .FLT_25(%rip), %xmm7 - pand %xmm3, %xmm6 + /* Load constants (not all needed at once) */ + lea _dCoeffs+96+__svml_dtan_data_internal(%rip), %rdx + movdqu .FLT_27(%rip), %xmm6 + movdqu .FLT_25(%rip), %xmm7 + pand %xmm3, %xmm6 -/* - * Create floating-point low and medium parts, respectively - * lo_23, ... lo_0, 0, ..., 0 - * hi_11, ... hi_0, lo_63, ..., lo_24 - * then subtract off the implicitly added integer bits, - * 2^-104 and 2^-52, respectively. - * Put the original sign into all of them at this stage. - */ - movdqu .FLT_24(%rip), %xmm8 - pand %xmm0, %xmm7 - subpd %xmm12, %xmm14 - psllq $40, %xmm6 - psrlq $24, %xmm0 - pxor %xmm9, %xmm8 - por %xmm0, %xmm6 - pxor .FLT_26(%rip), %xmm9 - psllq $28, %xmm7 - subpd %xmm14, %xmm5 - por %xmm9, %xmm6 |
