diff options
| author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 10:47:14 -0800 |
|---|---|---|
| committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 21:44:09 -0800 |
| commit | fa2a051dd9e29ccd4ca87f898654f7d451a2833a (patch) | |
| tree | c6f890b6ba4543388f71ae7d3a738e0823a4d163 | |
| parent | a9f782823f211333409e3f39d3e29240693c9df1 (diff) | |
| download | glibc-fa2a051dd9e29ccd4ca87f898654f7d451a2833a.tar.xz glibc-fa2a051dd9e29ccd4ca87f898654f7d451a2833a.zip | |
x86_64: Fix svml_s_tanf16_core_avx512.S code formatting
This commit contains following formatting changes
1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
between it and the first operand.
3. Instruction greater than 7 characters in length have a
space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S | 1633 |
1 files changed, 815 insertions, 818 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S index fc44c37d4a..f2a18f0b2c 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_tanf16_core_avx512.S @@ -34,894 +34,891 @@ /* Offsets for data table __svml_stan_data_internal */ -#define _sInvPI_uisa 0 -#define _sPI1_uisa 64 -#define _sPI2_uisa 128 -#define _sPI3_uisa 192 -#define Th_tbl_uisa 256 -#define _sPC3_uisa 384 -#define _sPC5_uisa 448 -#define _sRangeReductionVal_uisa 512 -#define _sAbsMask 576 -#define _sRangeVal 640 -#define _sRShifter 704 -#define _sOne 768 -#define _sRangeReductionVal 832 -#define _sPI1 896 -#define _sPI2 960 -#define _sPI3 1024 +#define _sInvPI_uisa 0 +#define _sPI1_uisa 64 +#define _sPI2_uisa 128 +#define _sPI3_uisa 192 +#define Th_tbl_uisa 256 +#define _sPC3_uisa 384 +#define _sPC5_uisa 448 +#define _sRangeReductionVal_uisa 512 +#define _sAbsMask 576 +#define _sRangeVal 640 +#define _sRShifter 704 +#define _sOne 768 +#define _sRangeReductionVal 832 +#define _sPI1 896 +#define _sPI2 960 +#define _sPI3 1024 #include <sysdep.h> - .text - .section .text.exex512,"ax",@progbits + .section .text.exex512, "ax", @progbits ENTRY(_ZGVeN16v_tanf_skx) - pushq %rbp - cfi_def_cfa_offset(16) - movq %rsp, %rbp - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - andq $-64, %rsp - subq $192, %rsp - xorl %edx, %edx - -/* Large values check */ - vmovups _sRangeReductionVal_uisa+__svml_stan_data_internal(%rip), %zmm10 - -/* - * - * Main path - * - * start arg. reduction - */ - vmovups _sRShifter+__svml_stan_data_internal(%rip), %zmm1 - vmovups _sPI1_uisa+__svml_stan_data_internal(%rip), %zmm4 - vmovups _sPI2_uisa+__svml_stan_data_internal(%rip), %zmm2 - vmovups _sPI3_uisa+__svml_stan_data_internal(%rip), %zmm3 - vmovaps %zmm0, %zmm11 - vandps _sAbsMask+__svml_stan_data_internal(%rip), %zmm11, %zmm0 - vcmpps $22, {sae}, %zmm10, %zmm0, %k6 - vmovups __svml_stan_data_internal(%rip), %zmm10 - -/* - * - * End of main path - */ - - kortestw %k6, %k6 - vfmadd213ps {rn-sae}, %zmm1, %zmm11, %zmm10 - vsubps {rn-sae}, %zmm1, %zmm10, %zmm5 - vfnmadd213ps {rn-sae}, %zmm11, %zmm5, %zmm4 - vfnmadd231ps {rn-sae}, %zmm5, %zmm2, %zmm4 - vfnmadd213ps {rn-sae}, %zmm4, %zmm3, %zmm5 - -/* Go to auxilary branch */ - jne L(AUX_BRANCH) - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm5 zmm10 zmm11 k6 - -/* Return from auxilary branch - * for out of main path inputs - */ + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-64, %rsp + subq $192, %rsp + xorl %edx, %edx + + /* Large values check */ + vmovups _sRangeReductionVal_uisa+__svml_stan_data_internal(%rip), %zmm10 + + /* + * + * Main path + * + * start arg. reduction + */ + vmovups _sRShifter+__svml_stan_data_internal(%rip), %zmm1 + vmovups _sPI1_uisa+__svml_stan_data_internal(%rip), %zmm4 + vmovups _sPI2_uisa+__svml_stan_data_internal(%rip), %zmm2 + vmovups _sPI3_uisa+__svml_stan_data_internal(%rip), %zmm3 + vmovaps %zmm0, %zmm11 + vandps _sAbsMask+__svml_stan_data_internal(%rip), %zmm11, %zmm0 + vcmpps $22, {sae}, %zmm10, %zmm0, %k6 + vmovups __svml_stan_data_internal(%rip), %zmm10 + + /* + * + * End of main path + */ + + kortestw %k6, %k6 + vfmadd213ps {rn-sae}, %zmm1, %zmm11, %zmm10 + vsubps {rn-sae}, %zmm1, %zmm10, %zmm5 + vfnmadd213ps {rn-sae}, %zmm11, %zmm5, %zmm4 + vfnmadd231ps {rn-sae}, %zmm5, %zmm2, %zmm4 + vfnmadd213ps {rn-sae}, %zmm4, %zmm3, %zmm5 + + /* Go to auxilary branch */ + jne L(AUX_BRANCH) + # LOE rbx r12 r13 r14 r15 edx zmm0 zmm5 zmm10 zmm11 k6 + + /* Return from auxilary branch + * for out of main path inputs + */ L(AUX_BRANCH_RETURN): -/* Table lookup */ - vmovups Th_tbl_uisa+__svml_stan_data_internal(%rip), %zmm3 - vmovups _sPC3_uisa+__svml_stan_data_internal(%rip), %zmm0 - vmulps {rn-sae}, %zmm5, %zmm5, %zmm1 - vpermt2ps Th_tbl_uisa+64+__svml_stan_data_internal(%rip), %zmm10, %zmm3 - vmovups _sPC5_uisa+__svml_stan_data_internal(%rip), %zmm10 - vfmadd231ps {rn-sae}, %zmm1, %zmm10, %zmm0 - vmulps {rn-sae}, %zmm5, %zmm0, %zmm4 - vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm4 - -/* - * Computer Denominator: - * sDenominator - sDlow ~= 1-(sTh+sTl)*(sP+sPlow) - */ - vmovups _sOne+__svml_stan_data_internal(%rip), %zmm5 - vmulps {rn-sae}, %zmm4, %zmm3, %zmm7 - -/* - * Compute Numerator: - * sNumerator + sNlow ~= sTh+sTl+sP+sPlow - */ - vaddps {rn-sae}, %zmm3, %zmm4, %zmm8 - vsubps {rn-sae}, %zmm7, %zmm5, %zmm9 - vsubps {rn-sae}, %zmm3, %zmm8, %zmm2 - -/* - * Now computes (sNumerator + sNlow)/(sDenominator - sDlow) - * Choose NR iteration instead of hardware division - */ - vrcp14ps %zmm9, %zmm14 - vsubps {rn-sae}, %zmm5, %zmm9, %zmm6 - vsubps {rn-sae}, %zmm2, %zmm4, %zmm13 - vmulps {rn-sae}, %zmm8, %zmm14, %zmm15 - vaddps {rn-sae}, %zmm7, %zmm6, %zmm12 - -/* One NR iteration to refine sQuotient */ - vfmsub213ps {rn-sae}, %zmm8, %zmm15, %zmm9 - vfnmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm12 - vsubps {rn-sae}, %zmm13, %zmm12, %zmm0 - vfnmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm0 - testl %edx, %edx - -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11 - -/* Restore registers - * and exit the function - */ + /* Table lookup */ + vmovups Th_tbl_uisa+__svml_stan_data_internal(%rip), %zmm3 + vmovups _sPC3_uisa+__svml_stan_data_internal(%rip), %zmm0 + vmulps {rn-sae}, %zmm5, %zmm5, %zmm1 + vpermt2ps Th_tbl_uisa+64+__svml_stan_data_internal(%rip), %zmm10, %zmm3 + vmovups _sPC5_uisa+__svml_stan_data_internal(%rip), %zmm10 + vfmadd231ps {rn-sae}, %zmm1, %zmm10, %zmm0 + vmulps {rn-sae}, %zmm5, %zmm0, %zmm4 + vfmadd213ps {rn-sae}, %zmm5, %zmm1, %zmm4 + + /* + * Computer Denominator: + * sDenominator - sDlow ~= 1-(sTh+sTl)*(sP+sPlow) + */ + vmovups _sOne+__svml_stan_data_internal(%rip), %zmm5 + vmulps {rn-sae}, %zmm4, %zmm3, %zmm7 + + /* + * Compute Numerator: + * sNumerator + sNlow ~= sTh+sTl+sP+sPlow + */ + vaddps {rn-sae}, %zmm3, %zmm4, %zmm8 + vsubps {rn-sae}, %zmm7, %zmm5, %zmm9 + vsubps {rn-sae}, %zmm3, %zmm8, %zmm2 + + /* + * Now computes (sNumerator + sNlow)/(sDenominator - sDlow) + * Choose NR iteration instead of hardware division + */ + vrcp14ps %zmm9, %zmm14 + vsubps {rn-sae}, %zmm5, %zmm9, %zmm6 + vsubps {rn-sae}, %zmm2, %zmm4, %zmm13 + vmulps {rn-sae}, %zmm8, %zmm14, %zmm15 + vaddps {rn-sae}, %zmm7, %zmm6, %zmm12 + + /* One NR iteration to refine sQuotient */ + vfmsub213ps {rn-sae}, %zmm8, %zmm15, %zmm9 + vfnmadd213ps {rn-sae}, %zmm9, %zmm15, %zmm12 + vsubps {rn-sae}, %zmm13, %zmm12, %zmm0 + vfnmadd213ps {rn-sae}, %zmm15, %zmm14, %zmm0 + testl %edx, %edx + + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11 + + /* Restore registers + * and exit the function + */ L(EXIT): - movq %rbp, %rsp - popq %rbp - cfi_def_cfa(7, 8) - cfi_restore(6) - ret - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - -/* Branch to process - * special inputs - */ + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - vmovups %zmm11, 64(%rsp) - vmovups %zmm0, 128(%rsp) - # LOE rbx r12 r13 r14 r15 edx zmm0 - - xorl %eax, %eax - # LOE rbx r12 r13 r14 r15 eax edx - - vzeroupper - movq %r12, 16(%rsp) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - movl %eax, %r12d - movq %r13, 8(%rsp) - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - movl %edx, %r13d - movq %r14, (%rsp) - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r15 r12d r13d - -/* Range mask - * bits check - */ + vmovups %zmm11, 64(%rsp) + vmovups %zmm0, 128(%rsp) + # LOE rbx r12 r13 r14 r15 edx zmm0 + + xorl %eax, %eax + # LOE rbx r12 r13 r14 r15 eax edx + + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r15 r12d r13d + + /* Range mask + * bits check + */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d -/* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx r15 r12d r13d + /* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + # LOE rbx r15 r12d r13d -/* Special inputs - * processing loop - */ + /* Special inputs + * processing loop + */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $16, %r12d - -/* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx r15 r12d r13d - - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - vmovups 128(%rsp), %zmm0 - -/* Go to exit */ - jmp L(EXIT) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r12 r13 r14 r15 zmm0 - -/* Scalar math fucntion call - * to process special input - */ + incl %r12d + cmpl $16, %r12d + + /* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + # LOE rbx r15 r12d r13d + + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 128(%rsp), %zmm0 + + /* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r12 r13 r14 r15 zmm0 + + /* Scalar math fucntion call + * to process special input + */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movss 64(%rsp,%r14,4), %xmm0 - call tanf@PLT - # LOE rbx r14 r15 r12d r13d xmm0 - - movss %xmm0, 128(%rsp,%r14,4) - -/* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - cfi_restore(12) - cfi_restore(13) - cfi_restore(14) - # LOE rbx r15 r12d r13d - -/* Auxilary branch - * for out of main path inputs - */ + movl %r12d, %r14d + movss 64(%rsp, %r14, 4), %xmm0 + call tanf@PLT + # LOE rbx r14 r15 r12d r13d xmm0 -L(AUX_BRANCH): - vmovups _sRangeVal+__svml_stan_data_internal(%rip), %zmm6 + movss %xmm0, 128(%rsp, %r14, 4) -/* - * Get the (2^a / 2pi) mod 1 values from the table. - * Because doesn't have I-type gather, we need a trivial cast - */ - lea __svml_stan_reduction_data_internal(%rip), %rax - vmovups %zmm5, (%rsp) - vandps %zmm0, %zmm6, %zmm14 - vcmpps $0, {sae}, %zmm6, %zmm14, %k0 + /* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + cfi_restore(12) + cfi_restore(13) + cfi_restore(14) + # LOE rbx r15 r12d r13d -/* - * Break the P_xxx and m into 16-bit chunks ready for - * the long multiplication via 16x16->32 multiplications - */ - vmovups .FLT_15(%rip), %zmm6 - kxnorw %k0, %k0, %k1 - kxnorw %k0, %k0, %k2 - kxnorw %k0, %k0, %k3 - kmovw %k0, %edx - vpandd .FLT_12(%rip), %zmm11, %zmm5 - vpsrld $23, %zmm5, %zmm7 - vpslld $1, %zmm7, %zmm8 - vpaddd %zmm7, %zmm8, %zmm9 - vpslld $2, %zmm9, %zmm4 - vpxord %zmm3, %zmm3, %zmm3 - vpxord %zmm15, %zmm15, %zmm15 - vpxord %zmm2, %zmm2, %zmm2 - vgatherdps (%rax,%zmm4), %zmm3{%k1} - vgatherdps 4(%rax,%zmm4), %zmm15{%k2} - vgatherdps 8(%rax,%zmm4), %zmm2{%k3} - vpsrld $16, %zmm3, %zmm5 - vpsrld $16, %zmm2, %zmm13 - -/* - * Also get the significand as an integer - * NB: adding in the integer bit is wrong for denorms! - * To make this work for denorms we should do something slightly different - */ - vpandd .FLT_13(%rip), %zmm11, %zmm0 - vpaddd .FLT_14(%rip), %zmm0, %zmm1 - vpsrld $16, %zmm15, %zmm0 - vpsrld $16, %zmm1, %zmm8 - vpandd %zmm6, %zmm3, %zmm9 - vpandd %zmm6, %zmm15, %zmm12 - vpandd %zmm6, %zmm2, %zmm7 - vpandd %zmm6, %zmm1, %zmm14 - -/* Now do the big multiplication and carry propagation */ - vpmulld %zmm9, %zmm8, %zmm4 - vpmulld %zmm0, %zmm8, %zmm3 - vpmulld %zmm12, %zmm8, %zmm2 - vpmulld %zmm13, %zmm8, %zmm1 - vpmulld %zmm7, %zmm8, %zmm8 - vpmulld %zmm5, %zmm14, %zmm7 - vpmulld %zmm9, %zmm14, %zmm5 - vpmulld %zmm0, %zmm14, %zmm9 - vpmulld %zmm12, %zmm14, %zmm0 - vpmulld %zmm13, %zmm14, %zmm12 - vpsrld $16, %zmm12, %zmm14 - vpsrld $16, %zmm0, %zmm13 - vpsrld $16, %zmm9, %zmm15 - vpsrld $16, %zmm5, %zmm12 - vpsrld $16, %zmm8, %zmm8 - vpaddd %zmm14, %zmm1, %zmm1 - vpaddd %zmm13, %zmm2, %zmm2 - vpaddd %zmm15, %zmm3, %zmm15 - vpaddd %zmm12, %zmm4, %zmm3 - vpandd %zmm6, %zmm0, %zmm13 - vpaddd %zmm1, %zmm13, %zmm4 - vpaddd %zmm4, %zmm8, %zmm14 - vpsrld $16, %zmm14, %zmm0 - vpandd %zmm6, %zmm9, %zmm9 - vpaddd %zmm2, %zmm9, %zmm1 - vpaddd %zmm1, %zmm0, %zmm8 - -/* - * Now round at the 2^-8 bit position for reduction mod pi/2^7 - * instead of the original 2pi (but still with the same 2pi scaling). - * Use a shifter of 2^15 + 2^14. - * The N we get is our final version; it has an offset of - * 2^8 because of the implicit integer bit, and anyway for negative - * starting value it's a 2s complement thing. But we need to mask - * off the exponent part anyway so it's fine. - */ - vmovups .FLT_18(%rip), %zmm1 - vpandd %zmm6, %zmm7, %zmm7 - vpaddd %zmm3, %zmm7, %zmm13 - vpsrld $16, %zmm8, %zmm3 - vpandd %zmm6, %zmm5, %zmm5 - vpaddd %zmm15, %zmm5, %zmm2 - vpaddd %zmm2, %zmm3, %zmm15 - vpsrld $16, %zmm15, %zmm12 - vpaddd %zmm13, %zmm12, %zmm5 - -/* Assemble reduced argument from the pieces */ - vpandd %zmm6, %zmm14, %zmm9 - vpandd %zmm6, %zmm15, %zmm7 - vpslld $16, %zmm5, %zmm6 - vpslld $16, %zmm8, %zmm5 - vpaddd %zmm7, %zmm6, %zmm4 - vpaddd %zmm9, %zmm5, %zmm9 - vpsrld $9, %zmm4, %zmm6 - -/* - * We want to incorporate the original sign now too. - * Do it here for convenience in getting the right N value, - * though we could wait right to the end if we were prepared - * to modify the sign of N later too. - * So get the appropriate sign mask now (or sooner). - */ - vpandd .FLT_16(%rip), %zmm11, %zmm0 - vpandd .FLT_21(%rip), %zmm9, %zmm13 - vpslld $5, %zmm13, %zmm14 - -/* - * Create floating-point high part, implicitly adding integer bit 1 - * Incorporate overall sign at this stage too. - */ - vpxord .FLT_17(%rip), %zmm0, %zmm8 - vpord %zmm8, %zmm6, %zmm2 - vaddps {rn-sae}, %zmm2, %zmm1, %zmm12 - vsubps {rn-sae}, %zmm1, %zmm12, %zmm3 - vsubps {rn-sae}, %zmm3, %zmm2, %zmm7 - -/* - * Create floating-point low and medium parts, respectively - * lo_17, ... lo_0, 0, ..., 0 - * hi_8, ... hi_0, lo_31, ..., lo_18 - * then subtract off the implicitly added integer bits, - * 2^-46 and 2^-23, respectively. - * Put the original sign into all of them at this stage. - */ - vpxord .FLT_20(%rip), %zmm0, %zmm6 - vpord %zmm6, %zmm14, %zmm15 - vpandd .FLT_23(%rip), %zmm4, %zmm4 - vsubps {rn-sae}, %zmm6, %zmm15, %zmm8 - vandps .FLT_26(%rip), %zmm11, %zmm15 - vpsrld $18, %zmm9, %zmm6 + /* Auxilary branch + * for out of main path inputs + */ -/* - * If the magnitude of the input is <= 2^-20, then - * just pass through the input, since no reduction will be needed and - * the main path will only work accurately if the reduced argument is - * about >= 2^-40 (which it is for all large pi multiples) - */ - vmovups .FLT_27(%rip), %zmm14 - vcmpps $26, {sae}, %zmm14, %zmm15, %k4 - vcmpps $22, {sae}, %zmm14, %zmm15, %k5 - vpxord .FLT_22(%rip), %zmm0, %zmm1 - vpslld $14, %zmm4, %zmm0 - vpord %zmm6, %zmm0, %zmm0 - vpord %zmm1, %zmm0, %zmm4 - vsubps {rn-sae}, %zmm1, %zmm4, %zmm2 - vpternlogd $255, %zmm6, %zmm6, %zmm6 - -/* Now add them up into 2 reasonably aligned pieces */ - vaddps {rn-sae}, %zmm2, %zmm7, %zmm13 - vsubps {rn-sae}, %zmm13, %zmm7, %zmm7 - vaddps {rn-sae}, %zmm7, %zmm2, %zmm3 - -/* - * The output is _VRES_R (high) + _VRES_E (low), and the integer part is _VRES_IND - * Set sRp2 = _VRES_R^2 and then resume the original code. - */ - vmovups .FLT_28(%rip), %zmm2 - vaddps {rn-sae}, %zmm8, %zmm3, %zmm1 - vmovups .FLT_25(%rip), %zmm8 - -/* Grab our final N value as an integer, appropriately masked mod 2^8 */ - vpandd .FLT_19(%rip), %zmm12, %zmm5 - -/* - * Now multiply those numbers all by 2 pi, reasonably accurately. - * (RHi + RLo) * (pi_lead + pi_trail) ~= - * RHi * pi_lead + (RHi * pi_trail + RLo * pi_lead) - */ - vmovups .FLT_24(%rip), %zmm12 - vmulps {rn-sae}, %zmm12, %zmm13, %zmm0 - vmovaps %zmm12, %zmm9 - vfmsub213ps {rn-sae}, %zmm0, %zmm13, %zmm9 - vfmadd213ps {rn-sae}, %zmm9, %zmm8, %zmm13 - vmovaps %zmm6, %zmm8 - vfmadd213ps {rn-sae}, %zmm13, %zmm12, %zmm1 - vpandnd %zmm15, %zmm15, %zmm8{%k4} - vpandnd %zmm15, %zmm15, %zmm6{%k5} - vandps %zmm11, %zmm6, %zmm14 - vandps %zmm0, %zmm8, %zmm15 - vandps %zmm1, %zmm8, %zmm12 - vorps %zmm15, %zmm14, %zmm6 - vpsrld $31, %zmm6, %zmm3 - vpsubd %zmm3, %zmm2, %zmm4 - vpaddd %zmm4, %zmm5, %zmm7 - vpsrld $2, %zmm7, %zmm13 - vpslld $2, %zmm13, %zmm9 - -/* - * - * End of large arguments path - * - * Merge results from main and large paths: - */ - vblendmps %zmm13, %zmm10, %zmm10{%k6} - vpsubd %zmm9, %zmm5, %zmm5 - vmovups .FLT_29(%rip), %zmm9 - vcvtdq2ps {rn-sae}, %zmm5, %zmm0 - vmovups .FLT_30(%rip), %zmm5 - vfmadd231ps {rn-sae}, %zmm0, %zmm5, %zmm12 - vmovups (%rsp), %zmm5 - vaddps {rn-sae}, %zmm6, %zmm12, %zmm6 - vfmadd213ps {rn-sae}, %zmm6, %zmm9, %zmm0 - vblendmps %zmm0, %zmm5, %zmm5{%k6} - -/* Return to main vector processing path */ - jmp L(AUX_BRANCH_RETURN) - # LOE rbx r12 r13 r14 r15 edx zmm5 zmm10 zmm11 +L(AUX_BRANCH): + vmovups _sRangeVal+__svml_stan_data_internal(%rip), %zmm6 + + /* + * Get the (2^a / 2pi) mod 1 values from the table. + * Because doesn't have I-type gather, we need a trivial cast + */ + lea __svml_stan_reduction_data_internal(%rip), %rax + vmovups %zmm5, (%rsp) + vandps %zmm0, %zmm6, %zmm14 + vcmpps $0, {sae}, %zmm6, %zmm14, %k0 + + /* + * Break the P_xxx and m into 16-bit chunks ready for + * the long multiplication via 16x16->32 multiplications + */ + vmovups .FLT_15(%rip), %zmm6 + kxnorw %k0, %k0, %k1 + kxnorw %k0, %k0, %k2 + kxnorw %k0, %k0, %k3 + kmovw %k0, %edx + vpandd .FLT_12(%rip), %zmm11, %zmm5 + vpsrld $23, %zmm5, %zmm7 + vpslld $1, %zmm7, %zmm8 + vpaddd %zmm7, %zmm8, %zmm9 + vpslld $2, %zmm9, %zmm4 + vpxord %zmm3, %zmm3, %zmm3 + vpxord %zmm15, %zmm15, %zmm15 + vpxord %zmm2, %zmm2, %zmm2 + vgatherdps (%rax, %zmm4), %zmm3{%k1} + vgatherdps 4(%rax, %zmm4), %zmm15{%k2} + vgatherdps 8(%rax, %zmm4), %zmm2{%k3} + vpsrld $16, %zmm3, %zmm5 + vpsrld $16, %zmm2, %zmm13 + + /* + * Also get the significand as an integer + * NB: adding in the integer bit is wrong for denorms! + * To make this work for denorms we should do something slightly different + */ + vpandd .FLT_13(%rip), %zmm11, %zmm0 + vpaddd .FLT_14(%rip), %zmm0, %zmm1 + vpsrld $1 |
