diff options
| author | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 10:47:12 -0800 |
|---|---|---|
| committer | Sunil K Pandey <skpgkp2@gmail.com> | 2022-03-07 21:14:10 -0800 |
| commit | c9102dec55e1d182ab617a0f0f2e1d39f3cd8b18 (patch) | |
| tree | e5d92002a17af8a5a8a8d4e6e1a247c2d8121ec3 | |
| parent | 27be95ecd73a9a65ae6c981fd2b00c28d0642819 (diff) | |
| download | glibc-c9102dec55e1d182ab617a0f0f2e1d39f3cd8b18.tar.xz glibc-c9102dec55e1d182ab617a0f0f2e1d39f3cd8b18.zip | |
x86_64: Fix svml_d_erfc8_core_avx512.S code formatting
This commit contains following formatting changes
1. Instructions proceeded by a tab.
2. Instruction less than 8 characters in length have a tab
between it and the first operand.
3. Instruction greater than 7 characters in length have a
space between it and the first operand.
4. Tabs after `#define`d names and their value.
5. 8 space at the beginning of line replaced by tab.
6. Indent comments with code.
7. Remove redundent .text section.
8. 1 space between line content and line comment.
9. Space after all commas.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
| -rw-r--r-- | sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S | 7560 |
1 files changed, 3779 insertions, 3781 deletions
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S index bd2c3bef7d..77228814d3 100644 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_erfc8_core_avx512.S @@ -29,3832 +29,3830 @@ /* Offsets for data table __svml_derfc_data_internal */ -#define _erfc_tbl 0 -#define _AbsMask 55808 -#define _MaxThreshold 55872 -#define _SgnMask 55936 -#define _One 56000 -#define _TwoM128 56064 -#define _SRound 56128 -#define _poly1_0 56192 -#define _poly1_1 56256 -#define _poly3_0 56320 -#define _poly3_1 56384 -#define _poly5_0 56448 -#define _poly5_1 56512 -#define _poly1_2 56576 -#define _poly3_2 56640 -#define _poly5_2 56704 -#define _poly1_3 56768 -#define _poly3_3 56832 -#define _poly5_3 56896 -#define _poly1_4 56960 -#define _poly3_4 57024 -#define _poly1_5 57088 -#define _poly3_5 57152 -#define _poly3_6 57216 -#define _poly1_6 57280 -#define _poly1_7 57344 -#define _UF_Threshold 57408 -#define _Mask32 57472 +#define _erfc_tbl 0 +#define _AbsMask 55808 +#define _MaxThreshold 55872 +#define _SgnMask 55936 +#define _One 56000 +#define _TwoM128 56064 +#define _SRound 56128 +#define _poly1_0 56192 +#define _poly1_1 56256 +#define _poly3_0 56320 +#define _poly3_1 56384 +#define _poly5_0 56448 +#define _poly5_1 56512 +#define _poly1_2 56576 +#define _poly3_2 56640 +#define _poly5_2 56704 +#define _poly1_3 56768 +#define _poly3_3 56832 +#define _poly5_3 56896 +#define _poly1_4 56960 +#define _poly3_4 57024 +#define _poly1_5 57088 +#define _poly3_5 57152 +#define _poly3_6 57216 +#define _poly1_6 57280 +#define _poly1_7 57344 +#define _UF_Threshold 57408 +#define _Mask32 57472 #include <sysdep.h> - .text - .section .text.evex512,"ax",@progbits + .section .text.evex512, "ax", @progbits ENTRY(_ZGVeN8v_erfc_skx) - pushq %rbp - cfi_def_cfa_offset(16) - movq %rsp, %rbp - cfi_def_cfa(6, 16) - cfi_offset(6, -16) - andq $-64, %rsp - subq $192, %rsp + pushq %rbp + cfi_def_cfa_offset(16) + movq %rsp, %rbp + cfi_def_cfa(6, 16) + cfi_offset(6, -16) + andq $-64, %rsp + subq $192, %rsp -/* vector gather: erfc_h(x0), (erfc_l(x0), 2/sqrt(pi)*exp(-x0^2)) */ - lea __svml_derfc_data_internal(%rip), %rax + /* vector gather: erfc_h(x0), (erfc_l(x0), 2/sqrt(pi)*exp(-x0^2)) */ + lea __svml_derfc_data_internal(%rip), %rax -/* - * erfc(27.25) underflows to 0 - * can compute all results in the main path - */ - vmovups _MaxThreshold+__svml_derfc_data_internal(%rip), %zmm13 - vmovups _SRound+__svml_derfc_data_internal(%rip), %zmm7 - vmovups _One+__svml_derfc_data_internal(%rip), %zmm14 - kxnorw %k0, %k0, %k2 - kxnorw %k0, %k0, %k1 - vmovaps %zmm0, %zmm11 - vandpd _AbsMask+__svml_derfc_data_internal(%rip), %zmm11, %zmm12 - vandpd _SgnMask+__svml_derfc_data_internal(%rip), %zmm11, %zmm3 - vmovups _TwoM128+__svml_derfc_data_internal(%rip), %zmm0 - vminpd {sae}, %zmm13, %zmm12, %zmm6 + /* + * erfc(27.25) underflows to 0 + * can compute all results in the main path + */ + vmovups _MaxThreshold+__svml_derfc_data_internal(%rip), %zmm13 + vmovups _SRound+__svml_derfc_data_internal(%rip), %zmm7 + vmovups _One+__svml_derfc_data_internal(%rip), %zmm14 + kxnorw %k0, %k0, %k2 + kxnorw %k0, %k0, %k1 + vmovaps %zmm0, %zmm11 + vandpd _AbsMask+__svml_derfc_data_internal(%rip), %zmm11, %zmm12 + vandpd _SgnMask+__svml_derfc_data_internal(%rip), %zmm11, %zmm3 + vmovups _TwoM128+__svml_derfc_data_internal(%rip), %zmm0 + vminpd {sae}, %zmm13, %zmm12, %zmm6 -/* Start polynomial evaluation */ - vmovups _poly1_0+__svml_derfc_data_internal(%rip), %zmm12 - vmovups _poly3_0+__svml_derfc_data_internal(%rip), %zmm13 - vaddpd {rn-sae}, %zmm7, %zmm6, %zmm1 - vorpd %zmm3, %zmm14, %zmm15 - vmaxpd {sae}, %zmm0, %zmm6, %zmm2 - vmovups _poly1_1+__svml_derfc_data_internal(%rip), %zmm6 - vpsllq $4, %zmm1, %zmm4 - vsubpd {rn-sae}, %zmm7, %zmm1, %zmm5 + /* Start polynomial evaluation */ + vmovups _poly1_0+__svml_derfc_data_internal(%rip), %zmm12 + vmovups _poly3_0+__svml_derfc_data_internal(%rip), %zmm13 + vaddpd {rn-sae}, %zmm7, %zmm6, %zmm1 + vorpd %zmm3, %zmm14, %zmm15 + vmaxpd {sae}, %zmm0, %zmm6, %zmm2 + vmovups _poly1_1+__svml_derfc_data_internal(%rip), %zmm6 + vpsllq $4, %zmm1, %zmm4 + vsubpd {rn-sae}, %zmm7, %zmm1, %zmm5 -/* 2.0 if x<0, 0.0 otherwise */ - vsubpd {rn-sae}, %zmm15, %zmm14, %zmm10 + /* 2.0 if x<0, 0.0 otherwise */ + vsubpd {rn-sae}, %zmm15, %zmm14, %zmm10 -/* 2^(-128) with sign of input */ - vorpd %zmm3, %zmm0, %zmm7 - vsubpd {rn-sae}, %zmm5, %zmm2, %zmm9 - vmovups _poly1_2+__svml_derfc_data_internal(%rip), %zmm15 - vmovups _poly5_1+__svml_derfc_data_internal(%rip), %zmm3 - vmovups _poly5_0+__svml_derfc_data_internal(%rip), %zmm14 - vmovups _poly1_3+__svml_derfc_data_internal(%rip), %zmm0 - vmovups _poly5_2+__svml_derfc_data_internal(%rip), %zmm1 - vmovups _poly3_2+__svml_derfc_data_internal(%rip), %zmm2 - vmulpd {rn-sae}, %zmm9, %zmm5, %zmm8 - vmovups _poly3_1+__svml_derfc_data_internal(%rip), %zmm5 - vfmadd231pd {rn-sae}, %zmm8, %zmm12, %zmm6 - vfmadd231pd {rn-sae}, %zmm8, %zmm14, %zmm3 - vfmadd231pd {rn-sae}, %zmm8, %zmm13, %zmm5 - vmovups _poly3_3+__svml_derfc_data_internal(%rip), %zmm12 - vmovups _poly5_3+__svml_derfc_data_internal(%rip), %zmm13 - vfmadd213pd {rn-sae}, %zmm15, %zmm8, %zmm6 - vfmadd213pd {rn-sae}, %zmm1, %zmm8, %zmm3 - vfmadd213pd {rn-sae}, %zmm2, %zmm8, %zmm5 - vmovups _poly3_5+__svml_derfc_data_internal(%rip), %zmm14 + /* 2^(-128) with sign of input */ + vorpd %zmm3, %zmm0, %zmm7 + vsubpd {rn-sae}, %zmm5, %zmm2, %zmm9 + vmovups _poly1_2+__svml_derfc_data_internal(%rip), %zmm15 + vmovups _poly5_1+__svml_derfc_data_internal(%rip), %zmm3 + vmovups _poly5_0+__svml_derfc_data_internal(%rip), %zmm14 + vmovups _poly1_3+__svml_derfc_data_internal(%rip), %zmm0 + vmovups _poly5_2+__svml_derfc_data_internal(%rip), %zmm1 + vmovups _poly3_2+__svml_derfc_data_internal(%rip), %zmm2 + vmulpd {rn-sae}, %zmm9, %zmm5, %zmm8 + vmovups _poly3_1+__svml_derfc_data_internal(%rip), %zmm5 + vfmadd231pd {rn-sae}, %zmm8, %zmm12, %zmm6 + vfmadd231pd {rn-sae}, %zmm8, %zmm14, %zmm3 + vfmadd231pd {rn-sae}, %zmm8, %zmm13, %zmm5 + vmovups _poly3_3+__svml_derfc_data_internal(%rip), %zmm12 + vmovups _poly5_3+__svml_derfc_data_internal(%rip), %zmm13 + vfmadd213pd {rn-sae}, %zmm15, %zmm8, %zmm6 + vfmadd213pd {rn-sae}, %zmm1, %zmm8, %zmm3 + vfmadd213pd {rn-sae}, %zmm2, %zmm8, %zmm5 + vmovups _poly3_5+__svml_derfc_data_internal(%rip), %zmm14 -/* P5 = P5 + D2*P07 */ - vmovups _poly3_6+__svml_derfc_data_internal(%rip), %zmm15 - vfmadd213pd {rn-sae}, %zmm0, %zmm8, %zmm6 - vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm3 - vfmadd213pd {rn-sae}, %zmm12, %zmm8, %zmm5 - vmovups _poly3_4+__svml_derfc_data_internal(%rip), %zmm12 - vmovups _poly1_5+__svml_derfc_data_internal(%rip), %zmm13 - vfmadd213pd {rn-sae}, %zmm12, %zmm8, %zmm5 - vfmadd213pd {rn-sae}, %zmm14, %zmm8, %zmm5 - vpandq _Mask32+__svml_derfc_data_internal(%rip), %zmm4, %zmm4 - vpmovqd %zmm4, %ymm0 - vmovups _poly1_4+__svml_derfc_data_internal(%rip), %zmm4 - vfmadd213pd {rn-sae}, %zmm4, %zmm8, %zmm6 + /* P5 = P5 + D2*P07 */ + vmovups _poly3_6+__svml_derfc_data_internal(%rip), %zmm15 + vfmadd213pd {rn-sae}, %zmm0, %zmm8, %zmm6 + vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm3 + vfmadd213pd {rn-sae}, %zmm12, %zmm8, %zmm5 + vmovups _poly3_4+__svml_derfc_data_internal(%rip), %zmm12 + vmovups _poly1_5+__svml_derfc_data_internal(%rip), %zmm13 + vfmadd213pd {rn-sae}, %zmm12, %zmm8, %zmm5 + vfmadd213pd {rn-sae}, %zmm14, %zmm8, %zmm5 + vpandq _Mask32+__svml_derfc_data_internal(%rip), %zmm4, %zmm4 + vpmovqd %zmm4, %ymm0 + vmovups _poly1_4+__svml_derfc_data_internal(%rip), %zmm4 + vfmadd213pd {rn-sae}, %zmm4, %zmm8, %zmm6 -/* T^2 */ - vmulpd {rn-sae}, %zmm8, %zmm8, %zmm4 - vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm6 - vpxord %zmm1, %zmm1, %zmm1 - vgatherdpd 8(%rax,%ymm0), %zmm1{%k2} - vpxord %zmm2, %zmm2, %zmm2 - vgatherdpd (%rax,%ymm0), %zmm2{%k1} + /* T^2 */ + vmulpd {rn-sae}, %zmm8, %zmm8, %zmm4 + vfmadd213pd {rn-sae}, %zmm13, %zmm8, %zmm6 + vpxord %zmm1, %zmm1, %zmm1 + vgatherdpd 8(%rax, %ymm0), %zmm1{%k2} + vpxord %zmm2, %zmm2, %zmm2 + vgatherdpd (%rax, %ymm0), %zmm2{%k1} -/* Diff^2 */ - vmulpd {rn-sae}, %zmm9, %zmm9, %zmm0 - vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm15 - vmovups _poly1_6+__svml_derfc_data_internal(%rip), %zmm3 - vfmadd213pd {rn-sae}, %zmm15, %zmm8, %zmm5 - vfmadd213pd {rn-sae}, %zmm3, %zmm8, %zmm6 - vmovups _poly1_7+__svml_derfc_data_internal(%rip), %zmm3 - vfmsub213pd {rn-sae}, %zmm8, %zmm0, %zmm5 - vfmadd213pd {rn-sae}, %zmm3, %zmm8, %zmm6 + /* Diff^2 */ + vmulpd {rn-sae}, %zmm9, %zmm9, %zmm0 + vfmadd231pd {rn-sae}, %zmm0, %zmm3, %zmm15 + vmovups _poly1_6+__svml_derfc_data_internal(%rip), %zmm3 + vfmadd213pd {rn-sae}, %zmm15, %zmm8, %zmm5 + vfmadd213pd {rn-sae}, %zmm3, %zmm8, %zmm6 + vmovups _poly1_7+__svml_derfc_data_internal(%rip), %zmm3 + vfmsub213pd {rn-sae}, %zmm8, %zmm0, %zmm5 + vfmadd213pd {rn-sae}, %zmm3, %zmm8, %zmm6 -/* EXP_X0H *= Diff */ - vmulpd {rn-sae}, %zmm9, %zmm1, %zmm8 + /* EXP_X0H *= Diff */ + vmulpd {rn-sae}, %zmm9, %zmm1, %zmm8 -/* Special arguments (for flags only) */ - vmovups _UF_Threshold+__svml_derfc_data_internal(%rip), %zmm9 - vfmadd213pd {rn-sae}, %zmm5, %zmm4, %zmm6 - vcmppd $21, {sae}, %zmm9, %zmm11, %k0 + /* Special arguments (for flags only) */ + vmovups _UF_Threshold+__svml_derfc_data_internal(%rip), %zmm9 + vfmadd213pd {rn-sae}, %zmm5, %zmm4, %zmm6 + vcmppd $21, {sae}, %zmm9, %zmm11, %k0 -/* EXP_x0H*Diff*(1+P1) */ - vfmadd213pd {rn-sae}, %zmm8, %zmm8, %zmm6 - kmovw %k0, %edx + /* EXP_x0H*Diff*(1+P1) */ + vfmadd213pd {rn-sae}, %zmm8, %zmm8, %zmm6 + kmovw %k0, %edx -/* erfc(|_VARG1|) = erfc_h(x0) - P1 */ - vsubpd {rn-sae}, %zmm6, %zmm2, %zmm0 - vfmadd213pd {rn-sae}, %zmm10, %zmm7, %zmm0 - testl %edx, %edx + /* erfc(|_VARG1|) = erfc_h(x0) - P1 */ + vsubpd {rn-sae}, %zmm6, %zmm2, %zmm0 + vfmadd213pd {rn-sae}, %zmm10, %zmm7, %zmm0 + testl %edx, %edx -/* Go to special inputs processing branch */ - jne L(SPECIAL_VALUES_BRANCH) - # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11 + /* Go to special inputs processing branch */ + jne L(SPECIAL_VALUES_BRANCH) + # LOE rbx r12 r13 r14 r15 edx zmm0 zmm11 -/* Restore registers - * and exit the function - */ + /* Restore registers + * and exit the function + */ L(EXIT): - movq %rbp, %rsp - popq %rbp - cfi_def_cfa(7, 8) - cfi_restore(6) - ret - cfi_def_cfa(6, 16) - cfi_offset(6, -16) + movq %rbp, %rsp + popq %rbp + cfi_def_cfa(7, 8) + cfi_restore(6) + ret + cfi_def_cfa(6, 16) + cfi_offset(6, -16) -/* Branch to process - * special inputs - */ + /* Branch to process + * special inputs + */ L(SPECIAL_VALUES_BRANCH): - vmovups %zmm11, 64(%rsp) - vmovups %zmm0, 128(%rsp) - # LOE rbx r12 r13 r14 r15 edx zmm0 + vmovups %zmm11, 64(%rsp) + vmovups %zmm0, 128(%rsp) + # LOE rbx r12 r13 r14 r15 edx zmm0 - xorl %eax, %eax - # LOE rbx r12 r13 r14 r15 eax edx + xorl %eax, %eax + # LOE rbx r12 r13 r14 r15 eax edx - vzeroupper - movq %r12, 16(%rsp) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - movl %eax, %r12d - movq %r13, 8(%rsp) - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - movl %edx, %r13d - movq %r14, (%rsp) - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r15 r12d r13d + vzeroupper + movq %r12, 16(%rsp) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + movl %eax, %r12d + movq %r13, 8(%rsp) + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + movl %edx, %r13d + movq %r14, (%rsp) + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r15 r12d r13d -/* Range mask - * bits check - */ + /* Range mask + * bits check + */ L(RANGEMASK_CHECK): - btl %r12d, %r13d + btl %r12d, %r13d -/* Call scalar math function */ - jc L(SCALAR_MATH_CALL) - # LOE rbx r15 r12d r13d + /* Call scalar math function */ + jc L(SCALAR_MATH_CALL) + # LOE rbx r15 r12d r13d -/* Special inputs - * processing loop - */ + /* Special inputs + * processing loop + */ L(SPECIAL_VALUES_LOOP): - incl %r12d - cmpl $8, %r12d + incl %r12d + cmpl $8, %r12d -/* Check bits in range mask */ - jl L(RANGEMASK_CHECK) - # LOE rbx r15 r12d r13d + /* Check bits in range mask */ + jl L(RANGEMASK_CHECK) + # LOE rbx r15 r12d r13d - movq 16(%rsp), %r12 - cfi_restore(12) - movq 8(%rsp), %r13 - cfi_restore(13) - movq (%rsp), %r14 - cfi_restore(14) - vmovups 128(%rsp), %zmm0 + movq 16(%rsp), %r12 + cfi_restore(12) + movq 8(%rsp), %r13 + cfi_restore(13) + movq (%rsp), %r14 + cfi_restore(14) + vmovups 128(%rsp), %zmm0 -/* Go to exit */ - jmp L(EXIT) - /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ - .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ - .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 - /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ - .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 - # LOE rbx r12 r13 r14 r15 zmm0 + /* Go to exit */ + jmp L(EXIT) + /* DW_CFA_expression: r12 (r12) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -176; DW_OP_plus) */ + .cfi_escape 0x10, 0x0c, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x50, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r13 (r13) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -184; DW_OP_plus) */ + .cfi_escape 0x10, 0x0d, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x48, 0xff, 0xff, 0xff, 0x22 + /* DW_CFA_expression: r14 (r14) (DW_OP_lit8; DW_OP_minus; DW_OP_const4s: -64; DW_OP_and; DW_OP_const4s: -192; DW_OP_plus) */ + .cfi_escape 0x10, 0x0e, 0x0e, 0x38, 0x1c, 0x0d, 0xc0, 0xff, 0xff, 0xff, 0x1a, 0x0d, 0x40, 0xff, 0xff, 0xff, 0x22 + # LOE rbx r12 r13 r14 r15 zmm0 -/* Scalar math fucntion call - * to process special input - */ + /* Scalar math fucntion call + * to process special input + */ L(SCALAR_MATH_CALL): - movl %r12d, %r14d - movsd 64(%rsp,%r14,8), %xmm0 - call erfc@PLT - # LOE rbx r14 r15 r12d r13d xmm0 + movl %r12d, %r14d + movsd 64(%rsp, %r14, 8), %xmm0 + call erfc@PLT + # LOE rbx r14 r15 r12d r13d xmm0 - movsd %xmm0, 128(%rsp,%r14,8) + movsd %xmm0, 128(%rsp, %r14, 8) -/* Process special inputs in loop */ - jmp L(SPECIAL_VALUES_LOOP) - # LOE rbx r15 r12d r13d + /* Process special inputs in loop */ + jmp L(SPECIAL_VALUES_LOOP) + # LOE rbx r15 r12d r13d END(_ZGVeN8v_erfc_skx) - .section .rodata, "a" - .align 64 + .section .rodata, "a" + .align 64 #ifdef __svml_derfc_data_internal_typedef typedef unsigned int VUINT32; -typedef struct -{ - __declspec(align(64)) VUINT32 _erfc_tbl[3488*2][2]; - __declspec(align(64)) VUINT32 _AbsMask[8][2]; - __declspec(align(64)) VUINT32 _MaxThreshold[8][2]; - __declspec(align(64)) VUINT32 _SgnMask[8][2]; - __declspec(align(64)) VUINT32 _One[8][2]; - __declspec(align(64)) VUINT32 _TwoM128[8][2]; - __declspec(align(64)) VUINT32 _SRound[8][2]; - __declspec(align(64)) VUINT32 _poly1_0[8][2]; - __declspec(align(64)) VUINT32 _poly1_1[8][2]; - __declspec(align(64)) VUINT32 _poly3_0[8][2]; - __declspec(align(64)) VUINT32 _poly3_1[8][2]; - __declspec(align(64)) VUINT32 _poly5_0[8][2]; - __declspec(align(64)) VUINT32 _poly5_1[8][2]; - __declspec(align(64)) VUINT32 _poly1_2[8][2]; - __declspec(align(64)) VUINT32 _poly3_2[8][2]; - __declspec(align(64)) VUINT32 _poly5_2[8][2]; - __declspec(align(64)) VUINT32 _poly1_3[8][2]; - __declspec(align(64)) VUINT32 _poly3_3[8][2]; - __declspec(align(64)) VUINT32 _poly5_3[8][2]; - __declspec(align(64)) VUINT32 _poly1_4[8][2]; - __declspec(align(64)) VUINT32 _poly3_4[8][2]; - __declspec(align(64)) VUINT32 _poly1_5[8][2]; - __declspec(align(64)) VUINT32 _poly3_5[8][2]; - __declspec(align(64)) VUINT32 _poly3_6[8][2]; - __declspec(align(64)) VUINT32 _poly1_6[8][2]; - __declspec(align(64)) VUINT32 _poly1_7[8][2]; - __declspec(align(64)) VUINT32 _UF_Threshold[8][2]; - __declspec(align(64)) VUINT32 _Mask32[8][2]; +typedef struct { + __declspec(align(64)) VUINT32 _erfc_tbl[3488*2][2]; + __declspec(align(64)) VUINT32 _AbsMask[8][2]; + __declspec(align(64)) VUINT32 _MaxThreshold[8][2]; + __declspec(align(64)) VUINT32 _SgnMask[8][2]; + __declspec(align(64)) VUINT32 _One[8][2]; + __declspec(align(64)) VUINT32 _TwoM128[8][2]; + __declspec(align(64)) VUINT32 _SRound[8][2]; + __declspec(align(64)) VUINT32 _poly1_0[8][2]; + __declspec(align(64)) VUINT32 _poly1_1[8][2]; + __declspec(align(64)) VUINT32 _poly3_0[8][2]; + __declspec(align(64)) VUINT32 _poly3_1[8][2]; + __declspec(align(64)) VUINT32 _poly5_0[8][2]; + __declspec(align(64)) VUINT32 _poly5_1[8][2]; + __declspec(align(64)) VUINT32 _poly1_2[8][2]; + __declspec(align(64)) VUINT32 _poly3_2[8][2]; + __declspec(align(64)) VUINT32 _poly5_2[8][2]; + __declspec(align(64)) VUINT32 _poly1_3[8][2]; + __declspec(align(64)) VUINT32 _poly3_3[8][2]; + __declspec(align(64)) VUINT32 _poly5_3[8][2]; + __declspec(align(64)) VUINT32 _poly1_4[8][2]; + __declspec(align(64)) VUINT32 _poly3_4[8][2]; + __declspec(align(64)) VUINT32 _poly1_5[8][2]; + __declspec(align(64)) VUINT32 _poly3_5[8][2]; + __declspec(align(64)) VUINT32 _poly3_6[8][2]; + __declspec(align(64)) VUINT32 _poly1_6[8][2]; + __declspec(align(64)) VUINT32 _poly1_7[8][2]; + __declspec(align(64)) VUINT32 _UF_Threshold[8][2]; + __declspec(align(64)) VUINT32 _Mask32[8][2]; } __svml_derfc_data_internal; #endif __svml_derfc_data_internal: - /*== _erfc_tbl ==*/ - .quad 0x47f0000000000000, 0x47f20dd750429b6d - .quad 0x47efb7c9030853b3, 0x47f20d8f1975c85d - .quad 0x47ef6f9447be0743, 0x47f20cb67bd452c7 - .quad 0x47ef27640f9853d9, 0x47f20b4d8bac36c1 - .quad 0x47eedf3a9ba22dad, 0x47f209546ad13ccf - .quad 0x47ee971a2c4436ae, 0x47f206cb4897b148 - .quad 0x47ee4f05010eca8c, 0x47f203b261cd0053 - .quad 0x47ee06fd58842c7e, 0x47f2000a00ae3804 - .quad 0x47edbf056fe2df35, 0x47f1fbd27cdc72d3 - .quad 0x47ed771f82f02f4e, 0x47f1f70c3b4f2cc8 - .quad 0x47ed2f4dcbc2f894, 0x47f1f1b7ae44867f - .quad 0x47ece792828eae5c, 0x47f1ebd5552f795b - .quad 0x47ec9fefdd6eaf19, 0x47f1e565bca400d4 - .quad 0x47ec58681031eb6a, 0x47f1de697e413d29 - .quad 0x47ec10fd4c26e896, 0x47f1d6e14099944a - .quad 0x47ebc9b1bfe82687, 0x47f1cecdb718d61c - .quad 0x47eb82879728f11e, 0x47f1c62fa1e869b6 - .quad 0x47eb3b80fa82a4bb, 0x47f1bd07cdd189ac - .quad 0x47eaf4a00f426daa, 0x47f1b357141d95d5 - .quad 0x47eaade6f7378a0e, 0x47f1a91e5a748165 - .quad 0x47ea6757d08215d8, 0x47f19e5e92b964ab - .quad 0x47ea20f4b5626818, 0x47f19318bae53a04 - .quad 0x47e9dabfbc090901, 0x47f1874ddcdfce24 - .quad 0x47e994baf66747ad, 0x47f17aff0e56ec10 - .quad 0x47e94ee8720076b6, 0x47f16e2d7093cd8c - .quad 0x47e9094a37bbd66e, 0x47f160da304ed92f - .quad 0x47e8c3e24bb73372, 0x47f153068581b781 - .quad 0x47e87eb2ad1a4032, 0x47f144b3b337c90c - .quad 0x47e839bd55eaafc8, 0x47f135e3075d076b - .quad 0x47e7f5043ae11862, 0x47f12695da8b5bde - .quad 0x47e7b0894b3ea35c, 0x47f116cd8fd67618 - .quad 0x47e76c4e70a390e7, 0x47f1068b94962e5e - .quad 0x47e728558ee694fc, 0x47f0f5d1602f7e41 - .quad 0x47e6e4a083ed132f, 0x47f0e4a073dc1b91 - .quad 0x47e6a13127843ec1, 0x47f0d2fa5a70c168 - .quad 0x47e65e094b3b2413, 0x47f0c0e0a8223359 - .quad 0x47e61b2aba3da093, 0x47f0ae54fa490723 - .quad 0x47e5d89739304dcf, 0x47f09b58f724416b - .quad 0x47e59650860d6469, 0x47f087ee4d9ad247 - .quad 0x47e5545858029b39, 0x47f07416b4fbfe7c - .quad 0x47e512b05f5006e1, 0x47f05fd3ecbec298 - .quad 0x47e4d15a4527fdc7, 0x47f04b27bc403d30 - .quad 0x47e49057ab900447, 0x47f03613f2812daf - .quad 0x47e44faa2d42c4a0, 0x47f0209a65e29545 - .quad 0x47e40f535d93160e, 0x47f00abcf3e187a9 - .quad 0x47e3cf54c8501620, 0x47efe8fb01a47307 - .quad 0x47e38faff1aa574a, 0x47efbbbbef34b4b2 - .quad 0x47e35066561a275d, 0x47ef8dc092d58ff8 - .quad 0x47e311796a46f064, 0x47ef5f0cdaf15313 - .quad 0x47e2d2ea9aefb636, 0x47ef2fa4c16c0019 - .quad 0x47e294bb4cd4b2bd, 0x47eeff8c4b1375db - .quad 0x47e256ecdca212cc, 0x47eecec7870ebca8 - .quad 0x47e219809edbd524, 0x47ee9d5a8e4c934e - .quad 0x47e1dc77dfcacd02, 0x47ee6b4982f158b9 - .quad 0x47e19fd3e36ac96a, 0x47ee38988fc46e72 - .quad 0x47e16395e559e218, 0x47ee054be79d3042 - .quad 0x47e127bf18c8eadc, 0x47edd167c4cf9d2a - .quad 0x47e0ec50a86d0dd4, 0x47ed9cf06898cdaf - .quad 0x47e0b14bb6728cd8, 0x47ed67ea1a8b5368 - .quad 0x47e076b15c70aa28, 0x47ed325927fb9d89 - .quad 0x47e03c82ab5eb831, 0x47ecfc41e36c7df9 - .quad 0x47e002c0ab8a5018, 0x47ecc5a8a3fbea40 - .quad 0x47df92d8b91d5cc7, 0x47ec8e91c4d01368 - .quad 0x47df210d6a9a6a31, 0x47ec5701a484ef9d - .quad 0x47deb02147ce245c, 0x47ec1efca49a5011 - .quad 0x47de40161b701275, 0x47ebe68728e29d5e - .quad 0x47ddd0ed9ea4bdd6, 0x47ebada596f25436 - .quad 0x47dd62a978f7c957, 0x47eb745c55905bf8 - .quad 0x47dcf54b4058455f, 0x47eb3aafcc27502e - .quad 0x47dc88d479173cce, 0x47eb00a46237d5be - .quad 0x47dc1d4695e87644, 0x47eac63e7ecc1411 - .quad 0x47dbb2a2f7e56520, 0x47ea8b8287ec6a09 - .quad 0x47db48eaee924501, 0x47ea5074e2157620 - .quad 0x47dae01fb7e55a66, 0x47ea1519efaf889e - .quad 0x47da78428050527e, 0x47e9d97610879642 - .quad 0x47da115462cbbc17, 0x47e99d8da149c13f - .quad 0x47d9ab5668e4930a, 0x47e96164fafd8de3 - .quad 0x47d946498acbd766, 0x47e925007283d7aa - .quad 0x47d8e22eaf68291e, 0x47e8e86458169af8 |
