sysdeps/aarch64/fpu/log10_advsimd.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

/* Double-precision vector (AdvSIMD) log10 function

   Copyright (C) 2023-2025 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include "v_math.h"

static const struct data
{
  uint64x2_t off, sign_exp_mask, offset_lower_bound;
  uint32x4_t special_bound;
  double invln10, log10_2;
  double c1, c3;
  float64x2_t c0, c2, c4;
} data = {
  /* Computed from log coefficients divided by log(10) then rounded to double
     precision.  */
  .c0 = V2 (-0x1.bcb7b1526e506p-3),
  .c1 = 0x1.287a7636be1d1p-3,
  .c2 = V2 (-0x1.bcb7b158af938p-4),
  .c3 = 0x1.63c78734e6d07p-4,
  .c4 = V2 (-0x1.287461742fee4p-4),
  .invln10 = 0x1.bcb7b1526e50ep-2,
  .log10_2 = 0x1.34413509f79ffp-2,
  .off = V2 (0x3fe6900900000000),
  .sign_exp_mask = V2 (0xfff0000000000000),
  /* Lower bound is 0x0010000000000000. For
     optimised register use subnormals are detected after offset has been
     subtracted, so lower bound - offset (which wraps around).  */
  .offset_lower_bound = V2 (0x0010000000000000 - 0x3fe6900900000000),
  .special_bound = V4 (0x7fe00000), /* asuint64(inf) - 0x0010000000000000.  */
};

#define N (1 << V_LOG10_TABLE_BITS)
#define IndexMask (N - 1)

struct entry
{
  float64x2_t invc;
  float64x2_t log10c;
};

static inline struct entry
lookup (uint64x2_t i)
{
  struct entry e;
  uint64_t i0
      = (vgetq_lane_u64 (i, 0) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
  uint64_t i1
      = (vgetq_lane_u64 (i, 1) >> (52 - V_LOG10_TABLE_BITS)) & IndexMask;
  float64x2_t e0 = vld1q_f64 (&__v_log10_data.table[i0].invc);
  float64x2_t e1 = vld1q_f64 (&__v_log10_data.table[i1].invc);
  e.invc = vuzp1q_f64 (e0, e1);
  e.log10c = vuzp2q_f64 (e0, e1);
  return e;
}

static float64x2_t VPCS_ATTR NOINLINE
special_case (float64x2_t hi, uint64x2_t u_off, float64x2_t y, float64x2_t r2,
	      uint32x2_t special, const struct data *d)
{
  float64x2_t x = vreinterpretq_f64_u64 (vaddq_u64 (u_off, d->off));
  return v_call_f64 (log10, x, vfmaq_f64 (hi, y, r2), vmovl_u32 (special));
}

/* Fast implementation of double-precision vector log10
   is a slight modification of double-precision vector log.
   Max ULP error: < 2.5 ulp (nearest rounding.)
   Maximum measured at 2.46 ulp for x in [0.96, 0.97]
   _ZGVnN2v_log10(0x1.13192407fcb46p+0) got 0x1.fff6be3cae4bbp-6
				       want 0x1.fff6be3cae4b9p-6.  */
float64x2_t VPCS_ATTR V_NAME_D1 (log10) (float64x2_t x)
{
  const struct data *d = ptr_barrier (&data);

  /* To avoid having to mov x out of the way, keep u after offset has been
     applied, and recover x by adding the offset back in the special-case
     handler.  */
  uint64x2_t u = vreinterpretq_u64_f64 (x);
  uint64x2_t u_off = vsubq_u64 (u, d->off);

  /* x = 2^k z; where z is in range [OFF,2*OFF) and exact.
     The range is split into N subintervals.
     The ith subinterval contains z and c is near its center.  */
  int64x2_t k = vshrq_n_s64 (vreinterpretq_s64_u64 (u_off), 52);
  uint64x2_t iz = vsubq_u64 (u, vandq_u64 (u_off, d->sign_exp_mask));
  float64x2_t z = vreinterpretq_f64_u64 (iz);

  struct entry e = lookup (u_off);

  uint32x2_t special = vcge_u32 (vsubhn_u64 (u_off, d->offset_lower_bound),
				 vget_low_u32 (d->special_bound));

  /* log10(x) = log1p(z/c-1)/log(10) + log10(c) + k*log10(2).  */
  float64x2_t r = vfmaq_f64 (v_f64 (-1.0), z, e.invc);
  float64x2_t kd = vcvtq_f64_s64 (k);

  /* hi = r / log(10) + log10(c) + k*log10(2).
     Constants in v_log10_data.c are computed (in extended precision) as
     e.log10c := e.logc * invln10.  */
  float64x2_t cte = vld1q_f64 (&d->invln10);
  float64x2_t hi = vfmaq_laneq_f64 (e.log10c, r, cte, 0);

  /* y = log10(1+r) + n * log10(2).  */
  hi = vfmaq_laneq_f64 (hi, kd, cte, 1);

  /* y = r2*(A0 + r*A1 + r2*(A2 + r*A3 + r2*A4)) + hi.  */
  float64x2_t r2 = vmulq_f64 (r, r);
  float64x2_t odd_coeffs = vld1q_f64 (&d->c1);
  float64x2_t y = vfmaq_laneq_f64 (d->c2, r, odd_coeffs, 1);
  float64x2_t p = vfmaq_laneq_f64 (d->c0, r, odd_coeffs, 0);
  y = vfmaq_f64 (y, d->c4, r2);
  y = vfmaq_f64 (p, y, r2);

  if (__glibc_unlikely (v_any_u32h (special)))
    return special_case (hi, u_off, y, r2, special, d);
  return vfmaq_f64 (hi, y, r2);
}