1 | /* Compute sine and cosine of argument optimized with vector. |
2 | Copyright (C) 2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <errno.h> |
20 | #include <math.h> |
21 | #include <math_private.h> |
22 | #include <x86intrin.h> |
23 | #include <libm-alias-float.h> |
24 | #include "s_sincosf.h" |
25 | |
26 | #define SINCOSF __sincosf_fma |
27 | |
28 | #ifndef SINCOSF |
29 | # define SINCOSF_FUNC __sincosf |
30 | #else |
31 | # define SINCOSF_FUNC SINCOSF |
32 | #endif |
33 | |
34 | /* Chebyshev constants for sin and cos, range -PI/4 - PI/4. */ |
35 | static const __v2df V0 = { -0x1.5555555551cd9p-3, -0x1.ffffffffe98aep-2}; |
36 | static const __v2df V1 = { 0x1.1111110c2688bp-7, 0x1.55555545c50c7p-5 }; |
37 | static const __v2df V2 = { -0x1.a019f8b4bd1f9p-13, -0x1.6c16b348b6874p-10 }; |
38 | static const __v2df V3 = { 0x1.71d7264e6b5b4p-19, 0x1.a00eb9ac43ccp-16 }; |
39 | static const __v2df V4 = { -0x1.a947e1674b58ap-26, -0x1.23c97dd8844d7p-22 }; |
40 | |
41 | /* Chebyshev constants for sin and cos, range 2^-27 - 2^-5. */ |
42 | static const __v2df VC0 = { -0x1.555555543d49dp-3, -0x1.fffffff5cc6fdp-2 }; |
43 | static const __v2df VC1 = { 0x1.110f475cec8c5p-7, 0x1.55514b178dac5p-5 }; |
44 | |
45 | static const __v2df v2ones = { 1.0, 1.0 }; |
46 | |
47 | /* Compute the sine and cosine values using Chebyshev polynomials where |
48 | THETA is the range reduced absolute value of the input |
49 | and it is less than Pi/4, |
50 | N is calculated as trunc(|x|/(Pi/4)) + 1 and it is used to decide |
51 | whether a sine or cosine approximation is more accurate and |
52 | SIGNBIT is used to add the correct sign after the Chebyshev |
53 | polynomial is computed. */ |
54 | static void |
55 | reduced_sincos (const double theta, const unsigned int n, |
56 | const unsigned int signbit, float *sinx, float *cosx) |
57 | { |
58 | __v2df v2x, v2sx, v2cx; |
59 | const __v2df v2theta = { theta, theta }; |
60 | const __v2df v2theta2 = v2theta * v2theta; |
61 | /* Here sinf() and cosf() are calculated using sin Chebyshev polynomial: |
62 | x+x^3*(S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4)))). */ |
63 | v2x = V3 + v2theta2 * V4; /* S3+x^2*S4. */ |
64 | v2x = V2 + v2theta2 * v2x; /* S2+x^2*(S3+x^2*S4). */ |
65 | v2x = V1 + v2theta2 * v2x; /* S1+x^2*(S2+x^2*(S3+x^2*S4)). */ |
66 | v2x = V0 + v2theta2 * v2x; /* S0+x^2*(S1+x^2*(S2+x^2*(S3+x^2*S4))). */ |
67 | v2x = v2theta2 * v2x; |
68 | v2cx = v2ones + v2x; |
69 | v2sx = v2theta + v2theta * v2x; |
70 | /* We are operating on |x|, so we need to add back the original |
71 | signbit for sinf. */ |
72 | /* Determine positive or negative primary interval. */ |
73 | /* Are we in the primary interval of sin or cos? */ |
74 | if ((n & 2) == 0) |
75 | { |
76 | const __v2df v2sign = |
77 | { |
78 | ones[((n >> 2) & 1) ^ signbit], |
79 | ones[((n + 2) >> 2) & 1] |
80 | }; |
81 | v2cx[0] = v2sx[0]; |
82 | v2cx *= v2sign; |
83 | __v4sf v4sx = _mm_cvtpd_ps (v2cx); |
84 | *sinx = v4sx[0]; |
85 | *cosx = v4sx[1]; |
86 | } |
87 | else |
88 | { |
89 | const __v2df v2sign = |
90 | { |
91 | ones[((n + 2) >> 2) & 1], |
92 | ones[((n >> 2) & 1) ^ signbit] |
93 | }; |
94 | v2cx[0] = v2sx[0]; |
95 | v2cx *= v2sign; |
96 | __v4sf v4sx = _mm_cvtpd_ps (v2cx); |
97 | *sinx = v4sx[1]; |
98 | *cosx = v4sx[0]; |
99 | } |
100 | } |
101 | |
102 | void |
103 | SINCOSF_FUNC (float x, float *sinx, float *cosx) |
104 | { |
105 | double theta = x; |
106 | double abstheta = fabs (theta); |
107 | uint32_t ix, xi; |
108 | GET_FLOAT_WORD (xi, x); |
109 | /* |x| */ |
110 | ix = xi & 0x7fffffff; |
111 | /* If |x|< Pi/4. */ |
112 | if (ix < 0x3f490fdb) |
113 | { |
114 | if (ix >= 0x3d000000) /* |x| >= 2^-5. */ |
115 | { |
116 | __v2df v2x, v2sx, v2cx; |
117 | const __v2df v2theta = { theta, theta }; |
118 | const __v2df v2theta2 = v2theta * v2theta; |
119 | /* Chebyshev polynomial of the form for sin and cos. */ |
120 | v2x = V3 + v2theta2 * V4; |
121 | v2x = V2 + v2theta2 * v2x; |
122 | v2x = V1 + v2theta2 * v2x; |
123 | v2x = V0 + v2theta2 * v2x; |
124 | v2x = v2theta2 * v2x; |
125 | v2cx = v2ones + v2x; |
126 | v2sx = v2theta + v2theta * v2x; |
127 | v2cx[0] = v2sx[0]; |
128 | __v4sf v4sx = _mm_cvtpd_ps (v2cx); |
129 | *sinx = v4sx[0]; |
130 | *cosx = v4sx[1]; |
131 | } |
132 | else if (ix >= 0x32000000) /* |x| >= 2^-27. */ |
133 | { |
134 | /* A simpler Chebyshev approximation is close enough for this range: |
135 | for sin: x+x^3*(SS0+x^2*SS1) |
136 | for cos: 1.0+x^2*(CC0+x^3*CC1). */ |
137 | __v2df v2x, v2sx, v2cx; |
138 | const __v2df v2theta = { theta, theta }; |
139 | const __v2df v2theta2 = v2theta * v2theta; |
140 | v2x = VC0 + v2theta * v2theta2 * VC1; |
141 | v2x = v2theta2 * v2x; |
142 | v2cx = v2ones + v2x; |
143 | v2sx = v2theta + v2theta * v2x; |
144 | v2cx[0] = v2sx[0]; |
145 | __v4sf v4sx = _mm_cvtpd_ps (v2cx); |
146 | *sinx = v4sx[0]; |
147 | *cosx = v4sx[1]; |
148 | } |
149 | else |
150 | { |
151 | /* Handle some special cases. */ |
152 | if (ix) |
153 | *sinx = theta - (theta * SMALL); |
154 | else |
155 | *sinx = theta; |
156 | *cosx = 1.0 - abstheta; |
157 | } |
158 | } |
159 | else /* |x| >= Pi/4. */ |
160 | { |
161 | unsigned int signbit = xi >> 31; |
162 | if (ix < 0x40e231d6) /* |x| < 9*Pi/4. */ |
163 | { |
164 | /* There are cases where FE_UPWARD rounding mode can |
165 | produce a result of abstheta * inv_PI_4 == 9, |
166 | where abstheta < 9pi/4, so the domain for |
167 | pio2_table must go to 5 (9 / 2 + 1). */ |
168 | unsigned int n = (abstheta * inv_PI_4) + 1; |
169 | theta = abstheta - pio2_table[n / 2]; |
170 | reduced_sincos (theta, n, signbit, sinx, cosx); |
171 | } |
172 | else if (ix < 0x7f800000) |
173 | { |
174 | if (ix < 0x4b000000) /* |x| < 2^23. */ |
175 | { |
176 | unsigned int n = ((unsigned int) (abstheta * inv_PI_4)) + 1; |
177 | double x = n / 2; |
178 | theta = (abstheta - x * PI_2_hi) - x * PI_2_lo; |
179 | /* Argument reduction needed. */ |
180 | reduced_sincos (theta, n, signbit, sinx, cosx); |
181 | } |
182 | else /* |x| >= 2^23. */ |
183 | { |
184 | x = fabsf (x); |
185 | int exponent |
186 | = (ix >> FLOAT_EXPONENT_SHIFT) - FLOAT_EXPONENT_BIAS; |
187 | exponent += 3; |
188 | exponent /= 28; |
189 | double a = invpio4_table[exponent] * x; |
190 | double b = invpio4_table[exponent + 1] * x; |
191 | double c = invpio4_table[exponent + 2] * x; |
192 | double d = invpio4_table[exponent + 3] * x; |
193 | uint64_t l = a; |
194 | l &= ~0x7; |
195 | a -= l; |
196 | double e = a + b; |
197 | l = e; |
198 | e = a - l; |
199 | if (l & 1) |
200 | { |
201 | e -= 1.0; |
202 | e += b; |
203 | e += c; |
204 | e += d; |
205 | e *= M_PI_4; |
206 | reduced_sincos (e, l + 1, signbit, sinx, cosx); |
207 | } |
208 | else |
209 | { |
210 | e += b; |
211 | e += c; |
212 | e += d; |
213 | if (e <= 1.0) |
214 | { |
215 | e *= M_PI_4; |
216 | reduced_sincos (e, l + 1, signbit, sinx, cosx); |
217 | } |
218 | else |
219 | { |
220 | l++; |
221 | e -= 2.0; |
222 | e *= M_PI_4; |
223 | reduced_sincos (e, l + 1, signbit, sinx, cosx); |
224 | } |
225 | } |
226 | } |
227 | } |
228 | else |
229 | { |
230 | if (ix == 0x7f800000) |
231 | __set_errno (EDOM); |
232 | /* sin/cos(Inf or NaN) is NaN. */ |
233 | *sinx = *cosx = x - x; |
234 | } |
235 | } |
236 | } |
237 | |
238 | #ifndef SINCOSF |
239 | libm_alias_float (__sincos, sincos) |
240 | #endif |
241 | |