1/* Optimized __ieee754_expf function.
2 Copyright (C) 2012-2016 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22/* Short algorithm description:
23 *
24 * Let K = 64 (table size).
25 * e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
26 * where
27 * x = m*log(2)/K + y, y in [0.0..log(2)/K]
28 * m = n*K + j, m,n,j - signed integer, j in [0..K-1]
29 * values of 2^(j/K) are tabulated as T[j].
30 *
31 * P(y) is a minimax polynomial approximation of expf(x)-1
32 * on small interval [0.0..log(2)/K].
33 *
34 * P(y) = P3*y*y*y*y + P2*y*y*y + P1*y*y + P0*y, calculated as
35 * z = y*y; P(y) = (P3*z + P1)*z + (P2*z + P0)*y
36 *
37 * Special cases:
38 * expf(NaN) = NaN
39 * expf(+INF) = +INF
40 * expf(-INF) = 0
41 * expf(x) = 1 for subnormals
42 * for finite argument, only expf(0)=1 is exact
43 * expf(x) overflows if x>88.7228317260742190
44 * expf(x) underflows if x<-103.972076416015620
45 */
46
47 .text
48ENTRY(__ieee754_expf)
49 /* Input: single precision x in %xmm0 */
50 cvtss2sd %xmm0, %xmm1 /* Convert x to double precision */
51 movd %xmm0, %ecx /* Copy x */
52 movsd L(DP_KLN2)(%rip), %xmm2 /* DP K/log(2) */
53 movsd L(DP_P2)(%rip), %xmm3 /* DP P2 */
54 movl %ecx, %eax /* x */
55 mulsd %xmm1, %xmm2 /* DP x*K/log(2) */
56 andl $0x7fffffff, %ecx /* |x| */
57 lea L(DP_T)(%rip), %rsi /* address of table T[j] */
58 cmpl $0x42ad496b, %ecx /* |x|<125*log(2) ? */
59 movsd L(DP_P3)(%rip), %xmm4 /* DP P3 */
60 addsd L(DP_RS)(%rip), %xmm2 /* DP x*K/log(2)+RS */
61 jae L(special_paths)
62
63 /* Here if |x|<125*log(2) */
64 cmpl $0x31800000, %ecx /* |x|<2^(-28) ? */
65 jb L(small_arg)
66
67 /* Main path: here if 2^(-28)<=|x|<125*log(2) */
68 cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */
69 movd %xmm2, %eax /* bits of n*K+j with trash */
70 subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */
71 movl %eax, %edx /* n*K+j with trash */
72 cvtss2sd %xmm2, %xmm2 /* DP t */
73 andl $0x3f, %eax /* bits of j */
74 mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
75 andl $0xffffffc0, %edx /* bits of n */
76#ifdef __AVX__
77 vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */
78 vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */
79#else
80 addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */
81 movaps %xmm2, %xmm0 /* DP y */
82 mulsd %xmm2, %xmm2 /* DP z=y*y */
83#endif
84 mulsd %xmm2, %xmm4 /* DP P3*z */
85 addl $0x1fc0, %edx /* bits of n + SP exponent bias */
86 mulsd %xmm2, %xmm3 /* DP P2*z */
87 shll $17, %edx /* SP 2^n */
88 addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */
89 addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */
90 movd %edx, %xmm1 /* SP 2^n */
91 mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */
92 mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */
93 addsd %xmm4, %xmm0 /* DP P(y) */
94 mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */
95 addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */
96 cvtsd2ss %xmm0, %xmm0 /* SP T[j]*(P(y)+1) */
97 mulss %xmm1, %xmm0 /* SP result=2^n*(T[j]*(P(y)+1)) */
98 ret
99
100 .p2align 4
101L(small_arg):
102 /* Here if 0<=|x|<2^(-28) */
103 addss L(SP_ONE)(%rip), %xmm0 /* 1.0 + x */
104 /* Return 1.0 with inexact raised, except for x==0 */
105 ret
106
107 .p2align 4
108L(special_paths):
109 /* Here if 125*log(2)<=|x| */
110 shrl $31, %eax /* Get sign bit of x, and depending on it: */
111 lea L(SP_RANGE)(%rip), %rdx /* load over/underflow bound */
112 cmpl (%rdx,%rax,4), %ecx /* |x|<under/overflow bound ? */
113 jbe L(near_under_or_overflow)
114
115 /* Here if |x|>under/overflow bound */
116 cmpl $0x7f800000, %ecx /* |x| is finite ? */
117 jae L(arg_inf_or_nan)
118
119 /* Here if |x|>under/overflow bound, and x is finite */
120 testq %rax, %rax /* sign of x nonzero ? */
121 je L(res_overflow)
122
123 /* Here if -inf<x<underflow bound (x<0) */
124 movss L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
125 mulss %xmm0, %xmm0 /* Return underflowed result (zero or subnormal) */
126 ret
127
128 .p2align 4
129L(res_overflow):
130 /* Here if overflow bound<x<inf (x>0) */
131 movss L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
132 mulss %xmm0, %xmm0 /* Return overflowed result (Inf or max normal) */
133 ret
134
135 .p2align 4
136L(arg_inf_or_nan):
137 /* Here if |x| is Inf or NAN */
138 jne L(arg_nan) /* |x| is Inf ? */
139
140 /* Here if |x| is Inf */
141 lea L(SP_INF_0)(%rip), %rdx /* depending on sign of x: */
142 movss (%rdx,%rax,4), %xmm0 /* return zero or Inf */
143 ret
144
145 .p2align 4
146L(arg_nan):
147 /* Here if |x| is NaN */
148 addss %xmm0, %xmm0 /* Return x+x (raise invalid) */
149 ret
150
151 .p2align 4
152L(near_under_or_overflow):
153 /* Here if 125*log(2)<=|x|<under/overflow bound */
154 cvtsd2ss %xmm2, %xmm2 /* SP x*K/log(2)+RS */
155 movd %xmm2, %eax /* bits of n*K+j with trash */
156 subss L(SP_RS)(%rip), %xmm2 /* SP t=round(x*K/log(2)) */
157 movl %eax, %edx /* n*K+j with trash */
158 cvtss2sd %xmm2, %xmm2 /* DP t */
159 andl $0x3f, %eax /* bits of j */
160 mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -t*log(2)/K */
161 andl $0xffffffc0, %edx /* bits of n */
162#ifdef __AVX__
163 vaddsd %xmm1, %xmm2, %xmm0 /* DP y=x-t*log(2)/K */
164 vmulsd %xmm0, %xmm0, %xmm2 /* DP z=y*y */
165#else
166 addsd %xmm1, %xmm2 /* DP y=x-t*log(2)/K */
167 movaps %xmm2, %xmm0 /* DP y */
168 mulsd %xmm2, %xmm2 /* DP z=y*y */
169#endif
170 mulsd %xmm2, %xmm4 /* DP P3*z */
171 addl $0xffc0, %edx /* bits of n + DP exponent bias */
172 mulsd %xmm2, %xmm3 /* DP P2*z */
173 shlq $46, %rdx /* DP 2^n */
174 addsd L(DP_P1)(%rip), %xmm4 /* DP P3*z+P1 */
175 addsd L(DP_P0)(%rip), %xmm3 /* DP P2*z+P0 */
176 movd %rdx, %xmm1 /* DP 2^n */
177 mulsd %xmm2, %xmm4 /* DP (P3*z+P1)*z */
178 mulsd %xmm3, %xmm0 /* DP (P2*z+P0)*y */
179 addsd %xmm4, %xmm0 /* DP P(y) */
180 mulsd (%rsi,%rax,8), %xmm0 /* DP P(y)*T[j] */
181 addsd (%rsi,%rax,8), %xmm0 /* DP T[j]*(P(y)+1) */
182 mulsd %xmm1, %xmm0 /* DP result=2^n*(T[j]*(P(y)+1)) */
183 cvtsd2ss %xmm0, %xmm0 /* convert result to single precision */
184 ret
185END(__ieee754_expf)
186
187 .section .rodata, "a"
188 .p2align 3
189L(DP_T): /* table of double precision values 2^(j/K) for j=[0..K-1] */
190 .long 0x00000000, 0x3ff00000
191 .long 0x3e778061, 0x3ff02c9a
192 .long 0xd3158574, 0x3ff059b0
193 .long 0x18759bc8, 0x3ff08745
194 .long 0x6cf9890f, 0x3ff0b558
195 .long 0x32d3d1a2, 0x3ff0e3ec
196 .long 0xd0125b51, 0x3ff11301
197 .long 0xaea92de0, 0x3ff1429a
198 .long 0x3c7d517b, 0x3ff172b8
199 .long 0xeb6fcb75, 0x3ff1a35b
200 .long 0x3168b9aa, 0x3ff1d487
201 .long 0x88628cd6, 0x3ff2063b
202 .long 0x6e756238, 0x3ff2387a
203 .long 0x65e27cdd, 0x3ff26b45
204 .long 0xf51fdee1, 0x3ff29e9d
205 .long 0xa6e4030b, 0x3ff2d285
206 .long 0x0a31b715, 0x3ff306fe
207 .long 0xb26416ff, 0x3ff33c08
208 .long 0x373aa9cb, 0x3ff371a7
209 .long 0x34e59ff7, 0x3ff3a7db
210 .long 0x4c123422, 0x3ff3dea6
211 .long 0x21f72e2a, 0x3ff4160a
212 .long 0x6061892d, 0x3ff44e08
213 .long 0xb5c13cd0, 0x3ff486a2
214 .long 0xd5362a27, 0x3ff4bfda
215 .long 0x769d2ca7, 0x3ff4f9b2
216 .long 0x569d4f82, 0x3ff5342b
217 .long 0x36b527da, 0x3ff56f47
218 .long 0xdd485429, 0x3ff5ab07
219 .long 0x15ad2148, 0x3ff5e76f
220 .long 0xb03a5585, 0x3ff6247e
221 .long 0x82552225, 0x3ff66238
222 .long 0x667f3bcd, 0x3ff6a09e
223 .long 0x3c651a2f, 0x3ff6dfb2
224 .long 0xe8ec5f74, 0x3ff71f75
225 .long 0x564267c9, 0x3ff75feb
226 .long 0x73eb0187, 0x3ff7a114
227 .long 0x36cf4e62, 0x3ff7e2f3
228 .long 0x994cce13, 0x3ff82589
229 .long 0x9b4492ed, 0x3ff868d9
230 .long 0x422aa0db, 0x3ff8ace5
231 .long 0x99157736, 0x3ff8f1ae
232 .long 0xb0cdc5e5, 0x3ff93737
233 .long 0x9fde4e50, 0x3ff97d82
234 .long 0x82a3f090, 0x3ff9c491
235 .long 0x7b5de565, 0x3ffa0c66
236 .long 0xb23e255d, 0x3ffa5503
237 .long 0x5579fdbf, 0x3ffa9e6b
238 .long 0x995ad3ad, 0x3ffae89f
239 .long 0xb84f15fb, 0x3ffb33a2
240 .long 0xf2fb5e47, 0x3ffb7f76
241 .long 0x904bc1d2, 0x3ffbcc1e
242 .long 0xdd85529c, 0x3ffc199b
243 .long 0x2e57d14b, 0x3ffc67f1
244 .long 0xdcef9069, 0x3ffcb720
245 .long 0x4a07897c, 0x3ffd072d
246 .long 0xdcfba487, 0x3ffd5818
247 .long 0x03db3285, 0x3ffda9e6
248 .long 0x337b9b5f, 0x3ffdfc97
249 .long 0xe78b3ff6, 0x3ffe502e
250 .long 0xa2a490da, 0x3ffea4af
251 .long 0xee615a27, 0x3ffefa1b
252 .long 0x5b6e4540, 0x3fff5076
253 .long 0x819e90d8, 0x3fffa7c1
254 .type L(DP_T), @object
255 ASM_SIZE_DIRECTIVE(L(DP_T))
256
257 .section .rodata.cst8,"aM",@progbits,8
258 .p2align 3
259L(DP_KLN2): /* double precision K/log(2) */
260 .long 0x652b82fe, 0x40571547
261 .type L(DP_KLN2), @object
262 ASM_SIZE_DIRECTIVE(L(DP_KLN2))
263
264 .p2align 3
265L(DP_NLN2K): /* double precision -log(2)/K */
266 .long 0xfefa39ef, 0xbf862e42
267 .type L(DP_NLN2K), @object
268 ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
269
270 .p2align 3
271L(DP_RS): /* double precision 2^23+2^22 */
272 .long 0x00000000, 0x41680000
273 .type L(DP_RS), @object
274 ASM_SIZE_DIRECTIVE(L(DP_RS))
275
276 .p2align 3
277L(DP_P3): /* double precision polynomial coefficient P3 */
278 .long 0xeb78fa85, 0x3fa56420
279 .type L(DP_P3), @object
280 ASM_SIZE_DIRECTIVE(L(DP_P3))
281
282 .p2align 3
283L(DP_P1): /* double precision polynomial coefficient P1 */
284 .long 0x008d6118, 0x3fe00000
285 .type L(DP_P1), @object
286 ASM_SIZE_DIRECTIVE(L(DP_P1))
287
288 .p2align 3
289L(DP_P2): /* double precision polynomial coefficient P2 */
290 .long 0xda752d4f, 0x3fc55550
291 .type L(DP_P2), @object
292 ASM_SIZE_DIRECTIVE(L(DP_P2))
293
294 .p2align 3
295L(DP_P0): /* double precision polynomial coefficient P0 */
296 .long 0xffffe7c6, 0x3fefffff
297 .type L(DP_P0), @object
298 ASM_SIZE_DIRECTIVE(L(DP_P0))
299
300 .p2align 2
301L(SP_RANGE): /* single precision overflow/underflow bounds */
302 .long 0x42b17217 /* if x>this bound, then result overflows */
303 .long 0x42cff1b4 /* if x<this bound, then result underflows */
304 .type L(SP_RANGE), @object
305 ASM_SIZE_DIRECTIVE(L(SP_RANGE))
306
307 .p2align 2
308L(SP_INF_0):
309 .long 0x7f800000 /* single precision Inf */
310 .long 0 /* single precision zero */
311 .type L(SP_INF_0), @object
312 ASM_SIZE_DIRECTIVE(L(SP_INF_0))
313
314 .section .rodata.cst4,"aM",@progbits,4
315 .p2align 2
316L(SP_RS): /* single precision 2^23+2^22 */
317 .long 0x4b400000
318 .type L(SP_RS), @object
319 ASM_SIZE_DIRECTIVE(L(SP_RS))
320
321 .p2align 2
322L(SP_SMALL): /* single precision small value 2^(-100) */
323 .long 0x0d800000
324 .type L(SP_SMALL), @object
325 ASM_SIZE_DIRECTIVE(L(SP_SMALL))
326
327 .p2align 2
328L(SP_LARGE): /* single precision large value 2^100 */
329 .long 0x71800000
330 .type L(SP_LARGE), @object
331 ASM_SIZE_DIRECTIVE(L(SP_LARGE))
332
333 .p2align 2
334L(SP_ONE): /* single precision 1.0 */
335 .long 0x3f800000
336 .type L(SP_ONE), @object
337 ASM_SIZE_DIRECTIVE(L(SP_ONE))
338
339strong_alias (__ieee754_expf, __expf_finite)
340