e_expf.S source code [glibc_src_2.24/sysdeps/x86_64/fpu/e_expf.S]

1	/ Optimized __ieee754_expf function.*
2	Copyright (C) 2012-2016 Free Software Foundation, Inc.
3	Contributed by Intel Corporation.
4	This file is part of the GNU C Library.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, see
18	<http://www.gnu.org/licenses/>. /*
19
20	#include <sysdep.h>
21
22	/ Short algorithm description:*
23	*
24	* Let K = 64 (table size).
25	* e^x = 2^(x/log(2)) = 2^n * T[j] * (1 + P(y))
26	* where
27	* x = m*log(2)/K + y, y in [0.0..log(2)/K]
28	* m = n*K + j, m,n,j - signed integer, j in [0..K-1]
29	* values of 2^(j/K) are tabulated as T[j].
30	*
31	* P(y) is a minimax polynomial approximation of expf(x)-1
32	* on small interval [0.0..log(2)/K].
33	*
34	* P(y) = P3yyyy + P2yyy + P1yy + P0y, calculated as
35	* z = yy; P(y) = (P3z + P1)z + (P2z + P0)*y
36	*
37	* Special cases:
38	* expf(NaN) = NaN
39	* expf(+INF) = +INF
40	* expf(-INF) = 0
41	* expf(x) = 1 for subnormals
42	* for finite argument, only expf(0)=1 is exact
43	* expf(x) overflows if x>88.7228317260742190
44	* expf(x) underflows if x<-103.972076416015620
45	*/
46
47	.text
48	ENTRY(__ieee754_expf)
49	/ Input: single precision x in %xmm0 /
50	cvtss2sd %xmm0, %xmm1 / Convert x to double precision /
51	movd %xmm0, %ecx / Copy x /
52	movsd L(DP_KLN2)(%rip), %xmm2 / DP K/log(2) /
53	movsd L(DP_P2)(%rip), %xmm3 / DP P2 /
54	movl %ecx, %eax / x /
55	mulsd %xmm1, %xmm2 / DP xK/log(2) /*
56	andl $`0x7fffffff`, %ecx / \|x\| /
57	lea L(DP_T)(%rip), %rsi / address of table T[j] /
58	cmpl $`0x42ad496b`, %ecx / \|x\|<125log(2) ? /*
59	movsd L(DP_P3)(%rip), %xmm4 / DP P3 /
60	addsd L(DP_RS)(%rip), %xmm2 / DP xK/log(2)+RS /*
61	jae L(special_paths)
62
63	/ Here if \|x\|<125log(2) /*
64	cmpl $`0x31800000`, %ecx / \|x\|<2^(-28) ? /
65	jb L(small_arg)
66
67	/ Main path: here if 2^(-28)<=\|x\|<125log(2) /*
68	cvtsd2ss %xmm2, %xmm2 / SP xK/log(2)+RS /*
69	movd %xmm2, %eax / bits of nK+j with trash /*
70	subss L(SP_RS)(%rip), %xmm2 / SP t=round(xK/log(2)) /*
71	movl %eax, %edx / nK+j with trash /*
72	cvtss2sd %xmm2, %xmm2 / DP t /
73	andl $`0x3f`, %eax / bits of j /
74	mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -tlog(2)/K /
75	andl $`0xffffffc0`, %edx / bits of n /
76	#ifdef __AVX__
77	vaddsd %xmm1, %xmm2, %xmm0 / DP y=x-tlog(2)/K /*
78	vmulsd %xmm0, %xmm0, %xmm2 / DP z=yy /*
79	#else
80	addsd %xmm1, %xmm2 / DP y=x-tlog(2)/K /*
81	movaps %xmm2, %xmm0 / DP y /
82	mulsd %xmm2, %xmm2 / DP z=yy /*
83	#endif
84	mulsd %xmm2, %xmm4 / DP P3z /*
85	addl $`0x1fc0`, %edx / bits of n + SP exponent bias /
86	mulsd %xmm2, %xmm3 / DP P2z /*
87	shll $`17`, %edx / SP 2^n /
88	addsd L(DP_P1)(%rip), %xmm4 / DP P3z+P1 /*
89	addsd L(DP_P0)(%rip), %xmm3 / DP P2z+P0 /*
90	movd %edx, %xmm1 / SP 2^n /
91	mulsd %xmm2, %xmm4 / DP (P3z+P1)z /
92	mulsd %xmm3, %xmm0 / DP (P2z+P0)y /
93	addsd %xmm4, %xmm0 / DP P(y) /
94	mulsd (%rsi,%rax,`8`), %xmm0 / DP P(y)T[j] /*
95	addsd (%rsi,%rax,`8`), %xmm0 / DP T[j](P(y)+1) /*
96	cvtsd2ss %xmm0, %xmm0 / SP T[j](P(y)+1) /*
97	mulss %xmm1, %xmm0 / SP result=2^n(T[j](P(y)+1)) /
98	ret
99
100	.p2align `4`
101	L(small_arg):
102	/ Here if 0<=\|x\|<2^(-28) /
103	addss L(SP_ONE)(%rip), %xmm0 / 1.0 + x /
104	/ Return 1.0 with inexact raised, except for x==0 /
105	ret
106
107	.p2align `4`
108	L(special_paths):
109	/ Here if 125log(2)<=\|x\| /*
110	shrl $`31`, %eax / Get sign bit of x, and depending on it: /
111	lea L(SP_RANGE)(%rip), %rdx / load over/underflow bound /
112	cmpl (%rdx,%rax,`4`), %ecx / \|x\|<under/overflow bound ? /
113	jbe L(near_under_or_overflow)
114
115	/ Here if \|x\|>under/overflow bound /
116	cmpl $`0x7f800000`, %ecx / \|x\| is finite ? /
117	jae L(arg_inf_or_nan)
118
119	/ Here if \|x\|>under/overflow bound, and x is finite /
120	testq %rax, %rax / sign of x nonzero ? /
121	je L(res_overflow)
122
123	/ Here if -inf<x<underflow bound (x<0) /
124	movss L(SP_SMALL)(%rip), %xmm0/* load small value 2^(-100) */
125	mulss %xmm0, %xmm0 / Return underflowed result (zero or subnormal) /
126	ret
127
128	.p2align `4`
129	L(res_overflow):
130	/ Here if overflow bound<x<inf (x>0) /
131	movss L(SP_LARGE)(%rip), %xmm0/* load large value 2^100 */
132	mulss %xmm0, %xmm0 / Return overflowed result (Inf or max normal) /
133	ret
134
135	.p2align `4`
136	L(arg_inf_or_nan):
137	/ Here if \|x\| is Inf or NAN /
138	jne L(arg_nan) / \|x\| is Inf ? /
139
140	/ Here if \|x\| is Inf /
141	lea L(SP_INF_0)(%rip), %rdx / depending on sign of x: /
142	movss (%rdx,%rax,`4`), %xmm0 / return zero or Inf /
143	ret
144
145	.p2align `4`
146	L(arg_nan):
147	/ Here if \|x\| is NaN /
148	addss %xmm0, %xmm0 / Return x+x (raise invalid) /
149	ret
150
151	.p2align `4`
152	L(near_under_or_overflow):
153	/ Here if 125log(2)<=\|x\|<under/overflow bound /*
154	cvtsd2ss %xmm2, %xmm2 / SP xK/log(2)+RS /*
155	movd %xmm2, %eax / bits of nK+j with trash /*
156	subss L(SP_RS)(%rip), %xmm2 / SP t=round(xK/log(2)) /*
157	movl %eax, %edx / nK+j with trash /*
158	cvtss2sd %xmm2, %xmm2 / DP t /
159	andl $`0x3f`, %eax / bits of j /
160	mulsd L(DP_NLN2K)(%rip), %xmm2/* DP -tlog(2)/K /
161	andl $`0xffffffc0`, %edx / bits of n /
162	#ifdef __AVX__
163	vaddsd %xmm1, %xmm2, %xmm0 / DP y=x-tlog(2)/K /*
164	vmulsd %xmm0, %xmm0, %xmm2 / DP z=yy /*
165	#else
166	addsd %xmm1, %xmm2 / DP y=x-tlog(2)/K /*
167	movaps %xmm2, %xmm0 / DP y /
168	mulsd %xmm2, %xmm2 / DP z=yy /*
169	#endif
170	mulsd %xmm2, %xmm4 / DP P3z /*
171	addl $`0xffc0`, %edx / bits of n + DP exponent bias /
172	mulsd %xmm2, %xmm3 / DP P2z /*
173	shlq $`46`, %rdx / DP 2^n /
174	addsd L(DP_P1)(%rip), %xmm4 / DP P3z+P1 /*
175	addsd L(DP_P0)(%rip), %xmm3 / DP P2z+P0 /*
176	movd %rdx, %xmm1 / DP 2^n /
177	mulsd %xmm2, %xmm4 / DP (P3z+P1)z /
178	mulsd %xmm3, %xmm0 / DP (P2z+P0)y /
179	addsd %xmm4, %xmm0 / DP P(y) /
180	mulsd (%rsi,%rax,`8`), %xmm0 / DP P(y)T[j] /*
181	addsd (%rsi,%rax,`8`), %xmm0 / DP T[j](P(y)+1) /*
182	mulsd %xmm1, %xmm0 / DP result=2^n(T[j](P(y)+1)) /
183	cvtsd2ss %xmm0, %xmm0 / convert result to single precision /
184	ret
185	END(__ieee754_expf)
186
187	.section .rodata, "a"
188	.p2align `3`
189	L(DP_T): / table of double precision values 2^(j/K) for j=[0..K-1] /
190	.long `0x00000000`, `0x3ff00000`
191	.long `0x3e778061`, `0x3ff02c9a`
192	.long `0xd3158574`, `0x3ff059b0`
193	.long `0x18759bc8`, `0x3ff08745`
194	.long `0x6cf9890f`, `0x3ff0b558`
195	.long `0x32d3d1a2`, `0x3ff0e3ec`
196	.long `0xd0125b51`, `0x3ff11301`
197	.long `0xaea92de0`, `0x3ff1429a`
198	.long `0x3c7d517b`, `0x3ff172b8`
199	.long `0xeb6fcb75`, `0x3ff1a35b`
200	.long `0x3168b9aa`, `0x3ff1d487`
201	.long `0x88628cd6`, `0x3ff2063b`
202	.long `0x6e756238`, `0x3ff2387a`
203	.long `0x65e27cdd`, `0x3ff26b45`
204	.long `0xf51fdee1`, `0x3ff29e9d`
205	.long `0xa6e4030b`, `0x3ff2d285`
206	.long `0x0a31b715`, `0x3ff306fe`
207	.long `0xb26416ff`, `0x3ff33c08`
208	.long `0x373aa9cb`, `0x3ff371a7`
209	.long `0x34e59ff7`, `0x3ff3a7db`
210	.long `0x4c123422`, `0x3ff3dea6`
211	.long `0x21f72e2a`, `0x3ff4160a`
212	.long `0x6061892d`, `0x3ff44e08`
213	.long `0xb5c13cd0`, `0x3ff486a2`
214	.long `0xd5362a27`, `0x3ff4bfda`
215	.long `0x769d2ca7`, `0x3ff4f9b2`
216	.long `0x569d4f82`, `0x3ff5342b`
217	.long `0x36b527da`, `0x3ff56f47`
218	.long `0xdd485429`, `0x3ff5ab07`
219	.long `0x15ad2148`, `0x3ff5e76f`
220	.long `0xb03a5585`, `0x3ff6247e`
221	.long `0x82552225`, `0x3ff66238`
222	.long `0x667f3bcd`, `0x3ff6a09e`
223	.long `0x3c651a2f`, `0x3ff6dfb2`
224	.long `0xe8ec5f74`, `0x3ff71f75`
225	.long `0x564267c9`, `0x3ff75feb`
226	.long `0x73eb0187`, `0x3ff7a114`
227	.long `0x36cf4e62`, `0x3ff7e2f3`
228	.long `0x994cce13`, `0x3ff82589`
229	.long `0x9b4492ed`, `0x3ff868d9`
230	.long `0x422aa0db`, `0x3ff8ace5`
231	.long `0x99157736`, `0x3ff8f1ae`
232	.long `0xb0cdc5e5`, `0x3ff93737`
233	.long `0x9fde4e50`, `0x3ff97d82`
234	.long `0x82a3f090`, `0x3ff9c491`
235	.long `0x7b5de565`, `0x3ffa0c66`
236	.long `0xb23e255d`, `0x3ffa5503`
237	.long `0x5579fdbf`, `0x3ffa9e6b`
238	.long `0x995ad3ad`, `0x3ffae89f`
239	.long `0xb84f15fb`, `0x3ffb33a2`
240	.long `0xf2fb5e47`, `0x3ffb7f76`
241	.long `0x904bc1d2`, `0x3ffbcc1e`
242	.long `0xdd85529c`, `0x3ffc199b`
243	.long `0x2e57d14b`, `0x3ffc67f1`
244	.long `0xdcef9069`, `0x3ffcb720`
245	.long `0x4a07897c`, `0x3ffd072d`
246	.long `0xdcfba487`, `0x3ffd5818`
247	.long `0x03db3285`, `0x3ffda9e6`
248	.long `0x337b9b5f`, `0x3ffdfc97`
249	.long `0xe78b3ff6`, `0x3ffe502e`
250	.long `0xa2a490da`, `0x3ffea4af`
251	.long `0xee615a27`, `0x3ffefa1b`
252	.long `0x5b6e4540`, `0x3fff5076`
253	.long `0x819e90d8`, `0x3fffa7c1`
254	.type L(DP_T), @object
255	ASM_SIZE_DIRECTIVE(L(DP_T))
256
257	.section .rodata.cst8,"aM",@progbits,`8`
258	.p2align `3`
259	L(DP_KLN2): / double precision K/log(2) /
260	.long `0x652b82fe`, `0x40571547`
261	.type L(DP_KLN2), @object
262	ASM_SIZE_DIRECTIVE(L(DP_KLN2))
263
264	.p2align `3`
265	L(DP_NLN2K): / double precision -log(2)/K /
266	.long `0xfefa39ef`, `0xbf862e42`
267	.type L(DP_NLN2K), @object
268	ASM_SIZE_DIRECTIVE(L(DP_NLN2K))
269
270	.p2align `3`
271	L(DP_RS): / double precision 2^23+2^22 /
272	.long `0x00000000`, `0x41680000`
273	.type L(DP_RS), @object
274	ASM_SIZE_DIRECTIVE(L(DP_RS))
275
276	.p2align `3`
277	L(DP_P3): / double precision polynomial coefficient P3 /
278	.long `0xeb78fa85`, `0x3fa56420`
279	.type L(DP_P3), @object
280	ASM_SIZE_DIRECTIVE(L(DP_P3))
281
282	.p2align `3`
283	L(DP_P1): / double precision polynomial coefficient P1 /
284	.long `0x008d6118`, `0x3fe00000`
285	.type L(DP_P1), @object
286	ASM_SIZE_DIRECTIVE(L(DP_P1))
287
288	.p2align `3`
289	L(DP_P2): / double precision polynomial coefficient P2 /
290	.long `0xda752d4f`, `0x3fc55550`
291	.type L(DP_P2), @object
292	ASM_SIZE_DIRECTIVE(L(DP_P2))
293
294	.p2align `3`
295	L(DP_P0): / double precision polynomial coefficient P0 /
296	.long `0xffffe7c6`, `0x3fefffff`
297	.type L(DP_P0), @object
298	ASM_SIZE_DIRECTIVE(L(DP_P0))
299
300	.p2align `2`
301	L(SP_RANGE): / single precision overflow/underflow bounds /
302	.long `0x42b17217` / if x>this bound, then result overflows /
303	.long `0x42cff1b4` / if x<this bound, then result underflows /
304	.type L(SP_RANGE), @object
305	ASM_SIZE_DIRECTIVE(L(SP_RANGE))
306
307	.p2align `2`
308	L(SP_INF_0):
309	.long `0x7f800000` / single precision Inf /
310	.long `0` / single precision zero /
311	.type L(SP_INF_0), @object
312	ASM_SIZE_DIRECTIVE(L(SP_INF_0))
313
314	.section .rodata.cst4,"aM",@progbits,`4`
315	.p2align `2`
316	L(SP_RS): / single precision 2^23+2^22 /
317	.long `0x4b400000`
318	.type L(SP_RS), @object
319	ASM_SIZE_DIRECTIVE(L(SP_RS))
320
321	.p2align `2`
322	L(SP_SMALL): / single precision small value 2^(-100) /
323	.long `0x0d800000`
324	.type L(SP_SMALL), @object
325	ASM_SIZE_DIRECTIVE(L(SP_SMALL))
326
327	.p2align `2`
328	L(SP_LARGE): / single precision large value 2^100 /
329	.long `0x71800000`
330	.type L(SP_LARGE), @object
331	ASM_SIZE_DIRECTIVE(L(SP_LARGE))
332
333	.p2align `2`
334	L(SP_ONE): / single precision 1.0 /
335	.long `0x3f800000`
336	.type L(SP_ONE), @object
337	ASM_SIZE_DIRECTIVE(L(SP_ONE))
338
339	strong_alias (__ieee754_expf, __expf_finite)
340

Browse the source code of glibc_src_2.24/sysdeps/x86_64/fpu/e_expf.S