1/* Function sin vectorized with AVX-512, KNL and SKX versions.
2 Copyright (C) 2014-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_d_trig_data.h"
21#include "svml_d_wrapper_impl.h"
22
23 .text
24ENTRY (_ZGVeN8v_sin_knl)
25#ifndef HAVE_AVX512DQ_ASM_SUPPORT
26WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
27#else
28/*
29 ALGORITHM DESCRIPTION:
30
31 ( low accuracy ( < 4ulp ) or enhanced performance
32 ( half of correct mantissa ) implementation )
33
34 Argument representation:
35 arg = N*Pi + R
36
37 Result calculation:
38 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
39 sin(R) is approximated by corresponding polynomial
40 */
41 pushq %rbp
42 cfi_adjust_cfa_offset (8)
43 cfi_rel_offset (%rbp, 0)
44 movq %rsp, %rbp
45 cfi_def_cfa_register (%rbp)
46 andq $-64, %rsp
47 subq $1280, %rsp
48 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
49 movq $-1, %rdx
50 vmovups __dAbsMask(%rax), %zmm6
51 vmovups __dInvPI(%rax), %zmm1
52
53/*
54 ARGUMENT RANGE REDUCTION:
55 X' = |X|
56 */
57 vpandq %zmm6, %zmm0, %zmm12
58 vmovups __dPI1_FMA(%rax), %zmm2
59 vmovups __dC7_sin(%rax), %zmm7
60
61/* SignX - sign bit of X */
62 vpandnq %zmm0, %zmm6, %zmm11
63
64/* R = X' - N*Pi1 */
65 vmovaps %zmm12, %zmm3
66
67/* Y = X'*InvPi + RS : right shifter add */
68 vfmadd213pd __dRShifter(%rax), %zmm12, %zmm1
69 vcmppd $22, __dRangeVal(%rax), %zmm12, %k1
70 vpbroadcastq %rdx, %zmm13{%k1}{z}
71
72/* N = Y - RS : right shifter sub */
73 vsubpd __dRShifter(%rax), %zmm1, %zmm4
74
75/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
76 vpsllq $63, %zmm1, %zmm5
77 vptestmq %zmm13, %zmm13, %k0
78 vfnmadd231pd %zmm4, %zmm2, %zmm3
79 kmovw %k0, %ecx
80 movzbl %cl, %ecx
81
82/* R = R - N*Pi2 */
83 vfnmadd231pd __dPI2_FMA(%rax), %zmm4, %zmm3
84
85/* R = R - N*Pi3 */
86 vfnmadd132pd __dPI3_FMA(%rax), %zmm3, %zmm4
87
88/*
89 POLYNOMIAL APPROXIMATION:
90 R2 = R*R
91 */
92 vmulpd %zmm4, %zmm4, %zmm8
93
94/* R = R^SignRes : update sign of reduced argument */
95 vpxorq %zmm5, %zmm4, %zmm9
96 vfmadd213pd __dC6_sin(%rax), %zmm8, %zmm7
97 vfmadd213pd __dC5_sin(%rax), %zmm8, %zmm7
98 vfmadd213pd __dC4_sin(%rax), %zmm8, %zmm7
99
100/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
101 vfmadd213pd __dC3_sin(%rax), %zmm8, %zmm7
102
103/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
104 vfmadd213pd __dC2_sin(%rax), %zmm8, %zmm7
105 vfmadd213pd __dC1_sin(%rax), %zmm8, %zmm7
106 vmulpd %zmm8, %zmm7, %zmm10
107
108/* Poly = Poly*R + R */
109 vfmadd213pd %zmm9, %zmm9, %zmm10
110
111/*
112 RECONSTRUCTION:
113 Final sign setting: Res = Poly^SignX
114 */
115 vpxorq %zmm11, %zmm10, %zmm1
116 testl %ecx, %ecx
117 jne .LBL_1_3
118
119.LBL_1_2:
120 cfi_remember_state
121 vmovaps %zmm1, %zmm0
122 movq %rbp, %rsp
123 cfi_def_cfa_register (%rsp)
124 popq %rbp
125 cfi_adjust_cfa_offset (-8)
126 cfi_restore (%rbp)
127 ret
128
129.LBL_1_3:
130 cfi_restore_state
131 vmovups %zmm0, 1152(%rsp)
132 vmovups %zmm1, 1216(%rsp)
133 je .LBL_1_2
134
135 xorb %dl, %dl
136 kmovw %k4, 1048(%rsp)
137 xorl %eax, %eax
138 kmovw %k5, 1040(%rsp)
139 kmovw %k6, 1032(%rsp)
140 kmovw %k7, 1024(%rsp)
141 vmovups %zmm16, 960(%rsp)
142 vmovups %zmm17, 896(%rsp)
143 vmovups %zmm18, 832(%rsp)
144 vmovups %zmm19, 768(%rsp)
145 vmovups %zmm20, 704(%rsp)
146 vmovups %zmm21, 640(%rsp)
147 vmovups %zmm22, 576(%rsp)
148 vmovups %zmm23, 512(%rsp)
149 vmovups %zmm24, 448(%rsp)
150 vmovups %zmm25, 384(%rsp)
151 vmovups %zmm26, 320(%rsp)
152 vmovups %zmm27, 256(%rsp)
153 vmovups %zmm28, 192(%rsp)
154 vmovups %zmm29, 128(%rsp)
155 vmovups %zmm30, 64(%rsp)
156 vmovups %zmm31, (%rsp)
157 movq %rsi, 1064(%rsp)
158 movq %rdi, 1056(%rsp)
159 movq %r12, 1096(%rsp)
160 cfi_offset_rel_rsp (12, 1096)
161 movb %dl, %r12b
162 movq %r13, 1088(%rsp)
163 cfi_offset_rel_rsp (13, 1088)
164 movl %ecx, %r13d
165 movq %r14, 1080(%rsp)
166 cfi_offset_rel_rsp (14, 1080)
167 movl %eax, %r14d
168 movq %r15, 1072(%rsp)
169 cfi_offset_rel_rsp (15, 1072)
170 cfi_remember_state
171
172.LBL_1_6:
173 btl %r14d, %r13d
174 jc .LBL_1_12
175
176.LBL_1_7:
177 lea 1(%r14), %esi
178 btl %esi, %r13d
179 jc .LBL_1_10
180
181.LBL_1_8:
182 addb $1, %r12b
183 addl $2, %r14d
184 cmpb $16, %r12b
185 jb .LBL_1_6
186
187 kmovw 1048(%rsp), %k4
188 movq 1064(%rsp), %rsi
189 kmovw 1040(%rsp), %k5
190 movq 1056(%rsp), %rdi
191 kmovw 1032(%rsp), %k6
192 movq 1096(%rsp), %r12
193 cfi_restore (%r12)
194 movq 1088(%rsp), %r13
195 cfi_restore (%r13)
196 kmovw 1024(%rsp), %k7
197 vmovups 960(%rsp), %zmm16
198 vmovups 896(%rsp), %zmm17
199 vmovups 832(%rsp), %zmm18
200 vmovups 768(%rsp), %zmm19
201 vmovups 704(%rsp), %zmm20
202 vmovups 640(%rsp), %zmm21
203 vmovups 576(%rsp), %zmm22
204 vmovups 512(%rsp), %zmm23
205 vmovups 448(%rsp), %zmm24
206 vmovups 384(%rsp), %zmm25
207 vmovups 320(%rsp), %zmm26
208 vmovups 256(%rsp), %zmm27
209 vmovups 192(%rsp), %zmm28
210 vmovups 128(%rsp), %zmm29
211 vmovups 64(%rsp), %zmm30
212 vmovups (%rsp), %zmm31
213 movq 1080(%rsp), %r14
214 cfi_restore (%r14)
215 movq 1072(%rsp), %r15
216 cfi_restore (%r15)
217 vmovups 1216(%rsp), %zmm1
218 jmp .LBL_1_2
219
220.LBL_1_10:
221 cfi_restore_state
222 movzbl %r12b, %r15d
223 shlq $4, %r15
224 vmovsd 1160(%rsp,%r15), %xmm0
225 call JUMPTARGET(sin)
226 vmovsd %xmm0, 1224(%rsp,%r15)
227 jmp .LBL_1_8
228
229.LBL_1_12:
230 movzbl %r12b, %r15d
231 shlq $4, %r15
232 vmovsd 1152(%rsp,%r15), %xmm0
233 call JUMPTARGET(sin)
234 vmovsd %xmm0, 1216(%rsp,%r15)
235 jmp .LBL_1_7
236#endif
237END (_ZGVeN8v_sin_knl)
238
239ENTRY (_ZGVeN8v_sin_skx)
240#ifndef HAVE_AVX512DQ_ASM_SUPPORT
241WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
242#else
243/*
244 ALGORITHM DESCRIPTION:
245
246 ( low accuracy ( < 4ulp ) or enhanced performance
247 ( half of correct mantissa ) implementation )
248
249 Argument representation:
250 arg = N*Pi + R
251
252 Result calculation:
253 sin(arg) = sin(N*Pi + R) = (-1)^N * sin(R)
254 sin(R) is approximated by corresponding polynomial
255 */
256 pushq %rbp
257 cfi_adjust_cfa_offset (8)
258 cfi_rel_offset (%rbp, 0)
259 movq %rsp, %rbp
260 cfi_def_cfa_register (%rbp)
261 andq $-64, %rsp
262 subq $1280, %rsp
263 movq __svml_d_trig_data@GOTPCREL(%rip), %rax
264 vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
265 vmovups __dAbsMask(%rax), %zmm7
266 vmovups __dInvPI(%rax), %zmm2
267 vmovups __dRShifter(%rax), %zmm1
268 vmovups __dPI1_FMA(%rax), %zmm3
269 vmovups __dC7_sin(%rax), %zmm8
270
271/*
272 ARGUMENT RANGE REDUCTION:
273 X' = |X|
274 */
275 vandpd %zmm7, %zmm0, %zmm13
276
277/* SignX - sign bit of X */
278 vandnpd %zmm0, %zmm7, %zmm12
279
280/* Y = X'*InvPi + RS : right shifter add */
281 vfmadd213pd %zmm1, %zmm13, %zmm2
282 vcmppd $18, __dRangeVal(%rax), %zmm13, %k1
283
284/* SignRes = Y<<63 : shift LSB to MSB place for result sign */
285 vpsllq $63, %zmm2, %zmm6
286
287/* N = Y - RS : right shifter sub */
288 vsubpd %zmm1, %zmm2, %zmm5
289
290/* R = X' - N*Pi1 */
291 vmovaps %zmm13, %zmm4
292 vfnmadd231pd %zmm5, %zmm3, %zmm4
293
294/* R = R - N*Pi2 */
295 vfnmadd231pd __dPI2_FMA(%rax), %zmm5, %zmm4
296
297/* R = R - N*Pi3 */
298 vfnmadd132pd __dPI3_FMA(%rax), %zmm4, %zmm5
299
300/*
301 POLYNOMIAL APPROXIMATION:
302 R2 = R*R
303 */
304 vmulpd %zmm5, %zmm5, %zmm9
305
306/* R = R^SignRes : update sign of reduced argument */
307 vxorpd %zmm6, %zmm5, %zmm10
308 vfmadd213pd __dC6_sin(%rax), %zmm9, %zmm8
309 vfmadd213pd __dC5_sin(%rax), %zmm9, %zmm8
310 vfmadd213pd __dC4_sin(%rax), %zmm9, %zmm8
311
312/* Poly = C3+R2*(C4+R2*(C5+R2*(C6+R2*C7))) */
313 vfmadd213pd __dC3_sin(%rax), %zmm9, %zmm8
314
315/* Poly = R2*(C1+R2*(C2+R2*Poly)) */
316 vfmadd213pd __dC2_sin(%rax), %zmm9, %zmm8
317 vfmadd213pd __dC1_sin(%rax), %zmm9, %zmm8
318 vmulpd %zmm9, %zmm8, %zmm11
319
320/* Poly = Poly*R + R */
321 vfmadd213pd %zmm10, %zmm10, %zmm11
322
323/*
324 RECONSTRUCTION:
325 Final sign setting: Res = Poly^SignX
326 */
327 vxorpd %zmm12, %zmm11, %zmm1
328 vpandnq %zmm13, %zmm13, %zmm14{%k1}
329 vcmppd $3, %zmm14, %zmm14, %k0
330 kmovw %k0, %ecx
331 testl %ecx, %ecx
332 jne .LBL_2_3
333
334.LBL_2_2:
335 cfi_remember_state
336 vmovaps %zmm1, %zmm0
337 movq %rbp, %rsp
338 cfi_def_cfa_register (%rsp)
339 popq %rbp
340 cfi_adjust_cfa_offset (-8)
341 cfi_restore (%rbp)
342 ret
343
344.LBL_2_3:
345 cfi_restore_state
346 vmovups %zmm0, 1152(%rsp)
347 vmovups %zmm1, 1216(%rsp)
348 je .LBL_2_2
349
350 xorb %dl, %dl
351 xorl %eax, %eax
352 kmovw %k4, 1048(%rsp)
353 kmovw %k5, 1040(%rsp)
354 kmovw %k6, 1032(%rsp)
355 kmovw %k7, 1024(%rsp)
356 vmovups %zmm16, 960(%rsp)
357 vmovups %zmm17, 896(%rsp)
358 vmovups %zmm18, 832(%rsp)
359 vmovups %zmm19, 768(%rsp)
360 vmovups %zmm20, 704(%rsp)
361 vmovups %zmm21, 640(%rsp)
362 vmovups %zmm22, 576(%rsp)
363 vmovups %zmm23, 512(%rsp)
364 vmovups %zmm24, 448(%rsp)
365 vmovups %zmm25, 384(%rsp)
366 vmovups %zmm26, 320(%rsp)
367 vmovups %zmm27, 256(%rsp)
368 vmovups %zmm28, 192(%rsp)
369 vmovups %zmm29, 128(%rsp)
370 vmovups %zmm30, 64(%rsp)
371 vmovups %zmm31, (%rsp)
372 movq %rsi, 1064(%rsp)
373 movq %rdi, 1056(%rsp)
374 movq %r12, 1096(%rsp)
375 cfi_offset_rel_rsp (12, 1096)
376 movb %dl, %r12b
377 movq %r13, 1088(%rsp)
378 cfi_offset_rel_rsp (13, 1088)
379 movl %ecx, %r13d
380 movq %r14, 1080(%rsp)
381 cfi_offset_rel_rsp (14, 1080)
382 movl %eax, %r14d
383 movq %r15, 1072(%rsp)
384 cfi_offset_rel_rsp (15, 1072)
385 cfi_remember_state
386
387.LBL_2_6:
388 btl %r14d, %r13d
389 jc .LBL_2_12
390
391.LBL_2_7:
392 lea 1(%r14), %esi
393 btl %esi, %r13d
394 jc .LBL_2_10
395
396.LBL_2_8:
397 incb %r12b
398 addl $2, %r14d
399 cmpb $16, %r12b
400 jb .LBL_2_6
401
402 kmovw 1048(%rsp), %k4
403 kmovw 1040(%rsp), %k5
404 kmovw 1032(%rsp), %k6
405 kmovw 1024(%rsp), %k7
406 vmovups 960(%rsp), %zmm16
407 vmovups 896(%rsp), %zmm17
408 vmovups 832(%rsp), %zmm18
409 vmovups 768(%rsp), %zmm19
410 vmovups 704(%rsp), %zmm20
411 vmovups 640(%rsp), %zmm21
412 vmovups 576(%rsp), %zmm22
413 vmovups 512(%rsp), %zmm23
414 vmovups 448(%rsp), %zmm24
415 vmovups 384(%rsp), %zmm25
416 vmovups 320(%rsp), %zmm26
417 vmovups 256(%rsp), %zmm27
418 vmovups 192(%rsp), %zmm28
419 vmovups 128(%rsp), %zmm29
420 vmovups 64(%rsp), %zmm30
421 vmovups (%rsp), %zmm31
422 vmovups 1216(%rsp), %zmm1
423 movq 1064(%rsp), %rsi
424 movq 1056(%rsp), %rdi
425 movq 1096(%rsp), %r12
426 cfi_restore (%r12)
427 movq 1088(%rsp), %r13
428 cfi_restore (%r13)
429 movq 1080(%rsp), %r14
430 cfi_restore (%r14)
431 movq 1072(%rsp), %r15
432 cfi_restore (%r15)
433 jmp .LBL_2_2
434
435.LBL_2_10:
436 cfi_restore_state
437 movzbl %r12b, %r15d
438 shlq $4, %r15
439 vmovsd 1160(%rsp,%r15), %xmm0
440 vzeroupper
441 vmovsd 1160(%rsp,%r15), %xmm0
442
443 call JUMPTARGET(sin)
444
445 vmovsd %xmm0, 1224(%rsp,%r15)
446 jmp .LBL_2_8
447
448.LBL_2_12:
449 movzbl %r12b, %r15d
450 shlq $4, %r15
451 vmovsd 1152(%rsp,%r15), %xmm0
452 vzeroupper
453 vmovsd 1152(%rsp,%r15), %xmm0
454
455 call JUMPTARGET(sin)
456
457 vmovsd %xmm0, 1216(%rsp,%r15)
458 jmp .LBL_2_7
459#endif
460END (_ZGVeN8v_sin_skx)
461
462 .section .rodata, "a"
463.L_2il0floatpacket.14:
464 .long 0xffffffff,0xffffffff
465 .type .L_2il0floatpacket.14,@object
466