1/* Function sinf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21#include "svml_s_wrapper_impl.h"
22
23 .text
24ENTRY(_ZGVeN16v_sinf_knl)
25#ifndef HAVE_AVX512DQ_ASM_SUPPORT
26WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
27#else
28/*
29 ALGORITHM DESCRIPTION:
30
31 1) Range reduction to [-Pi/2; +Pi/2] interval
32 a) Grab sign from source argument and save it.
33 b) Remove sign using AND operation
34 c) Getting octant Y by 1/Pi multiplication
35 d) Add "Right Shifter" value
36 e) Treat obtained value as integer for destination sign setting.
37 Shift first bit of this value to the last (sign) position
38 f) Change destination sign if source sign is negative
39 using XOR operation.
40 g) Subtract "Right Shifter" value
41 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
42 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
43 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
44 a) Calculate X^2 = X * X
45 b) Calculate polynomial:
46 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
47 3) Destination sign setting
48 a) Set shifted destination sign using XOR operation:
49 R = XOR( R, S );
50 */
51 pushq %rbp
52 cfi_adjust_cfa_offset (8)
53 cfi_rel_offset (%rbp, 0)
54 movq %rsp, %rbp
55 cfi_def_cfa_register (%rbp)
56 andq $-64, %rsp
57 subq $1280, %rsp
58 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
59
60/* Check for large and special values */
61 movl $-1, %edx
62 vmovups __sAbsMask(%rax), %zmm4
63 vmovups __sInvPI(%rax), %zmm1
64
65/* b) Remove sign using AND operation */
66 vpandd %zmm4, %zmm0, %zmm12
67 vmovups __sPI1_FMA(%rax), %zmm2
68 vmovups __sA9(%rax), %zmm7
69
70/*
71 f) Change destination sign if source sign is negative
72 using XOR operation.
73 */
74 vpandnd %zmm0, %zmm4, %zmm11
75
76/*
77 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
78 X = X - Y*PI1 - Y*PI2 - Y*PI3;
79 */
80 vmovaps %zmm12, %zmm3
81
82/*
83 c) Getting octant Y by 1/Pi multiplication
84 d) Add "Right Shifter" value
85 */
86 vfmadd213ps __sRShifter(%rax), %zmm12, %zmm1
87 vcmpps $22, __sRangeReductionVal(%rax), %zmm12, %k1
88 vpbroadcastd %edx, %zmm13{%k1}{z}
89
90/* g) Subtract "Right Shifter" value */
91 vsubps __sRShifter(%rax), %zmm1, %zmm5
92
93/*
94 e) Treat obtained value as integer for destination sign setting.
95 Shift first bit of this value to the last (sign) position
96 */
97 vpslld $31, %zmm1, %zmm6
98 vptestmd %zmm13, %zmm13, %k0
99 vfnmadd231ps %zmm5, %zmm2, %zmm3
100 kmovw %k0, %ecx
101 vfnmadd231ps __sPI2_FMA(%rax), %zmm5, %zmm3
102 vfnmadd132ps __sPI3_FMA(%rax), %zmm3, %zmm5
103
104/*
105 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
106 a) Calculate X^2 = X * X
107 b) Calculate polynomial:
108 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
109 */
110 vmulps %zmm5, %zmm5, %zmm8
111 vpxord %zmm6, %zmm5, %zmm9
112 vfmadd213ps __sA7(%rax), %zmm8, %zmm7
113 vfmadd213ps __sA5(%rax), %zmm8, %zmm7
114 vfmadd213ps __sA3(%rax), %zmm8, %zmm7
115 vmulps %zmm8, %zmm7, %zmm10
116 vfmadd213ps %zmm9, %zmm9, %zmm10
117
118/*
119 3) Destination sign setting
120 a) Set shifted destination sign using XOR operation:
121 R = XOR( R, S );
122 */
123 vpxord %zmm11, %zmm10, %zmm1
124 testl %ecx, %ecx
125 jne .LBL_1_3
126
127.LBL_1_2:
128 cfi_remember_state
129 vmovaps %zmm1, %zmm0
130 movq %rbp, %rsp
131 cfi_def_cfa_register (%rsp)
132 popq %rbp
133 cfi_adjust_cfa_offset (-8)
134 cfi_restore (%rbp)
135 ret
136
137.LBL_1_3:
138 cfi_restore_state
139 vmovups %zmm0, 1152(%rsp)
140 vmovups %zmm1, 1216(%rsp)
141 je .LBL_1_2
142
143 xorb %dl, %dl
144 kmovw %k4, 1048(%rsp)
145 xorl %eax, %eax
146 kmovw %k5, 1040(%rsp)
147 kmovw %k6, 1032(%rsp)
148 kmovw %k7, 1024(%rsp)
149 vmovups %zmm16, 960(%rsp)
150 vmovups %zmm17, 896(%rsp)
151 vmovups %zmm18, 832(%rsp)
152 vmovups %zmm19, 768(%rsp)
153 vmovups %zmm20, 704(%rsp)
154 vmovups %zmm21, 640(%rsp)
155 vmovups %zmm22, 576(%rsp)
156 vmovups %zmm23, 512(%rsp)
157 vmovups %zmm24, 448(%rsp)
158 vmovups %zmm25, 384(%rsp)
159 vmovups %zmm26, 320(%rsp)
160 vmovups %zmm27, 256(%rsp)
161 vmovups %zmm28, 192(%rsp)
162 vmovups %zmm29, 128(%rsp)
163 vmovups %zmm30, 64(%rsp)
164 vmovups %zmm31, (%rsp)
165 movq %rsi, 1064(%rsp)
166 movq %rdi, 1056(%rsp)
167 movq %r12, 1096(%rsp)
168 cfi_offset_rel_rsp (12, 1096)
169 movb %dl, %r12b
170 movq %r13, 1088(%rsp)
171 cfi_offset_rel_rsp (13, 1088)
172 movl %ecx, %r13d
173 movq %r14, 1080(%rsp)
174 cfi_offset_rel_rsp (14, 1080)
175 movl %eax, %r14d
176 movq %r15, 1072(%rsp)
177 cfi_offset_rel_rsp (15, 1072)
178 cfi_remember_state
179
180.LBL_1_6:
181 btl %r14d, %r13d
182 jc .LBL_1_12
183
184.LBL_1_7:
185 lea 1(%r14), %esi
186 btl %esi, %r13d
187 jc .LBL_1_10
188
189.LBL_1_8:
190 addb $1, %r12b
191 addl $2, %r14d
192 cmpb $16, %r12b
193 jb .LBL_1_6
194
195 kmovw 1048(%rsp), %k4
196 movq 1064(%rsp), %rsi
197 kmovw 1040(%rsp), %k5
198 movq 1056(%rsp), %rdi
199 kmovw 1032(%rsp), %k6
200 movq 1096(%rsp), %r12
201 cfi_restore (%r12)
202 movq 1088(%rsp), %r13
203 cfi_restore (%r13)
204 kmovw 1024(%rsp), %k7
205 vmovups 960(%rsp), %zmm16
206 vmovups 896(%rsp), %zmm17
207 vmovups 832(%rsp), %zmm18
208 vmovups 768(%rsp), %zmm19
209 vmovups 704(%rsp), %zmm20
210 vmovups 640(%rsp), %zmm21
211 vmovups 576(%rsp), %zmm22
212 vmovups 512(%rsp), %zmm23
213 vmovups 448(%rsp), %zmm24
214 vmovups 384(%rsp), %zmm25
215 vmovups 320(%rsp), %zmm26
216 vmovups 256(%rsp), %zmm27
217 vmovups 192(%rsp), %zmm28
218 vmovups 128(%rsp), %zmm29
219 vmovups 64(%rsp), %zmm30
220 vmovups (%rsp), %zmm31
221 movq 1080(%rsp), %r14
222 cfi_restore (%r14)
223 movq 1072(%rsp), %r15
224 cfi_restore (%r15)
225 vmovups 1216(%rsp), %zmm1
226 jmp .LBL_1_2
227
228.LBL_1_10:
229 cfi_restore_state
230 movzbl %r12b, %r15d
231 vmovss 1156(%rsp,%r15,8), %xmm0
232 call JUMPTARGET(sinf)
233 vmovss %xmm0, 1220(%rsp,%r15,8)
234 jmp .LBL_1_8
235
236.LBL_1_12:
237 movzbl %r12b, %r15d
238 vmovss 1152(%rsp,%r15,8), %xmm0
239 call JUMPTARGET(sinf)
240 vmovss %xmm0, 1216(%rsp,%r15,8)
241 jmp .LBL_1_7
242#endif
243END(_ZGVeN16v_sinf_knl)
244
245ENTRY (_ZGVeN16v_sinf_skx)
246#ifndef HAVE_AVX512DQ_ASM_SUPPORT
247WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
248#else
249/*
250 ALGORITHM DESCRIPTION:
251
252 1) Range reduction to [-Pi/2; +Pi/2] interval
253 a) Grab sign from source argument and save it.
254 b) Remove sign using AND operation
255 c) Getting octant Y by 1/Pi multiplication
256 d) Add "Right Shifter" value
257 e) Treat obtained value as integer for destination sign setting.
258 Shift first bit of this value to the last (sign) position
259 f) Change destination sign if source sign is negative
260 using XOR operation.
261 g) Subtract "Right Shifter" value
262 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
263 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
264 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
265 a) Calculate X^2 = X * X
266 b) Calculate polynomial:
267 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
268 3) Destination sign setting
269 a) Set shifted destination sign using XOR operation:
270 R = XOR( R, S );
271 */
272
273 pushq %rbp
274 cfi_adjust_cfa_offset (8)
275 cfi_rel_offset (%rbp, 0)
276 movq %rsp, %rbp
277 cfi_def_cfa_register (%rbp)
278 andq $-64, %rsp
279 subq $1280, %rsp
280 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
281
282/* Check for large and special values */
283 vmovups .L_2il0floatpacket.11(%rip), %zmm14
284 vmovups __sAbsMask(%rax), %zmm5
285 vmovups __sInvPI(%rax), %zmm1
286 vmovups __sRShifter(%rax), %zmm2
287 vmovups __sPI1_FMA(%rax), %zmm3
288 vmovups __sA9(%rax), %zmm8
289
290/* b) Remove sign using AND operation */
291 vandps %zmm5, %zmm0, %zmm13
292
293/*
294 f) Change destination sign if source sign is negative
295 using XOR operation.
296 */
297 vandnps %zmm0, %zmm5, %zmm12
298
299/*
300 c) Getting octant Y by 1/Pi multiplication
301 d) Add "Right Shifter" value
302 */
303 vfmadd213ps %zmm2, %zmm13, %zmm1
304 vcmpps $18, __sRangeReductionVal(%rax), %zmm13, %k1
305
306/*
307 e) Treat obtained value as integer for destination sign setting.
308 Shift first bit of this value to the last (sign) position
309 */
310 vpslld $31, %zmm1, %zmm7
311
312/* g) Subtract "Right Shifter" value */
313 vsubps %zmm2, %zmm1, %zmm6
314
315/*
316 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
317 X = X - Y*PI1 - Y*PI2 - Y*PI3;
318 */
319 vmovaps %zmm13, %zmm4
320 vfnmadd231ps %zmm6, %zmm3, %zmm4
321 vfnmadd231ps __sPI2_FMA(%rax), %zmm6, %zmm4
322 vfnmadd132ps __sPI3_FMA(%rax), %zmm4, %zmm6
323
324/*
325 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
326 a) Calculate X^2 = X * X
327 b) Calculate polynomial:
328 R = X + X * X^2 * (A3 + x^2 * (A5 + ......
329 */
330 vmulps %zmm6, %zmm6, %zmm9
331 vxorps %zmm7, %zmm6, %zmm10
332 vfmadd213ps __sA7(%rax), %zmm9, %zmm8
333 vfmadd213ps __sA5(%rax), %zmm9, %zmm8
334 vfmadd213ps __sA3(%rax), %zmm9, %zmm8
335 vmulps %zmm9, %zmm8, %zmm11
336 vfmadd213ps %zmm10, %zmm10, %zmm11
337
338/*
339 3) Destination sign setting
340 a) Set shifted destination sign using XOR operation:
341 R = XOR( R, S );
342 */
343 vxorps %zmm12, %zmm11, %zmm1
344 vpandnd %zmm13, %zmm13, %zmm14{%k1}
345 vptestmd %zmm14, %zmm14, %k0
346 kmovw %k0, %ecx
347 testl %ecx, %ecx
348 jne .LBL_2_3
349
350.LBL_2_2:
351 cfi_remember_state
352 vmovaps %zmm1, %zmm0
353 movq %rbp, %rsp
354 cfi_def_cfa_register (%rsp)
355 popq %rbp
356 cfi_adjust_cfa_offset (-8)
357 cfi_restore (%rbp)
358 ret
359
360.LBL_2_3:
361 cfi_restore_state
362 vmovups %zmm0, 1152(%rsp)
363 vmovups %zmm1, 1216(%rsp)
364 je .LBL_2_2
365
366 xorb %dl, %dl
367 xorl %eax, %eax
368 kmovw %k4, 1048(%rsp)
369 kmovw %k5, 1040(%rsp)
370 kmovw %k6, 1032(%rsp)
371 kmovw %k7, 1024(%rsp)
372 vmovups %zmm16, 960(%rsp)
373 vmovups %zmm17, 896(%rsp)
374 vmovups %zmm18, 832(%rsp)
375 vmovups %zmm19, 768(%rsp)
376 vmovups %zmm20, 704(%rsp)
377 vmovups %zmm21, 640(%rsp)
378 vmovups %zmm22, 576(%rsp)
379 vmovups %zmm23, 512(%rsp)
380 vmovups %zmm24, 448(%rsp)
381 vmovups %zmm25, 384(%rsp)
382 vmovups %zmm26, 320(%rsp)
383 vmovups %zmm27, 256(%rsp)
384 vmovups %zmm28, 192(%rsp)
385 vmovups %zmm29, 128(%rsp)
386 vmovups %zmm30, 64(%rsp)
387 vmovups %zmm31, (%rsp)
388 movq %rsi, 1064(%rsp)
389 movq %rdi, 1056(%rsp)
390 movq %r12, 1096(%rsp)
391 cfi_offset_rel_rsp (12, 1096)
392 movb %dl, %r12b
393 movq %r13, 1088(%rsp)
394 cfi_offset_rel_rsp (13, 1088)
395 movl %ecx, %r13d
396 movq %r14, 1080(%rsp)
397 cfi_offset_rel_rsp (14, 1080)
398 movl %eax, %r14d
399 movq %r15, 1072(%rsp)
400 cfi_offset_rel_rsp (15, 1072)
401 cfi_remember_state
402
403.LBL_2_6:
404 btl %r14d, %r13d
405 jc .LBL_2_12
406
407.LBL_2_7:
408 lea 1(%r14), %esi
409 btl %esi, %r13d
410 jc .LBL_2_10
411
412.LBL_2_8:
413 incb %r12b
414 addl $2, %r14d
415 cmpb $16, %r12b
416 jb .LBL_2_6
417
418 kmovw 1048(%rsp), %k4
419 kmovw 1040(%rsp), %k5
420 kmovw 1032(%rsp), %k6
421 kmovw 1024(%rsp), %k7
422 vmovups 960(%rsp), %zmm16
423 vmovups 896(%rsp), %zmm17
424 vmovups 832(%rsp), %zmm18
425 vmovups 768(%rsp), %zmm19
426 vmovups 704(%rsp), %zmm20
427 vmovups 640(%rsp), %zmm21
428 vmovups 576(%rsp), %zmm22
429 vmovups 512(%rsp), %zmm23
430 vmovups 448(%rsp), %zmm24
431 vmovups 384(%rsp), %zmm25
432 vmovups 320(%rsp), %zmm26
433 vmovups 256(%rsp), %zmm27
434 vmovups 192(%rsp), %zmm28
435 vmovups 128(%rsp), %zmm29
436 vmovups 64(%rsp), %zmm30
437 vmovups (%rsp), %zmm31
438 vmovups 1216(%rsp), %zmm1
439 movq 1064(%rsp), %rsi
440 movq 1056(%rsp), %rdi
441 movq 1096(%rsp), %r12
442 cfi_restore (%r12)
443 movq 1088(%rsp), %r13
444 cfi_restore (%r13)
445 movq 1080(%rsp), %r14
446 cfi_restore (%r14)
447 movq 1072(%rsp), %r15
448 cfi_restore (%r15)
449 jmp .LBL_2_2
450
451.LBL_2_10:
452 cfi_restore_state
453 movzbl %r12b, %r15d
454 vmovss 1156(%rsp,%r15,8), %xmm0
455 vzeroupper
456 vmovss 1156(%rsp,%r15,8), %xmm0
457
458 call JUMPTARGET(sinf)
459
460 vmovss %xmm0, 1220(%rsp,%r15,8)
461 jmp .LBL_2_8
462
463.LBL_2_12:
464 movzbl %r12b, %r15d
465 vmovss 1152(%rsp,%r15,8), %xmm0
466 vzeroupper
467 vmovss 1152(%rsp,%r15,8), %xmm0
468
469 call JUMPTARGET(sinf)
470
471 vmovss %xmm0, 1216(%rsp,%r15,8)
472 jmp .LBL_2_7
473#endif
474END (_ZGVeN16v_sinf_skx)
475
476 .section .rodata, "a"
477.L_2il0floatpacket.11:
478 .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
479 .type .L_2il0floatpacket.11,@object
480