1/* Function cosf vectorized with AVX-512. KNL and SKX versions.
2 Copyright (C) 2014-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_trig_data.h"
21#include "svml_s_wrapper_impl.h"
22
23 .text
24ENTRY (_ZGVeN16v_cosf_knl)
25#ifndef HAVE_AVX512DQ_ASM_SUPPORT
26WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
27#else
28/*
29 ALGORITHM DESCRIPTION:
30
31 1) Range reduction to [-Pi/2; +Pi/2] interval
32 a) We remove sign using AND operation
33 b) Add Pi/2 value to argument X for Cos to Sin transformation
34 c) Getting octant Y by 1/Pi multiplication
35 d) Add "Right Shifter" value
36 e) Treat obtained value as integer for destination sign setting.
37 Shift first bit of this value to the last (sign) position
38 f) Subtract "Right Shifter" value
39 g) Subtract 0.5 from result for octant correction
40 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
41 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
42 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
43 a) Calculate X^2 = X * X
44 b) Calculate polynomial:
45 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
46 3) Destination sign setting
47 a) Set shifted destination sign using XOR operation:
48 R = XOR( R, S );
49 */
50 pushq %rbp
51 cfi_adjust_cfa_offset (8)
52 cfi_rel_offset (%rbp, 0)
53 movq %rsp, %rbp
54 cfi_def_cfa_register (%rbp)
55 andq $-64, %rsp
56 subq $1280, %rsp
57 movq __svml_s_trig_data@GOTPCREL(%rip), %rdx
58
59/*
60 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
61 X = X - Y*PI1 - Y*PI2 - Y*PI3
62 */
63 vmovaps %zmm0, %zmm6
64 movl $-1, %eax
65
66/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
67 vaddps __sHalfPI(%rdx), %zmm0, %zmm2
68 vmovups __sRShifter(%rdx), %zmm3
69
70/*
71 1) Range reduction to [-Pi/2; +Pi/2] interval
72 c) Getting octant Y by 1/Pi multiplication
73 d) Add "Right Shifter" (0x4B000000) value
74 */
75 vfmadd132ps __sInvPI(%rdx), %zmm3, %zmm2
76 vmovups __sPI1_FMA(%rdx), %zmm5
77
78/* f) Subtract "Right Shifter" (0x4B000000) value */
79 vsubps %zmm3, %zmm2, %zmm4
80 vmovups __sA9_FMA(%rdx), %zmm9
81
82/* Check for large and special arguments */
83 vpandd __sAbsMask(%rdx), %zmm0, %zmm1
84
85/*
86 e) Treat obtained value as integer for destination sign setting.
87 Shift first bit of this value to the last (sign) position (S << 31)
88 */
89 vpslld $31, %zmm2, %zmm8
90 vcmpps $22, __sRangeReductionVal(%rdx), %zmm1, %k1
91 vpbroadcastd %eax, %zmm12{%k1}{z}
92
93/* g) Subtract 0.5 from result for octant correction */
94 vsubps __sOneHalf(%rdx), %zmm4, %zmm7
95 vptestmd %zmm12, %zmm12, %k0
96 vfnmadd231ps %zmm7, %zmm5, %zmm6
97 kmovw %k0, %ecx
98 vfnmadd231ps __sPI2_FMA(%rdx), %zmm7, %zmm6
99 vfnmadd132ps __sPI3_FMA(%rdx), %zmm6, %zmm7
100
101/* a) Calculate X^2 = X * X */
102 vmulps %zmm7, %zmm7, %zmm10
103
104/*
105 3) Destination sign setting
106 a) Set shifted destination sign using XOR operation:
107 R = XOR( R, S );
108 */
109 vpxord %zmm8, %zmm7, %zmm11
110
111/*
112 b) Calculate polynomial:
113 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
114 */
115 vfmadd213ps __sA7_FMA(%rdx), %zmm10, %zmm9
116 vfmadd213ps __sA5_FMA(%rdx), %zmm10, %zmm9
117 vfmadd213ps __sA3(%rdx), %zmm10, %zmm9
118 vmulps %zmm10, %zmm9, %zmm1
119 vfmadd213ps %zmm11, %zmm11, %zmm1
120 testl %ecx, %ecx
121 jne .LBL_1_3
122
123.LBL_1_2:
124 cfi_remember_state
125 vmovaps %zmm1, %zmm0
126 movq %rbp, %rsp
127 cfi_def_cfa_register (%rsp)
128 popq %rbp
129 cfi_adjust_cfa_offset (-8)
130 cfi_restore (%rbp)
131 ret
132
133.LBL_1_3:
134 cfi_restore_state
135 vmovups %zmm0, 1152(%rsp)
136 vmovups %zmm1, 1216(%rsp)
137 je .LBL_1_2
138
139 xorb %dl, %dl
140 kmovw %k4, 1048(%rsp)
141 xorl %eax, %eax
142 kmovw %k5, 1040(%rsp)
143 kmovw %k6, 1032(%rsp)
144 kmovw %k7, 1024(%rsp)
145 vmovups %zmm16, 960(%rsp)
146 vmovups %zmm17, 896(%rsp)
147 vmovups %zmm18, 832(%rsp)
148 vmovups %zmm19, 768(%rsp)
149 vmovups %zmm20, 704(%rsp)
150 vmovups %zmm21, 640(%rsp)
151 vmovups %zmm22, 576(%rsp)
152 vmovups %zmm23, 512(%rsp)
153 vmovups %zmm24, 448(%rsp)
154 vmovups %zmm25, 384(%rsp)
155 vmovups %zmm26, 320(%rsp)
156 vmovups %zmm27, 256(%rsp)
157 vmovups %zmm28, 192(%rsp)
158 vmovups %zmm29, 128(%rsp)
159 vmovups %zmm30, 64(%rsp)
160 vmovups %zmm31, (%rsp)
161 movq %rsi, 1064(%rsp)
162 movq %rdi, 1056(%rsp)
163 movq %r12, 1096(%rsp)
164 cfi_offset_rel_rsp (12, 1096)
165 movb %dl, %r12b
166 movq %r13, 1088(%rsp)
167 cfi_offset_rel_rsp (13, 1088)
168 movl %ecx, %r13d
169 movq %r14, 1080(%rsp)
170 cfi_offset_rel_rsp (14, 1080)
171 movl %eax, %r14d
172 movq %r15, 1072(%rsp)
173 cfi_offset_rel_rsp (15, 1072)
174 cfi_remember_state
175
176.LBL_1_6:
177 btl %r14d, %r13d
178 jc .LBL_1_12
179
180.LBL_1_7:
181 lea 1(%r14), %esi
182 btl %esi, %r13d
183 jc .LBL_1_10
184
185.LBL_1_8:
186 addb $1, %r12b
187 addl $2, %r14d
188 cmpb $16, %r12b
189 jb .LBL_1_6
190
191 kmovw 1048(%rsp), %k4
192 movq 1064(%rsp), %rsi
193 kmovw 1040(%rsp), %k5
194 movq 1056(%rsp), %rdi
195 kmovw 1032(%rsp), %k6
196 movq 1096(%rsp), %r12
197 cfi_restore (%r12)
198 movq 1088(%rsp), %r13
199 cfi_restore (%r13)
200 kmovw 1024(%rsp), %k7
201 vmovups 960(%rsp), %zmm16
202 vmovups 896(%rsp), %zmm17
203 vmovups 832(%rsp), %zmm18
204 vmovups 768(%rsp), %zmm19
205 vmovups 704(%rsp), %zmm20
206 vmovups 640(%rsp), %zmm21
207 vmovups 576(%rsp), %zmm22
208 vmovups 512(%rsp), %zmm23
209 vmovups 448(%rsp), %zmm24
210 vmovups 384(%rsp), %zmm25
211 vmovups 320(%rsp), %zmm26
212 vmovups 256(%rsp), %zmm27
213 vmovups 192(%rsp), %zmm28
214 vmovups 128(%rsp), %zmm29
215 vmovups 64(%rsp), %zmm30
216 vmovups (%rsp), %zmm31
217 movq 1080(%rsp), %r14
218 cfi_restore (%r14)
219 movq 1072(%rsp), %r15
220 cfi_restore (%r15)
221 vmovups 1216(%rsp), %zmm1
222 jmp .LBL_1_2
223
224.LBL_1_10:
225 cfi_restore_state
226 movzbl %r12b, %r15d
227 vmovss 1156(%rsp,%r15,8), %xmm0
228 call JUMPTARGET(cosf)
229 vmovss %xmm0, 1220(%rsp,%r15,8)
230 jmp .LBL_1_8
231
232.LBL_1_12:
233 movzbl %r12b, %r15d
234 vmovss 1152(%rsp,%r15,8), %xmm0
235 call JUMPTARGET(cosf)
236 vmovss %xmm0, 1216(%rsp,%r15,8)
237 jmp .LBL_1_7
238#endif
239END (_ZGVeN16v_cosf_knl)
240
241ENTRY (_ZGVeN16v_cosf_skx)
242#ifndef HAVE_AVX512DQ_ASM_SUPPORT
243WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
244#else
245/*
246 ALGORITHM DESCRIPTION:
247
248 1) Range reduction to [-Pi/2; +Pi/2] interval
249 a) We remove sign using AND operation
250 b) Add Pi/2 value to argument X for Cos to Sin transformation
251 c) Getting octant Y by 1/Pi multiplication
252 d) Add "Right Shifter" value
253 e) Treat obtained value as integer for destination sign setting.
254 Shift first bit of this value to the last (sign) position
255 f) Subtract "Right Shifter" value
256 g) Subtract 0.5 from result for octant correction
257 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
258 X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4;
259 2) Polynomial (minimax for sin within [-Pi/2; +Pi/2] interval)
260 a) Calculate X^2 = X * X
261 b) Calculate polynomial:
262 R = X + X * X^2 * (A3 + x^2 * (A5 + .....
263 3) Destination sign setting
264 a) Set shifted destination sign using XOR operation:
265 R = XOR( R, S );
266 */
267 pushq %rbp
268 cfi_adjust_cfa_offset (8)
269 cfi_rel_offset (%rbp, 0)
270 movq %rsp, %rbp
271 cfi_def_cfa_register (%rbp)
272 andq $-64, %rsp
273 subq $1280, %rsp
274 movq __svml_s_trig_data@GOTPCREL(%rip), %rax
275
276/*
277 h) Subtract Y*PI from X argument, where PI divided to 4 parts:
278 X = X - Y*PI1 - Y*PI2 - Y*PI3
279 */
280 vmovaps %zmm0, %zmm6
281 vmovups .L_2il0floatpacket.13(%rip), %zmm12
282 vmovups __sRShifter(%rax), %zmm3
283 vmovups __sPI1_FMA(%rax), %zmm5
284 vmovups __sA9_FMA(%rax), %zmm9
285
286/* b) Add Pi/2 value to argument X for Cos to Sin transformation */
287 vaddps __sHalfPI(%rax), %zmm0, %zmm2
288
289/* Check for large and special arguments */
290 vandps __sAbsMask(%rax), %zmm0, %zmm1
291
292/*
293 1) Range reduction to [-Pi/2; +Pi/2] interval
294 c) Getting octant Y by 1/Pi multiplication
295 d) Add "Right Shifter" (0x4B000000) value
296 */
297 vfmadd132ps __sInvPI(%rax), %zmm3, %zmm2
298 vcmpps $18, __sRangeReductionVal(%rax), %zmm1, %k1
299
300/*
301 e) Treat obtained value as integer for destination sign setting.
302 Shift first bit of this value to the last (sign) position (S << 31)
303 */
304 vpslld $31, %zmm2, %zmm8
305
306/* f) Subtract "Right Shifter" (0x4B000000) value */
307 vsubps %zmm3, %zmm2, %zmm4
308
309/* g) Subtract 0.5 from result for octant correction */
310 vsubps __sOneHalf(%rax), %zmm4, %zmm7
311 vfnmadd231ps %zmm7, %zmm5, %zmm6
312 vfnmadd231ps __sPI2_FMA(%rax), %zmm7, %zmm6
313 vfnmadd132ps __sPI3_FMA(%rax), %zmm6, %zmm7
314
315/* a) Calculate X^2 = X * X */
316 vmulps %zmm7, %zmm7, %zmm10
317
318/*
319 3) Destination sign setting
320 a) Set shifted destination sign using XOR operation:
321 R = XOR( R, S );
322 */
323 vxorps %zmm8, %zmm7, %zmm11
324
325/*
326 b) Calculate polynomial:
327 R = X + X * X^2 * (A3 + x^2 * (A5 + x^2 * (A7 + x^2 * (A9))));
328 */
329 vfmadd213ps __sA7_FMA(%rax), %zmm10, %zmm9
330 vfmadd213ps __sA5_FMA(%rax), %zmm10, %zmm9
331 vfmadd213ps __sA3(%rax), %zmm10, %zmm9
332 vpandnd %zmm1, %zmm1, %zmm12{%k1}
333 vmulps %zmm10, %zmm9, %zmm1
334 vptestmd %zmm12, %zmm12, %k0
335 vfmadd213ps %zmm11, %zmm11, %zmm1
336 kmovw %k0, %ecx
337 testl %ecx, %ecx
338 jne .LBL_2_3
339.LBL_2_2:
340 cfi_remember_state
341 vmovaps %zmm1, %zmm0
342 movq %rbp, %rsp
343 cfi_def_cfa_register (%rsp)
344 popq %rbp
345 cfi_adjust_cfa_offset (-8)
346 cfi_restore (%rbp)
347 ret
348
349.LBL_2_3:
350 cfi_restore_state
351 vmovups %zmm0, 1152(%rsp)
352 vmovups %zmm1, 1216(%rsp)
353 je .LBL_2_2
354
355 xorb %dl, %dl
356 xorl %eax, %eax
357 kmovw %k4, 1048(%rsp)
358 kmovw %k5, 1040(%rsp)
359 kmovw %k6, 1032(%rsp)
360 kmovw %k7, 1024(%rsp)
361 vmovups %zmm16, 960(%rsp)
362 vmovups %zmm17, 896(%rsp)
363 vmovups %zmm18, 832(%rsp)
364 vmovups %zmm19, 768(%rsp)
365 vmovups %zmm20, 704(%rsp)
366 vmovups %zmm21, 640(%rsp)
367 vmovups %zmm22, 576(%rsp)
368 vmovups %zmm23, 512(%rsp)
369 vmovups %zmm24, 448(%rsp)
370 vmovups %zmm25, 384(%rsp)
371 vmovups %zmm26, 320(%rsp)
372 vmovups %zmm27, 256(%rsp)
373 vmovups %zmm28, 192(%rsp)
374 vmovups %zmm29, 128(%rsp)
375 vmovups %zmm30, 64(%rsp)
376 vmovups %zmm31, (%rsp)
377 movq %rsi, 1064(%rsp)
378 movq %rdi, 1056(%rsp)
379 movq %r12, 1096(%rsp)
380 cfi_offset_rel_rsp (12, 1096)
381 movb %dl, %r12b
382 movq %r13, 1088(%rsp)
383 cfi_offset_rel_rsp (13, 1088)
384 movl %ecx, %r13d
385 movq %r14, 1080(%rsp)
386 cfi_offset_rel_rsp (14, 1080)
387 movl %eax, %r14d
388 movq %r15, 1072(%rsp)
389 cfi_offset_rel_rsp (15, 1072)
390 cfi_remember_state
391
392.LBL_2_6:
393 btl %r14d, %r13d
394 jc .LBL_2_12
395.LBL_2_7:
396 lea 1(%r14), %esi
397 btl %esi, %r13d
398 jc .LBL_2_10
399.LBL_2_8:
400 incb %r12b
401 addl $2, %r14d
402 cmpb $16, %r12b
403 jb .LBL_2_6
404 kmovw 1048(%rsp), %k4
405 kmovw 1040(%rsp), %k5
406 kmovw 1032(%rsp), %k6
407 kmovw 1024(%rsp), %k7
408 vmovups 960(%rsp), %zmm16
409 vmovups 896(%rsp), %zmm17
410 vmovups 832(%rsp), %zmm18
411 vmovups 768(%rsp), %zmm19
412 vmovups 704(%rsp), %zmm20
413 vmovups 640(%rsp), %zmm21
414 vmovups 576(%rsp), %zmm22
415 vmovups 512(%rsp), %zmm23
416 vmovups 448(%rsp), %zmm24
417 vmovups 384(%rsp), %zmm25
418 vmovups 320(%rsp), %zmm26
419 vmovups 256(%rsp), %zmm27
420 vmovups 192(%rsp), %zmm28
421 vmovups 128(%rsp), %zmm29
422 vmovups 64(%rsp), %zmm30
423 vmovups (%rsp), %zmm31
424 vmovups 1216(%rsp), %zmm1
425 movq 1064(%rsp), %rsi
426 movq 1056(%rsp), %rdi
427 movq 1096(%rsp), %r12
428 cfi_restore (%r12)
429 movq 1088(%rsp), %r13
430 cfi_restore (%r13)
431 movq 1080(%rsp), %r14
432 cfi_restore (%r14)
433 movq 1072(%rsp), %r15
434 cfi_restore (%r15)
435 jmp .LBL_2_2
436
437.LBL_2_10:
438 cfi_restore_state
439 movzbl %r12b, %r15d
440 vmovss 1156(%rsp,%r15,8), %xmm0
441 vzeroupper
442 vmovss 1156(%rsp,%r15,8), %xmm0
443 call JUMPTARGET(cosf)
444 vmovss %xmm0, 1220(%rsp,%r15,8)
445 jmp .LBL_2_8
446.LBL_2_12:
447 movzbl %r12b, %r15d
448 vmovss 1152(%rsp,%r15,8), %xmm0
449 vzeroupper
450 vmovss 1152(%rsp,%r15,8), %xmm0
451 call JUMPTARGET(cosf)
452 vmovss %xmm0, 1216(%rsp,%r15,8)
453 jmp .LBL_2_7
454#endif
455END (_ZGVeN16v_cosf_skx)
456
457 .section .rodata, "a"
458.L_2il0floatpacket.13:
459 .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
460 .type .L_2il0floatpacket.13,@object
461