1 | /* Function sincosf vectorized with AVX-512. KNL and SKX versions. |
2 | Copyright (C) 2014-2020 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <https://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "svml_s_trig_data.h" |
21 | #include "svml_s_wrapper_impl.h" |
22 | |
23 | /* |
24 | ALGORITHM DESCRIPTION: |
25 | |
26 | 1) Range reduction to [-Pi/4; +Pi/4] interval |
27 | a) Grab sign from source argument and save it. |
28 | b) Remove sign using AND operation |
29 | c) Getting octant Y by 2/Pi multiplication |
30 | d) Add "Right Shifter" value |
31 | e) Treat obtained value as integer S for destination sign setting. |
32 | SS = ((S-S&1)&2)<<30; For sin part |
33 | SC = ((S+S&1)&2)<<30; For cos part |
34 | f) Change destination sign if source sign is negative |
35 | using XOR operation. |
36 | g) Subtract "Right Shifter" (0x4B000000) value |
37 | h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 4 parts: |
38 | X = X - Y*PI1 - Y*PI2 - Y*PI3 - Y*PI4; |
39 | 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
40 | a) Calculate X^2 = X * X |
41 | b) Calculate 2 polynomials for sin and cos: |
42 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
43 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))); |
44 | c) Swap RS & RC if first bit of obtained value after |
45 | Right Shifting is set to 1. Using And, Andnot & Or operations. |
46 | 3) Destination sign setting |
47 | a) Set shifted destination sign using XOR operation: |
48 | R1 = XOR( RS, SS ); |
49 | R2 = XOR( RC, SC ). */ |
50 | |
51 | .text |
52 | ENTRY (_ZGVeN16vl4l4_sincosf_knl) |
53 | #ifndef HAVE_AVX512DQ_ASM_SUPPORT |
54 | WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf |
55 | #else |
56 | pushq %rbp |
57 | cfi_adjust_cfa_offset (8) |
58 | cfi_rel_offset (%rbp, 0) |
59 | movq %rsp, %rbp |
60 | cfi_def_cfa_register (%rbp) |
61 | andq $-64, %rsp |
62 | subq $1344, %rsp |
63 | movq __svml_s_trig_data@GOTPCREL(%rip), %rax |
64 | vmovaps %zmm0, %zmm2 |
65 | movl $-1, %edx |
66 | vmovups __sAbsMask(%rax), %zmm0 |
67 | vmovups __sInvPI(%rax), %zmm3 |
68 | |
69 | /* Absolute argument computation */ |
70 | vpandd %zmm0, %zmm2, %zmm1 |
71 | vmovups __sPI1_FMA(%rax), %zmm5 |
72 | vmovups __sSignMask(%rax), %zmm9 |
73 | vpandnd %zmm2, %zmm0, %zmm0 |
74 | |
75 | /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: |
76 | X = X - Y*PI1 - Y*PI2 - Y*PI3 */ |
77 | vmovaps %zmm1, %zmm6 |
78 | vmovaps %zmm1, %zmm8 |
79 | |
80 | /* c) Getting octant Y by 2/Pi multiplication |
81 | d) Add "Right Shifter" value */ |
82 | vfmadd213ps __sRShifter(%rax), %zmm1, %zmm3 |
83 | vmovups __sPI3_FMA(%rax), %zmm7 |
84 | |
85 | /* g) Subtract "Right Shifter" (0x4B000000) value */ |
86 | vsubps __sRShifter(%rax), %zmm3, %zmm12 |
87 | |
88 | /* e) Treat obtained value as integer S for destination sign setting */ |
89 | vpslld $31, %zmm3, %zmm13 |
90 | vmovups __sA7_FMA(%rax), %zmm14 |
91 | vfnmadd231ps %zmm12, %zmm5, %zmm6 |
92 | |
93 | /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
94 | a) Calculate X^2 = X * X |
95 | b) Calculate 2 polynomials for sin and cos: |
96 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
97 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ |
98 | vmovaps %zmm14, %zmm15 |
99 | vmovups __sA9_FMA(%rax), %zmm3 |
100 | vcmpps $22, __sRangeReductionVal(%rax), %zmm1, %k1 |
101 | vpbroadcastd %edx, %zmm1{%k1}{z} |
102 | vfnmadd231ps __sPI2_FMA(%rax), %zmm12, %zmm6 |
103 | vptestmd %zmm1, %zmm1, %k0 |
104 | vpandd %zmm6, %zmm9, %zmm11 |
105 | kmovw %k0, %ecx |
106 | vpxord __sOneHalf(%rax), %zmm11, %zmm4 |
107 | |
108 | /* Result sign calculations */ |
109 | vpternlogd $150, %zmm13, %zmm9, %zmm11 |
110 | |
111 | /* Add correction term 0.5 for cos() part */ |
112 | vaddps %zmm4, %zmm12, %zmm10 |
113 | vfnmadd213ps %zmm6, %zmm7, %zmm12 |
114 | vfnmadd231ps %zmm10, %zmm5, %zmm8 |
115 | vpxord %zmm13, %zmm12, %zmm13 |
116 | vmulps %zmm13, %zmm13, %zmm12 |
117 | vfnmadd231ps __sPI2_FMA(%rax), %zmm10, %zmm8 |
118 | vfmadd231ps __sA9_FMA(%rax), %zmm12, %zmm15 |
119 | vfnmadd213ps %zmm8, %zmm7, %zmm10 |
120 | vfmadd213ps __sA5_FMA(%rax), %zmm12, %zmm15 |
121 | vpxord %zmm11, %zmm10, %zmm5 |
122 | vmulps %zmm5, %zmm5, %zmm4 |
123 | vfmadd213ps __sA3(%rax), %zmm12, %zmm15 |
124 | vfmadd213ps %zmm14, %zmm4, %zmm3 |
125 | vmulps %zmm12, %zmm15, %zmm14 |
126 | vfmadd213ps __sA5_FMA(%rax), %zmm4, %zmm3 |
127 | vfmadd213ps %zmm13, %zmm13, %zmm14 |
128 | vfmadd213ps __sA3(%rax), %zmm4, %zmm3 |
129 | vpxord %zmm0, %zmm14, %zmm0 |
130 | vmulps %zmm4, %zmm3, %zmm3 |
131 | vfmadd213ps %zmm5, %zmm5, %zmm3 |
132 | testl %ecx, %ecx |
133 | jne .LBL_1_3 |
134 | |
135 | .LBL_1_2: |
136 | cfi_remember_state |
137 | vmovups %zmm0, (%rdi) |
138 | vmovups %zmm3, (%rsi) |
139 | movq %rbp, %rsp |
140 | cfi_def_cfa_register (%rsp) |
141 | popq %rbp |
142 | cfi_adjust_cfa_offset (-8) |
143 | cfi_restore (%rbp) |
144 | ret |
145 | |
146 | .LBL_1_3: |
147 | cfi_restore_state |
148 | vmovups %zmm2, 1152(%rsp) |
149 | vmovups %zmm0, 1216(%rsp) |
150 | vmovups %zmm3, 1280(%rsp) |
151 | je .LBL_1_2 |
152 | |
153 | xorb %dl, %dl |
154 | kmovw %k4, 1048(%rsp) |
155 | xorl %eax, %eax |
156 | kmovw %k5, 1040(%rsp) |
157 | kmovw %k6, 1032(%rsp) |
158 | kmovw %k7, 1024(%rsp) |
159 | vmovups %zmm16, 960(%rsp) |
160 | vmovups %zmm17, 896(%rsp) |
161 | vmovups %zmm18, 832(%rsp) |
162 | vmovups %zmm19, 768(%rsp) |
163 | vmovups %zmm20, 704(%rsp) |
164 | vmovups %zmm21, 640(%rsp) |
165 | vmovups %zmm22, 576(%rsp) |
166 | vmovups %zmm23, 512(%rsp) |
167 | vmovups %zmm24, 448(%rsp) |
168 | vmovups %zmm25, 384(%rsp) |
169 | vmovups %zmm26, 320(%rsp) |
170 | vmovups %zmm27, 256(%rsp) |
171 | vmovups %zmm28, 192(%rsp) |
172 | vmovups %zmm29, 128(%rsp) |
173 | vmovups %zmm30, 64(%rsp) |
174 | vmovups %zmm31, (%rsp) |
175 | movq %rsi, 1056(%rsp) |
176 | movq %r12, 1096(%rsp) |
177 | cfi_offset_rel_rsp (12, 1096) |
178 | movb %dl, %r12b |
179 | movq %r13, 1088(%rsp) |
180 | cfi_offset_rel_rsp (13, 1088) |
181 | movl %eax, %r13d |
182 | movq %r14, 1080(%rsp) |
183 | cfi_offset_rel_rsp (14, 1080) |
184 | movl %ecx, %r14d |
185 | movq %r15, 1072(%rsp) |
186 | cfi_offset_rel_rsp (15, 1072) |
187 | movq %rbx, 1064(%rsp) |
188 | movq %rdi, %rbx |
189 | cfi_remember_state |
190 | |
191 | .LBL_1_6: |
192 | btl %r13d, %r14d |
193 | jc .LBL_1_13 |
194 | |
195 | .LBL_1_7: |
196 | lea 1(%r13), %esi |
197 | btl %esi, %r14d |
198 | jc .LBL_1_10 |
199 | |
200 | .LBL_1_8: |
201 | addb $1, %r12b |
202 | addl $2, %r13d |
203 | cmpb $16, %r12b |
204 | jb .LBL_1_6 |
205 | |
206 | movq %rbx, %rdi |
207 | kmovw 1048(%rsp), %k4 |
208 | movq 1056(%rsp), %rsi |
209 | kmovw 1040(%rsp), %k5 |
210 | movq 1096(%rsp), %r12 |
211 | cfi_restore (%r12) |
212 | kmovw 1032(%rsp), %k6 |
213 | movq 1088(%rsp), %r13 |
214 | cfi_restore (%r13) |
215 | kmovw 1024(%rsp), %k7 |
216 | vmovups 960(%rsp), %zmm16 |
217 | vmovups 896(%rsp), %zmm17 |
218 | vmovups 832(%rsp), %zmm18 |
219 | vmovups 768(%rsp), %zmm19 |
220 | vmovups 704(%rsp), %zmm20 |
221 | vmovups 640(%rsp), %zmm21 |
222 | vmovups 576(%rsp), %zmm22 |
223 | vmovups 512(%rsp), %zmm23 |
224 | vmovups 448(%rsp), %zmm24 |
225 | vmovups 384(%rsp), %zmm25 |
226 | vmovups 320(%rsp), %zmm26 |
227 | vmovups 256(%rsp), %zmm27 |
228 | vmovups 192(%rsp), %zmm28 |
229 | vmovups 128(%rsp), %zmm29 |
230 | vmovups 64(%rsp), %zmm30 |
231 | vmovups (%rsp), %zmm31 |
232 | movq 1080(%rsp), %r14 |
233 | cfi_restore (%r14) |
234 | movq 1072(%rsp), %r15 |
235 | cfi_restore (%r15) |
236 | movq 1064(%rsp), %rbx |
237 | vmovups 1216(%rsp), %zmm0 |
238 | vmovups 1280(%rsp), %zmm3 |
239 | jmp .LBL_1_2 |
240 | |
241 | .LBL_1_10: |
242 | cfi_restore_state |
243 | movzbl %r12b, %r15d |
244 | vmovss 1156(%rsp,%r15,8), %xmm0 |
245 | |
246 | call JUMPTARGET(sinf) |
247 | |
248 | vmovss %xmm0, 1220(%rsp,%r15,8) |
249 | vmovss 1156(%rsp,%r15,8), %xmm0 |
250 | |
251 | call JUMPTARGET(cosf) |
252 | |
253 | vmovss %xmm0, 1284(%rsp,%r15,8) |
254 | jmp .LBL_1_8 |
255 | |
256 | .LBL_1_13: |
257 | movzbl %r12b, %r15d |
258 | vmovss 1152(%rsp,%r15,8), %xmm0 |
259 | |
260 | call JUMPTARGET(sinf) |
261 | |
262 | vmovss %xmm0, 1216(%rsp,%r15,8) |
263 | vmovss 1152(%rsp,%r15,8), %xmm0 |
264 | |
265 | call JUMPTARGET(cosf) |
266 | |
267 | vmovss %xmm0, 1280(%rsp,%r15,8) |
268 | jmp .LBL_1_7 |
269 | #endif |
270 | END (_ZGVeN16vl4l4_sincosf_knl) |
271 | libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_knl) |
272 | |
273 | ENTRY (_ZGVeN16vl4l4_sincosf_skx) |
274 | #ifndef HAVE_AVX512DQ_ASM_SUPPORT |
275 | WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf |
276 | #else |
277 | pushq %rbp |
278 | cfi_adjust_cfa_offset (8) |
279 | cfi_rel_offset (%rbp, 0) |
280 | movq %rsp, %rbp |
281 | cfi_def_cfa_register (%rbp) |
282 | andq $-64, %rsp |
283 | subq $1344, %rsp |
284 | movq __svml_s_trig_data@GOTPCREL(%rip), %rax |
285 | vmovaps %zmm0, %zmm4 |
286 | vmovups __sAbsMask(%rax), %zmm3 |
287 | vmovups __sInvPI(%rax), %zmm5 |
288 | vmovups __sRShifter(%rax), %zmm6 |
289 | vmovups __sPI1_FMA(%rax), %zmm9 |
290 | vmovups __sPI2_FMA(%rax), %zmm10 |
291 | vmovups __sSignMask(%rax), %zmm14 |
292 | vmovups __sOneHalf(%rax), %zmm7 |
293 | vmovups __sPI3_FMA(%rax), %zmm12 |
294 | |
295 | /* Absolute argument computation */ |
296 | vandps %zmm3, %zmm4, %zmm2 |
297 | |
298 | /* c) Getting octant Y by 2/Pi multiplication |
299 | d) Add "Right Shifter" value */ |
300 | vfmadd213ps %zmm6, %zmm2, %zmm5 |
301 | vcmpps $18, __sRangeReductionVal(%rax), %zmm2, %k1 |
302 | |
303 | /* e) Treat obtained value as integer S for destination sign setting */ |
304 | vpslld $31, %zmm5, %zmm0 |
305 | |
306 | /* g) Subtract "Right Shifter" (0x4B000000) value */ |
307 | vsubps %zmm6, %zmm5, %zmm5 |
308 | vmovups __sA3(%rax), %zmm6 |
309 | |
310 | /* h) Subtract Y*(PI/2) from X argument, where PI/2 divided to 3 parts: |
311 | X = X - Y*PI1 - Y*PI2 - Y*PI3 */ |
312 | vmovaps %zmm2, %zmm11 |
313 | vfnmadd231ps %zmm5, %zmm9, %zmm11 |
314 | vfnmadd231ps %zmm5, %zmm10, %zmm11 |
315 | vandps %zmm11, %zmm14, %zmm1 |
316 | vxorps %zmm1, %zmm7, %zmm8 |
317 | |
318 | /* Result sign calculations */ |
319 | vpternlogd $150, %zmm0, %zmm14, %zmm1 |
320 | vmovups .L_2il0floatpacket.13(%rip), %zmm14 |
321 | |
322 | /* Add correction term 0.5 for cos() part */ |
323 | vaddps %zmm8, %zmm5, %zmm15 |
324 | vfnmadd213ps %zmm11, %zmm12, %zmm5 |
325 | vandnps %zmm4, %zmm3, %zmm11 |
326 | vmovups __sA7_FMA(%rax), %zmm3 |
327 | vmovaps %zmm2, %zmm13 |
328 | vfnmadd231ps %zmm15, %zmm9, %zmm13 |
329 | vxorps %zmm0, %zmm5, %zmm9 |
330 | vmovups __sA5_FMA(%rax), %zmm0 |
331 | vfnmadd231ps %zmm15, %zmm10, %zmm13 |
332 | vmulps %zmm9, %zmm9, %zmm8 |
333 | vfnmadd213ps %zmm13, %zmm12, %zmm15 |
334 | vmovups __sA9_FMA(%rax), %zmm12 |
335 | vxorps %zmm1, %zmm15, %zmm1 |
336 | vmulps %zmm1, %zmm1, %zmm13 |
337 | |
338 | /* 2) Polynomial (minimax for sin within [-Pi/4; +Pi/4] interval) |
339 | a) Calculate X^2 = X * X |
340 | b) Calculate 2 polynomials for sin and cos: |
341 | RS = X * ( A0 + X^2 * (A1 + x^2 * (A2 + x^2 * (A3)))); |
342 | RC = B0 + X^2 * (B1 + x^2 * (B2 + x^2 * (B3 + x^2 * (B4)))) */ |
343 | vmovaps %zmm12, %zmm7 |
344 | vfmadd213ps %zmm3, %zmm8, %zmm7 |
345 | vfmadd213ps %zmm3, %zmm13, %zmm12 |
346 | vfmadd213ps %zmm0, %zmm8, %zmm7 |
347 | vfmadd213ps %zmm0, %zmm13, %zmm12 |
348 | vfmadd213ps %zmm6, %zmm8, %zmm7 |
349 | vfmadd213ps %zmm6, %zmm13, %zmm12 |
350 | vmulps %zmm8, %zmm7, %zmm10 |
351 | vmulps %zmm13, %zmm12, %zmm3 |
352 | vfmadd213ps %zmm9, %zmm9, %zmm10 |
353 | vfmadd213ps %zmm1, %zmm1, %zmm3 |
354 | vxorps %zmm11, %zmm10, %zmm0 |
355 | vpandnd %zmm2, %zmm2, %zmm14{%k1} |
356 | vptestmd %zmm14, %zmm14, %k0 |
357 | kmovw %k0, %ecx |
358 | testl %ecx, %ecx |
359 | jne .LBL_2_3 |
360 | |
361 | .LBL_2_2: |
362 | cfi_remember_state |
363 | vmovups %zmm0, (%rdi) |
364 | vmovups %zmm3, (%rsi) |
365 | movq %rbp, %rsp |
366 | cfi_def_cfa_register (%rsp) |
367 | popq %rbp |
368 | cfi_adjust_cfa_offset (-8) |
369 | cfi_restore (%rbp) |
370 | ret |
371 | |
372 | .LBL_2_3: |
373 | cfi_restore_state |
374 | vmovups %zmm4, 1152(%rsp) |
375 | vmovups %zmm0, 1216(%rsp) |
376 | vmovups %zmm3, 1280(%rsp) |
377 | je .LBL_2_2 |
378 | |
379 | xorb %dl, %dl |
380 | xorl %eax, %eax |
381 | kmovw %k4, 1048(%rsp) |
382 | kmovw %k5, 1040(%rsp) |
383 | kmovw %k6, 1032(%rsp) |
384 | kmovw %k7, 1024(%rsp) |
385 | vmovups %zmm16, 960(%rsp) |
386 | vmovups %zmm17, 896(%rsp) |
387 | vmovups %zmm18, 832(%rsp) |
388 | vmovups %zmm19, 768(%rsp) |
389 | vmovups %zmm20, 704(%rsp) |
390 | vmovups %zmm21, 640(%rsp) |
391 | vmovups %zmm22, 576(%rsp) |
392 | vmovups %zmm23, 512(%rsp) |
393 | vmovups %zmm24, 448(%rsp) |
394 | vmovups %zmm25, 384(%rsp) |
395 | vmovups %zmm26, 320(%rsp) |
396 | vmovups %zmm27, 256(%rsp) |
397 | vmovups %zmm28, 192(%rsp) |
398 | vmovups %zmm29, 128(%rsp) |
399 | vmovups %zmm30, 64(%rsp) |
400 | vmovups %zmm31, (%rsp) |
401 | movq %rsi, 1056(%rsp) |
402 | movq %r12, 1096(%rsp) |
403 | cfi_offset_rel_rsp (12, 1096) |
404 | movb %dl, %r12b |
405 | movq %r13, 1088(%rsp) |
406 | cfi_offset_rel_rsp (13, 1088) |
407 | movl %eax, %r13d |
408 | movq %r14, 1080(%rsp) |
409 | cfi_offset_rel_rsp (14, 1080) |
410 | movl %ecx, %r14d |
411 | movq %r15, 1072(%rsp) |
412 | cfi_offset_rel_rsp (15, 1072) |
413 | movq %rbx, 1064(%rsp) |
414 | movq %rdi, %rbx |
415 | cfi_remember_state |
416 | |
417 | .LBL_2_6: |
418 | btl %r13d, %r14d |
419 | jc .LBL_2_13 |
420 | |
421 | .LBL_2_7: |
422 | lea 1(%r13), %esi |
423 | btl %esi, %r14d |
424 | jc .LBL_2_10 |
425 | |
426 | .LBL_2_8: |
427 | incb %r12b |
428 | addl $2, %r13d |
429 | cmpb $16, %r12b |
430 | jb .LBL_2_6 |
431 | |
432 | kmovw 1048(%rsp), %k4 |
433 | movq %rbx, %rdi |
434 | kmovw 1040(%rsp), %k5 |
435 | kmovw 1032(%rsp), %k6 |
436 | kmovw 1024(%rsp), %k7 |
437 | vmovups 960(%rsp), %zmm16 |
438 | vmovups 896(%rsp), %zmm17 |
439 | vmovups 832(%rsp), %zmm18 |
440 | vmovups 768(%rsp), %zmm19 |
441 | vmovups 704(%rsp), %zmm20 |
442 | vmovups 640(%rsp), %zmm21 |
443 | vmovups 576(%rsp), %zmm22 |
444 | vmovups 512(%rsp), %zmm23 |
445 | vmovups 448(%rsp), %zmm24 |
446 | vmovups 384(%rsp), %zmm25 |
447 | vmovups 320(%rsp), %zmm26 |
448 | vmovups 256(%rsp), %zmm27 |
449 | vmovups 192(%rsp), %zmm28 |
450 | vmovups 128(%rsp), %zmm29 |
451 | vmovups 64(%rsp), %zmm30 |
452 | vmovups (%rsp), %zmm31 |
453 | vmovups 1216(%rsp), %zmm0 |
454 | vmovups 1280(%rsp), %zmm3 |
455 | movq 1056(%rsp), %rsi |
456 | movq 1096(%rsp), %r12 |
457 | cfi_restore (%r12) |
458 | movq 1088(%rsp), %r13 |
459 | cfi_restore (%r13) |
460 | movq 1080(%rsp), %r14 |
461 | cfi_restore (%r14) |
462 | movq 1072(%rsp), %r15 |
463 | cfi_restore (%r15) |
464 | movq 1064(%rsp), %rbx |
465 | jmp .LBL_2_2 |
466 | |
467 | .LBL_2_10: |
468 | cfi_restore_state |
469 | movzbl %r12b, %r15d |
470 | vmovss 1156(%rsp,%r15,8), %xmm0 |
471 | vzeroupper |
472 | vmovss 1156(%rsp,%r15,8), %xmm0 |
473 | |
474 | call JUMPTARGET(sinf) |
475 | |
476 | vmovss %xmm0, 1220(%rsp,%r15,8) |
477 | vmovss 1156(%rsp,%r15,8), %xmm0 |
478 | |
479 | call JUMPTARGET(cosf) |
480 | |
481 | vmovss %xmm0, 1284(%rsp,%r15,8) |
482 | jmp .LBL_2_8 |
483 | |
484 | .LBL_2_13: |
485 | movzbl %r12b, %r15d |
486 | vmovss 1152(%rsp,%r15,8), %xmm0 |
487 | vzeroupper |
488 | vmovss 1152(%rsp,%r15,8), %xmm0 |
489 | |
490 | call JUMPTARGET(sinf) |
491 | |
492 | vmovss %xmm0, 1216(%rsp,%r15,8) |
493 | vmovss 1152(%rsp,%r15,8), %xmm0 |
494 | |
495 | call JUMPTARGET(cosf) |
496 | |
497 | vmovss %xmm0, 1280(%rsp,%r15,8) |
498 | jmp .LBL_2_7 |
499 | #endif |
500 | END (_ZGVeN16vl4l4_sincosf_skx) |
501 | libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx) |
502 | |
503 | /* Wrapper between vvv and vl4l4 vector variants. */ |
504 | .macro WRAPPER_AVX512_vvv_vl4l4 callee |
505 | #ifndef __ILP32__ |
506 | pushq %rbp |
507 | cfi_adjust_cfa_offset (8) |
508 | cfi_rel_offset (%rbp, 0) |
509 | movq %rsp, %rbp |
510 | cfi_def_cfa_register (%rbp) |
511 | andq $-64, %rsp |
512 | subq $384, %rsp |
513 | vmovups %zmm1, 128(%rsp) |
514 | lea (%rsp), %rdi |
515 | vmovups %zmm2, 192(%rdi) |
516 | vmovups %zmm3, 256(%rdi) |
517 | vmovups %zmm4, 320(%rdi) |
518 | lea 64(%rsp), %rsi |
519 | call HIDDEN_JUMPTARGET(\callee) |
520 | movq 128(%rsp), %rdx |
521 | movq 136(%rsp), %rsi |
522 | movq 144(%rsp), %r8 |
523 | movq 152(%rsp), %r10 |
524 | movl (%rsp), %eax |
525 | movl 4(%rsp), %ecx |
526 | movl 8(%rsp), %edi |
527 | movl 12(%rsp), %r9d |
528 | movl %eax, (%rdx) |
529 | movl %ecx, (%rsi) |
530 | movq 160(%rsp), %rax |
531 | movq 168(%rsp), %rcx |
532 | movl %edi, (%r8) |
533 | movl %r9d, (%r10) |
534 | movq 176(%rsp), %rdi |
535 | movq 184(%rsp), %r9 |
536 | movl 16(%rsp), %r11d |
537 | movl 20(%rsp), %edx |
538 | movl 24(%rsp), %esi |
539 | movl 28(%rsp), %r8d |
540 | movl %r11d, (%rax) |
541 | movl %edx, (%rcx) |
542 | movq 192(%rsp), %r11 |
543 | movq 200(%rsp), %rdx |
544 | movl %esi, (%rdi) |
545 | movl %r8d, (%r9) |
546 | movq 208(%rsp), %rsi |
547 | movq 216(%rsp), %r8 |
548 | movl 32(%rsp), %r10d |
549 | movl 36(%rsp), %eax |
550 | movl 40(%rsp), %ecx |
551 | movl 44(%rsp), %edi |
552 | movl %r10d, (%r11) |
553 | movl %eax, (%rdx) |
554 | movq 224(%rsp), %r10 |
555 | movq 232(%rsp), %rax |
556 | movl %ecx, (%rsi) |
557 | movl %edi, (%r8) |
558 | movq 240(%rsp), %rcx |
559 | movq 248(%rsp), %rdi |
560 | movl 48(%rsp), %r9d |
561 | movl 52(%rsp), %r11d |
562 | movl 56(%rsp), %edx |
563 | movl 60(%rsp), %esi |
564 | movl %r9d, (%r10) |
565 | movl %r11d, (%rax) |
566 | movq 256(%rsp), %r9 |
567 | movq 264(%rsp), %r11 |
568 | movl %edx, (%rcx) |
569 | movl %esi, (%rdi) |
570 | movq 272(%rsp), %rdx |
571 | movq 280(%rsp), %rsi |
572 | movl 64(%rsp), %r8d |
573 | movl 68(%rsp), %r10d |
574 | movl 72(%rsp), %eax |
575 | movl 76(%rsp), %ecx |
576 | movl %r8d, (%r9) |
577 | movl %r10d, (%r11) |
578 | movq 288(%rsp), %r8 |
579 | movq 296(%rsp), %r10 |
580 | movl %eax, (%rdx) |
581 | movl %ecx, (%rsi) |
582 | movq 304(%rsp), %rax |
583 | movq 312(%rsp), %rcx |
584 | movl 80(%rsp), %edi |
585 | movl 84(%rsp), %r9d |
586 | movl 88(%rsp), %r11d |
587 | movl 92(%rsp), %edx |
588 | movl %edi, (%r8) |
589 | movl %r9d, (%r10) |
590 | movq 320(%rsp), %rdi |
591 | movq 328(%rsp), %r9 |
592 | movl %r11d, (%rax) |
593 | movl %edx, (%rcx) |
594 | movq 336(%rsp), %r11 |
595 | movq 344(%rsp), %rdx |
596 | movl 96(%rsp), %esi |
597 | movl 100(%rsp), %r8d |
598 | movl 104(%rsp), %r10d |
599 | movl 108(%rsp), %eax |
600 | movl %esi, (%rdi) |
601 | movl %r8d, (%r9) |
602 | movq 352(%rsp), %rsi |
603 | movq 360(%rsp), %r8 |
604 | movl %r10d, (%r11) |
605 | movl %eax, (%rdx) |
606 | movq 368(%rsp), %r10 |
607 | movq 376(%rsp), %rax |
608 | movl 112(%rsp), %ecx |
609 | movl 116(%rsp), %edi |
610 | movl 120(%rsp), %r9d |
611 | movl 124(%rsp), %r11d |
612 | movl %ecx, (%rsi) |
613 | movl %edi, (%r8) |
614 | movl %r9d, (%r10) |
615 | movl %r11d, (%rax) |
616 | movq %rbp, %rsp |
617 | cfi_def_cfa_register (%rsp) |
618 | popq %rbp |
619 | cfi_adjust_cfa_offset (-8) |
620 | cfi_restore (%rbp) |
621 | ret |
622 | #else |
623 | leal 8(%rsp), %r10d |
624 | .cfi_def_cfa 10, 0 |
625 | andl $-64, %esp |
626 | pushq -8(%r10d) |
627 | pushq %rbp |
628 | .cfi_escape 0x10,0x6,0x2,0x76,0 |
629 | movl %esp, %ebp |
630 | pushq %r10 |
631 | .cfi_escape 0xf,0x3,0x76,0x78,0x6 |
632 | leal -112(%rbp), %esi |
633 | leal -176(%rbp), %edi |
634 | subl $296, %esp |
635 | vmovdqa64 %zmm1, -240(%ebp) |
636 | vmovdqa64 %zmm2, -304(%ebp) |
637 | call HIDDEN_JUMPTARGET(\callee) |
638 | movl -240(%ebp), %eax |
639 | vmovss -176(%ebp), %xmm0 |
640 | vmovss %xmm0, (%eax) |
641 | movl -236(%ebp), %eax |
642 | vmovss -172(%ebp), %xmm0 |
643 | vmovss %xmm0, (%eax) |
644 | movl -232(%ebp), %eax |
645 | vmovss -168(%ebp), %xmm0 |
646 | vmovss %xmm0, (%eax) |
647 | movl -228(%ebp), %eax |
648 | vmovss -164(%ebp), %xmm0 |
649 | vmovss %xmm0, (%eax) |
650 | movl -224(%ebp), %eax |
651 | vmovss -160(%ebp), %xmm0 |
652 | vmovss %xmm0, (%eax) |
653 | movl -220(%ebp), %eax |
654 | vmovss -156(%ebp), %xmm0 |
655 | vmovss %xmm0, (%eax) |
656 | movl -216(%ebp), %eax |
657 | vmovss -152(%ebp), %xmm0 |
658 | vmovss %xmm0, (%eax) |
659 | movl -212(%ebp), %eax |
660 | vmovss -148(%ebp), %xmm0 |
661 | vmovss %xmm0, (%eax) |
662 | movl -208(%ebp), %eax |
663 | vmovss -144(%ebp), %xmm0 |
664 | vmovss %xmm0, (%eax) |
665 | movl -204(%ebp), %eax |
666 | vmovss -140(%ebp), %xmm0 |
667 | vmovss %xmm0, (%eax) |
668 | movl -200(%ebp), %eax |
669 | vmovss -136(%ebp), %xmm0 |
670 | vmovss %xmm0, (%eax) |
671 | movl -196(%ebp), %eax |
672 | vmovss -132(%ebp), %xmm0 |
673 | vmovss %xmm0, (%eax) |
674 | movl -192(%ebp), %eax |
675 | vmovss -128(%ebp), %xmm0 |
676 | vmovss %xmm0, (%eax) |
677 | movl -188(%ebp), %eax |
678 | vmovss -124(%ebp), %xmm0 |
679 | vmovss %xmm0, (%eax) |
680 | movl -184(%ebp), %eax |
681 | vmovss -120(%ebp), %xmm0 |
682 | vmovss %xmm0, (%eax) |
683 | movl -180(%ebp), %eax |
684 | vmovss -116(%ebp), %xmm0 |
685 | vmovss %xmm0, (%eax) |
686 | movl -304(%ebp), %eax |
687 | vmovss -112(%ebp), %xmm0 |
688 | vmovss %xmm0, (%eax) |
689 | movl -300(%ebp), %eax |
690 | vmovss -108(%ebp), %xmm0 |
691 | vmovss %xmm0, (%eax) |
692 | movl -296(%ebp), %eax |
693 | vmovss -104(%ebp), %xmm0 |
694 | vmovss %xmm0, (%eax) |
695 | movl -292(%ebp), %eax |
696 | vmovss -100(%ebp), %xmm0 |
697 | vmovss %xmm0, (%eax) |
698 | movl -288(%ebp), %eax |
699 | vmovss -96(%ebp), %xmm0 |
700 | vmovss %xmm0, (%eax) |
701 | movl -284(%ebp), %eax |
702 | vmovss -92(%ebp), %xmm0 |
703 | vmovss %xmm0, (%eax) |
704 | movl -280(%ebp), %eax |
705 | vmovss -88(%ebp), %xmm0 |
706 | vmovss %xmm0, (%eax) |
707 | movl -276(%ebp), %eax |
708 | vmovss -84(%ebp), %xmm0 |
709 | vmovss %xmm0, (%eax) |
710 | movl -272(%ebp), %eax |
711 | vmovss -80(%ebp), %xmm0 |
712 | vmovss %xmm0, (%eax) |
713 | movl -268(%ebp), %eax |
714 | vmovss -76(%ebp), %xmm0 |
715 | vmovss %xmm0, (%eax) |
716 | movl -264(%ebp), %eax |
717 | vmovss -72(%ebp), %xmm0 |
718 | vmovss %xmm0, (%eax) |
719 | movl -260(%ebp), %eax |
720 | vmovss -68(%ebp), %xmm0 |
721 | vmovss %xmm0, (%eax) |
722 | movl -256(%ebp), %eax |
723 | vmovss -64(%ebp), %xmm0 |
724 | vmovss %xmm0, (%eax) |
725 | movl -252(%ebp), %eax |
726 | vmovss -60(%ebp), %xmm0 |
727 | vmovss %xmm0, (%eax) |
728 | movl -248(%ebp), %eax |
729 | vmovss -56(%ebp), %xmm0 |
730 | vmovss %xmm0, (%eax) |
731 | movl -244(%ebp), %eax |
732 | vmovss -52(%ebp), %xmm0 |
733 | vmovss %xmm0, (%eax) |
734 | addl $296, %esp |
735 | popq %r10 |
736 | .cfi_def_cfa 10, 0 |
737 | popq %rbp |
738 | leal -8(%r10), %esp |
739 | .cfi_def_cfa 7, 8 |
740 | ret |
741 | #endif |
742 | .endm |
743 | |
744 | ENTRY (_ZGVeN16vvv_sincosf_knl) |
745 | WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_knl |
746 | END (_ZGVeN16vvv_sincosf_knl) |
747 | |
748 | ENTRY (_ZGVeN16vvv_sincosf_skx) |
749 | WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx |
750 | END (_ZGVeN16vvv_sincosf_skx) |
751 | |
752 | .section .rodata, "a" |
753 | .L_2il0floatpacket.13: |
754 | .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff |
755 | .type .L_2il0floatpacket.13,@object |
756 | |