1 | /* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version. |
2 | Copyright (C) 2014-2016 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | #include "svml_s_wrapper_impl.h" |
21 | |
22 | .text |
23 | ENTRY (_ZGVeN16vl4l4_sincosf) |
24 | WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf |
25 | END (_ZGVeN16vl4l4_sincosf) |
26 | |
27 | /* AVX512 ISA version as wrapper to AVX2 ISA version (for vector |
28 | function declared with #pragma omp declare simd notinbranch). */ |
29 | .macro WRAPPER_IMPL_AVX512_fFF_vvv callee |
30 | #ifndef __ILP32__ |
31 | pushq %rbp |
32 | cfi_adjust_cfa_offset (8) |
33 | cfi_rel_offset (%rbp, 0) |
34 | movq %rsp, %rbp |
35 | cfi_def_cfa_register (%rbp) |
36 | andq $-64, %rsp |
37 | subq $448, %rsp |
38 | /* Encoding for vmovups %zmm0, 384(%rsp). */ |
39 | .byte 0x62 |
40 | .byte 0xf1 |
41 | .byte 0x7c |
42 | .byte 0x48 |
43 | .byte 0x11 |
44 | .byte 0x44 |
45 | .byte 0x24 |
46 | .byte 0x06 |
47 | lea (%rsp), %rdi |
48 | /* Encoding for vmovups %zmm1, 128(%rdi). */ |
49 | .byte 0x62 |
50 | .byte 0xf1 |
51 | .byte 0x7c |
52 | .byte 0x48 |
53 | .byte 0x11 |
54 | .byte 0x4f |
55 | .byte 0x02 |
56 | /* Encoding for vmovups %zmm2, 192(%rdi). */ |
57 | .byte 0x62 |
58 | .byte 0xf1 |
59 | .byte 0x7c |
60 | .byte 0x48 |
61 | .byte 0x11 |
62 | .byte 0x57 |
63 | .byte 0x03 |
64 | /* Encoding for vmovups %zmm3, 256(%rdi). */ |
65 | .byte 0x62 |
66 | .byte 0xf1 |
67 | .byte 0x7c |
68 | .byte 0x48 |
69 | .byte 0x11 |
70 | .byte 0x5f |
71 | .byte 0x04 |
72 | /* Encoding for vmovups %zmm4, 320(%rdi). */ |
73 | .byte 0x62 |
74 | .byte 0xf1 |
75 | .byte 0x7c |
76 | .byte 0x48 |
77 | .byte 0x11 |
78 | .byte 0x67 |
79 | .byte 0x05 |
80 | lea 64(%rsp), %rsi |
81 | call HIDDEN_JUMPTARGET(\callee) |
82 | vmovdqu 416(%rsp), %ymm0 |
83 | lea 32(%rsp), %rdi |
84 | lea 96(%rsp), %rsi |
85 | call HIDDEN_JUMPTARGET(\callee) |
86 | movq 128(%rsp), %rdx |
87 | movq 136(%rsp), %rsi |
88 | movq 144(%rsp), %r8 |
89 | movq 152(%rsp), %r10 |
90 | movl (%rsp), %eax |
91 | movl 4(%rsp), %ecx |
92 | movl 8(%rsp), %edi |
93 | movl 12(%rsp), %r9d |
94 | movl %eax, (%rdx) |
95 | movl %ecx, (%rsi) |
96 | movq 160(%rsp), %rax |
97 | movq 168(%rsp), %rcx |
98 | movl %edi, (%r8) |
99 | movl %r9d, (%r10) |
100 | movq 176(%rsp), %rdi |
101 | movq 184(%rsp), %r9 |
102 | movl 16(%rsp), %r11d |
103 | movl 20(%rsp), %edx |
104 | movl 24(%rsp), %esi |
105 | movl 28(%rsp), %r8d |
106 | movl %r11d, (%rax) |
107 | movl %edx, (%rcx) |
108 | movq 192(%rsp), %r11 |
109 | movq 200(%rsp), %rdx |
110 | movl %esi, (%rdi) |
111 | movl %r8d, (%r9) |
112 | movq 208(%rsp), %rsi |
113 | movq 216(%rsp), %r8 |
114 | movl 32(%rsp), %r10d |
115 | movl 36(%rsp), %eax |
116 | movl 40(%rsp), %ecx |
117 | movl 44(%rsp), %edi |
118 | movl %r10d, (%r11) |
119 | movl %eax, (%rdx) |
120 | movq 224(%rsp), %r10 |
121 | movq 232(%rsp), %rax |
122 | movl %ecx, (%rsi) |
123 | movl %edi, (%r8) |
124 | movq 240(%rsp), %rcx |
125 | movq 248(%rsp), %rdi |
126 | movl 48(%rsp), %r9d |
127 | movl 52(%rsp), %r11d |
128 | movl 56(%rsp), %edx |
129 | movl 60(%rsp), %esi |
130 | movl %r9d, (%r10) |
131 | movl %r11d, (%rax) |
132 | movq 256(%rsp), %r9 |
133 | movq 264(%rsp), %r11 |
134 | movl %edx, (%rcx) |
135 | movl %esi, (%rdi) |
136 | movq 272(%rsp), %rdx |
137 | movq 280(%rsp), %rsi |
138 | movl 64(%rsp), %r8d |
139 | movl 68(%rsp), %r10d |
140 | movl 72(%rsp), %eax |
141 | movl 76(%rsp), %ecx |
142 | movl %r8d, (%r9) |
143 | movl %r10d, (%r11) |
144 | movq 288(%rsp), %r8 |
145 | movq 296(%rsp), %r10 |
146 | movl %eax, (%rdx) |
147 | movl %ecx, (%rsi) |
148 | movq 304(%rsp), %rax |
149 | movq 312(%rsp), %rcx |
150 | movl 80(%rsp), %edi |
151 | movl 84(%rsp), %r9d |
152 | movl 88(%rsp), %r11d |
153 | movl 92(%rsp), %edx |
154 | movl %edi, (%r8) |
155 | movl %r9d, (%r10) |
156 | movq 320(%rsp), %rdi |
157 | movq 328(%rsp), %r9 |
158 | movl %r11d, (%rax) |
159 | movl %edx, (%rcx) |
160 | movq 336(%rsp), %r11 |
161 | movq 344(%rsp), %rdx |
162 | movl 96(%rsp), %esi |
163 | movl 100(%rsp), %r8d |
164 | movl 104(%rsp), %r10d |
165 | movl 108(%rsp), %eax |
166 | movl %esi, (%rdi) |
167 | movl %r8d, (%r9) |
168 | movq 352(%rsp), %rsi |
169 | movq 360(%rsp), %r8 |
170 | movl %r10d, (%r11) |
171 | movl %eax, (%rdx) |
172 | movq 368(%rsp), %r10 |
173 | movq 376(%rsp), %rax |
174 | movl 112(%rsp), %ecx |
175 | movl 116(%rsp), %edi |
176 | movl 120(%rsp), %r9d |
177 | movl 124(%rsp), %r11d |
178 | movl %ecx, (%rsi) |
179 | movl %edi, (%r8) |
180 | movl %r9d, (%r10) |
181 | movl %r11d, (%rax) |
182 | movq %rbp, %rsp |
183 | cfi_def_cfa_register (%rsp) |
184 | popq %rbp |
185 | cfi_adjust_cfa_offset (-8) |
186 | cfi_restore (%rbp) |
187 | ret |
188 | #else |
189 | leal 8(%rsp), %r10d |
190 | .cfi_def_cfa 10, 0 |
191 | andl $-64, %esp |
192 | pushq -8(%r10d) |
193 | pushq %rbp |
194 | .cfi_escape 0x10,0x6,0x2,0x76,0 |
195 | movl %esp, %ebp |
196 | pushq %r12 |
197 | leal -112(%rbp), %esi |
198 | pushq %r10 |
199 | .cfi_escape 0xf,0x3,0x76,0x70,0x6 |
200 | .cfi_escape 0x10,0xc,0x2,0x76,0x78 |
201 | leal -176(%rbp), %edi |
202 | movq %rsi, %r12 |
203 | pushq %rbx |
204 | .cfi_escape 0x10,0x3,0x2,0x76,0x68 |
205 | movq %rdi, %rbx |
206 | subl $344, %esp |
207 | /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */ |
208 | .byte 0x67 |
209 | .byte 0x62 |
210 | .byte 0xf1 |
211 | .byte 0xfd |
212 | .byte 0x48 |
213 | .byte 0x7f |
214 | .byte 0x8d |
215 | .byte 0x10 |
216 | .byte 0xff |
217 | .byte 0xff |
218 | .byte 0xff |
219 | /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */ |
220 | .byte 0x67 |
221 | .byte 0x62 |
222 | .byte 0xf1 |
223 | .byte 0xfd |
224 | .byte 0x48 |
225 | .byte 0x7f |
226 | .byte 0x95 |
227 | .byte 0xd0 |
228 | .byte 0xfe |
229 | .byte 0xff |
230 | .byte 0xff |
231 | /* Encoding for vmovaps %zmm0, -368(%ebp). */ |
232 | .byte 0x67 |
233 | .byte 0x62 |
234 | .byte 0xf1 |
235 | .byte 0x7c |
236 | .byte 0x48 |
237 | .byte 0x29 |
238 | .byte 0x85 |
239 | .byte 0x90 |
240 | .byte 0xfe |
241 | .byte 0xff |
242 | .byte 0xff |
243 | call HIDDEN_JUMPTARGET(\callee) |
244 | leal 32(%r12), %esi |
245 | vmovups -336(%ebp), %ymm0 |
246 | leal 32(%rbx), %edi |
247 | call HIDDEN_JUMPTARGET(\callee) |
248 | movl -240(%ebp), %eax |
249 | vmovss -176(%ebp), %xmm0 |
250 | vmovss %xmm0, (%eax) |
251 | movl -236(%ebp), %eax |
252 | vmovss -172(%ebp), %xmm0 |
253 | vmovss %xmm0, (%eax) |
254 | movl -232(%ebp), %eax |
255 | vmovss -168(%ebp), %xmm0 |
256 | vmovss %xmm0, (%eax) |
257 | movl -228(%ebp), %eax |
258 | vmovss -164(%ebp), %xmm0 |
259 | vmovss %xmm0, (%eax) |
260 | movl -224(%ebp), %eax |
261 | vmovss -160(%ebp), %xmm0 |
262 | vmovss %xmm0, (%eax) |
263 | movl -220(%ebp), %eax |
264 | vmovss -156(%ebp), %xmm0 |
265 | vmovss %xmm0, (%eax) |
266 | movl -216(%ebp), %eax |
267 | vmovss -152(%ebp), %xmm0 |
268 | vmovss %xmm0, (%eax) |
269 | movl -212(%ebp), %eax |
270 | vmovss -148(%ebp), %xmm0 |
271 | vmovss %xmm0, (%eax) |
272 | movl -208(%ebp), %eax |
273 | vmovss -144(%ebp), %xmm0 |
274 | vmovss %xmm0, (%eax) |
275 | movl -204(%ebp), %eax |
276 | vmovss -140(%ebp), %xmm0 |
277 | vmovss %xmm0, (%eax) |
278 | movl -200(%ebp), %eax |
279 | vmovss -136(%ebp), %xmm0 |
280 | vmovss %xmm0, (%eax) |
281 | movl -196(%ebp), %eax |
282 | vmovss -132(%ebp), %xmm0 |
283 | vmovss %xmm0, (%eax) |
284 | movl -192(%ebp), %eax |
285 | vmovss -128(%ebp), %xmm0 |
286 | vmovss %xmm0, (%eax) |
287 | movl -188(%ebp), %eax |
288 | vmovss -124(%ebp), %xmm0 |
289 | vmovss %xmm0, (%eax) |
290 | movl -184(%ebp), %eax |
291 | vmovss -120(%ebp), %xmm0 |
292 | vmovss %xmm0, (%eax) |
293 | movl -180(%ebp), %eax |
294 | vmovss -116(%ebp), %xmm0 |
295 | vmovss %xmm0, (%eax) |
296 | movl -304(%ebp), %eax |
297 | vmovss -112(%ebp), %xmm0 |
298 | vmovss %xmm0, (%eax) |
299 | movl -300(%ebp), %eax |
300 | vmovss -108(%ebp), %xmm0 |
301 | vmovss %xmm0, (%eax) |
302 | movl -296(%ebp), %eax |
303 | vmovss -104(%ebp), %xmm0 |
304 | vmovss %xmm0, (%eax) |
305 | movl -292(%ebp), %eax |
306 | vmovss -100(%ebp), %xmm0 |
307 | vmovss %xmm0, (%eax) |
308 | movl -288(%ebp), %eax |
309 | vmovss -96(%ebp), %xmm0 |
310 | vmovss %xmm0, (%eax) |
311 | movl -284(%ebp), %eax |
312 | vmovss -92(%ebp), %xmm0 |
313 | vmovss %xmm0, (%eax) |
314 | movl -280(%ebp), %eax |
315 | vmovss -88(%ebp), %xmm0 |
316 | vmovss %xmm0, (%eax) |
317 | movl -276(%ebp), %eax |
318 | vmovss -84(%ebp), %xmm0 |
319 | vmovss %xmm0, (%eax) |
320 | movl -272(%ebp), %eax |
321 | vmovss -80(%ebp), %xmm0 |
322 | vmovss %xmm0, (%eax) |
323 | movl -268(%ebp), %eax |
324 | vmovss -76(%ebp), %xmm0 |
325 | vmovss %xmm0, (%eax) |
326 | movl -264(%ebp), %eax |
327 | vmovss -72(%ebp), %xmm0 |
328 | vmovss %xmm0, (%eax) |
329 | movl -260(%ebp), %eax |
330 | vmovss -68(%ebp), %xmm0 |
331 | vmovss %xmm0, (%eax) |
332 | movl -256(%ebp), %eax |
333 | vmovss -64(%ebp), %xmm0 |
334 | vmovss %xmm0, (%eax) |
335 | movl -252(%ebp), %eax |
336 | vmovss -60(%ebp), %xmm0 |
337 | vmovss %xmm0, (%eax) |
338 | movl -248(%ebp), %eax |
339 | vmovss -56(%ebp), %xmm0 |
340 | vmovss %xmm0, (%eax) |
341 | movl -244(%ebp), %eax |
342 | vmovss -52(%ebp), %xmm0 |
343 | vmovss %xmm0, (%eax) |
344 | addl $344, %esp |
345 | popq %rbx |
346 | popq %r10 |
347 | .cfi_def_cfa 10, 0 |
348 | popq %r12 |
349 | popq %rbp |
350 | leal -8(%r10), %esp |
351 | .cfi_def_cfa 7, 8 |
352 | ret |
353 | #endif |
354 | .endm |
355 | |
356 | ENTRY (_ZGVeN16vvv_sincosf) |
357 | WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf |
358 | END (_ZGVeN16vvv_sincosf) |
359 | |