1/* Function sincosf vectorized with AVX-512. Wrapper to AVX2 version.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20#include "svml_s_wrapper_impl.h"
21
22 .text
23ENTRY (_ZGVeN16vl4l4_sincosf)
24WRAPPER_IMPL_AVX512_fFF _ZGVdN8vl4l4_sincosf
25END (_ZGVeN16vl4l4_sincosf)
26
27/* AVX512 ISA version as wrapper to AVX2 ISA version (for vector
28 function declared with #pragma omp declare simd notinbranch). */
29.macro WRAPPER_IMPL_AVX512_fFF_vvv callee
30#ifndef __ILP32__
31 pushq %rbp
32 cfi_adjust_cfa_offset (8)
33 cfi_rel_offset (%rbp, 0)
34 movq %rsp, %rbp
35 cfi_def_cfa_register (%rbp)
36 andq $-64, %rsp
37 subq $448, %rsp
38 /* Encoding for vmovups %zmm0, 384(%rsp). */
39 .byte 0x62
40 .byte 0xf1
41 .byte 0x7c
42 .byte 0x48
43 .byte 0x11
44 .byte 0x44
45 .byte 0x24
46 .byte 0x06
47 lea (%rsp), %rdi
48 /* Encoding for vmovups %zmm1, 128(%rdi). */
49 .byte 0x62
50 .byte 0xf1
51 .byte 0x7c
52 .byte 0x48
53 .byte 0x11
54 .byte 0x4f
55 .byte 0x02
56 /* Encoding for vmovups %zmm2, 192(%rdi). */
57 .byte 0x62
58 .byte 0xf1
59 .byte 0x7c
60 .byte 0x48
61 .byte 0x11
62 .byte 0x57
63 .byte 0x03
64 /* Encoding for vmovups %zmm3, 256(%rdi). */
65 .byte 0x62
66 .byte 0xf1
67 .byte 0x7c
68 .byte 0x48
69 .byte 0x11
70 .byte 0x5f
71 .byte 0x04
72 /* Encoding for vmovups %zmm4, 320(%rdi). */
73 .byte 0x62
74 .byte 0xf1
75 .byte 0x7c
76 .byte 0x48
77 .byte 0x11
78 .byte 0x67
79 .byte 0x05
80 lea 64(%rsp), %rsi
81 call HIDDEN_JUMPTARGET(\callee)
82 vmovdqu 416(%rsp), %ymm0
83 lea 32(%rsp), %rdi
84 lea 96(%rsp), %rsi
85 call HIDDEN_JUMPTARGET(\callee)
86 movq 128(%rsp), %rdx
87 movq 136(%rsp), %rsi
88 movq 144(%rsp), %r8
89 movq 152(%rsp), %r10
90 movl (%rsp), %eax
91 movl 4(%rsp), %ecx
92 movl 8(%rsp), %edi
93 movl 12(%rsp), %r9d
94 movl %eax, (%rdx)
95 movl %ecx, (%rsi)
96 movq 160(%rsp), %rax
97 movq 168(%rsp), %rcx
98 movl %edi, (%r8)
99 movl %r9d, (%r10)
100 movq 176(%rsp), %rdi
101 movq 184(%rsp), %r9
102 movl 16(%rsp), %r11d
103 movl 20(%rsp), %edx
104 movl 24(%rsp), %esi
105 movl 28(%rsp), %r8d
106 movl %r11d, (%rax)
107 movl %edx, (%rcx)
108 movq 192(%rsp), %r11
109 movq 200(%rsp), %rdx
110 movl %esi, (%rdi)
111 movl %r8d, (%r9)
112 movq 208(%rsp), %rsi
113 movq 216(%rsp), %r8
114 movl 32(%rsp), %r10d
115 movl 36(%rsp), %eax
116 movl 40(%rsp), %ecx
117 movl 44(%rsp), %edi
118 movl %r10d, (%r11)
119 movl %eax, (%rdx)
120 movq 224(%rsp), %r10
121 movq 232(%rsp), %rax
122 movl %ecx, (%rsi)
123 movl %edi, (%r8)
124 movq 240(%rsp), %rcx
125 movq 248(%rsp), %rdi
126 movl 48(%rsp), %r9d
127 movl 52(%rsp), %r11d
128 movl 56(%rsp), %edx
129 movl 60(%rsp), %esi
130 movl %r9d, (%r10)
131 movl %r11d, (%rax)
132 movq 256(%rsp), %r9
133 movq 264(%rsp), %r11
134 movl %edx, (%rcx)
135 movl %esi, (%rdi)
136 movq 272(%rsp), %rdx
137 movq 280(%rsp), %rsi
138 movl 64(%rsp), %r8d
139 movl 68(%rsp), %r10d
140 movl 72(%rsp), %eax
141 movl 76(%rsp), %ecx
142 movl %r8d, (%r9)
143 movl %r10d, (%r11)
144 movq 288(%rsp), %r8
145 movq 296(%rsp), %r10
146 movl %eax, (%rdx)
147 movl %ecx, (%rsi)
148 movq 304(%rsp), %rax
149 movq 312(%rsp), %rcx
150 movl 80(%rsp), %edi
151 movl 84(%rsp), %r9d
152 movl 88(%rsp), %r11d
153 movl 92(%rsp), %edx
154 movl %edi, (%r8)
155 movl %r9d, (%r10)
156 movq 320(%rsp), %rdi
157 movq 328(%rsp), %r9
158 movl %r11d, (%rax)
159 movl %edx, (%rcx)
160 movq 336(%rsp), %r11
161 movq 344(%rsp), %rdx
162 movl 96(%rsp), %esi
163 movl 100(%rsp), %r8d
164 movl 104(%rsp), %r10d
165 movl 108(%rsp), %eax
166 movl %esi, (%rdi)
167 movl %r8d, (%r9)
168 movq 352(%rsp), %rsi
169 movq 360(%rsp), %r8
170 movl %r10d, (%r11)
171 movl %eax, (%rdx)
172 movq 368(%rsp), %r10
173 movq 376(%rsp), %rax
174 movl 112(%rsp), %ecx
175 movl 116(%rsp), %edi
176 movl 120(%rsp), %r9d
177 movl 124(%rsp), %r11d
178 movl %ecx, (%rsi)
179 movl %edi, (%r8)
180 movl %r9d, (%r10)
181 movl %r11d, (%rax)
182 movq %rbp, %rsp
183 cfi_def_cfa_register (%rsp)
184 popq %rbp
185 cfi_adjust_cfa_offset (-8)
186 cfi_restore (%rbp)
187 ret
188#else
189 leal 8(%rsp), %r10d
190 .cfi_def_cfa 10, 0
191 andl $-64, %esp
192 pushq -8(%r10d)
193 pushq %rbp
194 .cfi_escape 0x10,0x6,0x2,0x76,0
195 movl %esp, %ebp
196 pushq %r12
197 leal -112(%rbp), %esi
198 pushq %r10
199 .cfi_escape 0xf,0x3,0x76,0x70,0x6
200 .cfi_escape 0x10,0xc,0x2,0x76,0x78
201 leal -176(%rbp), %edi
202 movq %rsi, %r12
203 pushq %rbx
204 .cfi_escape 0x10,0x3,0x2,0x76,0x68
205 movq %rdi, %rbx
206 subl $344, %esp
207 /* Encoding for vmovdqa64 %zmm1, -240(%ebp). */
208 .byte 0x67
209 .byte 0x62
210 .byte 0xf1
211 .byte 0xfd
212 .byte 0x48
213 .byte 0x7f
214 .byte 0x8d
215 .byte 0x10
216 .byte 0xff
217 .byte 0xff
218 .byte 0xff
219 /* Encoding for vmovdqa64 %zmm2, -304(%ebp). */
220 .byte 0x67
221 .byte 0x62
222 .byte 0xf1
223 .byte 0xfd
224 .byte 0x48
225 .byte 0x7f
226 .byte 0x95
227 .byte 0xd0
228 .byte 0xfe
229 .byte 0xff
230 .byte 0xff
231 /* Encoding for vmovaps %zmm0, -368(%ebp). */
232 .byte 0x67
233 .byte 0x62
234 .byte 0xf1
235 .byte 0x7c
236 .byte 0x48
237 .byte 0x29
238 .byte 0x85
239 .byte 0x90
240 .byte 0xfe
241 .byte 0xff
242 .byte 0xff
243 call HIDDEN_JUMPTARGET(\callee)
244 leal 32(%r12), %esi
245 vmovups -336(%ebp), %ymm0
246 leal 32(%rbx), %edi
247 call HIDDEN_JUMPTARGET(\callee)
248 movl -240(%ebp), %eax
249 vmovss -176(%ebp), %xmm0
250 vmovss %xmm0, (%eax)
251 movl -236(%ebp), %eax
252 vmovss -172(%ebp), %xmm0
253 vmovss %xmm0, (%eax)
254 movl -232(%ebp), %eax
255 vmovss -168(%ebp), %xmm0
256 vmovss %xmm0, (%eax)
257 movl -228(%ebp), %eax
258 vmovss -164(%ebp), %xmm0
259 vmovss %xmm0, (%eax)
260 movl -224(%ebp), %eax
261 vmovss -160(%ebp), %xmm0
262 vmovss %xmm0, (%eax)
263 movl -220(%ebp), %eax
264 vmovss -156(%ebp), %xmm0
265 vmovss %xmm0, (%eax)
266 movl -216(%ebp), %eax
267 vmovss -152(%ebp), %xmm0
268 vmovss %xmm0, (%eax)
269 movl -212(%ebp), %eax
270 vmovss -148(%ebp), %xmm0
271 vmovss %xmm0, (%eax)
272 movl -208(%ebp), %eax
273 vmovss -144(%ebp), %xmm0
274 vmovss %xmm0, (%eax)
275 movl -204(%ebp), %eax
276 vmovss -140(%ebp), %xmm0
277 vmovss %xmm0, (%eax)
278 movl -200(%ebp), %eax
279 vmovss -136(%ebp), %xmm0
280 vmovss %xmm0, (%eax)
281 movl -196(%ebp), %eax
282 vmovss -132(%ebp), %xmm0
283 vmovss %xmm0, (%eax)
284 movl -192(%ebp), %eax
285 vmovss -128(%ebp), %xmm0
286 vmovss %xmm0, (%eax)
287 movl -188(%ebp), %eax
288 vmovss -124(%ebp), %xmm0
289 vmovss %xmm0, (%eax)
290 movl -184(%ebp), %eax
291 vmovss -120(%ebp), %xmm0
292 vmovss %xmm0, (%eax)
293 movl -180(%ebp), %eax
294 vmovss -116(%ebp), %xmm0
295 vmovss %xmm0, (%eax)
296 movl -304(%ebp), %eax
297 vmovss -112(%ebp), %xmm0
298 vmovss %xmm0, (%eax)
299 movl -300(%ebp), %eax
300 vmovss -108(%ebp), %xmm0
301 vmovss %xmm0, (%eax)
302 movl -296(%ebp), %eax
303 vmovss -104(%ebp), %xmm0
304 vmovss %xmm0, (%eax)
305 movl -292(%ebp), %eax
306 vmovss -100(%ebp), %xmm0
307 vmovss %xmm0, (%eax)
308 movl -288(%ebp), %eax
309 vmovss -96(%ebp), %xmm0
310 vmovss %xmm0, (%eax)
311 movl -284(%ebp), %eax
312 vmovss -92(%ebp), %xmm0
313 vmovss %xmm0, (%eax)
314 movl -280(%ebp), %eax
315 vmovss -88(%ebp), %xmm0
316 vmovss %xmm0, (%eax)
317 movl -276(%ebp), %eax
318 vmovss -84(%ebp), %xmm0
319 vmovss %xmm0, (%eax)
320 movl -272(%ebp), %eax
321 vmovss -80(%ebp), %xmm0
322 vmovss %xmm0, (%eax)
323 movl -268(%ebp), %eax
324 vmovss -76(%ebp), %xmm0
325 vmovss %xmm0, (%eax)
326 movl -264(%ebp), %eax
327 vmovss -72(%ebp), %xmm0
328 vmovss %xmm0, (%eax)
329 movl -260(%ebp), %eax
330 vmovss -68(%ebp), %xmm0
331 vmovss %xmm0, (%eax)
332 movl -256(%ebp), %eax
333 vmovss -64(%ebp), %xmm0
334 vmovss %xmm0, (%eax)
335 movl -252(%ebp), %eax
336 vmovss -60(%ebp), %xmm0
337 vmovss %xmm0, (%eax)
338 movl -248(%ebp), %eax
339 vmovss -56(%ebp), %xmm0
340 vmovss %xmm0, (%eax)
341 movl -244(%ebp), %eax
342 vmovss -52(%ebp), %xmm0
343 vmovss %xmm0, (%eax)
344 addl $344, %esp
345 popq %rbx
346 popq %r10
347 .cfi_def_cfa 10, 0
348 popq %r12
349 popq %rbp
350 leal -8(%r10), %esp
351 .cfi_def_cfa 7, 8
352 ret
353#endif
354.endm
355
356ENTRY (_ZGVeN16vvv_sincosf)
357WRAPPER_IMPL_AVX512_fFF_vvv _ZGVdN8vl4l4_sincosf
358END (_ZGVeN16vvv_sincosf)
359