1/* Wrapper implementations of vector math functions.
2 Copyright (C) 2014-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* SSE2 ISA version as wrapper to scalar. */
20.macro WRAPPER_IMPL_SSE2 callee
21 subq $40, %rsp
22 cfi_adjust_cfa_offset(40)
23 movaps %xmm0, (%rsp)
24 call JUMPTARGET(\callee)
25 movsd %xmm0, 16(%rsp)
26 movsd 8(%rsp), %xmm0
27 call JUMPTARGET(\callee)
28 movsd 16(%rsp), %xmm1
29 movsd %xmm0, 24(%rsp)
30 unpcklpd %xmm0, %xmm1
31 movaps %xmm1, %xmm0
32 addq $40, %rsp
33 cfi_adjust_cfa_offset(-40)
34 ret
35.endm
36
37/* 2 argument SSE2 ISA version as wrapper to scalar. */
38.macro WRAPPER_IMPL_SSE2_ff callee
39 subq $56, %rsp
40 cfi_adjust_cfa_offset(56)
41 movaps %xmm0, (%rsp)
42 movaps %xmm1, 16(%rsp)
43 call JUMPTARGET(\callee)
44 movsd %xmm0, 32(%rsp)
45 movsd 8(%rsp), %xmm0
46 movsd 24(%rsp), %xmm1
47 call JUMPTARGET(\callee)
48 movsd 32(%rsp), %xmm1
49 movsd %xmm0, 40(%rsp)
50 unpcklpd %xmm0, %xmm1
51 movaps %xmm1, %xmm0
52 addq $56, %rsp
53 cfi_adjust_cfa_offset(-56)
54 ret
55.endm
56
57/* 3 argument SSE2 ISA version as wrapper to scalar. */
58.macro WRAPPER_IMPL_SSE2_fFF callee
59 pushq %rbp
60 cfi_adjust_cfa_offset (8)
61 cfi_rel_offset (%rbp, 0)
62 pushq %rbx
63 cfi_adjust_cfa_offset (8)
64 cfi_rel_offset (%rbx, 0)
65 movq %rdi, %rbp
66 movq %rsi, %rbx
67 subq $40, %rsp
68 cfi_adjust_cfa_offset(40)
69 leaq 16(%rsp), %rsi
70 leaq 24(%rsp), %rdi
71 movaps %xmm0, (%rsp)
72 call JUMPTARGET(\callee)
73 leaq 16(%rsp), %rsi
74 leaq 24(%rsp), %rdi
75 movsd 24(%rsp), %xmm0
76 movapd (%rsp), %xmm1
77 movsd %xmm0, 0(%rbp)
78 unpckhpd %xmm1, %xmm1
79 movsd 16(%rsp), %xmm0
80 movsd %xmm0, (%rbx)
81 movapd %xmm1, %xmm0
82 call JUMPTARGET(\callee)
83 movsd 24(%rsp), %xmm0
84 movsd %xmm0, 8(%rbp)
85 movsd 16(%rsp), %xmm0
86 movsd %xmm0, 8(%rbx)
87 addq $40, %rsp
88 cfi_adjust_cfa_offset(-40)
89 popq %rbx
90 cfi_adjust_cfa_offset (-8)
91 cfi_restore (%rbx)
92 popq %rbp
93 cfi_adjust_cfa_offset (-8)
94 cfi_restore (%rbp)
95 ret
96.endm
97
98/* AVX/AVX2 ISA version as wrapper to SSE ISA version. */
99.macro WRAPPER_IMPL_AVX callee
100 pushq %rbp
101 cfi_adjust_cfa_offset (8)
102 cfi_rel_offset (%rbp, 0)
103 movq %rsp, %rbp
104 cfi_def_cfa_register (%rbp)
105 andq $-32, %rsp
106 subq $32, %rsp
107 vextractf128 $1, %ymm0, (%rsp)
108 vzeroupper
109 call HIDDEN_JUMPTARGET(\callee)
110 vmovapd %xmm0, 16(%rsp)
111 vmovaps (%rsp), %xmm0
112 call HIDDEN_JUMPTARGET(\callee)
113 vmovapd %xmm0, %xmm1
114 vmovapd 16(%rsp), %xmm0
115 vinsertf128 $1, %xmm1, %ymm0, %ymm0
116 movq %rbp, %rsp
117 cfi_def_cfa_register (%rsp)
118 popq %rbp
119 cfi_adjust_cfa_offset (-8)
120 cfi_restore (%rbp)
121 ret
122.endm
123
124/* 2 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
125.macro WRAPPER_IMPL_AVX_ff callee
126 pushq %rbp
127 cfi_adjust_cfa_offset (8)
128 cfi_rel_offset (%rbp, 0)
129 movq %rsp, %rbp
130 cfi_def_cfa_register (%rbp)
131 andq $-32, %rsp
132 subq $64, %rsp
133 vextractf128 $1, %ymm0, 16(%rsp)
134 vextractf128 $1, %ymm1, (%rsp)
135 vzeroupper
136 call HIDDEN_JUMPTARGET(\callee)
137 vmovaps %xmm0, 32(%rsp)
138 vmovaps 16(%rsp), %xmm0
139 vmovaps (%rsp), %xmm1
140 call HIDDEN_JUMPTARGET(\callee)
141 vmovaps %xmm0, %xmm1
142 vmovaps 32(%rsp), %xmm0
143 vinsertf128 $1, %xmm1, %ymm0, %ymm0
144 movq %rbp, %rsp
145 cfi_def_cfa_register (%rsp)
146 popq %rbp
147 cfi_adjust_cfa_offset (-8)
148 cfi_restore (%rbp)
149 ret
150.endm
151
152/* 3 argument AVX/AVX2 ISA version as wrapper to SSE ISA version. */
153.macro WRAPPER_IMPL_AVX_fFF callee
154 pushq %rbp
155 cfi_adjust_cfa_offset (8)
156 cfi_rel_offset (%rbp, 0)
157 movq %rsp, %rbp
158 cfi_def_cfa_register (%rbp)
159 andq $-32, %rsp
160 pushq %r13
161 cfi_adjust_cfa_offset (8)
162 cfi_rel_offset (%r13, 0)
163 pushq %r14
164 cfi_adjust_cfa_offset (8)
165 cfi_rel_offset (%r14, 0)
166 subq $48, %rsp
167 movq %rsi, %r14
168 movq %rdi, %r13
169 vextractf128 $1, %ymm0, 32(%rsp)
170 vzeroupper
171 call HIDDEN_JUMPTARGET(\callee)
172 vmovaps 32(%rsp), %xmm0
173 lea (%rsp), %rdi
174 lea 16(%rsp), %rsi
175 call HIDDEN_JUMPTARGET(\callee)
176 vmovapd (%rsp), %xmm0
177 vmovapd 16(%rsp), %xmm1
178 vmovapd %xmm0, 16(%r13)
179 vmovapd %xmm1, 16(%r14)
180 addq $48, %rsp
181 popq %r14
182 cfi_adjust_cfa_offset (-8)
183 cfi_restore (%r14)
184 popq %r13
185 cfi_adjust_cfa_offset (-8)
186 cfi_restore (%r13)
187 movq %rbp, %rsp
188 cfi_def_cfa_register (%rsp)
189 popq %rbp
190 cfi_adjust_cfa_offset (-8)
191 cfi_restore (%rbp)
192 ret
193.endm
194
195/* AVX512 ISA version as wrapper to AVX2 ISA version. */
196.macro WRAPPER_IMPL_AVX512 callee
197 pushq %rbp
198 cfi_adjust_cfa_offset (8)
199 cfi_rel_offset (%rbp, 0)
200 movq %rsp, %rbp
201 cfi_def_cfa_register (%rbp)
202 andq $-64, %rsp
203 subq $128, %rsp
204/* Below is encoding for vmovups %zmm0, (%rsp). */
205 .byte 0x62
206 .byte 0xf1
207 .byte 0x7c
208 .byte 0x48
209 .byte 0x11
210 .byte 0x04
211 .byte 0x24
212 vmovupd (%rsp), %ymm0
213 call HIDDEN_JUMPTARGET(\callee)
214 vmovupd %ymm0, 64(%rsp)
215 vmovupd 32(%rsp), %ymm0
216 call HIDDEN_JUMPTARGET(\callee)
217 vmovupd %ymm0, 96(%rsp)
218/* Below is encoding for vmovups 64(%rsp), %zmm0. */
219 .byte 0x62
220 .byte 0xf1
221 .byte 0x7c
222 .byte 0x48
223 .byte 0x10
224 .byte 0x44
225 .byte 0x24
226 .byte 0x01
227 movq %rbp, %rsp
228 cfi_def_cfa_register (%rsp)
229 popq %rbp
230 cfi_adjust_cfa_offset (-8)
231 cfi_restore (%rbp)
232 ret
233.endm
234
235/* 2 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
236.macro WRAPPER_IMPL_AVX512_ff callee
237 pushq %rbp
238 cfi_adjust_cfa_offset (8)
239 cfi_rel_offset (%rbp, 0)
240 movq %rsp, %rbp
241 cfi_def_cfa_register (%rbp)
242 andq $-64, %rsp
243 subq $192, %rsp
244/* Below is encoding for vmovups %zmm0, (%rsp). */
245 .byte 0x62
246 .byte 0xf1
247 .byte 0x7c
248 .byte 0x48
249 .byte 0x11
250 .byte 0x04
251 .byte 0x24
252/* Below is encoding for vmovups %zmm1, 64(%rsp). */
253 .byte 0x62
254 .byte 0xf1
255 .byte 0x7c
256 .byte 0x48
257 .byte 0x11
258 .byte 0x4c
259 .byte 0x24
260 .byte 0x01
261 vmovupd (%rsp), %ymm0
262 vmovupd 64(%rsp), %ymm1
263 call HIDDEN_JUMPTARGET(\callee)
264 vmovupd %ymm0, 128(%rsp)
265 vmovupd 32(%rsp), %ymm0
266 vmovupd 96(%rsp), %ymm1
267 call HIDDEN_JUMPTARGET(\callee)
268 vmovupd %ymm0, 160(%rsp)
269/* Below is encoding for vmovups 128(%rsp), %zmm0. */
270 .byte 0x62
271 .byte 0xf1
272 .byte 0x7c
273 .byte 0x48
274 .byte 0x10
275 .byte 0x44
276 .byte 0x24
277 .byte 0x02
278 movq %rbp, %rsp
279 cfi_def_cfa_register (%rsp)
280 popq %rbp
281 cfi_adjust_cfa_offset (-8)
282 cfi_restore (%rbp)
283 ret
284.endm
285
286/* 3 argument AVX512 ISA version as wrapper to AVX2 ISA version. */
287.macro WRAPPER_IMPL_AVX512_fFF callee
288 pushq %rbp
289 cfi_adjust_cfa_offset (8)
290 cfi_rel_offset (%rbp, 0)
291 movq %rsp, %rbp
292 cfi_def_cfa_register (%rbp)
293 andq $-64, %rsp
294 pushq %r12
295 cfi_adjust_cfa_offset (8)
296 cfi_rel_offset (%r12, 0)
297 pushq %r13
298 cfi_adjust_cfa_offset (8)
299 cfi_rel_offset (%r13, 0)
300 subq $176, %rsp
301 movq %rsi, %r13
302/* Below is encoding for vmovups %zmm0, (%rsp). */
303 .byte 0x62
304 .byte 0xf1
305 .byte 0x7c
306 .byte 0x48
307 .byte 0x11
308 .byte 0x04
309 .byte 0x24
310 movq %rdi, %r12
311 vmovupd (%rsp), %ymm0
312 call HIDDEN_JUMPTARGET(\callee)
313 vmovupd 32(%rsp), %ymm0
314 lea 64(%rsp), %rdi
315 lea 96(%rsp), %rsi
316 call HIDDEN_JUMPTARGET(\callee)
317 vmovupd 64(%rsp), %ymm0
318 vmovupd 96(%rsp), %ymm1
319 vmovupd %ymm0, 32(%r12)
320 vmovupd %ymm1, 32(%r13)
321 vzeroupper
322 addq $176, %rsp
323 popq %r13
324 cfi_adjust_cfa_offset (-8)
325 cfi_restore (%r13)
326 popq %r12
327 cfi_adjust_cfa_offset (-8)
328 cfi_restore (%r12)
329 movq %rbp, %rsp
330 cfi_def_cfa_register (%rsp)
331 popq %rbp
332 cfi_adjust_cfa_offset (-8)
333 cfi_restore (%rbp)
334 ret
335.endm
336