1 | /* memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware. |
2 | Copyright (C) 2016-2017 Free Software Foundation, Inc. |
3 | This file is part of the GNU C Library. |
4 | |
5 | The GNU C Library is free software; you can redistribute it and/or |
6 | modify it under the terms of the GNU Lesser General Public |
7 | License as published by the Free Software Foundation; either |
8 | version 2.1 of the License, or (at your option) any later version. |
9 | |
10 | The GNU C Library is distributed in the hope that it will be useful, |
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | Lesser General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU Lesser General Public |
16 | License along with the GNU C Library; if not, see |
17 | <http://www.gnu.org/licenses/>. */ |
18 | |
19 | #include <sysdep.h> |
20 | |
21 | #if IS_IN (libc) |
22 | |
23 | # include "asm-syntax.h" |
24 | |
25 | .section .text.avx512,"ax" ,@progbits |
26 | # if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE |
27 | ENTRY (__mempcpy_chk_avx512_no_vzeroupper) |
28 | cmp %RDX_LP, %RCX_LP |
29 | jb HIDDEN_JUMPTARGET (__chk_fail) |
30 | END (__mempcpy_chk_avx512_no_vzeroupper) |
31 | |
32 | ENTRY (__mempcpy_avx512_no_vzeroupper) |
33 | mov %RDI_LP, %RAX_LP |
34 | add %RDX_LP, %RAX_LP |
35 | jmp L(start) |
36 | END (__mempcpy_avx512_no_vzeroupper) |
37 | # endif |
38 | |
39 | # ifdef SHARED |
40 | ENTRY (__memmove_chk_avx512_no_vzeroupper) |
41 | cmp %RDX_LP, %RCX_LP |
42 | jb HIDDEN_JUMPTARGET (__chk_fail) |
43 | END (__memmove_chk_avx512_no_vzeroupper) |
44 | # endif |
45 | |
46 | ENTRY (__memmove_avx512_no_vzeroupper) |
47 | mov %RDI_LP, %RAX_LP |
48 | # ifdef USE_AS_MEMPCPY |
49 | add %RDX_LP, %RAX_LP |
50 | # endif |
51 | L(start): |
52 | # ifdef __ILP32__ |
53 | /* Clear the upper 32 bits. */ |
54 | mov %edx, %edx |
55 | # endif |
56 | lea (%rsi, %rdx), %rcx |
57 | lea (%rdi, %rdx), %r9 |
58 | cmp $512, %rdx |
59 | ja L(512bytesormore) |
60 | |
61 | L(check): |
62 | cmp $16, %rdx |
63 | jbe L(less_16bytes) |
64 | cmp $256, %rdx |
65 | jb L(less_256bytes) |
66 | vmovups (%rsi), %zmm0 |
67 | vmovups 0x40(%rsi), %zmm1 |
68 | vmovups 0x80(%rsi), %zmm2 |
69 | vmovups 0xC0(%rsi), %zmm3 |
70 | vmovups -0x100(%rcx), %zmm4 |
71 | vmovups -0xC0(%rcx), %zmm5 |
72 | vmovups -0x80(%rcx), %zmm6 |
73 | vmovups -0x40(%rcx), %zmm7 |
74 | vmovups %zmm0, (%rdi) |
75 | vmovups %zmm1, 0x40(%rdi) |
76 | vmovups %zmm2, 0x80(%rdi) |
77 | vmovups %zmm3, 0xC0(%rdi) |
78 | vmovups %zmm4, -0x100(%r9) |
79 | vmovups %zmm5, -0xC0(%r9) |
80 | vmovups %zmm6, -0x80(%r9) |
81 | vmovups %zmm7, -0x40(%r9) |
82 | ret |
83 | |
84 | L(less_256bytes): |
85 | cmp $128, %dl |
86 | jb L(less_128bytes) |
87 | vmovups (%rsi), %zmm0 |
88 | vmovups 0x40(%rsi), %zmm1 |
89 | vmovups -0x80(%rcx), %zmm2 |
90 | vmovups -0x40(%rcx), %zmm3 |
91 | vmovups %zmm0, (%rdi) |
92 | vmovups %zmm1, 0x40(%rdi) |
93 | vmovups %zmm2, -0x80(%r9) |
94 | vmovups %zmm3, -0x40(%r9) |
95 | ret |
96 | |
97 | L(less_128bytes): |
98 | cmp $64, %dl |
99 | jb L(less_64bytes) |
100 | vmovdqu (%rsi), %ymm0 |
101 | vmovdqu 0x20(%rsi), %ymm1 |
102 | vmovdqu -0x40(%rcx), %ymm2 |
103 | vmovdqu -0x20(%rcx), %ymm3 |
104 | vmovdqu %ymm0, (%rdi) |
105 | vmovdqu %ymm1, 0x20(%rdi) |
106 | vmovdqu %ymm2, -0x40(%r9) |
107 | vmovdqu %ymm3, -0x20(%r9) |
108 | ret |
109 | |
110 | L(less_64bytes): |
111 | cmp $32, %dl |
112 | jb L(less_32bytes) |
113 | vmovdqu (%rsi), %ymm0 |
114 | vmovdqu -0x20(%rcx), %ymm1 |
115 | vmovdqu %ymm0, (%rdi) |
116 | vmovdqu %ymm1, -0x20(%r9) |
117 | ret |
118 | |
119 | L(less_32bytes): |
120 | vmovdqu (%rsi), %xmm0 |
121 | vmovdqu -0x10(%rcx), %xmm1 |
122 | vmovdqu %xmm0, (%rdi) |
123 | vmovdqu %xmm1, -0x10(%r9) |
124 | ret |
125 | |
126 | L(less_16bytes): |
127 | cmp $8, %dl |
128 | jb L(less_8bytes) |
129 | movq (%rsi), %rsi |
130 | movq -0x8(%rcx), %rcx |
131 | movq %rsi, (%rdi) |
132 | movq %rcx, -0x8(%r9) |
133 | ret |
134 | |
135 | L(less_8bytes): |
136 | cmp $4, %dl |
137 | jb L(less_4bytes) |
138 | mov (%rsi), %esi |
139 | mov -0x4(%rcx), %ecx |
140 | mov %esi, (%rdi) |
141 | mov %ecx, -0x4(%r9) |
142 | ret |
143 | |
144 | L(less_4bytes): |
145 | cmp $2, %dl |
146 | jb L(less_2bytes) |
147 | mov (%rsi), %si |
148 | mov -0x2(%rcx), %cx |
149 | mov %si, (%rdi) |
150 | mov %cx, -0x2(%r9) |
151 | ret |
152 | |
153 | L(less_2bytes): |
154 | cmp $1, %dl |
155 | jb L(less_1bytes) |
156 | mov (%rsi), %cl |
157 | mov %cl, (%rdi) |
158 | L(less_1bytes): |
159 | ret |
160 | |
161 | L(512bytesormore): |
162 | # ifdef SHARED_CACHE_SIZE_HALF |
163 | mov $SHARED_CACHE_SIZE_HALF, %r8 |
164 | # else |
165 | mov __x86_shared_cache_size_half(%rip), %r8 |
166 | # endif |
167 | cmp %r8, %rdx |
168 | jae L(preloop_large) |
169 | cmp $1024, %rdx |
170 | ja L(1024bytesormore) |
171 | prefetcht1 (%rsi) |
172 | prefetcht1 0x40(%rsi) |
173 | prefetcht1 0x80(%rsi) |
174 | prefetcht1 0xC0(%rsi) |
175 | prefetcht1 0x100(%rsi) |
176 | prefetcht1 0x140(%rsi) |
177 | prefetcht1 0x180(%rsi) |
178 | prefetcht1 0x1C0(%rsi) |
179 | prefetcht1 -0x200(%rcx) |
180 | prefetcht1 -0x1C0(%rcx) |
181 | prefetcht1 -0x180(%rcx) |
182 | prefetcht1 -0x140(%rcx) |
183 | prefetcht1 -0x100(%rcx) |
184 | prefetcht1 -0xC0(%rcx) |
185 | prefetcht1 -0x80(%rcx) |
186 | prefetcht1 -0x40(%rcx) |
187 | vmovups (%rsi), %zmm0 |
188 | vmovups 0x40(%rsi), %zmm1 |
189 | vmovups 0x80(%rsi), %zmm2 |
190 | vmovups 0xC0(%rsi), %zmm3 |
191 | vmovups 0x100(%rsi), %zmm4 |
192 | vmovups 0x140(%rsi), %zmm5 |
193 | vmovups 0x180(%rsi), %zmm6 |
194 | vmovups 0x1C0(%rsi), %zmm7 |
195 | vmovups -0x200(%rcx), %zmm8 |
196 | vmovups -0x1C0(%rcx), %zmm9 |
197 | vmovups -0x180(%rcx), %zmm10 |
198 | vmovups -0x140(%rcx), %zmm11 |
199 | vmovups -0x100(%rcx), %zmm12 |
200 | vmovups -0xC0(%rcx), %zmm13 |
201 | vmovups -0x80(%rcx), %zmm14 |
202 | vmovups -0x40(%rcx), %zmm15 |
203 | vmovups %zmm0, (%rdi) |
204 | vmovups %zmm1, 0x40(%rdi) |
205 | vmovups %zmm2, 0x80(%rdi) |
206 | vmovups %zmm3, 0xC0(%rdi) |
207 | vmovups %zmm4, 0x100(%rdi) |
208 | vmovups %zmm5, 0x140(%rdi) |
209 | vmovups %zmm6, 0x180(%rdi) |
210 | vmovups %zmm7, 0x1C0(%rdi) |
211 | vmovups %zmm8, -0x200(%r9) |
212 | vmovups %zmm9, -0x1C0(%r9) |
213 | vmovups %zmm10, -0x180(%r9) |
214 | vmovups %zmm11, -0x140(%r9) |
215 | vmovups %zmm12, -0x100(%r9) |
216 | vmovups %zmm13, -0xC0(%r9) |
217 | vmovups %zmm14, -0x80(%r9) |
218 | vmovups %zmm15, -0x40(%r9) |
219 | ret |
220 | |
221 | L(1024bytesormore): |
222 | cmp %rsi, %rdi |
223 | ja L(1024bytesormore_bkw) |
224 | sub $512, %r9 |
225 | vmovups -0x200(%rcx), %zmm8 |
226 | vmovups -0x1C0(%rcx), %zmm9 |
227 | vmovups -0x180(%rcx), %zmm10 |
228 | vmovups -0x140(%rcx), %zmm11 |
229 | vmovups -0x100(%rcx), %zmm12 |
230 | vmovups -0xC0(%rcx), %zmm13 |
231 | vmovups -0x80(%rcx), %zmm14 |
232 | vmovups -0x40(%rcx), %zmm15 |
233 | prefetcht1 (%rsi) |
234 | prefetcht1 0x40(%rsi) |
235 | prefetcht1 0x80(%rsi) |
236 | prefetcht1 0xC0(%rsi) |
237 | prefetcht1 0x100(%rsi) |
238 | prefetcht1 0x140(%rsi) |
239 | prefetcht1 0x180(%rsi) |
240 | prefetcht1 0x1C0(%rsi) |
241 | |
242 | /* Loop with unaligned memory access. */ |
243 | L(gobble_512bytes_loop): |
244 | vmovups (%rsi), %zmm0 |
245 | vmovups 0x40(%rsi), %zmm1 |
246 | vmovups 0x80(%rsi), %zmm2 |
247 | vmovups 0xC0(%rsi), %zmm3 |
248 | vmovups 0x100(%rsi), %zmm4 |
249 | vmovups 0x140(%rsi), %zmm5 |
250 | vmovups 0x180(%rsi), %zmm6 |
251 | vmovups 0x1C0(%rsi), %zmm7 |
252 | add $512, %rsi |
253 | prefetcht1 (%rsi) |
254 | prefetcht1 0x40(%rsi) |
255 | prefetcht1 0x80(%rsi) |
256 | prefetcht1 0xC0(%rsi) |
257 | prefetcht1 0x100(%rsi) |
258 | prefetcht1 0x140(%rsi) |
259 | prefetcht1 0x180(%rsi) |
260 | prefetcht1 0x1C0(%rsi) |
261 | vmovups %zmm0, (%rdi) |
262 | vmovups %zmm1, 0x40(%rdi) |
263 | vmovups %zmm2, 0x80(%rdi) |
264 | vmovups %zmm3, 0xC0(%rdi) |
265 | vmovups %zmm4, 0x100(%rdi) |
266 | vmovups %zmm5, 0x140(%rdi) |
267 | vmovups %zmm6, 0x180(%rdi) |
268 | vmovups %zmm7, 0x1C0(%rdi) |
269 | add $512, %rdi |
270 | cmp %r9, %rdi |
271 | jb L(gobble_512bytes_loop) |
272 | vmovups %zmm8, (%r9) |
273 | vmovups %zmm9, 0x40(%r9) |
274 | vmovups %zmm10, 0x80(%r9) |
275 | vmovups %zmm11, 0xC0(%r9) |
276 | vmovups %zmm12, 0x100(%r9) |
277 | vmovups %zmm13, 0x140(%r9) |
278 | vmovups %zmm14, 0x180(%r9) |
279 | vmovups %zmm15, 0x1C0(%r9) |
280 | ret |
281 | |
282 | L(1024bytesormore_bkw): |
283 | add $512, %rdi |
284 | vmovups 0x1C0(%rsi), %zmm8 |
285 | vmovups 0x180(%rsi), %zmm9 |
286 | vmovups 0x140(%rsi), %zmm10 |
287 | vmovups 0x100(%rsi), %zmm11 |
288 | vmovups 0xC0(%rsi), %zmm12 |
289 | vmovups 0x80(%rsi), %zmm13 |
290 | vmovups 0x40(%rsi), %zmm14 |
291 | vmovups (%rsi), %zmm15 |
292 | prefetcht1 -0x40(%rcx) |
293 | prefetcht1 -0x80(%rcx) |
294 | prefetcht1 -0xC0(%rcx) |
295 | prefetcht1 -0x100(%rcx) |
296 | prefetcht1 -0x140(%rcx) |
297 | prefetcht1 -0x180(%rcx) |
298 | prefetcht1 -0x1C0(%rcx) |
299 | prefetcht1 -0x200(%rcx) |
300 | |
301 | /* Backward loop with unaligned memory access. */ |
302 | L(gobble_512bytes_loop_bkw): |
303 | vmovups -0x40(%rcx), %zmm0 |
304 | vmovups -0x80(%rcx), %zmm1 |
305 | vmovups -0xC0(%rcx), %zmm2 |
306 | vmovups -0x100(%rcx), %zmm3 |
307 | vmovups -0x140(%rcx), %zmm4 |
308 | vmovups -0x180(%rcx), %zmm5 |
309 | vmovups -0x1C0(%rcx), %zmm6 |
310 | vmovups -0x200(%rcx), %zmm7 |
311 | sub $512, %rcx |
312 | prefetcht1 -0x40(%rcx) |
313 | prefetcht1 -0x80(%rcx) |
314 | prefetcht1 -0xC0(%rcx) |
315 | prefetcht1 -0x100(%rcx) |
316 | prefetcht1 -0x140(%rcx) |
317 | prefetcht1 -0x180(%rcx) |
318 | prefetcht1 -0x1C0(%rcx) |
319 | prefetcht1 -0x200(%rcx) |
320 | vmovups %zmm0, -0x40(%r9) |
321 | vmovups %zmm1, -0x80(%r9) |
322 | vmovups %zmm2, -0xC0(%r9) |
323 | vmovups %zmm3, -0x100(%r9) |
324 | vmovups %zmm4, -0x140(%r9) |
325 | vmovups %zmm5, -0x180(%r9) |
326 | vmovups %zmm6, -0x1C0(%r9) |
327 | vmovups %zmm7, -0x200(%r9) |
328 | sub $512, %r9 |
329 | cmp %rdi, %r9 |
330 | ja L(gobble_512bytes_loop_bkw) |
331 | vmovups %zmm8, -0x40(%rdi) |
332 | vmovups %zmm9, -0x80(%rdi) |
333 | vmovups %zmm10, -0xC0(%rdi) |
334 | vmovups %zmm11, -0x100(%rdi) |
335 | vmovups %zmm12, -0x140(%rdi) |
336 | vmovups %zmm13, -0x180(%rdi) |
337 | vmovups %zmm14, -0x1C0(%rdi) |
338 | vmovups %zmm15, -0x200(%rdi) |
339 | ret |
340 | |
341 | L(preloop_large): |
342 | cmp %rsi, %rdi |
343 | ja L(preloop_large_bkw) |
344 | vmovups (%rsi), %zmm4 |
345 | vmovups 0x40(%rsi), %zmm5 |
346 | |
347 | /* Align destination for access with non-temporal stores in the loop. */ |
348 | mov %rdi, %r8 |
349 | and $-0x80, %rdi |
350 | add $0x80, %rdi |
351 | sub %rdi, %r8 |
352 | sub %r8, %rsi |
353 | add %r8, %rdx |
354 | L(gobble_256bytes_nt_loop): |
355 | prefetcht1 0x200(%rsi) |
356 | prefetcht1 0x240(%rsi) |
357 | prefetcht1 0x280(%rsi) |
358 | prefetcht1 0x2C0(%rsi) |
359 | prefetcht1 0x300(%rsi) |
360 | prefetcht1 0x340(%rsi) |
361 | prefetcht1 0x380(%rsi) |
362 | prefetcht1 0x3C0(%rsi) |
363 | vmovdqu64 (%rsi), %zmm0 |
364 | vmovdqu64 0x40(%rsi), %zmm1 |
365 | vmovdqu64 0x80(%rsi), %zmm2 |
366 | vmovdqu64 0xC0(%rsi), %zmm3 |
367 | vmovntdq %zmm0, (%rdi) |
368 | vmovntdq %zmm1, 0x40(%rdi) |
369 | vmovntdq %zmm2, 0x80(%rdi) |
370 | vmovntdq %zmm3, 0xC0(%rdi) |
371 | sub $256, %rdx |
372 | add $256, %rsi |
373 | add $256, %rdi |
374 | cmp $256, %rdx |
375 | ja L(gobble_256bytes_nt_loop) |
376 | sfence |
377 | vmovups %zmm4, (%rax) |
378 | vmovups %zmm5, 0x40(%rax) |
379 | jmp L(check) |
380 | |
381 | L(preloop_large_bkw): |
382 | vmovups -0x80(%rcx), %zmm4 |
383 | vmovups -0x40(%rcx), %zmm5 |
384 | |
385 | /* Align end of destination for access with non-temporal stores. */ |
386 | mov %r9, %r8 |
387 | and $-0x80, %r9 |
388 | sub %r9, %r8 |
389 | sub %r8, %rcx |
390 | sub %r8, %rdx |
391 | add %r9, %r8 |
392 | L(gobble_256bytes_nt_loop_bkw): |
393 | prefetcht1 -0x400(%rcx) |
394 | prefetcht1 -0x3C0(%rcx) |
395 | prefetcht1 -0x380(%rcx) |
396 | prefetcht1 -0x340(%rcx) |
397 | prefetcht1 -0x300(%rcx) |
398 | prefetcht1 -0x2C0(%rcx) |
399 | prefetcht1 -0x280(%rcx) |
400 | prefetcht1 -0x240(%rcx) |
401 | vmovdqu64 -0x100(%rcx), %zmm0 |
402 | vmovdqu64 -0xC0(%rcx), %zmm1 |
403 | vmovdqu64 -0x80(%rcx), %zmm2 |
404 | vmovdqu64 -0x40(%rcx), %zmm3 |
405 | vmovntdq %zmm0, -0x100(%r9) |
406 | vmovntdq %zmm1, -0xC0(%r9) |
407 | vmovntdq %zmm2, -0x80(%r9) |
408 | vmovntdq %zmm3, -0x40(%r9) |
409 | sub $256, %rdx |
410 | sub $256, %rcx |
411 | sub $256, %r9 |
412 | cmp $256, %rdx |
413 | ja L(gobble_256bytes_nt_loop_bkw) |
414 | sfence |
415 | vmovups %zmm4, -0x80(%r8) |
416 | vmovups %zmm5, -0x40(%r8) |
417 | jmp L(check) |
418 | END (__memmove_avx512_no_vzeroupper) |
419 | |
420 | # ifdef SHARED |
421 | strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper) |
422 | strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper) |
423 | # endif |
424 | #endif |
425 | |