memmove-avx512-no-vzeroupper.S source code [glibc_src_2.25/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S]

1	/ memmove/memcpy/mempcpy optimized with AVX512 for KNL hardware.*
2	Copyright (C) 2016-2017 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	#include <sysdep.h>
20
21	#if IS_IN (libc)
22
23	# include "asm-syntax.h"
24
25	.section .text.avx512,"ax",@progbits
26	# if defined SHARED && !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
27	ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
28	cmp %RDX_LP, %RCX_LP
29	jb HIDDEN_JUMPTARGET (__chk_fail)
30	END (__mempcpy_chk_avx512_no_vzeroupper)
31
32	ENTRY (__mempcpy_avx512_no_vzeroupper)
33	mov %RDI_LP, %RAX_LP
34	add %RDX_LP, %RAX_LP
35	jmp L(start)
36	END (__mempcpy_avx512_no_vzeroupper)
37	# endif
38
39	# ifdef SHARED
40	ENTRY (__memmove_chk_avx512_no_vzeroupper)
41	cmp %RDX_LP, %RCX_LP
42	jb HIDDEN_JUMPTARGET (__chk_fail)
43	END (__memmove_chk_avx512_no_vzeroupper)
44	# endif
45
46	ENTRY (__memmove_avx512_no_vzeroupper)
47	mov %RDI_LP, %RAX_LP
48	# ifdef USE_AS_MEMPCPY
49	add %RDX_LP, %RAX_LP
50	# endif
51	L(start):
52	# ifdef __ILP32__
53	/ Clear the upper 32 bits. /
54	mov %edx, %edx
55	# endif
56	lea (%rsi, %rdx), %rcx
57	lea (%rdi, %rdx), %r9
58	cmp $`512`, %rdx
59	ja L(`512bytesormore`)
60
61	L(check):
62	cmp $`16`, %rdx
63	jbe L(less_16bytes)
64	cmp $`256`, %rdx
65	jb L(less_256bytes)
66	vmovups (%rsi), %zmm0
67	vmovups `0x40`(%rsi), %zmm1
68	vmovups `0x80`(%rsi), %zmm2
69	vmovups `0xC0`(%rsi), %zmm3
70	vmovups -`0x100`(%rcx), %zmm4
71	vmovups -`0xC0`(%rcx), %zmm5
72	vmovups -`0x80`(%rcx), %zmm6
73	vmovups -`0x40`(%rcx), %zmm7
74	vmovups %zmm0, (%rdi)
75	vmovups %zmm1, `0x40`(%rdi)
76	vmovups %zmm2, `0x80`(%rdi)
77	vmovups %zmm3, `0xC0`(%rdi)
78	vmovups %zmm4, -`0x100`(%r9)
79	vmovups %zmm5, -`0xC0`(%r9)
80	vmovups %zmm6, -`0x80`(%r9)
81	vmovups %zmm7, -`0x40`(%r9)
82	ret
83
84	L(less_256bytes):
85	cmp $`128`, %dl
86	jb L(less_128bytes)
87	vmovups (%rsi), %zmm0
88	vmovups `0x40`(%rsi), %zmm1
89	vmovups -`0x80`(%rcx), %zmm2
90	vmovups -`0x40`(%rcx), %zmm3
91	vmovups %zmm0, (%rdi)
92	vmovups %zmm1, `0x40`(%rdi)
93	vmovups %zmm2, -`0x80`(%r9)
94	vmovups %zmm3, -`0x40`(%r9)
95	ret
96
97	L(less_128bytes):
98	cmp $`64`, %dl
99	jb L(less_64bytes)
100	vmovdqu (%rsi), %ymm0
101	vmovdqu `0x20`(%rsi), %ymm1
102	vmovdqu -`0x40`(%rcx), %ymm2
103	vmovdqu -`0x20`(%rcx), %ymm3
104	vmovdqu %ymm0, (%rdi)
105	vmovdqu %ymm1, `0x20`(%rdi)
106	vmovdqu %ymm2, -`0x40`(%r9)
107	vmovdqu %ymm3, -`0x20`(%r9)
108	ret
109
110	L(less_64bytes):
111	cmp $`32`, %dl
112	jb L(less_32bytes)
113	vmovdqu (%rsi), %ymm0
114	vmovdqu -`0x20`(%rcx), %ymm1
115	vmovdqu %ymm0, (%rdi)
116	vmovdqu %ymm1, -`0x20`(%r9)
117	ret
118
119	L(less_32bytes):
120	vmovdqu (%rsi), %xmm0
121	vmovdqu -`0x10`(%rcx), %xmm1
122	vmovdqu %xmm0, (%rdi)
123	vmovdqu %xmm1, -`0x10`(%r9)
124	ret
125
126	L(less_16bytes):
127	cmp $`8`, %dl
128	jb L(less_8bytes)
129	movq (%rsi), %rsi
130	movq -`0x8`(%rcx), %rcx
131	movq %rsi, (%rdi)
132	movq %rcx, -`0x8`(%r9)
133	ret
134
135	L(less_8bytes):
136	cmp $`4`, %dl
137	jb L(less_4bytes)
138	mov (%rsi), %esi
139	mov -`0x4`(%rcx), %ecx
140	mov %esi, (%rdi)
141	mov %ecx, -`0x4`(%r9)
142	ret
143
144	L(less_4bytes):
145	cmp $`2`, %dl
146	jb L(less_2bytes)
147	mov (%rsi), %si
148	mov -`0x2`(%rcx), %cx
149	mov %si, (%rdi)
150	mov %cx, -`0x2`(%r9)
151	ret
152
153	L(less_2bytes):
154	cmp $`1`, %dl
155	jb L(less_1bytes)
156	mov (%rsi), %cl
157	mov %cl, (%rdi)
158	L(less_1bytes):
159	ret
160
161	L(`512bytesormore`):
162	# ifdef SHARED_CACHE_SIZE_HALF
163	mov $SHARED_CACHE_SIZE_HALF, %r8
164	# else
165	mov __x86_shared_cache_size_half(%rip), %r8
166	# endif
167	cmp %r8, %rdx
168	jae L(preloop_large)
169	cmp $`1024`, %rdx
170	ja L(`1024bytesormore`)
171	prefetcht1 (%rsi)
172	prefetcht1 `0x40`(%rsi)
173	prefetcht1 `0x80`(%rsi)
174	prefetcht1 `0xC0`(%rsi)
175	prefetcht1 `0x100`(%rsi)
176	prefetcht1 `0x140`(%rsi)
177	prefetcht1 `0x180`(%rsi)
178	prefetcht1 `0x1C0`(%rsi)
179	prefetcht1 -`0x200`(%rcx)
180	prefetcht1 -`0x1C0`(%rcx)
181	prefetcht1 -`0x180`(%rcx)
182	prefetcht1 -`0x140`(%rcx)
183	prefetcht1 -`0x100`(%rcx)
184	prefetcht1 -`0xC0`(%rcx)
185	prefetcht1 -`0x80`(%rcx)
186	prefetcht1 -`0x40`(%rcx)
187	vmovups (%rsi), %zmm0
188	vmovups `0x40`(%rsi), %zmm1
189	vmovups `0x80`(%rsi), %zmm2
190	vmovups `0xC0`(%rsi), %zmm3
191	vmovups `0x100`(%rsi), %zmm4
192	vmovups `0x140`(%rsi), %zmm5
193	vmovups `0x180`(%rsi), %zmm6
194	vmovups `0x1C0`(%rsi), %zmm7
195	vmovups -`0x200`(%rcx), %zmm8
196	vmovups -`0x1C0`(%rcx), %zmm9
197	vmovups -`0x180`(%rcx), %zmm10
198	vmovups -`0x140`(%rcx), %zmm11
199	vmovups -`0x100`(%rcx), %zmm12
200	vmovups -`0xC0`(%rcx), %zmm13
201	vmovups -`0x80`(%rcx), %zmm14
202	vmovups -`0x40`(%rcx), %zmm15
203	vmovups %zmm0, (%rdi)
204	vmovups %zmm1, `0x40`(%rdi)
205	vmovups %zmm2, `0x80`(%rdi)
206	vmovups %zmm3, `0xC0`(%rdi)
207	vmovups %zmm4, `0x100`(%rdi)
208	vmovups %zmm5, `0x140`(%rdi)
209	vmovups %zmm6, `0x180`(%rdi)
210	vmovups %zmm7, `0x1C0`(%rdi)
211	vmovups %zmm8, -`0x200`(%r9)
212	vmovups %zmm9, -`0x1C0`(%r9)
213	vmovups %zmm10, -`0x180`(%r9)
214	vmovups %zmm11, -`0x140`(%r9)
215	vmovups %zmm12, -`0x100`(%r9)
216	vmovups %zmm13, -`0xC0`(%r9)
217	vmovups %zmm14, -`0x80`(%r9)
218	vmovups %zmm15, -`0x40`(%r9)
219	ret
220
221	L(`1024bytesormore`):
222	cmp %rsi, %rdi
223	ja L(`1024bytesormore_bkw`)
224	sub $`512`, %r9
225	vmovups -`0x200`(%rcx), %zmm8
226	vmovups -`0x1C0`(%rcx), %zmm9
227	vmovups -`0x180`(%rcx), %zmm10
228	vmovups -`0x140`(%rcx), %zmm11
229	vmovups -`0x100`(%rcx), %zmm12
230	vmovups -`0xC0`(%rcx), %zmm13
231	vmovups -`0x80`(%rcx), %zmm14
232	vmovups -`0x40`(%rcx), %zmm15
233	prefetcht1 (%rsi)
234	prefetcht1 `0x40`(%rsi)
235	prefetcht1 `0x80`(%rsi)
236	prefetcht1 `0xC0`(%rsi)
237	prefetcht1 `0x100`(%rsi)
238	prefetcht1 `0x140`(%rsi)
239	prefetcht1 `0x180`(%rsi)
240	prefetcht1 `0x1C0`(%rsi)
241
242	/ Loop with unaligned memory access. /
243	L(gobble_512bytes_loop):
244	vmovups (%rsi), %zmm0
245	vmovups `0x40`(%rsi), %zmm1
246	vmovups `0x80`(%rsi), %zmm2
247	vmovups `0xC0`(%rsi), %zmm3
248	vmovups `0x100`(%rsi), %zmm4
249	vmovups `0x140`(%rsi), %zmm5
250	vmovups `0x180`(%rsi), %zmm6
251	vmovups `0x1C0`(%rsi), %zmm7
252	add $`512`, %rsi
253	prefetcht1 (%rsi)
254	prefetcht1 `0x40`(%rsi)
255	prefetcht1 `0x80`(%rsi)
256	prefetcht1 `0xC0`(%rsi)
257	prefetcht1 `0x100`(%rsi)
258	prefetcht1 `0x140`(%rsi)
259	prefetcht1 `0x180`(%rsi)
260	prefetcht1 `0x1C0`(%rsi)
261	vmovups %zmm0, (%rdi)
262	vmovups %zmm1, `0x40`(%rdi)
263	vmovups %zmm2, `0x80`(%rdi)
264	vmovups %zmm3, `0xC0`(%rdi)
265	vmovups %zmm4, `0x100`(%rdi)
266	vmovups %zmm5, `0x140`(%rdi)
267	vmovups %zmm6, `0x180`(%rdi)
268	vmovups %zmm7, `0x1C0`(%rdi)
269	add $`512`, %rdi
270	cmp %r9, %rdi
271	jb L(gobble_512bytes_loop)
272	vmovups %zmm8, (%r9)
273	vmovups %zmm9, `0x40`(%r9)
274	vmovups %zmm10, `0x80`(%r9)
275	vmovups %zmm11, `0xC0`(%r9)
276	vmovups %zmm12, `0x100`(%r9)
277	vmovups %zmm13, `0x140`(%r9)
278	vmovups %zmm14, `0x180`(%r9)
279	vmovups %zmm15, `0x1C0`(%r9)
280	ret
281
282	L(`1024bytesormore_bkw`):
283	add $`512`, %rdi
284	vmovups `0x1C0`(%rsi), %zmm8
285	vmovups `0x180`(%rsi), %zmm9
286	vmovups `0x140`(%rsi), %zmm10
287	vmovups `0x100`(%rsi), %zmm11
288	vmovups `0xC0`(%rsi), %zmm12
289	vmovups `0x80`(%rsi), %zmm13
290	vmovups `0x40`(%rsi), %zmm14
291	vmovups (%rsi), %zmm15
292	prefetcht1 -`0x40`(%rcx)
293	prefetcht1 -`0x80`(%rcx)
294	prefetcht1 -`0xC0`(%rcx)
295	prefetcht1 -`0x100`(%rcx)
296	prefetcht1 -`0x140`(%rcx)
297	prefetcht1 -`0x180`(%rcx)
298	prefetcht1 -`0x1C0`(%rcx)
299	prefetcht1 -`0x200`(%rcx)
300
301	/ Backward loop with unaligned memory access. /
302	L(gobble_512bytes_loop_bkw):
303	vmovups -`0x40`(%rcx), %zmm0
304	vmovups -`0x80`(%rcx), %zmm1
305	vmovups -`0xC0`(%rcx), %zmm2
306	vmovups -`0x100`(%rcx), %zmm3
307	vmovups -`0x140`(%rcx), %zmm4
308	vmovups -`0x180`(%rcx), %zmm5
309	vmovups -`0x1C0`(%rcx), %zmm6
310	vmovups -`0x200`(%rcx), %zmm7
311	sub $`512`, %rcx
312	prefetcht1 -`0x40`(%rcx)
313	prefetcht1 -`0x80`(%rcx)
314	prefetcht1 -`0xC0`(%rcx)
315	prefetcht1 -`0x100`(%rcx)
316	prefetcht1 -`0x140`(%rcx)
317	prefetcht1 -`0x180`(%rcx)
318	prefetcht1 -`0x1C0`(%rcx)
319	prefetcht1 -`0x200`(%rcx)
320	vmovups %zmm0, -`0x40`(%r9)
321	vmovups %zmm1, -`0x80`(%r9)
322	vmovups %zmm2, -`0xC0`(%r9)
323	vmovups %zmm3, -`0x100`(%r9)
324	vmovups %zmm4, -`0x140`(%r9)
325	vmovups %zmm5, -`0x180`(%r9)
326	vmovups %zmm6, -`0x1C0`(%r9)
327	vmovups %zmm7, -`0x200`(%r9)
328	sub $`512`, %r9
329	cmp %rdi, %r9
330	ja L(gobble_512bytes_loop_bkw)
331	vmovups %zmm8, -`0x40`(%rdi)
332	vmovups %zmm9, -`0x80`(%rdi)
333	vmovups %zmm10, -`0xC0`(%rdi)
334	vmovups %zmm11, -`0x100`(%rdi)
335	vmovups %zmm12, -`0x140`(%rdi)
336	vmovups %zmm13, -`0x180`(%rdi)
337	vmovups %zmm14, -`0x1C0`(%rdi)
338	vmovups %zmm15, -`0x200`(%rdi)
339	ret
340
341	L(preloop_large):
342	cmp %rsi, %rdi
343	ja L(preloop_large_bkw)
344	vmovups (%rsi), %zmm4
345	vmovups `0x40`(%rsi), %zmm5
346
347	/ Align destination for access with non-temporal stores in the loop. /
348	mov %rdi, %r8
349	and $-`0x80`, %rdi
350	add $`0x80`, %rdi
351	sub %rdi, %r8
352	sub %r8, %rsi
353	add %r8, %rdx
354	L(gobble_256bytes_nt_loop):
355	prefetcht1 `0x200`(%rsi)
356	prefetcht1 `0x240`(%rsi)
357	prefetcht1 `0x280`(%rsi)
358	prefetcht1 `0x2C0`(%rsi)
359	prefetcht1 `0x300`(%rsi)
360	prefetcht1 `0x340`(%rsi)
361	prefetcht1 `0x380`(%rsi)
362	prefetcht1 `0x3C0`(%rsi)
363	vmovdqu64 (%rsi), %zmm0
364	vmovdqu64 `0x40`(%rsi), %zmm1
365	vmovdqu64 `0x80`(%rsi), %zmm2
366	vmovdqu64 `0xC0`(%rsi), %zmm3
367	vmovntdq %zmm0, (%rdi)
368	vmovntdq %zmm1, `0x40`(%rdi)
369	vmovntdq %zmm2, `0x80`(%rdi)
370	vmovntdq %zmm3, `0xC0`(%rdi)
371	sub $`256`, %rdx
372	add $`256`, %rsi
373	add $`256`, %rdi
374	cmp $`256`, %rdx
375	ja L(gobble_256bytes_nt_loop)
376	sfence
377	vmovups %zmm4, (%rax)
378	vmovups %zmm5, `0x40`(%rax)
379	jmp L(check)
380
381	L(preloop_large_bkw):
382	vmovups -`0x80`(%rcx), %zmm4
383	vmovups -`0x40`(%rcx), %zmm5
384
385	/ Align end of destination for access with non-temporal stores. /
386	mov %r9, %r8
387	and $-`0x80`, %r9
388	sub %r9, %r8
389	sub %r8, %rcx
390	sub %r8, %rdx
391	add %r9, %r8
392	L(gobble_256bytes_nt_loop_bkw):
393	prefetcht1 -`0x400`(%rcx)
394	prefetcht1 -`0x3C0`(%rcx)
395	prefetcht1 -`0x380`(%rcx)
396	prefetcht1 -`0x340`(%rcx)
397	prefetcht1 -`0x300`(%rcx)
398	prefetcht1 -`0x2C0`(%rcx)
399	prefetcht1 -`0x280`(%rcx)
400	prefetcht1 -`0x240`(%rcx)
401	vmovdqu64 -`0x100`(%rcx), %zmm0
402	vmovdqu64 -`0xC0`(%rcx), %zmm1
403	vmovdqu64 -`0x80`(%rcx), %zmm2
404	vmovdqu64 -`0x40`(%rcx), %zmm3
405	vmovntdq %zmm0, -`0x100`(%r9)
406	vmovntdq %zmm1, -`0xC0`(%r9)
407	vmovntdq %zmm2, -`0x80`(%r9)
408	vmovntdq %zmm3, -`0x40`(%r9)
409	sub $`256`, %rdx
410	sub $`256`, %rcx
411	sub $`256`, %r9
412	cmp $`256`, %rdx
413	ja L(gobble_256bytes_nt_loop_bkw)
414	sfence
415	vmovups %zmm4, -`0x80`(%r8)
416	vmovups %zmm5, -`0x40`(%r8)
417	jmp L(check)
418	END (__memmove_avx512_no_vzeroupper)
419
420	# ifdef SHARED
421	strong_alias (__memmove_avx512_no_vzeroupper, __memcpy_avx512_no_vzeroupper)
422	strong_alias (__memmove_chk_avx512_no_vzeroupper, __memcpy_chk_avx512_no_vzeroupper)
423	# endif
424	#endif
425

Browse the source code of glibc_src_2.25/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S