memmove-vec-unaligned-erms.S source code [glibc_src_2.24/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96	# else
97	# error Unsupported PREFETCHED_LOAD_SIZE!
98	# endif
99	#else
100	# error Unsupported PREFETCH_SIZE!
101	#endif
102
103	#ifndef SECTION
104	# error SECTION is not defined!
105	#endif
106
107	.section SECTION(.text),"ax",@progbits
108	#if defined SHARED && IS_IN (libc)
109	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110	cmp %RDX_LP, %RCX_LP
111	jb HIDDEN_JUMPTARGET (__chk_fail)
112	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113	#endif
114
115	#if VEC_SIZE == 16 \|\| defined SHARED
116	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117	mov %RDI_LP, %RAX_LP
118	add %RDX_LP, %RAX_LP
119	jmp L(start)
120	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121	#endif
122
123	#if defined SHARED && IS_IN (libc)
124	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125	cmp %RDX_LP, %RCX_LP
126	jb HIDDEN_JUMPTARGET (__chk_fail)
127	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128	#endif
129
130	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131	movq %rdi, %rax
132	L(start):
133	# ifdef __ILP32__
134	/ Clear the upper 32 bits. /
135	movl %edx, %edx
136	# endif
137	cmp $VEC_SIZE, %RDX_LP
138	jb L(less_vec)
139	cmp $(VEC_SIZE * `2`), %RDX_LP
140	ja L(more_2x_vec)
141	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
142	L(last_2x_vec):
143	#endif
144	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
145	VMOVU (%rsi), %VEC(`0`)
146	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
147	VMOVU %VEC(`0`), (%rdi)
148	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
149	VZEROUPPER
150	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
151	L(nop):
152	#endif
153	ret
154	#if defined USE_MULTIARCH && IS_IN (libc)
155	END (MEMMOVE_SYMBOL (__memmove, unaligned))
156
157	# if VEC_SIZE == 16
158	# if defined SHARED
159	/ Only used to measure performance of REP MOVSB. /
160	ENTRY (__mempcpy_erms)
161	mov %RDI_LP, %RAX_LP
162	add %RDX_LP, %RAX_LP
163	jmp L(start_movsb)
164	END (__mempcpy_erms)
165	# endif
166
167	ENTRY (__memmove_erms)
168	movq %rdi, %rax
169	L(start_movsb):
170	mov %RDX_LP, %RCX_LP
171	cmp %RSI_LP, %RDI_LP
172	jb `1f`
173	/ Source == destination is less common. /
174	je `2f`
175	lea (%rsi,%rcx), %RDX_LP
176	cmp %RDX_LP, %RDI_LP
177	jb L(movsb_backward)
178	`1`:
179	rep movsb
180	`2`:
181	ret
182	L(movsb_backward):
183	leaq -`1`(%rdi,%rcx), %rdi
184	leaq -`1`(%rsi,%rcx), %rsi
185	std
186	rep movsb
187	cld
188	ret
189	END (__memmove_erms)
190	# if defined SHARED
191	strong_alias (__memmove_erms, __memcpy_erms)
192	# endif
193	# endif
194
195	# ifdef SHARED
196	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
197	cmp %RDX_LP, %RCX_LP
198	jb HIDDEN_JUMPTARGET (__chk_fail)
199	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
200
201	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202	mov %RDI_LP, %RAX_LP
203	add %RDX_LP, %RAX_LP
204	jmp L(start_erms)
205	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
206
207	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
208	cmp %RDX_LP, %RCX_LP
209	jb HIDDEN_JUMPTARGET (__chk_fail)
210	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
211	# endif
212
213	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
214	movq %rdi, %rax
215	L(start_erms):
216	# ifdef __ILP32__
217	/ Clear the upper 32 bits. /
218	movl %edx, %edx
219	# endif
220	cmp $VEC_SIZE, %RDX_LP
221	jb L(less_vec)
222	cmp $(VEC_SIZE * `2`), %RDX_LP
223	ja L(movsb_more_2x_vec)
224	L(last_2x_vec):
225	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
226	VMOVU (%rsi), %VEC(`0`)
227	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
228	VMOVU %VEC(`0`), (%rdi)
229	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
230	L(return):
231	VZEROUPPER
232	ret
233
234	L(movsb):
235	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
236	jae L(more_8x_vec)
237	cmpq %rsi, %rdi
238	jb `1f`
239	/ Source == destination is less common. /
240	je L(nop)
241	leaq (%rsi,%rdx), %r9
242	cmpq %r9, %rdi
243	/ Avoid slow backward REP MOVSB. /
244	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
245	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
246	# endif
247	jb L(more_8x_vec_backward)
248	`1`:
249	mov %RDX_LP, %RCX_LP
250	rep movsb
251	L(nop):
252	ret
253	#endif
254
255	L(less_vec):
256	/ Less than 1 VEC. /
257	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
258	# error Unsupported VEC_SIZE!
259	#endif
260	#if VEC_SIZE > 32
261	cmpb $`32`, %dl
262	jae L(between_32_63)
263	#endif
264	#if VEC_SIZE > 16
265	cmpb $`16`, %dl
266	jae L(between_16_31)
267	#endif
268	cmpb $`8`, %dl
269	jae L(between_8_15)
270	cmpb $`4`, %dl
271	jae L(between_4_7)
272	cmpb $`1`, %dl
273	ja L(between_2_3)
274	jb `1f`
275	movzbl (%rsi), %ecx
276	movb %cl, (%rdi)
277	`1`:
278	ret
279	#if VEC_SIZE > 32
280	L(between_32_63):
281	/ From 32 to 63. No branch when size == 32. /
282	vmovdqu (%rsi), %ymm0
283	vmovdqu -`32`(%rsi,%rdx), %ymm1
284	vmovdqu %ymm0, (%rdi)
285	vmovdqu %ymm1, -`32`(%rdi,%rdx)
286	VZEROUPPER
287	ret
288	#endif
289	#if VEC_SIZE > 16
290	/ From 16 to 31. No branch when size == 16. /
291	L(between_16_31):
292	vmovdqu (%rsi), %xmm0
293	vmovdqu -`16`(%rsi,%rdx), %xmm1
294	vmovdqu %xmm0, (%rdi)
295	vmovdqu %xmm1, -`16`(%rdi,%rdx)
296	ret
297	#endif
298	L(between_8_15):
299	/ From 8 to 15. No branch when size == 8. /
300	movq -`8`(%rsi,%rdx), %rcx
301	movq (%rsi), %rsi
302	movq %rcx, -`8`(%rdi,%rdx)
303	movq %rsi, (%rdi)
304	ret
305	L(between_4_7):
306	/ From 4 to 7. No branch when size == 4. /
307	movl -`4`(%rsi,%rdx), %ecx
308	movl (%rsi), %esi
309	movl %ecx, -`4`(%rdi,%rdx)
310	movl %esi, (%rdi)
311	ret
312	L(between_2_3):
313	/ From 2 to 3. No branch when size == 2. /
314	movzwl -`2`(%rsi,%rdx), %ecx
315	movzwl (%rsi), %esi
316	movw %cx, -`2`(%rdi,%rdx)
317	movw %si, (%rdi)
318	ret
319
320	#if defined USE_MULTIARCH && IS_IN (libc)
321	L(movsb_more_2x_vec):
322	cmpq $REP_MOVSB_THRESHOLD, %rdx
323	ja L(movsb)
324	#endif
325	L(more_2x_vec):
326	/ More than 2 * VEC and there may be overlap between destination*
327	and source. /*
328	cmpq $(VEC_SIZE * `8`), %rdx
329	ja L(more_8x_vec)
330	cmpq $(VEC_SIZE * `4`), %rdx
331	jb L(last_4x_vec)
332	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
333	VMOVU (%rsi), %VEC(`0`)
334	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
335	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
336	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
337	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
338	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
339	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
340	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
341	VMOVU %VEC(`0`), (%rdi)
342	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
343	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
344	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
345	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
346	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
347	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
348	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
349	VZEROUPPER
350	ret
351	L(last_4x_vec):
352	/ Copy from 2 * VEC to 4 * VEC. /
353	VMOVU (%rsi), %VEC(`0`)
354	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
355	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
356	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
357	VMOVU %VEC(`0`), (%rdi)
358	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
359	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
360	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
361	VZEROUPPER
362	ret
363
364	L(more_8x_vec):
365	cmpq %rsi, %rdi
366	ja L(more_8x_vec_backward)
367	/ Source == destination is less common. /
368	je L(nop)
369	/ Load the first VEC and last 4 * VEC to support overlapping*
370	addresses. /*
371	VMOVU (%rsi), %VEC(`4`)
372	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
373	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
374	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
375	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
376	/ Save start and stop of the destination buffer. /
377	movq %rdi, %r11
378	leaq -VEC_SIZE(%rdi, %rdx), %rcx
379	/ Align destination for aligned stores in the loop. Compute*
380	how much destination is misaligned. /*
381	movq %rdi, %r8
382	andq $(VEC_SIZE - `1`), %r8
383	/ Get the negative of offset for alignment. /
384	subq $VEC_SIZE, %r8
385	/ Adjust source. /
386	subq %r8, %rsi
387	/ Adjust destination which should be aligned now. /
388	subq %r8, %rdi
389	/ Adjust length. /
390	addq %r8, %rdx
391	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
392	/ Check non-temporal store threshold. /
393	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
394	ja L(large_forward)
395	#endif
396	L(loop_4x_vec_forward):
397	/ Copy 4 * VEC a time forward. /
398	VMOVU (%rsi), %VEC(`0`)
399	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
400	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
401	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
402	addq $(VEC_SIZE * `4`), %rsi
403	subq $(VEC_SIZE * `4`), %rdx
404	VMOVA %VEC(`0`), (%rdi)
405	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
406	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
407	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
408	addq $(VEC_SIZE * `4`), %rdi
409	cmpq $(VEC_SIZE * `4`), %rdx
410	ja L(loop_4x_vec_forward)
411	/ Store the last 4 * VEC. /
412	VMOVU %VEC(`5`), (%rcx)
413	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
414	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
415	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
416	/ Store the first VEC. /
417	VMOVU %VEC(`4`), (%r11)
418	VZEROUPPER
419	ret
420
421	L(more_8x_vec_backward):
422	/ Load the first 4 * VEC and last VEC to support overlapping*
423	addresses. /*
424	VMOVU (%rsi), %VEC(`4`)
425	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
426	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
427	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
428	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
429	/ Save stop of the destination buffer. /
430	leaq -VEC_SIZE(%rdi, %rdx), %r11
431	/ Align destination end for aligned stores in the loop. Compute*
432	how much destination end is misaligned. /*
433	leaq -VEC_SIZE(%rsi, %rdx), %rcx
434	movq %r11, %r9
435	movq %r11, %r8
436	andq $(VEC_SIZE - `1`), %r8
437	/ Adjust source. /
438	subq %r8, %rcx
439	/ Adjust the end of destination which should be aligned now. /
440	subq %r8, %r9
441	/ Adjust length. /
442	subq %r8, %rdx
443	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
444	/ Check non-temporal store threshold. /
445	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
446	ja L(large_backward)
447	#endif
448	L(loop_4x_vec_backward):
449	/ Copy 4 * VEC a time backward. /
450	VMOVU (%rcx), %VEC(`0`)
451	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
452	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
453	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
454	subq $(VEC_SIZE * `4`), %rcx
455	subq $(VEC_SIZE * `4`), %rdx
456	VMOVA %VEC(`0`), (%r9)
457	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
458	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
459	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
460	subq $(VEC_SIZE * `4`), %r9
461	cmpq $(VEC_SIZE * `4`), %rdx
462	ja L(loop_4x_vec_backward)
463	/ Store the first 4 * VEC. /
464	VMOVU %VEC(`4`), (%rdi)
465	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
466	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
467	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
468	/ Store the last VEC. /
469	VMOVU %VEC(`8`), (%r11)
470	VZEROUPPER
471	ret
472
473	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
474	L(large_forward):
475	/ Don't use non-temporal store if there is overlap between*
476	destination and source since destination may be in cache
477	when source is loaded. /*
478	leaq (%rdi, %rdx), %r10
479	cmpq %r10, %rsi
480	jb L(loop_4x_vec_forward)
481	L(loop_large_forward):
482	/ Copy 4 * VEC a time forward with non-temporal stores. /
483	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
484	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
485	VMOVU (%rsi), %VEC(`0`)
486	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
487	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
488	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
489	addq $PREFETCHED_LOAD_SIZE, %rsi
490	subq $PREFETCHED_LOAD_SIZE, %rdx
491	VMOVNT %VEC(`0`), (%rdi)
492	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
493	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
494	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
495	addq $PREFETCHED_LOAD_SIZE, %rdi
496	cmpq $PREFETCHED_LOAD_SIZE, %rdx
497	ja L(loop_large_forward)
498	sfence
499	/ Store the last 4 * VEC. /
500	VMOVU %VEC(`5`), (%rcx)
501	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
502	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
503	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
504	/ Store the first VEC. /
505	VMOVU %VEC(`4`), (%r11)
506	VZEROUPPER
507	ret
508
509	L(large_backward):
510	/ Don't use non-temporal store if there is overlap between*
511	destination and source since destination may be in cache
512	when source is loaded. /*
513	leaq (%rcx, %rdx), %r10
514	cmpq %r10, %r9
515	jb L(loop_4x_vec_backward)
516	L(loop_large_backward):
517	/ Copy 4 * VEC a time backward with non-temporal stores. /
518	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
519	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
520	VMOVU (%rcx), %VEC(`0`)
521	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
522	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
523	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
524	subq $PREFETCHED_LOAD_SIZE, %rcx
525	subq $PREFETCHED_LOAD_SIZE, %rdx
526	VMOVNT %VEC(`0`), (%r9)
527	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
528	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
529	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
530	subq $PREFETCHED_LOAD_SIZE, %r9
531	cmpq $PREFETCHED_LOAD_SIZE, %rdx
532	ja L(loop_large_backward)
533	sfence
534	/ Store the first 4 * VEC. /
535	VMOVU %VEC(`4`), (%rdi)
536	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
537	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
538	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
539	/ Store the last VEC. /
540	VMOVU %VEC(`8`), (%r11)
541	VZEROUPPER
542	ret
543	#endif
544	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
545
546	#ifdef SHARED
547	# if IS_IN (libc)
548	# ifdef USE_MULTIARCH
549	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
550	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
551	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
552	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
553	# endif
554	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
556	# endif
557	#endif
558	#if VEC_SIZE == 16 \|\| defined SHARED
559	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
560	MEMCPY_SYMBOL (__memcpy, unaligned))
561	#endif
562

Browse the source code of glibc_src_2.24/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S