memmove-vec-unaligned-erms.S source code [glibc_src_2.27/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S]

1	/ memmove/memcpy/mempcpy with unaligned load/store and rep movsb*
2	Copyright (C) 2016-2018 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4
5	The GNU C Library is free software; you can redistribute it and/or
6	modify it under the terms of the GNU Lesser General Public
7	License as published by the Free Software Foundation; either
8	version 2.1 of the License, or (at your option) any later version.
9
10	The GNU C Library is distributed in the hope that it will be useful,
11	but WITHOUT ANY WARRANTY; without even the implied warranty of
12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13	Lesser General Public License for more details.
14
15	You should have received a copy of the GNU Lesser General Public
16	License along with the GNU C Library; if not, see
17	<http://www.gnu.org/licenses/>. /*
18
19	/ memmove/memcpy/mempcpy is implemented as:*
20	1. Use overlapping load and store to avoid branch.
21	2. Load all sources into registers and store them together to avoid
22	possible address overlap between source and destination.
23	3. If size is 8 VEC_SIZE or less, load all sources into registers*
24	and store them together.
25	4. If address of destination > address of source, backward copy
26	4 VEC_SIZE at a time with unaligned load and aligned store.*
27	Load the first 4 VEC and last VEC before the loop and store*
28	them after the loop to support overlapping addresses.
29	5. Otherwise, forward copy 4 VEC_SIZE at a time with unaligned*
30	load and aligned store. Load the last 4 VEC and first VEC*
31	before the loop and store them after the loop to support
32	overlapping addresses.
33	6. If size >= __x86_shared_non_temporal_threshold and there is no
34	overlap between destination and source, use non-temporal store
35	instead of aligned store. /*
36
37	#include <sysdep.h>
38
39	#ifndef MEMCPY_SYMBOL
40	# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41	#endif
42
43	#ifndef MEMPCPY_SYMBOL
44	# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45	#endif
46
47	#ifndef MEMMOVE_CHK_SYMBOL
48	# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49	#endif
50
51	#ifndef VZEROUPPER
52	# if VEC_SIZE > 16
53	# define VZEROUPPER vzeroupper
54	# else
55	# define VZEROUPPER
56	# endif
57	#endif
58
59	/ Threshold to use Enhanced REP MOVSB. Since there is overhead to set*
60	up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61	memcpy micro benchmark in glibc shows that 2KB is the approximate
62	value above which REP MOVSB becomes faster than SSE2 optimization
63	on processors with Enhanced REP MOVSB. Since larger register size
64	can move more data with a single load and store, the threshold is
65	higher with larger register size. /*
66	#ifndef REP_MOVSB_THRESHOLD
67	# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68	#endif
69
70	#ifndef PREFETCH
71	# define PREFETCH(addr) prefetcht0 addr
72	#endif
73
74	/ Assume 64-byte prefetch size. /
75	#ifndef PREFETCH_SIZE
76	# define PREFETCH_SIZE 64
77	#endif
78
79	#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81	#if PREFETCH_SIZE == 64
82	# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83	# define PREFETCH_ONE_SET(dir, base, offset) \
84	PREFETCH ((offset)base)
85	# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86	# define PREFETCH_ONE_SET(dir, base, offset) \
87	PREFETCH ((offset)base); \
88	PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89	# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90	# define PREFETCH_ONE_SET(dir, base, offset) \
91	PREFETCH ((offset)base); \
92	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93	PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94	PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95	PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96	# else
97	# error Unsupported PREFETCHED_LOAD_SIZE!
98	# endif
99	#else
100	# error Unsupported PREFETCH_SIZE!
101	#endif
102
103	#ifndef SECTION
104	# error SECTION is not defined!
105	#endif
106
107	.section SECTION(.text),"ax",@progbits
108	#if defined SHARED && IS_IN (libc)
109	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110	cmp %RDX_LP, %RCX_LP
111	jb HIDDEN_JUMPTARGET (__chk_fail)
112	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113	#endif
114
115	ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
116	mov %RDI_LP, %RAX_LP
117	add %RDX_LP, %RAX_LP
118	jmp L(start)
119	END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
120
121	#if defined SHARED && IS_IN (libc)
122	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
123	cmp %RDX_LP, %RCX_LP
124	jb HIDDEN_JUMPTARGET (__chk_fail)
125	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
126	#endif
127
128	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
129	movq %rdi, %rax
130	L(start):
131	# ifdef __ILP32__
132	/ Clear the upper 32 bits. /
133	movl %edx, %edx
134	# endif
135	cmp $VEC_SIZE, %RDX_LP
136	jb L(less_vec)
137	cmp $(VEC_SIZE * `2`), %RDX_LP
138	ja L(more_2x_vec)
139	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
140	L(last_2x_vec):
141	#endif
142	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
143	VMOVU (%rsi), %VEC(`0`)
144	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
145	VMOVU %VEC(`0`), (%rdi)
146	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
147	VZEROUPPER
148	#if !defined USE_MULTIARCH \|\| !IS_IN (libc)
149	L(nop):
150	#endif
151	ret
152	#if defined USE_MULTIARCH && IS_IN (libc)
153	END (MEMMOVE_SYMBOL (__memmove, unaligned))
154
155	# if VEC_SIZE == 16
156	ENTRY (__mempcpy_chk_erms)
157	cmp %RDX_LP, %RCX_LP
158	jb HIDDEN_JUMPTARGET (__chk_fail)
159	END (__mempcpy_chk_erms)
160
161	/ Only used to measure performance of REP MOVSB. /
162	ENTRY (__mempcpy_erms)
163	mov %RDI_LP, %RAX_LP
164	add %RDX_LP, %RAX_LP
165	jmp L(start_movsb)
166	END (__mempcpy_erms)
167
168	ENTRY (__memmove_chk_erms)
169	cmp %RDX_LP, %RCX_LP
170	jb HIDDEN_JUMPTARGET (__chk_fail)
171	END (__memmove_chk_erms)
172
173	ENTRY (__memmove_erms)
174	movq %rdi, %rax
175	L(start_movsb):
176	mov %RDX_LP, %RCX_LP
177	cmp %RSI_LP, %RDI_LP
178	jb `1f`
179	/ Source == destination is less common. /
180	je `2f`
181	lea (%rsi,%rcx), %RDX_LP
182	cmp %RDX_LP, %RDI_LP
183	jb L(movsb_backward)
184	`1`:
185	rep movsb
186	`2`:
187	ret
188	L(movsb_backward):
189	leaq -`1`(%rdi,%rcx), %rdi
190	leaq -`1`(%rsi,%rcx), %rsi
191	std
192	rep movsb
193	cld
194	ret
195	END (__memmove_erms)
196	strong_alias (__memmove_erms, __memcpy_erms)
197	strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
198	# endif
199
200	# ifdef SHARED
201	ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
202	cmp %RDX_LP, %RCX_LP
203	jb HIDDEN_JUMPTARGET (__chk_fail)
204	END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
205	# endif
206
207	ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
208	mov %RDI_LP, %RAX_LP
209	add %RDX_LP, %RAX_LP
210	jmp L(start_erms)
211	END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
212
213	# ifdef SHARED
214	ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
215	cmp %RDX_LP, %RCX_LP
216	jb HIDDEN_JUMPTARGET (__chk_fail)
217	END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
218	# endif
219
220	ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
221	movq %rdi, %rax
222	L(start_erms):
223	# ifdef __ILP32__
224	/ Clear the upper 32 bits. /
225	movl %edx, %edx
226	# endif
227	cmp $VEC_SIZE, %RDX_LP
228	jb L(less_vec)
229	cmp $(VEC_SIZE * `2`), %RDX_LP
230	ja L(movsb_more_2x_vec)
231	L(last_2x_vec):
232	/ From VEC and to 2 * VEC. No branch when size == VEC_SIZE. /
233	VMOVU (%rsi), %VEC(`0`)
234	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`1`)
235	VMOVU %VEC(`0`), (%rdi)
236	VMOVU %VEC(`1`), -VEC_SIZE(%rdi,%rdx)
237	L(return):
238	VZEROUPPER
239	ret
240
241	L(movsb):
242	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
243	jae L(more_8x_vec)
244	cmpq %rsi, %rdi
245	jb `1f`
246	/ Source == destination is less common. /
247	je L(nop)
248	leaq (%rsi,%rdx), %r9
249	cmpq %r9, %rdi
250	/ Avoid slow backward REP MOVSB. /
251	# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
252	# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
253	# endif
254	jb L(more_8x_vec_backward)
255	`1`:
256	mov %RDX_LP, %RCX_LP
257	rep movsb
258	L(nop):
259	ret
260	#endif
261
262	L(less_vec):
263	/ Less than 1 VEC. /
264	#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
265	# error Unsupported VEC_SIZE!
266	#endif
267	#if VEC_SIZE > 32
268	cmpb $`32`, %dl
269	jae L(between_32_63)
270	#endif
271	#if VEC_SIZE > 16
272	cmpb $`16`, %dl
273	jae L(between_16_31)
274	#endif
275	cmpb $`8`, %dl
276	jae L(between_8_15)
277	cmpb $`4`, %dl
278	jae L(between_4_7)
279	cmpb $`1`, %dl
280	ja L(between_2_3)
281	jb `1f`
282	movzbl (%rsi), %ecx
283	movb %cl, (%rdi)
284	`1`:
285	ret
286	#if VEC_SIZE > 32
287	L(between_32_63):
288	/ From 32 to 63. No branch when size == 32. /
289	vmovdqu (%rsi), %ymm0
290	vmovdqu -`32`(%rsi,%rdx), %ymm1
291	vmovdqu %ymm0, (%rdi)
292	vmovdqu %ymm1, -`32`(%rdi,%rdx)
293	VZEROUPPER
294	ret
295	#endif
296	#if VEC_SIZE > 16
297	/ From 16 to 31. No branch when size == 16. /
298	L(between_16_31):
299	vmovdqu (%rsi), %xmm0
300	vmovdqu -`16`(%rsi,%rdx), %xmm1
301	vmovdqu %xmm0, (%rdi)
302	vmovdqu %xmm1, -`16`(%rdi,%rdx)
303	ret
304	#endif
305	L(between_8_15):
306	/ From 8 to 15. No branch when size == 8. /
307	movq -`8`(%rsi,%rdx), %rcx
308	movq (%rsi), %rsi
309	movq %rcx, -`8`(%rdi,%rdx)
310	movq %rsi, (%rdi)
311	ret
312	L(between_4_7):
313	/ From 4 to 7. No branch when size == 4. /
314	movl -`4`(%rsi,%rdx), %ecx
315	movl (%rsi), %esi
316	movl %ecx, -`4`(%rdi,%rdx)
317	movl %esi, (%rdi)
318	ret
319	L(between_2_3):
320	/ From 2 to 3. No branch when size == 2. /
321	movzwl -`2`(%rsi,%rdx), %ecx
322	movzwl (%rsi), %esi
323	movw %cx, -`2`(%rdi,%rdx)
324	movw %si, (%rdi)
325	ret
326
327	#if defined USE_MULTIARCH && IS_IN (libc)
328	L(movsb_more_2x_vec):
329	cmpq $REP_MOVSB_THRESHOLD, %rdx
330	ja L(movsb)
331	#endif
332	L(more_2x_vec):
333	/ More than 2 * VEC and there may be overlap between destination*
334	and source. /*
335	cmpq $(VEC_SIZE * `8`), %rdx
336	ja L(more_8x_vec)
337	cmpq $(VEC_SIZE * `4`), %rdx
338	jb L(last_4x_vec)
339	/ Copy from 4 * VEC to 8 * VEC, inclusively. /
340	VMOVU (%rsi), %VEC(`0`)
341	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
342	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
343	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
344	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`4`)
345	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`5`)
346	VMOVU -(VEC_SIZE * `3`)(%rsi,%rdx), %VEC(`6`)
347	VMOVU -(VEC_SIZE * `4`)(%rsi,%rdx), %VEC(`7`)
348	VMOVU %VEC(`0`), (%rdi)
349	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
350	VMOVU %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
351	VMOVU %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
352	VMOVU %VEC(`4`), -VEC_SIZE(%rdi,%rdx)
353	VMOVU %VEC(`5`), -(VEC_SIZE * `2`)(%rdi,%rdx)
354	VMOVU %VEC(`6`), -(VEC_SIZE * `3`)(%rdi,%rdx)
355	VMOVU %VEC(`7`), -(VEC_SIZE * `4`)(%rdi,%rdx)
356	VZEROUPPER
357	ret
358	L(last_4x_vec):
359	/ Copy from 2 * VEC to 4 * VEC. /
360	VMOVU (%rsi), %VEC(`0`)
361	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
362	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`2`)
363	VMOVU -(VEC_SIZE * `2`)(%rsi,%rdx), %VEC(`3`)
364	VMOVU %VEC(`0`), (%rdi)
365	VMOVU %VEC(`1`), VEC_SIZE(%rdi)
366	VMOVU %VEC(`2`), -VEC_SIZE(%rdi,%rdx)
367	VMOVU %VEC(`3`), -(VEC_SIZE * `2`)(%rdi,%rdx)
368	VZEROUPPER
369	ret
370
371	L(more_8x_vec):
372	cmpq %rsi, %rdi
373	ja L(more_8x_vec_backward)
374	/ Source == destination is less common. /
375	je L(nop)
376	/ Load the first VEC and last 4 * VEC to support overlapping*
377	addresses. /*
378	VMOVU (%rsi), %VEC(`4`)
379	VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(`5`)
380	VMOVU -(VEC_SIZE * `2`)(%rsi, %rdx), %VEC(`6`)
381	VMOVU -(VEC_SIZE * `3`)(%rsi, %rdx), %VEC(`7`)
382	VMOVU -(VEC_SIZE * `4`)(%rsi, %rdx), %VEC(`8`)
383	/ Save start and stop of the destination buffer. /
384	movq %rdi, %r11
385	leaq -VEC_SIZE(%rdi, %rdx), %rcx
386	/ Align destination for aligned stores in the loop. Compute*
387	how much destination is misaligned. /*
388	movq %rdi, %r8
389	andq $(VEC_SIZE - `1`), %r8
390	/ Get the negative of offset for alignment. /
391	subq $VEC_SIZE, %r8
392	/ Adjust source. /
393	subq %r8, %rsi
394	/ Adjust destination which should be aligned now. /
395	subq %r8, %rdi
396	/ Adjust length. /
397	addq %r8, %rdx
398	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
399	/ Check non-temporal store threshold. /
400	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
401	ja L(large_forward)
402	#endif
403	L(loop_4x_vec_forward):
404	/ Copy 4 * VEC a time forward. /
405	VMOVU (%rsi), %VEC(`0`)
406	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
407	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
408	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
409	addq $(VEC_SIZE * `4`), %rsi
410	subq $(VEC_SIZE * `4`), %rdx
411	VMOVA %VEC(`0`), (%rdi)
412	VMOVA %VEC(`1`), VEC_SIZE(%rdi)
413	VMOVA %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
414	VMOVA %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
415	addq $(VEC_SIZE * `4`), %rdi
416	cmpq $(VEC_SIZE * `4`), %rdx
417	ja L(loop_4x_vec_forward)
418	/ Store the last 4 * VEC. /
419	VMOVU %VEC(`5`), (%rcx)
420	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
421	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
422	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
423	/ Store the first VEC. /
424	VMOVU %VEC(`4`), (%r11)
425	VZEROUPPER
426	ret
427
428	L(more_8x_vec_backward):
429	/ Load the first 4 * VEC and last VEC to support overlapping*
430	addresses. /*
431	VMOVU (%rsi), %VEC(`4`)
432	VMOVU VEC_SIZE(%rsi), %VEC(`5`)
433	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`6`)
434	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`7`)
435	VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(`8`)
436	/ Save stop of the destination buffer. /
437	leaq -VEC_SIZE(%rdi, %rdx), %r11
438	/ Align destination end for aligned stores in the loop. Compute*
439	how much destination end is misaligned. /*
440	leaq -VEC_SIZE(%rsi, %rdx), %rcx
441	movq %r11, %r9
442	movq %r11, %r8
443	andq $(VEC_SIZE - `1`), %r8
444	/ Adjust source. /
445	subq %r8, %rcx
446	/ Adjust the end of destination which should be aligned now. /
447	subq %r8, %r9
448	/ Adjust length. /
449	subq %r8, %rdx
450	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
451	/ Check non-temporal store threshold. /
452	cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
453	ja L(large_backward)
454	#endif
455	L(loop_4x_vec_backward):
456	/ Copy 4 * VEC a time backward. /
457	VMOVU (%rcx), %VEC(`0`)
458	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
459	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
460	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
461	subq $(VEC_SIZE * `4`), %rcx
462	subq $(VEC_SIZE * `4`), %rdx
463	VMOVA %VEC(`0`), (%r9)
464	VMOVA %VEC(`1`), -VEC_SIZE(%r9)
465	VMOVA %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
466	VMOVA %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
467	subq $(VEC_SIZE * `4`), %r9
468	cmpq $(VEC_SIZE * `4`), %rdx
469	ja L(loop_4x_vec_backward)
470	/ Store the first 4 * VEC. /
471	VMOVU %VEC(`4`), (%rdi)
472	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
473	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
474	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
475	/ Store the last VEC. /
476	VMOVU %VEC(`8`), (%r11)
477	VZEROUPPER
478	ret
479
480	#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)
481	L(large_forward):
482	/ Don't use non-temporal store if there is overlap between*
483	destination and source since destination may be in cache
484	when source is loaded. /*
485	leaq (%rdi, %rdx), %r10
486	cmpq %r10, %rsi
487	jb L(loop_4x_vec_forward)
488	L(loop_large_forward):
489	/ Copy 4 * VEC a time forward with non-temporal stores. /
490	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `2`)
491	PREFETCH_ONE_SET (`1`, (%rsi), PREFETCHED_LOAD_SIZE * `3`)
492	VMOVU (%rsi), %VEC(`0`)
493	VMOVU VEC_SIZE(%rsi), %VEC(`1`)
494	VMOVU (VEC_SIZE * `2`)(%rsi), %VEC(`2`)
495	VMOVU (VEC_SIZE * `3`)(%rsi), %VEC(`3`)
496	addq $PREFETCHED_LOAD_SIZE, %rsi
497	subq $PREFETCHED_LOAD_SIZE, %rdx
498	VMOVNT %VEC(`0`), (%rdi)
499	VMOVNT %VEC(`1`), VEC_SIZE(%rdi)
500	VMOVNT %VEC(`2`), (VEC_SIZE * `2`)(%rdi)
501	VMOVNT %VEC(`3`), (VEC_SIZE * `3`)(%rdi)
502	addq $PREFETCHED_LOAD_SIZE, %rdi
503	cmpq $PREFETCHED_LOAD_SIZE, %rdx
504	ja L(loop_large_forward)
505	sfence
506	/ Store the last 4 * VEC. /
507	VMOVU %VEC(`5`), (%rcx)
508	VMOVU %VEC(`6`), -VEC_SIZE(%rcx)
509	VMOVU %VEC(`7`), -(VEC_SIZE * `2`)(%rcx)
510	VMOVU %VEC(`8`), -(VEC_SIZE * `3`)(%rcx)
511	/ Store the first VEC. /
512	VMOVU %VEC(`4`), (%r11)
513	VZEROUPPER
514	ret
515
516	L(large_backward):
517	/ Don't use non-temporal store if there is overlap between*
518	destination and source since destination may be in cache
519	when source is loaded. /*
520	leaq (%rcx, %rdx), %r10
521	cmpq %r10, %r9
522	jb L(loop_4x_vec_backward)
523	L(loop_large_backward):
524	/ Copy 4 * VEC a time backward with non-temporal stores. /
525	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `2`)
526	PREFETCH_ONE_SET (-`1`, (%rcx), -PREFETCHED_LOAD_SIZE * `3`)
527	VMOVU (%rcx), %VEC(`0`)
528	VMOVU -VEC_SIZE(%rcx), %VEC(`1`)
529	VMOVU -(VEC_SIZE * `2`)(%rcx), %VEC(`2`)
530	VMOVU -(VEC_SIZE * `3`)(%rcx), %VEC(`3`)
531	subq $PREFETCHED_LOAD_SIZE, %rcx
532	subq $PREFETCHED_LOAD_SIZE, %rdx
533	VMOVNT %VEC(`0`), (%r9)
534	VMOVNT %VEC(`1`), -VEC_SIZE(%r9)
535	VMOVNT %VEC(`2`), -(VEC_SIZE * `2`)(%r9)
536	VMOVNT %VEC(`3`), -(VEC_SIZE * `3`)(%r9)
537	subq $PREFETCHED_LOAD_SIZE, %r9
538	cmpq $PREFETCHED_LOAD_SIZE, %rdx
539	ja L(loop_large_backward)
540	sfence
541	/ Store the first 4 * VEC. /
542	VMOVU %VEC(`4`), (%rdi)
543	VMOVU %VEC(`5`), VEC_SIZE(%rdi)
544	VMOVU %VEC(`6`), (VEC_SIZE * `2`)(%rdi)
545	VMOVU %VEC(`7`), (VEC_SIZE * `3`)(%rdi)
546	/ Store the last VEC. /
547	VMOVU %VEC(`8`), (%r11)
548	VZEROUPPER
549	ret
550	#endif
551	END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
552
553	#if IS_IN (libc)
554	# ifdef USE_MULTIARCH
555	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
556	MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
557	# ifdef SHARED
558	strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
559	MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
560	# endif
561	# endif
562	# ifdef SHARED
563	strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
564	MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
565	# endif
566	#endif
567	strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
568	MEMCPY_SYMBOL (__memcpy, unaligned))
569

Browse the source code of glibc_src_2.27/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S