1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
95# else
96# error Unsupported PREFETCHED_LOAD_SIZE!
97# endif
98#else
99# error Unsupported PREFETCH_SIZE!
100#endif
101
102#ifndef SECTION
103# error SECTION is not defined!
104#endif
105
106 .section SECTION(.text),"ax",@progbits
107#if defined SHARED && IS_IN (libc)
108ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
109 cmp %RDX_LP, %RCX_LP
110 jb HIDDEN_JUMPTARGET (__chk_fail)
111END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
112#endif
113
114ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
115 mov %RDI_LP, %RAX_LP
116 add %RDX_LP, %RAX_LP
117 jmp L(start)
118END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
119
120#if defined SHARED && IS_IN (libc)
121ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
122 cmp %RDX_LP, %RCX_LP
123 jb HIDDEN_JUMPTARGET (__chk_fail)
124END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125#endif
126
127ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
128 movq %rdi, %rax
129L(start):
130# ifdef __ILP32__
131 /* Clear the upper 32 bits. */
132 movl %edx, %edx
133# endif
134 cmp $VEC_SIZE, %RDX_LP
135 jb L(less_vec)
136 cmp $(VEC_SIZE * 2), %RDX_LP
137 ja L(more_2x_vec)
138#if !defined USE_MULTIARCH || !IS_IN (libc)
139L(last_2x_vec):
140#endif
141 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
142 VMOVU (%rsi), %VEC(0)
143 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
144 VMOVU %VEC(0), (%rdi)
145 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
146 VZEROUPPER
147#if !defined USE_MULTIARCH || !IS_IN (libc)
148L(nop):
149#endif
150 ret
151#if defined USE_MULTIARCH && IS_IN (libc)
152END (MEMMOVE_SYMBOL (__memmove, unaligned))
153
154# if VEC_SIZE == 16
155ENTRY (__mempcpy_chk_erms)
156 cmp %RDX_LP, %RCX_LP
157 jb HIDDEN_JUMPTARGET (__chk_fail)
158END (__mempcpy_chk_erms)
159
160/* Only used to measure performance of REP MOVSB. */
161ENTRY (__mempcpy_erms)
162 mov %RDI_LP, %RAX_LP
163 /* Skip zero length. */
164 test %RDX_LP, %RDX_LP
165 jz 2f
166 add %RDX_LP, %RAX_LP
167 jmp L(start_movsb)
168END (__mempcpy_erms)
169
170ENTRY (__memmove_chk_erms)
171 cmp %RDX_LP, %RCX_LP
172 jb HIDDEN_JUMPTARGET (__chk_fail)
173END (__memmove_chk_erms)
174
175ENTRY (__memmove_erms)
176 movq %rdi, %rax
177 /* Skip zero length. */
178 test %RDX_LP, %RDX_LP
179 jz 2f
180L(start_movsb):
181 mov %RDX_LP, %RCX_LP
182 cmp %RSI_LP, %RDI_LP
183 jb 1f
184 /* Source == destination is less common. */
185 je 2f
186 lea (%rsi,%rcx), %RDX_LP
187 cmp %RDX_LP, %RDI_LP
188 jb L(movsb_backward)
1891:
190 rep movsb
1912:
192 ret
193L(movsb_backward):
194 leaq -1(%rdi,%rcx), %rdi
195 leaq -1(%rsi,%rcx), %rsi
196 std
197 rep movsb
198 cld
199 ret
200END (__memmove_erms)
201strong_alias (__memmove_erms, __memcpy_erms)
202strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
203# endif
204
205# ifdef SHARED
206ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
207 cmp %RDX_LP, %RCX_LP
208 jb HIDDEN_JUMPTARGET (__chk_fail)
209END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
210# endif
211
212ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
213 mov %RDI_LP, %RAX_LP
214 add %RDX_LP, %RAX_LP
215 jmp L(start_erms)
216END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
217
218# ifdef SHARED
219ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
220 cmp %RDX_LP, %RCX_LP
221 jb HIDDEN_JUMPTARGET (__chk_fail)
222END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
223# endif
224
225ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
226 movq %rdi, %rax
227L(start_erms):
228# ifdef __ILP32__
229 /* Clear the upper 32 bits. */
230 movl %edx, %edx
231# endif
232 cmp $VEC_SIZE, %RDX_LP
233 jb L(less_vec)
234 cmp $(VEC_SIZE * 2), %RDX_LP
235 ja L(movsb_more_2x_vec)
236L(last_2x_vec):
237 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
238 VMOVU (%rsi), %VEC(0)
239 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
240 VMOVU %VEC(0), (%rdi)
241 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
242L(return):
243 VZEROUPPER
244 ret
245
246L(movsb):
247 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
248 jae L(more_8x_vec)
249 cmpq %rsi, %rdi
250 jb 1f
251 /* Source == destination is less common. */
252 je L(nop)
253 leaq (%rsi,%rdx), %r9
254 cmpq %r9, %rdi
255 /* Avoid slow backward REP MOVSB. */
256# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
257# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
258# endif
259 jb L(more_8x_vec_backward)
2601:
261 mov %RDX_LP, %RCX_LP
262 rep movsb
263L(nop):
264 ret
265#endif
266
267L(less_vec):
268 /* Less than 1 VEC. */
269#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
270# error Unsupported VEC_SIZE!
271#endif
272#if VEC_SIZE > 32
273 cmpb $32, %dl
274 jae L(between_32_63)
275#endif
276#if VEC_SIZE > 16
277 cmpb $16, %dl
278 jae L(between_16_31)
279#endif
280 cmpb $8, %dl
281 jae L(between_8_15)
282 cmpb $4, %dl
283 jae L(between_4_7)
284 cmpb $1, %dl
285 ja L(between_2_3)
286 jb 1f
287 movzbl (%rsi), %ecx
288 movb %cl, (%rdi)
2891:
290 ret
291#if VEC_SIZE > 32
292L(between_32_63):
293 /* From 32 to 63. No branch when size == 32. */
294 vmovdqu (%rsi), %ymm0
295 vmovdqu -32(%rsi,%rdx), %ymm1
296 vmovdqu %ymm0, (%rdi)
297 vmovdqu %ymm1, -32(%rdi,%rdx)
298 VZEROUPPER
299 ret
300#endif
301#if VEC_SIZE > 16
302 /* From 16 to 31. No branch when size == 16. */
303L(between_16_31):
304 vmovdqu (%rsi), %xmm0
305 vmovdqu -16(%rsi,%rdx), %xmm1
306 vmovdqu %xmm0, (%rdi)
307 vmovdqu %xmm1, -16(%rdi,%rdx)
308 ret
309#endif
310L(between_8_15):
311 /* From 8 to 15. No branch when size == 8. */
312 movq -8(%rsi,%rdx), %rcx
313 movq (%rsi), %rsi
314 movq %rcx, -8(%rdi,%rdx)
315 movq %rsi, (%rdi)
316 ret
317L(between_4_7):
318 /* From 4 to 7. No branch when size == 4. */
319 movl -4(%rsi,%rdx), %ecx
320 movl (%rsi), %esi
321 movl %ecx, -4(%rdi,%rdx)
322 movl %esi, (%rdi)
323 ret
324L(between_2_3):
325 /* From 2 to 3. No branch when size == 2. */
326 movzwl -2(%rsi,%rdx), %ecx
327 movzwl (%rsi), %esi
328 movw %cx, -2(%rdi,%rdx)
329 movw %si, (%rdi)
330 ret
331
332#if defined USE_MULTIARCH && IS_IN (libc)
333L(movsb_more_2x_vec):
334 cmpq $REP_MOVSB_THRESHOLD, %rdx
335 ja L(movsb)
336#endif
337L(more_2x_vec):
338 /* More than 2 * VEC and there may be overlap between destination
339 and source. */
340 cmpq $(VEC_SIZE * 8), %rdx
341 ja L(more_8x_vec)
342 cmpq $(VEC_SIZE * 4), %rdx
343 jb L(last_4x_vec)
344 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
345 VMOVU (%rsi), %VEC(0)
346 VMOVU VEC_SIZE(%rsi), %VEC(1)
347 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
348 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
349 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
350 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
351 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
352 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
353 VMOVU %VEC(0), (%rdi)
354 VMOVU %VEC(1), VEC_SIZE(%rdi)
355 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
356 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
357 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
358 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
359 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
360 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
361 VZEROUPPER
362 ret
363L(last_4x_vec):
364 /* Copy from 2 * VEC to 4 * VEC. */
365 VMOVU (%rsi), %VEC(0)
366 VMOVU VEC_SIZE(%rsi), %VEC(1)
367 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
368 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
369 VMOVU %VEC(0), (%rdi)
370 VMOVU %VEC(1), VEC_SIZE(%rdi)
371 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
372 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
373 VZEROUPPER
374 ret
375
376L(more_8x_vec):
377 cmpq %rsi, %rdi
378 ja L(more_8x_vec_backward)
379 /* Source == destination is less common. */
380 je L(nop)
381 /* Load the first VEC and last 4 * VEC to support overlapping
382 addresses. */
383 VMOVU (%rsi), %VEC(4)
384 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
385 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
386 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
387 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
388 /* Save start and stop of the destination buffer. */
389 movq %rdi, %r11
390 leaq -VEC_SIZE(%rdi, %rdx), %rcx
391 /* Align destination for aligned stores in the loop. Compute
392 how much destination is misaligned. */
393 movq %rdi, %r8
394 andq $(VEC_SIZE - 1), %r8
395 /* Get the negative of offset for alignment. */
396 subq $VEC_SIZE, %r8
397 /* Adjust source. */
398 subq %r8, %rsi
399 /* Adjust destination which should be aligned now. */
400 subq %r8, %rdi
401 /* Adjust length. */
402 addq %r8, %rdx
403#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
404 /* Check non-temporal store threshold. */
405 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
406 ja L(large_forward)
407#endif
408L(loop_4x_vec_forward):
409 /* Copy 4 * VEC a time forward. */
410 VMOVU (%rsi), %VEC(0)
411 VMOVU VEC_SIZE(%rsi), %VEC(1)
412 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
413 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
414 addq $(VEC_SIZE * 4), %rsi
415 subq $(VEC_SIZE * 4), %rdx
416 VMOVA %VEC(0), (%rdi)
417 VMOVA %VEC(1), VEC_SIZE(%rdi)
418 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
419 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
420 addq $(VEC_SIZE * 4), %rdi
421 cmpq $(VEC_SIZE * 4), %rdx
422 ja L(loop_4x_vec_forward)
423 /* Store the last 4 * VEC. */
424 VMOVU %VEC(5), (%rcx)
425 VMOVU %VEC(6), -VEC_SIZE(%rcx)
426 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
427 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
428 /* Store the first VEC. */
429 VMOVU %VEC(4), (%r11)
430 VZEROUPPER
431 ret
432
433L(more_8x_vec_backward):
434 /* Load the first 4 * VEC and last VEC to support overlapping
435 addresses. */
436 VMOVU (%rsi), %VEC(4)
437 VMOVU VEC_SIZE(%rsi), %VEC(5)
438 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
439 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
440 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
441 /* Save stop of the destination buffer. */
442 leaq -VEC_SIZE(%rdi, %rdx), %r11
443 /* Align destination end for aligned stores in the loop. Compute
444 how much destination end is misaligned. */
445 leaq -VEC_SIZE(%rsi, %rdx), %rcx
446 movq %r11, %r9
447 movq %r11, %r8
448 andq $(VEC_SIZE - 1), %r8
449 /* Adjust source. */
450 subq %r8, %rcx
451 /* Adjust the end of destination which should be aligned now. */
452 subq %r8, %r9
453 /* Adjust length. */
454 subq %r8, %rdx
455#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
456 /* Check non-temporal store threshold. */
457 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
458 ja L(large_backward)
459#endif
460L(loop_4x_vec_backward):
461 /* Copy 4 * VEC a time backward. */
462 VMOVU (%rcx), %VEC(0)
463 VMOVU -VEC_SIZE(%rcx), %VEC(1)
464 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
465 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
466 subq $(VEC_SIZE * 4), %rcx
467 subq $(VEC_SIZE * 4), %rdx
468 VMOVA %VEC(0), (%r9)
469 VMOVA %VEC(1), -VEC_SIZE(%r9)
470 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
471 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
472 subq $(VEC_SIZE * 4), %r9
473 cmpq $(VEC_SIZE * 4), %rdx
474 ja L(loop_4x_vec_backward)
475 /* Store the first 4 * VEC. */
476 VMOVU %VEC(4), (%rdi)
477 VMOVU %VEC(5), VEC_SIZE(%rdi)
478 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
479 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
480 /* Store the last VEC. */
481 VMOVU %VEC(8), (%r11)
482 VZEROUPPER
483 ret
484
485#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
486L(large_forward):
487 /* Don't use non-temporal store if there is overlap between
488 destination and source since destination may be in cache
489 when source is loaded. */
490 leaq (%rdi, %rdx), %r10
491 cmpq %r10, %rsi
492 jb L(loop_4x_vec_forward)
493L(loop_large_forward):
494 /* Copy 4 * VEC a time forward with non-temporal stores. */
495 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
496 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
497 VMOVU (%rsi), %VEC(0)
498 VMOVU VEC_SIZE(%rsi), %VEC(1)
499 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
500 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
501 addq $PREFETCHED_LOAD_SIZE, %rsi
502 subq $PREFETCHED_LOAD_SIZE, %rdx
503 VMOVNT %VEC(0), (%rdi)
504 VMOVNT %VEC(1), VEC_SIZE(%rdi)
505 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
506 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
507 addq $PREFETCHED_LOAD_SIZE, %rdi
508 cmpq $PREFETCHED_LOAD_SIZE, %rdx
509 ja L(loop_large_forward)
510 sfence
511 /* Store the last 4 * VEC. */
512 VMOVU %VEC(5), (%rcx)
513 VMOVU %VEC(6), -VEC_SIZE(%rcx)
514 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
515 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
516 /* Store the first VEC. */
517 VMOVU %VEC(4), (%r11)
518 VZEROUPPER
519 ret
520
521L(large_backward):
522 /* Don't use non-temporal store if there is overlap between
523 destination and source since destination may be in cache
524 when source is loaded. */
525 leaq (%rcx, %rdx), %r10
526 cmpq %r10, %r9
527 jb L(loop_4x_vec_backward)
528L(loop_large_backward):
529 /* Copy 4 * VEC a time backward with non-temporal stores. */
530 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
531 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
532 VMOVU (%rcx), %VEC(0)
533 VMOVU -VEC_SIZE(%rcx), %VEC(1)
534 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
535 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
536 subq $PREFETCHED_LOAD_SIZE, %rcx
537 subq $PREFETCHED_LOAD_SIZE, %rdx
538 VMOVNT %VEC(0), (%r9)
539 VMOVNT %VEC(1), -VEC_SIZE(%r9)
540 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
541 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
542 subq $PREFETCHED_LOAD_SIZE, %r9
543 cmpq $PREFETCHED_LOAD_SIZE, %rdx
544 ja L(loop_large_backward)
545 sfence
546 /* Store the first 4 * VEC. */
547 VMOVU %VEC(4), (%rdi)
548 VMOVU %VEC(5), VEC_SIZE(%rdi)
549 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
550 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
551 /* Store the last VEC. */
552 VMOVU %VEC(8), (%r11)
553 VZEROUPPER
554 ret
555#endif
556END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
557
558#if IS_IN (libc)
559# ifdef USE_MULTIARCH
560strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
561 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
562# ifdef SHARED
563strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
564 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
565# endif
566# endif
567# ifdef SHARED
568strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
569 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
570# endif
571#endif
572strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
573 MEMCPY_SYMBOL (__memcpy, unaligned))
574