1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96# else
97# error Unsupported PREFETCHED_LOAD_SIZE!
98# endif
99#else
100# error Unsupported PREFETCH_SIZE!
101#endif
102
103#ifndef SECTION
104# error SECTION is not defined!
105#endif
106
107 .section SECTION(.text),"ax",@progbits
108#if defined SHARED && IS_IN (libc)
109ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 cmp %RDX_LP, %RCX_LP
111 jb HIDDEN_JUMPTARGET (__chk_fail)
112END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113#endif
114
115#if VEC_SIZE == 16 || defined SHARED
116ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
117 mov %RDI_LP, %RAX_LP
118 add %RDX_LP, %RAX_LP
119 jmp L(start)
120END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
121#endif
122
123#if defined SHARED && IS_IN (libc)
124ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
125 cmp %RDX_LP, %RCX_LP
126 jb HIDDEN_JUMPTARGET (__chk_fail)
127END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
128#endif
129
130ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
131 movq %rdi, %rax
132L(start):
133# ifdef __ILP32__
134 /* Clear the upper 32 bits. */
135 movl %edx, %edx
136# endif
137 cmp $VEC_SIZE, %RDX_LP
138 jb L(less_vec)
139 cmp $(VEC_SIZE * 2), %RDX_LP
140 ja L(more_2x_vec)
141#if !defined USE_MULTIARCH || !IS_IN (libc)
142L(last_2x_vec):
143#endif
144 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
145 VMOVU (%rsi), %VEC(0)
146 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
147 VMOVU %VEC(0), (%rdi)
148 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
149 VZEROUPPER
150#if !defined USE_MULTIARCH || !IS_IN (libc)
151L(nop):
152#endif
153 ret
154#if defined USE_MULTIARCH && IS_IN (libc)
155END (MEMMOVE_SYMBOL (__memmove, unaligned))
156
157# if VEC_SIZE == 16
158# if defined SHARED
159/* Only used to measure performance of REP MOVSB. */
160ENTRY (__mempcpy_erms)
161 mov %RDI_LP, %RAX_LP
162 add %RDX_LP, %RAX_LP
163 jmp L(start_movsb)
164END (__mempcpy_erms)
165# endif
166
167ENTRY (__memmove_erms)
168 movq %rdi, %rax
169L(start_movsb):
170 mov %RDX_LP, %RCX_LP
171 cmp %RSI_LP, %RDI_LP
172 jb 1f
173 /* Source == destination is less common. */
174 je 2f
175 lea (%rsi,%rcx), %RDX_LP
176 cmp %RDX_LP, %RDI_LP
177 jb L(movsb_backward)
1781:
179 rep movsb
1802:
181 ret
182L(movsb_backward):
183 leaq -1(%rdi,%rcx), %rdi
184 leaq -1(%rsi,%rcx), %rsi
185 std
186 rep movsb
187 cld
188 ret
189END (__memmove_erms)
190# if defined SHARED
191strong_alias (__memmove_erms, __memcpy_erms)
192# endif
193# endif
194
195# ifdef SHARED
196ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
197 cmp %RDX_LP, %RCX_LP
198 jb HIDDEN_JUMPTARGET (__chk_fail)
199END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
200
201ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
202 mov %RDI_LP, %RAX_LP
203 add %RDX_LP, %RAX_LP
204 jmp L(start_erms)
205END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
206
207ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
208 cmp %RDX_LP, %RCX_LP
209 jb HIDDEN_JUMPTARGET (__chk_fail)
210END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
211# endif
212
213ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
214 movq %rdi, %rax
215L(start_erms):
216# ifdef __ILP32__
217 /* Clear the upper 32 bits. */
218 movl %edx, %edx
219# endif
220 cmp $VEC_SIZE, %RDX_LP
221 jb L(less_vec)
222 cmp $(VEC_SIZE * 2), %RDX_LP
223 ja L(movsb_more_2x_vec)
224L(last_2x_vec):
225 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
226 VMOVU (%rsi), %VEC(0)
227 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
228 VMOVU %VEC(0), (%rdi)
229 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
230L(return):
231 VZEROUPPER
232 ret
233
234L(movsb):
235 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
236 jae L(more_8x_vec)
237 cmpq %rsi, %rdi
238 jb 1f
239 /* Source == destination is less common. */
240 je L(nop)
241 leaq (%rsi,%rdx), %r9
242 cmpq %r9, %rdi
243 /* Avoid slow backward REP MOVSB. */
244# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
245# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
246# endif
247 jb L(more_8x_vec_backward)
2481:
249 mov %RDX_LP, %RCX_LP
250 rep movsb
251L(nop):
252 ret
253#endif
254
255L(less_vec):
256 /* Less than 1 VEC. */
257#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
258# error Unsupported VEC_SIZE!
259#endif
260#if VEC_SIZE > 32
261 cmpb $32, %dl
262 jae L(between_32_63)
263#endif
264#if VEC_SIZE > 16
265 cmpb $16, %dl
266 jae L(between_16_31)
267#endif
268 cmpb $8, %dl
269 jae L(between_8_15)
270 cmpb $4, %dl
271 jae L(between_4_7)
272 cmpb $1, %dl
273 ja L(between_2_3)
274 jb 1f
275 movzbl (%rsi), %ecx
276 movb %cl, (%rdi)
2771:
278 ret
279#if VEC_SIZE > 32
280L(between_32_63):
281 /* From 32 to 63. No branch when size == 32. */
282 vmovdqu (%rsi), %ymm0
283 vmovdqu -32(%rsi,%rdx), %ymm1
284 vmovdqu %ymm0, (%rdi)
285 vmovdqu %ymm1, -32(%rdi,%rdx)
286 VZEROUPPER
287 ret
288#endif
289#if VEC_SIZE > 16
290 /* From 16 to 31. No branch when size == 16. */
291L(between_16_31):
292 vmovdqu (%rsi), %xmm0
293 vmovdqu -16(%rsi,%rdx), %xmm1
294 vmovdqu %xmm0, (%rdi)
295 vmovdqu %xmm1, -16(%rdi,%rdx)
296 ret
297#endif
298L(between_8_15):
299 /* From 8 to 15. No branch when size == 8. */
300 movq -8(%rsi,%rdx), %rcx
301 movq (%rsi), %rsi
302 movq %rcx, -8(%rdi,%rdx)
303 movq %rsi, (%rdi)
304 ret
305L(between_4_7):
306 /* From 4 to 7. No branch when size == 4. */
307 movl -4(%rsi,%rdx), %ecx
308 movl (%rsi), %esi
309 movl %ecx, -4(%rdi,%rdx)
310 movl %esi, (%rdi)
311 ret
312L(between_2_3):
313 /* From 2 to 3. No branch when size == 2. */
314 movzwl -2(%rsi,%rdx), %ecx
315 movzwl (%rsi), %esi
316 movw %cx, -2(%rdi,%rdx)
317 movw %si, (%rdi)
318 ret
319
320#if defined USE_MULTIARCH && IS_IN (libc)
321L(movsb_more_2x_vec):
322 cmpq $REP_MOVSB_THRESHOLD, %rdx
323 ja L(movsb)
324#endif
325L(more_2x_vec):
326 /* More than 2 * VEC and there may be overlap between destination
327 and source. */
328 cmpq $(VEC_SIZE * 8), %rdx
329 ja L(more_8x_vec)
330 cmpq $(VEC_SIZE * 4), %rdx
331 jb L(last_4x_vec)
332 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
333 VMOVU (%rsi), %VEC(0)
334 VMOVU VEC_SIZE(%rsi), %VEC(1)
335 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
336 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
337 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
338 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
339 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
340 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
341 VMOVU %VEC(0), (%rdi)
342 VMOVU %VEC(1), VEC_SIZE(%rdi)
343 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
344 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
345 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
346 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
347 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
348 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
349 VZEROUPPER
350 ret
351L(last_4x_vec):
352 /* Copy from 2 * VEC to 4 * VEC. */
353 VMOVU (%rsi), %VEC(0)
354 VMOVU VEC_SIZE(%rsi), %VEC(1)
355 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
356 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
357 VMOVU %VEC(0), (%rdi)
358 VMOVU %VEC(1), VEC_SIZE(%rdi)
359 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
360 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
361 VZEROUPPER
362 ret
363
364L(more_8x_vec):
365 cmpq %rsi, %rdi
366 ja L(more_8x_vec_backward)
367 /* Source == destination is less common. */
368 je L(nop)
369 /* Load the first VEC and last 4 * VEC to support overlapping
370 addresses. */
371 VMOVU (%rsi), %VEC(4)
372 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
373 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
374 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
375 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
376 /* Save start and stop of the destination buffer. */
377 movq %rdi, %r11
378 leaq -VEC_SIZE(%rdi, %rdx), %rcx
379 /* Align destination for aligned stores in the loop. Compute
380 how much destination is misaligned. */
381 movq %rdi, %r8
382 andq $(VEC_SIZE - 1), %r8
383 /* Get the negative of offset for alignment. */
384 subq $VEC_SIZE, %r8
385 /* Adjust source. */
386 subq %r8, %rsi
387 /* Adjust destination which should be aligned now. */
388 subq %r8, %rdi
389 /* Adjust length. */
390 addq %r8, %rdx
391#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
392 /* Check non-temporal store threshold. */
393 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
394 ja L(large_forward)
395#endif
396L(loop_4x_vec_forward):
397 /* Copy 4 * VEC a time forward. */
398 VMOVU (%rsi), %VEC(0)
399 VMOVU VEC_SIZE(%rsi), %VEC(1)
400 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
401 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
402 addq $(VEC_SIZE * 4), %rsi
403 subq $(VEC_SIZE * 4), %rdx
404 VMOVA %VEC(0), (%rdi)
405 VMOVA %VEC(1), VEC_SIZE(%rdi)
406 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
407 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
408 addq $(VEC_SIZE * 4), %rdi
409 cmpq $(VEC_SIZE * 4), %rdx
410 ja L(loop_4x_vec_forward)
411 /* Store the last 4 * VEC. */
412 VMOVU %VEC(5), (%rcx)
413 VMOVU %VEC(6), -VEC_SIZE(%rcx)
414 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
415 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
416 /* Store the first VEC. */
417 VMOVU %VEC(4), (%r11)
418 VZEROUPPER
419 ret
420
421L(more_8x_vec_backward):
422 /* Load the first 4 * VEC and last VEC to support overlapping
423 addresses. */
424 VMOVU (%rsi), %VEC(4)
425 VMOVU VEC_SIZE(%rsi), %VEC(5)
426 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
427 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
428 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
429 /* Save stop of the destination buffer. */
430 leaq -VEC_SIZE(%rdi, %rdx), %r11
431 /* Align destination end for aligned stores in the loop. Compute
432 how much destination end is misaligned. */
433 leaq -VEC_SIZE(%rsi, %rdx), %rcx
434 movq %r11, %r9
435 movq %r11, %r8
436 andq $(VEC_SIZE - 1), %r8
437 /* Adjust source. */
438 subq %r8, %rcx
439 /* Adjust the end of destination which should be aligned now. */
440 subq %r8, %r9
441 /* Adjust length. */
442 subq %r8, %rdx
443#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
444 /* Check non-temporal store threshold. */
445 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
446 ja L(large_backward)
447#endif
448L(loop_4x_vec_backward):
449 /* Copy 4 * VEC a time backward. */
450 VMOVU (%rcx), %VEC(0)
451 VMOVU -VEC_SIZE(%rcx), %VEC(1)
452 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
453 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
454 subq $(VEC_SIZE * 4), %rcx
455 subq $(VEC_SIZE * 4), %rdx
456 VMOVA %VEC(0), (%r9)
457 VMOVA %VEC(1), -VEC_SIZE(%r9)
458 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
459 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
460 subq $(VEC_SIZE * 4), %r9
461 cmpq $(VEC_SIZE * 4), %rdx
462 ja L(loop_4x_vec_backward)
463 /* Store the first 4 * VEC. */
464 VMOVU %VEC(4), (%rdi)
465 VMOVU %VEC(5), VEC_SIZE(%rdi)
466 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
467 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
468 /* Store the last VEC. */
469 VMOVU %VEC(8), (%r11)
470 VZEROUPPER
471 ret
472
473#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
474L(large_forward):
475 /* Don't use non-temporal store if there is overlap between
476 destination and source since destination may be in cache
477 when source is loaded. */
478 leaq (%rdi, %rdx), %r10
479 cmpq %r10, %rsi
480 jb L(loop_4x_vec_forward)
481L(loop_large_forward):
482 /* Copy 4 * VEC a time forward with non-temporal stores. */
483 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
484 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
485 VMOVU (%rsi), %VEC(0)
486 VMOVU VEC_SIZE(%rsi), %VEC(1)
487 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
488 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
489 addq $PREFETCHED_LOAD_SIZE, %rsi
490 subq $PREFETCHED_LOAD_SIZE, %rdx
491 VMOVNT %VEC(0), (%rdi)
492 VMOVNT %VEC(1), VEC_SIZE(%rdi)
493 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
494 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
495 addq $PREFETCHED_LOAD_SIZE, %rdi
496 cmpq $PREFETCHED_LOAD_SIZE, %rdx
497 ja L(loop_large_forward)
498 sfence
499 /* Store the last 4 * VEC. */
500 VMOVU %VEC(5), (%rcx)
501 VMOVU %VEC(6), -VEC_SIZE(%rcx)
502 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
503 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
504 /* Store the first VEC. */
505 VMOVU %VEC(4), (%r11)
506 VZEROUPPER
507 ret
508
509L(large_backward):
510 /* Don't use non-temporal store if there is overlap between
511 destination and source since destination may be in cache
512 when source is loaded. */
513 leaq (%rcx, %rdx), %r10
514 cmpq %r10, %r9
515 jb L(loop_4x_vec_backward)
516L(loop_large_backward):
517 /* Copy 4 * VEC a time backward with non-temporal stores. */
518 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
519 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
520 VMOVU (%rcx), %VEC(0)
521 VMOVU -VEC_SIZE(%rcx), %VEC(1)
522 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
523 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
524 subq $PREFETCHED_LOAD_SIZE, %rcx
525 subq $PREFETCHED_LOAD_SIZE, %rdx
526 VMOVNT %VEC(0), (%r9)
527 VMOVNT %VEC(1), -VEC_SIZE(%r9)
528 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
529 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
530 subq $PREFETCHED_LOAD_SIZE, %r9
531 cmpq $PREFETCHED_LOAD_SIZE, %rdx
532 ja L(loop_large_backward)
533 sfence
534 /* Store the first 4 * VEC. */
535 VMOVU %VEC(4), (%rdi)
536 VMOVU %VEC(5), VEC_SIZE(%rdi)
537 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
538 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
539 /* Store the last VEC. */
540 VMOVU %VEC(8), (%r11)
541 VZEROUPPER
542 ret
543#endif
544END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
545
546#ifdef SHARED
547# if IS_IN (libc)
548# ifdef USE_MULTIARCH
549strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
550 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
551strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
552 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
553# endif
554strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
555 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
556# endif
557#endif
558#if VEC_SIZE == 16 || defined SHARED
559strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
560 MEMCPY_SYMBOL (__memcpy, unaligned))
561#endif
562