1/* memmove/memcpy/mempcpy with unaligned load/store and rep movsb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memmove/memcpy/mempcpy is implemented as:
20 1. Use overlapping load and store to avoid branch.
21 2. Load all sources into registers and store them together to avoid
22 possible address overlap between source and destination.
23 3. If size is 8 * VEC_SIZE or less, load all sources into registers
24 and store them together.
25 4. If address of destination > address of source, backward copy
26 4 * VEC_SIZE at a time with unaligned load and aligned store.
27 Load the first 4 * VEC and last VEC before the loop and store
28 them after the loop to support overlapping addresses.
29 5. Otherwise, forward copy 4 * VEC_SIZE at a time with unaligned
30 load and aligned store. Load the last 4 * VEC and first VEC
31 before the loop and store them after the loop to support
32 overlapping addresses.
33 6. If size >= __x86_shared_non_temporal_threshold and there is no
34 overlap between destination and source, use non-temporal store
35 instead of aligned store. */
36
37#include <sysdep.h>
38
39#ifndef MEMCPY_SYMBOL
40# define MEMCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
41#endif
42
43#ifndef MEMPCPY_SYMBOL
44# define MEMPCPY_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
45#endif
46
47#ifndef MEMMOVE_CHK_SYMBOL
48# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
49#endif
50
51#ifndef VZEROUPPER
52# if VEC_SIZE > 16
53# define VZEROUPPER vzeroupper
54# else
55# define VZEROUPPER
56# endif
57#endif
58
59/* Threshold to use Enhanced REP MOVSB. Since there is overhead to set
60 up REP MOVSB operation, REP MOVSB isn't faster on short data. The
61 memcpy micro benchmark in glibc shows that 2KB is the approximate
62 value above which REP MOVSB becomes faster than SSE2 optimization
63 on processors with Enhanced REP MOVSB. Since larger register size
64 can move more data with a single load and store, the threshold is
65 higher with larger register size. */
66#ifndef REP_MOVSB_THRESHOLD
67# define REP_MOVSB_THRESHOLD (2048 * (VEC_SIZE / 16))
68#endif
69
70#ifndef PREFETCH
71# define PREFETCH(addr) prefetcht0 addr
72#endif
73
74/* Assume 64-byte prefetch size. */
75#ifndef PREFETCH_SIZE
76# define PREFETCH_SIZE 64
77#endif
78
79#define PREFETCHED_LOAD_SIZE (VEC_SIZE * 4)
80
81#if PREFETCH_SIZE == 64
82# if PREFETCHED_LOAD_SIZE == PREFETCH_SIZE
83# define PREFETCH_ONE_SET(dir, base, offset) \
84 PREFETCH ((offset)base)
85# elif PREFETCHED_LOAD_SIZE == 2 * PREFETCH_SIZE
86# define PREFETCH_ONE_SET(dir, base, offset) \
87 PREFETCH ((offset)base); \
88 PREFETCH ((offset + dir * PREFETCH_SIZE)base)
89# elif PREFETCHED_LOAD_SIZE == 4 * PREFETCH_SIZE
90# define PREFETCH_ONE_SET(dir, base, offset) \
91 PREFETCH ((offset)base); \
92 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
93 PREFETCH ((offset + dir * PREFETCH_SIZE)base); \
94 PREFETCH ((offset + dir * PREFETCH_SIZE * 2)base); \
95 PREFETCH ((offset + dir * PREFETCH_SIZE * 3)base)
96# else
97# error Unsupported PREFETCHED_LOAD_SIZE!
98# endif
99#else
100# error Unsupported PREFETCH_SIZE!
101#endif
102
103#ifndef SECTION
104# error SECTION is not defined!
105#endif
106
107 .section SECTION(.text),"ax",@progbits
108#if defined SHARED && IS_IN (libc)
109ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
110 cmp %RDX_LP, %RCX_LP
111 jb HIDDEN_JUMPTARGET (__chk_fail)
112END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
113#endif
114
115ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
116 mov %RDI_LP, %RAX_LP
117 add %RDX_LP, %RAX_LP
118 jmp L(start)
119END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
120
121#if defined SHARED && IS_IN (libc)
122ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
123 cmp %RDX_LP, %RCX_LP
124 jb HIDDEN_JUMPTARGET (__chk_fail)
125END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
126#endif
127
128ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
129 movq %rdi, %rax
130L(start):
131# ifdef __ILP32__
132 /* Clear the upper 32 bits. */
133 movl %edx, %edx
134# endif
135 cmp $VEC_SIZE, %RDX_LP
136 jb L(less_vec)
137 cmp $(VEC_SIZE * 2), %RDX_LP
138 ja L(more_2x_vec)
139#if !defined USE_MULTIARCH || !IS_IN (libc)
140L(last_2x_vec):
141#endif
142 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
143 VMOVU (%rsi), %VEC(0)
144 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
145 VMOVU %VEC(0), (%rdi)
146 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
147 VZEROUPPER
148#if !defined USE_MULTIARCH || !IS_IN (libc)
149L(nop):
150#endif
151 ret
152#if defined USE_MULTIARCH && IS_IN (libc)
153END (MEMMOVE_SYMBOL (__memmove, unaligned))
154
155# if VEC_SIZE == 16
156ENTRY (__mempcpy_chk_erms)
157 cmp %RDX_LP, %RCX_LP
158 jb HIDDEN_JUMPTARGET (__chk_fail)
159END (__mempcpy_chk_erms)
160
161/* Only used to measure performance of REP MOVSB. */
162ENTRY (__mempcpy_erms)
163 mov %RDI_LP, %RAX_LP
164 add %RDX_LP, %RAX_LP
165 jmp L(start_movsb)
166END (__mempcpy_erms)
167
168ENTRY (__memmove_chk_erms)
169 cmp %RDX_LP, %RCX_LP
170 jb HIDDEN_JUMPTARGET (__chk_fail)
171END (__memmove_chk_erms)
172
173ENTRY (__memmove_erms)
174 movq %rdi, %rax
175L(start_movsb):
176 mov %RDX_LP, %RCX_LP
177 cmp %RSI_LP, %RDI_LP
178 jb 1f
179 /* Source == destination is less common. */
180 je 2f
181 lea (%rsi,%rcx), %RDX_LP
182 cmp %RDX_LP, %RDI_LP
183 jb L(movsb_backward)
1841:
185 rep movsb
1862:
187 ret
188L(movsb_backward):
189 leaq -1(%rdi,%rcx), %rdi
190 leaq -1(%rsi,%rcx), %rsi
191 std
192 rep movsb
193 cld
194 ret
195END (__memmove_erms)
196strong_alias (__memmove_erms, __memcpy_erms)
197strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
198# endif
199
200# ifdef SHARED
201ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
202 cmp %RDX_LP, %RCX_LP
203 jb HIDDEN_JUMPTARGET (__chk_fail)
204END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
205# endif
206
207ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
208 mov %RDI_LP, %RAX_LP
209 add %RDX_LP, %RAX_LP
210 jmp L(start_erms)
211END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
212
213# ifdef SHARED
214ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
215 cmp %RDX_LP, %RCX_LP
216 jb HIDDEN_JUMPTARGET (__chk_fail)
217END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
218# endif
219
220ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
221 movq %rdi, %rax
222L(start_erms):
223# ifdef __ILP32__
224 /* Clear the upper 32 bits. */
225 movl %edx, %edx
226# endif
227 cmp $VEC_SIZE, %RDX_LP
228 jb L(less_vec)
229 cmp $(VEC_SIZE * 2), %RDX_LP
230 ja L(movsb_more_2x_vec)
231L(last_2x_vec):
232 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
233 VMOVU (%rsi), %VEC(0)
234 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
235 VMOVU %VEC(0), (%rdi)
236 VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
237L(return):
238 VZEROUPPER
239 ret
240
241L(movsb):
242 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
243 jae L(more_8x_vec)
244 cmpq %rsi, %rdi
245 jb 1f
246 /* Source == destination is less common. */
247 je L(nop)
248 leaq (%rsi,%rdx), %r9
249 cmpq %r9, %rdi
250 /* Avoid slow backward REP MOVSB. */
251# if REP_MOVSB_THRESHOLD <= (VEC_SIZE * 8)
252# error Unsupported REP_MOVSB_THRESHOLD and VEC_SIZE!
253# endif
254 jb L(more_8x_vec_backward)
2551:
256 mov %RDX_LP, %RCX_LP
257 rep movsb
258L(nop):
259 ret
260#endif
261
262L(less_vec):
263 /* Less than 1 VEC. */
264#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
265# error Unsupported VEC_SIZE!
266#endif
267#if VEC_SIZE > 32
268 cmpb $32, %dl
269 jae L(between_32_63)
270#endif
271#if VEC_SIZE > 16
272 cmpb $16, %dl
273 jae L(between_16_31)
274#endif
275 cmpb $8, %dl
276 jae L(between_8_15)
277 cmpb $4, %dl
278 jae L(between_4_7)
279 cmpb $1, %dl
280 ja L(between_2_3)
281 jb 1f
282 movzbl (%rsi), %ecx
283 movb %cl, (%rdi)
2841:
285 ret
286#if VEC_SIZE > 32
287L(between_32_63):
288 /* From 32 to 63. No branch when size == 32. */
289 vmovdqu (%rsi), %ymm0
290 vmovdqu -32(%rsi,%rdx), %ymm1
291 vmovdqu %ymm0, (%rdi)
292 vmovdqu %ymm1, -32(%rdi,%rdx)
293 VZEROUPPER
294 ret
295#endif
296#if VEC_SIZE > 16
297 /* From 16 to 31. No branch when size == 16. */
298L(between_16_31):
299 vmovdqu (%rsi), %xmm0
300 vmovdqu -16(%rsi,%rdx), %xmm1
301 vmovdqu %xmm0, (%rdi)
302 vmovdqu %xmm1, -16(%rdi,%rdx)
303 ret
304#endif
305L(between_8_15):
306 /* From 8 to 15. No branch when size == 8. */
307 movq -8(%rsi,%rdx), %rcx
308 movq (%rsi), %rsi
309 movq %rcx, -8(%rdi,%rdx)
310 movq %rsi, (%rdi)
311 ret
312L(between_4_7):
313 /* From 4 to 7. No branch when size == 4. */
314 movl -4(%rsi,%rdx), %ecx
315 movl (%rsi), %esi
316 movl %ecx, -4(%rdi,%rdx)
317 movl %esi, (%rdi)
318 ret
319L(between_2_3):
320 /* From 2 to 3. No branch when size == 2. */
321 movzwl -2(%rsi,%rdx), %ecx
322 movzwl (%rsi), %esi
323 movw %cx, -2(%rdi,%rdx)
324 movw %si, (%rdi)
325 ret
326
327#if defined USE_MULTIARCH && IS_IN (libc)
328L(movsb_more_2x_vec):
329 cmpq $REP_MOVSB_THRESHOLD, %rdx
330 ja L(movsb)
331#endif
332L(more_2x_vec):
333 /* More than 2 * VEC and there may be overlap between destination
334 and source. */
335 cmpq $(VEC_SIZE * 8), %rdx
336 ja L(more_8x_vec)
337 cmpq $(VEC_SIZE * 4), %rdx
338 jb L(last_4x_vec)
339 /* Copy from 4 * VEC to 8 * VEC, inclusively. */
340 VMOVU (%rsi), %VEC(0)
341 VMOVU VEC_SIZE(%rsi), %VEC(1)
342 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
343 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
344 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
345 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
346 VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
347 VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
348 VMOVU %VEC(0), (%rdi)
349 VMOVU %VEC(1), VEC_SIZE(%rdi)
350 VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
351 VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
352 VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
353 VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
354 VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
355 VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
356 VZEROUPPER
357 ret
358L(last_4x_vec):
359 /* Copy from 2 * VEC to 4 * VEC. */
360 VMOVU (%rsi), %VEC(0)
361 VMOVU VEC_SIZE(%rsi), %VEC(1)
362 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
363 VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
364 VMOVU %VEC(0), (%rdi)
365 VMOVU %VEC(1), VEC_SIZE(%rdi)
366 VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
367 VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
368 VZEROUPPER
369 ret
370
371L(more_8x_vec):
372 cmpq %rsi, %rdi
373 ja L(more_8x_vec_backward)
374 /* Source == destination is less common. */
375 je L(nop)
376 /* Load the first VEC and last 4 * VEC to support overlapping
377 addresses. */
378 VMOVU (%rsi), %VEC(4)
379 VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
380 VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
381 VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
382 VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
383 /* Save start and stop of the destination buffer. */
384 movq %rdi, %r11
385 leaq -VEC_SIZE(%rdi, %rdx), %rcx
386 /* Align destination for aligned stores in the loop. Compute
387 how much destination is misaligned. */
388 movq %rdi, %r8
389 andq $(VEC_SIZE - 1), %r8
390 /* Get the negative of offset for alignment. */
391 subq $VEC_SIZE, %r8
392 /* Adjust source. */
393 subq %r8, %rsi
394 /* Adjust destination which should be aligned now. */
395 subq %r8, %rdi
396 /* Adjust length. */
397 addq %r8, %rdx
398#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
399 /* Check non-temporal store threshold. */
400 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
401 ja L(large_forward)
402#endif
403L(loop_4x_vec_forward):
404 /* Copy 4 * VEC a time forward. */
405 VMOVU (%rsi), %VEC(0)
406 VMOVU VEC_SIZE(%rsi), %VEC(1)
407 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
408 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
409 addq $(VEC_SIZE * 4), %rsi
410 subq $(VEC_SIZE * 4), %rdx
411 VMOVA %VEC(0), (%rdi)
412 VMOVA %VEC(1), VEC_SIZE(%rdi)
413 VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
414 VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
415 addq $(VEC_SIZE * 4), %rdi
416 cmpq $(VEC_SIZE * 4), %rdx
417 ja L(loop_4x_vec_forward)
418 /* Store the last 4 * VEC. */
419 VMOVU %VEC(5), (%rcx)
420 VMOVU %VEC(6), -VEC_SIZE(%rcx)
421 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
422 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
423 /* Store the first VEC. */
424 VMOVU %VEC(4), (%r11)
425 VZEROUPPER
426 ret
427
428L(more_8x_vec_backward):
429 /* Load the first 4 * VEC and last VEC to support overlapping
430 addresses. */
431 VMOVU (%rsi), %VEC(4)
432 VMOVU VEC_SIZE(%rsi), %VEC(5)
433 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
434 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
435 VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
436 /* Save stop of the destination buffer. */
437 leaq -VEC_SIZE(%rdi, %rdx), %r11
438 /* Align destination end for aligned stores in the loop. Compute
439 how much destination end is misaligned. */
440 leaq -VEC_SIZE(%rsi, %rdx), %rcx
441 movq %r11, %r9
442 movq %r11, %r8
443 andq $(VEC_SIZE - 1), %r8
444 /* Adjust source. */
445 subq %r8, %rcx
446 /* Adjust the end of destination which should be aligned now. */
447 subq %r8, %r9
448 /* Adjust length. */
449 subq %r8, %rdx
450#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
451 /* Check non-temporal store threshold. */
452 cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
453 ja L(large_backward)
454#endif
455L(loop_4x_vec_backward):
456 /* Copy 4 * VEC a time backward. */
457 VMOVU (%rcx), %VEC(0)
458 VMOVU -VEC_SIZE(%rcx), %VEC(1)
459 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
460 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
461 subq $(VEC_SIZE * 4), %rcx
462 subq $(VEC_SIZE * 4), %rdx
463 VMOVA %VEC(0), (%r9)
464 VMOVA %VEC(1), -VEC_SIZE(%r9)
465 VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
466 VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
467 subq $(VEC_SIZE * 4), %r9
468 cmpq $(VEC_SIZE * 4), %rdx
469 ja L(loop_4x_vec_backward)
470 /* Store the first 4 * VEC. */
471 VMOVU %VEC(4), (%rdi)
472 VMOVU %VEC(5), VEC_SIZE(%rdi)
473 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
474 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
475 /* Store the last VEC. */
476 VMOVU %VEC(8), (%r11)
477 VZEROUPPER
478 ret
479
480#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
481L(large_forward):
482 /* Don't use non-temporal store if there is overlap between
483 destination and source since destination may be in cache
484 when source is loaded. */
485 leaq (%rdi, %rdx), %r10
486 cmpq %r10, %rsi
487 jb L(loop_4x_vec_forward)
488L(loop_large_forward):
489 /* Copy 4 * VEC a time forward with non-temporal stores. */
490 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
491 PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
492 VMOVU (%rsi), %VEC(0)
493 VMOVU VEC_SIZE(%rsi), %VEC(1)
494 VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
495 VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
496 addq $PREFETCHED_LOAD_SIZE, %rsi
497 subq $PREFETCHED_LOAD_SIZE, %rdx
498 VMOVNT %VEC(0), (%rdi)
499 VMOVNT %VEC(1), VEC_SIZE(%rdi)
500 VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
501 VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
502 addq $PREFETCHED_LOAD_SIZE, %rdi
503 cmpq $PREFETCHED_LOAD_SIZE, %rdx
504 ja L(loop_large_forward)
505 sfence
506 /* Store the last 4 * VEC. */
507 VMOVU %VEC(5), (%rcx)
508 VMOVU %VEC(6), -VEC_SIZE(%rcx)
509 VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
510 VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
511 /* Store the first VEC. */
512 VMOVU %VEC(4), (%r11)
513 VZEROUPPER
514 ret
515
516L(large_backward):
517 /* Don't use non-temporal store if there is overlap between
518 destination and source since destination may be in cache
519 when source is loaded. */
520 leaq (%rcx, %rdx), %r10
521 cmpq %r10, %r9
522 jb L(loop_4x_vec_backward)
523L(loop_large_backward):
524 /* Copy 4 * VEC a time backward with non-temporal stores. */
525 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
526 PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
527 VMOVU (%rcx), %VEC(0)
528 VMOVU -VEC_SIZE(%rcx), %VEC(1)
529 VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
530 VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
531 subq $PREFETCHED_LOAD_SIZE, %rcx
532 subq $PREFETCHED_LOAD_SIZE, %rdx
533 VMOVNT %VEC(0), (%r9)
534 VMOVNT %VEC(1), -VEC_SIZE(%r9)
535 VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
536 VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
537 subq $PREFETCHED_LOAD_SIZE, %r9
538 cmpq $PREFETCHED_LOAD_SIZE, %rdx
539 ja L(loop_large_backward)
540 sfence
541 /* Store the first 4 * VEC. */
542 VMOVU %VEC(4), (%rdi)
543 VMOVU %VEC(5), VEC_SIZE(%rdi)
544 VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
545 VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
546 /* Store the last VEC. */
547 VMOVU %VEC(8), (%r11)
548 VZEROUPPER
549 ret
550#endif
551END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
552
553#if IS_IN (libc)
554# ifdef USE_MULTIARCH
555strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned_erms),
556 MEMMOVE_SYMBOL (__memcpy, unaligned_erms))
557# ifdef SHARED
558strong_alias (MEMMOVE_SYMBOL (__memmove_chk, unaligned_erms),
559 MEMMOVE_SYMBOL (__memcpy_chk, unaligned_erms))
560# endif
561# endif
562# ifdef SHARED
563strong_alias (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned),
564 MEMMOVE_CHK_SYMBOL (__memcpy_chk, unaligned))
565# endif
566#endif
567strong_alias (MEMMOVE_SYMBOL (__memmove, unaligned),
568 MEMCPY_SYMBOL (__memcpy, unaligned))
569