1/* strcpy with AVX2
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# ifndef USE_AS_STRCAT
23# include <sysdep.h>
24
25# ifndef STRCPY
26# define STRCPY __strcpy_avx2
27# endif
28
29# endif
30
31/* Number of bytes in a vector register */
32# ifndef VEC_SIZE
33# define VEC_SIZE 32
34# endif
35
36# ifndef VZEROUPPER
37# define VZEROUPPER vzeroupper
38# endif
39
40/* zero register */
41#define xmmZ xmm0
42#define ymmZ ymm0
43
44/* mask register */
45#define ymmM ymm1
46
47# ifndef USE_AS_STRCAT
48
49 .section .text.avx,"ax",@progbits
50ENTRY (STRCPY)
51# ifdef USE_AS_STRNCPY
52 mov %RDX_LP, %R8_LP
53 test %R8_LP, %R8_LP
54 jz L(ExitZero)
55# endif
56 mov %rsi, %rcx
57# ifndef USE_AS_STPCPY
58 mov %rdi, %rax /* save result */
59# endif
60
61# endif
62
63 vpxor %xmmZ, %xmmZ, %xmmZ
64
65 and $((VEC_SIZE * 4) - 1), %ecx
66 cmp $(VEC_SIZE * 2), %ecx
67 jbe L(SourceStringAlignmentLessTwoVecSize)
68
69 and $-VEC_SIZE, %rsi
70 and $(VEC_SIZE - 1), %ecx
71
72 vpcmpeqb (%rsi), %ymmZ, %ymmM
73 vpmovmskb %ymmM, %edx
74 shr %cl, %rdx
75
76# ifdef USE_AS_STRNCPY
77# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
78 mov $VEC_SIZE, %r10
79 sub %rcx, %r10
80 cmp %r10, %r8
81# else
82 mov $(VEC_SIZE + 1), %r10
83 sub %rcx, %r10
84 cmp %r10, %r8
85# endif
86 jbe L(CopyVecSizeTailCase2OrCase3)
87# endif
88 test %edx, %edx
89 jnz L(CopyVecSizeTail)
90
91 vpcmpeqb VEC_SIZE(%rsi), %ymmZ, %ymm2
92 vpmovmskb %ymm2, %edx
93
94# ifdef USE_AS_STRNCPY
95 add $VEC_SIZE, %r10
96 cmp %r10, %r8
97 jbe L(CopyTwoVecSizeCase2OrCase3)
98# endif
99 test %edx, %edx
100 jnz L(CopyTwoVecSize)
101
102 vmovdqu (%rsi, %rcx), %ymm2 /* copy VEC_SIZE bytes */
103 vmovdqu %ymm2, (%rdi)
104
105/* If source address alignment != destination address alignment */
106 .p2align 4
107L(UnalignVecSizeBoth):
108 sub %rcx, %rdi
109# ifdef USE_AS_STRNCPY
110 add %rcx, %r8
111 sbb %rcx, %rcx
112 or %rcx, %r8
113# endif
114 mov $VEC_SIZE, %rcx
115 vmovdqa (%rsi, %rcx), %ymm2
116 vmovdqu %ymm2, (%rdi, %rcx)
117 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
118 vpcmpeqb %ymm2, %ymmZ, %ymmM
119 vpmovmskb %ymmM, %edx
120 add $VEC_SIZE, %rcx
121# ifdef USE_AS_STRNCPY
122 sub $(VEC_SIZE * 3), %r8
123 jbe L(CopyVecSizeCase2OrCase3)
124# endif
125 test %edx, %edx
126# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
127 jnz L(CopyVecSizeUnalignedVec2)
128# else
129 jnz L(CopyVecSize)
130# endif
131
132 vmovdqu %ymm2, (%rdi, %rcx)
133 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
134 vpcmpeqb %ymm3, %ymmZ, %ymmM
135 vpmovmskb %ymmM, %edx
136 add $VEC_SIZE, %rcx
137# ifdef USE_AS_STRNCPY
138 sub $VEC_SIZE, %r8
139 jbe L(CopyVecSizeCase2OrCase3)
140# endif
141 test %edx, %edx
142# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
143 jnz L(CopyVecSizeUnalignedVec3)
144# else
145 jnz L(CopyVecSize)
146# endif
147
148 vmovdqu %ymm3, (%rdi, %rcx)
149 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm4
150 vpcmpeqb %ymm4, %ymmZ, %ymmM
151 vpmovmskb %ymmM, %edx
152 add $VEC_SIZE, %rcx
153# ifdef USE_AS_STRNCPY
154 sub $VEC_SIZE, %r8
155 jbe L(CopyVecSizeCase2OrCase3)
156# endif
157 test %edx, %edx
158# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
159 jnz L(CopyVecSizeUnalignedVec4)
160# else
161 jnz L(CopyVecSize)
162# endif
163
164 vmovdqu %ymm4, (%rdi, %rcx)
165 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
166 vpcmpeqb %ymm2, %ymmZ, %ymmM
167 vpmovmskb %ymmM, %edx
168 add $VEC_SIZE, %rcx
169# ifdef USE_AS_STRNCPY
170 sub $VEC_SIZE, %r8
171 jbe L(CopyVecSizeCase2OrCase3)
172# endif
173 test %edx, %edx
174# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
175 jnz L(CopyVecSizeUnalignedVec2)
176# else
177 jnz L(CopyVecSize)
178# endif
179
180 vmovdqu %ymm2, (%rdi, %rcx)
181 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm2
182 vpcmpeqb %ymm2, %ymmZ, %ymmM
183 vpmovmskb %ymmM, %edx
184 add $VEC_SIZE, %rcx
185# ifdef USE_AS_STRNCPY
186 sub $VEC_SIZE, %r8
187 jbe L(CopyVecSizeCase2OrCase3)
188# endif
189 test %edx, %edx
190# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
191 jnz L(CopyVecSizeUnalignedVec2)
192# else
193 jnz L(CopyVecSize)
194# endif
195
196 vmovdqa VEC_SIZE(%rsi, %rcx), %ymm3
197 vmovdqu %ymm2, (%rdi, %rcx)
198 vpcmpeqb %ymm3, %ymmZ, %ymmM
199 vpmovmskb %ymmM, %edx
200 add $VEC_SIZE, %rcx
201# ifdef USE_AS_STRNCPY
202 sub $VEC_SIZE, %r8
203 jbe L(CopyVecSizeCase2OrCase3)
204# endif
205 test %edx, %edx
206# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
207 jnz L(CopyVecSizeUnalignedVec3)
208# else
209 jnz L(CopyVecSize)
210# endif
211
212 vmovdqu %ymm3, (%rdi, %rcx)
213 mov %rsi, %rdx
214 lea VEC_SIZE(%rsi, %rcx), %rsi
215 and $-(VEC_SIZE * 4), %rsi
216 sub %rsi, %rdx
217 sub %rdx, %rdi
218# ifdef USE_AS_STRNCPY
219 lea (VEC_SIZE * 8)(%r8, %rdx), %r8
220# endif
221L(UnalignedFourVecSizeLoop):
222 vmovdqa (%rsi), %ymm4
223 vmovdqa VEC_SIZE(%rsi), %ymm5
224 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
225 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
226 vpminub %ymm5, %ymm4, %ymm2
227 vpminub %ymm7, %ymm6, %ymm3
228 vpminub %ymm2, %ymm3, %ymm3
229 vpcmpeqb %ymmM, %ymm3, %ymm3
230 vpmovmskb %ymm3, %edx
231# ifdef USE_AS_STRNCPY
232 sub $(VEC_SIZE * 4), %r8
233 jbe L(UnalignedLeaveCase2OrCase3)
234# endif
235 test %edx, %edx
236 jnz L(UnalignedFourVecSizeLeave)
237
238L(UnalignedFourVecSizeLoop_start):
239 add $(VEC_SIZE * 4), %rdi
240 add $(VEC_SIZE * 4), %rsi
241 vmovdqu %ymm4, -(VEC_SIZE * 4)(%rdi)
242 vmovdqa (%rsi), %ymm4
243 vmovdqu %ymm5, -(VEC_SIZE * 3)(%rdi)
244 vmovdqa VEC_SIZE(%rsi), %ymm5
245 vpminub %ymm5, %ymm4, %ymm2
246 vmovdqu %ymm6, -(VEC_SIZE * 2)(%rdi)
247 vmovdqa (VEC_SIZE * 2)(%rsi), %ymm6
248 vmovdqu %ymm7, -VEC_SIZE(%rdi)
249 vmovdqa (VEC_SIZE * 3)(%rsi), %ymm7
250 vpminub %ymm7, %ymm6, %ymm3
251 vpminub %ymm2, %ymm3, %ymm3
252 vpcmpeqb %ymmM, %ymm3, %ymm3
253 vpmovmskb %ymm3, %edx
254# ifdef USE_AS_STRNCPY
255 sub $(VEC_SIZE * 4), %r8
256 jbe L(UnalignedLeaveCase2OrCase3)
257# endif
258 test %edx, %edx
259 jz L(UnalignedFourVecSizeLoop_start)
260
261L(UnalignedFourVecSizeLeave):
262 vpcmpeqb %ymm4, %ymmZ, %ymmM
263 vpmovmskb %ymmM, %edx
264 test %edx, %edx
265 jnz L(CopyVecSizeUnaligned_0)
266
267 vpcmpeqb %ymm5, %ymmZ, %ymmM
268 vpmovmskb %ymmM, %ecx
269 test %ecx, %ecx
270 jnz L(CopyVecSizeUnaligned_16)
271
272 vpcmpeqb %ymm6, %ymmZ, %ymmM
273 vpmovmskb %ymmM, %edx
274 test %edx, %edx
275 jnz L(CopyVecSizeUnaligned_32)
276
277 vpcmpeqb %ymm7, %ymmZ, %ymmM
278 vpmovmskb %ymmM, %ecx
279 bsf %ecx, %edx
280 vmovdqu %ymm4, (%rdi)
281 vmovdqu %ymm5, VEC_SIZE(%rdi)
282 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
283# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
284# ifdef USE_AS_STPCPY
285 lea (VEC_SIZE * 3)(%rdi, %rdx), %rax
286# endif
287 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
288 add $(VEC_SIZE - 1), %r8
289 sub %rdx, %r8
290 lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi
291 jmp L(StrncpyFillTailWithZero)
292# else
293 add $(VEC_SIZE * 3), %rsi
294 add $(VEC_SIZE * 3), %rdi
295 jmp L(CopyVecSizeExit)
296# endif
297
298/* If source address alignment == destination address alignment */
299
300L(SourceStringAlignmentLessTwoVecSize):
301 vmovdqu (%rsi), %ymm3
302 vmovdqu VEC_SIZE(%rsi), %ymm2
303 vpcmpeqb %ymm3, %ymmZ, %ymmM
304 vpmovmskb %ymmM, %edx
305
306# ifdef USE_AS_STRNCPY
307# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
308 cmp $VEC_SIZE, %r8
309# else
310 cmp $(VEC_SIZE + 1), %r8
311# endif
312 jbe L(CopyVecSizeTail1Case2OrCase3)
313# endif
314 test %edx, %edx
315 jnz L(CopyVecSizeTail1)
316
317 vmovdqu %ymm3, (%rdi)
318 vpcmpeqb %ymm2, %ymmZ, %ymmM
319 vpmovmskb %ymmM, %edx
320
321# ifdef USE_AS_STRNCPY
322# if defined USE_AS_STPCPY || defined USE_AS_STRCAT
323 cmp $(VEC_SIZE * 2), %r8
324# else
325 cmp $((VEC_SIZE * 2) + 1), %r8
326# endif
327 jbe L(CopyTwoVecSize1Case2OrCase3)
328# endif
329 test %edx, %edx
330 jnz L(CopyTwoVecSize1)
331
332 and $-VEC_SIZE, %rsi
333 and $(VEC_SIZE - 1), %ecx
334 jmp L(UnalignVecSizeBoth)
335
336/*------End of main part with loops---------------------*/
337
338/* Case1 */
339
340# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT)
341 .p2align 4
342L(CopyVecSize):
343 add %rcx, %rdi
344# endif
345L(CopyVecSizeTail):
346 add %rcx, %rsi
347L(CopyVecSizeTail1):
348 bsf %edx, %edx
349L(CopyVecSizeExit):
350 cmp $32, %edx
351 jae L(Exit32_63)
352 cmp $16, %edx
353 jae L(Exit16_31)
354 cmp $8, %edx
355 jae L(Exit8_15)
356 cmp $4, %edx
357 jae L(Exit4_7)
358 cmp $3, %edx
359 je L(Exit3)
360 cmp $1, %edx
361 ja L(Exit2)
362 je L(Exit1)
363 movb $0, (%rdi)
364# ifdef USE_AS_STPCPY
365 lea (%rdi), %rax
366# endif
367# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
368 sub $1, %r8
369 lea 1(%rdi), %rdi
370 jnz L(StrncpyFillTailWithZero)
371# endif
372 VZEROUPPER
373 ret
374
375 .p2align 4
376L(CopyTwoVecSize1):
377 add $VEC_SIZE, %rsi
378 add $VEC_SIZE, %rdi
379# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
380 sub $VEC_SIZE, %r8
381# endif
382 jmp L(CopyVecSizeTail1)
383
384 .p2align 4
385L(CopyTwoVecSize):
386 bsf %edx, %edx
387 add %rcx, %rsi
388 add $VEC_SIZE, %edx
389 sub %ecx, %edx
390 jmp L(CopyVecSizeExit)
391
392 .p2align 4
393L(CopyVecSizeUnaligned_0):
394 bsf %edx, %edx
395# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
396# ifdef USE_AS_STPCPY
397 lea (%rdi, %rdx), %rax
398# endif
399 vmovdqu %ymm4, (%rdi)
400 add $((VEC_SIZE * 4) - 1), %r8
401 sub %rdx, %r8
402 lea 1(%rdi, %rdx), %rdi
403 jmp L(StrncpyFillTailWithZero)
404# else
405 jmp L(CopyVecSizeExit)
406# endif
407
408 .p2align 4
409L(CopyVecSizeUnaligned_16):
410 bsf %ecx, %edx
411 vmovdqu %ymm4, (%rdi)
412# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
413# ifdef USE_AS_STPCPY
414 lea VEC_SIZE(%rdi, %rdx), %rax
415# endif
416 vmovdqu %ymm5, VEC_SIZE(%rdi)
417 add $((VEC_SIZE * 3) - 1), %r8
418 sub %rdx, %r8
419 lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi
420 jmp L(StrncpyFillTailWithZero)
421# else
422 add $VEC_SIZE, %rsi
423 add $VEC_SIZE, %rdi
424 jmp L(CopyVecSizeExit)
425# endif
426
427 .p2align 4
428L(CopyVecSizeUnaligned_32):
429 bsf %edx, %edx
430 vmovdqu %ymm4, (%rdi)
431 vmovdqu %ymm5, VEC_SIZE(%rdi)
432# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
433# ifdef USE_AS_STPCPY
434 lea (VEC_SIZE * 2)(%rdi, %rdx), %rax
435# endif
436 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
437 add $((VEC_SIZE * 2) - 1), %r8
438 sub %rdx, %r8
439 lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi
440 jmp L(StrncpyFillTailWithZero)
441# else
442 add $(VEC_SIZE * 2), %rsi
443 add $(VEC_SIZE * 2), %rdi
444 jmp L(CopyVecSizeExit)
445# endif
446
447# ifdef USE_AS_STRNCPY
448# ifndef USE_AS_STRCAT
449 .p2align 4
450L(CopyVecSizeUnalignedVec6):
451 vmovdqu %ymm6, (%rdi, %rcx)
452 jmp L(CopyVecSizeVecExit)
453
454 .p2align 4
455L(CopyVecSizeUnalignedVec5):
456 vmovdqu %ymm5, (%rdi, %rcx)
457 jmp L(CopyVecSizeVecExit)
458
459 .p2align 4
460L(CopyVecSizeUnalignedVec4):
461 vmovdqu %ymm4, (%rdi, %rcx)
462 jmp L(CopyVecSizeVecExit)
463
464 .p2align 4
465L(CopyVecSizeUnalignedVec3):
466 vmovdqu %ymm3, (%rdi, %rcx)
467 jmp L(CopyVecSizeVecExit)
468# endif
469
470/* Case2 */
471
472 .p2align 4
473L(CopyVecSizeCase2):
474 add $VEC_SIZE, %r8
475 add %rcx, %rdi
476 add %rcx, %rsi
477 bsf %edx, %edx
478 cmp %r8d, %edx
479 jb L(CopyVecSizeExit)
480 jmp L(StrncpyExit)
481
482 .p2align 4
483L(CopyTwoVecSizeCase2):
484 add %rcx, %rsi
485 bsf %edx, %edx
486 add $VEC_SIZE, %edx
487 sub %ecx, %edx
488 cmp %r8d, %edx
489 jb L(CopyVecSizeExit)
490 jmp L(StrncpyExit)
491
492L(CopyVecSizeTailCase2):
493 add %rcx, %rsi
494 bsf %edx, %edx
495 cmp %r8d, %edx
496 jb L(CopyVecSizeExit)
497 jmp L(StrncpyExit)
498
499L(CopyVecSizeTail1Case2):
500 bsf %edx, %edx
501 cmp %r8d, %edx
502 jb L(CopyVecSizeExit)
503 jmp L(StrncpyExit)
504
505/* Case2 or Case3, Case3 */
506
507 .p2align 4
508L(CopyVecSizeCase2OrCase3):
509 test %rdx, %rdx
510 jnz L(CopyVecSizeCase2)
511L(CopyVecSizeCase3):
512 add $VEC_SIZE, %r8
513 add %rcx, %rdi
514 add %rcx, %rsi
515 jmp L(StrncpyExit)
516
517 .p2align 4
518L(CopyTwoVecSizeCase2OrCase3):
519 test %rdx, %rdx
520 jnz L(CopyTwoVecSizeCase2)
521 add %rcx, %rsi
522 jmp L(StrncpyExit)
523
524 .p2align 4
525L(CopyVecSizeTailCase2OrCase3):
526 test %rdx, %rdx
527 jnz L(CopyVecSizeTailCase2)
528 add %rcx, %rsi
529 jmp L(StrncpyExit)
530
531 .p2align 4
532L(CopyTwoVecSize1Case2OrCase3):
533 add $VEC_SIZE, %rdi
534 add $VEC_SIZE, %rsi
535 sub $VEC_SIZE, %r8
536L(CopyVecSizeTail1Case2OrCase3):
537 test %rdx, %rdx
538 jnz L(CopyVecSizeTail1Case2)
539 jmp L(StrncpyExit)
540# endif
541
542/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/
543
544 .p2align 4
545L(Exit1):
546 movzwl (%rsi), %edx
547 mov %dx, (%rdi)
548# ifdef USE_AS_STPCPY
549 lea 1(%rdi), %rax
550# endif
551# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
552 sub $2, %r8
553 lea 2(%rdi), %rdi
554 jnz L(StrncpyFillTailWithZero)
555# endif
556 VZEROUPPER
557 ret
558
559 .p2align 4
560L(Exit2):
561 movzwl (%rsi), %ecx
562 mov %cx, (%rdi)
563 movb $0, 2(%rdi)
564# ifdef USE_AS_STPCPY
565 lea 2(%rdi), %rax
566# endif
567# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
568 sub $3, %r8
569 lea 3(%rdi), %rdi
570 jnz L(StrncpyFillTailWithZero)
571# endif
572 VZEROUPPER
573 ret
574
575 .p2align 4
576L(Exit3):
577 mov (%rsi), %edx
578 mov %edx, (%rdi)
579# ifdef USE_AS_STPCPY
580 lea 3(%rdi), %rax
581# endif
582# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
583 sub $4, %r8
584 lea 4(%rdi), %rdi
585 jnz L(StrncpyFillTailWithZero)
586# endif
587 VZEROUPPER
588 ret
589
590 .p2align 4
591L(Exit4_7):
592 mov (%rsi), %ecx
593 mov %ecx, (%rdi)
594 mov -3(%rsi, %rdx), %ecx
595 mov %ecx, -3(%rdi, %rdx)
596# ifdef USE_AS_STPCPY
597 lea (%rdi, %rdx), %rax
598# endif
599# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
600 sub %rdx, %r8
601 sub $1, %r8
602 lea 1(%rdi, %rdx), %rdi
603 jnz L(StrncpyFillTailWithZero)
604# endif
605 VZEROUPPER
606 ret
607
608 .p2align 4
609L(Exit8_15):
610 mov (%rsi), %rcx
611 mov -7(%rsi, %rdx), %r9
612 mov %rcx, (%rdi)
613 mov %r9, -7(%rdi, %rdx)
614# ifdef USE_AS_STPCPY
615 lea (%rdi, %rdx), %rax
616# endif
617# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
618 sub %rdx, %r8
619 sub $1, %r8
620 lea 1(%rdi, %rdx), %rdi
621 jnz L(StrncpyFillTailWithZero)
622# endif
623 VZEROUPPER
624 ret
625
626 .p2align 4
627L(Exit16_31):
628 vmovdqu (%rsi), %xmm2
629 vmovdqu -15(%rsi, %rdx), %xmm3
630 vmovdqu %xmm2, (%rdi)
631 vmovdqu %xmm3, -15(%rdi, %rdx)
632# ifdef USE_AS_STPCPY
633 lea (%rdi, %rdx), %rax
634# endif
635# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
636 sub %rdx, %r8
637 sub $1, %r8
638 lea 1(%rdi, %rdx), %rdi
639 jnz L(StrncpyFillTailWithZero)
640# endif
641 VZEROUPPER
642 ret
643
644 .p2align 4
645L(Exit32_63):
646 vmovdqu (%rsi), %ymm2
647 vmovdqu -31(%rsi, %rdx), %ymm3
648 vmovdqu %ymm2, (%rdi)
649 vmovdqu %ymm3, -31(%rdi, %rdx)
650# ifdef USE_AS_STPCPY
651 lea (%rdi, %rdx), %rax
652# endif
653# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT
654 sub %rdx, %r8
655 sub $1, %r8
656 lea 1(%rdi, %rdx), %rdi
657 jnz L(StrncpyFillTailWithZero)
658# endif
659 VZEROUPPER
660 ret
661
662# ifdef USE_AS_STRNCPY
663
664 .p2align 4
665L(StrncpyExit1):
666 movzbl (%rsi), %edx
667 mov %dl, (%rdi)
668# ifdef USE_AS_STPCPY
669 lea 1(%rdi), %rax
670# endif
671# ifdef USE_AS_STRCAT
672 movb $0, 1(%rdi)
673# endif
674 VZEROUPPER
675 ret
676
677 .p2align 4
678L(StrncpyExit2):
679 movzwl (%rsi), %edx
680 mov %dx, (%rdi)
681# ifdef USE_AS_STPCPY
682 lea 2(%rdi), %rax
683# endif
684# ifdef USE_AS_STRCAT
685 movb $0, 2(%rdi)
686# endif
687 VZEROUPPER
688 ret
689
690 .p2align 4
691L(StrncpyExit3_4):
692 movzwl (%rsi), %ecx
693 movzwl -2(%rsi, %r8), %edx
694 mov %cx, (%rdi)
695 mov %dx, -2(%rdi, %r8)
696# ifdef USE_AS_STPCPY
697 lea (%rdi, %r8), %rax
698# endif
699# ifdef USE_AS_STRCAT
700 movb $0, (%rdi, %r8)
701# endif
702 VZEROUPPER
703 ret
704
705 .p2align 4
706L(StrncpyExit5_8):
707 mov (%rsi), %ecx
708 mov -4(%rsi, %r8), %edx
709 mov %ecx, (%rdi)
710 mov %edx, -4(%rdi, %r8)
711# ifdef USE_AS_STPCPY
712 lea (%rdi, %r8), %rax
713# endif
714# ifdef USE_AS_STRCAT
715 movb $0, (%rdi, %r8)
716# endif
717 VZEROUPPER
718 ret
719
720 .p2align 4
721L(StrncpyExit9_16):
722 mov (%rsi), %rcx
723 mov -8(%rsi, %r8), %rdx
724 mov %rcx, (%rdi)
725 mov %rdx, -8(%rdi, %r8)
726# ifdef USE_AS_STPCPY
727 lea (%rdi, %r8), %rax
728# endif
729# ifdef USE_AS_STRCAT
730 movb $0, (%rdi, %r8)
731# endif
732 VZEROUPPER
733 ret
734
735 .p2align 4
736L(StrncpyExit17_32):
737 vmovdqu (%rsi), %xmm2
738 vmovdqu -16(%rsi, %r8), %xmm3
739 vmovdqu %xmm2, (%rdi)
740 vmovdqu %xmm3, -16(%rdi, %r8)
741# ifdef USE_AS_STPCPY
742 lea (%rdi, %r8), %rax
743# endif
744# ifdef USE_AS_STRCAT
745 movb $0, (%rdi, %r8)
746# endif
747 VZEROUPPER
748 ret
749
750 .p2align 4
751L(StrncpyExit33_64):
752 /* 0/32, 31/16 */
753 vmovdqu (%rsi), %ymm2
754 vmovdqu -VEC_SIZE(%rsi, %r8), %ymm3
755 vmovdqu %ymm2, (%rdi)
756 vmovdqu %ymm3, -VEC_SIZE(%rdi, %r8)
757# ifdef USE_AS_STPCPY
758 lea (%rdi, %r8), %rax
759# endif
760# ifdef USE_AS_STRCAT
761 movb $0, (%rdi, %r8)
762# endif
763 VZEROUPPER
764 ret
765
766 .p2align 4
767L(StrncpyExit65):
768 /* 0/32, 32/32, 64/1 */
769 vmovdqu (%rsi), %ymm2
770 vmovdqu 32(%rsi), %ymm3
771 mov 64(%rsi), %cl
772 vmovdqu %ymm2, (%rdi)
773 vmovdqu %ymm3, 32(%rdi)
774 mov %cl, 64(%rdi)
775# ifdef USE_AS_STPCPY
776 lea 65(%rdi), %rax
777# endif
778# ifdef USE_AS_STRCAT
779 movb $0, 65(%rdi)
780# endif
781 VZEROUPPER
782 ret
783
784# ifndef USE_AS_STRCAT
785
786 .p2align 4
787L(Fill1):
788 mov %dl, (%rdi)
789 VZEROUPPER
790 ret
791
792 .p2align 4
793L(Fill2):
794 mov %dx, (%rdi)
795 VZEROUPPER
796 ret
797
798 .p2align 4
799L(Fill3_4):
800 mov %dx, (%rdi)
801 mov %dx, -2(%rdi, %r8)
802 VZEROUPPER
803 ret
804
805 .p2align 4
806L(Fill5_8):
807 mov %edx, (%rdi)
808 mov %edx, -4(%rdi, %r8)
809 VZEROUPPER
810 ret
811
812 .p2align 4
813L(Fill9_16):
814 mov %rdx, (%rdi)
815 mov %rdx, -8(%rdi, %r8)
816 VZEROUPPER
817 ret
818
819 .p2align 4
820L(Fill17_32):
821 vmovdqu %xmmZ, (%rdi)
822 vmovdqu %xmmZ, -16(%rdi, %r8)
823 VZEROUPPER
824 ret
825
826 .p2align 4
827L(CopyVecSizeUnalignedVec2):
828 vmovdqu %ymm2, (%rdi, %rcx)
829
830 .p2align 4
831L(CopyVecSizeVecExit):
832 bsf %edx, %edx
833 add $(VEC_SIZE - 1), %r8
834 add %rcx, %rdi
835# ifdef USE_AS_STPCPY
836 lea (%rdi, %rdx), %rax
837# endif
838 sub %rdx, %r8
839 lea 1(%rdi, %rdx), %rdi
840
841 .p2align 4
842L(StrncpyFillTailWithZero):
843 xor %edx, %edx
844 sub $VEC_SIZE, %r8
845 jbe L(StrncpyFillExit)
846
847 vmovdqu %ymmZ, (%rdi)
848 add $VEC_SIZE, %rdi
849
850 mov %rdi, %rsi
851 and $(VEC_SIZE - 1), %esi
852 sub %rsi, %rdi
853 add %rsi, %r8
854 sub $(VEC_SIZE * 4), %r8
855 jb L(StrncpyFillLessFourVecSize)
856
857L(StrncpyFillLoopVmovdqa):
858 vmovdqa %ymmZ, (%rdi)
859 vmovdqa %ymmZ, VEC_SIZE(%rdi)
860 vmovdqa %ymmZ, (VEC_SIZE * 2)(%rdi)
861 vmovdqa %ymmZ, (VEC_SIZE * 3)(%rdi)
862 add $(VEC_SIZE * 4), %rdi
863 sub $(VEC_SIZE * 4), %r8
864 jae L(StrncpyFillLoopVmovdqa)
865
866L(StrncpyFillLessFourVecSize):
867 add $(VEC_SIZE * 2), %r8
868 jl L(StrncpyFillLessTwoVecSize)
869 vmovdqa %ymmZ, (%rdi)
870 vmovdqa %ymmZ, VEC_SIZE(%rdi)
871 add $(VEC_SIZE * 2), %rdi
872 sub $VEC_SIZE, %r8
873 jl L(StrncpyFillExit)
874 vmovdqa %ymmZ, (%rdi)
875 add $VEC_SIZE, %rdi
876 jmp L(Fill)
877
878 .p2align 4
879L(StrncpyFillLessTwoVecSize):
880 add $VEC_SIZE, %r8
881 jl L(StrncpyFillExit)
882 vmovdqa %ymmZ, (%rdi)
883 add $VEC_SIZE, %rdi
884 jmp L(Fill)
885
886 .p2align 4
887L(StrncpyFillExit):
888 add $VEC_SIZE, %r8
889L(Fill):
890 cmp $17, %r8d
891 jae L(Fill17_32)
892 cmp $9, %r8d
893 jae L(Fill9_16)
894 cmp $5, %r8d
895 jae L(Fill5_8)
896 cmp $3, %r8d
897 jae L(Fill3_4)
898 cmp $1, %r8d
899 ja L(Fill2)
900 je L(Fill1)
901 VZEROUPPER
902 ret
903
904/* end of ifndef USE_AS_STRCAT */
905# endif
906
907 .p2align 4
908L(UnalignedLeaveCase2OrCase3):
909 test %rdx, %rdx
910 jnz L(UnalignedFourVecSizeLeaveCase2)
911L(UnalignedFourVecSizeLeaveCase3):
912 lea (VEC_SIZE * 4)(%r8), %rcx
913 and $-VEC_SIZE, %rcx
914 add $(VEC_SIZE * 3), %r8
915 jl L(CopyVecSizeCase3)
916 vmovdqu %ymm4, (%rdi)
917 sub $VEC_SIZE, %r8
918 jb L(CopyVecSizeCase3)
919 vmovdqu %ymm5, VEC_SIZE(%rdi)
920 sub $VEC_SIZE, %r8
921 jb L(CopyVecSizeCase3)
922 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
923 sub $VEC_SIZE, %r8
924 jb L(CopyVecSizeCase3)
925 vmovdqu %ymm7, (VEC_SIZE * 3)(%rdi)
926# ifdef USE_AS_STPCPY
927 lea (VEC_SIZE * 4)(%rdi), %rax
928# endif
929# ifdef USE_AS_STRCAT
930 movb $0, (VEC_SIZE * 4)(%rdi)
931# endif
932 VZEROUPPER
933 ret
934
935 .p2align 4
936L(UnalignedFourVecSizeLeaveCase2):
937 xor %ecx, %ecx
938 vpcmpeqb %ymm4, %ymmZ, %ymmM
939 vpmovmskb %ymmM, %edx
940 add $(VEC_SIZE * 3), %r8
941 jle L(CopyVecSizeCase2OrCase3)
942 test %edx, %edx
943# ifndef USE_AS_STRCAT
944 jnz L(CopyVecSizeUnalignedVec4)
945# else
946 jnz L(CopyVecSize)
947# endif
948 vpcmpeqb %ymm5, %ymmZ, %ymmM
949 vpmovmskb %ymmM, %edx
950 vmovdqu %ymm4, (%rdi)
951 add $VEC_SIZE, %rcx
952 sub $VEC_SIZE, %r8
953 jbe L(CopyVecSizeCase2OrCase3)
954 test %edx, %edx
955# ifndef USE_AS_STRCAT
956 jnz L(CopyVecSizeUnalignedVec5)
957# else
958 jnz L(CopyVecSize)
959# endif
960
961 vpcmpeqb %ymm6, %ymmZ, %ymmM
962 vpmovmskb %ymmM, %edx
963 vmovdqu %ymm5, VEC_SIZE(%rdi)
964 add $VEC_SIZE, %rcx
965 sub $VEC_SIZE, %r8
966 jbe L(CopyVecSizeCase2OrCase3)
967 test %edx, %edx
968# ifndef USE_AS_STRCAT
969 jnz L(CopyVecSizeUnalignedVec6)
970# else
971 jnz L(CopyVecSize)
972# endif
973
974 vpcmpeqb %ymm7, %ymmZ, %ymmM
975 vpmovmskb %ymmM, %edx
976 vmovdqu %ymm6, (VEC_SIZE * 2)(%rdi)
977 lea VEC_SIZE(%rdi, %rcx), %rdi
978 lea VEC_SIZE(%rsi, %rcx), %rsi
979 bsf %edx, %edx
980 cmp %r8d, %edx
981 jb L(CopyVecSizeExit)
982L(StrncpyExit):
983 cmp $65, %r8d
984 je L(StrncpyExit65)
985 cmp $33, %r8d
986 jae L(StrncpyExit33_64)
987 cmp $17, %r8d
988 jae L(StrncpyExit17_32)
989 cmp $9, %r8d
990 jae L(StrncpyExit9_16)
991 cmp $5, %r8d
992 jae L(StrncpyExit5_8)
993 cmp $3, %r8d
994 jae L(StrncpyExit3_4)
995 cmp $1, %r8d
996 ja L(StrncpyExit2)
997 je L(StrncpyExit1)
998# ifdef USE_AS_STPCPY
999 mov %rdi, %rax
1000# endif
1001# ifdef USE_AS_STRCAT
1002 movb $0, (%rdi)
1003# endif
1004 VZEROUPPER
1005 ret
1006
1007 .p2align 4
1008L(ExitZero):
1009# ifndef USE_AS_STRCAT
1010 mov %rdi, %rax
1011# endif
1012 VZEROUPPER
1013 ret
1014
1015# endif
1016
1017# ifndef USE_AS_STRCAT
1018END (STRCPY)
1019# else
1020END (STRCAT)
1021# endif
1022#endif
1023