1/* wcscpy with SSSE3
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21# include <sysdep.h>
22
23 .section .text.ssse3,"ax",@progbits
24ENTRY (__wcscpy_ssse3)
25
26 mov %rsi, %rcx
27 mov %rdi, %rdx
28
29 cmpl $0, (%rcx)
30 jz L(Exit4)
31 cmpl $0, 4(%rcx)
32 jz L(Exit8)
33 cmpl $0, 8(%rcx)
34 jz L(Exit12)
35 cmpl $0, 12(%rcx)
36 jz L(Exit16)
37
38 lea 16(%rcx), %rsi
39 and $-16, %rsi
40
41 pxor %xmm0, %xmm0
42 mov (%rcx), %r9
43 mov %r9, (%rdx)
44
45 pcmpeqd (%rsi), %xmm0
46 mov 8(%rcx), %r9
47 mov %r9, 8(%rdx)
48
49 pmovmskb %xmm0, %rax
50 sub %rcx, %rsi
51
52 test %rax, %rax
53 jnz L(CopyFrom1To16Bytes)
54
55 mov %rdx, %rax
56 lea 16(%rdx), %rdx
57 and $-16, %rdx
58 sub %rdx, %rax
59 sub %rax, %rcx
60 mov %rcx, %rax
61 and $0xf, %rax
62 mov $0, %rsi
63
64/* case: rcx_offset == rdx_offset */
65
66 jz L(Align16Both)
67
68 cmp $4, %rax
69 je L(Shl4)
70 cmp $8, %rax
71 je L(Shl8)
72 jmp L(Shl12)
73
74L(Align16Both):
75 movaps (%rcx), %xmm1
76 movaps 16(%rcx), %xmm2
77 movaps %xmm1, (%rdx)
78 pcmpeqd %xmm2, %xmm0
79 pmovmskb %xmm0, %rax
80 lea 16(%rsi), %rsi
81
82 test %rax, %rax
83 jnz L(CopyFrom1To16Bytes)
84
85 movaps 16(%rcx, %rsi), %xmm3
86 movaps %xmm2, (%rdx, %rsi)
87 pcmpeqd %xmm3, %xmm0
88 pmovmskb %xmm0, %rax
89 lea 16(%rsi), %rsi
90
91 test %rax, %rax
92 jnz L(CopyFrom1To16Bytes)
93
94 movaps 16(%rcx, %rsi), %xmm4
95 movaps %xmm3, (%rdx, %rsi)
96 pcmpeqd %xmm4, %xmm0
97 pmovmskb %xmm0, %rax
98 lea 16(%rsi), %rsi
99
100 test %rax, %rax
101 jnz L(CopyFrom1To16Bytes)
102
103 movaps 16(%rcx, %rsi), %xmm1
104 movaps %xmm4, (%rdx, %rsi)
105 pcmpeqd %xmm1, %xmm0
106 pmovmskb %xmm0, %rax
107 lea 16(%rsi), %rsi
108
109 test %rax, %rax
110 jnz L(CopyFrom1To16Bytes)
111
112 movaps 16(%rcx, %rsi), %xmm2
113 movaps %xmm1, (%rdx, %rsi)
114 pcmpeqd %xmm2, %xmm0
115 pmovmskb %xmm0, %rax
116 lea 16(%rsi), %rsi
117
118 test %rax, %rax
119 jnz L(CopyFrom1To16Bytes)
120
121 movaps 16(%rcx, %rsi), %xmm3
122 movaps %xmm2, (%rdx, %rsi)
123 pcmpeqd %xmm3, %xmm0
124 pmovmskb %xmm0, %rax
125 lea 16(%rsi), %rsi
126
127 test %rax, %rax
128 jnz L(CopyFrom1To16Bytes)
129
130 movaps %xmm3, (%rdx, %rsi)
131 mov %rcx, %rax
132 lea 16(%rcx, %rsi), %rcx
133 and $-0x40, %rcx
134 sub %rcx, %rax
135 sub %rax, %rdx
136
137 mov $-0x40, %rsi
138
139 .p2align 4
140L(Aligned64Loop):
141 movaps (%rcx), %xmm2
142 movaps %xmm2, %xmm4
143 movaps 16(%rcx), %xmm5
144 movaps 32(%rcx), %xmm3
145 movaps %xmm3, %xmm6
146 movaps 48(%rcx), %xmm7
147 pminub %xmm5, %xmm2
148 pminub %xmm7, %xmm3
149 pminub %xmm2, %xmm3
150 pcmpeqd %xmm0, %xmm3
151 pmovmskb %xmm3, %rax
152 lea 64(%rdx), %rdx
153 lea 64(%rcx), %rcx
154 test %rax, %rax
155 jnz L(Aligned64Leave)
156 movaps %xmm4, -64(%rdx)
157 movaps %xmm5, -48(%rdx)
158 movaps %xmm6, -32(%rdx)
159 movaps %xmm7, -16(%rdx)
160 jmp L(Aligned64Loop)
161
162L(Aligned64Leave):
163 pcmpeqd %xmm4, %xmm0
164 pmovmskb %xmm0, %rax
165 test %rax, %rax
166 jnz L(CopyFrom1To16Bytes)
167
168 pcmpeqd %xmm5, %xmm0
169
170 pmovmskb %xmm0, %rax
171 movaps %xmm4, -64(%rdx)
172 test %rax, %rax
173 lea 16(%rsi), %rsi
174 jnz L(CopyFrom1To16Bytes)
175
176 pcmpeqd %xmm6, %xmm0
177
178 pmovmskb %xmm0, %rax
179 movaps %xmm5, -48(%rdx)
180 test %rax, %rax
181 lea 16(%rsi), %rsi
182 jnz L(CopyFrom1To16Bytes)
183
184 movaps %xmm6, -32(%rdx)
185 pcmpeqd %xmm7, %xmm0
186
187 pmovmskb %xmm0, %rax
188 lea 16(%rsi), %rsi
189 test %rax, %rax
190 jnz L(CopyFrom1To16Bytes)
191
192 mov $-0x40, %rsi
193 movaps %xmm7, -16(%rdx)
194 jmp L(Aligned64Loop)
195
196 .p2align 4
197L(Shl4):
198 movaps -4(%rcx), %xmm1
199 movaps 12(%rcx), %xmm2
200L(Shl4Start):
201 pcmpeqd %xmm2, %xmm0
202 pmovmskb %xmm0, %rax
203 movaps %xmm2, %xmm3
204
205 test %rax, %rax
206 jnz L(Shl4LoopExit)
207
208 palignr $4, %xmm1, %xmm2
209 movaps %xmm2, (%rdx)
210 movaps 28(%rcx), %xmm2
211
212 pcmpeqd %xmm2, %xmm0
213 lea 16(%rdx), %rdx
214 pmovmskb %xmm0, %rax
215 lea 16(%rcx), %rcx
216 movaps %xmm2, %xmm1
217
218 test %rax, %rax
219 jnz L(Shl4LoopExit)
220
221 palignr $4, %xmm3, %xmm2
222 movaps %xmm2, (%rdx)
223 movaps 28(%rcx), %xmm2
224
225 pcmpeqd %xmm2, %xmm0
226 lea 16(%rdx), %rdx
227 pmovmskb %xmm0, %rax
228 lea 16(%rcx), %rcx
229 movaps %xmm2, %xmm3
230
231 test %rax, %rax
232 jnz L(Shl4LoopExit)
233
234 palignr $4, %xmm1, %xmm2
235 movaps %xmm2, (%rdx)
236 movaps 28(%rcx), %xmm2
237
238 pcmpeqd %xmm2, %xmm0
239 lea 16(%rdx), %rdx
240 pmovmskb %xmm0, %rax
241 lea 16(%rcx), %rcx
242
243 test %rax, %rax
244 jnz L(Shl4LoopExit)
245
246 palignr $4, %xmm3, %xmm2
247 movaps %xmm2, (%rdx)
248 lea 28(%rcx), %rcx
249 lea 16(%rdx), %rdx
250
251 mov %rcx, %rax
252 and $-0x40, %rcx
253 sub %rcx, %rax
254 lea -12(%rcx), %rcx
255 sub %rax, %rdx
256
257 movaps -4(%rcx), %xmm1
258
259 .p2align 4
260L(Shl4LoopStart):
261 movaps 12(%rcx), %xmm2
262 movaps 28(%rcx), %xmm3
263 movaps %xmm3, %xmm6
264 movaps 44(%rcx), %xmm4
265 movaps %xmm4, %xmm7
266 movaps 60(%rcx), %xmm5
267 pminub %xmm2, %xmm6
268 pminub %xmm5, %xmm7
269 pminub %xmm6, %xmm7
270 pcmpeqd %xmm0, %xmm7
271 pmovmskb %xmm7, %rax
272 movaps %xmm5, %xmm7
273 palignr $4, %xmm4, %xmm5
274 test %rax, %rax
275 palignr $4, %xmm3, %xmm4
276 jnz L(Shl4Start)
277
278 palignr $4, %xmm2, %xmm3
279 lea 64(%rcx), %rcx
280 palignr $4, %xmm1, %xmm2
281 movaps %xmm7, %xmm1
282 movaps %xmm5, 48(%rdx)
283 movaps %xmm4, 32(%rdx)
284 movaps %xmm3, 16(%rdx)
285 movaps %xmm2, (%rdx)
286 lea 64(%rdx), %rdx
287 jmp L(Shl4LoopStart)
288
289L(Shl4LoopExit):
290 movdqu -4(%rcx), %xmm1
291 mov $12, %rsi
292 movdqu %xmm1, -4(%rdx)
293 jmp L(CopyFrom1To16Bytes)
294
295 .p2align 4
296L(Shl8):
297 movaps -8(%rcx), %xmm1
298 movaps 8(%rcx), %xmm2
299L(Shl8Start):
300 pcmpeqd %xmm2, %xmm0
301 pmovmskb %xmm0, %rax
302 movaps %xmm2, %xmm3
303
304 test %rax, %rax
305 jnz L(Shl8LoopExit)
306
307 palignr $8, %xmm1, %xmm2
308 movaps %xmm2, (%rdx)
309 movaps 24(%rcx), %xmm2
310
311 pcmpeqd %xmm2, %xmm0
312 lea 16(%rdx), %rdx
313 pmovmskb %xmm0, %rax
314 lea 16(%rcx), %rcx
315 movaps %xmm2, %xmm1
316
317 test %rax, %rax
318 jnz L(Shl8LoopExit)
319
320 palignr $8, %xmm3, %xmm2
321 movaps %xmm2, (%rdx)
322 movaps 24(%rcx), %xmm2
323
324 pcmpeqd %xmm2, %xmm0
325 lea 16(%rdx), %rdx
326 pmovmskb %xmm0, %rax
327 lea 16(%rcx), %rcx
328 movaps %xmm2, %xmm3
329
330 test %rax, %rax
331 jnz L(Shl8LoopExit)
332
333 palignr $8, %xmm1, %xmm2
334 movaps %xmm2, (%rdx)
335 movaps 24(%rcx), %xmm2
336
337 pcmpeqd %xmm2, %xmm0
338 lea 16(%rdx), %rdx
339 pmovmskb %xmm0, %rax
340 lea 16(%rcx), %rcx
341
342 test %rax, %rax
343 jnz L(Shl8LoopExit)
344
345 palignr $8, %xmm3, %xmm2
346 movaps %xmm2, (%rdx)
347 lea 24(%rcx), %rcx
348 lea 16(%rdx), %rdx
349
350 mov %rcx, %rax
351 and $-0x40, %rcx
352 sub %rcx, %rax
353 lea -8(%rcx), %rcx
354 sub %rax, %rdx
355
356 movaps -8(%rcx), %xmm1
357
358 .p2align 4
359L(Shl8LoopStart):
360 movaps 8(%rcx), %xmm2
361 movaps 24(%rcx), %xmm3
362 movaps %xmm3, %xmm6
363 movaps 40(%rcx), %xmm4
364 movaps %xmm4, %xmm7
365 movaps 56(%rcx), %xmm5
366 pminub %xmm2, %xmm6
367 pminub %xmm5, %xmm7
368 pminub %xmm6, %xmm7
369 pcmpeqd %xmm0, %xmm7
370 pmovmskb %xmm7, %rax
371 movaps %xmm5, %xmm7
372 palignr $8, %xmm4, %xmm5
373 test %rax, %rax
374 palignr $8, %xmm3, %xmm4
375 jnz L(Shl8Start)
376
377 palignr $8, %xmm2, %xmm3
378 lea 64(%rcx), %rcx
379 palignr $8, %xmm1, %xmm2
380 movaps %xmm7, %xmm1
381 movaps %xmm5, 48(%rdx)
382 movaps %xmm4, 32(%rdx)
383 movaps %xmm3, 16(%rdx)
384 movaps %xmm2, (%rdx)
385 lea 64(%rdx), %rdx
386 jmp L(Shl8LoopStart)
387
388L(Shl8LoopExit):
389 mov (%rcx), %r9
390 mov $8, %rsi
391 mov %r9, (%rdx)
392 jmp L(CopyFrom1To16Bytes)
393
394 .p2align 4
395L(Shl12):
396 movaps -12(%rcx), %xmm1
397 movaps 4(%rcx), %xmm2
398L(Shl12Start):
399 pcmpeqd %xmm2, %xmm0
400 pmovmskb %xmm0, %rax
401 movaps %xmm2, %xmm3
402
403 test %rax, %rax
404 jnz L(Shl12LoopExit)
405
406 palignr $12, %xmm1, %xmm2
407 movaps %xmm2, (%rdx)
408 movaps 20(%rcx), %xmm2
409
410 pcmpeqd %xmm2, %xmm0
411 lea 16(%rdx), %rdx
412 pmovmskb %xmm0, %rax
413 lea 16(%rcx), %rcx
414 movaps %xmm2, %xmm1
415
416 test %rax, %rax
417 jnz L(Shl12LoopExit)
418
419 palignr $12, %xmm3, %xmm2
420 movaps %xmm2, (%rdx)
421 movaps 20(%rcx), %xmm2
422
423 pcmpeqd %xmm2, %xmm0
424 lea 16(%rdx), %rdx
425 pmovmskb %xmm0, %rax
426 lea 16(%rcx), %rcx
427 movaps %xmm2, %xmm3
428
429 test %rax, %rax
430 jnz L(Shl12LoopExit)
431
432 palignr $12, %xmm1, %xmm2
433 movaps %xmm2, (%rdx)
434 movaps 20(%rcx), %xmm2
435
436 pcmpeqd %xmm2, %xmm0
437 lea 16(%rdx), %rdx
438 pmovmskb %xmm0, %rax
439 lea 16(%rcx), %rcx
440
441 test %rax, %rax
442 jnz L(Shl12LoopExit)
443
444 palignr $12, %xmm3, %xmm2
445 movaps %xmm2, (%rdx)
446 lea 20(%rcx), %rcx
447 lea 16(%rdx), %rdx
448
449 mov %rcx, %rax
450 and $-0x40, %rcx
451 sub %rcx, %rax
452 lea -4(%rcx), %rcx
453 sub %rax, %rdx
454
455 movaps -12(%rcx), %xmm1
456
457 .p2align 4
458L(Shl12LoopStart):
459 movaps 4(%rcx), %xmm2
460 movaps 20(%rcx), %xmm3
461 movaps %xmm3, %xmm6
462 movaps 36(%rcx), %xmm4
463 movaps %xmm4, %xmm7
464 movaps 52(%rcx), %xmm5
465 pminub %xmm2, %xmm6
466 pminub %xmm5, %xmm7
467 pminub %xmm6, %xmm7
468 pcmpeqd %xmm0, %xmm7
469 pmovmskb %xmm7, %rax
470 movaps %xmm5, %xmm7
471 palignr $12, %xmm4, %xmm5
472 test %rax, %rax
473 palignr $12, %xmm3, %xmm4
474 jnz L(Shl12Start)
475 palignr $12, %xmm2, %xmm3
476 lea 64(%rcx), %rcx
477 palignr $12, %xmm1, %xmm2
478 movaps %xmm7, %xmm1
479 movaps %xmm5, 48(%rdx)
480 movaps %xmm4, 32(%rdx)
481 movaps %xmm3, 16(%rdx)
482 movaps %xmm2, (%rdx)
483 lea 64(%rdx), %rdx
484 jmp L(Shl12LoopStart)
485
486L(Shl12LoopExit):
487 mov (%rcx), %r9d
488 mov $4, %rsi
489 mov %r9d, (%rdx)
490 jmp L(CopyFrom1To16Bytes)
491
492 .p2align 4
493L(CopyFrom1To16Bytes):
494 add %rsi, %rdx
495 add %rsi, %rcx
496
497 test %al, %al
498 jz L(ExitHigh)
499 test $0x01, %al
500 jnz L(Exit4)
501
502 mov (%rcx), %rax
503 mov %rax, (%rdx)
504 mov %rdi, %rax
505 ret
506
507 .p2align 4
508L(ExitHigh):
509 test $0x01, %ah
510 jnz L(Exit12)
511
512 mov (%rcx), %rax
513 mov %rax, (%rdx)
514 mov 8(%rcx), %rax
515 mov %rax, 8(%rdx)
516 mov %rdi, %rax
517 ret
518
519 .p2align 4
520L(Exit4):
521 movl (%rcx), %eax
522 movl %eax, (%rdx)
523 mov %rdi, %rax
524 ret
525
526 .p2align 4
527L(Exit8):
528 mov (%rcx), %rax
529 mov %rax, (%rdx)
530 mov %rdi, %rax
531 ret
532
533 .p2align 4
534L(Exit12):
535 mov (%rcx), %rax
536 mov %rax, (%rdx)
537 mov 8(%rcx), %eax
538 mov %eax, 8(%rdx)
539 mov %rdi, %rax
540 ret
541
542 .p2align 4
543L(Exit16):
544 mov (%rcx), %rax
545 mov %rax, (%rdx)
546 mov 8(%rcx), %rax
547 mov %rax, 8(%rdx)
548 mov %rdi, %rax
549 ret
550
551END(__wcscpy_ssse3)
552#endif
553