1/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
2 Copyright (C) 2018-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCMP
24# define STRCMP __strcmp_avx2
25# endif
26
27# define PAGE_SIZE 4096
28
29/* VEC_SIZE = Number of bytes in a ymm register */
30# define VEC_SIZE 32
31
32/* Shift for dividing by (VEC_SIZE * 4). */
33# define DIVIDE_BY_VEC_4_SHIFT 7
34# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
35# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
36# endif
37
38# ifdef USE_AS_WCSCMP
39/* Compare packed dwords. */
40# define VPCMPEQ vpcmpeqd
41/* Compare packed dwords and store minimum. */
42# define VPMINU vpminud
43/* 1 dword char == 4 bytes. */
44# define SIZE_OF_CHAR 4
45# else
46/* Compare packed bytes. */
47# define VPCMPEQ vpcmpeqb
48/* Compare packed bytes and store minimum. */
49# define VPMINU vpminub
50/* 1 byte char == 1 byte. */
51# define SIZE_OF_CHAR 1
52# endif
53
54# ifndef VZEROUPPER
55# define VZEROUPPER vzeroupper
56# endif
57
58/* Warning!
59 wcscmp/wcsncmp have to use SIGNED comparison for elements.
60 strcmp/strncmp have to use UNSIGNED comparison for elements.
61*/
62
63/* The main idea of the string comparison (byte or dword) using AVX2
64 consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
65 either packed bytes or dwords depending on USE_AS_WCSCMP. In order
66 to check the null char, algorithm keeps the matched bytes/dwords,
67 requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
68 the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
69 one VPMINU instructions, together with movdqu and testl instructions.
70 Main loop (away from from page boundary) compares 4 vectors are a time,
71 effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
72
73 The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
74 is the same as strcmp, except that an a maximum offset is tracked. If
75 the maximum offset is reached before a difference is found, zero is
76 returned. */
77
78 .section .text.avx,"ax",@progbits
79ENTRY (STRCMP)
80# ifdef USE_AS_STRNCMP
81 /* Check for simple cases (0 or 1) in offset. */
82 cmp $1, %RDX_LP
83 je L(char0)
84 jb L(zero)
85# ifdef USE_AS_WCSCMP
86 /* Convert units: from wide to byte char. */
87 shl $2, %RDX_LP
88# endif
89 /* Register %r11 tracks the maximum offset. */
90 mov %RDX_LP, %R11_LP
91# endif
92 movl %edi, %eax
93 xorl %edx, %edx
94 /* Make %ymm7 all zeros in this function. */
95 vpxor %ymm7, %ymm7, %ymm7
96 orl %esi, %eax
97 andl $(PAGE_SIZE - 1), %eax
98 cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
99 jg L(cross_page)
100 /* Start comparing 4 vectors. */
101 vmovdqu (%rdi), %ymm1
102 VPCMPEQ (%rsi), %ymm1, %ymm0
103 VPMINU %ymm1, %ymm0, %ymm0
104 VPCMPEQ %ymm7, %ymm0, %ymm0
105 vpmovmskb %ymm0, %ecx
106 testl %ecx, %ecx
107 je L(next_3_vectors)
108 tzcntl %ecx, %edx
109# ifdef USE_AS_STRNCMP
110 /* Return 0 if the mismatched index (%rdx) is after the maximum
111 offset (%r11). */
112 cmpq %r11, %rdx
113 jae L(zero)
114# endif
115# ifdef USE_AS_WCSCMP
116 xorl %eax, %eax
117 movl (%rdi, %rdx), %ecx
118 cmpl (%rsi, %rdx), %ecx
119 je L(return)
120L(wcscmp_return):
121 setl %al
122 negl %eax
123 orl $1, %eax
124L(return):
125# else
126 movzbl (%rdi, %rdx), %eax
127 movzbl (%rsi, %rdx), %edx
128 subl %edx, %eax
129# endif
130 VZEROUPPER
131 ret
132
133 .p2align 4
134L(return_vec_size):
135 tzcntl %ecx, %edx
136# ifdef USE_AS_STRNCMP
137 /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
138 the maximum offset (%r11). */
139 addq $VEC_SIZE, %rdx
140 cmpq %r11, %rdx
141 jae L(zero)
142# ifdef USE_AS_WCSCMP
143 xorl %eax, %eax
144 movl (%rdi, %rdx), %ecx
145 cmpl (%rsi, %rdx), %ecx
146 jne L(wcscmp_return)
147# else
148 movzbl (%rdi, %rdx), %eax
149 movzbl (%rsi, %rdx), %edx
150 subl %edx, %eax
151# endif
152# else
153# ifdef USE_AS_WCSCMP
154 xorl %eax, %eax
155 movl VEC_SIZE(%rdi, %rdx), %ecx
156 cmpl VEC_SIZE(%rsi, %rdx), %ecx
157 jne L(wcscmp_return)
158# else
159 movzbl VEC_SIZE(%rdi, %rdx), %eax
160 movzbl VEC_SIZE(%rsi, %rdx), %edx
161 subl %edx, %eax
162# endif
163# endif
164 VZEROUPPER
165 ret
166
167 .p2align 4
168L(return_2_vec_size):
169 tzcntl %ecx, %edx
170# ifdef USE_AS_STRNCMP
171 /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
172 after the maximum offset (%r11). */
173 addq $(VEC_SIZE * 2), %rdx
174 cmpq %r11, %rdx
175 jae L(zero)
176# ifdef USE_AS_WCSCMP
177 xorl %eax, %eax
178 movl (%rdi, %rdx), %ecx
179 cmpl (%rsi, %rdx), %ecx
180 jne L(wcscmp_return)
181# else
182 movzbl (%rdi, %rdx), %eax
183 movzbl (%rsi, %rdx), %edx
184 subl %edx, %eax
185# endif
186# else
187# ifdef USE_AS_WCSCMP
188 xorl %eax, %eax
189 movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
190 cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
191 jne L(wcscmp_return)
192# else
193 movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
194 movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
195 subl %edx, %eax
196# endif
197# endif
198 VZEROUPPER
199 ret
200
201 .p2align 4
202L(return_3_vec_size):
203 tzcntl %ecx, %edx
204# ifdef USE_AS_STRNCMP
205 /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
206 after the maximum offset (%r11). */
207 addq $(VEC_SIZE * 3), %rdx
208 cmpq %r11, %rdx
209 jae L(zero)
210# ifdef USE_AS_WCSCMP
211 xorl %eax, %eax
212 movl (%rdi, %rdx), %ecx
213 cmpl (%rsi, %rdx), %ecx
214 jne L(wcscmp_return)
215# else
216 movzbl (%rdi, %rdx), %eax
217 movzbl (%rsi, %rdx), %edx
218 subl %edx, %eax
219# endif
220# else
221# ifdef USE_AS_WCSCMP
222 xorl %eax, %eax
223 movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
224 cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
225 jne L(wcscmp_return)
226# else
227 movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
228 movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
229 subl %edx, %eax
230# endif
231# endif
232 VZEROUPPER
233 ret
234
235 .p2align 4
236L(next_3_vectors):
237 vmovdqu VEC_SIZE(%rdi), %ymm6
238 VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
239 VPMINU %ymm6, %ymm3, %ymm3
240 VPCMPEQ %ymm7, %ymm3, %ymm3
241 vpmovmskb %ymm3, %ecx
242 testl %ecx, %ecx
243 jne L(return_vec_size)
244 vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
245 vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
246 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
247 VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
248 VPMINU %ymm5, %ymm2, %ymm2
249 VPCMPEQ %ymm4, %ymm0, %ymm0
250 VPCMPEQ %ymm7, %ymm2, %ymm2
251 vpmovmskb %ymm2, %ecx
252 testl %ecx, %ecx
253 jne L(return_2_vec_size)
254 VPMINU %ymm4, %ymm0, %ymm0
255 VPCMPEQ %ymm7, %ymm0, %ymm0
256 vpmovmskb %ymm0, %ecx
257 testl %ecx, %ecx
258 jne L(return_3_vec_size)
259L(main_loop_header):
260 leaq (VEC_SIZE * 4)(%rdi), %rdx
261 movl $PAGE_SIZE, %ecx
262 /* Align load via RAX. */
263 andq $-(VEC_SIZE * 4), %rdx
264 subq %rdi, %rdx
265 leaq (%rdi, %rdx), %rax
266# ifdef USE_AS_STRNCMP
267 /* Starting from this point, the maximum offset, or simply the
268 'offset', DECREASES by the same amount when base pointers are
269 moved forward. Return 0 when:
270 1) On match: offset <= the matched vector index.
271 2) On mistmach, offset is before the mistmatched index.
272 */
273 subq %rdx, %r11
274 jbe L(zero)
275# endif
276 addq %rsi, %rdx
277 movq %rdx, %rsi
278 andl $(PAGE_SIZE - 1), %esi
279 /* Number of bytes before page crossing. */
280 subq %rsi, %rcx
281 /* Number of VEC_SIZE * 4 blocks before page crossing. */
282 shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
283 /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
284 movl %ecx, %esi
285 jmp L(loop_start)
286
287 .p2align 4
288L(loop):
289# ifdef USE_AS_STRNCMP
290 /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
291 the maximum offset (%r11) by the same amount. */
292 subq $(VEC_SIZE * 4), %r11
293 jbe L(zero)
294# endif
295 addq $(VEC_SIZE * 4), %rax
296 addq $(VEC_SIZE * 4), %rdx
297L(loop_start):
298 testl %esi, %esi
299 leal -1(%esi), %esi
300 je L(loop_cross_page)
301L(back_to_loop):
302 /* Main loop, comparing 4 vectors are a time. */
303 vmovdqa (%rax), %ymm0
304 vmovdqa VEC_SIZE(%rax), %ymm3
305 VPCMPEQ (%rdx), %ymm0, %ymm4
306 VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
307 VPMINU %ymm0, %ymm4, %ymm4
308 VPMINU %ymm3, %ymm1, %ymm1
309 vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
310 VPMINU %ymm1, %ymm4, %ymm0
311 vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
312 VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
313 VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
314 VPMINU %ymm2, %ymm5, %ymm5
315 VPMINU %ymm3, %ymm6, %ymm6
316 VPMINU %ymm5, %ymm0, %ymm0
317 VPMINU %ymm6, %ymm0, %ymm0
318 VPCMPEQ %ymm7, %ymm0, %ymm0
319
320 /* Test each mask (32 bits) individually because for VEC_SIZE
321 == 32 is not possible to OR the four masks and keep all bits
322 in a 64-bit integer register, differing from SSE2 strcmp
323 where ORing is possible. */
324 vpmovmskb %ymm0, %ecx
325 testl %ecx, %ecx
326 je L(loop)
327 VPCMPEQ %ymm7, %ymm4, %ymm0
328 vpmovmskb %ymm0, %edi
329 testl %edi, %edi
330 je L(test_vec)
331 tzcntl %edi, %ecx
332# ifdef USE_AS_STRNCMP
333 cmpq %rcx, %r11
334 jbe L(zero)
335# ifdef USE_AS_WCSCMP
336 movq %rax, %rsi
337 xorl %eax, %eax
338 movl (%rsi, %rcx), %edi
339 cmpl (%rdx, %rcx), %edi
340 jne L(wcscmp_return)
341# else
342 movzbl (%rax, %rcx), %eax
343 movzbl (%rdx, %rcx), %edx
344 subl %edx, %eax
345# endif
346# else
347# ifdef USE_AS_WCSCMP
348 movq %rax, %rsi
349 xorl %eax, %eax
350 movl (%rsi, %rcx), %edi
351 cmpl (%rdx, %rcx), %edi
352 jne L(wcscmp_return)
353# else
354 movzbl (%rax, %rcx), %eax
355 movzbl (%rdx, %rcx), %edx
356 subl %edx, %eax
357# endif
358# endif
359 VZEROUPPER
360 ret
361
362 .p2align 4
363L(test_vec):
364# ifdef USE_AS_STRNCMP
365 /* The first vector matched. Return 0 if the maximum offset
366 (%r11) <= VEC_SIZE. */
367 cmpq $VEC_SIZE, %r11
368 jbe L(zero)
369# endif
370 VPCMPEQ %ymm7, %ymm1, %ymm1
371 vpmovmskb %ymm1, %ecx
372 testl %ecx, %ecx
373 je L(test_2_vec)
374 tzcntl %ecx, %edi
375# ifdef USE_AS_STRNCMP
376 addq $VEC_SIZE, %rdi
377 cmpq %rdi, %r11
378 jbe L(zero)
379# ifdef USE_AS_WCSCMP
380 movq %rax, %rsi
381 xorl %eax, %eax
382 movl (%rsi, %rdi), %ecx
383 cmpl (%rdx, %rdi), %ecx
384 jne L(wcscmp_return)
385# else
386 movzbl (%rax, %rdi), %eax
387 movzbl (%rdx, %rdi), %edx
388 subl %edx, %eax
389# endif
390# else
391# ifdef USE_AS_WCSCMP
392 movq %rax, %rsi
393 xorl %eax, %eax
394 movl VEC_SIZE(%rsi, %rdi), %ecx
395 cmpl VEC_SIZE(%rdx, %rdi), %ecx
396 jne L(wcscmp_return)
397# else
398 movzbl VEC_SIZE(%rax, %rdi), %eax
399 movzbl VEC_SIZE(%rdx, %rdi), %edx
400 subl %edx, %eax
401# endif
402# endif
403 VZEROUPPER
404 ret
405
406 .p2align 4
407L(test_2_vec):
408# ifdef USE_AS_STRNCMP
409 /* The first 2 vectors matched. Return 0 if the maximum offset
410 (%r11) <= 2 * VEC_SIZE. */
411 cmpq $(VEC_SIZE * 2), %r11
412 jbe L(zero)
413# endif
414 VPCMPEQ %ymm7, %ymm5, %ymm5
415 vpmovmskb %ymm5, %ecx
416 testl %ecx, %ecx
417 je L(test_3_vec)
418 tzcntl %ecx, %edi
419# ifdef USE_AS_STRNCMP
420 addq $(VEC_SIZE * 2), %rdi
421 cmpq %rdi, %r11
422 jbe L(zero)
423# ifdef USE_AS_WCSCMP
424 movq %rax, %rsi
425 xorl %eax, %eax
426 movl (%rsi, %rdi), %ecx
427 cmpl (%rdx, %rdi), %ecx
428 jne L(wcscmp_return)
429# else
430 movzbl (%rax, %rdi), %eax
431 movzbl (%rdx, %rdi), %edx
432 subl %edx, %eax
433# endif
434# else
435# ifdef USE_AS_WCSCMP
436 movq %rax, %rsi
437 xorl %eax, %eax
438 movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
439 cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
440 jne L(wcscmp_return)
441# else
442 movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
443 movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
444 subl %edx, %eax
445# endif
446# endif
447 VZEROUPPER
448 ret
449
450 .p2align 4
451L(test_3_vec):
452# ifdef USE_AS_STRNCMP
453 /* The first 3 vectors matched. Return 0 if the maximum offset
454 (%r11) <= 3 * VEC_SIZE. */
455 cmpq $(VEC_SIZE * 3), %r11
456 jbe L(zero)
457# endif
458 VPCMPEQ %ymm7, %ymm6, %ymm6
459 vpmovmskb %ymm6, %esi
460 tzcntl %esi, %ecx
461# ifdef USE_AS_STRNCMP
462 addq $(VEC_SIZE * 3), %rcx
463 cmpq %rcx, %r11
464 jbe L(zero)
465# ifdef USE_AS_WCSCMP
466 movq %rax, %rsi
467 xorl %eax, %eax
468 movl (%rsi, %rcx), %esi
469 cmpl (%rdx, %rcx), %esi
470 jne L(wcscmp_return)
471# else
472 movzbl (%rax, %rcx), %eax
473 movzbl (%rdx, %rcx), %edx
474 subl %edx, %eax
475# endif
476# else
477# ifdef USE_AS_WCSCMP
478 movq %rax, %rsi
479 xorl %eax, %eax
480 movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
481 cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
482 jne L(wcscmp_return)
483# else
484 movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
485 movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
486 subl %edx, %eax
487# endif
488# endif
489 VZEROUPPER
490 ret
491
492 .p2align 4
493L(loop_cross_page):
494 xorl %r10d, %r10d
495 movq %rdx, %rcx
496 /* Align load via RDX. We load the extra ECX bytes which should
497 be ignored. */
498 andl $((VEC_SIZE * 4) - 1), %ecx
499 /* R10 is -RCX. */
500 subq %rcx, %r10
501
502 /* This works only if VEC_SIZE * 2 == 64. */
503# if (VEC_SIZE * 2) != 64
504# error (VEC_SIZE * 2) != 64
505# endif
506
507 /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
508 cmpl $(VEC_SIZE * 2), %ecx
509 jge L(loop_cross_page_2_vec)
510
511 vmovdqu (%rax, %r10), %ymm2
512 vmovdqu VEC_SIZE(%rax, %r10), %ymm3
513 VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
514 VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
515 VPMINU %ymm2, %ymm0, %ymm0
516 VPMINU %ymm3, %ymm1, %ymm1
517 VPCMPEQ %ymm7, %ymm0, %ymm0
518 VPCMPEQ %ymm7, %ymm1, %ymm1
519
520 vpmovmskb %ymm0, %edi
521 vpmovmskb %ymm1, %esi
522
523 salq $32, %rsi
524 xorq %rsi, %rdi
525
526 /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
527 shrq %cl, %rdi
528
529 testq %rdi, %rdi
530 je L(loop_cross_page_2_vec)
531 tzcntq %rdi, %rcx
532# ifdef USE_AS_STRNCMP
533 cmpq %rcx, %r11
534 jbe L(zero)
535# ifdef USE_AS_WCSCMP
536 movq %rax, %rsi
537 xorl %eax, %eax
538 movl (%rsi, %rcx), %edi
539 cmpl (%rdx, %rcx), %edi
540 jne L(wcscmp_return)
541# else
542 movzbl (%rax, %rcx), %eax
543 movzbl (%rdx, %rcx), %edx
544 subl %edx, %eax
545# endif
546# else
547# ifdef USE_AS_WCSCMP
548 movq %rax, %rsi
549 xorl %eax, %eax
550 movl (%rsi, %rcx), %edi
551 cmpl (%rdx, %rcx), %edi
552 jne L(wcscmp_return)
553# else
554 movzbl (%rax, %rcx), %eax
555 movzbl (%rdx, %rcx), %edx
556 subl %edx, %eax
557# endif
558# endif
559 VZEROUPPER
560 ret
561
562 .p2align 4
563L(loop_cross_page_2_vec):
564 /* The first VEC_SIZE * 2 bytes match or are ignored. */
565 vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
566 vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
567 VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
568 VPMINU %ymm2, %ymm5, %ymm5
569 VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
570 VPCMPEQ %ymm7, %ymm5, %ymm5
571 VPMINU %ymm3, %ymm6, %ymm6
572 VPCMPEQ %ymm7, %ymm6, %ymm6
573
574 vpmovmskb %ymm5, %edi
575 vpmovmskb %ymm6, %esi
576
577 salq $32, %rsi
578 xorq %rsi, %rdi
579
580 xorl %r8d, %r8d
581 /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
582 subl $(VEC_SIZE * 2), %ecx
583 jle 1f
584 /* Skip ECX bytes. */
585 shrq %cl, %rdi
586 /* R8 has number of bytes skipped. */
587 movl %ecx, %r8d
5881:
589 /* Before jumping back to the loop, set ESI to the number of
590 VEC_SIZE * 4 blocks before page crossing. */
591 movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
592
593 testq %rdi, %rdi
594 je L(back_to_loop)
595 tzcntq %rdi, %rcx
596 addq %r10, %rcx
597 /* Adjust for number of bytes skipped. */
598 addq %r8, %rcx
599# ifdef USE_AS_STRNCMP
600 addq $(VEC_SIZE * 2), %rcx
601 subq %rcx, %r11
602 jbe L(zero)
603# ifdef USE_AS_WCSCMP
604 movq %rax, %rsi
605 xorl %eax, %eax
606 movl (%rsi, %rcx), %edi
607 cmpl (%rdx, %rcx), %edi
608 jne L(wcscmp_return)
609# else
610 movzbl (%rax, %rcx), %eax
611 movzbl (%rdx, %rcx), %edx
612 subl %edx, %eax
613# endif
614# else
615# ifdef USE_AS_WCSCMP
616 movq %rax, %rsi
617 xorl %eax, %eax
618 movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
619 cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
620 jne L(wcscmp_return)
621# else
622 movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
623 movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
624 subl %edx, %eax
625# endif
626# endif
627 VZEROUPPER
628 ret
629
630 .p2align 4
631L(cross_page_loop):
632 /* Check one byte/dword at a time. */
633# ifdef USE_AS_WCSCMP
634 cmpl %ecx, %eax
635# else
636 subl %ecx, %eax
637# endif
638 jne L(different)
639 addl $SIZE_OF_CHAR, %edx
640 cmpl $(VEC_SIZE * 4), %edx
641 je L(main_loop_header)
642# ifdef USE_AS_STRNCMP
643 cmpq %r11, %rdx
644 jae L(zero)
645# endif
646# ifdef USE_AS_WCSCMP
647 movl (%rdi, %rdx), %eax
648 movl (%rsi, %rdx), %ecx
649# else
650 movzbl (%rdi, %rdx), %eax
651 movzbl (%rsi, %rdx), %ecx
652# endif
653 /* Check null char. */
654 testl %eax, %eax
655 jne L(cross_page_loop)
656 /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
657 comparisons. */
658 subl %ecx, %eax
659# ifndef USE_AS_WCSCMP
660L(different):
661# endif
662 VZEROUPPER
663 ret
664
665# ifdef USE_AS_WCSCMP
666 .p2align 4
667L(different):
668 /* Use movl to avoid modifying EFLAGS. */
669 movl $0, %eax
670 setl %al
671 negl %eax
672 orl $1, %eax
673 VZEROUPPER
674 ret
675# endif
676
677# ifdef USE_AS_STRNCMP
678 .p2align 4
679L(zero):
680 xorl %eax, %eax
681 VZEROUPPER
682 ret
683
684 .p2align 4
685L(char0):
686# ifdef USE_AS_WCSCMP
687 xorl %eax, %eax
688 movl (%rdi), %ecx
689 cmpl (%rsi), %ecx
690 jne L(wcscmp_return)
691# else
692 movzbl (%rsi), %ecx
693 movzbl (%rdi), %eax
694 subl %ecx, %eax
695# endif
696 VZEROUPPER
697 ret
698# endif
699
700 .p2align 4
701L(last_vector):
702 addq %rdx, %rdi
703 addq %rdx, %rsi
704# ifdef USE_AS_STRNCMP
705 subq %rdx, %r11
706# endif
707 tzcntl %ecx, %edx
708# ifdef USE_AS_STRNCMP
709 cmpq %r11, %rdx
710 jae L(zero)
711# endif
712# ifdef USE_AS_WCSCMP
713 xorl %eax, %eax
714 movl (%rdi, %rdx), %ecx
715 cmpl (%rsi, %rdx), %ecx
716 jne L(wcscmp_return)
717# else
718 movzbl (%rdi, %rdx), %eax
719 movzbl (%rsi, %rdx), %edx
720 subl %edx, %eax
721# endif
722 VZEROUPPER
723 ret
724
725 /* Comparing on page boundary region requires special treatment:
726 It must done one vector at the time, starting with the wider
727 ymm vector if possible, if not, with xmm. If fetching 16 bytes
728 (xmm) still passes the boundary, byte comparison must be done.
729 */
730 .p2align 4
731L(cross_page):
732 /* Try one ymm vector at a time. */
733 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
734 jg L(cross_page_1_vector)
735L(loop_1_vector):
736 vmovdqu (%rdi, %rdx), %ymm1
737 VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
738 VPMINU %ymm1, %ymm0, %ymm0
739 VPCMPEQ %ymm7, %ymm0, %ymm0
740 vpmovmskb %ymm0, %ecx
741 testl %ecx, %ecx
742 jne L(last_vector)
743
744 addl $VEC_SIZE, %edx
745
746 addl $VEC_SIZE, %eax
747# ifdef USE_AS_STRNCMP
748 /* Return 0 if the current offset (%rdx) >= the maximum offset
749 (%r11). */
750 cmpq %r11, %rdx
751 jae L(zero)
752# endif
753 cmpl $(PAGE_SIZE - VEC_SIZE), %eax
754 jle L(loop_1_vector)
755L(cross_page_1_vector):
756 /* Less than 32 bytes to check, try one xmm vector. */
757 cmpl $(PAGE_SIZE - 16), %eax
758 jg L(cross_page_1_xmm)
759 vmovdqu (%rdi, %rdx), %xmm1
760 VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
761 VPMINU %xmm1, %xmm0, %xmm0
762 VPCMPEQ %xmm7, %xmm0, %xmm0
763 vpmovmskb %xmm0, %ecx
764 testl %ecx, %ecx
765 jne L(last_vector)
766
767 addl $16, %edx
768# ifndef USE_AS_WCSCMP
769 addl $16, %eax
770# endif
771# ifdef USE_AS_STRNCMP
772 /* Return 0 if the current offset (%rdx) >= the maximum offset
773 (%r11). */
774 cmpq %r11, %rdx
775 jae L(zero)
776# endif
777
778L(cross_page_1_xmm):
779# ifndef USE_AS_WCSCMP
780 /* Less than 16 bytes to check, try 8 byte vector. NB: No need
781 for wcscmp nor wcsncmp since wide char is 4 bytes. */
782 cmpl $(PAGE_SIZE - 8), %eax
783 jg L(cross_page_8bytes)
784 vmovq (%rdi, %rdx), %xmm1
785 vmovq (%rsi, %rdx), %xmm0
786 VPCMPEQ %xmm0, %xmm1, %xmm0
787 VPMINU %xmm1, %xmm0, %xmm0
788 VPCMPEQ %xmm7, %xmm0, %xmm0
789 vpmovmskb %xmm0, %ecx
790 /* Only last 8 bits are valid. */
791 andl $0xff, %ecx
792 testl %ecx, %ecx
793 jne L(last_vector)
794
795 addl $8, %edx
796 addl $8, %eax
797# ifdef USE_AS_STRNCMP
798 /* Return 0 if the current offset (%rdx) >= the maximum offset
799 (%r11). */
800 cmpq %r11, %rdx
801 jae L(zero)
802# endif
803
804L(cross_page_8bytes):
805 /* Less than 8 bytes to check, try 4 byte vector. */
806 cmpl $(PAGE_SIZE - 4), %eax
807 jg L(cross_page_4bytes)
808 vmovd (%rdi, %rdx), %xmm1
809 vmovd (%rsi, %rdx), %xmm0
810 VPCMPEQ %xmm0, %xmm1, %xmm0
811 VPMINU %xmm1, %xmm0, %xmm0
812 VPCMPEQ %xmm7, %xmm0, %xmm0
813 vpmovmskb %xmm0, %ecx
814 /* Only last 4 bits are valid. */
815 andl $0xf, %ecx
816 testl %ecx, %ecx
817 jne L(last_vector)
818
819 addl $4, %edx
820# ifdef USE_AS_STRNCMP
821 /* Return 0 if the current offset (%rdx) >= the maximum offset
822 (%r11). */
823 cmpq %r11, %rdx
824 jae L(zero)
825# endif
826
827L(cross_page_4bytes):
828# endif
829 /* Less than 4 bytes to check, try one byte/dword at a time. */
830# ifdef USE_AS_STRNCMP
831 cmpq %r11, %rdx
832 jae L(zero)
833# endif
834# ifdef USE_AS_WCSCMP
835 movl (%rdi, %rdx), %eax
836 movl (%rsi, %rdx), %ecx
837# else
838 movzbl (%rdi, %rdx), %eax
839 movzbl (%rsi, %rdx), %ecx
840# endif
841 testl %eax, %eax
842 jne L(cross_page_loop)
843 subl %ecx, %eax
844 VZEROUPPER
845 ret
846END (STRCMP)
847#endif
848