1/* strcmp with SSE4.2
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22#ifndef STRCMP_SSE42
23# define STRCMP_SSE42 __strcmp_sse42
24#endif
25
26#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
27# include "locale-defines.h"
28#endif
29
30#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
31/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
32 if the new counter > the old one or is 0. */
33# define UPDATE_STRNCMP_COUNTER \
34 /* calculate left number to compare */ \
35 lea -16(%rcx, %r11), %r9; \
36 cmp %r9, %r11; \
37 jb LABEL(strcmp_exitz); \
38 test %r9, %r9; \
39 je LABEL(strcmp_exitz); \
40 mov %r9, %r11
41#else
42# define UPDATE_STRNCMP_COUNTER
43#endif
44
45#ifdef USE_AVX
46# define SECTION avx
47# define GLABEL(l) l##_avx
48#else
49# define SECTION sse4.2
50# define GLABEL(l) l##_sse42
51#endif
52
53#define LABEL(l) .L##l
54
55/* We use 0x1a:
56 _SIDD_SBYTE_OPS
57 | _SIDD_CMP_EQUAL_EACH
58 | _SIDD_NEGATIVE_POLARITY
59 | _SIDD_LEAST_SIGNIFICANT
60 on pcmpistri to find out if two 16byte data elements are the same
61 and the offset of the first different byte. There are 4 cases:
62
63 1. Both 16byte data elements are valid and identical.
64 2. Both 16byte data elements have EOS and identical.
65 3. Both 16byte data elements are valid and they differ at offset X.
66 4. At least one 16byte data element has EOS at offset X. Two 16byte
67 data elements must differ at or before offset X.
68
69 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
70
71 case ECX CFlag ZFlag SFlag
72 1 16 0 0 0
73 2 16 0 1 1
74 3 X 1 0 0
75 4 0 <= X 1 0/1 0/1
76
77 We exit from the loop for cases 2, 3 and 4 with jbe which branches
78 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
79 case 2. */
80
81 /* Put all SSE 4.2 functions together. */
82 .section .text.SECTION,"ax",@progbits
83 .align 16
84 .type STRCMP_SSE42, @function
85 .globl STRCMP_SSE42
86 .hidden STRCMP_SSE42
87#ifdef USE_AS_STRCASECMP_L
88ENTRY (GLABEL(__strcasecmp))
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
90 mov %fs:(%rax),%RDX_LP
91
92 // XXX 5 byte should be before the function
93 /* 5-byte NOP. */
94 .byte 0x0f,0x1f,0x44,0x00,0x00
95END (GLABEL(__strcasecmp))
96 /* FALLTHROUGH to strcasecmp_l. */
97#endif
98#ifdef USE_AS_STRNCASECMP_L
99ENTRY (GLABEL(__strncasecmp))
100 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
101 mov %fs:(%rax),%RCX_LP
102
103 // XXX 5 byte should be before the function
104 /* 5-byte NOP. */
105 .byte 0x0f,0x1f,0x44,0x00,0x00
106END (GLABEL(__strncasecmp))
107 /* FALLTHROUGH to strncasecmp_l. */
108#endif
109
110
111#ifdef USE_AVX
112# define movdqa vmovdqa
113# define movdqu vmovdqu
114# define pmovmskb vpmovmskb
115# define pcmpistri vpcmpistri
116# define psubb vpsubb
117# define pcmpeqb vpcmpeqb
118# define psrldq vpsrldq
119# define pslldq vpslldq
120# define palignr vpalignr
121# define pxor vpxor
122# define D(arg) arg, arg
123#else
124# define D(arg) arg
125#endif
126
127STRCMP_SSE42:
128 cfi_startproc
129 _CET_ENDBR
130 CALL_MCOUNT
131
132/*
133 * This implementation uses SSE to compare up to 16 bytes at a time.
134 */
135#ifdef USE_AS_STRCASECMP_L
136 /* We have to fall back on the C implementation for locales
137 with encodings not matching ASCII for single bytes. */
138# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
139 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
140# else
141 mov (%rdx), %RAX_LP
142# endif
143 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
144 jne __strcasecmp_l_nonascii
145#endif
146#ifdef USE_AS_STRNCASECMP_L
147 /* We have to fall back on the C implementation for locales
148 with encodings not matching ASCII for single bytes. */
149# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
150 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
151# else
152 mov (%rcx), %RAX_LP
153# endif
154 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
155 jne __strncasecmp_l_nonascii
156#endif
157
158#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
159 test %RDX_LP, %RDX_LP
160 je LABEL(strcmp_exitz)
161 cmp $1, %RDX_LP
162 je LABEL(Byte0)
163 mov %RDX_LP, %R11_LP
164#endif
165 mov %esi, %ecx
166 mov %edi, %eax
167/* Use 64bit AND here to avoid long NOP padding. */
168 and $0x3f, %rcx /* rsi alignment in cache line */
169 and $0x3f, %rax /* rdi alignment in cache line */
170#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
171 .section .rodata.cst16,"aM",@progbits,16
172 .align 16
173LABEL(belowupper):
174 .quad 0x4040404040404040
175 .quad 0x4040404040404040
176LABEL(topupper):
177# ifdef USE_AVX
178 .quad 0x5a5a5a5a5a5a5a5a
179 .quad 0x5a5a5a5a5a5a5a5a
180# else
181 .quad 0x5b5b5b5b5b5b5b5b
182 .quad 0x5b5b5b5b5b5b5b5b
183# endif
184LABEL(touppermask):
185 .quad 0x2020202020202020
186 .quad 0x2020202020202020
187 .previous
188 movdqa LABEL(belowupper)(%rip), %xmm4
189# define UCLOW_reg %xmm4
190 movdqa LABEL(topupper)(%rip), %xmm5
191# define UCHIGH_reg %xmm5
192 movdqa LABEL(touppermask)(%rip), %xmm6
193# define LCQWORD_reg %xmm6
194#endif
195 cmp $0x30, %ecx
196 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
197 cmp $0x30, %eax
198 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
199 movdqu (%rdi), %xmm1
200 movdqu (%rsi), %xmm2
201#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
202# ifdef USE_AVX
203# define TOLOWER(reg1, reg2) \
204 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
205 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
206 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
207 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
208 vpandn %xmm7, %xmm8, %xmm8; \
209 vpandn %xmm9, %xmm10, %xmm10; \
210 vpand LCQWORD_reg, %xmm8, %xmm8; \
211 vpand LCQWORD_reg, %xmm10, %xmm10; \
212 vpor reg1, %xmm8, reg1; \
213 vpor reg2, %xmm10, reg2
214# else
215# define TOLOWER(reg1, reg2) \
216 movdqa reg1, %xmm7; \
217 movdqa UCHIGH_reg, %xmm8; \
218 movdqa reg2, %xmm9; \
219 movdqa UCHIGH_reg, %xmm10; \
220 pcmpgtb UCLOW_reg, %xmm7; \
221 pcmpgtb reg1, %xmm8; \
222 pcmpgtb UCLOW_reg, %xmm9; \
223 pcmpgtb reg2, %xmm10; \
224 pand %xmm8, %xmm7; \
225 pand %xmm10, %xmm9; \
226 pand LCQWORD_reg, %xmm7; \
227 pand LCQWORD_reg, %xmm9; \
228 por %xmm7, reg1; \
229 por %xmm9, reg2
230# endif
231 TOLOWER (%xmm1, %xmm2)
232#else
233# define TOLOWER(reg1, reg2)
234#endif
235 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
236 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
237 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
238 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
239 pmovmskb %xmm1, %edx
240 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
241 jnz LABEL(less16bytes)/* If not, find different value or null char */
242#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
243 sub $16, %r11
244 jbe LABEL(strcmp_exitz)/* finish comparison */
245#endif
246 add $16, %rsi /* prepare to search next 16 bytes */
247 add $16, %rdi /* prepare to search next 16 bytes */
248
249 /*
250 * Determine source and destination string offsets from 16-byte
251 * alignment. Use relative offset difference between the two to
252 * determine which case below to use.
253 */
254 .p2align 4
255LABEL(crosscache):
256 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
257 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
258 mov $0xffff, %edx /* for equivalent offset */
259 xor %r8d, %r8d
260 and $0xf, %ecx /* offset of rsi */
261 and $0xf, %eax /* offset of rdi */
262 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
263 cmp %eax, %ecx
264 je LABEL(ashr_0) /* rsi and rdi relative offset same */
265 ja LABEL(bigger)
266 mov %edx, %r8d /* r8d is offset flag for exit tail */
267 xchg %ecx, %eax
268 xchg %rsi, %rdi
269LABEL(bigger):
270 movdqa (%rdi), %xmm2
271 movdqa (%rsi), %xmm1
272 lea 15(%rax), %r9
273 sub %rcx, %r9
274 lea LABEL(unaligned_table)(%rip), %r10
275 movslq (%r10, %r9,4), %r9
276 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
277 lea (%r10, %r9), %r10
278 _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
279
280/*
281 * The following cases will be handled by ashr_0
282 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
283 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
284 */
285 .p2align 4
286LABEL(ashr_0):
287
288 movdqa (%rsi), %xmm1
289 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
290#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
291 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
292#else
293 movdqa (%rdi), %xmm2
294 TOLOWER (%xmm1, %xmm2)
295 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
296#endif
297 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
298 pmovmskb %xmm1, %r9d
299 shr %cl, %edx /* adjust 0xffff for offset */
300 shr %cl, %r9d /* adjust for 16-byte offset */
301 sub %r9d, %edx
302 /*
303 * edx must be the same with r9d if in left byte (16-rcx) is equal to
304 * the start from (16-rax) and no null char was seen.
305 */
306 jne LABEL(less32bytes) /* mismatch or null char */
307 UPDATE_STRNCMP_COUNTER
308 mov $16, %rcx
309 mov $16, %r9
310
311 /*
312 * Now both strings are aligned at 16-byte boundary. Loop over strings
313 * checking 32-bytes per iteration.
314 */
315 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
316 .p2align 4
317LABEL(ashr_0_use):
318 movdqa (%rdi,%rdx), %xmm0
319#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
320 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
321#else
322 movdqa (%rsi,%rdx), %xmm1
323 TOLOWER (%xmm0, %xmm1)
324 pcmpistri $0x1a, %xmm1, %xmm0
325#endif
326 lea 16(%rdx), %rdx
327 jbe LABEL(ashr_0_exit_use)
328#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
329 sub $16, %r11
330 jbe LABEL(strcmp_exitz)
331#endif
332
333 movdqa (%rdi,%rdx), %xmm0
334#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
335 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
336#else
337 movdqa (%rsi,%rdx), %xmm1
338 TOLOWER (%xmm0, %xmm1)
339 pcmpistri $0x1a, %xmm1, %xmm0
340#endif
341 lea 16(%rdx), %rdx
342 jbe LABEL(ashr_0_exit_use)
343#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
344 sub $16, %r11
345 jbe LABEL(strcmp_exitz)
346#endif
347 jmp LABEL(ashr_0_use)
348
349
350 .p2align 4
351LABEL(ashr_0_exit_use):
352 jnc LABEL(strcmp_exitz)
353#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
354 sub %rcx, %r11
355 jbe LABEL(strcmp_exitz)
356#endif
357 lea -16(%rdx, %rcx), %rcx
358 movzbl (%rdi, %rcx), %eax
359 movzbl (%rsi, %rcx), %edx
360#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
361 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
362 movl (%rcx,%rax,4), %eax
363 movl (%rcx,%rdx,4), %edx
364#endif
365 sub %edx, %eax
366 ret
367
368
369
370/*
371 * The following cases will be handled by ashr_1
372 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
373 * n(15) n -15 0(15 +(n-15) - n) ashr_1
374 */
375 .p2align 4
376LABEL(ashr_1):
377 pslldq $15, D(%xmm2) /* shift first string to align with second */
378 TOLOWER (%xmm1, %xmm2)
379 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
380 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
381 pmovmskb %xmm2, %r9d
382 shr %cl, %edx /* adjust 0xffff for offset */
383 shr %cl, %r9d /* adjust for 16-byte offset */
384 sub %r9d, %edx
385 jnz LABEL(less32bytes) /* mismatch or null char seen */
386 movdqa (%rdi), %xmm3
387 UPDATE_STRNCMP_COUNTER
388
389 mov $16, %rcx /* index for loads*/
390 mov $1, %r9d /* byte position left over from less32bytes case */
391 /*
392 * Setup %r10 value allows us to detect crossing a page boundary.
393 * When %r10 goes positive we have crossed a page boundary and
394 * need to do a nibble.
395 */
396 lea 1(%rdi), %r10
397 and $0xfff, %r10 /* offset into 4K page */
398 sub $0x1000, %r10 /* subtract 4K pagesize */
399 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
400
401 .p2align 4
402LABEL(loop_ashr_1_use):
403 add $16, %r10
404 jg LABEL(nibble_ashr_1_use)
405
406LABEL(nibble_ashr_1_restart_use):
407 movdqa (%rdi, %rdx), %xmm0
408 palignr $1, -16(%rdi, %rdx), D(%xmm0)
409#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
410 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
411#else
412 movdqa (%rsi,%rdx), %xmm1
413 TOLOWER (%xmm0, %xmm1)
414 pcmpistri $0x1a, %xmm1, %xmm0
415#endif
416 jbe LABEL(exit_use)
417#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
418 sub $16, %r11
419 jbe LABEL(strcmp_exitz)
420#endif
421
422 add $16, %rdx
423 add $16, %r10
424 jg LABEL(nibble_ashr_1_use)
425
426 movdqa (%rdi, %rdx), %xmm0
427 palignr $1, -16(%rdi, %rdx), D(%xmm0)
428#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
429 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
430#else
431 movdqa (%rsi,%rdx), %xmm1
432 TOLOWER (%xmm0, %xmm1)
433 pcmpistri $0x1a, %xmm1, %xmm0
434#endif
435 jbe LABEL(exit_use)
436#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
437 sub $16, %r11
438 jbe LABEL(strcmp_exitz)
439#endif
440 add $16, %rdx
441 jmp LABEL(loop_ashr_1_use)
442
443 .p2align 4
444LABEL(nibble_ashr_1_use):
445 sub $0x1000, %r10
446 movdqa -16(%rdi, %rdx), %xmm0
447 psrldq $1, D(%xmm0)
448 pcmpistri $0x3a,%xmm0, %xmm0
449#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
450 cmp %r11, %rcx
451 jae LABEL(nibble_ashr_exit_use)
452#endif
453 cmp $14, %ecx
454 ja LABEL(nibble_ashr_1_restart_use)
455
456 jmp LABEL(nibble_ashr_exit_use)
457
458/*
459 * The following cases will be handled by ashr_2
460 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
461 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
462 */
463 .p2align 4
464LABEL(ashr_2):
465 pslldq $14, D(%xmm2)
466 TOLOWER (%xmm1, %xmm2)
467 pcmpeqb %xmm1, D(%xmm2)
468 psubb %xmm0, D(%xmm2)
469 pmovmskb %xmm2, %r9d
470 shr %cl, %edx
471 shr %cl, %r9d
472 sub %r9d, %edx
473 jnz LABEL(less32bytes)
474 movdqa (%rdi), %xmm3
475 UPDATE_STRNCMP_COUNTER
476
477 mov $16, %rcx /* index for loads */
478 mov $2, %r9d /* byte position left over from less32bytes case */
479 /*
480 * Setup %r10 value allows us to detect crossing a page boundary.
481 * When %r10 goes positive we have crossed a page boundary and
482 * need to do a nibble.
483 */
484 lea 2(%rdi), %r10
485 and $0xfff, %r10 /* offset into 4K page */
486 sub $0x1000, %r10 /* subtract 4K pagesize */
487 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
488
489 .p2align 4
490LABEL(loop_ashr_2_use):
491 add $16, %r10
492 jg LABEL(nibble_ashr_2_use)
493
494LABEL(nibble_ashr_2_restart_use):
495 movdqa (%rdi, %rdx), %xmm0
496 palignr $2, -16(%rdi, %rdx), D(%xmm0)
497#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
498 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
499#else
500 movdqa (%rsi,%rdx), %xmm1
501 TOLOWER (%xmm0, %xmm1)
502 pcmpistri $0x1a, %xmm1, %xmm0
503#endif
504 jbe LABEL(exit_use)
505#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
506 sub $16, %r11
507 jbe LABEL(strcmp_exitz)
508#endif
509
510 add $16, %rdx
511 add $16, %r10
512 jg LABEL(nibble_ashr_2_use)
513
514 movdqa (%rdi, %rdx), %xmm0
515 palignr $2, -16(%rdi, %rdx), D(%xmm0)
516#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
517 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
518#else
519 movdqa (%rsi,%rdx), %xmm1
520 TOLOWER (%xmm0, %xmm1)
521 pcmpistri $0x1a, %xmm1, %xmm0
522#endif
523 jbe LABEL(exit_use)
524#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
525 sub $16, %r11
526 jbe LABEL(strcmp_exitz)
527#endif
528 add $16, %rdx
529 jmp LABEL(loop_ashr_2_use)
530
531 .p2align 4
532LABEL(nibble_ashr_2_use):
533 sub $0x1000, %r10
534 movdqa -16(%rdi, %rdx), %xmm0
535 psrldq $2, D(%xmm0)
536 pcmpistri $0x3a,%xmm0, %xmm0
537#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
538 cmp %r11, %rcx
539 jae LABEL(nibble_ashr_exit_use)
540#endif
541 cmp $13, %ecx
542 ja LABEL(nibble_ashr_2_restart_use)
543
544 jmp LABEL(nibble_ashr_exit_use)
545
546/*
547 * The following cases will be handled by ashr_3
548 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
549 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
550 */
551 .p2align 4
552LABEL(ashr_3):
553 pslldq $13, D(%xmm2)
554 TOLOWER (%xmm1, %xmm2)
555 pcmpeqb %xmm1, D(%xmm2)
556 psubb %xmm0, D(%xmm2)
557 pmovmskb %xmm2, %r9d
558 shr %cl, %edx
559 shr %cl, %r9d
560 sub %r9d, %edx
561 jnz LABEL(less32bytes)
562 movdqa (%rdi), %xmm3
563
564 UPDATE_STRNCMP_COUNTER
565
566 mov $16, %rcx /* index for loads */
567 mov $3, %r9d /* byte position left over from less32bytes case */
568 /*
569 * Setup %r10 value allows us to detect crossing a page boundary.
570 * When %r10 goes positive we have crossed a page boundary and
571 * need to do a nibble.
572 */
573 lea 3(%rdi), %r10
574 and $0xfff, %r10 /* offset into 4K page */
575 sub $0x1000, %r10 /* subtract 4K pagesize */
576 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
577
578LABEL(loop_ashr_3_use):
579 add $16, %r10
580 jg LABEL(nibble_ashr_3_use)
581
582LABEL(nibble_ashr_3_restart_use):
583 movdqa (%rdi, %rdx), %xmm0
584 palignr $3, -16(%rdi, %rdx), D(%xmm0)
585#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
586 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
587#else
588 movdqa (%rsi,%rdx), %xmm1
589 TOLOWER (%xmm0, %xmm1)
590 pcmpistri $0x1a, %xmm1, %xmm0
591#endif
592 jbe LABEL(exit_use)
593#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
594 sub $16, %r11
595 jbe LABEL(strcmp_exitz)
596#endif
597
598 add $16, %rdx
599 add $16, %r10
600 jg LABEL(nibble_ashr_3_use)
601
602 movdqa (%rdi, %rdx), %xmm0
603 palignr $3, -16(%rdi, %rdx), D(%xmm0)
604#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
605 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
606#else
607 movdqa (%rsi,%rdx), %xmm1
608 TOLOWER (%xmm0, %xmm1)
609 pcmpistri $0x1a, %xmm1, %xmm0
610#endif
611 jbe LABEL(exit_use)
612#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
613 sub $16, %r11
614 jbe LABEL(strcmp_exitz)
615#endif
616 add $16, %rdx
617 jmp LABEL(loop_ashr_3_use)
618
619 .p2align 4
620LABEL(nibble_ashr_3_use):
621 sub $0x1000, %r10
622 movdqa -16(%rdi, %rdx), %xmm0
623 psrldq $3, D(%xmm0)
624 pcmpistri $0x3a,%xmm0, %xmm0
625#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
626 cmp %r11, %rcx
627 jae LABEL(nibble_ashr_exit_use)
628#endif
629 cmp $12, %ecx
630 ja LABEL(nibble_ashr_3_restart_use)
631
632 jmp LABEL(nibble_ashr_exit_use)
633
634/*
635 * The following cases will be handled by ashr_4
636 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
637 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
638 */
639 .p2align 4
640LABEL(ashr_4):
641 pslldq $12, D(%xmm2)
642 TOLOWER (%xmm1, %xmm2)
643 pcmpeqb %xmm1, D(%xmm2)
644 psubb %xmm0, D(%xmm2)
645 pmovmskb %xmm2, %r9d
646 shr %cl, %edx
647 shr %cl, %r9d
648 sub %r9d, %edx
649 jnz LABEL(less32bytes)
650 movdqa (%rdi), %xmm3
651
652 UPDATE_STRNCMP_COUNTER
653
654 mov $16, %rcx /* index for loads */
655 mov $4, %r9d /* byte position left over from less32bytes case */
656 /*
657 * Setup %r10 value allows us to detect crossing a page boundary.
658 * When %r10 goes positive we have crossed a page boundary and
659 * need to do a nibble.
660 */
661 lea 4(%rdi), %r10
662 and $0xfff, %r10 /* offset into 4K page */
663 sub $0x1000, %r10 /* subtract 4K pagesize */
664 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
665
666 .p2align 4
667LABEL(loop_ashr_4_use):
668 add $16, %r10
669 jg LABEL(nibble_ashr_4_use)
670
671LABEL(nibble_ashr_4_restart_use):
672 movdqa (%rdi, %rdx), %xmm0
673 palignr $4, -16(%rdi, %rdx), D(%xmm0)
674#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
675 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
676#else
677 movdqa (%rsi,%rdx), %xmm1
678 TOLOWER (%xmm0, %xmm1)
679 pcmpistri $0x1a, %xmm1, %xmm0
680#endif
681 jbe LABEL(exit_use)
682#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
683 sub $16, %r11
684 jbe LABEL(strcmp_exitz)
685#endif
686
687 add $16, %rdx
688 add $16, %r10
689 jg LABEL(nibble_ashr_4_use)
690
691 movdqa (%rdi, %rdx), %xmm0
692 palignr $4, -16(%rdi, %rdx), D(%xmm0)
693#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
694 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
695#else
696 movdqa (%rsi,%rdx), %xmm1
697 TOLOWER (%xmm0, %xmm1)
698 pcmpistri $0x1a, %xmm1, %xmm0
699#endif
700 jbe LABEL(exit_use)
701#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
702 sub $16, %r11
703 jbe LABEL(strcmp_exitz)
704#endif
705 add $16, %rdx
706 jmp LABEL(loop_ashr_4_use)
707
708 .p2align 4
709LABEL(nibble_ashr_4_use):
710 sub $0x1000, %r10
711 movdqa -16(%rdi, %rdx), %xmm0
712 psrldq $4, D(%xmm0)
713 pcmpistri $0x3a,%xmm0, %xmm0
714#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
715 cmp %r11, %rcx
716 jae LABEL(nibble_ashr_exit_use)
717#endif
718 cmp $11, %ecx
719 ja LABEL(nibble_ashr_4_restart_use)
720
721 jmp LABEL(nibble_ashr_exit_use)
722
723/*
724 * The following cases will be handled by ashr_5
725 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
726 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
727 */
728 .p2align 4
729LABEL(ashr_5):
730 pslldq $11, D(%xmm2)
731 TOLOWER (%xmm1, %xmm2)
732 pcmpeqb %xmm1, D(%xmm2)
733 psubb %xmm0, D(%xmm2)
734 pmovmskb %xmm2, %r9d
735 shr %cl, %edx
736 shr %cl, %r9d
737 sub %r9d, %edx
738 jnz LABEL(less32bytes)
739 movdqa (%rdi), %xmm3
740
741 UPDATE_STRNCMP_COUNTER
742
743 mov $16, %rcx /* index for loads */
744 mov $5, %r9d /* byte position left over from less32bytes case */
745 /*
746 * Setup %r10 value allows us to detect crossing a page boundary.
747 * When %r10 goes positive we have crossed a page boundary and
748 * need to do a nibble.
749 */
750 lea 5(%rdi), %r10
751 and $0xfff, %r10 /* offset into 4K page */
752 sub $0x1000, %r10 /* subtract 4K pagesize */
753 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
754
755 .p2align 4
756LABEL(loop_ashr_5_use):
757 add $16, %r10
758 jg LABEL(nibble_ashr_5_use)
759
760LABEL(nibble_ashr_5_restart_use):
761 movdqa (%rdi, %rdx), %xmm0
762 palignr $5, -16(%rdi, %rdx), D(%xmm0)
763#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
764 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
765#else
766 movdqa (%rsi,%rdx), %xmm1
767 TOLOWER (%xmm0, %xmm1)
768 pcmpistri $0x1a, %xmm1, %xmm0
769#endif
770 jbe LABEL(exit_use)
771#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
772 sub $16, %r11
773 jbe LABEL(strcmp_exitz)
774#endif
775
776 add $16, %rdx
777 add $16, %r10
778 jg LABEL(nibble_ashr_5_use)
779
780 movdqa (%rdi, %rdx), %xmm0
781
782 palignr $5, -16(%rdi, %rdx), D(%xmm0)
783#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
784 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
785#else
786 movdqa (%rsi,%rdx), %xmm1
787 TOLOWER (%xmm0, %xmm1)
788 pcmpistri $0x1a, %xmm1, %xmm0
789#endif
790 jbe LABEL(exit_use)
791#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
792 sub $16, %r11
793 jbe LABEL(strcmp_exitz)
794#endif
795 add $16, %rdx
796 jmp LABEL(loop_ashr_5_use)
797
798 .p2align 4
799LABEL(nibble_ashr_5_use):
800 sub $0x1000, %r10
801 movdqa -16(%rdi, %rdx), %xmm0
802 psrldq $5, D(%xmm0)
803 pcmpistri $0x3a,%xmm0, %xmm0
804#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
805 cmp %r11, %rcx
806 jae LABEL(nibble_ashr_exit_use)
807#endif
808 cmp $10, %ecx
809 ja LABEL(nibble_ashr_5_restart_use)
810
811 jmp LABEL(nibble_ashr_exit_use)
812
813/*
814 * The following cases will be handled by ashr_6
815 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
816 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
817 */
818 .p2align 4
819LABEL(ashr_6):
820 pslldq $10, D(%xmm2)
821 TOLOWER (%xmm1, %xmm2)
822 pcmpeqb %xmm1, D(%xmm2)
823 psubb %xmm0, D(%xmm2)
824 pmovmskb %xmm2, %r9d
825 shr %cl, %edx
826 shr %cl, %r9d
827 sub %r9d, %edx
828 jnz LABEL(less32bytes)
829 movdqa (%rdi), %xmm3
830
831 UPDATE_STRNCMP_COUNTER
832
833 mov $16, %rcx /* index for loads */
834 mov $6, %r9d /* byte position left over from less32bytes case */
835 /*
836 * Setup %r10 value allows us to detect crossing a page boundary.
837 * When %r10 goes positive we have crossed a page boundary and
838 * need to do a nibble.
839 */
840 lea 6(%rdi), %r10
841 and $0xfff, %r10 /* offset into 4K page */
842 sub $0x1000, %r10 /* subtract 4K pagesize */
843 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
844
845 .p2align 4
846LABEL(loop_ashr_6_use):
847 add $16, %r10
848 jg LABEL(nibble_ashr_6_use)
849
850LABEL(nibble_ashr_6_restart_use):
851 movdqa (%rdi, %rdx), %xmm0
852 palignr $6, -16(%rdi, %rdx), D(%xmm0)
853#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
854 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
855#else
856 movdqa (%rsi,%rdx), %xmm1
857 TOLOWER (%xmm0, %xmm1)
858 pcmpistri $0x1a, %xmm1, %xmm0
859#endif
860 jbe LABEL(exit_use)
861#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
862 sub $16, %r11
863 jbe LABEL(strcmp_exitz)
864#endif
865
866 add $16, %rdx
867 add $16, %r10
868 jg LABEL(nibble_ashr_6_use)
869
870 movdqa (%rdi, %rdx), %xmm0
871 palignr $6, -16(%rdi, %rdx), D(%xmm0)
872#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
873 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
874#else
875 movdqa (%rsi,%rdx), %xmm1
876 TOLOWER (%xmm0, %xmm1)
877 pcmpistri $0x1a, %xmm1, %xmm0
878#endif
879 jbe LABEL(exit_use)
880#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
881 sub $16, %r11
882 jbe LABEL(strcmp_exitz)
883#endif
884 add $16, %rdx
885 jmp LABEL(loop_ashr_6_use)
886
887 .p2align 4
888LABEL(nibble_ashr_6_use):
889 sub $0x1000, %r10
890 movdqa -16(%rdi, %rdx), %xmm0
891 psrldq $6, D(%xmm0)
892 pcmpistri $0x3a,%xmm0, %xmm0
893#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
894 cmp %r11, %rcx
895 jae LABEL(nibble_ashr_exit_use)
896#endif
897 cmp $9, %ecx
898 ja LABEL(nibble_ashr_6_restart_use)
899
900 jmp LABEL(nibble_ashr_exit_use)
901
902/*
903 * The following cases will be handled by ashr_7
904 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
905 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
906 */
907 .p2align 4
908LABEL(ashr_7):
909 pslldq $9, D(%xmm2)
910 TOLOWER (%xmm1, %xmm2)
911 pcmpeqb %xmm1, D(%xmm2)
912 psubb %xmm0, D(%xmm2)
913 pmovmskb %xmm2, %r9d
914 shr %cl, %edx
915 shr %cl, %r9d
916 sub %r9d, %edx
917 jnz LABEL(less32bytes)
918 movdqa (%rdi), %xmm3
919
920 UPDATE_STRNCMP_COUNTER
921
922 mov $16, %rcx /* index for loads */
923 mov $7, %r9d /* byte position left over from less32bytes case */
924 /*
925 * Setup %r10 value allows us to detect crossing a page boundary.
926 * When %r10 goes positive we have crossed a page boundary and
927 * need to do a nibble.
928 */
929 lea 7(%rdi), %r10
930 and $0xfff, %r10 /* offset into 4K page */
931 sub $0x1000, %r10 /* subtract 4K pagesize */
932 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
933
934 .p2align 4
935LABEL(loop_ashr_7_use):
936 add $16, %r10
937 jg LABEL(nibble_ashr_7_use)
938
939LABEL(nibble_ashr_7_restart_use):
940 movdqa (%rdi, %rdx), %xmm0
941 palignr $7, -16(%rdi, %rdx), D(%xmm0)
942#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
943 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
944#else
945 movdqa (%rsi,%rdx), %xmm1
946 TOLOWER (%xmm0, %xmm1)
947 pcmpistri $0x1a, %xmm1, %xmm0
948#endif
949 jbe LABEL(exit_use)
950#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
951 sub $16, %r11
952 jbe LABEL(strcmp_exitz)
953#endif
954
955 add $16, %rdx
956 add $16, %r10
957 jg LABEL(nibble_ashr_7_use)
958
959 movdqa (%rdi, %rdx), %xmm0
960 palignr $7, -16(%rdi, %rdx), D(%xmm0)
961#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
962 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
963#else
964 movdqa (%rsi,%rdx), %xmm1
965 TOLOWER (%xmm0, %xmm1)
966 pcmpistri $0x1a, %xmm1, %xmm0
967#endif
968 jbe LABEL(exit_use)
969#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
970 sub $16, %r11
971 jbe LABEL(strcmp_exitz)
972#endif
973 add $16, %rdx
974 jmp LABEL(loop_ashr_7_use)
975
976 .p2align 4
977LABEL(nibble_ashr_7_use):
978 sub $0x1000, %r10
979 movdqa -16(%rdi, %rdx), %xmm0
980 psrldq $7, D(%xmm0)
981 pcmpistri $0x3a,%xmm0, %xmm0
982#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
983 cmp %r11, %rcx
984 jae LABEL(nibble_ashr_exit_use)
985#endif
986 cmp $8, %ecx
987 ja LABEL(nibble_ashr_7_restart_use)
988
989 jmp LABEL(nibble_ashr_exit_use)
990
991/*
992 * The following cases will be handled by ashr_8
993 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
994 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
995 */
996 .p2align 4
997LABEL(ashr_8):
998 pslldq $8, D(%xmm2)
999 TOLOWER (%xmm1, %xmm2)
1000 pcmpeqb %xmm1, D(%xmm2)
1001 psubb %xmm0, D(%xmm2)
1002 pmovmskb %xmm2, %r9d
1003 shr %cl, %edx
1004 shr %cl, %r9d
1005 sub %r9d, %edx
1006 jnz LABEL(less32bytes)
1007 movdqa (%rdi), %xmm3
1008
1009 UPDATE_STRNCMP_COUNTER
1010
1011 mov $16, %rcx /* index for loads */
1012 mov $8, %r9d /* byte position left over from less32bytes case */
1013 /*
1014 * Setup %r10 value allows us to detect crossing a page boundary.
1015 * When %r10 goes positive we have crossed a page boundary and
1016 * need to do a nibble.
1017 */
1018 lea 8(%rdi), %r10
1019 and $0xfff, %r10 /* offset into 4K page */
1020 sub $0x1000, %r10 /* subtract 4K pagesize */
1021 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1022
1023 .p2align 4
1024LABEL(loop_ashr_8_use):
1025 add $16, %r10
1026 jg LABEL(nibble_ashr_8_use)
1027
1028LABEL(nibble_ashr_8_restart_use):
1029 movdqa (%rdi, %rdx), %xmm0
1030 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1031#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1032 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1033#else
1034 movdqa (%rsi,%rdx), %xmm1
1035 TOLOWER (%xmm0, %xmm1)
1036 pcmpistri $0x1a, %xmm1, %xmm0
1037#endif
1038 jbe LABEL(exit_use)
1039#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1040 sub $16, %r11
1041 jbe LABEL(strcmp_exitz)
1042#endif
1043
1044 add $16, %rdx
1045 add $16, %r10
1046 jg LABEL(nibble_ashr_8_use)
1047
1048 movdqa (%rdi, %rdx), %xmm0
1049 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1050#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1051 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1052#else
1053 movdqa (%rsi,%rdx), %xmm1
1054 TOLOWER (%xmm0, %xmm1)
1055 pcmpistri $0x1a, %xmm1, %xmm0
1056#endif
1057 jbe LABEL(exit_use)
1058#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1059 sub $16, %r11
1060 jbe LABEL(strcmp_exitz)
1061#endif
1062 add $16, %rdx
1063 jmp LABEL(loop_ashr_8_use)
1064
1065 .p2align 4
1066LABEL(nibble_ashr_8_use):
1067 sub $0x1000, %r10
1068 movdqa -16(%rdi, %rdx), %xmm0
1069 psrldq $8, D(%xmm0)
1070 pcmpistri $0x3a,%xmm0, %xmm0
1071#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1072 cmp %r11, %rcx
1073 jae LABEL(nibble_ashr_exit_use)
1074#endif
1075 cmp $7, %ecx
1076 ja LABEL(nibble_ashr_8_restart_use)
1077
1078 jmp LABEL(nibble_ashr_exit_use)
1079
1080/*
1081 * The following cases will be handled by ashr_9
1082 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1083 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1084 */
1085 .p2align 4
1086LABEL(ashr_9):
1087 pslldq $7, D(%xmm2)
1088 TOLOWER (%xmm1, %xmm2)
1089 pcmpeqb %xmm1, D(%xmm2)
1090 psubb %xmm0, D(%xmm2)
1091 pmovmskb %xmm2, %r9d
1092 shr %cl, %edx
1093 shr %cl, %r9d
1094 sub %r9d, %edx
1095 jnz LABEL(less32bytes)
1096 movdqa (%rdi), %xmm3
1097
1098 UPDATE_STRNCMP_COUNTER
1099
1100 mov $16, %rcx /* index for loads */
1101 mov $9, %r9d /* byte position left over from less32bytes case */
1102 /*
1103 * Setup %r10 value allows us to detect crossing a page boundary.
1104 * When %r10 goes positive we have crossed a page boundary and
1105 * need to do a nibble.
1106 */
1107 lea 9(%rdi), %r10
1108 and $0xfff, %r10 /* offset into 4K page */
1109 sub $0x1000, %r10 /* subtract 4K pagesize */
1110 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1111
1112 .p2align 4
1113LABEL(loop_ashr_9_use):
1114 add $16, %r10
1115 jg LABEL(nibble_ashr_9_use)
1116
1117LABEL(nibble_ashr_9_restart_use):
1118 movdqa (%rdi, %rdx), %xmm0
1119
1120 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1121#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1122 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1123#else
1124 movdqa (%rsi,%rdx), %xmm1
1125 TOLOWER (%xmm0, %xmm1)
1126 pcmpistri $0x1a, %xmm1, %xmm0
1127#endif
1128 jbe LABEL(exit_use)
1129#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1130 sub $16, %r11
1131 jbe LABEL(strcmp_exitz)
1132#endif
1133
1134 add $16, %rdx
1135 add $16, %r10
1136 jg LABEL(nibble_ashr_9_use)
1137
1138 movdqa (%rdi, %rdx), %xmm0
1139 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1140#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1141 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1142#else
1143 movdqa (%rsi,%rdx), %xmm1
1144 TOLOWER (%xmm0, %xmm1)
1145 pcmpistri $0x1a, %xmm1, %xmm0
1146#endif
1147 jbe LABEL(exit_use)
1148#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1149 sub $16, %r11
1150 jbe LABEL(strcmp_exitz)
1151#endif
1152 add $16, %rdx
1153 jmp LABEL(loop_ashr_9_use)
1154
1155 .p2align 4
1156LABEL(nibble_ashr_9_use):
1157 sub $0x1000, %r10
1158 movdqa -16(%rdi, %rdx), %xmm0
1159 psrldq $9, D(%xmm0)
1160 pcmpistri $0x3a,%xmm0, %xmm0
1161#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1162 cmp %r11, %rcx
1163 jae LABEL(nibble_ashr_exit_use)
1164#endif
1165 cmp $6, %ecx
1166 ja LABEL(nibble_ashr_9_restart_use)
1167
1168 jmp LABEL(nibble_ashr_exit_use)
1169
1170/*
1171 * The following cases will be handled by ashr_10
1172 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1173 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1174 */
1175 .p2align 4
1176LABEL(ashr_10):
1177 pslldq $6, D(%xmm2)
1178 TOLOWER (%xmm1, %xmm2)
1179 pcmpeqb %xmm1, D(%xmm2)
1180 psubb %xmm0, D(%xmm2)
1181 pmovmskb %xmm2, %r9d
1182 shr %cl, %edx
1183 shr %cl, %r9d
1184 sub %r9d, %edx
1185 jnz LABEL(less32bytes)
1186 movdqa (%rdi), %xmm3
1187
1188 UPDATE_STRNCMP_COUNTER
1189
1190 mov $16, %rcx /* index for loads */
1191 mov $10, %r9d /* byte position left over from less32bytes case */
1192 /*
1193 * Setup %r10 value allows us to detect crossing a page boundary.
1194 * When %r10 goes positive we have crossed a page boundary and
1195 * need to do a nibble.
1196 */
1197 lea 10(%rdi), %r10
1198 and $0xfff, %r10 /* offset into 4K page */
1199 sub $0x1000, %r10 /* subtract 4K pagesize */
1200 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1201
1202 .p2align 4
1203LABEL(loop_ashr_10_use):
1204 add $16, %r10
1205 jg LABEL(nibble_ashr_10_use)
1206
1207LABEL(nibble_ashr_10_restart_use):
1208 movdqa (%rdi, %rdx), %xmm0
1209 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1210#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1211 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1212#else
1213 movdqa (%rsi,%rdx), %xmm1
1214 TOLOWER (%xmm0, %xmm1)
1215 pcmpistri $0x1a, %xmm1, %xmm0
1216#endif
1217 jbe LABEL(exit_use)
1218#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1219 sub $16, %r11
1220 jbe LABEL(strcmp_exitz)
1221#endif
1222
1223 add $16, %rdx
1224 add $16, %r10
1225 jg LABEL(nibble_ashr_10_use)
1226
1227 movdqa (%rdi, %rdx), %xmm0
1228 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1229#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1230 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1231#else
1232 movdqa (%rsi,%rdx), %xmm1
1233 TOLOWER (%xmm0, %xmm1)
1234 pcmpistri $0x1a, %xmm1, %xmm0
1235#endif
1236 jbe LABEL(exit_use)
1237#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1238 sub $16, %r11
1239 jbe LABEL(strcmp_exitz)
1240#endif
1241 add $16, %rdx
1242 jmp LABEL(loop_ashr_10_use)
1243
1244 .p2align 4
1245LABEL(nibble_ashr_10_use):
1246 sub $0x1000, %r10
1247 movdqa -16(%rdi, %rdx), %xmm0
1248 psrldq $10, D(%xmm0)
1249 pcmpistri $0x3a,%xmm0, %xmm0
1250#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1251 cmp %r11, %rcx
1252 jae LABEL(nibble_ashr_exit_use)
1253#endif
1254 cmp $5, %ecx
1255 ja LABEL(nibble_ashr_10_restart_use)
1256
1257 jmp LABEL(nibble_ashr_exit_use)
1258
1259/*
1260 * The following cases will be handled by ashr_11
1261 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1262 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1263 */
1264 .p2align 4
1265LABEL(ashr_11):
1266 pslldq $5, D(%xmm2)
1267 TOLOWER (%xmm1, %xmm2)
1268 pcmpeqb %xmm1, D(%xmm2)
1269 psubb %xmm0, D(%xmm2)
1270 pmovmskb %xmm2, %r9d
1271 shr %cl, %edx
1272 shr %cl, %r9d
1273 sub %r9d, %edx
1274 jnz LABEL(less32bytes)
1275 movdqa (%rdi), %xmm3
1276
1277 UPDATE_STRNCMP_COUNTER
1278
1279 mov $16, %rcx /* index for loads */
1280 mov $11, %r9d /* byte position left over from less32bytes case */
1281 /*
1282 * Setup %r10 value allows us to detect crossing a page boundary.
1283 * When %r10 goes positive we have crossed a page boundary and
1284 * need to do a nibble.
1285 */
1286 lea 11(%rdi), %r10
1287 and $0xfff, %r10 /* offset into 4K page */
1288 sub $0x1000, %r10 /* subtract 4K pagesize */
1289 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1290
1291 .p2align 4
1292LABEL(loop_ashr_11_use):
1293 add $16, %r10
1294 jg LABEL(nibble_ashr_11_use)
1295
1296LABEL(nibble_ashr_11_restart_use):
1297 movdqa (%rdi, %rdx), %xmm0
1298 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1299#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1300 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1301#else
1302 movdqa (%rsi,%rdx), %xmm1
1303 TOLOWER (%xmm0, %xmm1)
1304 pcmpistri $0x1a, %xmm1, %xmm0
1305#endif
1306 jbe LABEL(exit_use)
1307#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1308 sub $16, %r11
1309 jbe LABEL(strcmp_exitz)
1310#endif
1311
1312 add $16, %rdx
1313 add $16, %r10
1314 jg LABEL(nibble_ashr_11_use)
1315
1316 movdqa (%rdi, %rdx), %xmm0
1317 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1319 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1320#else
1321 movdqa (%rsi,%rdx), %xmm1
1322 TOLOWER (%xmm0, %xmm1)
1323 pcmpistri $0x1a, %xmm1, %xmm0
1324#endif
1325 jbe LABEL(exit_use)
1326#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1327 sub $16, %r11
1328 jbe LABEL(strcmp_exitz)
1329#endif
1330 add $16, %rdx
1331 jmp LABEL(loop_ashr_11_use)
1332
1333 .p2align 4
1334LABEL(nibble_ashr_11_use):
1335 sub $0x1000, %r10
1336 movdqa -16(%rdi, %rdx), %xmm0
1337 psrldq $11, D(%xmm0)
1338 pcmpistri $0x3a,%xmm0, %xmm0
1339#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1340 cmp %r11, %rcx
1341 jae LABEL(nibble_ashr_exit_use)
1342#endif
1343 cmp $4, %ecx
1344 ja LABEL(nibble_ashr_11_restart_use)
1345
1346 jmp LABEL(nibble_ashr_exit_use)
1347
1348/*
1349 * The following cases will be handled by ashr_12
1350 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1351 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1352 */
1353 .p2align 4
1354LABEL(ashr_12):
1355 pslldq $4, D(%xmm2)
1356 TOLOWER (%xmm1, %xmm2)
1357 pcmpeqb %xmm1, D(%xmm2)
1358 psubb %xmm0, D(%xmm2)
1359 pmovmskb %xmm2, %r9d
1360 shr %cl, %edx
1361 shr %cl, %r9d
1362 sub %r9d, %edx
1363 jnz LABEL(less32bytes)
1364 movdqa (%rdi), %xmm3
1365
1366 UPDATE_STRNCMP_COUNTER
1367
1368 mov $16, %rcx /* index for loads */
1369 mov $12, %r9d /* byte position left over from less32bytes case */
1370 /*
1371 * Setup %r10 value allows us to detect crossing a page boundary.
1372 * When %r10 goes positive we have crossed a page boundary and
1373 * need to do a nibble.
1374 */
1375 lea 12(%rdi), %r10
1376 and $0xfff, %r10 /* offset into 4K page */
1377 sub $0x1000, %r10 /* subtract 4K pagesize */
1378 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1379
1380 .p2align 4
1381LABEL(loop_ashr_12_use):
1382 add $16, %r10
1383 jg LABEL(nibble_ashr_12_use)
1384
1385LABEL(nibble_ashr_12_restart_use):
1386 movdqa (%rdi, %rdx), %xmm0
1387 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1388#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1389 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1390#else
1391 movdqa (%rsi,%rdx), %xmm1
1392 TOLOWER (%xmm0, %xmm1)
1393 pcmpistri $0x1a, %xmm1, %xmm0
1394#endif
1395 jbe LABEL(exit_use)
1396#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1397 sub $16, %r11
1398 jbe LABEL(strcmp_exitz)
1399#endif
1400
1401 add $16, %rdx
1402 add $16, %r10
1403 jg LABEL(nibble_ashr_12_use)
1404
1405 movdqa (%rdi, %rdx), %xmm0
1406 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1407#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1408 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1409#else
1410 movdqa (%rsi,%rdx), %xmm1
1411 TOLOWER (%xmm0, %xmm1)
1412 pcmpistri $0x1a, %xmm1, %xmm0
1413#endif
1414 jbe LABEL(exit_use)
1415#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1416 sub $16, %r11
1417 jbe LABEL(strcmp_exitz)
1418#endif
1419 add $16, %rdx
1420 jmp LABEL(loop_ashr_12_use)
1421
1422 .p2align 4
1423LABEL(nibble_ashr_12_use):
1424 sub $0x1000, %r10
1425 movdqa -16(%rdi, %rdx), %xmm0
1426 psrldq $12, D(%xmm0)
1427 pcmpistri $0x3a,%xmm0, %xmm0
1428#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1429 cmp %r11, %rcx
1430 jae LABEL(nibble_ashr_exit_use)
1431#endif
1432 cmp $3, %ecx
1433 ja LABEL(nibble_ashr_12_restart_use)
1434
1435 jmp LABEL(nibble_ashr_exit_use)
1436
1437/*
1438 * The following cases will be handled by ashr_13
1439 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1440 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1441 */
1442 .p2align 4
1443LABEL(ashr_13):
1444 pslldq $3, D(%xmm2)
1445 TOLOWER (%xmm1, %xmm2)
1446 pcmpeqb %xmm1, D(%xmm2)
1447 psubb %xmm0, D(%xmm2)
1448 pmovmskb %xmm2, %r9d
1449 shr %cl, %edx
1450 shr %cl, %r9d
1451 sub %r9d, %edx
1452 jnz LABEL(less32bytes)
1453 movdqa (%rdi), %xmm3
1454
1455 UPDATE_STRNCMP_COUNTER
1456
1457 mov $16, %rcx /* index for loads */
1458 mov $13, %r9d /* byte position left over from less32bytes case */
1459 /*
1460 * Setup %r10 value allows us to detect crossing a page boundary.
1461 * When %r10 goes positive we have crossed a page boundary and
1462 * need to do a nibble.
1463 */
1464 lea 13(%rdi), %r10
1465 and $0xfff, %r10 /* offset into 4K page */
1466 sub $0x1000, %r10 /* subtract 4K pagesize */
1467
1468 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1469
1470 .p2align 4
1471LABEL(loop_ashr_13_use):
1472 add $16, %r10
1473 jg LABEL(nibble_ashr_13_use)
1474
1475LABEL(nibble_ashr_13_restart_use):
1476 movdqa (%rdi, %rdx), %xmm0
1477 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1478#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1479 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1480#else
1481 movdqa (%rsi,%rdx), %xmm1
1482 TOLOWER (%xmm0, %xmm1)
1483 pcmpistri $0x1a, %xmm1, %xmm0
1484#endif
1485 jbe LABEL(exit_use)
1486#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1487 sub $16, %r11
1488 jbe LABEL(strcmp_exitz)
1489#endif
1490
1491 add $16, %rdx
1492 add $16, %r10
1493 jg LABEL(nibble_ashr_13_use)
1494
1495 movdqa (%rdi, %rdx), %xmm0
1496 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1497#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1498 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1499#else
1500 movdqa (%rsi,%rdx), %xmm1
1501 TOLOWER (%xmm0, %xmm1)
1502 pcmpistri $0x1a, %xmm1, %xmm0
1503#endif
1504 jbe LABEL(exit_use)
1505#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1506 sub $16, %r11
1507 jbe LABEL(strcmp_exitz)
1508#endif
1509 add $16, %rdx
1510 jmp LABEL(loop_ashr_13_use)
1511
1512 .p2align 4
1513LABEL(nibble_ashr_13_use):
1514 sub $0x1000, %r10
1515 movdqa -16(%rdi, %rdx), %xmm0
1516 psrldq $13, D(%xmm0)
1517 pcmpistri $0x3a,%xmm0, %xmm0
1518#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1519 cmp %r11, %rcx
1520 jae LABEL(nibble_ashr_exit_use)
1521#endif
1522 cmp $2, %ecx
1523 ja LABEL(nibble_ashr_13_restart_use)
1524
1525 jmp LABEL(nibble_ashr_exit_use)
1526
1527/*
1528 * The following cases will be handled by ashr_14
1529 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1530 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1531 */
1532 .p2align 4
1533LABEL(ashr_14):
1534 pslldq $2, D(%xmm2)
1535 TOLOWER (%xmm1, %xmm2)
1536 pcmpeqb %xmm1, D(%xmm2)
1537 psubb %xmm0, D(%xmm2)
1538 pmovmskb %xmm2, %r9d
1539 shr %cl, %edx
1540 shr %cl, %r9d
1541 sub %r9d, %edx
1542 jnz LABEL(less32bytes)
1543 movdqa (%rdi), %xmm3
1544
1545 UPDATE_STRNCMP_COUNTER
1546
1547 mov $16, %rcx /* index for loads */
1548 mov $14, %r9d /* byte position left over from less32bytes case */
1549 /*
1550 * Setup %r10 value allows us to detect crossing a page boundary.
1551 * When %r10 goes positive we have crossed a page boundary and
1552 * need to do a nibble.
1553 */
1554 lea 14(%rdi), %r10
1555 and $0xfff, %r10 /* offset into 4K page */
1556 sub $0x1000, %r10 /* subtract 4K pagesize */
1557
1558 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1559
1560 .p2align 4
1561LABEL(loop_ashr_14_use):
1562 add $16, %r10
1563 jg LABEL(nibble_ashr_14_use)
1564
1565LABEL(nibble_ashr_14_restart_use):
1566 movdqa (%rdi, %rdx), %xmm0
1567 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1568#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1569 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1570#else
1571 movdqa (%rsi,%rdx), %xmm1
1572 TOLOWER (%xmm0, %xmm1)
1573 pcmpistri $0x1a, %xmm1, %xmm0
1574#endif
1575 jbe LABEL(exit_use)
1576#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1577 sub $16, %r11
1578 jbe LABEL(strcmp_exitz)
1579#endif
1580
1581 add $16, %rdx
1582 add $16, %r10
1583 jg LABEL(nibble_ashr_14_use)
1584
1585 movdqa (%rdi, %rdx), %xmm0
1586 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1587#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1588 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1589#else
1590 movdqa (%rsi,%rdx), %xmm1
1591 TOLOWER (%xmm0, %xmm1)
1592 pcmpistri $0x1a, %xmm1, %xmm0
1593#endif
1594 jbe LABEL(exit_use)
1595#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1596 sub $16, %r11
1597 jbe LABEL(strcmp_exitz)
1598#endif
1599 add $16, %rdx
1600 jmp LABEL(loop_ashr_14_use)
1601
1602 .p2align 4
1603LABEL(nibble_ashr_14_use):
1604 sub $0x1000, %r10
1605 movdqa -16(%rdi, %rdx), %xmm0
1606 psrldq $14, D(%xmm0)
1607 pcmpistri $0x3a,%xmm0, %xmm0
1608#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1609 cmp %r11, %rcx
1610 jae LABEL(nibble_ashr_exit_use)
1611#endif
1612 cmp $1, %ecx
1613 ja LABEL(nibble_ashr_14_restart_use)
1614
1615 jmp LABEL(nibble_ashr_exit_use)
1616
1617/*
1618 * The following cases will be handled by ashr_15
1619 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1620 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1621 */
1622 .p2align 4
1623LABEL(ashr_15):
1624 pslldq $1, D(%xmm2)
1625 TOLOWER (%xmm1, %xmm2)
1626 pcmpeqb %xmm1, D(%xmm2)
1627 psubb %xmm0, D(%xmm2)
1628 pmovmskb %xmm2, %r9d
1629 shr %cl, %edx
1630 shr %cl, %r9d
1631 sub %r9d, %edx
1632 jnz LABEL(less32bytes)
1633
1634 movdqa (%rdi), %xmm3
1635
1636 UPDATE_STRNCMP_COUNTER
1637
1638 mov $16, %rcx /* index for loads */
1639 mov $15, %r9d /* byte position left over from less32bytes case */
1640 /*
1641 * Setup %r10 value allows us to detect crossing a page boundary.
1642 * When %r10 goes positive we have crossed a page boundary and
1643 * need to do a nibble.
1644 */
1645 lea 15(%rdi), %r10
1646 and $0xfff, %r10 /* offset into 4K page */
1647
1648 sub $0x1000, %r10 /* subtract 4K pagesize */
1649
1650 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1651
1652 .p2align 4
1653LABEL(loop_ashr_15_use):
1654 add $16, %r10
1655 jg LABEL(nibble_ashr_15_use)
1656
1657LABEL(nibble_ashr_15_restart_use):
1658 movdqa (%rdi, %rdx), %xmm0
1659 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1660#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1661 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1662#else
1663 movdqa (%rsi,%rdx), %xmm1
1664 TOLOWER (%xmm0, %xmm1)
1665 pcmpistri $0x1a, %xmm1, %xmm0
1666#endif
1667 jbe LABEL(exit_use)
1668#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1669 sub $16, %r11
1670 jbe LABEL(strcmp_exitz)
1671#endif
1672
1673 add $16, %rdx
1674 add $16, %r10
1675 jg LABEL(nibble_ashr_15_use)
1676
1677 movdqa (%rdi, %rdx), %xmm0
1678 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1679#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1680 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1681#else
1682 movdqa (%rsi,%rdx), %xmm1
1683 TOLOWER (%xmm0, %xmm1)
1684 pcmpistri $0x1a, %xmm1, %xmm0
1685#endif
1686 jbe LABEL(exit_use)
1687#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1688 sub $16, %r11
1689 jbe LABEL(strcmp_exitz)
1690#endif
1691 add $16, %rdx
1692 jmp LABEL(loop_ashr_15_use)
1693
1694 .p2align 4
1695LABEL(nibble_ashr_15_use):
1696 sub $0x1000, %r10
1697 movdqa -16(%rdi, %rdx), %xmm0
1698 psrldq $15, D(%xmm0)
1699 pcmpistri $0x3a,%xmm0, %xmm0
1700#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1701 cmp %r11, %rcx
1702 jae LABEL(nibble_ashr_exit_use)
1703#endif
1704 cmp $0, %ecx
1705 ja LABEL(nibble_ashr_15_restart_use)
1706
1707LABEL(nibble_ashr_exit_use):
1708#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1709 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1710#else
1711 movdqa (%rsi,%rdx), %xmm1
1712 TOLOWER (%xmm0, %xmm1)
1713 pcmpistri $0x1a, %xmm1, %xmm0
1714#endif
1715 .p2align 4
1716LABEL(exit_use):
1717 jnc LABEL(strcmp_exitz)
1718#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1719 sub %rcx, %r11
1720 jbe LABEL(strcmp_exitz)
1721#endif
1722 add %rcx, %rdx
1723 lea -16(%rdi, %r9), %rdi
1724 movzbl (%rdi, %rdx), %eax
1725 movzbl (%rsi, %rdx), %edx
1726 test %r8d, %r8d
1727 jz LABEL(ret_use)
1728 xchg %eax, %edx
1729LABEL(ret_use):
1730#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1731 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1732 movl (%rcx,%rdx,4), %edx
1733 movl (%rcx,%rax,4), %eax
1734#endif
1735
1736 sub %edx, %eax
1737 ret
1738
1739LABEL(less32bytes):
1740 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1741 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1742 test %r8d, %r8d
1743 jz LABEL(ret)
1744 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1745
1746 .p2align 4
1747LABEL(ret):
1748LABEL(less16bytes):
1749 bsf %rdx, %rdx /* find and store bit index in %rdx */
1750
1751#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1752 sub %rdx, %r11
1753 jbe LABEL(strcmp_exitz)
1754#endif
1755 movzbl (%rsi, %rdx), %ecx
1756 movzbl (%rdi, %rdx), %eax
1757
1758#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1759 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1760 movl (%rdx,%rcx,4), %ecx
1761 movl (%rdx,%rax,4), %eax
1762#endif
1763
1764 sub %ecx, %eax
1765 ret
1766
1767LABEL(strcmp_exitz):
1768 xor %eax, %eax
1769 ret
1770
1771 .p2align 4
1772 // XXX Same as code above
1773LABEL(Byte0):
1774 movzx (%rsi), %ecx
1775 movzx (%rdi), %eax
1776
1777#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1778 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1779 movl (%rdx,%rcx,4), %ecx
1780 movl (%rdx,%rax,4), %eax
1781#endif
1782
1783 sub %ecx, %eax
1784 ret
1785 cfi_endproc
1786 .size STRCMP_SSE42, .-STRCMP_SSE42
1787
1788#undef UCLOW_reg
1789#undef UCHIGH_reg
1790#undef LCQWORD_reg
1791#undef TOLOWER
1792
1793 /* Put all SSE 4.2 functions together. */
1794 .section .rodata.SECTION,"a",@progbits
1795 .p2align 3
1796LABEL(unaligned_table):
1797 .int LABEL(ashr_1) - LABEL(unaligned_table)
1798 .int LABEL(ashr_2) - LABEL(unaligned_table)
1799 .int LABEL(ashr_3) - LABEL(unaligned_table)
1800 .int LABEL(ashr_4) - LABEL(unaligned_table)
1801 .int LABEL(ashr_5) - LABEL(unaligned_table)
1802 .int LABEL(ashr_6) - LABEL(unaligned_table)
1803 .int LABEL(ashr_7) - LABEL(unaligned_table)
1804 .int LABEL(ashr_8) - LABEL(unaligned_table)
1805 .int LABEL(ashr_9) - LABEL(unaligned_table)
1806 .int LABEL(ashr_10) - LABEL(unaligned_table)
1807 .int LABEL(ashr_11) - LABEL(unaligned_table)
1808 .int LABEL(ashr_12) - LABEL(unaligned_table)
1809 .int LABEL(ashr_13) - LABEL(unaligned_table)
1810 .int LABEL(ashr_14) - LABEL(unaligned_table)
1811 .int LABEL(ashr_15) - LABEL(unaligned_table)
1812 .int LABEL(ashr_0) - LABEL(unaligned_table)
1813
1814#undef LABEL
1815#undef GLABEL
1816#undef SECTION
1817#undef movdqa
1818#undef movdqu
1819#undef pmovmskb
1820#undef pcmpistri
1821#undef psubb
1822#undef pcmpeqb
1823#undef psrldq
1824#undef pslldq
1825#undef palignr
1826#undef pxor
1827#undef D
1828