1/* strcmp with SSE4.2
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20
21/* We use 0x1a:
22 _SIDD_SBYTE_OPS
23 | _SIDD_CMP_EQUAL_EACH
24 | _SIDD_NEGATIVE_POLARITY
25 | _SIDD_LEAST_SIGNIFICANT
26 on pcmpistri to find out if two 16byte data elements are the same
27 and the offset of the first different byte. There are 4 cases:
28
29 1. Both 16byte data elements are valid and identical.
30 2. Both 16byte data elements have EOS and identical.
31 3. Both 16byte data elements are valid and they differ at offset X.
32 4. At least one 16byte data element has EOS at offset X. Two 16byte
33 data elements must differ at or before offset X.
34
35 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
36
37 case ECX CFlag ZFlag SFlag
38 1 16 0 0 0
39 2 16 0 1 1
40 3 X 1 0 0
41 4 0 <= X 1 0/1 0/1
42
43 We exit from the loop for cases 2, 3 and 4 with jbe which branches
44 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
45 case 2. */
46
47 /* Put all SSE 4.2 functions together. */
48 .section .text.SECTION,"ax",@progbits
49 .align 16
50 .type STRCMP_SSE42, @function
51 .globl STRCMP_SSE42
52 .hidden STRCMP_SSE42
53#ifdef USE_AS_STRCASECMP_L
54ENTRY (GLABEL(__strcasecmp))
55 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
56 mov %fs:(%rax),%RDX_LP
57
58 // XXX 5 byte should be before the function
59 /* 5-byte NOP. */
60 .byte 0x0f,0x1f,0x44,0x00,0x00
61END (GLABEL(__strcasecmp))
62 /* FALLTHROUGH to strcasecmp_l. */
63#endif
64#ifdef USE_AS_STRNCASECMP_L
65ENTRY (GLABEL(__strncasecmp))
66 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
67 mov %fs:(%rax),%RCX_LP
68
69 // XXX 5 byte should be before the function
70 /* 5-byte NOP. */
71 .byte 0x0f,0x1f,0x44,0x00,0x00
72END (GLABEL(__strncasecmp))
73 /* FALLTHROUGH to strncasecmp_l. */
74#endif
75
76
77#ifdef USE_AVX
78# define movdqa vmovdqa
79# define movdqu vmovdqu
80# define pmovmskb vpmovmskb
81# define pcmpistri vpcmpistri
82# define psubb vpsubb
83# define pcmpeqb vpcmpeqb
84# define psrldq vpsrldq
85# define pslldq vpslldq
86# define palignr vpalignr
87# define pxor vpxor
88# define D(arg) arg, arg
89#else
90# define D(arg) arg
91#endif
92
93STRCMP_SSE42:
94 cfi_startproc
95 CALL_MCOUNT
96
97/*
98 * This implementation uses SSE to compare up to 16 bytes at a time.
99 */
100#ifdef USE_AS_STRCASECMP_L
101 /* We have to fall back on the C implementation for locales
102 with encodings not matching ASCII for single bytes. */
103# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
104 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
105# else
106 mov (%rdx), %RAX_LP
107# endif
108 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
109 jne __strcasecmp_l_nonascii
110#endif
111#ifdef USE_AS_STRNCASECMP_L
112 /* We have to fall back on the C implementation for locales
113 with encodings not matching ASCII for single bytes. */
114# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
115 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
116# else
117 mov (%rcx), %RAX_LP
118# endif
119 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
120 jne __strncasecmp_l_nonascii
121#endif
122
123#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
124 test %RDX_LP, %RDX_LP
125 je LABEL(strcmp_exitz)
126 cmp $1, %RDX_LP
127 je LABEL(Byte0)
128 mov %RDX_LP, %R11_LP
129#endif
130 mov %esi, %ecx
131 mov %edi, %eax
132/* Use 64bit AND here to avoid long NOP padding. */
133 and $0x3f, %rcx /* rsi alignment in cache line */
134 and $0x3f, %rax /* rdi alignment in cache line */
135#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
136 .section .rodata.cst16,"aM",@progbits,16
137 .align 16
138LABEL(belowupper):
139 .quad 0x4040404040404040
140 .quad 0x4040404040404040
141LABEL(topupper):
142# ifdef USE_AVX
143 .quad 0x5a5a5a5a5a5a5a5a
144 .quad 0x5a5a5a5a5a5a5a5a
145# else
146 .quad 0x5b5b5b5b5b5b5b5b
147 .quad 0x5b5b5b5b5b5b5b5b
148# endif
149LABEL(touppermask):
150 .quad 0x2020202020202020
151 .quad 0x2020202020202020
152 .previous
153 movdqa LABEL(belowupper)(%rip), %xmm4
154# define UCLOW_reg %xmm4
155 movdqa LABEL(topupper)(%rip), %xmm5
156# define UCHIGH_reg %xmm5
157 movdqa LABEL(touppermask)(%rip), %xmm6
158# define LCQWORD_reg %xmm6
159#endif
160 cmp $0x30, %ecx
161 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
162 cmp $0x30, %eax
163 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
164 movdqu (%rdi), %xmm1
165 movdqu (%rsi), %xmm2
166#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
167# ifdef USE_AVX
168# define TOLOWER(reg1, reg2) \
169 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
170 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
171 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
172 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
173 vpandn %xmm7, %xmm8, %xmm8; \
174 vpandn %xmm9, %xmm10, %xmm10; \
175 vpand LCQWORD_reg, %xmm8, %xmm8; \
176 vpand LCQWORD_reg, %xmm10, %xmm10; \
177 vpor reg1, %xmm8, reg1; \
178 vpor reg2, %xmm10, reg2
179# else
180# define TOLOWER(reg1, reg2) \
181 movdqa reg1, %xmm7; \
182 movdqa UCHIGH_reg, %xmm8; \
183 movdqa reg2, %xmm9; \
184 movdqa UCHIGH_reg, %xmm10; \
185 pcmpgtb UCLOW_reg, %xmm7; \
186 pcmpgtb reg1, %xmm8; \
187 pcmpgtb UCLOW_reg, %xmm9; \
188 pcmpgtb reg2, %xmm10; \
189 pand %xmm8, %xmm7; \
190 pand %xmm10, %xmm9; \
191 pand LCQWORD_reg, %xmm7; \
192 pand LCQWORD_reg, %xmm9; \
193 por %xmm7, reg1; \
194 por %xmm9, reg2
195# endif
196 TOLOWER (%xmm1, %xmm2)
197#else
198# define TOLOWER(reg1, reg2)
199#endif
200 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
201 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
202 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
203 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
204 pmovmskb %xmm1, %edx
205 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
206 jnz LABEL(less16bytes)/* If not, find different value or null char */
207#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
208 sub $16, %r11
209 jbe LABEL(strcmp_exitz)/* finish comparison */
210#endif
211 add $16, %rsi /* prepare to search next 16 bytes */
212 add $16, %rdi /* prepare to search next 16 bytes */
213
214 /*
215 * Determine source and destination string offsets from 16-byte
216 * alignment. Use relative offset difference between the two to
217 * determine which case below to use.
218 */
219 .p2align 4
220LABEL(crosscache):
221 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
222 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
223 mov $0xffff, %edx /* for equivalent offset */
224 xor %r8d, %r8d
225 and $0xf, %ecx /* offset of rsi */
226 and $0xf, %eax /* offset of rdi */
227 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
228 cmp %eax, %ecx
229 je LABEL(ashr_0) /* rsi and rdi relative offset same */
230 ja LABEL(bigger)
231 mov %edx, %r8d /* r8d is offset flag for exit tail */
232 xchg %ecx, %eax
233 xchg %rsi, %rdi
234LABEL(bigger):
235 movdqa (%rdi), %xmm2
236 movdqa (%rsi), %xmm1
237 lea 15(%rax), %r9
238 sub %rcx, %r9
239 lea LABEL(unaligned_table)(%rip), %r10
240 movslq (%r10, %r9,4), %r9
241 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
242 lea (%r10, %r9), %r10
243 jmp *%r10 /* jump to corresponding case */
244
245/*
246 * The following cases will be handled by ashr_0
247 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
248 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
249 */
250 .p2align 4
251LABEL(ashr_0):
252
253 movdqa (%rsi), %xmm1
254 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
255#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
256 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
257#else
258 movdqa (%rdi), %xmm2
259 TOLOWER (%xmm1, %xmm2)
260 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
261#endif
262 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
263 pmovmskb %xmm1, %r9d
264 shr %cl, %edx /* adjust 0xffff for offset */
265 shr %cl, %r9d /* adjust for 16-byte offset */
266 sub %r9d, %edx
267 /*
268 * edx must be the same with r9d if in left byte (16-rcx) is equal to
269 * the start from (16-rax) and no null char was seen.
270 */
271 jne LABEL(less32bytes) /* mismatch or null char */
272 UPDATE_STRNCMP_COUNTER
273 mov $16, %rcx
274 mov $16, %r9
275
276 /*
277 * Now both strings are aligned at 16-byte boundary. Loop over strings
278 * checking 32-bytes per iteration.
279 */
280 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
281 .p2align 4
282LABEL(ashr_0_use):
283 movdqa (%rdi,%rdx), %xmm0
284#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
285 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
286#else
287 movdqa (%rsi,%rdx), %xmm1
288 TOLOWER (%xmm0, %xmm1)
289 pcmpistri $0x1a, %xmm1, %xmm0
290#endif
291 lea 16(%rdx), %rdx
292 jbe LABEL(ashr_0_exit_use)
293#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
294 sub $16, %r11
295 jbe LABEL(strcmp_exitz)
296#endif
297
298 movdqa (%rdi,%rdx), %xmm0
299#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
300 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
301#else
302 movdqa (%rsi,%rdx), %xmm1
303 TOLOWER (%xmm0, %xmm1)
304 pcmpistri $0x1a, %xmm1, %xmm0
305#endif
306 lea 16(%rdx), %rdx
307 jbe LABEL(ashr_0_exit_use)
308#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
309 sub $16, %r11
310 jbe LABEL(strcmp_exitz)
311#endif
312 jmp LABEL(ashr_0_use)
313
314
315 .p2align 4
316LABEL(ashr_0_exit_use):
317 jnc LABEL(strcmp_exitz)
318#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
319 sub %rcx, %r11
320 jbe LABEL(strcmp_exitz)
321#endif
322 lea -16(%rdx, %rcx), %rcx
323 movzbl (%rdi, %rcx), %eax
324 movzbl (%rsi, %rcx), %edx
325#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
326 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
327 movl (%rcx,%rax,4), %eax
328 movl (%rcx,%rdx,4), %edx
329#endif
330 sub %edx, %eax
331 ret
332
333
334
335/*
336 * The following cases will be handled by ashr_1
337 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
338 * n(15) n -15 0(15 +(n-15) - n) ashr_1
339 */
340 .p2align 4
341LABEL(ashr_1):
342 pslldq $15, D(%xmm2) /* shift first string to align with second */
343 TOLOWER (%xmm1, %xmm2)
344 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
345 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
346 pmovmskb %xmm2, %r9d
347 shr %cl, %edx /* adjust 0xffff for offset */
348 shr %cl, %r9d /* adjust for 16-byte offset */
349 sub %r9d, %edx
350 jnz LABEL(less32bytes) /* mismatch or null char seen */
351 movdqa (%rdi), %xmm3
352 UPDATE_STRNCMP_COUNTER
353
354 mov $16, %rcx /* index for loads*/
355 mov $1, %r9d /* byte position left over from less32bytes case */
356 /*
357 * Setup %r10 value allows us to detect crossing a page boundary.
358 * When %r10 goes positive we have crossed a page boundary and
359 * need to do a nibble.
360 */
361 lea 1(%rdi), %r10
362 and $0xfff, %r10 /* offset into 4K page */
363 sub $0x1000, %r10 /* subtract 4K pagesize */
364 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
365
366 .p2align 4
367LABEL(loop_ashr_1_use):
368 add $16, %r10
369 jg LABEL(nibble_ashr_1_use)
370
371LABEL(nibble_ashr_1_restart_use):
372 movdqa (%rdi, %rdx), %xmm0
373 palignr $1, -16(%rdi, %rdx), D(%xmm0)
374#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
375 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
376#else
377 movdqa (%rsi,%rdx), %xmm1
378 TOLOWER (%xmm0, %xmm1)
379 pcmpistri $0x1a, %xmm1, %xmm0
380#endif
381 jbe LABEL(exit_use)
382#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
383 sub $16, %r11
384 jbe LABEL(strcmp_exitz)
385#endif
386
387 add $16, %rdx
388 add $16, %r10
389 jg LABEL(nibble_ashr_1_use)
390
391 movdqa (%rdi, %rdx), %xmm0
392 palignr $1, -16(%rdi, %rdx), D(%xmm0)
393#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
394 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
395#else
396 movdqa (%rsi,%rdx), %xmm1
397 TOLOWER (%xmm0, %xmm1)
398 pcmpistri $0x1a, %xmm1, %xmm0
399#endif
400 jbe LABEL(exit_use)
401#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
402 sub $16, %r11
403 jbe LABEL(strcmp_exitz)
404#endif
405 add $16, %rdx
406 jmp LABEL(loop_ashr_1_use)
407
408 .p2align 4
409LABEL(nibble_ashr_1_use):
410 sub $0x1000, %r10
411 movdqa -16(%rdi, %rdx), %xmm0
412 psrldq $1, D(%xmm0)
413 pcmpistri $0x3a,%xmm0, %xmm0
414#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
415 cmp %r11, %rcx
416 jae LABEL(nibble_ashr_exit_use)
417#endif
418 cmp $14, %ecx
419 ja LABEL(nibble_ashr_1_restart_use)
420
421 jmp LABEL(nibble_ashr_exit_use)
422
423/*
424 * The following cases will be handled by ashr_2
425 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
426 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
427 */
428 .p2align 4
429LABEL(ashr_2):
430 pslldq $14, D(%xmm2)
431 TOLOWER (%xmm1, %xmm2)
432 pcmpeqb %xmm1, D(%xmm2)
433 psubb %xmm0, D(%xmm2)
434 pmovmskb %xmm2, %r9d
435 shr %cl, %edx
436 shr %cl, %r9d
437 sub %r9d, %edx
438 jnz LABEL(less32bytes)
439 movdqa (%rdi), %xmm3
440 UPDATE_STRNCMP_COUNTER
441
442 mov $16, %rcx /* index for loads */
443 mov $2, %r9d /* byte position left over from less32bytes case */
444 /*
445 * Setup %r10 value allows us to detect crossing a page boundary.
446 * When %r10 goes positive we have crossed a page boundary and
447 * need to do a nibble.
448 */
449 lea 2(%rdi), %r10
450 and $0xfff, %r10 /* offset into 4K page */
451 sub $0x1000, %r10 /* subtract 4K pagesize */
452 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
453
454 .p2align 4
455LABEL(loop_ashr_2_use):
456 add $16, %r10
457 jg LABEL(nibble_ashr_2_use)
458
459LABEL(nibble_ashr_2_restart_use):
460 movdqa (%rdi, %rdx), %xmm0
461 palignr $2, -16(%rdi, %rdx), D(%xmm0)
462#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
463 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
464#else
465 movdqa (%rsi,%rdx), %xmm1
466 TOLOWER (%xmm0, %xmm1)
467 pcmpistri $0x1a, %xmm1, %xmm0
468#endif
469 jbe LABEL(exit_use)
470#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
471 sub $16, %r11
472 jbe LABEL(strcmp_exitz)
473#endif
474
475 add $16, %rdx
476 add $16, %r10
477 jg LABEL(nibble_ashr_2_use)
478
479 movdqa (%rdi, %rdx), %xmm0
480 palignr $2, -16(%rdi, %rdx), D(%xmm0)
481#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
482 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
483#else
484 movdqa (%rsi,%rdx), %xmm1
485 TOLOWER (%xmm0, %xmm1)
486 pcmpistri $0x1a, %xmm1, %xmm0
487#endif
488 jbe LABEL(exit_use)
489#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
490 sub $16, %r11
491 jbe LABEL(strcmp_exitz)
492#endif
493 add $16, %rdx
494 jmp LABEL(loop_ashr_2_use)
495
496 .p2align 4
497LABEL(nibble_ashr_2_use):
498 sub $0x1000, %r10
499 movdqa -16(%rdi, %rdx), %xmm0
500 psrldq $2, D(%xmm0)
501 pcmpistri $0x3a,%xmm0, %xmm0
502#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
503 cmp %r11, %rcx
504 jae LABEL(nibble_ashr_exit_use)
505#endif
506 cmp $13, %ecx
507 ja LABEL(nibble_ashr_2_restart_use)
508
509 jmp LABEL(nibble_ashr_exit_use)
510
511/*
512 * The following cases will be handled by ashr_3
513 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
514 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
515 */
516 .p2align 4
517LABEL(ashr_3):
518 pslldq $13, D(%xmm2)
519 TOLOWER (%xmm1, %xmm2)
520 pcmpeqb %xmm1, D(%xmm2)
521 psubb %xmm0, D(%xmm2)
522 pmovmskb %xmm2, %r9d
523 shr %cl, %edx
524 shr %cl, %r9d
525 sub %r9d, %edx
526 jnz LABEL(less32bytes)
527 movdqa (%rdi), %xmm3
528
529 UPDATE_STRNCMP_COUNTER
530
531 mov $16, %rcx /* index for loads */
532 mov $3, %r9d /* byte position left over from less32bytes case */
533 /*
534 * Setup %r10 value allows us to detect crossing a page boundary.
535 * When %r10 goes positive we have crossed a page boundary and
536 * need to do a nibble.
537 */
538 lea 3(%rdi), %r10
539 and $0xfff, %r10 /* offset into 4K page */
540 sub $0x1000, %r10 /* subtract 4K pagesize */
541 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
542
543LABEL(loop_ashr_3_use):
544 add $16, %r10
545 jg LABEL(nibble_ashr_3_use)
546
547LABEL(nibble_ashr_3_restart_use):
548 movdqa (%rdi, %rdx), %xmm0
549 palignr $3, -16(%rdi, %rdx), D(%xmm0)
550#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
551 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
552#else
553 movdqa (%rsi,%rdx), %xmm1
554 TOLOWER (%xmm0, %xmm1)
555 pcmpistri $0x1a, %xmm1, %xmm0
556#endif
557 jbe LABEL(exit_use)
558#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
559 sub $16, %r11
560 jbe LABEL(strcmp_exitz)
561#endif
562
563 add $16, %rdx
564 add $16, %r10
565 jg LABEL(nibble_ashr_3_use)
566
567 movdqa (%rdi, %rdx), %xmm0
568 palignr $3, -16(%rdi, %rdx), D(%xmm0)
569#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
570 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
571#else
572 movdqa (%rsi,%rdx), %xmm1
573 TOLOWER (%xmm0, %xmm1)
574 pcmpistri $0x1a, %xmm1, %xmm0
575#endif
576 jbe LABEL(exit_use)
577#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
578 sub $16, %r11
579 jbe LABEL(strcmp_exitz)
580#endif
581 add $16, %rdx
582 jmp LABEL(loop_ashr_3_use)
583
584 .p2align 4
585LABEL(nibble_ashr_3_use):
586 sub $0x1000, %r10
587 movdqa -16(%rdi, %rdx), %xmm0
588 psrldq $3, D(%xmm0)
589 pcmpistri $0x3a,%xmm0, %xmm0
590#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
591 cmp %r11, %rcx
592 jae LABEL(nibble_ashr_exit_use)
593#endif
594 cmp $12, %ecx
595 ja LABEL(nibble_ashr_3_restart_use)
596
597 jmp LABEL(nibble_ashr_exit_use)
598
599/*
600 * The following cases will be handled by ashr_4
601 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
602 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
603 */
604 .p2align 4
605LABEL(ashr_4):
606 pslldq $12, D(%xmm2)
607 TOLOWER (%xmm1, %xmm2)
608 pcmpeqb %xmm1, D(%xmm2)
609 psubb %xmm0, D(%xmm2)
610 pmovmskb %xmm2, %r9d
611 shr %cl, %edx
612 shr %cl, %r9d
613 sub %r9d, %edx
614 jnz LABEL(less32bytes)
615 movdqa (%rdi), %xmm3
616
617 UPDATE_STRNCMP_COUNTER
618
619 mov $16, %rcx /* index for loads */
620 mov $4, %r9d /* byte position left over from less32bytes case */
621 /*
622 * Setup %r10 value allows us to detect crossing a page boundary.
623 * When %r10 goes positive we have crossed a page boundary and
624 * need to do a nibble.
625 */
626 lea 4(%rdi), %r10
627 and $0xfff, %r10 /* offset into 4K page */
628 sub $0x1000, %r10 /* subtract 4K pagesize */
629 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
630
631 .p2align 4
632LABEL(loop_ashr_4_use):
633 add $16, %r10
634 jg LABEL(nibble_ashr_4_use)
635
636LABEL(nibble_ashr_4_restart_use):
637 movdqa (%rdi, %rdx), %xmm0
638 palignr $4, -16(%rdi, %rdx), D(%xmm0)
639#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
640 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
641#else
642 movdqa (%rsi,%rdx), %xmm1
643 TOLOWER (%xmm0, %xmm1)
644 pcmpistri $0x1a, %xmm1, %xmm0
645#endif
646 jbe LABEL(exit_use)
647#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
648 sub $16, %r11
649 jbe LABEL(strcmp_exitz)
650#endif
651
652 add $16, %rdx
653 add $16, %r10
654 jg LABEL(nibble_ashr_4_use)
655
656 movdqa (%rdi, %rdx), %xmm0
657 palignr $4, -16(%rdi, %rdx), D(%xmm0)
658#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
659 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
660#else
661 movdqa (%rsi,%rdx), %xmm1
662 TOLOWER (%xmm0, %xmm1)
663 pcmpistri $0x1a, %xmm1, %xmm0
664#endif
665 jbe LABEL(exit_use)
666#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
667 sub $16, %r11
668 jbe LABEL(strcmp_exitz)
669#endif
670 add $16, %rdx
671 jmp LABEL(loop_ashr_4_use)
672
673 .p2align 4
674LABEL(nibble_ashr_4_use):
675 sub $0x1000, %r10
676 movdqa -16(%rdi, %rdx), %xmm0
677 psrldq $4, D(%xmm0)
678 pcmpistri $0x3a,%xmm0, %xmm0
679#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
680 cmp %r11, %rcx
681 jae LABEL(nibble_ashr_exit_use)
682#endif
683 cmp $11, %ecx
684 ja LABEL(nibble_ashr_4_restart_use)
685
686 jmp LABEL(nibble_ashr_exit_use)
687
688/*
689 * The following cases will be handled by ashr_5
690 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
691 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
692 */
693 .p2align 4
694LABEL(ashr_5):
695 pslldq $11, D(%xmm2)
696 TOLOWER (%xmm1, %xmm2)
697 pcmpeqb %xmm1, D(%xmm2)
698 psubb %xmm0, D(%xmm2)
699 pmovmskb %xmm2, %r9d
700 shr %cl, %edx
701 shr %cl, %r9d
702 sub %r9d, %edx
703 jnz LABEL(less32bytes)
704 movdqa (%rdi), %xmm3
705
706 UPDATE_STRNCMP_COUNTER
707
708 mov $16, %rcx /* index for loads */
709 mov $5, %r9d /* byte position left over from less32bytes case */
710 /*
711 * Setup %r10 value allows us to detect crossing a page boundary.
712 * When %r10 goes positive we have crossed a page boundary and
713 * need to do a nibble.
714 */
715 lea 5(%rdi), %r10
716 and $0xfff, %r10 /* offset into 4K page */
717 sub $0x1000, %r10 /* subtract 4K pagesize */
718 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
719
720 .p2align 4
721LABEL(loop_ashr_5_use):
722 add $16, %r10
723 jg LABEL(nibble_ashr_5_use)
724
725LABEL(nibble_ashr_5_restart_use):
726 movdqa (%rdi, %rdx), %xmm0
727 palignr $5, -16(%rdi, %rdx), D(%xmm0)
728#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
729 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
730#else
731 movdqa (%rsi,%rdx), %xmm1
732 TOLOWER (%xmm0, %xmm1)
733 pcmpistri $0x1a, %xmm1, %xmm0
734#endif
735 jbe LABEL(exit_use)
736#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
737 sub $16, %r11
738 jbe LABEL(strcmp_exitz)
739#endif
740
741 add $16, %rdx
742 add $16, %r10
743 jg LABEL(nibble_ashr_5_use)
744
745 movdqa (%rdi, %rdx), %xmm0
746
747 palignr $5, -16(%rdi, %rdx), D(%xmm0)
748#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
749 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
750#else
751 movdqa (%rsi,%rdx), %xmm1
752 TOLOWER (%xmm0, %xmm1)
753 pcmpistri $0x1a, %xmm1, %xmm0
754#endif
755 jbe LABEL(exit_use)
756#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
757 sub $16, %r11
758 jbe LABEL(strcmp_exitz)
759#endif
760 add $16, %rdx
761 jmp LABEL(loop_ashr_5_use)
762
763 .p2align 4
764LABEL(nibble_ashr_5_use):
765 sub $0x1000, %r10
766 movdqa -16(%rdi, %rdx), %xmm0
767 psrldq $5, D(%xmm0)
768 pcmpistri $0x3a,%xmm0, %xmm0
769#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
770 cmp %r11, %rcx
771 jae LABEL(nibble_ashr_exit_use)
772#endif
773 cmp $10, %ecx
774 ja LABEL(nibble_ashr_5_restart_use)
775
776 jmp LABEL(nibble_ashr_exit_use)
777
778/*
779 * The following cases will be handled by ashr_6
780 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
781 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
782 */
783 .p2align 4
784LABEL(ashr_6):
785 pslldq $10, D(%xmm2)
786 TOLOWER (%xmm1, %xmm2)
787 pcmpeqb %xmm1, D(%xmm2)
788 psubb %xmm0, D(%xmm2)
789 pmovmskb %xmm2, %r9d
790 shr %cl, %edx
791 shr %cl, %r9d
792 sub %r9d, %edx
793 jnz LABEL(less32bytes)
794 movdqa (%rdi), %xmm3
795
796 UPDATE_STRNCMP_COUNTER
797
798 mov $16, %rcx /* index for loads */
799 mov $6, %r9d /* byte position left over from less32bytes case */
800 /*
801 * Setup %r10 value allows us to detect crossing a page boundary.
802 * When %r10 goes positive we have crossed a page boundary and
803 * need to do a nibble.
804 */
805 lea 6(%rdi), %r10
806 and $0xfff, %r10 /* offset into 4K page */
807 sub $0x1000, %r10 /* subtract 4K pagesize */
808 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
809
810 .p2align 4
811LABEL(loop_ashr_6_use):
812 add $16, %r10
813 jg LABEL(nibble_ashr_6_use)
814
815LABEL(nibble_ashr_6_restart_use):
816 movdqa (%rdi, %rdx), %xmm0
817 palignr $6, -16(%rdi, %rdx), D(%xmm0)
818#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
819 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
820#else
821 movdqa (%rsi,%rdx), %xmm1
822 TOLOWER (%xmm0, %xmm1)
823 pcmpistri $0x1a, %xmm1, %xmm0
824#endif
825 jbe LABEL(exit_use)
826#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
827 sub $16, %r11
828 jbe LABEL(strcmp_exitz)
829#endif
830
831 add $16, %rdx
832 add $16, %r10
833 jg LABEL(nibble_ashr_6_use)
834
835 movdqa (%rdi, %rdx), %xmm0
836 palignr $6, -16(%rdi, %rdx), D(%xmm0)
837#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
838 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
839#else
840 movdqa (%rsi,%rdx), %xmm1
841 TOLOWER (%xmm0, %xmm1)
842 pcmpistri $0x1a, %xmm1, %xmm0
843#endif
844 jbe LABEL(exit_use)
845#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
846 sub $16, %r11
847 jbe LABEL(strcmp_exitz)
848#endif
849 add $16, %rdx
850 jmp LABEL(loop_ashr_6_use)
851
852 .p2align 4
853LABEL(nibble_ashr_6_use):
854 sub $0x1000, %r10
855 movdqa -16(%rdi, %rdx), %xmm0
856 psrldq $6, D(%xmm0)
857 pcmpistri $0x3a,%xmm0, %xmm0
858#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
859 cmp %r11, %rcx
860 jae LABEL(nibble_ashr_exit_use)
861#endif
862 cmp $9, %ecx
863 ja LABEL(nibble_ashr_6_restart_use)
864
865 jmp LABEL(nibble_ashr_exit_use)
866
867/*
868 * The following cases will be handled by ashr_7
869 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
870 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
871 */
872 .p2align 4
873LABEL(ashr_7):
874 pslldq $9, D(%xmm2)
875 TOLOWER (%xmm1, %xmm2)
876 pcmpeqb %xmm1, D(%xmm2)
877 psubb %xmm0, D(%xmm2)
878 pmovmskb %xmm2, %r9d
879 shr %cl, %edx
880 shr %cl, %r9d
881 sub %r9d, %edx
882 jnz LABEL(less32bytes)
883 movdqa (%rdi), %xmm3
884
885 UPDATE_STRNCMP_COUNTER
886
887 mov $16, %rcx /* index for loads */
888 mov $7, %r9d /* byte position left over from less32bytes case */
889 /*
890 * Setup %r10 value allows us to detect crossing a page boundary.
891 * When %r10 goes positive we have crossed a page boundary and
892 * need to do a nibble.
893 */
894 lea 7(%rdi), %r10
895 and $0xfff, %r10 /* offset into 4K page */
896 sub $0x1000, %r10 /* subtract 4K pagesize */
897 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
898
899 .p2align 4
900LABEL(loop_ashr_7_use):
901 add $16, %r10
902 jg LABEL(nibble_ashr_7_use)
903
904LABEL(nibble_ashr_7_restart_use):
905 movdqa (%rdi, %rdx), %xmm0
906 palignr $7, -16(%rdi, %rdx), D(%xmm0)
907#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
908 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
909#else
910 movdqa (%rsi,%rdx), %xmm1
911 TOLOWER (%xmm0, %xmm1)
912 pcmpistri $0x1a, %xmm1, %xmm0
913#endif
914 jbe LABEL(exit_use)
915#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
916 sub $16, %r11
917 jbe LABEL(strcmp_exitz)
918#endif
919
920 add $16, %rdx
921 add $16, %r10
922 jg LABEL(nibble_ashr_7_use)
923
924 movdqa (%rdi, %rdx), %xmm0
925 palignr $7, -16(%rdi, %rdx), D(%xmm0)
926#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
927 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
928#else
929 movdqa (%rsi,%rdx), %xmm1
930 TOLOWER (%xmm0, %xmm1)
931 pcmpistri $0x1a, %xmm1, %xmm0
932#endif
933 jbe LABEL(exit_use)
934#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
935 sub $16, %r11
936 jbe LABEL(strcmp_exitz)
937#endif
938 add $16, %rdx
939 jmp LABEL(loop_ashr_7_use)
940
941 .p2align 4
942LABEL(nibble_ashr_7_use):
943 sub $0x1000, %r10
944 movdqa -16(%rdi, %rdx), %xmm0
945 psrldq $7, D(%xmm0)
946 pcmpistri $0x3a,%xmm0, %xmm0
947#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
948 cmp %r11, %rcx
949 jae LABEL(nibble_ashr_exit_use)
950#endif
951 cmp $8, %ecx
952 ja LABEL(nibble_ashr_7_restart_use)
953
954 jmp LABEL(nibble_ashr_exit_use)
955
956/*
957 * The following cases will be handled by ashr_8
958 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
959 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
960 */
961 .p2align 4
962LABEL(ashr_8):
963 pslldq $8, D(%xmm2)
964 TOLOWER (%xmm1, %xmm2)
965 pcmpeqb %xmm1, D(%xmm2)
966 psubb %xmm0, D(%xmm2)
967 pmovmskb %xmm2, %r9d
968 shr %cl, %edx
969 shr %cl, %r9d
970 sub %r9d, %edx
971 jnz LABEL(less32bytes)
972 movdqa (%rdi), %xmm3
973
974 UPDATE_STRNCMP_COUNTER
975
976 mov $16, %rcx /* index for loads */
977 mov $8, %r9d /* byte position left over from less32bytes case */
978 /*
979 * Setup %r10 value allows us to detect crossing a page boundary.
980 * When %r10 goes positive we have crossed a page boundary and
981 * need to do a nibble.
982 */
983 lea 8(%rdi), %r10
984 and $0xfff, %r10 /* offset into 4K page */
985 sub $0x1000, %r10 /* subtract 4K pagesize */
986 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
987
988 .p2align 4
989LABEL(loop_ashr_8_use):
990 add $16, %r10
991 jg LABEL(nibble_ashr_8_use)
992
993LABEL(nibble_ashr_8_restart_use):
994 movdqa (%rdi, %rdx), %xmm0
995 palignr $8, -16(%rdi, %rdx), D(%xmm0)
996#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
997 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
998#else
999 movdqa (%rsi,%rdx), %xmm1
1000 TOLOWER (%xmm0, %xmm1)
1001 pcmpistri $0x1a, %xmm1, %xmm0
1002#endif
1003 jbe LABEL(exit_use)
1004#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1005 sub $16, %r11
1006 jbe LABEL(strcmp_exitz)
1007#endif
1008
1009 add $16, %rdx
1010 add $16, %r10
1011 jg LABEL(nibble_ashr_8_use)
1012
1013 movdqa (%rdi, %rdx), %xmm0
1014 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1015#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1016 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1017#else
1018 movdqa (%rsi,%rdx), %xmm1
1019 TOLOWER (%xmm0, %xmm1)
1020 pcmpistri $0x1a, %xmm1, %xmm0
1021#endif
1022 jbe LABEL(exit_use)
1023#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1024 sub $16, %r11
1025 jbe LABEL(strcmp_exitz)
1026#endif
1027 add $16, %rdx
1028 jmp LABEL(loop_ashr_8_use)
1029
1030 .p2align 4
1031LABEL(nibble_ashr_8_use):
1032 sub $0x1000, %r10
1033 movdqa -16(%rdi, %rdx), %xmm0
1034 psrldq $8, D(%xmm0)
1035 pcmpistri $0x3a,%xmm0, %xmm0
1036#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1037 cmp %r11, %rcx
1038 jae LABEL(nibble_ashr_exit_use)
1039#endif
1040 cmp $7, %ecx
1041 ja LABEL(nibble_ashr_8_restart_use)
1042
1043 jmp LABEL(nibble_ashr_exit_use)
1044
1045/*
1046 * The following cases will be handled by ashr_9
1047 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1048 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1049 */
1050 .p2align 4
1051LABEL(ashr_9):
1052 pslldq $7, D(%xmm2)
1053 TOLOWER (%xmm1, %xmm2)
1054 pcmpeqb %xmm1, D(%xmm2)
1055 psubb %xmm0, D(%xmm2)
1056 pmovmskb %xmm2, %r9d
1057 shr %cl, %edx
1058 shr %cl, %r9d
1059 sub %r9d, %edx
1060 jnz LABEL(less32bytes)
1061 movdqa (%rdi), %xmm3
1062
1063 UPDATE_STRNCMP_COUNTER
1064
1065 mov $16, %rcx /* index for loads */
1066 mov $9, %r9d /* byte position left over from less32bytes case */
1067 /*
1068 * Setup %r10 value allows us to detect crossing a page boundary.
1069 * When %r10 goes positive we have crossed a page boundary and
1070 * need to do a nibble.
1071 */
1072 lea 9(%rdi), %r10
1073 and $0xfff, %r10 /* offset into 4K page */
1074 sub $0x1000, %r10 /* subtract 4K pagesize */
1075 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1076
1077 .p2align 4
1078LABEL(loop_ashr_9_use):
1079 add $16, %r10
1080 jg LABEL(nibble_ashr_9_use)
1081
1082LABEL(nibble_ashr_9_restart_use):
1083 movdqa (%rdi, %rdx), %xmm0
1084
1085 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1086#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1087 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1088#else
1089 movdqa (%rsi,%rdx), %xmm1
1090 TOLOWER (%xmm0, %xmm1)
1091 pcmpistri $0x1a, %xmm1, %xmm0
1092#endif
1093 jbe LABEL(exit_use)
1094#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1095 sub $16, %r11
1096 jbe LABEL(strcmp_exitz)
1097#endif
1098
1099 add $16, %rdx
1100 add $16, %r10
1101 jg LABEL(nibble_ashr_9_use)
1102
1103 movdqa (%rdi, %rdx), %xmm0
1104 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1105#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1106 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1107#else
1108 movdqa (%rsi,%rdx), %xmm1
1109 TOLOWER (%xmm0, %xmm1)
1110 pcmpistri $0x1a, %xmm1, %xmm0
1111#endif
1112 jbe LABEL(exit_use)
1113#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1114 sub $16, %r11
1115 jbe LABEL(strcmp_exitz)
1116#endif
1117 add $16, %rdx
1118 jmp LABEL(loop_ashr_9_use)
1119
1120 .p2align 4
1121LABEL(nibble_ashr_9_use):
1122 sub $0x1000, %r10
1123 movdqa -16(%rdi, %rdx), %xmm0
1124 psrldq $9, D(%xmm0)
1125 pcmpistri $0x3a,%xmm0, %xmm0
1126#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1127 cmp %r11, %rcx
1128 jae LABEL(nibble_ashr_exit_use)
1129#endif
1130 cmp $6, %ecx
1131 ja LABEL(nibble_ashr_9_restart_use)
1132
1133 jmp LABEL(nibble_ashr_exit_use)
1134
1135/*
1136 * The following cases will be handled by ashr_10
1137 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1138 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1139 */
1140 .p2align 4
1141LABEL(ashr_10):
1142 pslldq $6, D(%xmm2)
1143 TOLOWER (%xmm1, %xmm2)
1144 pcmpeqb %xmm1, D(%xmm2)
1145 psubb %xmm0, D(%xmm2)
1146 pmovmskb %xmm2, %r9d
1147 shr %cl, %edx
1148 shr %cl, %r9d
1149 sub %r9d, %edx
1150 jnz LABEL(less32bytes)
1151 movdqa (%rdi), %xmm3
1152
1153 UPDATE_STRNCMP_COUNTER
1154
1155 mov $16, %rcx /* index for loads */
1156 mov $10, %r9d /* byte position left over from less32bytes case */
1157 /*
1158 * Setup %r10 value allows us to detect crossing a page boundary.
1159 * When %r10 goes positive we have crossed a page boundary and
1160 * need to do a nibble.
1161 */
1162 lea 10(%rdi), %r10
1163 and $0xfff, %r10 /* offset into 4K page */
1164 sub $0x1000, %r10 /* subtract 4K pagesize */
1165 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1166
1167 .p2align 4
1168LABEL(loop_ashr_10_use):
1169 add $16, %r10
1170 jg LABEL(nibble_ashr_10_use)
1171
1172LABEL(nibble_ashr_10_restart_use):
1173 movdqa (%rdi, %rdx), %xmm0
1174 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1175#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1176 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1177#else
1178 movdqa (%rsi,%rdx), %xmm1
1179 TOLOWER (%xmm0, %xmm1)
1180 pcmpistri $0x1a, %xmm1, %xmm0
1181#endif
1182 jbe LABEL(exit_use)
1183#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1184 sub $16, %r11
1185 jbe LABEL(strcmp_exitz)
1186#endif
1187
1188 add $16, %rdx
1189 add $16, %r10
1190 jg LABEL(nibble_ashr_10_use)
1191
1192 movdqa (%rdi, %rdx), %xmm0
1193 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1194#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1195 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1196#else
1197 movdqa (%rsi,%rdx), %xmm1
1198 TOLOWER (%xmm0, %xmm1)
1199 pcmpistri $0x1a, %xmm1, %xmm0
1200#endif
1201 jbe LABEL(exit_use)
1202#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1203 sub $16, %r11
1204 jbe LABEL(strcmp_exitz)
1205#endif
1206 add $16, %rdx
1207 jmp LABEL(loop_ashr_10_use)
1208
1209 .p2align 4
1210LABEL(nibble_ashr_10_use):
1211 sub $0x1000, %r10
1212 movdqa -16(%rdi, %rdx), %xmm0
1213 psrldq $10, D(%xmm0)
1214 pcmpistri $0x3a,%xmm0, %xmm0
1215#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1216 cmp %r11, %rcx
1217 jae LABEL(nibble_ashr_exit_use)
1218#endif
1219 cmp $5, %ecx
1220 ja LABEL(nibble_ashr_10_restart_use)
1221
1222 jmp LABEL(nibble_ashr_exit_use)
1223
1224/*
1225 * The following cases will be handled by ashr_11
1226 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1227 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1228 */
1229 .p2align 4
1230LABEL(ashr_11):
1231 pslldq $5, D(%xmm2)
1232 TOLOWER (%xmm1, %xmm2)
1233 pcmpeqb %xmm1, D(%xmm2)
1234 psubb %xmm0, D(%xmm2)
1235 pmovmskb %xmm2, %r9d
1236 shr %cl, %edx
1237 shr %cl, %r9d
1238 sub %r9d, %edx
1239 jnz LABEL(less32bytes)
1240 movdqa (%rdi), %xmm3
1241
1242 UPDATE_STRNCMP_COUNTER
1243
1244 mov $16, %rcx /* index for loads */
1245 mov $11, %r9d /* byte position left over from less32bytes case */
1246 /*
1247 * Setup %r10 value allows us to detect crossing a page boundary.
1248 * When %r10 goes positive we have crossed a page boundary and
1249 * need to do a nibble.
1250 */
1251 lea 11(%rdi), %r10
1252 and $0xfff, %r10 /* offset into 4K page */
1253 sub $0x1000, %r10 /* subtract 4K pagesize */
1254 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1255
1256 .p2align 4
1257LABEL(loop_ashr_11_use):
1258 add $16, %r10
1259 jg LABEL(nibble_ashr_11_use)
1260
1261LABEL(nibble_ashr_11_restart_use):
1262 movdqa (%rdi, %rdx), %xmm0
1263 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1264#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1265 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1266#else
1267 movdqa (%rsi,%rdx), %xmm1
1268 TOLOWER (%xmm0, %xmm1)
1269 pcmpistri $0x1a, %xmm1, %xmm0
1270#endif
1271 jbe LABEL(exit_use)
1272#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1273 sub $16, %r11
1274 jbe LABEL(strcmp_exitz)
1275#endif
1276
1277 add $16, %rdx
1278 add $16, %r10
1279 jg LABEL(nibble_ashr_11_use)
1280
1281 movdqa (%rdi, %rdx), %xmm0
1282 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1283#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1284 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1285#else
1286 movdqa (%rsi,%rdx), %xmm1
1287 TOLOWER (%xmm0, %xmm1)
1288 pcmpistri $0x1a, %xmm1, %xmm0
1289#endif
1290 jbe LABEL(exit_use)
1291#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1292 sub $16, %r11
1293 jbe LABEL(strcmp_exitz)
1294#endif
1295 add $16, %rdx
1296 jmp LABEL(loop_ashr_11_use)
1297
1298 .p2align 4
1299LABEL(nibble_ashr_11_use):
1300 sub $0x1000, %r10
1301 movdqa -16(%rdi, %rdx), %xmm0
1302 psrldq $11, D(%xmm0)
1303 pcmpistri $0x3a,%xmm0, %xmm0
1304#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1305 cmp %r11, %rcx
1306 jae LABEL(nibble_ashr_exit_use)
1307#endif
1308 cmp $4, %ecx
1309 ja LABEL(nibble_ashr_11_restart_use)
1310
1311 jmp LABEL(nibble_ashr_exit_use)
1312
1313/*
1314 * The following cases will be handled by ashr_12
1315 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1316 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1317 */
1318 .p2align 4
1319LABEL(ashr_12):
1320 pslldq $4, D(%xmm2)
1321 TOLOWER (%xmm1, %xmm2)
1322 pcmpeqb %xmm1, D(%xmm2)
1323 psubb %xmm0, D(%xmm2)
1324 pmovmskb %xmm2, %r9d
1325 shr %cl, %edx
1326 shr %cl, %r9d
1327 sub %r9d, %edx
1328 jnz LABEL(less32bytes)
1329 movdqa (%rdi), %xmm3
1330
1331 UPDATE_STRNCMP_COUNTER
1332
1333 mov $16, %rcx /* index for loads */
1334 mov $12, %r9d /* byte position left over from less32bytes case */
1335 /*
1336 * Setup %r10 value allows us to detect crossing a page boundary.
1337 * When %r10 goes positive we have crossed a page boundary and
1338 * need to do a nibble.
1339 */
1340 lea 12(%rdi), %r10
1341 and $0xfff, %r10 /* offset into 4K page */
1342 sub $0x1000, %r10 /* subtract 4K pagesize */
1343 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1344
1345 .p2align 4
1346LABEL(loop_ashr_12_use):
1347 add $16, %r10
1348 jg LABEL(nibble_ashr_12_use)
1349
1350LABEL(nibble_ashr_12_restart_use):
1351 movdqa (%rdi, %rdx), %xmm0
1352 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1353#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1354 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1355#else
1356 movdqa (%rsi,%rdx), %xmm1
1357 TOLOWER (%xmm0, %xmm1)
1358 pcmpistri $0x1a, %xmm1, %xmm0
1359#endif
1360 jbe LABEL(exit_use)
1361#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1362 sub $16, %r11
1363 jbe LABEL(strcmp_exitz)
1364#endif
1365
1366 add $16, %rdx
1367 add $16, %r10
1368 jg LABEL(nibble_ashr_12_use)
1369
1370 movdqa (%rdi, %rdx), %xmm0
1371 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1372#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1373 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1374#else
1375 movdqa (%rsi,%rdx), %xmm1
1376 TOLOWER (%xmm0, %xmm1)
1377 pcmpistri $0x1a, %xmm1, %xmm0
1378#endif
1379 jbe LABEL(exit_use)
1380#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1381 sub $16, %r11
1382 jbe LABEL(strcmp_exitz)
1383#endif
1384 add $16, %rdx
1385 jmp LABEL(loop_ashr_12_use)
1386
1387 .p2align 4
1388LABEL(nibble_ashr_12_use):
1389 sub $0x1000, %r10
1390 movdqa -16(%rdi, %rdx), %xmm0
1391 psrldq $12, D(%xmm0)
1392 pcmpistri $0x3a,%xmm0, %xmm0
1393#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1394 cmp %r11, %rcx
1395 jae LABEL(nibble_ashr_exit_use)
1396#endif
1397 cmp $3, %ecx
1398 ja LABEL(nibble_ashr_12_restart_use)
1399
1400 jmp LABEL(nibble_ashr_exit_use)
1401
1402/*
1403 * The following cases will be handled by ashr_13
1404 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1405 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1406 */
1407 .p2align 4
1408LABEL(ashr_13):
1409 pslldq $3, D(%xmm2)
1410 TOLOWER (%xmm1, %xmm2)
1411 pcmpeqb %xmm1, D(%xmm2)
1412 psubb %xmm0, D(%xmm2)
1413 pmovmskb %xmm2, %r9d
1414 shr %cl, %edx
1415 shr %cl, %r9d
1416 sub %r9d, %edx
1417 jnz LABEL(less32bytes)
1418 movdqa (%rdi), %xmm3
1419
1420 UPDATE_STRNCMP_COUNTER
1421
1422 mov $16, %rcx /* index for loads */
1423 mov $13, %r9d /* byte position left over from less32bytes case */
1424 /*
1425 * Setup %r10 value allows us to detect crossing a page boundary.
1426 * When %r10 goes positive we have crossed a page boundary and
1427 * need to do a nibble.
1428 */
1429 lea 13(%rdi), %r10
1430 and $0xfff, %r10 /* offset into 4K page */
1431 sub $0x1000, %r10 /* subtract 4K pagesize */
1432
1433 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1434
1435 .p2align 4
1436LABEL(loop_ashr_13_use):
1437 add $16, %r10
1438 jg LABEL(nibble_ashr_13_use)
1439
1440LABEL(nibble_ashr_13_restart_use):
1441 movdqa (%rdi, %rdx), %xmm0
1442 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1443#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1444 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1445#else
1446 movdqa (%rsi,%rdx), %xmm1
1447 TOLOWER (%xmm0, %xmm1)
1448 pcmpistri $0x1a, %xmm1, %xmm0
1449#endif
1450 jbe LABEL(exit_use)
1451#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1452 sub $16, %r11
1453 jbe LABEL(strcmp_exitz)
1454#endif
1455
1456 add $16, %rdx
1457 add $16, %r10
1458 jg LABEL(nibble_ashr_13_use)
1459
1460 movdqa (%rdi, %rdx), %xmm0
1461 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1462#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1463 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1464#else
1465 movdqa (%rsi,%rdx), %xmm1
1466 TOLOWER (%xmm0, %xmm1)
1467 pcmpistri $0x1a, %xmm1, %xmm0
1468#endif
1469 jbe LABEL(exit_use)
1470#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1471 sub $16, %r11
1472 jbe LABEL(strcmp_exitz)
1473#endif
1474 add $16, %rdx
1475 jmp LABEL(loop_ashr_13_use)
1476
1477 .p2align 4
1478LABEL(nibble_ashr_13_use):
1479 sub $0x1000, %r10
1480 movdqa -16(%rdi, %rdx), %xmm0
1481 psrldq $13, D(%xmm0)
1482 pcmpistri $0x3a,%xmm0, %xmm0
1483#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1484 cmp %r11, %rcx
1485 jae LABEL(nibble_ashr_exit_use)
1486#endif
1487 cmp $2, %ecx
1488 ja LABEL(nibble_ashr_13_restart_use)
1489
1490 jmp LABEL(nibble_ashr_exit_use)
1491
1492/*
1493 * The following cases will be handled by ashr_14
1494 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1495 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1496 */
1497 .p2align 4
1498LABEL(ashr_14):
1499 pslldq $2, D(%xmm2)
1500 TOLOWER (%xmm1, %xmm2)
1501 pcmpeqb %xmm1, D(%xmm2)
1502 psubb %xmm0, D(%xmm2)
1503 pmovmskb %xmm2, %r9d
1504 shr %cl, %edx
1505 shr %cl, %r9d
1506 sub %r9d, %edx
1507 jnz LABEL(less32bytes)
1508 movdqa (%rdi), %xmm3
1509
1510 UPDATE_STRNCMP_COUNTER
1511
1512 mov $16, %rcx /* index for loads */
1513 mov $14, %r9d /* byte position left over from less32bytes case */
1514 /*
1515 * Setup %r10 value allows us to detect crossing a page boundary.
1516 * When %r10 goes positive we have crossed a page boundary and
1517 * need to do a nibble.
1518 */
1519 lea 14(%rdi), %r10
1520 and $0xfff, %r10 /* offset into 4K page */
1521 sub $0x1000, %r10 /* subtract 4K pagesize */
1522
1523 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1524
1525 .p2align 4
1526LABEL(loop_ashr_14_use):
1527 add $16, %r10
1528 jg LABEL(nibble_ashr_14_use)
1529
1530LABEL(nibble_ashr_14_restart_use):
1531 movdqa (%rdi, %rdx), %xmm0
1532 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1533#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1534 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1535#else
1536 movdqa (%rsi,%rdx), %xmm1
1537 TOLOWER (%xmm0, %xmm1)
1538 pcmpistri $0x1a, %xmm1, %xmm0
1539#endif
1540 jbe LABEL(exit_use)
1541#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1542 sub $16, %r11
1543 jbe LABEL(strcmp_exitz)
1544#endif
1545
1546 add $16, %rdx
1547 add $16, %r10
1548 jg LABEL(nibble_ashr_14_use)
1549
1550 movdqa (%rdi, %rdx), %xmm0
1551 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1552#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1553 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1554#else
1555 movdqa (%rsi,%rdx), %xmm1
1556 TOLOWER (%xmm0, %xmm1)
1557 pcmpistri $0x1a, %xmm1, %xmm0
1558#endif
1559 jbe LABEL(exit_use)
1560#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1561 sub $16, %r11
1562 jbe LABEL(strcmp_exitz)
1563#endif
1564 add $16, %rdx
1565 jmp LABEL(loop_ashr_14_use)
1566
1567 .p2align 4
1568LABEL(nibble_ashr_14_use):
1569 sub $0x1000, %r10
1570 movdqa -16(%rdi, %rdx), %xmm0
1571 psrldq $14, D(%xmm0)
1572 pcmpistri $0x3a,%xmm0, %xmm0
1573#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1574 cmp %r11, %rcx
1575 jae LABEL(nibble_ashr_exit_use)
1576#endif
1577 cmp $1, %ecx
1578 ja LABEL(nibble_ashr_14_restart_use)
1579
1580 jmp LABEL(nibble_ashr_exit_use)
1581
1582/*
1583 * The following cases will be handled by ashr_15
1584 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1585 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1586 */
1587 .p2align 4
1588LABEL(ashr_15):
1589 pslldq $1, D(%xmm2)
1590 TOLOWER (%xmm1, %xmm2)
1591 pcmpeqb %xmm1, D(%xmm2)
1592 psubb %xmm0, D(%xmm2)
1593 pmovmskb %xmm2, %r9d
1594 shr %cl, %edx
1595 shr %cl, %r9d
1596 sub %r9d, %edx
1597 jnz LABEL(less32bytes)
1598
1599 movdqa (%rdi), %xmm3
1600
1601 UPDATE_STRNCMP_COUNTER
1602
1603 mov $16, %rcx /* index for loads */
1604 mov $15, %r9d /* byte position left over from less32bytes case */
1605 /*
1606 * Setup %r10 value allows us to detect crossing a page boundary.
1607 * When %r10 goes positive we have crossed a page boundary and
1608 * need to do a nibble.
1609 */
1610 lea 15(%rdi), %r10
1611 and $0xfff, %r10 /* offset into 4K page */
1612
1613 sub $0x1000, %r10 /* subtract 4K pagesize */
1614
1615 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1616
1617 .p2align 4
1618LABEL(loop_ashr_15_use):
1619 add $16, %r10
1620 jg LABEL(nibble_ashr_15_use)
1621
1622LABEL(nibble_ashr_15_restart_use):
1623 movdqa (%rdi, %rdx), %xmm0
1624 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1625#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1626 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1627#else
1628 movdqa (%rsi,%rdx), %xmm1
1629 TOLOWER (%xmm0, %xmm1)
1630 pcmpistri $0x1a, %xmm1, %xmm0
1631#endif
1632 jbe LABEL(exit_use)
1633#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1634 sub $16, %r11
1635 jbe LABEL(strcmp_exitz)
1636#endif
1637
1638 add $16, %rdx
1639 add $16, %r10
1640 jg LABEL(nibble_ashr_15_use)
1641
1642 movdqa (%rdi, %rdx), %xmm0
1643 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1644#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1645 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1646#else
1647 movdqa (%rsi,%rdx), %xmm1
1648 TOLOWER (%xmm0, %xmm1)
1649 pcmpistri $0x1a, %xmm1, %xmm0
1650#endif
1651 jbe LABEL(exit_use)
1652#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1653 sub $16, %r11
1654 jbe LABEL(strcmp_exitz)
1655#endif
1656 add $16, %rdx
1657 jmp LABEL(loop_ashr_15_use)
1658
1659 .p2align 4
1660LABEL(nibble_ashr_15_use):
1661 sub $0x1000, %r10
1662 movdqa -16(%rdi, %rdx), %xmm0
1663 psrldq $15, D(%xmm0)
1664 pcmpistri $0x3a,%xmm0, %xmm0
1665#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1666 cmp %r11, %rcx
1667 jae LABEL(nibble_ashr_exit_use)
1668#endif
1669 cmp $0, %ecx
1670 ja LABEL(nibble_ashr_15_restart_use)
1671
1672LABEL(nibble_ashr_exit_use):
1673#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1674 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1675#else
1676 movdqa (%rsi,%rdx), %xmm1
1677 TOLOWER (%xmm0, %xmm1)
1678 pcmpistri $0x1a, %xmm1, %xmm0
1679#endif
1680 .p2align 4
1681LABEL(exit_use):
1682 jnc LABEL(strcmp_exitz)
1683#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1684 sub %rcx, %r11
1685 jbe LABEL(strcmp_exitz)
1686#endif
1687 add %rcx, %rdx
1688 lea -16(%rdi, %r9), %rdi
1689 movzbl (%rdi, %rdx), %eax
1690 movzbl (%rsi, %rdx), %edx
1691 test %r8d, %r8d
1692 jz LABEL(ret_use)
1693 xchg %eax, %edx
1694LABEL(ret_use):
1695#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1696 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1697 movl (%rcx,%rdx,4), %edx
1698 movl (%rcx,%rax,4), %eax
1699#endif
1700
1701 sub %edx, %eax
1702 ret
1703
1704LABEL(less32bytes):
1705 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1706 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1707 test %r8d, %r8d
1708 jz LABEL(ret)
1709 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1710
1711 .p2align 4
1712LABEL(ret):
1713LABEL(less16bytes):
1714 bsf %rdx, %rdx /* find and store bit index in %rdx */
1715
1716#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1717 sub %rdx, %r11
1718 jbe LABEL(strcmp_exitz)
1719#endif
1720 movzbl (%rsi, %rdx), %ecx
1721 movzbl (%rdi, %rdx), %eax
1722
1723#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1724 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1725 movl (%rdx,%rcx,4), %ecx
1726 movl (%rdx,%rax,4), %eax
1727#endif
1728
1729 sub %ecx, %eax
1730 ret
1731
1732LABEL(strcmp_exitz):
1733 xor %eax, %eax
1734 ret
1735
1736 .p2align 4
1737 // XXX Same as code above
1738LABEL(Byte0):
1739 movzx (%rsi), %ecx
1740 movzx (%rdi), %eax
1741
1742#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1743 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1744 movl (%rdx,%rcx,4), %ecx
1745 movl (%rdx,%rax,4), %eax
1746#endif
1747
1748 sub %ecx, %eax
1749 ret
1750 cfi_endproc
1751 .size STRCMP_SSE42, .-STRCMP_SSE42
1752
1753#undef UCLOW_reg
1754#undef UCHIGH_reg
1755#undef LCQWORD_reg
1756#undef TOLOWER
1757
1758 /* Put all SSE 4.2 functions together. */
1759 .section .rodata.SECTION,"a",@progbits
1760 .p2align 3
1761LABEL(unaligned_table):
1762 .int LABEL(ashr_1) - LABEL(unaligned_table)
1763 .int LABEL(ashr_2) - LABEL(unaligned_table)
1764 .int LABEL(ashr_3) - LABEL(unaligned_table)
1765 .int LABEL(ashr_4) - LABEL(unaligned_table)
1766 .int LABEL(ashr_5) - LABEL(unaligned_table)
1767 .int LABEL(ashr_6) - LABEL(unaligned_table)
1768 .int LABEL(ashr_7) - LABEL(unaligned_table)
1769 .int LABEL(ashr_8) - LABEL(unaligned_table)
1770 .int LABEL(ashr_9) - LABEL(unaligned_table)
1771 .int LABEL(ashr_10) - LABEL(unaligned_table)
1772 .int LABEL(ashr_11) - LABEL(unaligned_table)
1773 .int LABEL(ashr_12) - LABEL(unaligned_table)
1774 .int LABEL(ashr_13) - LABEL(unaligned_table)
1775 .int LABEL(ashr_14) - LABEL(unaligned_table)
1776 .int LABEL(ashr_15) - LABEL(unaligned_table)
1777 .int LABEL(ashr_0) - LABEL(unaligned_table)
1778
1779#undef LABEL
1780#undef GLABEL
1781#undef SECTION
1782#undef movdqa
1783#undef movdqu
1784#undef pmovmskb
1785#undef pcmpistri
1786#undef psubb
1787#undef pcmpeqb
1788#undef psrldq
1789#undef pslldq
1790#undef palignr
1791#undef pxor
1792#undef D
1793