1/* strcmp with SSE4.2
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22#ifndef STRCMP_SSE42
23# define STRCMP_SSE42 __strcmp_sse42
24#endif
25
26#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
27# include "locale-defines.h"
28#endif
29
30#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
31/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
32 if the new counter > the old one or is 0. */
33# define UPDATE_STRNCMP_COUNTER \
34 /* calculate left number to compare */ \
35 lea -16(%rcx, %r11), %r9; \
36 cmp %r9, %r11; \
37 jb LABEL(strcmp_exitz); \
38 test %r9, %r9; \
39 je LABEL(strcmp_exitz); \
40 mov %r9, %r11
41#else
42# define UPDATE_STRNCMP_COUNTER
43#endif
44
45#ifdef USE_AVX
46# define SECTION avx
47# define GLABEL(l) l##_avx
48#else
49# define SECTION sse4.2
50# define GLABEL(l) l##_sse42
51#endif
52
53#define LABEL(l) .L##l
54
55/* We use 0x1a:
56 _SIDD_SBYTE_OPS
57 | _SIDD_CMP_EQUAL_EACH
58 | _SIDD_NEGATIVE_POLARITY
59 | _SIDD_LEAST_SIGNIFICANT
60 on pcmpistri to find out if two 16byte data elements are the same
61 and the offset of the first different byte. There are 4 cases:
62
63 1. Both 16byte data elements are valid and identical.
64 2. Both 16byte data elements have EOS and identical.
65 3. Both 16byte data elements are valid and they differ at offset X.
66 4. At least one 16byte data element has EOS at offset X. Two 16byte
67 data elements must differ at or before offset X.
68
69 Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
70
71 case ECX CFlag ZFlag SFlag
72 1 16 0 0 0
73 2 16 0 1 1
74 3 X 1 0 0
75 4 0 <= X 1 0/1 0/1
76
77 We exit from the loop for cases 2, 3 and 4 with jbe which branches
78 when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
79 case 2. */
80
81 /* Put all SSE 4.2 functions together. */
82 .section .text.SECTION,"ax",@progbits
83 .align 16
84 .type STRCMP_SSE42, @function
85 .globl STRCMP_SSE42
86 .hidden STRCMP_SSE42
87#ifdef USE_AS_STRCASECMP_L
88ENTRY (GLABEL(__strcasecmp))
89 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
90 mov %fs:(%rax),%RDX_LP
91
92 // XXX 5 byte should be before the function
93 /* 5-byte NOP. */
94 .byte 0x0f,0x1f,0x44,0x00,0x00
95END (GLABEL(__strcasecmp))
96 /* FALLTHROUGH to strcasecmp_l. */
97#endif
98#ifdef USE_AS_STRNCASECMP_L
99ENTRY (GLABEL(__strncasecmp))
100 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
101 mov %fs:(%rax),%RCX_LP
102
103 // XXX 5 byte should be before the function
104 /* 5-byte NOP. */
105 .byte 0x0f,0x1f,0x44,0x00,0x00
106END (GLABEL(__strncasecmp))
107 /* FALLTHROUGH to strncasecmp_l. */
108#endif
109
110
111#ifdef USE_AVX
112# define movdqa vmovdqa
113# define movdqu vmovdqu
114# define pmovmskb vpmovmskb
115# define pcmpistri vpcmpistri
116# define psubb vpsubb
117# define pcmpeqb vpcmpeqb
118# define psrldq vpsrldq
119# define pslldq vpslldq
120# define palignr vpalignr
121# define pxor vpxor
122# define D(arg) arg, arg
123#else
124# define D(arg) arg
125#endif
126
127STRCMP_SSE42:
128 cfi_startproc
129 CALL_MCOUNT
130
131/*
132 * This implementation uses SSE to compare up to 16 bytes at a time.
133 */
134#ifdef USE_AS_STRCASECMP_L
135 /* We have to fall back on the C implementation for locales
136 with encodings not matching ASCII for single bytes. */
137# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
138 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
139# else
140 mov (%rdx), %RAX_LP
141# endif
142 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
143 jne __strcasecmp_l_nonascii
144#endif
145#ifdef USE_AS_STRNCASECMP_L
146 /* We have to fall back on the C implementation for locales
147 with encodings not matching ASCII for single bytes. */
148# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
149 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
150# else
151 mov (%rcx), %RAX_LP
152# endif
153 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
154 jne __strncasecmp_l_nonascii
155#endif
156
157#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
158 test %RDX_LP, %RDX_LP
159 je LABEL(strcmp_exitz)
160 cmp $1, %RDX_LP
161 je LABEL(Byte0)
162 mov %RDX_LP, %R11_LP
163#endif
164 mov %esi, %ecx
165 mov %edi, %eax
166/* Use 64bit AND here to avoid long NOP padding. */
167 and $0x3f, %rcx /* rsi alignment in cache line */
168 and $0x3f, %rax /* rdi alignment in cache line */
169#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
170 .section .rodata.cst16,"aM",@progbits,16
171 .align 16
172LABEL(belowupper):
173 .quad 0x4040404040404040
174 .quad 0x4040404040404040
175LABEL(topupper):
176# ifdef USE_AVX
177 .quad 0x5a5a5a5a5a5a5a5a
178 .quad 0x5a5a5a5a5a5a5a5a
179# else
180 .quad 0x5b5b5b5b5b5b5b5b
181 .quad 0x5b5b5b5b5b5b5b5b
182# endif
183LABEL(touppermask):
184 .quad 0x2020202020202020
185 .quad 0x2020202020202020
186 .previous
187 movdqa LABEL(belowupper)(%rip), %xmm4
188# define UCLOW_reg %xmm4
189 movdqa LABEL(topupper)(%rip), %xmm5
190# define UCHIGH_reg %xmm5
191 movdqa LABEL(touppermask)(%rip), %xmm6
192# define LCQWORD_reg %xmm6
193#endif
194 cmp $0x30, %ecx
195 ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
196 cmp $0x30, %eax
197 ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
198 movdqu (%rdi), %xmm1
199 movdqu (%rsi), %xmm2
200#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
201# ifdef USE_AVX
202# define TOLOWER(reg1, reg2) \
203 vpcmpgtb UCLOW_reg, reg1, %xmm7; \
204 vpcmpgtb UCHIGH_reg, reg1, %xmm8; \
205 vpcmpgtb UCLOW_reg, reg2, %xmm9; \
206 vpcmpgtb UCHIGH_reg, reg2, %xmm10; \
207 vpandn %xmm7, %xmm8, %xmm8; \
208 vpandn %xmm9, %xmm10, %xmm10; \
209 vpand LCQWORD_reg, %xmm8, %xmm8; \
210 vpand LCQWORD_reg, %xmm10, %xmm10; \
211 vpor reg1, %xmm8, reg1; \
212 vpor reg2, %xmm10, reg2
213# else
214# define TOLOWER(reg1, reg2) \
215 movdqa reg1, %xmm7; \
216 movdqa UCHIGH_reg, %xmm8; \
217 movdqa reg2, %xmm9; \
218 movdqa UCHIGH_reg, %xmm10; \
219 pcmpgtb UCLOW_reg, %xmm7; \
220 pcmpgtb reg1, %xmm8; \
221 pcmpgtb UCLOW_reg, %xmm9; \
222 pcmpgtb reg2, %xmm10; \
223 pand %xmm8, %xmm7; \
224 pand %xmm10, %xmm9; \
225 pand LCQWORD_reg, %xmm7; \
226 pand LCQWORD_reg, %xmm9; \
227 por %xmm7, reg1; \
228 por %xmm9, reg2
229# endif
230 TOLOWER (%xmm1, %xmm2)
231#else
232# define TOLOWER(reg1, reg2)
233#endif
234 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */
235 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
236 pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */
237 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
238 pmovmskb %xmm1, %edx
239 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
240 jnz LABEL(less16bytes)/* If not, find different value or null char */
241#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
242 sub $16, %r11
243 jbe LABEL(strcmp_exitz)/* finish comparison */
244#endif
245 add $16, %rsi /* prepare to search next 16 bytes */
246 add $16, %rdi /* prepare to search next 16 bytes */
247
248 /*
249 * Determine source and destination string offsets from 16-byte
250 * alignment. Use relative offset difference between the two to
251 * determine which case below to use.
252 */
253 .p2align 4
254LABEL(crosscache):
255 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
256 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
257 mov $0xffff, %edx /* for equivalent offset */
258 xor %r8d, %r8d
259 and $0xf, %ecx /* offset of rsi */
260 and $0xf, %eax /* offset of rdi */
261 pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */
262 cmp %eax, %ecx
263 je LABEL(ashr_0) /* rsi and rdi relative offset same */
264 ja LABEL(bigger)
265 mov %edx, %r8d /* r8d is offset flag for exit tail */
266 xchg %ecx, %eax
267 xchg %rsi, %rdi
268LABEL(bigger):
269 movdqa (%rdi), %xmm2
270 movdqa (%rsi), %xmm1
271 lea 15(%rax), %r9
272 sub %rcx, %r9
273 lea LABEL(unaligned_table)(%rip), %r10
274 movslq (%r10, %r9,4), %r9
275 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
276 lea (%r10, %r9), %r10
277 jmp *%r10 /* jump to corresponding case */
278
279/*
280 * The following cases will be handled by ashr_0
281 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
282 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
283 */
284 .p2align 4
285LABEL(ashr_0):
286
287 movdqa (%rsi), %xmm1
288 pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */
289#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
290 pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */
291#else
292 movdqa (%rdi), %xmm2
293 TOLOWER (%xmm1, %xmm2)
294 pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */
295#endif
296 psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/
297 pmovmskb %xmm1, %r9d
298 shr %cl, %edx /* adjust 0xffff for offset */
299 shr %cl, %r9d /* adjust for 16-byte offset */
300 sub %r9d, %edx
301 /*
302 * edx must be the same with r9d if in left byte (16-rcx) is equal to
303 * the start from (16-rax) and no null char was seen.
304 */
305 jne LABEL(less32bytes) /* mismatch or null char */
306 UPDATE_STRNCMP_COUNTER
307 mov $16, %rcx
308 mov $16, %r9
309
310 /*
311 * Now both strings are aligned at 16-byte boundary. Loop over strings
312 * checking 32-bytes per iteration.
313 */
314 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
315 .p2align 4
316LABEL(ashr_0_use):
317 movdqa (%rdi,%rdx), %xmm0
318#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
319 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
320#else
321 movdqa (%rsi,%rdx), %xmm1
322 TOLOWER (%xmm0, %xmm1)
323 pcmpistri $0x1a, %xmm1, %xmm0
324#endif
325 lea 16(%rdx), %rdx
326 jbe LABEL(ashr_0_exit_use)
327#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
328 sub $16, %r11
329 jbe LABEL(strcmp_exitz)
330#endif
331
332 movdqa (%rdi,%rdx), %xmm0
333#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
334 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
335#else
336 movdqa (%rsi,%rdx), %xmm1
337 TOLOWER (%xmm0, %xmm1)
338 pcmpistri $0x1a, %xmm1, %xmm0
339#endif
340 lea 16(%rdx), %rdx
341 jbe LABEL(ashr_0_exit_use)
342#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
343 sub $16, %r11
344 jbe LABEL(strcmp_exitz)
345#endif
346 jmp LABEL(ashr_0_use)
347
348
349 .p2align 4
350LABEL(ashr_0_exit_use):
351 jnc LABEL(strcmp_exitz)
352#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
353 sub %rcx, %r11
354 jbe LABEL(strcmp_exitz)
355#endif
356 lea -16(%rdx, %rcx), %rcx
357 movzbl (%rdi, %rcx), %eax
358 movzbl (%rsi, %rcx), %edx
359#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
360 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
361 movl (%rcx,%rax,4), %eax
362 movl (%rcx,%rdx,4), %edx
363#endif
364 sub %edx, %eax
365 ret
366
367
368
369/*
370 * The following cases will be handled by ashr_1
371 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
372 * n(15) n -15 0(15 +(n-15) - n) ashr_1
373 */
374 .p2align 4
375LABEL(ashr_1):
376 pslldq $15, D(%xmm2) /* shift first string to align with second */
377 TOLOWER (%xmm1, %xmm2)
378 pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */
379 psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/
380 pmovmskb %xmm2, %r9d
381 shr %cl, %edx /* adjust 0xffff for offset */
382 shr %cl, %r9d /* adjust for 16-byte offset */
383 sub %r9d, %edx
384 jnz LABEL(less32bytes) /* mismatch or null char seen */
385 movdqa (%rdi), %xmm3
386 UPDATE_STRNCMP_COUNTER
387
388 mov $16, %rcx /* index for loads*/
389 mov $1, %r9d /* byte position left over from less32bytes case */
390 /*
391 * Setup %r10 value allows us to detect crossing a page boundary.
392 * When %r10 goes positive we have crossed a page boundary and
393 * need to do a nibble.
394 */
395 lea 1(%rdi), %r10
396 and $0xfff, %r10 /* offset into 4K page */
397 sub $0x1000, %r10 /* subtract 4K pagesize */
398 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
399
400 .p2align 4
401LABEL(loop_ashr_1_use):
402 add $16, %r10
403 jg LABEL(nibble_ashr_1_use)
404
405LABEL(nibble_ashr_1_restart_use):
406 movdqa (%rdi, %rdx), %xmm0
407 palignr $1, -16(%rdi, %rdx), D(%xmm0)
408#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
409 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
410#else
411 movdqa (%rsi,%rdx), %xmm1
412 TOLOWER (%xmm0, %xmm1)
413 pcmpistri $0x1a, %xmm1, %xmm0
414#endif
415 jbe LABEL(exit_use)
416#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
417 sub $16, %r11
418 jbe LABEL(strcmp_exitz)
419#endif
420
421 add $16, %rdx
422 add $16, %r10
423 jg LABEL(nibble_ashr_1_use)
424
425 movdqa (%rdi, %rdx), %xmm0
426 palignr $1, -16(%rdi, %rdx), D(%xmm0)
427#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
428 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
429#else
430 movdqa (%rsi,%rdx), %xmm1
431 TOLOWER (%xmm0, %xmm1)
432 pcmpistri $0x1a, %xmm1, %xmm0
433#endif
434 jbe LABEL(exit_use)
435#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
436 sub $16, %r11
437 jbe LABEL(strcmp_exitz)
438#endif
439 add $16, %rdx
440 jmp LABEL(loop_ashr_1_use)
441
442 .p2align 4
443LABEL(nibble_ashr_1_use):
444 sub $0x1000, %r10
445 movdqa -16(%rdi, %rdx), %xmm0
446 psrldq $1, D(%xmm0)
447 pcmpistri $0x3a,%xmm0, %xmm0
448#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
449 cmp %r11, %rcx
450 jae LABEL(nibble_ashr_exit_use)
451#endif
452 cmp $14, %ecx
453 ja LABEL(nibble_ashr_1_restart_use)
454
455 jmp LABEL(nibble_ashr_exit_use)
456
457/*
458 * The following cases will be handled by ashr_2
459 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
460 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
461 */
462 .p2align 4
463LABEL(ashr_2):
464 pslldq $14, D(%xmm2)
465 TOLOWER (%xmm1, %xmm2)
466 pcmpeqb %xmm1, D(%xmm2)
467 psubb %xmm0, D(%xmm2)
468 pmovmskb %xmm2, %r9d
469 shr %cl, %edx
470 shr %cl, %r9d
471 sub %r9d, %edx
472 jnz LABEL(less32bytes)
473 movdqa (%rdi), %xmm3
474 UPDATE_STRNCMP_COUNTER
475
476 mov $16, %rcx /* index for loads */
477 mov $2, %r9d /* byte position left over from less32bytes case */
478 /*
479 * Setup %r10 value allows us to detect crossing a page boundary.
480 * When %r10 goes positive we have crossed a page boundary and
481 * need to do a nibble.
482 */
483 lea 2(%rdi), %r10
484 and $0xfff, %r10 /* offset into 4K page */
485 sub $0x1000, %r10 /* subtract 4K pagesize */
486 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
487
488 .p2align 4
489LABEL(loop_ashr_2_use):
490 add $16, %r10
491 jg LABEL(nibble_ashr_2_use)
492
493LABEL(nibble_ashr_2_restart_use):
494 movdqa (%rdi, %rdx), %xmm0
495 palignr $2, -16(%rdi, %rdx), D(%xmm0)
496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
497 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
498#else
499 movdqa (%rsi,%rdx), %xmm1
500 TOLOWER (%xmm0, %xmm1)
501 pcmpistri $0x1a, %xmm1, %xmm0
502#endif
503 jbe LABEL(exit_use)
504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
505 sub $16, %r11
506 jbe LABEL(strcmp_exitz)
507#endif
508
509 add $16, %rdx
510 add $16, %r10
511 jg LABEL(nibble_ashr_2_use)
512
513 movdqa (%rdi, %rdx), %xmm0
514 palignr $2, -16(%rdi, %rdx), D(%xmm0)
515#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
516 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
517#else
518 movdqa (%rsi,%rdx), %xmm1
519 TOLOWER (%xmm0, %xmm1)
520 pcmpistri $0x1a, %xmm1, %xmm0
521#endif
522 jbe LABEL(exit_use)
523#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
524 sub $16, %r11
525 jbe LABEL(strcmp_exitz)
526#endif
527 add $16, %rdx
528 jmp LABEL(loop_ashr_2_use)
529
530 .p2align 4
531LABEL(nibble_ashr_2_use):
532 sub $0x1000, %r10
533 movdqa -16(%rdi, %rdx), %xmm0
534 psrldq $2, D(%xmm0)
535 pcmpistri $0x3a,%xmm0, %xmm0
536#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
537 cmp %r11, %rcx
538 jae LABEL(nibble_ashr_exit_use)
539#endif
540 cmp $13, %ecx
541 ja LABEL(nibble_ashr_2_restart_use)
542
543 jmp LABEL(nibble_ashr_exit_use)
544
545/*
546 * The following cases will be handled by ashr_3
547 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
548 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
549 */
550 .p2align 4
551LABEL(ashr_3):
552 pslldq $13, D(%xmm2)
553 TOLOWER (%xmm1, %xmm2)
554 pcmpeqb %xmm1, D(%xmm2)
555 psubb %xmm0, D(%xmm2)
556 pmovmskb %xmm2, %r9d
557 shr %cl, %edx
558 shr %cl, %r9d
559 sub %r9d, %edx
560 jnz LABEL(less32bytes)
561 movdqa (%rdi), %xmm3
562
563 UPDATE_STRNCMP_COUNTER
564
565 mov $16, %rcx /* index for loads */
566 mov $3, %r9d /* byte position left over from less32bytes case */
567 /*
568 * Setup %r10 value allows us to detect crossing a page boundary.
569 * When %r10 goes positive we have crossed a page boundary and
570 * need to do a nibble.
571 */
572 lea 3(%rdi), %r10
573 and $0xfff, %r10 /* offset into 4K page */
574 sub $0x1000, %r10 /* subtract 4K pagesize */
575 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
576
577LABEL(loop_ashr_3_use):
578 add $16, %r10
579 jg LABEL(nibble_ashr_3_use)
580
581LABEL(nibble_ashr_3_restart_use):
582 movdqa (%rdi, %rdx), %xmm0
583 palignr $3, -16(%rdi, %rdx), D(%xmm0)
584#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
585 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
586#else
587 movdqa (%rsi,%rdx), %xmm1
588 TOLOWER (%xmm0, %xmm1)
589 pcmpistri $0x1a, %xmm1, %xmm0
590#endif
591 jbe LABEL(exit_use)
592#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
593 sub $16, %r11
594 jbe LABEL(strcmp_exitz)
595#endif
596
597 add $16, %rdx
598 add $16, %r10
599 jg LABEL(nibble_ashr_3_use)
600
601 movdqa (%rdi, %rdx), %xmm0
602 palignr $3, -16(%rdi, %rdx), D(%xmm0)
603#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
604 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
605#else
606 movdqa (%rsi,%rdx), %xmm1
607 TOLOWER (%xmm0, %xmm1)
608 pcmpistri $0x1a, %xmm1, %xmm0
609#endif
610 jbe LABEL(exit_use)
611#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
612 sub $16, %r11
613 jbe LABEL(strcmp_exitz)
614#endif
615 add $16, %rdx
616 jmp LABEL(loop_ashr_3_use)
617
618 .p2align 4
619LABEL(nibble_ashr_3_use):
620 sub $0x1000, %r10
621 movdqa -16(%rdi, %rdx), %xmm0
622 psrldq $3, D(%xmm0)
623 pcmpistri $0x3a,%xmm0, %xmm0
624#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
625 cmp %r11, %rcx
626 jae LABEL(nibble_ashr_exit_use)
627#endif
628 cmp $12, %ecx
629 ja LABEL(nibble_ashr_3_restart_use)
630
631 jmp LABEL(nibble_ashr_exit_use)
632
633/*
634 * The following cases will be handled by ashr_4
635 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
636 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
637 */
638 .p2align 4
639LABEL(ashr_4):
640 pslldq $12, D(%xmm2)
641 TOLOWER (%xmm1, %xmm2)
642 pcmpeqb %xmm1, D(%xmm2)
643 psubb %xmm0, D(%xmm2)
644 pmovmskb %xmm2, %r9d
645 shr %cl, %edx
646 shr %cl, %r9d
647 sub %r9d, %edx
648 jnz LABEL(less32bytes)
649 movdqa (%rdi), %xmm3
650
651 UPDATE_STRNCMP_COUNTER
652
653 mov $16, %rcx /* index for loads */
654 mov $4, %r9d /* byte position left over from less32bytes case */
655 /*
656 * Setup %r10 value allows us to detect crossing a page boundary.
657 * When %r10 goes positive we have crossed a page boundary and
658 * need to do a nibble.
659 */
660 lea 4(%rdi), %r10
661 and $0xfff, %r10 /* offset into 4K page */
662 sub $0x1000, %r10 /* subtract 4K pagesize */
663 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
664
665 .p2align 4
666LABEL(loop_ashr_4_use):
667 add $16, %r10
668 jg LABEL(nibble_ashr_4_use)
669
670LABEL(nibble_ashr_4_restart_use):
671 movdqa (%rdi, %rdx), %xmm0
672 palignr $4, -16(%rdi, %rdx), D(%xmm0)
673#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
674 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
675#else
676 movdqa (%rsi,%rdx), %xmm1
677 TOLOWER (%xmm0, %xmm1)
678 pcmpistri $0x1a, %xmm1, %xmm0
679#endif
680 jbe LABEL(exit_use)
681#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
682 sub $16, %r11
683 jbe LABEL(strcmp_exitz)
684#endif
685
686 add $16, %rdx
687 add $16, %r10
688 jg LABEL(nibble_ashr_4_use)
689
690 movdqa (%rdi, %rdx), %xmm0
691 palignr $4, -16(%rdi, %rdx), D(%xmm0)
692#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
693 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
694#else
695 movdqa (%rsi,%rdx), %xmm1
696 TOLOWER (%xmm0, %xmm1)
697 pcmpistri $0x1a, %xmm1, %xmm0
698#endif
699 jbe LABEL(exit_use)
700#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
701 sub $16, %r11
702 jbe LABEL(strcmp_exitz)
703#endif
704 add $16, %rdx
705 jmp LABEL(loop_ashr_4_use)
706
707 .p2align 4
708LABEL(nibble_ashr_4_use):
709 sub $0x1000, %r10
710 movdqa -16(%rdi, %rdx), %xmm0
711 psrldq $4, D(%xmm0)
712 pcmpistri $0x3a,%xmm0, %xmm0
713#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
714 cmp %r11, %rcx
715 jae LABEL(nibble_ashr_exit_use)
716#endif
717 cmp $11, %ecx
718 ja LABEL(nibble_ashr_4_restart_use)
719
720 jmp LABEL(nibble_ashr_exit_use)
721
722/*
723 * The following cases will be handled by ashr_5
724 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
725 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
726 */
727 .p2align 4
728LABEL(ashr_5):
729 pslldq $11, D(%xmm2)
730 TOLOWER (%xmm1, %xmm2)
731 pcmpeqb %xmm1, D(%xmm2)
732 psubb %xmm0, D(%xmm2)
733 pmovmskb %xmm2, %r9d
734 shr %cl, %edx
735 shr %cl, %r9d
736 sub %r9d, %edx
737 jnz LABEL(less32bytes)
738 movdqa (%rdi), %xmm3
739
740 UPDATE_STRNCMP_COUNTER
741
742 mov $16, %rcx /* index for loads */
743 mov $5, %r9d /* byte position left over from less32bytes case */
744 /*
745 * Setup %r10 value allows us to detect crossing a page boundary.
746 * When %r10 goes positive we have crossed a page boundary and
747 * need to do a nibble.
748 */
749 lea 5(%rdi), %r10
750 and $0xfff, %r10 /* offset into 4K page */
751 sub $0x1000, %r10 /* subtract 4K pagesize */
752 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
753
754 .p2align 4
755LABEL(loop_ashr_5_use):
756 add $16, %r10
757 jg LABEL(nibble_ashr_5_use)
758
759LABEL(nibble_ashr_5_restart_use):
760 movdqa (%rdi, %rdx), %xmm0
761 palignr $5, -16(%rdi, %rdx), D(%xmm0)
762#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
763 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
764#else
765 movdqa (%rsi,%rdx), %xmm1
766 TOLOWER (%xmm0, %xmm1)
767 pcmpistri $0x1a, %xmm1, %xmm0
768#endif
769 jbe LABEL(exit_use)
770#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
771 sub $16, %r11
772 jbe LABEL(strcmp_exitz)
773#endif
774
775 add $16, %rdx
776 add $16, %r10
777 jg LABEL(nibble_ashr_5_use)
778
779 movdqa (%rdi, %rdx), %xmm0
780
781 palignr $5, -16(%rdi, %rdx), D(%xmm0)
782#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
783 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
784#else
785 movdqa (%rsi,%rdx), %xmm1
786 TOLOWER (%xmm0, %xmm1)
787 pcmpistri $0x1a, %xmm1, %xmm0
788#endif
789 jbe LABEL(exit_use)
790#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
791 sub $16, %r11
792 jbe LABEL(strcmp_exitz)
793#endif
794 add $16, %rdx
795 jmp LABEL(loop_ashr_5_use)
796
797 .p2align 4
798LABEL(nibble_ashr_5_use):
799 sub $0x1000, %r10
800 movdqa -16(%rdi, %rdx), %xmm0
801 psrldq $5, D(%xmm0)
802 pcmpistri $0x3a,%xmm0, %xmm0
803#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
804 cmp %r11, %rcx
805 jae LABEL(nibble_ashr_exit_use)
806#endif
807 cmp $10, %ecx
808 ja LABEL(nibble_ashr_5_restart_use)
809
810 jmp LABEL(nibble_ashr_exit_use)
811
812/*
813 * The following cases will be handled by ashr_6
814 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
815 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
816 */
817 .p2align 4
818LABEL(ashr_6):
819 pslldq $10, D(%xmm2)
820 TOLOWER (%xmm1, %xmm2)
821 pcmpeqb %xmm1, D(%xmm2)
822 psubb %xmm0, D(%xmm2)
823 pmovmskb %xmm2, %r9d
824 shr %cl, %edx
825 shr %cl, %r9d
826 sub %r9d, %edx
827 jnz LABEL(less32bytes)
828 movdqa (%rdi), %xmm3
829
830 UPDATE_STRNCMP_COUNTER
831
832 mov $16, %rcx /* index for loads */
833 mov $6, %r9d /* byte position left over from less32bytes case */
834 /*
835 * Setup %r10 value allows us to detect crossing a page boundary.
836 * When %r10 goes positive we have crossed a page boundary and
837 * need to do a nibble.
838 */
839 lea 6(%rdi), %r10
840 and $0xfff, %r10 /* offset into 4K page */
841 sub $0x1000, %r10 /* subtract 4K pagesize */
842 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
843
844 .p2align 4
845LABEL(loop_ashr_6_use):
846 add $16, %r10
847 jg LABEL(nibble_ashr_6_use)
848
849LABEL(nibble_ashr_6_restart_use):
850 movdqa (%rdi, %rdx), %xmm0
851 palignr $6, -16(%rdi, %rdx), D(%xmm0)
852#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
853 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
854#else
855 movdqa (%rsi,%rdx), %xmm1
856 TOLOWER (%xmm0, %xmm1)
857 pcmpistri $0x1a, %xmm1, %xmm0
858#endif
859 jbe LABEL(exit_use)
860#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
861 sub $16, %r11
862 jbe LABEL(strcmp_exitz)
863#endif
864
865 add $16, %rdx
866 add $16, %r10
867 jg LABEL(nibble_ashr_6_use)
868
869 movdqa (%rdi, %rdx), %xmm0
870 palignr $6, -16(%rdi, %rdx), D(%xmm0)
871#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
872 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
873#else
874 movdqa (%rsi,%rdx), %xmm1
875 TOLOWER (%xmm0, %xmm1)
876 pcmpistri $0x1a, %xmm1, %xmm0
877#endif
878 jbe LABEL(exit_use)
879#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
880 sub $16, %r11
881 jbe LABEL(strcmp_exitz)
882#endif
883 add $16, %rdx
884 jmp LABEL(loop_ashr_6_use)
885
886 .p2align 4
887LABEL(nibble_ashr_6_use):
888 sub $0x1000, %r10
889 movdqa -16(%rdi, %rdx), %xmm0
890 psrldq $6, D(%xmm0)
891 pcmpistri $0x3a,%xmm0, %xmm0
892#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
893 cmp %r11, %rcx
894 jae LABEL(nibble_ashr_exit_use)
895#endif
896 cmp $9, %ecx
897 ja LABEL(nibble_ashr_6_restart_use)
898
899 jmp LABEL(nibble_ashr_exit_use)
900
901/*
902 * The following cases will be handled by ashr_7
903 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
904 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
905 */
906 .p2align 4
907LABEL(ashr_7):
908 pslldq $9, D(%xmm2)
909 TOLOWER (%xmm1, %xmm2)
910 pcmpeqb %xmm1, D(%xmm2)
911 psubb %xmm0, D(%xmm2)
912 pmovmskb %xmm2, %r9d
913 shr %cl, %edx
914 shr %cl, %r9d
915 sub %r9d, %edx
916 jnz LABEL(less32bytes)
917 movdqa (%rdi), %xmm3
918
919 UPDATE_STRNCMP_COUNTER
920
921 mov $16, %rcx /* index for loads */
922 mov $7, %r9d /* byte position left over from less32bytes case */
923 /*
924 * Setup %r10 value allows us to detect crossing a page boundary.
925 * When %r10 goes positive we have crossed a page boundary and
926 * need to do a nibble.
927 */
928 lea 7(%rdi), %r10
929 and $0xfff, %r10 /* offset into 4K page */
930 sub $0x1000, %r10 /* subtract 4K pagesize */
931 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
932
933 .p2align 4
934LABEL(loop_ashr_7_use):
935 add $16, %r10
936 jg LABEL(nibble_ashr_7_use)
937
938LABEL(nibble_ashr_7_restart_use):
939 movdqa (%rdi, %rdx), %xmm0
940 palignr $7, -16(%rdi, %rdx), D(%xmm0)
941#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
942 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
943#else
944 movdqa (%rsi,%rdx), %xmm1
945 TOLOWER (%xmm0, %xmm1)
946 pcmpistri $0x1a, %xmm1, %xmm0
947#endif
948 jbe LABEL(exit_use)
949#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
950 sub $16, %r11
951 jbe LABEL(strcmp_exitz)
952#endif
953
954 add $16, %rdx
955 add $16, %r10
956 jg LABEL(nibble_ashr_7_use)
957
958 movdqa (%rdi, %rdx), %xmm0
959 palignr $7, -16(%rdi, %rdx), D(%xmm0)
960#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
961 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
962#else
963 movdqa (%rsi,%rdx), %xmm1
964 TOLOWER (%xmm0, %xmm1)
965 pcmpistri $0x1a, %xmm1, %xmm0
966#endif
967 jbe LABEL(exit_use)
968#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
969 sub $16, %r11
970 jbe LABEL(strcmp_exitz)
971#endif
972 add $16, %rdx
973 jmp LABEL(loop_ashr_7_use)
974
975 .p2align 4
976LABEL(nibble_ashr_7_use):
977 sub $0x1000, %r10
978 movdqa -16(%rdi, %rdx), %xmm0
979 psrldq $7, D(%xmm0)
980 pcmpistri $0x3a,%xmm0, %xmm0
981#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
982 cmp %r11, %rcx
983 jae LABEL(nibble_ashr_exit_use)
984#endif
985 cmp $8, %ecx
986 ja LABEL(nibble_ashr_7_restart_use)
987
988 jmp LABEL(nibble_ashr_exit_use)
989
990/*
991 * The following cases will be handled by ashr_8
992 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
993 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
994 */
995 .p2align 4
996LABEL(ashr_8):
997 pslldq $8, D(%xmm2)
998 TOLOWER (%xmm1, %xmm2)
999 pcmpeqb %xmm1, D(%xmm2)
1000 psubb %xmm0, D(%xmm2)
1001 pmovmskb %xmm2, %r9d
1002 shr %cl, %edx
1003 shr %cl, %r9d
1004 sub %r9d, %edx
1005 jnz LABEL(less32bytes)
1006 movdqa (%rdi), %xmm3
1007
1008 UPDATE_STRNCMP_COUNTER
1009
1010 mov $16, %rcx /* index for loads */
1011 mov $8, %r9d /* byte position left over from less32bytes case */
1012 /*
1013 * Setup %r10 value allows us to detect crossing a page boundary.
1014 * When %r10 goes positive we have crossed a page boundary and
1015 * need to do a nibble.
1016 */
1017 lea 8(%rdi), %r10
1018 and $0xfff, %r10 /* offset into 4K page */
1019 sub $0x1000, %r10 /* subtract 4K pagesize */
1020 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1021
1022 .p2align 4
1023LABEL(loop_ashr_8_use):
1024 add $16, %r10
1025 jg LABEL(nibble_ashr_8_use)
1026
1027LABEL(nibble_ashr_8_restart_use):
1028 movdqa (%rdi, %rdx), %xmm0
1029 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1030#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1031 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1032#else
1033 movdqa (%rsi,%rdx), %xmm1
1034 TOLOWER (%xmm0, %xmm1)
1035 pcmpistri $0x1a, %xmm1, %xmm0
1036#endif
1037 jbe LABEL(exit_use)
1038#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1039 sub $16, %r11
1040 jbe LABEL(strcmp_exitz)
1041#endif
1042
1043 add $16, %rdx
1044 add $16, %r10
1045 jg LABEL(nibble_ashr_8_use)
1046
1047 movdqa (%rdi, %rdx), %xmm0
1048 palignr $8, -16(%rdi, %rdx), D(%xmm0)
1049#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1050 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1051#else
1052 movdqa (%rsi,%rdx), %xmm1
1053 TOLOWER (%xmm0, %xmm1)
1054 pcmpistri $0x1a, %xmm1, %xmm0
1055#endif
1056 jbe LABEL(exit_use)
1057#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1058 sub $16, %r11
1059 jbe LABEL(strcmp_exitz)
1060#endif
1061 add $16, %rdx
1062 jmp LABEL(loop_ashr_8_use)
1063
1064 .p2align 4
1065LABEL(nibble_ashr_8_use):
1066 sub $0x1000, %r10
1067 movdqa -16(%rdi, %rdx), %xmm0
1068 psrldq $8, D(%xmm0)
1069 pcmpistri $0x3a,%xmm0, %xmm0
1070#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1071 cmp %r11, %rcx
1072 jae LABEL(nibble_ashr_exit_use)
1073#endif
1074 cmp $7, %ecx
1075 ja LABEL(nibble_ashr_8_restart_use)
1076
1077 jmp LABEL(nibble_ashr_exit_use)
1078
1079/*
1080 * The following cases will be handled by ashr_9
1081 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1082 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1083 */
1084 .p2align 4
1085LABEL(ashr_9):
1086 pslldq $7, D(%xmm2)
1087 TOLOWER (%xmm1, %xmm2)
1088 pcmpeqb %xmm1, D(%xmm2)
1089 psubb %xmm0, D(%xmm2)
1090 pmovmskb %xmm2, %r9d
1091 shr %cl, %edx
1092 shr %cl, %r9d
1093 sub %r9d, %edx
1094 jnz LABEL(less32bytes)
1095 movdqa (%rdi), %xmm3
1096
1097 UPDATE_STRNCMP_COUNTER
1098
1099 mov $16, %rcx /* index for loads */
1100 mov $9, %r9d /* byte position left over from less32bytes case */
1101 /*
1102 * Setup %r10 value allows us to detect crossing a page boundary.
1103 * When %r10 goes positive we have crossed a page boundary and
1104 * need to do a nibble.
1105 */
1106 lea 9(%rdi), %r10
1107 and $0xfff, %r10 /* offset into 4K page */
1108 sub $0x1000, %r10 /* subtract 4K pagesize */
1109 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1110
1111 .p2align 4
1112LABEL(loop_ashr_9_use):
1113 add $16, %r10
1114 jg LABEL(nibble_ashr_9_use)
1115
1116LABEL(nibble_ashr_9_restart_use):
1117 movdqa (%rdi, %rdx), %xmm0
1118
1119 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1120#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1121 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1122#else
1123 movdqa (%rsi,%rdx), %xmm1
1124 TOLOWER (%xmm0, %xmm1)
1125 pcmpistri $0x1a, %xmm1, %xmm0
1126#endif
1127 jbe LABEL(exit_use)
1128#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1129 sub $16, %r11
1130 jbe LABEL(strcmp_exitz)
1131#endif
1132
1133 add $16, %rdx
1134 add $16, %r10
1135 jg LABEL(nibble_ashr_9_use)
1136
1137 movdqa (%rdi, %rdx), %xmm0
1138 palignr $9, -16(%rdi, %rdx), D(%xmm0)
1139#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1140 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1141#else
1142 movdqa (%rsi,%rdx), %xmm1
1143 TOLOWER (%xmm0, %xmm1)
1144 pcmpistri $0x1a, %xmm1, %xmm0
1145#endif
1146 jbe LABEL(exit_use)
1147#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1148 sub $16, %r11
1149 jbe LABEL(strcmp_exitz)
1150#endif
1151 add $16, %rdx
1152 jmp LABEL(loop_ashr_9_use)
1153
1154 .p2align 4
1155LABEL(nibble_ashr_9_use):
1156 sub $0x1000, %r10
1157 movdqa -16(%rdi, %rdx), %xmm0
1158 psrldq $9, D(%xmm0)
1159 pcmpistri $0x3a,%xmm0, %xmm0
1160#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1161 cmp %r11, %rcx
1162 jae LABEL(nibble_ashr_exit_use)
1163#endif
1164 cmp $6, %ecx
1165 ja LABEL(nibble_ashr_9_restart_use)
1166
1167 jmp LABEL(nibble_ashr_exit_use)
1168
1169/*
1170 * The following cases will be handled by ashr_10
1171 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1172 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1173 */
1174 .p2align 4
1175LABEL(ashr_10):
1176 pslldq $6, D(%xmm2)
1177 TOLOWER (%xmm1, %xmm2)
1178 pcmpeqb %xmm1, D(%xmm2)
1179 psubb %xmm0, D(%xmm2)
1180 pmovmskb %xmm2, %r9d
1181 shr %cl, %edx
1182 shr %cl, %r9d
1183 sub %r9d, %edx
1184 jnz LABEL(less32bytes)
1185 movdqa (%rdi), %xmm3
1186
1187 UPDATE_STRNCMP_COUNTER
1188
1189 mov $16, %rcx /* index for loads */
1190 mov $10, %r9d /* byte position left over from less32bytes case */
1191 /*
1192 * Setup %r10 value allows us to detect crossing a page boundary.
1193 * When %r10 goes positive we have crossed a page boundary and
1194 * need to do a nibble.
1195 */
1196 lea 10(%rdi), %r10
1197 and $0xfff, %r10 /* offset into 4K page */
1198 sub $0x1000, %r10 /* subtract 4K pagesize */
1199 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1200
1201 .p2align 4
1202LABEL(loop_ashr_10_use):
1203 add $16, %r10
1204 jg LABEL(nibble_ashr_10_use)
1205
1206LABEL(nibble_ashr_10_restart_use):
1207 movdqa (%rdi, %rdx), %xmm0
1208 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1209#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1210 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1211#else
1212 movdqa (%rsi,%rdx), %xmm1
1213 TOLOWER (%xmm0, %xmm1)
1214 pcmpistri $0x1a, %xmm1, %xmm0
1215#endif
1216 jbe LABEL(exit_use)
1217#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1218 sub $16, %r11
1219 jbe LABEL(strcmp_exitz)
1220#endif
1221
1222 add $16, %rdx
1223 add $16, %r10
1224 jg LABEL(nibble_ashr_10_use)
1225
1226 movdqa (%rdi, %rdx), %xmm0
1227 palignr $10, -16(%rdi, %rdx), D(%xmm0)
1228#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1229 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1230#else
1231 movdqa (%rsi,%rdx), %xmm1
1232 TOLOWER (%xmm0, %xmm1)
1233 pcmpistri $0x1a, %xmm1, %xmm0
1234#endif
1235 jbe LABEL(exit_use)
1236#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1237 sub $16, %r11
1238 jbe LABEL(strcmp_exitz)
1239#endif
1240 add $16, %rdx
1241 jmp LABEL(loop_ashr_10_use)
1242
1243 .p2align 4
1244LABEL(nibble_ashr_10_use):
1245 sub $0x1000, %r10
1246 movdqa -16(%rdi, %rdx), %xmm0
1247 psrldq $10, D(%xmm0)
1248 pcmpistri $0x3a,%xmm0, %xmm0
1249#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1250 cmp %r11, %rcx
1251 jae LABEL(nibble_ashr_exit_use)
1252#endif
1253 cmp $5, %ecx
1254 ja LABEL(nibble_ashr_10_restart_use)
1255
1256 jmp LABEL(nibble_ashr_exit_use)
1257
1258/*
1259 * The following cases will be handled by ashr_11
1260 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1261 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1262 */
1263 .p2align 4
1264LABEL(ashr_11):
1265 pslldq $5, D(%xmm2)
1266 TOLOWER (%xmm1, %xmm2)
1267 pcmpeqb %xmm1, D(%xmm2)
1268 psubb %xmm0, D(%xmm2)
1269 pmovmskb %xmm2, %r9d
1270 shr %cl, %edx
1271 shr %cl, %r9d
1272 sub %r9d, %edx
1273 jnz LABEL(less32bytes)
1274 movdqa (%rdi), %xmm3
1275
1276 UPDATE_STRNCMP_COUNTER
1277
1278 mov $16, %rcx /* index for loads */
1279 mov $11, %r9d /* byte position left over from less32bytes case */
1280 /*
1281 * Setup %r10 value allows us to detect crossing a page boundary.
1282 * When %r10 goes positive we have crossed a page boundary and
1283 * need to do a nibble.
1284 */
1285 lea 11(%rdi), %r10
1286 and $0xfff, %r10 /* offset into 4K page */
1287 sub $0x1000, %r10 /* subtract 4K pagesize */
1288 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1289
1290 .p2align 4
1291LABEL(loop_ashr_11_use):
1292 add $16, %r10
1293 jg LABEL(nibble_ashr_11_use)
1294
1295LABEL(nibble_ashr_11_restart_use):
1296 movdqa (%rdi, %rdx), %xmm0
1297 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1298#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1299 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1300#else
1301 movdqa (%rsi,%rdx), %xmm1
1302 TOLOWER (%xmm0, %xmm1)
1303 pcmpistri $0x1a, %xmm1, %xmm0
1304#endif
1305 jbe LABEL(exit_use)
1306#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1307 sub $16, %r11
1308 jbe LABEL(strcmp_exitz)
1309#endif
1310
1311 add $16, %rdx
1312 add $16, %r10
1313 jg LABEL(nibble_ashr_11_use)
1314
1315 movdqa (%rdi, %rdx), %xmm0
1316 palignr $11, -16(%rdi, %rdx), D(%xmm0)
1317#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1318 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1319#else
1320 movdqa (%rsi,%rdx), %xmm1
1321 TOLOWER (%xmm0, %xmm1)
1322 pcmpistri $0x1a, %xmm1, %xmm0
1323#endif
1324 jbe LABEL(exit_use)
1325#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1326 sub $16, %r11
1327 jbe LABEL(strcmp_exitz)
1328#endif
1329 add $16, %rdx
1330 jmp LABEL(loop_ashr_11_use)
1331
1332 .p2align 4
1333LABEL(nibble_ashr_11_use):
1334 sub $0x1000, %r10
1335 movdqa -16(%rdi, %rdx), %xmm0
1336 psrldq $11, D(%xmm0)
1337 pcmpistri $0x3a,%xmm0, %xmm0
1338#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1339 cmp %r11, %rcx
1340 jae LABEL(nibble_ashr_exit_use)
1341#endif
1342 cmp $4, %ecx
1343 ja LABEL(nibble_ashr_11_restart_use)
1344
1345 jmp LABEL(nibble_ashr_exit_use)
1346
1347/*
1348 * The following cases will be handled by ashr_12
1349 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1350 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1351 */
1352 .p2align 4
1353LABEL(ashr_12):
1354 pslldq $4, D(%xmm2)
1355 TOLOWER (%xmm1, %xmm2)
1356 pcmpeqb %xmm1, D(%xmm2)
1357 psubb %xmm0, D(%xmm2)
1358 pmovmskb %xmm2, %r9d
1359 shr %cl, %edx
1360 shr %cl, %r9d
1361 sub %r9d, %edx
1362 jnz LABEL(less32bytes)
1363 movdqa (%rdi), %xmm3
1364
1365 UPDATE_STRNCMP_COUNTER
1366
1367 mov $16, %rcx /* index for loads */
1368 mov $12, %r9d /* byte position left over from less32bytes case */
1369 /*
1370 * Setup %r10 value allows us to detect crossing a page boundary.
1371 * When %r10 goes positive we have crossed a page boundary and
1372 * need to do a nibble.
1373 */
1374 lea 12(%rdi), %r10
1375 and $0xfff, %r10 /* offset into 4K page */
1376 sub $0x1000, %r10 /* subtract 4K pagesize */
1377 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1378
1379 .p2align 4
1380LABEL(loop_ashr_12_use):
1381 add $16, %r10
1382 jg LABEL(nibble_ashr_12_use)
1383
1384LABEL(nibble_ashr_12_restart_use):
1385 movdqa (%rdi, %rdx), %xmm0
1386 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1387#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1388 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1389#else
1390 movdqa (%rsi,%rdx), %xmm1
1391 TOLOWER (%xmm0, %xmm1)
1392 pcmpistri $0x1a, %xmm1, %xmm0
1393#endif
1394 jbe LABEL(exit_use)
1395#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1396 sub $16, %r11
1397 jbe LABEL(strcmp_exitz)
1398#endif
1399
1400 add $16, %rdx
1401 add $16, %r10
1402 jg LABEL(nibble_ashr_12_use)
1403
1404 movdqa (%rdi, %rdx), %xmm0
1405 palignr $12, -16(%rdi, %rdx), D(%xmm0)
1406#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1407 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1408#else
1409 movdqa (%rsi,%rdx), %xmm1
1410 TOLOWER (%xmm0, %xmm1)
1411 pcmpistri $0x1a, %xmm1, %xmm0
1412#endif
1413 jbe LABEL(exit_use)
1414#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1415 sub $16, %r11
1416 jbe LABEL(strcmp_exitz)
1417#endif
1418 add $16, %rdx
1419 jmp LABEL(loop_ashr_12_use)
1420
1421 .p2align 4
1422LABEL(nibble_ashr_12_use):
1423 sub $0x1000, %r10
1424 movdqa -16(%rdi, %rdx), %xmm0
1425 psrldq $12, D(%xmm0)
1426 pcmpistri $0x3a,%xmm0, %xmm0
1427#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1428 cmp %r11, %rcx
1429 jae LABEL(nibble_ashr_exit_use)
1430#endif
1431 cmp $3, %ecx
1432 ja LABEL(nibble_ashr_12_restart_use)
1433
1434 jmp LABEL(nibble_ashr_exit_use)
1435
1436/*
1437 * The following cases will be handled by ashr_13
1438 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1439 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1440 */
1441 .p2align 4
1442LABEL(ashr_13):
1443 pslldq $3, D(%xmm2)
1444 TOLOWER (%xmm1, %xmm2)
1445 pcmpeqb %xmm1, D(%xmm2)
1446 psubb %xmm0, D(%xmm2)
1447 pmovmskb %xmm2, %r9d
1448 shr %cl, %edx
1449 shr %cl, %r9d
1450 sub %r9d, %edx
1451 jnz LABEL(less32bytes)
1452 movdqa (%rdi), %xmm3
1453
1454 UPDATE_STRNCMP_COUNTER
1455
1456 mov $16, %rcx /* index for loads */
1457 mov $13, %r9d /* byte position left over from less32bytes case */
1458 /*
1459 * Setup %r10 value allows us to detect crossing a page boundary.
1460 * When %r10 goes positive we have crossed a page boundary and
1461 * need to do a nibble.
1462 */
1463 lea 13(%rdi), %r10
1464 and $0xfff, %r10 /* offset into 4K page */
1465 sub $0x1000, %r10 /* subtract 4K pagesize */
1466
1467 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1468
1469 .p2align 4
1470LABEL(loop_ashr_13_use):
1471 add $16, %r10
1472 jg LABEL(nibble_ashr_13_use)
1473
1474LABEL(nibble_ashr_13_restart_use):
1475 movdqa (%rdi, %rdx), %xmm0
1476 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1477#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1478 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1479#else
1480 movdqa (%rsi,%rdx), %xmm1
1481 TOLOWER (%xmm0, %xmm1)
1482 pcmpistri $0x1a, %xmm1, %xmm0
1483#endif
1484 jbe LABEL(exit_use)
1485#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1486 sub $16, %r11
1487 jbe LABEL(strcmp_exitz)
1488#endif
1489
1490 add $16, %rdx
1491 add $16, %r10
1492 jg LABEL(nibble_ashr_13_use)
1493
1494 movdqa (%rdi, %rdx), %xmm0
1495 palignr $13, -16(%rdi, %rdx), D(%xmm0)
1496#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1497 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1498#else
1499 movdqa (%rsi,%rdx), %xmm1
1500 TOLOWER (%xmm0, %xmm1)
1501 pcmpistri $0x1a, %xmm1, %xmm0
1502#endif
1503 jbe LABEL(exit_use)
1504#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1505 sub $16, %r11
1506 jbe LABEL(strcmp_exitz)
1507#endif
1508 add $16, %rdx
1509 jmp LABEL(loop_ashr_13_use)
1510
1511 .p2align 4
1512LABEL(nibble_ashr_13_use):
1513 sub $0x1000, %r10
1514 movdqa -16(%rdi, %rdx), %xmm0
1515 psrldq $13, D(%xmm0)
1516 pcmpistri $0x3a,%xmm0, %xmm0
1517#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1518 cmp %r11, %rcx
1519 jae LABEL(nibble_ashr_exit_use)
1520#endif
1521 cmp $2, %ecx
1522 ja LABEL(nibble_ashr_13_restart_use)
1523
1524 jmp LABEL(nibble_ashr_exit_use)
1525
1526/*
1527 * The following cases will be handled by ashr_14
1528 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1529 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1530 */
1531 .p2align 4
1532LABEL(ashr_14):
1533 pslldq $2, D(%xmm2)
1534 TOLOWER (%xmm1, %xmm2)
1535 pcmpeqb %xmm1, D(%xmm2)
1536 psubb %xmm0, D(%xmm2)
1537 pmovmskb %xmm2, %r9d
1538 shr %cl, %edx
1539 shr %cl, %r9d
1540 sub %r9d, %edx
1541 jnz LABEL(less32bytes)
1542 movdqa (%rdi), %xmm3
1543
1544 UPDATE_STRNCMP_COUNTER
1545
1546 mov $16, %rcx /* index for loads */
1547 mov $14, %r9d /* byte position left over from less32bytes case */
1548 /*
1549 * Setup %r10 value allows us to detect crossing a page boundary.
1550 * When %r10 goes positive we have crossed a page boundary and
1551 * need to do a nibble.
1552 */
1553 lea 14(%rdi), %r10
1554 and $0xfff, %r10 /* offset into 4K page */
1555 sub $0x1000, %r10 /* subtract 4K pagesize */
1556
1557 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1558
1559 .p2align 4
1560LABEL(loop_ashr_14_use):
1561 add $16, %r10
1562 jg LABEL(nibble_ashr_14_use)
1563
1564LABEL(nibble_ashr_14_restart_use):
1565 movdqa (%rdi, %rdx), %xmm0
1566 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1567#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1568 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1569#else
1570 movdqa (%rsi,%rdx), %xmm1
1571 TOLOWER (%xmm0, %xmm1)
1572 pcmpistri $0x1a, %xmm1, %xmm0
1573#endif
1574 jbe LABEL(exit_use)
1575#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1576 sub $16, %r11
1577 jbe LABEL(strcmp_exitz)
1578#endif
1579
1580 add $16, %rdx
1581 add $16, %r10
1582 jg LABEL(nibble_ashr_14_use)
1583
1584 movdqa (%rdi, %rdx), %xmm0
1585 palignr $14, -16(%rdi, %rdx), D(%xmm0)
1586#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1587 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1588#else
1589 movdqa (%rsi,%rdx), %xmm1
1590 TOLOWER (%xmm0, %xmm1)
1591 pcmpistri $0x1a, %xmm1, %xmm0
1592#endif
1593 jbe LABEL(exit_use)
1594#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1595 sub $16, %r11
1596 jbe LABEL(strcmp_exitz)
1597#endif
1598 add $16, %rdx
1599 jmp LABEL(loop_ashr_14_use)
1600
1601 .p2align 4
1602LABEL(nibble_ashr_14_use):
1603 sub $0x1000, %r10
1604 movdqa -16(%rdi, %rdx), %xmm0
1605 psrldq $14, D(%xmm0)
1606 pcmpistri $0x3a,%xmm0, %xmm0
1607#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1608 cmp %r11, %rcx
1609 jae LABEL(nibble_ashr_exit_use)
1610#endif
1611 cmp $1, %ecx
1612 ja LABEL(nibble_ashr_14_restart_use)
1613
1614 jmp LABEL(nibble_ashr_exit_use)
1615
1616/*
1617 * The following cases will be handled by ashr_15
1618 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1619 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
1620 */
1621 .p2align 4
1622LABEL(ashr_15):
1623 pslldq $1, D(%xmm2)
1624 TOLOWER (%xmm1, %xmm2)
1625 pcmpeqb %xmm1, D(%xmm2)
1626 psubb %xmm0, D(%xmm2)
1627 pmovmskb %xmm2, %r9d
1628 shr %cl, %edx
1629 shr %cl, %r9d
1630 sub %r9d, %edx
1631 jnz LABEL(less32bytes)
1632
1633 movdqa (%rdi), %xmm3
1634
1635 UPDATE_STRNCMP_COUNTER
1636
1637 mov $16, %rcx /* index for loads */
1638 mov $15, %r9d /* byte position left over from less32bytes case */
1639 /*
1640 * Setup %r10 value allows us to detect crossing a page boundary.
1641 * When %r10 goes positive we have crossed a page boundary and
1642 * need to do a nibble.
1643 */
1644 lea 15(%rdi), %r10
1645 and $0xfff, %r10 /* offset into 4K page */
1646
1647 sub $0x1000, %r10 /* subtract 4K pagesize */
1648
1649 mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
1650
1651 .p2align 4
1652LABEL(loop_ashr_15_use):
1653 add $16, %r10
1654 jg LABEL(nibble_ashr_15_use)
1655
1656LABEL(nibble_ashr_15_restart_use):
1657 movdqa (%rdi, %rdx), %xmm0
1658 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1659#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1660 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1661#else
1662 movdqa (%rsi,%rdx), %xmm1
1663 TOLOWER (%xmm0, %xmm1)
1664 pcmpistri $0x1a, %xmm1, %xmm0
1665#endif
1666 jbe LABEL(exit_use)
1667#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1668 sub $16, %r11
1669 jbe LABEL(strcmp_exitz)
1670#endif
1671
1672 add $16, %rdx
1673 add $16, %r10
1674 jg LABEL(nibble_ashr_15_use)
1675
1676 movdqa (%rdi, %rdx), %xmm0
1677 palignr $15, -16(%rdi, %rdx), D(%xmm0)
1678#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1679 pcmpistri $0x1a, (%rsi,%rdx), %xmm0
1680#else
1681 movdqa (%rsi,%rdx), %xmm1
1682 TOLOWER (%xmm0, %xmm1)
1683 pcmpistri $0x1a, %xmm1, %xmm0
1684#endif
1685 jbe LABEL(exit_use)
1686#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1687 sub $16, %r11
1688 jbe LABEL(strcmp_exitz)
1689#endif
1690 add $16, %rdx
1691 jmp LABEL(loop_ashr_15_use)
1692
1693 .p2align 4
1694LABEL(nibble_ashr_15_use):
1695 sub $0x1000, %r10
1696 movdqa -16(%rdi, %rdx), %xmm0
1697 psrldq $15, D(%xmm0)
1698 pcmpistri $0x3a,%xmm0, %xmm0
1699#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1700 cmp %r11, %rcx
1701 jae LABEL(nibble_ashr_exit_use)
1702#endif
1703 cmp $0, %ecx
1704 ja LABEL(nibble_ashr_15_restart_use)
1705
1706LABEL(nibble_ashr_exit_use):
1707#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
1708 pcmpistri $0x1a,(%rsi,%rdx), %xmm0
1709#else
1710 movdqa (%rsi,%rdx), %xmm1
1711 TOLOWER (%xmm0, %xmm1)
1712 pcmpistri $0x1a, %xmm1, %xmm0
1713#endif
1714 .p2align 4
1715LABEL(exit_use):
1716 jnc LABEL(strcmp_exitz)
1717#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1718 sub %rcx, %r11
1719 jbe LABEL(strcmp_exitz)
1720#endif
1721 add %rcx, %rdx
1722 lea -16(%rdi, %r9), %rdi
1723 movzbl (%rdi, %rdx), %eax
1724 movzbl (%rsi, %rdx), %edx
1725 test %r8d, %r8d
1726 jz LABEL(ret_use)
1727 xchg %eax, %edx
1728LABEL(ret_use):
1729#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1730 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
1731 movl (%rcx,%rdx,4), %edx
1732 movl (%rcx,%rax,4), %eax
1733#endif
1734
1735 sub %edx, %eax
1736 ret
1737
1738LABEL(less32bytes):
1739 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
1740 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
1741 test %r8d, %r8d
1742 jz LABEL(ret)
1743 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
1744
1745 .p2align 4
1746LABEL(ret):
1747LABEL(less16bytes):
1748 bsf %rdx, %rdx /* find and store bit index in %rdx */
1749
1750#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1751 sub %rdx, %r11
1752 jbe LABEL(strcmp_exitz)
1753#endif
1754 movzbl (%rsi, %rdx), %ecx
1755 movzbl (%rdi, %rdx), %eax
1756
1757#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1758 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1759 movl (%rdx,%rcx,4), %ecx
1760 movl (%rdx,%rax,4), %eax
1761#endif
1762
1763 sub %ecx, %eax
1764 ret
1765
1766LABEL(strcmp_exitz):
1767 xor %eax, %eax
1768 ret
1769
1770 .p2align 4
1771 // XXX Same as code above
1772LABEL(Byte0):
1773 movzx (%rsi), %ecx
1774 movzx (%rdi), %eax
1775
1776#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
1777 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
1778 movl (%rdx,%rcx,4), %ecx
1779 movl (%rdx,%rax,4), %eax
1780#endif
1781
1782 sub %ecx, %eax
1783 ret
1784 cfi_endproc
1785 .size STRCMP_SSE42, .-STRCMP_SSE42
1786
1787#undef UCLOW_reg
1788#undef UCHIGH_reg
1789#undef LCQWORD_reg
1790#undef TOLOWER
1791
1792 /* Put all SSE 4.2 functions together. */
1793 .section .rodata.SECTION,"a",@progbits
1794 .p2align 3
1795LABEL(unaligned_table):
1796 .int LABEL(ashr_1) - LABEL(unaligned_table)
1797 .int LABEL(ashr_2) - LABEL(unaligned_table)
1798 .int LABEL(ashr_3) - LABEL(unaligned_table)
1799 .int LABEL(ashr_4) - LABEL(unaligned_table)
1800 .int LABEL(ashr_5) - LABEL(unaligned_table)
1801 .int LABEL(ashr_6) - LABEL(unaligned_table)
1802 .int LABEL(ashr_7) - LABEL(unaligned_table)
1803 .int LABEL(ashr_8) - LABEL(unaligned_table)
1804 .int LABEL(ashr_9) - LABEL(unaligned_table)
1805 .int LABEL(ashr_10) - LABEL(unaligned_table)
1806 .int LABEL(ashr_11) - LABEL(unaligned_table)
1807 .int LABEL(ashr_12) - LABEL(unaligned_table)
1808 .int LABEL(ashr_13) - LABEL(unaligned_table)
1809 .int LABEL(ashr_14) - LABEL(unaligned_table)
1810 .int LABEL(ashr_15) - LABEL(unaligned_table)
1811 .int LABEL(ashr_0) - LABEL(unaligned_table)
1812
1813#undef LABEL
1814#undef GLABEL
1815#undef SECTION
1816#undef movdqa
1817#undef movdqu
1818#undef pmovmskb
1819#undef pcmpistri
1820#undef psubb
1821#undef pcmpeqb
1822#undef psrldq
1823#undef pslldq
1824#undef palignr
1825#undef pxor
1826#undef D
1827