1/* Highly optimized version for x86-64.
2 Copyright (C) 1999-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Based on i686 version contributed by Ulrich Drepper
5 <drepper@cygnus.com>, 1999.
6 Updated with SSE2 support contributed by Intel Corporation.
7
8 The GNU C Library is free software; you can redistribute it and/or
9 modify it under the terms of the GNU Lesser General Public
10 License as published by the Free Software Foundation; either
11 version 2.1 of the License, or (at your option) any later version.
12
13 The GNU C Library is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public
19 License along with the GNU C Library; if not, see
20 <http://www.gnu.org/licenses/>. */
21
22#include <sysdep.h>
23#include "asm-syntax.h"
24
25#undef UPDATE_STRNCMP_COUNTER
26
27#ifndef LABEL
28#define LABEL(l) L(l)
29#endif
30
31#ifdef USE_AS_STRNCMP
32/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
33 if the new counter > the old one or is 0. */
34# define UPDATE_STRNCMP_COUNTER \
35 /* calculate left number to compare */ \
36 lea -16(%rcx, %r11), %r9; \
37 cmp %r9, %r11; \
38 jb LABEL(strcmp_exitz); \
39 test %r9, %r9; \
40 je LABEL(strcmp_exitz); \
41 mov %r9, %r11
42
43#elif defined USE_AS_STRCASECMP_L
44# include "locale-defines.h"
45
46# define UPDATE_STRNCMP_COUNTER
47#elif defined USE_AS_STRNCASECMP_L
48# include "locale-defines.h"
49
50# define UPDATE_STRNCMP_COUNTER \
51 /* calculate left number to compare */ \
52 lea -16(%rcx, %r11), %r9; \
53 cmp %r9, %r11; \
54 jb LABEL(strcmp_exitz); \
55 test %r9, %r9; \
56 je LABEL(strcmp_exitz); \
57 mov %r9, %r11
58#else
59# define UPDATE_STRNCMP_COUNTER
60# ifndef STRCMP
61# define STRCMP strcmp
62# endif
63#endif
64
65#ifndef USE_SSSE3
66 .text
67#else
68 .section .text.ssse3,"ax",@progbits
69#endif
70
71#ifdef USE_AS_STRCASECMP_L
72# ifndef ENTRY2
73# define ENTRY2(name) ENTRY (name)
74# define END2(name) END (name)
75# endif
76
77ENTRY2 (__strcasecmp)
78 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
79 mov %fs:(%rax),%RDX_LP
80
81 // XXX 5 byte should be before the function
82 /* 5-byte NOP. */
83 .byte 0x0f,0x1f,0x44,0x00,0x00
84END2 (__strcasecmp)
85# ifndef NO_NOLOCALE_ALIAS
86weak_alias (__strcasecmp, strcasecmp)
87libc_hidden_def (__strcasecmp)
88# endif
89 /* FALLTHROUGH to strcasecmp_l. */
90#elif defined USE_AS_STRNCASECMP_L
91# ifndef ENTRY2
92# define ENTRY2(name) ENTRY (name)
93# define END2(name) END (name)
94# endif
95
96ENTRY2 (__strncasecmp)
97 movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
98 mov %fs:(%rax),%RCX_LP
99
100 // XXX 5 byte should be before the function
101 /* 5-byte NOP. */
102 .byte 0x0f,0x1f,0x44,0x00,0x00
103END2 (__strncasecmp)
104# ifndef NO_NOLOCALE_ALIAS
105weak_alias (__strncasecmp, strncasecmp)
106libc_hidden_def (__strncasecmp)
107# endif
108 /* FALLTHROUGH to strncasecmp_l. */
109#endif
110
111ENTRY (STRCMP)
112#ifdef USE_AS_STRCASECMP_L
113 /* We have to fall back on the C implementation for locales
114 with encodings not matching ASCII for single bytes. */
115# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
116 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
117# else
118 mov (%rdx), %RAX_LP
119# endif
120 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
121 jne __strcasecmp_l_nonascii
122#elif defined USE_AS_STRNCASECMP_L
123 /* We have to fall back on the C implementation for locales
124 with encodings not matching ASCII for single bytes. */
125# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
126 mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
127# else
128 mov (%rcx), %RAX_LP
129# endif
130 testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
131 jne __strncasecmp_l_nonascii
132#endif
133
134/*
135 * This implementation uses SSE to compare up to 16 bytes at a time.
136 */
137#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
138 test %RDX_LP, %RDX_LP
139 je LABEL(strcmp_exitz)
140 cmp $1, %RDX_LP
141 je LABEL(Byte0)
142 mov %RDX_LP, %R11_LP
143#endif
144 mov %esi, %ecx
145 mov %edi, %eax
146/* Use 64bit AND here to avoid long NOP padding. */
147 and $0x3f, %rcx /* rsi alignment in cache line */
148 and $0x3f, %rax /* rdi alignment in cache line */
149#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
150 .section .rodata.cst16,"aM",@progbits,16
151 .align 16
152.Lbelowupper:
153 .quad 0x4040404040404040
154 .quad 0x4040404040404040
155.Ltopupper:
156 .quad 0x5b5b5b5b5b5b5b5b
157 .quad 0x5b5b5b5b5b5b5b5b
158.Ltouppermask:
159 .quad 0x2020202020202020
160 .quad 0x2020202020202020
161 .previous
162 movdqa .Lbelowupper(%rip), %xmm5
163# define UCLOW_reg %xmm5
164 movdqa .Ltopupper(%rip), %xmm6
165# define UCHIGH_reg %xmm6
166 movdqa .Ltouppermask(%rip), %xmm7
167# define LCQWORD_reg %xmm7
168#endif
169 cmp $0x30, %ecx
170 ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
171 cmp $0x30, %eax
172 ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
173 movlpd (%rdi), %xmm1
174 movlpd (%rsi), %xmm2
175 movhpd 8(%rdi), %xmm1
176 movhpd 8(%rsi), %xmm2
177#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
178# define TOLOWER(reg1, reg2) \
179 movdqa reg1, %xmm8; \
180 movdqa UCHIGH_reg, %xmm9; \
181 movdqa reg2, %xmm10; \
182 movdqa UCHIGH_reg, %xmm11; \
183 pcmpgtb UCLOW_reg, %xmm8; \
184 pcmpgtb reg1, %xmm9; \
185 pcmpgtb UCLOW_reg, %xmm10; \
186 pcmpgtb reg2, %xmm11; \
187 pand %xmm9, %xmm8; \
188 pand %xmm11, %xmm10; \
189 pand LCQWORD_reg, %xmm8; \
190 pand LCQWORD_reg, %xmm10; \
191 por %xmm8, reg1; \
192 por %xmm10, reg2
193 TOLOWER (%xmm1, %xmm2)
194#else
195# define TOLOWER(reg1, reg2)
196#endif
197 pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
198 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
199 pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
200 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
201 pmovmskb %xmm1, %edx
202 sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
203 jnz LABEL(less16bytes) /* If not, find different value or null char */
204#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
205 sub $16, %r11
206 jbe LABEL(strcmp_exitz) /* finish comparision */
207#endif
208 add $16, %rsi /* prepare to search next 16 bytes */
209 add $16, %rdi /* prepare to search next 16 bytes */
210
211 /*
212 * Determine source and destination string offsets from 16-byte alignment.
213 * Use relative offset difference between the two to determine which case
214 * below to use.
215 */
216 .p2align 4
217LABEL(crosscache):
218 and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
219 and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
220 mov $0xffff, %edx /* for equivalent offset */
221 xor %r8d, %r8d
222 and $0xf, %ecx /* offset of rsi */
223 and $0xf, %eax /* offset of rdi */
224 cmp %eax, %ecx
225 je LABEL(ashr_0) /* rsi and rdi relative offset same */
226 ja LABEL(bigger)
227 mov %edx, %r8d /* r8d is offset flag for exit tail */
228 xchg %ecx, %eax
229 xchg %rsi, %rdi
230LABEL(bigger):
231 lea 15(%rax), %r9
232 sub %rcx, %r9
233 lea LABEL(unaligned_table)(%rip), %r10
234 movslq (%r10, %r9,4), %r9
235 lea (%r10, %r9), %r10
236 jmp *%r10 /* jump to corresponding case */
237
238/*
239 * The following cases will be handled by ashr_0
240 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
241 * n(0~15) n(0~15) 15(15+ n-n) ashr_0
242 */
243 .p2align 4
244LABEL(ashr_0):
245
246 movdqa (%rsi), %xmm1
247 pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
248 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
249#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
250 pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
251#else
252 movdqa (%rdi), %xmm2
253 TOLOWER (%xmm1, %xmm2)
254 pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
255#endif
256 psubb %xmm0, %xmm1 /* packed sub of comparison results*/
257 pmovmskb %xmm1, %r9d
258 shr %cl, %edx /* adjust 0xffff for offset */
259 shr %cl, %r9d /* adjust for 16-byte offset */
260 sub %r9d, %edx
261 /*
262 * edx must be the same with r9d if in left byte (16-rcx) is equal to
263 * the start from (16-rax) and no null char was seen.
264 */
265 jne LABEL(less32bytes) /* mismatch or null char */
266 UPDATE_STRNCMP_COUNTER
267 mov $16, %rcx
268 mov $16, %r9
269 pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
270
271 /*
272 * Now both strings are aligned at 16-byte boundary. Loop over strings
273 * checking 32-bytes per iteration.
274 */
275 .p2align 4
276LABEL(loop_ashr_0):
277 movdqa (%rsi, %rcx), %xmm1
278 movdqa (%rdi, %rcx), %xmm2
279 TOLOWER (%xmm1, %xmm2)
280
281 pcmpeqb %xmm1, %xmm0
282 pcmpeqb %xmm2, %xmm1
283 psubb %xmm0, %xmm1
284 pmovmskb %xmm1, %edx
285 sub $0xffff, %edx
286 jnz LABEL(exit) /* mismatch or null char seen */
287
288#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
289 sub $16, %r11
290 jbe LABEL(strcmp_exitz)
291#endif
292 add $16, %rcx
293 movdqa (%rsi, %rcx), %xmm1
294 movdqa (%rdi, %rcx), %xmm2
295 TOLOWER (%xmm1, %xmm2)
296
297 pcmpeqb %xmm1, %xmm0
298 pcmpeqb %xmm2, %xmm1
299 psubb %xmm0, %xmm1
300 pmovmskb %xmm1, %edx
301 sub $0xffff, %edx
302 jnz LABEL(exit)
303#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
304 sub $16, %r11
305 jbe LABEL(strcmp_exitz)
306#endif
307 add $16, %rcx
308 jmp LABEL(loop_ashr_0)
309
310/*
311 * The following cases will be handled by ashr_1
312 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
313 * n(15) n -15 0(15 +(n-15) - n) ashr_1
314 */
315 .p2align 4
316LABEL(ashr_1):
317 pxor %xmm0, %xmm0
318 movdqa (%rdi), %xmm2
319 movdqa (%rsi), %xmm1
320 pcmpeqb %xmm1, %xmm0 /* Any null chars? */
321 pslldq $15, %xmm2 /* shift first string to align with second */
322 TOLOWER (%xmm1, %xmm2)
323 pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
324 psubb %xmm0, %xmm2 /* packed sub of comparison results*/
325 pmovmskb %xmm2, %r9d
326 shr %cl, %edx /* adjust 0xffff for offset */
327 shr %cl, %r9d /* adjust for 16-byte offset */
328 sub %r9d, %edx
329 jnz LABEL(less32bytes) /* mismatch or null char seen */
330 movdqa (%rdi), %xmm3
331 UPDATE_STRNCMP_COUNTER
332
333 pxor %xmm0, %xmm0
334 mov $16, %rcx /* index for loads*/
335 mov $1, %r9d /* byte position left over from less32bytes case */
336 /*
337 * Setup %r10 value allows us to detect crossing a page boundary.
338 * When %r10 goes positive we have crossed a page boundary and
339 * need to do a nibble.
340 */
341 lea 1(%rdi), %r10
342 and $0xfff, %r10 /* offset into 4K page */
343 sub $0x1000, %r10 /* subtract 4K pagesize */
344
345 .p2align 4
346LABEL(loop_ashr_1):
347 add $16, %r10
348 jg LABEL(nibble_ashr_1) /* cross page boundary */
349
350LABEL(gobble_ashr_1):
351 movdqa (%rsi, %rcx), %xmm1
352 movdqa (%rdi, %rcx), %xmm2
353 movdqa %xmm2, %xmm4 /* store for next cycle */
354
355#ifndef USE_SSSE3
356 psrldq $1, %xmm3
357 pslldq $15, %xmm2
358 por %xmm3, %xmm2 /* merge into one 16byte value */
359#else
360 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
361#endif
362 TOLOWER (%xmm1, %xmm2)
363
364 pcmpeqb %xmm1, %xmm0
365 pcmpeqb %xmm2, %xmm1
366 psubb %xmm0, %xmm1
367 pmovmskb %xmm1, %edx
368 sub $0xffff, %edx
369 jnz LABEL(exit)
370
371#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
372 sub $16, %r11
373 jbe LABEL(strcmp_exitz)
374#endif
375 add $16, %rcx
376 movdqa %xmm4, %xmm3
377
378 add $16, %r10
379 jg LABEL(nibble_ashr_1) /* cross page boundary */
380
381 movdqa (%rsi, %rcx), %xmm1
382 movdqa (%rdi, %rcx), %xmm2
383 movdqa %xmm2, %xmm4 /* store for next cycle */
384
385#ifndef USE_SSSE3
386 psrldq $1, %xmm3
387 pslldq $15, %xmm2
388 por %xmm3, %xmm2 /* merge into one 16byte value */
389#else
390 palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
391#endif
392 TOLOWER (%xmm1, %xmm2)
393
394 pcmpeqb %xmm1, %xmm0
395 pcmpeqb %xmm2, %xmm1
396 psubb %xmm0, %xmm1
397 pmovmskb %xmm1, %edx
398 sub $0xffff, %edx
399 jnz LABEL(exit)
400
401#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
402 sub $16, %r11
403 jbe LABEL(strcmp_exitz)
404#endif
405 add $16, %rcx
406 movdqa %xmm4, %xmm3
407 jmp LABEL(loop_ashr_1)
408
409 /*
410 * Nibble avoids loads across page boundary. This is to avoid a potential
411 * access into unmapped memory.
412 */
413 .p2align 4
414LABEL(nibble_ashr_1):
415 pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
416 pmovmskb %xmm0, %edx
417 test $0xfffe, %edx
418 jnz LABEL(ashr_1_exittail) /* find null char*/
419
420#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
421 cmp $15, %r11
422 jbe LABEL(ashr_1_exittail)
423#endif
424
425 pxor %xmm0, %xmm0
426 sub $0x1000, %r10 /* substract 4K from %r10 */
427 jmp LABEL(gobble_ashr_1)
428
429 /*
430 * Once find null char, determine if there is a string mismatch
431 * before the null char.
432 */
433 .p2align 4
434LABEL(ashr_1_exittail):
435 movdqa (%rsi, %rcx), %xmm1
436 psrldq $1, %xmm0
437 psrldq $1, %xmm3
438 jmp LABEL(aftertail)
439
440/*
441 * The following cases will be handled by ashr_2
442 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
443 * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
444 */
445 .p2align 4
446LABEL(ashr_2):
447 pxor %xmm0, %xmm0
448 movdqa (%rdi), %xmm2
449 movdqa (%rsi), %xmm1
450 pcmpeqb %xmm1, %xmm0
451 pslldq $14, %xmm2
452 TOLOWER (%xmm1, %xmm2)
453 pcmpeqb %xmm1, %xmm2
454 psubb %xmm0, %xmm2
455 pmovmskb %xmm2, %r9d
456 shr %cl, %edx
457 shr %cl, %r9d
458 sub %r9d, %edx
459 jnz LABEL(less32bytes)
460 movdqa (%rdi), %xmm3
461 UPDATE_STRNCMP_COUNTER
462
463 pxor %xmm0, %xmm0
464 mov $16, %rcx /* index for loads */
465 mov $2, %r9d /* byte position left over from less32bytes case */
466 /*
467 * Setup %r10 value allows us to detect crossing a page boundary.
468 * When %r10 goes positive we have crossed a page boundary and
469 * need to do a nibble.
470 */
471 lea 2(%rdi), %r10
472 and $0xfff, %r10 /* offset into 4K page */
473 sub $0x1000, %r10 /* subtract 4K pagesize */
474
475 .p2align 4
476LABEL(loop_ashr_2):
477 add $16, %r10
478 jg LABEL(nibble_ashr_2)
479
480LABEL(gobble_ashr_2):
481 movdqa (%rsi, %rcx), %xmm1
482 movdqa (%rdi, %rcx), %xmm2
483 movdqa %xmm2, %xmm4
484
485#ifndef USE_SSSE3
486 psrldq $2, %xmm3
487 pslldq $14, %xmm2
488 por %xmm3, %xmm2 /* merge into one 16byte value */
489#else
490 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
491#endif
492 TOLOWER (%xmm1, %xmm2)
493
494 pcmpeqb %xmm1, %xmm0
495 pcmpeqb %xmm2, %xmm1
496 psubb %xmm0, %xmm1
497 pmovmskb %xmm1, %edx
498 sub $0xffff, %edx
499 jnz LABEL(exit)
500
501#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
502 sub $16, %r11
503 jbe LABEL(strcmp_exitz)
504#endif
505
506 add $16, %rcx
507 movdqa %xmm4, %xmm3
508
509 add $16, %r10
510 jg LABEL(nibble_ashr_2) /* cross page boundary */
511
512 movdqa (%rsi, %rcx), %xmm1
513 movdqa (%rdi, %rcx), %xmm2
514 movdqa %xmm2, %xmm4
515
516#ifndef USE_SSSE3
517 psrldq $2, %xmm3
518 pslldq $14, %xmm2
519 por %xmm3, %xmm2 /* merge into one 16byte value */
520#else
521 palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
522#endif
523 TOLOWER (%xmm1, %xmm2)
524
525 pcmpeqb %xmm1, %xmm0
526 pcmpeqb %xmm2, %xmm1
527 psubb %xmm0, %xmm1
528 pmovmskb %xmm1, %edx
529 sub $0xffff, %edx
530 jnz LABEL(exit)
531
532#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
533 sub $16, %r11
534 jbe LABEL(strcmp_exitz)
535#endif
536
537 add $16, %rcx
538 movdqa %xmm4, %xmm3
539 jmp LABEL(loop_ashr_2)
540
541 .p2align 4
542LABEL(nibble_ashr_2):
543 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
544 pmovmskb %xmm0, %edx
545 test $0xfffc, %edx
546 jnz LABEL(ashr_2_exittail)
547
548#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
549 cmp $14, %r11
550 jbe LABEL(ashr_2_exittail)
551#endif
552
553 pxor %xmm0, %xmm0
554 sub $0x1000, %r10
555 jmp LABEL(gobble_ashr_2)
556
557 .p2align 4
558LABEL(ashr_2_exittail):
559 movdqa (%rsi, %rcx), %xmm1
560 psrldq $2, %xmm0
561 psrldq $2, %xmm3
562 jmp LABEL(aftertail)
563
564/*
565 * The following cases will be handled by ashr_3
566 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
567 * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
568 */
569 .p2align 4
570LABEL(ashr_3):
571 pxor %xmm0, %xmm0
572 movdqa (%rdi), %xmm2
573 movdqa (%rsi), %xmm1
574 pcmpeqb %xmm1, %xmm0
575 pslldq $13, %xmm2
576 TOLOWER (%xmm1, %xmm2)
577 pcmpeqb %xmm1, %xmm2
578 psubb %xmm0, %xmm2
579 pmovmskb %xmm2, %r9d
580 shr %cl, %edx
581 shr %cl, %r9d
582 sub %r9d, %edx
583 jnz LABEL(less32bytes)
584 movdqa (%rdi), %xmm3
585
586 UPDATE_STRNCMP_COUNTER
587
588 pxor %xmm0, %xmm0
589 mov $16, %rcx /* index for loads */
590 mov $3, %r9d /* byte position left over from less32bytes case */
591 /*
592 * Setup %r10 value allows us to detect crossing a page boundary.
593 * When %r10 goes positive we have crossed a page boundary and
594 * need to do a nibble.
595 */
596 lea 3(%rdi), %r10
597 and $0xfff, %r10 /* offset into 4K page */
598 sub $0x1000, %r10 /* subtract 4K pagesize */
599
600 .p2align 4
601LABEL(loop_ashr_3):
602 add $16, %r10
603 jg LABEL(nibble_ashr_3)
604
605LABEL(gobble_ashr_3):
606 movdqa (%rsi, %rcx), %xmm1
607 movdqa (%rdi, %rcx), %xmm2
608 movdqa %xmm2, %xmm4
609
610#ifndef USE_SSSE3
611 psrldq $3, %xmm3
612 pslldq $13, %xmm2
613 por %xmm3, %xmm2 /* merge into one 16byte value */
614#else
615 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
616#endif
617 TOLOWER (%xmm1, %xmm2)
618
619 pcmpeqb %xmm1, %xmm0
620 pcmpeqb %xmm2, %xmm1
621 psubb %xmm0, %xmm1
622 pmovmskb %xmm1, %edx
623 sub $0xffff, %edx
624 jnz LABEL(exit)
625
626#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
627 sub $16, %r11
628 jbe LABEL(strcmp_exitz)
629#endif
630
631 add $16, %rcx
632 movdqa %xmm4, %xmm3
633
634 add $16, %r10
635 jg LABEL(nibble_ashr_3) /* cross page boundary */
636
637 movdqa (%rsi, %rcx), %xmm1
638 movdqa (%rdi, %rcx), %xmm2
639 movdqa %xmm2, %xmm4
640
641#ifndef USE_SSSE3
642 psrldq $3, %xmm3
643 pslldq $13, %xmm2
644 por %xmm3, %xmm2 /* merge into one 16byte value */
645#else
646 palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
647#endif
648 TOLOWER (%xmm1, %xmm2)
649
650 pcmpeqb %xmm1, %xmm0
651 pcmpeqb %xmm2, %xmm1
652 psubb %xmm0, %xmm1
653 pmovmskb %xmm1, %edx
654 sub $0xffff, %edx
655 jnz LABEL(exit)
656
657#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
658 sub $16, %r11
659 jbe LABEL(strcmp_exitz)
660#endif
661
662 add $16, %rcx
663 movdqa %xmm4, %xmm3
664 jmp LABEL(loop_ashr_3)
665
666 .p2align 4
667LABEL(nibble_ashr_3):
668 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
669 pmovmskb %xmm0, %edx
670 test $0xfff8, %edx
671 jnz LABEL(ashr_3_exittail)
672
673#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
674 cmp $13, %r11
675 jbe LABEL(ashr_3_exittail)
676#endif
677
678 pxor %xmm0, %xmm0
679 sub $0x1000, %r10
680 jmp LABEL(gobble_ashr_3)
681
682 .p2align 4
683LABEL(ashr_3_exittail):
684 movdqa (%rsi, %rcx), %xmm1
685 psrldq $3, %xmm0
686 psrldq $3, %xmm3
687 jmp LABEL(aftertail)
688
689/*
690 * The following cases will be handled by ashr_4
691 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
692 * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
693 */
694 .p2align 4
695LABEL(ashr_4):
696 pxor %xmm0, %xmm0
697 movdqa (%rdi), %xmm2
698 movdqa (%rsi), %xmm1
699 pcmpeqb %xmm1, %xmm0
700 pslldq $12, %xmm2
701 TOLOWER (%xmm1, %xmm2)
702 pcmpeqb %xmm1, %xmm2
703 psubb %xmm0, %xmm2
704 pmovmskb %xmm2, %r9d
705 shr %cl, %edx
706 shr %cl, %r9d
707 sub %r9d, %edx
708 jnz LABEL(less32bytes)
709 movdqa (%rdi), %xmm3
710
711 UPDATE_STRNCMP_COUNTER
712
713 pxor %xmm0, %xmm0
714 mov $16, %rcx /* index for loads */
715 mov $4, %r9d /* byte position left over from less32bytes case */
716 /*
717 * Setup %r10 value allows us to detect crossing a page boundary.
718 * When %r10 goes positive we have crossed a page boundary and
719 * need to do a nibble.
720 */
721 lea 4(%rdi), %r10
722 and $0xfff, %r10 /* offset into 4K page */
723 sub $0x1000, %r10 /* subtract 4K pagesize */
724
725 .p2align 4
726LABEL(loop_ashr_4):
727 add $16, %r10
728 jg LABEL(nibble_ashr_4)
729
730LABEL(gobble_ashr_4):
731 movdqa (%rsi, %rcx), %xmm1
732 movdqa (%rdi, %rcx), %xmm2
733 movdqa %xmm2, %xmm4
734
735#ifndef USE_SSSE3
736 psrldq $4, %xmm3
737 pslldq $12, %xmm2
738 por %xmm3, %xmm2 /* merge into one 16byte value */
739#else
740 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
741#endif
742 TOLOWER (%xmm1, %xmm2)
743
744 pcmpeqb %xmm1, %xmm0
745 pcmpeqb %xmm2, %xmm1
746 psubb %xmm0, %xmm1
747 pmovmskb %xmm1, %edx
748 sub $0xffff, %edx
749 jnz LABEL(exit)
750
751#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
752 sub $16, %r11
753 jbe LABEL(strcmp_exitz)
754#endif
755
756 add $16, %rcx
757 movdqa %xmm4, %xmm3
758
759 add $16, %r10
760 jg LABEL(nibble_ashr_4) /* cross page boundary */
761
762 movdqa (%rsi, %rcx), %xmm1
763 movdqa (%rdi, %rcx), %xmm2
764 movdqa %xmm2, %xmm4
765
766#ifndef USE_SSSE3
767 psrldq $4, %xmm3
768 pslldq $12, %xmm2
769 por %xmm3, %xmm2 /* merge into one 16byte value */
770#else
771 palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
772#endif
773 TOLOWER (%xmm1, %xmm2)
774
775 pcmpeqb %xmm1, %xmm0
776 pcmpeqb %xmm2, %xmm1
777 psubb %xmm0, %xmm1
778 pmovmskb %xmm1, %edx
779 sub $0xffff, %edx
780 jnz LABEL(exit)
781
782#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
783 sub $16, %r11
784 jbe LABEL(strcmp_exitz)
785#endif
786
787 add $16, %rcx
788 movdqa %xmm4, %xmm3
789 jmp LABEL(loop_ashr_4)
790
791 .p2align 4
792LABEL(nibble_ashr_4):
793 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
794 pmovmskb %xmm0, %edx
795 test $0xfff0, %edx
796 jnz LABEL(ashr_4_exittail)
797
798#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
799 cmp $12, %r11
800 jbe LABEL(ashr_4_exittail)
801#endif
802
803 pxor %xmm0, %xmm0
804 sub $0x1000, %r10
805 jmp LABEL(gobble_ashr_4)
806
807 .p2align 4
808LABEL(ashr_4_exittail):
809 movdqa (%rsi, %rcx), %xmm1
810 psrldq $4, %xmm0
811 psrldq $4, %xmm3
812 jmp LABEL(aftertail)
813
814/*
815 * The following cases will be handled by ashr_5
816 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
817 * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
818 */
819 .p2align 4
820LABEL(ashr_5):
821 pxor %xmm0, %xmm0
822 movdqa (%rdi), %xmm2
823 movdqa (%rsi), %xmm1
824 pcmpeqb %xmm1, %xmm0
825 pslldq $11, %xmm2
826 TOLOWER (%xmm1, %xmm2)
827 pcmpeqb %xmm1, %xmm2
828 psubb %xmm0, %xmm2
829 pmovmskb %xmm2, %r9d
830 shr %cl, %edx
831 shr %cl, %r9d
832 sub %r9d, %edx
833 jnz LABEL(less32bytes)
834 movdqa (%rdi), %xmm3
835
836 UPDATE_STRNCMP_COUNTER
837
838 pxor %xmm0, %xmm0
839 mov $16, %rcx /* index for loads */
840 mov $5, %r9d /* byte position left over from less32bytes case */
841 /*
842 * Setup %r10 value allows us to detect crossing a page boundary.
843 * When %r10 goes positive we have crossed a page boundary and
844 * need to do a nibble.
845 */
846 lea 5(%rdi), %r10
847 and $0xfff, %r10 /* offset into 4K page */
848 sub $0x1000, %r10 /* subtract 4K pagesize */
849
850 .p2align 4
851LABEL(loop_ashr_5):
852 add $16, %r10
853 jg LABEL(nibble_ashr_5)
854
855LABEL(gobble_ashr_5):
856 movdqa (%rsi, %rcx), %xmm1
857 movdqa (%rdi, %rcx), %xmm2
858 movdqa %xmm2, %xmm4
859
860#ifndef USE_SSSE3
861 psrldq $5, %xmm3
862 pslldq $11, %xmm2
863 por %xmm3, %xmm2 /* merge into one 16byte value */
864#else
865 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
866#endif
867 TOLOWER (%xmm1, %xmm2)
868
869 pcmpeqb %xmm1, %xmm0
870 pcmpeqb %xmm2, %xmm1
871 psubb %xmm0, %xmm1
872 pmovmskb %xmm1, %edx
873 sub $0xffff, %edx
874 jnz LABEL(exit)
875
876#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
877 sub $16, %r11
878 jbe LABEL(strcmp_exitz)
879#endif
880
881 add $16, %rcx
882 movdqa %xmm4, %xmm3
883
884 add $16, %r10
885 jg LABEL(nibble_ashr_5) /* cross page boundary */
886
887 movdqa (%rsi, %rcx), %xmm1
888 movdqa (%rdi, %rcx), %xmm2
889 movdqa %xmm2, %xmm4
890
891#ifndef USE_SSSE3
892 psrldq $5, %xmm3
893 pslldq $11, %xmm2
894 por %xmm3, %xmm2 /* merge into one 16byte value */
895#else
896 palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
897#endif
898 TOLOWER (%xmm1, %xmm2)
899
900 pcmpeqb %xmm1, %xmm0
901 pcmpeqb %xmm2, %xmm1
902 psubb %xmm0, %xmm1
903 pmovmskb %xmm1, %edx
904 sub $0xffff, %edx
905 jnz LABEL(exit)
906
907#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
908 sub $16, %r11
909 jbe LABEL(strcmp_exitz)
910#endif
911
912 add $16, %rcx
913 movdqa %xmm4, %xmm3
914 jmp LABEL(loop_ashr_5)
915
916 .p2align 4
917LABEL(nibble_ashr_5):
918 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
919 pmovmskb %xmm0, %edx
920 test $0xffe0, %edx
921 jnz LABEL(ashr_5_exittail)
922
923#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
924 cmp $11, %r11
925 jbe LABEL(ashr_5_exittail)
926#endif
927
928 pxor %xmm0, %xmm0
929 sub $0x1000, %r10
930 jmp LABEL(gobble_ashr_5)
931
932 .p2align 4
933LABEL(ashr_5_exittail):
934 movdqa (%rsi, %rcx), %xmm1
935 psrldq $5, %xmm0
936 psrldq $5, %xmm3
937 jmp LABEL(aftertail)
938
939/*
940 * The following cases will be handled by ashr_6
941 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
942 * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
943 */
944 .p2align 4
945LABEL(ashr_6):
946 pxor %xmm0, %xmm0
947 movdqa (%rdi), %xmm2
948 movdqa (%rsi), %xmm1
949 pcmpeqb %xmm1, %xmm0
950 pslldq $10, %xmm2
951 TOLOWER (%xmm1, %xmm2)
952 pcmpeqb %xmm1, %xmm2
953 psubb %xmm0, %xmm2
954 pmovmskb %xmm2, %r9d
955 shr %cl, %edx
956 shr %cl, %r9d
957 sub %r9d, %edx
958 jnz LABEL(less32bytes)
959 movdqa (%rdi), %xmm3
960
961 UPDATE_STRNCMP_COUNTER
962
963 pxor %xmm0, %xmm0
964 mov $16, %rcx /* index for loads */
965 mov $6, %r9d /* byte position left over from less32bytes case */
966 /*
967 * Setup %r10 value allows us to detect crossing a page boundary.
968 * When %r10 goes positive we have crossed a page boundary and
969 * need to do a nibble.
970 */
971 lea 6(%rdi), %r10
972 and $0xfff, %r10 /* offset into 4K page */
973 sub $0x1000, %r10 /* subtract 4K pagesize */
974
975 .p2align 4
976LABEL(loop_ashr_6):
977 add $16, %r10
978 jg LABEL(nibble_ashr_6)
979
980LABEL(gobble_ashr_6):
981 movdqa (%rsi, %rcx), %xmm1
982 movdqa (%rdi, %rcx), %xmm2
983 movdqa %xmm2, %xmm4
984
985#ifndef USE_SSSE3
986 psrldq $6, %xmm3
987 pslldq $10, %xmm2
988 por %xmm3, %xmm2 /* merge into one 16byte value */
989#else
990 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
991#endif
992 TOLOWER (%xmm1, %xmm2)
993
994 pcmpeqb %xmm1, %xmm0
995 pcmpeqb %xmm2, %xmm1
996 psubb %xmm0, %xmm1
997 pmovmskb %xmm1, %edx
998 sub $0xffff, %edx
999 jnz LABEL(exit)
1000
1001#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1002 sub $16, %r11
1003 jbe LABEL(strcmp_exitz)
1004#endif
1005
1006 add $16, %rcx
1007 movdqa %xmm4, %xmm3
1008
1009 add $16, %r10
1010 jg LABEL(nibble_ashr_6) /* cross page boundary */
1011
1012 movdqa (%rsi, %rcx), %xmm1
1013 movdqa (%rdi, %rcx), %xmm2
1014 movdqa %xmm2, %xmm4
1015
1016#ifndef USE_SSSE3
1017 psrldq $6, %xmm3
1018 pslldq $10, %xmm2
1019 por %xmm3, %xmm2 /* merge into one 16byte value */
1020#else
1021 palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
1022#endif
1023 TOLOWER (%xmm1, %xmm2)
1024
1025 pcmpeqb %xmm1, %xmm0
1026 pcmpeqb %xmm2, %xmm1
1027 psubb %xmm0, %xmm1
1028 pmovmskb %xmm1, %edx
1029 sub $0xffff, %edx
1030 jnz LABEL(exit)
1031
1032#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1033 sub $16, %r11
1034 jbe LABEL(strcmp_exitz)
1035#endif
1036
1037 add $16, %rcx
1038 movdqa %xmm4, %xmm3
1039 jmp LABEL(loop_ashr_6)
1040
1041 .p2align 4
1042LABEL(nibble_ashr_6):
1043 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1044 pmovmskb %xmm0, %edx
1045 test $0xffc0, %edx
1046 jnz LABEL(ashr_6_exittail)
1047
1048#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1049 cmp $10, %r11
1050 jbe LABEL(ashr_6_exittail)
1051#endif
1052
1053 pxor %xmm0, %xmm0
1054 sub $0x1000, %r10
1055 jmp LABEL(gobble_ashr_6)
1056
1057 .p2align 4
1058LABEL(ashr_6_exittail):
1059 movdqa (%rsi, %rcx), %xmm1
1060 psrldq $6, %xmm0
1061 psrldq $6, %xmm3
1062 jmp LABEL(aftertail)
1063
1064/*
1065 * The following cases will be handled by ashr_7
1066 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1067 * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
1068 */
1069 .p2align 4
1070LABEL(ashr_7):
1071 pxor %xmm0, %xmm0
1072 movdqa (%rdi), %xmm2
1073 movdqa (%rsi), %xmm1
1074 pcmpeqb %xmm1, %xmm0
1075 pslldq $9, %xmm2
1076 TOLOWER (%xmm1, %xmm2)
1077 pcmpeqb %xmm1, %xmm2
1078 psubb %xmm0, %xmm2
1079 pmovmskb %xmm2, %r9d
1080 shr %cl, %edx
1081 shr %cl, %r9d
1082 sub %r9d, %edx
1083 jnz LABEL(less32bytes)
1084 movdqa (%rdi), %xmm3
1085
1086 UPDATE_STRNCMP_COUNTER
1087
1088 pxor %xmm0, %xmm0
1089 mov $16, %rcx /* index for loads */
1090 mov $7, %r9d /* byte position left over from less32bytes case */
1091 /*
1092 * Setup %r10 value allows us to detect crossing a page boundary.
1093 * When %r10 goes positive we have crossed a page boundary and
1094 * need to do a nibble.
1095 */
1096 lea 7(%rdi), %r10
1097 and $0xfff, %r10 /* offset into 4K page */
1098 sub $0x1000, %r10 /* subtract 4K pagesize */
1099
1100 .p2align 4
1101LABEL(loop_ashr_7):
1102 add $16, %r10
1103 jg LABEL(nibble_ashr_7)
1104
1105LABEL(gobble_ashr_7):
1106 movdqa (%rsi, %rcx), %xmm1
1107 movdqa (%rdi, %rcx), %xmm2
1108 movdqa %xmm2, %xmm4
1109
1110#ifndef USE_SSSE3
1111 psrldq $7, %xmm3
1112 pslldq $9, %xmm2
1113 por %xmm3, %xmm2 /* merge into one 16byte value */
1114#else
1115 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1116#endif
1117 TOLOWER (%xmm1, %xmm2)
1118
1119 pcmpeqb %xmm1, %xmm0
1120 pcmpeqb %xmm2, %xmm1
1121 psubb %xmm0, %xmm1
1122 pmovmskb %xmm1, %edx
1123 sub $0xffff, %edx
1124 jnz LABEL(exit)
1125
1126#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1127 sub $16, %r11
1128 jbe LABEL(strcmp_exitz)
1129#endif
1130
1131 add $16, %rcx
1132 movdqa %xmm4, %xmm3
1133
1134 add $16, %r10
1135 jg LABEL(nibble_ashr_7) /* cross page boundary */
1136
1137 movdqa (%rsi, %rcx), %xmm1
1138 movdqa (%rdi, %rcx), %xmm2
1139 movdqa %xmm2, %xmm4
1140
1141#ifndef USE_SSSE3
1142 psrldq $7, %xmm3
1143 pslldq $9, %xmm2
1144 por %xmm3, %xmm2 /* merge into one 16byte value */
1145#else
1146 palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
1147#endif
1148 TOLOWER (%xmm1, %xmm2)
1149
1150 pcmpeqb %xmm1, %xmm0
1151 pcmpeqb %xmm2, %xmm1
1152 psubb %xmm0, %xmm1
1153 pmovmskb %xmm1, %edx
1154 sub $0xffff, %edx
1155 jnz LABEL(exit)
1156
1157#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1158 sub $16, %r11
1159 jbe LABEL(strcmp_exitz)
1160#endif
1161
1162 add $16, %rcx
1163 movdqa %xmm4, %xmm3
1164 jmp LABEL(loop_ashr_7)
1165
1166 .p2align 4
1167LABEL(nibble_ashr_7):
1168 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1169 pmovmskb %xmm0, %edx
1170 test $0xff80, %edx
1171 jnz LABEL(ashr_7_exittail)
1172
1173#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1174 cmp $9, %r11
1175 jbe LABEL(ashr_7_exittail)
1176#endif
1177
1178 pxor %xmm0, %xmm0
1179 sub $0x1000, %r10
1180 jmp LABEL(gobble_ashr_7)
1181
1182 .p2align 4
1183LABEL(ashr_7_exittail):
1184 movdqa (%rsi, %rcx), %xmm1
1185 psrldq $7, %xmm0
1186 psrldq $7, %xmm3
1187 jmp LABEL(aftertail)
1188
1189/*
1190 * The following cases will be handled by ashr_8
1191 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1192 * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
1193 */
1194 .p2align 4
1195LABEL(ashr_8):
1196 pxor %xmm0, %xmm0
1197 movdqa (%rdi), %xmm2
1198 movdqa (%rsi), %xmm1
1199 pcmpeqb %xmm1, %xmm0
1200 pslldq $8, %xmm2
1201 TOLOWER (%xmm1, %xmm2)
1202 pcmpeqb %xmm1, %xmm2
1203 psubb %xmm0, %xmm2
1204 pmovmskb %xmm2, %r9d
1205 shr %cl, %edx
1206 shr %cl, %r9d
1207 sub %r9d, %edx
1208 jnz LABEL(less32bytes)
1209 movdqa (%rdi), %xmm3
1210
1211 UPDATE_STRNCMP_COUNTER
1212
1213 pxor %xmm0, %xmm0
1214 mov $16, %rcx /* index for loads */
1215 mov $8, %r9d /* byte position left over from less32bytes case */
1216 /*
1217 * Setup %r10 value allows us to detect crossing a page boundary.
1218 * When %r10 goes positive we have crossed a page boundary and
1219 * need to do a nibble.
1220 */
1221 lea 8(%rdi), %r10
1222 and $0xfff, %r10 /* offset into 4K page */
1223 sub $0x1000, %r10 /* subtract 4K pagesize */
1224
1225 .p2align 4
1226LABEL(loop_ashr_8):
1227 add $16, %r10
1228 jg LABEL(nibble_ashr_8)
1229
1230LABEL(gobble_ashr_8):
1231 movdqa (%rsi, %rcx), %xmm1
1232 movdqa (%rdi, %rcx), %xmm2
1233 movdqa %xmm2, %xmm4
1234
1235#ifndef USE_SSSE3
1236 psrldq $8, %xmm3
1237 pslldq $8, %xmm2
1238 por %xmm3, %xmm2 /* merge into one 16byte value */
1239#else
1240 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1241#endif
1242 TOLOWER (%xmm1, %xmm2)
1243
1244 pcmpeqb %xmm1, %xmm0
1245 pcmpeqb %xmm2, %xmm1
1246 psubb %xmm0, %xmm1
1247 pmovmskb %xmm1, %edx
1248 sub $0xffff, %edx
1249 jnz LABEL(exit)
1250
1251#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1252 sub $16, %r11
1253 jbe LABEL(strcmp_exitz)
1254#endif
1255
1256 add $16, %rcx
1257 movdqa %xmm4, %xmm3
1258
1259 add $16, %r10
1260 jg LABEL(nibble_ashr_8) /* cross page boundary */
1261
1262 movdqa (%rsi, %rcx), %xmm1
1263 movdqa (%rdi, %rcx), %xmm2
1264 movdqa %xmm2, %xmm4
1265
1266#ifndef USE_SSSE3
1267 psrldq $8, %xmm3
1268 pslldq $8, %xmm2
1269 por %xmm3, %xmm2 /* merge into one 16byte value */
1270#else
1271 palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
1272#endif
1273 TOLOWER (%xmm1, %xmm2)
1274
1275 pcmpeqb %xmm1, %xmm0
1276 pcmpeqb %xmm2, %xmm1
1277 psubb %xmm0, %xmm1
1278 pmovmskb %xmm1, %edx
1279 sub $0xffff, %edx
1280 jnz LABEL(exit)
1281
1282#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1283 sub $16, %r11
1284 jbe LABEL(strcmp_exitz)
1285#endif
1286
1287 add $16, %rcx
1288 movdqa %xmm4, %xmm3
1289 jmp LABEL(loop_ashr_8)
1290
1291 .p2align 4
1292LABEL(nibble_ashr_8):
1293 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1294 pmovmskb %xmm0, %edx
1295 test $0xff00, %edx
1296 jnz LABEL(ashr_8_exittail)
1297
1298#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1299 cmp $8, %r11
1300 jbe LABEL(ashr_8_exittail)
1301#endif
1302
1303 pxor %xmm0, %xmm0
1304 sub $0x1000, %r10
1305 jmp LABEL(gobble_ashr_8)
1306
1307 .p2align 4
1308LABEL(ashr_8_exittail):
1309 movdqa (%rsi, %rcx), %xmm1
1310 psrldq $8, %xmm0
1311 psrldq $8, %xmm3
1312 jmp LABEL(aftertail)
1313
1314/*
1315 * The following cases will be handled by ashr_9
1316 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1317 * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
1318 */
1319 .p2align 4
1320LABEL(ashr_9):
1321 pxor %xmm0, %xmm0
1322 movdqa (%rdi), %xmm2
1323 movdqa (%rsi), %xmm1
1324 pcmpeqb %xmm1, %xmm0
1325 pslldq $7, %xmm2
1326 TOLOWER (%xmm1, %xmm2)
1327 pcmpeqb %xmm1, %xmm2
1328 psubb %xmm0, %xmm2
1329 pmovmskb %xmm2, %r9d
1330 shr %cl, %edx
1331 shr %cl, %r9d
1332 sub %r9d, %edx
1333 jnz LABEL(less32bytes)
1334 movdqa (%rdi), %xmm3
1335
1336 UPDATE_STRNCMP_COUNTER
1337
1338 pxor %xmm0, %xmm0
1339 mov $16, %rcx /* index for loads */
1340 mov $9, %r9d /* byte position left over from less32bytes case */
1341 /*
1342 * Setup %r10 value allows us to detect crossing a page boundary.
1343 * When %r10 goes positive we have crossed a page boundary and
1344 * need to do a nibble.
1345 */
1346 lea 9(%rdi), %r10
1347 and $0xfff, %r10 /* offset into 4K page */
1348 sub $0x1000, %r10 /* subtract 4K pagesize */
1349
1350 .p2align 4
1351LABEL(loop_ashr_9):
1352 add $16, %r10
1353 jg LABEL(nibble_ashr_9)
1354
1355LABEL(gobble_ashr_9):
1356 movdqa (%rsi, %rcx), %xmm1
1357 movdqa (%rdi, %rcx), %xmm2
1358 movdqa %xmm2, %xmm4
1359
1360#ifndef USE_SSSE3
1361 psrldq $9, %xmm3
1362 pslldq $7, %xmm2
1363 por %xmm3, %xmm2 /* merge into one 16byte value */
1364#else
1365 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1366#endif
1367 TOLOWER (%xmm1, %xmm2)
1368
1369 pcmpeqb %xmm1, %xmm0
1370 pcmpeqb %xmm2, %xmm1
1371 psubb %xmm0, %xmm1
1372 pmovmskb %xmm1, %edx
1373 sub $0xffff, %edx
1374 jnz LABEL(exit)
1375
1376#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1377 sub $16, %r11
1378 jbe LABEL(strcmp_exitz)
1379#endif
1380
1381 add $16, %rcx
1382 movdqa %xmm4, %xmm3
1383
1384 add $16, %r10
1385 jg LABEL(nibble_ashr_9) /* cross page boundary */
1386
1387 movdqa (%rsi, %rcx), %xmm1
1388 movdqa (%rdi, %rcx), %xmm2
1389 movdqa %xmm2, %xmm4
1390
1391#ifndef USE_SSSE3
1392 psrldq $9, %xmm3
1393 pslldq $7, %xmm2
1394 por %xmm3, %xmm2 /* merge into one 16byte value */
1395#else
1396 palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
1397#endif
1398 TOLOWER (%xmm1, %xmm2)
1399
1400 pcmpeqb %xmm1, %xmm0
1401 pcmpeqb %xmm2, %xmm1
1402 psubb %xmm0, %xmm1
1403 pmovmskb %xmm1, %edx
1404 sub $0xffff, %edx
1405 jnz LABEL(exit)
1406
1407#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1408 sub $16, %r11
1409 jbe LABEL(strcmp_exitz)
1410#endif
1411
1412 add $16, %rcx
1413 movdqa %xmm4, %xmm3 /* store for next cycle */
1414 jmp LABEL(loop_ashr_9)
1415
1416 .p2align 4
1417LABEL(nibble_ashr_9):
1418 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1419 pmovmskb %xmm0, %edx
1420 test $0xfe00, %edx
1421 jnz LABEL(ashr_9_exittail)
1422
1423#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1424 cmp $7, %r11
1425 jbe LABEL(ashr_9_exittail)
1426#endif
1427
1428 pxor %xmm0, %xmm0
1429 sub $0x1000, %r10
1430 jmp LABEL(gobble_ashr_9)
1431
1432 .p2align 4
1433LABEL(ashr_9_exittail):
1434 movdqa (%rsi, %rcx), %xmm1
1435 psrldq $9, %xmm0
1436 psrldq $9, %xmm3
1437 jmp LABEL(aftertail)
1438
1439/*
1440 * The following cases will be handled by ashr_10
1441 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1442 * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
1443 */
1444 .p2align 4
1445LABEL(ashr_10):
1446 pxor %xmm0, %xmm0
1447 movdqa (%rdi), %xmm2
1448 movdqa (%rsi), %xmm1
1449 pcmpeqb %xmm1, %xmm0
1450 pslldq $6, %xmm2
1451 TOLOWER (%xmm1, %xmm2)
1452 pcmpeqb %xmm1, %xmm2
1453 psubb %xmm0, %xmm2
1454 pmovmskb %xmm2, %r9d
1455 shr %cl, %edx
1456 shr %cl, %r9d
1457 sub %r9d, %edx
1458 jnz LABEL(less32bytes)
1459 movdqa (%rdi), %xmm3
1460
1461 UPDATE_STRNCMP_COUNTER
1462
1463 pxor %xmm0, %xmm0
1464 mov $16, %rcx /* index for loads */
1465 mov $10, %r9d /* byte position left over from less32bytes case */
1466 /*
1467 * Setup %r10 value allows us to detect crossing a page boundary.
1468 * When %r10 goes positive we have crossed a page boundary and
1469 * need to do a nibble.
1470 */
1471 lea 10(%rdi), %r10
1472 and $0xfff, %r10 /* offset into 4K page */
1473 sub $0x1000, %r10 /* subtract 4K pagesize */
1474
1475 .p2align 4
1476LABEL(loop_ashr_10):
1477 add $16, %r10
1478 jg LABEL(nibble_ashr_10)
1479
1480LABEL(gobble_ashr_10):
1481 movdqa (%rsi, %rcx), %xmm1
1482 movdqa (%rdi, %rcx), %xmm2
1483 movdqa %xmm2, %xmm4
1484
1485#ifndef USE_SSSE3
1486 psrldq $10, %xmm3
1487 pslldq $6, %xmm2
1488 por %xmm3, %xmm2 /* merge into one 16byte value */
1489#else
1490 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1491#endif
1492 TOLOWER (%xmm1, %xmm2)
1493
1494 pcmpeqb %xmm1, %xmm0
1495 pcmpeqb %xmm2, %xmm1
1496 psubb %xmm0, %xmm1
1497 pmovmskb %xmm1, %edx
1498 sub $0xffff, %edx
1499 jnz LABEL(exit)
1500
1501#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1502 sub $16, %r11
1503 jbe LABEL(strcmp_exitz)
1504#endif
1505
1506 add $16, %rcx
1507 movdqa %xmm4, %xmm3
1508
1509 add $16, %r10
1510 jg LABEL(nibble_ashr_10) /* cross page boundary */
1511
1512 movdqa (%rsi, %rcx), %xmm1
1513 movdqa (%rdi, %rcx), %xmm2
1514 movdqa %xmm2, %xmm4
1515
1516#ifndef USE_SSSE3
1517 psrldq $10, %xmm3
1518 pslldq $6, %xmm2
1519 por %xmm3, %xmm2 /* merge into one 16byte value */
1520#else
1521 palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
1522#endif
1523 TOLOWER (%xmm1, %xmm2)
1524
1525 pcmpeqb %xmm1, %xmm0
1526 pcmpeqb %xmm2, %xmm1
1527 psubb %xmm0, %xmm1
1528 pmovmskb %xmm1, %edx
1529 sub $0xffff, %edx
1530 jnz LABEL(exit)
1531
1532#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1533 sub $16, %r11
1534 jbe LABEL(strcmp_exitz)
1535#endif
1536
1537 add $16, %rcx
1538 movdqa %xmm4, %xmm3
1539 jmp LABEL(loop_ashr_10)
1540
1541 .p2align 4
1542LABEL(nibble_ashr_10):
1543 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1544 pmovmskb %xmm0, %edx
1545 test $0xfc00, %edx
1546 jnz LABEL(ashr_10_exittail)
1547
1548#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1549 cmp $6, %r11
1550 jbe LABEL(ashr_10_exittail)
1551#endif
1552
1553 pxor %xmm0, %xmm0
1554 sub $0x1000, %r10
1555 jmp LABEL(gobble_ashr_10)
1556
1557 .p2align 4
1558LABEL(ashr_10_exittail):
1559 movdqa (%rsi, %rcx), %xmm1
1560 psrldq $10, %xmm0
1561 psrldq $10, %xmm3
1562 jmp LABEL(aftertail)
1563
1564/*
1565 * The following cases will be handled by ashr_11
1566 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1567 * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
1568 */
1569 .p2align 4
1570LABEL(ashr_11):
1571 pxor %xmm0, %xmm0
1572 movdqa (%rdi), %xmm2
1573 movdqa (%rsi), %xmm1
1574 pcmpeqb %xmm1, %xmm0
1575 pslldq $5, %xmm2
1576 TOLOWER (%xmm1, %xmm2)
1577 pcmpeqb %xmm1, %xmm2
1578 psubb %xmm0, %xmm2
1579 pmovmskb %xmm2, %r9d
1580 shr %cl, %edx
1581 shr %cl, %r9d
1582 sub %r9d, %edx
1583 jnz LABEL(less32bytes)
1584 movdqa (%rdi), %xmm3
1585
1586 UPDATE_STRNCMP_COUNTER
1587
1588 pxor %xmm0, %xmm0
1589 mov $16, %rcx /* index for loads */
1590 mov $11, %r9d /* byte position left over from less32bytes case */
1591 /*
1592 * Setup %r10 value allows us to detect crossing a page boundary.
1593 * When %r10 goes positive we have crossed a page boundary and
1594 * need to do a nibble.
1595 */
1596 lea 11(%rdi), %r10
1597 and $0xfff, %r10 /* offset into 4K page */
1598 sub $0x1000, %r10 /* subtract 4K pagesize */
1599
1600 .p2align 4
1601LABEL(loop_ashr_11):
1602 add $16, %r10
1603 jg LABEL(nibble_ashr_11)
1604
1605LABEL(gobble_ashr_11):
1606 movdqa (%rsi, %rcx), %xmm1
1607 movdqa (%rdi, %rcx), %xmm2
1608 movdqa %xmm2, %xmm4
1609
1610#ifndef USE_SSSE3
1611 psrldq $11, %xmm3
1612 pslldq $5, %xmm2
1613 por %xmm3, %xmm2 /* merge into one 16byte value */
1614#else
1615 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1616#endif
1617 TOLOWER (%xmm1, %xmm2)
1618
1619 pcmpeqb %xmm1, %xmm0
1620 pcmpeqb %xmm2, %xmm1
1621 psubb %xmm0, %xmm1
1622 pmovmskb %xmm1, %edx
1623 sub $0xffff, %edx
1624 jnz LABEL(exit)
1625
1626#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1627 sub $16, %r11
1628 jbe LABEL(strcmp_exitz)
1629#endif
1630
1631 add $16, %rcx
1632 movdqa %xmm4, %xmm3
1633
1634 add $16, %r10
1635 jg LABEL(nibble_ashr_11) /* cross page boundary */
1636
1637 movdqa (%rsi, %rcx), %xmm1
1638 movdqa (%rdi, %rcx), %xmm2
1639 movdqa %xmm2, %xmm4
1640
1641#ifndef USE_SSSE3
1642 psrldq $11, %xmm3
1643 pslldq $5, %xmm2
1644 por %xmm3, %xmm2 /* merge into one 16byte value */
1645#else
1646 palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
1647#endif
1648 TOLOWER (%xmm1, %xmm2)
1649
1650 pcmpeqb %xmm1, %xmm0
1651 pcmpeqb %xmm2, %xmm1
1652 psubb %xmm0, %xmm1
1653 pmovmskb %xmm1, %edx
1654 sub $0xffff, %edx
1655 jnz LABEL(exit)
1656
1657#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1658 sub $16, %r11
1659 jbe LABEL(strcmp_exitz)
1660#endif
1661
1662 add $16, %rcx
1663 movdqa %xmm4, %xmm3
1664 jmp LABEL(loop_ashr_11)
1665
1666 .p2align 4
1667LABEL(nibble_ashr_11):
1668 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1669 pmovmskb %xmm0, %edx
1670 test $0xf800, %edx
1671 jnz LABEL(ashr_11_exittail)
1672
1673#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1674 cmp $5, %r11
1675 jbe LABEL(ashr_11_exittail)
1676#endif
1677
1678 pxor %xmm0, %xmm0
1679 sub $0x1000, %r10
1680 jmp LABEL(gobble_ashr_11)
1681
1682 .p2align 4
1683LABEL(ashr_11_exittail):
1684 movdqa (%rsi, %rcx), %xmm1
1685 psrldq $11, %xmm0
1686 psrldq $11, %xmm3
1687 jmp LABEL(aftertail)
1688
1689/*
1690 * The following cases will be handled by ashr_12
1691 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1692 * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
1693 */
1694 .p2align 4
1695LABEL(ashr_12):
1696 pxor %xmm0, %xmm0
1697 movdqa (%rdi), %xmm2
1698 movdqa (%rsi), %xmm1
1699 pcmpeqb %xmm1, %xmm0
1700 pslldq $4, %xmm2
1701 TOLOWER (%xmm1, %xmm2)
1702 pcmpeqb %xmm1, %xmm2
1703 psubb %xmm0, %xmm2
1704 pmovmskb %xmm2, %r9d
1705 shr %cl, %edx
1706 shr %cl, %r9d
1707 sub %r9d, %edx
1708 jnz LABEL(less32bytes)
1709 movdqa (%rdi), %xmm3
1710
1711 UPDATE_STRNCMP_COUNTER
1712
1713 pxor %xmm0, %xmm0
1714 mov $16, %rcx /* index for loads */
1715 mov $12, %r9d /* byte position left over from less32bytes case */
1716 /*
1717 * Setup %r10 value allows us to detect crossing a page boundary.
1718 * When %r10 goes positive we have crossed a page boundary and
1719 * need to do a nibble.
1720 */
1721 lea 12(%rdi), %r10
1722 and $0xfff, %r10 /* offset into 4K page */
1723 sub $0x1000, %r10 /* subtract 4K pagesize */
1724
1725 .p2align 4
1726LABEL(loop_ashr_12):
1727 add $16, %r10
1728 jg LABEL(nibble_ashr_12)
1729
1730LABEL(gobble_ashr_12):
1731 movdqa (%rsi, %rcx), %xmm1
1732 movdqa (%rdi, %rcx), %xmm2
1733 movdqa %xmm2, %xmm4
1734
1735#ifndef USE_SSSE3
1736 psrldq $12, %xmm3
1737 pslldq $4, %xmm2
1738 por %xmm3, %xmm2 /* merge into one 16byte value */
1739#else
1740 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1741#endif
1742 TOLOWER (%xmm1, %xmm2)
1743
1744 pcmpeqb %xmm1, %xmm0
1745 pcmpeqb %xmm2, %xmm1
1746 psubb %xmm0, %xmm1
1747 pmovmskb %xmm1, %edx
1748 sub $0xffff, %edx
1749 jnz LABEL(exit)
1750
1751#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1752 sub $16, %r11
1753 jbe LABEL(strcmp_exitz)
1754#endif
1755
1756 add $16, %rcx
1757 movdqa %xmm4, %xmm3
1758
1759 add $16, %r10
1760 jg LABEL(nibble_ashr_12) /* cross page boundary */
1761
1762 movdqa (%rsi, %rcx), %xmm1
1763 movdqa (%rdi, %rcx), %xmm2
1764 movdqa %xmm2, %xmm4
1765
1766#ifndef USE_SSSE3
1767 psrldq $12, %xmm3
1768 pslldq $4, %xmm2
1769 por %xmm3, %xmm2 /* merge into one 16byte value */
1770#else
1771 palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
1772#endif
1773 TOLOWER (%xmm1, %xmm2)
1774
1775 pcmpeqb %xmm1, %xmm0
1776 pcmpeqb %xmm2, %xmm1
1777 psubb %xmm0, %xmm1
1778 pmovmskb %xmm1, %edx
1779 sub $0xffff, %edx
1780 jnz LABEL(exit)
1781
1782#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1783 sub $16, %r11
1784 jbe LABEL(strcmp_exitz)
1785#endif
1786
1787 add $16, %rcx
1788 movdqa %xmm4, %xmm3
1789 jmp LABEL(loop_ashr_12)
1790
1791 .p2align 4
1792LABEL(nibble_ashr_12):
1793 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1794 pmovmskb %xmm0, %edx
1795 test $0xf000, %edx
1796 jnz LABEL(ashr_12_exittail)
1797
1798#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1799 cmp $4, %r11
1800 jbe LABEL(ashr_12_exittail)
1801#endif
1802
1803 pxor %xmm0, %xmm0
1804 sub $0x1000, %r10
1805 jmp LABEL(gobble_ashr_12)
1806
1807 .p2align 4
1808LABEL(ashr_12_exittail):
1809 movdqa (%rsi, %rcx), %xmm1
1810 psrldq $12, %xmm0
1811 psrldq $12, %xmm3
1812 jmp LABEL(aftertail)
1813
1814/*
1815 * The following cases will be handled by ashr_13
1816 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1817 * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
1818 */
1819 .p2align 4
1820LABEL(ashr_13):
1821 pxor %xmm0, %xmm0
1822 movdqa (%rdi), %xmm2
1823 movdqa (%rsi), %xmm1
1824 pcmpeqb %xmm1, %xmm0
1825 pslldq $3, %xmm2
1826 TOLOWER (%xmm1, %xmm2)
1827 pcmpeqb %xmm1, %xmm2
1828 psubb %xmm0, %xmm2
1829 pmovmskb %xmm2, %r9d
1830 shr %cl, %edx
1831 shr %cl, %r9d
1832 sub %r9d, %edx
1833 jnz LABEL(less32bytes)
1834 movdqa (%rdi), %xmm3
1835
1836 UPDATE_STRNCMP_COUNTER
1837
1838 pxor %xmm0, %xmm0
1839 mov $16, %rcx /* index for loads */
1840 mov $13, %r9d /* byte position left over from less32bytes case */
1841 /*
1842 * Setup %r10 value allows us to detect crossing a page boundary.
1843 * When %r10 goes positive we have crossed a page boundary and
1844 * need to do a nibble.
1845 */
1846 lea 13(%rdi), %r10
1847 and $0xfff, %r10 /* offset into 4K page */
1848 sub $0x1000, %r10 /* subtract 4K pagesize */
1849
1850 .p2align 4
1851LABEL(loop_ashr_13):
1852 add $16, %r10
1853 jg LABEL(nibble_ashr_13)
1854
1855LABEL(gobble_ashr_13):
1856 movdqa (%rsi, %rcx), %xmm1
1857 movdqa (%rdi, %rcx), %xmm2
1858 movdqa %xmm2, %xmm4
1859
1860#ifndef USE_SSSE3
1861 psrldq $13, %xmm3
1862 pslldq $3, %xmm2
1863 por %xmm3, %xmm2 /* merge into one 16byte value */
1864#else
1865 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1866#endif
1867 TOLOWER (%xmm1, %xmm2)
1868
1869 pcmpeqb %xmm1, %xmm0
1870 pcmpeqb %xmm2, %xmm1
1871 psubb %xmm0, %xmm1
1872 pmovmskb %xmm1, %edx
1873 sub $0xffff, %edx
1874 jnz LABEL(exit)
1875
1876#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1877 sub $16, %r11
1878 jbe LABEL(strcmp_exitz)
1879#endif
1880
1881 add $16, %rcx
1882 movdqa %xmm4, %xmm3
1883
1884 add $16, %r10
1885 jg LABEL(nibble_ashr_13) /* cross page boundary */
1886
1887 movdqa (%rsi, %rcx), %xmm1
1888 movdqa (%rdi, %rcx), %xmm2
1889 movdqa %xmm2, %xmm4
1890
1891#ifndef USE_SSSE3
1892 psrldq $13, %xmm3
1893 pslldq $3, %xmm2
1894 por %xmm3, %xmm2 /* merge into one 16byte value */
1895#else
1896 palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
1897#endif
1898 TOLOWER (%xmm1, %xmm2)
1899
1900 pcmpeqb %xmm1, %xmm0
1901 pcmpeqb %xmm2, %xmm1
1902 psubb %xmm0, %xmm1
1903 pmovmskb %xmm1, %edx
1904 sub $0xffff, %edx
1905 jnz LABEL(exit)
1906
1907#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1908 sub $16, %r11
1909 jbe LABEL(strcmp_exitz)
1910#endif
1911
1912 add $16, %rcx
1913 movdqa %xmm4, %xmm3
1914 jmp LABEL(loop_ashr_13)
1915
1916 .p2align 4
1917LABEL(nibble_ashr_13):
1918 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
1919 pmovmskb %xmm0, %edx
1920 test $0xe000, %edx
1921 jnz LABEL(ashr_13_exittail)
1922
1923#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
1924 cmp $3, %r11
1925 jbe LABEL(ashr_13_exittail)
1926#endif
1927
1928 pxor %xmm0, %xmm0
1929 sub $0x1000, %r10
1930 jmp LABEL(gobble_ashr_13)
1931
1932 .p2align 4
1933LABEL(ashr_13_exittail):
1934 movdqa (%rsi, %rcx), %xmm1
1935 psrldq $13, %xmm0
1936 psrldq $13, %xmm3
1937 jmp LABEL(aftertail)
1938
1939/*
1940 * The following cases will be handled by ashr_14
1941 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
1942 * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
1943 */
1944 .p2align 4
1945LABEL(ashr_14):
1946 pxor %xmm0, %xmm0
1947 movdqa (%rdi), %xmm2
1948 movdqa (%rsi), %xmm1
1949 pcmpeqb %xmm1, %xmm0
1950 pslldq $2, %xmm2
1951 TOLOWER (%xmm1, %xmm2)
1952 pcmpeqb %xmm1, %xmm2
1953 psubb %xmm0, %xmm2
1954 pmovmskb %xmm2, %r9d
1955 shr %cl, %edx
1956 shr %cl, %r9d
1957 sub %r9d, %edx
1958 jnz LABEL(less32bytes)
1959 movdqa (%rdi), %xmm3
1960
1961 UPDATE_STRNCMP_COUNTER
1962
1963 pxor %xmm0, %xmm0
1964 mov $16, %rcx /* index for loads */
1965 mov $14, %r9d /* byte position left over from less32bytes case */
1966 /*
1967 * Setup %r10 value allows us to detect crossing a page boundary.
1968 * When %r10 goes positive we have crossed a page boundary and
1969 * need to do a nibble.
1970 */
1971 lea 14(%rdi), %r10
1972 and $0xfff, %r10 /* offset into 4K page */
1973 sub $0x1000, %r10 /* subtract 4K pagesize */
1974
1975 .p2align 4
1976LABEL(loop_ashr_14):
1977 add $16, %r10
1978 jg LABEL(nibble_ashr_14)
1979
1980LABEL(gobble_ashr_14):
1981 movdqa (%rsi, %rcx), %xmm1
1982 movdqa (%rdi, %rcx), %xmm2
1983 movdqa %xmm2, %xmm4
1984
1985#ifndef USE_SSSE3
1986 psrldq $14, %xmm3
1987 pslldq $2, %xmm2
1988 por %xmm3, %xmm2 /* merge into one 16byte value */
1989#else
1990 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
1991#endif
1992 TOLOWER (%xmm1, %xmm2)
1993
1994 pcmpeqb %xmm1, %xmm0
1995 pcmpeqb %xmm2, %xmm1
1996 psubb %xmm0, %xmm1
1997 pmovmskb %xmm1, %edx
1998 sub $0xffff, %edx
1999 jnz LABEL(exit)
2000
2001#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2002 sub $16, %r11
2003 jbe LABEL(strcmp_exitz)
2004#endif
2005
2006 add $16, %rcx
2007 movdqa %xmm4, %xmm3
2008
2009 add $16, %r10
2010 jg LABEL(nibble_ashr_14) /* cross page boundary */
2011
2012 movdqa (%rsi, %rcx), %xmm1
2013 movdqa (%rdi, %rcx), %xmm2
2014 movdqa %xmm2, %xmm4
2015
2016#ifndef USE_SSSE3
2017 psrldq $14, %xmm3
2018 pslldq $2, %xmm2
2019 por %xmm3, %xmm2 /* merge into one 16byte value */
2020#else
2021 palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
2022#endif
2023 TOLOWER (%xmm1, %xmm2)
2024
2025 pcmpeqb %xmm1, %xmm0
2026 pcmpeqb %xmm2, %xmm1
2027 psubb %xmm0, %xmm1
2028 pmovmskb %xmm1, %edx
2029 sub $0xffff, %edx
2030 jnz LABEL(exit)
2031
2032#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
2033 sub $16, %r11
2034 jbe LABEL(strcmp_exitz)
2035#endif
2036
2037 add $16, %rcx
2038 movdqa %xmm4, %xmm3
2039 jmp LABEL(loop_ashr_14)
2040
2041 .p2align 4
2042LABEL(nibble_ashr_14):
2043 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2044 pmovmskb %xmm0, %edx
2045 test $0xc000, %edx
2046 jnz LABEL(ashr_14_exittail)
2047
2048#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2049 cmp $2, %r11
2050 jbe LABEL(ashr_14_exittail)
2051#endif
2052
2053 pxor %xmm0, %xmm0
2054 sub $0x1000, %r10
2055 jmp LABEL(gobble_ashr_14)
2056
2057 .p2align 4
2058LABEL(ashr_14_exittail):
2059 movdqa (%rsi, %rcx), %xmm1
2060 psrldq $14, %xmm0
2061 psrldq $14, %xmm3
2062 jmp LABEL(aftertail)
2063
2064/*
2065 * The following cases will be handled by ashr_15
2066 * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
2067 * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
2068 */
2069 .p2align 4
2070LABEL(ashr_15):
2071 pxor %xmm0, %xmm0
2072 movdqa (%rdi), %xmm2
2073 movdqa (%rsi), %xmm1
2074 pcmpeqb %xmm1, %xmm0
2075 pslldq $1, %xmm2
2076 TOLOWER (%xmm1, %xmm2)
2077 pcmpeqb %xmm1, %xmm2
2078 psubb %xmm0, %xmm2
2079 pmovmskb %xmm2, %r9d
2080 shr %cl, %edx
2081 shr %cl, %r9d
2082 sub %r9d, %edx
2083 jnz LABEL(less32bytes)
2084
2085 movdqa (%rdi), %xmm3
2086
2087 UPDATE_STRNCMP_COUNTER
2088
2089 pxor %xmm0, %xmm0
2090 mov $16, %rcx /* index for loads */
2091 mov $15, %r9d /* byte position left over from less32bytes case */
2092 /*
2093 * Setup %r10 value allows us to detect crossing a page boundary.
2094 * When %r10 goes positive we have crossed a page boundary and
2095 * need to do a nibble.
2096 */
2097 lea 15(%rdi), %r10
2098 and $0xfff, %r10 /* offset into 4K page */
2099
2100 sub $0x1000, %r10 /* subtract 4K pagesize */
2101
2102 .p2align 4
2103LABEL(loop_ashr_15):
2104 add $16, %r10
2105 jg LABEL(nibble_ashr_15)
2106
2107LABEL(gobble_ashr_15):
2108 movdqa (%rsi, %rcx), %xmm1
2109 movdqa (%rdi, %rcx), %xmm2
2110 movdqa %xmm2, %xmm4
2111
2112#ifndef USE_SSSE3
2113 psrldq $15, %xmm3
2114 pslldq $1, %xmm2
2115 por %xmm3, %xmm2 /* merge into one 16byte value */
2116#else
2117 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2118#endif
2119 TOLOWER (%xmm1, %xmm2)
2120
2121 pcmpeqb %xmm1, %xmm0
2122 pcmpeqb %xmm2, %xmm1
2123 psubb %xmm0, %xmm1
2124 pmovmskb %xmm1, %edx
2125 sub $0xffff, %edx
2126 jnz LABEL(exit)
2127
2128#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2129 sub $16, %r11
2130 jbe LABEL(strcmp_exitz)
2131#endif
2132
2133 add $16, %rcx
2134 movdqa %xmm4, %xmm3
2135
2136 add $16, %r10
2137 jg LABEL(nibble_ashr_15) /* cross page boundary */
2138
2139 movdqa (%rsi, %rcx), %xmm1
2140 movdqa (%rdi, %rcx), %xmm2
2141 movdqa %xmm2, %xmm4
2142
2143#ifndef USE_SSSE3
2144 psrldq $15, %xmm3
2145 pslldq $1, %xmm2
2146 por %xmm3, %xmm2 /* merge into one 16byte value */
2147#else
2148 palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
2149#endif
2150 TOLOWER (%xmm1, %xmm2)
2151
2152 pcmpeqb %xmm1, %xmm0
2153 pcmpeqb %xmm2, %xmm1
2154 psubb %xmm0, %xmm1
2155 pmovmskb %xmm1, %edx
2156 sub $0xffff, %edx
2157 jnz LABEL(exit)
2158
2159#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2160 sub $16, %r11
2161 jbe LABEL(strcmp_exitz)
2162#endif
2163
2164 add $16, %rcx
2165 movdqa %xmm4, %xmm3
2166 jmp LABEL(loop_ashr_15)
2167
2168 .p2align 4
2169LABEL(nibble_ashr_15):
2170 pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
2171 pmovmskb %xmm0, %edx
2172 test $0x8000, %edx
2173 jnz LABEL(ashr_15_exittail)
2174
2175#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2176 cmpq $1, %r11
2177 jbe LABEL(ashr_15_exittail)
2178#endif
2179
2180 pxor %xmm0, %xmm0
2181 sub $0x1000, %r10
2182 jmp LABEL(gobble_ashr_15)
2183
2184 .p2align 4
2185LABEL(ashr_15_exittail):
2186 movdqa (%rsi, %rcx), %xmm1
2187 psrldq $15, %xmm3
2188 psrldq $15, %xmm0
2189
2190 .p2align 4
2191LABEL(aftertail):
2192 TOLOWER (%xmm1, %xmm3)
2193 pcmpeqb %xmm3, %xmm1
2194 psubb %xmm0, %xmm1
2195 pmovmskb %xmm1, %edx
2196 not %edx
2197
2198 .p2align 4
2199LABEL(exit):
2200 lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
2201LABEL(less32bytes):
2202 lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
2203 lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
2204 test %r8d, %r8d
2205 jz LABEL(ret)
2206 xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
2207
2208 .p2align 4
2209LABEL(ret):
2210LABEL(less16bytes):
2211 bsf %rdx, %rdx /* find and store bit index in %rdx */
2212
2213#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
2214 sub %rdx, %r11
2215 jbe LABEL(strcmp_exitz)
2216#endif
2217 movzbl (%rsi, %rdx), %ecx
2218 movzbl (%rdi, %rdx), %eax
2219
2220#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2221 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2222 movl (%rdx,%rcx,4), %ecx
2223 movl (%rdx,%rax,4), %eax
2224#endif
2225
2226 sub %ecx, %eax
2227 ret
2228
2229LABEL(strcmp_exitz):
2230 xor %eax, %eax
2231 ret
2232
2233 .p2align 4
2234LABEL(Byte0):
2235 movzx (%rsi), %ecx
2236 movzx (%rdi), %eax
2237
2238#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
2239 leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
2240 movl (%rdx,%rcx,4), %ecx
2241 movl (%rdx,%rax,4), %eax
2242#endif
2243
2244 sub %ecx, %eax
2245 ret
2246END (STRCMP)
2247
2248 .section .rodata,"a",@progbits
2249 .p2align 3
2250LABEL(unaligned_table):
2251 .int LABEL(ashr_1) - LABEL(unaligned_table)
2252 .int LABEL(ashr_2) - LABEL(unaligned_table)
2253 .int LABEL(ashr_3) - LABEL(unaligned_table)
2254 .int LABEL(ashr_4) - LABEL(unaligned_table)
2255 .int LABEL(ashr_5) - LABEL(unaligned_table)
2256 .int LABEL(ashr_6) - LABEL(unaligned_table)
2257 .int LABEL(ashr_7) - LABEL(unaligned_table)
2258 .int LABEL(ashr_8) - LABEL(unaligned_table)
2259 .int LABEL(ashr_9) - LABEL(unaligned_table)
2260 .int LABEL(ashr_10) - LABEL(unaligned_table)
2261 .int LABEL(ashr_11) - LABEL(unaligned_table)
2262 .int LABEL(ashr_12) - LABEL(unaligned_table)
2263 .int LABEL(ashr_13) - LABEL(unaligned_table)
2264 .int LABEL(ashr_14) - LABEL(unaligned_table)
2265 .int LABEL(ashr_15) - LABEL(unaligned_table)
2266 .int LABEL(ashr_0) - LABEL(unaligned_table)
2267libc_hidden_builtin_def (STRCMP)
2268