1/* memcmp with SSSE3, wmemcmp with SSSE3
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24# ifndef MEMCMP
25# define MEMCMP __memcmp_ssse3
26# endif
27
28/* Warning!
29 wmemcmp has to use SIGNED comparison for elements.
30 memcmp has to use UNSIGNED comparison for elemnts.
31*/
32
33 atom_text_section
34ENTRY (MEMCMP)
35# ifdef USE_AS_WMEMCMP
36 shl $2, %RDX_LP
37 test %RDX_LP, %RDX_LP
38 jz L(equal)
39# elif defined __ILP32__
40 /* Clear the upper 32 bits. */
41 mov %edx, %edx
42# endif
43 mov %rdx, %rcx
44 mov %rdi, %rdx
45 cmp $48, %rcx;
46 jae L(48bytesormore) /* LEN => 48 */
47
48 add %rcx, %rsi
49 add %rcx, %rdi
50 jmp L(less48bytes)
51
52 .p2align 4
53/* ECX >= 32. */
54L(48bytesormore):
55 movdqu (%rdi), %xmm3
56 movdqu (%rsi), %xmm0
57 pcmpeqb %xmm0, %xmm3
58 pmovmskb %xmm3, %edx
59 lea 16(%rdi), %rdi
60 lea 16(%rsi), %rsi
61 sub $0xffff, %edx
62 jnz L(less16bytes)
63 mov %edi, %edx
64 and $0xf, %edx
65 xor %rdx, %rdi
66 sub %rdx, %rsi
67 add %rdx, %rcx
68 mov %esi, %edx
69 and $0xf, %edx
70 jz L(shr_0)
71 xor %rdx, %rsi
72
73# ifndef USE_AS_WMEMCMP
74 cmp $8, %edx
75 jae L(next_unaligned_table)
76 cmp $0, %edx
77 je L(shr_0)
78 cmp $1, %edx
79 je L(shr_1)
80 cmp $2, %edx
81 je L(shr_2)
82 cmp $3, %edx
83 je L(shr_3)
84 cmp $4, %edx
85 je L(shr_4)
86 cmp $5, %edx
87 je L(shr_5)
88 cmp $6, %edx
89 je L(shr_6)
90 jmp L(shr_7)
91
92 .p2align 2
93L(next_unaligned_table):
94 cmp $8, %edx
95 je L(shr_8)
96 cmp $9, %edx
97 je L(shr_9)
98 cmp $10, %edx
99 je L(shr_10)
100 cmp $11, %edx
101 je L(shr_11)
102 cmp $12, %edx
103 je L(shr_12)
104 cmp $13, %edx
105 je L(shr_13)
106 cmp $14, %edx
107 je L(shr_14)
108 jmp L(shr_15)
109# else
110 cmp $0, %edx
111 je L(shr_0)
112 cmp $4, %edx
113 je L(shr_4)
114 cmp $8, %edx
115 je L(shr_8)
116 jmp L(shr_12)
117# endif
118
119 .p2align 4
120L(shr_0):
121 cmp $80, %rcx
122 lea -48(%rcx), %rcx
123 jae L(shr_0_gobble)
124 xor %eax, %eax
125 movdqa (%rsi), %xmm1
126 pcmpeqb (%rdi), %xmm1
127 movdqa 16(%rsi), %xmm2
128 pcmpeqb 16(%rdi), %xmm2
129 pand %xmm1, %xmm2
130 pmovmskb %xmm2, %edx
131 lea 32(%rdi), %rdi
132 lea 32(%rsi), %rsi
133 sub $0xffff, %edx
134 jnz L(exit)
135 add %rcx, %rsi
136 add %rcx, %rdi
137 jmp L(less48bytes)
138
139 .p2align 4
140L(shr_0_gobble):
141 movdqa (%rsi), %xmm0
142 xor %eax, %eax
143 pcmpeqb (%rdi), %xmm0
144 sub $32, %rcx
145 movdqa 16(%rsi), %xmm2
146 pcmpeqb 16(%rdi), %xmm2
147L(shr_0_gobble_loop):
148 pand %xmm0, %xmm2
149 sub $32, %rcx
150 pmovmskb %xmm2, %edx
151 movdqa %xmm0, %xmm1
152 movdqa 32(%rsi), %xmm0
153 movdqa 48(%rsi), %xmm2
154 sbb $0xffff, %edx
155 pcmpeqb 32(%rdi), %xmm0
156 pcmpeqb 48(%rdi), %xmm2
157 lea 32(%rdi), %rdi
158 lea 32(%rsi), %rsi
159 jz L(shr_0_gobble_loop)
160
161 pand %xmm0, %xmm2
162 cmp $0, %rcx
163 jge L(next)
164 inc %edx
165 add $32, %rcx
166L(next):
167 test %edx, %edx
168 jnz L(exit)
169
170 pmovmskb %xmm2, %edx
171 movdqa %xmm0, %xmm1
172 lea 32(%rdi), %rdi
173 lea 32(%rsi), %rsi
174 sub $0xffff, %edx
175 jnz L(exit)
176 add %rcx, %rsi
177 add %rcx, %rdi
178 jmp L(less48bytes)
179
180# ifndef USE_AS_WMEMCMP
181
182 .p2align 4
183L(shr_1):
184 cmp $80, %rcx
185 lea -48(%rcx), %rcx
186 mov %edx, %eax
187 jae L(shr_1_gobble)
188
189 movdqa 16(%rsi), %xmm1
190 movdqa %xmm1, %xmm2
191 palignr $1, (%rsi), %xmm1
192 pcmpeqb (%rdi), %xmm1
193
194 movdqa 32(%rsi), %xmm3
195 palignr $1, %xmm2, %xmm3
196 pcmpeqb 16(%rdi), %xmm3
197
198 pand %xmm1, %xmm3
199 pmovmskb %xmm3, %edx
200 lea 32(%rdi), %rdi
201 lea 32(%rsi), %rsi
202 sub $0xffff, %edx
203 jnz L(exit)
204 add $1, %rsi
205 add %rcx, %rsi
206 add %rcx, %rdi
207 jmp L(less48bytes)
208
209 .p2align 4
210L(shr_1_gobble):
211 sub $32, %rcx
212 movdqa 16(%rsi), %xmm0
213 palignr $1, (%rsi), %xmm0
214 pcmpeqb (%rdi), %xmm0
215
216 movdqa 32(%rsi), %xmm3
217 palignr $1, 16(%rsi), %xmm3
218 pcmpeqb 16(%rdi), %xmm3
219
220L(shr_1_gobble_loop):
221 pand %xmm0, %xmm3
222 sub $32, %rcx
223 pmovmskb %xmm3, %edx
224 movdqa %xmm0, %xmm1
225
226 movdqa 64(%rsi), %xmm3
227 palignr $1, 48(%rsi), %xmm3
228 sbb $0xffff, %edx
229 movdqa 48(%rsi), %xmm0
230 palignr $1, 32(%rsi), %xmm0
231 pcmpeqb 32(%rdi), %xmm0
232 lea 32(%rsi), %rsi
233 pcmpeqb 48(%rdi), %xmm3
234
235 lea 32(%rdi), %rdi
236 jz L(shr_1_gobble_loop)
237 pand %xmm0, %xmm3
238
239 cmp $0, %rcx
240 jge L(shr_1_gobble_next)
241 inc %edx
242 add $32, %rcx
243L(shr_1_gobble_next):
244 test %edx, %edx
245 jnz L(exit)
246
247 pmovmskb %xmm3, %edx
248 movdqa %xmm0, %xmm1
249 lea 32(%rdi), %rdi
250 lea 32(%rsi), %rsi
251 sub $0xffff, %edx
252 jnz L(exit)
253
254 lea 1(%rsi), %rsi
255 add %rcx, %rsi
256 add %rcx, %rdi
257 jmp L(less48bytes)
258
259
260 .p2align 4
261L(shr_2):
262 cmp $80, %rcx
263 lea -48(%rcx), %rcx
264 mov %edx, %eax
265 jae L(shr_2_gobble)
266
267 movdqa 16(%rsi), %xmm1
268 movdqa %xmm1, %xmm2
269 palignr $2, (%rsi), %xmm1
270 pcmpeqb (%rdi), %xmm1
271
272 movdqa 32(%rsi), %xmm3
273 palignr $2, %xmm2, %xmm3
274 pcmpeqb 16(%rdi), %xmm3
275
276 pand %xmm1, %xmm3
277 pmovmskb %xmm3, %edx
278 lea 32(%rdi), %rdi
279 lea 32(%rsi), %rsi
280 sub $0xffff, %edx
281 jnz L(exit)
282 add $2, %rsi
283 add %rcx, %rsi
284 add %rcx, %rdi
285 jmp L(less48bytes)
286
287 .p2align 4
288L(shr_2_gobble):
289 sub $32, %rcx
290 movdqa 16(%rsi), %xmm0
291 palignr $2, (%rsi), %xmm0
292 pcmpeqb (%rdi), %xmm0
293
294 movdqa 32(%rsi), %xmm3
295 palignr $2, 16(%rsi), %xmm3
296 pcmpeqb 16(%rdi), %xmm3
297
298L(shr_2_gobble_loop):
299 pand %xmm0, %xmm3
300 sub $32, %rcx
301 pmovmskb %xmm3, %edx
302 movdqa %xmm0, %xmm1
303
304 movdqa 64(%rsi), %xmm3
305 palignr $2, 48(%rsi), %xmm3
306 sbb $0xffff, %edx
307 movdqa 48(%rsi), %xmm0
308 palignr $2, 32(%rsi), %xmm0
309 pcmpeqb 32(%rdi), %xmm0
310 lea 32(%rsi), %rsi
311 pcmpeqb 48(%rdi), %xmm3
312
313 lea 32(%rdi), %rdi
314 jz L(shr_2_gobble_loop)
315 pand %xmm0, %xmm3
316
317 cmp $0, %rcx
318 jge L(shr_2_gobble_next)
319 inc %edx
320 add $32, %rcx
321L(shr_2_gobble_next):
322 test %edx, %edx
323 jnz L(exit)
324
325 pmovmskb %xmm3, %edx
326 movdqa %xmm0, %xmm1
327 lea 32(%rdi), %rdi
328 lea 32(%rsi), %rsi
329 sub $0xffff, %edx
330 jnz L(exit)
331
332 lea 2(%rsi), %rsi
333 add %rcx, %rsi
334 add %rcx, %rdi
335 jmp L(less48bytes)
336
337 .p2align 4
338L(shr_3):
339 cmp $80, %rcx
340 lea -48(%rcx), %rcx
341 mov %edx, %eax
342 jae L(shr_3_gobble)
343
344 movdqa 16(%rsi), %xmm1
345 movdqa %xmm1, %xmm2
346 palignr $3, (%rsi), %xmm1
347 pcmpeqb (%rdi), %xmm1
348
349 movdqa 32(%rsi), %xmm3
350 palignr $3, %xmm2, %xmm3
351 pcmpeqb 16(%rdi), %xmm3
352
353 pand %xmm1, %xmm3
354 pmovmskb %xmm3, %edx
355 lea 32(%rdi), %rdi
356 lea 32(%rsi), %rsi
357 sub $0xffff, %edx
358 jnz L(exit)
359 add $3, %rsi
360 add %rcx, %rsi
361 add %rcx, %rdi
362 jmp L(less48bytes)
363
364 .p2align 4
365L(shr_3_gobble):
366 sub $32, %rcx
367 movdqa 16(%rsi), %xmm0
368 palignr $3, (%rsi), %xmm0
369 pcmpeqb (%rdi), %xmm0
370
371 movdqa 32(%rsi), %xmm3
372 palignr $3, 16(%rsi), %xmm3
373 pcmpeqb 16(%rdi), %xmm3
374
375L(shr_3_gobble_loop):
376 pand %xmm0, %xmm3
377 sub $32, %rcx
378 pmovmskb %xmm3, %edx
379 movdqa %xmm0, %xmm1
380
381 movdqa 64(%rsi), %xmm3
382 palignr $3, 48(%rsi), %xmm3
383 sbb $0xffff, %edx
384 movdqa 48(%rsi), %xmm0
385 palignr $3, 32(%rsi), %xmm0
386 pcmpeqb 32(%rdi), %xmm0
387 lea 32(%rsi), %rsi
388 pcmpeqb 48(%rdi), %xmm3
389
390 lea 32(%rdi), %rdi
391 jz L(shr_3_gobble_loop)
392 pand %xmm0, %xmm3
393
394 cmp $0, %rcx
395 jge L(shr_3_gobble_next)
396 inc %edx
397 add $32, %rcx
398L(shr_3_gobble_next):
399 test %edx, %edx
400 jnz L(exit)
401
402 pmovmskb %xmm3, %edx
403 movdqa %xmm0, %xmm1
404 lea 32(%rdi), %rdi
405 lea 32(%rsi), %rsi
406 sub $0xffff, %edx
407 jnz L(exit)
408
409 lea 3(%rsi), %rsi
410 add %rcx, %rsi
411 add %rcx, %rdi
412 jmp L(less48bytes)
413
414# endif
415
416 .p2align 4
417L(shr_4):
418 cmp $80, %rcx
419 lea -48(%rcx), %rcx
420 mov %edx, %eax
421 jae L(shr_4_gobble)
422
423 movdqa 16(%rsi), %xmm1
424 movdqa %xmm1, %xmm2
425 palignr $4, (%rsi), %xmm1
426 pcmpeqb (%rdi), %xmm1
427
428 movdqa 32(%rsi), %xmm3
429 palignr $4, %xmm2, %xmm3
430 pcmpeqb 16(%rdi), %xmm3
431
432 pand %xmm1, %xmm3
433 pmovmskb %xmm3, %edx
434 lea 32(%rdi), %rdi
435 lea 32(%rsi), %rsi
436 sub $0xffff, %edx
437 jnz L(exit)
438 add $4, %rsi
439 add %rcx, %rsi
440 add %rcx, %rdi
441 jmp L(less48bytes)
442
443 .p2align 4
444L(shr_4_gobble):
445 sub $32, %rcx
446 movdqa 16(%rsi), %xmm0
447 palignr $4, (%rsi), %xmm0
448 pcmpeqb (%rdi), %xmm0
449
450 movdqa 32(%rsi), %xmm3
451 palignr $4, 16(%rsi), %xmm3
452 pcmpeqb 16(%rdi), %xmm3
453
454L(shr_4_gobble_loop):
455 pand %xmm0, %xmm3
456 sub $32, %rcx
457 pmovmskb %xmm3, %edx
458 movdqa %xmm0, %xmm1
459
460 movdqa 64(%rsi), %xmm3
461 palignr $4, 48(%rsi), %xmm3
462 sbb $0xffff, %edx
463 movdqa 48(%rsi), %xmm0
464 palignr $4, 32(%rsi), %xmm0
465 pcmpeqb 32(%rdi), %xmm0
466 lea 32(%rsi), %rsi
467 pcmpeqb 48(%rdi), %xmm3
468
469 lea 32(%rdi), %rdi
470 jz L(shr_4_gobble_loop)
471 pand %xmm0, %xmm3
472
473 cmp $0, %rcx
474 jge L(shr_4_gobble_next)
475 inc %edx
476 add $32, %rcx
477L(shr_4_gobble_next):
478 test %edx, %edx
479 jnz L(exit)
480
481 pmovmskb %xmm3, %edx
482 movdqa %xmm0, %xmm1
483 lea 32(%rdi), %rdi
484 lea 32(%rsi), %rsi
485 sub $0xffff, %edx
486 jnz L(exit)
487
488 lea 4(%rsi), %rsi
489 add %rcx, %rsi
490 add %rcx, %rdi
491 jmp L(less48bytes)
492
493# ifndef USE_AS_WMEMCMP
494
495 .p2align 4
496L(shr_5):
497 cmp $80, %rcx
498 lea -48(%rcx), %rcx
499 mov %edx, %eax
500 jae L(shr_5_gobble)
501
502 movdqa 16(%rsi), %xmm1
503 movdqa %xmm1, %xmm2
504 palignr $5, (%rsi), %xmm1
505 pcmpeqb (%rdi), %xmm1
506
507 movdqa 32(%rsi), %xmm3
508 palignr $5, %xmm2, %xmm3
509 pcmpeqb 16(%rdi), %xmm3
510
511 pand %xmm1, %xmm3
512 pmovmskb %xmm3, %edx
513 lea 32(%rdi), %rdi
514 lea 32(%rsi), %rsi
515 sub $0xffff, %edx
516 jnz L(exit)
517 add $5, %rsi
518 add %rcx, %rsi
519 add %rcx, %rdi
520 jmp L(less48bytes)
521
522 .p2align 4
523L(shr_5_gobble):
524 sub $32, %rcx
525 movdqa 16(%rsi), %xmm0
526 palignr $5, (%rsi), %xmm0
527 pcmpeqb (%rdi), %xmm0
528
529 movdqa 32(%rsi), %xmm3
530 palignr $5, 16(%rsi), %xmm3
531 pcmpeqb 16(%rdi), %xmm3
532
533L(shr_5_gobble_loop):
534 pand %xmm0, %xmm3
535 sub $32, %rcx
536 pmovmskb %xmm3, %edx
537 movdqa %xmm0, %xmm1
538
539 movdqa 64(%rsi), %xmm3
540 palignr $5, 48(%rsi), %xmm3
541 sbb $0xffff, %edx
542 movdqa 48(%rsi), %xmm0
543 palignr $5, 32(%rsi), %xmm0
544 pcmpeqb 32(%rdi), %xmm0
545 lea 32(%rsi), %rsi
546 pcmpeqb 48(%rdi), %xmm3
547
548 lea 32(%rdi), %rdi
549 jz L(shr_5_gobble_loop)
550 pand %xmm0, %xmm3
551
552 cmp $0, %rcx
553 jge L(shr_5_gobble_next)
554 inc %edx
555 add $32, %rcx
556L(shr_5_gobble_next):
557 test %edx, %edx
558 jnz L(exit)
559
560 pmovmskb %xmm3, %edx
561 movdqa %xmm0, %xmm1
562 lea 32(%rdi), %rdi
563 lea 32(%rsi), %rsi
564 sub $0xffff, %edx
565 jnz L(exit)
566
567 lea 5(%rsi), %rsi
568 add %rcx, %rsi
569 add %rcx, %rdi
570 jmp L(less48bytes)
571
572 .p2align 4
573L(shr_6):
574 cmp $80, %rcx
575 lea -48(%rcx), %rcx
576 mov %edx, %eax
577 jae L(shr_6_gobble)
578
579 movdqa 16(%rsi), %xmm1
580 movdqa %xmm1, %xmm2
581 palignr $6, (%rsi), %xmm1
582 pcmpeqb (%rdi), %xmm1
583
584 movdqa 32(%rsi), %xmm3
585 palignr $6, %xmm2, %xmm3
586 pcmpeqb 16(%rdi), %xmm3
587
588 pand %xmm1, %xmm3
589 pmovmskb %xmm3, %edx
590 lea 32(%rdi), %rdi
591 lea 32(%rsi), %rsi
592 sub $0xffff, %edx
593 jnz L(exit)
594 add $6, %rsi
595 add %rcx, %rsi
596 add %rcx, %rdi
597 jmp L(less48bytes)
598
599 .p2align 4
600L(shr_6_gobble):
601 sub $32, %rcx
602 movdqa 16(%rsi), %xmm0
603 palignr $6, (%rsi), %xmm0
604 pcmpeqb (%rdi), %xmm0
605
606 movdqa 32(%rsi), %xmm3
607 palignr $6, 16(%rsi), %xmm3
608 pcmpeqb 16(%rdi), %xmm3
609
610L(shr_6_gobble_loop):
611 pand %xmm0, %xmm3
612 sub $32, %rcx
613 pmovmskb %xmm3, %edx
614 movdqa %xmm0, %xmm1
615
616 movdqa 64(%rsi), %xmm3
617 palignr $6, 48(%rsi), %xmm3
618 sbb $0xffff, %edx
619 movdqa 48(%rsi), %xmm0
620 palignr $6, 32(%rsi), %xmm0
621 pcmpeqb 32(%rdi), %xmm0
622 lea 32(%rsi), %rsi
623 pcmpeqb 48(%rdi), %xmm3
624
625 lea 32(%rdi), %rdi
626 jz L(shr_6_gobble_loop)
627 pand %xmm0, %xmm3
628
629 cmp $0, %rcx
630 jge L(shr_6_gobble_next)
631 inc %edx
632 add $32, %rcx
633L(shr_6_gobble_next):
634 test %edx, %edx
635 jnz L(exit)
636
637 pmovmskb %xmm3, %edx
638 movdqa %xmm0, %xmm1
639 lea 32(%rdi), %rdi
640 lea 32(%rsi), %rsi
641 sub $0xffff, %edx
642 jnz L(exit)
643
644 lea 6(%rsi), %rsi
645 add %rcx, %rsi
646 add %rcx, %rdi
647 jmp L(less48bytes)
648
649 .p2align 4
650L(shr_7):
651 cmp $80, %rcx
652 lea -48(%rcx), %rcx
653 mov %edx, %eax
654 jae L(shr_7_gobble)
655
656 movdqa 16(%rsi), %xmm1
657 movdqa %xmm1, %xmm2
658 palignr $7, (%rsi), %xmm1
659 pcmpeqb (%rdi), %xmm1
660
661 movdqa 32(%rsi), %xmm3
662 palignr $7, %xmm2, %xmm3
663 pcmpeqb 16(%rdi), %xmm3
664
665 pand %xmm1, %xmm3
666 pmovmskb %xmm3, %edx
667 lea 32(%rdi), %rdi
668 lea 32(%rsi), %rsi
669 sub $0xffff, %edx
670 jnz L(exit)
671 add $7, %rsi
672 add %rcx, %rsi
673 add %rcx, %rdi
674 jmp L(less48bytes)
675
676 .p2align 4
677L(shr_7_gobble):
678 sub $32, %rcx
679 movdqa 16(%rsi), %xmm0
680 palignr $7, (%rsi), %xmm0
681 pcmpeqb (%rdi), %xmm0
682
683 movdqa 32(%rsi), %xmm3
684 palignr $7, 16(%rsi), %xmm3
685 pcmpeqb 16(%rdi), %xmm3
686
687L(shr_7_gobble_loop):
688 pand %xmm0, %xmm3
689 sub $32, %rcx
690 pmovmskb %xmm3, %edx
691 movdqa %xmm0, %xmm1
692
693 movdqa 64(%rsi), %xmm3
694 palignr $7, 48(%rsi), %xmm3
695 sbb $0xffff, %edx
696 movdqa 48(%rsi), %xmm0
697 palignr $7, 32(%rsi), %xmm0
698 pcmpeqb 32(%rdi), %xmm0
699 lea 32(%rsi), %rsi
700 pcmpeqb 48(%rdi), %xmm3
701
702 lea 32(%rdi), %rdi
703 jz L(shr_7_gobble_loop)
704 pand %xmm0, %xmm3
705
706 cmp $0, %rcx
707 jge L(shr_7_gobble_next)
708 inc %edx
709 add $32, %rcx
710L(shr_7_gobble_next):
711 test %edx, %edx
712 jnz L(exit)
713
714 pmovmskb %xmm3, %edx
715 movdqa %xmm0, %xmm1
716 lea 32(%rdi), %rdi
717 lea 32(%rsi), %rsi
718 sub $0xffff, %edx
719 jnz L(exit)
720
721 lea 7(%rsi), %rsi
722 add %rcx, %rsi
723 add %rcx, %rdi
724 jmp L(less48bytes)
725
726# endif
727
728 .p2align 4
729L(shr_8):
730 cmp $80, %rcx
731 lea -48(%rcx), %rcx
732 mov %edx, %eax
733 jae L(shr_8_gobble)
734
735 movdqa 16(%rsi), %xmm1
736 movdqa %xmm1, %xmm2
737 palignr $8, (%rsi), %xmm1
738 pcmpeqb (%rdi), %xmm1
739
740 movdqa 32(%rsi), %xmm3
741 palignr $8, %xmm2, %xmm3
742 pcmpeqb 16(%rdi), %xmm3
743
744 pand %xmm1, %xmm3
745 pmovmskb %xmm3, %edx
746 lea 32(%rdi), %rdi
747 lea 32(%rsi), %rsi
748 sub $0xffff, %edx
749 jnz L(exit)
750 add $8, %rsi
751 add %rcx, %rsi
752 add %rcx, %rdi
753 jmp L(less48bytes)
754
755 .p2align 4
756L(shr_8_gobble):
757 sub $32, %rcx
758 movdqa 16(%rsi), %xmm0
759 palignr $8, (%rsi), %xmm0
760 pcmpeqb (%rdi), %xmm0
761
762 movdqa 32(%rsi), %xmm3
763 palignr $8, 16(%rsi), %xmm3
764 pcmpeqb 16(%rdi), %xmm3
765
766L(shr_8_gobble_loop):
767 pand %xmm0, %xmm3
768 sub $32, %rcx
769 pmovmskb %xmm3, %edx
770 movdqa %xmm0, %xmm1
771
772 movdqa 64(%rsi), %xmm3
773 palignr $8, 48(%rsi), %xmm3
774 sbb $0xffff, %edx
775 movdqa 48(%rsi), %xmm0
776 palignr $8, 32(%rsi), %xmm0
777 pcmpeqb 32(%rdi), %xmm0
778 lea 32(%rsi), %rsi
779 pcmpeqb 48(%rdi), %xmm3
780
781 lea 32(%rdi), %rdi
782 jz L(shr_8_gobble_loop)
783 pand %xmm0, %xmm3
784
785 cmp $0, %rcx
786 jge L(shr_8_gobble_next)
787 inc %edx
788 add $32, %rcx
789L(shr_8_gobble_next):
790 test %edx, %edx
791 jnz L(exit)
792
793 pmovmskb %xmm3, %edx
794 movdqa %xmm0, %xmm1
795 lea 32(%rdi), %rdi
796 lea 32(%rsi), %rsi
797 sub $0xffff, %edx
798 jnz L(exit)
799
800 lea 8(%rsi), %rsi
801 add %rcx, %rsi
802 add %rcx, %rdi
803 jmp L(less48bytes)
804
805# ifndef USE_AS_WMEMCMP
806
807 .p2align 4
808L(shr_9):
809 cmp $80, %rcx
810 lea -48(%rcx), %rcx
811 mov %edx, %eax
812 jae L(shr_9_gobble)
813
814 movdqa 16(%rsi), %xmm1
815 movdqa %xmm1, %xmm2
816 palignr $9, (%rsi), %xmm1
817 pcmpeqb (%rdi), %xmm1
818
819 movdqa 32(%rsi), %xmm3
820 palignr $9, %xmm2, %xmm3
821 pcmpeqb 16(%rdi), %xmm3
822
823 pand %xmm1, %xmm3
824 pmovmskb %xmm3, %edx
825 lea 32(%rdi), %rdi
826 lea 32(%rsi), %rsi
827 sub $0xffff, %edx
828 jnz L(exit)
829 add $9, %rsi
830 add %rcx, %rsi
831 add %rcx, %rdi
832 jmp L(less48bytes)
833
834 .p2align 4
835L(shr_9_gobble):
836 sub $32, %rcx
837 movdqa 16(%rsi), %xmm0
838 palignr $9, (%rsi), %xmm0
839 pcmpeqb (%rdi), %xmm0
840
841 movdqa 32(%rsi), %xmm3
842 palignr $9, 16(%rsi), %xmm3
843 pcmpeqb 16(%rdi), %xmm3
844
845L(shr_9_gobble_loop):
846 pand %xmm0, %xmm3
847 sub $32, %rcx
848 pmovmskb %xmm3, %edx
849 movdqa %xmm0, %xmm1
850
851 movdqa 64(%rsi), %xmm3
852 palignr $9, 48(%rsi), %xmm3
853 sbb $0xffff, %edx
854 movdqa 48(%rsi), %xmm0
855 palignr $9, 32(%rsi), %xmm0
856 pcmpeqb 32(%rdi), %xmm0
857 lea 32(%rsi), %rsi
858 pcmpeqb 48(%rdi), %xmm3
859
860 lea 32(%rdi), %rdi
861 jz L(shr_9_gobble_loop)
862 pand %xmm0, %xmm3
863
864 cmp $0, %rcx
865 jge L(shr_9_gobble_next)
866 inc %edx
867 add $32, %rcx
868L(shr_9_gobble_next):
869 test %edx, %edx
870 jnz L(exit)
871
872 pmovmskb %xmm3, %edx
873 movdqa %xmm0, %xmm1
874 lea 32(%rdi), %rdi
875 lea 32(%rsi), %rsi
876 sub $0xffff, %edx
877 jnz L(exit)
878
879 lea 9(%rsi), %rsi
880 add %rcx, %rsi
881 add %rcx, %rdi
882 jmp L(less48bytes)
883
884 .p2align 4
885L(shr_10):
886 cmp $80, %rcx
887 lea -48(%rcx), %rcx
888 mov %edx, %eax
889 jae L(shr_10_gobble)
890
891 movdqa 16(%rsi), %xmm1
892 movdqa %xmm1, %xmm2
893 palignr $10, (%rsi), %xmm1
894 pcmpeqb (%rdi), %xmm1
895
896 movdqa 32(%rsi), %xmm3
897 palignr $10, %xmm2, %xmm3
898 pcmpeqb 16(%rdi), %xmm3
899
900 pand %xmm1, %xmm3
901 pmovmskb %xmm3, %edx
902 lea 32(%rdi), %rdi
903 lea 32(%rsi), %rsi
904 sub $0xffff, %edx
905 jnz L(exit)
906 add $10, %rsi
907 add %rcx, %rsi
908 add %rcx, %rdi
909 jmp L(less48bytes)
910
911 .p2align 4
912L(shr_10_gobble):
913 sub $32, %rcx
914 movdqa 16(%rsi), %xmm0
915 palignr $10, (%rsi), %xmm0
916 pcmpeqb (%rdi), %xmm0
917
918 movdqa 32(%rsi), %xmm3
919 palignr $10, 16(%rsi), %xmm3
920 pcmpeqb 16(%rdi), %xmm3
921
922L(shr_10_gobble_loop):
923 pand %xmm0, %xmm3
924 sub $32, %rcx
925 pmovmskb %xmm3, %edx
926 movdqa %xmm0, %xmm1
927
928 movdqa 64(%rsi), %xmm3
929 palignr $10, 48(%rsi), %xmm3
930 sbb $0xffff, %edx
931 movdqa 48(%rsi), %xmm0
932 palignr $10, 32(%rsi), %xmm0
933 pcmpeqb 32(%rdi), %xmm0
934 lea 32(%rsi), %rsi
935 pcmpeqb 48(%rdi), %xmm3
936
937 lea 32(%rdi), %rdi
938 jz L(shr_10_gobble_loop)
939 pand %xmm0, %xmm3
940
941 cmp $0, %rcx
942 jge L(shr_10_gobble_next)
943 inc %edx
944 add $32, %rcx
945L(shr_10_gobble_next):
946 test %edx, %edx
947 jnz L(exit)
948
949 pmovmskb %xmm3, %edx
950 movdqa %xmm0, %xmm1
951 lea 32(%rdi), %rdi
952 lea 32(%rsi), %rsi
953 sub $0xffff, %edx
954 jnz L(exit)
955
956 lea 10(%rsi), %rsi
957 add %rcx, %rsi
958 add %rcx, %rdi
959 jmp L(less48bytes)
960
961 .p2align 4
962L(shr_11):
963 cmp $80, %rcx
964 lea -48(%rcx), %rcx
965 mov %edx, %eax
966 jae L(shr_11_gobble)
967
968 movdqa 16(%rsi), %xmm1
969 movdqa %xmm1, %xmm2
970 palignr $11, (%rsi), %xmm1
971 pcmpeqb (%rdi), %xmm1
972
973 movdqa 32(%rsi), %xmm3
974 palignr $11, %xmm2, %xmm3
975 pcmpeqb 16(%rdi), %xmm3
976
977 pand %xmm1, %xmm3
978 pmovmskb %xmm3, %edx
979 lea 32(%rdi), %rdi
980 lea 32(%rsi), %rsi
981 sub $0xffff, %edx
982 jnz L(exit)
983 add $11, %rsi
984 add %rcx, %rsi
985 add %rcx, %rdi
986 jmp L(less48bytes)
987
988 .p2align 4
989L(shr_11_gobble):
990 sub $32, %rcx
991 movdqa 16(%rsi), %xmm0
992 palignr $11, (%rsi), %xmm0
993 pcmpeqb (%rdi), %xmm0
994
995 movdqa 32(%rsi), %xmm3
996 palignr $11, 16(%rsi), %xmm3
997 pcmpeqb 16(%rdi), %xmm3
998
999L(shr_11_gobble_loop):
1000 pand %xmm0, %xmm3
1001 sub $32, %rcx
1002 pmovmskb %xmm3, %edx
1003 movdqa %xmm0, %xmm1
1004
1005 movdqa 64(%rsi), %xmm3
1006 palignr $11, 48(%rsi), %xmm3
1007 sbb $0xffff, %edx
1008 movdqa 48(%rsi), %xmm0
1009 palignr $11, 32(%rsi), %xmm0
1010 pcmpeqb 32(%rdi), %xmm0
1011 lea 32(%rsi), %rsi
1012 pcmpeqb 48(%rdi), %xmm3
1013
1014 lea 32(%rdi), %rdi
1015 jz L(shr_11_gobble_loop)
1016 pand %xmm0, %xmm3
1017
1018 cmp $0, %rcx
1019 jge L(shr_11_gobble_next)
1020 inc %edx
1021 add $32, %rcx
1022L(shr_11_gobble_next):
1023 test %edx, %edx
1024 jnz L(exit)
1025
1026 pmovmskb %xmm3, %edx
1027 movdqa %xmm0, %xmm1
1028 lea 32(%rdi), %rdi
1029 lea 32(%rsi), %rsi
1030 sub $0xffff, %edx
1031 jnz L(exit)
1032
1033 lea 11(%rsi), %rsi
1034 add %rcx, %rsi
1035 add %rcx, %rdi
1036 jmp L(less48bytes)
1037
1038# endif
1039
1040 .p2align 4
1041L(shr_12):
1042 cmp $80, %rcx
1043 lea -48(%rcx), %rcx
1044 mov %edx, %eax
1045 jae L(shr_12_gobble)
1046
1047 movdqa 16(%rsi), %xmm1
1048 movdqa %xmm1, %xmm2
1049 palignr $12, (%rsi), %xmm1
1050 pcmpeqb (%rdi), %xmm1
1051
1052 movdqa 32(%rsi), %xmm3
1053 palignr $12, %xmm2, %xmm3
1054 pcmpeqb 16(%rdi), %xmm3
1055
1056 pand %xmm1, %xmm3
1057 pmovmskb %xmm3, %edx
1058 lea 32(%rdi), %rdi
1059 lea 32(%rsi), %rsi
1060 sub $0xffff, %edx
1061 jnz L(exit)
1062 add $12, %rsi
1063 add %rcx, %rsi
1064 add %rcx, %rdi
1065 jmp L(less48bytes)
1066
1067 .p2align 4
1068L(shr_12_gobble):
1069 sub $32, %rcx
1070 movdqa 16(%rsi), %xmm0
1071 palignr $12, (%rsi), %xmm0
1072 pcmpeqb (%rdi), %xmm0
1073
1074 movdqa 32(%rsi), %xmm3
1075 palignr $12, 16(%rsi), %xmm3
1076 pcmpeqb 16(%rdi), %xmm3
1077
1078L(shr_12_gobble_loop):
1079 pand %xmm0, %xmm3
1080 sub $32, %rcx
1081 pmovmskb %xmm3, %edx
1082 movdqa %xmm0, %xmm1
1083
1084 movdqa 64(%rsi), %xmm3
1085 palignr $12, 48(%rsi), %xmm3
1086 sbb $0xffff, %edx
1087 movdqa 48(%rsi), %xmm0
1088 palignr $12, 32(%rsi), %xmm0
1089 pcmpeqb 32(%rdi), %xmm0
1090 lea 32(%rsi), %rsi
1091 pcmpeqb 48(%rdi), %xmm3
1092
1093 lea 32(%rdi), %rdi
1094 jz L(shr_12_gobble_loop)
1095 pand %xmm0, %xmm3
1096
1097 cmp $0, %rcx
1098 jge L(shr_12_gobble_next)
1099 inc %edx
1100 add $32, %rcx
1101L(shr_12_gobble_next):
1102 test %edx, %edx
1103 jnz L(exit)
1104
1105 pmovmskb %xmm3, %edx
1106 movdqa %xmm0, %xmm1
1107 lea 32(%rdi), %rdi
1108 lea 32(%rsi), %rsi
1109 sub $0xffff, %edx
1110 jnz L(exit)
1111
1112 lea 12(%rsi), %rsi
1113 add %rcx, %rsi
1114 add %rcx, %rdi
1115 jmp L(less48bytes)
1116
1117# ifndef USE_AS_WMEMCMP
1118
1119 .p2align 4
1120L(shr_13):
1121 cmp $80, %rcx
1122 lea -48(%rcx), %rcx
1123 mov %edx, %eax
1124 jae L(shr_13_gobble)
1125
1126 movdqa 16(%rsi), %xmm1
1127 movdqa %xmm1, %xmm2
1128 palignr $13, (%rsi), %xmm1
1129 pcmpeqb (%rdi), %xmm1
1130
1131 movdqa 32(%rsi), %xmm3
1132 palignr $13, %xmm2, %xmm3
1133 pcmpeqb 16(%rdi), %xmm3
1134
1135 pand %xmm1, %xmm3
1136 pmovmskb %xmm3, %edx
1137 lea 32(%rdi), %rdi
1138 lea 32(%rsi), %rsi
1139 sub $0xffff, %edx
1140 jnz L(exit)
1141 add $13, %rsi
1142 add %rcx, %rsi
1143 add %rcx, %rdi
1144 jmp L(less48bytes)
1145
1146 .p2align 4
1147L(shr_13_gobble):
1148 sub $32, %rcx
1149 movdqa 16(%rsi), %xmm0
1150 palignr $13, (%rsi), %xmm0
1151 pcmpeqb (%rdi), %xmm0
1152
1153 movdqa 32(%rsi), %xmm3
1154 palignr $13, 16(%rsi), %xmm3
1155 pcmpeqb 16(%rdi), %xmm3
1156
1157L(shr_13_gobble_loop):
1158 pand %xmm0, %xmm3
1159 sub $32, %rcx
1160 pmovmskb %xmm3, %edx
1161 movdqa %xmm0, %xmm1
1162
1163 movdqa 64(%rsi), %xmm3
1164 palignr $13, 48(%rsi), %xmm3
1165 sbb $0xffff, %edx
1166 movdqa 48(%rsi), %xmm0
1167 palignr $13, 32(%rsi), %xmm0
1168 pcmpeqb 32(%rdi), %xmm0
1169 lea 32(%rsi), %rsi
1170 pcmpeqb 48(%rdi), %xmm3
1171
1172 lea 32(%rdi), %rdi
1173 jz L(shr_13_gobble_loop)
1174 pand %xmm0, %xmm3
1175
1176 cmp $0, %rcx
1177 jge L(shr_13_gobble_next)
1178 inc %edx
1179 add $32, %rcx
1180L(shr_13_gobble_next):
1181 test %edx, %edx
1182 jnz L(exit)
1183
1184 pmovmskb %xmm3, %edx
1185 movdqa %xmm0, %xmm1
1186 lea 32(%rdi), %rdi
1187 lea 32(%rsi), %rsi
1188 sub $0xffff, %edx
1189 jnz L(exit)
1190
1191 lea 13(%rsi), %rsi
1192 add %rcx, %rsi
1193 add %rcx, %rdi
1194 jmp L(less48bytes)
1195
1196 .p2align 4
1197L(shr_14):
1198 cmp $80, %rcx
1199 lea -48(%rcx), %rcx
1200 mov %edx, %eax
1201 jae L(shr_14_gobble)
1202
1203 movdqa 16(%rsi), %xmm1
1204 movdqa %xmm1, %xmm2
1205 palignr $14, (%rsi), %xmm1
1206 pcmpeqb (%rdi), %xmm1
1207
1208 movdqa 32(%rsi), %xmm3
1209 palignr $14, %xmm2, %xmm3
1210 pcmpeqb 16(%rdi), %xmm3
1211
1212 pand %xmm1, %xmm3
1213 pmovmskb %xmm3, %edx
1214 lea 32(%rdi), %rdi
1215 lea 32(%rsi), %rsi
1216 sub $0xffff, %edx
1217 jnz L(exit)
1218 add $14, %rsi
1219 add %rcx, %rsi
1220 add %rcx, %rdi
1221 jmp L(less48bytes)
1222
1223 .p2align 4
1224L(shr_14_gobble):
1225 sub $32, %rcx
1226 movdqa 16(%rsi), %xmm0
1227 palignr $14, (%rsi), %xmm0
1228 pcmpeqb (%rdi), %xmm0
1229
1230 movdqa 32(%rsi), %xmm3
1231 palignr $14, 16(%rsi), %xmm3
1232 pcmpeqb 16(%rdi), %xmm3
1233
1234L(shr_14_gobble_loop):
1235 pand %xmm0, %xmm3
1236 sub $32, %rcx
1237 pmovmskb %xmm3, %edx
1238 movdqa %xmm0, %xmm1
1239
1240 movdqa 64(%rsi), %xmm3
1241 palignr $14, 48(%rsi), %xmm3
1242 sbb $0xffff, %edx
1243 movdqa 48(%rsi), %xmm0
1244 palignr $14, 32(%rsi), %xmm0
1245 pcmpeqb 32(%rdi), %xmm0
1246 lea 32(%rsi), %rsi
1247 pcmpeqb 48(%rdi), %xmm3
1248
1249 lea 32(%rdi), %rdi
1250 jz L(shr_14_gobble_loop)
1251 pand %xmm0, %xmm3
1252
1253 cmp $0, %rcx
1254 jge L(shr_14_gobble_next)
1255 inc %edx
1256 add $32, %rcx
1257L(shr_14_gobble_next):
1258 test %edx, %edx
1259 jnz L(exit)
1260
1261 pmovmskb %xmm3, %edx
1262 movdqa %xmm0, %xmm1
1263 lea 32(%rdi), %rdi
1264 lea 32(%rsi), %rsi
1265 sub $0xffff, %edx
1266 jnz L(exit)
1267
1268 lea 14(%rsi), %rsi
1269 add %rcx, %rsi
1270 add %rcx, %rdi
1271 jmp L(less48bytes)
1272
1273 .p2align 4
1274L(shr_15):
1275 cmp $80, %rcx
1276 lea -48(%rcx), %rcx
1277 mov %edx, %eax
1278 jae L(shr_15_gobble)
1279
1280 movdqa 16(%rsi), %xmm1
1281 movdqa %xmm1, %xmm2
1282 palignr $15, (%rsi), %xmm1
1283 pcmpeqb (%rdi), %xmm1
1284
1285 movdqa 32(%rsi), %xmm3
1286 palignr $15, %xmm2, %xmm3
1287 pcmpeqb 16(%rdi), %xmm3
1288
1289 pand %xmm1, %xmm3
1290 pmovmskb %xmm3, %edx
1291 lea 32(%rdi), %rdi
1292 lea 32(%rsi), %rsi
1293 sub $0xffff, %edx
1294 jnz L(exit)
1295 add $15, %rsi
1296 add %rcx, %rsi
1297 add %rcx, %rdi
1298 jmp L(less48bytes)
1299
1300 .p2align 4
1301L(shr_15_gobble):
1302 sub $32, %rcx
1303 movdqa 16(%rsi), %xmm0
1304 palignr $15, (%rsi), %xmm0
1305 pcmpeqb (%rdi), %xmm0
1306
1307 movdqa 32(%rsi), %xmm3
1308 palignr $15, 16(%rsi), %xmm3
1309 pcmpeqb 16(%rdi), %xmm3
1310
1311L(shr_15_gobble_loop):
1312 pand %xmm0, %xmm3
1313 sub $32, %rcx
1314 pmovmskb %xmm3, %edx
1315 movdqa %xmm0, %xmm1
1316
1317 movdqa 64(%rsi), %xmm3
1318 palignr $15, 48(%rsi), %xmm3
1319 sbb $0xffff, %edx
1320 movdqa 48(%rsi), %xmm0
1321 palignr $15, 32(%rsi), %xmm0
1322 pcmpeqb 32(%rdi), %xmm0
1323 lea 32(%rsi), %rsi
1324 pcmpeqb 48(%rdi), %xmm3
1325
1326 lea 32(%rdi), %rdi
1327 jz L(shr_15_gobble_loop)
1328 pand %xmm0, %xmm3
1329
1330 cmp $0, %rcx
1331 jge L(shr_15_gobble_next)
1332 inc %edx
1333 add $32, %rcx
1334L(shr_15_gobble_next):
1335 test %edx, %edx
1336 jnz L(exit)
1337
1338 pmovmskb %xmm3, %edx
1339 movdqa %xmm0, %xmm1
1340 lea 32(%rdi), %rdi
1341 lea 32(%rsi), %rsi
1342 sub $0xffff, %edx
1343 jnz L(exit)
1344
1345 lea 15(%rsi), %rsi
1346 add %rcx, %rsi
1347 add %rcx, %rdi
1348 jmp L(less48bytes)
1349# endif
1350 .p2align 4
1351L(exit):
1352 pmovmskb %xmm1, %r8d
1353 sub $0xffff, %r8d
1354 jz L(first16bytes)
1355 lea -16(%rsi), %rsi
1356 lea -16(%rdi), %rdi
1357 mov %r8d, %edx
1358L(first16bytes):
1359 add %rax, %rsi
1360L(less16bytes):
1361# ifndef USE_AS_WMEMCMP
1362 test %dl, %dl
1363 jz L(next_24_bytes)
1364
1365 test $0x01, %dl
1366 jnz L(Byte16)
1367
1368 test $0x02, %dl
1369 jnz L(Byte17)
1370
1371 test $0x04, %dl
1372 jnz L(Byte18)
1373
1374 test $0x08, %dl
1375 jnz L(Byte19)
1376
1377 test $0x10, %dl
1378 jnz L(Byte20)
1379
1380 test $0x20, %dl
1381 jnz L(Byte21)
1382
1383 test $0x40, %dl
1384 jnz L(Byte22)
1385
1386 movzbl -9(%rdi), %eax
1387 movzbl -9(%rsi), %edx
1388 sub %edx, %eax
1389 ret
1390
1391 .p2align 4
1392L(Byte16):
1393 movzbl -16(%rdi), %eax
1394 movzbl -16(%rsi), %edx
1395 sub %edx, %eax
1396 ret
1397
1398 .p2align 4
1399L(Byte17):
1400 movzbl -15(%rdi), %eax
1401 movzbl -15(%rsi), %edx
1402 sub %edx, %eax
1403 ret
1404
1405 .p2align 4
1406L(Byte18):
1407 movzbl -14(%rdi), %eax
1408 movzbl -14(%rsi), %edx
1409 sub %edx, %eax
1410 ret
1411
1412 .p2align 4
1413L(Byte19):
1414 movzbl -13(%rdi), %eax
1415 movzbl -13(%rsi), %edx
1416 sub %edx, %eax
1417 ret
1418
1419 .p2align 4
1420L(Byte20):
1421 movzbl -12(%rdi), %eax
1422 movzbl -12(%rsi), %edx
1423 sub %edx, %eax
1424 ret
1425
1426 .p2align 4
1427L(Byte21):
1428 movzbl -11(%rdi), %eax
1429 movzbl -11(%rsi), %edx
1430 sub %edx, %eax
1431 ret
1432
1433 .p2align 4
1434L(Byte22):
1435 movzbl -10(%rdi), %eax
1436 movzbl -10(%rsi), %edx
1437 sub %edx, %eax
1438 ret
1439
1440 .p2align 4
1441L(next_24_bytes):
1442 lea 8(%rdi), %rdi
1443 lea 8(%rsi), %rsi
1444 test $0x01, %dh
1445 jnz L(Byte16)
1446
1447 test $0x02, %dh
1448 jnz L(Byte17)
1449
1450 test $0x04, %dh
1451 jnz L(Byte18)
1452
1453 test $0x08, %dh
1454 jnz L(Byte19)
1455
1456 test $0x10, %dh
1457 jnz L(Byte20)
1458
1459 test $0x20, %dh
1460 jnz L(Byte21)
1461
1462 test $0x40, %dh
1463 jnz L(Byte22)
1464
1465 movzbl -9(%rdi), %eax
1466 movzbl -9(%rsi), %edx
1467 sub %edx, %eax
1468 ret
1469# else
1470/* special for wmemcmp */
1471 xor %eax, %eax
1472 test %dl, %dl
1473 jz L(next_two_double_words)
1474 and $15, %dl
1475 jz L(second_double_word)
1476 mov -16(%rdi), %eax
1477 cmp -16(%rsi), %eax
1478 jne L(find_diff)
1479 ret
1480
1481 .p2align 4
1482L(second_double_word):
1483 mov -12(%rdi), %eax
1484 cmp -12(%rsi), %eax
1485 jne L(find_diff)
1486 ret
1487
1488 .p2align 4
1489L(next_two_double_words):
1490 and $15, %dh
1491 jz L(fourth_double_word)
1492 mov -8(%rdi), %eax
1493 cmp -8(%rsi), %eax
1494 jne L(find_diff)
1495 ret
1496
1497 .p2align 4
1498L(fourth_double_word):
1499 mov -4(%rdi), %eax
1500 cmp -4(%rsi), %eax
1501 jne L(find_diff)
1502 ret
1503# endif
1504
1505 .p2align 4
1506L(less48bytes):
1507 cmp $8, %ecx
1508 jae L(more8bytes)
1509 cmp $0, %ecx
1510 je L(0bytes)
1511# ifndef USE_AS_WMEMCMP
1512 cmp $1, %ecx
1513 je L(1bytes)
1514 cmp $2, %ecx
1515 je L(2bytes)
1516 cmp $3, %ecx
1517 je L(3bytes)
1518 cmp $4, %ecx
1519 je L(4bytes)
1520 cmp $5, %ecx
1521 je L(5bytes)
1522 cmp $6, %ecx
1523 je L(6bytes)
1524 jmp L(7bytes)
1525# else
1526 jmp L(4bytes)
1527# endif
1528
1529 .p2align 4
1530L(more8bytes):
1531 cmp $16, %ecx
1532 jae L(more16bytes)
1533 cmp $8, %ecx
1534 je L(8bytes)
1535# ifndef USE_AS_WMEMCMP
1536 cmp $9, %ecx
1537 je L(9bytes)
1538 cmp $10, %ecx
1539 je L(10bytes)
1540 cmp $11, %ecx
1541 je L(11bytes)
1542 cmp $12, %ecx
1543 je L(12bytes)
1544 cmp $13, %ecx
1545 je L(13bytes)
1546 cmp $14, %ecx
1547 je L(14bytes)
1548 jmp L(15bytes)
1549# else
1550 jmp L(12bytes)
1551# endif
1552
1553 .p2align 4
1554L(more16bytes):
1555 cmp $24, %ecx
1556 jae L(more24bytes)
1557 cmp $16, %ecx
1558 je L(16bytes)
1559# ifndef USE_AS_WMEMCMP
1560 cmp $17, %ecx
1561 je L(17bytes)
1562 cmp $18, %ecx
1563 je L(18bytes)
1564 cmp $19, %ecx
1565 je L(19bytes)
1566 cmp $20, %ecx
1567 je L(20bytes)
1568 cmp $21, %ecx
1569 je L(21bytes)
1570 cmp $22, %ecx
1571 je L(22bytes)
1572 jmp L(23bytes)
1573# else
1574 jmp L(20bytes)
1575# endif
1576
1577 .p2align 4
1578L(more24bytes):
1579 cmp $32, %ecx
1580 jae L(more32bytes)
1581 cmp $24, %ecx
1582 je L(24bytes)
1583# ifndef USE_AS_WMEMCMP
1584 cmp $25, %ecx
1585 je L(25bytes)
1586 cmp $26, %ecx
1587 je L(26bytes)
1588 cmp $27, %ecx
1589 je L(27bytes)
1590 cmp $28, %ecx
1591 je L(28bytes)
1592 cmp $29, %ecx
1593 je L(29bytes)
1594 cmp $30, %ecx
1595 je L(30bytes)
1596 jmp L(31bytes)
1597# else
1598 jmp L(28bytes)
1599# endif
1600
1601 .p2align 4
1602L(more32bytes):
1603 cmp $40, %ecx
1604 jae L(more40bytes)
1605 cmp $32, %ecx
1606 je L(32bytes)
1607# ifndef USE_AS_WMEMCMP
1608 cmp $33, %ecx
1609 je L(33bytes)
1610 cmp $34, %ecx
1611 je L(34bytes)
1612 cmp $35, %ecx
1613 je L(35bytes)
1614 cmp $36, %ecx
1615 je L(36bytes)
1616 cmp $37, %ecx
1617 je L(37bytes)
1618 cmp $38, %ecx
1619 je L(38bytes)
1620 jmp L(39bytes)
1621# else
1622 jmp L(36bytes)
1623# endif
1624
1625 .p2align 4
1626L(more40bytes):
1627 cmp $40, %ecx
1628 je L(40bytes)
1629# ifndef USE_AS_WMEMCMP
1630 cmp $41, %ecx
1631 je L(41bytes)
1632 cmp $42, %ecx
1633 je L(42bytes)
1634 cmp $43, %ecx
1635 je L(43bytes)
1636 cmp $44, %ecx
1637 je L(44bytes)
1638 cmp $45, %ecx
1639 je L(45bytes)
1640 cmp $46, %ecx
1641 je L(46bytes)
1642 jmp L(47bytes)
1643
1644 .p2align 4
1645L(44bytes):
1646 movl -44(%rdi), %eax
1647 movl -44(%rsi), %ecx
1648 cmp %ecx, %eax
1649 jne L(find_diff)
1650L(40bytes):
1651 movl -40(%rdi), %eax
1652 movl -40(%rsi), %ecx
1653 cmp %ecx, %eax
1654 jne L(find_diff)
1655L(36bytes):
1656 movl -36(%rdi), %eax
1657 movl -36(%rsi), %ecx
1658 cmp %ecx, %eax
1659 jne L(find_diff)
1660L(32bytes):
1661 movl -32(%rdi), %eax
1662 movl -32(%rsi), %ecx
1663 cmp %ecx, %eax
1664 jne L(find_diff)
1665L(28bytes):
1666 movl -28(%rdi), %eax
1667 movl -28(%rsi), %ecx
1668 cmp %ecx, %eax
1669 jne L(find_diff)
1670L(24bytes):
1671 movl -24(%rdi), %eax
1672 movl -24(%rsi), %ecx
1673 cmp %ecx, %eax
1674 jne L(find_diff)
1675L(20bytes):
1676 movl -20(%rdi), %eax
1677 movl -20(%rsi), %ecx
1678 cmp %ecx, %eax
1679 jne L(find_diff)
1680L(16bytes):
1681 movl -16(%rdi), %eax
1682 movl -16(%rsi), %ecx
1683 cmp %ecx, %eax
1684 jne L(find_diff)
1685L(12bytes):
1686 movl -12(%rdi), %eax
1687 movl -12(%rsi), %ecx
1688 cmp %ecx, %eax
1689 jne L(find_diff)
1690L(8bytes):
1691 movl -8(%rdi), %eax
1692 movl -8(%rsi), %ecx
1693 cmp %ecx, %eax
1694 jne L(find_diff)
1695L(4bytes):
1696 movl -4(%rdi), %eax
1697 movl -4(%rsi), %ecx
1698 cmp %ecx, %eax
1699 jne L(find_diff)
1700L(0bytes):
1701 xor %eax, %eax
1702 ret
1703# else
1704 .p2align 4
1705L(44bytes):
1706 movl -44(%rdi), %eax
1707 cmp -44(%rsi), %eax
1708 jne L(find_diff)
1709L(40bytes):
1710 movl -40(%rdi), %eax
1711 cmp -40(%rsi), %eax
1712 jne L(find_diff)
1713L(36bytes):
1714 movl -36(%rdi), %eax
1715 cmp -36(%rsi), %eax
1716 jne L(find_diff)
1717L(32bytes):
1718 movl -32(%rdi), %eax
1719 cmp -32(%rsi), %eax
1720 jne L(find_diff)
1721L(28bytes):
1722 movl -28(%rdi), %eax
1723 cmp -28(%rsi), %eax
1724 jne L(find_diff)
1725L(24bytes):
1726 movl -24(%rdi), %eax
1727 cmp -24(%rsi), %eax
1728 jne L(find_diff)
1729L(20bytes):
1730 movl -20(%rdi), %eax
1731 cmp -20(%rsi), %eax
1732 jne L(find_diff)
1733L(16bytes):
1734 movl -16(%rdi), %eax
1735 cmp -16(%rsi), %eax
1736 jne L(find_diff)
1737L(12bytes):
1738 movl -12(%rdi), %eax
1739 cmp -12(%rsi), %eax
1740 jne L(find_diff)
1741L(8bytes):
1742 movl -8(%rdi), %eax
1743 cmp -8(%rsi), %eax
1744 jne L(find_diff)
1745L(4bytes):
1746 movl -4(%rdi), %eax
1747 cmp -4(%rsi), %eax
1748 jne L(find_diff)
1749L(0bytes):
1750 xor %eax, %eax
1751 ret
1752# endif
1753
1754# ifndef USE_AS_WMEMCMP
1755 .p2align 4
1756L(45bytes):
1757 movl -45(%rdi), %eax
1758 movl -45(%rsi), %ecx
1759 cmp %ecx, %eax
1760 jne L(find_diff)
1761L(41bytes):
1762 movl -41(%rdi), %eax
1763 movl -41(%rsi), %ecx
1764 cmp %ecx, %eax
1765 jne L(find_diff)
1766L(37bytes):
1767 movl -37(%rdi), %eax
1768 movl -37(%rsi), %ecx
1769 cmp %ecx, %eax
1770 jne L(find_diff)
1771L(33bytes):
1772 movl -33(%rdi), %eax
1773 movl -33(%rsi), %ecx
1774 cmp %ecx, %eax
1775 jne L(find_diff)
1776L(29bytes):
1777 movl -29(%rdi), %eax
1778 movl -29(%rsi), %ecx
1779 cmp %ecx, %eax
1780 jne L(find_diff)
1781L(25bytes):
1782 movl -25(%rdi), %eax
1783 movl -25(%rsi), %ecx
1784 cmp %ecx, %eax
1785 jne L(find_diff)
1786L(21bytes):
1787 movl -21(%rdi), %eax
1788 movl -21(%rsi), %ecx
1789 cmp %ecx, %eax
1790 jne L(find_diff)
1791L(17bytes):
1792 movl -17(%rdi), %eax
1793 movl -17(%rsi), %ecx
1794 cmp %ecx, %eax
1795 jne L(find_diff)
1796L(13bytes):
1797 movl -13(%rdi), %eax
1798 movl -13(%rsi), %ecx
1799 cmp %ecx, %eax
1800 jne L(find_diff)
1801L(9bytes):
1802 movl -9(%rdi), %eax
1803 movl -9(%rsi), %ecx
1804 cmp %ecx, %eax
1805 jne L(find_diff)
1806L(5bytes):
1807 movl -5(%rdi), %eax
1808 movl -5(%rsi), %ecx
1809 cmp %ecx, %eax
1810 jne L(find_diff)
1811L(1bytes):
1812 movzbl -1(%rdi), %eax
1813 cmpb -1(%rsi), %al
1814 jne L(set)
1815 xor %eax, %eax
1816 ret
1817
1818 .p2align 4
1819L(46bytes):
1820 movl -46(%rdi), %eax
1821 movl -46(%rsi), %ecx
1822 cmp %ecx, %eax
1823 jne L(find_diff)
1824L(42bytes):
1825 movl -42(%rdi), %eax
1826 movl -42(%rsi), %ecx
1827 cmp %ecx, %eax
1828 jne L(find_diff)
1829L(38bytes):
1830 movl -38(%rdi), %eax
1831 movl -38(%rsi), %ecx
1832 cmp %ecx, %eax
1833 jne L(find_diff)
1834L(34bytes):
1835 movl -34(%rdi), %eax
1836 movl -34(%rsi), %ecx
1837 cmp %ecx, %eax
1838 jne L(find_diff)
1839L(30bytes):
1840 movl -30(%rdi), %eax
1841 movl -30(%rsi), %ecx
1842 cmp %ecx, %eax
1843 jne L(find_diff)
1844L(26bytes):
1845 movl -26(%rdi), %eax
1846 movl -26(%rsi), %ecx
1847 cmp %ecx, %eax
1848 jne L(find_diff)
1849L(22bytes):
1850 movl -22(%rdi), %eax
1851 movl -22(%rsi), %ecx
1852 cmp %ecx, %eax
1853 jne L(find_diff)
1854L(18bytes):
1855 movl -18(%rdi), %eax
1856 movl -18(%rsi), %ecx
1857 cmp %ecx, %eax
1858 jne L(find_diff)
1859L(14bytes):
1860 movl -14(%rdi), %eax
1861 movl -14(%rsi), %ecx
1862 cmp %ecx, %eax
1863 jne L(find_diff)
1864L(10bytes):
1865 movl -10(%rdi), %eax
1866 movl -10(%rsi), %ecx
1867 cmp %ecx, %eax
1868 jne L(find_diff)
1869L(6bytes):
1870 movl -6(%rdi), %eax
1871 movl -6(%rsi), %ecx
1872 cmp %ecx, %eax
1873 jne L(find_diff)
1874L(2bytes):
1875 movzwl -2(%rdi), %eax
1876 movzwl -2(%rsi), %ecx
1877 cmpb %cl, %al
1878 jne L(set)
1879 cmp %ecx, %eax
1880 jne L(set)
1881 xor %eax, %eax
1882 ret
1883
1884 .p2align 4
1885L(47bytes):
1886 movl -47(%rdi), %eax
1887 movl -47(%rsi), %ecx
1888 cmp %ecx, %eax
1889 jne L(find_diff)
1890L(43bytes):
1891 movl -43(%rdi), %eax
1892 movl -43(%rsi), %ecx
1893 cmp %ecx, %eax
1894 jne L(find_diff)
1895L(39bytes):
1896 movl -39(%rdi), %eax
1897 movl -39(%rsi), %ecx
1898 cmp %ecx, %eax
1899 jne L(find_diff)
1900L(35bytes):
1901 movl -35(%rdi), %eax
1902 movl -35(%rsi), %ecx
1903 cmp %ecx, %eax
1904 jne L(find_diff)
1905L(31bytes):
1906 movl -31(%rdi), %eax
1907 movl -31(%rsi), %ecx
1908 cmp %ecx, %eax
1909 jne L(find_diff)
1910L(27bytes):
1911 movl -27(%rdi), %eax
1912 movl -27(%rsi), %ecx
1913 cmp %ecx, %eax
1914 jne L(find_diff)
1915L(23bytes):
1916 movl -23(%rdi), %eax
1917 movl -23(%rsi), %ecx
1918 cmp %ecx, %eax
1919 jne L(find_diff)
1920L(19bytes):
1921 movl -19(%rdi), %eax
1922 movl -19(%rsi), %ecx
1923 cmp %ecx, %eax
1924 jne L(find_diff)
1925L(15bytes):
1926 movl -15(%rdi), %eax
1927 movl -15(%rsi), %ecx
1928 cmp %ecx, %eax
1929 jne L(find_diff)
1930L(11bytes):
1931 movl -11(%rdi), %eax
1932 movl -11(%rsi), %ecx
1933 cmp %ecx, %eax
1934 jne L(find_diff)
1935L(7bytes):
1936 movl -7(%rdi), %eax
1937 movl -7(%rsi), %ecx
1938 cmp %ecx, %eax
1939 jne L(find_diff)
1940L(3bytes):
1941 movzwl -3(%rdi), %eax
1942 movzwl -3(%rsi), %ecx
1943 cmpb %cl, %al
1944 jne L(set)
1945 cmp %ecx, %eax
1946 jne L(set)
1947 movzbl -1(%rdi), %eax
1948 cmpb -1(%rsi), %al
1949 jne L(set)
1950 xor %eax, %eax
1951 ret
1952
1953 .p2align 4
1954L(find_diff):
1955 cmpb %cl, %al
1956 jne L(set)
1957 cmpw %cx, %ax
1958 jne L(set)
1959 shr $16, %eax
1960 shr $16, %ecx
1961 cmpb %cl, %al
1962 jne L(set)
1963
1964/* We get there only if we already know there is a
1965difference. */
1966
1967 cmp %ecx, %eax
1968L(set):
1969 sbb %eax, %eax
1970 sbb $-1, %eax
1971 ret
1972# else
1973
1974/* for wmemcmp */
1975 .p2align 4
1976L(find_diff):
1977 mov $1, %eax
1978 jg L(find_diff_bigger)
1979 neg %eax
1980 ret
1981
1982 .p2align 4
1983L(find_diff_bigger):
1984 ret
1985# endif
1986
1987 .p2align 4
1988L(equal):
1989 xor %eax, %eax
1990 ret
1991
1992END (MEMCMP)
1993#endif
1994