1/* memcmp with SSE2
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <http://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22 .text
23ENTRY (memcmp)
24#ifdef __ILP32__
25 /* Clear the upper 32 bits. */
26 movl %edx, %edx
27#endif
28 test %RDX_LP, %RDX_LP
29 jz L(finz)
30 cmpq $1, %rdx
31 jbe L(finr1b)
32 subq %rdi, %rsi
33 movq %rdx, %r10
34 cmpq $32, %r10
35 jae L(gt32)
36 /* Handle small chunks and last block of less than 32 bytes. */
37L(small):
38 testq $1, %r10
39 jz L(s2b)
40 movzbl (%rdi), %eax
41 movzbl (%rdi, %rsi), %edx
42 subq $1, %r10
43 je L(finz1)
44 addq $1, %rdi
45 subl %edx, %eax
46 jnz L(exit)
47L(s2b):
48 testq $2, %r10
49 jz L(s4b)
50 movzwl (%rdi), %eax
51 movzwl (%rdi, %rsi), %edx
52 subq $2, %r10
53 je L(fin2_7)
54 addq $2, %rdi
55 cmpl %edx, %eax
56 jnz L(fin2_7)
57L(s4b):
58 testq $4, %r10
59 jz L(s8b)
60 movl (%rdi), %eax
61 movl (%rdi, %rsi), %edx
62 subq $4, %r10
63 je L(fin2_7)
64 addq $4, %rdi
65 cmpl %edx, %eax
66 jnz L(fin2_7)
67L(s8b):
68 testq $8, %r10
69 jz L(s16b)
70 movq (%rdi), %rax
71 movq (%rdi, %rsi), %rdx
72 subq $8, %r10
73 je L(fin2_7)
74 addq $8, %rdi
75 cmpq %rdx, %rax
76 jnz L(fin2_7)
77L(s16b):
78 movdqu (%rdi), %xmm1
79 movdqu (%rdi, %rsi), %xmm0
80 pcmpeqb %xmm0, %xmm1
81 pmovmskb %xmm1, %edx
82 xorl %eax, %eax
83 subl $0xffff, %edx
84 jz L(finz)
85 bsfl %edx, %ecx
86 leaq (%rdi, %rcx), %rcx
87 movzbl (%rcx), %eax
88 movzbl (%rsi, %rcx), %edx
89 jmp L(finz1)
90
91 .p2align 4,, 4
92L(finr1b):
93 movzbl (%rdi), %eax
94 movzbl (%rsi), %edx
95L(finz1):
96 subl %edx, %eax
97L(exit):
98 ret
99
100 .p2align 4,, 4
101L(fin2_7):
102 cmpq %rdx, %rax
103 jz L(finz)
104 movq %rax, %r11
105 subq %rdx, %r11
106 bsfq %r11, %rcx
107 sarq $3, %rcx
108 salq $3, %rcx
109 sarq %cl, %rax
110 movzbl %al, %eax
111 sarq %cl, %rdx
112 movzbl %dl, %edx
113 subl %edx, %eax
114 ret
115
116 .p2align 4,, 4
117L(finz):
118 xorl %eax, %eax
119 ret
120
121 /* For blocks bigger than 32 bytes
122 1. Advance one of the addr pointer to be 16B aligned.
123 2. Treat the case of both addr pointers aligned to 16B
124 separately to avoid movdqu.
125 3. Handle any blocks of greater than 64 consecutive bytes with
126 unrolling to reduce branches.
127 4. At least one addr pointer is 16B aligned, use memory version
128 of pcmbeqb.
129 */
130 .p2align 4,, 4
131L(gt32):
132 movq %rdx, %r11
133 addq %rdi, %r11
134 movq %rdi, %r8
135
136 andq $15, %r8
137 jz L(16am)
138 /* Both pointers may be misaligned. */
139 movdqu (%rdi), %xmm1
140 movdqu (%rdi, %rsi), %xmm0
141 pcmpeqb %xmm0, %xmm1
142 pmovmskb %xmm1, %edx
143 subl $0xffff, %edx
144 jnz L(neq)
145 neg %r8
146 leaq 16(%rdi, %r8), %rdi
147L(16am):
148 /* Handle two 16B aligned pointers separately. */
149 testq $15, %rsi
150 jz L(ATR)
151 testq $16, %rdi
152 jz L(A32)
153 movdqu (%rdi, %rsi), %xmm0
154 pcmpeqb (%rdi), %xmm0
155 pmovmskb %xmm0, %edx
156 subl $0xffff, %edx
157 jnz L(neq)
158 addq $16, %rdi
159L(A32):
160 movq %r11, %r10
161 andq $-32, %r10
162 cmpq %r10, %rdi
163 jae L(mt16)
164 /* Pre-unroll to be ready for unrolled 64B loop. */
165 testq $32, %rdi
166 jz L(A64)
167 movdqu (%rdi,%rsi), %xmm0
168 pcmpeqb (%rdi), %xmm0
169 pmovmskb %xmm0, %edx
170 subl $0xffff, %edx
171 jnz L(neq)
172 addq $16, %rdi
173
174 movdqu (%rdi,%rsi), %xmm0
175 pcmpeqb (%rdi), %xmm0
176 pmovmskb %xmm0, %edx
177 subl $0xffff, %edx
178 jnz L(neq)
179 addq $16, %rdi
180
181L(A64):
182 movq %r11, %r10
183 andq $-64, %r10
184 cmpq %r10, %rdi
185 jae L(mt32)
186
187L(A64main):
188 movdqu (%rdi,%rsi), %xmm0
189 pcmpeqb (%rdi), %xmm0
190 pmovmskb %xmm0, %edx
191 subl $0xffff, %edx
192 jnz L(neq)
193 addq $16, %rdi
194
195 movdqu (%rdi,%rsi), %xmm0
196 pcmpeqb (%rdi), %xmm0
197 pmovmskb %xmm0, %edx
198 subl $0xffff, %edx
199 jnz L(neq)
200 addq $16, %rdi
201
202 movdqu (%rdi,%rsi), %xmm0
203 pcmpeqb (%rdi), %xmm0
204 pmovmskb %xmm0, %edx
205 subl $0xffff, %edx
206 jnz L(neq)
207 addq $16, %rdi
208
209 movdqu (%rdi,%rsi), %xmm0
210 pcmpeqb (%rdi), %xmm0
211 pmovmskb %xmm0, %edx
212 subl $0xffff, %edx
213 jnz L(neq)
214 addq $16, %rdi
215
216 cmpq %rdi, %r10
217 jne L(A64main)
218
219L(mt32):
220 movq %r11, %r10
221 andq $-32, %r10
222 cmpq %r10, %rdi
223 jae L(mt16)
224
225L(A32main):
226 movdqu (%rdi,%rsi), %xmm0
227 pcmpeqb (%rdi), %xmm0
228 pmovmskb %xmm0, %edx
229 subl $0xffff, %edx
230 jnz L(neq)
231 addq $16, %rdi
232
233 movdqu (%rdi,%rsi), %xmm0
234 pcmpeqb (%rdi), %xmm0
235 pmovmskb %xmm0, %edx
236 subl $0xffff, %edx
237 jnz L(neq)
238 addq $16, %rdi
239
240 cmpq %rdi, %r10
241 jne L(A32main)
242L(mt16):
243 subq %rdi, %r11
244 je L(finz)
245 movq %r11, %r10
246 jmp L(small)
247
248 .p2align 4,, 4
249L(neq):
250 bsfl %edx, %ecx
251 movzbl (%rdi, %rcx), %eax
252 addq %rdi, %rsi
253 movzbl (%rsi,%rcx), %edx
254 jmp L(finz1)
255
256 .p2align 4,, 4
257L(ATR):
258 movq %r11, %r10
259 andq $-32, %r10
260 cmpq %r10, %rdi
261 jae L(mt16)
262 testq $16, %rdi
263 jz L(ATR32)
264
265 movdqa (%rdi,%rsi), %xmm0
266 pcmpeqb (%rdi), %xmm0
267 pmovmskb %xmm0, %edx
268 subl $0xffff, %edx
269 jnz L(neq)
270 addq $16, %rdi
271 cmpq %rdi, %r10
272 je L(mt16)
273
274L(ATR32):
275 movq %r11, %r10
276 andq $-64, %r10
277 testq $32, %rdi
278 jz L(ATR64)
279
280 movdqa (%rdi,%rsi), %xmm0
281 pcmpeqb (%rdi), %xmm0
282 pmovmskb %xmm0, %edx
283 subl $0xffff, %edx
284 jnz L(neq)
285 addq $16, %rdi
286
287 movdqa (%rdi,%rsi), %xmm0
288 pcmpeqb (%rdi), %xmm0
289 pmovmskb %xmm0, %edx
290 subl $0xffff, %edx
291 jnz L(neq)
292 addq $16, %rdi
293
294L(ATR64):
295 cmpq %rdi, %r10
296 je L(mt32)
297
298L(ATR64main):
299 movdqa (%rdi,%rsi), %xmm0
300 pcmpeqb (%rdi), %xmm0
301 pmovmskb %xmm0, %edx
302 subl $0xffff, %edx
303 jnz L(neq)
304 addq $16, %rdi
305
306 movdqa (%rdi,%rsi), %xmm0
307 pcmpeqb (%rdi), %xmm0
308 pmovmskb %xmm0, %edx
309 subl $0xffff, %edx
310 jnz L(neq)
311 addq $16, %rdi
312
313 movdqa (%rdi,%rsi), %xmm0
314 pcmpeqb (%rdi), %xmm0
315 pmovmskb %xmm0, %edx
316 subl $0xffff, %edx
317 jnz L(neq)
318 addq $16, %rdi
319
320 movdqa (%rdi,%rsi), %xmm0
321 pcmpeqb (%rdi), %xmm0
322 pmovmskb %xmm0, %edx
323 subl $0xffff, %edx
324 jnz L(neq)
325 addq $16, %rdi
326 cmpq %rdi, %r10
327 jne L(ATR64main)
328
329 movq %r11, %r10
330 andq $-32, %r10
331 cmpq %r10, %rdi
332 jae L(mt16)
333
334L(ATR32res):
335 movdqa (%rdi,%rsi), %xmm0
336 pcmpeqb (%rdi), %xmm0
337 pmovmskb %xmm0, %edx
338 subl $0xffff, %edx
339 jnz L(neq)
340 addq $16, %rdi
341
342 movdqa (%rdi,%rsi), %xmm0
343 pcmpeqb (%rdi), %xmm0
344 pmovmskb %xmm0, %edx
345 subl $0xffff, %edx
346 jnz L(neq)
347 addq $16, %rdi
348
349 cmpq %r10, %rdi
350 jne L(ATR32res)
351
352 subq %rdi, %r11
353 je L(finz)
354 movq %r11, %r10
355 jmp L(small)
356 /* Align to 16byte to improve instruction fetch. */
357 .p2align 4,, 4
358END(memcmp)
359
360#undef bcmp
361weak_alias (memcmp, bcmp)
362libc_hidden_builtin_def (memcmp)
363