1/* memcmp/wmemcmp optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21/* memcmp/wmemcmp is implemented as:
22 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
23 to avoid branches.
24 2. Use overlapping compare to avoid branch.
25 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
26 bytes for wmemcmp.
27 4. If size is 8 * VEC_SIZE or less, unroll the loop.
28 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
29 area.
30 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
31 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
32 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
33
34# include <sysdep.h>
35
36# ifndef MEMCMP
37# define MEMCMP __memcmp_avx2_movbe
38# endif
39
40# ifdef USE_AS_WMEMCMP
41# define VPCMPEQ vpcmpeqd
42# else
43# define VPCMPEQ vpcmpeqb
44# endif
45
46# ifndef VZEROUPPER
47# define VZEROUPPER vzeroupper
48# endif
49
50# define VEC_SIZE 32
51# define VEC_MASK ((1 << VEC_SIZE) - 1)
52
53/* Warning!
54 wmemcmp has to use SIGNED comparison for elements.
55 memcmp has to use UNSIGNED comparison for elemnts.
56*/
57
58 .section .text.avx,"ax",@progbits
59ENTRY (MEMCMP)
60# ifdef USE_AS_WMEMCMP
61 shl $2, %RDX_LP
62# elif defined __ILP32__
63 /* Clear the upper 32 bits. */
64 movl %edx, %edx
65# endif
66 cmp $VEC_SIZE, %RDX_LP
67 jb L(less_vec)
68
69 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
70 vmovdqu (%rsi), %ymm2
71 VPCMPEQ (%rdi), %ymm2, %ymm2
72 vpmovmskb %ymm2, %eax
73 subl $VEC_MASK, %eax
74 jnz L(first_vec)
75
76 cmpq $(VEC_SIZE * 2), %rdx
77 jbe L(last_vec)
78
79 VPCMPEQ %ymm0, %ymm0, %ymm0
80 /* More than 2 * VEC. */
81 cmpq $(VEC_SIZE * 8), %rdx
82 ja L(more_8x_vec)
83 cmpq $(VEC_SIZE * 4), %rdx
84 jb L(last_4x_vec)
85
86 /* From 4 * VEC to 8 * VEC, inclusively. */
87 vmovdqu (%rsi), %ymm1
88 VPCMPEQ (%rdi), %ymm1, %ymm1
89
90 vmovdqu VEC_SIZE(%rsi), %ymm2
91 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
92
93 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
94 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
95
96 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
97 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
98
99 vpand %ymm1, %ymm2, %ymm5
100 vpand %ymm3, %ymm4, %ymm6
101 vpand %ymm5, %ymm6, %ymm5
102
103 vptest %ymm0, %ymm5
104 jnc L(4x_vec_end)
105
106 leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
107 leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
108 vmovdqu (%rsi), %ymm1
109 VPCMPEQ (%rdi), %ymm1, %ymm1
110
111 vmovdqu VEC_SIZE(%rsi), %ymm2
112 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
113 vpand %ymm2, %ymm1, %ymm5
114
115 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
116 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
117 vpand %ymm3, %ymm5, %ymm5
118
119 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
120 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
121 vpand %ymm4, %ymm5, %ymm5
122
123 vptest %ymm0, %ymm5
124 jnc L(4x_vec_end)
125 xorl %eax, %eax
126 VZEROUPPER
127 ret
128
129 .p2align 4
130L(last_2x_vec):
131 /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
132 vmovdqu (%rsi), %ymm2
133 VPCMPEQ (%rdi), %ymm2, %ymm2
134 vpmovmskb %ymm2, %eax
135 subl $VEC_MASK, %eax
136 jnz L(first_vec)
137
138L(last_vec):
139 /* Use overlapping loads to avoid branches. */
140 leaq -VEC_SIZE(%rdi, %rdx), %rdi
141 leaq -VEC_SIZE(%rsi, %rdx), %rsi
142 vmovdqu (%rsi), %ymm2
143 VPCMPEQ (%rdi), %ymm2, %ymm2
144 vpmovmskb %ymm2, %eax
145 subl $VEC_MASK, %eax
146 jnz L(first_vec)
147 VZEROUPPER
148 ret
149
150 .p2align 4
151L(first_vec):
152 /* A byte or int32 is different within 16 or 32 bytes. */
153 tzcntl %eax, %ecx
154# ifdef USE_AS_WMEMCMP
155 xorl %eax, %eax
156 movl (%rdi, %rcx), %edx
157 cmpl (%rsi, %rcx), %edx
158L(wmemcmp_return):
159 setl %al
160 negl %eax
161 orl $1, %eax
162# else
163 movzbl (%rdi, %rcx), %eax
164 movzbl (%rsi, %rcx), %edx
165 sub %edx, %eax
166# endif
167 VZEROUPPER
168 ret
169
170# ifdef USE_AS_WMEMCMP
171 .p2align 4
172L(4):
173 xorl %eax, %eax
174 movl (%rdi), %edx
175 cmpl (%rsi), %edx
176 jne L(wmemcmp_return)
177 ret
178# else
179 .p2align 4
180L(between_4_7):
181 /* Load as big endian with overlapping movbe to avoid branches. */
182 movbe (%rdi), %eax
183 movbe (%rsi), %ecx
184 shlq $32, %rax
185 shlq $32, %rcx
186 movbe -4(%rdi, %rdx), %edi
187 movbe -4(%rsi, %rdx), %esi
188 orq %rdi, %rax
189 orq %rsi, %rcx
190 subq %rcx, %rax
191 je L(exit)
192 sbbl %eax, %eax
193 orl $1, %eax
194 ret
195
196 .p2align 4
197L(exit):
198 ret
199
200 .p2align 4
201L(between_2_3):
202 /* Load as big endian to avoid branches. */
203 movzwl (%rdi), %eax
204 movzwl (%rsi), %ecx
205 shll $8, %eax
206 shll $8, %ecx
207 bswap %eax
208 bswap %ecx
209 movb -1(%rdi, %rdx), %al
210 movb -1(%rsi, %rdx), %cl
211 /* Subtraction is okay because the upper 8 bits are zero. */
212 subl %ecx, %eax
213 ret
214
215 .p2align 4
216L(1):
217 movzbl (%rdi), %eax
218 movzbl (%rsi), %ecx
219 subl %ecx, %eax
220 ret
221# endif
222
223 .p2align 4
224L(zero):
225 xorl %eax, %eax
226 ret
227
228 .p2align 4
229L(less_vec):
230# ifdef USE_AS_WMEMCMP
231 /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
232 cmpb $4, %dl
233 je L(4)
234 jb L(zero)
235# else
236 cmpb $1, %dl
237 je L(1)
238 jb L(zero)
239 cmpb $4, %dl
240 jb L(between_2_3)
241 cmpb $8, %dl
242 jb L(between_4_7)
243# endif
244 cmpb $16, %dl
245 jae L(between_16_31)
246 /* It is between 8 and 15 bytes. */
247 vmovq (%rdi), %xmm1
248 vmovq (%rsi), %xmm2
249 VPCMPEQ %xmm1, %xmm2, %xmm2
250 vpmovmskb %xmm2, %eax
251 subl $0xffff, %eax
252 jnz L(first_vec)
253 /* Use overlapping loads to avoid branches. */
254 leaq -8(%rdi, %rdx), %rdi
255 leaq -8(%rsi, %rdx), %rsi
256 vmovq (%rdi), %xmm1
257 vmovq (%rsi), %xmm2
258 VPCMPEQ %xmm1, %xmm2, %xmm2
259 vpmovmskb %xmm2, %eax
260 subl $0xffff, %eax
261 jnz L(first_vec)
262 ret
263
264 .p2align 4
265L(between_16_31):
266 /* From 16 to 31 bytes. No branch when size == 16. */
267 vmovdqu (%rsi), %xmm2
268 VPCMPEQ (%rdi), %xmm2, %xmm2
269 vpmovmskb %xmm2, %eax
270 subl $0xffff, %eax
271 jnz L(first_vec)
272
273 /* Use overlapping loads to avoid branches. */
274 leaq -16(%rdi, %rdx), %rdi
275 leaq -16(%rsi, %rdx), %rsi
276 vmovdqu (%rsi), %xmm2
277 VPCMPEQ (%rdi), %xmm2, %xmm2
278 vpmovmskb %xmm2, %eax
279 subl $0xffff, %eax
280 jnz L(first_vec)
281 ret
282
283 .p2align 4
284L(more_8x_vec):
285 /* More than 8 * VEC. Check the first VEC. */
286 vmovdqu (%rsi), %ymm2
287 VPCMPEQ (%rdi), %ymm2, %ymm2
288 vpmovmskb %ymm2, %eax
289 subl $VEC_MASK, %eax
290 jnz L(first_vec)
291
292 /* Align the first memory area for aligned loads in the loop.
293 Compute how much the first memory area is misaligned. */
294 movq %rdi, %rcx
295 andl $(VEC_SIZE - 1), %ecx
296 /* Get the negative of offset for alignment. */
297 subq $VEC_SIZE, %rcx
298 /* Adjust the second memory area. */
299 subq %rcx, %rsi
300 /* Adjust the first memory area which should be aligned now. */
301 subq %rcx, %rdi
302 /* Adjust length. */
303 addq %rcx, %rdx
304
305L(loop_4x_vec):
306 /* Compare 4 * VEC at a time forward. */
307 vmovdqu (%rsi), %ymm1
308 VPCMPEQ (%rdi), %ymm1, %ymm1
309
310 vmovdqu VEC_SIZE(%rsi), %ymm2
311 VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
312 vpand %ymm2, %ymm1, %ymm5
313
314 vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
315 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
316 vpand %ymm3, %ymm5, %ymm5
317
318 vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
319 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
320 vpand %ymm4, %ymm5, %ymm5
321
322 vptest %ymm0, %ymm5
323 jnc L(4x_vec_end)
324
325 addq $(VEC_SIZE * 4), %rdi
326 addq $(VEC_SIZE * 4), %rsi
327
328 subq $(VEC_SIZE * 4), %rdx
329 cmpq $(VEC_SIZE * 4), %rdx
330 jae L(loop_4x_vec)
331
332 /* Less than 4 * VEC. */
333 cmpq $VEC_SIZE, %rdx
334 jbe L(last_vec)
335 cmpq $(VEC_SIZE * 2), %rdx
336 jbe L(last_2x_vec)
337
338L(last_4x_vec):
339 /* From 2 * VEC to 4 * VEC. */
340 vmovdqu (%rsi), %ymm2
341 VPCMPEQ (%rdi), %ymm2, %ymm2
342 vpmovmskb %ymm2, %eax
343 subl $VEC_MASK, %eax
344 jnz L(first_vec)
345
346 addq $VEC_SIZE, %rdi
347 addq $VEC_SIZE, %rsi
348 vmovdqu (%rsi), %ymm2
349 VPCMPEQ (%rdi), %ymm2, %ymm2
350 vpmovmskb %ymm2, %eax
351 subl $VEC_MASK, %eax
352 jnz L(first_vec)
353
354 /* Use overlapping loads to avoid branches. */
355 leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
356 leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
357 vmovdqu (%rsi), %ymm2
358 VPCMPEQ (%rdi), %ymm2, %ymm2
359 vpmovmskb %ymm2, %eax
360 subl $VEC_MASK, %eax
361 jnz L(first_vec)
362
363 addq $VEC_SIZE, %rdi
364 addq $VEC_SIZE, %rsi
365 vmovdqu (%rsi), %ymm2
366 VPCMPEQ (%rdi), %ymm2, %ymm2
367 vpmovmskb %ymm2, %eax
368 subl $VEC_MASK, %eax
369 jnz L(first_vec)
370 VZEROUPPER
371 ret
372
373 .p2align 4
374L(4x_vec_end):
375 vpmovmskb %ymm1, %eax
376 subl $VEC_MASK, %eax
377 jnz L(first_vec)
378 vpmovmskb %ymm2, %eax
379 subl $VEC_MASK, %eax
380 jnz L(first_vec_x1)
381 vpmovmskb %ymm3, %eax
382 subl $VEC_MASK, %eax
383 jnz L(first_vec_x2)
384 vpmovmskb %ymm4, %eax
385 subl $VEC_MASK, %eax
386 tzcntl %eax, %ecx
387# ifdef USE_AS_WMEMCMP
388 xorl %eax, %eax
389 movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
390 cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
391 jmp L(wmemcmp_return)
392# else
393 movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
394 movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
395 sub %edx, %eax
396# endif
397 VZEROUPPER
398 ret
399
400 .p2align 4
401L(first_vec_x1):
402 tzcntl %eax, %ecx
403# ifdef USE_AS_WMEMCMP
404 xorl %eax, %eax
405 movl VEC_SIZE(%rdi, %rcx), %edx
406 cmpl VEC_SIZE(%rsi, %rcx), %edx
407 jmp L(wmemcmp_return)
408# else
409 movzbl VEC_SIZE(%rdi, %rcx), %eax
410 movzbl VEC_SIZE(%rsi, %rcx), %edx
411 sub %edx, %eax
412# endif
413 VZEROUPPER
414 ret
415
416 .p2align 4
417L(first_vec_x2):
418 tzcntl %eax, %ecx
419# ifdef USE_AS_WMEMCMP
420 xorl %eax, %eax
421 movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
422 cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
423 jmp L(wmemcmp_return)
424# else
425 movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
426 movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
427 sub %edx, %eax
428# endif
429 VZEROUPPER
430 ret
431END (MEMCMP)
432#endif
433