1/* memchr/wmemchr optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef MEMCHR
24# define MEMCHR __memchr_avx2
25# endif
26
27# ifdef USE_AS_WMEMCHR
28# define VPCMPEQ vpcmpeqd
29# else
30# define VPCMPEQ vpcmpeqb
31# endif
32
33# ifndef VZEROUPPER
34# define VZEROUPPER vzeroupper
35# endif
36
37# define VEC_SIZE 32
38
39 .section .text.avx,"ax",@progbits
40ENTRY (MEMCHR)
41# ifndef USE_AS_RAWMEMCHR
42 /* Check for zero length. */
43 test %RDX_LP, %RDX_LP
44 jz L(null)
45# endif
46 movl %edi, %ecx
47 /* Broadcast CHAR to YMM0. */
48 vmovd %esi, %xmm0
49# ifdef USE_AS_WMEMCHR
50 shl $2, %RDX_LP
51 vpbroadcastd %xmm0, %ymm0
52# else
53# ifdef __ILP32__
54 /* Clear the upper 32 bits. */
55 movl %edx, %edx
56# endif
57 vpbroadcastb %xmm0, %ymm0
58# endif
59 /* Check if we may cross page boundary with one vector load. */
60 andl $(2 * VEC_SIZE - 1), %ecx
61 cmpl $VEC_SIZE, %ecx
62 ja L(cros_page_boundary)
63
64 /* Check the first VEC_SIZE bytes. */
65 VPCMPEQ (%rdi), %ymm0, %ymm1
66 vpmovmskb %ymm1, %eax
67 testl %eax, %eax
68
69# ifndef USE_AS_RAWMEMCHR
70 jnz L(first_vec_x0_check)
71 /* Adjust length and check the end of data. */
72 subq $VEC_SIZE, %rdx
73 jbe L(zero)
74# else
75 jnz L(first_vec_x0)
76# endif
77
78 /* Align data for aligned loads in the loop. */
79 addq $VEC_SIZE, %rdi
80 andl $(VEC_SIZE - 1), %ecx
81 andq $-VEC_SIZE, %rdi
82
83# ifndef USE_AS_RAWMEMCHR
84 /* Adjust length. */
85 addq %rcx, %rdx
86
87 subq $(VEC_SIZE * 4), %rdx
88 jbe L(last_4x_vec_or_less)
89# endif
90 jmp L(more_4x_vec)
91
92 .p2align 4
93L(cros_page_boundary):
94 andl $(VEC_SIZE - 1), %ecx
95 andq $-VEC_SIZE, %rdi
96 VPCMPEQ (%rdi), %ymm0, %ymm1
97 vpmovmskb %ymm1, %eax
98 /* Remove the leading bytes. */
99 sarl %cl, %eax
100 testl %eax, %eax
101 jz L(aligned_more)
102 tzcntl %eax, %eax
103# ifndef USE_AS_RAWMEMCHR
104 /* Check the end of data. */
105 cmpq %rax, %rdx
106 jbe L(zero)
107# endif
108 addq %rdi, %rax
109 addq %rcx, %rax
110 VZEROUPPER
111 ret
112
113 .p2align 4
114L(aligned_more):
115# ifndef USE_AS_RAWMEMCHR
116 /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
117 instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
118 overflow. */
119 negq %rcx
120 addq $VEC_SIZE, %rcx
121
122 /* Check the end of data. */
123 subq %rcx, %rdx
124 jbe L(zero)
125# endif
126
127 addq $VEC_SIZE, %rdi
128
129# ifndef USE_AS_RAWMEMCHR
130 subq $(VEC_SIZE * 4), %rdx
131 jbe L(last_4x_vec_or_less)
132# endif
133
134L(more_4x_vec):
135 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
136 since data is only aligned to VEC_SIZE. */
137 VPCMPEQ (%rdi), %ymm0, %ymm1
138 vpmovmskb %ymm1, %eax
139 testl %eax, %eax
140 jnz L(first_vec_x0)
141
142 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
143 vpmovmskb %ymm1, %eax
144 testl %eax, %eax
145 jnz L(first_vec_x1)
146
147 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
148 vpmovmskb %ymm1, %eax
149 testl %eax, %eax
150 jnz L(first_vec_x2)
151
152 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
153 vpmovmskb %ymm1, %eax
154 testl %eax, %eax
155 jnz L(first_vec_x3)
156
157 addq $(VEC_SIZE * 4), %rdi
158
159# ifndef USE_AS_RAWMEMCHR
160 subq $(VEC_SIZE * 4), %rdx
161 jbe L(last_4x_vec_or_less)
162# endif
163
164 /* Align data to 4 * VEC_SIZE. */
165 movq %rdi, %rcx
166 andl $(4 * VEC_SIZE - 1), %ecx
167 andq $-(4 * VEC_SIZE), %rdi
168
169# ifndef USE_AS_RAWMEMCHR
170 /* Adjust length. */
171 addq %rcx, %rdx
172# endif
173
174 .p2align 4
175L(loop_4x_vec):
176 /* Compare 4 * VEC at a time forward. */
177 VPCMPEQ (%rdi), %ymm0, %ymm1
178 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
179 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
180 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
181
182 vpor %ymm1, %ymm2, %ymm5
183 vpor %ymm3, %ymm4, %ymm6
184 vpor %ymm5, %ymm6, %ymm5
185
186 vpmovmskb %ymm5, %eax
187 testl %eax, %eax
188 jnz L(4x_vec_end)
189
190 addq $(VEC_SIZE * 4), %rdi
191
192# ifdef USE_AS_RAWMEMCHR
193 jmp L(loop_4x_vec)
194# else
195 subq $(VEC_SIZE * 4), %rdx
196 ja L(loop_4x_vec)
197
198L(last_4x_vec_or_less):
199 /* Less than 4 * VEC and aligned to VEC_SIZE. */
200 addl $(VEC_SIZE * 2), %edx
201 jle L(last_2x_vec)
202
203 VPCMPEQ (%rdi), %ymm0, %ymm1
204 vpmovmskb %ymm1, %eax
205 testl %eax, %eax
206 jnz L(first_vec_x0)
207
208 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
209 vpmovmskb %ymm1, %eax
210 testl %eax, %eax
211 jnz L(first_vec_x1)
212
213 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
214 vpmovmskb %ymm1, %eax
215 testl %eax, %eax
216
217 jnz L(first_vec_x2_check)
218 subl $VEC_SIZE, %edx
219 jle L(zero)
220
221 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
222 vpmovmskb %ymm1, %eax
223 testl %eax, %eax
224
225 jnz L(first_vec_x3_check)
226 xorl %eax, %eax
227 VZEROUPPER
228 ret
229
230 .p2align 4
231L(last_2x_vec):
232 addl $(VEC_SIZE * 2), %edx
233 VPCMPEQ (%rdi), %ymm0, %ymm1
234 vpmovmskb %ymm1, %eax
235 testl %eax, %eax
236
237 jnz L(first_vec_x0_check)
238 subl $VEC_SIZE, %edx
239 jle L(zero)
240
241 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
242 vpmovmskb %ymm1, %eax
243 testl %eax, %eax
244 jnz L(first_vec_x1_check)
245 xorl %eax, %eax
246 VZEROUPPER
247 ret
248
249 .p2align 4
250L(first_vec_x0_check):
251 tzcntl %eax, %eax
252 /* Check the end of data. */
253 cmpq %rax, %rdx
254 jbe L(zero)
255 addq %rdi, %rax
256 VZEROUPPER
257 ret
258
259 .p2align 4
260L(first_vec_x1_check):
261 tzcntl %eax, %eax
262 /* Check the end of data. */
263 cmpq %rax, %rdx
264 jbe L(zero)
265 addq $VEC_SIZE, %rax
266 addq %rdi, %rax
267 VZEROUPPER
268 ret
269
270 .p2align 4
271L(first_vec_x2_check):
272 tzcntl %eax, %eax
273 /* Check the end of data. */
274 cmpq %rax, %rdx
275 jbe L(zero)
276 addq $(VEC_SIZE * 2), %rax
277 addq %rdi, %rax
278 VZEROUPPER
279 ret
280
281 .p2align 4
282L(first_vec_x3_check):
283 tzcntl %eax, %eax
284 /* Check the end of data. */
285 cmpq %rax, %rdx
286 jbe L(zero)
287 addq $(VEC_SIZE * 3), %rax
288 addq %rdi, %rax
289 VZEROUPPER
290 ret
291
292 .p2align 4
293L(zero):
294 VZEROUPPER
295L(null):
296 xorl %eax, %eax
297 ret
298# endif
299
300 .p2align 4
301L(first_vec_x0):
302 tzcntl %eax, %eax
303 addq %rdi, %rax
304 VZEROUPPER
305 ret
306
307 .p2align 4
308L(first_vec_x1):
309 tzcntl %eax, %eax
310 addq $VEC_SIZE, %rax
311 addq %rdi, %rax
312 VZEROUPPER
313 ret
314
315 .p2align 4
316L(first_vec_x2):
317 tzcntl %eax, %eax
318 addq $(VEC_SIZE * 2), %rax
319 addq %rdi, %rax
320 VZEROUPPER
321 ret
322
323 .p2align 4
324L(4x_vec_end):
325 vpmovmskb %ymm1, %eax
326 testl %eax, %eax
327 jnz L(first_vec_x0)
328 vpmovmskb %ymm2, %eax
329 testl %eax, %eax
330 jnz L(first_vec_x1)
331 vpmovmskb %ymm3, %eax
332 testl %eax, %eax
333 jnz L(first_vec_x2)
334 vpmovmskb %ymm4, %eax
335 testl %eax, %eax
336L(first_vec_x3):
337 tzcntl %eax, %eax
338 addq $(VEC_SIZE * 3), %rax
339 addq %rdi, %rax
340 VZEROUPPER
341 ret
342
343END (MEMCHR)
344#endif
345