1/* memrchr optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef VZEROUPPER
24# define VZEROUPPER vzeroupper
25# endif
26
27# define VEC_SIZE 32
28
29 .section .text.avx,"ax",@progbits
30ENTRY (__memrchr_avx2)
31 /* Broadcast CHAR to YMM0. */
32 vmovd %esi, %xmm0
33 vpbroadcastb %xmm0, %ymm0
34
35 sub $VEC_SIZE, %RDX_LP
36 jbe L(last_vec_or_less)
37
38 add %RDX_LP, %RDI_LP
39
40 /* Check the last VEC_SIZE bytes. */
41 vpcmpeqb (%rdi), %ymm0, %ymm1
42 vpmovmskb %ymm1, %eax
43 testl %eax, %eax
44 jnz L(last_vec_x0)
45
46 subq $(VEC_SIZE * 4), %rdi
47 movl %edi, %ecx
48 andl $(VEC_SIZE - 1), %ecx
49 jz L(aligned_more)
50
51 /* Align data for aligned loads in the loop. */
52 addq $VEC_SIZE, %rdi
53 addq $VEC_SIZE, %rdx
54 andq $-VEC_SIZE, %rdi
55 subq %rcx, %rdx
56
57 .p2align 4
58L(aligned_more):
59 subq $(VEC_SIZE * 4), %rdx
60 jbe L(last_4x_vec_or_less)
61
62 /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
63 since data is only aligned to VEC_SIZE. */
64 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
65 vpmovmskb %ymm1, %eax
66 testl %eax, %eax
67 jnz L(last_vec_x3)
68
69 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
70 vpmovmskb %ymm2, %eax
71 testl %eax, %eax
72 jnz L(last_vec_x2)
73
74 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
75 vpmovmskb %ymm3, %eax
76 testl %eax, %eax
77 jnz L(last_vec_x1)
78
79 vpcmpeqb (%rdi), %ymm0, %ymm4
80 vpmovmskb %ymm4, %eax
81 testl %eax, %eax
82 jnz L(last_vec_x0)
83
84 /* Align data to 4 * VEC_SIZE for loop with fewer branches.
85 There are some overlaps with above if data isn't aligned
86 to 4 * VEC_SIZE. */
87 movl %edi, %ecx
88 andl $(VEC_SIZE * 4 - 1), %ecx
89 jz L(loop_4x_vec)
90
91 addq $(VEC_SIZE * 4), %rdi
92 addq $(VEC_SIZE * 4), %rdx
93 andq $-(VEC_SIZE * 4), %rdi
94 subq %rcx, %rdx
95
96 .p2align 4
97L(loop_4x_vec):
98 /* Compare 4 * VEC at a time forward. */
99 subq $(VEC_SIZE * 4), %rdi
100 subq $(VEC_SIZE * 4), %rdx
101 jbe L(last_4x_vec_or_less)
102
103 vmovdqa (%rdi), %ymm1
104 vmovdqa VEC_SIZE(%rdi), %ymm2
105 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
106 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
107
108 vpcmpeqb %ymm1, %ymm0, %ymm1
109 vpcmpeqb %ymm2, %ymm0, %ymm2
110 vpcmpeqb %ymm3, %ymm0, %ymm3
111 vpcmpeqb %ymm4, %ymm0, %ymm4
112
113 vpor %ymm1, %ymm2, %ymm5
114 vpor %ymm3, %ymm4, %ymm6
115 vpor %ymm5, %ymm6, %ymm5
116
117 vpmovmskb %ymm5, %eax
118 testl %eax, %eax
119 jz L(loop_4x_vec)
120
121 /* There is a match. */
122 vpmovmskb %ymm4, %eax
123 testl %eax, %eax
124 jnz L(last_vec_x3)
125
126 vpmovmskb %ymm3, %eax
127 testl %eax, %eax
128 jnz L(last_vec_x2)
129
130 vpmovmskb %ymm2, %eax
131 testl %eax, %eax
132 jnz L(last_vec_x1)
133
134 vpmovmskb %ymm1, %eax
135 bsrl %eax, %eax
136 addq %rdi, %rax
137 VZEROUPPER
138 ret
139
140 .p2align 4
141L(last_4x_vec_or_less):
142 addl $(VEC_SIZE * 4), %edx
143 cmpl $(VEC_SIZE * 2), %edx
144 jbe L(last_2x_vec)
145
146 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
147 vpmovmskb %ymm1, %eax
148 testl %eax, %eax
149 jnz L(last_vec_x3)
150
151 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
152 vpmovmskb %ymm2, %eax
153 testl %eax, %eax
154 jnz L(last_vec_x2)
155
156 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
157 vpmovmskb %ymm3, %eax
158 testl %eax, %eax
159 jnz L(last_vec_x1_check)
160 cmpl $(VEC_SIZE * 3), %edx
161 jbe L(zero)
162
163 vpcmpeqb (%rdi), %ymm0, %ymm4
164 vpmovmskb %ymm4, %eax
165 testl %eax, %eax
166 jz L(zero)
167 bsrl %eax, %eax
168 subq $(VEC_SIZE * 4), %rdx
169 addq %rax, %rdx
170 jl L(zero)
171 addq %rdi, %rax
172 VZEROUPPER
173 ret
174
175 .p2align 4
176L(last_2x_vec):
177 vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
178 vpmovmskb %ymm1, %eax
179 testl %eax, %eax
180 jnz L(last_vec_x3_check)
181 cmpl $VEC_SIZE, %edx
182 jbe L(zero)
183
184 vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
185 vpmovmskb %ymm1, %eax
186 testl %eax, %eax
187 jz L(zero)
188 bsrl %eax, %eax
189 subq $(VEC_SIZE * 2), %rdx
190 addq %rax, %rdx
191 jl L(zero)
192 addl $(VEC_SIZE * 2), %eax
193 addq %rdi, %rax
194 VZEROUPPER
195 ret
196
197 .p2align 4
198L(last_vec_x0):
199 bsrl %eax, %eax
200 addq %rdi, %rax
201 VZEROUPPER
202 ret
203
204 .p2align 4
205L(last_vec_x1):
206 bsrl %eax, %eax
207 addl $VEC_SIZE, %eax
208 addq %rdi, %rax
209 VZEROUPPER
210 ret
211
212 .p2align 4
213L(last_vec_x2):
214 bsrl %eax, %eax
215 addl $(VEC_SIZE * 2), %eax
216 addq %rdi, %rax
217 VZEROUPPER
218 ret
219
220 .p2align 4
221L(last_vec_x3):
222 bsrl %eax, %eax
223 addl $(VEC_SIZE * 3), %eax
224 addq %rdi, %rax
225 ret
226
227 .p2align 4
228L(last_vec_x1_check):
229 bsrl %eax, %eax
230 subq $(VEC_SIZE * 3), %rdx
231 addq %rax, %rdx
232 jl L(zero)
233 addl $VEC_SIZE, %eax
234 addq %rdi, %rax
235 VZEROUPPER
236 ret
237
238 .p2align 4
239L(last_vec_x3_check):
240 bsrl %eax, %eax
241 subq $VEC_SIZE, %rdx
242 addq %rax, %rdx
243 jl L(zero)
244 addl $(VEC_SIZE * 3), %eax
245 addq %rdi, %rax
246 VZEROUPPER
247 ret
248
249 .p2align 4
250L(zero):
251 VZEROUPPER
252L(null):
253 xorl %eax, %eax
254 ret
255
256 .p2align 4
257L(last_vec_or_less_aligned):
258 movl %edx, %ecx
259
260 vpcmpeqb (%rdi), %ymm0, %ymm1
261
262 movl $1, %edx
263 /* Support rdx << 32. */
264 salq %cl, %rdx
265 subq $1, %rdx
266
267 vpmovmskb %ymm1, %eax
268
269 /* Remove the trailing bytes. */
270 andl %edx, %eax
271 testl %eax, %eax
272 jz L(zero)
273
274 bsrl %eax, %eax
275 addq %rdi, %rax
276 VZEROUPPER
277 ret
278
279 .p2align 4
280L(last_vec_or_less):
281 addl $VEC_SIZE, %edx
282
283 /* Check for zero length. */
284 testl %edx, %edx
285 jz L(null)
286
287 movl %edi, %ecx
288 andl $(VEC_SIZE - 1), %ecx
289 jz L(last_vec_or_less_aligned)
290
291 movl %ecx, %esi
292 movl %ecx, %r8d
293 addl %edx, %esi
294 andq $-VEC_SIZE, %rdi
295
296 subl $VEC_SIZE, %esi
297 ja L(last_vec_2x_aligned)
298
299 /* Check the last VEC. */
300 vpcmpeqb (%rdi), %ymm0, %ymm1
301 vpmovmskb %ymm1, %eax
302
303 /* Remove the leading and trailing bytes. */
304 sarl %cl, %eax
305 movl %edx, %ecx
306
307 movl $1, %edx
308 sall %cl, %edx
309 subl $1, %edx
310
311 andl %edx, %eax
312 testl %eax, %eax
313 jz L(zero)
314
315 bsrl %eax, %eax
316 addq %rdi, %rax
317 addq %r8, %rax
318 VZEROUPPER
319 ret
320
321 .p2align 4
322L(last_vec_2x_aligned):
323 movl %esi, %ecx
324
325 /* Check the last VEC. */
326 vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
327
328 movl $1, %edx
329 sall %cl, %edx
330 subl $1, %edx
331
332 vpmovmskb %ymm1, %eax
333
334 /* Remove the trailing bytes. */
335 andl %edx, %eax
336
337 testl %eax, %eax
338 jnz L(last_vec_x1)
339
340 /* Check the second last VEC. */
341 vpcmpeqb (%rdi), %ymm0, %ymm1
342
343 movl %r8d, %ecx
344
345 vpmovmskb %ymm1, %eax
346
347 /* Remove the leading bytes. Must use unsigned right shift for
348 bsrl below. */
349 shrl %cl, %eax
350 testl %eax, %eax
351 jz L(zero)
352
353 bsrl %eax, %eax
354 addq %rdi, %rax
355 addq %r8, %rax
356 VZEROUPPER
357 ret
358END (__memrchr_avx2)
359#endif
360