1/* strlen/strnlen/wcslen/wcsnlen optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRLEN
24# define STRLEN __strlen_avx2
25# endif
26
27# ifdef USE_AS_WCSLEN
28# define VPCMPEQ vpcmpeqd
29# define VPMINU vpminud
30# else
31# define VPCMPEQ vpcmpeqb
32# define VPMINU vpminub
33# endif
34
35# ifndef VZEROUPPER
36# define VZEROUPPER vzeroupper
37# endif
38
39# define VEC_SIZE 32
40
41 .section .text.avx,"ax",@progbits
42ENTRY (STRLEN)
43# ifdef USE_AS_STRNLEN
44 /* Check for zero length. */
45 test %RSI_LP, %RSI_LP
46 jz L(zero)
47# ifdef USE_AS_WCSLEN
48 shl $2, %RSI_LP
49# elif defined __ILP32__
50 /* Clear the upper 32 bits. */
51 movl %esi, %esi
52# endif
53 mov %RSI_LP, %R8_LP
54# endif
55 movl %edi, %ecx
56 movq %rdi, %rdx
57 vpxor %xmm0, %xmm0, %xmm0
58
59 /* Check if we may cross page boundary with one vector load. */
60 andl $(2 * VEC_SIZE - 1), %ecx
61 cmpl $VEC_SIZE, %ecx
62 ja L(cros_page_boundary)
63
64 /* Check the first VEC_SIZE bytes. */
65 VPCMPEQ (%rdi), %ymm0, %ymm1
66 vpmovmskb %ymm1, %eax
67 testl %eax, %eax
68
69# ifdef USE_AS_STRNLEN
70 jnz L(first_vec_x0_check)
71 /* Adjust length and check the end of data. */
72 subq $VEC_SIZE, %rsi
73 jbe L(max)
74# else
75 jnz L(first_vec_x0)
76# endif
77
78 /* Align data for aligned loads in the loop. */
79 addq $VEC_SIZE, %rdi
80 andl $(VEC_SIZE - 1), %ecx
81 andq $-VEC_SIZE, %rdi
82
83# ifdef USE_AS_STRNLEN
84 /* Adjust length. */
85 addq %rcx, %rsi
86
87 subq $(VEC_SIZE * 4), %rsi
88 jbe L(last_4x_vec_or_less)
89# endif
90 jmp L(more_4x_vec)
91
92 .p2align 4
93L(cros_page_boundary):
94 andl $(VEC_SIZE - 1), %ecx
95 andq $-VEC_SIZE, %rdi
96 VPCMPEQ (%rdi), %ymm0, %ymm1
97 vpmovmskb %ymm1, %eax
98 /* Remove the leading bytes. */
99 sarl %cl, %eax
100 testl %eax, %eax
101 jz L(aligned_more)
102 tzcntl %eax, %eax
103# ifdef USE_AS_STRNLEN
104 /* Check the end of data. */
105 cmpq %rax, %rsi
106 jbe L(max)
107# endif
108 addq %rdi, %rax
109 addq %rcx, %rax
110 subq %rdx, %rax
111# ifdef USE_AS_WCSLEN
112 shrq $2, %rax
113# endif
114 VZEROUPPER
115 ret
116
117 .p2align 4
118L(aligned_more):
119# ifdef USE_AS_STRNLEN
120 /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
121 with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
122 to void possible addition overflow. */
123 negq %rcx
124 addq $VEC_SIZE, %rcx
125
126 /* Check the end of data. */
127 subq %rcx, %rsi
128 jbe L(max)
129# endif
130
131 addq $VEC_SIZE, %rdi
132
133# ifdef USE_AS_STRNLEN
134 subq $(VEC_SIZE * 4), %rsi
135 jbe L(last_4x_vec_or_less)
136# endif
137
138L(more_4x_vec):
139 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
140 since data is only aligned to VEC_SIZE. */
141 VPCMPEQ (%rdi), %ymm0, %ymm1
142 vpmovmskb %ymm1, %eax
143 testl %eax, %eax
144 jnz L(first_vec_x0)
145
146 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
147 vpmovmskb %ymm1, %eax
148 testl %eax, %eax
149 jnz L(first_vec_x1)
150
151 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
152 vpmovmskb %ymm1, %eax
153 testl %eax, %eax
154 jnz L(first_vec_x2)
155
156 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
157 vpmovmskb %ymm1, %eax
158 testl %eax, %eax
159 jnz L(first_vec_x3)
160
161 addq $(VEC_SIZE * 4), %rdi
162
163# ifdef USE_AS_STRNLEN
164 subq $(VEC_SIZE * 4), %rsi
165 jbe L(last_4x_vec_or_less)
166# endif
167
168 /* Align data to 4 * VEC_SIZE. */
169 movq %rdi, %rcx
170 andl $(4 * VEC_SIZE - 1), %ecx
171 andq $-(4 * VEC_SIZE), %rdi
172
173# ifdef USE_AS_STRNLEN
174 /* Adjust length. */
175 addq %rcx, %rsi
176# endif
177
178 .p2align 4
179L(loop_4x_vec):
180 /* Compare 4 * VEC at a time forward. */
181 vmovdqa (%rdi), %ymm1
182 vmovdqa VEC_SIZE(%rdi), %ymm2
183 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
184 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
185 VPMINU %ymm1, %ymm2, %ymm5
186 VPMINU %ymm3, %ymm4, %ymm6
187 VPMINU %ymm5, %ymm6, %ymm5
188
189 VPCMPEQ %ymm5, %ymm0, %ymm5
190 vpmovmskb %ymm5, %eax
191 testl %eax, %eax
192 jnz L(4x_vec_end)
193
194 addq $(VEC_SIZE * 4), %rdi
195
196# ifndef USE_AS_STRNLEN
197 jmp L(loop_4x_vec)
198# else
199 subq $(VEC_SIZE * 4), %rsi
200 ja L(loop_4x_vec)
201
202L(last_4x_vec_or_less):
203 /* Less than 4 * VEC and aligned to VEC_SIZE. */
204 addl $(VEC_SIZE * 2), %esi
205 jle L(last_2x_vec)
206
207 VPCMPEQ (%rdi), %ymm0, %ymm1
208 vpmovmskb %ymm1, %eax
209 testl %eax, %eax
210 jnz L(first_vec_x0)
211
212 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
213 vpmovmskb %ymm1, %eax
214 testl %eax, %eax
215 jnz L(first_vec_x1)
216
217 VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
218 vpmovmskb %ymm1, %eax
219 testl %eax, %eax
220
221 jnz L(first_vec_x2_check)
222 subl $VEC_SIZE, %esi
223 jle L(max)
224
225 VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
226 vpmovmskb %ymm1, %eax
227 testl %eax, %eax
228
229 jnz L(first_vec_x3_check)
230 movq %r8, %rax
231# ifdef USE_AS_WCSLEN
232 shrq $2, %rax
233# endif
234 VZEROUPPER
235 ret
236
237 .p2align 4
238L(last_2x_vec):
239 addl $(VEC_SIZE * 2), %esi
240 VPCMPEQ (%rdi), %ymm0, %ymm1
241 vpmovmskb %ymm1, %eax
242 testl %eax, %eax
243
244 jnz L(first_vec_x0_check)
245 subl $VEC_SIZE, %esi
246 jle L(max)
247
248 VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
249 vpmovmskb %ymm1, %eax
250 testl %eax, %eax
251 jnz L(first_vec_x1_check)
252 movq %r8, %rax
253# ifdef USE_AS_WCSLEN
254 shrq $2, %rax
255# endif
256 VZEROUPPER
257 ret
258
259 .p2align 4
260L(first_vec_x0_check):
261 tzcntl %eax, %eax
262 /* Check the end of data. */
263 cmpq %rax, %rsi
264 jbe L(max)
265 addq %rdi, %rax
266 subq %rdx, %rax
267# ifdef USE_AS_WCSLEN
268 shrq $2, %rax
269# endif
270 VZEROUPPER
271 ret
272
273 .p2align 4
274L(first_vec_x1_check):
275 tzcntl %eax, %eax
276 /* Check the end of data. */
277 cmpq %rax, %rsi
278 jbe L(max)
279 addq $VEC_SIZE, %rax
280 addq %rdi, %rax
281 subq %rdx, %rax
282# ifdef USE_AS_WCSLEN
283 shrq $2, %rax
284# endif
285 VZEROUPPER
286 ret
287
288 .p2align 4
289L(first_vec_x2_check):
290 tzcntl %eax, %eax
291 /* Check the end of data. */
292 cmpq %rax, %rsi
293 jbe L(max)
294 addq $(VEC_SIZE * 2), %rax
295 addq %rdi, %rax
296 subq %rdx, %rax
297# ifdef USE_AS_WCSLEN
298 shrq $2, %rax
299# endif
300 VZEROUPPER
301 ret
302
303 .p2align 4
304L(first_vec_x3_check):
305 tzcntl %eax, %eax
306 /* Check the end of data. */
307 cmpq %rax, %rsi
308 jbe L(max)
309 addq $(VEC_SIZE * 3), %rax
310 addq %rdi, %rax
311 subq %rdx, %rax
312# ifdef USE_AS_WCSLEN
313 shrq $2, %rax
314# endif
315 VZEROUPPER
316 ret
317
318 .p2align 4
319L(max):
320 movq %r8, %rax
321# ifdef USE_AS_WCSLEN
322 shrq $2, %rax
323# endif
324 VZEROUPPER
325 ret
326
327 .p2align 4
328L(zero):
329 xorl %eax, %eax
330 ret
331# endif
332
333 .p2align 4
334L(first_vec_x0):
335 tzcntl %eax, %eax
336 addq %rdi, %rax
337 subq %rdx, %rax
338# ifdef USE_AS_WCSLEN
339 shrq $2, %rax
340# endif
341 VZEROUPPER
342 ret
343
344 .p2align 4
345L(first_vec_x1):
346 tzcntl %eax, %eax
347 addq $VEC_SIZE, %rax
348 addq %rdi, %rax
349 subq %rdx, %rax
350# ifdef USE_AS_WCSLEN
351 shrq $2, %rax
352# endif
353 VZEROUPPER
354 ret
355
356 .p2align 4
357L(first_vec_x2):
358 tzcntl %eax, %eax
359 addq $(VEC_SIZE * 2), %rax
360 addq %rdi, %rax
361 subq %rdx, %rax
362# ifdef USE_AS_WCSLEN
363 shrq $2, %rax
364# endif
365 VZEROUPPER
366 ret
367
368 .p2align 4
369L(4x_vec_end):
370 VPCMPEQ %ymm1, %ymm0, %ymm1
371 vpmovmskb %ymm1, %eax
372 testl %eax, %eax
373 jnz L(first_vec_x0)
374 VPCMPEQ %ymm2, %ymm0, %ymm2
375 vpmovmskb %ymm2, %eax
376 testl %eax, %eax
377 jnz L(first_vec_x1)
378 VPCMPEQ %ymm3, %ymm0, %ymm3
379 vpmovmskb %ymm3, %eax
380 testl %eax, %eax
381 jnz L(first_vec_x2)
382 VPCMPEQ %ymm4, %ymm0, %ymm4
383 vpmovmskb %ymm4, %eax
384L(first_vec_x3):
385 tzcntl %eax, %eax
386 addq $(VEC_SIZE * 3), %rax
387 addq %rdi, %rax
388 subq %rdx, %rax
389# ifdef USE_AS_WCSLEN
390 shrq $2, %rax
391# endif
392 VZEROUPPER
393 ret
394
395END (STRLEN)
396#endif
397