1/* strchr/strchrnul optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRCHR
24# define STRCHR __strchr_avx2
25# endif
26
27# ifdef USE_AS_WCSCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# define CHAR_REG esi
31# else
32# define VPBROADCAST vpbroadcastb
33# define VPCMPEQ vpcmpeqb
34# define CHAR_REG sil
35# endif
36
37# ifndef VZEROUPPER
38# define VZEROUPPER vzeroupper
39# endif
40
41# define VEC_SIZE 32
42
43 .section .text.avx,"ax",@progbits
44ENTRY (STRCHR)
45 movl %edi, %ecx
46 /* Broadcast CHAR to YMM0. */
47 vmovd %esi, %xmm0
48 vpxor %xmm9, %xmm9, %xmm9
49 VPBROADCAST %xmm0, %ymm0
50 /* Check if we may cross page boundary with one vector load. */
51 andl $(2 * VEC_SIZE - 1), %ecx
52 cmpl $VEC_SIZE, %ecx
53 ja L(cros_page_boundary)
54
55 /* Check the first VEC_SIZE bytes. Search for both CHAR and the
56 null byte. */
57 vmovdqu (%rdi), %ymm8
58 VPCMPEQ %ymm8, %ymm0, %ymm1
59 VPCMPEQ %ymm8, %ymm9, %ymm2
60 vpor %ymm1, %ymm2, %ymm1
61 vpmovmskb %ymm1, %eax
62 testl %eax, %eax
63 jnz L(first_vec_x0)
64
65 /* Align data for aligned loads in the loop. */
66 addq $VEC_SIZE, %rdi
67 andl $(VEC_SIZE - 1), %ecx
68 andq $-VEC_SIZE, %rdi
69
70 jmp L(more_4x_vec)
71
72 .p2align 4
73L(cros_page_boundary):
74 andl $(VEC_SIZE - 1), %ecx
75 andq $-VEC_SIZE, %rdi
76 vmovdqu (%rdi), %ymm8
77 VPCMPEQ %ymm8, %ymm0, %ymm1
78 VPCMPEQ %ymm8, %ymm9, %ymm2
79 vpor %ymm1, %ymm2, %ymm1
80 vpmovmskb %ymm1, %eax
81 /* Remove the leading bytes. */
82 sarl %cl, %eax
83 testl %eax, %eax
84 jz L(aligned_more)
85 /* Found CHAR or the null byte. */
86 tzcntl %eax, %eax
87 addq %rcx, %rax
88# ifdef USE_AS_STRCHRNUL
89 addq %rdi, %rax
90# else
91 xorl %edx, %edx
92 leaq (%rdi, %rax), %rax
93 cmp (%rax), %CHAR_REG
94 cmovne %rdx, %rax
95# endif
96 VZEROUPPER
97 ret
98
99 .p2align 4
100L(aligned_more):
101 addq $VEC_SIZE, %rdi
102
103L(more_4x_vec):
104 /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
105 since data is only aligned to VEC_SIZE. */
106 vmovdqa (%rdi), %ymm8
107 VPCMPEQ %ymm8, %ymm0, %ymm1
108 VPCMPEQ %ymm8, %ymm9, %ymm2
109 vpor %ymm1, %ymm2, %ymm1
110 vpmovmskb %ymm1, %eax
111 testl %eax, %eax
112 jnz L(first_vec_x0)
113
114 vmovdqa VEC_SIZE(%rdi), %ymm8
115 VPCMPEQ %ymm8, %ymm0, %ymm1
116 VPCMPEQ %ymm8, %ymm9, %ymm2
117 vpor %ymm1, %ymm2, %ymm1
118 vpmovmskb %ymm1, %eax
119 testl %eax, %eax
120 jnz L(first_vec_x1)
121
122 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
123 VPCMPEQ %ymm8, %ymm0, %ymm1
124 VPCMPEQ %ymm8, %ymm9, %ymm2
125 vpor %ymm1, %ymm2, %ymm1
126 vpmovmskb %ymm1, %eax
127 testl %eax, %eax
128 jnz L(first_vec_x2)
129
130 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
131 VPCMPEQ %ymm8, %ymm0, %ymm1
132 VPCMPEQ %ymm8, %ymm9, %ymm2
133 vpor %ymm1, %ymm2, %ymm1
134 vpmovmskb %ymm1, %eax
135 testl %eax, %eax
136 jnz L(first_vec_x3)
137
138 addq $(VEC_SIZE * 4), %rdi
139
140 /* Align data to 4 * VEC_SIZE. */
141 movq %rdi, %rcx
142 andl $(4 * VEC_SIZE - 1), %ecx
143 andq $-(4 * VEC_SIZE), %rdi
144
145 .p2align 4
146L(loop_4x_vec):
147 /* Compare 4 * VEC at a time forward. */
148 vmovdqa (%rdi), %ymm5
149 vmovdqa VEC_SIZE(%rdi), %ymm6
150 vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
151 vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
152
153 VPCMPEQ %ymm5, %ymm0, %ymm1
154 VPCMPEQ %ymm6, %ymm0, %ymm2
155 VPCMPEQ %ymm7, %ymm0, %ymm3
156 VPCMPEQ %ymm8, %ymm0, %ymm4
157
158 VPCMPEQ %ymm5, %ymm9, %ymm5
159 VPCMPEQ %ymm6, %ymm9, %ymm6
160 VPCMPEQ %ymm7, %ymm9, %ymm7
161 VPCMPEQ %ymm8, %ymm9, %ymm8
162
163 vpor %ymm1, %ymm5, %ymm1
164 vpor %ymm2, %ymm6, %ymm2
165 vpor %ymm3, %ymm7, %ymm3
166 vpor %ymm4, %ymm8, %ymm4
167
168 vpor %ymm1, %ymm2, %ymm5
169 vpor %ymm3, %ymm4, %ymm6
170
171 vpor %ymm5, %ymm6, %ymm5
172
173 vpmovmskb %ymm5, %eax
174 testl %eax, %eax
175 jnz L(4x_vec_end)
176
177 addq $(VEC_SIZE * 4), %rdi
178
179 jmp L(loop_4x_vec)
180
181 .p2align 4
182L(first_vec_x0):
183 /* Found CHAR or the null byte. */
184 tzcntl %eax, %eax
185# ifdef USE_AS_STRCHRNUL
186 addq %rdi, %rax
187# else
188 xorl %edx, %edx
189 leaq (%rdi, %rax), %rax
190 cmp (%rax), %CHAR_REG
191 cmovne %rdx, %rax
192# endif
193 VZEROUPPER
194 ret
195
196 .p2align 4
197L(first_vec_x1):
198 tzcntl %eax, %eax
199# ifdef USE_AS_STRCHRNUL
200 addq $VEC_SIZE, %rax
201 addq %rdi, %rax
202# else
203 xorl %edx, %edx
204 leaq VEC_SIZE(%rdi, %rax), %rax
205 cmp (%rax), %CHAR_REG
206 cmovne %rdx, %rax
207# endif
208 VZEROUPPER
209 ret
210
211 .p2align 4
212L(first_vec_x2):
213 tzcntl %eax, %eax
214# ifdef USE_AS_STRCHRNUL
215 addq $(VEC_SIZE * 2), %rax
216 addq %rdi, %rax
217# else
218 xorl %edx, %edx
219 leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
220 cmp (%rax), %CHAR_REG
221 cmovne %rdx, %rax
222# endif
223 VZEROUPPER
224 ret
225
226 .p2align 4
227L(4x_vec_end):
228 vpmovmskb %ymm1, %eax
229 testl %eax, %eax
230 jnz L(first_vec_x0)
231 vpmovmskb %ymm2, %eax
232 testl %eax, %eax
233 jnz L(first_vec_x1)
234 vpmovmskb %ymm3, %eax
235 testl %eax, %eax
236 jnz L(first_vec_x2)
237 vpmovmskb %ymm4, %eax
238 testl %eax, %eax
239L(first_vec_x3):
240 tzcntl %eax, %eax
241# ifdef USE_AS_STRCHRNUL
242 addq $(VEC_SIZE * 3), %rax
243 addq %rdi, %rax
244# else
245 xorl %edx, %edx
246 leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
247 cmp (%rax), %CHAR_REG
248 cmovne %rdx, %rax
249# endif
250 VZEROUPPER
251 ret
252
253END (STRCHR)
254#endif
255