1/* Copyright (C) 2011-2018 Free Software Foundation, Inc.
2 Contributed by Intel Corporation.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef USE_AS_WMEMCHR
22# define MEMCHR wmemchr
23# define PCMPEQ pcmpeqd
24#else
25# define MEMCHR memchr
26# define PCMPEQ pcmpeqb
27#endif
28
29/* fast SSE2 version with using pmaxub and 64 byte loop */
30
31 .text
32ENTRY(MEMCHR)
33 movd %esi, %xmm1
34 mov %edi, %ecx
35
36#ifdef USE_AS_WMEMCHR
37 test %RDX_LP, %RDX_LP
38 jz L(return_null)
39 shl $2, %RDX_LP
40#else
41# ifdef __ILP32__
42 /* Clear the upper 32 bits. */
43 movl %edx, %edx
44# endif
45 punpcklbw %xmm1, %xmm1
46 test %RDX_LP, %RDX_LP
47 jz L(return_null)
48 punpcklbw %xmm1, %xmm1
49#endif
50
51 and $63, %ecx
52 pshufd $0, %xmm1, %xmm1
53
54 cmp $48, %ecx
55 ja L(crosscache)
56
57 movdqu (%rdi), %xmm0
58 PCMPEQ %xmm1, %xmm0
59 pmovmskb %xmm0, %eax
60 test %eax, %eax
61
62 jnz L(matches_1)
63 sub $16, %rdx
64 jbe L(return_null)
65 add $16, %rdi
66 and $15, %ecx
67 and $-16, %rdi
68 add %rcx, %rdx
69 sub $64, %rdx
70 jbe L(exit_loop)
71 jmp L(loop_prolog)
72
73 .p2align 4
74L(crosscache):
75 and $15, %ecx
76 and $-16, %rdi
77 movdqa (%rdi), %xmm0
78
79 PCMPEQ %xmm1, %xmm0
80/* Check if there is a match. */
81 pmovmskb %xmm0, %eax
82/* Remove the leading bytes. */
83 sar %cl, %eax
84 test %eax, %eax
85 je L(unaligned_no_match)
86/* Check which byte is a match. */
87 bsf %eax, %eax
88
89 sub %rax, %rdx
90 jbe L(return_null)
91 add %rdi, %rax
92 add %rcx, %rax
93 ret
94
95 .p2align 4
96L(unaligned_no_match):
97 /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
98 "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
99 possible addition overflow. */
100 neg %rcx
101 add $16, %rcx
102 sub %rcx, %rdx
103 jbe L(return_null)
104 add $16, %rdi
105 sub $64, %rdx
106 jbe L(exit_loop)
107
108 .p2align 4
109L(loop_prolog):
110 movdqa (%rdi), %xmm0
111 PCMPEQ %xmm1, %xmm0
112 pmovmskb %xmm0, %eax
113 test %eax, %eax
114 jnz L(matches)
115
116 movdqa 16(%rdi), %xmm2
117 PCMPEQ %xmm1, %xmm2
118 pmovmskb %xmm2, %eax
119 test %eax, %eax
120 jnz L(matches16)
121
122 movdqa 32(%rdi), %xmm3
123 PCMPEQ %xmm1, %xmm3
124 pmovmskb %xmm3, %eax
125 test %eax, %eax
126 jnz L(matches32)
127
128 movdqa 48(%rdi), %xmm4
129 PCMPEQ %xmm1, %xmm4
130 add $64, %rdi
131 pmovmskb %xmm4, %eax
132 test %eax, %eax
133 jnz L(matches0)
134
135 test $0x3f, %rdi
136 jz L(align64_loop)
137
138 sub $64, %rdx
139 jbe L(exit_loop)
140
141 movdqa (%rdi), %xmm0
142 PCMPEQ %xmm1, %xmm0
143 pmovmskb %xmm0, %eax
144 test %eax, %eax
145 jnz L(matches)
146
147 movdqa 16(%rdi), %xmm2
148 PCMPEQ %xmm1, %xmm2
149 pmovmskb %xmm2, %eax
150 test %eax, %eax
151 jnz L(matches16)
152
153 movdqa 32(%rdi), %xmm3
154 PCMPEQ %xmm1, %xmm3
155 pmovmskb %xmm3, %eax
156 test %eax, %eax
157 jnz L(matches32)
158
159 movdqa 48(%rdi), %xmm3
160 PCMPEQ %xmm1, %xmm3
161 pmovmskb %xmm3, %eax
162
163 add $64, %rdi
164 test %eax, %eax
165 jnz L(matches0)
166
167 mov %rdi, %rcx
168 and $-64, %rdi
169 and $63, %ecx
170 add %rcx, %rdx
171
172 .p2align 4
173L(align64_loop):
174 sub $64, %rdx
175 jbe L(exit_loop)
176 movdqa (%rdi), %xmm0
177 movdqa 16(%rdi), %xmm2
178 movdqa 32(%rdi), %xmm3
179 movdqa 48(%rdi), %xmm4
180
181 PCMPEQ %xmm1, %xmm0
182 PCMPEQ %xmm1, %xmm2
183 PCMPEQ %xmm1, %xmm3
184 PCMPEQ %xmm1, %xmm4
185
186 pmaxub %xmm0, %xmm3
187 pmaxub %xmm2, %xmm4
188 pmaxub %xmm3, %xmm4
189 pmovmskb %xmm4, %eax
190
191 add $64, %rdi
192
193 test %eax, %eax
194 jz L(align64_loop)
195
196 sub $64, %rdi
197
198 pmovmskb %xmm0, %eax
199 test %eax, %eax
200 jnz L(matches)
201
202 pmovmskb %xmm2, %eax
203 test %eax, %eax
204 jnz L(matches16)
205
206 movdqa 32(%rdi), %xmm3
207 PCMPEQ %xmm1, %xmm3
208
209 PCMPEQ 48(%rdi), %xmm1
210 pmovmskb %xmm3, %eax
211 test %eax, %eax
212 jnz L(matches32)
213
214 pmovmskb %xmm1, %eax
215 bsf %eax, %eax
216 lea 48(%rdi, %rax), %rax
217 ret
218
219 .p2align 4
220L(exit_loop):
221 add $32, %edx
222 jle L(exit_loop_32)
223
224 movdqa (%rdi), %xmm0
225 PCMPEQ %xmm1, %xmm0
226 pmovmskb %xmm0, %eax
227 test %eax, %eax
228 jnz L(matches)
229
230 movdqa 16(%rdi), %xmm2
231 PCMPEQ %xmm1, %xmm2
232 pmovmskb %xmm2, %eax
233 test %eax, %eax
234 jnz L(matches16)
235
236 movdqa 32(%rdi), %xmm3
237 PCMPEQ %xmm1, %xmm3
238 pmovmskb %xmm3, %eax
239 test %eax, %eax
240 jnz L(matches32_1)
241 sub $16, %edx
242 jle L(return_null)
243
244 PCMPEQ 48(%rdi), %xmm1
245 pmovmskb %xmm1, %eax
246 test %eax, %eax
247 jnz L(matches48_1)
248 xor %eax, %eax
249 ret
250
251 .p2align 4
252L(exit_loop_32):
253 add $32, %edx
254 movdqa (%rdi), %xmm0
255 PCMPEQ %xmm1, %xmm0
256 pmovmskb %xmm0, %eax
257 test %eax, %eax
258 jnz L(matches_1)
259 sub $16, %edx
260 jbe L(return_null)
261
262 PCMPEQ 16(%rdi), %xmm1
263 pmovmskb %xmm1, %eax
264 test %eax, %eax
265 jnz L(matches16_1)
266 xor %eax, %eax
267 ret
268
269 .p2align 4
270L(matches0):
271 bsf %eax, %eax
272 lea -16(%rax, %rdi), %rax
273 ret
274
275 .p2align 4
276L(matches):
277 bsf %eax, %eax
278 add %rdi, %rax
279 ret
280
281 .p2align 4
282L(matches16):
283 bsf %eax, %eax
284 lea 16(%rax, %rdi), %rax
285 ret
286
287 .p2align 4
288L(matches32):
289 bsf %eax, %eax
290 lea 32(%rax, %rdi), %rax
291 ret
292
293 .p2align 4
294L(matches_1):
295 bsf %eax, %eax
296 sub %rax, %rdx
297 jbe L(return_null)
298 add %rdi, %rax
299 ret
300
301 .p2align 4
302L(matches16_1):
303 bsf %eax, %eax
304 sub %rax, %rdx
305 jbe L(return_null)
306 lea 16(%rdi, %rax), %rax
307 ret
308
309 .p2align 4
310L(matches32_1):
311 bsf %eax, %eax
312 sub %rax, %rdx
313 jbe L(return_null)
314 lea 32(%rdi, %rax), %rax
315 ret
316
317 .p2align 4
318L(matches48_1):
319 bsf %eax, %eax
320 sub %rax, %rdx
321 jbe L(return_null)
322 lea 48(%rdi, %rax), %rax
323 ret
324
325 .p2align 4
326L(return_null):
327 xor %eax, %eax
328 ret
329END(MEMCHR)
330
331#ifndef USE_AS_WMEMCHR
332strong_alias (memchr, __memchr)
333libc_hidden_builtin_def(memchr)
334#endif
335