1/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
2
3 Copyright (C) 2011-2016 Free Software Foundation, Inc.
4 Contributed by Intel Corporation.
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21#include <sysdep.h>
22
23 .text
24ENTRY (__memrchr)
25 movd %rsi, %xmm1
26
27 sub $16, %RDX_LP
28 jbe L(length_less16)
29
30 punpcklbw %xmm1, %xmm1
31 punpcklbw %xmm1, %xmm1
32
33 add %RDX_LP, %RDI_LP
34 pshufd $0, %xmm1, %xmm1
35
36 movdqu (%rdi), %xmm0
37 pcmpeqb %xmm1, %xmm0
38
39/* Check if there is a match. */
40 pmovmskb %xmm0, %eax
41 test %eax, %eax
42 jnz L(matches0)
43
44 sub $64, %rdi
45 mov %rdi, %rcx
46 and $15, %rcx
47 jz L(loop_prolog)
48
49 add $16, %rdi
50 add $16, %rdx
51 and $-16, %rdi
52 sub %rcx, %rdx
53
54 .p2align 4
55L(loop_prolog):
56 sub $64, %rdx
57 jbe L(exit_loop)
58
59 movdqa 48(%rdi), %xmm0
60 pcmpeqb %xmm1, %xmm0
61 pmovmskb %xmm0, %eax
62 test %eax, %eax
63 jnz L(matches48)
64
65 movdqa 32(%rdi), %xmm2
66 pcmpeqb %xmm1, %xmm2
67 pmovmskb %xmm2, %eax
68 test %eax, %eax
69 jnz L(matches32)
70
71 movdqa 16(%rdi), %xmm3
72 pcmpeqb %xmm1, %xmm3
73 pmovmskb %xmm3, %eax
74 test %eax, %eax
75 jnz L(matches16)
76
77 movdqa (%rdi), %xmm4
78 pcmpeqb %xmm1, %xmm4
79 pmovmskb %xmm4, %eax
80 test %eax, %eax
81 jnz L(matches0)
82
83 sub $64, %rdi
84 sub $64, %rdx
85 jbe L(exit_loop)
86
87 movdqa 48(%rdi), %xmm0
88 pcmpeqb %xmm1, %xmm0
89 pmovmskb %xmm0, %eax
90 test %eax, %eax
91 jnz L(matches48)
92
93 movdqa 32(%rdi), %xmm2
94 pcmpeqb %xmm1, %xmm2
95 pmovmskb %xmm2, %eax
96 test %eax, %eax
97 jnz L(matches32)
98
99 movdqa 16(%rdi), %xmm3
100 pcmpeqb %xmm1, %xmm3
101 pmovmskb %xmm3, %eax
102 test %eax, %eax
103 jnz L(matches16)
104
105 movdqa (%rdi), %xmm3
106 pcmpeqb %xmm1, %xmm3
107 pmovmskb %xmm3, %eax
108 test %eax, %eax
109 jnz L(matches0)
110
111 mov %rdi, %rcx
112 and $63, %rcx
113 jz L(align64_loop)
114
115 add $64, %rdi
116 add $64, %rdx
117 and $-64, %rdi
118 sub %rcx, %rdx
119
120 .p2align 4
121L(align64_loop):
122 sub $64, %rdi
123 sub $64, %rdx
124 jbe L(exit_loop)
125
126 movdqa (%rdi), %xmm0
127 movdqa 16(%rdi), %xmm2
128 movdqa 32(%rdi), %xmm3
129 movdqa 48(%rdi), %xmm4
130
131 pcmpeqb %xmm1, %xmm0
132 pcmpeqb %xmm1, %xmm2
133 pcmpeqb %xmm1, %xmm3
134 pcmpeqb %xmm1, %xmm4
135
136 pmaxub %xmm3, %xmm0
137 pmaxub %xmm4, %xmm2
138 pmaxub %xmm0, %xmm2
139 pmovmskb %xmm2, %eax
140
141 test %eax, %eax
142 jz L(align64_loop)
143
144 pmovmskb %xmm4, %eax
145 test %eax, %eax
146 jnz L(matches48)
147
148 pmovmskb %xmm3, %eax
149 test %eax, %eax
150 jnz L(matches32)
151
152 movdqa 16(%rdi), %xmm2
153
154 pcmpeqb %xmm1, %xmm2
155 pcmpeqb (%rdi), %xmm1
156
157 pmovmskb %xmm2, %eax
158 test %eax, %eax
159 jnz L(matches16)
160
161 pmovmskb %xmm1, %eax
162 bsr %eax, %eax
163
164 add %rdi, %rax
165 ret
166
167 .p2align 4
168L(exit_loop):
169 add $64, %rdx
170 cmp $32, %rdx
171 jbe L(exit_loop_32)
172
173 movdqa 48(%rdi), %xmm0
174 pcmpeqb %xmm1, %xmm0
175 pmovmskb %xmm0, %eax
176 test %eax, %eax
177 jnz L(matches48)
178
179 movdqa 32(%rdi), %xmm2
180 pcmpeqb %xmm1, %xmm2
181 pmovmskb %xmm2, %eax
182 test %eax, %eax
183 jnz L(matches32)
184
185 movdqa 16(%rdi), %xmm3
186 pcmpeqb %xmm1, %xmm3
187 pmovmskb %xmm3, %eax
188 test %eax, %eax
189 jnz L(matches16_1)
190 cmp $48, %rdx
191 jbe L(return_null)
192
193 pcmpeqb (%rdi), %xmm1
194 pmovmskb %xmm1, %eax
195 test %eax, %eax
196 jnz L(matches0_1)
197 xor %eax, %eax
198 ret
199
200 .p2align 4
201L(exit_loop_32):
202 movdqa 48(%rdi), %xmm0
203 pcmpeqb %xmm1, %xmm0
204 pmovmskb %xmm0, %eax
205 test %eax, %eax
206 jnz L(matches48_1)
207 cmp $16, %rdx
208 jbe L(return_null)
209
210 pcmpeqb 32(%rdi), %xmm1
211 pmovmskb %xmm1, %eax
212 test %eax, %eax
213 jnz L(matches32_1)
214 xor %eax, %eax
215 ret
216
217 .p2align 4
218L(matches0):
219 bsr %eax, %eax
220 add %rdi, %rax
221 ret
222
223 .p2align 4
224L(matches16):
225 bsr %eax, %eax
226 lea 16(%rax, %rdi), %rax
227 ret
228
229 .p2align 4
230L(matches32):
231 bsr %eax, %eax
232 lea 32(%rax, %rdi), %rax
233 ret
234
235 .p2align 4
236L(matches48):
237 bsr %eax, %eax
238 lea 48(%rax, %rdi), %rax
239 ret
240
241 .p2align 4
242L(matches0_1):
243 bsr %eax, %eax
244 sub $64, %rdx
245 add %rax, %rdx
246 jl L(return_null)
247 add %rdi, %rax
248 ret
249
250 .p2align 4
251L(matches16_1):
252 bsr %eax, %eax
253 sub $48, %rdx
254 add %rax, %rdx
255 jl L(return_null)
256 lea 16(%rdi, %rax), %rax
257 ret
258
259 .p2align 4
260L(matches32_1):
261 bsr %eax, %eax
262 sub $32, %rdx
263 add %rax, %rdx
264 jl L(return_null)
265 lea 32(%rdi, %rax), %rax
266 ret
267
268 .p2align 4
269L(matches48_1):
270 bsr %eax, %eax
271 sub $16, %rdx
272 add %rax, %rdx
273 jl L(return_null)
274 lea 48(%rdi, %rax), %rax
275 ret
276
277 .p2align 4
278L(return_null):
279 xor %rax, %rax
280 ret
281
282 .p2align 4
283L(length_less16_offset0):
284 test %edx, %edx
285 jz L(return_null)
286
287 mov %dl, %cl
288 pcmpeqb (%rdi), %xmm1
289
290 mov $1, %edx
291 sal %cl, %edx
292 sub $1, %edx
293
294 pmovmskb %xmm1, %eax
295
296 and %edx, %eax
297 test %eax, %eax
298 jz L(return_null)
299
300 bsr %eax, %eax
301 add %rdi, %rax
302 ret
303
304 .p2align 4
305L(length_less16):
306 punpcklbw %xmm1, %xmm1
307 punpcklbw %xmm1, %xmm1
308
309 add $16, %rdx
310
311 pshufd $0, %xmm1, %xmm1
312
313 mov %rdi, %rcx
314 and $15, %rcx
315 jz L(length_less16_offset0)
316
317 mov %rdi, %rcx
318 and $15, %rcx
319 mov %cl, %dh
320 mov %rcx, %r8
321 add %dl, %dh
322 and $-16, %rdi
323
324 sub $16, %dh
325 ja L(length_less16_part2)
326
327 pcmpeqb (%rdi), %xmm1
328 pmovmskb %xmm1, %eax
329
330 sar %cl, %eax
331 mov %dl, %cl
332
333 mov $1, %edx
334 sal %cl, %edx
335 sub $1, %edx
336
337 and %edx, %eax
338 test %eax, %eax
339 jz L(return_null)
340
341 bsr %eax, %eax
342 add %rdi, %rax
343 add %r8, %rax
344 ret
345
346 .p2align 4
347L(length_less16_part2):
348 movdqa 16(%rdi), %xmm2
349 pcmpeqb %xmm1, %xmm2
350 pmovmskb %xmm2, %eax
351
352 mov %dh, %cl
353 mov $1, %edx
354 sal %cl, %edx
355 sub $1, %edx
356
357 and %edx, %eax
358
359 test %eax, %eax
360 jnz L(length_less16_part2_return)
361
362 pcmpeqb (%rdi), %xmm1
363 pmovmskb %xmm1, %eax
364
365 mov %r8, %rcx
366 sar %cl, %eax
367 test %eax, %eax
368 jz L(return_null)
369
370 bsr %eax, %eax
371 add %rdi, %rax
372 add %r8, %rax
373 ret
374
375 .p2align 4
376L(length_less16_part2_return):
377 bsr %eax, %eax
378 lea 16(%rax, %rdi), %rax
379 ret
380
381END (__memrchr)
382weak_alias (__memrchr, memrchr)
383