1/* fast SSE2 memrchr with 64 byte loop and pmaxub instruction using
2
3 Copyright (C) 2011-2018 Free Software Foundation, Inc.
4 Contributed by Intel Corporation.
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21#include <sysdep.h>
22
23 .text
24ENTRY (__memrchr)
25 movd %esi, %xmm1
26
27 sub $16, %RDX_LP
28 jbe L(length_less16)
29
30 punpcklbw %xmm1, %xmm1
31 punpcklbw %xmm1, %xmm1
32
33 add %RDX_LP, %RDI_LP
34 pshufd $0, %xmm1, %xmm1
35
36 movdqu (%rdi), %xmm0
37 pcmpeqb %xmm1, %xmm0
38
39/* Check if there is a match. */
40 pmovmskb %xmm0, %eax
41 test %eax, %eax
42 jnz L(matches0)
43
44 sub $64, %rdi
45 mov %edi, %ecx
46 and $15, %ecx
47 jz L(loop_prolog)
48
49 add $16, %rdi
50 add $16, %rdx
51 and $-16, %rdi
52 sub %rcx, %rdx
53
54 .p2align 4
55L(loop_prolog):
56 sub $64, %rdx
57 jbe L(exit_loop)
58
59 movdqa 48(%rdi), %xmm0
60 pcmpeqb %xmm1, %xmm0
61 pmovmskb %xmm0, %eax
62 test %eax, %eax
63 jnz L(matches48)
64
65 movdqa 32(%rdi), %xmm2
66 pcmpeqb %xmm1, %xmm2
67 pmovmskb %xmm2, %eax
68 test %eax, %eax
69 jnz L(matches32)
70
71 movdqa 16(%rdi), %xmm3
72 pcmpeqb %xmm1, %xmm3
73 pmovmskb %xmm3, %eax
74 test %eax, %eax
75 jnz L(matches16)
76
77 movdqa (%rdi), %xmm4
78 pcmpeqb %xmm1, %xmm4
79 pmovmskb %xmm4, %eax
80 test %eax, %eax
81 jnz L(matches0)
82
83 sub $64, %rdi
84 sub $64, %rdx
85 jbe L(exit_loop)
86
87 movdqa 48(%rdi), %xmm0
88 pcmpeqb %xmm1, %xmm0
89 pmovmskb %xmm0, %eax
90 test %eax, %eax
91 jnz L(matches48)
92
93 movdqa 32(%rdi), %xmm2
94 pcmpeqb %xmm1, %xmm2
95 pmovmskb %xmm2, %eax
96 test %eax, %eax
97 jnz L(matches32)
98
99 movdqa 16(%rdi), %xmm3
100 pcmpeqb %xmm1, %xmm3
101 pmovmskb %xmm3, %eax
102 test %eax, %eax
103 jnz L(matches16)
104
105 movdqa (%rdi), %xmm3
106 pcmpeqb %xmm1, %xmm3
107 pmovmskb %xmm3, %eax
108 test %eax, %eax
109 jnz L(matches0)
110
111 mov %edi, %ecx
112 and $63, %ecx
113 jz L(align64_loop)
114
115 add $64, %rdi
116 add $64, %rdx
117 and $-64, %rdi
118 sub %rcx, %rdx
119
120 .p2align 4
121L(align64_loop):
122 sub $64, %rdi
123 sub $64, %rdx
124 jbe L(exit_loop)
125
126 movdqa (%rdi), %xmm0
127 movdqa 16(%rdi), %xmm2
128 movdqa 32(%rdi), %xmm3
129 movdqa 48(%rdi), %xmm4
130
131 pcmpeqb %xmm1, %xmm0
132 pcmpeqb %xmm1, %xmm2
133 pcmpeqb %xmm1, %xmm3
134 pcmpeqb %xmm1, %xmm4
135
136 pmaxub %xmm3, %xmm0
137 pmaxub %xmm4, %xmm2
138 pmaxub %xmm0, %xmm2
139 pmovmskb %xmm2, %eax
140
141 test %eax, %eax
142 jz L(align64_loop)
143
144 pmovmskb %xmm4, %eax
145 test %eax, %eax
146 jnz L(matches48)
147
148 pmovmskb %xmm3, %eax
149 test %eax, %eax
150 jnz L(matches32)
151
152 movdqa 16(%rdi), %xmm2
153
154 pcmpeqb %xmm1, %xmm2
155 pcmpeqb (%rdi), %xmm1
156
157 pmovmskb %xmm2, %eax
158 test %eax, %eax
159 jnz L(matches16)
160
161 pmovmskb %xmm1, %eax
162 bsr %eax, %eax
163
164 add %rdi, %rax
165 ret
166
167 .p2align 4
168L(exit_loop):
169 add $64, %edx
170 cmp $32, %edx
171 jbe L(exit_loop_32)
172
173 movdqa 48(%rdi), %xmm0
174 pcmpeqb %xmm1, %xmm0
175 pmovmskb %xmm0, %eax
176 test %eax, %eax
177 jnz L(matches48)
178
179 movdqa 32(%rdi), %xmm2
180 pcmpeqb %xmm1, %xmm2
181 pmovmskb %xmm2, %eax
182 test %eax, %eax
183 jnz L(matches32)
184
185 movdqa 16(%rdi), %xmm3
186 pcmpeqb %xmm1, %xmm3
187 pmovmskb %xmm3, %eax
188 test %eax, %eax
189 jnz L(matches16_1)
190 cmp $48, %edx
191 jbe L(return_null)
192
193 pcmpeqb (%rdi), %xmm1
194 pmovmskb %xmm1, %eax
195 test %eax, %eax
196 jnz L(matches0_1)
197 xor %eax, %eax
198 ret
199
200 .p2align 4
201L(exit_loop_32):
202 movdqa 48(%rdi), %xmm0
203 pcmpeqb %xmm1, %xmm0
204 pmovmskb %xmm0, %eax
205 test %eax, %eax
206 jnz L(matches48_1)
207 cmp $16, %edx
208 jbe L(return_null)
209
210 pcmpeqb 32(%rdi), %xmm1
211 pmovmskb %xmm1, %eax
212 test %eax, %eax
213 jnz L(matches32_1)
214 xor %eax, %eax
215 ret
216
217 .p2align 4
218L(matches0):
219 bsr %eax, %eax
220 add %rdi, %rax
221 ret
222
223 .p2align 4
224L(matches16):
225 bsr %eax, %eax
226 lea 16(%rax, %rdi), %rax
227 ret
228
229 .p2align 4
230L(matches32):
231 bsr %eax, %eax
232 lea 32(%rax, %rdi), %rax
233 ret
234
235 .p2align 4
236L(matches48):
237 bsr %eax, %eax
238 lea 48(%rax, %rdi), %rax
239 ret
240
241 .p2align 4
242L(matches0_1):
243 bsr %eax, %eax
244 sub $64, %rdx
245 add %rax, %rdx
246 jl L(return_null)
247 add %rdi, %rax
248 ret
249
250 .p2align 4
251L(matches16_1):
252 bsr %eax, %eax
253 sub $48, %rdx
254 add %rax, %rdx
255 jl L(return_null)
256 lea 16(%rdi, %rax), %rax
257 ret
258
259 .p2align 4
260L(matches32_1):
261 bsr %eax, %eax
262 sub $32, %rdx
263 add %rax, %rdx
264 jl L(return_null)
265 lea 32(%rdi, %rax), %rax
266 ret
267
268 .p2align 4
269L(matches48_1):
270 bsr %eax, %eax
271 sub $16, %rdx
272 add %rax, %rdx
273 jl L(return_null)
274 lea 48(%rdi, %rax), %rax
275 ret
276
277 .p2align 4
278L(return_null):
279 xor %eax, %eax
280 ret
281
282 .p2align 4
283L(length_less16_offset0):
284 test %edx, %edx
285 jz L(return_null)
286
287 mov %dl, %cl
288 pcmpeqb (%rdi), %xmm1
289
290 mov $1, %edx
291 sal %cl, %edx
292 sub $1, %edx
293
294 pmovmskb %xmm1, %eax
295
296 and %edx, %eax
297 test %eax, %eax
298 jz L(return_null)
299
300 bsr %eax, %eax
301 add %rdi, %rax
302 ret
303
304 .p2align 4
305L(length_less16):
306 punpcklbw %xmm1, %xmm1
307 punpcklbw %xmm1, %xmm1
308
309 add $16, %edx
310
311 pshufd $0, %xmm1, %xmm1
312
313 mov %edi, %ecx
314 and $15, %ecx
315 jz L(length_less16_offset0)
316
317 mov %cl, %dh
318 mov %ecx, %esi
319 add %dl, %dh
320 and $-16, %rdi
321
322 sub $16, %dh
323 ja L(length_less16_part2)
324
325 pcmpeqb (%rdi), %xmm1
326 pmovmskb %xmm1, %eax
327
328 sar %cl, %eax
329 mov %dl, %cl
330
331 mov $1, %edx
332 sal %cl, %edx
333 sub $1, %edx
334
335 and %edx, %eax
336 test %eax, %eax
337 jz L(return_null)
338
339 bsr %eax, %eax
340 add %rdi, %rax
341 add %rsi, %rax
342 ret
343
344 .p2align 4
345L(length_less16_part2):
346 movdqa 16(%rdi), %xmm2
347 pcmpeqb %xmm1, %xmm2
348 pmovmskb %xmm2, %eax
349
350 mov %dh, %cl
351 mov $1, %edx
352 sal %cl, %edx
353 sub $1, %edx
354
355 and %edx, %eax
356
357 test %eax, %eax
358 jnz L(length_less16_part2_return)
359
360 pcmpeqb (%rdi), %xmm1
361 pmovmskb %xmm1, %eax
362
363 mov %esi, %ecx
364 sar %cl, %eax
365 test %eax, %eax
366 jz L(return_null)
367
368 bsr %eax, %eax
369 add %rdi, %rax
370 add %rsi, %rax
371 ret
372
373 .p2align 4
374L(length_less16_part2_return):
375 bsr %eax, %eax
376 lea 16(%rax, %rdi), %rax
377 ret
378
379END (__memrchr)
380weak_alias (__memrchr, memrchr)
381