1/* fast SSE2 memchr with 64 byte loop and pmaxub instruction using
2
3 Copyright (C) 2011-2018 Free Software Foundation, Inc.
4 Contributed by Intel Corporation.
5 This file is part of the GNU C Library.
6
7 The GNU C Library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Lesser General Public
9 License as published by the Free Software Foundation; either
10 version 2.1 of the License, or (at your option) any later version.
11
12 The GNU C Library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public
18 License along with the GNU C Library; if not, see
19 <http://www.gnu.org/licenses/>. */
20
21#include <sysdep.h>
22
23 .text
24ENTRY (__rawmemchr)
25 movd %rsi, %xmm1
26 mov %rdi, %rcx
27
28 punpcklbw %xmm1, %xmm1
29 punpcklbw %xmm1, %xmm1
30
31 and $63, %rcx
32 pshufd $0, %xmm1, %xmm1
33
34 cmp $48, %rcx
35 ja L(crosscache)
36
37 movdqu (%rdi), %xmm0
38 pcmpeqb %xmm1, %xmm0
39/* Check if there is a match. */
40 pmovmskb %xmm0, %eax
41 test %eax, %eax
42
43 jnz L(matches)
44 add $16, %rdi
45 and $-16, %rdi
46 jmp L(loop_prolog)
47
48 .p2align 4
49L(crosscache):
50 and $15, %rcx
51 and $-16, %rdi
52 movdqa (%rdi), %xmm0
53
54 pcmpeqb %xmm1, %xmm0
55/* Check if there is a match. */
56 pmovmskb %xmm0, %eax
57/* Remove the leading bytes. */
58 sar %cl, %eax
59 test %eax, %eax
60 je L(unaligned_no_match)
61/* Check which byte is a match. */
62 bsf %eax, %eax
63
64 add %rdi, %rax
65 add %rcx, %rax
66 ret
67
68 .p2align 4
69L(unaligned_no_match):
70 add $16, %rdi
71
72 .p2align 4
73L(loop_prolog):
74 movdqa (%rdi), %xmm0
75 pcmpeqb %xmm1, %xmm0
76 pmovmskb %xmm0, %eax
77 test %eax, %eax
78 jnz L(matches)
79
80 movdqa 16(%rdi), %xmm2
81 pcmpeqb %xmm1, %xmm2
82 pmovmskb %xmm2, %eax
83 test %eax, %eax
84 jnz L(matches16)
85
86 movdqa 32(%rdi), %xmm3
87 pcmpeqb %xmm1, %xmm3
88 pmovmskb %xmm3, %eax
89 test %eax, %eax
90 jnz L(matches32)
91
92 movdqa 48(%rdi), %xmm4
93 pcmpeqb %xmm1, %xmm4
94 add $64, %rdi
95 pmovmskb %xmm4, %eax
96 test %eax, %eax
97 jnz L(matches0)
98
99 test $0x3f, %rdi
100 jz L(align64_loop)
101
102 movdqa (%rdi), %xmm0
103 pcmpeqb %xmm1, %xmm0
104 pmovmskb %xmm0, %eax
105 test %eax, %eax
106 jnz L(matches)
107
108 movdqa 16(%rdi), %xmm2
109 pcmpeqb %xmm1, %xmm2
110 pmovmskb %xmm2, %eax
111 test %eax, %eax
112 jnz L(matches16)
113
114 movdqa 32(%rdi), %xmm3
115 pcmpeqb %xmm1, %xmm3
116 pmovmskb %xmm3, %eax
117 test %eax, %eax
118 jnz L(matches32)
119
120 movdqa 48(%rdi), %xmm3
121 pcmpeqb %xmm1, %xmm3
122 pmovmskb %xmm3, %eax
123
124 add $64, %rdi
125 test %eax, %eax
126 jnz L(matches0)
127
128 and $-64, %rdi
129
130 .p2align 4
131L(align64_loop):
132 movdqa (%rdi), %xmm0
133 movdqa 16(%rdi), %xmm2
134 movdqa 32(%rdi), %xmm3
135 movdqa 48(%rdi), %xmm4
136
137 pcmpeqb %xmm1, %xmm0
138 pcmpeqb %xmm1, %xmm2
139 pcmpeqb %xmm1, %xmm3
140 pcmpeqb %xmm1, %xmm4
141
142 pmaxub %xmm0, %xmm3
143 pmaxub %xmm2, %xmm4
144 pmaxub %xmm3, %xmm4
145 pmovmskb %xmm4, %eax
146
147 add $64, %rdi
148
149 test %eax, %eax
150 jz L(align64_loop)
151
152 sub $64, %rdi
153
154 pmovmskb %xmm0, %eax
155 test %eax, %eax
156 jnz L(matches)
157
158 pmovmskb %xmm2, %eax
159 test %eax, %eax
160 jnz L(matches16)
161
162 movdqa 32(%rdi), %xmm3
163 pcmpeqb %xmm1, %xmm3
164
165 pcmpeqb 48(%rdi), %xmm1
166 pmovmskb %xmm3, %eax
167 test %eax, %eax
168 jnz L(matches32)
169
170 pmovmskb %xmm1, %eax
171 bsf %eax, %eax
172 lea 48(%rdi, %rax), %rax
173 ret
174
175 .p2align 4
176L(matches0):
177 bsf %eax, %eax
178 lea -16(%rax, %rdi), %rax
179 ret
180
181 .p2align 4
182L(matches):
183 bsf %eax, %eax
184 add %rdi, %rax
185 ret
186
187 .p2align 4
188L(matches16):
189 bsf %eax, %eax
190 lea 16(%rax, %rdi), %rax
191 ret
192
193 .p2align 4
194L(matches32):
195 bsf %eax, %eax
196 lea 32(%rax, %rdi), %rax
197 ret
198
199END (__rawmemchr)
200
201weak_alias (__rawmemchr, rawmemchr)
202libc_hidden_builtin_def (__rawmemchr)
203