1/* wcsrchr with SSSE3
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#include <sysdep.h>
21
22 .text
23ENTRY (wcsrchr)
24
25 movd %rsi, %xmm1
26 mov %rdi, %rcx
27 punpckldq %xmm1, %xmm1
28 pxor %xmm2, %xmm2
29 punpckldq %xmm1, %xmm1
30 and $63, %rcx
31 cmp $48, %rcx
32 ja L(crosscache)
33
34 movdqu (%rdi), %xmm0
35 pcmpeqd %xmm0, %xmm2
36 pcmpeqd %xmm1, %xmm0
37 pmovmskb %xmm2, %rcx
38 pmovmskb %xmm0, %rax
39 add $16, %rdi
40
41 test %rax, %rax
42 jnz L(unaligned_match1)
43
44 test %rcx, %rcx
45 jnz L(return_null)
46
47 and $-16, %rdi
48 xor %r8, %r8
49 jmp L(loop)
50
51 .p2align 4
52L(unaligned_match1):
53 test %rcx, %rcx
54 jnz L(prolog_find_zero_1)
55
56 mov %rax, %r8
57 mov %rdi, %rsi
58 and $-16, %rdi
59 jmp L(loop)
60
61 .p2align 4
62L(crosscache):
63 and $15, %rcx
64 and $-16, %rdi
65 pxor %xmm3, %xmm3
66 movdqa (%rdi), %xmm0
67 pcmpeqd %xmm0, %xmm3
68 pcmpeqd %xmm1, %xmm0
69 pmovmskb %xmm3, %rdx
70 pmovmskb %xmm0, %rax
71 shr %cl, %rdx
72 shr %cl, %rax
73 add $16, %rdi
74
75 test %rax, %rax
76 jnz L(unaligned_match)
77
78 test %rdx, %rdx
79 jnz L(return_null)
80
81 xor %r8, %r8
82 jmp L(loop)
83
84 .p2align 4
85L(unaligned_match):
86 test %rdx, %rdx
87 jnz L(prolog_find_zero)
88
89 mov %rax, %r8
90 lea (%rdi, %rcx), %rsi
91
92/* Loop start on aligned string. */
93 .p2align 4
94L(loop):
95 movdqa (%rdi), %xmm0
96 pcmpeqd %xmm0, %xmm2
97 add $16, %rdi
98 pcmpeqd %xmm1, %xmm0
99 pmovmskb %xmm2, %rcx
100 pmovmskb %xmm0, %rax
101 or %rax, %rcx
102 jnz L(matches)
103
104 movdqa (%rdi), %xmm3
105 pcmpeqd %xmm3, %xmm2
106 add $16, %rdi
107 pcmpeqd %xmm1, %xmm3
108 pmovmskb %xmm2, %rcx
109 pmovmskb %xmm3, %rax
110 or %rax, %rcx
111 jnz L(matches)
112
113 movdqa (%rdi), %xmm4
114 pcmpeqd %xmm4, %xmm2
115 add $16, %rdi
116 pcmpeqd %xmm1, %xmm4
117 pmovmskb %xmm2, %rcx
118 pmovmskb %xmm4, %rax
119 or %rax, %rcx
120 jnz L(matches)
121
122 movdqa (%rdi), %xmm5
123 pcmpeqd %xmm5, %xmm2
124 add $16, %rdi
125 pcmpeqd %xmm1, %xmm5
126 pmovmskb %xmm2, %rcx
127 pmovmskb %xmm5, %rax
128 or %rax, %rcx
129 jz L(loop)
130
131 .p2align 4
132L(matches):
133 test %rax, %rax
134 jnz L(match)
135L(return_value):
136 test %r8, %r8
137 jz L(return_null)
138 mov %r8, %rax
139 mov %rsi, %rdi
140
141 test $15 << 4, %ah
142 jnz L(match_fourth_wchar)
143 test %ah, %ah
144 jnz L(match_third_wchar)
145 test $15 << 4, %al
146 jnz L(match_second_wchar)
147 lea -16(%rdi), %rax
148 ret
149
150 .p2align 4
151L(match):
152 pmovmskb %xmm2, %rcx
153 test %rcx, %rcx
154 jnz L(find_zero)
155 mov %rax, %r8
156 mov %rdi, %rsi
157 jmp L(loop)
158
159 .p2align 4
160L(find_zero):
161 test $15, %cl
162 jnz L(find_zero_in_first_wchar)
163 test %cl, %cl
164 jnz L(find_zero_in_second_wchar)
165 test $15, %ch
166 jnz L(find_zero_in_third_wchar)
167
168 and $1 << 13 - 1, %rax
169 jz L(return_value)
170
171 test $15 << 4, %ah
172 jnz L(match_fourth_wchar)
173 test %ah, %ah
174 jnz L(match_third_wchar)
175 test $15 << 4, %al
176 jnz L(match_second_wchar)
177 lea -16(%rdi), %rax
178 ret
179
180 .p2align 4
181L(find_zero_in_first_wchar):
182 test $1, %rax
183 jz L(return_value)
184 lea -16(%rdi), %rax
185 ret
186
187 .p2align 4
188L(find_zero_in_second_wchar):
189 and $1 << 5 - 1, %rax
190 jz L(return_value)
191
192 test $15 << 4, %al
193 jnz L(match_second_wchar)
194 lea -16(%rdi), %rax
195 ret
196
197 .p2align 4
198L(find_zero_in_third_wchar):
199 and $1 << 9 - 1, %rax
200 jz L(return_value)
201
202 test %ah, %ah
203 jnz L(match_third_wchar)
204 test $15 << 4, %al
205 jnz L(match_second_wchar)
206 lea -16(%rdi), %rax
207 ret
208
209 .p2align 4
210L(prolog_find_zero):
211 add %rcx, %rdi
212 mov %rdx, %rcx
213L(prolog_find_zero_1):
214 test $15, %cl
215 jnz L(prolog_find_zero_in_first_wchar)
216 test %cl, %cl
217 jnz L(prolog_find_zero_in_second_wchar)
218 test $15, %ch
219 jnz L(prolog_find_zero_in_third_wchar)
220
221 and $1 << 13 - 1, %rax
222 jz L(return_null)
223
224 test $15 << 4, %ah
225 jnz L(match_fourth_wchar)
226 test %ah, %ah
227 jnz L(match_third_wchar)
228 test $15 << 4, %al
229 jnz L(match_second_wchar)
230 lea -16(%rdi), %rax
231 ret
232
233 .p2align 4
234L(prolog_find_zero_in_first_wchar):
235 test $1, %rax
236 jz L(return_null)
237 lea -16(%rdi), %rax
238 ret
239
240 .p2align 4
241L(prolog_find_zero_in_second_wchar):
242 and $1 << 5 - 1, %rax
243 jz L(return_null)
244
245 test $15 << 4, %al
246 jnz L(match_second_wchar)
247 lea -16(%rdi), %rax
248 ret
249
250 .p2align 4
251L(prolog_find_zero_in_third_wchar):
252 and $1 << 9 - 1, %rax
253 jz L(return_null)
254
255 test %ah, %ah
256 jnz L(match_third_wchar)
257 test $15 << 4, %al
258 jnz L(match_second_wchar)
259 lea -16(%rdi), %rax
260 ret
261
262 .p2align 4
263L(match_second_wchar):
264 lea -12(%rdi), %rax
265 ret
266
267 .p2align 4
268L(match_third_wchar):
269 lea -8(%rdi), %rax
270 ret
271
272 .p2align 4
273L(match_fourth_wchar):
274 lea -4(%rdi), %rax
275 ret
276
277 .p2align 4
278L(return_null):
279 xor %rax, %rax
280 ret
281
282END (wcsrchr)
283