1/* strrchr/wcsrchr optimized with AVX2.
2 Copyright (C) 2017-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#if IS_IN (libc)
20
21# include <sysdep.h>
22
23# ifndef STRRCHR
24# define STRRCHR __strrchr_avx2
25# endif
26
27# ifdef USE_AS_WCSRCHR
28# define VPBROADCAST vpbroadcastd
29# define VPCMPEQ vpcmpeqd
30# else
31# define VPBROADCAST vpbroadcastb
32# define VPCMPEQ vpcmpeqb
33# endif
34
35# ifndef VZEROUPPER
36# define VZEROUPPER vzeroupper
37# endif
38
39# define VEC_SIZE 32
40
41 .section .text.avx,"ax",@progbits
42ENTRY (STRRCHR)
43 movd %esi, %xmm4
44 movl %edi, %ecx
45 /* Broadcast CHAR to YMM4. */
46 VPBROADCAST %xmm4, %ymm4
47 vpxor %ymm0, %ymm0, %ymm0
48
49 /* Check if we may cross page boundary with one vector load. */
50 andl $(2 * VEC_SIZE - 1), %ecx
51 cmpl $VEC_SIZE, %ecx
52 ja L(cros_page_boundary)
53
54 vmovdqu (%rdi), %ymm1
55 VPCMPEQ %ymm1, %ymm0, %ymm2
56 VPCMPEQ %ymm1, %ymm4, %ymm3
57 vpmovmskb %ymm2, %ecx
58 vpmovmskb %ymm3, %eax
59 addq $VEC_SIZE, %rdi
60
61 testl %eax, %eax
62 jnz L(first_vec)
63
64 testl %ecx, %ecx
65 jnz L(return_null)
66
67 andq $-VEC_SIZE, %rdi
68 xorl %edx, %edx
69 jmp L(aligned_loop)
70
71 .p2align 4
72L(first_vec):
73 /* Check if there is a nul CHAR. */
74 testl %ecx, %ecx
75 jnz L(char_and_nul_in_first_vec)
76
77 /* Remember the match and keep searching. */
78 movl %eax, %edx
79 movq %rdi, %rsi
80 andq $-VEC_SIZE, %rdi
81 jmp L(aligned_loop)
82
83 .p2align 4
84L(cros_page_boundary):
85 andl $(VEC_SIZE - 1), %ecx
86 andq $-VEC_SIZE, %rdi
87 vmovdqa (%rdi), %ymm1
88 VPCMPEQ %ymm1, %ymm0, %ymm2
89 VPCMPEQ %ymm1, %ymm4, %ymm3
90 vpmovmskb %ymm2, %edx
91 vpmovmskb %ymm3, %eax
92 shrl %cl, %edx
93 shrl %cl, %eax
94 addq $VEC_SIZE, %rdi
95
96 /* Check if there is a CHAR. */
97 testl %eax, %eax
98 jnz L(found_char)
99
100 testl %edx, %edx
101 jnz L(return_null)
102
103 jmp L(aligned_loop)
104
105 .p2align 4
106L(found_char):
107 testl %edx, %edx
108 jnz L(char_and_nul)
109
110 /* Remember the match and keep searching. */
111 movl %eax, %edx
112 leaq (%rdi, %rcx), %rsi
113
114 .p2align 4
115L(aligned_loop):
116 vmovdqa (%rdi), %ymm1
117 VPCMPEQ %ymm1, %ymm0, %ymm2
118 addq $VEC_SIZE, %rdi
119 VPCMPEQ %ymm1, %ymm4, %ymm3
120 vpmovmskb %ymm2, %ecx
121 vpmovmskb %ymm3, %eax
122 orl %eax, %ecx
123 jnz L(char_nor_null)
124
125 vmovdqa (%rdi), %ymm1
126 VPCMPEQ %ymm1, %ymm0, %ymm2
127 add $VEC_SIZE, %rdi
128 VPCMPEQ %ymm1, %ymm4, %ymm3
129 vpmovmskb %ymm2, %ecx
130 vpmovmskb %ymm3, %eax
131 orl %eax, %ecx
132 jnz L(char_nor_null)
133
134 vmovdqa (%rdi), %ymm1
135 VPCMPEQ %ymm1, %ymm0, %ymm2
136 addq $VEC_SIZE, %rdi
137 VPCMPEQ %ymm1, %ymm4, %ymm3
138 vpmovmskb %ymm2, %ecx
139 vpmovmskb %ymm3, %eax
140 orl %eax, %ecx
141 jnz L(char_nor_null)
142
143 vmovdqa (%rdi), %ymm1
144 VPCMPEQ %ymm1, %ymm0, %ymm2
145 addq $VEC_SIZE, %rdi
146 VPCMPEQ %ymm1, %ymm4, %ymm3
147 vpmovmskb %ymm2, %ecx
148 vpmovmskb %ymm3, %eax
149 orl %eax, %ecx
150 jz L(aligned_loop)
151
152 .p2align 4
153L(char_nor_null):
154 /* Find a CHAR or a nul CHAR in a loop. */
155 testl %eax, %eax
156 jnz L(match)
157L(return_value):
158 testl %edx, %edx
159 jz L(return_null)
160 movl %edx, %eax
161 movq %rsi, %rdi
162
163# ifdef USE_AS_WCSRCHR
164 /* Keep the first bit for each matching CHAR for bsr. */
165 andl $0x11111111, %eax
166# endif
167 bsrl %eax, %eax
168 leaq -VEC_SIZE(%rdi, %rax), %rax
169 VZEROUPPER
170 ret
171
172 .p2align 4
173L(match):
174 /* Find a CHAR. Check if there is a nul CHAR. */
175 vpmovmskb %ymm2, %ecx
176 testl %ecx, %ecx
177 jnz L(find_nul)
178
179 /* Remember the match and keep searching. */
180 movl %eax, %edx
181 movq %rdi, %rsi
182 jmp L(aligned_loop)
183
184 .p2align 4
185L(find_nul):
186# ifdef USE_AS_WCSRCHR
187 /* Keep the first bit for each matching CHAR for bsr. */
188 andl $0x11111111, %ecx
189 andl $0x11111111, %eax
190# endif
191 /* Mask out any matching bits after the nul CHAR. */
192 movl %ecx, %r8d
193 subl $1, %r8d
194 xorl %ecx, %r8d
195 andl %r8d, %eax
196 testl %eax, %eax
197 /* If there is no CHAR here, return the remembered one. */
198 jz L(return_value)
199 bsrl %eax, %eax
200 leaq -VEC_SIZE(%rdi, %rax), %rax
201 VZEROUPPER
202 ret
203
204 .p2align 4
205L(char_and_nul):
206 /* Find both a CHAR and a nul CHAR. */
207 addq %rcx, %rdi
208 movl %edx, %ecx
209L(char_and_nul_in_first_vec):
210# ifdef USE_AS_WCSRCHR
211 /* Keep the first bit for each matching CHAR for bsr. */
212 andl $0x11111111, %ecx
213 andl $0x11111111, %eax
214# endif
215 /* Mask out any matching bits after the nul CHAR. */
216 movl %ecx, %r8d
217 subl $1, %r8d
218 xorl %ecx, %r8d
219 andl %r8d, %eax
220 testl %eax, %eax
221 /* Return null pointer if the nul CHAR comes first. */
222 jz L(return_null)
223 bsrl %eax, %eax
224 leaq -VEC_SIZE(%rdi, %rax), %rax
225 VZEROUPPER
226 ret
227
228 .p2align 4
229L(return_null):
230 xorl %eax, %eax
231 VZEROUPPER
232 ret
233
234END (STRRCHR)
235#endif
236