1/* SSE2 version of strlen/wcslen.
2 Copyright (C) 2012-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19#include <sysdep.h>
20
21#ifdef AS_WCSLEN
22# define PMINU pminud
23# define PCMPEQ pcmpeqd
24# define SHIFT_RETURN shrq $2, %rax
25#else
26# define PMINU pminub
27# define PCMPEQ pcmpeqb
28# define SHIFT_RETURN
29#endif
30
31/* Long lived register in strlen(s), strnlen(s, n) are:
32
33 %xmm3 - zero
34 %rdi - s
35 %r10 (s+n) & (~(64-1))
36 %r11 s+n
37*/
38
39
40.text
41ENTRY(strlen)
42
43/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
44#define FIND_ZERO \
45 PCMPEQ (%rax), %xmm0; \
46 PCMPEQ 16(%rax), %xmm1; \
47 PCMPEQ 32(%rax), %xmm2; \
48 PCMPEQ 48(%rax), %xmm3; \
49 pmovmskb %xmm0, %esi; \
50 pmovmskb %xmm1, %edx; \
51 pmovmskb %xmm2, %r8d; \
52 pmovmskb %xmm3, %ecx; \
53 salq $16, %rdx; \
54 salq $16, %rcx; \
55 orq %rsi, %rdx; \
56 orq %r8, %rcx; \
57 salq $32, %rcx; \
58 orq %rcx, %rdx;
59
60#ifdef AS_STRNLEN
61/* Do not read anything when n==0. */
62 test %RSI_LP, %RSI_LP
63 jne L(n_nonzero)
64 xor %rax, %rax
65 ret
66L(n_nonzero):
67# ifdef AS_WCSLEN
68 shl $2, %RSI_LP
69# endif
70
71/* Initialize long lived registers. */
72
73 add %RDI_LP, %RSI_LP
74 mov %RSI_LP, %R10_LP
75 and $-64, %R10_LP
76 mov %RSI_LP, %R11_LP
77#endif
78
79 pxor %xmm0, %xmm0
80 pxor %xmm1, %xmm1
81 pxor %xmm2, %xmm2
82 pxor %xmm3, %xmm3
83 movq %rdi, %rax
84 movq %rdi, %rcx
85 andq $4095, %rcx
86/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
87 cmpq $4047, %rcx
88/* We cannot unify this branching as it would be ~6 cycles slower. */
89 ja L(cross_page)
90
91#ifdef AS_STRNLEN
92/* Test if end is among first 64 bytes. */
93# define STRNLEN_PROLOG \
94 mov %r11, %rsi; \
95 subq %rax, %rsi; \
96 andq $-64, %rax; \
97 testq $-64, %rsi; \
98 je L(strnlen_ret)
99#else
100# define STRNLEN_PROLOG andq $-64, %rax;
101#endif
102
103/* Ignore bits in mask that come before start of string. */
104#define PROLOG(lab) \
105 movq %rdi, %rcx; \
106 xorq %rax, %rcx; \
107 STRNLEN_PROLOG; \
108 sarq %cl, %rdx; \
109 test %rdx, %rdx; \
110 je L(lab); \
111 bsfq %rdx, %rax; \
112 SHIFT_RETURN; \
113 ret
114
115#ifdef AS_STRNLEN
116 andq $-16, %rax
117 FIND_ZERO
118#else
119 /* Test first 16 bytes unaligned. */
120 movdqu (%rax), %xmm4
121 PCMPEQ %xmm0, %xmm4
122 pmovmskb %xmm4, %edx
123 test %edx, %edx
124 je L(next48_bytes)
125 bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
126 SHIFT_RETURN
127 ret
128
129L(next48_bytes):
130/* Same as FIND_ZERO except we do not check first 16 bytes. */
131 andq $-16, %rax
132 PCMPEQ 16(%rax), %xmm1
133 PCMPEQ 32(%rax), %xmm2
134 PCMPEQ 48(%rax), %xmm3
135 pmovmskb %xmm1, %edx
136 pmovmskb %xmm2, %r8d
137 pmovmskb %xmm3, %ecx
138 salq $16, %rdx
139 salq $16, %rcx
140 orq %r8, %rcx
141 salq $32, %rcx
142 orq %rcx, %rdx
143#endif
144
145 /* When no zero byte is found xmm1-3 are zero so we do not have to
146 zero them. */
147 PROLOG(loop)
148
149 .p2align 4
150L(cross_page):
151 andq $-64, %rax
152 FIND_ZERO
153 PROLOG(loop_init)
154
155#ifdef AS_STRNLEN
156/* We must do this check to correctly handle strnlen (s, -1). */
157L(strnlen_ret):
158 bts %rsi, %rdx
159 sarq %cl, %rdx
160 test %rdx, %rdx
161 je L(loop_init)
162 bsfq %rdx, %rax
163 SHIFT_RETURN
164 ret
165#endif
166 .p2align 4
167L(loop_init):
168 pxor %xmm1, %xmm1
169 pxor %xmm2, %xmm2
170 pxor %xmm3, %xmm3
171#ifdef AS_STRNLEN
172 .p2align 4
173L(loop):
174
175 addq $64, %rax
176 cmpq %rax, %r10
177 je L(exit_end)
178
179 movdqa (%rax), %xmm0
180 PMINU 16(%rax), %xmm0
181 PMINU 32(%rax), %xmm0
182 PMINU 48(%rax), %xmm0
183 PCMPEQ %xmm3, %xmm0
184 pmovmskb %xmm0, %edx
185 testl %edx, %edx
186 jne L(exit)
187 jmp L(loop)
188
189 .p2align 4
190L(exit_end):
191 cmp %rax, %r11
192 je L(first) /* Do not read when end is at page boundary. */
193 pxor %xmm0, %xmm0
194 FIND_ZERO
195
196L(first):
197 bts %r11, %rdx
198 bsfq %rdx, %rdx
199 addq %rdx, %rax
200 subq %rdi, %rax
201 SHIFT_RETURN
202 ret
203
204 .p2align 4
205L(exit):
206 pxor %xmm0, %xmm0
207 FIND_ZERO
208
209 bsfq %rdx, %rdx
210 addq %rdx, %rax
211 subq %rdi, %rax
212 SHIFT_RETURN
213 ret
214
215#else
216
217 /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
218 .p2align 4
219L(loop):
220
221 movdqa 64(%rax), %xmm0
222 PMINU 80(%rax), %xmm0
223 PMINU 96(%rax), %xmm0
224 PMINU 112(%rax), %xmm0
225 PCMPEQ %xmm3, %xmm0
226 pmovmskb %xmm0, %edx
227 testl %edx, %edx
228 jne L(exit64)
229
230 subq $-128, %rax
231
232 movdqa (%rax), %xmm0
233 PMINU 16(%rax), %xmm0
234 PMINU 32(%rax), %xmm0
235 PMINU 48(%rax), %xmm0
236 PCMPEQ %xmm3, %xmm0
237 pmovmskb %xmm0, %edx
238 testl %edx, %edx
239 jne L(exit0)
240 jmp L(loop)
241
242 .p2align 4
243L(exit64):
244 addq $64, %rax
245L(exit0):
246 pxor %xmm0, %xmm0
247 FIND_ZERO
248
249 bsfq %rdx, %rdx
250 addq %rdx, %rax
251 subq %rdi, %rax
252 SHIFT_RETURN
253 ret
254
255#endif
256
257END(strlen)
258libc_hidden_builtin_def (strlen)
259