1/* strcat with SSE2
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24# ifndef STRCAT
25# define STRCAT __strcat_sse2_unaligned
26# endif
27
28# define USE_AS_STRCAT
29
30.text
31ENTRY (STRCAT)
32 mov %rdi, %r9
33# ifdef USE_AS_STRNCAT
34 mov %rdx, %r8
35# endif
36
37/* Inline corresponding strlen file, temporary until new strcpy
38 implementation gets merged. */
39
40 xor %rax, %rax
41 mov %edi, %ecx
42 and $0x3f, %ecx
43 pxor %xmm0, %xmm0
44 cmp $0x30, %ecx
45 ja L(next)
46 movdqu (%rdi), %xmm1
47 pcmpeqb %xmm1, %xmm0
48 pmovmskb %xmm0, %edx
49 test %edx, %edx
50 jnz L(exit_less16)
51 mov %rdi, %rax
52 and $-16, %rax
53 jmp L(align16_start)
54L(next):
55 mov %rdi, %rax
56 and $-16, %rax
57 pcmpeqb (%rax), %xmm0
58 mov $-1, %r10d
59 sub %rax, %rcx
60 shl %cl, %r10d
61 pmovmskb %xmm0, %edx
62 and %r10d, %edx
63 jnz L(exit)
64
65L(align16_start):
66 pxor %xmm0, %xmm0
67 pxor %xmm1, %xmm1
68 pxor %xmm2, %xmm2
69 pxor %xmm3, %xmm3
70 pcmpeqb 16(%rax), %xmm0
71 pmovmskb %xmm0, %edx
72 test %edx, %edx
73 jnz L(exit16)
74
75 pcmpeqb 32(%rax), %xmm1
76 pmovmskb %xmm1, %edx
77 test %edx, %edx
78 jnz L(exit32)
79
80 pcmpeqb 48(%rax), %xmm2
81 pmovmskb %xmm2, %edx
82 test %edx, %edx
83 jnz L(exit48)
84
85 pcmpeqb 64(%rax), %xmm3
86 pmovmskb %xmm3, %edx
87 test %edx, %edx
88 jnz L(exit64)
89
90 pcmpeqb 80(%rax), %xmm0
91 add $64, %rax
92 pmovmskb %xmm0, %edx
93 test %edx, %edx
94 jnz L(exit16)
95
96 pcmpeqb 32(%rax), %xmm1
97 pmovmskb %xmm1, %edx
98 test %edx, %edx
99 jnz L(exit32)
100
101 pcmpeqb 48(%rax), %xmm2
102 pmovmskb %xmm2, %edx
103 test %edx, %edx
104 jnz L(exit48)
105
106 pcmpeqb 64(%rax), %xmm3
107 pmovmskb %xmm3, %edx
108 test %edx, %edx
109 jnz L(exit64)
110
111 pcmpeqb 80(%rax), %xmm0
112 add $64, %rax
113 pmovmskb %xmm0, %edx
114 test %edx, %edx
115 jnz L(exit16)
116
117 pcmpeqb 32(%rax), %xmm1
118 pmovmskb %xmm1, %edx
119 test %edx, %edx
120 jnz L(exit32)
121
122 pcmpeqb 48(%rax), %xmm2
123 pmovmskb %xmm2, %edx
124 test %edx, %edx
125 jnz L(exit48)
126
127 pcmpeqb 64(%rax), %xmm3
128 pmovmskb %xmm3, %edx
129 test %edx, %edx
130 jnz L(exit64)
131
132 pcmpeqb 80(%rax), %xmm0
133 add $64, %rax
134 pmovmskb %xmm0, %edx
135 test %edx, %edx
136 jnz L(exit16)
137
138 pcmpeqb 32(%rax), %xmm1
139 pmovmskb %xmm1, %edx
140 test %edx, %edx
141 jnz L(exit32)
142
143 pcmpeqb 48(%rax), %xmm2
144 pmovmskb %xmm2, %edx
145 test %edx, %edx
146 jnz L(exit48)
147
148 pcmpeqb 64(%rax), %xmm3
149 pmovmskb %xmm3, %edx
150 test %edx, %edx
151 jnz L(exit64)
152
153 test $0x3f, %rax
154 jz L(align64_loop)
155
156 pcmpeqb 80(%rax), %xmm0
157 add $80, %rax
158 pmovmskb %xmm0, %edx
159 test %edx, %edx
160 jnz L(exit)
161
162 test $0x3f, %rax
163 jz L(align64_loop)
164
165 pcmpeqb 16(%rax), %xmm1
166 add $16, %rax
167 pmovmskb %xmm1, %edx
168 test %edx, %edx
169 jnz L(exit)
170
171 test $0x3f, %rax
172 jz L(align64_loop)
173
174 pcmpeqb 16(%rax), %xmm2
175 add $16, %rax
176 pmovmskb %xmm2, %edx
177 test %edx, %edx
178 jnz L(exit)
179
180 test $0x3f, %rax
181 jz L(align64_loop)
182
183 pcmpeqb 16(%rax), %xmm3
184 add $16, %rax
185 pmovmskb %xmm3, %edx
186 test %edx, %edx
187 jnz L(exit)
188
189 add $16, %rax
190 .p2align 4
191 L(align64_loop):
192 movaps (%rax), %xmm4
193 pminub 16(%rax), %xmm4
194 movaps 32(%rax), %xmm5
195 pminub 48(%rax), %xmm5
196 add $64, %rax
197 pminub %xmm4, %xmm5
198 pcmpeqb %xmm0, %xmm5
199 pmovmskb %xmm5, %edx
200 test %edx, %edx
201 jz L(align64_loop)
202
203 pcmpeqb -64(%rax), %xmm0
204 sub $80, %rax
205 pmovmskb %xmm0, %edx
206 test %edx, %edx
207 jnz L(exit16)
208
209 pcmpeqb 32(%rax), %xmm1
210 pmovmskb %xmm1, %edx
211 test %edx, %edx
212 jnz L(exit32)
213
214 pcmpeqb 48(%rax), %xmm2
215 pmovmskb %xmm2, %edx
216 test %edx, %edx
217 jnz L(exit48)
218
219 pcmpeqb 64(%rax), %xmm3
220 pmovmskb %xmm3, %edx
221 sub %rdi, %rax
222 bsf %rdx, %rdx
223 add %rdx, %rax
224 add $64, %rax
225 jmp L(StartStrcpyPart)
226
227 .p2align 4
228L(exit):
229 sub %rdi, %rax
230L(exit_less16):
231 bsf %rdx, %rdx
232 add %rdx, %rax
233 jmp L(StartStrcpyPart)
234
235 .p2align 4
236L(exit16):
237 sub %rdi, %rax
238 bsf %rdx, %rdx
239 add %rdx, %rax
240 add $16, %rax
241 jmp L(StartStrcpyPart)
242
243 .p2align 4
244L(exit32):
245 sub %rdi, %rax
246 bsf %rdx, %rdx
247 add %rdx, %rax
248 add $32, %rax
249 jmp L(StartStrcpyPart)
250
251 .p2align 4
252L(exit48):
253 sub %rdi, %rax
254 bsf %rdx, %rdx
255 add %rdx, %rax
256 add $48, %rax
257 jmp L(StartStrcpyPart)
258
259 .p2align 4
260L(exit64):
261 sub %rdi, %rax
262 bsf %rdx, %rdx
263 add %rdx, %rax
264 add $64, %rax
265
266 .p2align 4
267L(StartStrcpyPart):
268 lea (%r9, %rax), %rdi
269 mov %rsi, %rcx
270 mov %r9, %rax /* save result */
271
272# ifdef USE_AS_STRNCAT
273 test %r8, %r8
274 jz L(ExitZero)
275# define USE_AS_STRNCPY
276# endif
277
278# include "strcpy-sse2-unaligned.S"
279#endif
280