1/* strcat with AVX2
2 Copyright (C) 2011-2020 Free Software Foundation, Inc.
3 Contributed by Intel Corporation.
4 This file is part of the GNU C Library.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, see
18 <https://www.gnu.org/licenses/>. */
19
20#if IS_IN (libc)
21
22# include <sysdep.h>
23
24# ifndef STRCAT
25# define STRCAT __strcat_avx2
26# endif
27
28# define USE_AS_STRCAT
29
30/* Number of bytes in a vector register */
31# define VEC_SIZE 32
32
33 .section .text.avx,"ax",@progbits
34ENTRY (STRCAT)
35 mov %rdi, %r9
36# ifdef USE_AS_STRNCAT
37 mov %rdx, %r8
38# endif
39
40 xor %eax, %eax
41 mov %edi, %ecx
42 and $((VEC_SIZE * 4) - 1), %ecx
43 vpxor %xmm6, %xmm6, %xmm6
44 cmp $(VEC_SIZE * 3), %ecx
45 ja L(fourth_vector_boundary)
46 vpcmpeqb (%rdi), %ymm6, %ymm0
47 vpmovmskb %ymm0, %edx
48 test %edx, %edx
49 jnz L(exit_null_on_first_vector)
50 mov %rdi, %rax
51 and $-VEC_SIZE, %rax
52 jmp L(align_vec_size_start)
53L(fourth_vector_boundary):
54 mov %rdi, %rax
55 and $-VEC_SIZE, %rax
56 vpcmpeqb (%rax), %ymm6, %ymm0
57 mov $-1, %r10d
58 sub %rax, %rcx
59 shl %cl, %r10d
60 vpmovmskb %ymm0, %edx
61 and %r10d, %edx
62 jnz L(exit)
63
64L(align_vec_size_start):
65 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm0
66 vpmovmskb %ymm0, %edx
67 test %edx, %edx
68 jnz L(exit_null_on_second_vector)
69
70 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
71 vpmovmskb %ymm1, %edx
72 test %edx, %edx
73 jnz L(exit_null_on_third_vector)
74
75 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
76 vpmovmskb %ymm2, %edx
77 test %edx, %edx
78 jnz L(exit_null_on_fourth_vector)
79
80 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
81 vpmovmskb %ymm3, %edx
82 test %edx, %edx
83 jnz L(exit_null_on_fifth_vector)
84
85 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
86 add $(VEC_SIZE * 4), %rax
87 vpmovmskb %ymm0, %edx
88 test %edx, %edx
89 jnz L(exit_null_on_second_vector)
90
91 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
92 vpmovmskb %ymm1, %edx
93 test %edx, %edx
94 jnz L(exit_null_on_third_vector)
95
96 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
97 vpmovmskb %ymm2, %edx
98 test %edx, %edx
99 jnz L(exit_null_on_fourth_vector)
100
101 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
102 vpmovmskb %ymm3, %edx
103 test %edx, %edx
104 jnz L(exit_null_on_fifth_vector)
105
106 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
107 add $(VEC_SIZE * 4), %rax
108 vpmovmskb %ymm0, %edx
109 test %edx, %edx
110 jnz L(exit_null_on_second_vector)
111
112 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
113 vpmovmskb %ymm1, %edx
114 test %edx, %edx
115 jnz L(exit_null_on_third_vector)
116
117 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
118 vpmovmskb %ymm2, %edx
119 test %edx, %edx
120 jnz L(exit_null_on_fourth_vector)
121
122 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
123 vpmovmskb %ymm3, %edx
124 test %edx, %edx
125 jnz L(exit_null_on_fifth_vector)
126
127 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
128 add $(VEC_SIZE * 4), %rax
129 vpmovmskb %ymm0, %edx
130 test %edx, %edx
131 jnz L(exit_null_on_second_vector)
132
133 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
134 vpmovmskb %ymm1, %edx
135 test %edx, %edx
136 jnz L(exit_null_on_third_vector)
137
138 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
139 vpmovmskb %ymm2, %edx
140 test %edx, %edx
141 jnz L(exit_null_on_fourth_vector)
142
143 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
144 vpmovmskb %ymm3, %edx
145 test %edx, %edx
146 jnz L(exit_null_on_fifth_vector)
147
148 test $((VEC_SIZE * 4) - 1), %rax
149 jz L(align_four_vec_loop)
150
151 vpcmpeqb (VEC_SIZE * 5)(%rax), %ymm6, %ymm0
152 add $(VEC_SIZE * 5), %rax
153 vpmovmskb %ymm0, %edx
154 test %edx, %edx
155 jnz L(exit)
156
157 test $((VEC_SIZE * 4) - 1), %rax
158 jz L(align_four_vec_loop)
159
160 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm1
161 add $VEC_SIZE, %rax
162 vpmovmskb %ymm1, %edx
163 test %edx, %edx
164 jnz L(exit)
165
166 test $((VEC_SIZE * 4) - 1), %rax
167 jz L(align_four_vec_loop)
168
169 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm2
170 add $VEC_SIZE, %rax
171 vpmovmskb %ymm2, %edx
172 test %edx, %edx
173 jnz L(exit)
174
175 test $((VEC_SIZE * 4) - 1), %rax
176 jz L(align_four_vec_loop)
177
178 vpcmpeqb VEC_SIZE(%rax), %ymm6, %ymm3
179 add $VEC_SIZE, %rax
180 vpmovmskb %ymm3, %edx
181 test %edx, %edx
182 jnz L(exit)
183
184 add $VEC_SIZE, %rax
185
186 .p2align 4
187L(align_four_vec_loop):
188 vmovaps (%rax), %ymm4
189 vpminub VEC_SIZE(%rax), %ymm4, %ymm4
190 vmovaps (VEC_SIZE * 2)(%rax), %ymm5
191 vpminub (VEC_SIZE * 3)(%rax), %ymm5, %ymm5
192 add $(VEC_SIZE * 4), %rax
193 vpminub %ymm4, %ymm5, %ymm5
194 vpcmpeqb %ymm5, %ymm6, %ymm5
195 vpmovmskb %ymm5, %edx
196 test %edx, %edx
197 jz L(align_four_vec_loop)
198
199 vpcmpeqb -(VEC_SIZE * 4)(%rax), %ymm6, %ymm0
200 sub $(VEC_SIZE * 5), %rax
201 vpmovmskb %ymm0, %edx
202 test %edx, %edx
203 jnz L(exit_null_on_second_vector)
204
205 vpcmpeqb (VEC_SIZE * 2)(%rax), %ymm6, %ymm1
206 vpmovmskb %ymm1, %edx
207 test %edx, %edx
208 jnz L(exit_null_on_third_vector)
209
210 vpcmpeqb (VEC_SIZE * 3)(%rax), %ymm6, %ymm2
211 vpmovmskb %ymm2, %edx
212 test %edx, %edx
213 jnz L(exit_null_on_fourth_vector)
214
215 vpcmpeqb (VEC_SIZE * 4)(%rax), %ymm6, %ymm3
216 vpmovmskb %ymm3, %edx
217 sub %rdi, %rax
218 bsf %rdx, %rdx
219 add %rdx, %rax
220 add $(VEC_SIZE * 4), %rax
221 jmp L(StartStrcpyPart)
222
223 .p2align 4
224L(exit):
225 sub %rdi, %rax
226L(exit_null_on_first_vector):
227 bsf %rdx, %rdx
228 add %rdx, %rax
229 jmp L(StartStrcpyPart)
230
231 .p2align 4
232L(exit_null_on_second_vector):
233 sub %rdi, %rax
234 bsf %rdx, %rdx
235 add %rdx, %rax
236 add $VEC_SIZE, %rax
237 jmp L(StartStrcpyPart)
238
239 .p2align 4
240L(exit_null_on_third_vector):
241 sub %rdi, %rax
242 bsf %rdx, %rdx
243 add %rdx, %rax
244 add $(VEC_SIZE * 2), %rax
245 jmp L(StartStrcpyPart)
246
247 .p2align 4
248L(exit_null_on_fourth_vector):
249 sub %rdi, %rax
250 bsf %rdx, %rdx
251 add %rdx, %rax
252 add $(VEC_SIZE * 3), %rax
253 jmp L(StartStrcpyPart)
254
255 .p2align 4
256L(exit_null_on_fifth_vector):
257 sub %rdi, %rax
258 bsf %rdx, %rdx
259 add %rdx, %rax
260 add $(VEC_SIZE * 4), %rax
261
262 .p2align 4
263L(StartStrcpyPart):
264 lea (%r9, %rax), %rdi
265 mov %rsi, %rcx
266 mov %r9, %rax /* save result */
267
268# ifdef USE_AS_STRNCAT
269 test %r8, %r8
270 jz L(ExitZero)
271# define USE_AS_STRNCPY
272# endif
273
274# include "strcpy-avx2.S"
275#endif
276