1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2020 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <https://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef VZEROUPPER
38# if VEC_SIZE > 16
39# define VZEROUPPER vzeroupper
40# else
41# define VZEROUPPER
42# endif
43#endif
44
45#ifndef VZEROUPPER_SHORT_RETURN
46# if VEC_SIZE > 16
47# define VZEROUPPER_SHORT_RETURN vzeroupper
48# else
49# define VZEROUPPER_SHORT_RETURN rep
50# endif
51#endif
52
53#ifndef MOVQ
54# if VEC_SIZE > 16
55# define MOVQ vmovq
56# else
57# define MOVQ movq
58# endif
59#endif
60
61/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67#ifndef REP_STOSB_THRESHOLD
68# define REP_STOSB_THRESHOLD 2048
69#endif
70
71#ifndef SECTION
72# error SECTION is not defined!
73#endif
74
75 .section SECTION(.text),"ax",@progbits
76#if VEC_SIZE == 16 && IS_IN (libc)
77ENTRY (__bzero)
78 mov %RDI_LP, %RAX_LP /* Set return value. */
79 mov %RSI_LP, %RDX_LP /* Set n. */
80 pxor %xmm0, %xmm0
81 jmp L(entry_from_bzero)
82END (__bzero)
83weak_alias (__bzero, bzero)
84#endif
85
86#if IS_IN (libc)
87# if defined SHARED
88ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
89 cmp %RDX_LP, %RCX_LP
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
92# endif
93
94ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
95 shl $2, %RDX_LP
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98END (WMEMSET_SYMBOL (__wmemset, unaligned))
99#endif
100
101#if defined SHARED && IS_IN (libc)
102ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
103 cmp %RDX_LP, %RCX_LP
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
106#endif
107
108ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
110# ifdef __ILP32__
111 /* Clear the upper 32 bits. */
112 mov %edx, %edx
113# endif
114L(entry_from_bzero):
115 cmpq $VEC_SIZE, %rdx
116 jb L(less_vec)
117 cmpq $(VEC_SIZE * 2), %rdx
118 ja L(more_2x_vec)
119 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
120 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
121 VMOVU %VEC(0), (%rdi)
122 VZEROUPPER
123 ret
124#if defined USE_MULTIARCH && IS_IN (libc)
125END (MEMSET_SYMBOL (__memset, unaligned))
126
127# if VEC_SIZE == 16
128ENTRY (__memset_chk_erms)
129 cmp %RDX_LP, %RCX_LP
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131END (__memset_chk_erms)
132
133/* Only used to measure performance of REP STOSB. */
134ENTRY (__memset_erms)
135 /* Skip zero length. */
136 test %RDX_LP, %RDX_LP
137 jnz L(stosb)
138 movq %rdi, %rax
139 ret
140# else
141/* Provide a hidden symbol to debugger. */
142 .hidden MEMSET_SYMBOL (__memset, erms)
143ENTRY (MEMSET_SYMBOL (__memset, erms))
144# endif
145L(stosb):
146 /* Issue vzeroupper before rep stosb. */
147 VZEROUPPER
148 mov %RDX_LP, %RCX_LP
149 movzbl %sil, %eax
150 mov %RDI_LP, %RDX_LP
151 rep stosb
152 mov %RDX_LP, %RAX_LP
153 ret
154# if VEC_SIZE == 16
155END (__memset_erms)
156# else
157END (MEMSET_SYMBOL (__memset, erms))
158# endif
159
160# if defined SHARED && IS_IN (libc)
161ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
162 cmp %RDX_LP, %RCX_LP
163 jb HIDDEN_JUMPTARGET (__chk_fail)
164END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
165# endif
166
167ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
168 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
169# ifdef __ILP32__
170 /* Clear the upper 32 bits. */
171 mov %edx, %edx
172# endif
173 cmp $VEC_SIZE, %RDX_LP
174 jb L(less_vec)
175 cmp $(VEC_SIZE * 2), %RDX_LP
176 ja L(stosb_more_2x_vec)
177 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
178 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
179 VMOVU %VEC(0), (%rdi)
180 VZEROUPPER
181 ret
182
183L(stosb_more_2x_vec):
184 cmpq $REP_STOSB_THRESHOLD, %rdx
185 ja L(stosb)
186#endif
187L(more_2x_vec):
188 cmpq $(VEC_SIZE * 4), %rdx
189 ja L(loop_start)
190 VMOVU %VEC(0), (%rdi)
191 VMOVU %VEC(0), VEC_SIZE(%rdi)
192 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
193 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
194L(return):
195 VZEROUPPER
196 ret
197
198L(loop_start):
199 leaq (VEC_SIZE * 4)(%rdi), %rcx
200 VMOVU %VEC(0), (%rdi)
201 andq $-(VEC_SIZE * 4), %rcx
202 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
203 VMOVU %VEC(0), VEC_SIZE(%rdi)
204 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
205 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
206 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
207 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
208 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
209 addq %rdi, %rdx
210 andq $-(VEC_SIZE * 4), %rdx
211 cmpq %rdx, %rcx
212 je L(return)
213L(loop):
214 VMOVA %VEC(0), (%rcx)
215 VMOVA %VEC(0), VEC_SIZE(%rcx)
216 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
217 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
218 addq $(VEC_SIZE * 4), %rcx
219 cmpq %rcx, %rdx
220 jne L(loop)
221 VZEROUPPER_SHORT_RETURN
222 ret
223L(less_vec):
224 /* Less than 1 VEC. */
225# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
226# error Unsupported VEC_SIZE!
227# endif
228# if VEC_SIZE > 32
229 cmpb $32, %dl
230 jae L(between_32_63)
231# endif
232# if VEC_SIZE > 16
233 cmpb $16, %dl
234 jae L(between_16_31)
235# endif
236 MOVQ %xmm0, %rcx
237 cmpb $8, %dl
238 jae L(between_8_15)
239 cmpb $4, %dl
240 jae L(between_4_7)
241 cmpb $1, %dl
242 ja L(between_2_3)
243 jb 1f
244 movb %cl, (%rdi)
2451:
246 VZEROUPPER
247 ret
248# if VEC_SIZE > 32
249 /* From 32 to 63. No branch when size == 32. */
250L(between_32_63):
251 vmovdqu %ymm0, -32(%rdi,%rdx)
252 vmovdqu %ymm0, (%rdi)
253 VZEROUPPER
254 ret
255# endif
256# if VEC_SIZE > 16
257 /* From 16 to 31. No branch when size == 16. */
258L(between_16_31):
259 vmovdqu %xmm0, -16(%rdi,%rdx)
260 vmovdqu %xmm0, (%rdi)
261 VZEROUPPER
262 ret
263# endif
264 /* From 8 to 15. No branch when size == 8. */
265L(between_8_15):
266 movq %rcx, -8(%rdi,%rdx)
267 movq %rcx, (%rdi)
268 VZEROUPPER
269 ret
270L(between_4_7):
271 /* From 4 to 7. No branch when size == 4. */
272 movl %ecx, -4(%rdi,%rdx)
273 movl %ecx, (%rdi)
274 VZEROUPPER
275 ret
276L(between_2_3):
277 /* From 2 to 3. No branch when size == 2. */
278 movw %cx, -2(%rdi,%rdx)
279 movw %cx, (%rdi)
280 VZEROUPPER
281 ret
282END (MEMSET_SYMBOL (__memset, unaligned_erms))
283