1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016-2018 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef WMEMSET_CHK_SYMBOL
34# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
35#endif
36
37#ifndef VZEROUPPER
38# if VEC_SIZE > 16
39# define VZEROUPPER vzeroupper
40# else
41# define VZEROUPPER
42# endif
43#endif
44
45#ifndef VZEROUPPER_SHORT_RETURN
46# if VEC_SIZE > 16
47# define VZEROUPPER_SHORT_RETURN vzeroupper
48# else
49# define VZEROUPPER_SHORT_RETURN rep
50# endif
51#endif
52
53#ifndef MOVQ
54# if VEC_SIZE > 16
55# define MOVQ vmovq
56# else
57# define MOVQ movq
58# endif
59#endif
60
61/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
62 up REP STOSB operation, REP STOSB isn't faster on short data. The
63 memset micro benchmark in glibc shows that 2KB is the approximate
64 value above which REP STOSB becomes faster on processors with
65 Enhanced REP STOSB. Since the stored value is fixed, larger register
66 size has minimal impact on threshold. */
67#ifndef REP_STOSB_THRESHOLD
68# define REP_STOSB_THRESHOLD 2048
69#endif
70
71#ifndef SECTION
72# error SECTION is not defined!
73#endif
74
75 .section SECTION(.text),"ax",@progbits
76#if VEC_SIZE == 16 && IS_IN (libc)
77ENTRY (__bzero)
78 mov %RDI_LP, %RAX_LP /* Set return value. */
79 mov %RSI_LP, %RDX_LP /* Set n. */
80 pxor %xmm0, %xmm0
81 jmp L(entry_from_bzero)
82END (__bzero)
83weak_alias (__bzero, bzero)
84#endif
85
86#if IS_IN (libc)
87# if defined SHARED
88ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
89 cmp %RDX_LP, %RCX_LP
90 jb HIDDEN_JUMPTARGET (__chk_fail)
91END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
92# endif
93
94ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
95 shl $2, %RDX_LP
96 WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
97 jmp L(entry_from_bzero)
98END (WMEMSET_SYMBOL (__wmemset, unaligned))
99#endif
100
101#if defined SHARED && IS_IN (libc)
102ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
103 cmp %RDX_LP, %RCX_LP
104 jb HIDDEN_JUMPTARGET (__chk_fail)
105END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
106#endif
107
108ENTRY (MEMSET_SYMBOL (__memset, unaligned))
109 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
110# ifdef __ILP32__
111 /* Clear the upper 32 bits. */
112 mov %edx, %edx
113# endif
114L(entry_from_bzero):
115 cmpq $VEC_SIZE, %rdx
116 jb L(less_vec)
117 cmpq $(VEC_SIZE * 2), %rdx
118 ja L(more_2x_vec)
119 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
120 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
121 VMOVU %VEC(0), (%rdi)
122 VZEROUPPER
123 ret
124#if defined USE_MULTIARCH && IS_IN (libc)
125END (MEMSET_SYMBOL (__memset, unaligned))
126
127# if VEC_SIZE == 16
128ENTRY (__memset_chk_erms)
129 cmp %RDX_LP, %RCX_LP
130 jb HIDDEN_JUMPTARGET (__chk_fail)
131END (__memset_chk_erms)
132
133/* Only used to measure performance of REP STOSB. */
134ENTRY (__memset_erms)
135# else
136/* Provide a symbol to debugger. */
137ENTRY (MEMSET_SYMBOL (__memset, erms))
138# endif
139L(stosb):
140 /* Issue vzeroupper before rep stosb. */
141 VZEROUPPER
142 mov %RDX_LP, %RCX_LP
143 movzbl %sil, %eax
144 mov %RDI_LP, %RDX_LP
145 rep stosb
146 mov %RDX_LP, %RAX_LP
147 ret
148# if VEC_SIZE == 16
149END (__memset_erms)
150# else
151END (MEMSET_SYMBOL (__memset, erms))
152# endif
153
154# if defined SHARED && IS_IN (libc)
155ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
156 cmp %RDX_LP, %RCX_LP
157 jb HIDDEN_JUMPTARGET (__chk_fail)
158END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
159# endif
160
161ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
162 MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
163# ifdef __ILP32__
164 /* Clear the upper 32 bits. */
165 mov %edx, %edx
166# endif
167 cmp $VEC_SIZE, %RDX_LP
168 jb L(less_vec)
169 cmp $(VEC_SIZE * 2), %RDX_LP
170 ja L(stosb_more_2x_vec)
171 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
172 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
173 VMOVU %VEC(0), (%rdi)
174 VZEROUPPER
175 ret
176
177L(stosb_more_2x_vec):
178 cmpq $REP_STOSB_THRESHOLD, %rdx
179 ja L(stosb)
180#endif
181L(more_2x_vec):
182 cmpq $(VEC_SIZE * 4), %rdx
183 ja L(loop_start)
184 VMOVU %VEC(0), (%rdi)
185 VMOVU %VEC(0), VEC_SIZE(%rdi)
186 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
187 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
188L(return):
189 VZEROUPPER
190 ret
191
192L(loop_start):
193 leaq (VEC_SIZE * 4)(%rdi), %rcx
194 VMOVU %VEC(0), (%rdi)
195 andq $-(VEC_SIZE * 4), %rcx
196 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
197 VMOVU %VEC(0), VEC_SIZE(%rdi)
198 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
199 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
200 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
201 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
202 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
203 addq %rdi, %rdx
204 andq $-(VEC_SIZE * 4), %rdx
205 cmpq %rdx, %rcx
206 je L(return)
207L(loop):
208 VMOVA %VEC(0), (%rcx)
209 VMOVA %VEC(0), VEC_SIZE(%rcx)
210 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
211 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
212 addq $(VEC_SIZE * 4), %rcx
213 cmpq %rcx, %rdx
214 jne L(loop)
215 VZEROUPPER_SHORT_RETURN
216 ret
217L(less_vec):
218 /* Less than 1 VEC. */
219# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
220# error Unsupported VEC_SIZE!
221# endif
222# if VEC_SIZE > 32
223 cmpb $32, %dl
224 jae L(between_32_63)
225# endif
226# if VEC_SIZE > 16
227 cmpb $16, %dl
228 jae L(between_16_31)
229# endif
230 MOVQ %xmm0, %rcx
231 cmpb $8, %dl
232 jae L(between_8_15)
233 cmpb $4, %dl
234 jae L(between_4_7)
235 cmpb $1, %dl
236 ja L(between_2_3)
237 jb 1f
238 movb %cl, (%rdi)
2391:
240 VZEROUPPER
241 ret
242# if VEC_SIZE > 32
243 /* From 32 to 63. No branch when size == 32. */
244L(between_32_63):
245 vmovdqu %ymm0, -32(%rdi,%rdx)
246 vmovdqu %ymm0, (%rdi)
247 VZEROUPPER
248 ret
249# endif
250# if VEC_SIZE > 16
251 /* From 16 to 31. No branch when size == 16. */
252L(between_16_31):
253 vmovdqu %xmm0, -16(%rdi,%rdx)
254 vmovdqu %xmm0, (%rdi)
255 VZEROUPPER
256 ret
257# endif
258 /* From 8 to 15. No branch when size == 8. */
259L(between_8_15):
260 movq %rcx, -8(%rdi,%rdx)
261 movq %rcx, (%rdi)
262 VZEROUPPER
263 ret
264L(between_4_7):
265 /* From 4 to 7. No branch when size == 4. */
266 movl %ecx, -4(%rdi,%rdx)
267 movl %ecx, (%rdi)
268 VZEROUPPER
269 ret
270L(between_2_3):
271 /* From 2 to 3. No branch when size == 2. */
272 movw %cx, -2(%rdi,%rdx)
273 movw %cx, (%rdi)
274 VZEROUPPER
275 ret
276END (MEMSET_SYMBOL (__memset, unaligned_erms))
277