1/* memset/bzero with unaligned store and rep stosb
2 Copyright (C) 2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19/* memset is implemented as:
20 1. Use overlapping store to avoid branch.
21 2. If size is less than VEC, use integer register stores.
22 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
23 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
24 5. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
25 4 VEC stores and store 4 * VEC at a time until done. */
26
27#include <sysdep.h>
28
29#ifndef MEMSET_CHK_SYMBOL
30# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
31#endif
32
33#ifndef VZEROUPPER
34# if VEC_SIZE > 16
35# define VZEROUPPER vzeroupper
36# else
37# define VZEROUPPER
38# endif
39#endif
40
41#ifndef VZEROUPPER_SHORT_RETURN
42# if VEC_SIZE > 16
43# define VZEROUPPER_SHORT_RETURN vzeroupper
44# else
45# define VZEROUPPER_SHORT_RETURN rep
46# endif
47#endif
48
49#ifndef MOVQ
50# if VEC_SIZE > 16
51# define MOVQ vmovq
52# else
53# define MOVQ movq
54# endif
55#endif
56
57/* Threshold to use Enhanced REP STOSB. Since there is overhead to set
58 up REP STOSB operation, REP STOSB isn't faster on short data. The
59 memset micro benchmark in glibc shows that 2KB is the approximate
60 value above which REP STOSB becomes faster on processors with
61 Enhanced REP STOSB. Since the stored value is fixed, larger register
62 size has minimal impact on threshold. */
63#ifndef REP_STOSB_THRESHOLD
64# define REP_STOSB_THRESHOLD 2048
65#endif
66
67#ifndef SECTION
68# error SECTION is not defined!
69#endif
70
71 .section SECTION(.text),"ax",@progbits
72#if VEC_SIZE == 16 && IS_IN (libc)
73ENTRY (__bzero)
74 mov %RDI_LP, %RAX_LP /* Set return value. */
75 mov %RSI_LP, %RDX_LP /* Set n. */
76 pxor %xmm0, %xmm0
77 jmp L(entry_from_bzero)
78END (__bzero)
79weak_alias (__bzero, bzero)
80#endif
81
82#if defined SHARED && IS_IN (libc)
83ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
84 cmp %RDX_LP, %RCX_LP
85 jb HIDDEN_JUMPTARGET (__chk_fail)
86END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
87#endif
88
89ENTRY (MEMSET_SYMBOL (__memset, unaligned))
90L(memset_entry):
91 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
92# ifdef __ILP32__
93 /* Clear the upper 32 bits. */
94 mov %edx, %edx
95# endif
96L(entry_from_bzero):
97 cmpq $VEC_SIZE, %rdx
98 jb L(less_vec)
99 cmpq $(VEC_SIZE * 2), %rdx
100 ja L(more_2x_vec)
101 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
102 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
103 VMOVU %VEC(0), (%rdi)
104 VZEROUPPER
105 ret
106#if defined USE_MULTIARCH && IS_IN (libc)
107END (MEMSET_SYMBOL (__memset, unaligned))
108
109# if VEC_SIZE == 16
110/* Only used to measure performance of REP STOSB. */
111ENTRY (__memset_erms)
112# else
113/* Provide a symbol to debugger. */
114ENTRY (MEMSET_SYMBOL (__memset, erms))
115# endif
116L(stosb):
117 /* Issue vzeroupper before rep stosb. */
118 VZEROUPPER
119 mov %RDX_LP, %RCX_LP
120 movzbl %sil, %eax
121 mov %RDI_LP, %RDX_LP
122 rep stosb
123 mov %RDX_LP, %RAX_LP
124 ret
125# if VEC_SIZE == 16
126END (__memset_erms)
127# else
128END (MEMSET_SYMBOL (__memset, erms))
129# endif
130
131# if defined SHARED && IS_IN (libc)
132ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
133 cmp %RDX_LP, %RCX_LP
134 jb HIDDEN_JUMPTARGET (__chk_fail)
135END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
136# endif
137
138ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
139 VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
140# ifdef __ILP32__
141 /* Clear the upper 32 bits. */
142 mov %edx, %edx
143# endif
144 cmp $VEC_SIZE, %RDX_LP
145 jb L(less_vec)
146 cmp $(VEC_SIZE * 2), %RDX_LP
147 ja L(stosb_more_2x_vec)
148 /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
149 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
150 VMOVU %VEC(0), (%rdi)
151 VZEROUPPER
152 ret
153
154L(stosb_more_2x_vec):
155 cmpq $REP_STOSB_THRESHOLD, %rdx
156 ja L(stosb)
157#endif
158L(more_2x_vec):
159 cmpq $(VEC_SIZE * 4), %rdx
160 ja L(loop_start)
161 VMOVU %VEC(0), (%rdi)
162 VMOVU %VEC(0), VEC_SIZE(%rdi)
163 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
164 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
165L(return):
166 VZEROUPPER
167 ret
168
169L(loop_start):
170 leaq (VEC_SIZE * 4)(%rdi), %rcx
171 VMOVU %VEC(0), (%rdi)
172 andq $-(VEC_SIZE * 4), %rcx
173 VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
174 VMOVU %VEC(0), VEC_SIZE(%rdi)
175 VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
176 VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
177 VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
178 VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
179 VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
180 addq %rdi, %rdx
181 andq $-(VEC_SIZE * 4), %rdx
182 cmpq %rdx, %rcx
183 je L(return)
184L(loop):
185 VMOVA %VEC(0), (%rcx)
186 VMOVA %VEC(0), VEC_SIZE(%rcx)
187 VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
188 VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
189 addq $(VEC_SIZE * 4), %rcx
190 cmpq %rcx, %rdx
191 jne L(loop)
192 VZEROUPPER_SHORT_RETURN
193 ret
194L(less_vec):
195 /* Less than 1 VEC. */
196# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
197# error Unsupported VEC_SIZE!
198# endif
199# if VEC_SIZE > 32
200 cmpb $32, %dl
201 jae L(between_32_63)
202# endif
203# if VEC_SIZE > 16
204 cmpb $16, %dl
205 jae L(between_16_31)
206# endif
207 MOVQ %xmm0, %rcx
208 cmpb $8, %dl
209 jae L(between_8_15)
210 cmpb $4, %dl
211 jae L(between_4_7)
212 cmpb $1, %dl
213 ja L(between_2_3)
214 jb 1f
215 movb %cl, (%rdi)
2161:
217 VZEROUPPER
218 ret
219# if VEC_SIZE > 32
220 /* From 32 to 63. No branch when size == 32. */
221L(between_32_63):
222 vmovdqu %ymm0, -32(%rdi,%rdx)
223 vmovdqu %ymm0, (%rdi)
224 VZEROUPPER
225 ret
226# endif
227# if VEC_SIZE > 16
228 /* From 16 to 31. No branch when size == 16. */
229L(between_16_31):
230 vmovdqu %xmm0, -16(%rdi,%rdx)
231 vmovdqu %xmm0, (%rdi)
232 VZEROUPPER
233 ret
234# endif
235 /* From 8 to 15. No branch when size == 8. */
236L(between_8_15):
237 movq %rcx, -8(%rdi,%rdx)
238 movq %rcx, (%rdi)
239 VZEROUPPER
240 ret
241L(between_4_7):
242 /* From 4 to 7. No branch when size == 4. */
243 movl %ecx, -4(%rdi,%rdx)
244 movl %ecx, (%rdi)
245 VZEROUPPER
246 ret
247L(between_2_3):
248 /* From 2 to 3. No branch when size == 2. */
249 movw %cx, -2(%rdi,%rdx)
250 movw %cx, (%rdi)
251 VZEROUPPER
252 ret
253END (MEMSET_SYMBOL (__memset, unaligned_erms))
254