1/* Initialize CPU feature data.
2 This file is part of the GNU C Library.
3 Copyright (C) 2008-2016 Free Software Foundation, Inc.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <cpuid.h>
20#include <cpu-features.h>
21#include <libc-internal.h>
22
23static inline void
24get_common_indeces (struct cpu_features *cpu_features,
25 unsigned int *family, unsigned int *model,
26 unsigned int *extended_model)
27{
28 unsigned int eax;
29 __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
30 cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
31 cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
32 GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].eax = eax;
33 *family = (eax >> 8) & 0x0f;
34 *model = (eax >> 4) & 0x0f;
35 *extended_model = (eax >> 12) & 0xf0;
36 if (*family == 0x0f)
37 {
38 *family += (eax >> 20) & 0xff;
39 *model += *extended_model;
40 }
41}
42
43static inline void
44init_cpu_features (struct cpu_features *cpu_features)
45{
46 unsigned int ebx, ecx, edx;
47 unsigned int family = 0;
48 unsigned int model = 0;
49 enum cpu_features_kind kind;
50
51#if !HAS_CPUID
52 if (__get_cpuid_max (0, 0) == 0)
53 {
54 kind = arch_kind_other;
55 goto no_cpuid;
56 }
57#endif
58
59 __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx);
60
61 /* This spells out "GenuineIntel". */
62 if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
63 {
64 unsigned int extended_model;
65
66 kind = arch_kind_intel;
67
68 get_common_indeces (cpu_features, &family, &model, &extended_model);
69
70 if (family == 0x06)
71 {
72 ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
73 model += extended_model;
74 switch (model)
75 {
76 case 0x1c:
77 case 0x26:
78 /* BSF is slow on Atom. */
79 cpu_features->feature[index_Slow_BSF] |= bit_Slow_BSF;
80 break;
81
82 case 0x57:
83 /* Knights Landing. Enable Silvermont optimizations. */
84 cpu_features->feature[index_Prefer_No_VZEROUPPER]
85 |= bit_Prefer_No_VZEROUPPER;
86
87 case 0x37:
88 case 0x4a:
89 case 0x4d:
90 case 0x5a:
91 case 0x5d:
92 /* Unaligned load versions are faster than SSSE3
93 on Silvermont. */
94#if index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
95# error index_Fast_Unaligned_Load != index_Prefer_PMINUB_for_stringop
96#endif
97#if index_Fast_Unaligned_Load != index_Slow_SSE4_2
98# error index_Fast_Unaligned_Load != index_Slow_SSE4_2
99#endif
100 cpu_features->feature[index_Fast_Unaligned_Load]
101 |= (bit_Fast_Unaligned_Load
102 | bit_Prefer_PMINUB_for_stringop
103 | bit_Slow_SSE4_2);
104 break;
105
106 default:
107 /* Unknown family 0x06 processors. Assuming this is one
108 of Core i3/i5/i7 processors if AVX is available. */
109 if ((ecx & bit_AVX) == 0)
110 break;
111
112 case 0x1a:
113 case 0x1e:
114 case 0x1f:
115 case 0x25:
116 case 0x2c:
117 case 0x2e:
118 case 0x2f:
119 /* Rep string instructions, copy backward, unaligned loads
120 and pminub are fast on Intel Core i3, i5 and i7. */
121#if index_Fast_Rep_String != index_Fast_Copy_Backward
122# error index_Fast_Rep_String != index_Fast_Copy_Backward
123#endif
124#if index_Fast_Rep_String != index_Fast_Unaligned_Load
125# error index_Fast_Rep_String != index_Fast_Unaligned_Load
126#endif
127#if index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
128# error index_Fast_Rep_String != index_Prefer_PMINUB_for_stringop
129#endif
130 cpu_features->feature[index_Fast_Rep_String]
131 |= (bit_Fast_Rep_String
132 | bit_Fast_Copy_Backward
133 | bit_Fast_Unaligned_Load
134 | bit_Prefer_PMINUB_for_stringop);
135 break;
136 }
137 }
138 }
139 /* This spells out "AuthenticAMD". */
140 else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
141 {
142 unsigned int extended_model;
143
144 kind = arch_kind_amd;
145
146 get_common_indeces (cpu_features, &family, &model, &extended_model);
147
148 ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
149
150 unsigned int eax;
151 __cpuid (0x80000000, eax, ebx, ecx, edx);
152 if (eax >= 0x80000001)
153 __cpuid (0x80000001,
154 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax,
155 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx,
156 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx,
157 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx);
158
159 if (family == 0x15)
160 {
161 /* "Excavator" */
162 if (model >= 0x60 && model <= 0x7f)
163 cpu_features->feature[index_Fast_Unaligned_Load]
164 |= bit_Fast_Unaligned_Load;
165 }
166 }
167 else
168 kind = arch_kind_other;
169
170 /* Support i586 if CX8 is available. */
171 if (HAS_CPU_FEATURE (CX8))
172 cpu_features->feature[index_I586] |= bit_I586;
173
174 /* Support i686 if CMOV is available. */
175 if (HAS_CPU_FEATURE (CMOV))
176 cpu_features->feature[index_I686] |= bit_I686;
177
178 if (cpu_features->max_cpuid >= 7)
179 __cpuid_count (7, 0,
180 cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
181 cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
182 cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
183 cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
184
185 /* Can we call xgetbv? */
186 if (HAS_CPU_FEATURE (OSXSAVE))
187 {
188 unsigned int xcrlow;
189 unsigned int xcrhigh;
190 asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
191 /* Is YMM and XMM state usable? */
192 if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
193 (bit_YMM_state | bit_XMM_state))
194 {
195 /* Determine if AVX is usable. */
196 if (HAS_CPU_FEATURE (AVX))
197 cpu_features->feature[index_AVX_Usable] |= bit_AVX_Usable;
198#if index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
199# error index_AVX2_Usable != index_AVX_Fast_Unaligned_Load
200#endif
201 /* Determine if AVX2 is usable. Unaligned load with 256-bit
202 AVX registers are faster on processors with AVX2. */
203 if (HAS_CPU_FEATURE (AVX2))
204 cpu_features->feature[index_AVX2_Usable]
205 |= bit_AVX2_Usable | bit_AVX_Fast_Unaligned_Load;
206 /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
207 ZMM16-ZMM31 state are enabled. */
208 if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
209 | bit_ZMM16_31_state)) ==
210 (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
211 {
212 /* Determine if AVX512F is usable. */
213 if (HAS_CPU_FEATURE (AVX512F))
214 {
215 cpu_features->feature[index_AVX512F_Usable]
216 |= bit_AVX512F_Usable;
217 /* Determine if AVX512DQ is usable. */
218 if (HAS_CPU_FEATURE (AVX512DQ))
219 cpu_features->feature[index_AVX512DQ_Usable]
220 |= bit_AVX512DQ_Usable;
221 }
222 }
223 /* Determine if FMA is usable. */
224 if (HAS_CPU_FEATURE (FMA))
225 cpu_features->feature[index_FMA_Usable] |= bit_FMA_Usable;
226 /* Determine if FMA4 is usable. */
227 if (HAS_CPU_FEATURE (FMA4))
228 cpu_features->feature[index_FMA4_Usable] |= bit_FMA4_Usable;
229 }
230
231 /* For _dl_runtime_resolve, set xsave_state_size to xsave area
232 size + integer register save size and align it to 64 bytes. */
233 if (cpu_features->max_cpuid >= 0xd)
234 {
235 unsigned int eax, ebx, ecx, edx;
236
237 __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
238 if (ebx != 0)
239 {
240 cpu_features->xsave_state_size
241 = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
242
243 __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
244
245 /* Check if XSAVEC is available. */
246 if ((eax & (1 << 1)) != 0)
247 {
248 unsigned int xstate_comp_offsets[32];
249 unsigned int xstate_comp_sizes[32];
250 unsigned int i;
251
252 xstate_comp_offsets[0] = 0;
253 xstate_comp_offsets[1] = 160;
254 xstate_comp_offsets[2] = 576;
255 xstate_comp_sizes[0] = 160;
256 xstate_comp_sizes[1] = 256;
257
258 for (i = 2; i < 32; i++)
259 {
260 if ((STATE_SAVE_MASK & (1 << i)) != 0)
261 {
262 __cpuid_count (0xd, i, eax, ebx, ecx, edx);
263 xstate_comp_sizes[i] = eax;
264 }
265 else
266 {
267 ecx = 0;
268 xstate_comp_sizes[i] = 0;
269 }
270
271 if (i > 2)
272 {
273 xstate_comp_offsets[i]
274 = (xstate_comp_offsets[i - 1]
275 + xstate_comp_sizes[i -1]);
276 if ((ecx & (1 << 1)) != 0)
277 xstate_comp_offsets[i]
278 = ALIGN_UP (xstate_comp_offsets[i], 64);
279 }
280 }
281
282 /* Use XSAVEC. */
283 unsigned int size
284 = xstate_comp_offsets[31] + xstate_comp_sizes[31];
285 if (size)
286 {
287 cpu_features->xsave_state_size
288 = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
289 cpu_features->feature[index_XSAVEC_Usable]
290 |= bit_XSAVEC_Usable;
291 }
292 }
293 }
294 }
295 }
296
297#if !HAS_CPUID
298no_cpuid:
299#endif
300
301 cpu_features->family = family;
302 cpu_features->model = model;
303 cpu_features->kind = kind;
304}
305