1/* Initialize CPU feature data.
2 This file is part of the GNU C Library.
3 Copyright (C) 2008-2018 Free Software Foundation, Inc.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <cpuid.h>
20#include <cpu-features.h>
21#include <dl-hwcap.h>
22#include <libc-pointer-arith.h>
23
24#if HAVE_TUNABLES
25# define TUNABLE_NAMESPACE tune
26# include <unistd.h> /* Get STDOUT_FILENO for _dl_printf. */
27# include <elf/dl-tunables.h>
28
29extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
30 attribute_hidden;
31#endif
32
33static void
34get_extended_indices (struct cpu_features *cpu_features)
35{
36 unsigned int eax, ebx, ecx, edx;
37 __cpuid (0x80000000, eax, ebx, ecx, edx);
38 if (eax >= 0x80000001)
39 __cpuid (0x80000001,
40 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].eax,
41 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ebx,
42 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].ecx,
43 cpu_features->cpuid[COMMON_CPUID_INDEX_80000001].edx);
44
45}
46
47static void
48get_common_indeces (struct cpu_features *cpu_features,
49 unsigned int *family, unsigned int *model,
50 unsigned int *extended_model, unsigned int *stepping)
51{
52 if (family)
53 {
54 unsigned int eax;
55 __cpuid (1, eax, cpu_features->cpuid[COMMON_CPUID_INDEX_1].ebx,
56 cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx,
57 cpu_features->cpuid[COMMON_CPUID_INDEX_1].edx);
58 cpu_features->cpuid[COMMON_CPUID_INDEX_1].eax = eax;
59 *family = (eax >> 8) & 0x0f;
60 *model = (eax >> 4) & 0x0f;
61 *extended_model = (eax >> 12) & 0xf0;
62 *stepping = eax & 0x0f;
63 if (*family == 0x0f)
64 {
65 *family += (eax >> 20) & 0xff;
66 *model += *extended_model;
67 }
68 }
69
70 if (cpu_features->max_cpuid >= 7)
71 __cpuid_count (7, 0,
72 cpu_features->cpuid[COMMON_CPUID_INDEX_7].eax,
73 cpu_features->cpuid[COMMON_CPUID_INDEX_7].ebx,
74 cpu_features->cpuid[COMMON_CPUID_INDEX_7].ecx,
75 cpu_features->cpuid[COMMON_CPUID_INDEX_7].edx);
76
77 /* Can we call xgetbv? */
78 if (CPU_FEATURES_CPU_P (cpu_features, OSXSAVE))
79 {
80 unsigned int xcrlow;
81 unsigned int xcrhigh;
82 asm ("xgetbv" : "=a" (xcrlow), "=d" (xcrhigh) : "c" (0));
83 /* Is YMM and XMM state usable? */
84 if ((xcrlow & (bit_YMM_state | bit_XMM_state)) ==
85 (bit_YMM_state | bit_XMM_state))
86 {
87 /* Determine if AVX is usable. */
88 if (CPU_FEATURES_CPU_P (cpu_features, AVX))
89 {
90 cpu_features->feature[index_arch_AVX_Usable]
91 |= bit_arch_AVX_Usable;
92 /* The following features depend on AVX being usable. */
93 /* Determine if AVX2 is usable. */
94 if (CPU_FEATURES_CPU_P (cpu_features, AVX2))
95 cpu_features->feature[index_arch_AVX2_Usable]
96 |= bit_arch_AVX2_Usable;
97 /* Determine if FMA is usable. */
98 if (CPU_FEATURES_CPU_P (cpu_features, FMA))
99 cpu_features->feature[index_arch_FMA_Usable]
100 |= bit_arch_FMA_Usable;
101 }
102
103 /* Check if OPMASK state, upper 256-bit of ZMM0-ZMM15 and
104 ZMM16-ZMM31 state are enabled. */
105 if ((xcrlow & (bit_Opmask_state | bit_ZMM0_15_state
106 | bit_ZMM16_31_state)) ==
107 (bit_Opmask_state | bit_ZMM0_15_state | bit_ZMM16_31_state))
108 {
109 /* Determine if AVX512F is usable. */
110 if (CPU_FEATURES_CPU_P (cpu_features, AVX512F))
111 {
112 cpu_features->feature[index_arch_AVX512F_Usable]
113 |= bit_arch_AVX512F_Usable;
114 /* Determine if AVX512DQ is usable. */
115 if (CPU_FEATURES_CPU_P (cpu_features, AVX512DQ))
116 cpu_features->feature[index_arch_AVX512DQ_Usable]
117 |= bit_arch_AVX512DQ_Usable;
118 }
119 }
120 }
121
122 /* For _dl_runtime_resolve, set xsave_state_size to xsave area
123 size + integer register save size and align it to 64 bytes. */
124 if (cpu_features->max_cpuid >= 0xd)
125 {
126 unsigned int eax, ebx, ecx, edx;
127
128 __cpuid_count (0xd, 0, eax, ebx, ecx, edx);
129 if (ebx != 0)
130 {
131 unsigned int xsave_state_full_size
132 = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
133
134 cpu_features->xsave_state_size
135 = xsave_state_full_size;
136 cpu_features->xsave_state_full_size
137 = xsave_state_full_size;
138
139 __cpuid_count (0xd, 1, eax, ebx, ecx, edx);
140
141 /* Check if XSAVEC is available. */
142 if ((eax & (1 << 1)) != 0)
143 {
144 unsigned int xstate_comp_offsets[32];
145 unsigned int xstate_comp_sizes[32];
146 unsigned int i;
147
148 xstate_comp_offsets[0] = 0;
149 xstate_comp_offsets[1] = 160;
150 xstate_comp_offsets[2] = 576;
151 xstate_comp_sizes[0] = 160;
152 xstate_comp_sizes[1] = 256;
153
154 for (i = 2; i < 32; i++)
155 {
156 if ((STATE_SAVE_MASK & (1 << i)) != 0)
157 {
158 __cpuid_count (0xd, i, eax, ebx, ecx, edx);
159 xstate_comp_sizes[i] = eax;
160 }
161 else
162 {
163 ecx = 0;
164 xstate_comp_sizes[i] = 0;
165 }
166
167 if (i > 2)
168 {
169 xstate_comp_offsets[i]
170 = (xstate_comp_offsets[i - 1]
171 + xstate_comp_sizes[i -1]);
172 if ((ecx & (1 << 1)) != 0)
173 xstate_comp_offsets[i]
174 = ALIGN_UP (xstate_comp_offsets[i], 64);
175 }
176 }
177
178 /* Use XSAVEC. */
179 unsigned int size
180 = xstate_comp_offsets[31] + xstate_comp_sizes[31];
181 if (size)
182 {
183 cpu_features->xsave_state_size
184 = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
185 cpu_features->feature[index_arch_XSAVEC_Usable]
186 |= bit_arch_XSAVEC_Usable;
187 }
188 }
189 }
190 }
191 }
192}
193
194static inline void
195init_cpu_features (struct cpu_features *cpu_features)
196{
197 unsigned int ebx, ecx, edx;
198 unsigned int family = 0;
199 unsigned int model = 0;
200 enum cpu_features_kind kind;
201
202#if !HAS_CPUID
203 if (__get_cpuid_max (0, 0) == 0)
204 {
205 kind = arch_kind_other;
206 goto no_cpuid;
207 }
208#endif
209
210 __cpuid (0, cpu_features->max_cpuid, ebx, ecx, edx);
211
212 /* This spells out "GenuineIntel". */
213 if (ebx == 0x756e6547 && ecx == 0x6c65746e && edx == 0x49656e69)
214 {
215 unsigned int extended_model, stepping;
216
217 kind = arch_kind_intel;
218
219 get_common_indeces (cpu_features, &family, &model, &extended_model,
220 &stepping);
221
222 get_extended_indices (cpu_features);
223
224 if (family == 0x06)
225 {
226 model += extended_model;
227 switch (model)
228 {
229 case 0x1c:
230 case 0x26:
231 /* BSF is slow on Atom. */
232 cpu_features->feature[index_arch_Slow_BSF]
233 |= bit_arch_Slow_BSF;
234 break;
235
236 case 0x57:
237 /* Knights Landing. Enable Silvermont optimizations. */
238
239 case 0x5c:
240 case 0x5f:
241 /* Unaligned load versions are faster than SSSE3
242 on Goldmont. */
243
244 case 0x4c:
245 /* Airmont is a die shrink of Silvermont. */
246
247 case 0x37:
248 case 0x4a:
249 case 0x4d:
250 case 0x5a:
251 case 0x5d:
252 /* Unaligned load versions are faster than SSSE3
253 on Silvermont. */
254#if index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop
255# error index_arch_Fast_Unaligned_Load != index_arch_Prefer_PMINUB_for_stringop
256#endif
257#if index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
258# error index_arch_Fast_Unaligned_Load != index_arch_Slow_SSE4_2
259#endif
260#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
261# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Unaligned_Copy
262#endif
263 cpu_features->feature[index_arch_Fast_Unaligned_Load]
264 |= (bit_arch_Fast_Unaligned_Load
265 | bit_arch_Fast_Unaligned_Copy
266 | bit_arch_Prefer_PMINUB_for_stringop
267 | bit_arch_Slow_SSE4_2);
268 break;
269
270 default:
271 /* Unknown family 0x06 processors. Assuming this is one
272 of Core i3/i5/i7 processors if AVX is available. */
273 if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
274 break;
275
276 case 0x1a:
277 case 0x1e:
278 case 0x1f:
279 case 0x25:
280 case 0x2c:
281 case 0x2e:
282 case 0x2f:
283 /* Rep string instructions, unaligned load, unaligned copy,
284 and pminub are fast on Intel Core i3, i5 and i7. */
285#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
286# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Load
287#endif
288#if index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
289# error index_arch_Fast_Rep_String != index_arch_Prefer_PMINUB_for_stringop
290#endif
291#if index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
292# error index_arch_Fast_Rep_String != index_arch_Fast_Unaligned_Copy
293#endif
294 cpu_features->feature[index_arch_Fast_Rep_String]
295 |= (bit_arch_Fast_Rep_String
296 | bit_arch_Fast_Unaligned_Load
297 | bit_arch_Fast_Unaligned_Copy
298 | bit_arch_Prefer_PMINUB_for_stringop);
299 break;
300 }
301
302 /* Disable TSX on some Haswell processors to avoid TSX on kernels that
303 weren't updated with the latest microcode package (which disables
304 broken feature by default). */
305 switch (model)
306 {
307 case 0x3f:
308 /* Xeon E7 v3 with stepping >= 4 has working TSX. */
309 if (stepping >= 4)
310 break;
311 case 0x3c:
312 case 0x45:
313 case 0x46:
314 /* Disable Intel TSX on Haswell processors (except Xeon E7 v3
315 with stepping >= 4) to avoid TSX on kernels that weren't
316 updated with the latest microcode package (which disables
317 broken feature by default). */
318 cpu_features->cpuid[index_cpu_RTM].reg_RTM &= ~bit_cpu_RTM;
319 break;
320 }
321 }
322
323 /* Unaligned load with 256-bit AVX registers are faster on
324 Intel processors with AVX2. */
325 if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable))
326 cpu_features->feature[index_arch_AVX_Fast_Unaligned_Load]
327 |= bit_arch_AVX_Fast_Unaligned_Load;
328
329 /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
330 if AVX512ER is available. Don't use AVX512 to avoid lower CPU
331 frequency if AVX512ER isn't available. */
332 if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER))
333 cpu_features->feature[index_arch_Prefer_No_VZEROUPPER]
334 |= bit_arch_Prefer_No_VZEROUPPER;
335 else
336 cpu_features->feature[index_arch_Prefer_No_AVX512]
337 |= bit_arch_Prefer_No_AVX512;
338 }
339 /* This spells out "AuthenticAMD". */
340 else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
341 {
342 unsigned int extended_model, stepping;
343
344 kind = arch_kind_amd;
345
346 get_common_indeces (cpu_features, &family, &model, &extended_model,
347 &stepping);
348
349 get_extended_indices (cpu_features);
350
351 ecx = cpu_features->cpuid[COMMON_CPUID_INDEX_1].ecx;
352
353 if (HAS_ARCH_FEATURE (AVX_Usable))
354 {
355 /* Since the FMA4 bit is in COMMON_CPUID_INDEX_80000001 and
356 FMA4 requires AVX, determine if FMA4 is usable here. */
357 if (CPU_FEATURES_CPU_P (cpu_features, FMA4))
358 cpu_features->feature[index_arch_FMA4_Usable]
359 |= bit_arch_FMA4_Usable;
360 }
361
362 if (family == 0x15)
363 {
364#if index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
365# error index_arch_Fast_Unaligned_Load != index_arch_Fast_Copy_Backward
366#endif
367 /* "Excavator" */
368 if (model >= 0x60 && model <= 0x7f)
369 cpu_features->feature[index_arch_Fast_Unaligned_Load]
370 |= (bit_arch_Fast_Unaligned_Load
371 | bit_arch_Fast_Copy_Backward);
372 }
373 }
374 else
375 {
376 kind = arch_kind_other;
377 get_common_indeces (cpu_features, NULL, NULL, NULL, NULL);
378 }
379
380 /* Support i586 if CX8 is available. */
381 if (CPU_FEATURES_CPU_P (cpu_features, CX8))
382 cpu_features->feature[index_arch_I586] |= bit_arch_I586;
383
384 /* Support i686 if CMOV is available. */
385 if (CPU_FEATURES_CPU_P (cpu_features, CMOV))
386 cpu_features->feature[index_arch_I686] |= bit_arch_I686;
387
388#if !HAS_CPUID
389no_cpuid:
390#endif
391
392 cpu_features->family = family;
393 cpu_features->model = model;
394 cpu_features->kind = kind;
395
396#if HAVE_TUNABLES
397 TUNABLE_GET (hwcaps, tunable_val_t *, TUNABLE_CALLBACK (set_hwcaps));
398 cpu_features->non_temporal_threshold
399 = TUNABLE_GET (x86_non_temporal_threshold, long int, NULL);
400 cpu_features->data_cache_size
401 = TUNABLE_GET (x86_data_cache_size, long int, NULL);
402 cpu_features->shared_cache_size
403 = TUNABLE_GET (x86_shared_cache_size, long int, NULL);
404#endif
405
406 /* Reuse dl_platform, dl_hwcap and dl_hwcap_mask for x86. */
407#if !HAVE_TUNABLES && defined SHARED
408 /* The glibc.tune.hwcap_mask tunable is initialized already, so no need to do
409 this. */
410 GLRO(dl_hwcap_mask) = HWCAP_IMPORTANT;
411#endif
412
413#ifdef __x86_64__
414 GLRO(dl_hwcap) = HWCAP_X86_64;
415 if (cpu_features->kind == arch_kind_intel)
416 {
417 const char *platform = NULL;
418
419 if (CPU_FEATURES_ARCH_P (cpu_features, AVX512F_Usable)
420 && CPU_FEATURES_CPU_P (cpu_features, AVX512CD))
421 {
422 if (CPU_FEATURES_CPU_P (cpu_features, AVX512ER))
423 {
424 if (CPU_FEATURES_CPU_P (cpu_features, AVX512PF))
425 platform = "xeon_phi";
426 }
427 else
428 {
429 if (CPU_FEATURES_CPU_P (cpu_features, AVX512BW)
430 && CPU_FEATURES_CPU_P (cpu_features, AVX512DQ)
431 && CPU_FEATURES_CPU_P (cpu_features, AVX512VL))
432 GLRO(dl_hwcap) |= HWCAP_X86_AVX512_1;
433 }
434 }
435
436 if (platform == NULL
437 && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
438 && CPU_FEATURES_ARCH_P (cpu_features, FMA_Usable)
439 && CPU_FEATURES_CPU_P (cpu_features, BMI1)
440 && CPU_FEATURES_CPU_P (cpu_features, BMI2)
441 && CPU_FEATURES_CPU_P (cpu_features, LZCNT)
442 && CPU_FEATURES_CPU_P (cpu_features, MOVBE)
443 && CPU_FEATURES_CPU_P (cpu_features, POPCNT))
444 platform = "haswell";
445
446 if (platform != NULL)
447 GLRO(dl_platform) = platform;
448 }
449#else
450 GLRO(dl_hwcap) = 0;
451 if (CPU_FEATURES_CPU_P (cpu_features, SSE2))
452 GLRO(dl_hwcap) |= HWCAP_X86_SSE2;
453
454 if (CPU_FEATURES_ARCH_P (cpu_features, I686))
455 GLRO(dl_platform) = "i686";
456 else if (CPU_FEATURES_ARCH_P (cpu_features, I586))
457 GLRO(dl_platform) = "i586";
458#endif
459}
460