1/* x86_64 cache info.
2 Copyright (C) 2003-2017 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <assert.h>
20#include <stdbool.h>
21#include <stdlib.h>
22#include <unistd.h>
23#include <cpuid.h>
24#include <init-arch.h>
25
26#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
27#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
28#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
29
30static const struct intel_02_cache_info
31{
32 unsigned char idx;
33 unsigned char assoc;
34 unsigned char linesize;
35 unsigned char rel_name;
36 unsigned int size;
37} intel_02_known [] =
38 {
39#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
40 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
41 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
42 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
44 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
45 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
46 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
47 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
48 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
49 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
50 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
51 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
52 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
53 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
54 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
55 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
56 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
57 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
58 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
59 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
60 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
61 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
62 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
63 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
64 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
65 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
66 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
67 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
68 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
69 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
70 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
71 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
72 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
73 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
74 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
75 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
76 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
77 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
78 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
79 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
80 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
81 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
82 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
83 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
84 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
85 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
86 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
87 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
88 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
89 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
90 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
91 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
92 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
93 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
94 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
95 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
96 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
97 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
98 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
99 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
100 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
101 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
102 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
103 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
104 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
105 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
106 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
107 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
108 };
109
110#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
111
112static int
113intel_02_known_compare (const void *p1, const void *p2)
114{
115 const struct intel_02_cache_info *i1;
116 const struct intel_02_cache_info *i2;
117
118 i1 = (const struct intel_02_cache_info *) p1;
119 i2 = (const struct intel_02_cache_info *) p2;
120
121 if (i1->idx == i2->idx)
122 return 0;
123
124 return i1->idx < i2->idx ? -1 : 1;
125}
126
127
128static long int
129__attribute__ ((noinline))
130intel_check_word (int name, unsigned int value, bool *has_level_2,
131 bool *no_level_2_or_3)
132{
133 if ((value & 0x80000000) != 0)
134 /* The register value is reserved. */
135 return 0;
136
137 /* Fold the name. The _SC_ constants are always in the order SIZE,
138 ASSOC, LINESIZE. */
139 int folded_rel_name = (M(name) / 3) * 3;
140
141 while (value != 0)
142 {
143 unsigned int byte = value & 0xff;
144
145 if (byte == 0x40)
146 {
147 *no_level_2_or_3 = true;
148
149 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
150 /* No need to look further. */
151 break;
152 }
153 else if (byte == 0xff)
154 {
155 /* CPUID leaf 0x4 contains all the information. We need to
156 iterate over it. */
157 unsigned int eax;
158 unsigned int ebx;
159 unsigned int ecx;
160 unsigned int edx;
161
162 unsigned int round = 0;
163 while (1)
164 {
165 __cpuid_count (4, round, eax, ebx, ecx, edx);
166
167 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
168 if (type == null)
169 /* That was the end. */
170 break;
171
172 unsigned int level = (eax >> 5) & 0x7;
173
174 if ((level == 1 && type == data
175 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
176 || (level == 1 && type == inst
177 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
178 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
179 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
180 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
181 {
182 unsigned int offset = M(name) - folded_rel_name;
183
184 if (offset == 0)
185 /* Cache size. */
186 return (((ebx >> 22) + 1)
187 * (((ebx >> 12) & 0x3ff) + 1)
188 * ((ebx & 0xfff) + 1)
189 * (ecx + 1));
190 if (offset == 1)
191 return (ebx >> 22) + 1;
192
193 assert (offset == 2);
194 return (ebx & 0xfff) + 1;
195 }
196
197 ++round;
198 }
199 /* There is no other cache information anywhere else. */
200 break;
201 }
202 else
203 {
204 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
205 {
206 /* Intel reused this value. For family 15, model 6 it
207 specifies the 3rd level cache. Otherwise the 2nd
208 level cache. */
209 unsigned int family = GLRO(dl_x86_cpu_features).family;
210 unsigned int model = GLRO(dl_x86_cpu_features).model;
211
212 if (family == 15 && model == 6)
213 {
214 /* The level 3 cache is encoded for this model like
215 the level 2 cache is for other models. Pretend
216 the caller asked for the level 2 cache. */
217 name = (_SC_LEVEL2_CACHE_SIZE
218 + (name - _SC_LEVEL3_CACHE_SIZE));
219 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
220 }
221 }
222
223 struct intel_02_cache_info *found;
224 struct intel_02_cache_info search;
225
226 search.idx = byte;
227 found = bsearch (&search, intel_02_known, nintel_02_known,
228 sizeof (intel_02_known[0]), intel_02_known_compare);
229 if (found != NULL)
230 {
231 if (found->rel_name == folded_rel_name)
232 {
233 unsigned int offset = M(name) - folded_rel_name;
234
235 if (offset == 0)
236 /* Cache size. */
237 return found->size;
238 if (offset == 1)
239 return found->assoc;
240
241 assert (offset == 2);
242 return found->linesize;
243 }
244
245 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
246 *has_level_2 = true;
247 }
248 }
249
250 /* Next byte for the next round. */
251 value >>= 8;
252 }
253
254 /* Nothing found. */
255 return 0;
256}
257
258
259static long int __attribute__ ((noinline))
260handle_intel (int name, unsigned int maxidx)
261{
262 /* Return -1 for older CPUs. */
263 if (maxidx < 2)
264 return -1;
265
266 /* OK, we can use the CPUID instruction to get all info about the
267 caches. */
268 unsigned int cnt = 0;
269 unsigned int max = 1;
270 long int result = 0;
271 bool no_level_2_or_3 = false;
272 bool has_level_2 = false;
273
274 while (cnt++ < max)
275 {
276 unsigned int eax;
277 unsigned int ebx;
278 unsigned int ecx;
279 unsigned int edx;
280 __cpuid (2, eax, ebx, ecx, edx);
281
282 /* The low byte of EAX in the first round contain the number of
283 rounds we have to make. At least one, the one we are already
284 doing. */
285 if (cnt == 1)
286 {
287 max = eax & 0xff;
288 eax &= 0xffffff00;
289 }
290
291 /* Process the individual registers' value. */
292 result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
293 if (result != 0)
294 return result;
295
296 result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
297 if (result != 0)
298 return result;
299
300 result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
301 if (result != 0)
302 return result;
303
304 result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
305 if (result != 0)
306 return result;
307 }
308
309 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
310 && no_level_2_or_3)
311 return -1;
312
313 return 0;
314}
315
316
317static long int __attribute__ ((noinline))
318handle_amd (int name)
319{
320 unsigned int eax;
321 unsigned int ebx;
322 unsigned int ecx;
323 unsigned int edx;
324 __cpuid (0x80000000, eax, ebx, ecx, edx);
325
326 /* No level 4 cache (yet). */
327 if (name > _SC_LEVEL3_CACHE_LINESIZE)
328 return 0;
329
330 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
331 if (eax < fn)
332 return 0;
333
334 __cpuid (fn, eax, ebx, ecx, edx);
335
336 if (name < _SC_LEVEL1_DCACHE_SIZE)
337 {
338 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
339 ecx = edx;
340 }
341
342 switch (name)
343 {
344 case _SC_LEVEL1_DCACHE_SIZE:
345 return (ecx >> 14) & 0x3fc00;
346
347 case _SC_LEVEL1_DCACHE_ASSOC:
348 ecx >>= 16;
349 if ((ecx & 0xff) == 0xff)
350 /* Fully associative. */
351 return (ecx << 2) & 0x3fc00;
352 return ecx & 0xff;
353
354 case _SC_LEVEL1_DCACHE_LINESIZE:
355 return ecx & 0xff;
356
357 case _SC_LEVEL2_CACHE_SIZE:
358 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
359
360 case _SC_LEVEL2_CACHE_ASSOC:
361 switch ((ecx >> 12) & 0xf)
362 {
363 case 0:
364 case 1:
365 case 2:
366 case 4:
367 return (ecx >> 12) & 0xf;
368 case 6:
369 return 8;
370 case 8:
371 return 16;
372 case 10:
373 return 32;
374 case 11:
375 return 48;
376 case 12:
377 return 64;
378 case 13:
379 return 96;
380 case 14:
381 return 128;
382 case 15:
383 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
384 default:
385 return 0;
386 }
387 /* NOTREACHED */
388
389 case _SC_LEVEL2_CACHE_LINESIZE:
390 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
391
392 case _SC_LEVEL3_CACHE_SIZE:
393 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
394
395 case _SC_LEVEL3_CACHE_ASSOC:
396 switch ((edx >> 12) & 0xf)
397 {
398 case 0:
399 case 1:
400 case 2:
401 case 4:
402 return (edx >> 12) & 0xf;
403 case 6:
404 return 8;
405 case 8:
406 return 16;
407 case 10:
408 return 32;
409 case 11:
410 return 48;
411 case 12:
412 return 64;
413 case 13:
414 return 96;
415 case 14:
416 return 128;
417 case 15:
418 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
419 default:
420 return 0;
421 }
422 /* NOTREACHED */
423
424 case _SC_LEVEL3_CACHE_LINESIZE:
425 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
426
427 default:
428 assert (! "cannot happen");
429 }
430 return -1;
431}
432
433
434/* Get the value of the system variable NAME. */
435long int
436attribute_hidden
437__cache_sysconf (int name)
438{
439 if (is_intel)
440 return handle_intel (name, max_cpuid);
441
442 if (is_amd)
443 return handle_amd (name);
444
445 // XXX Fill in more vendors.
446
447 /* CPU not known, we have no information. */
448 return 0;
449}
450
451
452/* Data cache size for use in memory and string routines, typically
453 L1 size, rounded to multiple of 256 bytes. */
454long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
455long int __x86_data_cache_size attribute_hidden = 32 * 1024;
456/* Similar to __x86_data_cache_size_half, but not rounded. */
457long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
458/* Similar to __x86_data_cache_size, but not rounded. */
459long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
460/* Shared cache size for use in memory and string routines, typically
461 L2 or L3 size, rounded to multiple of 256 bytes. */
462long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
463long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
464/* Similar to __x86_shared_cache_size_half, but not rounded. */
465long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
466/* Similar to __x86_shared_cache_size, but not rounded. */
467long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
468
469/* Threshold to use non temporal store. */
470long int __x86_shared_non_temporal_threshold attribute_hidden;
471
472#ifndef DISABLE_PREFETCHW
473/* PREFETCHW support flag for use in memory and string routines. */
474int __x86_prefetchw attribute_hidden;
475#endif
476
477
478static void
479__attribute__((constructor))
480init_cacheinfo (void)
481{
482 /* Find out what brand of processor. */
483 unsigned int eax;
484 unsigned int ebx;
485 unsigned int ecx;
486 unsigned int edx;
487 int max_cpuid_ex;
488 long int data = -1;
489 long int shared = -1;
490 unsigned int level;
491 unsigned int threads = 0;
492
493 if (is_intel)
494 {
495 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
496
497 long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
498 bool inclusive_cache = true;
499
500 /* Try L3 first. */
501 level = 3;
502 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
503
504 /* Number of logical processors sharing L2 cache. */
505 int threads_l2;
506
507 /* Number of logical processors sharing L3 cache. */
508 int threads_l3;
509
510 if (shared <= 0)
511 {
512 /* Try L2 otherwise. */
513 level = 2;
514 shared = core;
515 threads_l2 = 0;
516 threads_l3 = -1;
517 }
518 else
519 {
520 threads_l2 = 0;
521 threads_l3 = 0;
522 }
523
524 /* A value of 0 for the HTT bit indicates there is only a single
525 logical processor. */
526 if (HAS_CPU_FEATURE (HTT))
527 {
528 /* Figure out the number of logical threads that share the
529 highest cache level. */
530 if (max_cpuid >= 4)
531 {
532 unsigned int family = GLRO(dl_x86_cpu_features).family;
533 unsigned int model = GLRO(dl_x86_cpu_features).model;
534
535 int i = 0;
536
537 /* Query until cache level 2 and 3 are enumerated. */
538 int check = 0x1 | (threads_l3 == 0) << 1;
539 do
540 {
541 __cpuid_count (4, i++, eax, ebx, ecx, edx);
542
543 /* There seems to be a bug in at least some Pentium Ds
544 which sometimes fail to iterate all cache parameters.
545 Do not loop indefinitely here, stop in this case and
546 assume there is no such information. */
547 if ((eax & 0x1f) == 0)
548 goto intel_bug_no_cache_info;
549
550 switch ((eax >> 5) & 0x7)
551 {
552 default:
553 break;
554 case 2:
555 if ((check & 0x1))
556 {
557 /* Get maximum number of logical processors
558 sharing L2 cache. */
559 threads_l2 = (eax >> 14) & 0x3ff;
560 check &= ~0x1;
561 }
562 break;
563 case 3:
564 if ((check & (0x1 << 1)))
565 {
566 /* Get maximum number of logical processors
567 sharing L3 cache. */
568 threads_l3 = (eax >> 14) & 0x3ff;
569
570 /* Check if L2 and L3 caches are inclusive. */
571 inclusive_cache = (edx & 0x2) != 0;
572 check &= ~(0x1 << 1);
573 }
574 break;
575 }
576 }
577 while (check);
578
579 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
580 numbers of addressable IDs for logical processors sharing
581 the cache, instead of the maximum number of threads
582 sharing the cache. */
583 if (max_cpuid >= 11)
584 {
585 /* Find the number of logical processors shipped in
586 one core and apply count mask. */
587 i = 0;
588
589 /* Count SMT only if there is L3 cache. Always count
590 core if there is no L3 cache. */
591 int count = ((threads_l2 > 0 && level == 3)
592 | ((threads_l3 > 0
593 || (threads_l2 > 0 && level == 2)) << 1));
594
595 while (count)
596 {
597 __cpuid_count (11, i++, eax, ebx, ecx, edx);
598
599 int shipped = ebx & 0xff;
600 int type = ecx & 0xff00;
601 if (shipped == 0 || type == 0)
602 break;
603 else if (type == 0x100)
604 {
605 /* Count SMT. */
606 if ((count & 0x1))
607 {
608 int count_mask;
609
610 /* Compute count mask. */
611 asm ("bsr %1, %0"
612 : "=r" (count_mask) : "g" (threads_l2));
613 count_mask = ~(-1 << (count_mask + 1));
614 threads_l2 = (shipped - 1) & count_mask;
615 count &= ~0x1;
616 }
617 }
618 else if (type == 0x200)
619 {
620 /* Count core. */
621 if ((count & (0x1 << 1)))
622 {
623 int count_mask;
624 int threads_core
625 = (level == 2 ? threads_l2 : threads_l3);
626
627 /* Compute count mask. */
628 asm ("bsr %1, %0"
629 : "=r" (count_mask) : "g" (threads_core));
630 count_mask = ~(-1 << (count_mask + 1));
631 threads_core = (shipped - 1) & count_mask;
632 if (level == 2)
633 threads_l2 = threads_core;
634 else
635 threads_l3 = threads_core;
636 count &= ~(0x1 << 1);
637 }
638 }
639 }
640 }
641 if (threads_l2 > 0)
642 threads_l2 += 1;
643 if (threads_l3 > 0)
644 threads_l3 += 1;
645 if (level == 2)
646 {
647 if (threads_l2)
648 {
649 threads = threads_l2;
650 if (threads > 2 && family == 6)
651 switch (model)
652 {
653 case 0x37:
654 case 0x4a:
655 case 0x4d:
656 case 0x5a:
657 case 0x5d:
658 /* Silvermont has L2 cache shared by 2 cores. */
659 threads = 2;
660 break;
661 default:
662 break;
663 }
664 }
665 }
666 else if (threads_l3)
667 threads = threads_l3;
668 }
669 else
670 {
671intel_bug_no_cache_info:
672 /* Assume that all logical threads share the highest cache
673 level. */
674
675 threads
676 = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
677 >> 16) & 0xff);
678 }
679
680 /* Cap usage of highest cache level to the number of supported
681 threads. */
682 if (shared > 0 && threads > 0)
683 shared /= threads;
684 }
685
686 /* Account for non-inclusive L2 and L3 caches. */
687 if (!inclusive_cache)
688 {
689 if (threads_l2 > 0)
690 core /= threads_l2;
691 shared += core;
692 }
693 }
694 /* This spells out "AuthenticAMD". */
695 else if (is_amd)
696 {
697 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
698 long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
699 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
700
701 /* Get maximum extended function. */
702 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
703
704 if (shared <= 0)
705 /* No shared L3 cache. All we have is the L2 cache. */
706 shared = core;
707 else
708 {
709 /* Figure out the number of logical threads that share L3. */
710 if (max_cpuid_ex >= 0x80000008)
711 {
712 /* Get width of APIC ID. */
713 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
714 threads = 1 << ((ecx >> 12) & 0x0f);
715 }
716
717 if (threads == 0)
718 {
719 /* If APIC ID width is not available, use logical
720 processor count. */
721 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
722
723 if ((edx & (1 << 28)) != 0)
724 threads = (ebx >> 16) & 0xff;
725 }
726
727 /* Cap usage of highest cache level to the number of
728 supported threads. */
729 if (threads > 0)
730 shared /= threads;
731
732 /* Account for exclusive L2 and L3 caches. */
733 shared += core;
734 }
735
736#ifndef DISABLE_PREFETCHW
737 if (max_cpuid_ex >= 0x80000001)
738 {
739 __cpuid (0x80000001, eax, ebx, ecx, edx);
740 /* PREFETCHW || 3DNow! */
741 if ((ecx & 0x100) || (edx & 0x80000000))
742 __x86_prefetchw = -1;
743 }
744#endif
745 }
746
747 if (data > 0)
748 {
749 __x86_raw_data_cache_size_half = data / 2;
750 __x86_raw_data_cache_size = data;
751 /* Round data cache size to multiple of 256 bytes. */
752 data = data & ~255L;
753 __x86_data_cache_size_half = data / 2;
754 __x86_data_cache_size = data;
755 }
756
757 if (shared > 0)
758 {
759 __x86_raw_shared_cache_size_half = shared / 2;
760 __x86_raw_shared_cache_size = shared;
761 /* Round shared cache size to multiple of 256 bytes. */
762 shared = shared & ~255L;
763 __x86_shared_cache_size_half = shared / 2;
764 __x86_shared_cache_size = shared;
765 }
766
767 /* The large memcpy micro benchmark in glibc shows that 6 times of
768 shared cache size is the approximate value above which non-temporal
769 store becomes faster. */
770 __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
771}
772