1/* x86_64 cache info.
2 Copyright (C) 2003-2016 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4
5 The GNU C Library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Lesser General Public
7 License as published by the Free Software Foundation; either
8 version 2.1 of the License, or (at your option) any later version.
9
10 The GNU C Library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Lesser General Public License for more details.
14
15 You should have received a copy of the GNU Lesser General Public
16 License along with the GNU C Library; if not, see
17 <http://www.gnu.org/licenses/>. */
18
19#include <assert.h>
20#include <stdbool.h>
21#include <stdlib.h>
22#include <unistd.h>
23#include <cpuid.h>
24#include <init-arch.h>
25
26#define is_intel GLRO(dl_x86_cpu_features).kind == arch_kind_intel
27#define is_amd GLRO(dl_x86_cpu_features).kind == arch_kind_amd
28#define max_cpuid GLRO(dl_x86_cpu_features).max_cpuid
29
30static const struct intel_02_cache_info
31{
32 unsigned char idx;
33 unsigned char assoc;
34 unsigned char linesize;
35 unsigned char rel_name;
36 unsigned int size;
37} intel_02_known [] =
38 {
39#define M(sc) ((sc) - _SC_LEVEL1_ICACHE_SIZE)
40 { 0x06, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 8192 },
41 { 0x08, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 16384 },
42 { 0x09, 4, 32, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
43 { 0x0a, 2, 32, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
44 { 0x0c, 4, 32, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
45 { 0x0d, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
46 { 0x0e, 6, 64, M(_SC_LEVEL1_DCACHE_SIZE), 24576 },
47 { 0x21, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
48 { 0x22, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
49 { 0x23, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
50 { 0x25, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
51 { 0x29, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
52 { 0x2c, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
53 { 0x30, 8, 64, M(_SC_LEVEL1_ICACHE_SIZE), 32768 },
54 { 0x39, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
55 { 0x3a, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 196608 },
56 { 0x3b, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
57 { 0x3c, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
58 { 0x3d, 6, 64, M(_SC_LEVEL2_CACHE_SIZE), 393216 },
59 { 0x3e, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
60 { 0x3f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
61 { 0x41, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
62 { 0x42, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
63 { 0x43, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
64 { 0x44, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
65 { 0x45, 4, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
66 { 0x46, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
67 { 0x47, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
68 { 0x48, 12, 64, M(_SC_LEVEL2_CACHE_SIZE), 3145728 },
69 { 0x49, 16, 64, M(_SC_LEVEL2_CACHE_SIZE), 4194304 },
70 { 0x4a, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 6291456 },
71 { 0x4b, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
72 { 0x4c, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
73 { 0x4d, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 16777216 },
74 { 0x4e, 24, 64, M(_SC_LEVEL2_CACHE_SIZE), 6291456 },
75 { 0x60, 8, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
76 { 0x66, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 8192 },
77 { 0x67, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 16384 },
78 { 0x68, 4, 64, M(_SC_LEVEL1_DCACHE_SIZE), 32768 },
79 { 0x78, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
80 { 0x79, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 131072 },
81 { 0x7a, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
82 { 0x7b, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
83 { 0x7c, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
84 { 0x7d, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
85 { 0x7f, 2, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
86 { 0x80, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
87 { 0x82, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 262144 },
88 { 0x83, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
89 { 0x84, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
90 { 0x85, 8, 32, M(_SC_LEVEL2_CACHE_SIZE), 2097152 },
91 { 0x86, 4, 64, M(_SC_LEVEL2_CACHE_SIZE), 524288 },
92 { 0x87, 8, 64, M(_SC_LEVEL2_CACHE_SIZE), 1048576 },
93 { 0xd0, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 524288 },
94 { 0xd1, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
95 { 0xd2, 4, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
96 { 0xd6, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 1048576 },
97 { 0xd7, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
98 { 0xd8, 8, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
99 { 0xdc, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
100 { 0xdd, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
101 { 0xde, 12, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
102 { 0xe2, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 2097152 },
103 { 0xe3, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 4194304 },
104 { 0xe4, 16, 64, M(_SC_LEVEL3_CACHE_SIZE), 8388608 },
105 { 0xea, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 12582912 },
106 { 0xeb, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 18874368 },
107 { 0xec, 24, 64, M(_SC_LEVEL3_CACHE_SIZE), 25165824 },
108 };
109
110#define nintel_02_known (sizeof (intel_02_known) / sizeof (intel_02_known [0]))
111
112static int
113intel_02_known_compare (const void *p1, const void *p2)
114{
115 const struct intel_02_cache_info *i1;
116 const struct intel_02_cache_info *i2;
117
118 i1 = (const struct intel_02_cache_info *) p1;
119 i2 = (const struct intel_02_cache_info *) p2;
120
121 if (i1->idx == i2->idx)
122 return 0;
123
124 return i1->idx < i2->idx ? -1 : 1;
125}
126
127
128static long int
129__attribute__ ((noinline))
130intel_check_word (int name, unsigned int value, bool *has_level_2,
131 bool *no_level_2_or_3)
132{
133 if ((value & 0x80000000) != 0)
134 /* The register value is reserved. */
135 return 0;
136
137 /* Fold the name. The _SC_ constants are always in the order SIZE,
138 ASSOC, LINESIZE. */
139 int folded_rel_name = (M(name) / 3) * 3;
140
141 while (value != 0)
142 {
143 unsigned int byte = value & 0xff;
144
145 if (byte == 0x40)
146 {
147 *no_level_2_or_3 = true;
148
149 if (folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
150 /* No need to look further. */
151 break;
152 }
153 else if (byte == 0xff)
154 {
155 /* CPUID leaf 0x4 contains all the information. We need to
156 iterate over it. */
157 unsigned int eax;
158 unsigned int ebx;
159 unsigned int ecx;
160 unsigned int edx;
161
162 unsigned int round = 0;
163 while (1)
164 {
165 __cpuid_count (4, round, eax, ebx, ecx, edx);
166
167 enum { null = 0, data = 1, inst = 2, uni = 3 } type = eax & 0x1f;
168 if (type == null)
169 /* That was the end. */
170 break;
171
172 unsigned int level = (eax >> 5) & 0x7;
173
174 if ((level == 1 && type == data
175 && folded_rel_name == M(_SC_LEVEL1_DCACHE_SIZE))
176 || (level == 1 && type == inst
177 && folded_rel_name == M(_SC_LEVEL1_ICACHE_SIZE))
178 || (level == 2 && folded_rel_name == M(_SC_LEVEL2_CACHE_SIZE))
179 || (level == 3 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
180 || (level == 4 && folded_rel_name == M(_SC_LEVEL4_CACHE_SIZE)))
181 {
182 unsigned int offset = M(name) - folded_rel_name;
183
184 if (offset == 0)
185 /* Cache size. */
186 return (((ebx >> 22) + 1)
187 * (((ebx >> 12) & 0x3ff) + 1)
188 * ((ebx & 0xfff) + 1)
189 * (ecx + 1));
190 if (offset == 1)
191 return (ebx >> 22) + 1;
192
193 assert (offset == 2);
194 return (ebx & 0xfff) + 1;
195 }
196
197 ++round;
198 }
199 /* There is no other cache information anywhere else. */
200 break;
201 }
202 else
203 {
204 if (byte == 0x49 && folded_rel_name == M(_SC_LEVEL3_CACHE_SIZE))
205 {
206 /* Intel reused this value. For family 15, model 6 it
207 specifies the 3rd level cache. Otherwise the 2nd
208 level cache. */
209 unsigned int family = GLRO(dl_x86_cpu_features).family;
210 unsigned int model = GLRO(dl_x86_cpu_features).model;
211
212 if (family == 15 && model == 6)
213 {
214 /* The level 3 cache is encoded for this model like
215 the level 2 cache is for other models. Pretend
216 the caller asked for the level 2 cache. */
217 name = (_SC_LEVEL2_CACHE_SIZE
218 + (name - _SC_LEVEL3_CACHE_SIZE));
219 folded_rel_name = M(_SC_LEVEL2_CACHE_SIZE);
220 }
221 }
222
223 struct intel_02_cache_info *found;
224 struct intel_02_cache_info search;
225
226 search.idx = byte;
227 found = bsearch (&search, intel_02_known, nintel_02_known,
228 sizeof (intel_02_known[0]), intel_02_known_compare);
229 if (found != NULL)
230 {
231 if (found->rel_name == folded_rel_name)
232 {
233 unsigned int offset = M(name) - folded_rel_name;
234
235 if (offset == 0)
236 /* Cache size. */
237 return found->size;
238 if (offset == 1)
239 return found->assoc;
240
241 assert (offset == 2);
242 return found->linesize;
243 }
244
245 if (found->rel_name == M(_SC_LEVEL2_CACHE_SIZE))
246 *has_level_2 = true;
247 }
248 }
249
250 /* Next byte for the next round. */
251 value >>= 8;
252 }
253
254 /* Nothing found. */
255 return 0;
256}
257
258
259static long int __attribute__ ((noinline))
260handle_intel (int name, unsigned int maxidx)
261{
262 assert (maxidx >= 2);
263
264 /* OK, we can use the CPUID instruction to get all info about the
265 caches. */
266 unsigned int cnt = 0;
267 unsigned int max = 1;
268 long int result = 0;
269 bool no_level_2_or_3 = false;
270 bool has_level_2 = false;
271
272 while (cnt++ < max)
273 {
274 unsigned int eax;
275 unsigned int ebx;
276 unsigned int ecx;
277 unsigned int edx;
278 __cpuid (2, eax, ebx, ecx, edx);
279
280 /* The low byte of EAX in the first round contain the number of
281 rounds we have to make. At least one, the one we are already
282 doing. */
283 if (cnt == 1)
284 {
285 max = eax & 0xff;
286 eax &= 0xffffff00;
287 }
288
289 /* Process the individual registers' value. */
290 result = intel_check_word (name, eax, &has_level_2, &no_level_2_or_3);
291 if (result != 0)
292 return result;
293
294 result = intel_check_word (name, ebx, &has_level_2, &no_level_2_or_3);
295 if (result != 0)
296 return result;
297
298 result = intel_check_word (name, ecx, &has_level_2, &no_level_2_or_3);
299 if (result != 0)
300 return result;
301
302 result = intel_check_word (name, edx, &has_level_2, &no_level_2_or_3);
303 if (result != 0)
304 return result;
305 }
306
307 if (name >= _SC_LEVEL2_CACHE_SIZE && name <= _SC_LEVEL3_CACHE_LINESIZE
308 && no_level_2_or_3)
309 return -1;
310
311 return 0;
312}
313
314
315static long int __attribute__ ((noinline))
316handle_amd (int name)
317{
318 unsigned int eax;
319 unsigned int ebx;
320 unsigned int ecx;
321 unsigned int edx;
322 __cpuid (0x80000000, eax, ebx, ecx, edx);
323
324 /* No level 4 cache (yet). */
325 if (name > _SC_LEVEL3_CACHE_LINESIZE)
326 return 0;
327
328 unsigned int fn = 0x80000005 + (name >= _SC_LEVEL2_CACHE_SIZE);
329 if (eax < fn)
330 return 0;
331
332 __cpuid (fn, eax, ebx, ecx, edx);
333
334 if (name < _SC_LEVEL1_DCACHE_SIZE)
335 {
336 name += _SC_LEVEL1_DCACHE_SIZE - _SC_LEVEL1_ICACHE_SIZE;
337 ecx = edx;
338 }
339
340 switch (name)
341 {
342 case _SC_LEVEL1_DCACHE_SIZE:
343 return (ecx >> 14) & 0x3fc00;
344
345 case _SC_LEVEL1_DCACHE_ASSOC:
346 ecx >>= 16;
347 if ((ecx & 0xff) == 0xff)
348 /* Fully associative. */
349 return (ecx << 2) & 0x3fc00;
350 return ecx & 0xff;
351
352 case _SC_LEVEL1_DCACHE_LINESIZE:
353 return ecx & 0xff;
354
355 case _SC_LEVEL2_CACHE_SIZE:
356 return (ecx & 0xf000) == 0 ? 0 : (ecx >> 6) & 0x3fffc00;
357
358 case _SC_LEVEL2_CACHE_ASSOC:
359 switch ((ecx >> 12) & 0xf)
360 {
361 case 0:
362 case 1:
363 case 2:
364 case 4:
365 return (ecx >> 12) & 0xf;
366 case 6:
367 return 8;
368 case 8:
369 return 16;
370 case 10:
371 return 32;
372 case 11:
373 return 48;
374 case 12:
375 return 64;
376 case 13:
377 return 96;
378 case 14:
379 return 128;
380 case 15:
381 return ((ecx >> 6) & 0x3fffc00) / (ecx & 0xff);
382 default:
383 return 0;
384 }
385 /* NOTREACHED */
386
387 case _SC_LEVEL2_CACHE_LINESIZE:
388 return (ecx & 0xf000) == 0 ? 0 : ecx & 0xff;
389
390 case _SC_LEVEL3_CACHE_SIZE:
391 return (edx & 0xf000) == 0 ? 0 : (edx & 0x3ffc0000) << 1;
392
393 case _SC_LEVEL3_CACHE_ASSOC:
394 switch ((edx >> 12) & 0xf)
395 {
396 case 0:
397 case 1:
398 case 2:
399 case 4:
400 return (edx >> 12) & 0xf;
401 case 6:
402 return 8;
403 case 8:
404 return 16;
405 case 10:
406 return 32;
407 case 11:
408 return 48;
409 case 12:
410 return 64;
411 case 13:
412 return 96;
413 case 14:
414 return 128;
415 case 15:
416 return ((edx & 0x3ffc0000) << 1) / (edx & 0xff);
417 default:
418 return 0;
419 }
420 /* NOTREACHED */
421
422 case _SC_LEVEL3_CACHE_LINESIZE:
423 return (edx & 0xf000) == 0 ? 0 : edx & 0xff;
424
425 default:
426 assert (! "cannot happen");
427 }
428 return -1;
429}
430
431
432/* Get the value of the system variable NAME. */
433long int
434attribute_hidden
435__cache_sysconf (int name)
436{
437 if (is_intel)
438 return handle_intel (name, max_cpuid);
439
440 if (is_amd)
441 return handle_amd (name);
442
443 // XXX Fill in more vendors.
444
445 /* CPU not known, we have no information. */
446 return 0;
447}
448
449
450/* Data cache size for use in memory and string routines, typically
451 L1 size, rounded to multiple of 256 bytes. */
452long int __x86_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
453long int __x86_data_cache_size attribute_hidden = 32 * 1024;
454/* Similar to __x86_data_cache_size_half, but not rounded. */
455long int __x86_raw_data_cache_size_half attribute_hidden = 32 * 1024 / 2;
456/* Similar to __x86_data_cache_size, but not rounded. */
457long int __x86_raw_data_cache_size attribute_hidden = 32 * 1024;
458/* Shared cache size for use in memory and string routines, typically
459 L2 or L3 size, rounded to multiple of 256 bytes. */
460long int __x86_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
461long int __x86_shared_cache_size attribute_hidden = 1024 * 1024;
462/* Similar to __x86_shared_cache_size_half, but not rounded. */
463long int __x86_raw_shared_cache_size_half attribute_hidden = 1024 * 1024 / 2;
464/* Similar to __x86_shared_cache_size, but not rounded. */
465long int __x86_raw_shared_cache_size attribute_hidden = 1024 * 1024;
466
467/* Threshold to use non temporal store. */
468long int __x86_shared_non_temporal_threshold attribute_hidden;
469
470#ifndef DISABLE_PREFETCHW
471/* PREFETCHW support flag for use in memory and string routines. */
472int __x86_prefetchw attribute_hidden;
473#endif
474
475
476static void
477__attribute__((constructor))
478init_cacheinfo (void)
479{
480 /* Find out what brand of processor. */
481 unsigned int eax;
482 unsigned int ebx;
483 unsigned int ecx;
484 unsigned int edx;
485 int max_cpuid_ex;
486 long int data = -1;
487 long int shared = -1;
488 unsigned int level;
489 unsigned int threads = 0;
490
491 if (is_intel)
492 {
493 data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, max_cpuid);
494
495 long int core = handle_intel (_SC_LEVEL2_CACHE_SIZE, max_cpuid);
496 bool inclusive_cache = true;
497
498 /* Try L3 first. */
499 level = 3;
500 shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, max_cpuid);
501
502 /* Number of logical processors sharing L2 cache. */
503 int threads_l2;
504
505 /* Number of logical processors sharing L3 cache. */
506 int threads_l3;
507
508 if (shared <= 0)
509 {
510 /* Try L2 otherwise. */
511 level = 2;
512 shared = core;
513 threads_l2 = 0;
514 threads_l3 = -1;
515 }
516 else
517 {
518 threads_l2 = 0;
519 threads_l3 = 0;
520 }
521
522 /* A value of 0 for the HTT bit indicates there is only a single
523 logical processor. */
524 if (HAS_CPU_FEATURE (HTT))
525 {
526 /* Figure out the number of logical threads that share the
527 highest cache level. */
528 if (max_cpuid >= 4)
529 {
530 unsigned int family = GLRO(dl_x86_cpu_features).family;
531 unsigned int model = GLRO(dl_x86_cpu_features).model;
532
533 int i = 0;
534
535 /* Query until cache level 2 and 3 are enumerated. */
536 int check = 0x1 | (threads_l3 == 0) << 1;
537 do
538 {
539 __cpuid_count (4, i++, eax, ebx, ecx, edx);
540
541 /* There seems to be a bug in at least some Pentium Ds
542 which sometimes fail to iterate all cache parameters.
543 Do not loop indefinitely here, stop in this case and
544 assume there is no such information. */
545 if ((eax & 0x1f) == 0)
546 goto intel_bug_no_cache_info;
547
548 switch ((eax >> 5) & 0x7)
549 {
550 default:
551 break;
552 case 2:
553 if ((check & 0x1))
554 {
555 /* Get maximum number of logical processors
556 sharing L2 cache. */
557 threads_l2 = (eax >> 14) & 0x3ff;
558 check &= ~0x1;
559 }
560 break;
561 case 3:
562 if ((check & (0x1 << 1)))
563 {
564 /* Get maximum number of logical processors
565 sharing L3 cache. */
566 threads_l3 = (eax >> 14) & 0x3ff;
567
568 /* Check if L2 and L3 caches are inclusive. */
569 inclusive_cache = (edx & 0x2) != 0;
570 check &= ~(0x1 << 1);
571 }
572 break;
573 }
574 }
575 while (check);
576
577 /* If max_cpuid >= 11, THREADS_L2/THREADS_L3 are the maximum
578 numbers of addressable IDs for logical processors sharing
579 the cache, instead of the maximum number of threads
580 sharing the cache. */
581 if (max_cpuid >= 11)
582 {
583 /* Find the number of logical processors shipped in
584 one core and apply count mask. */
585 i = 0;
586
587 /* Count SMT only if there is L3 cache. Always count
588 core if there is no L3 cache. */
589 int count = ((threads_l2 > 0 && level == 3)
590 | ((threads_l3 > 0
591 || (threads_l2 > 0 && level == 2)) << 1));
592
593 while (count)
594 {
595 __cpuid_count (11, i++, eax, ebx, ecx, edx);
596
597 int shipped = ebx & 0xff;
598 int type = ecx & 0xff00;
599 if (shipped == 0 || type == 0)
600 break;
601 else if (type == 0x100)
602 {
603 /* Count SMT. */
604 if ((count & 0x1))
605 {
606 int count_mask;
607
608 /* Compute count mask. */
609 asm ("bsr %1, %0"
610 : "=r" (count_mask) : "g" (threads_l2));
611 count_mask = ~(-1 << (count_mask + 1));
612 threads_l2 = (shipped - 1) & count_mask;
613 count &= ~0x1;
614 }
615 }
616 else if (type == 0x200)
617 {
618 /* Count core. */
619 if ((count & (0x1 << 1)))
620 {
621 int count_mask;
622 int threads_core
623 = (level == 2 ? threads_l2 : threads_l3);
624
625 /* Compute count mask. */
626 asm ("bsr %1, %0"
627 : "=r" (count_mask) : "g" (threads_core));
628 count_mask = ~(-1 << (count_mask + 1));
629 threads_core = (shipped - 1) & count_mask;
630 if (level == 2)
631 threads_l2 = threads_core;
632 else
633 threads_l3 = threads_core;
634 count &= ~(0x1 << 1);
635 }
636 }
637 }
638 }
639 if (threads_l2 > 0)
640 threads_l2 += 1;
641 if (threads_l3 > 0)
642 threads_l3 += 1;
643 if (level == 2)
644 {
645 if (threads_l2)
646 {
647 threads = threads_l2;
648 if (threads > 2 && family == 6)
649 switch (model)
650 {
651 case 0x37:
652 case 0x4a:
653 case 0x4d:
654 case 0x5a:
655 case 0x5d:
656 /* Silvermont has L2 cache shared by 2 cores. */
657 threads = 2;
658 break;
659 default:
660 break;
661 }
662 }
663 }
664 else if (threads_l3)
665 threads = threads_l3;
666 }
667 else
668 {
669intel_bug_no_cache_info:
670 /* Assume that all logical threads share the highest cache
671 level. */
672
673 threads
674 = ((GLRO(dl_x86_cpu_features).cpuid[COMMON_CPUID_INDEX_1].ebx
675 >> 16) & 0xff);
676 }
677
678 /* Cap usage of highest cache level to the number of supported
679 threads. */
680 if (shared > 0 && threads > 0)
681 shared /= threads;
682 }
683
684 /* Account for non-inclusive L2 and L3 caches. */
685 if (!inclusive_cache)
686 {
687 if (threads_l2 > 0)
688 core /= threads_l2;
689 shared += core;
690 }
691 }
692 /* This spells out "AuthenticAMD". */
693 else if (is_amd)
694 {
695 data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
696 long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
697 shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
698
699 /* Get maximum extended function. */
700 __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
701
702 if (shared <= 0)
703 /* No shared L3 cache. All we have is the L2 cache. */
704 shared = core;
705 else
706 {
707 /* Figure out the number of logical threads that share L3. */
708 if (max_cpuid_ex >= 0x80000008)
709 {
710 /* Get width of APIC ID. */
711 __cpuid (0x80000008, max_cpuid_ex, ebx, ecx, edx);
712 threads = 1 << ((ecx >> 12) & 0x0f);
713 }
714
715 if (threads == 0)
716 {
717 /* If APIC ID width is not available, use logical
718 processor count. */
719 __cpuid (0x00000001, max_cpuid_ex, ebx, ecx, edx);
720
721 if ((edx & (1 << 28)) != 0)
722 threads = (ebx >> 16) & 0xff;
723 }
724
725 /* Cap usage of highest cache level to the number of
726 supported threads. */
727 if (threads > 0)
728 shared /= threads;
729
730 /* Account for exclusive L2 and L3 caches. */
731 shared += core;
732 }
733
734#ifndef DISABLE_PREFETCHW
735 if (max_cpuid_ex >= 0x80000001)
736 {
737 __cpuid (0x80000001, eax, ebx, ecx, edx);
738 /* PREFETCHW || 3DNow! */
739 if ((ecx & 0x100) || (edx & 0x80000000))
740 __x86_prefetchw = -1;
741 }
742#endif
743 }
744
745 if (data > 0)
746 {
747 __x86_raw_data_cache_size_half = data / 2;
748 __x86_raw_data_cache_size = data;
749 /* Round data cache size to multiple of 256 bytes. */
750 data = data & ~255L;
751 __x86_data_cache_size_half = data / 2;
752 __x86_data_cache_size = data;
753 }
754
755 if (shared > 0)
756 {
757 __x86_raw_shared_cache_size_half = shared / 2;
758 __x86_raw_shared_cache_size = shared;
759 /* Round shared cache size to multiple of 256 bytes. */
760 shared = shared & ~255L;
761 __x86_shared_cache_size_half = shared / 2;
762 __x86_shared_cache_size = shared;
763 }
764
765 /* The large memcpy micro benchmark in glibc shows that 6 times of
766 shared cache size is the approximate value above which non-temporal
767 store becomes faster. */
768 __x86_shared_non_temporal_threshold = __x86_shared_cache_size * 6;
769}
770