1 2 /*--------------------------------------------------------------------*/ 3 /*--- x86- and AMD64-specific definitions. cg-x86-amd64.c ---*/ 4 /*--------------------------------------------------------------------*/ 5 6 /* 7 This file is part of Cachegrind, a Valgrind tool for cache 8 profiling programs. 9 10 Copyright (C) 2002-2010 Nicholas Nethercote 11 njn (at) valgrind.org 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 26 02111-1307, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 */ 30 31 #if defined(VGA_x86) || defined(VGA_amd64) 32 33 #include "pub_tool_basics.h" 34 #include "pub_tool_cpuid.h" 35 #include "pub_tool_libcbase.h" 36 #include "pub_tool_libcassert.h" 37 #include "pub_tool_libcprint.h" 38 39 #include "cg_arch.h" 40 41 // All CPUID info taken from sandpile.org/ia32/cpuid.htm */ 42 // Probably only works for Intel and AMD chips, and probably only for some of 43 // them. 44 45 static void micro_ops_warn(Int actual_size, Int used_size, Int line_size) 46 { 47 VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n", 48 actual_size); 49 VG_(dmsg)(" Simulating a %d KB I-cache with %d B lines\n", 50 used_size, line_size); 51 } 52 53 /* Intel method is truly wretched. We have to do an insane indexing into an 54 * array of pre-defined configurations for various parts of the memory 55 * hierarchy. 56 * According to Intel Processor Identification, App Note 485. 57 * 58 * If a L3 cache is found, then data for it rather than the L2 59 * is returned via *LLc. 60 */ 61 static 62 Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc) 63 { 64 Int cpuid1_eax; 65 Int cpuid1_ignore; 66 Int family; 67 Int model; 68 UChar info[16]; 69 Int i, trials; 70 Bool L2_found = False; 71 /* If we see L3 cache info, copy it into L3c. Then, at the end, 72 copy it into *LLc. Hence if a L3 cache is specified, *LLc will 73 eventually contain a description of it rather than the L2 cache. 74 The use of the L3c intermediary makes this process independent 75 of the order in which the cache specifications appear in 76 info[]. */ 77 Bool L3_found = False; 78 cache_t L3c = { 0, 0, 0 }; 79 80 if (level < 2) { 81 VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level); 82 return -1; 83 } 84 85 /* family/model needed to distinguish code reuse (currently 0x49) */ 86 VG_(cpuid)(1, &cpuid1_eax, &cpuid1_ignore, 87 &cpuid1_ignore, &cpuid1_ignore); 88 family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf); 89 model = (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf); 90 91 VG_(cpuid)(2, (Int*)&info[0], (Int*)&info[4], 92 (Int*)&info[8], (Int*)&info[12]); 93 trials = info[0] - 1; /* AL register - bits 0..7 of %eax */ 94 info[0] = 0x0; /* reset AL */ 95 96 if (0 != trials) { 97 VG_(dmsg)("warning: non-zero CPUID trials for Intel processor (%d)\n", 98 trials); 99 return -1; 100 } 101 102 for (i = 0; i < 16; i++) { 103 104 switch (info[i]) { 105 106 case 0x0: /* ignore zeros */ 107 break; 108 109 /* TLB info, ignore */ 110 case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: 111 case 0x4f: case 0x50: case 0x51: case 0x52: case 0x55: 112 case 0x56: case 0x57: case 0x59: 113 case 0x5a: case 0x5b: case 0x5c: case 0x5d: 114 case 0xb0: case 0xb1: case 0xb2: 115 case 0xb3: case 0xb4: case 0xba: case 0xc0: 116 case 0xca: 117 break; 118 119 case 0x06: *I1c = (cache_t) { 8, 4, 32 }; break; 120 case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break; 121 case 0x09: *I1c = (cache_t) { 32, 4, 64 }; break; 122 case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break; 123 124 case 0x0a: *D1c = (cache_t) { 8, 2, 32 }; break; 125 case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break; 126 case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break; 127 case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break; 128 129 /* IA-64 info -- panic! */ 130 case 0x10: case 0x15: case 0x1a: 131 case 0x88: case 0x89: case 0x8a: case 0x8d: 132 case 0x90: case 0x96: case 0x9b: 133 VG_(tool_panic)("IA-64 cache detected?!"); 134 135 /* L3 cache info. */ 136 case 0x22: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break; 137 case 0x23: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break; 138 case 0x25: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break; 139 case 0x29: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break; 140 case 0x46: L3c = (cache_t) { 4096, 4, 64 }; L3_found = True; break; 141 case 0x47: L3c = (cache_t) { 8192, 8, 64 }; L3_found = True; break; 142 case 0x4a: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break; 143 case 0x4b: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break; 144 case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break; 145 case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break; 146 case 0xd0: L3c = (cache_t) { 512, 4, 64 }; L3_found = True; break; 147 case 0xd1: L3c = (cache_t) { 1024, 4, 64 }; L3_found = True; break; 148 case 0xd2: L3c = (cache_t) { 2048, 4, 64 }; L3_found = True; break; 149 case 0xd6: L3c = (cache_t) { 1024, 8, 64 }; L3_found = True; break; 150 case 0xd7: L3c = (cache_t) { 2048, 8, 64 }; L3_found = True; break; 151 case 0xd8: L3c = (cache_t) { 4096, 8, 64 }; L3_found = True; break; 152 case 0xdc: L3c = (cache_t) { 1536, 12, 64 }; L3_found = True; break; 153 case 0xdd: L3c = (cache_t) { 3072, 12, 64 }; L3_found = True; break; 154 case 0xde: L3c = (cache_t) { 6144, 12, 64 }; L3_found = True; break; 155 case 0xe2: L3c = (cache_t) { 2048, 16, 64 }; L3_found = True; break; 156 case 0xe3: L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; break; 157 case 0xe4: L3c = (cache_t) { 8192, 16, 64 }; L3_found = True; break; 158 case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break; 159 case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break; 160 case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break; 161 162 /* Described as "MLC" in Intel documentation */ 163 case 0x21: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break; 164 165 /* These are sectored, whatever that means */ 166 case 0x39: *LLc = (cache_t) { 128, 4, 64 }; L2_found = True; break; 167 case 0x3c: *LLc = (cache_t) { 256, 4, 64 }; L2_found = True; break; 168 169 /* If a P6 core, this means "no L2 cache". 170 If a P4 core, this means "no L3 cache". 171 We don't know what core it is, so don't issue a warning. To detect 172 a missing L2 cache, we use 'L2_found'. */ 173 case 0x40: 174 break; 175 176 case 0x41: *LLc = (cache_t) { 128, 4, 32 }; L2_found = True; break; 177 case 0x42: *LLc = (cache_t) { 256, 4, 32 }; L2_found = True; break; 178 case 0x43: *LLc = (cache_t) { 512, 4, 32 }; L2_found = True; break; 179 case 0x44: *LLc = (cache_t) { 1024, 4, 32 }; L2_found = True; break; 180 case 0x45: *LLc = (cache_t) { 2048, 4, 32 }; L2_found = True; break; 181 case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break; 182 case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break; 183 case 0x49: 184 if (family == 15 && model == 6) { 185 /* On Xeon MP (family F, model 6), this is for L3 */ 186 L3c = (cache_t) { 4096, 16, 64 }; L3_found = True; 187 } else { 188 *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True; 189 } 190 break; 191 192 /* These are sectored, whatever that means */ 193 case 0x60: *D1c = (cache_t) { 16, 8, 64 }; break; /* sectored */ 194 case 0x66: *D1c = (cache_t) { 8, 4, 64 }; break; /* sectored */ 195 case 0x67: *D1c = (cache_t) { 16, 4, 64 }; break; /* sectored */ 196 case 0x68: *D1c = (cache_t) { 32, 4, 64 }; break; /* sectored */ 197 198 /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based. 199 * conversion to byte size is a total guess; treat the 12K and 16K 200 * cases the same since the cache byte size must be a power of two for 201 * everything to work!. Also guessing 32 bytes for the line size... 202 */ 203 case 0x70: /* 12K micro-ops, 8-way */ 204 *I1c = (cache_t) { 16, 8, 32 }; 205 micro_ops_warn(12, 16, 32); 206 break; 207 case 0x71: /* 16K micro-ops, 8-way */ 208 *I1c = (cache_t) { 16, 8, 32 }; 209 micro_ops_warn(16, 16, 32); 210 break; 211 case 0x72: /* 32K micro-ops, 8-way */ 212 *I1c = (cache_t) { 32, 8, 32 }; 213 micro_ops_warn(32, 32, 32); 214 break; 215 216 /* not sectored, whatever that might mean */ 217 case 0x78: *LLc = (cache_t) { 1024, 4, 64 }; L2_found = True; break; 218 219 /* These are sectored, whatever that means */ 220 case 0x79: *LLc = (cache_t) { 128, 8, 64 }; L2_found = True; break; 221 case 0x7a: *LLc = (cache_t) { 256, 8, 64 }; L2_found = True; break; 222 case 0x7b: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break; 223 case 0x7c: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break; 224 case 0x7d: *LLc = (cache_t) { 2048, 8, 64 }; L2_found = True; break; 225 case 0x7e: *LLc = (cache_t) { 256, 8, 128 }; L2_found = True; break; 226 case 0x7f: *LLc = (cache_t) { 512, 2, 64 }; L2_found = True; break; 227 case 0x80: *LLc = (cache_t) { 512, 8, 64 }; L2_found = True; break; 228 case 0x81: *LLc = (cache_t) { 128, 8, 32 }; L2_found = True; break; 229 case 0x82: *LLc = (cache_t) { 256, 8, 32 }; L2_found = True; break; 230 case 0x83: *LLc = (cache_t) { 512, 8, 32 }; L2_found = True; break; 231 case 0x84: *LLc = (cache_t) { 1024, 8, 32 }; L2_found = True; break; 232 case 0x85: *LLc = (cache_t) { 2048, 8, 32 }; L2_found = True; break; 233 case 0x86: *LLc = (cache_t) { 512, 4, 64 }; L2_found = True; break; 234 case 0x87: *LLc = (cache_t) { 1024, 8, 64 }; L2_found = True; break; 235 236 /* Ignore prefetch information */ 237 case 0xf0: case 0xf1: 238 break; 239 240 default: 241 VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), ignoring\n", 242 info[i]); 243 break; 244 } 245 } 246 247 /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */ 248 if (L3_found) { 249 VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n"); 250 *LLc = L3c; 251 L2_found = True; 252 } 253 254 if (!L2_found) 255 VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n"); 256 257 return 0; 258 } 259 260 /* AMD method is straightforward, just extract appropriate bits from the 261 * result registers. 262 * 263 * Bits, for D1 and I1: 264 * 31..24 data L1 cache size in KBs 265 * 23..16 data L1 cache associativity (FFh=full) 266 * 15.. 8 data L1 cache lines per tag 267 * 7.. 0 data L1 cache line size in bytes 268 * 269 * Bits, for L2: 270 * 31..16 unified L2 cache size in KBs 271 * 15..12 unified L2 cache associativity (0=off, FFh=full) 272 * 11.. 8 unified L2 cache lines per tag 273 * 7.. 0 unified L2 cache line size in bytes 274 * 275 * #3 The AMD K7 processor's L2 cache must be configured prior to relying 276 * upon this information. (Whatever that means -- njn) 277 * 278 * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model 279 * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB), 280 * so we detect that. 281 * 282 * Returns 0 on success, non-zero on failure. As with the Intel code 283 * above, if a L3 cache is found, then data for it rather than the L2 284 * is returned via *LLc. 285 */ 286 287 /* A small helper */ 288 static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 ) 289 { 290 /* Decode a L2/L3 associativity indication. It is encoded 291 differently from the I1/D1 associativity. Returns 1 292 (direct-map) as a safe but suboptimal result for unknown 293 encodings. */ 294 switch (bits_15_12 & 0xF) { 295 case 1: return 1; case 2: return 2; 296 case 4: return 4; case 6: return 8; 297 case 8: return 16; case 0xA: return 32; 298 case 0xB: return 48; case 0xC: return 64; 299 case 0xD: return 96; case 0xE: return 128; 300 case 0xF: /* fully associative */ 301 case 0: /* L2/L3 cache or TLB is disabled */ 302 default: 303 return 1; 304 } 305 } 306 307 static 308 Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc) 309 { 310 UInt ext_level; 311 UInt dummy, model; 312 UInt I1i, D1i, L2i, L3i; 313 314 VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy); 315 316 if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) { 317 VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n", 318 ext_level); 319 return -1; 320 } 321 322 VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i); 323 VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i); 324 325 VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy); 326 327 /* Check for Duron bug */ 328 if (model == 0x630) { 329 VG_(dmsg)("warning: Buggy Duron stepping A0. Assuming L2 size=65536 bytes\n"); 330 L2i = (64 << 16) | (L2i & 0xffff); 331 } 332 333 D1c->size = (D1i >> 24) & 0xff; 334 D1c->assoc = (D1i >> 16) & 0xff; 335 D1c->line_size = (D1i >> 0) & 0xff; 336 337 I1c->size = (I1i >> 24) & 0xff; 338 I1c->assoc = (I1i >> 16) & 0xff; 339 I1c->line_size = (I1i >> 0) & 0xff; 340 341 LLc->size = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */ 342 LLc->assoc = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf); 343 LLc->line_size = (L2i >> 0) & 0xff; 344 345 if (((L3i >> 18) & 0x3fff) > 0) { 346 /* There's an L3 cache. Replace *LLc contents with this info. */ 347 /* NB: the test in the if is "if L3 size > 0 ". I don't know if 348 this is the right way to test presence-vs-absence of L3. I 349 can't see any guidance on this in the AMD documentation. */ 350 LLc->size = ((L3i >> 18) & 0x3fff) * 512; 351 LLc->assoc = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf); 352 LLc->line_size = (L3i >> 0) & 0xff; 353 VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n"); 354 } 355 356 return 0; 357 } 358 359 static 360 Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc) 361 { 362 Int level, ret; 363 Char vendor_id[13]; 364 365 if (!VG_(has_cpuid)()) { 366 VG_(dmsg)("CPUID instruction not supported\n"); 367 return -1; 368 } 369 370 VG_(cpuid)(0, &level, (int*)&vendor_id[0], 371 (int*)&vendor_id[8], (int*)&vendor_id[4]); 372 vendor_id[12] = '\0'; 373 374 if (0 == level) { 375 VG_(dmsg)("CPUID level is 0, early Pentium?\n"); 376 return -1; 377 } 378 379 /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */ 380 if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) { 381 ret = Intel_cache_info(level, I1c, D1c, LLc); 382 383 } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) { 384 ret = AMD_cache_info(I1c, D1c, LLc); 385 386 } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) { 387 /* Total kludge. Pretend to be a VIA Nehemiah. */ 388 D1c->size = 64; 389 D1c->assoc = 16; 390 D1c->line_size = 16; 391 I1c->size = 64; 392 I1c->assoc = 4; 393 I1c->line_size = 16; 394 LLc->size = 64; 395 LLc->assoc = 16; 396 LLc->line_size = 16; 397 ret = 0; 398 399 } else { 400 VG_(dmsg)("CPU vendor ID not recognised (%s)\n", vendor_id); 401 return -1; 402 } 403 404 /* Successful! Convert sizes from KB to bytes */ 405 I1c->size *= 1024; 406 D1c->size *= 1024; 407 LLc->size *= 1024; 408 409 return ret; 410 } 411 412 413 void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc, 414 Bool all_caches_clo_defined) 415 { 416 Int res; 417 418 // Set caches to default. 419 *I1c = (cache_t) { 65536, 2, 64 }; 420 *D1c = (cache_t) { 65536, 2, 64 }; 421 *LLc = (cache_t) { 262144, 8, 64 }; 422 423 // Then replace with any info we can get from CPUID. 424 res = get_caches_from_CPUID(I1c, D1c, LLc); 425 426 // Warn if CPUID failed and config not completely specified from cmd line. 427 if (res != 0 && !all_caches_clo_defined) { 428 VG_(dmsg)("Warning: Couldn't auto-detect cache config, using one " 429 "or more defaults \n"); 430 } 431 } 432 433 #endif // defined(VGA_x86) || defined(VGA_amd64) 434 435 /*--------------------------------------------------------------------*/ 436 /*--- end ---*/ 437 /*--------------------------------------------------------------------*/ 438