Home | History | Annotate | Download | only in cachegrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- x86- and AMD64-specific definitions.          cg-x86-amd64.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Cachegrind, a Valgrind tool for cache
      8    profiling programs.
      9 
     10    Copyright (C) 2002-2010 Nicholas Nethercote
     11       njn (at) valgrind.org
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     26    02111-1307, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 */
     30 
     31 #if defined(VGA_x86) || defined(VGA_amd64)
     32 
     33 #include "pub_tool_basics.h"
     34 #include "pub_tool_cpuid.h"
     35 #include "pub_tool_libcbase.h"
     36 #include "pub_tool_libcassert.h"
     37 #include "pub_tool_libcprint.h"
     38 
     39 #include "cg_arch.h"
     40 
     41 // All CPUID info taken from sandpile.org/ia32/cpuid.htm */
     42 // Probably only works for Intel and AMD chips, and probably only for some of
     43 // them.
     44 
     45 static void micro_ops_warn(Int actual_size, Int used_size, Int line_size)
     46 {
     47    VG_(dmsg)("warning: Pentium 4 with %d KB micro-op instruction trace cache\n",
     48              actual_size);
     49    VG_(dmsg)("         Simulating a %d KB I-cache with %d B lines\n",
     50              used_size, line_size);
     51 }
     52 
     53 /* Intel method is truly wretched.  We have to do an insane indexing into an
     54  * array of pre-defined configurations for various parts of the memory
     55  * hierarchy.
     56  * According to Intel Processor Identification, App Note 485.
     57  *
     58  * If a L3 cache is found, then data for it rather than the L2
     59  * is returned via *LLc.
     60  */
     61 static
     62 Int Intel_cache_info(Int level, cache_t* I1c, cache_t* D1c, cache_t* LLc)
     63 {
     64    Int cpuid1_eax;
     65    Int cpuid1_ignore;
     66    Int family;
     67    Int model;
     68    UChar info[16];
     69    Int   i, trials;
     70    Bool  L2_found = False;
     71    /* If we see L3 cache info, copy it into L3c.  Then, at the end,
     72       copy it into *LLc.  Hence if a L3 cache is specified, *LLc will
     73       eventually contain a description of it rather than the L2 cache.
     74       The use of the L3c intermediary makes this process independent
     75       of the order in which the cache specifications appear in
     76       info[]. */
     77    Bool  L3_found = False;
     78    cache_t L3c = { 0, 0, 0 };
     79 
     80    if (level < 2) {
     81       VG_(dmsg)("warning: CPUID level < 2 for Intel processor (%d)\n", level);
     82       return -1;
     83    }
     84 
     85    /* family/model needed to distinguish code reuse (currently 0x49) */
     86    VG_(cpuid)(1, &cpuid1_eax, &cpuid1_ignore,
     87 	      &cpuid1_ignore, &cpuid1_ignore);
     88    family = (((cpuid1_eax >> 20) & 0xff) << 4) + ((cpuid1_eax >> 8) & 0xf);
     89    model =  (((cpuid1_eax >> 16) & 0xf) << 4) + ((cpuid1_eax >> 4) & 0xf);
     90 
     91    VG_(cpuid)(2, (Int*)&info[0], (Int*)&info[4],
     92                  (Int*)&info[8], (Int*)&info[12]);
     93    trials  = info[0] - 1;   /* AL register - bits 0..7 of %eax */
     94    info[0] = 0x0;           /* reset AL */
     95 
     96    if (0 != trials) {
     97       VG_(dmsg)("warning: non-zero CPUID trials for Intel processor (%d)\n",
     98                 trials);
     99       return -1;
    100    }
    101 
    102    for (i = 0; i < 16; i++) {
    103 
    104       switch (info[i]) {
    105 
    106       case 0x0:       /* ignore zeros */
    107           break;
    108 
    109       /* TLB info, ignore */
    110       case 0x01: case 0x02: case 0x03: case 0x04: case 0x05:
    111       case 0x4f: case 0x50: case 0x51: case 0x52: case 0x55:
    112       case 0x56: case 0x57: case 0x59:
    113       case 0x5a: case 0x5b: case 0x5c: case 0x5d:
    114       case 0xb0: case 0xb1: case 0xb2:
    115       case 0xb3: case 0xb4: case 0xba: case 0xc0:
    116       case 0xca:
    117           break;
    118 
    119       case 0x06: *I1c = (cache_t) {  8, 4, 32 }; break;
    120       case 0x08: *I1c = (cache_t) { 16, 4, 32 }; break;
    121       case 0x09: *I1c = (cache_t) { 32, 4, 64 }; break;
    122       case 0x30: *I1c = (cache_t) { 32, 8, 64 }; break;
    123 
    124       case 0x0a: *D1c = (cache_t) {  8, 2, 32 }; break;
    125       case 0x0c: *D1c = (cache_t) { 16, 4, 32 }; break;
    126       case 0x0e: *D1c = (cache_t) { 24, 6, 64 }; break;
    127       case 0x2c: *D1c = (cache_t) { 32, 8, 64 }; break;
    128 
    129       /* IA-64 info -- panic! */
    130       case 0x10: case 0x15: case 0x1a:
    131       case 0x88: case 0x89: case 0x8a: case 0x8d:
    132       case 0x90: case 0x96: case 0x9b:
    133          VG_(tool_panic)("IA-64 cache detected?!");
    134 
    135       /* L3 cache info. */
    136       case 0x22: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
    137       case 0x23: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
    138       case 0x25: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
    139       case 0x29: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
    140       case 0x46: L3c = (cache_t) { 4096,   4, 64 }; L3_found = True; break;
    141       case 0x47: L3c = (cache_t) { 8192,   8, 64 }; L3_found = True; break;
    142       case 0x4a: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
    143       case 0x4b: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
    144       case 0x4c: L3c = (cache_t) { 12288, 12, 64 }; L3_found = True; break;
    145       case 0x4d: L3c = (cache_t) { 16384, 16, 64 }; L3_found = True; break;
    146       case 0xd0: L3c = (cache_t) { 512,    4, 64 }; L3_found = True; break;
    147       case 0xd1: L3c = (cache_t) { 1024,   4, 64 }; L3_found = True; break;
    148       case 0xd2: L3c = (cache_t) { 2048,   4, 64 }; L3_found = True; break;
    149       case 0xd6: L3c = (cache_t) { 1024,   8, 64 }; L3_found = True; break;
    150       case 0xd7: L3c = (cache_t) { 2048,   8, 64 }; L3_found = True; break;
    151       case 0xd8: L3c = (cache_t) { 4096,   8, 64 }; L3_found = True; break;
    152       case 0xdc: L3c = (cache_t) { 1536,  12, 64 }; L3_found = True; break;
    153       case 0xdd: L3c = (cache_t) { 3072,  12, 64 }; L3_found = True; break;
    154       case 0xde: L3c = (cache_t) { 6144,  12, 64 }; L3_found = True; break;
    155       case 0xe2: L3c = (cache_t) { 2048,  16, 64 }; L3_found = True; break;
    156       case 0xe3: L3c = (cache_t) { 4096,  16, 64 }; L3_found = True; break;
    157       case 0xe4: L3c = (cache_t) { 8192,  16, 64 }; L3_found = True; break;
    158       case 0xea: L3c = (cache_t) { 12288, 24, 64 }; L3_found = True; break;
    159       case 0xeb: L3c = (cache_t) { 18432, 24, 64 }; L3_found = True; break;
    160       case 0xec: L3c = (cache_t) { 24576, 24, 64 }; L3_found = True; break;
    161 
    162       /* Described as "MLC" in Intel documentation */
    163       case 0x21: *LLc = (cache_t) {  256, 8, 64 }; L2_found = True; break;
    164 
    165       /* These are sectored, whatever that means */
    166       case 0x39: *LLc = (cache_t) {  128, 4, 64 }; L2_found = True; break;
    167       case 0x3c: *LLc = (cache_t) {  256, 4, 64 }; L2_found = True; break;
    168 
    169       /* If a P6 core, this means "no L2 cache".
    170          If a P4 core, this means "no L3 cache".
    171          We don't know what core it is, so don't issue a warning.  To detect
    172          a missing L2 cache, we use 'L2_found'. */
    173       case 0x40:
    174           break;
    175 
    176       case 0x41: *LLc = (cache_t) {  128,  4, 32 }; L2_found = True; break;
    177       case 0x42: *LLc = (cache_t) {  256,  4, 32 }; L2_found = True; break;
    178       case 0x43: *LLc = (cache_t) {  512,  4, 32 }; L2_found = True; break;
    179       case 0x44: *LLc = (cache_t) { 1024,  4, 32 }; L2_found = True; break;
    180       case 0x45: *LLc = (cache_t) { 2048,  4, 32 }; L2_found = True; break;
    181       case 0x48: *LLc = (cache_t) { 3072, 12, 64 }; L2_found = True; break;
    182       case 0x4e: *LLc = (cache_t) { 6144, 24, 64 }; L2_found = True; break;
    183       case 0x49:
    184          if (family == 15 && model == 6) {
    185             /* On Xeon MP (family F, model 6), this is for L3 */
    186             L3c = (cache_t) { 4096, 16, 64 }; L3_found = True;
    187          } else {
    188 	    *LLc = (cache_t) { 4096, 16, 64 }; L2_found = True;
    189          }
    190          break;
    191 
    192       /* These are sectored, whatever that means */
    193       case 0x60: *D1c = (cache_t) { 16, 8, 64 };  break;      /* sectored */
    194       case 0x66: *D1c = (cache_t) {  8, 4, 64 };  break;      /* sectored */
    195       case 0x67: *D1c = (cache_t) { 16, 4, 64 };  break;      /* sectored */
    196       case 0x68: *D1c = (cache_t) { 32, 4, 64 };  break;      /* sectored */
    197 
    198       /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
    199        * conversion to byte size is a total guess;  treat the 12K and 16K
    200        * cases the same since the cache byte size must be a power of two for
    201        * everything to work!.  Also guessing 32 bytes for the line size...
    202        */
    203       case 0x70:    /* 12K micro-ops, 8-way */
    204          *I1c = (cache_t) { 16, 8, 32 };
    205          micro_ops_warn(12, 16, 32);
    206          break;
    207       case 0x71:    /* 16K micro-ops, 8-way */
    208          *I1c = (cache_t) { 16, 8, 32 };
    209          micro_ops_warn(16, 16, 32);
    210          break;
    211       case 0x72:    /* 32K micro-ops, 8-way */
    212          *I1c = (cache_t) { 32, 8, 32 };
    213          micro_ops_warn(32, 32, 32);
    214          break;
    215 
    216       /* not sectored, whatever that might mean */
    217       case 0x78: *LLc = (cache_t) { 1024, 4,  64 }; L2_found = True;  break;
    218 
    219       /* These are sectored, whatever that means */
    220       case 0x79: *LLc = (cache_t) {  128, 8,  64 }; L2_found = True;  break;
    221       case 0x7a: *LLc = (cache_t) {  256, 8,  64 }; L2_found = True;  break;
    222       case 0x7b: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
    223       case 0x7c: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
    224       case 0x7d: *LLc = (cache_t) { 2048, 8,  64 }; L2_found = True;  break;
    225       case 0x7e: *LLc = (cache_t) {  256, 8, 128 }; L2_found = True;  break;
    226       case 0x7f: *LLc = (cache_t) {  512, 2,  64 }; L2_found = True;  break;
    227       case 0x80: *LLc = (cache_t) {  512, 8,  64 }; L2_found = True;  break;
    228       case 0x81: *LLc = (cache_t) {  128, 8,  32 }; L2_found = True;  break;
    229       case 0x82: *LLc = (cache_t) {  256, 8,  32 }; L2_found = True;  break;
    230       case 0x83: *LLc = (cache_t) {  512, 8,  32 }; L2_found = True;  break;
    231       case 0x84: *LLc = (cache_t) { 1024, 8,  32 }; L2_found = True;  break;
    232       case 0x85: *LLc = (cache_t) { 2048, 8,  32 }; L2_found = True;  break;
    233       case 0x86: *LLc = (cache_t) {  512, 4,  64 }; L2_found = True;  break;
    234       case 0x87: *LLc = (cache_t) { 1024, 8,  64 }; L2_found = True;  break;
    235 
    236       /* Ignore prefetch information */
    237       case 0xf0: case 0xf1:
    238          break;
    239 
    240       default:
    241          VG_(dmsg)("warning: Unknown Intel cache config value (0x%x), ignoring\n",
    242                    info[i]);
    243          break;
    244       }
    245    }
    246 
    247    /* If we found a L3 cache, throw away the L2 data and use the L3's instead. */
    248    if (L3_found) {
    249       VG_(dmsg)("warning: L3 cache found, using its data for the LL simulation.\n");
    250       *LLc = L3c;
    251       L2_found = True;
    252    }
    253 
    254    if (!L2_found)
    255       VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
    256 
    257    return 0;
    258 }
    259 
    260 /* AMD method is straightforward, just extract appropriate bits from the
    261  * result registers.
    262  *
    263  * Bits, for D1 and I1:
    264  *  31..24  data L1 cache size in KBs
    265  *  23..16  data L1 cache associativity (FFh=full)
    266  *  15.. 8  data L1 cache lines per tag
    267  *   7.. 0  data L1 cache line size in bytes
    268  *
    269  * Bits, for L2:
    270  *  31..16  unified L2 cache size in KBs
    271  *  15..12  unified L2 cache associativity (0=off, FFh=full)
    272  *  11.. 8  unified L2 cache lines per tag
    273  *   7.. 0  unified L2 cache line size in bytes
    274  *
    275  * #3  The AMD K7 processor's L2 cache must be configured prior to relying
    276  *     upon this information. (Whatever that means -- njn)
    277  *
    278  * Also, according to Cyrille Chepelov, Duron stepping A0 processors (model
    279  * 0x630) have a bug and misreport their L2 size as 1KB (it's really 64KB),
    280  * so we detect that.
    281  *
    282  * Returns 0 on success, non-zero on failure.  As with the Intel code
    283  * above, if a L3 cache is found, then data for it rather than the L2
    284  * is returned via *LLc.
    285  */
    286 
    287 /* A small helper */
    288 static Int decode_AMD_cache_L2_L3_assoc ( Int bits_15_12 )
    289 {
    290    /* Decode a L2/L3 associativity indication.  It is encoded
    291       differently from the I1/D1 associativity.  Returns 1
    292       (direct-map) as a safe but suboptimal result for unknown
    293       encodings. */
    294    switch (bits_15_12 & 0xF) {
    295       case 1: return 1;    case 2: return 2;
    296       case 4: return 4;    case 6: return 8;
    297       case 8: return 16;   case 0xA: return 32;
    298       case 0xB: return 48; case 0xC: return 64;
    299       case 0xD: return 96; case 0xE: return 128;
    300       case 0xF: /* fully associative */
    301       case 0: /* L2/L3 cache or TLB is disabled */
    302       default:
    303         return 1;
    304    }
    305 }
    306 
    307 static
    308 Int AMD_cache_info(cache_t* I1c, cache_t* D1c, cache_t* LLc)
    309 {
    310    UInt ext_level;
    311    UInt dummy, model;
    312    UInt I1i, D1i, L2i, L3i;
    313 
    314    VG_(cpuid)(0x80000000, &ext_level, &dummy, &dummy, &dummy);
    315 
    316    if (0 == (ext_level & 0x80000000) || ext_level < 0x80000006) {
    317       VG_(dmsg)("warning: ext_level < 0x80000006 for AMD processor (0x%x)\n",
    318                 ext_level);
    319       return -1;
    320    }
    321 
    322    VG_(cpuid)(0x80000005, &dummy, &dummy, &D1i, &I1i);
    323    VG_(cpuid)(0x80000006, &dummy, &dummy, &L2i, &L3i);
    324 
    325    VG_(cpuid)(0x1, &model, &dummy, &dummy, &dummy);
    326 
    327    /* Check for Duron bug */
    328    if (model == 0x630) {
    329       VG_(dmsg)("warning: Buggy Duron stepping A0. Assuming L2 size=65536 bytes\n");
    330       L2i = (64 << 16) | (L2i & 0xffff);
    331    }
    332 
    333    D1c->size      = (D1i >> 24) & 0xff;
    334    D1c->assoc     = (D1i >> 16) & 0xff;
    335    D1c->line_size = (D1i >>  0) & 0xff;
    336 
    337    I1c->size      = (I1i >> 24) & 0xff;
    338    I1c->assoc     = (I1i >> 16) & 0xff;
    339    I1c->line_size = (I1i >>  0) & 0xff;
    340 
    341    LLc->size      = (L2i >> 16) & 0xffff; /* Nb: different bits used for L2 */
    342    LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L2i >> 12) & 0xf);
    343    LLc->line_size = (L2i >>  0) & 0xff;
    344 
    345    if (((L3i >> 18) & 0x3fff) > 0) {
    346       /* There's an L3 cache.  Replace *LLc contents with this info. */
    347       /* NB: the test in the if is "if L3 size > 0 ".  I don't know if
    348          this is the right way to test presence-vs-absence of L3.  I
    349          can't see any guidance on this in the AMD documentation. */
    350       LLc->size      = ((L3i >> 18) & 0x3fff) * 512;
    351       LLc->assoc     = decode_AMD_cache_L2_L3_assoc((L3i >> 12) & 0xf);
    352       LLc->line_size = (L3i >>  0) & 0xff;
    353       VG_(dmsg)("warning: L3 cache found, using its data for the L2 simulation.\n");
    354    }
    355 
    356    return 0;
    357 }
    358 
    359 static
    360 Int get_caches_from_CPUID(cache_t* I1c, cache_t* D1c, cache_t* LLc)
    361 {
    362    Int  level, ret;
    363    Char vendor_id[13];
    364 
    365    if (!VG_(has_cpuid)()) {
    366       VG_(dmsg)("CPUID instruction not supported\n");
    367       return -1;
    368    }
    369 
    370    VG_(cpuid)(0, &level, (int*)&vendor_id[0],
    371 	      (int*)&vendor_id[8], (int*)&vendor_id[4]);
    372    vendor_id[12] = '\0';
    373 
    374    if (0 == level) {
    375       VG_(dmsg)("CPUID level is 0, early Pentium?\n");
    376       return -1;
    377    }
    378 
    379    /* Only handling Intel and AMD chips... no Cyrix, Transmeta, etc */
    380    if (0 == VG_(strcmp)(vendor_id, "GenuineIntel")) {
    381       ret = Intel_cache_info(level, I1c, D1c, LLc);
    382 
    383    } else if (0 == VG_(strcmp)(vendor_id, "AuthenticAMD")) {
    384       ret = AMD_cache_info(I1c, D1c, LLc);
    385 
    386    } else if (0 == VG_(strcmp)(vendor_id, "CentaurHauls")) {
    387       /* Total kludge.  Pretend to be a VIA Nehemiah. */
    388       D1c->size      = 64;
    389       D1c->assoc     = 16;
    390       D1c->line_size = 16;
    391       I1c->size      = 64;
    392       I1c->assoc     = 4;
    393       I1c->line_size = 16;
    394       LLc->size      = 64;
    395       LLc->assoc     = 16;
    396       LLc->line_size = 16;
    397       ret = 0;
    398 
    399    } else {
    400       VG_(dmsg)("CPU vendor ID not recognised (%s)\n", vendor_id);
    401       return -1;
    402    }
    403 
    404    /* Successful!  Convert sizes from KB to bytes */
    405    I1c->size *= 1024;
    406    D1c->size *= 1024;
    407    LLc->size *= 1024;
    408 
    409    return ret;
    410 }
    411 
    412 
    413 void VG_(configure_caches)(cache_t* I1c, cache_t* D1c, cache_t* LLc,
    414                            Bool all_caches_clo_defined)
    415 {
    416    Int res;
    417 
    418    // Set caches to default.
    419    *I1c = (cache_t) {  65536, 2, 64 };
    420    *D1c = (cache_t) {  65536, 2, 64 };
    421    *LLc = (cache_t) { 262144, 8, 64 };
    422 
    423    // Then replace with any info we can get from CPUID.
    424    res = get_caches_from_CPUID(I1c, D1c, LLc);
    425 
    426    // Warn if CPUID failed and config not completely specified from cmd line.
    427    if (res != 0 && !all_caches_clo_defined) {
    428       VG_(dmsg)("Warning: Couldn't auto-detect cache config, using one "
    429                 "or more defaults \n");
    430    }
    431 }
    432 
    433 #endif // defined(VGA_x86) || defined(VGA_amd64)
    434 
    435 /*--------------------------------------------------------------------*/
    436 /*--- end                                                          ---*/
    437 /*--------------------------------------------------------------------*/
    438