Home | History | Annotate | Download | only in cachegrind
      1 /*--------------------------------------------------------------------*/
      2 /*--- Cachegrind: cache configuration.                   cg-arch.c ---*/
      3 /*--------------------------------------------------------------------*/
      4 
      5 /*
      6    This file is part of Cachegrind, a Valgrind tool for cache
      7    profiling programs.
      8 
      9    Copyright (C) 2011-2015 Nicholas Nethercote
     10       njn (at) valgrind.org
     11 
     12    This program is free software; you can redistribute it and/or
     13    modify it under the terms of the GNU General Public License as
     14    published by the Free Software Foundation; either version 2 of the
     15    License, or (at your option) any later version.
     16 
     17    This program is distributed in the hope that it will be useful, but
     18    WITHOUT ANY WARRANTY; without even the implied warranty of
     19    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     20    General Public License for more details.
     21 
     22    You should have received a copy of the GNU General Public License
     23    along with this program; if not, write to the Free Software
     24    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     25    02111-1307, USA.
     26 
     27    The GNU General Public License is contained in the file COPYING.
     28 */
     29 
     30 #include "pub_tool_basics.h"
     31 #include "pub_tool_libcassert.h"
     32 #include "pub_tool_libcbase.h"
     33 #include "pub_tool_libcprint.h"
     34 #include "pub_tool_options.h"
     35 #include "pub_tool_machine.h"
     36 
     37 #include "cg_arch.h"
     38 
     39 static void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc,
     40                              Bool all_caches_clo_defined);
     41 
     42 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
     43 // string otherwise.
     44 static const HChar* check_cache(cache_t* cache)
     45 {
     46    // Simulator requires set count to be a power of two.
     47    if ((cache->size % (cache->line_size * cache->assoc) != 0) ||
     48        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
     49    {
     50       return "Cache set count is not a power of two.\n";
     51    }
     52 
     53    // Simulator requires line size to be a power of two.
     54    if (-1 == VG_(log2)(cache->line_size)) {
     55       return "Cache line size is not a power of two.\n";
     56    }
     57 
     58    // Then check line size >= 16 -- any smaller and a single instruction could
     59    // straddle three cache lines, which breaks a simulation assertion and is
     60    // stupid anyway.
     61    if (cache->line_size < MIN_LINE_SIZE) {
     62       return "Cache line size is too small.\n";
     63    }
     64 
     65    /* Then check cache size > line size (causes seg faults if not). */
     66    if (cache->size <= cache->line_size) {
     67       return "Cache size <= line size.\n";
     68    }
     69 
     70    /* Then check assoc <= (size / line size) (seg faults otherwise). */
     71    if (cache->assoc > (cache->size / cache->line_size)) {
     72       return "Cache associativity > (size / line size).\n";
     73    }
     74 
     75    return NULL;
     76 }
     77 
     78 
     79 static void parse_cache_opt ( cache_t* cache, const HChar* opt,
     80                               const HChar* optval )
     81 {
     82    Long i1, i2, i3;
     83    HChar* endptr;
     84    const HChar* checkRes;
     85 
     86    // Option argument looks like "65536,2,64".  Extract them.
     87    i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
     88    i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
     89    i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
     90 
     91    // Check for overflow.
     92    cache->size      = (Int)i1;
     93    cache->assoc     = (Int)i2;
     94    cache->line_size = (Int)i3;
     95    if (cache->size      != i1) goto overflow;
     96    if (cache->assoc     != i2) goto overflow;
     97    if (cache->line_size != i3) goto overflow;
     98 
     99    checkRes = check_cache(cache);
    100    if (checkRes) {
    101       VG_(fmsg)("%s", checkRes);
    102       goto bad;
    103    }
    104 
    105    return;
    106 
    107   bad:
    108    VG_(fmsg_bad_option)(opt, "Bad argument '%s'\n", optval);
    109 
    110   overflow:
    111    VG_(fmsg_bad_option)(opt,
    112       "One of the cache parameters was too large and overflowed.\n");
    113 }
    114 
    115 
    116 Bool VG_(str_clo_cache_opt)(const HChar *arg,
    117                             cache_t* clo_I1c,
    118                             cache_t* clo_D1c,
    119                             cache_t* clo_LLc)
    120 {
    121    const HChar* tmp_str;
    122 
    123    if      VG_STR_CLO(arg, "--I1", tmp_str) {
    124       parse_cache_opt(clo_I1c, arg, tmp_str);
    125       return True;
    126    } else if VG_STR_CLO(arg, "--D1", tmp_str) {
    127       parse_cache_opt(clo_D1c, arg, tmp_str);
    128       return True;
    129    } else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
    130               VG_STR_CLO(arg, "--LL", tmp_str)) {
    131       parse_cache_opt(clo_LLc, arg, tmp_str);
    132       return True;
    133    } else
    134       return False;
    135 }
    136 
    137 static void umsg_cache_img(const HChar* desc, cache_t* c)
    138 {
    139    VG_(umsg)("  %s: %'d B, %d-way, %d B lines\n", desc,
    140              c->size, c->assoc, c->line_size);
    141 }
    142 
    143 // Verifies if c is a valid cache.
    144 // An invalid value causes an assert, unless clo_redefined is True.
    145 static void check_cache_or_override(const HChar* desc, cache_t* c, Bool clo_redefined)
    146 {
    147    const HChar* checkRes;
    148 
    149    checkRes = check_cache(c);
    150    if (checkRes) {
    151       VG_(umsg)("Auto-detected %s cache configuration not supported: %s",
    152                 desc, checkRes);
    153       umsg_cache_img(desc, c);
    154       if (!clo_redefined) {
    155          VG_(umsg)("As it probably should be supported, please report a bug!\n");
    156          VG_(umsg)("Bypass this message by using option --%s=...\n", desc);
    157          tl_assert(0);
    158       }
    159    }
    160 }
    161 
    162 
    163 /* If the LL cache config isn't something the simulation functions
    164    can handle, try to adjust it so it is.  Caches are characterised
    165    by (total size T, line size L, associativity A), and then we
    166    have
    167 
    168      number of sets S = T / (L * A)
    169 
    170    The required constraints are:
    171 
    172    * L must be a power of 2, but it always is in practice, so
    173      no problem there
    174 
    175    * A can be any value >= 1
    176 
    177    * T can be any value, but ..
    178 
    179    * S must be a power of 2.
    180 
    181    That sometimes gives a problem.  For example, some Core iX based
    182    Intel CPUs have T = 12MB, A = 16, L = 64, which gives 12288
    183    sets.  Some AMD cpus have T = 5MB, A = 48, L = 64, which gives
    184    1706.667 sets (!).
    185 
    186    The "fix" is to force S down to the nearest power of two below its
    187    original value, and increase A proportionately, so as to keep the
    188    total cache size the same.  In fact to be safe we recalculate the
    189    cache size afterwards anyway, to guarantee that it divides exactly
    190    between the new number of sets.
    191 
    192    The "fix" is "justified" (cough, cough) by alleging that
    193    increases of associativity above about 4 have very little effect
    194    on the actual miss rate.  It would be far more inaccurate to
    195    fudge this by changing the size of the simulated cache --
    196    changing the associativity is a much better option.
    197 */
    198 
    199 /* (Helper function) Returns the largest power of 2 that is <= |x|.
    200    Even works when |x| == 0. */
    201 static UInt floor_power_of_2 ( UInt x )
    202 {
    203    x = x | (x >> 1);
    204    x = x | (x >> 2);
    205    x = x | (x >> 4);
    206    x = x | (x >> 8);
    207    x = x | (x >> 16);
    208    return x - (x >> 1);
    209 }
    210 
    211 static void
    212 maybe_tweak_LLc(cache_t *LLc)
    213 {
    214   if (LLc->size == 0 || LLc->assoc == 0 || LLc->line_size == 0)
    215      return;
    216 
    217   tl_assert(LLc->size > 0 && LLc->assoc > 0 && LLc->line_size > 0);
    218 
    219   UInt old_size      = (UInt)LLc->size;
    220   UInt old_assoc     = (UInt)LLc->assoc;
    221   UInt old_line_size = (UInt)LLc->line_size;
    222 
    223   UInt new_size      = old_size;
    224   UInt new_assoc     = old_assoc;
    225   UInt new_line_size = old_line_size;
    226 
    227   UInt old_nSets = old_size / (old_assoc * old_line_size);
    228   if (old_nSets == 0) {
    229      /* This surely can't happen; but would cause chaos with the maths
    230       * below if it did.  Just give up if it does. */
    231      return;
    232   }
    233 
    234   if (-1 != VG_(log2_64)(old_nSets)) {
    235      /* The number of sets is already a power of 2.  Make sure that
    236         the size divides exactly between the sets.  Almost all of the
    237         time this will have no effect. */
    238      new_size = old_line_size * old_assoc * old_nSets;
    239   } else {
    240      /* The number of sets isn't a power of two.  Calculate some
    241         scale-down factor which causes the number of sets to become a
    242         power of two.  Then, increase the associativity by that
    243         factor.  Finally, re-calculate the total size so as to make
    244         sure it divides exactly between the sets. */
    245      tl_assert(old_nSets >= 0);
    246      UInt new_nSets = floor_power_of_2 ( old_nSets );
    247      tl_assert(new_nSets > 0 && new_nSets < old_nSets);
    248      Double factor = (Double)old_nSets / (Double)new_nSets;
    249      tl_assert(factor >= 1.0);
    250 
    251      new_assoc = (UInt)(0.5 + factor * (Double)old_assoc);
    252      tl_assert(new_assoc >= old_assoc);
    253 
    254      new_size = old_line_size * new_assoc * new_nSets;
    255   }
    256 
    257   tl_assert(new_line_size == old_line_size); /* we never change this */
    258   if (new_size == old_size && new_assoc == old_assoc)
    259      return;
    260 
    261   VG_(dmsg)("warning: "
    262             "specified LL cache: line_size %u  assoc %u  total_size %'u\n",
    263             old_line_size, old_assoc, old_size);
    264   VG_(dmsg)("warning: "
    265             "simulated LL cache: line_size %u  assoc %u  total_size %'u\n",\
    266             new_line_size, new_assoc, new_size);
    267 
    268   LLc->size      = new_size;
    269   LLc->assoc     = new_assoc;
    270   LLc->line_size = new_line_size;
    271 }
    272 
    273 void VG_(post_clo_init_configure_caches)(cache_t* I1c,
    274                                          cache_t* D1c,
    275                                          cache_t* LLc,
    276                                          cache_t* clo_I1c,
    277                                          cache_t* clo_D1c,
    278                                          cache_t* clo_LLc)
    279 {
    280 #define DEFINED(L)   (-1 != L->size  || -1 != L->assoc || -1 != L->line_size)
    281 
    282    // Count how many were defined on the command line.
    283    Bool all_caches_clo_defined =
    284       (DEFINED(clo_I1c) &&
    285        DEFINED(clo_D1c) &&
    286        DEFINED(clo_LLc));
    287 
    288    // Set the cache config (using auto-detection, if supported by the
    289    // architecture).
    290    configure_caches( I1c, D1c, LLc, all_caches_clo_defined );
    291 
    292    maybe_tweak_LLc( LLc );
    293 
    294    // Check the default/auto-detected values.
    295    // Allow the user to override invalid auto-detected caches
    296    // with command line.
    297    check_cache_or_override ("I1", I1c, DEFINED(clo_I1c));
    298    check_cache_or_override ("D1", D1c, DEFINED(clo_D1c));
    299    check_cache_or_override ("LL", LLc, DEFINED(clo_LLc));
    300 
    301    // Then replace with any defined on the command line.  (Already checked in
    302    // VG(parse_clo_cache_opt)().)
    303    if (DEFINED(clo_I1c)) { *I1c = *clo_I1c; }
    304    if (DEFINED(clo_D1c)) { *D1c = *clo_D1c; }
    305    if (DEFINED(clo_LLc)) { *LLc = *clo_LLc; }
    306 
    307    if (VG_(clo_verbosity) >= 2) {
    308       VG_(umsg)("Cache configuration used:\n");
    309       umsg_cache_img ("I1", I1c);
    310       umsg_cache_img ("D1", D1c);
    311       umsg_cache_img ("LL", LLc);
    312    }
    313 #undef DEFINED
    314 }
    315 
    316 void VG_(print_cache_clo_opts)()
    317 {
    318    VG_(printf)(
    319 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
    320 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
    321 "    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
    322                );
    323 }
    324 
    325 
    326 // Traverse the cache info and return a cache of the given kind and level.
    327 // Return NULL if no such cache exists.
    328 static const VexCache *
    329 locate_cache(const VexCacheInfo *ci, VexCacheKind kind, UInt level)
    330 {
    331    const VexCache *c;
    332 
    333    for (c = ci->caches; c != ci->caches + ci->num_caches; ++c) {
    334       if (c->level == level && c->kind == kind) {
    335          return c;
    336       }
    337    }
    338    return NULL;  // not found
    339 }
    340 
    341 
    342 // Gives the auto-detected configuration of I1, D1 and LL caches.  They get
    343 // overridden by any cache configurations specified on the command line.
    344 static void
    345 configure_caches(cache_t *I1c, cache_t *D1c, cache_t *LLc,
    346                  Bool all_caches_clo_defined)
    347 {
    348    VexArchInfo vai;
    349    const VexCacheInfo *ci;
    350    const VexCache *i1, *d1, *ll;
    351 
    352    VG_(machine_get_VexArchInfo)(NULL, &vai);
    353    ci = &vai.hwcache_info;
    354 
    355    // Extract what we need
    356    i1 = locate_cache(ci, INSN_CACHE, 1);
    357    d1 = locate_cache(ci, DATA_CACHE, 1);
    358    ll = locate_cache(ci, UNIFIED_CACHE, ci->num_levels);
    359 
    360    if (ci->num_caches > 0 && ll == NULL) {
    361       VG_(dmsg)("warning: L2 cache not installed, ignore LL results.\n");
    362    }
    363 
    364    if (ll && ci->num_levels > 2) {
    365       VG_(dmsg)("warning: L%u cache found, using its data for the "
    366                 "LL simulation.\n", ci->num_levels);
    367    }
    368 
    369    if (i1 && d1 && ll) {
    370       if (i1->is_trace_cache) {
    371          /* HACK ALERT: Instruction trace cache -- capacity is micro-ops based.
    372           * conversion to byte size is a total guess;  treat the 12K and 16K
    373           * cases the same since the cache byte size must be a power of two for
    374           * everything to work!.  Also guessing 32 bytes for the line size...
    375           */
    376          UInt adjusted_size, guessed_line_size = 32;
    377 
    378          if (i1->sizeB == 12 * 1024 || i1->sizeB == 16 * 1024) {
    379             adjusted_size = 16 * 1024;
    380          } else {
    381             adjusted_size = 32 * 1024;
    382          }
    383          VG_(dmsg)("warning: Pentium 4 with %u KB micro-op instruction trace cache\n",
    384                    i1->sizeB / 1024);
    385          VG_(dmsg)("         Simulating a %u KB I-cache with %u B lines\n",
    386                    adjusted_size / 1024, guessed_line_size);
    387 
    388          *I1c = (cache_t) { adjusted_size, i1->assoc, guessed_line_size };
    389       } else {
    390          *I1c = (cache_t) { i1->sizeB, i1->assoc, i1->line_sizeB };
    391       }
    392       *D1c = (cache_t) { d1->sizeB, d1->assoc, d1->line_sizeB };
    393       *LLc = (cache_t) { ll->sizeB, ll->assoc, ll->line_sizeB };
    394 
    395       return;
    396    }
    397 
    398    // Cache information could not be queried; choose some default
    399    // architecture specific default setting.
    400 
    401 #if defined(VGA_ppc32)
    402 
    403    // Default cache configuration
    404    *I1c = (cache_t) {  65536, 2, 64 };
    405    *D1c = (cache_t) {  65536, 2, 64 };
    406    *LLc = (cache_t) { 262144, 8, 64 };
    407 
    408 #elif defined(VGA_ppc64be) || defined(VGA_ppc64le)
    409 
    410    // Default cache configuration
    411    *I1c = (cache_t) {  65536, 2, 64 };
    412    *D1c = (cache_t) {  65536, 2, 64 };
    413    *LLc = (cache_t) { 262144, 8, 64 };
    414 
    415 #elif defined(VGA_arm)
    416 
    417    // Set caches to default (for Cortex-A8 ?)
    418    *I1c = (cache_t) {  16384, 4, 64 };
    419    *D1c = (cache_t) {  16384, 4, 64 };
    420    *LLc = (cache_t) { 262144, 8, 64 };
    421 
    422 #elif defined(VGA_arm64)
    423 
    424    // Copy the 32-bit ARM version until such time as we have
    425    // some real hardware to run on
    426    *I1c = (cache_t) {  16384, 4, 64 };
    427    *D1c = (cache_t) {  16384, 4, 64 };
    428    *LLc = (cache_t) { 262144, 8, 64 };
    429 
    430 #elif defined(VGA_s390x)
    431    //
    432    // Here is the cache data from older machine models:
    433    //
    434    //           I1            D1      I/D L2
    435    // z900  256k/256/4    256k/256/4   16MB
    436    // z800  256k/256/4    256k/256/4    8MB
    437    // z990  256k/256/4    256k/256/4   32MB
    438    // z890  256k/256/4    256k/256/4   32MB
    439    // z9    256k/256/4    256k/256/4   40MB
    440    //
    441    // Sources:
    442    // (1) IBM System z9 109 Technical Introduction
    443    //     www.redbooks.ibm.com/redbooks/pdfs/sg246669.pdf
    444    // (2) The microarchitecture of the IBM eServer z900 processor
    445    //     IBM Journal of Research and Development
    446    //     Volume 46, Number 4/5, pp 381-395, July/September 2002
    447    // (3) The IBM eServer z990 microprocessor
    448    //     IBM Journal of Research and Development
    449    //     Volume 48, Number 3/4, pp 295-309, May/July 2004
    450    // (4) Charles Webb, IBM
    451    //
    452    // L2 data is unfortunately incomplete. Otherwise, we could support
    453    // machines without the ECAG insn by looking at VEX_S390X_MODEL(hwcaps).
    454 
    455    // Default cache configuration is z10-EC  (Source: ECAG insn)
    456    *I1c = (cache_t) {    65536,  4, 256 };
    457    *D1c = (cache_t) {   131072,  8, 256 };
    458    *LLc = (cache_t) { 50331648, 24, 256 };
    459 
    460 #elif defined(VGA_mips32)
    461 
    462    // Set caches to default (for MIPS32-r2(mips 74kc))
    463    *I1c = (cache_t) {  32768, 4, 32 };
    464    *D1c = (cache_t) {  32768, 4, 32 };
    465    *LLc = (cache_t) { 524288, 8, 32 };
    466 
    467 #elif defined(VGA_mips64)
    468 
    469    // Set caches to default (for MIPS64 - 5kc)
    470    *I1c = (cache_t) {  32768, 4, 32 };
    471    *D1c = (cache_t) {  32768, 4, 32 };
    472    *LLc = (cache_t) { 524288, 8, 32 };
    473 
    474 #elif defined(VGA_x86) || defined(VGA_amd64)
    475 
    476    *I1c = (cache_t) {  65536, 2, 64 };
    477    *D1c = (cache_t) {  65536, 2, 64 };
    478    *LLc = (cache_t) { 262144, 8, 64 };
    479 
    480 #elif defined(VGA_tilegx)
    481 
    482    // Set caches to default for Tilegx.
    483    *I1c = (cache_t) { 0x8000,  2, 64 };
    484    *D1c = (cache_t) { 0x8000,  2, 64 };
    485    *LLc = (cache_t) { 0x40000, 8, 64 };
    486 
    487 #else
    488 
    489 #error "Unknown arch"
    490 
    491 #endif
    492 
    493    if (!all_caches_clo_defined) {
    494       const HChar warning[] =
    495         "Warning: Cannot auto-detect cache config, using defaults.\n"
    496         "         Run with -v to see.\n";
    497       VG_(dmsg)("%s", warning);
    498    }
    499 }
    500 
    501 /*--------------------------------------------------------------------*/
    502 /*--- end                                                          ---*/
    503 /*--------------------------------------------------------------------*/
    504