Home | History | Annotate | Download | only in callgrind
      1 /*--------------------------------------------------------------------*/
      2 /*--- Cache simulation.                                            ---*/
      3 /*---                                                        sim.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Callgrind, a Valgrind tool for call graph
      8    profiling programs.
      9 
     10    Copyright (C) 2003-2017, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     11 
     12    This tool is derived from and contains code from Cachegrind
     13    Copyright (C) 2002-2017 Nicholas Nethercote (njn (at) valgrind.org)
     14 
     15    This program is free software; you can redistribute it and/or
     16    modify it under the terms of the GNU General Public License as
     17    published by the Free Software Foundation; either version 2 of the
     18    License, or (at your option) any later version.
     19 
     20    This program is distributed in the hope that it will be useful, but
     21    WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    General Public License for more details.
     24 
     25    You should have received a copy of the GNU General Public License
     26    along with this program; if not, write to the Free Software
     27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     28    02111-1307, USA.
     29 
     30    The GNU General Public License is contained in the file COPYING.
     31 */
     32 
     33 #include "global.h"
     34 
     35 
     36 /* Notes:
     37   - simulates a write-allocate cache
     38   - (block --> set) hash function uses simple bit selection
     39   - handling of references straddling two cache blocks:
     40       - counts as only one cache access (not two)
     41       - both blocks hit                  --> one hit
     42       - one block hits, the other misses --> one miss
     43       - both blocks miss                 --> one miss (not two)
     44 */
     45 
     46 /* Cache configuration */
     47 #include "cg_arch.c"
     48 
     49 /* additional structures for cache use info, separated
     50  * according usage frequency:
     51  * - line_loaded : pointer to cost center of instruction
     52  *                 which loaded the line into cache.
     53  *                 Needed to increment counters when line is evicted.
     54  * - line_use    : updated on every access
     55  */
     56 typedef struct {
     57   UInt count;
     58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
     59 } line_use;
     60 
     61 typedef struct {
     62   Addr memline, iaddr;
     63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
     64   ULong* use_base;
     65 } line_loaded;
     66 
     67 /* Cache state */
     68 typedef struct {
     69    const HChar* name;
     70    int          size;                   /* bytes */
     71    int          assoc;
     72    int          line_size;              /* bytes */
     73    Bool         sectored;  /* prefetch nearside cacheline on read */
     74    int          sets;
     75    int          sets_min_1;
     76    int          line_size_bits;
     77    int          tag_shift;
     78    UWord        tag_mask;
     79    HChar        desc_line[128];    // large enough
     80    UWord*       tags;
     81 
     82   /* for cache use */
     83    int          line_size_mask;
     84    int*         line_start_mask;
     85    int*         line_end_mask;
     86    line_loaded* loaded;
     87    line_use*    use;
     88 } cache_t2;
     89 
     90 /*
     91  * States of flat caches in our model.
     92  * We use a 2-level hierarchy,
     93  */
     94 static cache_t2 I1, D1, LL;
     95 
     96 /* Lower bits of cache tags are used as flags for a cache line */
     97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
     98 #define CACHELINE_DIRTY    1
     99 
    100 
    101 /* Cache simulator Options */
    102 static Bool clo_simulate_writeback = False;
    103 static Bool clo_simulate_hwpref = False;
    104 static Bool clo_simulate_sectors = False;
    105 static Bool clo_collect_cacheuse = False;
    106 
    107 /* Following global vars are setup before by setup_bbcc():
    108  *
    109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
    110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
    111  */
    112 
    113 Addr   CLG_(bb_base);
    114 ULong* CLG_(cost_base);
    115 
    116 static InstrInfo* current_ii;
    117 
    118 /* Cache use offsets */
    119 /* The offsets are only correct because all per-instruction event sets get
    120  * the "Use" set added first !
    121  */
    122 static Int off_I1_AcCost  = 0;
    123 static Int off_I1_SpLoss  = 1;
    124 static Int off_D1_AcCost  = 0;
    125 static Int off_D1_SpLoss  = 1;
    126 static Int off_LL_AcCost  = 2;
    127 static Int off_LL_SpLoss  = 3;
    128 
    129 /* Cache access types */
    130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
    131 
    132 /* Result of a reference into a flat cache */
    133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
    134 
    135 /* Result of a reference into a hierarchical cache model */
    136 typedef enum {
    137     L1_Hit,
    138     LL_Hit,
    139     MemAccess,
    140     WriteBackMemAccess } CacheModelResult;
    141 
    142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
    143 
    144 static struct {
    145     simcall_type I1_Read;
    146     simcall_type D1_Read;
    147     simcall_type D1_Write;
    148 } simulator;
    149 
    150 /*------------------------------------------------------------*/
    151 /*--- Cache Simulator Initialization                       ---*/
    152 /*------------------------------------------------------------*/
    153 
    154 static void cachesim_clearcache(cache_t2* c)
    155 {
    156   Int i;
    157 
    158   for (i = 0; i < c->sets * c->assoc; i++)
    159     c->tags[i] = 0;
    160   if (c->use) {
    161     for (i = 0; i < c->sets * c->assoc; i++) {
    162       c->loaded[i].memline  = 0;
    163       c->loaded[i].use_base = 0;
    164       c->loaded[i].dep_use = 0;
    165       c->loaded[i].iaddr = 0;
    166       c->use[i].mask    = 0;
    167       c->use[i].count   = 0;
    168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
    169     }
    170   }
    171 }
    172 
    173 static void cacheuse_initcache(cache_t2* c);
    174 
    175 /* By this point, the size/assoc/line_size has been checked. */
    176 static void cachesim_initcache(cache_t config, cache_t2* c)
    177 {
    178    c->size      = config.size;
    179    c->assoc     = config.assoc;
    180    c->line_size = config.line_size;
    181    c->sectored  = False; // FIXME
    182 
    183    c->sets           = (c->size / c->line_size) / c->assoc;
    184    c->sets_min_1     = c->sets - 1;
    185    c->line_size_bits = VG_(log2)(c->line_size);
    186    c->tag_shift     = c->line_size_bits + VG_(log2)(c->sets);
    187    c->tag_mask       = ~((1u<<c->tag_shift)-1);
    188 
    189    /* Can bits in tag entries be used for flags?
    190     * Should be always true as MIN_LINE_SIZE >= 16 */
    191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
    192 
    193    if (c->assoc == 1) {
    194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
    195 		   c->size, c->line_size,
    196 		   c->sectored ? ", sectored":"");
    197    } else {
    198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
    199 		   c->size, c->line_size, c->assoc,
    200 		   c->sectored ? ", sectored":"");
    201    }
    202 
    203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
    204                                  sizeof(UWord) * c->sets * c->assoc);
    205    if (clo_collect_cacheuse)
    206        cacheuse_initcache(c);
    207    else
    208      c->use = 0;
    209    cachesim_clearcache(c);
    210 }
    211 
    212 
    213 #if 0
    214 static void print_cache(cache_t2* c)
    215 {
    216    UInt set, way, i;
    217 
    218    /* Note initialisation and update of 'i'. */
    219    for (i = 0, set = 0; set < c->sets; set++) {
    220       for (way = 0; way < c->assoc; way++, i++) {
    221          VG_(printf)("%8x ", c->tags[i]);
    222       }
    223       VG_(printf)("\n");
    224    }
    225 }
    226 #endif
    227 
    228 
    229 /*------------------------------------------------------------*/
    230 /*--- Simple Cache Simulation                              ---*/
    231 /*------------------------------------------------------------*/
    232 
    233 /*
    234  * Model: single inclusive, 2-level cache hierarchy (L1/LL)
    235  *        with write-allocate
    236  *
    237  * For simple cache hit/miss counts, we do not have to
    238  * maintain the dirty state of lines (no need to distinguish
    239  * read/write references), and the resulting counts are the
    240  * same for write-through and write-back caches.
    241  *
    242  * Simulator functions:
    243  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
    244  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
    245  */
    246 __attribute__((always_inline))
    247 static __inline__
    248 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
    249 {
    250     int i, j;
    251     UWord *set;
    252 
    253     set = &(c->tags[set_no * c->assoc]);
    254 
    255     /* This loop is unrolled for just the first case, which is the most */
    256     /* common.  We can't unroll any further because it would screw up   */
    257     /* if we have a direct-mapped (1-way) cache.                        */
    258     if (tag == set[0])
    259         return Hit;
    260 
    261     /* If the tag is one other than the MRU, move it into the MRU spot  */
    262     /* and shuffle the rest down.                                       */
    263     for (i = 1; i < c->assoc; i++) {
    264         if (tag == set[i]) {
    265             for (j = i; j > 0; j--) {
    266                 set[j] = set[j - 1];
    267             }
    268             set[0] = tag;
    269             return Hit;
    270         }
    271     }
    272 
    273     /* A miss;  install this tag as MRU, shuffle rest down. */
    274     for (j = c->assoc - 1; j > 0; j--) {
    275         set[j] = set[j - 1];
    276     }
    277     set[0] = tag;
    278 
    279     return Miss;
    280 }
    281 
    282 __attribute__((always_inline))
    283 static __inline__
    284 CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
    285 {
    286     UWord block1 =  a         >> c->line_size_bits;
    287     UWord block2 = (a+size-1) >> c->line_size_bits;
    288     UInt  set1   = block1 & c->sets_min_1;
    289     /* the tag does not need to include bits specifying the set,
    290      * but it can, and this saves instructions */
    291     UWord tag1   = block1;
    292 
    293     /* Access entirely within line. */
    294     if (block1 == block2)
    295 	return cachesim_setref(c, set1, tag1);
    296 
    297     /* Access straddles two lines. */
    298     else if (block1 + 1 == block2) {
    299         UInt  set2 = block2 & c->sets_min_1;
    300         UWord tag2 = block2;
    301 
    302 	/* the call updates cache structures as side effect */
    303 	CacheResult res1 =  cachesim_setref(c, set1, tag1);
    304 	CacheResult res2 =  cachesim_setref(c, set2, tag2);
    305 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
    306 
    307    } else {
    308        VG_(printf)("addr: %lx  size: %u  blocks: %lu %lu",
    309 		   a, size, block1, block2);
    310        VG_(tool_panic)("item straddles more than two cache sets");
    311    }
    312    return Hit;
    313 }
    314 
    315 static
    316 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
    317 {
    318     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    319     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    320     return MemAccess;
    321 }
    322 
    323 static
    324 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
    325 {
    326     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    327     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    328     return MemAccess;
    329 }
    330 
    331 
    332 /*------------------------------------------------------------*/
    333 /*--- Write Back Cache Simulation                          ---*/
    334 /*------------------------------------------------------------*/
    335 
    336 /*
    337  * More complex model: L1 Write-through, LL Write-back
    338  * This needs to distinguish among read and write references.
    339  *
    340  * Simulator functions:
    341  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
    342  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
    343  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
    344  */
    345 
    346 /*
    347  * With write-back, result can be a miss evicting a dirty line
    348  * The dirty state of a cache line is stored in Bit0 of the tag for
    349  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
    350  * type (Read/Write), the line gets dirty on a write.
    351  */
    352 __attribute__((always_inline))
    353 static __inline__
    354 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
    355 {
    356     int i, j;
    357     UWord *set, tmp_tag;
    358 
    359     set = &(c->tags[set_no * c->assoc]);
    360 
    361     /* This loop is unrolled for just the first case, which is the most */
    362     /* common.  We can't unroll any further because it would screw up   */
    363     /* if we have a direct-mapped (1-way) cache.                        */
    364     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
    365 	set[0] |= ref;
    366         return Hit;
    367     }
    368     /* If the tag is one other than the MRU, move it into the MRU spot  */
    369     /* and shuffle the rest down.                                       */
    370     for (i = 1; i < c->assoc; i++) {
    371 	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
    372 	    tmp_tag = set[i] | ref; // update dirty flag
    373             for (j = i; j > 0; j--) {
    374                 set[j] = set[j - 1];
    375             }
    376             set[0] = tmp_tag;
    377             return Hit;
    378         }
    379     }
    380 
    381     /* A miss;  install this tag as MRU, shuffle rest down. */
    382     tmp_tag = set[c->assoc - 1];
    383     for (j = c->assoc - 1; j > 0; j--) {
    384         set[j] = set[j - 1];
    385     }
    386     set[0] = tag | ref;
    387 
    388     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
    389 }
    390 
    391 __attribute__((always_inline))
    392 static __inline__
    393 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
    394 {
    395     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
    396     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
    397     UWord tag = a & c->tag_mask;
    398 
    399     /* Access entirely within line. */
    400     if (set1 == set2)
    401 	return cachesim_setref_wb(c, ref, set1, tag);
    402 
    403     /* Access straddles two lines. */
    404     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
    405     else if (((set1 + 1) & (c->sets_min_1)) == set2) {
    406 	UWord tag2  = (a+size-1) & c->tag_mask;
    407 
    408 	/* the call updates cache structures as side effect */
    409 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
    410 	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
    411 
    412 	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
    413 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
    414 
    415    } else {
    416        VG_(printf)("addr: %lx  size: %u  sets: %u %u", a, size, set1, set2);
    417        VG_(tool_panic)("item straddles more than two cache sets");
    418    }
    419    return Hit;
    420 }
    421 
    422 
    423 static
    424 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
    425 {
    426     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    428 	case Hit: return LL_Hit;
    429 	case Miss: return MemAccess;
    430 	default: break;
    431     }
    432     return WriteBackMemAccess;
    433 }
    434 
    435 static
    436 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
    437 {
    438     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    439     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    440 	case Hit: return LL_Hit;
    441 	case Miss: return MemAccess;
    442 	default: break;
    443     }
    444     return WriteBackMemAccess;
    445 }
    446 
    447 static
    448 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
    449 {
    450     if ( cachesim_ref( &D1, a, size) == Hit ) {
    451 	/* Even for a L1 hit, the write-trough L1 passes
    452 	 * the write to the LL to make the LL line dirty.
    453 	 * But this causes no latency, so return the hit.
    454 	 */
    455 	cachesim_ref_wb( &LL, Write, a, size);
    456 	return L1_Hit;
    457     }
    458     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
    459 	case Hit: return LL_Hit;
    460 	case Miss: return MemAccess;
    461 	default: break;
    462     }
    463     return WriteBackMemAccess;
    464 }
    465 
    466 
    467 /*------------------------------------------------------------*/
    468 /*--- Hardware Prefetch Simulation                         ---*/
    469 /*------------------------------------------------------------*/
    470 
    471 static ULong prefetch_up = 0;
    472 static ULong prefetch_down = 0;
    473 
    474 #define PF_STREAMS  8
    475 #define PF_PAGEBITS 12
    476 
    477 static UInt pf_lastblock[PF_STREAMS];
    478 static Int  pf_seqblocks[PF_STREAMS];
    479 
    480 static
    481 void prefetch_clear(void)
    482 {
    483   int i;
    484   for(i=0;i<PF_STREAMS;i++)
    485     pf_lastblock[i] = pf_seqblocks[i] = 0;
    486 }
    487 
    488 /*
    489  * HW Prefetch emulation
    490  * Start prefetching when detecting sequential access to 3 memory blocks.
    491  * One stream can be detected per 4k page.
    492  */
    493 static __inline__
    494 void prefetch_LL_doref(Addr a)
    495 {
    496   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
    497   UInt block = ( a >> LL.line_size_bits);
    498 
    499   if (block != pf_lastblock[stream]) {
    500     if (pf_seqblocks[stream] == 0) {
    501       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
    502       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
    503     }
    504     else if (pf_seqblocks[stream] >0) {
    505       if (pf_lastblock[stream] +1 == block) {
    506 	pf_seqblocks[stream]++;
    507 	if (pf_seqblocks[stream] >= 2) {
    508 	  prefetch_up++;
    509 	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
    510 	}
    511       }
    512       else pf_seqblocks[stream] = 0;
    513     }
    514     else if (pf_seqblocks[stream] <0) {
    515       if (pf_lastblock[stream] -1 == block) {
    516 	pf_seqblocks[stream]--;
    517 	if (pf_seqblocks[stream] <= -2) {
    518 	  prefetch_down++;
    519 	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
    520 	}
    521       }
    522       else pf_seqblocks[stream] = 0;
    523     }
    524     pf_lastblock[stream] = block;
    525   }
    526 }
    527 
    528 /* simple model with hardware prefetch */
    529 
    530 static
    531 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
    532 {
    533     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    534     prefetch_LL_doref(a);
    535     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    536     return MemAccess;
    537 }
    538 
    539 static
    540 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
    541 {
    542     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    543     prefetch_LL_doref(a);
    544     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    545     return MemAccess;
    546 }
    547 
    548 
    549 /* complex model with hardware prefetch */
    550 
    551 static
    552 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
    553 {
    554     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    555     prefetch_LL_doref(a);
    556     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    557 	case Hit: return LL_Hit;
    558 	case Miss: return MemAccess;
    559 	default: break;
    560     }
    561     return WriteBackMemAccess;
    562 }
    563 
    564 static
    565 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
    566 {
    567     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    568     prefetch_LL_doref(a);
    569     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    570 	case Hit: return LL_Hit;
    571 	case Miss: return MemAccess;
    572 	default: break;
    573     }
    574     return WriteBackMemAccess;
    575 }
    576 
    577 static
    578 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
    579 {
    580     prefetch_LL_doref(a);
    581     if ( cachesim_ref( &D1, a, size) == Hit ) {
    582 	/* Even for a L1 hit, the write-trough L1 passes
    583 	 * the write to the LL to make the LL line dirty.
    584 	 * But this causes no latency, so return the hit.
    585 	 */
    586 	cachesim_ref_wb( &LL, Write, a, size);
    587 	return L1_Hit;
    588     }
    589     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
    590 	case Hit: return LL_Hit;
    591 	case Miss: return MemAccess;
    592 	default: break;
    593     }
    594     return WriteBackMemAccess;
    595 }
    596 
    597 
    598 /*------------------------------------------------------------*/
    599 /*--- Cache Simulation with use metric collection          ---*/
    600 /*------------------------------------------------------------*/
    601 
    602 /* can not be combined with write-back or prefetch */
    603 
    604 static
    605 void cacheuse_initcache(cache_t2* c)
    606 {
    607     int i;
    608     unsigned int start_mask, start_val;
    609     unsigned int end_mask, end_val;
    610 
    611     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
    612                            sizeof(line_use) * c->sets * c->assoc);
    613     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
    614                            sizeof(line_loaded) * c->sets * c->assoc);
    615     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
    616                                     sizeof(int) * c->line_size);
    617     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
    618                                   sizeof(int) * c->line_size);
    619 
    620     c->line_size_mask = c->line_size-1;
    621 
    622     /* Meaning of line_start_mask/line_end_mask
    623      * Example: for a given cache line, you get an access starting at
    624      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
    625      * line size of 32, you have 1 bit per byte in the mask:
    626      *
    627      *   bit31   bit8 bit5  bit 0
    628      *       |      |  |    |
    629      *       11..111111100000   line_start_mask[5]
    630      *       00..000111111111   line_end_mask[(5+4)-1]
    631      *
    632      *  use_mask |= line_start_mask[5] && line_end_mask[8]
    633      *
    634      */
    635     start_val = end_val = ~0;
    636     if (c->line_size < 32) {
    637 	int bits_per_byte = 32/c->line_size;
    638 	start_mask = (1<<bits_per_byte)-1;
    639 	end_mask   = start_mask << (32-bits_per_byte);
    640 	for(i=0;i<c->line_size;i++) {
    641 	    c->line_start_mask[i] = start_val;
    642 	    start_val  = start_val & ~start_mask;
    643 	    start_mask = start_mask << bits_per_byte;
    644 
    645 	    c->line_end_mask[c->line_size-i-1] = end_val;
    646 	    end_val  = end_val & ~end_mask;
    647 	    end_mask = end_mask >> bits_per_byte;
    648 	}
    649     }
    650     else {
    651 	int bytes_per_bit = c->line_size/32;
    652 	start_mask = 1;
    653 	end_mask   = 1u << 31;
    654 	for(i=0;i<c->line_size;i++) {
    655 	    c->line_start_mask[i] = start_val;
    656 	    c->line_end_mask[c->line_size-i-1] = end_val;
    657 	    if ( ((i+1)%bytes_per_bit) == 0) {
    658 		start_val   &= ~start_mask;
    659 		end_val     &= ~end_mask;
    660 		start_mask <<= 1;
    661 		end_mask   >>= 1;
    662 	    }
    663 	}
    664     }
    665 
    666     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
    667     for(i=0;i<c->line_size;i++) {
    668 	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
    669 		  i, (UInt)c->line_start_mask[i], (UInt)c->line_end_mask[i]);
    670     }
    671 
    672     /* We use lower tag bits as offset pointers to cache use info.
    673      * I.e. some cache parameters don't work.
    674      */
    675     if ( (1<<c->tag_shift) < c->assoc) {
    676 	VG_(message)(Vg_DebugMsg,
    677 		     "error: Use associativity < %d for cache use statistics!\n",
    678 		     (1<<c->tag_shift) );
    679 	VG_(tool_panic)("Unsupported cache configuration");
    680     }
    681 }
    682 
    683 
    684 /* for I1/D1 caches */
    685 #define CACHEUSE(L)                                                         \
    686                                                                             \
    687 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    688 {                                                                           \
    689    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
    690    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
    691    UWord tag  = a & L.tag_mask;                                             \
    692    UWord tag2;                                                              \
    693    int i, j, idx;                                                           \
    694    UWord *set, tmp_tag; 						    \
    695    UInt use_mask;							    \
    696                                                                             \
    697    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%u/%u]\n",                \
    698 	    L.name, a, size, set1, set2);				    \
    699                                                                             \
    700    /* First case: word entirely within line. */                             \
    701    if (set1 == set2) {                                                      \
    702                                                                             \
    703       set = &(L.tags[set1 * L.assoc]);                                      \
    704       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
    705 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
    706                                                                             \
    707       /* This loop is unrolled for just the first case, which is the most */\
    708       /* common.  We can't unroll any further because it would screw up   */\
    709       /* if we have a direct-mapped (1-way) cache.                        */\
    710       if (tag == (set[0] & L.tag_mask)) {                                   \
    711         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
    712         L.use[idx].count ++;                                                \
    713         L.use[idx].mask |= use_mask;                                        \
    714 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    715 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    716 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    717 	return L1_Hit;							    \
    718       }                                                                     \
    719       /* If the tag is one other than the MRU, move it into the MRU spot  */\
    720       /* and shuffle the rest down.                                       */\
    721       for (i = 1; i < L.assoc; i++) {                                       \
    722 	 if (tag == (set[i] & L.tag_mask)) {			            \
    723   	    tmp_tag = set[i];                                               \
    724             for (j = i; j > 0; j--) {                                       \
    725                set[j] = set[j - 1];                                         \
    726             }                                                               \
    727             set[0] = tmp_tag;			                            \
    728             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    729             L.use[idx].count ++;                                            \
    730             L.use[idx].mask |= use_mask;                                    \
    731 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    732 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    733 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    734             return L1_Hit;                                                  \
    735          }                                                                  \
    736       }                                                                     \
    737                                                                             \
    738       /* A miss;  install this tag as MRU, shuffle rest down. */            \
    739       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    740       for (j = L.assoc - 1; j > 0; j--) {                                   \
    741          set[j] = set[j - 1];                                               \
    742       }                                                                     \
    743       set[0] = tag | tmp_tag;                                               \
    744       idx = (set1 * L.assoc) + tmp_tag;                                     \
    745       return update_##L##_use(&L, idx,         			            \
    746 		       use_mask, a &~ L.line_size_mask);		    \
    747                                                                             \
    748    /* Second case: word straddles two lines. */                             \
    749    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    750    } else if (((set1 + 1) & (L.sets_min_1)) == set2) {                      \
    751       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
    752       set = &(L.tags[set1 * L.assoc]);                                      \
    753       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
    754       if (tag == (set[0] & L.tag_mask)) {                                   \
    755          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
    756          L.use[idx].count ++;                                               \
    757          L.use[idx].mask |= use_mask;                                       \
    758 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    759 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    760 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    761          goto block2;                                                       \
    762       }                                                                     \
    763       for (i = 1; i < L.assoc; i++) {                                       \
    764 	 if (tag == (set[i] & L.tag_mask)) {			            \
    765   	    tmp_tag = set[i];                                               \
    766             for (j = i; j > 0; j--) {                                       \
    767                set[j] = set[j - 1];                                         \
    768             }                                                               \
    769             set[0] = tmp_tag;                                               \
    770             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    771             L.use[idx].count ++;                                            \
    772             L.use[idx].mask |= use_mask;                                    \
    773 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    774 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    775 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    776             goto block2;                                                    \
    777          }                                                                  \
    778       }                                                                     \
    779       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    780       for (j = L.assoc - 1; j > 0; j--) {                                   \
    781          set[j] = set[j - 1];                                               \
    782       }                                                                     \
    783       set[0] = tag | tmp_tag;                                               \
    784       idx = (set1 * L.assoc) + tmp_tag;                                     \
    785       miss1 = update_##L##_use(&L, idx,        			            \
    786 		       use_mask, a &~ L.line_size_mask);		    \
    787 block2:                                                                     \
    788       set = &(L.tags[set2 * L.assoc]);                                      \
    789       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
    790       tag2  = (a+size-1) & L.tag_mask;                                      \
    791       if (tag2 == (set[0] & L.tag_mask)) {                                  \
    792          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
    793          L.use[idx].count ++;                                               \
    794          L.use[idx].mask |= use_mask;                                       \
    795 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    796 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    797 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    798          return miss1;                                                      \
    799       }                                                                     \
    800       for (i = 1; i < L.assoc; i++) {                                       \
    801 	 if (tag2 == (set[i] & L.tag_mask)) {			            \
    802   	    tmp_tag = set[i];                                               \
    803             for (j = i; j > 0; j--) {                                       \
    804                set[j] = set[j - 1];                                         \
    805             }                                                               \
    806             set[0] = tmp_tag;                                               \
    807             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    808             L.use[idx].count ++;                                            \
    809             L.use[idx].mask |= use_mask;                                    \
    810 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %u\n",\
    811 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    812 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    813             return miss1;                                                   \
    814          }                                                                  \
    815       }                                                                     \
    816       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    817       for (j = L.assoc - 1; j > 0; j--) {                                   \
    818          set[j] = set[j - 1];                                               \
    819       }                                                                     \
    820       set[0] = tag2 | tmp_tag;                                              \
    821       idx = (set2 * L.assoc) + tmp_tag;                                     \
    822       miss2 = update_##L##_use(&L, idx,			                    \
    823 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
    824       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
    825                                                                             \
    826    } else {                                                                 \
    827        VG_(printf)("addr: %#lx  size: %u  sets: %u %u", a, size, set1, set2); \
    828        VG_(tool_panic)("item straddles more than two cache sets");          \
    829    }                                                                        \
    830    return 0;                                                                \
    831 }
    832 
    833 
    834 /* logarithmic bitcounting algorithm, see
    835  * http://graphics.stanford.edu/~seander/bithacks.html
    836  */
    837 static __inline__ unsigned int countBits(unsigned int bits)
    838 {
    839   unsigned int c; // store the total here
    840   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
    841   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
    842 
    843   c = bits;
    844   c = ((c >> S[0]) & B[0]) + (c & B[0]);
    845   c = ((c >> S[1]) & B[1]) + (c & B[1]);
    846   c = ((c >> S[2]) & B[2]) + (c & B[2]);
    847   c = ((c >> S[3]) & B[3]) + (c & B[3]);
    848   c = ((c >> S[4]) & B[4]) + (c & B[4]);
    849   return c;
    850 }
    851 
    852 static void update_LL_use(int idx, Addr memline)
    853 {
    854   line_loaded* loaded = &(LL.loaded[idx]);
    855   line_use* use = &(LL.use[idx]);
    856   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
    857 
    858   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
    859            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
    860   if (use->count>0) {
    861     CLG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",
    862 	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
    863     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
    864 	     CLG_(current_state).collect, loaded->use_base);
    865 
    866     if (CLG_(current_state).collect && loaded->use_base) {
    867       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
    868       (loaded->use_base)[off_LL_SpLoss] += i;
    869     }
    870    }
    871 
    872    use->count = 0;
    873    use->mask  = 0;
    874 
    875   loaded->memline = memline;
    876   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
    877   loaded->use_base = (CLG_(current_state).nonskipped) ?
    878     CLG_(current_state).nonskipped->skipped :
    879     CLG_(cost_base) + current_ii->cost_offset;
    880 }
    881 
    882 static
    883 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
    884 {
    885    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
    886    UWord* set = &(LL.tags[setNo * LL.assoc]);
    887    UWord tag  = memline & LL.tag_mask;
    888 
    889    int i, j, idx;
    890    UWord tmp_tag;
    891 
    892    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %u\n", memline, setNo);
    893 
    894    if (tag == (set[0] & LL.tag_mask)) {
    895      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
    896      l1_loaded->dep_use = &(LL.use[idx]);
    897 
    898      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
    899 		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
    900 		 LL.use[idx].mask, LL.use[idx].count);
    901      return LL_Hit;
    902    }
    903    for (i = 1; i < LL.assoc; i++) {
    904      if (tag == (set[i] & LL.tag_mask)) {
    905        tmp_tag = set[i];
    906        for (j = i; j > 0; j--) {
    907 	 set[j] = set[j - 1];
    908        }
    909        set[0] = tmp_tag;
    910        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
    911        l1_loaded->dep_use = &(LL.use[idx]);
    912 
    913 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %u\n",
    914 		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
    915 		 LL.use[idx].mask, LL.use[idx].count);
    916 	return LL_Hit;
    917      }
    918    }
    919 
    920    /* A miss;  install this tag as MRU, shuffle rest down. */
    921    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
    922    for (j = LL.assoc - 1; j > 0; j--) {
    923      set[j] = set[j - 1];
    924    }
    925    set[0] = tag | tmp_tag;
    926    idx = (setNo * LL.assoc) + tmp_tag;
    927    l1_loaded->dep_use = &(LL.use[idx]);
    928 
    929    update_LL_use(idx, memline);
    930 
    931    return MemAccess;
    932 }
    933 
    934 
    935 
    936 
    937 #define UPDATE_USE(L)					             \
    938                                                                      \
    939 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
    940 			       UInt mask, Addr memline)		     \
    941 {                                                                    \
    942   line_loaded* loaded = &(cache->loaded[idx]);			     \
    943   line_use* use = &(cache->use[idx]);				     \
    944   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
    945                                                                      \
    946   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
    947            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
    948   if (use->count>0) {                                                \
    949     CLG_DEBUG(2, "   old: used %u, loss bits %d (%08x) [line %#lx from %#lx]\n",\
    950 	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
    951     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
    952 	     CLG_(current_state).collect, loaded->use_base);	     \
    953                                                                      \
    954     if (CLG_(current_state).collect && loaded->use_base) {           \
    955       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
    956       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
    957                                                                      \
    958       /* FIXME (?): L1/LL line sizes must be equal ! */              \
    959       loaded->dep_use->mask |= use->mask;                            \
    960       loaded->dep_use->count += use->count;                          \
    961     }                                                                \
    962   }                                                                  \
    963                                                                      \
    964   use->count = 1;                                                    \
    965   use->mask  = mask;                                                 \
    966   loaded->memline = memline;                                         \
    967   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
    968   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
    969     CLG_(current_state).nonskipped->skipped :                        \
    970     CLG_(cost_base) + current_ii->cost_offset;                       \
    971                                                                      \
    972   if (memline == 0) return LL_Hit;                                   \
    973   return cacheuse_LL_access(memline, loaded);                        \
    974 }
    975 
    976 UPDATE_USE(I1);
    977 UPDATE_USE(D1);
    978 
    979 CACHEUSE(I1);
    980 CACHEUSE(D1);
    981 
    982 
    983 static
    984 void cacheuse_finish(void)
    985 {
    986   int i;
    987   InstrInfo ii = { 0,0,0,0 };
    988 
    989   if (!CLG_(current_state).collect) return;
    990 
    991   CLG_(bb_base) = 0;
    992   current_ii = &ii; /* needs to be set for update_XX_use */
    993   CLG_(cost_base) = 0;
    994 
    995   /* update usage counters */
    996   if (I1.use)
    997     for (i = 0; i < I1.sets * I1.assoc; i++)
    998       if (I1.loaded[i].use_base)
    999 	update_I1_use( &I1, i, 0,0);
   1000 
   1001   if (D1.use)
   1002     for (i = 0; i < D1.sets * D1.assoc; i++)
   1003       if (D1.loaded[i].use_base)
   1004 	update_D1_use( &D1, i, 0,0);
   1005 
   1006   if (LL.use)
   1007     for (i = 0; i < LL.sets * LL.assoc; i++)
   1008       if (LL.loaded[i].use_base)
   1009 	update_LL_use(i, 0);
   1010 
   1011   current_ii = 0;
   1012 }
   1013 
   1014 
   1015 
   1016 /*------------------------------------------------------------*/
   1017 /*--- Helper functions called by instrumented code         ---*/
   1018 /*------------------------------------------------------------*/
   1019 
   1020 
   1021 static __inline__
   1022 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
   1023 {
   1024     switch(r) {
   1025 	case WriteBackMemAccess:
   1026 	    if (clo_simulate_writeback) {
   1027 		c1[3]++;
   1028 		c2[3]++;
   1029 	    }
   1030 	    // fall through
   1031 
   1032 	case MemAccess:
   1033 	    c1[2]++;
   1034 	    c2[2]++;
   1035 	    // fall through
   1036 
   1037 	case LL_Hit:
   1038 	    c1[1]++;
   1039 	    c2[1]++;
   1040 	    // fall through
   1041 
   1042 	default:
   1043 	    c1[0]++;
   1044 	    c2[0]++;
   1045     }
   1046 }
   1047 
   1048 static
   1049 const HChar* cacheRes(CacheModelResult r)
   1050 {
   1051     switch(r) {
   1052     case L1_Hit:    return "L1 Hit ";
   1053     case LL_Hit:    return "LL Hit ";
   1054     case MemAccess: return "LL Miss";
   1055     case WriteBackMemAccess: return "LL Miss (dirty)";
   1056     default:
   1057 	tl_assert(0);
   1058     }
   1059     return "??";
   1060 }
   1061 
   1062 VG_REGPARM(1)
   1063 static void log_1I0D(InstrInfo* ii)
   1064 {
   1065     CacheModelResult IrRes;
   1066 
   1067     current_ii = ii;
   1068     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1069 
   1070     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
   1071               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
   1072 
   1073     if (CLG_(current_state).collect) {
   1074 	ULong* cost_Ir;
   1075 
   1076 	if (CLG_(current_state).nonskipped)
   1077 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1078 	else
   1079             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1080 
   1081 	inc_costs(IrRes, cost_Ir,
   1082 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1083     }
   1084 }
   1085 
   1086 VG_REGPARM(2)
   1087 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
   1088 {
   1089     CacheModelResult Ir1Res, Ir2Res;
   1090     ULong *global_cost_Ir;
   1091 
   1092     current_ii = ii1;
   1093     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
   1094     current_ii = ii2;
   1095     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
   1096 
   1097     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
   1098               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
   1099               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
   1100 
   1101     if (!CLG_(current_state).collect) return;
   1102 
   1103     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
   1104     if (CLG_(current_state).nonskipped) {
   1105 	ULong* skipped_cost_Ir =
   1106 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1107 
   1108 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
   1109 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
   1110 	return;
   1111     }
   1112 
   1113     inc_costs(Ir1Res, global_cost_Ir,
   1114               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
   1115     inc_costs(Ir2Res, global_cost_Ir,
   1116               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
   1117 }
   1118 
   1119 VG_REGPARM(3)
   1120 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
   1121 {
   1122     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
   1123     ULong *global_cost_Ir;
   1124 
   1125     current_ii = ii1;
   1126     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
   1127     current_ii = ii2;
   1128     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
   1129     current_ii = ii3;
   1130     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
   1131 
   1132     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
   1133               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
   1134               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
   1135               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
   1136 
   1137     if (!CLG_(current_state).collect) return;
   1138 
   1139     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
   1140     if (CLG_(current_state).nonskipped) {
   1141 	ULong* skipped_cost_Ir =
   1142 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1143 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
   1144 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
   1145 	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
   1146 	return;
   1147     }
   1148 
   1149     inc_costs(Ir1Res, global_cost_Ir,
   1150               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
   1151     inc_costs(Ir2Res, global_cost_Ir,
   1152               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
   1153     inc_costs(Ir3Res, global_cost_Ir,
   1154               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
   1155 }
   1156 
   1157 /* Instruction doing a read access */
   1158 
   1159 VG_REGPARM(3)
   1160 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
   1161 {
   1162     CacheModelResult IrRes, DrRes;
   1163 
   1164     current_ii = ii;
   1165     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1166     DrRes = (*simulator.D1_Read)(data_addr, data_size);
   1167 
   1168     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%ld => %s\n",
   1169               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
   1170 	      data_addr, data_size, cacheRes(DrRes));
   1171 
   1172     if (CLG_(current_state).collect) {
   1173 	ULong *cost_Ir, *cost_Dr;
   1174 
   1175 	if (CLG_(current_state).nonskipped) {
   1176 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1177 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
   1178 	}
   1179 	else {
   1180             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1181             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
   1182 	}
   1183 
   1184 	inc_costs(IrRes, cost_Ir,
   1185 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1186 	inc_costs(DrRes, cost_Dr,
   1187 		  CLG_(current_state).cost + fullOffset(EG_DR) );
   1188     }
   1189 }
   1190 
   1191 
   1192 /* Note that addEvent_D_guarded assumes that log_0I1Dr and log_0I1Dw
   1193    have exactly the same prototype.  If you change them, you must
   1194    change addEvent_D_guarded too. */
   1195 VG_REGPARM(3)
   1196 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
   1197 {
   1198     CacheModelResult DrRes;
   1199 
   1200     current_ii = ii;
   1201     DrRes = (*simulator.D1_Read)(data_addr, data_size);
   1202 
   1203     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%ld => %s\n",
   1204 	      data_addr, data_size, cacheRes(DrRes));
   1205 
   1206     if (CLG_(current_state).collect) {
   1207 	ULong *cost_Dr;
   1208 
   1209 	if (CLG_(current_state).nonskipped)
   1210 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
   1211 	else
   1212             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
   1213 
   1214 	inc_costs(DrRes, cost_Dr,
   1215 		  CLG_(current_state).cost + fullOffset(EG_DR) );
   1216     }
   1217 }
   1218 
   1219 
   1220 /* Instruction doing a write access */
   1221 
   1222 VG_REGPARM(3)
   1223 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
   1224 {
   1225     CacheModelResult IrRes, DwRes;
   1226 
   1227     current_ii = ii;
   1228     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1229     DwRes = (*simulator.D1_Write)(data_addr, data_size);
   1230 
   1231     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%ld => %s\n",
   1232               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
   1233 	      data_addr, data_size, cacheRes(DwRes));
   1234 
   1235     if (CLG_(current_state).collect) {
   1236 	ULong *cost_Ir, *cost_Dw;
   1237 
   1238 	if (CLG_(current_state).nonskipped) {
   1239 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1240 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
   1241 	}
   1242 	else {
   1243             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1244             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
   1245 	}
   1246 
   1247 	inc_costs(IrRes, cost_Ir,
   1248 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1249 	inc_costs(DwRes, cost_Dw,
   1250 		  CLG_(current_state).cost + fullOffset(EG_DW) );
   1251     }
   1252 }
   1253 
   1254 /* See comment on log_0I1Dr. */
   1255 VG_REGPARM(3)
   1256 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
   1257 {
   1258     CacheModelResult DwRes;
   1259 
   1260     current_ii = ii;
   1261     DwRes = (*simulator.D1_Write)(data_addr, data_size);
   1262 
   1263     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%ld => %s\n",
   1264 	      data_addr, data_size, cacheRes(DwRes));
   1265 
   1266     if (CLG_(current_state).collect) {
   1267 	ULong *cost_Dw;
   1268 
   1269 	if (CLG_(current_state).nonskipped)
   1270 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
   1271 	else
   1272             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
   1273 
   1274 	inc_costs(DwRes, cost_Dw,
   1275 		  CLG_(current_state).cost + fullOffset(EG_DW) );
   1276     }
   1277 }
   1278 
   1279 
   1280 
   1281 /*------------------------------------------------------------*/
   1282 /*--- Cache configuration                                  ---*/
   1283 /*------------------------------------------------------------*/
   1284 
   1285 static cache_t clo_I1_cache = UNDEFINED_CACHE;
   1286 static cache_t clo_D1_cache = UNDEFINED_CACHE;
   1287 static cache_t clo_LL_cache = UNDEFINED_CACHE;
   1288 
   1289 /* Initialize and clear simulator state */
   1290 static void cachesim_post_clo_init(void)
   1291 {
   1292   /* Cache configurations. */
   1293   cache_t  I1c, D1c, LLc;
   1294 
   1295   /* Initialize access handlers */
   1296   if (!CLG_(clo).simulate_cache) {
   1297     CLG_(cachesim).log_1I0D  = 0;
   1298     CLG_(cachesim).log_1I0D_name = "(no function)";
   1299     CLG_(cachesim).log_2I0D  = 0;
   1300     CLG_(cachesim).log_2I0D_name = "(no function)";
   1301     CLG_(cachesim).log_3I0D  = 0;
   1302     CLG_(cachesim).log_3I0D_name = "(no function)";
   1303 
   1304     CLG_(cachesim).log_1I1Dr = 0;
   1305     CLG_(cachesim).log_1I1Dr_name = "(no function)";
   1306     CLG_(cachesim).log_1I1Dw = 0;
   1307     CLG_(cachesim).log_1I1Dw_name = "(no function)";
   1308 
   1309     CLG_(cachesim).log_0I1Dr = 0;
   1310     CLG_(cachesim).log_0I1Dr_name = "(no function)";
   1311     CLG_(cachesim).log_0I1Dw = 0;
   1312     CLG_(cachesim).log_0I1Dw_name = "(no function)";
   1313     return;
   1314   }
   1315 
   1316   /* Configuration of caches only needed with real cache simulation */
   1317   VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
   1318                                       &clo_I1_cache,
   1319                                       &clo_D1_cache,
   1320                                       &clo_LL_cache);
   1321 
   1322   I1.name = "I1";
   1323   D1.name = "D1";
   1324   LL.name = "LL";
   1325 
   1326   // min_line_size is used to make sure that we never feed
   1327   // accesses to the simulator straddling more than two
   1328   // cache lines at any cache level
   1329   CLG_(min_line_size) = (I1c.line_size < D1c.line_size)
   1330                            ? I1c.line_size : D1c.line_size;
   1331   CLG_(min_line_size) = (LLc.line_size < CLG_(min_line_size))
   1332                            ? LLc.line_size : CLG_(min_line_size);
   1333 
   1334   Int largest_load_or_store_size
   1335      = VG_(machine_get_size_of_largest_guest_register)();
   1336   if (CLG_(min_line_size) < largest_load_or_store_size) {
   1337      /* We can't continue, because the cache simulation might
   1338         straddle more than 2 lines, and it will assert.  So let's
   1339         just stop before we start. */
   1340      VG_(umsg)("Callgrind: cannot continue: the minimum line size (%d)\n",
   1341                (Int)CLG_(min_line_size));
   1342      VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
   1343                largest_load_or_store_size );
   1344      VG_(umsg)("  but it is not.  Exiting now.\n");
   1345      VG_(exit)(1);
   1346   }
   1347 
   1348   cachesim_initcache(I1c, &I1);
   1349   cachesim_initcache(D1c, &D1);
   1350   cachesim_initcache(LLc, &LL);
   1351 
   1352   /* the other cache simulators use the standard helpers
   1353    * with dispatching via simulator struct */
   1354 
   1355   CLG_(cachesim).log_1I0D  = log_1I0D;
   1356   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
   1357   CLG_(cachesim).log_2I0D  = log_2I0D;
   1358   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
   1359   CLG_(cachesim).log_3I0D  = log_3I0D;
   1360   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
   1361 
   1362   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
   1363   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
   1364   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
   1365   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
   1366 
   1367   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
   1368   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
   1369   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
   1370   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
   1371 
   1372   if (clo_collect_cacheuse) {
   1373 
   1374       /* Output warning for not supported option combinations */
   1375       if (clo_simulate_hwpref) {
   1376 	  VG_(message)(Vg_DebugMsg,
   1377 		       "warning: prefetch simulation can not be "
   1378                        "used with cache usage\n");
   1379 	  clo_simulate_hwpref = False;
   1380       }
   1381 
   1382       if (clo_simulate_writeback) {
   1383 	  VG_(message)(Vg_DebugMsg,
   1384 		       "warning: write-back simulation can not be "
   1385                        "used with cache usage\n");
   1386 	  clo_simulate_writeback = False;
   1387       }
   1388 
   1389       simulator.I1_Read  = cacheuse_I1_doRead;
   1390       simulator.D1_Read  = cacheuse_D1_doRead;
   1391       simulator.D1_Write = cacheuse_D1_doRead;
   1392       return;
   1393   }
   1394 
   1395   if (clo_simulate_hwpref) {
   1396     prefetch_clear();
   1397 
   1398     if (clo_simulate_writeback) {
   1399       simulator.I1_Read  = prefetch_I1_Read;
   1400       simulator.D1_Read  = prefetch_D1_Read;
   1401       simulator.D1_Write = prefetch_D1_Write;
   1402     }
   1403     else {
   1404       simulator.I1_Read  = prefetch_I1_ref;
   1405       simulator.D1_Read  = prefetch_D1_ref;
   1406       simulator.D1_Write = prefetch_D1_ref;
   1407     }
   1408 
   1409     return;
   1410   }
   1411 
   1412   if (clo_simulate_writeback) {
   1413       simulator.I1_Read  = cachesim_I1_Read;
   1414       simulator.D1_Read  = cachesim_D1_Read;
   1415       simulator.D1_Write = cachesim_D1_Write;
   1416   }
   1417   else {
   1418       simulator.I1_Read  = cachesim_I1_ref;
   1419       simulator.D1_Read  = cachesim_D1_ref;
   1420       simulator.D1_Write = cachesim_D1_ref;
   1421   }
   1422 }
   1423 
   1424 
   1425 /* Clear simulator state. Has to be initialized before */
   1426 static
   1427 void cachesim_clear(void)
   1428 {
   1429   cachesim_clearcache(&I1);
   1430   cachesim_clearcache(&D1);
   1431   cachesim_clearcache(&LL);
   1432 
   1433   prefetch_clear();
   1434 }
   1435 
   1436 
   1437 static void cachesim_dump_desc(VgFile *fp)
   1438 {
   1439   VG_(fprintf)(fp, "\ndesc: I1 cache: %s\n", I1.desc_line);
   1440   VG_(fprintf)(fp, "desc: D1 cache: %s\n", D1.desc_line);
   1441   VG_(fprintf)(fp, "desc: LL cache: %s\n", LL.desc_line);
   1442 }
   1443 
   1444 static
   1445 void cachesim_print_opts(void)
   1446 {
   1447   VG_(printf)(
   1448 "\n   cache simulator options (does cache simulation if used):\n"
   1449 "    --simulate-wb=no|yes      Count write-back events [no]\n"
   1450 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
   1451 #if CLG_EXPERIMENTAL
   1452 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
   1453 #endif
   1454 "    --cacheuse=no|yes         Collect cache block use [no]\n");
   1455   VG_(print_cache_clo_opts)();
   1456 }
   1457 
   1458 /* Check for command line option for cache configuration.
   1459  * Return False if unknown and not handled.
   1460  *
   1461  * Called from CLG_(process_cmd_line_option)() in clo.c
   1462  */
   1463 static Bool cachesim_parse_opt(const HChar* arg)
   1464 {
   1465    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
   1466    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
   1467    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
   1468 
   1469    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
   1470       if (clo_collect_cacheuse) {
   1471          /* Use counters only make sense with fine dumping */
   1472          CLG_(clo).dump_instr = True;
   1473       }
   1474    }
   1475 
   1476    else if (VG_(str_clo_cache_opt)(arg,
   1477                                    &clo_I1_cache,
   1478                                    &clo_D1_cache,
   1479                                    &clo_LL_cache)) {}
   1480 
   1481    else
   1482      return False;
   1483 
   1484   return True;
   1485 }
   1486 
   1487 static
   1488 void cachesim_printstat(Int l1, Int l2, Int l3)
   1489 {
   1490   FullCost total = CLG_(total_cost), D_total = 0;
   1491   ULong LL_total_m, LL_total_mr, LL_total_mw,
   1492     LL_total, LL_total_r, LL_total_w;
   1493 
   1494   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
   1495     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
   1496 		 prefetch_up);
   1497     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
   1498 		 prefetch_down);
   1499     VG_(message)(Vg_DebugMsg, "\n");
   1500   }
   1501 
   1502   VG_(message)(Vg_UserMsg, "I1  misses:    %'*llu\n", l1,
   1503                total[fullOffset(EG_IR) +1]);
   1504 
   1505   VG_(message)(Vg_UserMsg, "LLi misses:    %'*llu\n", l1,
   1506                total[fullOffset(EG_IR) +2]);
   1507 
   1508   if (0 == total[fullOffset(EG_IR)])
   1509     total[fullOffset(EG_IR)] = 1;
   1510 
   1511   VG_(message)(Vg_UserMsg, "I1  miss rate: %*.2f%%\n", l1,
   1512                total[fullOffset(EG_IR)+1] * 100.0 / total[fullOffset(EG_IR)]);
   1513 
   1514   VG_(message)(Vg_UserMsg, "LLi miss rate: %*.2f%%\n", l1,
   1515                total[fullOffset(EG_IR)+2] * 100.0 / total[fullOffset(EG_IR)]);
   1516 
   1517   VG_(message)(Vg_UserMsg, "\n");
   1518 
   1519   /* D cache results.
   1520      Use the D_refs.rd and D_refs.wr values to determine the
   1521    * width of columns 2 & 3. */
   1522 
   1523   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
   1524   CLG_(init_cost)( CLG_(sets).full, D_total);
   1525   // we only use the first 3 values of D_total, adding up Dr and Dw costs
   1526   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
   1527   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
   1528 
   1529   VG_(message)(Vg_UserMsg, "D   refs:      %'*llu  (%'*llu rd + %'*llu wr)\n",
   1530                l1, D_total[0],
   1531                l2, total[fullOffset(EG_DR)],
   1532                l3, total[fullOffset(EG_DW)]);
   1533 
   1534   VG_(message)(Vg_UserMsg, "D1  misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
   1535                l1, D_total[1],
   1536                l2, total[fullOffset(EG_DR)+1],
   1537                l3, total[fullOffset(EG_DW)+1]);
   1538 
   1539   VG_(message)(Vg_UserMsg, "LLd misses:    %'*llu  (%'*llu rd + %'*llu wr)\n",
   1540                l1, D_total[2],
   1541                l2, total[fullOffset(EG_DR)+2],
   1542                l3, total[fullOffset(EG_DW)+2]);
   1543 
   1544   if (0 == D_total[0])   D_total[0] = 1;
   1545   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
   1546   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
   1547 
   1548   VG_(message)(Vg_UserMsg, "D1  miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
   1549            l1, D_total[1] * 100.0 / D_total[0],
   1550            l2, total[fullOffset(EG_DR)+1] * 100.0 / total[fullOffset(EG_DR)],
   1551            l3, total[fullOffset(EG_DW)+1] * 100.0 / total[fullOffset(EG_DW)]);
   1552 
   1553   VG_(message)(Vg_UserMsg, "LLd miss rate: %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
   1554            l1, D_total[2] * 100.0 / D_total[0],
   1555            l2, total[fullOffset(EG_DR)+2] * 100.0 / total[fullOffset(EG_DR)],
   1556            l3, total[fullOffset(EG_DW)+2] * 100.0 / total[fullOffset(EG_DW)]);
   1557   VG_(message)(Vg_UserMsg, "\n");
   1558 
   1559 
   1560 
   1561   /* LL overall results */
   1562 
   1563   LL_total   =
   1564     total[fullOffset(EG_DR) +1] +
   1565     total[fullOffset(EG_DW) +1] +
   1566     total[fullOffset(EG_IR) +1];
   1567   LL_total_r =
   1568     total[fullOffset(EG_DR) +1] +
   1569     total[fullOffset(EG_IR) +1];
   1570   LL_total_w = total[fullOffset(EG_DW) +1];
   1571   VG_(message)(Vg_UserMsg, "LL refs:       %'*llu  (%'*llu rd + %'*llu wr)\n",
   1572                l1, LL_total, l2, LL_total_r, l3, LL_total_w);
   1573 
   1574   LL_total_m  =
   1575     total[fullOffset(EG_DR) +2] +
   1576     total[fullOffset(EG_DW) +2] +
   1577     total[fullOffset(EG_IR) +2];
   1578   LL_total_mr =
   1579     total[fullOffset(EG_DR) +2] +
   1580     total[fullOffset(EG_IR) +2];
   1581   LL_total_mw = total[fullOffset(EG_DW) +2];
   1582   VG_(message)(Vg_UserMsg, "LL misses:     %'*llu  (%'*llu rd + %'*llu wr)\n",
   1583                l1, LL_total_m, l2, LL_total_mr, l3, LL_total_mw);
   1584 
   1585   VG_(message)(Vg_UserMsg, "LL miss rate:  %*.1f%% (%*.1f%%   + %*.1f%%  )\n",
   1586           l1, LL_total_m  * 100.0 / (total[fullOffset(EG_IR)] + D_total[0]),
   1587           l2, LL_total_mr * 100.0 / (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
   1588           l3, LL_total_mw * 100.0 / total[fullOffset(EG_DW)]);
   1589 }
   1590 
   1591 
   1592 /*------------------------------------------------------------*/
   1593 /*--- Setup for Event set.                                 ---*/
   1594 /*------------------------------------------------------------*/
   1595 
   1596 struct event_sets CLG_(sets);
   1597 
   1598 void CLG_(init_eventsets)()
   1599 {
   1600     // Event groups from which the event sets are composed
   1601     // the "Use" group only is used with "cacheuse" simulation
   1602     if (clo_collect_cacheuse)
   1603 	CLG_(register_event_group4)(EG_USE,
   1604 				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
   1605 
   1606     if (!CLG_(clo).simulate_cache)
   1607 	CLG_(register_event_group)(EG_IR, "Ir");
   1608     else if (!clo_simulate_writeback) {
   1609 	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
   1610 	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
   1611 	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
   1612     }
   1613     else { // clo_simulate_writeback
   1614 	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
   1615         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
   1616         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
   1617     }
   1618 
   1619     if (CLG_(clo).simulate_branch) {
   1620         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
   1621         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
   1622     }
   1623 
   1624     if (CLG_(clo).collect_bus)
   1625 	CLG_(register_event_group)(EG_BUS, "Ge");
   1626 
   1627     if (CLG_(clo).collect_alloc)
   1628 	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
   1629 
   1630     if (CLG_(clo).collect_systime)
   1631 	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
   1632 
   1633     // event set used as base for instruction self cost
   1634     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
   1635 
   1636     // event set comprising all event groups, used for inclusive cost
   1637     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
   1638     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
   1639     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
   1640     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
   1641 
   1642     CLG_DEBUGIF(1) {
   1643 	CLG_DEBUG(1, "EventSets:\n");
   1644 	CLG_(print_eventset)(-2, CLG_(sets).base);
   1645 	CLG_(print_eventset)(-2, CLG_(sets).full);
   1646     }
   1647 
   1648     /* Not-existing events are silently ignored */
   1649     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
   1650     CLG_(append_event)(CLG_(dumpmap), "Ir");
   1651     CLG_(append_event)(CLG_(dumpmap), "Dr");
   1652     CLG_(append_event)(CLG_(dumpmap), "Dw");
   1653     CLG_(append_event)(CLG_(dumpmap), "I1mr");
   1654     CLG_(append_event)(CLG_(dumpmap), "D1mr");
   1655     CLG_(append_event)(CLG_(dumpmap), "D1mw");
   1656     CLG_(append_event)(CLG_(dumpmap), "ILmr");
   1657     CLG_(append_event)(CLG_(dumpmap), "DLmr");
   1658     CLG_(append_event)(CLG_(dumpmap), "DLmw");
   1659     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
   1660     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
   1661     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
   1662     CLG_(append_event)(CLG_(dumpmap), "Bc");
   1663     CLG_(append_event)(CLG_(dumpmap), "Bcm");
   1664     CLG_(append_event)(CLG_(dumpmap), "Bi");
   1665     CLG_(append_event)(CLG_(dumpmap), "Bim");
   1666     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
   1667     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
   1668     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
   1669     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
   1670     CLG_(append_event)(CLG_(dumpmap), "Ge");
   1671     CLG_(append_event)(CLG_(dumpmap), "allocCount");
   1672     CLG_(append_event)(CLG_(dumpmap), "allocSize");
   1673     CLG_(append_event)(CLG_(dumpmap), "sysCount");
   1674     CLG_(append_event)(CLG_(dumpmap), "sysTime");
   1675 }
   1676 
   1677 
   1678 /* this is called at dump time for every instruction executed */
   1679 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
   1680 			       InstrInfo* ii, ULong exe_count)
   1681 {
   1682     if (!CLG_(clo).simulate_cache)
   1683 	cost[ fullOffset(EG_IR) ] += exe_count;
   1684 
   1685     if (ii->eventset)
   1686 	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
   1687 				  ii->eventset, bbcc->cost + ii->cost_offset);
   1688 }
   1689 
   1690 static
   1691 void cachesim_finish(void)
   1692 {
   1693   if (clo_collect_cacheuse)
   1694     cacheuse_finish();
   1695 }
   1696 
   1697 /*------------------------------------------------------------*/
   1698 /*--- The simulator defined in this file                   ---*/
   1699 /*------------------------------------------------------------*/
   1700 
   1701 struct cachesim_if CLG_(cachesim) = {
   1702   .print_opts    = cachesim_print_opts,
   1703   .parse_opt     = cachesim_parse_opt,
   1704   .post_clo_init = cachesim_post_clo_init,
   1705   .clear         = cachesim_clear,
   1706   .dump_desc     = cachesim_dump_desc,
   1707   .printstat     = cachesim_printstat,
   1708   .add_icost     = cachesim_add_icost,
   1709   .finish        = cachesim_finish,
   1710 
   1711   /* these will be set by cachesim_post_clo_init */
   1712   .log_1I0D        = 0,
   1713   .log_2I0D        = 0,
   1714   .log_3I0D        = 0,
   1715 
   1716   .log_1I1Dr       = 0,
   1717   .log_1I1Dw       = 0,
   1718 
   1719   .log_0I1Dr       = 0,
   1720   .log_0I1Dw       = 0,
   1721 
   1722   .log_1I0D_name = "(no function)",
   1723   .log_2I0D_name = "(no function)",
   1724   .log_3I0D_name = "(no function)",
   1725 
   1726   .log_1I1Dr_name = "(no function)",
   1727   .log_1I1Dw_name = "(no function)",
   1728 
   1729   .log_0I1Dr_name = "(no function)",
   1730   .log_0I1Dw_name = "(no function)",
   1731 };
   1732 
   1733 
   1734 /*--------------------------------------------------------------------*/
   1735 /*--- end                                                 ct_sim.c ---*/
   1736 /*--------------------------------------------------------------------*/
   1737