Home | History | Annotate | Download | only in callgrind
      1 /*--------------------------------------------------------------------*/
      2 /*--- Cache simulation.                                            ---*/
      3 /*---                                                        sim.c ---*/
      4 /*--------------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Callgrind, a Valgrind tool for call graph
      8    profiling programs.
      9 
     10    Copyright (C) 2003-2010, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     11 
     12    This tool is derived from and contains code from Cachegrind
     13    Copyright (C) 2002-2010 Nicholas Nethercote (njn (at) valgrind.org)
     14 
     15    This program is free software; you can redistribute it and/or
     16    modify it under the terms of the GNU General Public License as
     17    published by the Free Software Foundation; either version 2 of the
     18    License, or (at your option) any later version.
     19 
     20    This program is distributed in the hope that it will be useful, but
     21    WITHOUT ANY WARRANTY; without even the implied warranty of
     22    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     23    General Public License for more details.
     24 
     25    You should have received a copy of the GNU General Public License
     26    along with this program; if not, write to the Free Software
     27    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     28    02111-1307, USA.
     29 
     30    The GNU General Public License is contained in the file COPYING.
     31 */
     32 
     33 #include "global.h"
     34 
     35 
     36 /* Notes:
     37   - simulates a write-allocate cache
     38   - (block --> set) hash function uses simple bit selection
     39   - handling of references straddling two cache blocks:
     40       - counts as only one cache access (not two)
     41       - both blocks hit                  --> one hit
     42       - one block hits, the other misses --> one miss
     43       - both blocks miss                 --> one miss (not two)
     44 */
     45 
     46 /* Cache configuration */
     47 #include "cg_arch.h"
     48 
     49 /* additional structures for cache use info, separated
     50  * according usage frequency:
     51  * - line_loaded : pointer to cost center of instruction
     52  *                 which loaded the line into cache.
     53  *                 Needed to increment counters when line is evicted.
     54  * - line_use    : updated on every access
     55  */
     56 typedef struct {
     57   UInt count;
     58   UInt mask; /* e.g. for 64Byte line size 1bit/2Byte */
     59 } line_use;
     60 
     61 typedef struct {
     62   Addr memline, iaddr;
     63   line_use* dep_use; /* point to higher-level cacheblock for this memline */
     64   ULong* use_base;
     65 } line_loaded;
     66 
     67 /* Cache state */
     68 typedef struct {
     69    char*        name;
     70    int          size;                   /* bytes */
     71    int          assoc;
     72    int          line_size;              /* bytes */
     73    Bool         sectored;  /* prefetch nearside cacheline on read */
     74    int          sets;
     75    int          sets_min_1;
     76    int          line_size_bits;
     77    int          tag_shift;
     78    UWord        tag_mask;
     79    char         desc_line[128];
     80    UWord*       tags;
     81 
     82   /* for cache use */
     83    int          line_size_mask;
     84    int*         line_start_mask;
     85    int*         line_end_mask;
     86    line_loaded* loaded;
     87    line_use*    use;
     88 } cache_t2;
     89 
     90 /*
     91  * States of flat caches in our model.
     92  * We use a 2-level hierarchy,
     93  */
     94 static cache_t2 I1, D1, LL;
     95 
     96 /* Lower bits of cache tags are used as flags for a cache line */
     97 #define CACHELINE_FLAGMASK (MIN_LINE_SIZE-1)
     98 #define CACHELINE_DIRTY    1
     99 
    100 
    101 /* Cache simulator Options */
    102 static Bool clo_simulate_writeback = False;
    103 static Bool clo_simulate_hwpref = False;
    104 static Bool clo_simulate_sectors = False;
    105 static Bool clo_collect_cacheuse = False;
    106 
    107 /* Following global vars are setup before by setup_bbcc():
    108  *
    109  * - Addr   CLG_(bb_base)     (instruction start address of original BB)
    110  * - ULong* CLG_(cost_base)   (start of cost array for BB)
    111  */
    112 
    113 Addr   CLG_(bb_base);
    114 ULong* CLG_(cost_base);
    115 
    116 static InstrInfo* current_ii;
    117 
    118 /* Cache use offsets */
    119 /* The offsets are only correct because all per-instruction event sets get
    120  * the "Use" set added first !
    121  */
    122 static Int off_I1_AcCost  = 0;
    123 static Int off_I1_SpLoss  = 1;
    124 static Int off_D1_AcCost  = 0;
    125 static Int off_D1_SpLoss  = 1;
    126 static Int off_LL_AcCost  = 2;
    127 static Int off_LL_SpLoss  = 3;
    128 
    129 /* Cache access types */
    130 typedef enum { Read = 0, Write = CACHELINE_DIRTY } RefType;
    131 
    132 /* Result of a reference into a flat cache */
    133 typedef enum { Hit  = 0, Miss, MissDirty } CacheResult;
    134 
    135 /* Result of a reference into a hierarchical cache model */
    136 typedef enum {
    137     L1_Hit,
    138     LL_Hit,
    139     MemAccess,
    140     WriteBackMemAccess } CacheModelResult;
    141 
    142 typedef CacheModelResult (*simcall_type)(Addr, UChar);
    143 
    144 static struct {
    145     simcall_type I1_Read;
    146     simcall_type D1_Read;
    147     simcall_type D1_Write;
    148 } simulator;
    149 
    150 /*------------------------------------------------------------*/
    151 /*--- Cache Simulator Initialization                       ---*/
    152 /*------------------------------------------------------------*/
    153 
    154 static void cachesim_clearcache(cache_t2* c)
    155 {
    156   Int i;
    157 
    158   for (i = 0; i < c->sets * c->assoc; i++)
    159     c->tags[i] = 0;
    160   if (c->use) {
    161     for (i = 0; i < c->sets * c->assoc; i++) {
    162       c->loaded[i].memline  = 0;
    163       c->loaded[i].use_base = 0;
    164       c->loaded[i].dep_use = 0;
    165       c->loaded[i].iaddr = 0;
    166       c->use[i].mask    = 0;
    167       c->use[i].count   = 0;
    168       c->tags[i] = i % c->assoc; /* init lower bits as pointer */
    169     }
    170   }
    171 }
    172 
    173 static void cacheuse_initcache(cache_t2* c);
    174 
    175 /* By this point, the size/assoc/line_size has been checked. */
    176 static void cachesim_initcache(cache_t config, cache_t2* c)
    177 {
    178    c->size      = config.size;
    179    c->assoc     = config.assoc;
    180    c->line_size = config.line_size;
    181    c->sectored  = False; // FIXME
    182 
    183    c->sets           = (c->size / c->line_size) / c->assoc;
    184    c->sets_min_1     = c->sets - 1;
    185    c->line_size_bits = VG_(log2)(c->line_size);
    186    c->tag_shift      = c->line_size_bits + VG_(log2)(c->sets);
    187    c->tag_mask       = ~((1<<c->tag_shift)-1);
    188 
    189    /* Can bits in tag entries be used for flags?
    190     * Should be always true as MIN_LINE_SIZE >= 16 */
    191    CLG_ASSERT( (c->tag_mask & CACHELINE_FLAGMASK) == 0);
    192 
    193    if (c->assoc == 1) {
    194       VG_(sprintf)(c->desc_line, "%d B, %d B, direct-mapped%s",
    195 		   c->size, c->line_size,
    196 		   c->sectored ? ", sectored":"");
    197    } else {
    198       VG_(sprintf)(c->desc_line, "%d B, %d B, %d-way associative%s",
    199 		   c->size, c->line_size, c->assoc,
    200 		   c->sectored ? ", sectored":"");
    201    }
    202 
    203    c->tags = (UWord*) CLG_MALLOC("cl.sim.cs_ic.1",
    204                                  sizeof(UWord) * c->sets * c->assoc);
    205    if (clo_collect_cacheuse)
    206        cacheuse_initcache(c);
    207    else
    208      c->use = 0;
    209    cachesim_clearcache(c);
    210 }
    211 
    212 
    213 #if 0
    214 static void print_cache(cache_t2* c)
    215 {
    216    UInt set, way, i;
    217 
    218    /* Note initialisation and update of 'i'. */
    219    for (i = 0, set = 0; set < c->sets; set++) {
    220       for (way = 0; way < c->assoc; way++, i++) {
    221          VG_(printf)("%8x ", c->tags[i]);
    222       }
    223       VG_(printf)("\n");
    224    }
    225 }
    226 #endif
    227 
    228 
    229 /*------------------------------------------------------------*/
    230 /*--- Write Through Cache Simulation                       ---*/
    231 /*------------------------------------------------------------*/
    232 
    233 /*
    234  * Simple model: L1 & LL Write Through
    235  * Does not distinguish among read and write references
    236  *
    237  * Simulator functions:
    238  *  CacheModelResult cachesim_I1_ref(Addr a, UChar size)
    239  *  CacheModelResult cachesim_D1_ref(Addr a, UChar size)
    240  */
    241 
    242 static __inline__
    243 CacheResult cachesim_setref(cache_t2* c, UInt set_no, UWord tag)
    244 {
    245     int i, j;
    246     UWord *set;
    247 
    248     set = &(c->tags[set_no * c->assoc]);
    249 
    250     /* This loop is unrolled for just the first case, which is the most */
    251     /* common.  We can't unroll any further because it would screw up   */
    252     /* if we have a direct-mapped (1-way) cache.                        */
    253     if (tag == set[0])
    254         return Hit;
    255 
    256     /* If the tag is one other than the MRU, move it into the MRU spot  */
    257     /* and shuffle the rest down.                                       */
    258     for (i = 1; i < c->assoc; i++) {
    259         if (tag == set[i]) {
    260             for (j = i; j > 0; j--) {
    261                 set[j] = set[j - 1];
    262             }
    263             set[0] = tag;
    264             return Hit;
    265         }
    266     }
    267 
    268     /* A miss;  install this tag as MRU, shuffle rest down. */
    269     for (j = c->assoc - 1; j > 0; j--) {
    270         set[j] = set[j - 1];
    271     }
    272     set[0] = tag;
    273 
    274     return Miss;
    275 }
    276 
    277 static CacheResult cachesim_ref(cache_t2* c, Addr a, UChar size)
    278 {
    279     UInt  set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
    280     UInt  set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
    281     UWord tag  = a >> c->tag_shift;
    282 
    283     /* Access entirely within line. */
    284     if (set1 == set2)
    285 	return cachesim_setref(c, set1, tag);
    286 
    287     /* Access straddles two lines. */
    288     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
    289     else if (((set1 + 1) & (c->sets-1)) == set2) {
    290 	UWord tag2  = (a+size-1) >> c->tag_shift;
    291 
    292 	/* the call updates cache structures as side effect */
    293 	CacheResult res1 =  cachesim_setref(c, set1, tag);
    294 	CacheResult res2 =  cachesim_setref(c, set2, tag2);
    295 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
    296 
    297    } else {
    298        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
    299        VG_(tool_panic)("item straddles more than two cache sets");
    300    }
    301    return Hit;
    302 }
    303 
    304 static
    305 CacheModelResult cachesim_I1_ref(Addr a, UChar size)
    306 {
    307     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    308     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    309     return MemAccess;
    310 }
    311 
    312 static
    313 CacheModelResult cachesim_D1_ref(Addr a, UChar size)
    314 {
    315     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    316     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    317     return MemAccess;
    318 }
    319 
    320 
    321 /*------------------------------------------------------------*/
    322 /*--- Write Back Cache Simulation                          ---*/
    323 /*------------------------------------------------------------*/
    324 
    325 /*
    326  * More complex model: L1 Write-through, LL Write-back
    327  * This needs to distinguish among read and write references.
    328  *
    329  * Simulator functions:
    330  *  CacheModelResult cachesim_I1_Read(Addr a, UChar size)
    331  *  CacheModelResult cachesim_D1_Read(Addr a, UChar size)
    332  *  CacheModelResult cachesim_D1_Write(Addr a, UChar size)
    333  */
    334 
    335 /*
    336  * With write-back, result can be a miss evicting a dirty line
    337  * The dirty state of a cache line is stored in Bit0 of the tag for
    338  * this cache line (CACHELINE_DIRTY = 1). By OR'ing the reference
    339  * type (Read/Write), the line gets dirty on a write.
    340  */
    341 static __inline__
    342 CacheResult cachesim_setref_wb(cache_t2* c, RefType ref, UInt set_no, UWord tag)
    343 {
    344     int i, j;
    345     UWord *set, tmp_tag;
    346 
    347     set = &(c->tags[set_no * c->assoc]);
    348 
    349     /* This loop is unrolled for just the first case, which is the most */
    350     /* common.  We can't unroll any further because it would screw up   */
    351     /* if we have a direct-mapped (1-way) cache.                        */
    352     if (tag == (set[0] & ~CACHELINE_DIRTY)) {
    353 	set[0] |= ref;
    354         return Hit;
    355     }
    356     /* If the tag is one other than the MRU, move it into the MRU spot  */
    357     /* and shuffle the rest down.                                       */
    358     for (i = 1; i < c->assoc; i++) {
    359 	if (tag == (set[i] & ~CACHELINE_DIRTY)) {
    360 	    tmp_tag = set[i] | ref; // update dirty flag
    361             for (j = i; j > 0; j--) {
    362                 set[j] = set[j - 1];
    363             }
    364             set[0] = tmp_tag;
    365             return Hit;
    366         }
    367     }
    368 
    369     /* A miss;  install this tag as MRU, shuffle rest down. */
    370     tmp_tag = set[c->assoc - 1];
    371     for (j = c->assoc - 1; j > 0; j--) {
    372         set[j] = set[j - 1];
    373     }
    374     set[0] = tag | ref;
    375 
    376     return (tmp_tag & CACHELINE_DIRTY) ? MissDirty : Miss;
    377 }
    378 
    379 
    380 static __inline__
    381 CacheResult cachesim_ref_wb(cache_t2* c, RefType ref, Addr a, UChar size)
    382 {
    383     UInt set1 = ( a         >> c->line_size_bits) & (c->sets_min_1);
    384     UInt set2 = ((a+size-1) >> c->line_size_bits) & (c->sets_min_1);
    385     UWord tag = a & c->tag_mask;
    386 
    387     /* Access entirely within line. */
    388     if (set1 == set2)
    389 	return cachesim_setref_wb(c, ref, set1, tag);
    390 
    391     /* Access straddles two lines. */
    392     /* Nb: this is a fast way of doing ((set1+1) % c->sets) */
    393     else if (((set1 + 1) & (c->sets-1)) == set2) {
    394 	UWord tag2  = (a+size-1) & c->tag_mask;
    395 
    396 	/* the call updates cache structures as side effect */
    397 	CacheResult res1 =  cachesim_setref_wb(c, ref, set1, tag);
    398 	CacheResult res2 =  cachesim_setref_wb(c, ref, set2, tag2);
    399 
    400 	if ((res1 == MissDirty) || (res2 == MissDirty)) return MissDirty;
    401 	return ((res1 == Miss) || (res2 == Miss)) ? Miss : Hit;
    402 
    403    } else {
    404        VG_(printf)("addr: %lx  size: %u  sets: %d %d", a, size, set1, set2);
    405        VG_(tool_panic)("item straddles more than two cache sets");
    406    }
    407    return Hit;
    408 }
    409 
    410 
    411 static
    412 CacheModelResult cachesim_I1_Read(Addr a, UChar size)
    413 {
    414     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    415     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    416 	case Hit: return LL_Hit;
    417 	case Miss: return MemAccess;
    418 	default: break;
    419     }
    420     return WriteBackMemAccess;
    421 }
    422 
    423 static
    424 CacheModelResult cachesim_D1_Read(Addr a, UChar size)
    425 {
    426     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    427     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    428 	case Hit: return LL_Hit;
    429 	case Miss: return MemAccess;
    430 	default: break;
    431     }
    432     return WriteBackMemAccess;
    433 }
    434 
    435 static
    436 CacheModelResult cachesim_D1_Write(Addr a, UChar size)
    437 {
    438     if ( cachesim_ref( &D1, a, size) == Hit ) {
    439 	/* Even for a L1 hit, the write-trough L1 passes
    440 	 * the write to the LL to make the LL line dirty.
    441 	 * But this causes no latency, so return the hit.
    442 	 */
    443 	cachesim_ref_wb( &LL, Write, a, size);
    444 	return L1_Hit;
    445     }
    446     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
    447 	case Hit: return LL_Hit;
    448 	case Miss: return MemAccess;
    449 	default: break;
    450     }
    451     return WriteBackMemAccess;
    452 }
    453 
    454 
    455 /*------------------------------------------------------------*/
    456 /*--- Hardware Prefetch Simulation                         ---*/
    457 /*------------------------------------------------------------*/
    458 
    459 static ULong prefetch_up = 0;
    460 static ULong prefetch_down = 0;
    461 
    462 #define PF_STREAMS  8
    463 #define PF_PAGEBITS 12
    464 
    465 static UInt pf_lastblock[PF_STREAMS];
    466 static Int  pf_seqblocks[PF_STREAMS];
    467 
    468 static
    469 void prefetch_clear(void)
    470 {
    471   int i;
    472   for(i=0;i<PF_STREAMS;i++)
    473     pf_lastblock[i] = pf_seqblocks[i] = 0;
    474 }
    475 
    476 /*
    477  * HW Prefetch emulation
    478  * Start prefetching when detecting sequential access to 3 memory blocks.
    479  * One stream can be detected per 4k page.
    480  */
    481 static __inline__
    482 void prefetch_LL_doref(Addr a)
    483 {
    484   UInt stream = (a >> PF_PAGEBITS) % PF_STREAMS;
    485   UInt block = ( a >> LL.line_size_bits);
    486 
    487   if (block != pf_lastblock[stream]) {
    488     if (pf_seqblocks[stream] == 0) {
    489       if (pf_lastblock[stream] +1 == block) pf_seqblocks[stream]++;
    490       else if (pf_lastblock[stream] -1 == block) pf_seqblocks[stream]--;
    491     }
    492     else if (pf_seqblocks[stream] >0) {
    493       if (pf_lastblock[stream] +1 == block) {
    494 	pf_seqblocks[stream]++;
    495 	if (pf_seqblocks[stream] >= 2) {
    496 	  prefetch_up++;
    497 	  cachesim_ref(&LL, a + 5 * LL.line_size,1);
    498 	}
    499       }
    500       else pf_seqblocks[stream] = 0;
    501     }
    502     else if (pf_seqblocks[stream] <0) {
    503       if (pf_lastblock[stream] -1 == block) {
    504 	pf_seqblocks[stream]--;
    505 	if (pf_seqblocks[stream] <= -2) {
    506 	  prefetch_down++;
    507 	  cachesim_ref(&LL, a - 5 * LL.line_size,1);
    508 	}
    509       }
    510       else pf_seqblocks[stream] = 0;
    511     }
    512     pf_lastblock[stream] = block;
    513   }
    514 }
    515 
    516 /* simple model with hardware prefetch */
    517 
    518 static
    519 CacheModelResult prefetch_I1_ref(Addr a, UChar size)
    520 {
    521     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    522     prefetch_LL_doref(a);
    523     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    524     return MemAccess;
    525 }
    526 
    527 static
    528 CacheModelResult prefetch_D1_ref(Addr a, UChar size)
    529 {
    530     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    531     prefetch_LL_doref(a);
    532     if ( cachesim_ref( &LL, a, size) == Hit ) return LL_Hit;
    533     return MemAccess;
    534 }
    535 
    536 
    537 /* complex model with hardware prefetch */
    538 
    539 static
    540 CacheModelResult prefetch_I1_Read(Addr a, UChar size)
    541 {
    542     if ( cachesim_ref( &I1, a, size) == Hit ) return L1_Hit;
    543     prefetch_LL_doref(a);
    544     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    545 	case Hit: return LL_Hit;
    546 	case Miss: return MemAccess;
    547 	default: break;
    548     }
    549     return WriteBackMemAccess;
    550 }
    551 
    552 static
    553 CacheModelResult prefetch_D1_Read(Addr a, UChar size)
    554 {
    555     if ( cachesim_ref( &D1, a, size) == Hit ) return L1_Hit;
    556     prefetch_LL_doref(a);
    557     switch( cachesim_ref_wb( &LL, Read, a, size) ) {
    558 	case Hit: return LL_Hit;
    559 	case Miss: return MemAccess;
    560 	default: break;
    561     }
    562     return WriteBackMemAccess;
    563 }
    564 
    565 static
    566 CacheModelResult prefetch_D1_Write(Addr a, UChar size)
    567 {
    568     prefetch_LL_doref(a);
    569     if ( cachesim_ref( &D1, a, size) == Hit ) {
    570 	/* Even for a L1 hit, the write-trough L1 passes
    571 	 * the write to the LL to make the LL line dirty.
    572 	 * But this causes no latency, so return the hit.
    573 	 */
    574 	cachesim_ref_wb( &LL, Write, a, size);
    575 	return L1_Hit;
    576     }
    577     switch( cachesim_ref_wb( &LL, Write, a, size) ) {
    578 	case Hit: return LL_Hit;
    579 	case Miss: return MemAccess;
    580 	default: break;
    581     }
    582     return WriteBackMemAccess;
    583 }
    584 
    585 
    586 /*------------------------------------------------------------*/
    587 /*--- Cache Simulation with use metric collection          ---*/
    588 /*------------------------------------------------------------*/
    589 
    590 /* can not be combined with write-back or prefetch */
    591 
    592 static
    593 void cacheuse_initcache(cache_t2* c)
    594 {
    595     int i;
    596     unsigned int start_mask, start_val;
    597     unsigned int end_mask, end_val;
    598 
    599     c->use    = CLG_MALLOC("cl.sim.cu_ic.1",
    600                            sizeof(line_use) * c->sets * c->assoc);
    601     c->loaded = CLG_MALLOC("cl.sim.cu_ic.2",
    602                            sizeof(line_loaded) * c->sets * c->assoc);
    603     c->line_start_mask = CLG_MALLOC("cl.sim.cu_ic.3",
    604                                     sizeof(int) * c->line_size);
    605     c->line_end_mask = CLG_MALLOC("cl.sim.cu_ic.4",
    606                                   sizeof(int) * c->line_size);
    607 
    608     c->line_size_mask = c->line_size-1;
    609 
    610     /* Meaning of line_start_mask/line_end_mask
    611      * Example: for a given cache line, you get an access starting at
    612      * byte offset 5, length 4, byte 5 - 8 was touched. For a cache
    613      * line size of 32, you have 1 bit per byte in the mask:
    614      *
    615      *   bit31   bit8 bit5  bit 0
    616      *       |      |  |    |
    617      *       11..111111100000   line_start_mask[5]
    618      *       00..000111111111   line_end_mask[(5+4)-1]
    619      *
    620      *  use_mask |= line_start_mask[5] && line_end_mask[8]
    621      *
    622      */
    623     start_val = end_val = ~0;
    624     if (c->line_size < 32) {
    625 	int bits_per_byte = 32/c->line_size;
    626 	start_mask = (1<<bits_per_byte)-1;
    627 	end_mask   = start_mask << (32-bits_per_byte);
    628 	for(i=0;i<c->line_size;i++) {
    629 	    c->line_start_mask[i] = start_val;
    630 	    start_val  = start_val & ~start_mask;
    631 	    start_mask = start_mask << bits_per_byte;
    632 
    633 	    c->line_end_mask[c->line_size-i-1] = end_val;
    634 	    end_val  = end_val & ~end_mask;
    635 	    end_mask = end_mask >> bits_per_byte;
    636 	}
    637     }
    638     else {
    639 	int bytes_per_bit = c->line_size/32;
    640 	start_mask = 1;
    641 	end_mask   = 1 << 31;
    642 	for(i=0;i<c->line_size;i++) {
    643 	    c->line_start_mask[i] = start_val;
    644 	    c->line_end_mask[c->line_size-i-1] = end_val;
    645 	    if ( ((i+1)%bytes_per_bit) == 0) {
    646 		start_val   &= ~start_mask;
    647 		end_val     &= ~end_mask;
    648 		start_mask <<= 1;
    649 		end_mask   >>= 1;
    650 	    }
    651 	}
    652     }
    653 
    654     CLG_DEBUG(6, "Config %s:\n", c->desc_line);
    655     for(i=0;i<c->line_size;i++) {
    656 	CLG_DEBUG(6, " [%2d]: start mask %8x, end mask %8x\n",
    657 		  i, c->line_start_mask[i], c->line_end_mask[i]);
    658     }
    659 
    660     /* We use lower tag bits as offset pointers to cache use info.
    661      * I.e. some cache parameters don't work.
    662      */
    663     if ( (1<<c->tag_shift) < c->assoc) {
    664 	VG_(message)(Vg_DebugMsg,
    665 		     "error: Use associativity < %d for cache use statistics!\n",
    666 		     (1<<c->tag_shift) );
    667 	VG_(tool_panic)("Unsupported cache configuration");
    668     }
    669 }
    670 
    671 
    672 /* for I1/D1 caches */
    673 #define CACHEUSE(L)                                                         \
    674                                                                             \
    675 static CacheModelResult cacheuse##_##L##_doRead(Addr a, UChar size)         \
    676 {                                                                           \
    677    UInt set1 = ( a         >> L.line_size_bits) & (L.sets_min_1);           \
    678    UInt set2 = ((a+size-1) >> L.line_size_bits) & (L.sets_min_1);           \
    679    UWord tag  = a & L.tag_mask;                                             \
    680    UWord tag2;                                                              \
    681    int i, j, idx;                                                           \
    682    UWord *set, tmp_tag; 						    \
    683    UInt use_mask;							    \
    684                                                                             \
    685    CLG_DEBUG(6,"%s.Acc(Addr %#lx, size %d): Sets [%d/%d]\n",                  \
    686 	    L.name, a, size, set1, set2);				    \
    687                                                                             \
    688    /* First case: word entirely within line. */                             \
    689    if (set1 == set2) {                                                      \
    690                                                                             \
    691       set = &(L.tags[set1 * L.assoc]);                                      \
    692       use_mask = L.line_start_mask[a & L.line_size_mask] &                  \
    693 	         L.line_end_mask[(a+size-1) & L.line_size_mask];	    \
    694                                                                             \
    695       /* This loop is unrolled for just the first case, which is the most */\
    696       /* common.  We can't unroll any further because it would screw up   */\
    697       /* if we have a direct-mapped (1-way) cache.                        */\
    698       if (tag == (set[0] & L.tag_mask)) {                                   \
    699         idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                    \
    700         L.use[idx].count ++;                                                \
    701         L.use[idx].mask |= use_mask;                                        \
    702 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    703 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    704 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    705 	return L1_Hit;							    \
    706       }                                                                     \
    707       /* If the tag is one other than the MRU, move it into the MRU spot  */\
    708       /* and shuffle the rest down.                                       */\
    709       for (i = 1; i < L.assoc; i++) {                                       \
    710 	 if (tag == (set[i] & L.tag_mask)) {			            \
    711   	    tmp_tag = set[i];                                               \
    712             for (j = i; j > 0; j--) {                                       \
    713                set[j] = set[j - 1];                                         \
    714             }                                                               \
    715             set[0] = tmp_tag;			                            \
    716             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    717             L.use[idx].count ++;                                            \
    718             L.use[idx].mask |= use_mask;                                    \
    719 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    720 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    721 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    722             return L1_Hit;                                                  \
    723          }                                                                  \
    724       }                                                                     \
    725                                                                             \
    726       /* A miss;  install this tag as MRU, shuffle rest down. */            \
    727       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    728       for (j = L.assoc - 1; j > 0; j--) {                                   \
    729          set[j] = set[j - 1];                                               \
    730       }                                                                     \
    731       set[0] = tag | tmp_tag;                                               \
    732       idx = (set1 * L.assoc) + tmp_tag;                                     \
    733       return update_##L##_use(&L, idx,         			            \
    734 		       use_mask, a &~ L.line_size_mask);		    \
    735                                                                             \
    736    /* Second case: word straddles two lines. */                             \
    737    /* Nb: this is a fast way of doing ((set1+1) % L.sets) */                \
    738    } else if (((set1 + 1) & (L.sets-1)) == set2) {                          \
    739       Int miss1=0, miss2=0; /* 0: L1 hit, 1:L1 miss, 2:LL miss */           \
    740       set = &(L.tags[set1 * L.assoc]);                                      \
    741       use_mask = L.line_start_mask[a & L.line_size_mask];		    \
    742       if (tag == (set[0] & L.tag_mask)) {                                   \
    743          idx = (set1 * L.assoc) + (set[0] & ~L.tag_mask);                   \
    744          L.use[idx].count ++;                                               \
    745          L.use[idx].mask |= use_mask;                                       \
    746 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    747 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    748 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    749          goto block2;                                                       \
    750       }                                                                     \
    751       for (i = 1; i < L.assoc; i++) {                                       \
    752 	 if (tag == (set[i] & L.tag_mask)) {			            \
    753   	    tmp_tag = set[i];                                               \
    754             for (j = i; j > 0; j--) {                                       \
    755                set[j] = set[j - 1];                                         \
    756             }                                                               \
    757             set[0] = tmp_tag;                                               \
    758             idx = (set1 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    759             L.use[idx].count ++;                                            \
    760             L.use[idx].mask |= use_mask;                                    \
    761 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    762 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    763 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    764             goto block2;                                                    \
    765          }                                                                  \
    766       }                                                                     \
    767       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    768       for (j = L.assoc - 1; j > 0; j--) {                                   \
    769          set[j] = set[j - 1];                                               \
    770       }                                                                     \
    771       set[0] = tag | tmp_tag;                                               \
    772       idx = (set1 * L.assoc) + tmp_tag;                                     \
    773       miss1 = update_##L##_use(&L, idx,        			            \
    774 		       use_mask, a &~ L.line_size_mask);		    \
    775 block2:                                                                     \
    776       set = &(L.tags[set2 * L.assoc]);                                      \
    777       use_mask = L.line_end_mask[(a+size-1) & L.line_size_mask];  	    \
    778       tag2  = (a+size-1) & L.tag_mask;                                      \
    779       if (tag2 == (set[0] & L.tag_mask)) {                                  \
    780          idx = (set2 * L.assoc) + (set[0] & ~L.tag_mask);                   \
    781          L.use[idx].count ++;                                               \
    782          L.use[idx].mask |= use_mask;                                       \
    783 	CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    784 		 idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,          \
    785 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    786          return miss1;                                                      \
    787       }                                                                     \
    788       for (i = 1; i < L.assoc; i++) {                                       \
    789 	 if (tag2 == (set[i] & L.tag_mask)) {			            \
    790   	    tmp_tag = set[i];                                               \
    791             for (j = i; j > 0; j--) {                                       \
    792                set[j] = set[j - 1];                                         \
    793             }                                                               \
    794             set[0] = tmp_tag;                                               \
    795             idx = (set2 * L.assoc) + (tmp_tag & ~L.tag_mask);               \
    796             L.use[idx].count ++;                                            \
    797             L.use[idx].mask |= use_mask;                                    \
    798 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): %x => %08x, count %d\n",\
    799 		 i, idx, L.loaded[idx].memline,  L.loaded[idx].iaddr,       \
    800 		 use_mask, L.use[idx].mask, L.use[idx].count);              \
    801             return miss1;                                                   \
    802          }                                                                  \
    803       }                                                                     \
    804       tmp_tag = set[L.assoc - 1] & ~L.tag_mask;                             \
    805       for (j = L.assoc - 1; j > 0; j--) {                                   \
    806          set[j] = set[j - 1];                                               \
    807       }                                                                     \
    808       set[0] = tag2 | tmp_tag;                                              \
    809       idx = (set2 * L.assoc) + tmp_tag;                                     \
    810       miss2 = update_##L##_use(&L, idx,			                    \
    811 		       use_mask, (a+size-1) &~ L.line_size_mask);	    \
    812       return (miss1==MemAccess || miss2==MemAccess) ? MemAccess:LL_Hit;     \
    813                                                                             \
    814    } else {                                                                 \
    815        VG_(printf)("addr: %#lx  size: %u  sets: %d %d", a, size, set1, set2); \
    816        VG_(tool_panic)("item straddles more than two cache sets");          \
    817    }                                                                        \
    818    return 0;                                                                \
    819 }
    820 
    821 
    822 /* logarithmic bitcounting algorithm, see
    823  * http://graphics.stanford.edu/~seander/bithacks.html
    824  */
    825 static __inline__ unsigned int countBits(unsigned int bits)
    826 {
    827   unsigned int c; // store the total here
    828   const int S[] = {1, 2, 4, 8, 16}; // Magic Binary Numbers
    829   const int B[] = {0x55555555, 0x33333333, 0x0F0F0F0F, 0x00FF00FF, 0x0000FFFF};
    830 
    831   c = bits;
    832   c = ((c >> S[0]) & B[0]) + (c & B[0]);
    833   c = ((c >> S[1]) & B[1]) + (c & B[1]);
    834   c = ((c >> S[2]) & B[2]) + (c & B[2]);
    835   c = ((c >> S[3]) & B[3]) + (c & B[3]);
    836   c = ((c >> S[4]) & B[4]) + (c & B[4]);
    837   return c;
    838 }
    839 
    840 static void update_LL_use(int idx, Addr memline)
    841 {
    842   line_loaded* loaded = &(LL.loaded[idx]);
    843   line_use* use = &(LL.use[idx]);
    844   int i = ((32 - countBits(use->mask)) * LL.line_size)>>5;
    845 
    846   CLG_DEBUG(2, " LL.miss [%d]: at %#lx accessing memline %#lx\n",
    847            idx, CLG_(bb_base) + current_ii->instr_offset, memline);
    848   if (use->count>0) {
    849     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",
    850 	     use->count, i, use->mask, loaded->memline, loaded->iaddr);
    851     CLG_DEBUG(2, "   collect: %d, use_base %p\n",
    852 	     CLG_(current_state).collect, loaded->use_base);
    853 
    854     if (CLG_(current_state).collect && loaded->use_base) {
    855       (loaded->use_base)[off_LL_AcCost] += 1000 / use->count;
    856       (loaded->use_base)[off_LL_SpLoss] += i;
    857     }
    858    }
    859 
    860    use->count = 0;
    861    use->mask  = 0;
    862 
    863   loaded->memline = memline;
    864   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;
    865   loaded->use_base = (CLG_(current_state).nonskipped) ?
    866     CLG_(current_state).nonskipped->skipped :
    867     CLG_(cost_base) + current_ii->cost_offset;
    868 }
    869 
    870 static
    871 CacheModelResult cacheuse_LL_access(Addr memline, line_loaded* l1_loaded)
    872 {
    873    UInt setNo = (memline >> LL.line_size_bits) & (LL.sets_min_1);
    874    UWord* set = &(LL.tags[setNo * LL.assoc]);
    875    UWord tag  = memline & LL.tag_mask;
    876 
    877    int i, j, idx;
    878    UWord tmp_tag;
    879 
    880    CLG_DEBUG(6,"LL.Acc(Memline %#lx): Set %d\n", memline, setNo);
    881 
    882    if (tag == (set[0] & LL.tag_mask)) {
    883      idx = (setNo * LL.assoc) + (set[0] & ~LL.tag_mask);
    884      l1_loaded->dep_use = &(LL.use[idx]);
    885 
    886      CLG_DEBUG(6," Hit0 [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
    887 		 idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
    888 		 LL.use[idx].mask, LL.use[idx].count);
    889      return LL_Hit;
    890    }
    891    for (i = 1; i < LL.assoc; i++) {
    892      if (tag == (set[i] & LL.tag_mask)) {
    893        tmp_tag = set[i];
    894        for (j = i; j > 0; j--) {
    895 	 set[j] = set[j - 1];
    896        }
    897        set[0] = tmp_tag;
    898        idx = (setNo * LL.assoc) + (tmp_tag & ~LL.tag_mask);
    899        l1_loaded->dep_use = &(LL.use[idx]);
    900 
    901 	CLG_DEBUG(6," Hit%d [idx %d] (line %#lx from %#lx): => %08x, count %d\n",
    902 		 i, idx, LL.loaded[idx].memline,  LL.loaded[idx].iaddr,
    903 		 LL.use[idx].mask, LL.use[idx].count);
    904 	return LL_Hit;
    905      }
    906    }
    907 
    908    /* A miss;  install this tag as MRU, shuffle rest down. */
    909    tmp_tag = set[LL.assoc - 1] & ~LL.tag_mask;
    910    for (j = LL.assoc - 1; j > 0; j--) {
    911      set[j] = set[j - 1];
    912    }
    913    set[0] = tag | tmp_tag;
    914    idx = (setNo * LL.assoc) + tmp_tag;
    915    l1_loaded->dep_use = &(LL.use[idx]);
    916 
    917    update_LL_use(idx, memline);
    918 
    919    return MemAccess;
    920 }
    921 
    922 
    923 
    924 
    925 #define UPDATE_USE(L)					             \
    926                                                                      \
    927 static CacheModelResult update##_##L##_use(cache_t2* cache, int idx, \
    928 			       UInt mask, Addr memline)		     \
    929 {                                                                    \
    930   line_loaded* loaded = &(cache->loaded[idx]);			     \
    931   line_use* use = &(cache->use[idx]);				     \
    932   int c = ((32 - countBits(use->mask)) * cache->line_size)>>5;       \
    933                                                                      \
    934   CLG_DEBUG(2, " %s.miss [%d]: at %#lx accessing memline %#lx (mask %08x)\n", \
    935            cache->name, idx, CLG_(bb_base) + current_ii->instr_offset, memline, mask); \
    936   if (use->count>0) {                                                \
    937     CLG_DEBUG(2, "   old: used %d, loss bits %d (%08x) [line %#lx from %#lx]\n",\
    938 	     use->count, c, use->mask, loaded->memline, loaded->iaddr);	\
    939     CLG_DEBUG(2, "   collect: %d, use_base %p\n", \
    940 	     CLG_(current_state).collect, loaded->use_base);	     \
    941                                                                      \
    942     if (CLG_(current_state).collect && loaded->use_base) {           \
    943       (loaded->use_base)[off_##L##_AcCost] += 1000 / use->count;     \
    944       (loaded->use_base)[off_##L##_SpLoss] += c;                     \
    945                                                                      \
    946       /* FIXME (?): L1/LL line sizes must be equal ! */              \
    947       loaded->dep_use->mask |= use->mask;                            \
    948       loaded->dep_use->count += use->count;                          \
    949     }                                                                \
    950   }                                                                  \
    951                                                                      \
    952   use->count = 1;                                                    \
    953   use->mask  = mask;                                                 \
    954   loaded->memline = memline;                                         \
    955   loaded->iaddr   = CLG_(bb_base) + current_ii->instr_offset;        \
    956   loaded->use_base = (CLG_(current_state).nonskipped) ?              \
    957     CLG_(current_state).nonskipped->skipped :                        \
    958     CLG_(cost_base) + current_ii->cost_offset;                       \
    959                                                                      \
    960   if (memline == 0) return LL_Hit;                                   \
    961   return cacheuse_LL_access(memline, loaded);                        \
    962 }
    963 
    964 UPDATE_USE(I1);
    965 UPDATE_USE(D1);
    966 
    967 CACHEUSE(I1);
    968 CACHEUSE(D1);
    969 
    970 
    971 static
    972 void cacheuse_finish(void)
    973 {
    974   int i;
    975   InstrInfo ii = { 0,0,0,0 };
    976 
    977   if (!CLG_(current_state).collect) return;
    978 
    979   CLG_(bb_base) = 0;
    980   current_ii = &ii;
    981   CLG_(cost_base) = 0;
    982 
    983   /* update usage counters */
    984   if (I1.use)
    985     for (i = 0; i < I1.sets * I1.assoc; i++)
    986       if (I1.loaded[i].use_base)
    987 	update_I1_use( &I1, i, 0,0);
    988 
    989   if (D1.use)
    990     for (i = 0; i < D1.sets * D1.assoc; i++)
    991       if (D1.loaded[i].use_base)
    992 	update_D1_use( &D1, i, 0,0);
    993 
    994   if (LL.use)
    995     for (i = 0; i < LL.sets * LL.assoc; i++)
    996       if (LL.loaded[i].use_base)
    997 	update_LL_use(i, 0);
    998 }
    999 
   1000 
   1001 
   1002 /*------------------------------------------------------------*/
   1003 /*--- Helper functions called by instrumented code         ---*/
   1004 /*------------------------------------------------------------*/
   1005 
   1006 
   1007 static __inline__
   1008 void inc_costs(CacheModelResult r, ULong* c1, ULong* c2)
   1009 {
   1010     switch(r) {
   1011 	case WriteBackMemAccess:
   1012 	    if (clo_simulate_writeback) {
   1013 		c1[3]++;
   1014 		c2[3]++;
   1015 	    }
   1016 	    // fall through
   1017 
   1018 	case MemAccess:
   1019 	    c1[2]++;
   1020 	    c2[2]++;
   1021 	    // fall through
   1022 
   1023 	case LL_Hit:
   1024 	    c1[1]++;
   1025 	    c2[1]++;
   1026 	    // fall through
   1027 
   1028 	default:
   1029 	    c1[0]++;
   1030 	    c2[0]++;
   1031     }
   1032 }
   1033 
   1034 static
   1035 Char* cacheRes(CacheModelResult r)
   1036 {
   1037     switch(r) {
   1038     case L1_Hit:    return "L1 Hit ";
   1039     case LL_Hit:    return "LL Hit ";
   1040     case MemAccess: return "LL Miss";
   1041     case WriteBackMemAccess: return "LL Miss (dirty)";
   1042     default:
   1043 	tl_assert(0);
   1044     }
   1045     return "??";
   1046 }
   1047 
   1048 VG_REGPARM(1)
   1049 static void log_1I0D(InstrInfo* ii)
   1050 {
   1051     CacheModelResult IrRes;
   1052 
   1053     current_ii = ii;
   1054     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1055 
   1056     CLG_DEBUG(6, "log_1I0D:  Ir  %#lx/%u => %s\n",
   1057               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes));
   1058 
   1059     if (CLG_(current_state).collect) {
   1060 	ULong* cost_Ir;
   1061 
   1062 	if (CLG_(current_state).nonskipped)
   1063 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1064 	else
   1065             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1066 
   1067 	inc_costs(IrRes, cost_Ir,
   1068 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1069     }
   1070 }
   1071 
   1072 VG_REGPARM(2)
   1073 static void log_2I0D(InstrInfo* ii1, InstrInfo* ii2)
   1074 {
   1075     CacheModelResult Ir1Res, Ir2Res;
   1076     ULong *global_cost_Ir;
   1077 
   1078     current_ii = ii1;
   1079     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
   1080     current_ii = ii2;
   1081     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
   1082 
   1083     CLG_DEBUG(6, "log_2I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s\n",
   1084               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
   1085               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res) );
   1086 
   1087     if (!CLG_(current_state).collect) return;
   1088 
   1089     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
   1090     if (CLG_(current_state).nonskipped) {
   1091 	ULong* skipped_cost_Ir =
   1092 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1093 
   1094 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
   1095 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
   1096 	return;
   1097     }
   1098 
   1099     inc_costs(Ir1Res, global_cost_Ir,
   1100               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
   1101     inc_costs(Ir2Res, global_cost_Ir,
   1102               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
   1103 }
   1104 
   1105 VG_REGPARM(3)
   1106 static void log_3I0D(InstrInfo* ii1, InstrInfo* ii2, InstrInfo* ii3)
   1107 {
   1108     CacheModelResult Ir1Res, Ir2Res, Ir3Res;
   1109     ULong *global_cost_Ir;
   1110 
   1111     current_ii = ii1;
   1112     Ir1Res = (*simulator.I1_Read)(CLG_(bb_base) + ii1->instr_offset, ii1->instr_size);
   1113     current_ii = ii2;
   1114     Ir2Res = (*simulator.I1_Read)(CLG_(bb_base) + ii2->instr_offset, ii2->instr_size);
   1115     current_ii = ii3;
   1116     Ir3Res = (*simulator.I1_Read)(CLG_(bb_base) + ii3->instr_offset, ii3->instr_size);
   1117 
   1118     CLG_DEBUG(6, "log_3I0D:  Ir1 %#lx/%u => %s, Ir2 %#lx/%u => %s, Ir3 %#lx/%u => %s\n",
   1119               CLG_(bb_base) + ii1->instr_offset, ii1->instr_size, cacheRes(Ir1Res),
   1120               CLG_(bb_base) + ii2->instr_offset, ii2->instr_size, cacheRes(Ir2Res),
   1121               CLG_(bb_base) + ii3->instr_offset, ii3->instr_size, cacheRes(Ir3Res) );
   1122 
   1123     if (!CLG_(current_state).collect) return;
   1124 
   1125     global_cost_Ir = CLG_(current_state).cost + fullOffset(EG_IR);
   1126     if (CLG_(current_state).nonskipped) {
   1127 	ULong* skipped_cost_Ir =
   1128 	    CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1129 	inc_costs(Ir1Res, global_cost_Ir, skipped_cost_Ir);
   1130 	inc_costs(Ir2Res, global_cost_Ir, skipped_cost_Ir);
   1131 	inc_costs(Ir3Res, global_cost_Ir, skipped_cost_Ir);
   1132 	return;
   1133     }
   1134 
   1135     inc_costs(Ir1Res, global_cost_Ir,
   1136               CLG_(cost_base) + ii1->cost_offset + ii1->eventset->offset[EG_IR]);
   1137     inc_costs(Ir2Res, global_cost_Ir,
   1138               CLG_(cost_base) + ii2->cost_offset + ii2->eventset->offset[EG_IR]);
   1139     inc_costs(Ir3Res, global_cost_Ir,
   1140               CLG_(cost_base) + ii3->cost_offset + ii3->eventset->offset[EG_IR]);
   1141 }
   1142 
   1143 /* Instruction doing a read access */
   1144 
   1145 VG_REGPARM(3)
   1146 static void log_1I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
   1147 {
   1148     CacheModelResult IrRes, DrRes;
   1149 
   1150     current_ii = ii;
   1151     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1152     DrRes = (*simulator.D1_Read)(data_addr, data_size);
   1153 
   1154     CLG_DEBUG(6, "log_1I1Dr: Ir  %#lx/%u => %s, Dr  %#lx/%lu => %s\n",
   1155               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
   1156 	      data_addr, data_size, cacheRes(DrRes));
   1157 
   1158     if (CLG_(current_state).collect) {
   1159 	ULong *cost_Ir, *cost_Dr;
   1160 
   1161 	if (CLG_(current_state).nonskipped) {
   1162 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1163 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
   1164 	}
   1165 	else {
   1166             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1167             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
   1168 	}
   1169 
   1170 	inc_costs(IrRes, cost_Ir,
   1171 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1172 	inc_costs(DrRes, cost_Dr,
   1173 		  CLG_(current_state).cost + fullOffset(EG_DR) );
   1174     }
   1175 }
   1176 
   1177 
   1178 VG_REGPARM(3)
   1179 static void log_0I1Dr(InstrInfo* ii, Addr data_addr, Word data_size)
   1180 {
   1181     CacheModelResult DrRes;
   1182 
   1183     current_ii = ii;
   1184     DrRes = (*simulator.D1_Read)(data_addr, data_size);
   1185 
   1186     CLG_DEBUG(6, "log_0I1Dr: Dr  %#lx/%lu => %s\n",
   1187 	      data_addr, data_size, cacheRes(DrRes));
   1188 
   1189     if (CLG_(current_state).collect) {
   1190 	ULong *cost_Dr;
   1191 
   1192 	if (CLG_(current_state).nonskipped)
   1193 	    cost_Dr = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DR);
   1194 	else
   1195             cost_Dr = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DR];
   1196 
   1197 	inc_costs(DrRes, cost_Dr,
   1198 		  CLG_(current_state).cost + fullOffset(EG_DR) );
   1199     }
   1200 }
   1201 
   1202 
   1203 /* Instruction doing a write access */
   1204 
   1205 VG_REGPARM(3)
   1206 static void log_1I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
   1207 {
   1208     CacheModelResult IrRes, DwRes;
   1209 
   1210     current_ii = ii;
   1211     IrRes = (*simulator.I1_Read)(CLG_(bb_base) + ii->instr_offset, ii->instr_size);
   1212     DwRes = (*simulator.D1_Write)(data_addr, data_size);
   1213 
   1214     CLG_DEBUG(6, "log_1I1Dw: Ir  %#lx/%u => %s, Dw  %#lx/%lu => %s\n",
   1215               CLG_(bb_base) + ii->instr_offset, ii->instr_size, cacheRes(IrRes),
   1216 	      data_addr, data_size, cacheRes(DwRes));
   1217 
   1218     if (CLG_(current_state).collect) {
   1219 	ULong *cost_Ir, *cost_Dw;
   1220 
   1221 	if (CLG_(current_state).nonskipped) {
   1222 	    cost_Ir = CLG_(current_state).nonskipped->skipped + fullOffset(EG_IR);
   1223 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
   1224 	}
   1225 	else {
   1226             cost_Ir = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_IR];
   1227             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
   1228 	}
   1229 
   1230 	inc_costs(IrRes, cost_Ir,
   1231 		  CLG_(current_state).cost + fullOffset(EG_IR) );
   1232 	inc_costs(DwRes, cost_Dw,
   1233 		  CLG_(current_state).cost + fullOffset(EG_DW) );
   1234     }
   1235 }
   1236 
   1237 VG_REGPARM(3)
   1238 static void log_0I1Dw(InstrInfo* ii, Addr data_addr, Word data_size)
   1239 {
   1240     CacheModelResult DwRes;
   1241 
   1242     current_ii = ii;
   1243     DwRes = (*simulator.D1_Write)(data_addr, data_size);
   1244 
   1245     CLG_DEBUG(6, "log_0I1Dw: Dw  %#lx/%lu => %s\n",
   1246 	      data_addr, data_size, cacheRes(DwRes));
   1247 
   1248     if (CLG_(current_state).collect) {
   1249 	ULong *cost_Dw;
   1250 
   1251 	if (CLG_(current_state).nonskipped)
   1252 	    cost_Dw = CLG_(current_state).nonskipped->skipped + fullOffset(EG_DW);
   1253 	else
   1254             cost_Dw = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_DW];
   1255 
   1256 	inc_costs(DwRes, cost_Dw,
   1257 		  CLG_(current_state).cost + fullOffset(EG_DW) );
   1258     }
   1259 }
   1260 
   1261 
   1262 
   1263 /*------------------------------------------------------------*/
   1264 /*--- Cache configuration                                  ---*/
   1265 /*------------------------------------------------------------*/
   1266 
   1267 #define UNDEFINED_CACHE     ((cache_t) { -1, -1, -1 })
   1268 
   1269 static cache_t clo_I1_cache = UNDEFINED_CACHE;
   1270 static cache_t clo_D1_cache = UNDEFINED_CACHE;
   1271 static cache_t clo_LL_cache = UNDEFINED_CACHE;
   1272 
   1273 
   1274 // Checks cache config is ok.  Returns NULL if ok, or a pointer to an error
   1275 // string otherwise.
   1276 static Char* check_cache(cache_t* cache)
   1277 {
   1278    // Simulator requires line size and set count to be powers of two.
   1279    if (( cache->size % (cache->line_size * cache->assoc) != 0) ||
   1280        (-1 == VG_(log2)(cache->size/cache->line_size/cache->assoc)))
   1281    {
   1282       return "Cache set count is not a power of two.\n";
   1283    }
   1284 
   1285    // Simulator requires line size to be a power of two.
   1286    if (-1 == VG_(log2)(cache->line_size)) {
   1287       return "Cache line size is not a power of two.\n";
   1288    }
   1289 
   1290    // Then check line size >= 16 -- any smaller and a single instruction could
   1291    // straddle three cache lines, which breaks a simulation assertion and is
   1292    // stupid anyway.
   1293    if (cache->line_size < MIN_LINE_SIZE) {
   1294       return "Cache line size is too small.\n";
   1295    }
   1296 
   1297    /* Then check cache size > line size (causes seg faults if not). */
   1298    if (cache->size <= cache->line_size) {
   1299       return "Cache size <= line size.\n";
   1300    }
   1301 
   1302    /* Then check assoc <= (size / line size) (seg faults otherwise). */
   1303    if (cache->assoc > (cache->size / cache->line_size)) {
   1304       return "Cache associativity > (size / line size).\n";
   1305    }
   1306 
   1307    return NULL;
   1308 }
   1309 
   1310 static
   1311 void configure_caches(cache_t* I1c, cache_t* D1c, cache_t* LLc)
   1312 {
   1313 #define DEFINED(L)   (-1 != L.size  || -1 != L.assoc || -1 != L.line_size)
   1314 
   1315    Char* checkRes;
   1316 
   1317    Bool all_caches_clo_defined =
   1318       (DEFINED(clo_I1_cache) &&
   1319        DEFINED(clo_D1_cache) &&
   1320        DEFINED(clo_LL_cache));
   1321 
   1322    // Set the cache config (using auto-detection, if supported by the
   1323    // architecture).
   1324    VG_(configure_caches)( I1c, D1c, LLc, all_caches_clo_defined );
   1325 
   1326    // Check the default/auto-detected values.
   1327    checkRes = check_cache(I1c);  tl_assert(!checkRes);
   1328    checkRes = check_cache(D1c);  tl_assert(!checkRes);
   1329    checkRes = check_cache(LLc);  tl_assert(!checkRes);
   1330 
   1331    // Then replace with any defined on the command line.
   1332    if (DEFINED(clo_I1_cache)) { *I1c = clo_I1_cache; }
   1333    if (DEFINED(clo_D1_cache)) { *D1c = clo_D1_cache; }
   1334    if (DEFINED(clo_LL_cache)) { *LLc = clo_LL_cache; }
   1335 
   1336    if (VG_(clo_verbosity) > 1) {
   1337       VG_(umsg)("Cache configuration used:\n");
   1338       VG_(umsg)("  I1: %dB, %d-way, %dB lines\n",
   1339                 I1c->size, I1c->assoc, I1c->line_size);
   1340       VG_(umsg)("  D1: %dB, %d-way, %dB lines\n",
   1341                 D1c->size, D1c->assoc, D1c->line_size);
   1342       VG_(umsg)("  LL: %dB, %d-way, %dB lines\n",
   1343                 LLc->size, LLc->assoc, LLc->line_size);
   1344    }
   1345 #undef CMD_LINE_DEFINED
   1346 }
   1347 
   1348 
   1349 /* Initialize and clear simulator state */
   1350 static void cachesim_post_clo_init(void)
   1351 {
   1352   /* Cache configurations. */
   1353   cache_t  I1c, D1c, LLc;
   1354 
   1355   /* Initialize access handlers */
   1356   if (!CLG_(clo).simulate_cache) {
   1357     CLG_(cachesim).log_1I0D  = 0;
   1358     CLG_(cachesim).log_1I0D_name = "(no function)";
   1359     CLG_(cachesim).log_2I0D  = 0;
   1360     CLG_(cachesim).log_2I0D_name = "(no function)";
   1361     CLG_(cachesim).log_3I0D  = 0;
   1362     CLG_(cachesim).log_3I0D_name = "(no function)";
   1363 
   1364     CLG_(cachesim).log_1I1Dr = 0;
   1365     CLG_(cachesim).log_1I1Dr_name = "(no function)";
   1366     CLG_(cachesim).log_1I1Dw = 0;
   1367     CLG_(cachesim).log_1I1Dw_name = "(no function)";
   1368 
   1369     CLG_(cachesim).log_0I1Dr = 0;
   1370     CLG_(cachesim).log_0I1Dr_name = "(no function)";
   1371     CLG_(cachesim).log_0I1Dw = 0;
   1372     CLG_(cachesim).log_0I1Dw_name = "(no function)";
   1373     return;
   1374   }
   1375 
   1376   /* Configuration of caches only needed with real cache simulation */
   1377   configure_caches(&I1c, &D1c, &LLc);
   1378 
   1379   I1.name = "I1";
   1380   D1.name = "D1";
   1381   LL.name = "LL";
   1382 
   1383   cachesim_initcache(I1c, &I1);
   1384   cachesim_initcache(D1c, &D1);
   1385   cachesim_initcache(LLc, &LL);
   1386 
   1387   /* the other cache simulators use the standard helpers
   1388    * with dispatching via simulator struct */
   1389 
   1390   CLG_(cachesim).log_1I0D  = log_1I0D;
   1391   CLG_(cachesim).log_1I0D_name  = "log_1I0D";
   1392   CLG_(cachesim).log_2I0D  = log_2I0D;
   1393   CLG_(cachesim).log_2I0D_name  = "log_2I0D";
   1394   CLG_(cachesim).log_3I0D  = log_3I0D;
   1395   CLG_(cachesim).log_3I0D_name  = "log_3I0D";
   1396 
   1397   CLG_(cachesim).log_1I1Dr = log_1I1Dr;
   1398   CLG_(cachesim).log_1I1Dw = log_1I1Dw;
   1399   CLG_(cachesim).log_1I1Dr_name = "log_1I1Dr";
   1400   CLG_(cachesim).log_1I1Dw_name = "log_1I1Dw";
   1401 
   1402   CLG_(cachesim).log_0I1Dr = log_0I1Dr;
   1403   CLG_(cachesim).log_0I1Dw = log_0I1Dw;
   1404   CLG_(cachesim).log_0I1Dr_name = "log_0I1Dr";
   1405   CLG_(cachesim).log_0I1Dw_name = "log_0I1Dw";
   1406 
   1407   if (clo_collect_cacheuse) {
   1408 
   1409       /* Output warning for not supported option combinations */
   1410       if (clo_simulate_hwpref) {
   1411 	  VG_(message)(Vg_DebugMsg,
   1412 		       "warning: prefetch simulation can not be "
   1413                        "used with cache usage\n");
   1414 	  clo_simulate_hwpref = False;
   1415       }
   1416 
   1417       if (clo_simulate_writeback) {
   1418 	  VG_(message)(Vg_DebugMsg,
   1419 		       "warning: write-back simulation can not be "
   1420                        "used with cache usage\n");
   1421 	  clo_simulate_writeback = False;
   1422       }
   1423 
   1424       simulator.I1_Read  = cacheuse_I1_doRead;
   1425       simulator.D1_Read  = cacheuse_D1_doRead;
   1426       simulator.D1_Write = cacheuse_D1_doRead;
   1427       return;
   1428   }
   1429 
   1430   if (clo_simulate_hwpref) {
   1431     prefetch_clear();
   1432 
   1433     if (clo_simulate_writeback) {
   1434       simulator.I1_Read  = prefetch_I1_Read;
   1435       simulator.D1_Read  = prefetch_D1_Read;
   1436       simulator.D1_Write = prefetch_D1_Write;
   1437     }
   1438     else {
   1439       simulator.I1_Read  = prefetch_I1_ref;
   1440       simulator.D1_Read  = prefetch_D1_ref;
   1441       simulator.D1_Write = prefetch_D1_ref;
   1442     }
   1443 
   1444     return;
   1445   }
   1446 
   1447   if (clo_simulate_writeback) {
   1448       simulator.I1_Read  = cachesim_I1_Read;
   1449       simulator.D1_Read  = cachesim_D1_Read;
   1450       simulator.D1_Write = cachesim_D1_Write;
   1451   }
   1452   else {
   1453       simulator.I1_Read  = cachesim_I1_ref;
   1454       simulator.D1_Read  = cachesim_D1_ref;
   1455       simulator.D1_Write = cachesim_D1_ref;
   1456   }
   1457 }
   1458 
   1459 
   1460 /* Clear simulator state. Has to be initialized before */
   1461 static
   1462 void cachesim_clear(void)
   1463 {
   1464   cachesim_clearcache(&I1);
   1465   cachesim_clearcache(&D1);
   1466   cachesim_clearcache(&LL);
   1467 
   1468   prefetch_clear();
   1469 }
   1470 
   1471 
   1472 static void cachesim_getdesc(Char* buf)
   1473 {
   1474   Int p;
   1475   p = VG_(sprintf)(buf, "\ndesc: I1 cache: %s\n", I1.desc_line);
   1476   p += VG_(sprintf)(buf+p, "desc: D1 cache: %s\n", D1.desc_line);
   1477   VG_(sprintf)(buf+p, "desc: LL cache: %s\n", LL.desc_line);
   1478 }
   1479 
   1480 static
   1481 void cachesim_print_opts(void)
   1482 {
   1483   VG_(printf)(
   1484 "\n   cache simulator options (does cache simulation if used):\n"
   1485 "    --simulate-wb=no|yes      Count write-back events [no]\n"
   1486 "    --simulate-hwpref=no|yes  Simulate hardware prefetch [no]\n"
   1487 #if CLG_EXPERIMENTAL
   1488 "    --simulate-sectors=no|yes Simulate sectored behaviour [no]\n"
   1489 #endif
   1490 "    --cacheuse=no|yes         Collect cache block use [no]\n"
   1491 "    --I1=<size>,<assoc>,<line_size>  set I1 cache manually\n"
   1492 "    --D1=<size>,<assoc>,<line_size>  set D1 cache manually\n"
   1493 "    --LL=<size>,<assoc>,<line_size>  set LL cache manually\n"
   1494 	      );
   1495 }
   1496 
   1497 static void parse_opt ( cache_t* cache,
   1498                         char* opt, Char* optval, UChar kind )
   1499 {
   1500    Long i1, i2, i3;
   1501    Char* endptr;
   1502    Char* checkRes;
   1503 
   1504    // Option argument looks like "65536,2,64".  Extract them.
   1505    i1 = VG_(strtoll10)(optval,   &endptr); if (*endptr != ',')  goto bad;
   1506    i2 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != ',')  goto bad;
   1507    i3 = VG_(strtoll10)(endptr+1, &endptr); if (*endptr != '\0') goto bad;
   1508 
   1509    // Check for overflow.
   1510    cache->size      = (Int)i1;
   1511    cache->assoc     = (Int)i2;
   1512    cache->line_size = (Int)i3;
   1513    if (cache->size      != i1) goto overflow;
   1514    if (cache->assoc     != i2) goto overflow;
   1515    if (cache->line_size != i3) goto overflow;
   1516 
   1517    checkRes = check_cache(cache);
   1518    if (checkRes) {
   1519       VG_(fmsg)("%s", checkRes);
   1520       goto bad;
   1521    }
   1522 
   1523    return;
   1524 
   1525   bad:
   1526    VG_(fmsg_bad_option)(opt, "");
   1527 
   1528   overflow:
   1529    VG_(fmsg_bad_option)(opt,
   1530       "One of the cache parameters was too large and overflowed.\n");
   1531 }
   1532 
   1533 /* Check for command line option for cache configuration.
   1534  * Return False if unknown and not handled.
   1535  *
   1536  * Called from CLG_(process_cmd_line_option)() in clo.c
   1537  */
   1538 static Bool cachesim_parse_opt(Char* arg)
   1539 {
   1540    Char* tmp_str;
   1541 
   1542    if      VG_BOOL_CLO(arg, "--simulate-wb",      clo_simulate_writeback) {}
   1543    else if VG_BOOL_CLO(arg, "--simulate-hwpref",  clo_simulate_hwpref)    {}
   1544    else if VG_BOOL_CLO(arg, "--simulate-sectors", clo_simulate_sectors)   {}
   1545 
   1546    else if VG_BOOL_CLO(arg, "--cacheuse", clo_collect_cacheuse) {
   1547       if (clo_collect_cacheuse) {
   1548          /* Use counters only make sense with fine dumping */
   1549          CLG_(clo).dump_instr = True;
   1550       }
   1551    }
   1552 
   1553    else if VG_STR_CLO(arg, "--I1", tmp_str)
   1554       parse_opt(&clo_I1_cache, arg, tmp_str, 'i');
   1555    else if VG_STR_CLO(arg, "--D1", tmp_str)
   1556       parse_opt(&clo_D1_cache, arg, tmp_str, '1');
   1557    else if (VG_STR_CLO(arg, "--L2", tmp_str) || // for backwards compatibility
   1558             VG_STR_CLO(arg, "--LL", tmp_str))
   1559       parse_opt(&clo_LL_cache, arg, tmp_str, '2');
   1560   else
   1561     return False;
   1562 
   1563   return True;
   1564 }
   1565 
   1566 /* Adds commas to ULong, right justifying in a field field_width wide, returns
   1567  * the string in buf. */
   1568 static
   1569 Int commify(ULong n, int field_width, char* buf)
   1570 {
   1571    int len, n_commas, i, j, new_len, space;
   1572 
   1573    VG_(sprintf)(buf, "%llu", n);
   1574    len = VG_(strlen)(buf);
   1575    n_commas = (len - 1) / 3;
   1576    new_len = len + n_commas;
   1577    space = field_width - new_len;
   1578 
   1579    /* Allow for printing a number in a field_width smaller than it's size */
   1580    if (space < 0) space = 0;
   1581 
   1582    /* Make j = -1 because we copy the '\0' before doing the numbers in groups
   1583     * of three. */
   1584    for (j = -1, i = len ; i >= 0; i--) {
   1585       buf[i + n_commas + space] = buf[i];
   1586 
   1587       if ((i>0) && (3 == ++j)) {
   1588          j = 0;
   1589          n_commas--;
   1590          buf[i + n_commas + space] = ',';
   1591       }
   1592    }
   1593    /* Right justify in field. */
   1594    for (i = 0; i < space; i++)  buf[i] = ' ';
   1595    return new_len;
   1596 }
   1597 
   1598 static
   1599 void percentify(Int n, Int ex, Int field_width, char buf[])
   1600 {
   1601    int i, len, space;
   1602 
   1603    VG_(sprintf)(buf, "%d.%d%%", n / ex, n % ex);
   1604    len = VG_(strlen)(buf);
   1605    space = field_width - len;
   1606    if (space < 0) space = 0;     /* Allow for v. small field_width */
   1607    i = len;
   1608 
   1609    /* Right justify in field */
   1610    for (     ; i >= 0;    i--)  buf[i + space] = buf[i];
   1611    for (i = 0; i < space; i++)  buf[i] = ' ';
   1612 }
   1613 
   1614 static
   1615 void cachesim_printstat(Int l1, Int l2, Int l3)
   1616 {
   1617   FullCost total = CLG_(total_cost), D_total = 0;
   1618   ULong LL_total_m, LL_total_mr, LL_total_mw,
   1619     LL_total, LL_total_r, LL_total_w;
   1620   char buf1[RESULTS_BUF_LEN],
   1621     buf2[RESULTS_BUF_LEN],
   1622     buf3[RESULTS_BUF_LEN];
   1623   Int p;
   1624 
   1625   if ((VG_(clo_verbosity) >1) && clo_simulate_hwpref) {
   1626     VG_(message)(Vg_DebugMsg, "Prefetch Up:       %llu\n",
   1627 		 prefetch_up);
   1628     VG_(message)(Vg_DebugMsg, "Prefetch Down:     %llu\n",
   1629 		 prefetch_down);
   1630     VG_(message)(Vg_DebugMsg, "\n");
   1631   }
   1632 
   1633   commify(total[fullOffset(EG_IR) +1], l1, buf1);
   1634   VG_(message)(Vg_UserMsg, "I1  misses:    %s\n", buf1);
   1635 
   1636   commify(total[fullOffset(EG_IR) +2], l1, buf1);
   1637   VG_(message)(Vg_UserMsg, "LLi misses:    %s\n", buf1);
   1638 
   1639   p = 100;
   1640 
   1641   if (0 == total[fullOffset(EG_IR)])
   1642     total[fullOffset(EG_IR)] = 1;
   1643 
   1644   percentify(total[fullOffset(EG_IR)+1] * 100 * p /
   1645 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
   1646   VG_(message)(Vg_UserMsg, "I1  miss rate: %s\n", buf1);
   1647 
   1648   percentify(total[fullOffset(EG_IR)+2] * 100 * p /
   1649 	     total[fullOffset(EG_IR)], p, l1+1, buf1);
   1650   VG_(message)(Vg_UserMsg, "LLi miss rate: %s\n", buf1);
   1651   VG_(message)(Vg_UserMsg, "\n");
   1652 
   1653   /* D cache results.
   1654      Use the D_refs.rd and D_refs.wr values to determine the
   1655    * width of columns 2 & 3. */
   1656 
   1657   D_total = CLG_(get_eventset_cost)( CLG_(sets).full );
   1658   CLG_(init_cost)( CLG_(sets).full, D_total);
   1659   // we only use the first 3 values of D_total, adding up Dr and Dw costs
   1660   CLG_(copy_cost)( CLG_(get_event_set)(EG_DR), D_total, total + fullOffset(EG_DR) );
   1661   CLG_(add_cost) ( CLG_(get_event_set)(EG_DW), D_total, total + fullOffset(EG_DW) );
   1662 
   1663   commify( D_total[0], l1, buf1);
   1664   commify(total[fullOffset(EG_DR)], l2,  buf2);
   1665   commify(total[fullOffset(EG_DW)], l3,  buf3);
   1666   VG_(message)(Vg_UserMsg, "D   refs:      %s  (%s rd + %s wr)\n",
   1667 	       buf1,  buf2,  buf3);
   1668 
   1669   commify( D_total[1], l1, buf1);
   1670   commify(total[fullOffset(EG_DR)+1], l2, buf2);
   1671   commify(total[fullOffset(EG_DW)+1], l3, buf3);
   1672   VG_(message)(Vg_UserMsg, "D1  misses:    %s  (%s rd + %s wr)\n",
   1673 	       buf1, buf2, buf3);
   1674 
   1675   commify( D_total[2], l1, buf1);
   1676   commify(total[fullOffset(EG_DR)+2], l2, buf2);
   1677   commify(total[fullOffset(EG_DW)+2], l3, buf3);
   1678   VG_(message)(Vg_UserMsg, "LLd misses:    %s  (%s rd + %s wr)\n",
   1679 	       buf1, buf2, buf3);
   1680 
   1681   p = 10;
   1682 
   1683   if (0 == D_total[0])   D_total[0] = 1;
   1684   if (0 == total[fullOffset(EG_DR)]) total[fullOffset(EG_DR)] = 1;
   1685   if (0 == total[fullOffset(EG_DW)]) total[fullOffset(EG_DW)] = 1;
   1686 
   1687   percentify( D_total[1] * 100 * p / D_total[0],  p, l1+1, buf1);
   1688   percentify(total[fullOffset(EG_DR)+1] * 100 * p /
   1689 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
   1690   percentify(total[fullOffset(EG_DW)+1] * 100 * p /
   1691 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
   1692   VG_(message)(Vg_UserMsg, "D1  miss rate: %s (%s   + %s  )\n",
   1693                buf1, buf2,buf3);
   1694 
   1695   percentify( D_total[2] * 100 * p / D_total[0],  p, l1+1, buf1);
   1696   percentify(total[fullOffset(EG_DR)+2] * 100 * p /
   1697 	     total[fullOffset(EG_DR)], p, l2+1, buf2);
   1698   percentify(total[fullOffset(EG_DW)+2] * 100 * p /
   1699 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
   1700   VG_(message)(Vg_UserMsg, "LLd miss rate: %s (%s   + %s  )\n",
   1701                buf1, buf2,buf3);
   1702   VG_(message)(Vg_UserMsg, "\n");
   1703 
   1704 
   1705 
   1706   /* LL overall results */
   1707 
   1708   LL_total   =
   1709     total[fullOffset(EG_DR) +1] +
   1710     total[fullOffset(EG_DW) +1] +
   1711     total[fullOffset(EG_IR) +1];
   1712   LL_total_r =
   1713     total[fullOffset(EG_DR) +1] +
   1714     total[fullOffset(EG_IR) +1];
   1715   LL_total_w = total[fullOffset(EG_DW) +1];
   1716   commify(LL_total,   l1, buf1);
   1717   commify(LL_total_r, l2, buf2);
   1718   commify(LL_total_w, l3, buf3);
   1719   VG_(message)(Vg_UserMsg, "LL refs:       %s  (%s rd + %s wr)\n",
   1720 	       buf1, buf2, buf3);
   1721 
   1722   LL_total_m  =
   1723     total[fullOffset(EG_DR) +2] +
   1724     total[fullOffset(EG_DW) +2] +
   1725     total[fullOffset(EG_IR) +2];
   1726   LL_total_mr =
   1727     total[fullOffset(EG_DR) +2] +
   1728     total[fullOffset(EG_IR) +2];
   1729   LL_total_mw = total[fullOffset(EG_DW) +2];
   1730   commify(LL_total_m,  l1, buf1);
   1731   commify(LL_total_mr, l2, buf2);
   1732   commify(LL_total_mw, l3, buf3);
   1733   VG_(message)(Vg_UserMsg, "LL misses:     %s  (%s rd + %s wr)\n",
   1734 	       buf1, buf2, buf3);
   1735 
   1736   percentify(LL_total_m  * 100 * p /
   1737 	     (total[fullOffset(EG_IR)] + D_total[0]),  p, l1+1, buf1);
   1738   percentify(LL_total_mr * 100 * p /
   1739 	     (total[fullOffset(EG_IR)] + total[fullOffset(EG_DR)]),
   1740 	     p, l2+1, buf2);
   1741   percentify(LL_total_mw * 100 * p /
   1742 	     total[fullOffset(EG_DW)], p, l3+1, buf3);
   1743   VG_(message)(Vg_UserMsg, "LL miss rate:  %s (%s   + %s  )\n",
   1744 	       buf1, buf2,buf3);
   1745 }
   1746 
   1747 
   1748 /*------------------------------------------------------------*/
   1749 /*--- Setup for Event set.                                 ---*/
   1750 /*------------------------------------------------------------*/
   1751 
   1752 struct event_sets CLG_(sets);
   1753 
   1754 void CLG_(init_eventsets)()
   1755 {
   1756     // Event groups from which the event sets are composed
   1757     // the "Use" group only is used with "cacheuse" simulation
   1758     if (clo_collect_cacheuse)
   1759 	CLG_(register_event_group4)(EG_USE,
   1760 				    "AcCost1", "SpLoss1", "AcCost2", "SpLoss2");
   1761 
   1762     if (!CLG_(clo).simulate_cache)
   1763 	CLG_(register_event_group)(EG_IR, "Ir");
   1764     else if (!clo_simulate_writeback) {
   1765 	CLG_(register_event_group3)(EG_IR, "Ir", "I1mr", "ILmr");
   1766 	CLG_(register_event_group3)(EG_DR, "Dr", "D1mr", "DLmr");
   1767 	CLG_(register_event_group3)(EG_DW, "Dw", "D1mw", "DLmw");
   1768     }
   1769     else { // clo_simulate_writeback
   1770 	CLG_(register_event_group4)(EG_IR, "Ir", "I1mr", "ILmr", "ILdmr");
   1771         CLG_(register_event_group4)(EG_DR, "Dr", "D1mr", "DLmr", "DLdmr");
   1772         CLG_(register_event_group4)(EG_DW, "Dw", "D1mw", "DLmw", "DLdmw");
   1773     }
   1774 
   1775     if (CLG_(clo).simulate_branch) {
   1776         CLG_(register_event_group2)(EG_BC, "Bc", "Bcm");
   1777         CLG_(register_event_group2)(EG_BI, "Bi", "Bim");
   1778     }
   1779 
   1780     if (CLG_(clo).collect_bus)
   1781 	CLG_(register_event_group)(EG_BUS, "Ge");
   1782 
   1783     if (CLG_(clo).collect_alloc)
   1784 	CLG_(register_event_group2)(EG_ALLOC, "allocCount", "allocSize");
   1785 
   1786     if (CLG_(clo).collect_systime)
   1787 	CLG_(register_event_group2)(EG_SYS, "sysCount", "sysTime");
   1788 
   1789     // event set used as base for instruction self cost
   1790     CLG_(sets).base = CLG_(get_event_set2)(EG_USE, EG_IR);
   1791 
   1792     // event set comprising all event groups, used for inclusive cost
   1793     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).base, EG_DR, EG_DW);
   1794     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_BC, EG_BI);
   1795     CLG_(sets).full = CLG_(add_event_group) (CLG_(sets).full, EG_BUS);
   1796     CLG_(sets).full = CLG_(add_event_group2)(CLG_(sets).full, EG_ALLOC, EG_SYS);
   1797 
   1798     CLG_DEBUGIF(1) {
   1799 	CLG_DEBUG(1, "EventSets:\n");
   1800 	CLG_(print_eventset)(-2, CLG_(sets).base);
   1801 	CLG_(print_eventset)(-2, CLG_(sets).full);
   1802     }
   1803 
   1804     /* Not-existing events are silently ignored */
   1805     CLG_(dumpmap) = CLG_(get_eventmapping)(CLG_(sets).full);
   1806     CLG_(append_event)(CLG_(dumpmap), "Ir");
   1807     CLG_(append_event)(CLG_(dumpmap), "Dr");
   1808     CLG_(append_event)(CLG_(dumpmap), "Dw");
   1809     CLG_(append_event)(CLG_(dumpmap), "I1mr");
   1810     CLG_(append_event)(CLG_(dumpmap), "D1mr");
   1811     CLG_(append_event)(CLG_(dumpmap), "D1mw");
   1812     CLG_(append_event)(CLG_(dumpmap), "ILmr");
   1813     CLG_(append_event)(CLG_(dumpmap), "DLmr");
   1814     CLG_(append_event)(CLG_(dumpmap), "DLmw");
   1815     CLG_(append_event)(CLG_(dumpmap), "ILdmr");
   1816     CLG_(append_event)(CLG_(dumpmap), "DLdmr");
   1817     CLG_(append_event)(CLG_(dumpmap), "DLdmw");
   1818     CLG_(append_event)(CLG_(dumpmap), "Bc");
   1819     CLG_(append_event)(CLG_(dumpmap), "Bcm");
   1820     CLG_(append_event)(CLG_(dumpmap), "Bi");
   1821     CLG_(append_event)(CLG_(dumpmap), "Bim");
   1822     CLG_(append_event)(CLG_(dumpmap), "AcCost1");
   1823     CLG_(append_event)(CLG_(dumpmap), "SpLoss1");
   1824     CLG_(append_event)(CLG_(dumpmap), "AcCost2");
   1825     CLG_(append_event)(CLG_(dumpmap), "SpLoss2");
   1826     CLG_(append_event)(CLG_(dumpmap), "Ge");
   1827     CLG_(append_event)(CLG_(dumpmap), "allocCount");
   1828     CLG_(append_event)(CLG_(dumpmap), "allocSize");
   1829     CLG_(append_event)(CLG_(dumpmap), "sysCount");
   1830     CLG_(append_event)(CLG_(dumpmap), "sysTime");
   1831 }
   1832 
   1833 
   1834 /* this is called at dump time for every instruction executed */
   1835 static void cachesim_add_icost(SimCost cost, BBCC* bbcc,
   1836 			       InstrInfo* ii, ULong exe_count)
   1837 {
   1838     if (!CLG_(clo).simulate_cache)
   1839 	cost[ fullOffset(EG_IR) ] += exe_count;
   1840 
   1841     if (ii->eventset)
   1842 	CLG_(add_and_zero_cost2)( CLG_(sets).full, cost,
   1843 				  ii->eventset, bbcc->cost + ii->cost_offset);
   1844 }
   1845 
   1846 static
   1847 void cachesim_finish(void)
   1848 {
   1849   if (clo_collect_cacheuse)
   1850     cacheuse_finish();
   1851 }
   1852 
   1853 /*------------------------------------------------------------*/
   1854 /*--- The simulator defined in this file                   ---*/
   1855 /*------------------------------------------------------------*/
   1856 
   1857 struct cachesim_if CLG_(cachesim) = {
   1858   .print_opts    = cachesim_print_opts,
   1859   .parse_opt     = cachesim_parse_opt,
   1860   .post_clo_init = cachesim_post_clo_init,
   1861   .clear         = cachesim_clear,
   1862   .getdesc       = cachesim_getdesc,
   1863   .printstat     = cachesim_printstat,
   1864   .add_icost     = cachesim_add_icost,
   1865   .finish        = cachesim_finish,
   1866 
   1867   /* these will be set by cachesim_post_clo_init */
   1868   .log_1I0D        = 0,
   1869   .log_2I0D        = 0,
   1870   .log_3I0D        = 0,
   1871 
   1872   .log_1I1Dr       = 0,
   1873   .log_1I1Dw       = 0,
   1874 
   1875   .log_0I1Dr       = 0,
   1876   .log_0I1Dw       = 0,
   1877 
   1878   .log_1I0D_name = "(no function)",
   1879   .log_2I0D_name = "(no function)",
   1880   .log_3I0D_name = "(no function)",
   1881 
   1882   .log_1I1Dr_name = "(no function)",
   1883   .log_1I1Dw_name = "(no function)",
   1884 
   1885   .log_0I1Dr_name = "(no function)",
   1886   .log_0I1Dw_name = "(no function)",
   1887 };
   1888 
   1889 
   1890 /*--------------------------------------------------------------------*/
   1891 /*--- end                                                 ct_sim.c ---*/
   1892 /*--------------------------------------------------------------------*/
   1893 
   1894