Home | History | Annotate | Download | only in cachegrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Cachegrind: everything but the simulation itself.            ---*/
      4 /*---                                                    cg_main.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Cachegrind, a Valgrind tool for cache
      9    profiling programs.
     10 
     11    Copyright (C) 2002-2013 Nicholas Nethercote
     12       njn (at) valgrind.org
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     27    02111-1307, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 #include "pub_tool_basics.h"
     33 #include "pub_tool_vki.h"
     34 #include "pub_tool_debuginfo.h"
     35 #include "pub_tool_libcbase.h"
     36 #include "pub_tool_libcassert.h"
     37 #include "pub_tool_libcfile.h"
     38 #include "pub_tool_libcprint.h"
     39 #include "pub_tool_libcproc.h"
     40 #include "pub_tool_machine.h"
     41 #include "pub_tool_mallocfree.h"
     42 #include "pub_tool_options.h"
     43 #include "pub_tool_oset.h"
     44 #include "pub_tool_tooliface.h"
     45 #include "pub_tool_xarray.h"
     46 #include "pub_tool_clientstate.h"
     47 #include "pub_tool_machine.h"      // VG_(fnptr_to_fnentry)
     48 
     49 #include "cg_arch.h"
     50 #include "cg_sim.c"
     51 #include "cg_branchpred.c"
     52 
     53 /*------------------------------------------------------------*/
     54 /*--- Constants                                            ---*/
     55 /*------------------------------------------------------------*/
     56 
     57 /* Set to 1 for very verbose debugging */
     58 #define DEBUG_CG 0
     59 
     60 #define MIN_LINE_SIZE         16
     61 #define FILE_LEN              VKI_PATH_MAX
     62 #define FN_LEN                256
     63 
     64 /*------------------------------------------------------------*/
     65 /*--- Options                                              ---*/
     66 /*------------------------------------------------------------*/
     67 
     68 static Bool  clo_cache_sim  = True;  /* do cache simulation? */
     69 static Bool  clo_branch_sim = False; /* do branch simulation? */
     70 static const HChar* clo_cachegrind_out_file = "cachegrind.out.%p";
     71 
     72 /*------------------------------------------------------------*/
     73 /*--- Cachesim configuration                               ---*/
     74 /*------------------------------------------------------------*/
     75 
     76 static Int min_line_size = 0; /* min of L1 and LL cache line sizes */
     77 
     78 /*------------------------------------------------------------*/
     79 /*--- Types and Data Structures                            ---*/
     80 /*------------------------------------------------------------*/
     81 
     82 typedef
     83    struct {
     84       ULong a;  /* total # memory accesses of this kind */
     85       ULong m1; /* misses in the first level cache */
     86       ULong mL; /* misses in the second level cache */
     87    }
     88    CacheCC;
     89 
     90 typedef
     91    struct {
     92       ULong b;  /* total # branches of this kind */
     93       ULong mp; /* number of branches mispredicted */
     94    }
     95    BranchCC;
     96 
     97 //------------------------------------------------------------
     98 // Primary data structure #1: CC table
     99 // - Holds the per-source-line hit/miss stats, grouped by file/function/line.
    100 // - an ordered set of CCs.  CC indexing done by file/function/line (as
    101 //   determined from the instrAddr).
    102 // - Traversed for dumping stats at end in file/func/line hierarchy.
    103 
    104 typedef struct {
    105    HChar* file;
    106    HChar* fn;
    107    Int    line;
    108 }
    109 CodeLoc;
    110 
    111 typedef struct {
    112    CodeLoc  loc; /* Source location that these counts pertain to */
    113    CacheCC  Ir;  /* Insn read counts */
    114    CacheCC  Dr;  /* Data read counts */
    115    CacheCC  Dw;  /* Data write/modify counts */
    116    BranchCC Bc;  /* Conditional branch counts */
    117    BranchCC Bi;  /* Indirect branch counts */
    118 } LineCC;
    119 
    120 // First compare file, then fn, then line.
    121 static Word cmp_CodeLoc_LineCC(const void *vloc, const void *vcc)
    122 {
    123    Word res;
    124    const CodeLoc* a = (const CodeLoc*)vloc;
    125    const CodeLoc* b = &(((const LineCC*)vcc)->loc);
    126 
    127    res = VG_(strcmp)(a->file, b->file);
    128    if (0 != res)
    129       return res;
    130 
    131    res = VG_(strcmp)(a->fn, b->fn);
    132    if (0 != res)
    133       return res;
    134 
    135    return a->line - b->line;
    136 }
    137 
    138 static OSet* CC_table;
    139 
    140 //------------------------------------------------------------
    141 // Primary data structure #2: InstrInfo table
    142 // - Holds the cached info about each instr that is used for simulation.
    143 // - table(SB_start_addr, list(InstrInfo))
    144 // - For each SB, each InstrInfo in the list holds info about the
    145 //   instruction (instrLen, instrAddr, etc), plus a pointer to its line
    146 //   CC.  This node is what's passed to the simulation function.
    147 // - When SBs are discarded the relevant list(instr_details) is freed.
    148 
    149 typedef struct _InstrInfo InstrInfo;
    150 struct _InstrInfo {
    151    Addr    instr_addr;
    152    UChar   instr_len;
    153    LineCC* parent;         // parent line-CC
    154 };
    155 
    156 typedef struct _SB_info SB_info;
    157 struct _SB_info {
    158    Addr      SB_addr;      // key;  MUST BE FIRST
    159    Int       n_instrs;
    160    InstrInfo instrs[0];
    161 };
    162 
    163 static OSet* instrInfoTable;
    164 
    165 //------------------------------------------------------------
    166 // Secondary data structure: string table
    167 // - holds strings, avoiding dups
    168 // - used for filenames and function names, each of which will be
    169 //   pointed to by one or more CCs.
    170 // - it also allows equality checks just by pointer comparison, which
    171 //   is good when printing the output file at the end.
    172 
    173 static OSet* stringTable;
    174 
    175 //------------------------------------------------------------
    176 // Stats
    177 static Int  distinct_files      = 0;
    178 static Int  distinct_fns        = 0;
    179 static Int  distinct_lines      = 0;
    180 static Int  distinct_instrsGen  = 0;
    181 static Int  distinct_instrsNoX  = 0;
    182 
    183 static Int  full_debugs         = 0;
    184 static Int  file_line_debugs    = 0;
    185 static Int  fn_debugs           = 0;
    186 static Int  no_debugs           = 0;
    187 
    188 /*------------------------------------------------------------*/
    189 /*--- String table operations                              ---*/
    190 /*------------------------------------------------------------*/
    191 
    192 static Word stringCmp( const void* key, const void* elem )
    193 {
    194    return VG_(strcmp)(*(const HChar *const *)key, *(const HChar *const *)elem);
    195 }
    196 
    197 // Get a permanent string;  either pull it out of the string table if it's
    198 // been encountered before, or dup it and put it into the string table.
    199 static HChar* get_perm_string(HChar* s)
    200 {
    201    HChar** s_ptr = VG_(OSetGen_Lookup)(stringTable, &s);
    202    if (s_ptr) {
    203       return *s_ptr;
    204    } else {
    205       HChar** s_node = VG_(OSetGen_AllocNode)(stringTable, sizeof(HChar*));
    206       *s_node = VG_(strdup)("cg.main.gps.1", s);
    207       VG_(OSetGen_Insert)(stringTable, s_node);
    208       return *s_node;
    209    }
    210 }
    211 
    212 /*------------------------------------------------------------*/
    213 /*--- CC table operations                                  ---*/
    214 /*------------------------------------------------------------*/
    215 
    216 static void get_debug_info(Addr instr_addr, HChar file[FILE_LEN],
    217                            HChar fn[FN_LEN], UInt* line)
    218 {
    219    HChar dir[FILE_LEN];
    220    Bool found_dirname;
    221    Bool found_file_line = VG_(get_filename_linenum)(
    222                              instr_addr,
    223                              file, FILE_LEN,
    224                              dir,  FILE_LEN, &found_dirname,
    225                              line
    226                           );
    227    Bool found_fn        = VG_(get_fnname)(instr_addr, fn, FN_LEN);
    228 
    229    if (!found_file_line) {
    230       VG_(strcpy)(file, "???");
    231       *line = 0;
    232    }
    233    if (!found_fn) {
    234       VG_(strcpy)(fn,  "???");
    235    }
    236 
    237    if (found_dirname) {
    238       // +1 for the '/'.
    239       tl_assert(VG_(strlen)(dir) + VG_(strlen)(file) + 1 < FILE_LEN);
    240       VG_(strcat)(dir, "/");     // Append '/'
    241       VG_(strcat)(dir, file);    // Append file to dir
    242       VG_(strcpy)(file, dir);    // Move dir+file to file
    243    }
    244 
    245    if (found_file_line) {
    246       if (found_fn) full_debugs++;
    247       else          file_line_debugs++;
    248    } else {
    249       if (found_fn) fn_debugs++;
    250       else          no_debugs++;
    251    }
    252 }
    253 
    254 // Do a three step traversal: by file, then fn, then line.
    255 // Returns a pointer to the line CC, creates a new one if necessary.
    256 static LineCC* get_lineCC(Addr origAddr)
    257 {
    258    HChar   file[FILE_LEN], fn[FN_LEN];
    259    UInt    line;
    260    CodeLoc loc;
    261    LineCC* lineCC;
    262 
    263    get_debug_info(origAddr, file, fn, &line);
    264 
    265    loc.file = file;
    266    loc.fn   = fn;
    267    loc.line = line;
    268 
    269    lineCC = VG_(OSetGen_Lookup)(CC_table, &loc);
    270    if (!lineCC) {
    271       // Allocate and zero a new node.
    272       lineCC           = VG_(OSetGen_AllocNode)(CC_table, sizeof(LineCC));
    273       lineCC->loc.file = get_perm_string(loc.file);
    274       lineCC->loc.fn   = get_perm_string(loc.fn);
    275       lineCC->loc.line = loc.line;
    276       lineCC->Ir.a     = 0;
    277       lineCC->Ir.m1    = 0;
    278       lineCC->Ir.mL    = 0;
    279       lineCC->Dr.a     = 0;
    280       lineCC->Dr.m1    = 0;
    281       lineCC->Dr.mL    = 0;
    282       lineCC->Dw.a     = 0;
    283       lineCC->Dw.m1    = 0;
    284       lineCC->Dw.mL    = 0;
    285       lineCC->Bc.b     = 0;
    286       lineCC->Bc.mp    = 0;
    287       lineCC->Bi.b     = 0;
    288       lineCC->Bi.mp    = 0;
    289       VG_(OSetGen_Insert)(CC_table, lineCC);
    290    }
    291 
    292    return lineCC;
    293 }
    294 
    295 /*------------------------------------------------------------*/
    296 /*--- Cache simulation functions                           ---*/
    297 /*------------------------------------------------------------*/
    298 
    299 /* A common case for an instruction read event is that the
    300  * bytes read belong to the same cache line in both L1I and LL
    301  * (if cache line sizes of L1 and LL are the same).
    302  * As this can be detected at instrumentation time, and results
    303  * in faster simulation, special-casing is benefical.
    304  *
    305  * Abbrevations used in var/function names:
    306  *  IrNoX - instruction read does not cross cache lines
    307  *  IrGen - generic instruction read; not detected as IrNoX
    308  *  Ir    - not known / not important whether it is an IrNoX
    309  */
    310 
    311 // Only used with --cache-sim=no.
    312 static VG_REGPARM(1)
    313 void log_1Ir(InstrInfo* n)
    314 {
    315    n->parent->Ir.a++;
    316 }
    317 
    318 // Only used with --cache-sim=no.
    319 static VG_REGPARM(2)
    320 void log_2Ir(InstrInfo* n, InstrInfo* n2)
    321 {
    322    n->parent->Ir.a++;
    323    n2->parent->Ir.a++;
    324 }
    325 
    326 // Only used with --cache-sim=no.
    327 static VG_REGPARM(3)
    328 void log_3Ir(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
    329 {
    330    n->parent->Ir.a++;
    331    n2->parent->Ir.a++;
    332    n3->parent->Ir.a++;
    333 }
    334 
    335 // Generic case for instruction reads: may cross cache lines.
    336 // All other Ir handlers expect IrNoX instruction reads.
    337 static VG_REGPARM(1)
    338 void log_1IrGen_0D_cache_access(InstrInfo* n)
    339 {
    340    //VG_(printf)("1IrGen_0D :  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n",
    341    //             n, n->instr_addr, n->instr_len);
    342    cachesim_I1_doref_Gen(n->instr_addr, n->instr_len,
    343 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    344    n->parent->Ir.a++;
    345 }
    346 
    347 static VG_REGPARM(1)
    348 void log_1IrNoX_0D_cache_access(InstrInfo* n)
    349 {
    350    //VG_(printf)("1IrNoX_0D :  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n",
    351    //             n, n->instr_addr, n->instr_len);
    352    cachesim_I1_doref_NoX(n->instr_addr, n->instr_len,
    353 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    354    n->parent->Ir.a++;
    355 }
    356 
    357 static VG_REGPARM(2)
    358 void log_2IrNoX_0D_cache_access(InstrInfo* n, InstrInfo* n2)
    359 {
    360    //VG_(printf)("2IrNoX_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
    361    //            "            CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n",
    362    //            n,  n->instr_addr,  n->instr_len,
    363    //            n2, n2->instr_addr, n2->instr_len);
    364    cachesim_I1_doref_NoX(n->instr_addr, n->instr_len,
    365 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    366    n->parent->Ir.a++;
    367    cachesim_I1_doref_NoX(n2->instr_addr, n2->instr_len,
    368 			 &n2->parent->Ir.m1, &n2->parent->Ir.mL);
    369    n2->parent->Ir.a++;
    370 }
    371 
    372 static VG_REGPARM(3)
    373 void log_3IrNoX_0D_cache_access(InstrInfo* n, InstrInfo* n2, InstrInfo* n3)
    374 {
    375    //VG_(printf)("3IrNoX_0D : CC1addr=0x%010lx, i1addr=0x%010lx, i1size=%lu\n"
    376    //            "            CC2addr=0x%010lx, i2addr=0x%010lx, i2size=%lu\n"
    377    //            "            CC3addr=0x%010lx, i3addr=0x%010lx, i3size=%lu\n",
    378    //            n,  n->instr_addr,  n->instr_len,
    379    //            n2, n2->instr_addr, n2->instr_len,
    380    //            n3, n3->instr_addr, n3->instr_len);
    381    cachesim_I1_doref_NoX(n->instr_addr, n->instr_len,
    382 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    383    n->parent->Ir.a++;
    384    cachesim_I1_doref_NoX(n2->instr_addr, n2->instr_len,
    385 			 &n2->parent->Ir.m1, &n2->parent->Ir.mL);
    386    n2->parent->Ir.a++;
    387    cachesim_I1_doref_NoX(n3->instr_addr, n3->instr_len,
    388 			 &n3->parent->Ir.m1, &n3->parent->Ir.mL);
    389    n3->parent->Ir.a++;
    390 }
    391 
    392 static VG_REGPARM(3)
    393 void log_1IrNoX_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
    394 {
    395    //VG_(printf)("1IrNoX_1Dr:  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n"
    396    //            "                               daddr=0x%010lx,  dsize=%lu\n",
    397    //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    398    cachesim_I1_doref_NoX(n->instr_addr, n->instr_len,
    399 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    400    n->parent->Ir.a++;
    401 
    402    cachesim_D1_doref(data_addr, data_size,
    403                      &n->parent->Dr.m1, &n->parent->Dr.mL);
    404    n->parent->Dr.a++;
    405 }
    406 
    407 static VG_REGPARM(3)
    408 void log_1IrNoX_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
    409 {
    410    //VG_(printf)("1IrNoX_1Dw:  CCaddr=0x%010lx,  iaddr=0x%010lx,  isize=%lu\n"
    411    //            "                               daddr=0x%010lx,  dsize=%lu\n",
    412    //            n, n->instr_addr, n->instr_len, data_addr, data_size);
    413    cachesim_I1_doref_NoX(n->instr_addr, n->instr_len,
    414 			 &n->parent->Ir.m1, &n->parent->Ir.mL);
    415    n->parent->Ir.a++;
    416 
    417    cachesim_D1_doref(data_addr, data_size,
    418                      &n->parent->Dw.m1, &n->parent->Dw.mL);
    419    n->parent->Dw.a++;
    420 }
    421 
    422 /* Note that addEvent_D_guarded assumes that log_0Ir_1Dr_cache_access
    423    and log_0Ir_1Dw_cache_access have exactly the same prototype.  If
    424    you change them, you must change addEvent_D_guarded too. */
    425 static VG_REGPARM(3)
    426 void log_0Ir_1Dr_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
    427 {
    428    //VG_(printf)("0Ir_1Dr:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
    429    //            n, data_addr, data_size);
    430    cachesim_D1_doref(data_addr, data_size,
    431                      &n->parent->Dr.m1, &n->parent->Dr.mL);
    432    n->parent->Dr.a++;
    433 }
    434 
    435 /* See comment on log_0Ir_1Dr_cache_access. */
    436 static VG_REGPARM(3)
    437 void log_0Ir_1Dw_cache_access(InstrInfo* n, Addr data_addr, Word data_size)
    438 {
    439    //VG_(printf)("0Ir_1Dw:  CCaddr=0x%010lx,  daddr=0x%010lx,  dsize=%lu\n",
    440    //            n, data_addr, data_size);
    441    cachesim_D1_doref(data_addr, data_size,
    442                      &n->parent->Dw.m1, &n->parent->Dw.mL);
    443    n->parent->Dw.a++;
    444 }
    445 
    446 /* For branches, we consult two different predictors, one which
    447    predicts taken/untaken for conditional branches, and the other
    448    which predicts the branch target address for indirect branches
    449    (jump-to-register style ones). */
    450 
    451 static VG_REGPARM(2)
    452 void log_cond_branch(InstrInfo* n, Word taken)
    453 {
    454    //VG_(printf)("cbrnch:  CCaddr=0x%010lx,  taken=0x%010lx\n",
    455    //             n, taken);
    456    n->parent->Bc.b++;
    457    n->parent->Bc.mp
    458       += (1 & do_cond_branch_predict(n->instr_addr, taken));
    459 }
    460 
    461 static VG_REGPARM(2)
    462 void log_ind_branch(InstrInfo* n, UWord actual_dst)
    463 {
    464    //VG_(printf)("ibrnch:  CCaddr=0x%010lx,    dst=0x%010lx\n",
    465    //             n, actual_dst);
    466    n->parent->Bi.b++;
    467    n->parent->Bi.mp
    468       += (1 & do_ind_branch_predict(n->instr_addr, actual_dst));
    469 }
    470 
    471 
    472 /*------------------------------------------------------------*/
    473 /*--- Instrumentation types and structures                 ---*/
    474 /*------------------------------------------------------------*/
    475 
    476 /* Maintain an ordered list of memory events which are outstanding, in
    477    the sense that no IR has yet been generated to do the relevant
    478    helper calls.  The BB is scanned top to bottom and memory events
    479    are added to the end of the list, merging with the most recent
    480    notified event where possible (Dw immediately following Dr and
    481    having the same size and EA can be merged).
    482 
    483    This merging is done so that for architectures which have
    484    load-op-store instructions (x86, amd64), the insn is treated as if
    485    it makes just one memory reference (a modify), rather than two (a
    486    read followed by a write at the same address).
    487 
    488    At various points the list will need to be flushed, that is, IR
    489    generated from it.  That must happen before any possible exit from
    490    the block (the end, or an IRStmt_Exit).  Flushing also takes place
    491    when there is no space to add a new event.
    492 
    493    If we require the simulation statistics to be up to date with
    494    respect to possible memory exceptions, then the list would have to
    495    be flushed before each memory reference.  That would however lose
    496    performance by inhibiting event-merging during flushing.
    497 
    498    Flushing the list consists of walking it start to end and emitting
    499    instrumentation IR for each event, in the order in which they
    500    appear.  It may be possible to emit a single call for two adjacent
    501    events in order to reduce the number of helper function calls made.
    502    For example, it could well be profitable to handle two adjacent Ir
    503    events with a single helper call.  */
    504 
    505 typedef
    506    IRExpr
    507    IRAtom;
    508 
    509 typedef
    510    enum {
    511       Ev_IrNoX,  // Instruction read not crossing cache lines
    512       Ev_IrGen,  // Generic Ir, not being detected as IrNoX
    513       Ev_Dr,     // Data read
    514       Ev_Dw,     // Data write
    515       Ev_Dm,     // Data modify (read then write)
    516       Ev_Bc,     // branch conditional
    517       Ev_Bi      // branch indirect (to unknown destination)
    518    }
    519    EventTag;
    520 
    521 typedef
    522    struct {
    523       EventTag   tag;
    524       InstrInfo* inode;
    525       union {
    526          struct {
    527          } IrGen;
    528          struct {
    529          } IrNoX;
    530          struct {
    531             IRAtom* ea;
    532             Int     szB;
    533          } Dr;
    534          struct {
    535             IRAtom* ea;
    536             Int     szB;
    537          } Dw;
    538          struct {
    539             IRAtom* ea;
    540             Int     szB;
    541          } Dm;
    542          struct {
    543             IRAtom* taken; /* :: Ity_I1 */
    544          } Bc;
    545          struct {
    546             IRAtom* dst;
    547          } Bi;
    548       } Ev;
    549    }
    550    Event;
    551 
    552 static void init_Event ( Event* ev ) {
    553    VG_(memset)(ev, 0, sizeof(Event));
    554 }
    555 
    556 static IRAtom* get_Event_dea ( Event* ev ) {
    557    switch (ev->tag) {
    558       case Ev_Dr: return ev->Ev.Dr.ea;
    559       case Ev_Dw: return ev->Ev.Dw.ea;
    560       case Ev_Dm: return ev->Ev.Dm.ea;
    561       default:    tl_assert(0);
    562    }
    563 }
    564 
    565 static Int get_Event_dszB ( Event* ev ) {
    566    switch (ev->tag) {
    567       case Ev_Dr: return ev->Ev.Dr.szB;
    568       case Ev_Dw: return ev->Ev.Dw.szB;
    569       case Ev_Dm: return ev->Ev.Dm.szB;
    570       default:    tl_assert(0);
    571    }
    572 }
    573 
    574 
    575 /* Up to this many unnotified events are allowed.  Number is
    576    arbitrary.  Larger numbers allow more event merging to occur, but
    577    potentially induce more spilling due to extending live ranges of
    578    address temporaries. */
    579 #define N_EVENTS 16
    580 
    581 
    582 /* A struct which holds all the running state during instrumentation.
    583    Mostly to avoid passing loads of parameters everywhere. */
    584 typedef
    585    struct {
    586       /* The current outstanding-memory-event list. */
    587       Event events[N_EVENTS];
    588       Int   events_used;
    589 
    590       /* The array of InstrInfo bins for the BB. */
    591       SB_info* sbInfo;
    592 
    593       /* Number InstrInfo bins 'used' so far. */
    594       Int sbInfo_i;
    595 
    596       /* The output SB being constructed. */
    597       IRSB* sbOut;
    598    }
    599    CgState;
    600 
    601 
    602 /*------------------------------------------------------------*/
    603 /*--- Instrumentation main                                 ---*/
    604 /*------------------------------------------------------------*/
    605 
    606 // Note that origAddr is the real origAddr, not the address of the first
    607 // instruction in the block (they can be different due to redirection).
    608 static
    609 SB_info* get_SB_info(IRSB* sbIn, Addr origAddr)
    610 {
    611    Int      i, n_instrs;
    612    IRStmt*  st;
    613    SB_info* sbInfo;
    614 
    615    // Count number of original instrs in SB
    616    n_instrs = 0;
    617    for (i = 0; i < sbIn->stmts_used; i++) {
    618       st = sbIn->stmts[i];
    619       if (Ist_IMark == st->tag) n_instrs++;
    620    }
    621 
    622    // Check that we don't have an entry for this BB in the instr-info table.
    623    // If this assertion fails, there has been some screwup:  some
    624    // translations must have been discarded but Cachegrind hasn't discarded
    625    // the corresponding entries in the instr-info table.
    626    sbInfo = VG_(OSetGen_Lookup)(instrInfoTable, &origAddr);
    627    tl_assert(NULL == sbInfo);
    628 
    629    // BB never translated before (at this address, at least;  could have
    630    // been unloaded and then reloaded elsewhere in memory)
    631    sbInfo = VG_(OSetGen_AllocNode)(instrInfoTable,
    632                                 sizeof(SB_info) + n_instrs*sizeof(InstrInfo));
    633    sbInfo->SB_addr  = origAddr;
    634    sbInfo->n_instrs = n_instrs;
    635    VG_(OSetGen_Insert)( instrInfoTable, sbInfo );
    636 
    637    return sbInfo;
    638 }
    639 
    640 
    641 static void showEvent ( Event* ev )
    642 {
    643    switch (ev->tag) {
    644       case Ev_IrGen:
    645          VG_(printf)("IrGen %p\n", ev->inode);
    646          break;
    647       case Ev_IrNoX:
    648          VG_(printf)("IrNoX %p\n", ev->inode);
    649          break;
    650       case Ev_Dr:
    651          VG_(printf)("Dr %p %d EA=", ev->inode, ev->Ev.Dr.szB);
    652          ppIRExpr(ev->Ev.Dr.ea);
    653          VG_(printf)("\n");
    654          break;
    655       case Ev_Dw:
    656          VG_(printf)("Dw %p %d EA=", ev->inode, ev->Ev.Dw.szB);
    657          ppIRExpr(ev->Ev.Dw.ea);
    658          VG_(printf)("\n");
    659          break;
    660       case Ev_Dm:
    661          VG_(printf)("Dm %p %d EA=", ev->inode, ev->Ev.Dm.szB);
    662          ppIRExpr(ev->Ev.Dm.ea);
    663          VG_(printf)("\n");
    664          break;
    665       case Ev_Bc:
    666          VG_(printf)("Bc %p   GA=", ev->inode);
    667          ppIRExpr(ev->Ev.Bc.taken);
    668          VG_(printf)("\n");
    669          break;
    670       case Ev_Bi:
    671          VG_(printf)("Bi %p  DST=", ev->inode);
    672          ppIRExpr(ev->Ev.Bi.dst);
    673          VG_(printf)("\n");
    674          break;
    675       default:
    676          tl_assert(0);
    677          break;
    678    }
    679 }
    680 
    681 // Reserve and initialise an InstrInfo for the first mention of a new insn.
    682 static
    683 InstrInfo* setup_InstrInfo ( CgState* cgs, Addr instr_addr, UInt instr_len )
    684 {
    685    InstrInfo* i_node;
    686    tl_assert(cgs->sbInfo_i >= 0);
    687    tl_assert(cgs->sbInfo_i < cgs->sbInfo->n_instrs);
    688    i_node = &cgs->sbInfo->instrs[ cgs->sbInfo_i ];
    689    i_node->instr_addr = instr_addr;
    690    i_node->instr_len  = instr_len;
    691    i_node->parent     = get_lineCC(instr_addr);
    692    cgs->sbInfo_i++;
    693    return i_node;
    694 }
    695 
    696 
    697 /* Generate code for all outstanding memory events, and mark the queue
    698    empty.  Code is generated into cgs->bbOut, and this activity
    699    'consumes' slots in cgs->sbInfo. */
    700 
    701 static void flushEvents ( CgState* cgs )
    702 {
    703    Int        i, regparms;
    704    const HChar* helperName;
    705    void*      helperAddr;
    706    IRExpr**   argv;
    707    IRExpr*    i_node_expr;
    708    IRDirty*   di;
    709    Event*     ev;
    710    Event*     ev2;
    711    Event*     ev3;
    712 
    713    i = 0;
    714    while (i < cgs->events_used) {
    715 
    716       helperName = NULL;
    717       helperAddr = NULL;
    718       argv       = NULL;
    719       regparms   = 0;
    720 
    721       /* generate IR to notify event i and possibly the ones
    722          immediately following it. */
    723       tl_assert(i >= 0 && i < cgs->events_used);
    724 
    725       ev  = &cgs->events[i];
    726       ev2 = ( i < cgs->events_used-1 ? &cgs->events[i+1] : NULL );
    727       ev3 = ( i < cgs->events_used-2 ? &cgs->events[i+2] : NULL );
    728 
    729       if (DEBUG_CG) {
    730          VG_(printf)("   flush ");
    731          showEvent( ev );
    732       }
    733 
    734       i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
    735 
    736       /* Decide on helper fn to call and args to pass it, and advance
    737          i appropriately. */
    738       switch (ev->tag) {
    739          case Ev_IrNoX:
    740             /* Merge an IrNoX with a following Dr/Dm. */
    741             if (ev2 && (ev2->tag == Ev_Dr || ev2->tag == Ev_Dm)) {
    742                /* Why is this true?  It's because we're merging an Ir
    743                   with a following Dr or Dm.  The Ir derives from the
    744                   instruction's IMark and the Dr/Dm from data
    745                   references which follow it.  In short it holds
    746                   because each insn starts with an IMark, hence an
    747                   Ev_Ir, and so these Dr/Dm must pertain to the
    748                   immediately preceding Ir.  Same applies to analogous
    749                   assertions in the subsequent cases. */
    750                tl_assert(ev2->inode == ev->inode);
    751                helperName = "log_1IrNoX_1Dr_cache_access";
    752                helperAddr = &log_1IrNoX_1Dr_cache_access;
    753                argv = mkIRExprVec_3( i_node_expr,
    754                                      get_Event_dea(ev2),
    755                                      mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    756                regparms = 3;
    757                i += 2;
    758             }
    759             /* Merge an IrNoX with a following Dw. */
    760             else
    761             if (ev2 && ev2->tag == Ev_Dw) {
    762                tl_assert(ev2->inode == ev->inode);
    763                helperName = "log_1IrNoX_1Dw_cache_access";
    764                helperAddr = &log_1IrNoX_1Dw_cache_access;
    765                argv = mkIRExprVec_3( i_node_expr,
    766                                      get_Event_dea(ev2),
    767                                      mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    768                regparms = 3;
    769                i += 2;
    770             }
    771             /* Merge an IrNoX with two following IrNoX's. */
    772             else
    773             if (ev2 && ev3 && ev2->tag == Ev_IrNoX && ev3->tag == Ev_IrNoX)
    774             {
    775                if (clo_cache_sim) {
    776                   helperName = "log_3IrNoX_0D_cache_access";
    777                   helperAddr = &log_3IrNoX_0D_cache_access;
    778                } else {
    779                   helperName = "log_3Ir";
    780                   helperAddr = &log_3Ir;
    781                }
    782                argv = mkIRExprVec_3( i_node_expr,
    783                                      mkIRExpr_HWord( (HWord)ev2->inode ),
    784                                      mkIRExpr_HWord( (HWord)ev3->inode ) );
    785                regparms = 3;
    786                i += 3;
    787             }
    788             /* Merge an IrNoX with one following IrNoX. */
    789             else
    790             if (ev2 && ev2->tag == Ev_IrNoX) {
    791                if (clo_cache_sim) {
    792                   helperName = "log_2IrNoX_0D_cache_access";
    793                   helperAddr = &log_2IrNoX_0D_cache_access;
    794                } else {
    795                   helperName = "log_2Ir";
    796                   helperAddr = &log_2Ir;
    797                }
    798                argv = mkIRExprVec_2( i_node_expr,
    799                                      mkIRExpr_HWord( (HWord)ev2->inode ) );
    800                regparms = 2;
    801                i += 2;
    802             }
    803             /* No merging possible; emit as-is. */
    804             else {
    805                if (clo_cache_sim) {
    806                   helperName = "log_1IrNoX_0D_cache_access";
    807                   helperAddr = &log_1IrNoX_0D_cache_access;
    808                } else {
    809                   helperName = "log_1Ir";
    810                   helperAddr = &log_1Ir;
    811                }
    812                argv = mkIRExprVec_1( i_node_expr );
    813                regparms = 1;
    814                i++;
    815             }
    816             break;
    817          case Ev_IrGen:
    818             if (clo_cache_sim) {
    819 	       helperName = "log_1IrGen_0D_cache_access";
    820 	       helperAddr = &log_1IrGen_0D_cache_access;
    821 	    } else {
    822 	       helperName = "log_1Ir";
    823 	       helperAddr = &log_1Ir;
    824 	    }
    825 	    argv = mkIRExprVec_1( i_node_expr );
    826 	    regparms = 1;
    827 	    i++;
    828             break;
    829          case Ev_Dr:
    830          case Ev_Dm:
    831             /* Data read or modify */
    832             helperName = "log_0Ir_1Dr_cache_access";
    833             helperAddr = &log_0Ir_1Dr_cache_access;
    834             argv = mkIRExprVec_3( i_node_expr,
    835                                   get_Event_dea(ev),
    836                                   mkIRExpr_HWord( get_Event_dszB(ev) ) );
    837             regparms = 3;
    838             i++;
    839             break;
    840          case Ev_Dw:
    841             /* Data write */
    842             helperName = "log_0Ir_1Dw_cache_access";
    843             helperAddr = &log_0Ir_1Dw_cache_access;
    844             argv = mkIRExprVec_3( i_node_expr,
    845                                   get_Event_dea(ev),
    846                                   mkIRExpr_HWord( get_Event_dszB(ev) ) );
    847             regparms = 3;
    848             i++;
    849             break;
    850          case Ev_Bc:
    851             /* Conditional branch */
    852             helperName = "log_cond_branch";
    853             helperAddr = &log_cond_branch;
    854             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
    855             regparms = 2;
    856             i++;
    857             break;
    858          case Ev_Bi:
    859             /* Branch to an unknown destination */
    860             helperName = "log_ind_branch";
    861             helperAddr = &log_ind_branch;
    862             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
    863             regparms = 2;
    864             i++;
    865             break;
    866          default:
    867             tl_assert(0);
    868       }
    869 
    870       /* Add the helper. */
    871       tl_assert(helperName);
    872       tl_assert(helperAddr);
    873       tl_assert(argv);
    874       di = unsafeIRDirty_0_N( regparms,
    875                               helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    876                               argv );
    877       addStmtToIRSB( cgs->sbOut, IRStmt_Dirty(di) );
    878    }
    879 
    880    cgs->events_used = 0;
    881 }
    882 
    883 static void addEvent_Ir ( CgState* cgs, InstrInfo* inode )
    884 {
    885    Event* evt;
    886    if (cgs->events_used == N_EVENTS)
    887       flushEvents(cgs);
    888    tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
    889    evt = &cgs->events[cgs->events_used];
    890    init_Event(evt);
    891    evt->inode    = inode;
    892    if (cachesim_is_IrNoX(inode->instr_addr, inode->instr_len)) {
    893       evt->tag = Ev_IrNoX;
    894       distinct_instrsNoX++;
    895    } else {
    896       evt->tag = Ev_IrGen;
    897       distinct_instrsGen++;
    898    }
    899    cgs->events_used++;
    900 }
    901 
    902 static
    903 void addEvent_Dr ( CgState* cgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    904 {
    905    Event* evt;
    906    tl_assert(isIRAtom(ea));
    907    tl_assert(datasize >= 1 && datasize <= min_line_size);
    908    if (!clo_cache_sim)
    909       return;
    910    if (cgs->events_used == N_EVENTS)
    911       flushEvents(cgs);
    912    tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
    913    evt = &cgs->events[cgs->events_used];
    914    init_Event(evt);
    915    evt->tag       = Ev_Dr;
    916    evt->inode     = inode;
    917    evt->Ev.Dr.szB = datasize;
    918    evt->Ev.Dr.ea  = ea;
    919    cgs->events_used++;
    920 }
    921 
    922 static
    923 void addEvent_Dw ( CgState* cgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    924 {
    925    Event* lastEvt;
    926    Event* evt;
    927 
    928    tl_assert(isIRAtom(ea));
    929    tl_assert(datasize >= 1 && datasize <= min_line_size);
    930 
    931    if (!clo_cache_sim)
    932       return;
    933 
    934    /* Is it possible to merge this write with the preceding read? */
    935    lastEvt = &cgs->events[cgs->events_used-1];
    936    if (cgs->events_used > 0
    937        && lastEvt->tag       == Ev_Dr
    938        && lastEvt->Ev.Dr.szB == datasize
    939        && lastEvt->inode     == inode
    940        && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
    941    {
    942       lastEvt->tag   = Ev_Dm;
    943       return;
    944    }
    945 
    946    /* No.  Add as normal. */
    947    if (cgs->events_used == N_EVENTS)
    948       flushEvents(cgs);
    949    tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
    950    evt = &cgs->events[cgs->events_used];
    951    init_Event(evt);
    952    evt->tag       = Ev_Dw;
    953    evt->inode     = inode;
    954    evt->Ev.Dw.szB = datasize;
    955    evt->Ev.Dw.ea  = ea;
    956    cgs->events_used++;
    957 }
    958 
    959 static
    960 void addEvent_D_guarded ( CgState* cgs, InstrInfo* inode,
    961                           Int datasize, IRAtom* ea, IRAtom* guard,
    962                           Bool isWrite )
    963 {
    964    tl_assert(isIRAtom(ea));
    965    tl_assert(guard);
    966    tl_assert(isIRAtom(guard));
    967    tl_assert(datasize >= 1 && datasize <= min_line_size);
    968 
    969    if (!clo_cache_sim)
    970       return;
    971 
    972    /* Adding guarded memory actions and merging them with the existing
    973       queue is too complex.  Simply flush the queue and add this
    974       action immediately.  Since guarded loads and stores are pretty
    975       rare, this is not thought likely to cause any noticeable
    976       performance loss as a result of the loss of event-merging
    977       opportunities. */
    978    tl_assert(cgs->events_used >= 0);
    979    flushEvents(cgs);
    980    tl_assert(cgs->events_used == 0);
    981    /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
    982    IRExpr*      i_node_expr;
    983    const HChar* helperName;
    984    void*        helperAddr;
    985    IRExpr**     argv;
    986    Int          regparms;
    987    IRDirty*     di;
    988    i_node_expr = mkIRExpr_HWord( (HWord)inode );
    989    helperName  = isWrite ? "log_0Ir_1Dw_cache_access"
    990                          : "log_0Ir_1Dr_cache_access";
    991    helperAddr  = isWrite ? &log_0Ir_1Dw_cache_access
    992                          : &log_0Ir_1Dr_cache_access;
    993    argv        = mkIRExprVec_3( i_node_expr,
    994                                 ea, mkIRExpr_HWord( datasize ) );
    995    regparms    = 3;
    996    di          = unsafeIRDirty_0_N(
    997                     regparms,
    998                     helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    999                     argv );
   1000    di->guard = guard;
   1001    addStmtToIRSB( cgs->sbOut, IRStmt_Dirty(di) );
   1002 }
   1003 
   1004 
   1005 static
   1006 void addEvent_Bc ( CgState* cgs, InstrInfo* inode, IRAtom* guard )
   1007 {
   1008    Event* evt;
   1009    tl_assert(isIRAtom(guard));
   1010    tl_assert(typeOfIRExpr(cgs->sbOut->tyenv, guard)
   1011              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
   1012    if (!clo_branch_sim)
   1013       return;
   1014    if (cgs->events_used == N_EVENTS)
   1015       flushEvents(cgs);
   1016    tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
   1017    evt = &cgs->events[cgs->events_used];
   1018    init_Event(evt);
   1019    evt->tag         = Ev_Bc;
   1020    evt->inode       = inode;
   1021    evt->Ev.Bc.taken = guard;
   1022    cgs->events_used++;
   1023 }
   1024 
   1025 static
   1026 void addEvent_Bi ( CgState* cgs, InstrInfo* inode, IRAtom* whereTo )
   1027 {
   1028    Event* evt;
   1029    tl_assert(isIRAtom(whereTo));
   1030    tl_assert(typeOfIRExpr(cgs->sbOut->tyenv, whereTo)
   1031              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
   1032    if (!clo_branch_sim)
   1033       return;
   1034    if (cgs->events_used == N_EVENTS)
   1035       flushEvents(cgs);
   1036    tl_assert(cgs->events_used >= 0 && cgs->events_used < N_EVENTS);
   1037    evt = &cgs->events[cgs->events_used];
   1038    init_Event(evt);
   1039    evt->tag       = Ev_Bi;
   1040    evt->inode     = inode;
   1041    evt->Ev.Bi.dst = whereTo;
   1042    cgs->events_used++;
   1043 }
   1044 
   1045 ////////////////////////////////////////////////////////////
   1046 
   1047 
   1048 static
   1049 IRSB* cg_instrument ( VgCallbackClosure* closure,
   1050                       IRSB* sbIn,
   1051                       VexGuestLayout* layout,
   1052                       VexGuestExtents* vge,
   1053                       VexArchInfo* archinfo_host,
   1054                       IRType gWordTy, IRType hWordTy )
   1055 {
   1056    Int        i, isize;
   1057    IRStmt*    st;
   1058    Addr64     cia; /* address of current insn */
   1059    CgState    cgs;
   1060    IRTypeEnv* tyenv = sbIn->tyenv;
   1061    InstrInfo* curr_inode = NULL;
   1062 
   1063    if (gWordTy != hWordTy) {
   1064       /* We don't currently support this case. */
   1065       VG_(tool_panic)("host/guest word size mismatch");
   1066    }
   1067 
   1068    // Set up new SB
   1069    cgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
   1070 
   1071    // Copy verbatim any IR preamble preceding the first IMark
   1072    i = 0;
   1073    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
   1074       addStmtToIRSB( cgs.sbOut, sbIn->stmts[i] );
   1075       i++;
   1076    }
   1077 
   1078    // Get the first statement, and initial cia from it
   1079    tl_assert(sbIn->stmts_used > 0);
   1080    tl_assert(i < sbIn->stmts_used);
   1081    st = sbIn->stmts[i];
   1082    tl_assert(Ist_IMark == st->tag);
   1083 
   1084    cia   = st->Ist.IMark.addr;
   1085    isize = st->Ist.IMark.len;
   1086    // If Vex fails to decode an instruction, the size will be zero.
   1087    // Pretend otherwise.
   1088    if (isize == 0) isize = VG_MIN_INSTR_SZB;
   1089 
   1090    // Set up running state and get block info
   1091    tl_assert(closure->readdr == vge->base[0]);
   1092    cgs.events_used = 0;
   1093    cgs.sbInfo      = get_SB_info(sbIn, (Addr)closure->readdr);
   1094    cgs.sbInfo_i    = 0;
   1095 
   1096    if (DEBUG_CG)
   1097       VG_(printf)("\n\n---------- cg_instrument ----------\n");
   1098 
   1099    // Traverse the block, initialising inodes, adding events and flushing as
   1100    // necessary.
   1101    for (/*use current i*/; i < sbIn->stmts_used; i++) {
   1102 
   1103       st = sbIn->stmts[i];
   1104       tl_assert(isFlatIRStmt(st));
   1105 
   1106       switch (st->tag) {
   1107          case Ist_NoOp:
   1108          case Ist_AbiHint:
   1109          case Ist_Put:
   1110          case Ist_PutI:
   1111          case Ist_MBE:
   1112             break;
   1113 
   1114          case Ist_IMark:
   1115             cia   = st->Ist.IMark.addr;
   1116             isize = st->Ist.IMark.len;
   1117 
   1118             // If Vex fails to decode an instruction, the size will be zero.
   1119             // Pretend otherwise.
   1120             if (isize == 0) isize = VG_MIN_INSTR_SZB;
   1121 
   1122             // Sanity-check size.
   1123             tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
   1124                      || VG_CLREQ_SZB == isize );
   1125 
   1126             // Get space for and init the inode, record it as the current one.
   1127             // Subsequent Dr/Dw/Dm events from the same instruction will
   1128             // also use it.
   1129             curr_inode = setup_InstrInfo(&cgs, cia, isize);
   1130 
   1131             addEvent_Ir( &cgs, curr_inode );
   1132             break;
   1133 
   1134          case Ist_WrTmp: {
   1135             IRExpr* data = st->Ist.WrTmp.data;
   1136             if (data->tag == Iex_Load) {
   1137                IRExpr* aexpr = data->Iex.Load.addr;
   1138                // Note also, endianness info is ignored.  I guess
   1139                // that's not interesting.
   1140                addEvent_Dr( &cgs, curr_inode, sizeofIRType(data->Iex.Load.ty),
   1141                                   aexpr );
   1142             }
   1143             break;
   1144          }
   1145 
   1146          case Ist_Store: {
   1147             IRExpr* data  = st->Ist.Store.data;
   1148             IRExpr* aexpr = st->Ist.Store.addr;
   1149             addEvent_Dw( &cgs, curr_inode,
   1150                          sizeofIRType(typeOfIRExpr(tyenv, data)), aexpr );
   1151             break;
   1152          }
   1153 
   1154          case Ist_StoreG: {
   1155             IRStoreG* sg   = st->Ist.StoreG.details;
   1156             IRExpr*   data = sg->data;
   1157             IRExpr*   addr = sg->addr;
   1158             IRType    type = typeOfIRExpr(tyenv, data);
   1159             tl_assert(type != Ity_INVALID);
   1160             addEvent_D_guarded( &cgs, curr_inode,
   1161                                 sizeofIRType(type), addr, sg->guard,
   1162                                 True/*isWrite*/ );
   1163             break;
   1164          }
   1165 
   1166          case Ist_LoadG: {
   1167             IRLoadG* lg       = st->Ist.LoadG.details;
   1168             IRType   type     = Ity_INVALID; /* loaded type */
   1169             IRType   typeWide = Ity_INVALID; /* after implicit widening */
   1170             IRExpr*  addr     = lg->addr;
   1171             typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
   1172             tl_assert(type != Ity_INVALID);
   1173             addEvent_D_guarded( &cgs, curr_inode,
   1174                                 sizeofIRType(type), addr, lg->guard,
   1175                                 False/*!isWrite*/ );
   1176             break;
   1177          }
   1178 
   1179          case Ist_Dirty: {
   1180             Int      dataSize;
   1181             IRDirty* d = st->Ist.Dirty.details;
   1182             if (d->mFx != Ifx_None) {
   1183                /* This dirty helper accesses memory.  Collect the details. */
   1184                tl_assert(d->mAddr != NULL);
   1185                tl_assert(d->mSize != 0);
   1186                dataSize = d->mSize;
   1187                // Large (eg. 28B, 108B, 512B on x86) data-sized
   1188                // instructions will be done inaccurately, but they're
   1189                // very rare and this avoids errors from hitting more
   1190                // than two cache lines in the simulation.
   1191                if (dataSize > min_line_size)
   1192                   dataSize = min_line_size;
   1193                if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
   1194                   addEvent_Dr( &cgs, curr_inode, dataSize, d->mAddr );
   1195                if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
   1196                   addEvent_Dw( &cgs, curr_inode, dataSize, d->mAddr );
   1197             } else {
   1198                tl_assert(d->mAddr == NULL);
   1199                tl_assert(d->mSize == 0);
   1200             }
   1201             break;
   1202          }
   1203 
   1204          case Ist_CAS: {
   1205             /* We treat it as a read and a write of the location.  I
   1206                think that is the same behaviour as it was before IRCAS
   1207                was introduced, since prior to that point, the Vex
   1208                front ends would translate a lock-prefixed instruction
   1209                into a (normal) read followed by a (normal) write. */
   1210             Int    dataSize;
   1211             IRCAS* cas = st->Ist.CAS.details;
   1212             tl_assert(cas->addr != NULL);
   1213             tl_assert(cas->dataLo != NULL);
   1214             dataSize = sizeofIRType(typeOfIRExpr(tyenv, cas->dataLo));
   1215             if (cas->dataHi != NULL)
   1216                dataSize *= 2; /* since it's a doubleword-CAS */
   1217             /* I don't think this can ever happen, but play safe. */
   1218             if (dataSize > min_line_size)
   1219                dataSize = min_line_size;
   1220             addEvent_Dr( &cgs, curr_inode, dataSize, cas->addr );
   1221             addEvent_Dw( &cgs, curr_inode, dataSize, cas->addr );
   1222             break;
   1223          }
   1224 
   1225          case Ist_LLSC: {
   1226             IRType dataTy;
   1227             if (st->Ist.LLSC.storedata == NULL) {
   1228                /* LL */
   1229                dataTy = typeOfIRTemp(tyenv, st->Ist.LLSC.result);
   1230                addEvent_Dr( &cgs, curr_inode,
   1231                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1232                /* flush events before LL, should help SC to succeed */
   1233                flushEvents( &cgs );
   1234             } else {
   1235                /* SC */
   1236                dataTy = typeOfIRExpr(tyenv, st->Ist.LLSC.storedata);
   1237                addEvent_Dw( &cgs, curr_inode,
   1238                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1239             }
   1240             break;
   1241          }
   1242 
   1243          case Ist_Exit: {
   1244             // call branch predictor only if this is a branch in guest code
   1245             if ( (st->Ist.Exit.jk == Ijk_Boring) ||
   1246                  (st->Ist.Exit.jk == Ijk_Call) ||
   1247                  (st->Ist.Exit.jk == Ijk_Ret) )
   1248             {
   1249                /* Stuff to widen the guard expression to a host word, so
   1250                   we can pass it to the branch predictor simulation
   1251                   functions easily. */
   1252                Bool     inverted;
   1253                Addr64   nia, sea;
   1254                IRConst* dst;
   1255                IRType   tyW    = hWordTy;
   1256                IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
   1257                IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
   1258                IRTemp   guard1 = newIRTemp(cgs.sbOut->tyenv, Ity_I1);
   1259                IRTemp   guardW = newIRTemp(cgs.sbOut->tyenv, tyW);
   1260                IRTemp   guard  = newIRTemp(cgs.sbOut->tyenv, tyW);
   1261                IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
   1262                                               : IRExpr_Const(IRConst_U64(1));
   1263 
   1264                /* First we need to figure out whether the side exit got
   1265                   inverted by the ir optimiser.  To do that, figure out
   1266                   the next (fallthrough) instruction's address and the
   1267                   side exit address and see if they are the same. */
   1268                nia = cia + (Addr64)isize;
   1269                if (tyW == Ity_I32)
   1270                   nia &= 0xFFFFFFFFULL;
   1271 
   1272                /* Side exit address */
   1273                dst = st->Ist.Exit.dst;
   1274                if (tyW == Ity_I32) {
   1275                   tl_assert(dst->tag == Ico_U32);
   1276                   sea = (Addr64)(UInt)dst->Ico.U32;
   1277                } else {
   1278                   tl_assert(tyW == Ity_I64);
   1279                   tl_assert(dst->tag == Ico_U64);
   1280                   sea = dst->Ico.U64;
   1281                }
   1282 
   1283                inverted = nia == sea;
   1284 
   1285                /* Widen the guard expression. */
   1286                addStmtToIRSB( cgs.sbOut,
   1287                               IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
   1288                addStmtToIRSB( cgs.sbOut,
   1289                               IRStmt_WrTmp( guardW,
   1290                                             IRExpr_Unop(widen,
   1291                                                         IRExpr_RdTmp(guard1))) );
   1292                /* If the exit is inverted, invert the sense of the guard. */
   1293                addStmtToIRSB(
   1294                      cgs.sbOut,
   1295                      IRStmt_WrTmp(
   1296                            guard,
   1297                            inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
   1298                                     : IRExpr_RdTmp(guardW)
   1299                               ));
   1300                /* And post the event. */
   1301                addEvent_Bc( &cgs, curr_inode, IRExpr_RdTmp(guard) );
   1302             }
   1303 
   1304             /* We may never reach the next statement, so need to flush
   1305                all outstanding transactions now. */
   1306             flushEvents( &cgs );
   1307             break;
   1308          }
   1309 
   1310          default:
   1311             ppIRStmt(st);
   1312             tl_assert(0);
   1313             break;
   1314       }
   1315 
   1316       /* Copy the original statement */
   1317       addStmtToIRSB( cgs.sbOut, st );
   1318 
   1319       if (DEBUG_CG) {
   1320          ppIRStmt(st);
   1321          VG_(printf)("\n");
   1322       }
   1323    }
   1324 
   1325    /* Deal with branches to unknown destinations.  Except ignore ones
   1326       which are function returns as we assume the return stack
   1327       predictor never mispredicts. */
   1328    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
   1329       if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
   1330       switch (sbIn->next->tag) {
   1331          case Iex_Const:
   1332             break; /* boring - branch to known address */
   1333          case Iex_RdTmp:
   1334             /* looks like an indirect branch (branch to unknown) */
   1335             addEvent_Bi( &cgs, curr_inode, sbIn->next );
   1336             break;
   1337          default:
   1338             /* shouldn't happen - if the incoming IR is properly
   1339                flattened, should only have tmp and const cases to
   1340                consider. */
   1341             tl_assert(0);
   1342       }
   1343    }
   1344 
   1345    /* At the end of the bb.  Flush outstandings. */
   1346    flushEvents( &cgs );
   1347 
   1348    /* done.  stay sane ... */
   1349    tl_assert(cgs.sbInfo_i == cgs.sbInfo->n_instrs);
   1350 
   1351    if (DEBUG_CG) {
   1352       VG_(printf)( "goto {");
   1353       ppIRJumpKind(sbIn->jumpkind);
   1354       VG_(printf)( "} ");
   1355       ppIRExpr( sbIn->next );
   1356       VG_(printf)( "}\n");
   1357    }
   1358 
   1359    return cgs.sbOut;
   1360 }
   1361 
   1362 /*------------------------------------------------------------*/
   1363 /*--- Cache configuration                                  ---*/
   1364 /*------------------------------------------------------------*/
   1365 
   1366 #define UNDEFINED_CACHE     { -1, -1, -1 }
   1367 
   1368 static cache_t clo_I1_cache = UNDEFINED_CACHE;
   1369 static cache_t clo_D1_cache = UNDEFINED_CACHE;
   1370 static cache_t clo_LL_cache = UNDEFINED_CACHE;
   1371 
   1372 /*------------------------------------------------------------*/
   1373 /*--- cg_fini() and related function                       ---*/
   1374 /*------------------------------------------------------------*/
   1375 
   1376 // Total reads/writes/misses.  Calculated during CC traversal at the end.
   1377 // All auto-zeroed.
   1378 static CacheCC  Ir_total;
   1379 static CacheCC  Dr_total;
   1380 static CacheCC  Dw_total;
   1381 static BranchCC Bc_total;
   1382 static BranchCC Bi_total;
   1383 
   1384 static void fprint_CC_table_and_calc_totals(void)
   1385 {
   1386    Int     i, fd;
   1387    SysRes  sres;
   1388    HChar    buf[512];
   1389    HChar   *currFile = NULL, *currFn = NULL;
   1390    LineCC* lineCC;
   1391 
   1392    // Setup output filename.  Nb: it's important to do this now, ie. as late
   1393    // as possible.  If we do it at start-up and the program forks and the
   1394    // output file format string contains a %p (pid) specifier, both the
   1395    // parent and child will incorrectly write to the same file;  this
   1396    // happened in 3.3.0.
   1397    HChar* cachegrind_out_file =
   1398       VG_(expand_file_name)("--cachegrind-out-file", clo_cachegrind_out_file);
   1399 
   1400    sres = VG_(open)(cachegrind_out_file, VKI_O_CREAT|VKI_O_TRUNC|VKI_O_WRONLY,
   1401                                          VKI_S_IRUSR|VKI_S_IWUSR);
   1402    if (sr_isError(sres)) {
   1403       // If the file can't be opened for whatever reason (conflict
   1404       // between multiple cachegrinded processes?), give up now.
   1405       VG_(umsg)("error: can't open cache simulation output file '%s'\n",
   1406                 cachegrind_out_file );
   1407       VG_(umsg)("       ... so simulation results will be missing.\n");
   1408       VG_(free)(cachegrind_out_file);
   1409       return;
   1410    } else {
   1411       fd = sr_Res(sres);
   1412       VG_(free)(cachegrind_out_file);
   1413    }
   1414 
   1415    // "desc:" lines (giving I1/D1/LL cache configuration).  The spaces after
   1416    // the 2nd colon makes cg_annotate's output look nicer.
   1417    VG_(sprintf)(buf, "desc: I1 cache:         %s\n"
   1418                      "desc: D1 cache:         %s\n"
   1419                      "desc: LL cache:         %s\n",
   1420                      I1.desc_line, D1.desc_line, LL.desc_line);
   1421    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1422 
   1423    // "cmd:" line
   1424    VG_(strcpy)(buf, "cmd:");
   1425    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1426    if (VG_(args_the_exename)) {
   1427       VG_(write)(fd, " ", 1);
   1428       VG_(write)(fd, VG_(args_the_exename),
   1429                      VG_(strlen)( VG_(args_the_exename) ));
   1430    }
   1431    for (i = 0; i < VG_(sizeXA)( VG_(args_for_client) ); i++) {
   1432       HChar* arg = * (HChar**) VG_(indexXA)( VG_(args_for_client), i );
   1433       if (arg) {
   1434          VG_(write)(fd, " ", 1);
   1435          VG_(write)(fd, arg, VG_(strlen)( arg ));
   1436       }
   1437    }
   1438    // "events:" line
   1439    if (clo_cache_sim && clo_branch_sim) {
   1440       VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
   1441                                   "Bc Bcm Bi Bim\n");
   1442    }
   1443    else if (clo_cache_sim && !clo_branch_sim) {
   1444       VG_(sprintf)(buf, "\nevents: Ir I1mr ILmr Dr D1mr DLmr Dw D1mw DLmw "
   1445                                   "\n");
   1446    }
   1447    else if (!clo_cache_sim && clo_branch_sim) {
   1448       VG_(sprintf)(buf, "\nevents: Ir "
   1449                                   "Bc Bcm Bi Bim\n");
   1450    }
   1451    else {
   1452       VG_(sprintf)(buf, "\nevents: Ir\n");
   1453    }
   1454 
   1455    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1456 
   1457    // Traverse every lineCC
   1458    VG_(OSetGen_ResetIter)(CC_table);
   1459    while ( (lineCC = VG_(OSetGen_Next)(CC_table)) ) {
   1460       Bool just_hit_a_new_file = False;
   1461       // If we've hit a new file, print a "fl=" line.  Note that because
   1462       // each string is stored exactly once in the string table, we can use
   1463       // pointer comparison rather than strcmp() to test for equality, which
   1464       // is good because most of the time the comparisons are equal and so
   1465       // the whole strings would have to be checked.
   1466       if ( lineCC->loc.file != currFile ) {
   1467          currFile = lineCC->loc.file;
   1468          VG_(sprintf)(buf, "fl=%s\n", currFile);
   1469          VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1470          distinct_files++;
   1471          just_hit_a_new_file = True;
   1472       }
   1473       // If we've hit a new function, print a "fn=" line.  We know to do
   1474       // this when the function name changes, and also every time we hit a
   1475       // new file (in which case the new function name might be the same as
   1476       // in the old file, hence the just_hit_a_new_file test).
   1477       if ( just_hit_a_new_file || lineCC->loc.fn != currFn ) {
   1478          currFn = lineCC->loc.fn;
   1479          VG_(sprintf)(buf, "fn=%s\n", currFn);
   1480          VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1481          distinct_fns++;
   1482       }
   1483 
   1484       // Print the LineCC
   1485       if (clo_cache_sim && clo_branch_sim) {
   1486          VG_(sprintf)(buf, "%u %llu %llu %llu"
   1487                              " %llu %llu %llu"
   1488                              " %llu %llu %llu"
   1489                              " %llu %llu %llu %llu\n",
   1490                             lineCC->loc.line,
   1491                             lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
   1492                             lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
   1493                             lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL,
   1494                             lineCC->Bc.b, lineCC->Bc.mp,
   1495                             lineCC->Bi.b, lineCC->Bi.mp);
   1496       }
   1497       else if (clo_cache_sim && !clo_branch_sim) {
   1498          VG_(sprintf)(buf, "%u %llu %llu %llu"
   1499                              " %llu %llu %llu"
   1500                              " %llu %llu %llu\n",
   1501                             lineCC->loc.line,
   1502                             lineCC->Ir.a, lineCC->Ir.m1, lineCC->Ir.mL,
   1503                             lineCC->Dr.a, lineCC->Dr.m1, lineCC->Dr.mL,
   1504                             lineCC->Dw.a, lineCC->Dw.m1, lineCC->Dw.mL);
   1505       }
   1506       else if (!clo_cache_sim && clo_branch_sim) {
   1507          VG_(sprintf)(buf, "%u %llu"
   1508                              " %llu %llu %llu %llu\n",
   1509                             lineCC->loc.line,
   1510                             lineCC->Ir.a,
   1511                             lineCC->Bc.b, lineCC->Bc.mp,
   1512                             lineCC->Bi.b, lineCC->Bi.mp);
   1513       }
   1514       else {
   1515          VG_(sprintf)(buf, "%u %llu\n",
   1516                             lineCC->loc.line,
   1517                             lineCC->Ir.a);
   1518       }
   1519 
   1520       VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1521 
   1522       // Update summary stats
   1523       Ir_total.a  += lineCC->Ir.a;
   1524       Ir_total.m1 += lineCC->Ir.m1;
   1525       Ir_total.mL += lineCC->Ir.mL;
   1526       Dr_total.a  += lineCC->Dr.a;
   1527       Dr_total.m1 += lineCC->Dr.m1;
   1528       Dr_total.mL += lineCC->Dr.mL;
   1529       Dw_total.a  += lineCC->Dw.a;
   1530       Dw_total.m1 += lineCC->Dw.m1;
   1531       Dw_total.mL += lineCC->Dw.mL;
   1532       Bc_total.b  += lineCC->Bc.b;
   1533       Bc_total.mp += lineCC->Bc.mp;
   1534       Bi_total.b  += lineCC->Bi.b;
   1535       Bi_total.mp += lineCC->Bi.mp;
   1536 
   1537       distinct_lines++;
   1538    }
   1539 
   1540    // Summary stats must come after rest of table, since we calculate them
   1541    // during traversal.  */
   1542    if (clo_cache_sim && clo_branch_sim) {
   1543       VG_(sprintf)(buf, "summary:"
   1544                         " %llu %llu %llu"
   1545                         " %llu %llu %llu"
   1546                         " %llu %llu %llu"
   1547                         " %llu %llu %llu %llu\n",
   1548                         Ir_total.a, Ir_total.m1, Ir_total.mL,
   1549                         Dr_total.a, Dr_total.m1, Dr_total.mL,
   1550                         Dw_total.a, Dw_total.m1, Dw_total.mL,
   1551                         Bc_total.b, Bc_total.mp,
   1552                         Bi_total.b, Bi_total.mp);
   1553    }
   1554    else if (clo_cache_sim && !clo_branch_sim) {
   1555       VG_(sprintf)(buf, "summary:"
   1556                         " %llu %llu %llu"
   1557                         " %llu %llu %llu"
   1558                         " %llu %llu %llu\n",
   1559                         Ir_total.a, Ir_total.m1, Ir_total.mL,
   1560                         Dr_total.a, Dr_total.m1, Dr_total.mL,
   1561                         Dw_total.a, Dw_total.m1, Dw_total.mL);
   1562    }
   1563    else if (!clo_cache_sim && clo_branch_sim) {
   1564       VG_(sprintf)(buf, "summary:"
   1565                         " %llu"
   1566                         " %llu %llu %llu %llu\n",
   1567                         Ir_total.a,
   1568                         Bc_total.b, Bc_total.mp,
   1569                         Bi_total.b, Bi_total.mp);
   1570    }
   1571    else {
   1572       VG_(sprintf)(buf, "summary:"
   1573                         " %llu\n",
   1574                         Ir_total.a);
   1575    }
   1576 
   1577    VG_(write)(fd, (void*)buf, VG_(strlen)(buf));
   1578    VG_(close)(fd);
   1579 }
   1580 
   1581 static UInt ULong_width(ULong n)
   1582 {
   1583    UInt w = 0;
   1584    while (n > 0) {
   1585       n = n / 10;
   1586       w++;
   1587    }
   1588    if (w == 0) w = 1;
   1589    return w + (w-1)/3;   // add space for commas
   1590 }
   1591 
   1592 static void cg_fini(Int exitcode)
   1593 {
   1594    static HChar buf1[128], buf2[128], buf3[128], buf4[123];
   1595    static HChar fmt[128];
   1596 
   1597    CacheCC  D_total;
   1598    BranchCC B_total;
   1599    ULong LL_total_m, LL_total_mr, LL_total_mw,
   1600          LL_total, LL_total_r, LL_total_w;
   1601    Int l1, l2, l3;
   1602 
   1603    fprint_CC_table_and_calc_totals();
   1604 
   1605    if (VG_(clo_verbosity) == 0)
   1606       return;
   1607 
   1608    // Nb: this isn't called "MAX" because that overshadows a global on Darwin.
   1609    #define CG_MAX(a, b)  ((a) >= (b) ? (a) : (b))
   1610 
   1611    /* I cache results.  Use the I_refs value to determine the first column
   1612     * width. */
   1613    l1 = ULong_width(Ir_total.a);
   1614    l2 = ULong_width(CG_MAX(Dr_total.a, Bc_total.b));
   1615    l3 = ULong_width(CG_MAX(Dw_total.a, Bi_total.b));
   1616 
   1617    /* Make format string, getting width right for numbers */
   1618    VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
   1619 
   1620    /* Always print this */
   1621    VG_(umsg)(fmt, "I   refs:     ", Ir_total.a);
   1622 
   1623    /* If cache profiling is enabled, show D access numbers and all
   1624       miss numbers */
   1625    if (clo_cache_sim) {
   1626       VG_(umsg)(fmt, "I1  misses:   ", Ir_total.m1);
   1627       VG_(umsg)(fmt, "LLi misses:   ", Ir_total.mL);
   1628 
   1629       if (0 == Ir_total.a) Ir_total.a = 1;
   1630       VG_(percentify)(Ir_total.m1, Ir_total.a, 2, l1+1, buf1);
   1631       VG_(umsg)("I1  miss rate: %s\n", buf1);
   1632 
   1633       VG_(percentify)(Ir_total.mL, Ir_total.a, 2, l1+1, buf1);
   1634       VG_(umsg)("LLi miss rate: %s\n", buf1);
   1635       VG_(umsg)("\n");
   1636 
   1637       /* D cache results.  Use the D_refs.rd and D_refs.wr values to
   1638        * determine the width of columns 2 & 3. */
   1639       D_total.a  = Dr_total.a  + Dw_total.a;
   1640       D_total.m1 = Dr_total.m1 + Dw_total.m1;
   1641       D_total.mL = Dr_total.mL + Dw_total.mL;
   1642 
   1643       /* Make format string, getting width right for numbers */
   1644       VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu rd   + %%,%dllu wr)\n",
   1645                         l1, l2, l3);
   1646 
   1647       VG_(umsg)(fmt, "D   refs:     ",
   1648                      D_total.a, Dr_total.a, Dw_total.a);
   1649       VG_(umsg)(fmt, "D1  misses:   ",
   1650                      D_total.m1, Dr_total.m1, Dw_total.m1);
   1651       VG_(umsg)(fmt, "LLd misses:   ",
   1652                      D_total.mL, Dr_total.mL, Dw_total.mL);
   1653 
   1654       if (0 == D_total.a)  D_total.a = 1;
   1655       if (0 == Dr_total.a) Dr_total.a = 1;
   1656       if (0 == Dw_total.a) Dw_total.a = 1;
   1657       VG_(percentify)( D_total.m1,  D_total.a, 1, l1+1, buf1);
   1658       VG_(percentify)(Dr_total.m1, Dr_total.a, 1, l2+1, buf2);
   1659       VG_(percentify)(Dw_total.m1, Dw_total.a, 1, l3+1, buf3);
   1660       VG_(umsg)("D1  miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
   1661 
   1662       VG_(percentify)( D_total.mL,  D_total.a, 1, l1+1, buf1);
   1663       VG_(percentify)(Dr_total.mL, Dr_total.a, 1, l2+1, buf2);
   1664       VG_(percentify)(Dw_total.mL, Dw_total.a, 1, l3+1, buf3);
   1665       VG_(umsg)("LLd miss rate: %s (%s     + %s  )\n", buf1, buf2,buf3);
   1666       VG_(umsg)("\n");
   1667 
   1668       /* LL overall results */
   1669 
   1670       LL_total   = Dr_total.m1 + Dw_total.m1 + Ir_total.m1;
   1671       LL_total_r = Dr_total.m1 + Ir_total.m1;
   1672       LL_total_w = Dw_total.m1;
   1673       VG_(umsg)(fmt, "LL refs:      ",
   1674                      LL_total, LL_total_r, LL_total_w);
   1675 
   1676       LL_total_m  = Dr_total.mL + Dw_total.mL + Ir_total.mL;
   1677       LL_total_mr = Dr_total.mL + Ir_total.mL;
   1678       LL_total_mw = Dw_total.mL;
   1679       VG_(umsg)(fmt, "LL misses:    ",
   1680                      LL_total_m, LL_total_mr, LL_total_mw);
   1681 
   1682       VG_(percentify)(LL_total_m,  (Ir_total.a + D_total.a),  1, l1+1, buf1);
   1683       VG_(percentify)(LL_total_mr, (Ir_total.a + Dr_total.a), 1, l2+1, buf2);
   1684       VG_(percentify)(LL_total_mw, Dw_total.a,                1, l3+1, buf3);
   1685       VG_(umsg)("LL miss rate:  %s (%s     + %s  )\n", buf1, buf2,buf3);
   1686    }
   1687 
   1688    /* If branch profiling is enabled, show branch overall results. */
   1689    if (clo_branch_sim) {
   1690       /* Make format string, getting width right for numbers */
   1691       VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
   1692                         l1, l2, l3);
   1693 
   1694       if (0 == Bc_total.b)  Bc_total.b = 1;
   1695       if (0 == Bi_total.b)  Bi_total.b = 1;
   1696       B_total.b  = Bc_total.b  + Bi_total.b;
   1697       B_total.mp = Bc_total.mp + Bi_total.mp;
   1698 
   1699       VG_(umsg)("\n");
   1700       VG_(umsg)(fmt, "Branches:     ",
   1701                      B_total.b, Bc_total.b, Bi_total.b);
   1702 
   1703       VG_(umsg)(fmt, "Mispredicts:  ",
   1704                      B_total.mp, Bc_total.mp, Bi_total.mp);
   1705 
   1706       VG_(percentify)(B_total.mp,  B_total.b,  1, l1+1, buf1);
   1707       VG_(percentify)(Bc_total.mp, Bc_total.b, 1, l2+1, buf2);
   1708       VG_(percentify)(Bi_total.mp, Bi_total.b, 1, l3+1, buf3);
   1709 
   1710       VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
   1711    }
   1712 
   1713    // Various stats
   1714    if (VG_(clo_stats)) {
   1715       Int debug_lookups = full_debugs      + fn_debugs +
   1716                           file_line_debugs + no_debugs;
   1717 
   1718       VG_(dmsg)("\n");
   1719       VG_(dmsg)("cachegrind: distinct files     : %d\n", distinct_files);
   1720       VG_(dmsg)("cachegrind: distinct functions : %d\n", distinct_fns);
   1721       VG_(dmsg)("cachegrind: distinct lines     : %d\n", distinct_lines);
   1722       VG_(dmsg)("cachegrind: distinct instrs NoX: %d\n", distinct_instrsNoX);
   1723       VG_(dmsg)("cachegrind: distinct instrs Gen: %d\n", distinct_instrsGen);
   1724       VG_(dmsg)("cachegrind: debug lookups      : %d\n", debug_lookups);
   1725 
   1726       VG_(percentify)(full_debugs,      debug_lookups, 1, 6, buf1);
   1727       VG_(percentify)(file_line_debugs, debug_lookups, 1, 6, buf2);
   1728       VG_(percentify)(fn_debugs,        debug_lookups, 1, 6, buf3);
   1729       VG_(percentify)(no_debugs,        debug_lookups, 1, 6, buf4);
   1730       VG_(dmsg)("cachegrind: with full      info:%s (%d)\n",
   1731                 buf1, full_debugs);
   1732       VG_(dmsg)("cachegrind: with file/line info:%s (%d)\n",
   1733                 buf2, file_line_debugs);
   1734       VG_(dmsg)("cachegrind: with fn name   info:%s (%d)\n",
   1735                 buf3, fn_debugs);
   1736       VG_(dmsg)("cachegrind: with zero      info:%s (%d)\n",
   1737                 buf4, no_debugs);
   1738 
   1739       VG_(dmsg)("cachegrind: string table size: %lu\n",
   1740                 VG_(OSetGen_Size)(stringTable));
   1741       VG_(dmsg)("cachegrind: CC table size: %lu\n",
   1742                 VG_(OSetGen_Size)(CC_table));
   1743       VG_(dmsg)("cachegrind: InstrInfo table size: %lu\n",
   1744                 VG_(OSetGen_Size)(instrInfoTable));
   1745    }
   1746 }
   1747 
   1748 /*--------------------------------------------------------------------*/
   1749 /*--- Discarding BB info                                           ---*/
   1750 /*--------------------------------------------------------------------*/
   1751 
   1752 // Called when a translation is removed from the translation cache for
   1753 // any reason at all: to free up space, because the guest code was
   1754 // unmapped or modified, or for any arbitrary reason.
   1755 static
   1756 void cg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
   1757 {
   1758    SB_info* sbInfo;
   1759    Addr     orig_addr = (Addr)vge.base[0];
   1760 
   1761    tl_assert(vge.n_used > 0);
   1762 
   1763    if (DEBUG_CG)
   1764       VG_(printf)( "discard_basic_block_info: %p, %p, %llu\n",
   1765                    (void*)(Addr)orig_addr,
   1766                    (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
   1767 
   1768    // Get BB info, remove from table, free BB info.  Simple!  Note that we
   1769    // use orig_addr, not the first instruction address in vge.
   1770    sbInfo = VG_(OSetGen_Remove)(instrInfoTable, &orig_addr);
   1771    tl_assert(NULL != sbInfo);
   1772    VG_(OSetGen_FreeNode)(instrInfoTable, sbInfo);
   1773 }
   1774 
   1775 /*--------------------------------------------------------------------*/
   1776 /*--- Command line processing                                      ---*/
   1777 /*--------------------------------------------------------------------*/
   1778 
   1779 static Bool cg_process_cmd_line_option(const HChar* arg)
   1780 {
   1781    if (VG_(str_clo_cache_opt)(arg,
   1782                               &clo_I1_cache,
   1783                               &clo_D1_cache,
   1784                               &clo_LL_cache)) {}
   1785 
   1786    else if VG_STR_CLO( arg, "--cachegrind-out-file", clo_cachegrind_out_file) {}
   1787    else if VG_BOOL_CLO(arg, "--cache-sim",  clo_cache_sim)  {}
   1788    else if VG_BOOL_CLO(arg, "--branch-sim", clo_branch_sim) {}
   1789    else
   1790       return False;
   1791 
   1792    return True;
   1793 }
   1794 
   1795 static void cg_print_usage(void)
   1796 {
   1797    VG_(print_cache_clo_opts)();
   1798    VG_(printf)(
   1799 "    --cache-sim=yes|no  [yes]        collect cache stats?\n"
   1800 "    --branch-sim=yes|no [no]         collect branch prediction stats?\n"
   1801 "    --cachegrind-out-file=<file>     output file name [cachegrind.out.%%p]\n"
   1802    );
   1803 }
   1804 
   1805 static void cg_print_debug_usage(void)
   1806 {
   1807    VG_(printf)(
   1808 "    (none)\n"
   1809    );
   1810 }
   1811 
   1812 /*--------------------------------------------------------------------*/
   1813 /*--- Setup                                                        ---*/
   1814 /*--------------------------------------------------------------------*/
   1815 
   1816 static void cg_post_clo_init(void); /* just below */
   1817 
   1818 static void cg_pre_clo_init(void)
   1819 {
   1820    VG_(details_name)            ("Cachegrind");
   1821    VG_(details_version)         (NULL);
   1822    VG_(details_description)     ("a cache and branch-prediction profiler");
   1823    VG_(details_copyright_author)(
   1824       "Copyright (C) 2002-2013, and GNU GPL'd, by Nicholas Nethercote et al.");
   1825    VG_(details_bug_reports_to)  (VG_BUGS_TO);
   1826    VG_(details_avg_translation_sizeB) ( 500 );
   1827 
   1828    VG_(clo_vex_control).iropt_register_updates
   1829       = VexRegUpdSpAtMemAccess; // overridable by the user.
   1830    VG_(basic_tool_funcs)          (cg_post_clo_init,
   1831                                    cg_instrument,
   1832                                    cg_fini);
   1833 
   1834    VG_(needs_superblock_discards)(cg_discard_superblock_info);
   1835    VG_(needs_command_line_options)(cg_process_cmd_line_option,
   1836                                    cg_print_usage,
   1837                                    cg_print_debug_usage);
   1838 }
   1839 
   1840 static void cg_post_clo_init(void)
   1841 {
   1842    cache_t I1c, D1c, LLc;
   1843 
   1844    CC_table =
   1845       VG_(OSetGen_Create)(offsetof(LineCC, loc),
   1846                           cmp_CodeLoc_LineCC,
   1847                           VG_(malloc), "cg.main.cpci.1",
   1848                           VG_(free));
   1849    instrInfoTable =
   1850       VG_(OSetGen_Create)(/*keyOff*/0,
   1851                           NULL,
   1852                           VG_(malloc), "cg.main.cpci.2",
   1853                           VG_(free));
   1854    stringTable =
   1855       VG_(OSetGen_Create)(/*keyOff*/0,
   1856                           stringCmp,
   1857                           VG_(malloc), "cg.main.cpci.3",
   1858                           VG_(free));
   1859 
   1860    VG_(post_clo_init_configure_caches)(&I1c, &D1c, &LLc,
   1861                                        &clo_I1_cache,
   1862                                        &clo_D1_cache,
   1863                                        &clo_LL_cache);
   1864 
   1865    // min_line_size is used to make sure that we never feed
   1866    // accesses to the simulator straddling more than two
   1867    // cache lines at any cache level
   1868    min_line_size = (I1c.line_size < D1c.line_size) ? I1c.line_size : D1c.line_size;
   1869    min_line_size = (LLc.line_size < min_line_size) ? LLc.line_size : min_line_size;
   1870 
   1871    Int largest_load_or_store_size
   1872       = VG_(machine_get_size_of_largest_guest_register)();
   1873    if (min_line_size < largest_load_or_store_size) {
   1874       /* We can't continue, because the cache simulation might
   1875          straddle more than 2 lines, and it will assert.  So let's
   1876          just stop before we start. */
   1877       VG_(umsg)("Cachegrind: cannot continue: the minimum line size (%d)\n",
   1878                 (Int)min_line_size);
   1879       VG_(umsg)("  must be equal to or larger than the maximum register size (%d)\n",
   1880                 largest_load_or_store_size );
   1881       VG_(umsg)("  but it is not.  Exiting now.\n");
   1882       VG_(exit)(1);
   1883    }
   1884 
   1885    cachesim_initcaches(I1c, D1c, LLc);
   1886 }
   1887 
   1888 VG_DETERMINE_INTERFACE_VERSION(cg_pre_clo_init)
   1889 
   1890 /*--------------------------------------------------------------------*/
   1891 /*--- end                                                          ---*/
   1892 /*--------------------------------------------------------------------*/
   1893 
   1894