Home | History | Annotate | Download | only in callgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Callgrind                                                    ---*/
      4 /*---                                                       main.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Callgrind, a Valgrind tool for call graph
      9    profiling programs.
     10 
     11    Copyright (C) 2002-2011, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     12 
     13    This tool is derived from and contains code from Cachegrind
     14    Copyright (C) 2002-2011 Nicholas Nethercote (njn (at) valgrind.org)
     15 
     16    This program is free software; you can redistribute it and/or
     17    modify it under the terms of the GNU General Public License as
     18    published by the Free Software Foundation; either version 2 of the
     19    License, or (at your option) any later version.
     20 
     21    This program is distributed in the hope that it will be useful, but
     22    WITHOUT ANY WARRANTY; without even the implied warranty of
     23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     24    General Public License for more details.
     25 
     26    You should have received a copy of the GNU General Public License
     27    along with this program; if not, write to the Free Software
     28    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     29    02111-1307, USA.
     30 
     31    The GNU General Public License is contained in the file COPYING.
     32 */
     33 
     34 #include "config.h"
     35 #include "callgrind.h"
     36 #include "global.h"
     37 
     38 #include "pub_tool_threadstate.h"
     39 #include "pub_tool_gdbserver.h"
     40 
     41 #include "cg_branchpred.c"
     42 
     43 /*------------------------------------------------------------*/
     44 /*--- Global variables                                     ---*/
     45 /*------------------------------------------------------------*/
     46 
     47 /* for all threads */
     48 CommandLineOptions CLG_(clo);
     49 Statistics CLG_(stat);
     50 Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
     51 
     52 /* thread and signal handler specific */
     53 exec_state CLG_(current_state);
     54 
     55 
     56 /*------------------------------------------------------------*/
     57 /*--- Statistics                                           ---*/
     58 /*------------------------------------------------------------*/
     59 
     60 static void CLG_(init_statistics)(Statistics* s)
     61 {
     62   s->call_counter        = 0;
     63   s->jcnd_counter        = 0;
     64   s->jump_counter        = 0;
     65   s->rec_call_counter    = 0;
     66   s->ret_counter         = 0;
     67   s->bb_executions       = 0;
     68 
     69   s->context_counter     = 0;
     70   s->bb_retranslations   = 0;
     71 
     72   s->distinct_objs       = 0;
     73   s->distinct_files      = 0;
     74   s->distinct_fns        = 0;
     75   s->distinct_contexts   = 0;
     76   s->distinct_bbs        = 0;
     77   s->distinct_bbccs      = 0;
     78   s->distinct_instrs     = 0;
     79   s->distinct_skips      = 0;
     80 
     81   s->bb_hash_resizes     = 0;
     82   s->bbcc_hash_resizes   = 0;
     83   s->jcc_hash_resizes    = 0;
     84   s->cxt_hash_resizes    = 0;
     85   s->fn_array_resizes    = 0;
     86   s->call_stack_resizes  = 0;
     87   s->fn_stack_resizes    = 0;
     88 
     89   s->full_debug_BBs      = 0;
     90   s->file_line_debug_BBs = 0;
     91   s->fn_name_debug_BBs   = 0;
     92   s->no_debug_BBs        = 0;
     93   s->bbcc_lru_misses     = 0;
     94   s->jcc_lru_misses      = 0;
     95   s->cxt_lru_misses      = 0;
     96   s->bbcc_clones         = 0;
     97 }
     98 
     99 
    100 /*------------------------------------------------------------*/
    101 /*--- Simple callbacks (not cache similator)               ---*/
    102 /*------------------------------------------------------------*/
    103 
    104 VG_REGPARM(1)
    105 static void log_global_event(InstrInfo* ii)
    106 {
    107     ULong* cost_Bus;
    108 
    109     CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
    110               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
    111 
    112     if (!CLG_(current_state).collect) return;
    113 
    114     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
    115 
    116     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
    117 
    118     if (CLG_(current_state).nonskipped)
    119         cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
    120     else
    121         cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
    122     cost_Bus[0]++;
    123 }
    124 
    125 
    126 /* For branches, we consult two different predictors, one which
    127    predicts taken/untaken for conditional branches, and the other
    128    which predicts the branch target address for indirect branches
    129    (jump-to-register style ones). */
    130 
    131 static VG_REGPARM(2)
    132 void log_cond_branch(InstrInfo* ii, Word taken)
    133 {
    134     Bool miss;
    135     Int fullOffset_Bc;
    136     ULong* cost_Bc;
    137 
    138     CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
    139               CLG_(bb_base) + ii->instr_offset, taken);
    140 
    141     miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
    142 
    143     if (!CLG_(current_state).collect) return;
    144 
    145     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
    146 
    147     if (CLG_(current_state).nonskipped)
    148         cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
    149     else
    150         cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
    151 
    152     fullOffset_Bc = fullOffset(EG_BC);
    153     CLG_(current_state).cost[ fullOffset_Bc ]++;
    154     cost_Bc[0]++;
    155     if (miss) {
    156         CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
    157         cost_Bc[1]++;
    158     }
    159 }
    160 
    161 static VG_REGPARM(2)
    162 void log_ind_branch(InstrInfo* ii, UWord actual_dst)
    163 {
    164     Bool miss;
    165     Int fullOffset_Bi;
    166     ULong* cost_Bi;
    167 
    168     CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
    169               CLG_(bb_base) + ii->instr_offset, actual_dst);
    170 
    171     miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
    172 
    173     if (!CLG_(current_state).collect) return;
    174 
    175     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
    176 
    177     if (CLG_(current_state).nonskipped)
    178         cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
    179     else
    180         cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
    181 
    182     fullOffset_Bi = fullOffset(EG_BI);
    183     CLG_(current_state).cost[ fullOffset_Bi ]++;
    184     cost_Bi[0]++;
    185     if (miss) {
    186         CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
    187         cost_Bi[1]++;
    188     }
    189 }
    190 
    191 /*------------------------------------------------------------*/
    192 /*--- Instrumentation structures and event queue handling  ---*/
    193 /*------------------------------------------------------------*/
    194 
    195 /* Maintain an ordered list of memory events which are outstanding, in
    196    the sense that no IR has yet been generated to do the relevant
    197    helper calls.  The BB is scanned top to bottom and memory events
    198    are added to the end of the list, merging with the most recent
    199    notified event where possible (Dw immediately following Dr and
    200    having the same size and EA can be merged).
    201 
    202    This merging is done so that for architectures which have
    203    load-op-store instructions (x86, amd64), the insn is treated as if
    204    it makes just one memory reference (a modify), rather than two (a
    205    read followed by a write at the same address).
    206 
    207    At various points the list will need to be flushed, that is, IR
    208    generated from it.  That must happen before any possible exit from
    209    the block (the end, or an IRStmt_Exit).  Flushing also takes place
    210    when there is no space to add a new event.
    211 
    212    If we require the simulation statistics to be up to date with
    213    respect to possible memory exceptions, then the list would have to
    214    be flushed before each memory reference.  That would however lose
    215    performance by inhibiting event-merging during flushing.
    216 
    217    Flushing the list consists of walking it start to end and emitting
    218    instrumentation IR for each event, in the order in which they
    219    appear.  It may be possible to emit a single call for two adjacent
    220    events in order to reduce the number of helper function calls made.
    221    For example, it could well be profitable to handle two adjacent Ir
    222    events with a single helper call.  */
    223 
    224 typedef
    225    IRExpr
    226    IRAtom;
    227 
    228 typedef
    229    enum {
    230       Ev_Ir,  // Instruction read
    231       Ev_Dr,  // Data read
    232       Ev_Dw,  // Data write
    233       Ev_Dm,  // Data modify (read then write)
    234       Ev_Bc,  // branch conditional
    235       Ev_Bi,  // branch indirect (to unknown destination)
    236       Ev_G    // Global bus event
    237    }
    238    EventTag;
    239 
    240 typedef
    241    struct {
    242       EventTag   tag;
    243       InstrInfo* inode;
    244       union {
    245 	 struct {
    246 	 } Ir;
    247 	 struct {
    248 	    IRAtom* ea;
    249 	    Int     szB;
    250 	 } Dr;
    251 	 struct {
    252 	    IRAtom* ea;
    253 	    Int     szB;
    254 	 } Dw;
    255 	 struct {
    256 	    IRAtom* ea;
    257 	    Int     szB;
    258 	 } Dm;
    259          struct {
    260             IRAtom* taken; /* :: Ity_I1 */
    261          } Bc;
    262          struct {
    263             IRAtom* dst;
    264          } Bi;
    265 	 struct {
    266 	 } G;
    267       } Ev;
    268    }
    269    Event;
    270 
    271 static void init_Event ( Event* ev ) {
    272    VG_(memset)(ev, 0, sizeof(Event));
    273 }
    274 
    275 static IRAtom* get_Event_dea ( Event* ev ) {
    276    switch (ev->tag) {
    277       case Ev_Dr: return ev->Ev.Dr.ea;
    278       case Ev_Dw: return ev->Ev.Dw.ea;
    279       case Ev_Dm: return ev->Ev.Dm.ea;
    280       default:    tl_assert(0);
    281    }
    282 }
    283 
    284 static Int get_Event_dszB ( Event* ev ) {
    285    switch (ev->tag) {
    286       case Ev_Dr: return ev->Ev.Dr.szB;
    287       case Ev_Dw: return ev->Ev.Dw.szB;
    288       case Ev_Dm: return ev->Ev.Dm.szB;
    289       default:    tl_assert(0);
    290    }
    291 }
    292 
    293 
    294 /* Up to this many unnotified events are allowed.  Number is
    295    arbitrary.  Larger numbers allow more event merging to occur, but
    296    potentially induce more spilling due to extending live ranges of
    297    address temporaries. */
    298 #define N_EVENTS 16
    299 
    300 
    301 /* A struct which holds all the running state during instrumentation.
    302    Mostly to avoid passing loads of parameters everywhere. */
    303 typedef struct {
    304     /* The current outstanding-memory-event list. */
    305     Event events[N_EVENTS];
    306     Int   events_used;
    307 
    308     /* The array of InstrInfo's is part of BB struct. */
    309     BB* bb;
    310 
    311     /* BB seen before (ie. re-instrumentation) */
    312     Bool seen_before;
    313 
    314     /* Number InstrInfo bins 'used' so far. */
    315     UInt ii_index;
    316 
    317     // current offset of guest instructions from BB start
    318     UInt instr_offset;
    319 
    320     /* The output SB being constructed. */
    321     IRSB* sbOut;
    322 } ClgState;
    323 
    324 
    325 static void showEvent ( Event* ev )
    326 {
    327    switch (ev->tag) {
    328       case Ev_Ir:
    329 	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
    330 		     ev->inode, ev->inode->instr_offset);
    331 	 break;
    332       case Ev_Dr:
    333 	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
    334 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
    335 	 ppIRExpr(ev->Ev.Dr.ea);
    336 	 VG_(printf)("\n");
    337 	 break;
    338       case Ev_Dw:
    339 	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
    340 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
    341 	 ppIRExpr(ev->Ev.Dw.ea);
    342 	 VG_(printf)("\n");
    343 	 break;
    344       case Ev_Dm:
    345 	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
    346 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
    347 	 ppIRExpr(ev->Ev.Dm.ea);
    348 	 VG_(printf)("\n");
    349 	 break;
    350       case Ev_Bc:
    351          VG_(printf)("Bc %p   GA=", ev->inode);
    352          ppIRExpr(ev->Ev.Bc.taken);
    353          VG_(printf)("\n");
    354          break;
    355       case Ev_Bi:
    356          VG_(printf)("Bi %p  DST=", ev->inode);
    357          ppIRExpr(ev->Ev.Bi.dst);
    358          VG_(printf)("\n");
    359          break;
    360       case Ev_G:
    361          VG_(printf)("G  %p\n", ev->inode);
    362          break;
    363       default:
    364 	 tl_assert(0);
    365 	 break;
    366    }
    367 }
    368 
    369 /* Generate code for all outstanding memory events, and mark the queue
    370    empty.  Code is generated into cgs->sbOut, and this activity
    371    'consumes' slots in cgs->bb. */
    372 
    373 static void flushEvents ( ClgState* clgs )
    374 {
    375    Int        i, regparms, inew;
    376    Char*      helperName;
    377    void*      helperAddr;
    378    IRExpr**   argv;
    379    IRExpr*    i_node_expr;
    380    IRDirty*   di;
    381    Event*     ev;
    382    Event*     ev2;
    383    Event*     ev3;
    384 
    385    if (!clgs->seen_before) {
    386        // extend event sets as needed
    387        // available sets: D0 Dr
    388        for(i=0; i<clgs->events_used; i++) {
    389 	   ev  = &clgs->events[i];
    390 	   switch(ev->tag) {
    391 	   case Ev_Ir:
    392 	       // Ir event always is first for a guest instruction
    393 	       CLG_ASSERT(ev->inode->eventset == 0);
    394 	       ev->inode->eventset = CLG_(sets).base;
    395 	       break;
    396 	   case Ev_Dr:
    397                // extend event set by Dr counters
    398 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    399 							   EG_DR);
    400 	       break;
    401 	   case Ev_Dw:
    402 	   case Ev_Dm:
    403                // extend event set by Dw counters
    404 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    405 							   EG_DW);
    406 	       break;
    407            case Ev_Bc:
    408                // extend event set by Bc counters
    409                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    410                                                            EG_BC);
    411                break;
    412            case Ev_Bi:
    413                // extend event set by Bi counters
    414                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    415                                                            EG_BI);
    416                break;
    417 	   case Ev_G:
    418                // extend event set by Bus counter
    419 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    420 							   EG_BUS);
    421 	       break;
    422 	   default:
    423 	       tl_assert(0);
    424 	   }
    425        }
    426    }
    427 
    428    for(i = 0; i < clgs->events_used; i = inew) {
    429 
    430       helperName = NULL;
    431       helperAddr = NULL;
    432       argv       = NULL;
    433       regparms   = 0;
    434 
    435       /* generate IR to notify event i and possibly the ones
    436 	 immediately following it. */
    437       tl_assert(i >= 0 && i < clgs->events_used);
    438 
    439       ev  = &clgs->events[i];
    440       ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
    441       ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
    442 
    443       CLG_DEBUGIF(5) {
    444 	 VG_(printf)("   flush ");
    445 	 showEvent( ev );
    446       }
    447 
    448       i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
    449 
    450       /* Decide on helper fn to call and args to pass it, and advance
    451 	 i appropriately.
    452 	 Dm events have same effect as Dw events */
    453       switch (ev->tag) {
    454 	 case Ev_Ir:
    455 	    /* Merge an Ir with a following Dr. */
    456 	    if (ev2 && ev2->tag == Ev_Dr) {
    457 	       /* Why is this true?  It's because we're merging an Ir
    458 		  with a following Dr.  The Ir derives from the
    459 		  instruction's IMark and the Dr from data
    460 		  references which follow it.  In short it holds
    461 		  because each insn starts with an IMark, hence an
    462 		  Ev_Ir, and so these Dr must pertain to the
    463 		  immediately preceding Ir.  Same applies to analogous
    464 		  assertions in the subsequent cases. */
    465 	       tl_assert(ev2->inode == ev->inode);
    466 	       helperName = CLG_(cachesim).log_1I1Dr_name;
    467 	       helperAddr = CLG_(cachesim).log_1I1Dr;
    468 	       argv = mkIRExprVec_3( i_node_expr,
    469 				     get_Event_dea(ev2),
    470 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    471 	       regparms = 3;
    472 	       inew = i+2;
    473 	    }
    474 	    /* Merge an Ir with a following Dw/Dm. */
    475 	    else
    476 	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
    477 	       tl_assert(ev2->inode == ev->inode);
    478 	       helperName = CLG_(cachesim).log_1I1Dw_name;
    479 	       helperAddr = CLG_(cachesim).log_1I1Dw;
    480 	       argv = mkIRExprVec_3( i_node_expr,
    481 				     get_Event_dea(ev2),
    482 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    483 	       regparms = 3;
    484 	       inew = i+2;
    485 	    }
    486 	    /* Merge an Ir with two following Irs. */
    487 	    else
    488 	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
    489 	       helperName = CLG_(cachesim).log_3I0D_name;
    490 	       helperAddr = CLG_(cachesim).log_3I0D;
    491 	       argv = mkIRExprVec_3( i_node_expr,
    492 				     mkIRExpr_HWord( (HWord)ev2->inode ),
    493 				     mkIRExpr_HWord( (HWord)ev3->inode ) );
    494 	       regparms = 3;
    495 	       inew = i+3;
    496 	    }
    497 	    /* Merge an Ir with one following Ir. */
    498 	    else
    499 	    if (ev2 && ev2->tag == Ev_Ir) {
    500 	       helperName = CLG_(cachesim).log_2I0D_name;
    501 	       helperAddr = CLG_(cachesim).log_2I0D;
    502 	       argv = mkIRExprVec_2( i_node_expr,
    503 				     mkIRExpr_HWord( (HWord)ev2->inode ) );
    504 	       regparms = 2;
    505 	       inew = i+2;
    506 	    }
    507 	    /* No merging possible; emit as-is. */
    508 	    else {
    509 	       helperName = CLG_(cachesim).log_1I0D_name;
    510 	       helperAddr = CLG_(cachesim).log_1I0D;
    511 	       argv = mkIRExprVec_1( i_node_expr );
    512 	       regparms = 1;
    513 	       inew = i+1;
    514 	    }
    515 	    break;
    516 	 case Ev_Dr:
    517 	    /* Data read or modify */
    518 	    helperName = CLG_(cachesim).log_0I1Dr_name;
    519 	    helperAddr = CLG_(cachesim).log_0I1Dr;
    520 	    argv = mkIRExprVec_3( i_node_expr,
    521 				  get_Event_dea(ev),
    522 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    523 	    regparms = 3;
    524 	    inew = i+1;
    525 	    break;
    526 	 case Ev_Dw:
    527 	 case Ev_Dm:
    528 	    /* Data write */
    529 	    helperName = CLG_(cachesim).log_0I1Dw_name;
    530 	    helperAddr = CLG_(cachesim).log_0I1Dw;
    531 	    argv = mkIRExprVec_3( i_node_expr,
    532 				  get_Event_dea(ev),
    533 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    534 	    regparms = 3;
    535 	    inew = i+1;
    536 	    break;
    537          case Ev_Bc:
    538             /* Conditional branch */
    539             helperName = "log_cond_branch";
    540             helperAddr = &log_cond_branch;
    541             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
    542             regparms = 2;
    543             inew = i+1;
    544             break;
    545          case Ev_Bi:
    546             /* Branch to an unknown destination */
    547             helperName = "log_ind_branch";
    548             helperAddr = &log_ind_branch;
    549             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
    550             regparms = 2;
    551             inew = i+1;
    552             break;
    553          case Ev_G:
    554             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
    555             helperName = "log_global_event";
    556             helperAddr = &log_global_event;
    557             argv = mkIRExprVec_1( i_node_expr );
    558             regparms = 1;
    559             inew = i+1;
    560             break;
    561 	 default:
    562 	    tl_assert(0);
    563       }
    564 
    565       CLG_DEBUGIF(5) {
    566 	  if (inew > i+1) {
    567 	      VG_(printf)("   merge ");
    568 	      showEvent( ev2 );
    569 	  }
    570 	  if (inew > i+2) {
    571 	      VG_(printf)("   merge ");
    572 	      showEvent( ev3 );
    573 	  }
    574 	  if (helperAddr)
    575 	      VG_(printf)("   call  %s (%p)\n",
    576 			  helperName, helperAddr);
    577       }
    578 
    579       /* helper could be unset depending on the simulator used */
    580       if (helperAddr == 0) continue;
    581 
    582       /* Add the helper. */
    583       tl_assert(helperName);
    584       tl_assert(helperAddr);
    585       tl_assert(argv);
    586       di = unsafeIRDirty_0_N( regparms,
    587 			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    588 			      argv );
    589       addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    590    }
    591 
    592    clgs->events_used = 0;
    593 }
    594 
    595 static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
    596 {
    597    Event* evt;
    598    tl_assert(clgs->seen_before || (inode->eventset == 0));
    599    if (!CLG_(clo).simulate_cache) return;
    600 
    601    if (clgs->events_used == N_EVENTS)
    602       flushEvents(clgs);
    603    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    604    evt = &clgs->events[clgs->events_used];
    605    init_Event(evt);
    606    evt->tag      = Ev_Ir;
    607    evt->inode    = inode;
    608    clgs->events_used++;
    609 }
    610 
    611 static
    612 void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    613 {
    614    Event* evt;
    615    tl_assert(isIRAtom(ea));
    616    tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
    617    if (!CLG_(clo).simulate_cache) return;
    618 
    619    if (clgs->events_used == N_EVENTS)
    620       flushEvents(clgs);
    621    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    622    evt = &clgs->events[clgs->events_used];
    623    init_Event(evt);
    624    evt->tag       = Ev_Dr;
    625    evt->inode     = inode;
    626    evt->Ev.Dr.szB = datasize;
    627    evt->Ev.Dr.ea  = ea;
    628    clgs->events_used++;
    629 }
    630 
    631 static
    632 void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    633 {
    634    Event* lastEvt;
    635    Event* evt;
    636    tl_assert(isIRAtom(ea));
    637    tl_assert(datasize >= 1 && datasize <= MIN_LINE_SIZE);
    638    if (!CLG_(clo).simulate_cache) return;
    639 
    640    /* Is it possible to merge this write with the preceding read? */
    641    lastEvt = &clgs->events[clgs->events_used-1];
    642    if (clgs->events_used > 0
    643        && lastEvt->tag       == Ev_Dr
    644        && lastEvt->Ev.Dr.szB == datasize
    645        && lastEvt->inode     == inode
    646        && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
    647    {
    648       lastEvt->tag   = Ev_Dm;
    649       return;
    650    }
    651 
    652    /* No.  Add as normal. */
    653    if (clgs->events_used == N_EVENTS)
    654       flushEvents(clgs);
    655    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    656    evt = &clgs->events[clgs->events_used];
    657    init_Event(evt);
    658    evt->tag       = Ev_Dw;
    659    evt->inode     = inode;
    660    evt->Ev.Dw.szB = datasize;
    661    evt->Ev.Dw.ea  = ea;
    662    clgs->events_used++;
    663 }
    664 
    665 static
    666 void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
    667 {
    668    Event* evt;
    669    tl_assert(isIRAtom(guard));
    670    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
    671              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    672    if (!CLG_(clo).simulate_branch) return;
    673 
    674    if (clgs->events_used == N_EVENTS)
    675       flushEvents(clgs);
    676    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    677    evt = &clgs->events[clgs->events_used];
    678    init_Event(evt);
    679    evt->tag         = Ev_Bc;
    680    evt->inode       = inode;
    681    evt->Ev.Bc.taken = guard;
    682    clgs->events_used++;
    683 }
    684 
    685 static
    686 void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
    687 {
    688    Event* evt;
    689    tl_assert(isIRAtom(whereTo));
    690    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
    691              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    692    if (!CLG_(clo).simulate_branch) return;
    693 
    694    if (clgs->events_used == N_EVENTS)
    695       flushEvents(clgs);
    696    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    697    evt = &clgs->events[clgs->events_used];
    698    init_Event(evt);
    699    evt->tag       = Ev_Bi;
    700    evt->inode     = inode;
    701    evt->Ev.Bi.dst = whereTo;
    702    clgs->events_used++;
    703 }
    704 
    705 static
    706 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
    707 {
    708    Event* evt;
    709    if (!CLG_(clo).collect_bus) return;
    710 
    711    if (clgs->events_used == N_EVENTS)
    712       flushEvents(clgs);
    713    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    714    evt = &clgs->events[clgs->events_used];
    715    init_Event(evt);
    716    evt->tag       = Ev_G;
    717    evt->inode     = inode;
    718    clgs->events_used++;
    719 }
    720 
    721 /* Initialise or check (if already seen before) an InstrInfo for next insn.
    722    We only can set instr_offset/instr_size here. The required event set and
    723    resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
    724    instructions. The event set is extended as required on flush of the event
    725    queue (when Dm events were determined), cost offsets are determined at
    726    end of BB instrumentation. */
    727 static
    728 InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
    729 {
    730    InstrInfo* ii;
    731    tl_assert(clgs->ii_index >= 0);
    732    tl_assert(clgs->ii_index < clgs->bb->instr_count);
    733    ii = &clgs->bb->instr[ clgs->ii_index ];
    734 
    735    if (clgs->seen_before) {
    736        CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
    737        CLG_ASSERT(ii->instr_size == instr_size);
    738    }
    739    else {
    740        ii->instr_offset = clgs->instr_offset;
    741        ii->instr_size = instr_size;
    742        ii->cost_offset = 0;
    743        ii->eventset = 0;
    744    }
    745 
    746    clgs->ii_index++;
    747    clgs->instr_offset += instr_size;
    748    CLG_(stat).distinct_instrs++;
    749 
    750    return ii;
    751 }
    752 
    753 // return total number of cost values needed for this BB
    754 static
    755 UInt update_cost_offsets( ClgState* clgs )
    756 {
    757     Int i;
    758     InstrInfo* ii;
    759     UInt cost_offset = 0;
    760 
    761     CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
    762     for(i=0; i<clgs->ii_index; i++) {
    763 	ii = &clgs->bb->instr[i];
    764 	if (clgs->seen_before) {
    765 	    CLG_ASSERT(ii->cost_offset == cost_offset);
    766 	} else
    767 	    ii->cost_offset = cost_offset;
    768 	cost_offset += ii->eventset ? ii->eventset->size : 0;
    769     }
    770 
    771     return cost_offset;
    772 }
    773 
    774 /*------------------------------------------------------------*/
    775 /*--- Instrumentation                                      ---*/
    776 /*------------------------------------------------------------*/
    777 
    778 #if defined(VG_BIGENDIAN)
    779 # define CLGEndness Iend_BE
    780 #elif defined(VG_LITTLEENDIAN)
    781 # define CLGEndness Iend_LE
    782 #else
    783 # error "Unknown endianness"
    784 #endif
    785 
    786 static
    787 Addr IRConst2Addr(IRConst* con)
    788 {
    789     Addr addr;
    790 
    791     if (sizeof(Addr) == 4) {
    792 	CLG_ASSERT( con->tag == Ico_U32 );
    793 	addr = con->Ico.U32;
    794     }
    795     else if (sizeof(Addr) == 8) {
    796 	CLG_ASSERT( con->tag == Ico_U64 );
    797 	addr = con->Ico.U64;
    798     }
    799     else
    800 	VG_(tool_panic)("Callgrind: invalid Addr type");
    801 
    802     return addr;
    803 }
    804 
    805 /* First pass over a BB to instrument, counting instructions and jumps
    806  * This is needed for the size of the BB struct to allocate
    807  *
    808  * Called from CLG_(get_bb)
    809  */
    810 void CLG_(collectBlockInfo)(IRSB* sbIn,
    811 			    /*INOUT*/ UInt* instrs,
    812 			    /*INOUT*/ UInt* cjmps,
    813 			    /*INOUT*/ Bool* cjmp_inverted)
    814 {
    815     Int i;
    816     IRStmt* st;
    817     Addr instrAddr =0, jumpDst;
    818     UInt instrLen = 0;
    819     Bool toNextInstr = False;
    820 
    821     // Ist_Exit has to be ignored in preamble code, before first IMark:
    822     // preamble code is added by VEX for self modifying code, and has
    823     // nothing to do with client code
    824     Bool inPreamble = True;
    825 
    826     if (!sbIn) return;
    827 
    828     for (i = 0; i < sbIn->stmts_used; i++) {
    829 	  st = sbIn->stmts[i];
    830 	  if (Ist_IMark == st->tag) {
    831 	      inPreamble = False;
    832 
    833 	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
    834 	      instrLen  = st->Ist.IMark.len;
    835 
    836 	      (*instrs)++;
    837 	      toNextInstr = False;
    838 	  }
    839 	  if (inPreamble) continue;
    840 	  if (Ist_Exit == st->tag) {
    841 	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
    842 	      toNextInstr =  (jumpDst == instrAddr + instrLen);
    843 
    844 	      (*cjmps)++;
    845 	  }
    846     }
    847 
    848     /* if the last instructions of BB conditionally jumps to next instruction
    849      * (= first instruction of next BB in memory), this is a inverted by VEX.
    850      */
    851     *cjmp_inverted = toNextInstr;
    852 }
    853 
    854 static
    855 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
    856 {
    857     addStmtToIRSB( bbOut,
    858 		   IRStmt_Store(CLGEndness,
    859 				IRExpr_Const(hWordTy == Ity_I32 ?
    860 					     IRConst_U32( addr ) :
    861 					     IRConst_U64( addr )),
    862 				IRExpr_Const(IRConst_U32(val)) ));
    863 }
    864 
    865 
    866 /* add helper call to setup_bbcc, with pointer to BB struct as argument
    867  *
    868  * precondition for setup_bbcc:
    869  * - jmps_passed has number of cond.jumps passed in last executed BB
    870  * - current_bbcc has a pointer to the BBCC of the last executed BB
    871  *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
    872  *     current_bbcc->bb->jmp_addr
    873  *   gives the address of the jump source.
    874  *
    875  * the setup does 2 things:
    876  * - trace call:
    877  *   * Unwind own call stack, i.e sync our ESP with real ESP
    878  *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
    879  *   * For CALLs or JMPs crossing objects, record call arg +
    880  *     push are on own call stack
    881  *
    882  * - prepare for cache log functions:
    883  *   set current_bbcc to BBCC that gets the costs for this BB execution
    884  *   attached
    885  */
    886 static
    887 void addBBSetupCall(ClgState* clgs)
    888 {
    889    IRDirty* di;
    890    IRExpr  *arg1, **argv;
    891 
    892    arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
    893    argv = mkIRExprVec_1(arg1);
    894    di = unsafeIRDirty_0_N( 1, "setup_bbcc",
    895 			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
    896 			      argv);
    897    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    898 }
    899 
    900 
    901 static
    902 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    903 			IRSB* sbIn,
    904 			VexGuestLayout* layout,
    905 			VexGuestExtents* vge,
    906 			IRType gWordTy, IRType hWordTy )
    907 {
    908    Int      i, isize;
    909    IRStmt*  st;
    910    Addr     origAddr;
    911    Addr64   cia; /* address of current insn */
    912    InstrInfo* curr_inode = NULL;
    913    ClgState clgs;
    914    UInt     cJumps = 0;
    915 
    916 
    917    if (gWordTy != hWordTy) {
    918       /* We don't currently support this case. */
    919       VG_(tool_panic)("host/guest word size mismatch");
    920    }
    921 
    922    // No instrumentation if it is switched off
    923    if (! CLG_(instrument_state)) {
    924        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
    925 		 (Addr)closure->readdr);
    926        return sbIn;
    927    }
    928 
    929    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
    930 
    931    /* Set up SB for instrumented IR */
    932    clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
    933 
    934    // Copy verbatim any IR preamble preceding the first IMark
    935    i = 0;
    936    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
    937       addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
    938       i++;
    939    }
    940 
    941    // Get the first statement, and origAddr from it
    942    CLG_ASSERT(sbIn->stmts_used >0);
    943    CLG_ASSERT(i < sbIn->stmts_used);
    944    st = sbIn->stmts[i];
    945    CLG_ASSERT(Ist_IMark == st->tag);
    946 
    947    origAddr = (Addr)st->Ist.IMark.addr;
    948    cia   = st->Ist.IMark.addr;
    949    isize = st->Ist.IMark.len;
    950    CLG_ASSERT(origAddr == st->Ist.IMark.addr);  // XXX: check no overflow
    951 
    952    /* Get BB struct (creating if necessary).
    953     * JS: The hash table is keyed with orig_addr_noredir -- important!
    954     * JW: Why? If it is because of different chasing of the redirection,
    955     *     this is not needed, as chasing is switched off in callgrind
    956     */
    957    clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
    958 
    959    addBBSetupCall(&clgs);
    960 
    961    // Set up running state
    962    clgs.events_used = 0;
    963    clgs.ii_index = 0;
    964    clgs.instr_offset = 0;
    965 
    966    for (/*use current i*/; i < sbIn->stmts_used; i++) {
    967 
    968       st = sbIn->stmts[i];
    969       CLG_ASSERT(isFlatIRStmt(st));
    970 
    971       switch (st->tag) {
    972 	 case Ist_NoOp:
    973 	 case Ist_AbiHint:
    974 	 case Ist_Put:
    975 	 case Ist_PutI:
    976 	 case Ist_MBE:
    977 	    break;
    978 
    979 	 case Ist_IMark: {
    980             cia   = st->Ist.IMark.addr;
    981             isize = st->Ist.IMark.len;
    982             CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
    983 	    // If Vex fails to decode an instruction, the size will be zero.
    984 	    // Pretend otherwise.
    985 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
    986 
    987 	    // Sanity-check size.
    988 	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
    989 		     || VG_CLREQ_SZB == isize );
    990 
    991 	    // Init the inode, record it as the current one.
    992 	    // Subsequent Dr/Dw/Dm events from the same instruction will
    993 	    // also use it.
    994 	    curr_inode = next_InstrInfo (&clgs, isize);
    995 
    996 	    addEvent_Ir( &clgs, curr_inode );
    997 	    break;
    998 	 }
    999 
   1000 	 case Ist_WrTmp: {
   1001 	    IRExpr* data = st->Ist.WrTmp.data;
   1002 	    if (data->tag == Iex_Load) {
   1003 	       IRExpr* aexpr = data->Iex.Load.addr;
   1004 	       // Note also, endianness info is ignored.  I guess
   1005 	       // that's not interesting.
   1006 	       addEvent_Dr( &clgs, curr_inode,
   1007 			    sizeofIRType(data->Iex.Load.ty), aexpr );
   1008 	    }
   1009 	    break;
   1010 	 }
   1011 
   1012 	 case Ist_Store: {
   1013 	    IRExpr* data  = st->Ist.Store.data;
   1014 	    IRExpr* aexpr = st->Ist.Store.addr;
   1015 	    addEvent_Dw( &clgs, curr_inode,
   1016 			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
   1017 	    break;
   1018 	 }
   1019 
   1020 	 case Ist_Dirty: {
   1021 	    Int      dataSize;
   1022 	    IRDirty* d = st->Ist.Dirty.details;
   1023 	    if (d->mFx != Ifx_None) {
   1024 	       /* This dirty helper accesses memory.  Collect the details. */
   1025 	       tl_assert(d->mAddr != NULL);
   1026 	       tl_assert(d->mSize != 0);
   1027 	       dataSize = d->mSize;
   1028 	       // Large (eg. 28B, 108B, 512B on x86) data-sized
   1029 	       // instructions will be done inaccurately, but they're
   1030 	       // very rare and this avoids errors from hitting more
   1031 	       // than two cache lines in the simulation.
   1032 	       if (dataSize > MIN_LINE_SIZE)
   1033 		  dataSize = MIN_LINE_SIZE;
   1034 	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
   1035 		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
   1036 	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
   1037 		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
   1038 	    } else {
   1039 	       tl_assert(d->mAddr == NULL);
   1040 	       tl_assert(d->mSize == 0);
   1041 	    }
   1042 	    break;
   1043 	 }
   1044 
   1045          case Ist_CAS: {
   1046             /* We treat it as a read and a write of the location.  I
   1047                think that is the same behaviour as it was before IRCAS
   1048                was introduced, since prior to that point, the Vex
   1049                front ends would translate a lock-prefixed instruction
   1050                into a (normal) read followed by a (normal) write. */
   1051             Int    dataSize;
   1052             IRCAS* cas = st->Ist.CAS.details;
   1053             CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
   1054             CLG_ASSERT(cas->dataLo);
   1055             dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
   1056             if (cas->dataHi != NULL)
   1057                dataSize *= 2; /* since this is a doubleword-cas */
   1058             addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
   1059             addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
   1060             addEvent_G(  &clgs, curr_inode );
   1061             break;
   1062          }
   1063 
   1064          case Ist_LLSC: {
   1065             IRType dataTy;
   1066             if (st->Ist.LLSC.storedata == NULL) {
   1067                /* LL */
   1068                dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
   1069                addEvent_Dr( &clgs, curr_inode,
   1070                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1071             } else {
   1072                /* SC */
   1073                dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
   1074                addEvent_Dw( &clgs, curr_inode,
   1075                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1076                /* I don't know whether the global-bus-lock cost should
   1077                   be attributed to the LL or the SC, but it doesn't
   1078                   really matter since they always have to be used in
   1079                   pairs anyway.  Hence put it (quite arbitrarily) on
   1080                   the SC. */
   1081                addEvent_G(  &clgs, curr_inode );
   1082             }
   1083             break;
   1084          }
   1085 
   1086  	 case Ist_Exit: {
   1087             Bool guest_exit, inverted;
   1088 
   1089             /* VEX code generation sometimes inverts conditional branches.
   1090              * As Callgrind counts (conditional) jumps, it has to correct
   1091              * inversions. The heuristic is the following:
   1092              * (1) Callgrind switches off SB chasing and unrolling, and
   1093              *     therefore it assumes that a candidate for inversion only is
   1094              *     the last conditional branch in an SB.
   1095              * (2) inversion is assumed if the branch jumps to the address of
   1096              *     the next guest instruction in memory.
   1097              * This heuristic is precalculated in CLG_(collectBlockInfo)().
   1098              *
   1099              * Branching behavior is also used for branch prediction. Note that
   1100              * above heuristic is different from what Cachegrind does.
   1101              * Cachegrind uses (2) for all branches.
   1102              */
   1103             if (cJumps+1 == clgs.bb->cjmp_count)
   1104                 inverted = clgs.bb->cjmp_inverted;
   1105             else
   1106                 inverted = False;
   1107 
   1108             // call branch predictor only if this is a branch in guest code
   1109             guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
   1110                          (st->Ist.Exit.jk == Ijk_Call) ||
   1111                          (st->Ist.Exit.jk == Ijk_Ret);
   1112 
   1113             if (guest_exit) {
   1114                 /* Stuff to widen the guard expression to a host word, so
   1115                    we can pass it to the branch predictor simulation
   1116                    functions easily. */
   1117                 IRType   tyW    = hWordTy;
   1118                 IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
   1119                 IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
   1120                 IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
   1121                 IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
   1122                 IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
   1123                 IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
   1124                                                : IRExpr_Const(IRConst_U64(1));
   1125 
   1126                 /* Widen the guard expression. */
   1127                 addStmtToIRSB( clgs.sbOut,
   1128                                IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
   1129                 addStmtToIRSB( clgs.sbOut,
   1130                                IRStmt_WrTmp( guardW,
   1131                                              IRExpr_Unop(widen,
   1132                                                          IRExpr_RdTmp(guard1))) );
   1133                 /* If the exit is inverted, invert the sense of the guard. */
   1134                 addStmtToIRSB(
   1135                         clgs.sbOut,
   1136                         IRStmt_WrTmp(
   1137                                 guard,
   1138                                 inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
   1139                                     : IRExpr_RdTmp(guardW)
   1140                                     ));
   1141                 /* And post the event. */
   1142                 addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
   1143             }
   1144 
   1145 	    /* We may never reach the next statement, so need to flush
   1146 	       all outstanding transactions now. */
   1147 	    flushEvents( &clgs );
   1148 
   1149 	    CLG_ASSERT(clgs.ii_index>0);
   1150 	    if (!clgs.seen_before) {
   1151 		clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1152 		clgs.bb->jmp[cJumps].skip = False;
   1153 	    }
   1154 
   1155 	    /* Update global variable jmps_passed before the jump
   1156 	     * A correction is needed if VEX inverted the last jump condition
   1157 	    */
   1158 	    addConstMemStoreStmt( clgs.sbOut,
   1159 				  (UWord) &CLG_(current_state).jmps_passed,
   1160                                   inverted ? cJumps+1 : cJumps, hWordTy);
   1161 	    cJumps++;
   1162 
   1163 	    break;
   1164 	 }
   1165 
   1166 	 default:
   1167 	    tl_assert(0);
   1168 	    break;
   1169       }
   1170 
   1171       /* Copy the original statement */
   1172       addStmtToIRSB( clgs.sbOut, st );
   1173 
   1174       CLG_DEBUGIF(5) {
   1175 	 VG_(printf)("   pass  ");
   1176 	 ppIRStmt(st);
   1177 	 VG_(printf)("\n");
   1178       }
   1179    }
   1180 
   1181    /* Deal with branches to unknown destinations.  Except ignore ones
   1182       which are function returns as we assume the return stack
   1183       predictor never mispredicts. */
   1184    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
   1185       if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
   1186       switch (sbIn->next->tag) {
   1187          case Iex_Const:
   1188             break; /* boring - branch to known address */
   1189          case Iex_RdTmp:
   1190             /* looks like an indirect branch (branch to unknown) */
   1191             addEvent_Bi( &clgs, curr_inode, sbIn->next );
   1192             break;
   1193          default:
   1194             /* shouldn't happen - if the incoming IR is properly
   1195                flattened, should only have tmp and const cases to
   1196                consider. */
   1197             tl_assert(0);
   1198       }
   1199    }
   1200 
   1201    /* At the end of the bb.  Flush outstandings. */
   1202    flushEvents( &clgs );
   1203 
   1204    /* Always update global variable jmps_passed at end of bb.
   1205     * A correction is needed if VEX inverted the last jump condition
   1206     */
   1207    {
   1208       UInt jmps_passed = cJumps;
   1209       if (clgs.bb->cjmp_inverted) jmps_passed--;
   1210       addConstMemStoreStmt( clgs.sbOut,
   1211 			    (UWord) &CLG_(current_state).jmps_passed,
   1212 			    jmps_passed, hWordTy);
   1213    }
   1214    CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
   1215    CLG_ASSERT(clgs.bb->instr_count = clgs.ii_index);
   1216 
   1217    /* This stores the instr of the call/ret at BB end */
   1218    clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1219 
   1220    if (clgs.seen_before) {
   1221        CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
   1222        CLG_ASSERT(clgs.bb->instr_len = clgs.instr_offset);
   1223        CLG_ASSERT(clgs.bb->jmpkind == sbIn->jumpkind);
   1224    }
   1225    else {
   1226        clgs.bb->cost_count = update_cost_offsets(&clgs);
   1227        clgs.bb->instr_len = clgs.instr_offset;
   1228        clgs.bb->jmpkind = sbIn->jumpkind;
   1229    }
   1230 
   1231    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
   1232 	     origAddr, clgs.bb->instr_len,
   1233 	     clgs.bb->cjmp_count, clgs.bb->cost_count);
   1234    if (cJumps>0) {
   1235        CLG_DEBUG(3, "                     [ ");
   1236        for (i=0;i<cJumps;i++)
   1237 	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
   1238        CLG_DEBUG(3, "], last inverted: %s \n",
   1239 		 clgs.bb->cjmp_inverted ? "yes":"no");
   1240    }
   1241 
   1242   return clgs.sbOut;
   1243 }
   1244 
   1245 /*--------------------------------------------------------------------*/
   1246 /*--- Discarding BB info                                           ---*/
   1247 /*--------------------------------------------------------------------*/
   1248 
   1249 // Called when a translation is removed from the translation cache for
   1250 // any reason at all: to free up space, because the guest code was
   1251 // unmapped or modified, or for any arbitrary reason.
   1252 static
   1253 void clg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
   1254 {
   1255     Addr orig_addr = (Addr)orig_addr64;
   1256 
   1257     tl_assert(vge.n_used > 0);
   1258 
   1259    if (0)
   1260       VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
   1261                    (void*)(Addr)orig_addr,
   1262                    (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
   1263 
   1264    // Get BB info, remove from table, free BB info.  Simple!  Note that we
   1265    // use orig_addr, not the first instruction address in vge.
   1266    CLG_(delete_bb)(orig_addr);
   1267 }
   1268 
   1269 
   1270 /*------------------------------------------------------------*/
   1271 /*--- CLG_(fini)() and related function                     ---*/
   1272 /*------------------------------------------------------------*/
   1273 
   1274 
   1275 
   1276 static void zero_thread_cost(thread_info* t)
   1277 {
   1278   Int i;
   1279 
   1280   for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1281     if (!CLG_(current_call_stack).entry[i].jcc) continue;
   1282 
   1283     /* reset call counters to current for active calls */
   1284     CLG_(copy_cost)( CLG_(sets).full,
   1285 		    CLG_(current_call_stack).entry[i].enter_cost,
   1286 		    CLG_(current_state).cost );
   1287     CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
   1288   }
   1289 
   1290   CLG_(forall_bbccs)(CLG_(zero_bbcc));
   1291 
   1292   /* set counter for last dump */
   1293   CLG_(copy_cost)( CLG_(sets).full,
   1294 		  t->lastdump_cost, CLG_(current_state).cost );
   1295 }
   1296 
   1297 void CLG_(zero_all_cost)(Bool only_current_thread)
   1298 {
   1299   if (VG_(clo_verbosity) > 1)
   1300     VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
   1301 
   1302   if (only_current_thread)
   1303     zero_thread_cost(CLG_(get_current_thread)());
   1304   else
   1305     CLG_(forall_threads)(zero_thread_cost);
   1306 
   1307   if (VG_(clo_verbosity) > 1)
   1308     VG_(message)(Vg_DebugMsg, "  ...done\n");
   1309 }
   1310 
   1311 static
   1312 void unwind_thread(thread_info* t)
   1313 {
   1314   /* unwind signal handlers */
   1315   while(CLG_(current_state).sig !=0)
   1316     CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
   1317 
   1318   /* unwind regular call stack */
   1319   while(CLG_(current_call_stack).sp>0)
   1320     CLG_(pop_call_stack)();
   1321 
   1322   /* reset context and function stack for context generation */
   1323   CLG_(init_exec_state)( &CLG_(current_state) );
   1324   CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
   1325 }
   1326 
   1327 static
   1328 void zero_state_cost(thread_info* t)
   1329 {
   1330     CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
   1331 }
   1332 
   1333 /* Ups, this can go very wrong... */
   1334 extern void VG_(discard_translations) ( Addr64 start, ULong range, HChar* who );
   1335 
   1336 void CLG_(set_instrument_state)(Char* reason, Bool state)
   1337 {
   1338   if (CLG_(instrument_state) == state) {
   1339     CLG_DEBUG(2, "%s: instrumentation already %s\n",
   1340 	     reason, state ? "ON" : "OFF");
   1341     return;
   1342   }
   1343   CLG_(instrument_state) = state;
   1344   CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
   1345 	   reason, state ? "ON" : "OFF");
   1346 
   1347   VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl, "callgrind");
   1348 
   1349   /* reset internal state: call stacks, simulator */
   1350   CLG_(forall_threads)(unwind_thread);
   1351   CLG_(forall_threads)(zero_state_cost);
   1352   (*CLG_(cachesim).clear)();
   1353 
   1354   if (VG_(clo_verbosity) > 1)
   1355     VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
   1356 		 reason, state ? "ON" : "OFF");
   1357 }
   1358 
   1359 /* helper for dump_state_togdb */
   1360 static void dump_state_of_thread_togdb(thread_info* ti)
   1361 {
   1362     static Char buf[512];
   1363     static FullCost sum = 0, tmp = 0;
   1364     Int t, p, i;
   1365     BBCC *from, *to;
   1366     call_entry* ce;
   1367 
   1368     t = CLG_(current_tid);
   1369     CLG_(init_cost_lz)( CLG_(sets).full, &sum );
   1370     CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
   1371     CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
   1372 			 ti->states.entry[0]->cost);
   1373     CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
   1374     CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), sum);
   1375     VG_(gdb_printf)("events-%d: %s\n", t, buf);
   1376     VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
   1377 
   1378     ce = 0;
   1379     for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1380       ce = CLG_(get_call_entry)(i);
   1381       /* if this frame is skipped, we don't have counters */
   1382       if (!ce->jcc) continue;
   1383 
   1384       from = ce->jcc->from;
   1385       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
   1386       VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
   1387 
   1388       /* FIXME: EventSets! */
   1389       CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
   1390       CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
   1391       CLG_(add_diff_cost)( CLG_(sets).full, sum,
   1392 			  ce->enter_cost, CLG_(current_state).cost );
   1393       CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
   1394 
   1395       p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
   1396       CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
   1397       VG_(gdb_printf)("%s\n", buf);
   1398     }
   1399     if (ce && ce->jcc) {
   1400       to = ce->jcc->to;
   1401       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
   1402     }
   1403 }
   1404 
   1405 /* Dump current state */
   1406 static void dump_state_togdb(void)
   1407 {
   1408     static Char buf[512];
   1409     thread_info** th;
   1410     int t, p;
   1411     Int orig_tid = CLG_(current_tid);
   1412 
   1413     VG_(gdb_printf)("instrumentation: %s\n",
   1414 		    CLG_(instrument_state) ? "on":"off");
   1415     if (!CLG_(instrument_state)) return;
   1416 
   1417     VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
   1418     VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
   1419     VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
   1420     VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
   1421     VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
   1422     VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
   1423 
   1424     /* "events:" line. Given here because it will be dynamic in the future */
   1425     p = VG_(sprintf)(buf, "events: ");
   1426     CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
   1427     VG_(gdb_printf)("%s\n", buf);
   1428     /* "part:" line (number of last part. Is 0 at start */
   1429     VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
   1430 
   1431     /* threads */
   1432     th = CLG_(get_threads)();
   1433     p = VG_(sprintf)(buf, "threads:");
   1434     for(t=1;t<VG_N_THREADS;t++) {
   1435 	if (!th[t]) continue;
   1436 	p += VG_(sprintf)(buf+p, " %d", t);
   1437     }
   1438     VG_(gdb_printf)("%s\n", buf);
   1439     VG_(gdb_printf)("current-tid: %d\n", orig_tid);
   1440     CLG_(forall_threads)(dump_state_of_thread_togdb);
   1441 }
   1442 
   1443 
   1444 static void print_monitor_help ( void )
   1445 {
   1446    VG_(gdb_printf) ("\n");
   1447    VG_(gdb_printf) ("callgrind monitor commands:\n");
   1448    VG_(gdb_printf) ("  dump [<dump_hint>]\n");
   1449    VG_(gdb_printf) ("        dump counters\n");
   1450    VG_(gdb_printf) ("  zero\n");
   1451    VG_(gdb_printf) ("        zero counters\n");
   1452    VG_(gdb_printf) ("  status\n");
   1453    VG_(gdb_printf) ("        print status\n");
   1454    VG_(gdb_printf) ("  instrumentation [on|off]\n");
   1455    VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
   1456    VG_(gdb_printf) ("\n");
   1457 }
   1458 
   1459 /* return True if request recognised, False otherwise */
   1460 static Bool handle_gdb_monitor_command (ThreadId tid, Char *req)
   1461 {
   1462    Char* wcmd;
   1463    Char s[VG_(strlen(req))]; /* copy for strtok_r */
   1464    Char *ssaveptr;
   1465 
   1466    VG_(strcpy) (s, req);
   1467 
   1468    wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
   1469    switch (VG_(keyword_id) ("help dump zero status instrumentation",
   1470                             wcmd, kwd_report_duplicated_matches)) {
   1471    case -2: /* multiple matches */
   1472       return True;
   1473    case -1: /* not found */
   1474       return False;
   1475    case  0: /* help */
   1476       print_monitor_help();
   1477       return True;
   1478    case  1: { /* dump */
   1479       CLG_(dump_profile)(req, False);
   1480       return True;
   1481    }
   1482    case  2: { /* zero */
   1483       CLG_(zero_all_cost)(False);
   1484       return True;
   1485    }
   1486 
   1487    case 3: { /* status */
   1488      Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1489      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
   1490        /* internal interface to callgrind_control */
   1491        dump_state_togdb();
   1492        return True;
   1493      }
   1494 
   1495      if (!CLG_(instrument_state)) {
   1496        VG_(gdb_printf)("No status available as instrumentation is switched off\n");
   1497      } else {
   1498        // Status information to be improved ...
   1499        thread_info** th = CLG_(get_threads)();
   1500        Int t, tcount = 0;
   1501        for(t=1;t<VG_N_THREADS;t++)
   1502 	 if (th[t]) tcount++;
   1503        VG_(gdb_printf)("%d thread(s) running.\n", tcount);
   1504      }
   1505      return True;
   1506    }
   1507 
   1508    case 4: { /* instrumentation */
   1509      Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1510      if (!arg) {
   1511        VG_(gdb_printf)("instrumentation: %s\n",
   1512 		       CLG_(instrument_state) ? "on":"off");
   1513      }
   1514      else
   1515        CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
   1516      return True;
   1517    }
   1518 
   1519    default:
   1520       tl_assert(0);
   1521       return False;
   1522    }
   1523 }
   1524 
   1525 static
   1526 Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
   1527 {
   1528    if (!VG_IS_TOOL_USERREQ('C','T',args[0])
   1529        && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
   1530       return False;
   1531 
   1532    switch(args[0]) {
   1533    case VG_USERREQ__DUMP_STATS:
   1534       CLG_(dump_profile)("Client Request", True);
   1535       *ret = 0;                 /* meaningless */
   1536       break;
   1537 
   1538    case VG_USERREQ__DUMP_STATS_AT:
   1539      {
   1540        Char buf[512];
   1541        VG_(sprintf)(buf,"Client Request: %s", (Char*)args[1]);
   1542        CLG_(dump_profile)(buf, True);
   1543        *ret = 0;                 /* meaningless */
   1544      }
   1545      break;
   1546 
   1547    case VG_USERREQ__ZERO_STATS:
   1548      CLG_(zero_all_cost)(True);
   1549       *ret = 0;                 /* meaningless */
   1550       break;
   1551 
   1552    case VG_USERREQ__TOGGLE_COLLECT:
   1553      CLG_(current_state).collect = !CLG_(current_state).collect;
   1554      CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
   1555 	      CLG_(current_state).collect ? "ON" : "OFF");
   1556      *ret = 0;                 /* meaningless */
   1557      break;
   1558 
   1559    case VG_USERREQ__START_INSTRUMENTATION:
   1560      CLG_(set_instrument_state)("Client Request", True);
   1561      *ret = 0;                 /* meaningless */
   1562      break;
   1563 
   1564    case VG_USERREQ__STOP_INSTRUMENTATION:
   1565      CLG_(set_instrument_state)("Client Request", False);
   1566      *ret = 0;                 /* meaningless */
   1567      break;
   1568 
   1569    case VG_USERREQ__GDB_MONITOR_COMMAND: {
   1570       Bool handled = handle_gdb_monitor_command (tid, (Char*)args[1]);
   1571       if (handled)
   1572          *ret = 1;
   1573       else
   1574          *ret = 0;
   1575       return handled;
   1576    }
   1577    default:
   1578       return False;
   1579    }
   1580 
   1581    return True;
   1582 }
   1583 
   1584 
   1585 /* Syscall Timing */
   1586 
   1587 /* struct timeval syscalltime[VG_N_THREADS]; */
   1588 #if CLG_MICROSYSTIME
   1589 #include <sys/time.h>
   1590 #include <sys/syscall.h>
   1591 extern Int VG_(do_syscall) ( UInt, ... );
   1592 
   1593 ULong syscalltime[VG_N_THREADS];
   1594 #else
   1595 UInt syscalltime[VG_N_THREADS];
   1596 #endif
   1597 
   1598 static
   1599 void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
   1600                            UWord* args, UInt nArgs)
   1601 {
   1602   if (CLG_(clo).collect_systime) {
   1603 #if CLG_MICROSYSTIME
   1604     struct vki_timeval tv_now;
   1605     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1606     syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
   1607 #else
   1608     syscalltime[tid] = VG_(read_millisecond_timer)();
   1609 #endif
   1610   }
   1611 }
   1612 
   1613 static
   1614 void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
   1615                             UWord* args, UInt nArgs, SysRes res)
   1616 {
   1617   if (CLG_(clo).collect_systime &&
   1618       CLG_(current_state).bbcc) {
   1619       Int o;
   1620 #if CLG_MICROSYSTIME
   1621     struct vki_timeval tv_now;
   1622     ULong diff;
   1623 
   1624     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1625     diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
   1626 #else
   1627     UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
   1628 #endif
   1629 
   1630     /* offset o is for "SysCount", o+1 for "SysTime" */
   1631     o = fullOffset(EG_SYS);
   1632     CLG_ASSERT(o>=0);
   1633     CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
   1634 
   1635     CLG_(current_state).cost[o] ++;
   1636     CLG_(current_state).cost[o+1] += diff;
   1637     if (!CLG_(current_state).bbcc->skipped)
   1638       CLG_(init_cost_lz)(CLG_(sets).full,
   1639 			&(CLG_(current_state).bbcc->skipped));
   1640     CLG_(current_state).bbcc->skipped[o] ++;
   1641     CLG_(current_state).bbcc->skipped[o+1] += diff;
   1642   }
   1643 }
   1644 
   1645 static UInt ULong_width(ULong n)
   1646 {
   1647    UInt w = 0;
   1648    while (n > 0) {
   1649       n = n / 10;
   1650       w++;
   1651    }
   1652    if (w == 0) w = 1;
   1653    return w + (w-1)/3;   // add space for commas
   1654 }
   1655 
   1656 static
   1657 void branchsim_printstat(int l1, int l2, int l3)
   1658 {
   1659     static Char buf1[128], buf2[128], buf3[128], fmt[128];
   1660     FullCost total;
   1661     ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
   1662     ULong B_total_b, B_total_mp;
   1663 
   1664     total = CLG_(total_cost);
   1665     Bc_total_b  = total[ fullOffset(EG_BC)   ];
   1666     Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
   1667     Bi_total_b  = total[ fullOffset(EG_BI)   ];
   1668     Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
   1669 
   1670     /* Make format string, getting width right for numbers */
   1671     VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
   1672                  l1, l2, l3);
   1673 
   1674     if (0 == Bc_total_b)  Bc_total_b = 1;
   1675     if (0 == Bi_total_b)  Bi_total_b = 1;
   1676     B_total_b  = Bc_total_b  + Bi_total_b;
   1677     B_total_mp = Bc_total_mp + Bi_total_mp;
   1678 
   1679     VG_(umsg)("\n");
   1680     VG_(umsg)(fmt, "Branches:     ",
   1681               B_total_b, Bc_total_b, Bi_total_b);
   1682 
   1683     VG_(umsg)(fmt, "Mispredicts:  ",
   1684               B_total_mp, Bc_total_mp, Bi_total_mp);
   1685 
   1686     VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
   1687     VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
   1688     VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
   1689 
   1690     VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
   1691 }
   1692 
   1693 
   1694 static
   1695 void finish(void)
   1696 {
   1697   Char buf[32+COSTS_LEN], fmt[128];
   1698   Int l1, l2, l3;
   1699   FullCost total;
   1700 
   1701   CLG_DEBUG(0, "finish()\n");
   1702 
   1703   (*CLG_(cachesim).finish)();
   1704 
   1705   /* pop all remaining items from CallStack for correct sum
   1706    */
   1707   CLG_(forall_threads)(unwind_thread);
   1708 
   1709   CLG_(dump_profile)(0, False);
   1710 
   1711   CLG_(finish_command)();
   1712 
   1713   if (VG_(clo_verbosity) == 0) return;
   1714 
   1715   /* Hash table stats */
   1716   if (VG_(clo_stats)) {
   1717     int BB_lookups =
   1718       CLG_(stat).full_debug_BBs +
   1719       CLG_(stat).fn_name_debug_BBs +
   1720       CLG_(stat).file_line_debug_BBs +
   1721       CLG_(stat).no_debug_BBs;
   1722 
   1723     VG_(message)(Vg_DebugMsg, "\n");
   1724     VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
   1725 		 CLG_(stat).distinct_objs);
   1726     VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
   1727 		 CLG_(stat).distinct_files);
   1728     VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
   1729 		 CLG_(stat).distinct_fns);
   1730     VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
   1731 		 CLG_(stat).distinct_contexts);
   1732     VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
   1733 		 CLG_(stat).distinct_bbs);
   1734     VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)\n",
   1735 		 CLG_(costarray_entries), CLG_(costarray_chunks));
   1736     VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
   1737 		 CLG_(stat).distinct_bbccs);
   1738     VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
   1739 		 CLG_(stat).distinct_jccs);
   1740     VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
   1741 		 CLG_(stat).distinct_skips);
   1742     VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
   1743 		 BB_lookups);
   1744     if (BB_lookups>0) {
   1745       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
   1746 		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
   1747 		   CLG_(stat).full_debug_BBs);
   1748       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
   1749 		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
   1750 		   CLG_(stat).file_line_debug_BBs);
   1751       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
   1752 		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
   1753 		   CLG_(stat).fn_name_debug_BBs);
   1754       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
   1755 		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
   1756 		   CLG_(stat).no_debug_BBs);
   1757     }
   1758     VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
   1759 		 CLG_(stat).bbcc_clones);
   1760     VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
   1761 		 CLG_(stat).bb_retranslations);
   1762     VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
   1763 		 CLG_(stat).distinct_instrs);
   1764     VG_(message)(Vg_DebugMsg, "");
   1765 
   1766     VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
   1767 		 CLG_(stat).cxt_lru_misses);
   1768     VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
   1769 		 CLG_(stat).bbcc_lru_misses);
   1770     VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
   1771 		 CLG_(stat).jcc_lru_misses);
   1772     VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
   1773 		 CLG_(stat).bb_executions);
   1774     VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
   1775 		 CLG_(stat).call_counter);
   1776     VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
   1777 		 CLG_(stat).jcnd_counter);
   1778     VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
   1779 		 CLG_(stat).jump_counter);
   1780     VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
   1781 		 CLG_(stat).rec_call_counter);
   1782     VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
   1783 		 CLG_(stat).ret_counter);
   1784 
   1785     VG_(message)(Vg_DebugMsg, "");
   1786   }
   1787 
   1788   CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
   1789   VG_(message)(Vg_UserMsg, "Events    : %s\n", buf);
   1790   CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
   1791   VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
   1792   VG_(message)(Vg_UserMsg, "\n");
   1793 
   1794   /* determine value widths for statistics */
   1795   total = CLG_(total_cost);
   1796   l1 = ULong_width( total[fullOffset(EG_IR)] );
   1797   l2 = l3 = 0;
   1798   if (CLG_(clo).simulate_cache) {
   1799       l2 = ULong_width( total[fullOffset(EG_DR)] );
   1800       l3 = ULong_width( total[fullOffset(EG_DW)] );
   1801   }
   1802   if (CLG_(clo).simulate_branch) {
   1803       int l2b = ULong_width( total[fullOffset(EG_BC)] );
   1804       int l3b = ULong_width( total[fullOffset(EG_BI)] );
   1805       if (l2b > l2) l2 = l2b;
   1806       if (l3b > l3) l3 = l3b;
   1807   }
   1808 
   1809   /* Make format string, getting width right for numbers */
   1810   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
   1811 
   1812   /* Always print this */
   1813   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
   1814 
   1815   if (CLG_(clo).simulate_cache)
   1816       (*CLG_(cachesim).printstat)(l1, l2, l3);
   1817 
   1818   if (CLG_(clo).simulate_branch)
   1819       branchsim_printstat(l1, l2, l3);
   1820 
   1821 }
   1822 
   1823 
   1824 void CLG_(fini)(Int exitcode)
   1825 {
   1826   finish();
   1827 }
   1828 
   1829 
   1830 /*--------------------------------------------------------------------*/
   1831 /*--- Setup                                                        ---*/
   1832 /*--------------------------------------------------------------------*/
   1833 
   1834 static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
   1835 {
   1836    static ULong last_blocks_done = 0;
   1837 
   1838    if (0)
   1839       VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
   1840 
   1841    /* throttle calls to CLG_(run_thread) by number of BBs executed */
   1842    if (blocks_done - last_blocks_done < 5000) return;
   1843    last_blocks_done = blocks_done;
   1844 
   1845    CLG_(run_thread)( tid );
   1846 }
   1847 
   1848 static
   1849 void CLG_(post_clo_init)(void)
   1850 {
   1851    VG_(clo_vex_control).iropt_unroll_thresh = 0;
   1852    VG_(clo_vex_control).guest_chase_thresh = 0;
   1853 
   1854    CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
   1855    CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
   1856    CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
   1857 
   1858    if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
   1859        VG_(message)(Vg_UserMsg, "Using source line as position.\n");
   1860        CLG_(clo).dump_line = True;
   1861    }
   1862 
   1863    CLG_(init_dumps)();
   1864    CLG_(init_command)();
   1865 
   1866    (*CLG_(cachesim).post_clo_init)();
   1867 
   1868    CLG_(init_eventsets)();
   1869    CLG_(init_statistics)(& CLG_(stat));
   1870    CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
   1871 
   1872    /* initialize hash tables */
   1873    CLG_(init_obj_table)();
   1874    CLG_(init_cxt_table)();
   1875    CLG_(init_bb_hash)();
   1876 
   1877    CLG_(init_threads)();
   1878    CLG_(run_thread)(1);
   1879 
   1880    CLG_(instrument_state) = CLG_(clo).instrument_atstart;
   1881 
   1882    if (VG_(clo_verbosity > 0)) {
   1883       VG_(message)(Vg_UserMsg,
   1884                    "For interactive control, run 'callgrind_control -h'.\n");
   1885    }
   1886 }
   1887 
   1888 static
   1889 void CLG_(pre_clo_init)(void)
   1890 {
   1891     VG_(details_name)            ("Callgrind");
   1892     VG_(details_version)         (NULL);
   1893     VG_(details_description)     ("a call-graph generating cache profiler");
   1894     VG_(details_copyright_author)("Copyright (C) 2002-2011, and GNU GPL'd, "
   1895 				  "by Josef Weidendorfer et al.");
   1896     VG_(details_bug_reports_to)  (VG_BUGS_TO);
   1897     VG_(details_avg_translation_sizeB) ( 500 );
   1898 
   1899     VG_(basic_tool_funcs)        (CLG_(post_clo_init),
   1900                                   CLG_(instrument),
   1901                                   CLG_(fini));
   1902 
   1903     VG_(needs_superblock_discards)(clg_discard_superblock_info);
   1904 
   1905 
   1906     VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
   1907 				    CLG_(print_usage),
   1908 				    CLG_(print_debug_usage));
   1909 
   1910     VG_(needs_client_requests)(CLG_(handle_client_request));
   1911     VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
   1912 			       CLG_(post_syscalltime));
   1913 
   1914     VG_(track_start_client_code)  ( & clg_start_client_code_callback );
   1915     VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
   1916     VG_(track_post_deliver_signal)( & CLG_(post_signal) );
   1917 
   1918     CLG_(set_clo_defaults)();
   1919 }
   1920 
   1921 VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
   1922 
   1923 /*--------------------------------------------------------------------*/
   1924 /*--- end                                                   main.c ---*/
   1925 /*--------------------------------------------------------------------*/
   1926