Home | History | Annotate | Download | only in callgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Callgrind                                                    ---*/
      4 /*---                                                       main.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Callgrind, a Valgrind tool for call graph
      9    profiling programs.
     10 
     11    Copyright (C) 2002-2012, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     12 
     13    This tool is derived from and contains code from Cachegrind
     14    Copyright (C) 2002-2012 Nicholas Nethercote (njn (at) valgrind.org)
     15 
     16    This program is free software; you can redistribute it and/or
     17    modify it under the terms of the GNU General Public License as
     18    published by the Free Software Foundation; either version 2 of the
     19    License, or (at your option) any later version.
     20 
     21    This program is distributed in the hope that it will be useful, but
     22    WITHOUT ANY WARRANTY; without even the implied warranty of
     23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     24    General Public License for more details.
     25 
     26    You should have received a copy of the GNU General Public License
     27    along with this program; if not, write to the Free Software
     28    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     29    02111-1307, USA.
     30 
     31    The GNU General Public License is contained in the file COPYING.
     32 */
     33 
     34 #include "config.h"
     35 #include "callgrind.h"
     36 #include "global.h"
     37 
     38 #include "pub_tool_threadstate.h"
     39 #include "pub_tool_gdbserver.h"
     40 
     41 #include "cg_branchpred.c"
     42 
     43 /*------------------------------------------------------------*/
     44 /*--- Global variables                                     ---*/
     45 /*------------------------------------------------------------*/
     46 
     47 /* for all threads */
     48 CommandLineOptions CLG_(clo);
     49 Statistics CLG_(stat);
     50 Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
     51 
     52 /* thread and signal handler specific */
     53 exec_state CLG_(current_state);
     54 
     55 /* min of L1 and LL cache line sizes.  This only gets set to a
     56    non-zero value if we are doing cache simulation. */
     57 Int CLG_(min_line_size) = 0;
     58 
     59 
     60 /*------------------------------------------------------------*/
     61 /*--- Statistics                                           ---*/
     62 /*------------------------------------------------------------*/
     63 
     64 static void CLG_(init_statistics)(Statistics* s)
     65 {
     66   s->call_counter        = 0;
     67   s->jcnd_counter        = 0;
     68   s->jump_counter        = 0;
     69   s->rec_call_counter    = 0;
     70   s->ret_counter         = 0;
     71   s->bb_executions       = 0;
     72 
     73   s->context_counter     = 0;
     74   s->bb_retranslations   = 0;
     75 
     76   s->distinct_objs       = 0;
     77   s->distinct_files      = 0;
     78   s->distinct_fns        = 0;
     79   s->distinct_contexts   = 0;
     80   s->distinct_bbs        = 0;
     81   s->distinct_bbccs      = 0;
     82   s->distinct_instrs     = 0;
     83   s->distinct_skips      = 0;
     84 
     85   s->bb_hash_resizes     = 0;
     86   s->bbcc_hash_resizes   = 0;
     87   s->jcc_hash_resizes    = 0;
     88   s->cxt_hash_resizes    = 0;
     89   s->fn_array_resizes    = 0;
     90   s->call_stack_resizes  = 0;
     91   s->fn_stack_resizes    = 0;
     92 
     93   s->full_debug_BBs      = 0;
     94   s->file_line_debug_BBs = 0;
     95   s->fn_name_debug_BBs   = 0;
     96   s->no_debug_BBs        = 0;
     97   s->bbcc_lru_misses     = 0;
     98   s->jcc_lru_misses      = 0;
     99   s->cxt_lru_misses      = 0;
    100   s->bbcc_clones         = 0;
    101 }
    102 
    103 
    104 /*------------------------------------------------------------*/
    105 /*--- Simple callbacks (not cache similator)               ---*/
    106 /*------------------------------------------------------------*/
    107 
    108 VG_REGPARM(1)
    109 static void log_global_event(InstrInfo* ii)
    110 {
    111     ULong* cost_Bus;
    112 
    113     CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
    114               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
    115 
    116     if (!CLG_(current_state).collect) return;
    117 
    118     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
    119 
    120     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
    121 
    122     if (CLG_(current_state).nonskipped)
    123         cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
    124     else
    125         cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
    126     cost_Bus[0]++;
    127 }
    128 
    129 
    130 /* For branches, we consult two different predictors, one which
    131    predicts taken/untaken for conditional branches, and the other
    132    which predicts the branch target address for indirect branches
    133    (jump-to-register style ones). */
    134 
    135 static VG_REGPARM(2)
    136 void log_cond_branch(InstrInfo* ii, Word taken)
    137 {
    138     Bool miss;
    139     Int fullOffset_Bc;
    140     ULong* cost_Bc;
    141 
    142     CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
    143               CLG_(bb_base) + ii->instr_offset, taken);
    144 
    145     miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
    146 
    147     if (!CLG_(current_state).collect) return;
    148 
    149     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
    150 
    151     if (CLG_(current_state).nonskipped)
    152         cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
    153     else
    154         cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
    155 
    156     fullOffset_Bc = fullOffset(EG_BC);
    157     CLG_(current_state).cost[ fullOffset_Bc ]++;
    158     cost_Bc[0]++;
    159     if (miss) {
    160         CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
    161         cost_Bc[1]++;
    162     }
    163 }
    164 
    165 static VG_REGPARM(2)
    166 void log_ind_branch(InstrInfo* ii, UWord actual_dst)
    167 {
    168     Bool miss;
    169     Int fullOffset_Bi;
    170     ULong* cost_Bi;
    171 
    172     CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
    173               CLG_(bb_base) + ii->instr_offset, actual_dst);
    174 
    175     miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
    176 
    177     if (!CLG_(current_state).collect) return;
    178 
    179     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
    180 
    181     if (CLG_(current_state).nonskipped)
    182         cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
    183     else
    184         cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
    185 
    186     fullOffset_Bi = fullOffset(EG_BI);
    187     CLG_(current_state).cost[ fullOffset_Bi ]++;
    188     cost_Bi[0]++;
    189     if (miss) {
    190         CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
    191         cost_Bi[1]++;
    192     }
    193 }
    194 
    195 /*------------------------------------------------------------*/
    196 /*--- Instrumentation structures and event queue handling  ---*/
    197 /*------------------------------------------------------------*/
    198 
    199 /* Maintain an ordered list of memory events which are outstanding, in
    200    the sense that no IR has yet been generated to do the relevant
    201    helper calls.  The BB is scanned top to bottom and memory events
    202    are added to the end of the list, merging with the most recent
    203    notified event where possible (Dw immediately following Dr and
    204    having the same size and EA can be merged).
    205 
    206    This merging is done so that for architectures which have
    207    load-op-store instructions (x86, amd64), the insn is treated as if
    208    it makes just one memory reference (a modify), rather than two (a
    209    read followed by a write at the same address).
    210 
    211    At various points the list will need to be flushed, that is, IR
    212    generated from it.  That must happen before any possible exit from
    213    the block (the end, or an IRStmt_Exit).  Flushing also takes place
    214    when there is no space to add a new event.
    215 
    216    If we require the simulation statistics to be up to date with
    217    respect to possible memory exceptions, then the list would have to
    218    be flushed before each memory reference.  That would however lose
    219    performance by inhibiting event-merging during flushing.
    220 
    221    Flushing the list consists of walking it start to end and emitting
    222    instrumentation IR for each event, in the order in which they
    223    appear.  It may be possible to emit a single call for two adjacent
    224    events in order to reduce the number of helper function calls made.
    225    For example, it could well be profitable to handle two adjacent Ir
    226    events with a single helper call.  */
    227 
    228 typedef
    229    IRExpr
    230    IRAtom;
    231 
    232 typedef
    233    enum {
    234       Ev_Ir,  // Instruction read
    235       Ev_Dr,  // Data read
    236       Ev_Dw,  // Data write
    237       Ev_Dm,  // Data modify (read then write)
    238       Ev_Bc,  // branch conditional
    239       Ev_Bi,  // branch indirect (to unknown destination)
    240       Ev_G    // Global bus event
    241    }
    242    EventTag;
    243 
    244 typedef
    245    struct {
    246       EventTag   tag;
    247       InstrInfo* inode;
    248       union {
    249 	 struct {
    250 	 } Ir;
    251 	 struct {
    252 	    IRAtom* ea;
    253 	    Int     szB;
    254 	 } Dr;
    255 	 struct {
    256 	    IRAtom* ea;
    257 	    Int     szB;
    258 	 } Dw;
    259 	 struct {
    260 	    IRAtom* ea;
    261 	    Int     szB;
    262 	 } Dm;
    263          struct {
    264             IRAtom* taken; /* :: Ity_I1 */
    265          } Bc;
    266          struct {
    267             IRAtom* dst;
    268          } Bi;
    269 	 struct {
    270 	 } G;
    271       } Ev;
    272    }
    273    Event;
    274 
    275 static void init_Event ( Event* ev ) {
    276    VG_(memset)(ev, 0, sizeof(Event));
    277 }
    278 
    279 static IRAtom* get_Event_dea ( Event* ev ) {
    280    switch (ev->tag) {
    281       case Ev_Dr: return ev->Ev.Dr.ea;
    282       case Ev_Dw: return ev->Ev.Dw.ea;
    283       case Ev_Dm: return ev->Ev.Dm.ea;
    284       default:    tl_assert(0);
    285    }
    286 }
    287 
    288 static Int get_Event_dszB ( Event* ev ) {
    289    switch (ev->tag) {
    290       case Ev_Dr: return ev->Ev.Dr.szB;
    291       case Ev_Dw: return ev->Ev.Dw.szB;
    292       case Ev_Dm: return ev->Ev.Dm.szB;
    293       default:    tl_assert(0);
    294    }
    295 }
    296 
    297 
    298 /* Up to this many unnotified events are allowed.  Number is
    299    arbitrary.  Larger numbers allow more event merging to occur, but
    300    potentially induce more spilling due to extending live ranges of
    301    address temporaries. */
    302 #define N_EVENTS 16
    303 
    304 
    305 /* A struct which holds all the running state during instrumentation.
    306    Mostly to avoid passing loads of parameters everywhere. */
    307 typedef struct {
    308     /* The current outstanding-memory-event list. */
    309     Event events[N_EVENTS];
    310     Int   events_used;
    311 
    312     /* The array of InstrInfo's is part of BB struct. */
    313     BB* bb;
    314 
    315     /* BB seen before (ie. re-instrumentation) */
    316     Bool seen_before;
    317 
    318     /* Number InstrInfo bins 'used' so far. */
    319     UInt ii_index;
    320 
    321     // current offset of guest instructions from BB start
    322     UInt instr_offset;
    323 
    324     /* The output SB being constructed. */
    325     IRSB* sbOut;
    326 } ClgState;
    327 
    328 
    329 static void showEvent ( Event* ev )
    330 {
    331    switch (ev->tag) {
    332       case Ev_Ir:
    333 	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
    334 		     ev->inode, ev->inode->instr_offset);
    335 	 break;
    336       case Ev_Dr:
    337 	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
    338 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
    339 	 ppIRExpr(ev->Ev.Dr.ea);
    340 	 VG_(printf)("\n");
    341 	 break;
    342       case Ev_Dw:
    343 	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
    344 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
    345 	 ppIRExpr(ev->Ev.Dw.ea);
    346 	 VG_(printf)("\n");
    347 	 break;
    348       case Ev_Dm:
    349 	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
    350 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
    351 	 ppIRExpr(ev->Ev.Dm.ea);
    352 	 VG_(printf)("\n");
    353 	 break;
    354       case Ev_Bc:
    355          VG_(printf)("Bc %p   GA=", ev->inode);
    356          ppIRExpr(ev->Ev.Bc.taken);
    357          VG_(printf)("\n");
    358          break;
    359       case Ev_Bi:
    360          VG_(printf)("Bi %p  DST=", ev->inode);
    361          ppIRExpr(ev->Ev.Bi.dst);
    362          VG_(printf)("\n");
    363          break;
    364       case Ev_G:
    365          VG_(printf)("G  %p\n", ev->inode);
    366          break;
    367       default:
    368 	 tl_assert(0);
    369 	 break;
    370    }
    371 }
    372 
    373 /* Generate code for all outstanding memory events, and mark the queue
    374    empty.  Code is generated into cgs->sbOut, and this activity
    375    'consumes' slots in cgs->bb. */
    376 
    377 static void flushEvents ( ClgState* clgs )
    378 {
    379    Int        i, regparms, inew;
    380    Char*      helperName;
    381    void*      helperAddr;
    382    IRExpr**   argv;
    383    IRExpr*    i_node_expr;
    384    IRDirty*   di;
    385    Event*     ev;
    386    Event*     ev2;
    387    Event*     ev3;
    388 
    389    if (!clgs->seen_before) {
    390        // extend event sets as needed
    391        // available sets: D0 Dr
    392        for(i=0; i<clgs->events_used; i++) {
    393 	   ev  = &clgs->events[i];
    394 	   switch(ev->tag) {
    395 	   case Ev_Ir:
    396 	       // Ir event always is first for a guest instruction
    397 	       CLG_ASSERT(ev->inode->eventset == 0);
    398 	       ev->inode->eventset = CLG_(sets).base;
    399 	       break;
    400 	   case Ev_Dr:
    401                // extend event set by Dr counters
    402 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    403 							   EG_DR);
    404 	       break;
    405 	   case Ev_Dw:
    406 	   case Ev_Dm:
    407                // extend event set by Dw counters
    408 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    409 							   EG_DW);
    410 	       break;
    411            case Ev_Bc:
    412                // extend event set by Bc counters
    413                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    414                                                            EG_BC);
    415                break;
    416            case Ev_Bi:
    417                // extend event set by Bi counters
    418                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    419                                                            EG_BI);
    420                break;
    421 	   case Ev_G:
    422                // extend event set by Bus counter
    423 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    424 							   EG_BUS);
    425 	       break;
    426 	   default:
    427 	       tl_assert(0);
    428 	   }
    429        }
    430    }
    431 
    432    for(i = 0; i < clgs->events_used; i = inew) {
    433 
    434       helperName = NULL;
    435       helperAddr = NULL;
    436       argv       = NULL;
    437       regparms   = 0;
    438 
    439       /* generate IR to notify event i and possibly the ones
    440 	 immediately following it. */
    441       tl_assert(i >= 0 && i < clgs->events_used);
    442 
    443       ev  = &clgs->events[i];
    444       ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
    445       ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
    446 
    447       CLG_DEBUGIF(5) {
    448 	 VG_(printf)("   flush ");
    449 	 showEvent( ev );
    450       }
    451 
    452       i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
    453 
    454       /* Decide on helper fn to call and args to pass it, and advance
    455 	 i appropriately.
    456 	 Dm events have same effect as Dw events */
    457       switch (ev->tag) {
    458 	 case Ev_Ir:
    459 	    /* Merge an Ir with a following Dr. */
    460 	    if (ev2 && ev2->tag == Ev_Dr) {
    461 	       /* Why is this true?  It's because we're merging an Ir
    462 		  with a following Dr.  The Ir derives from the
    463 		  instruction's IMark and the Dr from data
    464 		  references which follow it.  In short it holds
    465 		  because each insn starts with an IMark, hence an
    466 		  Ev_Ir, and so these Dr must pertain to the
    467 		  immediately preceding Ir.  Same applies to analogous
    468 		  assertions in the subsequent cases. */
    469 	       tl_assert(ev2->inode == ev->inode);
    470 	       helperName = CLG_(cachesim).log_1I1Dr_name;
    471 	       helperAddr = CLG_(cachesim).log_1I1Dr;
    472 	       argv = mkIRExprVec_3( i_node_expr,
    473 				     get_Event_dea(ev2),
    474 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    475 	       regparms = 3;
    476 	       inew = i+2;
    477 	    }
    478 	    /* Merge an Ir with a following Dw/Dm. */
    479 	    else
    480 	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
    481 	       tl_assert(ev2->inode == ev->inode);
    482 	       helperName = CLG_(cachesim).log_1I1Dw_name;
    483 	       helperAddr = CLG_(cachesim).log_1I1Dw;
    484 	       argv = mkIRExprVec_3( i_node_expr,
    485 				     get_Event_dea(ev2),
    486 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    487 	       regparms = 3;
    488 	       inew = i+2;
    489 	    }
    490 	    /* Merge an Ir with two following Irs. */
    491 	    else
    492 	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
    493 	       helperName = CLG_(cachesim).log_3I0D_name;
    494 	       helperAddr = CLG_(cachesim).log_3I0D;
    495 	       argv = mkIRExprVec_3( i_node_expr,
    496 				     mkIRExpr_HWord( (HWord)ev2->inode ),
    497 				     mkIRExpr_HWord( (HWord)ev3->inode ) );
    498 	       regparms = 3;
    499 	       inew = i+3;
    500 	    }
    501 	    /* Merge an Ir with one following Ir. */
    502 	    else
    503 	    if (ev2 && ev2->tag == Ev_Ir) {
    504 	       helperName = CLG_(cachesim).log_2I0D_name;
    505 	       helperAddr = CLG_(cachesim).log_2I0D;
    506 	       argv = mkIRExprVec_2( i_node_expr,
    507 				     mkIRExpr_HWord( (HWord)ev2->inode ) );
    508 	       regparms = 2;
    509 	       inew = i+2;
    510 	    }
    511 	    /* No merging possible; emit as-is. */
    512 	    else {
    513 	       helperName = CLG_(cachesim).log_1I0D_name;
    514 	       helperAddr = CLG_(cachesim).log_1I0D;
    515 	       argv = mkIRExprVec_1( i_node_expr );
    516 	       regparms = 1;
    517 	       inew = i+1;
    518 	    }
    519 	    break;
    520 	 case Ev_Dr:
    521 	    /* Data read or modify */
    522 	    helperName = CLG_(cachesim).log_0I1Dr_name;
    523 	    helperAddr = CLG_(cachesim).log_0I1Dr;
    524 	    argv = mkIRExprVec_3( i_node_expr,
    525 				  get_Event_dea(ev),
    526 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    527 	    regparms = 3;
    528 	    inew = i+1;
    529 	    break;
    530 	 case Ev_Dw:
    531 	 case Ev_Dm:
    532 	    /* Data write */
    533 	    helperName = CLG_(cachesim).log_0I1Dw_name;
    534 	    helperAddr = CLG_(cachesim).log_0I1Dw;
    535 	    argv = mkIRExprVec_3( i_node_expr,
    536 				  get_Event_dea(ev),
    537 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    538 	    regparms = 3;
    539 	    inew = i+1;
    540 	    break;
    541          case Ev_Bc:
    542             /* Conditional branch */
    543             helperName = "log_cond_branch";
    544             helperAddr = &log_cond_branch;
    545             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
    546             regparms = 2;
    547             inew = i+1;
    548             break;
    549          case Ev_Bi:
    550             /* Branch to an unknown destination */
    551             helperName = "log_ind_branch";
    552             helperAddr = &log_ind_branch;
    553             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
    554             regparms = 2;
    555             inew = i+1;
    556             break;
    557          case Ev_G:
    558             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
    559             helperName = "log_global_event";
    560             helperAddr = &log_global_event;
    561             argv = mkIRExprVec_1( i_node_expr );
    562             regparms = 1;
    563             inew = i+1;
    564             break;
    565 	 default:
    566 	    tl_assert(0);
    567       }
    568 
    569       CLG_DEBUGIF(5) {
    570 	  if (inew > i+1) {
    571 	      VG_(printf)("   merge ");
    572 	      showEvent( ev2 );
    573 	  }
    574 	  if (inew > i+2) {
    575 	      VG_(printf)("   merge ");
    576 	      showEvent( ev3 );
    577 	  }
    578 	  if (helperAddr)
    579 	      VG_(printf)("   call  %s (%p)\n",
    580 			  helperName, helperAddr);
    581       }
    582 
    583       /* helper could be unset depending on the simulator used */
    584       if (helperAddr == 0) continue;
    585 
    586       /* Add the helper. */
    587       tl_assert(helperName);
    588       tl_assert(helperAddr);
    589       tl_assert(argv);
    590       di = unsafeIRDirty_0_N( regparms,
    591 			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    592 			      argv );
    593       addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    594    }
    595 
    596    clgs->events_used = 0;
    597 }
    598 
    599 static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
    600 {
    601    Event* evt;
    602    tl_assert(clgs->seen_before || (inode->eventset == 0));
    603    if (!CLG_(clo).simulate_cache) return;
    604 
    605    if (clgs->events_used == N_EVENTS)
    606       flushEvents(clgs);
    607    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    608    evt = &clgs->events[clgs->events_used];
    609    init_Event(evt);
    610    evt->tag      = Ev_Ir;
    611    evt->inode    = inode;
    612    clgs->events_used++;
    613 }
    614 
    615 static
    616 void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    617 {
    618    Event* evt;
    619    tl_assert(isIRAtom(ea));
    620    tl_assert(datasize >= 1);
    621    if (!CLG_(clo).simulate_cache) return;
    622    tl_assert(datasize <= CLG_(min_line_size));
    623 
    624    if (clgs->events_used == N_EVENTS)
    625       flushEvents(clgs);
    626    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    627    evt = &clgs->events[clgs->events_used];
    628    init_Event(evt);
    629    evt->tag       = Ev_Dr;
    630    evt->inode     = inode;
    631    evt->Ev.Dr.szB = datasize;
    632    evt->Ev.Dr.ea  = ea;
    633    clgs->events_used++;
    634 }
    635 
    636 static
    637 void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    638 {
    639    Event* lastEvt;
    640    Event* evt;
    641    tl_assert(isIRAtom(ea));
    642    tl_assert(datasize >= 1);
    643    if (!CLG_(clo).simulate_cache) return;
    644    tl_assert(datasize <= CLG_(min_line_size));
    645 
    646    /* Is it possible to merge this write with the preceding read? */
    647    lastEvt = &clgs->events[clgs->events_used-1];
    648    if (clgs->events_used > 0
    649        && lastEvt->tag       == Ev_Dr
    650        && lastEvt->Ev.Dr.szB == datasize
    651        && lastEvt->inode     == inode
    652        && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
    653    {
    654       lastEvt->tag   = Ev_Dm;
    655       return;
    656    }
    657 
    658    /* No.  Add as normal. */
    659    if (clgs->events_used == N_EVENTS)
    660       flushEvents(clgs);
    661    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    662    evt = &clgs->events[clgs->events_used];
    663    init_Event(evt);
    664    evt->tag       = Ev_Dw;
    665    evt->inode     = inode;
    666    evt->Ev.Dw.szB = datasize;
    667    evt->Ev.Dw.ea  = ea;
    668    clgs->events_used++;
    669 }
    670 
    671 static
    672 void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
    673 {
    674    Event* evt;
    675    tl_assert(isIRAtom(guard));
    676    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
    677              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    678    if (!CLG_(clo).simulate_branch) return;
    679 
    680    if (clgs->events_used == N_EVENTS)
    681       flushEvents(clgs);
    682    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    683    evt = &clgs->events[clgs->events_used];
    684    init_Event(evt);
    685    evt->tag         = Ev_Bc;
    686    evt->inode       = inode;
    687    evt->Ev.Bc.taken = guard;
    688    clgs->events_used++;
    689 }
    690 
    691 static
    692 void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
    693 {
    694    Event* evt;
    695    tl_assert(isIRAtom(whereTo));
    696    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
    697              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    698    if (!CLG_(clo).simulate_branch) return;
    699 
    700    if (clgs->events_used == N_EVENTS)
    701       flushEvents(clgs);
    702    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    703    evt = &clgs->events[clgs->events_used];
    704    init_Event(evt);
    705    evt->tag       = Ev_Bi;
    706    evt->inode     = inode;
    707    evt->Ev.Bi.dst = whereTo;
    708    clgs->events_used++;
    709 }
    710 
    711 static
    712 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
    713 {
    714    Event* evt;
    715    if (!CLG_(clo).collect_bus) return;
    716 
    717    if (clgs->events_used == N_EVENTS)
    718       flushEvents(clgs);
    719    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    720    evt = &clgs->events[clgs->events_used];
    721    init_Event(evt);
    722    evt->tag       = Ev_G;
    723    evt->inode     = inode;
    724    clgs->events_used++;
    725 }
    726 
    727 /* Initialise or check (if already seen before) an InstrInfo for next insn.
    728    We only can set instr_offset/instr_size here. The required event set and
    729    resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
    730    instructions. The event set is extended as required on flush of the event
    731    queue (when Dm events were determined), cost offsets are determined at
    732    end of BB instrumentation. */
    733 static
    734 InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
    735 {
    736    InstrInfo* ii;
    737    tl_assert(clgs->ii_index >= 0);
    738    tl_assert(clgs->ii_index < clgs->bb->instr_count);
    739    ii = &clgs->bb->instr[ clgs->ii_index ];
    740 
    741    if (clgs->seen_before) {
    742        CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
    743        CLG_ASSERT(ii->instr_size == instr_size);
    744    }
    745    else {
    746        ii->instr_offset = clgs->instr_offset;
    747        ii->instr_size = instr_size;
    748        ii->cost_offset = 0;
    749        ii->eventset = 0;
    750    }
    751 
    752    clgs->ii_index++;
    753    clgs->instr_offset += instr_size;
    754    CLG_(stat).distinct_instrs++;
    755 
    756    return ii;
    757 }
    758 
    759 // return total number of cost values needed for this BB
    760 static
    761 UInt update_cost_offsets( ClgState* clgs )
    762 {
    763     Int i;
    764     InstrInfo* ii;
    765     UInt cost_offset = 0;
    766 
    767     CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
    768     for(i=0; i<clgs->ii_index; i++) {
    769 	ii = &clgs->bb->instr[i];
    770 	if (clgs->seen_before) {
    771 	    CLG_ASSERT(ii->cost_offset == cost_offset);
    772 	} else
    773 	    ii->cost_offset = cost_offset;
    774 	cost_offset += ii->eventset ? ii->eventset->size : 0;
    775     }
    776 
    777     return cost_offset;
    778 }
    779 
    780 /*------------------------------------------------------------*/
    781 /*--- Instrumentation                                      ---*/
    782 /*------------------------------------------------------------*/
    783 
    784 #if defined(VG_BIGENDIAN)
    785 # define CLGEndness Iend_BE
    786 #elif defined(VG_LITTLEENDIAN)
    787 # define CLGEndness Iend_LE
    788 #else
    789 # error "Unknown endianness"
    790 #endif
    791 
    792 static
    793 Addr IRConst2Addr(IRConst* con)
    794 {
    795     Addr addr;
    796 
    797     if (sizeof(Addr) == 4) {
    798 	CLG_ASSERT( con->tag == Ico_U32 );
    799 	addr = con->Ico.U32;
    800     }
    801     else if (sizeof(Addr) == 8) {
    802 	CLG_ASSERT( con->tag == Ico_U64 );
    803 	addr = con->Ico.U64;
    804     }
    805     else
    806 	VG_(tool_panic)("Callgrind: invalid Addr type");
    807 
    808     return addr;
    809 }
    810 
    811 /* First pass over a BB to instrument, counting instructions and jumps
    812  * This is needed for the size of the BB struct to allocate
    813  *
    814  * Called from CLG_(get_bb)
    815  */
    816 void CLG_(collectBlockInfo)(IRSB* sbIn,
    817 			    /*INOUT*/ UInt* instrs,
    818 			    /*INOUT*/ UInt* cjmps,
    819 			    /*INOUT*/ Bool* cjmp_inverted)
    820 {
    821     Int i;
    822     IRStmt* st;
    823     Addr instrAddr =0, jumpDst;
    824     UInt instrLen = 0;
    825     Bool toNextInstr = False;
    826 
    827     // Ist_Exit has to be ignored in preamble code, before first IMark:
    828     // preamble code is added by VEX for self modifying code, and has
    829     // nothing to do with client code
    830     Bool inPreamble = True;
    831 
    832     if (!sbIn) return;
    833 
    834     for (i = 0; i < sbIn->stmts_used; i++) {
    835 	  st = sbIn->stmts[i];
    836 	  if (Ist_IMark == st->tag) {
    837 	      inPreamble = False;
    838 
    839 	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
    840 	      instrLen  = st->Ist.IMark.len;
    841 
    842 	      (*instrs)++;
    843 	      toNextInstr = False;
    844 	  }
    845 	  if (inPreamble) continue;
    846 	  if (Ist_Exit == st->tag) {
    847 	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
    848 	      toNextInstr =  (jumpDst == instrAddr + instrLen);
    849 
    850 	      (*cjmps)++;
    851 	  }
    852     }
    853 
    854     /* if the last instructions of BB conditionally jumps to next instruction
    855      * (= first instruction of next BB in memory), this is a inverted by VEX.
    856      */
    857     *cjmp_inverted = toNextInstr;
    858 }
    859 
    860 static
    861 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
    862 {
    863     addStmtToIRSB( bbOut,
    864 		   IRStmt_Store(CLGEndness,
    865 				IRExpr_Const(hWordTy == Ity_I32 ?
    866 					     IRConst_U32( addr ) :
    867 					     IRConst_U64( addr )),
    868 				IRExpr_Const(IRConst_U32(val)) ));
    869 }
    870 
    871 
    872 /* add helper call to setup_bbcc, with pointer to BB struct as argument
    873  *
    874  * precondition for setup_bbcc:
    875  * - jmps_passed has number of cond.jumps passed in last executed BB
    876  * - current_bbcc has a pointer to the BBCC of the last executed BB
    877  *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
    878  *     current_bbcc->bb->jmp_addr
    879  *   gives the address of the jump source.
    880  *
    881  * the setup does 2 things:
    882  * - trace call:
    883  *   * Unwind own call stack, i.e sync our ESP with real ESP
    884  *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
    885  *   * For CALLs or JMPs crossing objects, record call arg +
    886  *     push are on own call stack
    887  *
    888  * - prepare for cache log functions:
    889  *   set current_bbcc to BBCC that gets the costs for this BB execution
    890  *   attached
    891  */
    892 static
    893 void addBBSetupCall(ClgState* clgs)
    894 {
    895    IRDirty* di;
    896    IRExpr  *arg1, **argv;
    897 
    898    arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
    899    argv = mkIRExprVec_1(arg1);
    900    di = unsafeIRDirty_0_N( 1, "setup_bbcc",
    901 			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
    902 			      argv);
    903    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    904 }
    905 
    906 
    907 static
    908 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    909 			IRSB* sbIn,
    910 			VexGuestLayout* layout,
    911 			VexGuestExtents* vge,
    912 			IRType gWordTy, IRType hWordTy )
    913 {
    914    Int      i;
    915    IRStmt*  st;
    916    Addr     origAddr;
    917    InstrInfo* curr_inode = NULL;
    918    ClgState clgs;
    919    UInt     cJumps = 0;
    920 
    921 
    922    if (gWordTy != hWordTy) {
    923       /* We don't currently support this case. */
    924       VG_(tool_panic)("host/guest word size mismatch");
    925    }
    926 
    927    // No instrumentation if it is switched off
    928    if (! CLG_(instrument_state)) {
    929        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
    930 		 (Addr)closure->readdr);
    931        return sbIn;
    932    }
    933 
    934    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
    935 
    936    /* Set up SB for instrumented IR */
    937    clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
    938 
    939    // Copy verbatim any IR preamble preceding the first IMark
    940    i = 0;
    941    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
    942       addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
    943       i++;
    944    }
    945 
    946    // Get the first statement, and origAddr from it
    947    CLG_ASSERT(sbIn->stmts_used >0);
    948    CLG_ASSERT(i < sbIn->stmts_used);
    949    st = sbIn->stmts[i];
    950    CLG_ASSERT(Ist_IMark == st->tag);
    951 
    952    origAddr = (Addr)st->Ist.IMark.addr + (Addr)st->Ist.IMark.delta;
    953    CLG_ASSERT(origAddr == st->Ist.IMark.addr
    954                           + st->Ist.IMark.delta);  // XXX: check no overflow
    955 
    956    /* Get BB struct (creating if necessary).
    957     * JS: The hash table is keyed with orig_addr_noredir -- important!
    958     * JW: Why? If it is because of different chasing of the redirection,
    959     *     this is not needed, as chasing is switched off in callgrind
    960     */
    961    clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
    962 
    963    addBBSetupCall(&clgs);
    964 
    965    // Set up running state
    966    clgs.events_used = 0;
    967    clgs.ii_index = 0;
    968    clgs.instr_offset = 0;
    969 
    970    for (/*use current i*/; i < sbIn->stmts_used; i++) {
    971 
    972       st = sbIn->stmts[i];
    973       CLG_ASSERT(isFlatIRStmt(st));
    974 
    975       switch (st->tag) {
    976 	 case Ist_NoOp:
    977 	 case Ist_AbiHint:
    978 	 case Ist_Put:
    979 	 case Ist_PutI:
    980 	 case Ist_MBE:
    981 	    break;
    982 
    983 	 case Ist_IMark: {
    984             Addr64 cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
    985             Int    isize = st->Ist.IMark.len;
    986             CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
    987 	    // If Vex fails to decode an instruction, the size will be zero.
    988 	    // Pretend otherwise.
    989 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
    990 
    991 	    // Sanity-check size.
    992 	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
    993 		     || VG_CLREQ_SZB == isize );
    994 
    995 	    // Init the inode, record it as the current one.
    996 	    // Subsequent Dr/Dw/Dm events from the same instruction will
    997 	    // also use it.
    998 	    curr_inode = next_InstrInfo (&clgs, isize);
    999 
   1000 	    addEvent_Ir( &clgs, curr_inode );
   1001 	    break;
   1002 	 }
   1003 
   1004 	 case Ist_WrTmp: {
   1005 	    IRExpr* data = st->Ist.WrTmp.data;
   1006 	    if (data->tag == Iex_Load) {
   1007 	       IRExpr* aexpr = data->Iex.Load.addr;
   1008 	       // Note also, endianness info is ignored.  I guess
   1009 	       // that's not interesting.
   1010 	       addEvent_Dr( &clgs, curr_inode,
   1011 			    sizeofIRType(data->Iex.Load.ty), aexpr );
   1012 	    }
   1013 	    break;
   1014 	 }
   1015 
   1016 	 case Ist_Store: {
   1017 	    IRExpr* data  = st->Ist.Store.data;
   1018 	    IRExpr* aexpr = st->Ist.Store.addr;
   1019 	    addEvent_Dw( &clgs, curr_inode,
   1020 			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
   1021 	    break;
   1022 	 }
   1023 
   1024 	 case Ist_Dirty: {
   1025 	    Int      dataSize;
   1026 	    IRDirty* d = st->Ist.Dirty.details;
   1027 	    if (d->mFx != Ifx_None) {
   1028 	       /* This dirty helper accesses memory.  Collect the details. */
   1029 	       tl_assert(d->mAddr != NULL);
   1030 	       tl_assert(d->mSize != 0);
   1031 	       dataSize = d->mSize;
   1032 	       // Large (eg. 28B, 108B, 512B on x86) data-sized
   1033 	       // instructions will be done inaccurately, but they're
   1034 	       // very rare and this avoids errors from hitting more
   1035 	       // than two cache lines in the simulation.
   1036 	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
   1037 		  dataSize = CLG_(min_line_size);
   1038 	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
   1039 		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
   1040 	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
   1041 		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
   1042 	    } else {
   1043 	       tl_assert(d->mAddr == NULL);
   1044 	       tl_assert(d->mSize == 0);
   1045 	    }
   1046 	    break;
   1047 	 }
   1048 
   1049          case Ist_CAS: {
   1050             /* We treat it as a read and a write of the location.  I
   1051                think that is the same behaviour as it was before IRCAS
   1052                was introduced, since prior to that point, the Vex
   1053                front ends would translate a lock-prefixed instruction
   1054                into a (normal) read followed by a (normal) write. */
   1055             Int    dataSize;
   1056             IRCAS* cas = st->Ist.CAS.details;
   1057             CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
   1058             CLG_ASSERT(cas->dataLo);
   1059             dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
   1060             if (cas->dataHi != NULL)
   1061                dataSize *= 2; /* since this is a doubleword-cas */
   1062             addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
   1063             addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
   1064             addEvent_G(  &clgs, curr_inode );
   1065             break;
   1066          }
   1067 
   1068          case Ist_LLSC: {
   1069             IRType dataTy;
   1070             if (st->Ist.LLSC.storedata == NULL) {
   1071                /* LL */
   1072                dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
   1073                addEvent_Dr( &clgs, curr_inode,
   1074                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1075             } else {
   1076                /* SC */
   1077                dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
   1078                addEvent_Dw( &clgs, curr_inode,
   1079                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1080                /* I don't know whether the global-bus-lock cost should
   1081                   be attributed to the LL or the SC, but it doesn't
   1082                   really matter since they always have to be used in
   1083                   pairs anyway.  Hence put it (quite arbitrarily) on
   1084                   the SC. */
   1085                addEvent_G(  &clgs, curr_inode );
   1086             }
   1087             break;
   1088          }
   1089 
   1090  	 case Ist_Exit: {
   1091             Bool guest_exit, inverted;
   1092 
   1093             /* VEX code generation sometimes inverts conditional branches.
   1094              * As Callgrind counts (conditional) jumps, it has to correct
   1095              * inversions. The heuristic is the following:
   1096              * (1) Callgrind switches off SB chasing and unrolling, and
   1097              *     therefore it assumes that a candidate for inversion only is
   1098              *     the last conditional branch in an SB.
   1099              * (2) inversion is assumed if the branch jumps to the address of
   1100              *     the next guest instruction in memory.
   1101              * This heuristic is precalculated in CLG_(collectBlockInfo)().
   1102              *
   1103              * Branching behavior is also used for branch prediction. Note that
   1104              * above heuristic is different from what Cachegrind does.
   1105              * Cachegrind uses (2) for all branches.
   1106              */
   1107             if (cJumps+1 == clgs.bb->cjmp_count)
   1108                 inverted = clgs.bb->cjmp_inverted;
   1109             else
   1110                 inverted = False;
   1111 
   1112             // call branch predictor only if this is a branch in guest code
   1113             guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
   1114                          (st->Ist.Exit.jk == Ijk_Call) ||
   1115                          (st->Ist.Exit.jk == Ijk_Ret);
   1116 
   1117             if (guest_exit) {
   1118                 /* Stuff to widen the guard expression to a host word, so
   1119                    we can pass it to the branch predictor simulation
   1120                    functions easily. */
   1121                 IRType   tyW    = hWordTy;
   1122                 IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
   1123                 IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
   1124                 IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
   1125                 IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
   1126                 IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
   1127                 IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
   1128                                                : IRExpr_Const(IRConst_U64(1));
   1129 
   1130                 /* Widen the guard expression. */
   1131                 addStmtToIRSB( clgs.sbOut,
   1132                                IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
   1133                 addStmtToIRSB( clgs.sbOut,
   1134                                IRStmt_WrTmp( guardW,
   1135                                              IRExpr_Unop(widen,
   1136                                                          IRExpr_RdTmp(guard1))) );
   1137                 /* If the exit is inverted, invert the sense of the guard. */
   1138                 addStmtToIRSB(
   1139                         clgs.sbOut,
   1140                         IRStmt_WrTmp(
   1141                                 guard,
   1142                                 inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
   1143                                     : IRExpr_RdTmp(guardW)
   1144                                     ));
   1145                 /* And post the event. */
   1146                 addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
   1147             }
   1148 
   1149 	    /* We may never reach the next statement, so need to flush
   1150 	       all outstanding transactions now. */
   1151 	    flushEvents( &clgs );
   1152 
   1153 	    CLG_ASSERT(clgs.ii_index>0);
   1154 	    if (!clgs.seen_before) {
   1155 	      ClgJumpKind jk;
   1156 
   1157 	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
   1158 	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
   1159 	      else {
   1160 		if (IRConst2Addr(st->Ist.Exit.dst) ==
   1161 		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
   1162 		  jk = jk_None;
   1163 		else
   1164 		  jk = jk_Jump;
   1165 	      }
   1166 
   1167 	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1168 	      clgs.bb->jmp[cJumps].jmpkind = jk;
   1169 	    }
   1170 
   1171 	    /* Update global variable jmps_passed before the jump
   1172 	     * A correction is needed if VEX inverted the last jump condition
   1173 	    */
   1174 	    addConstMemStoreStmt( clgs.sbOut,
   1175 				  (UWord) &CLG_(current_state).jmps_passed,
   1176                                   inverted ? cJumps+1 : cJumps, hWordTy);
   1177 	    cJumps++;
   1178 
   1179 	    break;
   1180 	 }
   1181 
   1182 	 default:
   1183 	    tl_assert(0);
   1184 	    break;
   1185       }
   1186 
   1187       /* Copy the original statement */
   1188       addStmtToIRSB( clgs.sbOut, st );
   1189 
   1190       CLG_DEBUGIF(5) {
   1191 	 VG_(printf)("   pass  ");
   1192 	 ppIRStmt(st);
   1193 	 VG_(printf)("\n");
   1194       }
   1195    }
   1196 
   1197    /* Deal with branches to unknown destinations.  Except ignore ones
   1198       which are function returns as we assume the return stack
   1199       predictor never mispredicts. */
   1200    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
   1201       if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
   1202       switch (sbIn->next->tag) {
   1203          case Iex_Const:
   1204             break; /* boring - branch to known address */
   1205          case Iex_RdTmp:
   1206             /* looks like an indirect branch (branch to unknown) */
   1207             addEvent_Bi( &clgs, curr_inode, sbIn->next );
   1208             break;
   1209          default:
   1210             /* shouldn't happen - if the incoming IR is properly
   1211                flattened, should only have tmp and const cases to
   1212                consider. */
   1213             tl_assert(0);
   1214       }
   1215    }
   1216 
   1217    /* At the end of the bb.  Flush outstandings. */
   1218    flushEvents( &clgs );
   1219 
   1220    /* Always update global variable jmps_passed at end of bb.
   1221     * A correction is needed if VEX inverted the last jump condition
   1222     */
   1223    {
   1224       UInt jmps_passed = cJumps;
   1225       if (clgs.bb->cjmp_inverted) jmps_passed--;
   1226       addConstMemStoreStmt( clgs.sbOut,
   1227 			    (UWord) &CLG_(current_state).jmps_passed,
   1228 			    jmps_passed, hWordTy);
   1229    }
   1230    CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
   1231    CLG_ASSERT(clgs.bb->instr_count = clgs.ii_index);
   1232 
   1233    /* Info for final exit from BB */
   1234    {
   1235      ClgJumpKind jk;
   1236 
   1237      if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
   1238      else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
   1239      else {
   1240        jk = jk_Jump;
   1241        if ((sbIn->next->tag == Iex_Const) &&
   1242 	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
   1243 	    origAddr + clgs.instr_offset))
   1244 	 jk = jk_None;
   1245      }
   1246      clgs.bb->jmp[cJumps].jmpkind = jk;
   1247      /* Instruction index of the call/ret at BB end
   1248       * (it is wrong for fall-through, but does not matter) */
   1249      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1250    }
   1251 
   1252    /* swap information of last exit with final exit if inverted */
   1253    if (clgs.bb->cjmp_inverted) {
   1254      ClgJumpKind jk;
   1255      UInt instr;
   1256 
   1257      jk = clgs.bb->jmp[cJumps].jmpkind;
   1258      clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
   1259      clgs.bb->jmp[cJumps-1].jmpkind = jk;
   1260      instr = clgs.bb->jmp[cJumps].instr;
   1261      clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
   1262      clgs.bb->jmp[cJumps-1].instr = instr;
   1263    }
   1264 
   1265    if (clgs.seen_before) {
   1266        CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
   1267        CLG_ASSERT(clgs.bb->instr_len = clgs.instr_offset);
   1268    }
   1269    else {
   1270        clgs.bb->cost_count = update_cost_offsets(&clgs);
   1271        clgs.bb->instr_len = clgs.instr_offset;
   1272    }
   1273 
   1274    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
   1275 	     origAddr, clgs.bb->instr_len,
   1276 	     clgs.bb->cjmp_count, clgs.bb->cost_count);
   1277    if (cJumps>0) {
   1278        CLG_DEBUG(3, "                     [ ");
   1279        for (i=0;i<cJumps;i++)
   1280 	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
   1281        CLG_DEBUG(3, "], last inverted: %s \n",
   1282 		 clgs.bb->cjmp_inverted ? "yes":"no");
   1283    }
   1284 
   1285   return clgs.sbOut;
   1286 }
   1287 
   1288 /*--------------------------------------------------------------------*/
   1289 /*--- Discarding BB info                                           ---*/
   1290 /*--------------------------------------------------------------------*/
   1291 
   1292 // Called when a translation is removed from the translation cache for
   1293 // any reason at all: to free up space, because the guest code was
   1294 // unmapped or modified, or for any arbitrary reason.
   1295 static
   1296 void clg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
   1297 {
   1298     Addr orig_addr = (Addr)orig_addr64;
   1299 
   1300     tl_assert(vge.n_used > 0);
   1301 
   1302    if (0)
   1303       VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
   1304                    (void*)(Addr)orig_addr,
   1305                    (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
   1306 
   1307    // Get BB info, remove from table, free BB info.  Simple!  Note that we
   1308    // use orig_addr, not the first instruction address in vge.
   1309    CLG_(delete_bb)(orig_addr);
   1310 }
   1311 
   1312 
   1313 /*------------------------------------------------------------*/
   1314 /*--- CLG_(fini)() and related function                     ---*/
   1315 /*------------------------------------------------------------*/
   1316 
   1317 
   1318 
   1319 static void zero_thread_cost(thread_info* t)
   1320 {
   1321   Int i;
   1322 
   1323   for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1324     if (!CLG_(current_call_stack).entry[i].jcc) continue;
   1325 
   1326     /* reset call counters to current for active calls */
   1327     CLG_(copy_cost)( CLG_(sets).full,
   1328 		    CLG_(current_call_stack).entry[i].enter_cost,
   1329 		    CLG_(current_state).cost );
   1330     CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
   1331   }
   1332 
   1333   CLG_(forall_bbccs)(CLG_(zero_bbcc));
   1334 
   1335   /* set counter for last dump */
   1336   CLG_(copy_cost)( CLG_(sets).full,
   1337 		  t->lastdump_cost, CLG_(current_state).cost );
   1338 }
   1339 
   1340 void CLG_(zero_all_cost)(Bool only_current_thread)
   1341 {
   1342   if (VG_(clo_verbosity) > 1)
   1343     VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
   1344 
   1345   if (only_current_thread)
   1346     zero_thread_cost(CLG_(get_current_thread)());
   1347   else
   1348     CLG_(forall_threads)(zero_thread_cost);
   1349 
   1350   if (VG_(clo_verbosity) > 1)
   1351     VG_(message)(Vg_DebugMsg, "  ...done\n");
   1352 }
   1353 
   1354 static
   1355 void unwind_thread(thread_info* t)
   1356 {
   1357   /* unwind signal handlers */
   1358   while(CLG_(current_state).sig !=0)
   1359     CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
   1360 
   1361   /* unwind regular call stack */
   1362   while(CLG_(current_call_stack).sp>0)
   1363     CLG_(pop_call_stack)();
   1364 
   1365   /* reset context and function stack for context generation */
   1366   CLG_(init_exec_state)( &CLG_(current_state) );
   1367   CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
   1368 }
   1369 
   1370 static
   1371 void zero_state_cost(thread_info* t)
   1372 {
   1373     CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
   1374 }
   1375 
   1376 /* Ups, this can go very wrong... */
   1377 extern void VG_(discard_translations) ( Addr64 start, ULong range, HChar* who );
   1378 
   1379 void CLG_(set_instrument_state)(Char* reason, Bool state)
   1380 {
   1381   if (CLG_(instrument_state) == state) {
   1382     CLG_DEBUG(2, "%s: instrumentation already %s\n",
   1383 	     reason, state ? "ON" : "OFF");
   1384     return;
   1385   }
   1386   CLG_(instrument_state) = state;
   1387   CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
   1388 	   reason, state ? "ON" : "OFF");
   1389 
   1390   VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl, "callgrind");
   1391 
   1392   /* reset internal state: call stacks, simulator */
   1393   CLG_(forall_threads)(unwind_thread);
   1394   CLG_(forall_threads)(zero_state_cost);
   1395   (*CLG_(cachesim).clear)();
   1396 
   1397   if (VG_(clo_verbosity) > 1)
   1398     VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
   1399 		 reason, state ? "ON" : "OFF");
   1400 }
   1401 
   1402 /* helper for dump_state_togdb */
   1403 static void dump_state_of_thread_togdb(thread_info* ti)
   1404 {
   1405     static Char buf[512];
   1406     static FullCost sum = 0, tmp = 0;
   1407     Int t, p, i;
   1408     BBCC *from, *to;
   1409     call_entry* ce;
   1410 
   1411     t = CLG_(current_tid);
   1412     CLG_(init_cost_lz)( CLG_(sets).full, &sum );
   1413     CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
   1414     CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
   1415 			 ti->states.entry[0]->cost);
   1416     CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
   1417     CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), sum);
   1418     VG_(gdb_printf)("events-%d: %s\n", t, buf);
   1419     VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
   1420 
   1421     ce = 0;
   1422     for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1423       ce = CLG_(get_call_entry)(i);
   1424       /* if this frame is skipped, we don't have counters */
   1425       if (!ce->jcc) continue;
   1426 
   1427       from = ce->jcc->from;
   1428       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
   1429       VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
   1430 
   1431       /* FIXME: EventSets! */
   1432       CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
   1433       CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
   1434       CLG_(add_diff_cost)( CLG_(sets).full, sum,
   1435 			  ce->enter_cost, CLG_(current_state).cost );
   1436       CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
   1437 
   1438       p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
   1439       CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
   1440       VG_(gdb_printf)("%s\n", buf);
   1441     }
   1442     if (ce && ce->jcc) {
   1443       to = ce->jcc->to;
   1444       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
   1445     }
   1446 }
   1447 
   1448 /* Dump current state */
   1449 static void dump_state_togdb(void)
   1450 {
   1451     static Char buf[512];
   1452     thread_info** th;
   1453     int t, p;
   1454     Int orig_tid = CLG_(current_tid);
   1455 
   1456     VG_(gdb_printf)("instrumentation: %s\n",
   1457 		    CLG_(instrument_state) ? "on":"off");
   1458     if (!CLG_(instrument_state)) return;
   1459 
   1460     VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
   1461     VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
   1462     VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
   1463     VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
   1464     VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
   1465     VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
   1466 
   1467     /* "events:" line. Given here because it will be dynamic in the future */
   1468     p = VG_(sprintf)(buf, "events: ");
   1469     CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
   1470     VG_(gdb_printf)("%s\n", buf);
   1471     /* "part:" line (number of last part. Is 0 at start */
   1472     VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
   1473 
   1474     /* threads */
   1475     th = CLG_(get_threads)();
   1476     p = VG_(sprintf)(buf, "threads:");
   1477     for(t=1;t<VG_N_THREADS;t++) {
   1478 	if (!th[t]) continue;
   1479 	p += VG_(sprintf)(buf+p, " %d", t);
   1480     }
   1481     VG_(gdb_printf)("%s\n", buf);
   1482     VG_(gdb_printf)("current-tid: %d\n", orig_tid);
   1483     CLG_(forall_threads)(dump_state_of_thread_togdb);
   1484 }
   1485 
   1486 
   1487 static void print_monitor_help ( void )
   1488 {
   1489    VG_(gdb_printf) ("\n");
   1490    VG_(gdb_printf) ("callgrind monitor commands:\n");
   1491    VG_(gdb_printf) ("  dump [<dump_hint>]\n");
   1492    VG_(gdb_printf) ("        dump counters\n");
   1493    VG_(gdb_printf) ("  zero\n");
   1494    VG_(gdb_printf) ("        zero counters\n");
   1495    VG_(gdb_printf) ("  status\n");
   1496    VG_(gdb_printf) ("        print status\n");
   1497    VG_(gdb_printf) ("  instrumentation [on|off]\n");
   1498    VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
   1499    VG_(gdb_printf) ("\n");
   1500 }
   1501 
   1502 /* return True if request recognised, False otherwise */
   1503 static Bool handle_gdb_monitor_command (ThreadId tid, Char *req)
   1504 {
   1505    Char* wcmd;
   1506    Char s[VG_(strlen(req))]; /* copy for strtok_r */
   1507    Char *ssaveptr;
   1508 
   1509    VG_(strcpy) (s, req);
   1510 
   1511    wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
   1512    switch (VG_(keyword_id) ("help dump zero status instrumentation",
   1513                             wcmd, kwd_report_duplicated_matches)) {
   1514    case -2: /* multiple matches */
   1515       return True;
   1516    case -1: /* not found */
   1517       return False;
   1518    case  0: /* help */
   1519       print_monitor_help();
   1520       return True;
   1521    case  1: { /* dump */
   1522       CLG_(dump_profile)(req, False);
   1523       return True;
   1524    }
   1525    case  2: { /* zero */
   1526       CLG_(zero_all_cost)(False);
   1527       return True;
   1528    }
   1529 
   1530    case 3: { /* status */
   1531      Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1532      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
   1533        /* internal interface to callgrind_control */
   1534        dump_state_togdb();
   1535        return True;
   1536      }
   1537 
   1538      if (!CLG_(instrument_state)) {
   1539        VG_(gdb_printf)("No status available as instrumentation is switched off\n");
   1540      } else {
   1541        // Status information to be improved ...
   1542        thread_info** th = CLG_(get_threads)();
   1543        Int t, tcount = 0;
   1544        for(t=1;t<VG_N_THREADS;t++)
   1545 	 if (th[t]) tcount++;
   1546        VG_(gdb_printf)("%d thread(s) running.\n", tcount);
   1547      }
   1548      return True;
   1549    }
   1550 
   1551    case 4: { /* instrumentation */
   1552      Char* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1553      if (!arg) {
   1554        VG_(gdb_printf)("instrumentation: %s\n",
   1555 		       CLG_(instrument_state) ? "on":"off");
   1556      }
   1557      else
   1558        CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
   1559      return True;
   1560    }
   1561 
   1562    default:
   1563       tl_assert(0);
   1564       return False;
   1565    }
   1566 }
   1567 
   1568 static
   1569 Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
   1570 {
   1571    if (!VG_IS_TOOL_USERREQ('C','T',args[0])
   1572        && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
   1573       return False;
   1574 
   1575    switch(args[0]) {
   1576    case VG_USERREQ__DUMP_STATS:
   1577       CLG_(dump_profile)("Client Request", True);
   1578       *ret = 0;                 /* meaningless */
   1579       break;
   1580 
   1581    case VG_USERREQ__DUMP_STATS_AT:
   1582      {
   1583        Char buf[512];
   1584        VG_(sprintf)(buf,"Client Request: %s", (Char*)args[1]);
   1585        CLG_(dump_profile)(buf, True);
   1586        *ret = 0;                 /* meaningless */
   1587      }
   1588      break;
   1589 
   1590    case VG_USERREQ__ZERO_STATS:
   1591      CLG_(zero_all_cost)(True);
   1592       *ret = 0;                 /* meaningless */
   1593       break;
   1594 
   1595    case VG_USERREQ__TOGGLE_COLLECT:
   1596      CLG_(current_state).collect = !CLG_(current_state).collect;
   1597      CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
   1598 	      CLG_(current_state).collect ? "ON" : "OFF");
   1599      *ret = 0;                 /* meaningless */
   1600      break;
   1601 
   1602    case VG_USERREQ__START_INSTRUMENTATION:
   1603      CLG_(set_instrument_state)("Client Request", True);
   1604      *ret = 0;                 /* meaningless */
   1605      break;
   1606 
   1607    case VG_USERREQ__STOP_INSTRUMENTATION:
   1608      CLG_(set_instrument_state)("Client Request", False);
   1609      *ret = 0;                 /* meaningless */
   1610      break;
   1611 
   1612    case VG_USERREQ__GDB_MONITOR_COMMAND: {
   1613       Bool handled = handle_gdb_monitor_command (tid, (Char*)args[1]);
   1614       if (handled)
   1615          *ret = 1;
   1616       else
   1617          *ret = 0;
   1618       return handled;
   1619    }
   1620    default:
   1621       return False;
   1622    }
   1623 
   1624    return True;
   1625 }
   1626 
   1627 
   1628 /* Syscall Timing */
   1629 
   1630 /* struct timeval syscalltime[VG_N_THREADS]; */
   1631 #if CLG_MICROSYSTIME
   1632 #include <sys/time.h>
   1633 #include <sys/syscall.h>
   1634 extern Int VG_(do_syscall) ( UInt, ... );
   1635 
   1636 ULong syscalltime[VG_N_THREADS];
   1637 #else
   1638 UInt syscalltime[VG_N_THREADS];
   1639 #endif
   1640 
   1641 static
   1642 void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
   1643                            UWord* args, UInt nArgs)
   1644 {
   1645   if (CLG_(clo).collect_systime) {
   1646 #if CLG_MICROSYSTIME
   1647     struct vki_timeval tv_now;
   1648     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1649     syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
   1650 #else
   1651     syscalltime[tid] = VG_(read_millisecond_timer)();
   1652 #endif
   1653   }
   1654 }
   1655 
   1656 static
   1657 void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
   1658                             UWord* args, UInt nArgs, SysRes res)
   1659 {
   1660   if (CLG_(clo).collect_systime &&
   1661       CLG_(current_state).bbcc) {
   1662       Int o;
   1663 #if CLG_MICROSYSTIME
   1664     struct vki_timeval tv_now;
   1665     ULong diff;
   1666 
   1667     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1668     diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
   1669 #else
   1670     UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
   1671 #endif
   1672 
   1673     /* offset o is for "SysCount", o+1 for "SysTime" */
   1674     o = fullOffset(EG_SYS);
   1675     CLG_ASSERT(o>=0);
   1676     CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
   1677 
   1678     CLG_(current_state).cost[o] ++;
   1679     CLG_(current_state).cost[o+1] += diff;
   1680     if (!CLG_(current_state).bbcc->skipped)
   1681       CLG_(init_cost_lz)(CLG_(sets).full,
   1682 			&(CLG_(current_state).bbcc->skipped));
   1683     CLG_(current_state).bbcc->skipped[o] ++;
   1684     CLG_(current_state).bbcc->skipped[o+1] += diff;
   1685   }
   1686 }
   1687 
   1688 static UInt ULong_width(ULong n)
   1689 {
   1690    UInt w = 0;
   1691    while (n > 0) {
   1692       n = n / 10;
   1693       w++;
   1694    }
   1695    if (w == 0) w = 1;
   1696    return w + (w-1)/3;   // add space for commas
   1697 }
   1698 
   1699 static
   1700 void branchsim_printstat(int l1, int l2, int l3)
   1701 {
   1702     static Char buf1[128], buf2[128], buf3[128], fmt[128];
   1703     FullCost total;
   1704     ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
   1705     ULong B_total_b, B_total_mp;
   1706 
   1707     total = CLG_(total_cost);
   1708     Bc_total_b  = total[ fullOffset(EG_BC)   ];
   1709     Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
   1710     Bi_total_b  = total[ fullOffset(EG_BI)   ];
   1711     Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
   1712 
   1713     /* Make format string, getting width right for numbers */
   1714     VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
   1715                  l1, l2, l3);
   1716 
   1717     if (0 == Bc_total_b)  Bc_total_b = 1;
   1718     if (0 == Bi_total_b)  Bi_total_b = 1;
   1719     B_total_b  = Bc_total_b  + Bi_total_b;
   1720     B_total_mp = Bc_total_mp + Bi_total_mp;
   1721 
   1722     VG_(umsg)("\n");
   1723     VG_(umsg)(fmt, "Branches:     ",
   1724               B_total_b, Bc_total_b, Bi_total_b);
   1725 
   1726     VG_(umsg)(fmt, "Mispredicts:  ",
   1727               B_total_mp, Bc_total_mp, Bi_total_mp);
   1728 
   1729     VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
   1730     VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
   1731     VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
   1732 
   1733     VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
   1734 }
   1735 
   1736 
   1737 static
   1738 void finish(void)
   1739 {
   1740   Char buf[32+COSTS_LEN], fmt[128];
   1741   Int l1, l2, l3;
   1742   FullCost total;
   1743 
   1744   CLG_DEBUG(0, "finish()\n");
   1745 
   1746   (*CLG_(cachesim).finish)();
   1747 
   1748   /* pop all remaining items from CallStack for correct sum
   1749    */
   1750   CLG_(forall_threads)(unwind_thread);
   1751 
   1752   CLG_(dump_profile)(0, False);
   1753 
   1754   if (VG_(clo_verbosity) == 0) return;
   1755 
   1756   /* Hash table stats */
   1757   if (VG_(clo_stats)) {
   1758     int BB_lookups =
   1759       CLG_(stat).full_debug_BBs +
   1760       CLG_(stat).fn_name_debug_BBs +
   1761       CLG_(stat).file_line_debug_BBs +
   1762       CLG_(stat).no_debug_BBs;
   1763 
   1764     VG_(message)(Vg_DebugMsg, "\n");
   1765     VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
   1766 		 CLG_(stat).distinct_objs);
   1767     VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
   1768 		 CLG_(stat).distinct_files);
   1769     VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
   1770 		 CLG_(stat).distinct_fns);
   1771     VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
   1772 		 CLG_(stat).distinct_contexts);
   1773     VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
   1774 		 CLG_(stat).distinct_bbs);
   1775     VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)\n",
   1776 		 CLG_(costarray_entries), CLG_(costarray_chunks));
   1777     VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
   1778 		 CLG_(stat).distinct_bbccs);
   1779     VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
   1780 		 CLG_(stat).distinct_jccs);
   1781     VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
   1782 		 CLG_(stat).distinct_skips);
   1783     VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
   1784 		 BB_lookups);
   1785     if (BB_lookups>0) {
   1786       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
   1787 		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
   1788 		   CLG_(stat).full_debug_BBs);
   1789       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
   1790 		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
   1791 		   CLG_(stat).file_line_debug_BBs);
   1792       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
   1793 		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
   1794 		   CLG_(stat).fn_name_debug_BBs);
   1795       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
   1796 		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
   1797 		   CLG_(stat).no_debug_BBs);
   1798     }
   1799     VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
   1800 		 CLG_(stat).bbcc_clones);
   1801     VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
   1802 		 CLG_(stat).bb_retranslations);
   1803     VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
   1804 		 CLG_(stat).distinct_instrs);
   1805     VG_(message)(Vg_DebugMsg, "");
   1806 
   1807     VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
   1808 		 CLG_(stat).cxt_lru_misses);
   1809     VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
   1810 		 CLG_(stat).bbcc_lru_misses);
   1811     VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
   1812 		 CLG_(stat).jcc_lru_misses);
   1813     VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
   1814 		 CLG_(stat).bb_executions);
   1815     VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
   1816 		 CLG_(stat).call_counter);
   1817     VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
   1818 		 CLG_(stat).jcnd_counter);
   1819     VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
   1820 		 CLG_(stat).jump_counter);
   1821     VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
   1822 		 CLG_(stat).rec_call_counter);
   1823     VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
   1824 		 CLG_(stat).ret_counter);
   1825 
   1826     VG_(message)(Vg_DebugMsg, "");
   1827   }
   1828 
   1829   CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
   1830   VG_(message)(Vg_UserMsg, "Events    : %s\n", buf);
   1831   CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
   1832   VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
   1833   VG_(message)(Vg_UserMsg, "\n");
   1834 
   1835   /* determine value widths for statistics */
   1836   total = CLG_(total_cost);
   1837   l1 = ULong_width( total[fullOffset(EG_IR)] );
   1838   l2 = l3 = 0;
   1839   if (CLG_(clo).simulate_cache) {
   1840       l2 = ULong_width( total[fullOffset(EG_DR)] );
   1841       l3 = ULong_width( total[fullOffset(EG_DW)] );
   1842   }
   1843   if (CLG_(clo).simulate_branch) {
   1844       int l2b = ULong_width( total[fullOffset(EG_BC)] );
   1845       int l3b = ULong_width( total[fullOffset(EG_BI)] );
   1846       if (l2b > l2) l2 = l2b;
   1847       if (l3b > l3) l3 = l3b;
   1848   }
   1849 
   1850   /* Make format string, getting width right for numbers */
   1851   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
   1852 
   1853   /* Always print this */
   1854   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
   1855 
   1856   if (CLG_(clo).simulate_cache)
   1857       (*CLG_(cachesim).printstat)(l1, l2, l3);
   1858 
   1859   if (CLG_(clo).simulate_branch)
   1860       branchsim_printstat(l1, l2, l3);
   1861 
   1862 }
   1863 
   1864 
   1865 void CLG_(fini)(Int exitcode)
   1866 {
   1867   finish();
   1868 }
   1869 
   1870 
   1871 /*--------------------------------------------------------------------*/
   1872 /*--- Setup                                                        ---*/
   1873 /*--------------------------------------------------------------------*/
   1874 
   1875 static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
   1876 {
   1877    static ULong last_blocks_done = 0;
   1878 
   1879    if (0)
   1880       VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
   1881 
   1882    /* throttle calls to CLG_(run_thread) by number of BBs executed */
   1883    if (blocks_done - last_blocks_done < 5000) return;
   1884    last_blocks_done = blocks_done;
   1885 
   1886    CLG_(run_thread)( tid );
   1887 }
   1888 
   1889 static
   1890 void CLG_(post_clo_init)(void)
   1891 {
   1892    VG_(clo_vex_control).iropt_unroll_thresh = 0;
   1893    VG_(clo_vex_control).guest_chase_thresh = 0;
   1894 
   1895    CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
   1896    CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
   1897    CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
   1898 
   1899    if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
   1900        VG_(message)(Vg_UserMsg, "Using source line as position.\n");
   1901        CLG_(clo).dump_line = True;
   1902    }
   1903 
   1904    CLG_(init_dumps)();
   1905 
   1906    (*CLG_(cachesim).post_clo_init)();
   1907 
   1908    CLG_(init_eventsets)();
   1909    CLG_(init_statistics)(& CLG_(stat));
   1910    CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
   1911 
   1912    /* initialize hash tables */
   1913    CLG_(init_obj_table)();
   1914    CLG_(init_cxt_table)();
   1915    CLG_(init_bb_hash)();
   1916 
   1917    CLG_(init_threads)();
   1918    CLG_(run_thread)(1);
   1919 
   1920    CLG_(instrument_state) = CLG_(clo).instrument_atstart;
   1921 
   1922    if (VG_(clo_verbosity > 0)) {
   1923       VG_(message)(Vg_UserMsg,
   1924                    "For interactive control, run 'callgrind_control -h'.\n");
   1925    }
   1926 }
   1927 
   1928 static
   1929 void CLG_(pre_clo_init)(void)
   1930 {
   1931     VG_(details_name)            ("Callgrind");
   1932     VG_(details_version)         (NULL);
   1933     VG_(details_description)     ("a call-graph generating cache profiler");
   1934     VG_(details_copyright_author)("Copyright (C) 2002-2012, and GNU GPL'd, "
   1935 				  "by Josef Weidendorfer et al.");
   1936     VG_(details_bug_reports_to)  (VG_BUGS_TO);
   1937     VG_(details_avg_translation_sizeB) ( 500 );
   1938 
   1939     VG_(basic_tool_funcs)        (CLG_(post_clo_init),
   1940                                   CLG_(instrument),
   1941                                   CLG_(fini));
   1942 
   1943     VG_(needs_superblock_discards)(clg_discard_superblock_info);
   1944 
   1945 
   1946     VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
   1947 				    CLG_(print_usage),
   1948 				    CLG_(print_debug_usage));
   1949 
   1950     VG_(needs_client_requests)(CLG_(handle_client_request));
   1951     VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
   1952 			       CLG_(post_syscalltime));
   1953 
   1954     VG_(track_start_client_code)  ( & clg_start_client_code_callback );
   1955     VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
   1956     VG_(track_post_deliver_signal)( & CLG_(post_signal) );
   1957 
   1958     CLG_(set_clo_defaults)();
   1959 }
   1960 
   1961 VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
   1962 
   1963 /*--------------------------------------------------------------------*/
   1964 /*--- end                                                   main.c ---*/
   1965 /*--------------------------------------------------------------------*/
   1966