Home | History | Annotate | Download | only in callgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Callgrind                                                    ---*/
      4 /*---                                                       main.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Callgrind, a Valgrind tool for call graph
      9    profiling programs.
     10 
     11    Copyright (C) 2002-2013, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     12 
     13    This tool is derived from and contains code from Cachegrind
     14    Copyright (C) 2002-2013 Nicholas Nethercote (njn (at) valgrind.org)
     15 
     16    This program is free software; you can redistribute it and/or
     17    modify it under the terms of the GNU General Public License as
     18    published by the Free Software Foundation; either version 2 of the
     19    License, or (at your option) any later version.
     20 
     21    This program is distributed in the hope that it will be useful, but
     22    WITHOUT ANY WARRANTY; without even the implied warranty of
     23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     24    General Public License for more details.
     25 
     26    You should have received a copy of the GNU General Public License
     27    along with this program; if not, write to the Free Software
     28    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     29    02111-1307, USA.
     30 
     31    The GNU General Public License is contained in the file COPYING.
     32 */
     33 
     34 #include "config.h"
     35 #include "callgrind.h"
     36 #include "global.h"
     37 
     38 #include "pub_tool_threadstate.h"
     39 #include "pub_tool_gdbserver.h"
     40 
     41 #include "cg_branchpred.c"
     42 
     43 /*------------------------------------------------------------*/
     44 /*--- Global variables                                     ---*/
     45 /*------------------------------------------------------------*/
     46 
     47 /* for all threads */
     48 CommandLineOptions CLG_(clo);
     49 Statistics CLG_(stat);
     50 Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
     51 
     52 /* thread and signal handler specific */
     53 exec_state CLG_(current_state);
     54 
     55 /* min of L1 and LL cache line sizes.  This only gets set to a
     56    non-zero value if we are doing cache simulation. */
     57 Int CLG_(min_line_size) = 0;
     58 
     59 
     60 /*------------------------------------------------------------*/
     61 /*--- Statistics                                           ---*/
     62 /*------------------------------------------------------------*/
     63 
     64 static void CLG_(init_statistics)(Statistics* s)
     65 {
     66   s->call_counter        = 0;
     67   s->jcnd_counter        = 0;
     68   s->jump_counter        = 0;
     69   s->rec_call_counter    = 0;
     70   s->ret_counter         = 0;
     71   s->bb_executions       = 0;
     72 
     73   s->context_counter     = 0;
     74   s->bb_retranslations   = 0;
     75 
     76   s->distinct_objs       = 0;
     77   s->distinct_files      = 0;
     78   s->distinct_fns        = 0;
     79   s->distinct_contexts   = 0;
     80   s->distinct_bbs        = 0;
     81   s->distinct_bbccs      = 0;
     82   s->distinct_instrs     = 0;
     83   s->distinct_skips      = 0;
     84 
     85   s->bb_hash_resizes     = 0;
     86   s->bbcc_hash_resizes   = 0;
     87   s->jcc_hash_resizes    = 0;
     88   s->cxt_hash_resizes    = 0;
     89   s->fn_array_resizes    = 0;
     90   s->call_stack_resizes  = 0;
     91   s->fn_stack_resizes    = 0;
     92 
     93   s->full_debug_BBs      = 0;
     94   s->file_line_debug_BBs = 0;
     95   s->fn_name_debug_BBs   = 0;
     96   s->no_debug_BBs        = 0;
     97   s->bbcc_lru_misses     = 0;
     98   s->jcc_lru_misses      = 0;
     99   s->cxt_lru_misses      = 0;
    100   s->bbcc_clones         = 0;
    101 }
    102 
    103 
    104 /*------------------------------------------------------------*/
    105 /*--- Simple callbacks (not cache similator)               ---*/
    106 /*------------------------------------------------------------*/
    107 
    108 VG_REGPARM(1)
    109 static void log_global_event(InstrInfo* ii)
    110 {
    111     ULong* cost_Bus;
    112 
    113     CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
    114               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
    115 
    116     if (!CLG_(current_state).collect) return;
    117 
    118     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
    119 
    120     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
    121 
    122     if (CLG_(current_state).nonskipped)
    123         cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
    124     else
    125         cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
    126     cost_Bus[0]++;
    127 }
    128 
    129 
    130 /* For branches, we consult two different predictors, one which
    131    predicts taken/untaken for conditional branches, and the other
    132    which predicts the branch target address for indirect branches
    133    (jump-to-register style ones). */
    134 
    135 static VG_REGPARM(2)
    136 void log_cond_branch(InstrInfo* ii, Word taken)
    137 {
    138     Bool miss;
    139     Int fullOffset_Bc;
    140     ULong* cost_Bc;
    141 
    142     CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %lu\n",
    143               CLG_(bb_base) + ii->instr_offset, taken);
    144 
    145     miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
    146 
    147     if (!CLG_(current_state).collect) return;
    148 
    149     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
    150 
    151     if (CLG_(current_state).nonskipped)
    152         cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
    153     else
    154         cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
    155 
    156     fullOffset_Bc = fullOffset(EG_BC);
    157     CLG_(current_state).cost[ fullOffset_Bc ]++;
    158     cost_Bc[0]++;
    159     if (miss) {
    160         CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
    161         cost_Bc[1]++;
    162     }
    163 }
    164 
    165 static VG_REGPARM(2)
    166 void log_ind_branch(InstrInfo* ii, UWord actual_dst)
    167 {
    168     Bool miss;
    169     Int fullOffset_Bi;
    170     ULong* cost_Bi;
    171 
    172     CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
    173               CLG_(bb_base) + ii->instr_offset, actual_dst);
    174 
    175     miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
    176 
    177     if (!CLG_(current_state).collect) return;
    178 
    179     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
    180 
    181     if (CLG_(current_state).nonskipped)
    182         cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
    183     else
    184         cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
    185 
    186     fullOffset_Bi = fullOffset(EG_BI);
    187     CLG_(current_state).cost[ fullOffset_Bi ]++;
    188     cost_Bi[0]++;
    189     if (miss) {
    190         CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
    191         cost_Bi[1]++;
    192     }
    193 }
    194 
    195 /*------------------------------------------------------------*/
    196 /*--- Instrumentation structures and event queue handling  ---*/
    197 /*------------------------------------------------------------*/
    198 
    199 /* Maintain an ordered list of memory events which are outstanding, in
    200    the sense that no IR has yet been generated to do the relevant
    201    helper calls.  The BB is scanned top to bottom and memory events
    202    are added to the end of the list, merging with the most recent
    203    notified event where possible (Dw immediately following Dr and
    204    having the same size and EA can be merged).
    205 
    206    This merging is done so that for architectures which have
    207    load-op-store instructions (x86, amd64), the insn is treated as if
    208    it makes just one memory reference (a modify), rather than two (a
    209    read followed by a write at the same address).
    210 
    211    At various points the list will need to be flushed, that is, IR
    212    generated from it.  That must happen before any possible exit from
    213    the block (the end, or an IRStmt_Exit).  Flushing also takes place
    214    when there is no space to add a new event.
    215 
    216    If we require the simulation statistics to be up to date with
    217    respect to possible memory exceptions, then the list would have to
    218    be flushed before each memory reference.  That would however lose
    219    performance by inhibiting event-merging during flushing.
    220 
    221    Flushing the list consists of walking it start to end and emitting
    222    instrumentation IR for each event, in the order in which they
    223    appear.  It may be possible to emit a single call for two adjacent
    224    events in order to reduce the number of helper function calls made.
    225    For example, it could well be profitable to handle two adjacent Ir
    226    events with a single helper call.  */
    227 
    228 typedef
    229    IRExpr
    230    IRAtom;
    231 
    232 typedef
    233    enum {
    234       Ev_Ir,  // Instruction read
    235       Ev_Dr,  // Data read
    236       Ev_Dw,  // Data write
    237       Ev_Dm,  // Data modify (read then write)
    238       Ev_Bc,  // branch conditional
    239       Ev_Bi,  // branch indirect (to unknown destination)
    240       Ev_G    // Global bus event
    241    }
    242    EventTag;
    243 
    244 typedef
    245    struct {
    246       EventTag   tag;
    247       InstrInfo* inode;
    248       union {
    249 	 struct {
    250 	 } Ir;
    251 	 struct {
    252 	    IRAtom* ea;
    253 	    Int     szB;
    254 	 } Dr;
    255 	 struct {
    256 	    IRAtom* ea;
    257 	    Int     szB;
    258 	 } Dw;
    259 	 struct {
    260 	    IRAtom* ea;
    261 	    Int     szB;
    262 	 } Dm;
    263          struct {
    264             IRAtom* taken; /* :: Ity_I1 */
    265          } Bc;
    266          struct {
    267             IRAtom* dst;
    268          } Bi;
    269 	 struct {
    270 	 } G;
    271       } Ev;
    272    }
    273    Event;
    274 
    275 static void init_Event ( Event* ev ) {
    276    VG_(memset)(ev, 0, sizeof(Event));
    277 }
    278 
    279 static IRAtom* get_Event_dea ( Event* ev ) {
    280    switch (ev->tag) {
    281       case Ev_Dr: return ev->Ev.Dr.ea;
    282       case Ev_Dw: return ev->Ev.Dw.ea;
    283       case Ev_Dm: return ev->Ev.Dm.ea;
    284       default:    tl_assert(0);
    285    }
    286 }
    287 
    288 static Int get_Event_dszB ( Event* ev ) {
    289    switch (ev->tag) {
    290       case Ev_Dr: return ev->Ev.Dr.szB;
    291       case Ev_Dw: return ev->Ev.Dw.szB;
    292       case Ev_Dm: return ev->Ev.Dm.szB;
    293       default:    tl_assert(0);
    294    }
    295 }
    296 
    297 
    298 /* Up to this many unnotified events are allowed.  Number is
    299    arbitrary.  Larger numbers allow more event merging to occur, but
    300    potentially induce more spilling due to extending live ranges of
    301    address temporaries. */
    302 #define N_EVENTS 16
    303 
    304 
    305 /* A struct which holds all the running state during instrumentation.
    306    Mostly to avoid passing loads of parameters everywhere. */
    307 typedef struct {
    308     /* The current outstanding-memory-event list. */
    309     Event events[N_EVENTS];
    310     Int   events_used;
    311 
    312     /* The array of InstrInfo's is part of BB struct. */
    313     BB* bb;
    314 
    315     /* BB seen before (ie. re-instrumentation) */
    316     Bool seen_before;
    317 
    318     /* Number InstrInfo bins 'used' so far. */
    319     UInt ii_index;
    320 
    321     // current offset of guest instructions from BB start
    322     UInt instr_offset;
    323 
    324     /* The output SB being constructed. */
    325     IRSB* sbOut;
    326 } ClgState;
    327 
    328 
    329 static void showEvent ( Event* ev )
    330 {
    331    switch (ev->tag) {
    332       case Ev_Ir:
    333 	 VG_(printf)("Ir (InstrInfo %p) at +%d\n",
    334 		     ev->inode, ev->inode->instr_offset);
    335 	 break;
    336       case Ev_Dr:
    337 	 VG_(printf)("Dr (InstrInfo %p) at +%d %d EA=",
    338 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
    339 	 ppIRExpr(ev->Ev.Dr.ea);
    340 	 VG_(printf)("\n");
    341 	 break;
    342       case Ev_Dw:
    343 	 VG_(printf)("Dw (InstrInfo %p) at +%d %d EA=",
    344 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
    345 	 ppIRExpr(ev->Ev.Dw.ea);
    346 	 VG_(printf)("\n");
    347 	 break;
    348       case Ev_Dm:
    349 	 VG_(printf)("Dm (InstrInfo %p) at +%d %d EA=",
    350 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
    351 	 ppIRExpr(ev->Ev.Dm.ea);
    352 	 VG_(printf)("\n");
    353 	 break;
    354       case Ev_Bc:
    355          VG_(printf)("Bc %p   GA=", ev->inode);
    356          ppIRExpr(ev->Ev.Bc.taken);
    357          VG_(printf)("\n");
    358          break;
    359       case Ev_Bi:
    360          VG_(printf)("Bi %p  DST=", ev->inode);
    361          ppIRExpr(ev->Ev.Bi.dst);
    362          VG_(printf)("\n");
    363          break;
    364       case Ev_G:
    365          VG_(printf)("G  %p\n", ev->inode);
    366          break;
    367       default:
    368 	 tl_assert(0);
    369 	 break;
    370    }
    371 }
    372 
    373 /* Generate code for all outstanding memory events, and mark the queue
    374    empty.  Code is generated into cgs->sbOut, and this activity
    375    'consumes' slots in cgs->bb. */
    376 
    377 static void flushEvents ( ClgState* clgs )
    378 {
    379    Int        i, regparms, inew;
    380    const HChar* helperName;
    381    void*      helperAddr;
    382    IRExpr**   argv;
    383    IRExpr*    i_node_expr;
    384    IRDirty*   di;
    385    Event*     ev;
    386    Event*     ev2;
    387    Event*     ev3;
    388 
    389    if (!clgs->seen_before) {
    390        // extend event sets as needed
    391        // available sets: D0 Dr
    392        for(i=0; i<clgs->events_used; i++) {
    393 	   ev  = &clgs->events[i];
    394 	   switch(ev->tag) {
    395 	   case Ev_Ir:
    396 	       // Ir event always is first for a guest instruction
    397 	       CLG_ASSERT(ev->inode->eventset == 0);
    398 	       ev->inode->eventset = CLG_(sets).base;
    399 	       break;
    400 	   case Ev_Dr:
    401                // extend event set by Dr counters
    402 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    403 							   EG_DR);
    404 	       break;
    405 	   case Ev_Dw:
    406 	   case Ev_Dm:
    407                // extend event set by Dw counters
    408 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    409 							   EG_DW);
    410 	       break;
    411            case Ev_Bc:
    412                // extend event set by Bc counters
    413                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    414                                                            EG_BC);
    415                break;
    416            case Ev_Bi:
    417                // extend event set by Bi counters
    418                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    419                                                            EG_BI);
    420                break;
    421 	   case Ev_G:
    422                // extend event set by Bus counter
    423 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    424 							   EG_BUS);
    425 	       break;
    426 	   default:
    427 	       tl_assert(0);
    428 	   }
    429        }
    430    }
    431 
    432    for(i = 0; i < clgs->events_used; i = inew) {
    433 
    434       helperName = NULL;
    435       helperAddr = NULL;
    436       argv       = NULL;
    437       regparms   = 0;
    438 
    439       /* generate IR to notify event i and possibly the ones
    440 	 immediately following it. */
    441       tl_assert(i >= 0 && i < clgs->events_used);
    442 
    443       ev  = &clgs->events[i];
    444       ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
    445       ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
    446 
    447       CLG_DEBUGIF(5) {
    448 	 VG_(printf)("   flush ");
    449 	 showEvent( ev );
    450       }
    451 
    452       i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
    453 
    454       /* Decide on helper fn to call and args to pass it, and advance
    455 	 i appropriately.
    456 	 Dm events have same effect as Dw events */
    457       switch (ev->tag) {
    458 	 case Ev_Ir:
    459 	    /* Merge an Ir with a following Dr. */
    460 	    if (ev2 && ev2->tag == Ev_Dr) {
    461 	       /* Why is this true?  It's because we're merging an Ir
    462 		  with a following Dr.  The Ir derives from the
    463 		  instruction's IMark and the Dr from data
    464 		  references which follow it.  In short it holds
    465 		  because each insn starts with an IMark, hence an
    466 		  Ev_Ir, and so these Dr must pertain to the
    467 		  immediately preceding Ir.  Same applies to analogous
    468 		  assertions in the subsequent cases. */
    469 	       tl_assert(ev2->inode == ev->inode);
    470 	       helperName = CLG_(cachesim).log_1I1Dr_name;
    471 	       helperAddr = CLG_(cachesim).log_1I1Dr;
    472 	       argv = mkIRExprVec_3( i_node_expr,
    473 				     get_Event_dea(ev2),
    474 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    475 	       regparms = 3;
    476 	       inew = i+2;
    477 	    }
    478 	    /* Merge an Ir with a following Dw/Dm. */
    479 	    else
    480 	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
    481 	       tl_assert(ev2->inode == ev->inode);
    482 	       helperName = CLG_(cachesim).log_1I1Dw_name;
    483 	       helperAddr = CLG_(cachesim).log_1I1Dw;
    484 	       argv = mkIRExprVec_3( i_node_expr,
    485 				     get_Event_dea(ev2),
    486 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    487 	       regparms = 3;
    488 	       inew = i+2;
    489 	    }
    490 	    /* Merge an Ir with two following Irs. */
    491 	    else
    492 	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
    493 	       helperName = CLG_(cachesim).log_3I0D_name;
    494 	       helperAddr = CLG_(cachesim).log_3I0D;
    495 	       argv = mkIRExprVec_3( i_node_expr,
    496 				     mkIRExpr_HWord( (HWord)ev2->inode ),
    497 				     mkIRExpr_HWord( (HWord)ev3->inode ) );
    498 	       regparms = 3;
    499 	       inew = i+3;
    500 	    }
    501 	    /* Merge an Ir with one following Ir. */
    502 	    else
    503 	    if (ev2 && ev2->tag == Ev_Ir) {
    504 	       helperName = CLG_(cachesim).log_2I0D_name;
    505 	       helperAddr = CLG_(cachesim).log_2I0D;
    506 	       argv = mkIRExprVec_2( i_node_expr,
    507 				     mkIRExpr_HWord( (HWord)ev2->inode ) );
    508 	       regparms = 2;
    509 	       inew = i+2;
    510 	    }
    511 	    /* No merging possible; emit as-is. */
    512 	    else {
    513 	       helperName = CLG_(cachesim).log_1I0D_name;
    514 	       helperAddr = CLG_(cachesim).log_1I0D;
    515 	       argv = mkIRExprVec_1( i_node_expr );
    516 	       regparms = 1;
    517 	       inew = i+1;
    518 	    }
    519 	    break;
    520 	 case Ev_Dr:
    521 	    /* Data read or modify */
    522 	    helperName = CLG_(cachesim).log_0I1Dr_name;
    523 	    helperAddr = CLG_(cachesim).log_0I1Dr;
    524 	    argv = mkIRExprVec_3( i_node_expr,
    525 				  get_Event_dea(ev),
    526 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    527 	    regparms = 3;
    528 	    inew = i+1;
    529 	    break;
    530 	 case Ev_Dw:
    531 	 case Ev_Dm:
    532 	    /* Data write */
    533 	    helperName = CLG_(cachesim).log_0I1Dw_name;
    534 	    helperAddr = CLG_(cachesim).log_0I1Dw;
    535 	    argv = mkIRExprVec_3( i_node_expr,
    536 				  get_Event_dea(ev),
    537 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    538 	    regparms = 3;
    539 	    inew = i+1;
    540 	    break;
    541          case Ev_Bc:
    542             /* Conditional branch */
    543             helperName = "log_cond_branch";
    544             helperAddr = &log_cond_branch;
    545             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
    546             regparms = 2;
    547             inew = i+1;
    548             break;
    549          case Ev_Bi:
    550             /* Branch to an unknown destination */
    551             helperName = "log_ind_branch";
    552             helperAddr = &log_ind_branch;
    553             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
    554             regparms = 2;
    555             inew = i+1;
    556             break;
    557          case Ev_G:
    558             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
    559             helperName = "log_global_event";
    560             helperAddr = &log_global_event;
    561             argv = mkIRExprVec_1( i_node_expr );
    562             regparms = 1;
    563             inew = i+1;
    564             break;
    565 	 default:
    566 	    tl_assert(0);
    567       }
    568 
    569       CLG_DEBUGIF(5) {
    570 	  if (inew > i+1) {
    571 	      VG_(printf)("   merge ");
    572 	      showEvent( ev2 );
    573 	  }
    574 	  if (inew > i+2) {
    575 	      VG_(printf)("   merge ");
    576 	      showEvent( ev3 );
    577 	  }
    578 	  if (helperAddr)
    579 	      VG_(printf)("   call  %s (%p)\n",
    580 			  helperName, helperAddr);
    581       }
    582 
    583       /* helper could be unset depending on the simulator used */
    584       if (helperAddr == 0) continue;
    585 
    586       /* Add the helper. */
    587       tl_assert(helperName);
    588       tl_assert(helperAddr);
    589       tl_assert(argv);
    590       di = unsafeIRDirty_0_N( regparms,
    591 			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    592 			      argv );
    593       addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    594    }
    595 
    596    clgs->events_used = 0;
    597 }
    598 
    599 static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
    600 {
    601    Event* evt;
    602    tl_assert(clgs->seen_before || (inode->eventset == 0));
    603    if (!CLG_(clo).simulate_cache) return;
    604 
    605    if (clgs->events_used == N_EVENTS)
    606       flushEvents(clgs);
    607    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    608    evt = &clgs->events[clgs->events_used];
    609    init_Event(evt);
    610    evt->tag      = Ev_Ir;
    611    evt->inode    = inode;
    612    clgs->events_used++;
    613 }
    614 
    615 static
    616 void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    617 {
    618    Event* evt;
    619    tl_assert(isIRAtom(ea));
    620    tl_assert(datasize >= 1);
    621    if (!CLG_(clo).simulate_cache) return;
    622    tl_assert(datasize <= CLG_(min_line_size));
    623 
    624    if (clgs->events_used == N_EVENTS)
    625       flushEvents(clgs);
    626    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    627    evt = &clgs->events[clgs->events_used];
    628    init_Event(evt);
    629    evt->tag       = Ev_Dr;
    630    evt->inode     = inode;
    631    evt->Ev.Dr.szB = datasize;
    632    evt->Ev.Dr.ea  = ea;
    633    clgs->events_used++;
    634 }
    635 
    636 static
    637 void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    638 {
    639    Event* lastEvt;
    640    Event* evt;
    641    tl_assert(isIRAtom(ea));
    642    tl_assert(datasize >= 1);
    643    if (!CLG_(clo).simulate_cache) return;
    644    tl_assert(datasize <= CLG_(min_line_size));
    645 
    646    /* Is it possible to merge this write with the preceding read? */
    647    lastEvt = &clgs->events[clgs->events_used-1];
    648    if (clgs->events_used > 0
    649        && lastEvt->tag       == Ev_Dr
    650        && lastEvt->Ev.Dr.szB == datasize
    651        && lastEvt->inode     == inode
    652        && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
    653    {
    654       lastEvt->tag   = Ev_Dm;
    655       return;
    656    }
    657 
    658    /* No.  Add as normal. */
    659    if (clgs->events_used == N_EVENTS)
    660       flushEvents(clgs);
    661    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    662    evt = &clgs->events[clgs->events_used];
    663    init_Event(evt);
    664    evt->tag       = Ev_Dw;
    665    evt->inode     = inode;
    666    evt->Ev.Dw.szB = datasize;
    667    evt->Ev.Dw.ea  = ea;
    668    clgs->events_used++;
    669 }
    670 
    671 static
    672 void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
    673                           Int datasize, IRAtom* ea, IRAtom* guard,
    674                           Bool isWrite )
    675 {
    676    tl_assert(isIRAtom(ea));
    677    tl_assert(guard);
    678    tl_assert(isIRAtom(guard));
    679    tl_assert(datasize >= 1);
    680    if (!CLG_(clo).simulate_cache) return;
    681    tl_assert(datasize <= CLG_(min_line_size));
    682 
    683    /* Adding guarded memory actions and merging them with the existing
    684       queue is too complex.  Simply flush the queue and add this
    685       action immediately.  Since guarded loads and stores are pretty
    686       rare, this is not thought likely to cause any noticeable
    687       performance loss as a result of the loss of event-merging
    688       opportunities. */
    689    tl_assert(clgs->events_used >= 0);
    690    flushEvents(clgs);
    691    tl_assert(clgs->events_used == 0);
    692    /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
    693    IRExpr*      i_node_expr;
    694    const HChar* helperName;
    695    void*        helperAddr;
    696    IRExpr**     argv;
    697    Int          regparms;
    698    IRDirty*     di;
    699    i_node_expr = mkIRExpr_HWord( (HWord)inode );
    700    helperName  = isWrite ? CLG_(cachesim).log_0I1Dw_name
    701                          : CLG_(cachesim).log_0I1Dr_name;
    702    helperAddr  = isWrite ? CLG_(cachesim).log_0I1Dw
    703                          : CLG_(cachesim).log_0I1Dr;
    704    argv        = mkIRExprVec_3( i_node_expr,
    705                                 ea, mkIRExpr_HWord( datasize ) );
    706    regparms    = 3;
    707    di          = unsafeIRDirty_0_N(
    708                     regparms,
    709                     helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    710                     argv );
    711    di->guard = guard;
    712    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    713 }
    714 
    715 static
    716 void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
    717 {
    718    Event* evt;
    719    tl_assert(isIRAtom(guard));
    720    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
    721              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    722    if (!CLG_(clo).simulate_branch) return;
    723 
    724    if (clgs->events_used == N_EVENTS)
    725       flushEvents(clgs);
    726    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    727    evt = &clgs->events[clgs->events_used];
    728    init_Event(evt);
    729    evt->tag         = Ev_Bc;
    730    evt->inode       = inode;
    731    evt->Ev.Bc.taken = guard;
    732    clgs->events_used++;
    733 }
    734 
    735 static
    736 void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
    737 {
    738    Event* evt;
    739    tl_assert(isIRAtom(whereTo));
    740    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
    741              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    742    if (!CLG_(clo).simulate_branch) return;
    743 
    744    if (clgs->events_used == N_EVENTS)
    745       flushEvents(clgs);
    746    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    747    evt = &clgs->events[clgs->events_used];
    748    init_Event(evt);
    749    evt->tag       = Ev_Bi;
    750    evt->inode     = inode;
    751    evt->Ev.Bi.dst = whereTo;
    752    clgs->events_used++;
    753 }
    754 
    755 static
    756 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
    757 {
    758    Event* evt;
    759    if (!CLG_(clo).collect_bus) return;
    760 
    761    if (clgs->events_used == N_EVENTS)
    762       flushEvents(clgs);
    763    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    764    evt = &clgs->events[clgs->events_used];
    765    init_Event(evt);
    766    evt->tag       = Ev_G;
    767    evt->inode     = inode;
    768    clgs->events_used++;
    769 }
    770 
    771 /* Initialise or check (if already seen before) an InstrInfo for next insn.
    772    We only can set instr_offset/instr_size here. The required event set and
    773    resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
    774    instructions. The event set is extended as required on flush of the event
    775    queue (when Dm events were determined), cost offsets are determined at
    776    end of BB instrumentation. */
    777 static
    778 InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
    779 {
    780    InstrInfo* ii;
    781    tl_assert(clgs->ii_index >= 0);
    782    tl_assert(clgs->ii_index < clgs->bb->instr_count);
    783    ii = &clgs->bb->instr[ clgs->ii_index ];
    784 
    785    if (clgs->seen_before) {
    786        CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
    787        CLG_ASSERT(ii->instr_size == instr_size);
    788    }
    789    else {
    790        ii->instr_offset = clgs->instr_offset;
    791        ii->instr_size = instr_size;
    792        ii->cost_offset = 0;
    793        ii->eventset = 0;
    794    }
    795 
    796    clgs->ii_index++;
    797    clgs->instr_offset += instr_size;
    798    CLG_(stat).distinct_instrs++;
    799 
    800    return ii;
    801 }
    802 
    803 // return total number of cost values needed for this BB
    804 static
    805 UInt update_cost_offsets( ClgState* clgs )
    806 {
    807     Int i;
    808     InstrInfo* ii;
    809     UInt cost_offset = 0;
    810 
    811     CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
    812     for(i=0; i<clgs->ii_index; i++) {
    813 	ii = &clgs->bb->instr[i];
    814 	if (clgs->seen_before) {
    815 	    CLG_ASSERT(ii->cost_offset == cost_offset);
    816 	} else
    817 	    ii->cost_offset = cost_offset;
    818 	cost_offset += ii->eventset ? ii->eventset->size : 0;
    819     }
    820 
    821     return cost_offset;
    822 }
    823 
    824 /*------------------------------------------------------------*/
    825 /*--- Instrumentation                                      ---*/
    826 /*------------------------------------------------------------*/
    827 
    828 #if defined(VG_BIGENDIAN)
    829 # define CLGEndness Iend_BE
    830 #elif defined(VG_LITTLEENDIAN)
    831 # define CLGEndness Iend_LE
    832 #else
    833 # error "Unknown endianness"
    834 #endif
    835 
    836 static
    837 Addr IRConst2Addr(IRConst* con)
    838 {
    839     Addr addr;
    840 
    841     if (sizeof(Addr) == 4) {
    842 	CLG_ASSERT( con->tag == Ico_U32 );
    843 	addr = con->Ico.U32;
    844     }
    845     else if (sizeof(Addr) == 8) {
    846 	CLG_ASSERT( con->tag == Ico_U64 );
    847 	addr = con->Ico.U64;
    848     }
    849     else
    850 	VG_(tool_panic)("Callgrind: invalid Addr type");
    851 
    852     return addr;
    853 }
    854 
    855 /* First pass over a BB to instrument, counting instructions and jumps
    856  * This is needed for the size of the BB struct to allocate
    857  *
    858  * Called from CLG_(get_bb)
    859  */
    860 void CLG_(collectBlockInfo)(IRSB* sbIn,
    861 			    /*INOUT*/ UInt* instrs,
    862 			    /*INOUT*/ UInt* cjmps,
    863 			    /*INOUT*/ Bool* cjmp_inverted)
    864 {
    865     Int i;
    866     IRStmt* st;
    867     Addr instrAddr =0, jumpDst;
    868     UInt instrLen = 0;
    869     Bool toNextInstr = False;
    870 
    871     // Ist_Exit has to be ignored in preamble code, before first IMark:
    872     // preamble code is added by VEX for self modifying code, and has
    873     // nothing to do with client code
    874     Bool inPreamble = True;
    875 
    876     if (!sbIn) return;
    877 
    878     for (i = 0; i < sbIn->stmts_used; i++) {
    879 	  st = sbIn->stmts[i];
    880 	  if (Ist_IMark == st->tag) {
    881 	      inPreamble = False;
    882 
    883 	      instrAddr = (Addr)ULong_to_Ptr(st->Ist.IMark.addr);
    884 	      instrLen  = st->Ist.IMark.len;
    885 
    886 	      (*instrs)++;
    887 	      toNextInstr = False;
    888 	  }
    889 	  if (inPreamble) continue;
    890 	  if (Ist_Exit == st->tag) {
    891 	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
    892 	      toNextInstr =  (jumpDst == instrAddr + instrLen);
    893 
    894 	      (*cjmps)++;
    895 	  }
    896     }
    897 
    898     /* if the last instructions of BB conditionally jumps to next instruction
    899      * (= first instruction of next BB in memory), this is a inverted by VEX.
    900      */
    901     *cjmp_inverted = toNextInstr;
    902 }
    903 
    904 static
    905 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
    906 {
    907     addStmtToIRSB( bbOut,
    908 		   IRStmt_Store(CLGEndness,
    909 				IRExpr_Const(hWordTy == Ity_I32 ?
    910 					     IRConst_U32( addr ) :
    911 					     IRConst_U64( addr )),
    912 				IRExpr_Const(IRConst_U32(val)) ));
    913 }
    914 
    915 
    916 /* add helper call to setup_bbcc, with pointer to BB struct as argument
    917  *
    918  * precondition for setup_bbcc:
    919  * - jmps_passed has number of cond.jumps passed in last executed BB
    920  * - current_bbcc has a pointer to the BBCC of the last executed BB
    921  *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
    922  *     current_bbcc->bb->jmp_addr
    923  *   gives the address of the jump source.
    924  *
    925  * the setup does 2 things:
    926  * - trace call:
    927  *   * Unwind own call stack, i.e sync our ESP with real ESP
    928  *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
    929  *   * For CALLs or JMPs crossing objects, record call arg +
    930  *     push are on own call stack
    931  *
    932  * - prepare for cache log functions:
    933  *   set current_bbcc to BBCC that gets the costs for this BB execution
    934  *   attached
    935  */
    936 static
    937 void addBBSetupCall(ClgState* clgs)
    938 {
    939    IRDirty* di;
    940    IRExpr  *arg1, **argv;
    941 
    942    arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
    943    argv = mkIRExprVec_1(arg1);
    944    di = unsafeIRDirty_0_N( 1, "setup_bbcc",
    945 			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
    946 			      argv);
    947    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    948 }
    949 
    950 
    951 static
    952 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    953 			IRSB* sbIn,
    954 			VexGuestLayout* layout,
    955 			VexGuestExtents* vge,
    956                         VexArchInfo* archinfo_host,
    957 			IRType gWordTy, IRType hWordTy )
    958 {
    959    Int        i;
    960    IRStmt*    st;
    961    Addr       origAddr;
    962    InstrInfo* curr_inode = NULL;
    963    ClgState   clgs;
    964    UInt       cJumps = 0;
    965    IRTypeEnv* tyenv = sbIn->tyenv;
    966 
    967    if (gWordTy != hWordTy) {
    968       /* We don't currently support this case. */
    969       VG_(tool_panic)("host/guest word size mismatch");
    970    }
    971 
    972    // No instrumentation if it is switched off
    973    if (! CLG_(instrument_state)) {
    974        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
    975 		 (Addr)closure->readdr);
    976        return sbIn;
    977    }
    978 
    979    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
    980 
    981    /* Set up SB for instrumented IR */
    982    clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
    983 
    984    // Copy verbatim any IR preamble preceding the first IMark
    985    i = 0;
    986    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
    987       addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
    988       i++;
    989    }
    990 
    991    // Get the first statement, and origAddr from it
    992    CLG_ASSERT(sbIn->stmts_used >0);
    993    CLG_ASSERT(i < sbIn->stmts_used);
    994    st = sbIn->stmts[i];
    995    CLG_ASSERT(Ist_IMark == st->tag);
    996 
    997    origAddr = (Addr)st->Ist.IMark.addr + (Addr)st->Ist.IMark.delta;
    998    CLG_ASSERT(origAddr == st->Ist.IMark.addr
    999                           + st->Ist.IMark.delta);  // XXX: check no overflow
   1000 
   1001    /* Get BB struct (creating if necessary).
   1002     * JS: The hash table is keyed with orig_addr_noredir -- important!
   1003     * JW: Why? If it is because of different chasing of the redirection,
   1004     *     this is not needed, as chasing is switched off in callgrind
   1005     */
   1006    clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
   1007 
   1008    addBBSetupCall(&clgs);
   1009 
   1010    // Set up running state
   1011    clgs.events_used = 0;
   1012    clgs.ii_index = 0;
   1013    clgs.instr_offset = 0;
   1014 
   1015    for (/*use current i*/; i < sbIn->stmts_used; i++) {
   1016 
   1017       st = sbIn->stmts[i];
   1018       CLG_ASSERT(isFlatIRStmt(st));
   1019 
   1020       switch (st->tag) {
   1021 	 case Ist_NoOp:
   1022 	 case Ist_AbiHint:
   1023 	 case Ist_Put:
   1024 	 case Ist_PutI:
   1025 	 case Ist_MBE:
   1026 	    break;
   1027 
   1028 	 case Ist_IMark: {
   1029             Addr64 cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
   1030             Int    isize = st->Ist.IMark.len;
   1031             CLG_ASSERT(clgs.instr_offset == (Addr)cia - origAddr);
   1032 	    // If Vex fails to decode an instruction, the size will be zero.
   1033 	    // Pretend otherwise.
   1034 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
   1035 
   1036 	    // Sanity-check size.
   1037 	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
   1038 		     || VG_CLREQ_SZB == isize );
   1039 
   1040 	    // Init the inode, record it as the current one.
   1041 	    // Subsequent Dr/Dw/Dm events from the same instruction will
   1042 	    // also use it.
   1043 	    curr_inode = next_InstrInfo (&clgs, isize);
   1044 
   1045 	    addEvent_Ir( &clgs, curr_inode );
   1046 	    break;
   1047 	 }
   1048 
   1049 	 case Ist_WrTmp: {
   1050 	    IRExpr* data = st->Ist.WrTmp.data;
   1051 	    if (data->tag == Iex_Load) {
   1052 	       IRExpr* aexpr = data->Iex.Load.addr;
   1053 	       // Note also, endianness info is ignored.  I guess
   1054 	       // that's not interesting.
   1055 	       addEvent_Dr( &clgs, curr_inode,
   1056 			    sizeofIRType(data->Iex.Load.ty), aexpr );
   1057 	    }
   1058 	    break;
   1059 	 }
   1060 
   1061 	 case Ist_Store: {
   1062 	    IRExpr* data  = st->Ist.Store.data;
   1063 	    IRExpr* aexpr = st->Ist.Store.addr;
   1064 	    addEvent_Dw( &clgs, curr_inode,
   1065 			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
   1066 	    break;
   1067 	 }
   1068 
   1069          case Ist_StoreG: {
   1070             IRStoreG* sg   = st->Ist.StoreG.details;
   1071             IRExpr*   data = sg->data;
   1072             IRExpr*   addr = sg->addr;
   1073             IRType    type = typeOfIRExpr(tyenv, data);
   1074             tl_assert(type != Ity_INVALID);
   1075             addEvent_D_guarded( &clgs, curr_inode,
   1076                                 sizeofIRType(type), addr, sg->guard,
   1077                                 True/*isWrite*/ );
   1078             break;
   1079          }
   1080 
   1081          case Ist_LoadG: {
   1082             IRLoadG* lg       = st->Ist.LoadG.details;
   1083             IRType   type     = Ity_INVALID; /* loaded type */
   1084             IRType   typeWide = Ity_INVALID; /* after implicit widening */
   1085             IRExpr*  addr     = lg->addr;
   1086             typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
   1087             tl_assert(type != Ity_INVALID);
   1088             addEvent_D_guarded( &clgs, curr_inode,
   1089                                 sizeofIRType(type), addr, lg->guard,
   1090                                 False/*!isWrite*/ );
   1091             break;
   1092          }
   1093 
   1094 	 case Ist_Dirty: {
   1095 	    Int      dataSize;
   1096 	    IRDirty* d = st->Ist.Dirty.details;
   1097 	    if (d->mFx != Ifx_None) {
   1098 	       /* This dirty helper accesses memory.  Collect the details. */
   1099 	       tl_assert(d->mAddr != NULL);
   1100 	       tl_assert(d->mSize != 0);
   1101 	       dataSize = d->mSize;
   1102 	       // Large (eg. 28B, 108B, 512B on x86) data-sized
   1103 	       // instructions will be done inaccurately, but they're
   1104 	       // very rare and this avoids errors from hitting more
   1105 	       // than two cache lines in the simulation.
   1106 	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
   1107 		  dataSize = CLG_(min_line_size);
   1108 	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
   1109 		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
   1110 	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
   1111 		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
   1112 	    } else {
   1113 	       tl_assert(d->mAddr == NULL);
   1114 	       tl_assert(d->mSize == 0);
   1115 	    }
   1116 	    break;
   1117 	 }
   1118 
   1119          case Ist_CAS: {
   1120             /* We treat it as a read and a write of the location.  I
   1121                think that is the same behaviour as it was before IRCAS
   1122                was introduced, since prior to that point, the Vex
   1123                front ends would translate a lock-prefixed instruction
   1124                into a (normal) read followed by a (normal) write. */
   1125             Int    dataSize;
   1126             IRCAS* cas = st->Ist.CAS.details;
   1127             CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
   1128             CLG_ASSERT(cas->dataLo);
   1129             dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
   1130             if (cas->dataHi != NULL)
   1131                dataSize *= 2; /* since this is a doubleword-cas */
   1132             addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
   1133             addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
   1134             addEvent_G(  &clgs, curr_inode );
   1135             break;
   1136          }
   1137 
   1138          case Ist_LLSC: {
   1139             IRType dataTy;
   1140             if (st->Ist.LLSC.storedata == NULL) {
   1141                /* LL */
   1142                dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
   1143                addEvent_Dr( &clgs, curr_inode,
   1144                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1145                /* flush events before LL, should help SC to succeed */
   1146                flushEvents( &clgs );
   1147             } else {
   1148                /* SC */
   1149                dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
   1150                addEvent_Dw( &clgs, curr_inode,
   1151                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1152                /* I don't know whether the global-bus-lock cost should
   1153                   be attributed to the LL or the SC, but it doesn't
   1154                   really matter since they always have to be used in
   1155                   pairs anyway.  Hence put it (quite arbitrarily) on
   1156                   the SC. */
   1157                addEvent_G(  &clgs, curr_inode );
   1158             }
   1159             break;
   1160          }
   1161 
   1162  	 case Ist_Exit: {
   1163             Bool guest_exit, inverted;
   1164 
   1165             /* VEX code generation sometimes inverts conditional branches.
   1166              * As Callgrind counts (conditional) jumps, it has to correct
   1167              * inversions. The heuristic is the following:
   1168              * (1) Callgrind switches off SB chasing and unrolling, and
   1169              *     therefore it assumes that a candidate for inversion only is
   1170              *     the last conditional branch in an SB.
   1171              * (2) inversion is assumed if the branch jumps to the address of
   1172              *     the next guest instruction in memory.
   1173              * This heuristic is precalculated in CLG_(collectBlockInfo)().
   1174              *
   1175              * Branching behavior is also used for branch prediction. Note that
   1176              * above heuristic is different from what Cachegrind does.
   1177              * Cachegrind uses (2) for all branches.
   1178              */
   1179             if (cJumps+1 == clgs.bb->cjmp_count)
   1180                 inverted = clgs.bb->cjmp_inverted;
   1181             else
   1182                 inverted = False;
   1183 
   1184             // call branch predictor only if this is a branch in guest code
   1185             guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
   1186                          (st->Ist.Exit.jk == Ijk_Call) ||
   1187                          (st->Ist.Exit.jk == Ijk_Ret);
   1188 
   1189             if (guest_exit) {
   1190                 /* Stuff to widen the guard expression to a host word, so
   1191                    we can pass it to the branch predictor simulation
   1192                    functions easily. */
   1193                 IRType   tyW    = hWordTy;
   1194                 IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
   1195                 IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
   1196                 IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
   1197                 IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
   1198                 IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
   1199                 IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
   1200                                                : IRExpr_Const(IRConst_U64(1));
   1201 
   1202                 /* Widen the guard expression. */
   1203                 addStmtToIRSB( clgs.sbOut,
   1204                                IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
   1205                 addStmtToIRSB( clgs.sbOut,
   1206                                IRStmt_WrTmp( guardW,
   1207                                              IRExpr_Unop(widen,
   1208                                                          IRExpr_RdTmp(guard1))) );
   1209                 /* If the exit is inverted, invert the sense of the guard. */
   1210                 addStmtToIRSB(
   1211                         clgs.sbOut,
   1212                         IRStmt_WrTmp(
   1213                                 guard,
   1214                                 inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
   1215                                     : IRExpr_RdTmp(guardW)
   1216                                     ));
   1217                 /* And post the event. */
   1218                 addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
   1219             }
   1220 
   1221 	    /* We may never reach the next statement, so need to flush
   1222 	       all outstanding transactions now. */
   1223 	    flushEvents( &clgs );
   1224 
   1225 	    CLG_ASSERT(clgs.ii_index>0);
   1226 	    if (!clgs.seen_before) {
   1227 	      ClgJumpKind jk;
   1228 
   1229 	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
   1230 	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
   1231 	      else {
   1232 		if (IRConst2Addr(st->Ist.Exit.dst) ==
   1233 		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
   1234 		  jk = jk_None;
   1235 		else
   1236 		  jk = jk_Jump;
   1237 	      }
   1238 
   1239 	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1240 	      clgs.bb->jmp[cJumps].jmpkind = jk;
   1241 	    }
   1242 
   1243 	    /* Update global variable jmps_passed before the jump
   1244 	     * A correction is needed if VEX inverted the last jump condition
   1245 	    */
   1246 	    UInt val = inverted ? cJumps+1 : cJumps;
   1247 	    addConstMemStoreStmt( clgs.sbOut,
   1248 				  (UWord) &CLG_(current_state).jmps_passed,
   1249 				  val, hWordTy);
   1250 	    cJumps++;
   1251 
   1252 	    break;
   1253 	 }
   1254 
   1255 	 default:
   1256 	    tl_assert(0);
   1257 	    break;
   1258       }
   1259 
   1260       /* Copy the original statement */
   1261       addStmtToIRSB( clgs.sbOut, st );
   1262 
   1263       CLG_DEBUGIF(5) {
   1264 	 VG_(printf)("   pass  ");
   1265 	 ppIRStmt(st);
   1266 	 VG_(printf)("\n");
   1267       }
   1268    }
   1269 
   1270    /* Deal with branches to unknown destinations.  Except ignore ones
   1271       which are function returns as we assume the return stack
   1272       predictor never mispredicts. */
   1273    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
   1274       if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
   1275       switch (sbIn->next->tag) {
   1276          case Iex_Const:
   1277             break; /* boring - branch to known address */
   1278          case Iex_RdTmp:
   1279             /* looks like an indirect branch (branch to unknown) */
   1280             addEvent_Bi( &clgs, curr_inode, sbIn->next );
   1281             break;
   1282          default:
   1283             /* shouldn't happen - if the incoming IR is properly
   1284                flattened, should only have tmp and const cases to
   1285                consider. */
   1286             tl_assert(0);
   1287       }
   1288    }
   1289 
   1290    /* At the end of the bb.  Flush outstandings. */
   1291    flushEvents( &clgs );
   1292 
   1293    /* Update global variable jmps_passed at end of SB.
   1294     * As CLG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
   1295     * this can be omitted if there is no conditional jump in this SB.
   1296     * A correction is needed if VEX inverted the last jump condition
   1297     */
   1298    if (cJumps>0) {
   1299       UInt jmps_passed = cJumps;
   1300       if (clgs.bb->cjmp_inverted) jmps_passed--;
   1301       addConstMemStoreStmt( clgs.sbOut,
   1302 			    (UWord) &CLG_(current_state).jmps_passed,
   1303 			    jmps_passed, hWordTy);
   1304    }
   1305    CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
   1306    CLG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
   1307 
   1308    /* Info for final exit from BB */
   1309    {
   1310      ClgJumpKind jk;
   1311 
   1312      if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
   1313      else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
   1314      else {
   1315        jk = jk_Jump;
   1316        if ((sbIn->next->tag == Iex_Const) &&
   1317 	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
   1318 	    origAddr + clgs.instr_offset))
   1319 	 jk = jk_None;
   1320      }
   1321      clgs.bb->jmp[cJumps].jmpkind = jk;
   1322      /* Instruction index of the call/ret at BB end
   1323       * (it is wrong for fall-through, but does not matter) */
   1324      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1325    }
   1326 
   1327    /* swap information of last exit with final exit if inverted */
   1328    if (clgs.bb->cjmp_inverted) {
   1329      ClgJumpKind jk;
   1330      UInt instr;
   1331 
   1332      jk = clgs.bb->jmp[cJumps].jmpkind;
   1333      clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
   1334      clgs.bb->jmp[cJumps-1].jmpkind = jk;
   1335      instr = clgs.bb->jmp[cJumps].instr;
   1336      clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
   1337      clgs.bb->jmp[cJumps-1].instr = instr;
   1338    }
   1339 
   1340    if (clgs.seen_before) {
   1341        CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
   1342        CLG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
   1343    }
   1344    else {
   1345        clgs.bb->cost_count = update_cost_offsets(&clgs);
   1346        clgs.bb->instr_len = clgs.instr_offset;
   1347    }
   1348 
   1349    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
   1350 	     origAddr, clgs.bb->instr_len,
   1351 	     clgs.bb->cjmp_count, clgs.bb->cost_count);
   1352    if (cJumps>0) {
   1353        CLG_DEBUG(3, "                     [ ");
   1354        for (i=0;i<cJumps;i++)
   1355 	   CLG_DEBUG(3, "%d ", clgs.bb->jmp[i].instr);
   1356        CLG_DEBUG(3, "], last inverted: %s \n",
   1357 		 clgs.bb->cjmp_inverted ? "yes":"no");
   1358    }
   1359 
   1360   return clgs.sbOut;
   1361 }
   1362 
   1363 /*--------------------------------------------------------------------*/
   1364 /*--- Discarding BB info                                           ---*/
   1365 /*--------------------------------------------------------------------*/
   1366 
   1367 // Called when a translation is removed from the translation cache for
   1368 // any reason at all: to free up space, because the guest code was
   1369 // unmapped or modified, or for any arbitrary reason.
   1370 static
   1371 void clg_discard_superblock_info ( Addr64 orig_addr64, VexGuestExtents vge )
   1372 {
   1373     Addr orig_addr = (Addr)orig_addr64;
   1374 
   1375     tl_assert(vge.n_used > 0);
   1376 
   1377    if (0)
   1378       VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
   1379                    (void*)(Addr)orig_addr,
   1380                    (void*)(Addr)vge.base[0], (ULong)vge.len[0]);
   1381 
   1382    // Get BB info, remove from table, free BB info.  Simple!  Note that we
   1383    // use orig_addr, not the first instruction address in vge.
   1384    CLG_(delete_bb)(orig_addr);
   1385 }
   1386 
   1387 
   1388 /*------------------------------------------------------------*/
   1389 /*--- CLG_(fini)() and related function                     ---*/
   1390 /*------------------------------------------------------------*/
   1391 
   1392 
   1393 
   1394 static void zero_thread_cost(thread_info* t)
   1395 {
   1396   Int i;
   1397 
   1398   for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1399     if (!CLG_(current_call_stack).entry[i].jcc) continue;
   1400 
   1401     /* reset call counters to current for active calls */
   1402     CLG_(copy_cost)( CLG_(sets).full,
   1403 		    CLG_(current_call_stack).entry[i].enter_cost,
   1404 		    CLG_(current_state).cost );
   1405     CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
   1406   }
   1407 
   1408   CLG_(forall_bbccs)(CLG_(zero_bbcc));
   1409 
   1410   /* set counter for last dump */
   1411   CLG_(copy_cost)( CLG_(sets).full,
   1412 		  t->lastdump_cost, CLG_(current_state).cost );
   1413 }
   1414 
   1415 void CLG_(zero_all_cost)(Bool only_current_thread)
   1416 {
   1417   if (VG_(clo_verbosity) > 1)
   1418     VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
   1419 
   1420   if (only_current_thread)
   1421     zero_thread_cost(CLG_(get_current_thread)());
   1422   else
   1423     CLG_(forall_threads)(zero_thread_cost);
   1424 
   1425   if (VG_(clo_verbosity) > 1)
   1426     VG_(message)(Vg_DebugMsg, "  ...done\n");
   1427 }
   1428 
   1429 static
   1430 void unwind_thread(thread_info* t)
   1431 {
   1432   /* unwind signal handlers */
   1433   while(CLG_(current_state).sig !=0)
   1434     CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
   1435 
   1436   /* unwind regular call stack */
   1437   while(CLG_(current_call_stack).sp>0)
   1438     CLG_(pop_call_stack)();
   1439 
   1440   /* reset context and function stack for context generation */
   1441   CLG_(init_exec_state)( &CLG_(current_state) );
   1442   CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
   1443 }
   1444 
   1445 static
   1446 void zero_state_cost(thread_info* t)
   1447 {
   1448     CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
   1449 }
   1450 
   1451 /* Ups, this can go very wrong...
   1452    FIXME: We should export this function or provide other means to get a handle */
   1453 extern void VG_(discard_translations) ( Addr64 start, ULong range, const HChar* who );
   1454 
   1455 void CLG_(set_instrument_state)(const HChar* reason, Bool state)
   1456 {
   1457   if (CLG_(instrument_state) == state) {
   1458     CLG_DEBUG(2, "%s: instrumentation already %s\n",
   1459 	     reason, state ? "ON" : "OFF");
   1460     return;
   1461   }
   1462   CLG_(instrument_state) = state;
   1463   CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
   1464 	   reason, state ? "ON" : "OFF");
   1465 
   1466   VG_(discard_translations)( (Addr64)0x1000, (ULong) ~0xfffl, "callgrind");
   1467 
   1468   /* reset internal state: call stacks, simulator */
   1469   CLG_(forall_threads)(unwind_thread);
   1470   CLG_(forall_threads)(zero_state_cost);
   1471   (*CLG_(cachesim).clear)();
   1472 
   1473   if (VG_(clo_verbosity) > 1)
   1474     VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
   1475 		 reason, state ? "ON" : "OFF");
   1476 }
   1477 
   1478 /* helper for dump_state_togdb */
   1479 static void dump_state_of_thread_togdb(thread_info* ti)
   1480 {
   1481     static HChar buf[512];
   1482     static FullCost sum = 0, tmp = 0;
   1483     Int t, p, i;
   1484     BBCC *from, *to;
   1485     call_entry* ce;
   1486 
   1487     t = CLG_(current_tid);
   1488     CLG_(init_cost_lz)( CLG_(sets).full, &sum );
   1489     CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
   1490     CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
   1491 			 ti->states.entry[0]->cost);
   1492     CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
   1493     CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), sum);
   1494     VG_(gdb_printf)("events-%d: %s\n", t, buf);
   1495     VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
   1496 
   1497     ce = 0;
   1498     for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1499       ce = CLG_(get_call_entry)(i);
   1500       /* if this frame is skipped, we don't have counters */
   1501       if (!ce->jcc) continue;
   1502 
   1503       from = ce->jcc->from;
   1504       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
   1505       VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
   1506 
   1507       /* FIXME: EventSets! */
   1508       CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
   1509       CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
   1510       CLG_(add_diff_cost)( CLG_(sets).full, sum,
   1511 			  ce->enter_cost, CLG_(current_state).cost );
   1512       CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
   1513 
   1514       p = VG_(sprintf)(buf, "events-%d-%d: ",t, i);
   1515       CLG_(sprint_mappingcost)(buf + p, CLG_(dumpmap), sum );
   1516       VG_(gdb_printf)("%s\n", buf);
   1517     }
   1518     if (ce && ce->jcc) {
   1519       to = ce->jcc->to;
   1520       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
   1521     }
   1522 }
   1523 
   1524 /* Dump current state */
   1525 static void dump_state_togdb(void)
   1526 {
   1527     static HChar buf[512];
   1528     thread_info** th;
   1529     int t, p;
   1530     Int orig_tid = CLG_(current_tid);
   1531 
   1532     VG_(gdb_printf)("instrumentation: %s\n",
   1533 		    CLG_(instrument_state) ? "on":"off");
   1534     if (!CLG_(instrument_state)) return;
   1535 
   1536     VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
   1537     VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
   1538     VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
   1539     VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
   1540     VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
   1541     VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
   1542 
   1543     /* "events:" line. Given here because it will be dynamic in the future */
   1544     p = VG_(sprintf)(buf, "events: ");
   1545     CLG_(sprint_eventmapping)(buf+p, CLG_(dumpmap));
   1546     VG_(gdb_printf)("%s\n", buf);
   1547     /* "part:" line (number of last part. Is 0 at start */
   1548     VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
   1549 
   1550     /* threads */
   1551     th = CLG_(get_threads)();
   1552     p = VG_(sprintf)(buf, "threads:");
   1553     for(t=1;t<VG_N_THREADS;t++) {
   1554 	if (!th[t]) continue;
   1555 	p += VG_(sprintf)(buf+p, " %d", t);
   1556     }
   1557     VG_(gdb_printf)("%s\n", buf);
   1558     VG_(gdb_printf)("current-tid: %d\n", orig_tid);
   1559     CLG_(forall_threads)(dump_state_of_thread_togdb);
   1560 }
   1561 
   1562 
   1563 static void print_monitor_help ( void )
   1564 {
   1565    VG_(gdb_printf) ("\n");
   1566    VG_(gdb_printf) ("callgrind monitor commands:\n");
   1567    VG_(gdb_printf) ("  dump [<dump_hint>]\n");
   1568    VG_(gdb_printf) ("        dump counters\n");
   1569    VG_(gdb_printf) ("  zero\n");
   1570    VG_(gdb_printf) ("        zero counters\n");
   1571    VG_(gdb_printf) ("  status\n");
   1572    VG_(gdb_printf) ("        print status\n");
   1573    VG_(gdb_printf) ("  instrumentation [on|off]\n");
   1574    VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
   1575    VG_(gdb_printf) ("\n");
   1576 }
   1577 
   1578 /* return True if request recognised, False otherwise */
   1579 static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
   1580 {
   1581    HChar* wcmd;
   1582    HChar s[VG_(strlen(req)) + 1]; /* copy for strtok_r */
   1583    HChar *ssaveptr;
   1584 
   1585    VG_(strcpy) (s, req);
   1586 
   1587    wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
   1588    switch (VG_(keyword_id) ("help dump zero status instrumentation",
   1589                             wcmd, kwd_report_duplicated_matches)) {
   1590    case -2: /* multiple matches */
   1591       return True;
   1592    case -1: /* not found */
   1593       return False;
   1594    case  0: /* help */
   1595       print_monitor_help();
   1596       return True;
   1597    case  1: { /* dump */
   1598       CLG_(dump_profile)(req, False);
   1599       return True;
   1600    }
   1601    case  2: { /* zero */
   1602       CLG_(zero_all_cost)(False);
   1603       return True;
   1604    }
   1605 
   1606    case 3: { /* status */
   1607      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1608      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
   1609        /* internal interface to callgrind_control */
   1610        dump_state_togdb();
   1611        return True;
   1612      }
   1613 
   1614      if (!CLG_(instrument_state)) {
   1615        VG_(gdb_printf)("No status available as instrumentation is switched off\n");
   1616      } else {
   1617        // Status information to be improved ...
   1618        thread_info** th = CLG_(get_threads)();
   1619        Int t, tcount = 0;
   1620        for(t=1;t<VG_N_THREADS;t++)
   1621 	 if (th[t]) tcount++;
   1622        VG_(gdb_printf)("%d thread(s) running.\n", tcount);
   1623      }
   1624      return True;
   1625    }
   1626 
   1627    case 4: { /* instrumentation */
   1628      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1629      if (!arg) {
   1630        VG_(gdb_printf)("instrumentation: %s\n",
   1631 		       CLG_(instrument_state) ? "on":"off");
   1632      }
   1633      else
   1634        CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
   1635      return True;
   1636    }
   1637 
   1638    default:
   1639       tl_assert(0);
   1640       return False;
   1641    }
   1642 }
   1643 
   1644 static
   1645 Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
   1646 {
   1647    if (!VG_IS_TOOL_USERREQ('C','T',args[0])
   1648        && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
   1649       return False;
   1650 
   1651    switch(args[0]) {
   1652    case VG_USERREQ__DUMP_STATS:
   1653       CLG_(dump_profile)("Client Request", True);
   1654       *ret = 0;                 /* meaningless */
   1655       break;
   1656 
   1657    case VG_USERREQ__DUMP_STATS_AT:
   1658      {
   1659        HChar buf[512];
   1660        VG_(sprintf)(buf,"Client Request: %s", (HChar*)args[1]);
   1661        CLG_(dump_profile)(buf, True);
   1662        *ret = 0;                 /* meaningless */
   1663      }
   1664      break;
   1665 
   1666    case VG_USERREQ__ZERO_STATS:
   1667      CLG_(zero_all_cost)(True);
   1668       *ret = 0;                 /* meaningless */
   1669       break;
   1670 
   1671    case VG_USERREQ__TOGGLE_COLLECT:
   1672      CLG_(current_state).collect = !CLG_(current_state).collect;
   1673      CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
   1674 	      CLG_(current_state).collect ? "ON" : "OFF");
   1675      *ret = 0;                 /* meaningless */
   1676      break;
   1677 
   1678    case VG_USERREQ__START_INSTRUMENTATION:
   1679      CLG_(set_instrument_state)("Client Request", True);
   1680      *ret = 0;                 /* meaningless */
   1681      break;
   1682 
   1683    case VG_USERREQ__STOP_INSTRUMENTATION:
   1684      CLG_(set_instrument_state)("Client Request", False);
   1685      *ret = 0;                 /* meaningless */
   1686      break;
   1687 
   1688    case VG_USERREQ__GDB_MONITOR_COMMAND: {
   1689       Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
   1690       if (handled)
   1691          *ret = 1;
   1692       else
   1693          *ret = 0;
   1694       return handled;
   1695    }
   1696    default:
   1697       return False;
   1698    }
   1699 
   1700    return True;
   1701 }
   1702 
   1703 
   1704 /* Syscall Timing */
   1705 
   1706 /* struct timeval syscalltime[VG_N_THREADS]; */
   1707 #if CLG_MICROSYSTIME
   1708 #include <sys/time.h>
   1709 #include <sys/syscall.h>
   1710 extern Int VG_(do_syscall) ( UInt, ... );
   1711 
   1712 ULong syscalltime[VG_N_THREADS];
   1713 #else
   1714 UInt syscalltime[VG_N_THREADS];
   1715 #endif
   1716 
   1717 static
   1718 void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
   1719                            UWord* args, UInt nArgs)
   1720 {
   1721   if (CLG_(clo).collect_systime) {
   1722 #if CLG_MICROSYSTIME
   1723     struct vki_timeval tv_now;
   1724     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1725     syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
   1726 #else
   1727     syscalltime[tid] = VG_(read_millisecond_timer)();
   1728 #endif
   1729   }
   1730 }
   1731 
   1732 static
   1733 void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
   1734                             UWord* args, UInt nArgs, SysRes res)
   1735 {
   1736   if (CLG_(clo).collect_systime &&
   1737       CLG_(current_state).bbcc) {
   1738       Int o;
   1739 #if CLG_MICROSYSTIME
   1740     struct vki_timeval tv_now;
   1741     ULong diff;
   1742 
   1743     VG_(do_syscall)(__NR_gettimeofday, (UInt)&tv_now, (UInt)NULL);
   1744     diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
   1745 #else
   1746     UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
   1747 #endif
   1748 
   1749     /* offset o is for "SysCount", o+1 for "SysTime" */
   1750     o = fullOffset(EG_SYS);
   1751     CLG_ASSERT(o>=0);
   1752     CLG_DEBUG(0,"   Time (Off %d) for Syscall %d: %ull\n", o, syscallno, diff);
   1753 
   1754     CLG_(current_state).cost[o] ++;
   1755     CLG_(current_state).cost[o+1] += diff;
   1756     if (!CLG_(current_state).bbcc->skipped)
   1757       CLG_(init_cost_lz)(CLG_(sets).full,
   1758 			&(CLG_(current_state).bbcc->skipped));
   1759     CLG_(current_state).bbcc->skipped[o] ++;
   1760     CLG_(current_state).bbcc->skipped[o+1] += diff;
   1761   }
   1762 }
   1763 
   1764 static UInt ULong_width(ULong n)
   1765 {
   1766    UInt w = 0;
   1767    while (n > 0) {
   1768       n = n / 10;
   1769       w++;
   1770    }
   1771    if (w == 0) w = 1;
   1772    return w + (w-1)/3;   // add space for commas
   1773 }
   1774 
   1775 static
   1776 void branchsim_printstat(int l1, int l2, int l3)
   1777 {
   1778     static HChar buf1[128], buf2[128], buf3[128];
   1779     static HChar fmt[128];
   1780     FullCost total;
   1781     ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
   1782     ULong B_total_b, B_total_mp;
   1783 
   1784     total = CLG_(total_cost);
   1785     Bc_total_b  = total[ fullOffset(EG_BC)   ];
   1786     Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
   1787     Bi_total_b  = total[ fullOffset(EG_BI)   ];
   1788     Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
   1789 
   1790     /* Make format string, getting width right for numbers */
   1791     VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
   1792                  l1, l2, l3);
   1793 
   1794     if (0 == Bc_total_b)  Bc_total_b = 1;
   1795     if (0 == Bi_total_b)  Bi_total_b = 1;
   1796     B_total_b  = Bc_total_b  + Bi_total_b;
   1797     B_total_mp = Bc_total_mp + Bi_total_mp;
   1798 
   1799     VG_(umsg)("\n");
   1800     VG_(umsg)(fmt, "Branches:     ",
   1801               B_total_b, Bc_total_b, Bi_total_b);
   1802 
   1803     VG_(umsg)(fmt, "Mispredicts:  ",
   1804               B_total_mp, Bc_total_mp, Bi_total_mp);
   1805 
   1806     VG_(percentify)(B_total_mp,  B_total_b,  1, l1+1, buf1);
   1807     VG_(percentify)(Bc_total_mp, Bc_total_b, 1, l2+1, buf2);
   1808     VG_(percentify)(Bi_total_mp, Bi_total_b, 1, l3+1, buf3);
   1809 
   1810     VG_(umsg)("Mispred rate:  %s (%s     + %s   )\n", buf1, buf2,buf3);
   1811 }
   1812 
   1813 static
   1814 void clg_print_stats(void)
   1815 {
   1816    int BB_lookups =
   1817      CLG_(stat).full_debug_BBs +
   1818      CLG_(stat).fn_name_debug_BBs +
   1819      CLG_(stat).file_line_debug_BBs +
   1820      CLG_(stat).no_debug_BBs;
   1821 
   1822    /* Hash table stats */
   1823    VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
   1824 		CLG_(stat).distinct_objs);
   1825    VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
   1826 		CLG_(stat).distinct_files);
   1827    VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
   1828 		CLG_(stat).distinct_fns);
   1829    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
   1830 		CLG_(stat).distinct_contexts);
   1831    VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
   1832 		CLG_(stat).distinct_bbs);
   1833    VG_(message)(Vg_DebugMsg, "Cost entries:     %d (Chunks %d)\n",
   1834 		CLG_(costarray_entries), CLG_(costarray_chunks));
   1835    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
   1836 		CLG_(stat).distinct_bbccs);
   1837    VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
   1838 		CLG_(stat).distinct_jccs);
   1839    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
   1840 		CLG_(stat).distinct_skips);
   1841    VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
   1842 		BB_lookups);
   1843    if (BB_lookups>0) {
   1844       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
   1845 		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
   1846 		   CLG_(stat).full_debug_BBs);
   1847       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
   1848 		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
   1849 		   CLG_(stat).file_line_debug_BBs);
   1850       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
   1851 		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
   1852 		   CLG_(stat).fn_name_debug_BBs);
   1853       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
   1854 		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
   1855 		   CLG_(stat).no_debug_BBs);
   1856    }
   1857    VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
   1858 		CLG_(stat).bbcc_clones);
   1859    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
   1860 		CLG_(stat).bb_retranslations);
   1861    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
   1862 		CLG_(stat).distinct_instrs);
   1863    VG_(message)(Vg_DebugMsg, "");
   1864 
   1865    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
   1866 		CLG_(stat).cxt_lru_misses);
   1867    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
   1868 		CLG_(stat).bbcc_lru_misses);
   1869    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
   1870 		CLG_(stat).jcc_lru_misses);
   1871    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
   1872 		CLG_(stat).bb_executions);
   1873    VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
   1874 		CLG_(stat).call_counter);
   1875    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
   1876 		CLG_(stat).jcnd_counter);
   1877    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
   1878 		CLG_(stat).jump_counter);
   1879    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
   1880 		CLG_(stat).rec_call_counter);
   1881    VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
   1882 		CLG_(stat).ret_counter);
   1883 }
   1884 
   1885 
   1886 static
   1887 void finish(void)
   1888 {
   1889   HChar buf[32+COSTS_LEN];
   1890   HChar fmt[128];
   1891   Int l1, l2, l3;
   1892   FullCost total;
   1893 
   1894   CLG_DEBUG(0, "finish()\n");
   1895 
   1896   (*CLG_(cachesim).finish)();
   1897 
   1898   /* pop all remaining items from CallStack for correct sum
   1899    */
   1900   CLG_(forall_threads)(unwind_thread);
   1901 
   1902   CLG_(dump_profile)(0, False);
   1903 
   1904   if (VG_(clo_verbosity) == 0) return;
   1905 
   1906   if (VG_(clo_stats)) {
   1907     VG_(message)(Vg_DebugMsg, "\n");
   1908     clg_print_stats();
   1909     VG_(message)(Vg_DebugMsg, "\n");
   1910   }
   1911 
   1912   CLG_(sprint_eventmapping)(buf, CLG_(dumpmap));
   1913   VG_(message)(Vg_UserMsg, "Events    : %s\n", buf);
   1914   CLG_(sprint_mappingcost)(buf, CLG_(dumpmap), CLG_(total_cost));
   1915   VG_(message)(Vg_UserMsg, "Collected : %s\n", buf);
   1916   VG_(message)(Vg_UserMsg, "\n");
   1917 
   1918   /* determine value widths for statistics */
   1919   total = CLG_(total_cost);
   1920   l1 = ULong_width( total[fullOffset(EG_IR)] );
   1921   l2 = l3 = 0;
   1922   if (CLG_(clo).simulate_cache) {
   1923       l2 = ULong_width( total[fullOffset(EG_DR)] );
   1924       l3 = ULong_width( total[fullOffset(EG_DW)] );
   1925   }
   1926   if (CLG_(clo).simulate_branch) {
   1927       int l2b = ULong_width( total[fullOffset(EG_BC)] );
   1928       int l3b = ULong_width( total[fullOffset(EG_BI)] );
   1929       if (l2b > l2) l2 = l2b;
   1930       if (l3b > l3) l3 = l3b;
   1931   }
   1932 
   1933   /* Make format string, getting width right for numbers */
   1934   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
   1935 
   1936   /* Always print this */
   1937   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
   1938 
   1939   if (CLG_(clo).simulate_cache)
   1940       (*CLG_(cachesim).printstat)(l1, l2, l3);
   1941 
   1942   if (CLG_(clo).simulate_branch)
   1943       branchsim_printstat(l1, l2, l3);
   1944 
   1945 }
   1946 
   1947 
   1948 void CLG_(fini)(Int exitcode)
   1949 {
   1950   finish();
   1951 }
   1952 
   1953 
   1954 /*--------------------------------------------------------------------*/
   1955 /*--- Setup                                                        ---*/
   1956 /*--------------------------------------------------------------------*/
   1957 
   1958 static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
   1959 {
   1960    static ULong last_blocks_done = 0;
   1961 
   1962    if (0)
   1963       VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
   1964 
   1965    /* throttle calls to CLG_(run_thread) by number of BBs executed */
   1966    if (blocks_done - last_blocks_done < 5000) return;
   1967    last_blocks_done = blocks_done;
   1968 
   1969    CLG_(run_thread)( tid );
   1970 }
   1971 
   1972 static
   1973 void CLG_(post_clo_init)(void)
   1974 {
   1975    if (VG_(clo_vex_control).iropt_register_updates
   1976        != VexRegUpdSpAtMemAccess) {
   1977       CLG_DEBUG(1, " Using user specified value for "
   1978                 "--vex-iropt-register-updates\n");
   1979    } else {
   1980       CLG_DEBUG(1,
   1981                 " Using default --vex-iropt-register-updates="
   1982                 "sp-at-mem-access\n");
   1983    }
   1984 
   1985    if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
   1986       VG_(message)(Vg_UserMsg,
   1987                    "callgrind only works with --vex-iropt-unroll-thresh=0\n"
   1988                    "=> resetting it back to 0\n");
   1989       VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
   1990    }
   1991    if (VG_(clo_vex_control).guest_chase_thresh != 0) {
   1992       VG_(message)(Vg_UserMsg,
   1993                    "callgrind only works with --vex-guest-chase-thresh=0\n"
   1994                    "=> resetting it back to 0\n");
   1995       VG_(clo_vex_control).guest_chase_thresh = 0; // cannot be overriden.
   1996    }
   1997 
   1998    CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
   1999    CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
   2000    CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
   2001 
   2002    if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
   2003        VG_(message)(Vg_UserMsg, "Using source line as position.\n");
   2004        CLG_(clo).dump_line = True;
   2005    }
   2006 
   2007    CLG_(init_dumps)();
   2008 
   2009    (*CLG_(cachesim).post_clo_init)();
   2010 
   2011    CLG_(init_eventsets)();
   2012    CLG_(init_statistics)(& CLG_(stat));
   2013    CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
   2014 
   2015    /* initialize hash tables */
   2016    CLG_(init_obj_table)();
   2017    CLG_(init_cxt_table)();
   2018    CLG_(init_bb_hash)();
   2019 
   2020    CLG_(init_threads)();
   2021    CLG_(run_thread)(1);
   2022 
   2023    CLG_(instrument_state) = CLG_(clo).instrument_atstart;
   2024 
   2025    if (VG_(clo_verbosity > 0)) {
   2026       VG_(message)(Vg_UserMsg,
   2027                    "For interactive control, run 'callgrind_control%s%s -h'.\n",
   2028                    (VG_(arg_vgdb_prefix) ? " " : ""),
   2029                    (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
   2030    }
   2031 }
   2032 
   2033 static
   2034 void CLG_(pre_clo_init)(void)
   2035 {
   2036     VG_(details_name)            ("Callgrind");
   2037     VG_(details_version)         (NULL);
   2038     VG_(details_description)     ("a call-graph generating cache profiler");
   2039     VG_(details_copyright_author)("Copyright (C) 2002-2013, and GNU GPL'd, "
   2040 				  "by Josef Weidendorfer et al.");
   2041     VG_(details_bug_reports_to)  (VG_BUGS_TO);
   2042     VG_(details_avg_translation_sizeB) ( 500 );
   2043 
   2044     VG_(clo_vex_control).iropt_register_updates
   2045        = VexRegUpdSpAtMemAccess; // overridable by the user.
   2046     VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
   2047     VG_(clo_vex_control).guest_chase_thresh = 0;    // cannot be overriden.
   2048 
   2049     VG_(basic_tool_funcs)        (CLG_(post_clo_init),
   2050                                   CLG_(instrument),
   2051                                   CLG_(fini));
   2052 
   2053     VG_(needs_superblock_discards)(clg_discard_superblock_info);
   2054 
   2055 
   2056     VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
   2057 				    CLG_(print_usage),
   2058 				    CLG_(print_debug_usage));
   2059 
   2060     VG_(needs_client_requests)(CLG_(handle_client_request));
   2061     VG_(needs_print_stats)    (clg_print_stats);
   2062     VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
   2063 			       CLG_(post_syscalltime));
   2064 
   2065     VG_(track_start_client_code)  ( & clg_start_client_code_callback );
   2066     VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
   2067     VG_(track_post_deliver_signal)( & CLG_(post_signal) );
   2068 
   2069     CLG_(set_clo_defaults)();
   2070 }
   2071 
   2072 VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
   2073 
   2074 /*--------------------------------------------------------------------*/
   2075 /*--- end                                                   main.c ---*/
   2076 /*--------------------------------------------------------------------*/
   2077