Home | History | Annotate | Download | only in callgrind
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Callgrind                                                    ---*/
      4 /*---                                                       main.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of Callgrind, a Valgrind tool for call graph
      9    profiling programs.
     10 
     11    Copyright (C) 2002-2015, Josef Weidendorfer (Josef.Weidendorfer (at) gmx.de)
     12 
     13    This tool is derived from and contains code from Cachegrind
     14    Copyright (C) 2002-2015 Nicholas Nethercote (njn (at) valgrind.org)
     15 
     16    This program is free software; you can redistribute it and/or
     17    modify it under the terms of the GNU General Public License as
     18    published by the Free Software Foundation; either version 2 of the
     19    License, or (at your option) any later version.
     20 
     21    This program is distributed in the hope that it will be useful, but
     22    WITHOUT ANY WARRANTY; without even the implied warranty of
     23    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     24    General Public License for more details.
     25 
     26    You should have received a copy of the GNU General Public License
     27    along with this program; if not, write to the Free Software
     28    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     29    02111-1307, USA.
     30 
     31    The GNU General Public License is contained in the file COPYING.
     32 */
     33 
     34 #include "config.h"
     35 #include "callgrind.h"
     36 #include "global.h"
     37 
     38 #include "pub_tool_threadstate.h"
     39 #include "pub_tool_gdbserver.h"
     40 #include "pub_tool_transtab.h"       // VG_(discard_translations_safely)
     41 
     42 #include "cg_branchpred.c"
     43 
     44 /*------------------------------------------------------------*/
     45 /*--- Global variables                                     ---*/
     46 /*------------------------------------------------------------*/
     47 
     48 /* for all threads */
     49 CommandLineOptions CLG_(clo);
     50 Statistics CLG_(stat);
     51 Bool CLG_(instrument_state) = True; /* Instrumentation on ? */
     52 
     53 /* thread and signal handler specific */
     54 exec_state CLG_(current_state);
     55 
     56 /* min of L1 and LL cache line sizes.  This only gets set to a
     57    non-zero value if we are doing cache simulation. */
     58 Int CLG_(min_line_size) = 0;
     59 
     60 
     61 /*------------------------------------------------------------*/
     62 /*--- Statistics                                           ---*/
     63 /*------------------------------------------------------------*/
     64 
     65 static void CLG_(init_statistics)(Statistics* s)
     66 {
     67   s->call_counter        = 0;
     68   s->jcnd_counter        = 0;
     69   s->jump_counter        = 0;
     70   s->rec_call_counter    = 0;
     71   s->ret_counter         = 0;
     72   s->bb_executions       = 0;
     73 
     74   s->context_counter     = 0;
     75   s->bb_retranslations   = 0;
     76 
     77   s->distinct_objs       = 0;
     78   s->distinct_files      = 0;
     79   s->distinct_fns        = 0;
     80   s->distinct_contexts   = 0;
     81   s->distinct_bbs        = 0;
     82   s->distinct_bbccs      = 0;
     83   s->distinct_instrs     = 0;
     84   s->distinct_skips      = 0;
     85 
     86   s->bb_hash_resizes     = 0;
     87   s->bbcc_hash_resizes   = 0;
     88   s->jcc_hash_resizes    = 0;
     89   s->cxt_hash_resizes    = 0;
     90   s->fn_array_resizes    = 0;
     91   s->call_stack_resizes  = 0;
     92   s->fn_stack_resizes    = 0;
     93 
     94   s->full_debug_BBs      = 0;
     95   s->file_line_debug_BBs = 0;
     96   s->fn_name_debug_BBs   = 0;
     97   s->no_debug_BBs        = 0;
     98   s->bbcc_lru_misses     = 0;
     99   s->jcc_lru_misses      = 0;
    100   s->cxt_lru_misses      = 0;
    101   s->bbcc_clones         = 0;
    102 }
    103 
    104 
    105 /*------------------------------------------------------------*/
    106 /*--- Simple callbacks (not cache similator)               ---*/
    107 /*------------------------------------------------------------*/
    108 
    109 VG_REGPARM(1)
    110 static void log_global_event(InstrInfo* ii)
    111 {
    112     ULong* cost_Bus;
    113 
    114     CLG_DEBUG(6, "log_global_event:  Ir  %#lx/%u\n",
    115               CLG_(bb_base) + ii->instr_offset, ii->instr_size);
    116 
    117     if (!CLG_(current_state).collect) return;
    118 
    119     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BUS))>0 );
    120 
    121     CLG_(current_state).cost[ fullOffset(EG_BUS) ]++;
    122 
    123     if (CLG_(current_state).nonskipped)
    124         cost_Bus = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BUS);
    125     else
    126         cost_Bus = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BUS];
    127     cost_Bus[0]++;
    128 }
    129 
    130 
    131 /* For branches, we consult two different predictors, one which
    132    predicts taken/untaken for conditional branches, and the other
    133    which predicts the branch target address for indirect branches
    134    (jump-to-register style ones). */
    135 
    136 static VG_REGPARM(2)
    137 void log_cond_branch(InstrInfo* ii, Word taken)
    138 {
    139     Bool miss;
    140     Int fullOffset_Bc;
    141     ULong* cost_Bc;
    142 
    143     CLG_DEBUG(6, "log_cond_branch:  Ir %#lx, taken %ld\n",
    144               CLG_(bb_base) + ii->instr_offset, taken);
    145 
    146     miss = 1 & do_cond_branch_predict(CLG_(bb_base) + ii->instr_offset, taken);
    147 
    148     if (!CLG_(current_state).collect) return;
    149 
    150     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BC))>0 );
    151 
    152     if (CLG_(current_state).nonskipped)
    153         cost_Bc = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BC);
    154     else
    155         cost_Bc = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BC];
    156 
    157     fullOffset_Bc = fullOffset(EG_BC);
    158     CLG_(current_state).cost[ fullOffset_Bc ]++;
    159     cost_Bc[0]++;
    160     if (miss) {
    161         CLG_(current_state).cost[ fullOffset_Bc+1 ]++;
    162         cost_Bc[1]++;
    163     }
    164 }
    165 
    166 static VG_REGPARM(2)
    167 void log_ind_branch(InstrInfo* ii, UWord actual_dst)
    168 {
    169     Bool miss;
    170     Int fullOffset_Bi;
    171     ULong* cost_Bi;
    172 
    173     CLG_DEBUG(6, "log_ind_branch:  Ir  %#lx, dst %#lx\n",
    174               CLG_(bb_base) + ii->instr_offset, actual_dst);
    175 
    176     miss = 1 & do_ind_branch_predict(CLG_(bb_base) + ii->instr_offset, actual_dst);
    177 
    178     if (!CLG_(current_state).collect) return;
    179 
    180     CLG_ASSERT( (ii->eventset->mask & (1u<<EG_BI))>0 );
    181 
    182     if (CLG_(current_state).nonskipped)
    183         cost_Bi = CLG_(current_state).nonskipped->skipped + fullOffset(EG_BI);
    184     else
    185         cost_Bi = CLG_(cost_base) + ii->cost_offset + ii->eventset->offset[EG_BI];
    186 
    187     fullOffset_Bi = fullOffset(EG_BI);
    188     CLG_(current_state).cost[ fullOffset_Bi ]++;
    189     cost_Bi[0]++;
    190     if (miss) {
    191         CLG_(current_state).cost[ fullOffset_Bi+1 ]++;
    192         cost_Bi[1]++;
    193     }
    194 }
    195 
    196 /*------------------------------------------------------------*/
    197 /*--- Instrumentation structures and event queue handling  ---*/
    198 /*------------------------------------------------------------*/
    199 
    200 /* Maintain an ordered list of memory events which are outstanding, in
    201    the sense that no IR has yet been generated to do the relevant
    202    helper calls.  The BB is scanned top to bottom and memory events
    203    are added to the end of the list, merging with the most recent
    204    notified event where possible (Dw immediately following Dr and
    205    having the same size and EA can be merged).
    206 
    207    This merging is done so that for architectures which have
    208    load-op-store instructions (x86, amd64), the insn is treated as if
    209    it makes just one memory reference (a modify), rather than two (a
    210    read followed by a write at the same address).
    211 
    212    At various points the list will need to be flushed, that is, IR
    213    generated from it.  That must happen before any possible exit from
    214    the block (the end, or an IRStmt_Exit).  Flushing also takes place
    215    when there is no space to add a new event.
    216 
    217    If we require the simulation statistics to be up to date with
    218    respect to possible memory exceptions, then the list would have to
    219    be flushed before each memory reference.  That would however lose
    220    performance by inhibiting event-merging during flushing.
    221 
    222    Flushing the list consists of walking it start to end and emitting
    223    instrumentation IR for each event, in the order in which they
    224    appear.  It may be possible to emit a single call for two adjacent
    225    events in order to reduce the number of helper function calls made.
    226    For example, it could well be profitable to handle two adjacent Ir
    227    events with a single helper call.  */
    228 
    229 typedef
    230    IRExpr
    231    IRAtom;
    232 
    233 typedef
    234    enum {
    235       Ev_Ir,  // Instruction read
    236       Ev_Dr,  // Data read
    237       Ev_Dw,  // Data write
    238       Ev_Dm,  // Data modify (read then write)
    239       Ev_Bc,  // branch conditional
    240       Ev_Bi,  // branch indirect (to unknown destination)
    241       Ev_G    // Global bus event
    242    }
    243    EventTag;
    244 
    245 typedef
    246    struct {
    247       EventTag   tag;
    248       InstrInfo* inode;
    249       union {
    250 	 struct {
    251 	 } Ir;
    252 	 struct {
    253 	    IRAtom* ea;
    254 	    Int     szB;
    255 	 } Dr;
    256 	 struct {
    257 	    IRAtom* ea;
    258 	    Int     szB;
    259 	 } Dw;
    260 	 struct {
    261 	    IRAtom* ea;
    262 	    Int     szB;
    263 	 } Dm;
    264          struct {
    265             IRAtom* taken; /* :: Ity_I1 */
    266          } Bc;
    267          struct {
    268             IRAtom* dst;
    269          } Bi;
    270 	 struct {
    271 	 } G;
    272       } Ev;
    273    }
    274    Event;
    275 
    276 static void init_Event ( Event* ev ) {
    277    VG_(memset)(ev, 0, sizeof(Event));
    278 }
    279 
    280 static IRAtom* get_Event_dea ( Event* ev ) {
    281    switch (ev->tag) {
    282       case Ev_Dr: return ev->Ev.Dr.ea;
    283       case Ev_Dw: return ev->Ev.Dw.ea;
    284       case Ev_Dm: return ev->Ev.Dm.ea;
    285       default:    tl_assert(0);
    286    }
    287 }
    288 
    289 static Int get_Event_dszB ( Event* ev ) {
    290    switch (ev->tag) {
    291       case Ev_Dr: return ev->Ev.Dr.szB;
    292       case Ev_Dw: return ev->Ev.Dw.szB;
    293       case Ev_Dm: return ev->Ev.Dm.szB;
    294       default:    tl_assert(0);
    295    }
    296 }
    297 
    298 
    299 /* Up to this many unnotified events are allowed.  Number is
    300    arbitrary.  Larger numbers allow more event merging to occur, but
    301    potentially induce more spilling due to extending live ranges of
    302    address temporaries. */
    303 #define N_EVENTS 16
    304 
    305 
    306 /* A struct which holds all the running state during instrumentation.
    307    Mostly to avoid passing loads of parameters everywhere. */
    308 typedef struct {
    309     /* The current outstanding-memory-event list. */
    310     Event events[N_EVENTS];
    311     Int   events_used;
    312 
    313     /* The array of InstrInfo's is part of BB struct. */
    314     BB* bb;
    315 
    316     /* BB seen before (ie. re-instrumentation) */
    317     Bool seen_before;
    318 
    319     /* Number InstrInfo bins 'used' so far. */
    320     UInt ii_index;
    321 
    322     // current offset of guest instructions from BB start
    323     UInt instr_offset;
    324 
    325     /* The output SB being constructed. */
    326     IRSB* sbOut;
    327 } ClgState;
    328 
    329 
    330 static void showEvent ( Event* ev )
    331 {
    332    switch (ev->tag) {
    333       case Ev_Ir:
    334 	 VG_(printf)("Ir (InstrInfo %p) at +%u\n",
    335 		     ev->inode, ev->inode->instr_offset);
    336 	 break;
    337       case Ev_Dr:
    338 	 VG_(printf)("Dr (InstrInfo %p) at +%u %d EA=",
    339 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dr.szB);
    340 	 ppIRExpr(ev->Ev.Dr.ea);
    341 	 VG_(printf)("\n");
    342 	 break;
    343       case Ev_Dw:
    344 	 VG_(printf)("Dw (InstrInfo %p) at +%u %d EA=",
    345 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dw.szB);
    346 	 ppIRExpr(ev->Ev.Dw.ea);
    347 	 VG_(printf)("\n");
    348 	 break;
    349       case Ev_Dm:
    350 	 VG_(printf)("Dm (InstrInfo %p) at +%u %d EA=",
    351 		     ev->inode, ev->inode->instr_offset, ev->Ev.Dm.szB);
    352 	 ppIRExpr(ev->Ev.Dm.ea);
    353 	 VG_(printf)("\n");
    354 	 break;
    355       case Ev_Bc:
    356          VG_(printf)("Bc %p   GA=", ev->inode);
    357          ppIRExpr(ev->Ev.Bc.taken);
    358          VG_(printf)("\n");
    359          break;
    360       case Ev_Bi:
    361          VG_(printf)("Bi %p  DST=", ev->inode);
    362          ppIRExpr(ev->Ev.Bi.dst);
    363          VG_(printf)("\n");
    364          break;
    365       case Ev_G:
    366          VG_(printf)("G  %p\n", ev->inode);
    367          break;
    368       default:
    369 	 tl_assert(0);
    370 	 break;
    371    }
    372 }
    373 
    374 /* Generate code for all outstanding memory events, and mark the queue
    375    empty.  Code is generated into cgs->sbOut, and this activity
    376    'consumes' slots in cgs->bb. */
    377 
    378 static void flushEvents ( ClgState* clgs )
    379 {
    380    Int        i, regparms, inew;
    381    const HChar* helperName;
    382    void*      helperAddr;
    383    IRExpr**   argv;
    384    IRExpr*    i_node_expr;
    385    IRDirty*   di;
    386    Event*     ev;
    387    Event*     ev2;
    388    Event*     ev3;
    389 
    390    if (!clgs->seen_before) {
    391        // extend event sets as needed
    392        // available sets: D0 Dr
    393        for(i=0; i<clgs->events_used; i++) {
    394 	   ev  = &clgs->events[i];
    395 	   switch(ev->tag) {
    396 	   case Ev_Ir:
    397 	       // Ir event always is first for a guest instruction
    398 	       CLG_ASSERT(ev->inode->eventset == 0);
    399 	       ev->inode->eventset = CLG_(sets).base;
    400 	       break;
    401 	   case Ev_Dr:
    402                // extend event set by Dr counters
    403 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    404 							   EG_DR);
    405 	       break;
    406 	   case Ev_Dw:
    407 	   case Ev_Dm:
    408                // extend event set by Dw counters
    409 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    410 							   EG_DW);
    411 	       break;
    412            case Ev_Bc:
    413                // extend event set by Bc counters
    414                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    415                                                            EG_BC);
    416                break;
    417            case Ev_Bi:
    418                // extend event set by Bi counters
    419                ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    420                                                            EG_BI);
    421                break;
    422 	   case Ev_G:
    423                // extend event set by Bus counter
    424 	       ev->inode->eventset = CLG_(add_event_group)(ev->inode->eventset,
    425 							   EG_BUS);
    426 	       break;
    427 	   default:
    428 	       tl_assert(0);
    429 	   }
    430        }
    431    }
    432 
    433    for(i = 0; i < clgs->events_used; i = inew) {
    434 
    435       helperName = NULL;
    436       helperAddr = NULL;
    437       argv       = NULL;
    438       regparms   = 0;
    439 
    440       /* generate IR to notify event i and possibly the ones
    441 	 immediately following it. */
    442       tl_assert(i >= 0 && i < clgs->events_used);
    443 
    444       ev  = &clgs->events[i];
    445       ev2 = ( i < clgs->events_used-1 ? &clgs->events[i+1] : NULL );
    446       ev3 = ( i < clgs->events_used-2 ? &clgs->events[i+2] : NULL );
    447 
    448       CLG_DEBUGIF(5) {
    449 	 VG_(printf)("   flush ");
    450 	 showEvent( ev );
    451       }
    452 
    453       i_node_expr = mkIRExpr_HWord( (HWord)ev->inode );
    454 
    455       /* Decide on helper fn to call and args to pass it, and advance
    456 	 i appropriately.
    457 	 Dm events have same effect as Dw events */
    458       switch (ev->tag) {
    459 	 case Ev_Ir:
    460 	    /* Merge an Ir with a following Dr. */
    461 	    if (ev2 && ev2->tag == Ev_Dr) {
    462 	       /* Why is this true?  It's because we're merging an Ir
    463 		  with a following Dr.  The Ir derives from the
    464 		  instruction's IMark and the Dr from data
    465 		  references which follow it.  In short it holds
    466 		  because each insn starts with an IMark, hence an
    467 		  Ev_Ir, and so these Dr must pertain to the
    468 		  immediately preceding Ir.  Same applies to analogous
    469 		  assertions in the subsequent cases. */
    470 	       tl_assert(ev2->inode == ev->inode);
    471 	       helperName = CLG_(cachesim).log_1I1Dr_name;
    472 	       helperAddr = CLG_(cachesim).log_1I1Dr;
    473 	       argv = mkIRExprVec_3( i_node_expr,
    474 				     get_Event_dea(ev2),
    475 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    476 	       regparms = 3;
    477 	       inew = i+2;
    478 	    }
    479 	    /* Merge an Ir with a following Dw/Dm. */
    480 	    else
    481 	    if (ev2 && (ev2->tag == Ev_Dw || ev2->tag == Ev_Dm)) {
    482 	       tl_assert(ev2->inode == ev->inode);
    483 	       helperName = CLG_(cachesim).log_1I1Dw_name;
    484 	       helperAddr = CLG_(cachesim).log_1I1Dw;
    485 	       argv = mkIRExprVec_3( i_node_expr,
    486 				     get_Event_dea(ev2),
    487 				     mkIRExpr_HWord( get_Event_dszB(ev2) ) );
    488 	       regparms = 3;
    489 	       inew = i+2;
    490 	    }
    491 	    /* Merge an Ir with two following Irs. */
    492 	    else
    493 	    if (ev2 && ev3 && ev2->tag == Ev_Ir && ev3->tag == Ev_Ir) {
    494 	       helperName = CLG_(cachesim).log_3I0D_name;
    495 	       helperAddr = CLG_(cachesim).log_3I0D;
    496 	       argv = mkIRExprVec_3( i_node_expr,
    497 				     mkIRExpr_HWord( (HWord)ev2->inode ),
    498 				     mkIRExpr_HWord( (HWord)ev3->inode ) );
    499 	       regparms = 3;
    500 	       inew = i+3;
    501 	    }
    502 	    /* Merge an Ir with one following Ir. */
    503 	    else
    504 	    if (ev2 && ev2->tag == Ev_Ir) {
    505 	       helperName = CLG_(cachesim).log_2I0D_name;
    506 	       helperAddr = CLG_(cachesim).log_2I0D;
    507 	       argv = mkIRExprVec_2( i_node_expr,
    508 				     mkIRExpr_HWord( (HWord)ev2->inode ) );
    509 	       regparms = 2;
    510 	       inew = i+2;
    511 	    }
    512 	    /* No merging possible; emit as-is. */
    513 	    else {
    514 	       helperName = CLG_(cachesim).log_1I0D_name;
    515 	       helperAddr = CLG_(cachesim).log_1I0D;
    516 	       argv = mkIRExprVec_1( i_node_expr );
    517 	       regparms = 1;
    518 	       inew = i+1;
    519 	    }
    520 	    break;
    521 	 case Ev_Dr:
    522 	    /* Data read or modify */
    523 	    helperName = CLG_(cachesim).log_0I1Dr_name;
    524 	    helperAddr = CLG_(cachesim).log_0I1Dr;
    525 	    argv = mkIRExprVec_3( i_node_expr,
    526 				  get_Event_dea(ev),
    527 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    528 	    regparms = 3;
    529 	    inew = i+1;
    530 	    break;
    531 	 case Ev_Dw:
    532 	 case Ev_Dm:
    533 	    /* Data write */
    534 	    helperName = CLG_(cachesim).log_0I1Dw_name;
    535 	    helperAddr = CLG_(cachesim).log_0I1Dw;
    536 	    argv = mkIRExprVec_3( i_node_expr,
    537 				  get_Event_dea(ev),
    538 				  mkIRExpr_HWord( get_Event_dszB(ev) ) );
    539 	    regparms = 3;
    540 	    inew = i+1;
    541 	    break;
    542          case Ev_Bc:
    543             /* Conditional branch */
    544             helperName = "log_cond_branch";
    545             helperAddr = &log_cond_branch;
    546             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bc.taken );
    547             regparms = 2;
    548             inew = i+1;
    549             break;
    550          case Ev_Bi:
    551             /* Branch to an unknown destination */
    552             helperName = "log_ind_branch";
    553             helperAddr = &log_ind_branch;
    554             argv = mkIRExprVec_2( i_node_expr, ev->Ev.Bi.dst );
    555             regparms = 2;
    556             inew = i+1;
    557             break;
    558          case Ev_G:
    559             /* Global bus event (CAS, LOCK-prefix, LL-SC, etc) */
    560             helperName = "log_global_event";
    561             helperAddr = &log_global_event;
    562             argv = mkIRExprVec_1( i_node_expr );
    563             regparms = 1;
    564             inew = i+1;
    565             break;
    566 	 default:
    567 	    tl_assert(0);
    568       }
    569 
    570       CLG_DEBUGIF(5) {
    571 	  if (inew > i+1) {
    572 	      VG_(printf)("   merge ");
    573 	      showEvent( ev2 );
    574 	  }
    575 	  if (inew > i+2) {
    576 	      VG_(printf)("   merge ");
    577 	      showEvent( ev3 );
    578 	  }
    579 	  if (helperAddr)
    580 	      VG_(printf)("   call  %s (%p)\n",
    581 			  helperName, helperAddr);
    582       }
    583 
    584       /* helper could be unset depending on the simulator used */
    585       if (helperAddr == 0) continue;
    586 
    587       /* Add the helper. */
    588       tl_assert(helperName);
    589       tl_assert(helperAddr);
    590       tl_assert(argv);
    591       di = unsafeIRDirty_0_N( regparms,
    592 			      helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    593 			      argv );
    594       addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    595    }
    596 
    597    clgs->events_used = 0;
    598 }
    599 
    600 static void addEvent_Ir ( ClgState* clgs, InstrInfo* inode )
    601 {
    602    Event* evt;
    603    tl_assert(clgs->seen_before || (inode->eventset == 0));
    604    if (!CLG_(clo).simulate_cache) return;
    605 
    606    if (clgs->events_used == N_EVENTS)
    607       flushEvents(clgs);
    608    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    609    evt = &clgs->events[clgs->events_used];
    610    init_Event(evt);
    611    evt->tag      = Ev_Ir;
    612    evt->inode    = inode;
    613    clgs->events_used++;
    614 }
    615 
    616 static
    617 void addEvent_Dr ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    618 {
    619    Event* evt;
    620    tl_assert(isIRAtom(ea));
    621    tl_assert(datasize >= 1);
    622    if (!CLG_(clo).simulate_cache) return;
    623    tl_assert(datasize <= CLG_(min_line_size));
    624 
    625    if (clgs->events_used == N_EVENTS)
    626       flushEvents(clgs);
    627    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    628    evt = &clgs->events[clgs->events_used];
    629    init_Event(evt);
    630    evt->tag       = Ev_Dr;
    631    evt->inode     = inode;
    632    evt->Ev.Dr.szB = datasize;
    633    evt->Ev.Dr.ea  = ea;
    634    clgs->events_used++;
    635 }
    636 
    637 static
    638 void addEvent_Dw ( ClgState* clgs, InstrInfo* inode, Int datasize, IRAtom* ea )
    639 {
    640    Event* lastEvt;
    641    Event* evt;
    642    tl_assert(isIRAtom(ea));
    643    tl_assert(datasize >= 1);
    644    if (!CLG_(clo).simulate_cache) return;
    645    tl_assert(datasize <= CLG_(min_line_size));
    646 
    647    /* Is it possible to merge this write with the preceding read? */
    648    lastEvt = &clgs->events[clgs->events_used-1];
    649    if (clgs->events_used > 0
    650        && lastEvt->tag       == Ev_Dr
    651        && lastEvt->Ev.Dr.szB == datasize
    652        && lastEvt->inode     == inode
    653        && eqIRAtom(lastEvt->Ev.Dr.ea, ea))
    654    {
    655       lastEvt->tag   = Ev_Dm;
    656       return;
    657    }
    658 
    659    /* No.  Add as normal. */
    660    if (clgs->events_used == N_EVENTS)
    661       flushEvents(clgs);
    662    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    663    evt = &clgs->events[clgs->events_used];
    664    init_Event(evt);
    665    evt->tag       = Ev_Dw;
    666    evt->inode     = inode;
    667    evt->Ev.Dw.szB = datasize;
    668    evt->Ev.Dw.ea  = ea;
    669    clgs->events_used++;
    670 }
    671 
    672 static
    673 void addEvent_D_guarded ( ClgState* clgs, InstrInfo* inode,
    674                           Int datasize, IRAtom* ea, IRAtom* guard,
    675                           Bool isWrite )
    676 {
    677    tl_assert(isIRAtom(ea));
    678    tl_assert(guard);
    679    tl_assert(isIRAtom(guard));
    680    tl_assert(datasize >= 1);
    681    if (!CLG_(clo).simulate_cache) return;
    682    tl_assert(datasize <= CLG_(min_line_size));
    683 
    684    /* Adding guarded memory actions and merging them with the existing
    685       queue is too complex.  Simply flush the queue and add this
    686       action immediately.  Since guarded loads and stores are pretty
    687       rare, this is not thought likely to cause any noticeable
    688       performance loss as a result of the loss of event-merging
    689       opportunities. */
    690    tl_assert(clgs->events_used >= 0);
    691    flushEvents(clgs);
    692    tl_assert(clgs->events_used == 0);
    693    /* Same as case Ev_Dw / case Ev_Dr in flushEvents, except with guard */
    694    IRExpr*      i_node_expr;
    695    const HChar* helperName;
    696    void*        helperAddr;
    697    IRExpr**     argv;
    698    Int          regparms;
    699    IRDirty*     di;
    700    i_node_expr = mkIRExpr_HWord( (HWord)inode );
    701    helperName  = isWrite ? CLG_(cachesim).log_0I1Dw_name
    702                          : CLG_(cachesim).log_0I1Dr_name;
    703    helperAddr  = isWrite ? CLG_(cachesim).log_0I1Dw
    704                          : CLG_(cachesim).log_0I1Dr;
    705    argv        = mkIRExprVec_3( i_node_expr,
    706                                 ea, mkIRExpr_HWord( datasize ) );
    707    regparms    = 3;
    708    di          = unsafeIRDirty_0_N(
    709                     regparms,
    710                     helperName, VG_(fnptr_to_fnentry)( helperAddr ),
    711                     argv );
    712    di->guard = guard;
    713    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    714 }
    715 
    716 static
    717 void addEvent_Bc ( ClgState* clgs, InstrInfo* inode, IRAtom* guard )
    718 {
    719    Event* evt;
    720    tl_assert(isIRAtom(guard));
    721    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, guard)
    722              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    723    if (!CLG_(clo).simulate_branch) return;
    724 
    725    if (clgs->events_used == N_EVENTS)
    726       flushEvents(clgs);
    727    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    728    evt = &clgs->events[clgs->events_used];
    729    init_Event(evt);
    730    evt->tag         = Ev_Bc;
    731    evt->inode       = inode;
    732    evt->Ev.Bc.taken = guard;
    733    clgs->events_used++;
    734 }
    735 
    736 static
    737 void addEvent_Bi ( ClgState* clgs, InstrInfo* inode, IRAtom* whereTo )
    738 {
    739    Event* evt;
    740    tl_assert(isIRAtom(whereTo));
    741    tl_assert(typeOfIRExpr(clgs->sbOut->tyenv, whereTo)
    742              == (sizeof(HWord)==4 ? Ity_I32 : Ity_I64));
    743    if (!CLG_(clo).simulate_branch) return;
    744 
    745    if (clgs->events_used == N_EVENTS)
    746       flushEvents(clgs);
    747    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    748    evt = &clgs->events[clgs->events_used];
    749    init_Event(evt);
    750    evt->tag       = Ev_Bi;
    751    evt->inode     = inode;
    752    evt->Ev.Bi.dst = whereTo;
    753    clgs->events_used++;
    754 }
    755 
    756 static
    757 void addEvent_G ( ClgState* clgs, InstrInfo* inode )
    758 {
    759    Event* evt;
    760    if (!CLG_(clo).collect_bus) return;
    761 
    762    if (clgs->events_used == N_EVENTS)
    763       flushEvents(clgs);
    764    tl_assert(clgs->events_used >= 0 && clgs->events_used < N_EVENTS);
    765    evt = &clgs->events[clgs->events_used];
    766    init_Event(evt);
    767    evt->tag       = Ev_G;
    768    evt->inode     = inode;
    769    clgs->events_used++;
    770 }
    771 
    772 /* Initialise or check (if already seen before) an InstrInfo for next insn.
    773    We only can set instr_offset/instr_size here. The required event set and
    774    resulting cost offset depend on events (Ir/Dr/Dw/Dm) in guest
    775    instructions. The event set is extended as required on flush of the event
    776    queue (when Dm events were determined), cost offsets are determined at
    777    end of BB instrumentation. */
    778 static
    779 InstrInfo* next_InstrInfo ( ClgState* clgs, UInt instr_size )
    780 {
    781    InstrInfo* ii;
    782    tl_assert(clgs->ii_index >= 0);
    783    tl_assert(clgs->ii_index < clgs->bb->instr_count);
    784    ii = &clgs->bb->instr[ clgs->ii_index ];
    785 
    786    if (clgs->seen_before) {
    787        CLG_ASSERT(ii->instr_offset == clgs->instr_offset);
    788        CLG_ASSERT(ii->instr_size == instr_size);
    789    }
    790    else {
    791        ii->instr_offset = clgs->instr_offset;
    792        ii->instr_size = instr_size;
    793        ii->cost_offset = 0;
    794        ii->eventset = 0;
    795    }
    796 
    797    clgs->ii_index++;
    798    clgs->instr_offset += instr_size;
    799    CLG_(stat).distinct_instrs++;
    800 
    801    return ii;
    802 }
    803 
    804 // return total number of cost values needed for this BB
    805 static
    806 UInt update_cost_offsets( ClgState* clgs )
    807 {
    808     Int i;
    809     InstrInfo* ii;
    810     UInt cost_offset = 0;
    811 
    812     CLG_ASSERT(clgs->bb->instr_count == clgs->ii_index);
    813     for(i=0; i<clgs->ii_index; i++) {
    814 	ii = &clgs->bb->instr[i];
    815 	if (clgs->seen_before) {
    816 	    CLG_ASSERT(ii->cost_offset == cost_offset);
    817 	} else
    818 	    ii->cost_offset = cost_offset;
    819 	cost_offset += ii->eventset ? ii->eventset->size : 0;
    820     }
    821 
    822     return cost_offset;
    823 }
    824 
    825 /*------------------------------------------------------------*/
    826 /*--- Instrumentation                                      ---*/
    827 /*------------------------------------------------------------*/
    828 
    829 #if defined(VG_BIGENDIAN)
    830 # define CLGEndness Iend_BE
    831 #elif defined(VG_LITTLEENDIAN)
    832 # define CLGEndness Iend_LE
    833 #else
    834 # error "Unknown endianness"
    835 #endif
    836 
    837 static
    838 Addr IRConst2Addr(IRConst* con)
    839 {
    840     Addr addr;
    841 
    842     if (sizeof(Addr) == 4) {
    843 	CLG_ASSERT( con->tag == Ico_U32 );
    844 	addr = con->Ico.U32;
    845     }
    846     else if (sizeof(Addr) == 8) {
    847 	CLG_ASSERT( con->tag == Ico_U64 );
    848 	addr = con->Ico.U64;
    849     }
    850     else
    851 	VG_(tool_panic)("Callgrind: invalid Addr type");
    852 
    853     return addr;
    854 }
    855 
    856 /* First pass over a BB to instrument, counting instructions and jumps
    857  * This is needed for the size of the BB struct to allocate
    858  *
    859  * Called from CLG_(get_bb)
    860  */
    861 void CLG_(collectBlockInfo)(IRSB* sbIn,
    862 			    /*INOUT*/ UInt* instrs,
    863 			    /*INOUT*/ UInt* cjmps,
    864 			    /*INOUT*/ Bool* cjmp_inverted)
    865 {
    866     Int i;
    867     IRStmt* st;
    868     Addr instrAddr =0, jumpDst;
    869     UInt instrLen = 0;
    870     Bool toNextInstr = False;
    871 
    872     // Ist_Exit has to be ignored in preamble code, before first IMark:
    873     // preamble code is added by VEX for self modifying code, and has
    874     // nothing to do with client code
    875     Bool inPreamble = True;
    876 
    877     if (!sbIn) return;
    878 
    879     for (i = 0; i < sbIn->stmts_used; i++) {
    880 	  st = sbIn->stmts[i];
    881 	  if (Ist_IMark == st->tag) {
    882 	      inPreamble = False;
    883 
    884 	      instrAddr = st->Ist.IMark.addr;
    885 	      instrLen  = st->Ist.IMark.len;
    886 
    887 	      (*instrs)++;
    888 	      toNextInstr = False;
    889 	  }
    890 	  if (inPreamble) continue;
    891 	  if (Ist_Exit == st->tag) {
    892 	      jumpDst = IRConst2Addr(st->Ist.Exit.dst);
    893 	      toNextInstr =  (jumpDst == instrAddr + instrLen);
    894 
    895 	      (*cjmps)++;
    896 	  }
    897     }
    898 
    899     /* if the last instructions of BB conditionally jumps to next instruction
    900      * (= first instruction of next BB in memory), this is a inverted by VEX.
    901      */
    902     *cjmp_inverted = toNextInstr;
    903 }
    904 
    905 static
    906 void addConstMemStoreStmt( IRSB* bbOut, UWord addr, UInt val, IRType hWordTy)
    907 {
    908     addStmtToIRSB( bbOut,
    909 		   IRStmt_Store(CLGEndness,
    910 				IRExpr_Const(hWordTy == Ity_I32 ?
    911 					     IRConst_U32( addr ) :
    912 					     IRConst_U64( addr )),
    913 				IRExpr_Const(IRConst_U32(val)) ));
    914 }
    915 
    916 
    917 /* add helper call to setup_bbcc, with pointer to BB struct as argument
    918  *
    919  * precondition for setup_bbcc:
    920  * - jmps_passed has number of cond.jumps passed in last executed BB
    921  * - current_bbcc has a pointer to the BBCC of the last executed BB
    922  *   Thus, if bbcc_jmpkind is != -1 (JmpNone),
    923  *     current_bbcc->bb->jmp_addr
    924  *   gives the address of the jump source.
    925  *
    926  * the setup does 2 things:
    927  * - trace call:
    928  *   * Unwind own call stack, i.e sync our ESP with real ESP
    929  *     This is for ESP manipulation (longjmps, C++ exec handling) and RET
    930  *   * For CALLs or JMPs crossing objects, record call arg +
    931  *     push are on own call stack
    932  *
    933  * - prepare for cache log functions:
    934  *   set current_bbcc to BBCC that gets the costs for this BB execution
    935  *   attached
    936  */
    937 static
    938 void addBBSetupCall(ClgState* clgs)
    939 {
    940    IRDirty* di;
    941    IRExpr  *arg1, **argv;
    942 
    943    arg1 = mkIRExpr_HWord( (HWord)clgs->bb );
    944    argv = mkIRExprVec_1(arg1);
    945    di = unsafeIRDirty_0_N( 1, "setup_bbcc",
    946 			      VG_(fnptr_to_fnentry)( & CLG_(setup_bbcc) ),
    947 			      argv);
    948    addStmtToIRSB( clgs->sbOut, IRStmt_Dirty(di) );
    949 }
    950 
    951 
    952 static
    953 IRSB* CLG_(instrument)( VgCallbackClosure* closure,
    954                         IRSB* sbIn,
    955 			const VexGuestLayout* layout,
    956 			const VexGuestExtents* vge,
    957                         const VexArchInfo* archinfo_host,
    958 			IRType gWordTy, IRType hWordTy )
    959 {
    960    Int        i;
    961    IRStmt*    st;
    962    Addr       origAddr;
    963    InstrInfo* curr_inode = NULL;
    964    ClgState   clgs;
    965    UInt       cJumps = 0;
    966    IRTypeEnv* tyenv = sbIn->tyenv;
    967 
    968    if (gWordTy != hWordTy) {
    969       /* We don't currently support this case. */
    970       VG_(tool_panic)("host/guest word size mismatch");
    971    }
    972 
    973    // No instrumentation if it is switched off
    974    if (! CLG_(instrument_state)) {
    975        CLG_DEBUG(5, "instrument(BB %#lx) [Instrumentation OFF]\n",
    976 		 (Addr)closure->readdr);
    977        return sbIn;
    978    }
    979 
    980    CLG_DEBUG(3, "+ instrument(BB %#lx)\n", (Addr)closure->readdr);
    981 
    982    /* Set up SB for instrumented IR */
    983    clgs.sbOut = deepCopyIRSBExceptStmts(sbIn);
    984 
    985    // Copy verbatim any IR preamble preceding the first IMark
    986    i = 0;
    987    while (i < sbIn->stmts_used && sbIn->stmts[i]->tag != Ist_IMark) {
    988       addStmtToIRSB( clgs.sbOut, sbIn->stmts[i] );
    989       i++;
    990    }
    991 
    992    // Get the first statement, and origAddr from it
    993    CLG_ASSERT(sbIn->stmts_used >0);
    994    CLG_ASSERT(i < sbIn->stmts_used);
    995    st = sbIn->stmts[i];
    996    CLG_ASSERT(Ist_IMark == st->tag);
    997 
    998    origAddr = st->Ist.IMark.addr + st->Ist.IMark.delta;
    999    CLG_ASSERT(origAddr == st->Ist.IMark.addr
   1000                           + st->Ist.IMark.delta);  // XXX: check no overflow
   1001 
   1002    /* Get BB struct (creating if necessary).
   1003     * JS: The hash table is keyed with orig_addr_noredir -- important!
   1004     * JW: Why? If it is because of different chasing of the redirection,
   1005     *     this is not needed, as chasing is switched off in callgrind
   1006     */
   1007    clgs.bb = CLG_(get_bb)(origAddr, sbIn, &(clgs.seen_before));
   1008 
   1009    addBBSetupCall(&clgs);
   1010 
   1011    // Set up running state
   1012    clgs.events_used = 0;
   1013    clgs.ii_index = 0;
   1014    clgs.instr_offset = 0;
   1015 
   1016    for (/*use current i*/; i < sbIn->stmts_used; i++) {
   1017 
   1018       st = sbIn->stmts[i];
   1019       CLG_ASSERT(isFlatIRStmt(st));
   1020 
   1021       switch (st->tag) {
   1022 	 case Ist_NoOp:
   1023 	 case Ist_AbiHint:
   1024 	 case Ist_Put:
   1025 	 case Ist_PutI:
   1026 	 case Ist_MBE:
   1027 	    break;
   1028 
   1029 	 case Ist_IMark: {
   1030             Addr   cia   = st->Ist.IMark.addr + st->Ist.IMark.delta;
   1031             UInt   isize = st->Ist.IMark.len;
   1032             CLG_ASSERT(clgs.instr_offset == cia - origAddr);
   1033 	    // If Vex fails to decode an instruction, the size will be zero.
   1034 	    // Pretend otherwise.
   1035 	    if (isize == 0) isize = VG_MIN_INSTR_SZB;
   1036 
   1037 	    // Sanity-check size.
   1038 	    tl_assert( (VG_MIN_INSTR_SZB <= isize && isize <= VG_MAX_INSTR_SZB)
   1039 		     || VG_CLREQ_SZB == isize );
   1040 
   1041 	    // Init the inode, record it as the current one.
   1042 	    // Subsequent Dr/Dw/Dm events from the same instruction will
   1043 	    // also use it.
   1044 	    curr_inode = next_InstrInfo (&clgs, isize);
   1045 
   1046 	    addEvent_Ir( &clgs, curr_inode );
   1047 	    break;
   1048 	 }
   1049 
   1050 	 case Ist_WrTmp: {
   1051 	    IRExpr* data = st->Ist.WrTmp.data;
   1052 	    if (data->tag == Iex_Load) {
   1053 	       IRExpr* aexpr = data->Iex.Load.addr;
   1054 	       // Note also, endianness info is ignored.  I guess
   1055 	       // that's not interesting.
   1056 	       addEvent_Dr( &clgs, curr_inode,
   1057 			    sizeofIRType(data->Iex.Load.ty), aexpr );
   1058 	    }
   1059 	    break;
   1060 	 }
   1061 
   1062 	 case Ist_Store: {
   1063 	    IRExpr* data  = st->Ist.Store.data;
   1064 	    IRExpr* aexpr = st->Ist.Store.addr;
   1065 	    addEvent_Dw( &clgs, curr_inode,
   1066 			 sizeofIRType(typeOfIRExpr(sbIn->tyenv, data)), aexpr );
   1067 	    break;
   1068 	 }
   1069 
   1070          case Ist_StoreG: {
   1071             IRStoreG* sg   = st->Ist.StoreG.details;
   1072             IRExpr*   data = sg->data;
   1073             IRExpr*   addr = sg->addr;
   1074             IRType    type = typeOfIRExpr(tyenv, data);
   1075             tl_assert(type != Ity_INVALID);
   1076             addEvent_D_guarded( &clgs, curr_inode,
   1077                                 sizeofIRType(type), addr, sg->guard,
   1078                                 True/*isWrite*/ );
   1079             break;
   1080          }
   1081 
   1082          case Ist_LoadG: {
   1083             IRLoadG* lg       = st->Ist.LoadG.details;
   1084             IRType   type     = Ity_INVALID; /* loaded type */
   1085             IRType   typeWide = Ity_INVALID; /* after implicit widening */
   1086             IRExpr*  addr     = lg->addr;
   1087             typeOfIRLoadGOp(lg->cvt, &typeWide, &type);
   1088             tl_assert(type != Ity_INVALID);
   1089             addEvent_D_guarded( &clgs, curr_inode,
   1090                                 sizeofIRType(type), addr, lg->guard,
   1091                                 False/*!isWrite*/ );
   1092             break;
   1093          }
   1094 
   1095 	 case Ist_Dirty: {
   1096 	    Int      dataSize;
   1097 	    IRDirty* d = st->Ist.Dirty.details;
   1098 	    if (d->mFx != Ifx_None) {
   1099 	       /* This dirty helper accesses memory.  Collect the details. */
   1100 	       tl_assert(d->mAddr != NULL);
   1101 	       tl_assert(d->mSize != 0);
   1102 	       dataSize = d->mSize;
   1103 	       // Large (eg. 28B, 108B, 512B on x86) data-sized
   1104 	       // instructions will be done inaccurately, but they're
   1105 	       // very rare and this avoids errors from hitting more
   1106 	       // than two cache lines in the simulation.
   1107 	       if (CLG_(clo).simulate_cache && dataSize > CLG_(min_line_size))
   1108 		  dataSize = CLG_(min_line_size);
   1109 	       if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify)
   1110 		  addEvent_Dr( &clgs, curr_inode, dataSize, d->mAddr );
   1111 	       if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify)
   1112 		  addEvent_Dw( &clgs, curr_inode, dataSize, d->mAddr );
   1113 	    } else {
   1114 	       tl_assert(d->mAddr == NULL);
   1115 	       tl_assert(d->mSize == 0);
   1116 	    }
   1117 	    break;
   1118 	 }
   1119 
   1120          case Ist_CAS: {
   1121             /* We treat it as a read and a write of the location.  I
   1122                think that is the same behaviour as it was before IRCAS
   1123                was introduced, since prior to that point, the Vex
   1124                front ends would translate a lock-prefixed instruction
   1125                into a (normal) read followed by a (normal) write. */
   1126             Int    dataSize;
   1127             IRCAS* cas = st->Ist.CAS.details;
   1128             CLG_ASSERT(cas->addr && isIRAtom(cas->addr));
   1129             CLG_ASSERT(cas->dataLo);
   1130             dataSize = sizeofIRType(typeOfIRExpr(sbIn->tyenv, cas->dataLo));
   1131             if (cas->dataHi != NULL)
   1132                dataSize *= 2; /* since this is a doubleword-cas */
   1133             addEvent_Dr( &clgs, curr_inode, dataSize, cas->addr );
   1134             addEvent_Dw( &clgs, curr_inode, dataSize, cas->addr );
   1135             addEvent_G(  &clgs, curr_inode );
   1136             break;
   1137          }
   1138 
   1139          case Ist_LLSC: {
   1140             IRType dataTy;
   1141             if (st->Ist.LLSC.storedata == NULL) {
   1142                /* LL */
   1143                dataTy = typeOfIRTemp(sbIn->tyenv, st->Ist.LLSC.result);
   1144                addEvent_Dr( &clgs, curr_inode,
   1145                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1146                /* flush events before LL, should help SC to succeed */
   1147                flushEvents( &clgs );
   1148             } else {
   1149                /* SC */
   1150                dataTy = typeOfIRExpr(sbIn->tyenv, st->Ist.LLSC.storedata);
   1151                addEvent_Dw( &clgs, curr_inode,
   1152                             sizeofIRType(dataTy), st->Ist.LLSC.addr );
   1153                /* I don't know whether the global-bus-lock cost should
   1154                   be attributed to the LL or the SC, but it doesn't
   1155                   really matter since they always have to be used in
   1156                   pairs anyway.  Hence put it (quite arbitrarily) on
   1157                   the SC. */
   1158                addEvent_G(  &clgs, curr_inode );
   1159             }
   1160             break;
   1161          }
   1162 
   1163  	 case Ist_Exit: {
   1164             Bool guest_exit, inverted;
   1165 
   1166             /* VEX code generation sometimes inverts conditional branches.
   1167              * As Callgrind counts (conditional) jumps, it has to correct
   1168              * inversions. The heuristic is the following:
   1169              * (1) Callgrind switches off SB chasing and unrolling, and
   1170              *     therefore it assumes that a candidate for inversion only is
   1171              *     the last conditional branch in an SB.
   1172              * (2) inversion is assumed if the branch jumps to the address of
   1173              *     the next guest instruction in memory.
   1174              * This heuristic is precalculated in CLG_(collectBlockInfo)().
   1175              *
   1176              * Branching behavior is also used for branch prediction. Note that
   1177              * above heuristic is different from what Cachegrind does.
   1178              * Cachegrind uses (2) for all branches.
   1179              */
   1180             if (cJumps+1 == clgs.bb->cjmp_count)
   1181                 inverted = clgs.bb->cjmp_inverted;
   1182             else
   1183                 inverted = False;
   1184 
   1185             // call branch predictor only if this is a branch in guest code
   1186             guest_exit = (st->Ist.Exit.jk == Ijk_Boring) ||
   1187                          (st->Ist.Exit.jk == Ijk_Call) ||
   1188                          (st->Ist.Exit.jk == Ijk_Ret);
   1189 
   1190             if (guest_exit) {
   1191                 /* Stuff to widen the guard expression to a host word, so
   1192                    we can pass it to the branch predictor simulation
   1193                    functions easily. */
   1194                 IRType   tyW    = hWordTy;
   1195                 IROp     widen  = tyW==Ity_I32  ? Iop_1Uto32  : Iop_1Uto64;
   1196                 IROp     opXOR  = tyW==Ity_I32  ? Iop_Xor32   : Iop_Xor64;
   1197                 IRTemp   guard1 = newIRTemp(clgs.sbOut->tyenv, Ity_I1);
   1198                 IRTemp   guardW = newIRTemp(clgs.sbOut->tyenv, tyW);
   1199                 IRTemp   guard  = newIRTemp(clgs.sbOut->tyenv, tyW);
   1200                 IRExpr*  one    = tyW==Ity_I32 ? IRExpr_Const(IRConst_U32(1))
   1201                                                : IRExpr_Const(IRConst_U64(1));
   1202 
   1203                 /* Widen the guard expression. */
   1204                 addStmtToIRSB( clgs.sbOut,
   1205                                IRStmt_WrTmp( guard1, st->Ist.Exit.guard ));
   1206                 addStmtToIRSB( clgs.sbOut,
   1207                                IRStmt_WrTmp( guardW,
   1208                                              IRExpr_Unop(widen,
   1209                                                          IRExpr_RdTmp(guard1))) );
   1210                 /* If the exit is inverted, invert the sense of the guard. */
   1211                 addStmtToIRSB(
   1212                         clgs.sbOut,
   1213                         IRStmt_WrTmp(
   1214                                 guard,
   1215                                 inverted ? IRExpr_Binop(opXOR, IRExpr_RdTmp(guardW), one)
   1216                                     : IRExpr_RdTmp(guardW)
   1217                                     ));
   1218                 /* And post the event. */
   1219                 addEvent_Bc( &clgs, curr_inode, IRExpr_RdTmp(guard) );
   1220             }
   1221 
   1222 	    /* We may never reach the next statement, so need to flush
   1223 	       all outstanding transactions now. */
   1224 	    flushEvents( &clgs );
   1225 
   1226 	    CLG_ASSERT(clgs.ii_index>0);
   1227 	    if (!clgs.seen_before) {
   1228 	      ClgJumpKind jk;
   1229 
   1230 	      if      (st->Ist.Exit.jk == Ijk_Call) jk = jk_Call;
   1231 	      else if (st->Ist.Exit.jk == Ijk_Ret)  jk = jk_Return;
   1232 	      else {
   1233 		if (IRConst2Addr(st->Ist.Exit.dst) ==
   1234 		    origAddr + curr_inode->instr_offset + curr_inode->instr_size)
   1235 		  jk = jk_None;
   1236 		else
   1237 		  jk = jk_Jump;
   1238 	      }
   1239 
   1240 	      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1241 	      clgs.bb->jmp[cJumps].jmpkind = jk;
   1242 	    }
   1243 
   1244 	    /* Update global variable jmps_passed before the jump
   1245 	     * A correction is needed if VEX inverted the last jump condition
   1246 	    */
   1247 	    UInt val = inverted ? cJumps+1 : cJumps;
   1248 	    addConstMemStoreStmt( clgs.sbOut,
   1249 				  (UWord) &CLG_(current_state).jmps_passed,
   1250 				  val, hWordTy);
   1251 	    cJumps++;
   1252 
   1253 	    break;
   1254 	 }
   1255 
   1256 	 default:
   1257 	    tl_assert(0);
   1258 	    break;
   1259       }
   1260 
   1261       /* Copy the original statement */
   1262       addStmtToIRSB( clgs.sbOut, st );
   1263 
   1264       CLG_DEBUGIF(5) {
   1265 	 VG_(printf)("   pass  ");
   1266 	 ppIRStmt(st);
   1267 	 VG_(printf)("\n");
   1268       }
   1269    }
   1270 
   1271    /* Deal with branches to unknown destinations.  Except ignore ones
   1272       which are function returns as we assume the return stack
   1273       predictor never mispredicts. */
   1274    if ((sbIn->jumpkind == Ijk_Boring) || (sbIn->jumpkind == Ijk_Call)) {
   1275       if (0) { ppIRExpr( sbIn->next ); VG_(printf)("\n"); }
   1276       switch (sbIn->next->tag) {
   1277          case Iex_Const:
   1278             break; /* boring - branch to known address */
   1279          case Iex_RdTmp:
   1280             /* looks like an indirect branch (branch to unknown) */
   1281             addEvent_Bi( &clgs, curr_inode, sbIn->next );
   1282             break;
   1283          default:
   1284             /* shouldn't happen - if the incoming IR is properly
   1285                flattened, should only have tmp and const cases to
   1286                consider. */
   1287             tl_assert(0);
   1288       }
   1289    }
   1290 
   1291    /* At the end of the bb.  Flush outstandings. */
   1292    flushEvents( &clgs );
   1293 
   1294    /* Update global variable jmps_passed at end of SB.
   1295     * As CLG_(current_state).jmps_passed is reset to 0 in setup_bbcc,
   1296     * this can be omitted if there is no conditional jump in this SB.
   1297     * A correction is needed if VEX inverted the last jump condition
   1298     */
   1299    if (cJumps>0) {
   1300       UInt jmps_passed = cJumps;
   1301       if (clgs.bb->cjmp_inverted) jmps_passed--;
   1302       addConstMemStoreStmt( clgs.sbOut,
   1303 			    (UWord) &CLG_(current_state).jmps_passed,
   1304 			    jmps_passed, hWordTy);
   1305    }
   1306    CLG_ASSERT(clgs.bb->cjmp_count == cJumps);
   1307    CLG_ASSERT(clgs.bb->instr_count == clgs.ii_index);
   1308 
   1309    /* Info for final exit from BB */
   1310    {
   1311      ClgJumpKind jk;
   1312 
   1313      if      (sbIn->jumpkind == Ijk_Call) jk = jk_Call;
   1314      else if (sbIn->jumpkind == Ijk_Ret)  jk = jk_Return;
   1315      else {
   1316        jk = jk_Jump;
   1317        if ((sbIn->next->tag == Iex_Const) &&
   1318 	   (IRConst2Addr(sbIn->next->Iex.Const.con) ==
   1319 	    origAddr + clgs.instr_offset))
   1320 	 jk = jk_None;
   1321      }
   1322      clgs.bb->jmp[cJumps].jmpkind = jk;
   1323      /* Instruction index of the call/ret at BB end
   1324       * (it is wrong for fall-through, but does not matter) */
   1325      clgs.bb->jmp[cJumps].instr = clgs.ii_index-1;
   1326    }
   1327 
   1328    /* swap information of last exit with final exit if inverted */
   1329    if (clgs.bb->cjmp_inverted) {
   1330      ClgJumpKind jk;
   1331      UInt instr;
   1332 
   1333      jk = clgs.bb->jmp[cJumps].jmpkind;
   1334      clgs.bb->jmp[cJumps].jmpkind = clgs.bb->jmp[cJumps-1].jmpkind;
   1335      clgs.bb->jmp[cJumps-1].jmpkind = jk;
   1336      instr = clgs.bb->jmp[cJumps].instr;
   1337      clgs.bb->jmp[cJumps].instr = clgs.bb->jmp[cJumps-1].instr;
   1338      clgs.bb->jmp[cJumps-1].instr = instr;
   1339    }
   1340 
   1341    if (clgs.seen_before) {
   1342        CLG_ASSERT(clgs.bb->cost_count == update_cost_offsets(&clgs));
   1343        CLG_ASSERT(clgs.bb->instr_len == clgs.instr_offset);
   1344    }
   1345    else {
   1346        clgs.bb->cost_count = update_cost_offsets(&clgs);
   1347        clgs.bb->instr_len = clgs.instr_offset;
   1348    }
   1349 
   1350    CLG_DEBUG(3, "- instrument(BB %#lx): byteLen %u, CJumps %u, CostLen %u\n",
   1351 	     origAddr, clgs.bb->instr_len,
   1352 	     clgs.bb->cjmp_count, clgs.bb->cost_count);
   1353    if (cJumps>0) {
   1354        CLG_DEBUG(3, "                     [ ");
   1355        for (i=0;i<cJumps;i++)
   1356 	   CLG_DEBUG(3, "%u ", clgs.bb->jmp[i].instr);
   1357        CLG_DEBUG(3, "], last inverted: %s \n",
   1358 		 clgs.bb->cjmp_inverted ? "yes":"no");
   1359    }
   1360 
   1361   return clgs.sbOut;
   1362 }
   1363 
   1364 /*--------------------------------------------------------------------*/
   1365 /*--- Discarding BB info                                           ---*/
   1366 /*--------------------------------------------------------------------*/
   1367 
   1368 // Called when a translation is removed from the translation cache for
   1369 // any reason at all: to free up space, because the guest code was
   1370 // unmapped or modified, or for any arbitrary reason.
   1371 static
   1372 void clg_discard_superblock_info ( Addr orig_addr, VexGuestExtents vge )
   1373 {
   1374     tl_assert(vge.n_used > 0);
   1375 
   1376    if (0)
   1377       VG_(printf)( "discard_superblock_info: %p, %p, %llu\n",
   1378                    (void*)orig_addr,
   1379                    (void*)vge.base[0], (ULong)vge.len[0]);
   1380 
   1381    // Get BB info, remove from table, free BB info.  Simple!
   1382    // When created, the BB is keyed by the first instruction address,
   1383    // (not orig_addr, but eventually redirected address). Thus, we
   1384    // use the first instruction address in vge.
   1385    CLG_(delete_bb)(vge.base[0]);
   1386 }
   1387 
   1388 
   1389 /*------------------------------------------------------------*/
   1390 /*--- CLG_(fini)() and related function                     ---*/
   1391 /*------------------------------------------------------------*/
   1392 
   1393 
   1394 
   1395 static void zero_thread_cost(thread_info* t)
   1396 {
   1397   Int i;
   1398 
   1399   for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1400     if (!CLG_(current_call_stack).entry[i].jcc) continue;
   1401 
   1402     /* reset call counters to current for active calls */
   1403     CLG_(copy_cost)( CLG_(sets).full,
   1404 		    CLG_(current_call_stack).entry[i].enter_cost,
   1405 		    CLG_(current_state).cost );
   1406     CLG_(current_call_stack).entry[i].jcc->call_counter = 0;
   1407   }
   1408 
   1409   CLG_(forall_bbccs)(CLG_(zero_bbcc));
   1410 
   1411   /* set counter for last dump */
   1412   CLG_(copy_cost)( CLG_(sets).full,
   1413 		  t->lastdump_cost, CLG_(current_state).cost );
   1414 }
   1415 
   1416 void CLG_(zero_all_cost)(Bool only_current_thread)
   1417 {
   1418   if (VG_(clo_verbosity) > 1)
   1419     VG_(message)(Vg_DebugMsg, "  Zeroing costs...\n");
   1420 
   1421   if (only_current_thread)
   1422     zero_thread_cost(CLG_(get_current_thread)());
   1423   else
   1424     CLG_(forall_threads)(zero_thread_cost);
   1425 
   1426   if (VG_(clo_verbosity) > 1)
   1427     VG_(message)(Vg_DebugMsg, "  ...done\n");
   1428 }
   1429 
   1430 static
   1431 void unwind_thread(thread_info* t)
   1432 {
   1433   /* unwind signal handlers */
   1434   while(CLG_(current_state).sig !=0)
   1435     CLG_(post_signal)(CLG_(current_tid),CLG_(current_state).sig);
   1436 
   1437   /* unwind regular call stack */
   1438   while(CLG_(current_call_stack).sp>0)
   1439     CLG_(pop_call_stack)();
   1440 
   1441   /* reset context and function stack for context generation */
   1442   CLG_(init_exec_state)( &CLG_(current_state) );
   1443   CLG_(current_fn_stack).top = CLG_(current_fn_stack).bottom;
   1444 }
   1445 
   1446 static
   1447 void zero_state_cost(thread_info* t)
   1448 {
   1449     CLG_(zero_cost)( CLG_(sets).full, CLG_(current_state).cost );
   1450 }
   1451 
   1452 void CLG_(set_instrument_state)(const HChar* reason, Bool state)
   1453 {
   1454   if (CLG_(instrument_state) == state) {
   1455     CLG_DEBUG(2, "%s: instrumentation already %s\n",
   1456 	     reason, state ? "ON" : "OFF");
   1457     return;
   1458   }
   1459   CLG_(instrument_state) = state;
   1460   CLG_DEBUG(2, "%s: Switching instrumentation %s ...\n",
   1461 	   reason, state ? "ON" : "OFF");
   1462 
   1463   VG_(discard_translations_safely)( (Addr)0x1000, ~(SizeT)0xfff, "callgrind");
   1464 
   1465   /* reset internal state: call stacks, simulator */
   1466   CLG_(forall_threads)(unwind_thread);
   1467   CLG_(forall_threads)(zero_state_cost);
   1468   (*CLG_(cachesim).clear)();
   1469 
   1470   if (VG_(clo_verbosity) > 1)
   1471     VG_(message)(Vg_DebugMsg, "%s: instrumentation switched %s\n",
   1472 		 reason, state ? "ON" : "OFF");
   1473 }
   1474 
   1475 /* helper for dump_state_togdb */
   1476 static void dump_state_of_thread_togdb(thread_info* ti)
   1477 {
   1478     static FullCost sum = 0, tmp = 0;
   1479     Int t, i;
   1480     BBCC *from, *to;
   1481     call_entry* ce;
   1482     HChar *mcost;
   1483 
   1484     t = CLG_(current_tid);
   1485     CLG_(init_cost_lz)( CLG_(sets).full, &sum );
   1486     CLG_(copy_cost_lz)( CLG_(sets).full, &tmp, ti->lastdump_cost );
   1487     CLG_(add_diff_cost)( CLG_(sets).full, sum, ti->lastdump_cost,
   1488 			 ti->states.entry[0]->cost);
   1489     CLG_(copy_cost)( CLG_(sets).full, ti->lastdump_cost, tmp );
   1490     mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), sum);
   1491     VG_(gdb_printf)("events-%d: %s\n", t, mcost);
   1492     VG_(free)(mcost);
   1493     VG_(gdb_printf)("frames-%d: %d\n", t, CLG_(current_call_stack).sp);
   1494 
   1495     ce = 0;
   1496     for(i = 0; i < CLG_(current_call_stack).sp; i++) {
   1497       ce = CLG_(get_call_entry)(i);
   1498       /* if this frame is skipped, we don't have counters */
   1499       if (!ce->jcc) continue;
   1500 
   1501       from = ce->jcc->from;
   1502       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, from->cxt->fn[0]->name);
   1503       VG_(gdb_printf)("calls-%d-%d: %llu\n",t, i, ce->jcc->call_counter);
   1504 
   1505       /* FIXME: EventSets! */
   1506       CLG_(copy_cost)( CLG_(sets).full, sum, ce->jcc->cost );
   1507       CLG_(copy_cost)( CLG_(sets).full, tmp, ce->enter_cost );
   1508       CLG_(add_diff_cost)( CLG_(sets).full, sum,
   1509 			  ce->enter_cost, CLG_(current_state).cost );
   1510       CLG_(copy_cost)( CLG_(sets).full, ce->enter_cost, tmp );
   1511 
   1512       mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), sum);
   1513       VG_(gdb_printf)("events-%d-%d: %s\n",t, i, mcost);
   1514       VG_(free)(mcost);
   1515     }
   1516     if (ce && ce->jcc) {
   1517       to = ce->jcc->to;
   1518       VG_(gdb_printf)("function-%d-%d: %s\n",t, i, to->cxt->fn[0]->name );
   1519     }
   1520 }
   1521 
   1522 /* Dump current state */
   1523 static void dump_state_togdb(void)
   1524 {
   1525     thread_info** th;
   1526     int t;
   1527     Int orig_tid = CLG_(current_tid);
   1528 
   1529     VG_(gdb_printf)("instrumentation: %s\n",
   1530 		    CLG_(instrument_state) ? "on":"off");
   1531     if (!CLG_(instrument_state)) return;
   1532 
   1533     VG_(gdb_printf)("executed-bbs: %llu\n", CLG_(stat).bb_executions);
   1534     VG_(gdb_printf)("executed-calls: %llu\n", CLG_(stat).call_counter);
   1535     VG_(gdb_printf)("distinct-bbs: %d\n", CLG_(stat).distinct_bbs);
   1536     VG_(gdb_printf)("distinct-calls: %d\n", CLG_(stat).distinct_jccs);
   1537     VG_(gdb_printf)("distinct-functions: %d\n", CLG_(stat).distinct_fns);
   1538     VG_(gdb_printf)("distinct-contexts: %d\n", CLG_(stat).distinct_contexts);
   1539 
   1540     /* "events:" line. Given here because it will be dynamic in the future */
   1541     HChar *evmap = CLG_(eventmapping_as_string)(CLG_(dumpmap));
   1542     VG_(gdb_printf)("events: %s\n", evmap);
   1543     VG_(free)(evmap);
   1544     /* "part:" line (number of last part. Is 0 at start */
   1545     VG_(gdb_printf)("part: %d\n", CLG_(get_dump_counter)());
   1546 
   1547     /* threads */
   1548     th = CLG_(get_threads)();
   1549     VG_(gdb_printf)("threads:");
   1550     for(t=1;t<VG_N_THREADS;t++) {
   1551 	if (!th[t]) continue;
   1552 	VG_(gdb_printf)(" %d", t);
   1553     }
   1554     VG_(gdb_printf)("\n");
   1555     VG_(gdb_printf)("current-tid: %d\n", orig_tid);
   1556     CLG_(forall_threads)(dump_state_of_thread_togdb);
   1557 }
   1558 
   1559 
   1560 static void print_monitor_help ( void )
   1561 {
   1562    VG_(gdb_printf) ("\n");
   1563    VG_(gdb_printf) ("callgrind monitor commands:\n");
   1564    VG_(gdb_printf) ("  dump [<dump_hint>]\n");
   1565    VG_(gdb_printf) ("        dump counters\n");
   1566    VG_(gdb_printf) ("  zero\n");
   1567    VG_(gdb_printf) ("        zero counters\n");
   1568    VG_(gdb_printf) ("  status\n");
   1569    VG_(gdb_printf) ("        print status\n");
   1570    VG_(gdb_printf) ("  instrumentation [on|off]\n");
   1571    VG_(gdb_printf) ("        get/set (if on/off given) instrumentation state\n");
   1572    VG_(gdb_printf) ("\n");
   1573 }
   1574 
   1575 /* return True if request recognised, False otherwise */
   1576 static Bool handle_gdb_monitor_command (ThreadId tid, const HChar *req)
   1577 {
   1578    HChar* wcmd;
   1579    HChar s[VG_(strlen(req)) + 1]; /* copy for strtok_r */
   1580    HChar *ssaveptr;
   1581 
   1582    VG_(strcpy) (s, req);
   1583 
   1584    wcmd = VG_(strtok_r) (s, " ", &ssaveptr);
   1585    switch (VG_(keyword_id) ("help dump zero status instrumentation",
   1586                             wcmd, kwd_report_duplicated_matches)) {
   1587    case -2: /* multiple matches */
   1588       return True;
   1589    case -1: /* not found */
   1590       return False;
   1591    case  0: /* help */
   1592       print_monitor_help();
   1593       return True;
   1594    case  1: { /* dump */
   1595       CLG_(dump_profile)(req, False);
   1596       return True;
   1597    }
   1598    case  2: { /* zero */
   1599       CLG_(zero_all_cost)(False);
   1600       return True;
   1601    }
   1602 
   1603    case 3: { /* status */
   1604      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1605      if (arg && (VG_(strcmp)(arg, "internal") == 0)) {
   1606        /* internal interface to callgrind_control */
   1607        dump_state_togdb();
   1608        return True;
   1609      }
   1610 
   1611      if (!CLG_(instrument_state)) {
   1612        VG_(gdb_printf)("No status available as instrumentation is switched off\n");
   1613      } else {
   1614        // Status information to be improved ...
   1615        thread_info** th = CLG_(get_threads)();
   1616        Int t, tcount = 0;
   1617        for(t=1;t<VG_N_THREADS;t++)
   1618 	 if (th[t]) tcount++;
   1619        VG_(gdb_printf)("%d thread(s) running.\n", tcount);
   1620      }
   1621      return True;
   1622    }
   1623 
   1624    case 4: { /* instrumentation */
   1625      HChar* arg = VG_(strtok_r) (0, " ", &ssaveptr);
   1626      if (!arg) {
   1627        VG_(gdb_printf)("instrumentation: %s\n",
   1628 		       CLG_(instrument_state) ? "on":"off");
   1629      }
   1630      else
   1631        CLG_(set_instrument_state)("Command", VG_(strcmp)(arg,"off")!=0);
   1632      return True;
   1633    }
   1634 
   1635    default:
   1636       tl_assert(0);
   1637       return False;
   1638    }
   1639 }
   1640 
   1641 static
   1642 Bool CLG_(handle_client_request)(ThreadId tid, UWord *args, UWord *ret)
   1643 {
   1644    if (!VG_IS_TOOL_USERREQ('C','T',args[0])
   1645        && VG_USERREQ__GDB_MONITOR_COMMAND   != args[0])
   1646       return False;
   1647 
   1648    switch(args[0]) {
   1649    case VG_USERREQ__DUMP_STATS:
   1650       CLG_(dump_profile)("Client Request", True);
   1651       *ret = 0;                 /* meaningless */
   1652       break;
   1653 
   1654    case VG_USERREQ__DUMP_STATS_AT:
   1655      {
   1656        const HChar *arg = (HChar*)args[1];
   1657        HChar buf[30 + VG_(strlen)(arg)];    // large enough
   1658        VG_(sprintf)(buf,"Client Request: %s", arg);
   1659        CLG_(dump_profile)(buf, True);
   1660        *ret = 0;                 /* meaningless */
   1661      }
   1662      break;
   1663 
   1664    case VG_USERREQ__ZERO_STATS:
   1665      CLG_(zero_all_cost)(True);
   1666       *ret = 0;                 /* meaningless */
   1667       break;
   1668 
   1669    case VG_USERREQ__TOGGLE_COLLECT:
   1670      CLG_(current_state).collect = !CLG_(current_state).collect;
   1671      CLG_DEBUG(2, "Client Request: toggled collection state to %s\n",
   1672 	      CLG_(current_state).collect ? "ON" : "OFF");
   1673      *ret = 0;                 /* meaningless */
   1674      break;
   1675 
   1676    case VG_USERREQ__START_INSTRUMENTATION:
   1677      CLG_(set_instrument_state)("Client Request", True);
   1678      *ret = 0;                 /* meaningless */
   1679      break;
   1680 
   1681    case VG_USERREQ__STOP_INSTRUMENTATION:
   1682      CLG_(set_instrument_state)("Client Request", False);
   1683      *ret = 0;                 /* meaningless */
   1684      break;
   1685 
   1686    case VG_USERREQ__GDB_MONITOR_COMMAND: {
   1687       Bool handled = handle_gdb_monitor_command (tid, (HChar*)args[1]);
   1688       if (handled)
   1689          *ret = 1;
   1690       else
   1691          *ret = 0;
   1692       return handled;
   1693    }
   1694    default:
   1695       return False;
   1696    }
   1697 
   1698    return True;
   1699 }
   1700 
   1701 
   1702 /* Syscall Timing */
   1703 
   1704 /* struct timeval syscalltime[VG_N_THREADS]; */
   1705 #if CLG_MICROSYSTIME
   1706 ULong *syscalltime;
   1707 #else
   1708 UInt *syscalltime;
   1709 #endif
   1710 
   1711 static
   1712 void CLG_(pre_syscalltime)(ThreadId tid, UInt syscallno,
   1713                            UWord* args, UInt nArgs)
   1714 {
   1715   if (CLG_(clo).collect_systime) {
   1716 #if CLG_MICROSYSTIME
   1717     struct vki_timeval tv_now;
   1718     VG_(gettimeofday)(&tv_now, NULL);
   1719     syscalltime[tid] = tv_now.tv_sec * 1000000ULL + tv_now.tv_usec;
   1720 #else
   1721     syscalltime[tid] = VG_(read_millisecond_timer)();
   1722 #endif
   1723   }
   1724 }
   1725 
   1726 static
   1727 void CLG_(post_syscalltime)(ThreadId tid, UInt syscallno,
   1728                             UWord* args, UInt nArgs, SysRes res)
   1729 {
   1730   if (CLG_(clo).collect_systime &&
   1731       CLG_(current_state).bbcc) {
   1732       Int o;
   1733 #if CLG_MICROSYSTIME
   1734     struct vki_timeval tv_now;
   1735     ULong diff;
   1736 
   1737     VG_(gettimeofday)(&tv_now, NULL);
   1738     diff = (tv_now.tv_sec * 1000000ULL + tv_now.tv_usec) - syscalltime[tid];
   1739 #else
   1740     UInt diff = VG_(read_millisecond_timer)() - syscalltime[tid];
   1741 #endif
   1742 
   1743     /* offset o is for "SysCount", o+1 for "SysTime" */
   1744     o = fullOffset(EG_SYS);
   1745     CLG_ASSERT(o>=0);
   1746     CLG_DEBUG(0,"   Time (Off %d) for Syscall %u: %llu\n", o, syscallno,
   1747               (ULong)diff);
   1748 
   1749     CLG_(current_state).cost[o] ++;
   1750     CLG_(current_state).cost[o+1] += diff;
   1751     if (!CLG_(current_state).bbcc->skipped)
   1752       CLG_(init_cost_lz)(CLG_(sets).full,
   1753 			&(CLG_(current_state).bbcc->skipped));
   1754     CLG_(current_state).bbcc->skipped[o] ++;
   1755     CLG_(current_state).bbcc->skipped[o+1] += diff;
   1756   }
   1757 }
   1758 
   1759 static UInt ULong_width(ULong n)
   1760 {
   1761    UInt w = 0;
   1762    while (n > 0) {
   1763       n = n / 10;
   1764       w++;
   1765    }
   1766    if (w == 0) w = 1;
   1767    return w + (w-1)/3;   // add space for commas
   1768 }
   1769 
   1770 static
   1771 void branchsim_printstat(int l1, int l2, int l3)
   1772 {
   1773     static HChar fmt[128];    // large enough
   1774     FullCost total;
   1775     ULong Bc_total_b, Bc_total_mp, Bi_total_b, Bi_total_mp;
   1776     ULong B_total_b, B_total_mp;
   1777 
   1778     total = CLG_(total_cost);
   1779     Bc_total_b  = total[ fullOffset(EG_BC)   ];
   1780     Bc_total_mp = total[ fullOffset(EG_BC)+1 ];
   1781     Bi_total_b  = total[ fullOffset(EG_BI)   ];
   1782     Bi_total_mp = total[ fullOffset(EG_BI)+1 ];
   1783 
   1784     /* Make format string, getting width right for numbers */
   1785     VG_(sprintf)(fmt, "%%s %%,%dllu  (%%,%dllu cond + %%,%dllu ind)\n",
   1786                  l1, l2, l3);
   1787 
   1788     if (0 == Bc_total_b)  Bc_total_b = 1;
   1789     if (0 == Bi_total_b)  Bi_total_b = 1;
   1790     B_total_b  = Bc_total_b  + Bi_total_b;
   1791     B_total_mp = Bc_total_mp + Bi_total_mp;
   1792 
   1793     VG_(umsg)("\n");
   1794     VG_(umsg)(fmt, "Branches:     ",
   1795               B_total_b, Bc_total_b, Bi_total_b);
   1796 
   1797     VG_(umsg)(fmt, "Mispredicts:  ",
   1798               B_total_mp, Bc_total_mp, Bi_total_mp);
   1799 
   1800     VG_(umsg)("Mispred rate:  %*.1f%% (%*.1f%%     + %*.1f%%   )\n",
   1801               l1, B_total_mp  * 100.0 / B_total_b,
   1802               l2, Bc_total_mp * 100.0 / Bc_total_b,
   1803               l3, Bi_total_mp * 100.0 / Bi_total_b);
   1804 }
   1805 
   1806 static
   1807 void clg_print_stats(void)
   1808 {
   1809    int BB_lookups =
   1810      CLG_(stat).full_debug_BBs +
   1811      CLG_(stat).fn_name_debug_BBs +
   1812      CLG_(stat).file_line_debug_BBs +
   1813      CLG_(stat).no_debug_BBs;
   1814 
   1815    /* Hash table stats */
   1816    VG_(message)(Vg_DebugMsg, "Distinct objects: %d\n",
   1817 		CLG_(stat).distinct_objs);
   1818    VG_(message)(Vg_DebugMsg, "Distinct files:   %d\n",
   1819 		CLG_(stat).distinct_files);
   1820    VG_(message)(Vg_DebugMsg, "Distinct fns:     %d\n",
   1821 		CLG_(stat).distinct_fns);
   1822    VG_(message)(Vg_DebugMsg, "Distinct contexts:%d\n",
   1823 		CLG_(stat).distinct_contexts);
   1824    VG_(message)(Vg_DebugMsg, "Distinct BBs:     %d\n",
   1825 		CLG_(stat).distinct_bbs);
   1826    VG_(message)(Vg_DebugMsg, "Cost entries:     %u (Chunks %u)\n",
   1827 		CLG_(costarray_entries), CLG_(costarray_chunks));
   1828    VG_(message)(Vg_DebugMsg, "Distinct BBCCs:   %d\n",
   1829 		CLG_(stat).distinct_bbccs);
   1830    VG_(message)(Vg_DebugMsg, "Distinct JCCs:    %d\n",
   1831 		CLG_(stat).distinct_jccs);
   1832    VG_(message)(Vg_DebugMsg, "Distinct skips:   %d\n",
   1833 		CLG_(stat).distinct_skips);
   1834    VG_(message)(Vg_DebugMsg, "BB lookups:       %d\n",
   1835 		BB_lookups);
   1836    if (BB_lookups>0) {
   1837       VG_(message)(Vg_DebugMsg, "With full      debug info:%3d%% (%d)\n",
   1838 		   CLG_(stat).full_debug_BBs    * 100 / BB_lookups,
   1839 		   CLG_(stat).full_debug_BBs);
   1840       VG_(message)(Vg_DebugMsg, "With file/line debug info:%3d%% (%d)\n",
   1841 		   CLG_(stat).file_line_debug_BBs * 100 / BB_lookups,
   1842 		   CLG_(stat).file_line_debug_BBs);
   1843       VG_(message)(Vg_DebugMsg, "With fn name   debug info:%3d%% (%d)\n",
   1844 		   CLG_(stat).fn_name_debug_BBs * 100 / BB_lookups,
   1845 		   CLG_(stat).fn_name_debug_BBs);
   1846       VG_(message)(Vg_DebugMsg, "With no        debug info:%3d%% (%d)\n",
   1847 		   CLG_(stat).no_debug_BBs      * 100 / BB_lookups,
   1848 		   CLG_(stat).no_debug_BBs);
   1849    }
   1850    VG_(message)(Vg_DebugMsg, "BBCC Clones:       %d\n",
   1851 		CLG_(stat).bbcc_clones);
   1852    VG_(message)(Vg_DebugMsg, "BBs Retranslated:  %d\n",
   1853 		CLG_(stat).bb_retranslations);
   1854    VG_(message)(Vg_DebugMsg, "Distinct instrs:   %d\n",
   1855 		CLG_(stat).distinct_instrs);
   1856 
   1857    VG_(message)(Vg_DebugMsg, "LRU Contxt Misses: %d\n",
   1858 		CLG_(stat).cxt_lru_misses);
   1859    VG_(message)(Vg_DebugMsg, "LRU BBCC Misses:   %d\n",
   1860 		CLG_(stat).bbcc_lru_misses);
   1861    VG_(message)(Vg_DebugMsg, "LRU JCC Misses:    %d\n",
   1862 		CLG_(stat).jcc_lru_misses);
   1863    VG_(message)(Vg_DebugMsg, "BBs Executed:      %llu\n",
   1864 		CLG_(stat).bb_executions);
   1865    VG_(message)(Vg_DebugMsg, "Calls:             %llu\n",
   1866 		CLG_(stat).call_counter);
   1867    VG_(message)(Vg_DebugMsg, "CondJMP followed:  %llu\n",
   1868 		CLG_(stat).jcnd_counter);
   1869    VG_(message)(Vg_DebugMsg, "Boring JMPs:       %llu\n",
   1870 		CLG_(stat).jump_counter);
   1871    VG_(message)(Vg_DebugMsg, "Recursive calls:   %llu\n",
   1872 		CLG_(stat).rec_call_counter);
   1873    VG_(message)(Vg_DebugMsg, "Returns:           %llu\n",
   1874 		CLG_(stat).ret_counter);
   1875 }
   1876 
   1877 
   1878 static
   1879 void finish(void)
   1880 {
   1881   HChar fmt[128];    // large enough
   1882   Int l1, l2, l3;
   1883   FullCost total;
   1884 
   1885   CLG_DEBUG(0, "finish()\n");
   1886 
   1887   (*CLG_(cachesim).finish)();
   1888 
   1889   /* pop all remaining items from CallStack for correct sum
   1890    */
   1891   CLG_(forall_threads)(unwind_thread);
   1892 
   1893   CLG_(dump_profile)(0, False);
   1894 
   1895   if (VG_(clo_verbosity) == 0) return;
   1896 
   1897   if (VG_(clo_stats)) {
   1898     VG_(message)(Vg_DebugMsg, "\n");
   1899     clg_print_stats();
   1900     VG_(message)(Vg_DebugMsg, "\n");
   1901   }
   1902 
   1903   HChar *evmap = CLG_(eventmapping_as_string)(CLG_(dumpmap));
   1904   VG_(message)(Vg_UserMsg, "Events    : %s\n", evmap);
   1905   VG_(free)(evmap);
   1906   HChar *mcost = CLG_(mappingcost_as_string)(CLG_(dumpmap), CLG_(total_cost));
   1907   VG_(message)(Vg_UserMsg, "Collected : %s\n", mcost);
   1908   VG_(free)(mcost);
   1909   VG_(message)(Vg_UserMsg, "\n");
   1910 
   1911   /* determine value widths for statistics */
   1912   total = CLG_(total_cost);
   1913   l1 = ULong_width( total[fullOffset(EG_IR)] );
   1914   l2 = l3 = 0;
   1915   if (CLG_(clo).simulate_cache) {
   1916       l2 = ULong_width( total[fullOffset(EG_DR)] );
   1917       l3 = ULong_width( total[fullOffset(EG_DW)] );
   1918   }
   1919   if (CLG_(clo).simulate_branch) {
   1920       int l2b = ULong_width( total[fullOffset(EG_BC)] );
   1921       int l3b = ULong_width( total[fullOffset(EG_BI)] );
   1922       if (l2b > l2) l2 = l2b;
   1923       if (l3b > l3) l3 = l3b;
   1924   }
   1925 
   1926   /* Make format string, getting width right for numbers */
   1927   VG_(sprintf)(fmt, "%%s %%,%dllu\n", l1);
   1928 
   1929   /* Always print this */
   1930   VG_(umsg)(fmt, "I   refs:     ", total[fullOffset(EG_IR)] );
   1931 
   1932   if (CLG_(clo).simulate_cache)
   1933       (*CLG_(cachesim).printstat)(l1, l2, l3);
   1934 
   1935   if (CLG_(clo).simulate_branch)
   1936       branchsim_printstat(l1, l2, l3);
   1937 
   1938 }
   1939 
   1940 
   1941 void CLG_(fini)(Int exitcode)
   1942 {
   1943   finish();
   1944 }
   1945 
   1946 
   1947 /*--------------------------------------------------------------------*/
   1948 /*--- Setup                                                        ---*/
   1949 /*--------------------------------------------------------------------*/
   1950 
   1951 static void clg_start_client_code_callback ( ThreadId tid, ULong blocks_done )
   1952 {
   1953    static ULong last_blocks_done = 0;
   1954 
   1955    if (0)
   1956       VG_(printf)("%d R %llu\n", (Int)tid, blocks_done);
   1957 
   1958    /* throttle calls to CLG_(run_thread) by number of BBs executed */
   1959    if (blocks_done - last_blocks_done < 5000) return;
   1960    last_blocks_done = blocks_done;
   1961 
   1962    CLG_(run_thread)( tid );
   1963 }
   1964 
   1965 static
   1966 void CLG_(post_clo_init)(void)
   1967 {
   1968    if (VG_(clo_vex_control).iropt_register_updates_default
   1969        != VexRegUpdSpAtMemAccess) {
   1970       CLG_DEBUG(1, " Using user specified value for "
   1971                 "--vex-iropt-register-updates\n");
   1972    } else {
   1973       CLG_DEBUG(1,
   1974                 " Using default --vex-iropt-register-updates="
   1975                 "sp-at-mem-access\n");
   1976    }
   1977 
   1978    if (VG_(clo_px_file_backed) != VexRegUpdSpAtMemAccess) {
   1979       CLG_DEBUG(1, " Using user specified value for "
   1980                 "--px-file-backed\n");
   1981    } else {
   1982       CLG_DEBUG(1,
   1983                 " Using default --px-file-backed="
   1984                 "sp-at-mem-access\n");
   1985    }
   1986 
   1987    if (VG_(clo_vex_control).iropt_unroll_thresh != 0) {
   1988       VG_(message)(Vg_UserMsg,
   1989                    "callgrind only works with --vex-iropt-unroll-thresh=0\n"
   1990                    "=> resetting it back to 0\n");
   1991       VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
   1992    }
   1993    if (VG_(clo_vex_control).guest_chase_thresh != 0) {
   1994       VG_(message)(Vg_UserMsg,
   1995                    "callgrind only works with --vex-guest-chase-thresh=0\n"
   1996                    "=> resetting it back to 0\n");
   1997       VG_(clo_vex_control).guest_chase_thresh = 0; // cannot be overriden.
   1998    }
   1999 
   2000    CLG_DEBUG(1, "  dump threads: %s\n", CLG_(clo).separate_threads ? "Yes":"No");
   2001    CLG_DEBUG(1, "  call sep. : %d\n", CLG_(clo).separate_callers);
   2002    CLG_DEBUG(1, "  rec. sep. : %d\n", CLG_(clo).separate_recursions);
   2003 
   2004    if (!CLG_(clo).dump_line && !CLG_(clo).dump_instr && !CLG_(clo).dump_bb) {
   2005        VG_(message)(Vg_UserMsg, "Using source line as position.\n");
   2006        CLG_(clo).dump_line = True;
   2007    }
   2008 
   2009    CLG_(init_dumps)();
   2010 
   2011    (*CLG_(cachesim).post_clo_init)();
   2012 
   2013    CLG_(init_eventsets)();
   2014    CLG_(init_statistics)(& CLG_(stat));
   2015    CLG_(init_cost_lz)( CLG_(sets).full, &CLG_(total_cost) );
   2016 
   2017    /* initialize hash tables */
   2018    CLG_(init_obj_table)();
   2019    CLG_(init_cxt_table)();
   2020    CLG_(init_bb_hash)();
   2021 
   2022    CLG_(init_threads)();
   2023    CLG_(run_thread)(1);
   2024 
   2025    CLG_(instrument_state) = CLG_(clo).instrument_atstart;
   2026 
   2027    if (VG_(clo_verbosity > 0)) {
   2028       VG_(message)(Vg_UserMsg,
   2029                    "For interactive control, run 'callgrind_control%s%s -h'.\n",
   2030                    (VG_(arg_vgdb_prefix) ? " " : ""),
   2031                    (VG_(arg_vgdb_prefix) ? VG_(arg_vgdb_prefix) : ""));
   2032    }
   2033 }
   2034 
   2035 static
   2036 void CLG_(pre_clo_init)(void)
   2037 {
   2038     VG_(details_name)            ("Callgrind");
   2039     VG_(details_version)         (NULL);
   2040     VG_(details_description)     ("a call-graph generating cache profiler");
   2041     VG_(details_copyright_author)("Copyright (C) 2002-2015, and GNU GPL'd, "
   2042 				  "by Josef Weidendorfer et al.");
   2043     VG_(details_bug_reports_to)  (VG_BUGS_TO);
   2044     VG_(details_avg_translation_sizeB) ( 500 );
   2045 
   2046     VG_(clo_vex_control).iropt_register_updates_default
   2047        = VG_(clo_px_file_backed)
   2048        = VexRegUpdSpAtMemAccess; // overridable by the user.
   2049 
   2050     VG_(clo_vex_control).iropt_unroll_thresh = 0;   // cannot be overriden.
   2051     VG_(clo_vex_control).guest_chase_thresh = 0;    // cannot be overriden.
   2052 
   2053     VG_(basic_tool_funcs)        (CLG_(post_clo_init),
   2054                                   CLG_(instrument),
   2055                                   CLG_(fini));
   2056 
   2057     VG_(needs_superblock_discards)(clg_discard_superblock_info);
   2058 
   2059 
   2060     VG_(needs_command_line_options)(CLG_(process_cmd_line_option),
   2061 				    CLG_(print_usage),
   2062 				    CLG_(print_debug_usage));
   2063 
   2064     VG_(needs_client_requests)(CLG_(handle_client_request));
   2065     VG_(needs_print_stats)    (clg_print_stats);
   2066     VG_(needs_syscall_wrapper)(CLG_(pre_syscalltime),
   2067 			       CLG_(post_syscalltime));
   2068 
   2069     VG_(track_start_client_code)  ( & clg_start_client_code_callback );
   2070     VG_(track_pre_deliver_signal) ( & CLG_(pre_signal) );
   2071     VG_(track_post_deliver_signal)( & CLG_(post_signal) );
   2072 
   2073     CLG_(set_clo_defaults)();
   2074 
   2075     syscalltime = CLG_MALLOC("cl.main.pci.1",
   2076                              VG_N_THREADS * sizeof syscalltime[0]);
   2077     for (UInt i = 0; i < VG_N_THREADS; ++i) {
   2078        syscalltime[i] = 0;
   2079     }
   2080 }
   2081 
   2082 VG_DETERMINE_INTERFACE_VERSION(CLG_(pre_clo_init))
   2083 
   2084 /*--------------------------------------------------------------------*/
   2085 /*--- end                                                   main.c ---*/
   2086 /*--------------------------------------------------------------------*/
   2087