Home | History | Annotate | Download | only in cygprofile
      1 #!/usr/bin/python
      2 # Copyright 2013 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Symbolize log file produced by cypgofile instrumentation.
      7 
      8 Given a log file and the binary being profiled (e.g. executable, shared
      9 library), the script can produce three different outputs: 1) symbols for the
     10 addresses, 2) function and line numbers for the addresses, or 3) an order file.
     11 """
     12 
     13 import optparse
     14 import os
     15 import string
     16 import subprocess
     17 import sys
     18 
     19 
     20 def ParseLogLines(log_file_lines):
     21   """Parse a log file produced by the profiled run of clank.
     22 
     23   Args:
     24     log_file_lines: array of lines in log file produced by profiled run
     25     lib_name: library or executable containing symbols
     26 
     27     Below is an example of a small log file:
     28     5086e000-52e92000 r-xp 00000000 b3:02 51276      libchromeview.so
     29     secs       usecs      pid:threadid    func
     30     START
     31     1314897086 795828     3587:1074648168 0x509e105c
     32     1314897086 795874     3587:1074648168 0x509e0eb4
     33     1314897086 796326     3587:1074648168 0x509e0e3c
     34     1314897086 796552     3587:1074648168 0x509e07bc
     35     END
     36 
     37   Returns:
     38     call_info list with list of tuples of the format (sec, usec, call id,
     39     function address called)
     40   """
     41   call_lines = []
     42   has_started = False
     43   vm_start = 0
     44   line = log_file_lines[0]
     45   assert("r-xp" in line)
     46   end_index = line.find('-')
     47   vm_start = int(line[:end_index], 16)
     48   for line in log_file_lines[2:]:
     49   # print hex(vm_start)
     50     fields = line.split()
     51     if len(fields) == 4:
     52       call_lines.append(fields)
     53 
     54   # Convert strings to int in fields.
     55   call_info = []
     56   for call_line in call_lines:
     57     (sec_timestamp, usec_timestamp) = map(int, call_line[0:2])
     58     callee_id = call_line[2]
     59     addr = int(call_line[3], 16)
     60     if vm_start < addr:
     61       addr -= vm_start
     62       call_info.append((sec_timestamp, usec_timestamp, callee_id, addr))
     63 
     64   return call_info
     65 
     66 
     67 def ParseLibSymbols(lib_file):
     68   """Get output from running nm and greping for text symbols.
     69 
     70   Args:
     71     lib_file: the library or executable that contains the profiled code
     72 
     73   Returns:
     74     list of sorted unique addresses and corresponding size of function symbols
     75     in lib_file and map of addresses to all symbols at a particular address
     76   """
     77   cmd = ['nm', '-S', '-n', lib_file]
     78   nm_p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
     79   output = nm_p.communicate()[0]
     80   nm_lines = output.split('\n')
     81 
     82   nm_symbols = []
     83   for nm_line in nm_lines:
     84     if any(str in nm_line for str in (' t ', ' W ', ' T ')):
     85       nm_symbols.append(nm_line)
     86 
     87   nm_index = 0
     88   unique_addrs = []
     89   address_map = {}
     90   while nm_index < len(nm_symbols):
     91 
     92     # If the length of the split line is not 4, then it does not contain all the
     93     # information needed to symbolize (i.e. address, size and symbol name).
     94     if len(nm_symbols[nm_index].split()) == 4:
     95       (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]
     96 
     97       # Multiple symbols may be at the same address.  This is do to aliasing
     98       # done by the compiler.  Since there is no way to be sure which one was
     99       # called in profiled run, we will symbolize to include all symbol names at
    100       # a particular address.
    101       fnames = []
    102       while (nm_index < len(nm_symbols) and
    103              addr == int(nm_symbols[nm_index].split()[0], 16)):
    104         if len(nm_symbols[nm_index].split()) == 4:
    105           fnames.append(nm_symbols[nm_index].split()[3])
    106         nm_index += 1
    107       address_map[addr] = fnames
    108       unique_addrs.append((addr, size))
    109     else:
    110       nm_index += 1
    111 
    112   return (unique_addrs, address_map)
    113 
    114 class SymbolNotFoundException(Exception):
    115   def __init__(self,value):
    116     self.value = value
    117   def __str__(self):
    118     return repr(self.value)
    119 
    120 def BinarySearchAddresses(addr, start, end, arr):
    121   """Find starting address of a symbol at a particular address.
    122 
    123   The reason we can not directly use the address provided by the log file is
    124   that the log file may give an address after the start of the symbol.  The
    125   logged address is often one byte after the start.  By using this search
    126   function rather than just subtracting one from the logged address allows
    127   the logging instrumentation to log any address in a function.
    128 
    129   Args:
    130     addr: the address being searched for
    131     start: the starting index for the binary search
    132     end: the ending index for the binary search
    133     arr: the list being searched containing tuple of address and size
    134 
    135   Returns:
    136     the starting address of the symbol at address addr
    137 
    138   Raises:
    139     Exception: if address not found.  Functions expects all logged addresses
    140     to be found
    141   """
    142   # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end)
    143   if start >= end or start == end - 1:
    144     # arr[i] is a tuple of address and size.  Check if addr inside range
    145     if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]:
    146       return arr[start][0]
    147     elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]:
    148       return arr[end][0]
    149     else:
    150       raise SymbolNotFoundException(addr)
    151   else:
    152     halfway = (start + end) / 2
    153     (nm_addr, size) = arr[halfway]
    154     # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)
    155     if addr >= nm_addr and addr < nm_addr + size:
    156       return nm_addr
    157     elif addr < nm_addr:
    158       return BinarySearchAddresses(addr, start, halfway-1, arr)
    159     else:
    160       # Condition (addr >= nm_addr + size) must be true.
    161       return BinarySearchAddresses(addr, halfway+1, end, arr)
    162 
    163 
    164 def FindFunctions(addr, unique_addrs, address_map):
    165   """Find function symbol names at address addr."""
    166   return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1,
    167                                            unique_addrs)]
    168 
    169 
    170 def AddrToLine(addr, lib_file):
    171   """Use addr2line to determine line info of a particular address."""
    172   cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]
    173   p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
    174   output = (p.communicate()[0]).split('\n')
    175   line = output[0]
    176   index = 1
    177   while index < len(output):
    178     line = line + ':' + output[index]
    179     index += 1
    180   return line
    181 
    182 
    183 def main():
    184   """Write output for profiled run to standard out.
    185 
    186   The format of the output depends on the output type specified as the third
    187   command line argument.  The default output type is to symbolize the addresses
    188   of the functions called.
    189   """
    190   parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')
    191   parser.add_option('-t', '--outputType', dest='output_type',
    192                     default='symbolize', type='string',
    193                     help='lineize or symbolize or orderfile')
    194 
    195   # Option for output type.  The log file and lib file arguments are required
    196   # by the script and therefore are not options.
    197   (options, args) = parser.parse_args()
    198   if len(args) != 2:
    199     parser.error('expected 2 args: log_file lib_file')
    200 
    201   (log_file, lib_file) = args
    202   output_type = options.output_type
    203 
    204   lib_name = lib_file.split('/')[-1].strip()
    205   log_file_lines = map(string.rstrip, open(log_file).readlines())
    206   call_info = ParseLogLines(log_file_lines)
    207   (unique_addrs, address_map) = ParseLibSymbols(lib_file)
    208 
    209   # Check for duplicate addresses in the log file, and print a warning if
    210   # duplicates are found. The instrumentation that produces the log file
    211   # should only print the first time a function is entered.
    212   addr_list = []
    213   for call in call_info:
    214     addr = call[3]
    215     if addr not in addr_list:
    216       addr_list.append(addr)
    217     else:
    218       print('WARNING: Address ' + hex(addr) + ' (line= ' +
    219             AddrToLine(addr, lib_file) + ') already profiled.')
    220 
    221   for call in call_info:
    222     if output_type == 'lineize':
    223       symbol = AddrToLine(call[3], lib_file)
    224       print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
    225             + symbol)
    226     elif output_type == 'orderfile':
    227       try:
    228         symbols = FindFunctions(call[3], unique_addrs, address_map)
    229         for symbol in symbols:
    230           print '.text.' + symbol
    231         print ''
    232       except SymbolNotFoundException as e:
    233         sys.stderr.write('WARNING: Did not find function in binary. addr: '
    234                       + hex(addr) + '\n')
    235     else:
    236       try:
    237         symbols = FindFunctions(call[3], unique_addrs, address_map)
    238         print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
    239               + symbols[0])
    240         first_symbol = True
    241         for symbol in symbols:
    242           if not first_symbol:
    243             print '\t\t\t\t\t' + symbol
    244           else:
    245             first_symbol = False
    246       except SymbolNotFoundException as e:
    247         sys.stderr.write('WARNING: Did not find function in binary. addr: '
    248                       + hex(addr) + '\n')
    249 
    250 if __name__ == '__main__':
    251   main()
    252