Home | History | Annotate | Download | only in binary_size
      1 #!/usr/bin/env python
      2 # Copyright 2014 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 """Generate a spatial analysis against an arbitrary library.
      7 
      8 To use, build the 'binary_size_tool' target. Then run this tool, passing
      9 in the location of the library to be analyzed along with any other options
     10 you desire.
     11 """
     12 
     13 import collections
     14 import json
     15 import logging
     16 import multiprocessing
     17 import optparse
     18 import os
     19 import re
     20 import shutil
     21 import struct
     22 import subprocess
     23 import sys
     24 import tempfile
     25 import time
     26 
     27 import binary_size_utils
     28 
     29 # This path changee is not beautiful. Temporary (I hope) measure until
     30 # the chromium project has figured out a proper way to organize the
     31 # library of python tools. http://crbug.com/375725
     32 elf_symbolizer_path = os.path.abspath(os.path.join(
     33     os.path.dirname(__file__),
     34     '..',
     35     '..',
     36     'build',
     37     'android',
     38     'pylib'))
     39 sys.path.append(elf_symbolizer_path)
     40 import symbols.elf_symbolizer as elf_symbolizer  # pylint: disable=F0401
     41 
     42 
     43 # Node dictionary keys. These are output in json read by the webapp so
     44 # keep them short to save file size.
     45 # Note: If these change, the webapp must also change.
     46 NODE_TYPE_KEY = 'k'
     47 NODE_NAME_KEY = 'n'
     48 NODE_CHILDREN_KEY = 'children'
     49 NODE_SYMBOL_TYPE_KEY = 't'
     50 NODE_SYMBOL_SIZE_KEY = 'value'
     51 NODE_MAX_DEPTH_KEY = 'maxDepth'
     52 NODE_LAST_PATH_ELEMENT_KEY = 'lastPathElement'
     53 
     54 # The display name of the bucket where we put symbols without path.
     55 NAME_NO_PATH_BUCKET = '(No Path)'
     56 
     57 # Try to keep data buckets smaller than this to avoid killing the
     58 # graphing lib.
     59 BIG_BUCKET_LIMIT = 3000
     60 
     61 
     62 # TODO(andrewhayden): Only used for legacy reports. Delete.
     63 def FormatBytes(byte_count):
     64   """Pretty-print a number of bytes."""
     65   if byte_count > 1e6:
     66     byte_count = byte_count / 1.0e6
     67     return '%.1fm' % byte_count
     68   if byte_count > 1e3:
     69     byte_count = byte_count / 1.0e3
     70     return '%.1fk' % byte_count
     71   return str(byte_count)
     72 
     73 
     74 # TODO(andrewhayden): Only used for legacy reports. Delete.
     75 def SymbolTypeToHuman(symbol_type):
     76   """Convert a symbol type as printed by nm into a human-readable name."""
     77   return {'b': 'bss',
     78           'd': 'data',
     79           'r': 'read-only data',
     80           't': 'code',
     81           'w': 'weak symbol',
     82           'v': 'weak symbol'}[symbol_type]
     83 
     84 
     85 def _MkChild(node, name):
     86   child = node[NODE_CHILDREN_KEY].get(name)
     87   if child is None:
     88     child = {NODE_NAME_KEY: name,
     89              NODE_CHILDREN_KEY: {}}
     90     node[NODE_CHILDREN_KEY][name] = child
     91   return child
     92 
     93 
     94 
     95 def SplitNoPathBucket(node):
     96   """NAME_NO_PATH_BUCKET can be too large for the graphing lib to
     97   handle. Split it into sub-buckets in that case."""
     98   root_children = node[NODE_CHILDREN_KEY]
     99   if NAME_NO_PATH_BUCKET in root_children:
    100     no_path_bucket = root_children[NAME_NO_PATH_BUCKET]
    101     old_children = no_path_bucket[NODE_CHILDREN_KEY]
    102     count = 0
    103     for symbol_type, symbol_bucket in old_children.iteritems():
    104       count += len(symbol_bucket[NODE_CHILDREN_KEY])
    105     if count > BIG_BUCKET_LIMIT:
    106       new_children = {}
    107       no_path_bucket[NODE_CHILDREN_KEY] = new_children
    108       current_bucket = None
    109       index = 0
    110       for symbol_type, symbol_bucket in old_children.iteritems():
    111         for symbol_name, value in symbol_bucket[NODE_CHILDREN_KEY].iteritems():
    112           if index % BIG_BUCKET_LIMIT == 0:
    113             group_no = (index / BIG_BUCKET_LIMIT) + 1
    114             current_bucket = _MkChild(no_path_bucket,
    115                                       '%s subgroup %d' % (NAME_NO_PATH_BUCKET,
    116                                                           group_no))
    117             assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
    118             node[NODE_TYPE_KEY] = 'p'  # p for path
    119           index += 1
    120           symbol_size = value[NODE_SYMBOL_SIZE_KEY]
    121           AddSymbolIntoFileNode(current_bucket, symbol_type,
    122                                 symbol_name, symbol_size)
    123 
    124 
    125 def MakeChildrenDictsIntoLists(node):
    126   largest_list_len = 0
    127   if NODE_CHILDREN_KEY in node:
    128     largest_list_len = len(node[NODE_CHILDREN_KEY])
    129     child_list = []
    130     for child in node[NODE_CHILDREN_KEY].itervalues():
    131       child_largest_list_len = MakeChildrenDictsIntoLists(child)
    132       if child_largest_list_len > largest_list_len:
    133         largest_list_len = child_largest_list_len
    134       child_list.append(child)
    135     node[NODE_CHILDREN_KEY] = child_list
    136 
    137   return largest_list_len
    138 
    139 
    140 def AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size):
    141   """Puts symbol into the file path node |node|.
    142   Returns the number of added levels in tree. I.e. returns 2."""
    143 
    144   # 'node' is the file node and first step is to find its symbol-type bucket.
    145   node[NODE_LAST_PATH_ELEMENT_KEY] = True
    146   node = _MkChild(node, symbol_type)
    147   assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'b'
    148   node[NODE_SYMBOL_TYPE_KEY] = symbol_type
    149   node[NODE_TYPE_KEY] = 'b'  # b for bucket
    150 
    151   # 'node' is now the symbol-type bucket. Make the child entry.
    152   node = _MkChild(node, symbol_name)
    153   if NODE_CHILDREN_KEY in node:
    154     if node[NODE_CHILDREN_KEY]:
    155       logging.warning('A container node used as symbol for %s.' % symbol_name)
    156     # This is going to be used as a leaf so no use for child list.
    157     del node[NODE_CHILDREN_KEY]
    158   node[NODE_SYMBOL_SIZE_KEY] = symbol_size
    159   node[NODE_SYMBOL_TYPE_KEY] = symbol_type
    160   node[NODE_TYPE_KEY] = 's'  # s for symbol
    161 
    162   return 2  # Depth of the added subtree.
    163 
    164 
    165 def MakeCompactTree(symbols, symbol_path_origin_dir):
    166   result = {NODE_NAME_KEY: '/',
    167             NODE_CHILDREN_KEY: {},
    168             NODE_TYPE_KEY: 'p',
    169             NODE_MAX_DEPTH_KEY: 0}
    170   seen_symbol_with_path = False
    171   cwd = os.path.abspath(os.getcwd())
    172   for symbol_name, symbol_type, symbol_size, file_path in symbols:
    173 
    174     if 'vtable for ' in symbol_name:
    175       symbol_type = '@'  # hack to categorize these separately
    176     # Take path like '/foo/bar/baz', convert to ['foo', 'bar', 'baz']
    177     if file_path and file_path != "??":
    178       file_path = os.path.abspath(os.path.join(symbol_path_origin_dir,
    179                                                file_path))
    180       # Let the output structure be relative to $CWD if inside $CWD,
    181       # otherwise relative to the disk root. This is to avoid
    182       # unnecessary click-through levels in the output.
    183       if file_path.startswith(cwd + os.sep):
    184         file_path = file_path[len(cwd):]
    185       if file_path.startswith('/'):
    186         file_path = file_path[1:]
    187       seen_symbol_with_path = True
    188     else:
    189       file_path = NAME_NO_PATH_BUCKET
    190 
    191     path_parts = file_path.split('/')
    192 
    193     # Find pre-existing node in tree, or update if it already exists
    194     node = result
    195     depth = 0
    196     while len(path_parts) > 0:
    197       path_part = path_parts.pop(0)
    198       if len(path_part) == 0:
    199         continue
    200       depth += 1
    201       node = _MkChild(node, path_part)
    202       assert not NODE_TYPE_KEY in node or node[NODE_TYPE_KEY] == 'p'
    203       node[NODE_TYPE_KEY] = 'p'  # p for path
    204 
    205     depth += AddSymbolIntoFileNode(node, symbol_type, symbol_name, symbol_size)
    206     result[NODE_MAX_DEPTH_KEY] = max(result[NODE_MAX_DEPTH_KEY], depth)
    207 
    208   if not seen_symbol_with_path:
    209     logging.warning('Symbols lack paths. Data will not be structured.')
    210 
    211   # The (no path) bucket can be extremely large if we failed to get
    212   # path information. Split it into subgroups if needed.
    213   SplitNoPathBucket(result)
    214 
    215   largest_list_len = MakeChildrenDictsIntoLists(result)
    216 
    217   if largest_list_len > BIG_BUCKET_LIMIT:
    218     logging.warning('There are sections with %d nodes. '
    219                     'Results might be unusable.' % largest_list_len)
    220   return result
    221 
    222 
    223 # TODO(andrewhayden): Only used for legacy reports. Delete.
    224 def TreeifySymbols(symbols):
    225   """Convert symbols into a path-based tree, calculating size information
    226   along the way.
    227 
    228   The result is a dictionary that contains two kinds of nodes:
    229   1. Leaf nodes, representing source code locations (e.g., c++ files)
    230      These nodes have the following dictionary entries:
    231        sizes: a dictionary whose keys are categories (such as code, data,
    232               vtable, etceteras) and whose values are the size, in bytes, of
    233               those categories;
    234        size:  the total size, in bytes, of all the entries in the sizes dict
    235   2. Non-leaf nodes, representing directories
    236      These nodes have the following dictionary entries:
    237        children: a dictionary whose keys are names (path entries; either
    238                  directory or file names) and whose values are other nodes;
    239        size:     the total size, in bytes, of all the leaf nodes that are
    240                  contained within the children dict (recursively expanded)
    241 
    242   The result object is itself a dictionary that represents the common ancestor
    243   of all child nodes, e.g. a path to which all other nodes beneath it are
    244   relative. The 'size' attribute of this dict yields the sum of the size of all
    245   leaf nodes within the data structure.
    246   """
    247   dirs = {'children': {}, 'size': 0}
    248   for sym, symbol_type, size, path in symbols:
    249     dirs['size'] += size
    250     if path:
    251       path = os.path.normpath(path)
    252       if path.startswith('/'):
    253         path = path[1:]
    254 
    255     parts = None
    256     if path:
    257       parts = path.split('/')
    258 
    259     if parts:
    260       assert path
    261       file_key = parts.pop()
    262       tree = dirs
    263       try:
    264         # Traverse the tree to the parent of the file node, creating as needed
    265         for part in parts:
    266           assert part != ''
    267           if part not in tree['children']:
    268             tree['children'][part] = {'children': {}, 'size': 0}
    269           tree = tree['children'][part]
    270           tree['size'] += size
    271 
    272         # Get (creating if necessary) the node for the file
    273         # This node doesn't have a 'children' attribute
    274         if file_key not in tree['children']:
    275           tree['children'][file_key] = {'sizes': collections.defaultdict(int),
    276                                         'size': 0}
    277         tree = tree['children'][file_key]
    278         tree['size'] += size
    279 
    280         # Accumulate size into a bucket within the file
    281         symbol_type = symbol_type.lower()
    282         if 'vtable for ' in sym:
    283           tree['sizes']['[vtable]'] += size
    284         elif 'r' == symbol_type:
    285           tree['sizes']['[rodata]'] += size
    286         elif 'd' == symbol_type:
    287           tree['sizes']['[data]'] += size
    288         elif 'b' == symbol_type:
    289           tree['sizes']['[bss]'] += size
    290         elif 't' == symbol_type:
    291           # 'text' in binary parlance means 'code'.
    292           tree['sizes']['[code]'] += size
    293         elif 'w' == symbol_type:
    294           tree['sizes']['[weak]'] += size
    295         else:
    296           tree['sizes']['[other]'] += size
    297       except:
    298         print >> sys.stderr, sym, parts, file_key
    299         raise
    300     else:
    301       key = 'symbols without paths'
    302       if key not in dirs['children']:
    303         dirs['children'][key] = {'sizes': collections.defaultdict(int),
    304                                  'size': 0}
    305       tree = dirs['children'][key]
    306       subkey = 'misc'
    307       if (sym.endswith('::__FUNCTION__') or
    308         sym.endswith('::__PRETTY_FUNCTION__')):
    309         subkey = '__FUNCTION__'
    310       elif sym.startswith('CSWTCH.'):
    311         subkey = 'CSWTCH'
    312       elif '::' in sym:
    313         subkey = sym[0:sym.find('::') + 2]
    314       tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
    315       tree['size'] += size
    316   return dirs
    317 
    318 
    319 # TODO(andrewhayden): Only used for legacy reports. Delete.
    320 def JsonifyTree(tree, name):
    321   """Convert TreeifySymbols output to a JSON treemap.
    322 
    323   The format is very similar, with the notable exceptions being
    324   lists of children instead of maps and some different attribute names."""
    325   children = []
    326   css_class_map = {
    327                   '[vtable]': 'vtable',
    328                   '[rodata]': 'read-only_data',
    329                   '[data]': 'data',
    330                   '[bss]': 'bss',
    331                   '[code]': 'code',
    332                   '[weak]': 'weak_symbol'
    333   }
    334   if 'children' in tree:
    335     # Non-leaf node. Recurse.
    336     for child_name, child in tree['children'].iteritems():
    337       children.append(JsonifyTree(child, child_name))
    338   else:
    339     # Leaf node; dump per-file stats as entries in the treemap
    340     for kind, size in tree['sizes'].iteritems():
    341       child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
    342                    'data': { '$area': size }}
    343       css_class = css_class_map.get(kind)
    344       if css_class is not None:
    345         child_json['data']['$symbol'] = css_class
    346       children.append(child_json)
    347   # Sort children by size, largest to smallest.
    348   children.sort(key=lambda child: -child['data']['$area'])
    349 
    350   # For leaf nodes, the 'size' attribute is the size of the leaf;
    351   # Non-leaf nodes don't really have a size, but their 'size' attribute is
    352   # the sum of the sizes of all their children.
    353   return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
    354           'data': { '$area': tree['size'] },
    355           'children': children }
    356 
    357 def DumpCompactTree(symbols, symbol_path_origin_dir, outfile):
    358   tree_root = MakeCompactTree(symbols, symbol_path_origin_dir)
    359   with open(outfile, 'w') as out:
    360     out.write('var tree_data=')
    361     # Use separators without whitespace to get a smaller file.
    362     json.dump(tree_root, out, separators=(',', ':'))
    363   print('Writing %d bytes json' % os.path.getsize(outfile))
    364 
    365 
    366 # TODO(andrewhayden): Only used for legacy reports. Delete.
    367 def DumpTreemap(symbols, outfile):
    368   dirs = TreeifySymbols(symbols)
    369   out = open(outfile, 'w')
    370   try:
    371     out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
    372   finally:
    373     out.flush()
    374     out.close()
    375 
    376 
    377 # TODO(andrewhayden): Only used for legacy reports. Delete.
    378 def DumpLargestSymbols(symbols, outfile, n):
    379   # a list of (sym, symbol_type, size, path); sort by size.
    380   symbols = sorted(symbols, key=lambda x: -x[2])
    381   dumped = 0
    382   out = open(outfile, 'w')
    383   try:
    384     out.write('var largestSymbols = [\n')
    385     for sym, symbol_type, size, path in symbols:
    386       if symbol_type in ('b', 'w'):
    387         continue  # skip bss and weak symbols
    388       if path is None:
    389         path = ''
    390       entry = {'size': FormatBytes(size),
    391                'symbol': sym,
    392                'type': SymbolTypeToHuman(symbol_type),
    393                'location': path }
    394       out.write(json.dumps(entry))
    395       out.write(',\n')
    396       dumped += 1
    397       if dumped >= n:
    398         return
    399   finally:
    400     out.write('];\n')
    401     out.flush()
    402     out.close()
    403 
    404 
    405 def MakeSourceMap(symbols):
    406   sources = {}
    407   for _sym, _symbol_type, size, path in symbols:
    408     key = None
    409     if path:
    410       key = os.path.normpath(path)
    411     else:
    412       key = '[no path]'
    413     if key not in sources:
    414       sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
    415     record = sources[key]
    416     record['size'] += size
    417     record['symbol_count'] += 1
    418   return sources
    419 
    420 
    421 # TODO(andrewhayden): Only used for legacy reports. Delete.
    422 def DumpLargestSources(symbols, outfile, n):
    423   source_map = MakeSourceMap(symbols)
    424   sources = sorted(source_map.values(), key=lambda x: -x['size'])
    425   dumped = 0
    426   out = open(outfile, 'w')
    427   try:
    428     out.write('var largestSources = [\n')
    429     for record in sources:
    430       entry = {'size': FormatBytes(record['size']),
    431                'symbol_count': str(record['symbol_count']),
    432                'location': record['path']}
    433       out.write(json.dumps(entry))
    434       out.write(',\n')
    435       dumped += 1
    436       if dumped >= n:
    437         return
    438   finally:
    439     out.write('];\n')
    440     out.flush()
    441     out.close()
    442 
    443 
    444 # TODO(andrewhayden): Only used for legacy reports. Delete.
    445 def DumpLargestVTables(symbols, outfile, n):
    446   vtables = []
    447   for symbol, _type, size, path in symbols:
    448     if 'vtable for ' in symbol:
    449       vtables.append({'symbol': symbol, 'path': path, 'size': size})
    450   vtables = sorted(vtables, key=lambda x: -x['size'])
    451   dumped = 0
    452   out = open(outfile, 'w')
    453   try:
    454     out.write('var largestVTables = [\n')
    455     for record in vtables:
    456       entry = {'size': FormatBytes(record['size']),
    457                'symbol': record['symbol'],
    458                'location': record['path']}
    459       out.write(json.dumps(entry))
    460       out.write(',\n')
    461       dumped += 1
    462       if dumped >= n:
    463         return
    464   finally:
    465     out.write('];\n')
    466     out.flush()
    467     out.close()
    468 
    469 
    470 # Regex for parsing "nm" output. A sample line looks like this:
    471 # 0167b39c 00000018 t ACCESS_DESCRIPTION_free /path/file.c:95
    472 #
    473 # The fields are: address, size, type, name, source location
    474 # Regular expression explained ( see also: https://xkcd.com/208 ):
    475 # ([0-9a-f]{8,}+)   The address
    476 # [\s]+             Whitespace separator
    477 # ([0-9a-f]{8,}+)   The size. From here on out it's all optional.
    478 # [\s]+             Whitespace separator
    479 # (\S?)             The symbol type, which is any non-whitespace char
    480 # [\s*]             Whitespace separator
    481 # ([^\t]*)          Symbol name, any non-tab character (spaces ok!)
    482 # [\t]?             Tab separator
    483 # (.*)              The location (filename[:linennum|?][ (discriminator n)]
    484 sNmPattern = re.compile(
    485   r'([0-9a-f]{8,})[\s]+([0-9a-f]{8,})[\s]*(\S?)[\s*]([^\t]*)[\t]?(.*)')
    486 
    487 class Progress():
    488   def __init__(self):
    489     self.count = 0
    490     self.skip_count = 0
    491     self.collisions = 0
    492     self.time_last_output = time.time()
    493     self.count_last_output = 0
    494     self.disambiguations = 0
    495     self.was_ambiguous = 0
    496 
    497 
    498 def RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
    499                      disambiguate, src_path):
    500   nm_output = RunNm(library, nm_binary)
    501   nm_output_lines = nm_output.splitlines()
    502   nm_output_lines_len = len(nm_output_lines)
    503   address_symbol = {}
    504   progress = Progress()
    505   def map_address_symbol(symbol, addr):
    506     progress.count += 1
    507     if addr in address_symbol:
    508       # 'Collision between %s and %s.' % (str(symbol.name),
    509       #                                   str(address_symbol[addr].name))
    510       progress.collisions += 1
    511     else:
    512       if symbol.disambiguated:
    513         progress.disambiguations += 1
    514       if symbol.was_ambiguous:
    515         progress.was_ambiguous += 1
    516 
    517       address_symbol[addr] = symbol
    518 
    519     progress_output()
    520 
    521   def progress_output():
    522     progress_chunk = 100
    523     if progress.count % progress_chunk == 0:
    524       time_now = time.time()
    525       time_spent = time_now - progress.time_last_output
    526       if time_spent > 1.0:
    527         # Only output at most once per second.
    528         progress.time_last_output = time_now
    529         chunk_size = progress.count - progress.count_last_output
    530         progress.count_last_output = progress.count
    531         if time_spent > 0:
    532           speed = chunk_size / time_spent
    533         else:
    534           speed = 0
    535         progress_percent = (100.0 * (progress.count + progress.skip_count) /
    536                             nm_output_lines_len)
    537         disambiguation_percent = 0
    538         if progress.disambiguations != 0:
    539           disambiguation_percent = (100.0 * progress.disambiguations /
    540                                     progress.was_ambiguous)
    541 
    542         sys.stdout.write('\r%.1f%%: Looked up %d symbols (%d collisions, '
    543               '%d disambiguations where %.1f%% succeeded)'
    544               ' - %.1f lookups/s.' %
    545               (progress_percent, progress.count, progress.collisions,
    546                progress.disambiguations, disambiguation_percent, speed))
    547 
    548   # In case disambiguation was disabled, we remove the source path (which upon
    549   # being set signals the symbolizer to enable disambiguation)
    550   if not disambiguate:
    551     src_path = None
    552   symbolizer = elf_symbolizer.ELFSymbolizer(library, addr2line_binary,
    553                                             map_address_symbol,
    554                                             max_concurrent_jobs=jobs,
    555                                             source_root_path=src_path)
    556   user_interrupted = False
    557   try:
    558     for line in nm_output_lines:
    559       match = sNmPattern.match(line)
    560       if match:
    561         location = match.group(5)
    562         if not location:
    563           addr = int(match.group(1), 16)
    564           size = int(match.group(2), 16)
    565           if addr in address_symbol:  # Already looked up, shortcut
    566                                       # ELFSymbolizer.
    567             map_address_symbol(address_symbol[addr], addr)
    568             continue
    569           elif size == 0:
    570             # Save time by not looking up empty symbols (do they even exist?)
    571             print('Empty symbol: ' + line)
    572           else:
    573             symbolizer.SymbolizeAsync(addr, addr)
    574             continue
    575 
    576       progress.skip_count += 1
    577   except KeyboardInterrupt:
    578     user_interrupted = True
    579     print('Interrupting - killing subprocesses. Please wait.')
    580 
    581   try:
    582     symbolizer.Join()
    583   except KeyboardInterrupt:
    584     # Don't want to abort here since we will be finished in a few seconds.
    585     user_interrupted = True
    586     print('Patience you must have my young padawan.')
    587 
    588   print ''
    589 
    590   if user_interrupted:
    591     print('Skipping the rest of the file mapping. '
    592           'Output will not be fully classified.')
    593 
    594   symbol_path_origin_dir = os.path.dirname(os.path.abspath(library))
    595 
    596   with open(outfile, 'w') as out:
    597     for line in nm_output_lines:
    598       match = sNmPattern.match(line)
    599       if match:
    600         location = match.group(5)
    601         if not location:
    602           addr = int(match.group(1), 16)
    603           symbol = address_symbol.get(addr)
    604           if symbol is not None:
    605             path = '??'
    606             if symbol.source_path is not None:
    607               path = os.path.abspath(os.path.join(symbol_path_origin_dir,
    608                                                   symbol.source_path))
    609             line_number = 0
    610             if symbol.source_line is not None:
    611               line_number = symbol.source_line
    612             out.write('%s\t%s:%d\n' % (line, path, line_number))
    613             continue
    614 
    615       out.write('%s\n' % line)
    616 
    617   print('%d symbols in the results.' % len(address_symbol))
    618 
    619 
    620 def RunNm(binary, nm_binary):
    621   cmd = [nm_binary, '-C', '--print-size', '--size-sort', '--reverse-sort',
    622          binary]
    623   nm_process = subprocess.Popen(cmd,
    624                                 stdout=subprocess.PIPE,
    625                                 stderr=subprocess.PIPE)
    626   (process_output, err_output) = nm_process.communicate()
    627 
    628   if nm_process.returncode != 0:
    629     if err_output:
    630       raise Exception, err_output
    631     else:
    632       raise Exception, process_output
    633 
    634   return process_output
    635 
    636 
    637 def GetNmSymbols(nm_infile, outfile, library, jobs, verbose,
    638                  addr2line_binary, nm_binary, disambiguate, src_path):
    639   if nm_infile is None:
    640     if outfile is None:
    641       outfile = tempfile.NamedTemporaryFile(delete=False).name
    642 
    643     if verbose:
    644       print 'Running parallel addr2line, dumping symbols to ' + outfile
    645     RunElfSymbolizer(outfile, library, addr2line_binary, nm_binary, jobs,
    646                      disambiguate, src_path)
    647 
    648     nm_infile = outfile
    649 
    650   elif verbose:
    651     print 'Using nm input from ' + nm_infile
    652   with file(nm_infile, 'r') as infile:
    653     return list(binary_size_utils.ParseNm(infile))
    654 
    655 
    656 PAK_RESOURCE_ID_TO_STRING = { "inited": False }
    657 
    658 def LoadPakIdsFromResourceFile(filename):
    659   """Given a file name, it loads everything that looks like a resource id
    660   into PAK_RESOURCE_ID_TO_STRING."""
    661   with open(filename) as resource_header:
    662     for line in resource_header:
    663       if line.startswith("#define "):
    664         line_data = line.split()
    665         if len(line_data) == 3:
    666           try:
    667             resource_number = int(line_data[2])
    668             resource_name = line_data[1]
    669             PAK_RESOURCE_ID_TO_STRING[resource_number] = resource_name
    670           except ValueError:
    671             pass
    672 
    673 def GetReadablePakResourceName(pak_file, resource_id):
    674   """Pak resources have a numeric identifier. It is not helpful when
    675   trying to locate where footprint is generated. This does its best to
    676   map the number to a usable string."""
    677   if not PAK_RESOURCE_ID_TO_STRING['inited']:
    678     # Try to find resource header files generated by grit when
    679     # building the pak file. We'll look for files named *resources.h"
    680     # and lines of the type:
    681     #    #define MY_RESOURCE_JS 1234
    682     PAK_RESOURCE_ID_TO_STRING['inited'] = True
    683     gen_dir = os.path.join(os.path.dirname(pak_file), 'gen')
    684     if os.path.isdir(gen_dir):
    685       for dirname, _dirs, files in os.walk(gen_dir):
    686         for filename in files:
    687           if filename.endswith('resources.h'):
    688             LoadPakIdsFromResourceFile(os.path.join(dirname, filename))
    689   return PAK_RESOURCE_ID_TO_STRING.get(resource_id,
    690                                        'Pak Resource %d' % resource_id)
    691 
    692 def AddPakData(symbols, pak_file):
    693   """Adds pseudo-symbols from a pak file."""
    694   pak_file = os.path.abspath(pak_file)
    695   with open(pak_file, 'rb') as pak:
    696     data = pak.read()
    697 
    698   PAK_FILE_VERSION = 4
    699   HEADER_LENGTH = 2 * 4 + 1  # Two uint32s. (file version, number of entries)
    700                              # and one uint8 (encoding of text resources)
    701   INDEX_ENTRY_SIZE = 2 + 4  # Each entry is a uint16 and a uint32.
    702   version, num_entries, _encoding = struct.unpack('<IIB', data[:HEADER_LENGTH])
    703   assert version == PAK_FILE_VERSION, ('Unsupported pak file '
    704                                        'version (%d) in %s. Only '
    705                                        'support version %d' %
    706                                        (version, pak_file, PAK_FILE_VERSION))
    707   if num_entries > 0:
    708     # Read the index and data.
    709     data = data[HEADER_LENGTH:]
    710     for _ in range(num_entries):
    711       resource_id, offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
    712       data = data[INDEX_ENTRY_SIZE:]
    713       _next_id, next_offset = struct.unpack('<HI', data[:INDEX_ENTRY_SIZE])
    714       resource_size = next_offset - offset
    715 
    716       symbol_name = GetReadablePakResourceName(pak_file, resource_id)
    717       symbol_path = pak_file
    718       symbol_type = 'd' # Data. Approximation.
    719       symbol_size = resource_size
    720       symbols.append((symbol_name, symbol_type, symbol_size, symbol_path))
    721 
    722 def _find_in_system_path(binary):
    723   """Locate the full path to binary in the system path or return None
    724   if not found."""
    725   system_path = os.environ["PATH"].split(os.pathsep)
    726   for path in system_path:
    727     binary_path = os.path.join(path, binary)
    728     if os.path.isfile(binary_path):
    729       return binary_path
    730   return None
    731 
    732 def CheckDebugFormatSupport(library, addr2line_binary):
    733   """Kills the program if debug data is in an unsupported format.
    734 
    735   There are two common versions of the DWARF debug formats and
    736   since we are right now transitioning from DWARF2 to newer formats,
    737   it's possible to have a mix of tools that are not compatible. Detect
    738   that and abort rather than produce meaningless output."""
    739   tool_output = subprocess.check_output([addr2line_binary, '--version'])
    740   version_re = re.compile(r'^GNU [^ ]+ .* (\d+).(\d+).*?$', re.M)
    741   parsed_output = version_re.match(tool_output)
    742   major = int(parsed_output.group(1))
    743   minor = int(parsed_output.group(2))
    744   supports_dwarf4 = major > 2 or major == 2 and minor > 22
    745 
    746   if supports_dwarf4:
    747     return
    748 
    749   print('Checking version of debug information in %s.' % library)
    750   debug_info = subprocess.check_output(['readelf', '--debug-dump=info',
    751                                        '--dwarf-depth=1', library])
    752   dwarf_version_re = re.compile(r'^\s+Version:\s+(\d+)$', re.M)
    753   parsed_dwarf_format_output = dwarf_version_re.search(debug_info)
    754   version = int(parsed_dwarf_format_output.group(1))
    755   if version > 2:
    756     print('The supplied tools only support DWARF2 debug data but the binary\n' +
    757           'uses DWARF%d. Update the tools or compile the binary\n' % version +
    758           'with -gdwarf-2.')
    759     sys.exit(1)
    760 
    761 
    762 def main():
    763   usage = """%prog [options]
    764 
    765   Runs a spatial analysis on a given library, looking up the source locations
    766   of its symbols and calculating how much space each directory, source file,
    767   and so on is taking. The result is a report that can be used to pinpoint
    768   sources of large portions of the binary, etceteras.
    769 
    770   Under normal circumstances, you only need to pass two arguments, thusly:
    771 
    772       %prog --library /path/to/library --destdir /path/to/output
    773 
    774   In this mode, the program will dump the symbols from the specified library
    775   and map those symbols back to source locations, producing a web-based
    776   report in the specified output directory.
    777 
    778   Other options are available via '--help'.
    779   """
    780   parser = optparse.OptionParser(usage=usage)
    781   parser.add_option('--nm-in', metavar='PATH',
    782                     help='if specified, use nm input from <path> instead of '
    783                     'generating it. Note that source locations should be '
    784                     'present in the file; i.e., no addr2line symbol lookups '
    785                     'will be performed when this option is specified. '
    786                     'Mutually exclusive with --library.')
    787   parser.add_option('--destdir', metavar='PATH',
    788                     help='write output to the specified directory. An HTML '
    789                     'report is generated here along with supporting files; '
    790                     'any existing report will be overwritten.')
    791   parser.add_option('--library', metavar='PATH',
    792                     help='if specified, process symbols in the library at '
    793                     'the specified path. Mutually exclusive with --nm-in.')
    794   parser.add_option('--pak', metavar='PATH',
    795                     help='if specified, includes the contents of the '
    796                     'specified *.pak file in the output.')
    797   parser.add_option('--nm-binary',
    798                     help='use the specified nm binary to analyze library. '
    799                     'This is to be used when the nm in the path is not for '
    800                     'the right architecture or of the right version.')
    801   parser.add_option('--addr2line-binary',
    802                     help='use the specified addr2line binary to analyze '
    803                     'library. This is to be used when the addr2line in '
    804                     'the path is not for the right architecture or '
    805                     'of the right version.')
    806   parser.add_option('--jobs', type='int',
    807                     help='number of jobs to use for the parallel '
    808                     'addr2line processing pool; defaults to 1. More '
    809                     'jobs greatly improve throughput but eat RAM like '
    810                     'popcorn, and take several gigabytes each. Start low '
    811                     'and ramp this number up until your machine begins to '
    812                     'struggle with RAM. '
    813                     'This argument is only valid when using --library.')
    814   parser.add_option('-v', dest='verbose', action='store_true',
    815                     help='be verbose, printing lots of status information.')
    816   parser.add_option('--nm-out', metavar='PATH',
    817                     help='keep the nm output file, and store it at the '
    818                     'specified path. This is useful if you want to see the '
    819                     'fully processed nm output after the symbols have been '
    820                     'mapped to source locations. By default, a tempfile is '
    821                     'used and is deleted when the program terminates.'
    822                     'This argument is only valid when using --library.')
    823   parser.add_option('--legacy', action='store_true',
    824                     help='emit legacy binary size report instead of modern')
    825   parser.add_option('--disable-disambiguation', action='store_true',
    826                     help='disables the disambiguation process altogether,'
    827                     ' NOTE: this may, depending on your toolchain, produce'
    828                     ' output with some symbols at the top layer if addr2line'
    829                     ' could not get the entire source path.')
    830   parser.add_option('--source-path', default='./',
    831                     help='the path to the source code of the output binary, '
    832                     'default set to current directory. Used in the'
    833                     ' disambiguation process.')
    834   opts, _args = parser.parse_args()
    835 
    836   if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
    837     parser.error('exactly one of --library or --nm-in is required')
    838   if (opts.nm_in):
    839     if opts.jobs:
    840       print >> sys.stderr, ('WARNING: --jobs has no effect '
    841                             'when used with --nm-in')
    842   if not opts.destdir:
    843     parser.error('--destdir is required argument')
    844   if not opts.jobs:
    845     # Use the number of processors but cap between 2 and 4 since raw
    846     # CPU power isn't the limiting factor. It's I/O limited, memory
    847     # bus limited and available-memory-limited. Too many processes and
    848     # the computer will run out of memory and it will be slow.
    849     opts.jobs = max(2, min(4, str(multiprocessing.cpu_count())))
    850 
    851   if opts.addr2line_binary:
    852     assert os.path.isfile(opts.addr2line_binary)
    853     addr2line_binary = opts.addr2line_binary
    854   else:
    855     addr2line_binary = _find_in_system_path('addr2line')
    856     assert addr2line_binary, 'Unable to find addr2line in the path. '\
    857         'Use --addr2line-binary to specify location.'
    858 
    859   if opts.nm_binary:
    860     assert os.path.isfile(opts.nm_binary)
    861     nm_binary = opts.nm_binary
    862   else:
    863     nm_binary = _find_in_system_path('nm')
    864     assert nm_binary, 'Unable to find nm in the path. Use --nm-binary '\
    865         'to specify location.'
    866 
    867   if opts.pak:
    868     assert os.path.isfile(opts.pak), 'Could not find ' % opts.pak
    869 
    870   print('addr2line: %s' % addr2line_binary)
    871   print('nm: %s' % nm_binary)
    872 
    873   if opts.library:
    874     CheckDebugFormatSupport(opts.library, addr2line_binary)
    875 
    876   symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library,
    877                          opts.jobs, opts.verbose is True,
    878                          addr2line_binary, nm_binary,
    879                          opts.disable_disambiguation is None,
    880                          opts.source_path)
    881 
    882   if opts.pak:
    883     AddPakData(symbols, opts.pak)
    884 
    885   if not os.path.exists(opts.destdir):
    886     os.makedirs(opts.destdir, 0755)
    887 
    888 
    889   if opts.legacy: # legacy report
    890     DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
    891     DumpLargestSymbols(symbols,
    892                          os.path.join(opts.destdir, 'largest-symbols.js'), 100)
    893     DumpLargestSources(symbols,
    894                          os.path.join(opts.destdir, 'largest-sources.js'), 100)
    895     DumpLargestVTables(symbols,
    896                          os.path.join(opts.destdir, 'largest-vtables.js'), 100)
    897     treemap_out = os.path.join(opts.destdir, 'webtreemap')
    898     if not os.path.exists(treemap_out):
    899       os.makedirs(treemap_out, 0755)
    900     treemap_src = os.path.join('third_party', 'webtreemap', 'src')
    901     shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
    902     shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
    903     shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
    904     shutil.copy(os.path.join('tools', 'binary_size', 'legacy_template',
    905                              'index.html'), opts.destdir)
    906   else: # modern report
    907     if opts.library:
    908       symbol_path_origin_dir = os.path.dirname(os.path.abspath(opts.library))
    909     else:
    910       # Just a guess. Hopefully all paths in the input file are absolute.
    911       symbol_path_origin_dir = os.path.abspath(os.getcwd())
    912     data_js_file_name = os.path.join(opts.destdir, 'data.js')
    913     DumpCompactTree(symbols, symbol_path_origin_dir, data_js_file_name)
    914     d3_out = os.path.join(opts.destdir, 'd3')
    915     if not os.path.exists(d3_out):
    916       os.makedirs(d3_out, 0755)
    917     d3_src = os.path.join(os.path.dirname(__file__),
    918                           '..',
    919                           '..',
    920                           'third_party', 'd3', 'src')
    921     template_src = os.path.join(os.path.dirname(__file__),
    922                                 'template')
    923     shutil.copy(os.path.join(d3_src, 'LICENSE'), d3_out)
    924     shutil.copy(os.path.join(d3_src, 'd3.js'), d3_out)
    925     shutil.copy(os.path.join(template_src, 'index.html'), opts.destdir)
    926     shutil.copy(os.path.join(template_src, 'D3SymbolTreeMap.js'), opts.destdir)
    927 
    928   print 'Report saved to ' + opts.destdir + '/index.html'
    929 
    930 
    931 if __name__ == '__main__':
    932   sys.exit(main())
    933