Home | History | Annotate | Download | only in bloat
      1 #!/usr/bin/python
      2 #
      3 # Copyright 2013 Google Inc. All Rights Reserved.
      4 #
      5 # Licensed under the Apache License, Version 2.0 (the "License");
      6 # you may not use this file except in compliance with the License.
      7 # You may obtain a copy of the License at
      8 #
      9 #     http://www.apache.org/licenses/LICENSE-2.0
     10 #
     11 # Unless required by applicable law or agreed to in writing, software
     12 # distributed under the License is distributed on an "AS IS" BASIS,
     13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     14 # See the License for the specific language governing permissions and
     15 # limitations under the License.
     16 
     17 import fileinput
     18 import operator
     19 import optparse
     20 import os
     21 import pprint
     22 import re
     23 import subprocess
     24 import sys
     25 import json
     26 
     27 def format_bytes(bytes):
     28     """Pretty-print a number of bytes."""
     29     if bytes > 1e6:
     30         bytes = bytes / 1.0e6
     31         return '%.1fm' % bytes
     32     if bytes > 1e3:
     33         bytes = bytes / 1.0e3
     34         return '%.1fk' % bytes
     35     return str(bytes)
     36 
     37 
     38 def symbol_type_to_human(type):
     39     """Convert a symbol type as printed by nm into a human-readable name."""
     40     return {
     41         'b': 'bss',
     42         'd': 'data',
     43         'r': 'read-only data',
     44         't': 'code',
     45         'u': 'weak symbol', # Unique global.
     46         'w': 'weak symbol',
     47         'v': 'weak symbol'
     48         }[type]
     49 
     50 
     51 def parse_nm(input):
     52     """Parse nm output.
     53 
     54     Argument: an iterable over lines of nm output.
     55 
     56     Yields: (symbol name, symbol type, symbol size, source file path).
     57     Path may be None if nm couldn't figure out the source file.
     58     """
     59 
     60     # Match lines with size + symbol + optional filename.
     61     sym_re = re.compile(r'^[0-9a-f]+ ([0-9a-f]+) (.) ([^\t]+)(?:\t(.*):\d+)?$')
     62 
     63     # Match lines with addr but no size.
     64     addr_re = re.compile(r'^[0-9a-f]+ (.) ([^\t]+)(?:\t.*)?$')
     65     # Match lines that don't have an address at all -- typically external symbols.
     66     noaddr_re = re.compile(r'^ + (.) (.*)$')
     67 
     68     for line in input:
     69         line = line.rstrip()
     70         match = sym_re.match(line)
     71         if match:
     72             size, type, sym = match.groups()[0:3]
     73             size = int(size, 16)
     74             type = type.lower()
     75             if type in ['u', 'v']:
     76                 type = 'w'  # just call them all weak
     77             if type == 'b':
     78                 continue  # skip all BSS for now
     79             path = match.group(4)
     80             yield sym, type, size, path
     81             continue
     82         match = addr_re.match(line)
     83         if match:
     84             type, sym = match.groups()[0:2]
     85             # No size == we don't care.
     86             continue
     87         match = noaddr_re.match(line)
     88         if match:
     89             type, sym = match.groups()
     90             if type in ('U', 'w'):
     91                 # external or weak symbol
     92                 continue
     93 
     94         print >>sys.stderr, 'unparsed:', repr(line)
     95 
     96 def demangle(ident, cppfilt):
     97     if cppfilt and ident.startswith('_Z'):
     98         # Demangle names when possible. Mangled names all start with _Z.
     99         ident = subprocess.check_output([cppfilt, ident]).strip()
    100     return ident
    101 
    102 
    103 class Suffix:
    104     def __init__(self, suffix, replacement):
    105         self.pattern = '^(.*)' + suffix + '(.*)$'
    106         self.re = re.compile(self.pattern)
    107         self.replacement = replacement
    108 
    109 class SuffixCleanup:
    110     """Pre-compile suffix regular expressions."""
    111     def __init__(self):
    112         self.suffixes = [
    113             Suffix('\.part\.([0-9]+)',      'part'),
    114             Suffix('\.constprop\.([0-9]+)', 'constprop'),
    115             Suffix('\.isra\.([0-9]+)',      'isra'),
    116         ]
    117     def cleanup(self, ident, cppfilt):
    118         """Cleanup identifiers that have suffixes preventing demangling,
    119            and demangle if possible."""
    120         to_append = []
    121         for s in self.suffixes:
    122             found = s.re.match(ident)
    123             if not found:
    124                 continue
    125             to_append += [' [' + s.replacement + '.' + found.group(2) + ']']
    126             ident = found.group(1) + found.group(3)
    127         if len(to_append) > 0:
    128             # Only try to demangle if there were suffixes.
    129             ident = demangle(ident, cppfilt)
    130         for s in to_append:
    131             ident += s
    132         return ident
    133 
    134 suffix_cleanup = SuffixCleanup()
    135 
    136 def parse_cpp_name(name, cppfilt):
    137     name = suffix_cleanup.cleanup(name, cppfilt)
    138 
    139     # Turn prefixes into suffixes so namespacing works.
    140     prefixes = [
    141         ['bool ',                         ''],
    142         ['construction vtable for ',      ' [construction vtable]'],
    143         ['global constructors keyed to ', ' [global constructors]'],
    144         ['guard variable for ',           ' [guard variable]'],
    145         ['int ',                          ''],
    146         ['non-virtual thunk to ',         ' [non-virtual thunk]'],
    147         ['typeinfo for ',                 ' [typeinfo]'],
    148         ['typeinfo name for ',            ' [typeinfo name]'],
    149         ['virtual thunk to ',             ' [virtual thunk]'],
    150         ['void ',                         ''],
    151         ['vtable for ',                   ' [vtable]'],
    152         ['VTT for ',                      ' [VTT]'],
    153     ]
    154     for prefix, replacement in prefixes:
    155         if name.startswith(prefix):
    156             name = name[len(prefix):] + replacement
    157     # Simplify parenthesis parsing.
    158     replacements = [
    159         ['(anonymous namespace)', '[anonymous namespace]'],
    160     ]
    161     for value, replacement in replacements:
    162         name = name.replace(value, replacement)
    163 
    164     def parse_one(val):
    165         """Returns (leftmost-part, remaining)."""
    166         if (val.startswith('operator') and
    167             not (val[8].isalnum() or val[8] == '_')):
    168             # Operator overload function, terminate.
    169             return (val, '')
    170         co = val.find('::')
    171         lt = val.find('<')
    172         pa = val.find('(')
    173         co = len(val) if co == -1 else co
    174         lt = len(val) if lt == -1 else lt
    175         pa = len(val) if pa == -1 else pa
    176         if co < lt and co < pa:
    177             # Namespace or type name.
    178             return (val[:co], val[co+2:])
    179         if lt < pa:
    180             # Template. Make sure we capture nested templates too.
    181             open_tmpl = 1
    182             gt = lt
    183             while val[gt] != '>' or open_tmpl != 0:
    184                 gt = gt + 1
    185                 if val[gt] == '<':
    186                     open_tmpl = open_tmpl + 1
    187                 if val[gt] == '>':
    188                     open_tmpl = open_tmpl - 1
    189             ret = val[gt+1:]
    190             if ret.startswith('::'):
    191                 ret = ret[2:]
    192             if ret.startswith('('):
    193                 # Template function, terminate.
    194                 return (val, '')
    195             return (val[:gt+1], ret)
    196         # Terminate with any function name, identifier, or unmangled name.
    197         return (val, '')
    198 
    199     parts = []
    200     while len(name) > 0:
    201         (part, name) = parse_one(name)
    202         assert len(part) > 0
    203         parts.append(part)
    204     return parts
    205 
    206 
    207 def treeify_syms(symbols, strip_prefix=None, cppfilt=None):
    208     dirs = {}
    209     for sym, type, size, path in symbols:
    210         if path:
    211             path = os.path.normpath(path)
    212             if strip_prefix and path.startswith(strip_prefix):
    213                 path = path[len(strip_prefix):]
    214             elif path.startswith('/'):
    215                 path = path[1:]
    216             path = ['[path]'] + path.split('/')
    217 
    218         parts = parse_cpp_name(sym, cppfilt)
    219         if len(parts) == 1:
    220           if path:
    221             # No namespaces, group with path.
    222             parts = path + parts
    223           else:
    224             new_prefix = ['[ungrouped]']
    225             regroups = [
    226                 ['.L.str',                 '[str]'],
    227                 ['.L__PRETTY_FUNCTION__.', '[__PRETTY_FUNCTION__]'],
    228                 ['.L__func__.',            '[__func__]'],
    229                 ['.Lswitch.table',         '[switch table]'],
    230             ]
    231             for prefix, group in regroups:
    232                 if parts[0].startswith(prefix):
    233                     parts[0] = parts[0][len(prefix):]
    234                     parts[0] = demangle(parts[0], cppfilt)
    235                     new_prefix += [group]
    236                     break
    237             parts = new_prefix + parts
    238 
    239         key = parts.pop()
    240         tree = dirs
    241         try:
    242             depth = 0
    243             for part in parts:
    244                 depth = depth + 1
    245                 assert part != '', path
    246                 if part not in tree:
    247                     tree[part] = {'$bloat_symbols':{}}
    248                 if type not in tree[part]['$bloat_symbols']:
    249                     tree[part]['$bloat_symbols'][type] = 0
    250                 tree[part]['$bloat_symbols'][type] += 1
    251                 tree = tree[part]
    252             old_size, old_symbols = tree.get(key, (0, {}))
    253             if type not in old_symbols:
    254                 old_symbols[type] = 0
    255             old_symbols[type] += 1
    256             tree[key] = (old_size + size, old_symbols)
    257         except:
    258             print >>sys.stderr, 'sym `%s`\tparts `%s`\tkey `%s`' % (sym, parts, key)
    259             raise
    260     return dirs
    261 
    262 
    263 def jsonify_tree(tree, name):
    264     children = []
    265     total = 0
    266     files = 0
    267 
    268     for key, val in tree.iteritems():
    269         if key == '$bloat_symbols':
    270             continue
    271         if isinstance(val, dict):
    272             subtree = jsonify_tree(val, key)
    273             total += subtree['data']['$area']
    274             children.append(subtree)
    275         else:
    276             (size, symbols) = val
    277             total += size
    278             assert len(symbols) == 1, symbols.values()[0] == 1
    279             symbol = symbol_type_to_human(symbols.keys()[0])
    280             children.append({
    281                     'name': key + ' ' + format_bytes(size),
    282                     'data': {
    283                         '$area': size,
    284                         '$symbol': symbol,
    285                     }
    286             })
    287 
    288     children.sort(key=lambda child: -child['data']['$area'])
    289     dominant_symbol = ''
    290     if '$bloat_symbols' in tree:
    291         dominant_symbol = symbol_type_to_human(
    292             max(tree['$bloat_symbols'].iteritems(),
    293                 key=operator.itemgetter(1))[0])
    294     return {
    295         'name': name + ' ' + format_bytes(total),
    296         'data': {
    297             '$area': total,
    298             '$dominant_symbol': dominant_symbol,
    299             },
    300         'children': children,
    301         }
    302 
    303 
    304 def dump_nm(nmfile, strip_prefix, cppfilt):
    305     dirs = treeify_syms(parse_nm(nmfile), strip_prefix, cppfilt)
    306     print ('var kTree = ' +
    307            json.dumps(jsonify_tree(dirs, '[everything]'), indent=2))
    308 
    309 
    310 def parse_objdump(input):
    311     """Parse objdump -h output."""
    312     sec_re = re.compile('^\d+ (\S+) +([0-9a-z]+)')
    313     sections = []
    314     debug_sections = []
    315 
    316     for line in input:
    317         line = line.strip()
    318         match = sec_re.match(line)
    319         if match:
    320             name, size = match.groups()
    321             if name.startswith('.'):
    322                 name = name[1:]
    323             if name.startswith('debug_'):
    324                 name = name[len('debug_'):]
    325                 debug_sections.append((name, int(size, 16)))
    326             else:
    327                 sections.append((name, int(size, 16)))
    328             continue
    329     return sections, debug_sections
    330 
    331 
    332 def jsonify_sections(name, sections):
    333     children = []
    334     total = 0
    335     for section, size in sections:
    336         children.append({
    337                 'name': section + ' ' + format_bytes(size),
    338                 'data': { '$area': size }
    339                 })
    340         total += size
    341 
    342     children.sort(key=lambda child: -child['data']['$area'])
    343 
    344     return {
    345         'name': name + ' ' + format_bytes(total),
    346         'data': { '$area': total },
    347         'children': children
    348         }
    349 
    350 
    351 def dump_sections(objdump):
    352     sections, debug_sections = parse_objdump(objdump)
    353     sections = jsonify_sections('sections', sections)
    354     debug_sections = jsonify_sections('debug', debug_sections)
    355     size = sections['data']['$area'] + debug_sections['data']['$area']
    356     print 'var kTree = ' + json.dumps({
    357             'name': 'top ' + format_bytes(size),
    358             'data': { '$area': size },
    359             'children': [ debug_sections, sections ]})
    360 
    361 
    362 usage="""%prog [options] MODE
    363 
    364 Modes are:
    365   syms: output symbols json suitable for a treemap
    366   dump: print symbols sorted by size (pipe to head for best output)
    367   sections: output binary sections json suitable for a treemap
    368 
    369 nm output passed to --nm-output should from running a command
    370 like the following (note, can take a long time -- 30 minutes):
    371   nm -C -S -l /path/to/binary > nm.out
    372 
    373 objdump output passed to --objdump-output should be from a command
    374 like:
    375   objdump -h /path/to/binary > objdump.out"""
    376 parser = optparse.OptionParser(usage=usage)
    377 parser.add_option('--nm-output', action='store', dest='nmpath',
    378                   metavar='PATH', default='nm.out',
    379                   help='path to nm output [default=nm.out]')
    380 parser.add_option('--objdump-output', action='store', dest='objdumppath',
    381                   metavar='PATH', default='objdump.out',
    382                   help='path to objdump output [default=objdump.out]')
    383 parser.add_option('--strip-prefix', metavar='PATH', action='store',
    384                   help='strip PATH prefix from paths; e.g. /path/to/src/root')
    385 parser.add_option('--filter', action='store',
    386                   help='include only symbols/files matching FILTER')
    387 parser.add_option('--c++filt', action='store', metavar='PATH', dest='cppfilt',
    388                   default='c++filt', help="Path to c++filt, used to demangle "
    389                   "symbols that weren't handled by nm. Set to an invalid path "
    390                   "to disable.")
    391 opts, args = parser.parse_args()
    392 
    393 if len(args) != 1:
    394     parser.print_usage()
    395     sys.exit(1)
    396 
    397 mode = args[0]
    398 if mode == 'syms':
    399     nmfile = open(opts.nmpath, 'r')
    400     try:
    401         res = subprocess.check_output([opts.cppfilt, 'main'])
    402         if res.strip() != 'main':
    403             print >>sys.stderr, ("%s failed demangling, "
    404                                  "output won't be demangled." % opt.cppfilt)
    405             opts.cppfilt = None
    406     except:
    407         print >>sys.stderr, ("Could not find c++filt at %s, "
    408                              "output won't be demangled." % opt.cppfilt)
    409         opts.cppfilt = None
    410     dump_nm(nmfile, strip_prefix=opts.strip_prefix, cppfilt=opts.cppfilt)
    411 elif mode == 'sections':
    412     objdumpfile = open(opts.objdumppath, 'r')
    413     dump_sections(objdumpfile)
    414 elif mode == 'dump':
    415     nmfile = open(opts.nmpath, 'r')
    416     syms = list(parse_nm(nmfile))
    417     # a list of (sym, type, size, path); sort by size.
    418     syms.sort(key=lambda x: -x[2])
    419     total = 0
    420     for sym, type, size, path in syms:
    421         if type in ('b', 'w'):
    422             continue  # skip bss and weak symbols
    423         if path is None:
    424             path = ''
    425         if opts.filter and not (opts.filter in sym or opts.filter in path):
    426             continue
    427         print '%6s %s (%s) %s' % (format_bytes(size), sym,
    428                                   symbol_type_to_human(type), path)
    429         total += size
    430     print '%6s %s' % (format_bytes(total), 'total'),
    431 else:
    432     print 'unknown mode'
    433     parser.print_usage()
    434