Home | History | Annotate | Download | only in utils
      1 #!/usr/bin/env python
      2 
      3 """A tool for extracting a list of symbols to export
      4 
      5 When exporting symbols from a dll or exe we either need to mark the symbols in
      6 the source code as __declspec(dllexport) or supply a list of symbols to the
      7 linker. This program automates the latter by inspecting the symbol tables of a
      8 list of link inputs and deciding which of those symbols need to be exported.
      9 
     10 We can't just export all the defined symbols, as there's a limit of 65535
     11 exported symbols and in clang we go way over that, particularly in a debug
     12 build. Therefore a large part of the work is pruning symbols either which can't
     13 be imported, or which we think are things that have definitions in public header
     14 files (i.e. template instantiations) and we would get defined in the thing
     15 importing these symbols anyway.
     16 """
     17 
     18 from __future__ import print_function
     19 import sys
     20 import re
     21 import os
     22 import subprocess
     23 import multiprocessing
     24 import argparse
     25 
     26 # Define functions which extract a list of symbols from a library using several
     27 # different tools. We use subprocess.Popen and yield a symbol at a time instead
     28 # of using subprocess.check_output and returning a list as, especially on
     29 # Windows, waiting for the entire output to be ready can take a significant
     30 # amount of time.
     31 
     32 def dumpbin_get_symbols(lib):
     33     process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
     34                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     35                                universal_newlines=True)
     36     process.stdin.close()
     37     for line in process.stdout:
     38         # Look for external symbols that are defined in some section
     39         match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
     40         if match:
     41             yield match.group(1)
     42     process.wait()
     43 
     44 def nm_get_symbols(lib):
     45     process = subprocess.Popen(['nm',lib], bufsize=1,
     46                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     47                                universal_newlines=True)
     48     process.stdin.close()
     49     for line in process.stdout:
     50         # Look for external symbols that are defined in some section
     51         match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
     52         if match:
     53             yield match.group(1)
     54     process.wait()
     55 
     56 def readobj_get_symbols(lib):
     57     process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
     58                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     59                                universal_newlines=True)
     60     process.stdin.close()
     61     for line in process.stdout:
     62         # When looking through the output of llvm-readobj we expect to see Name,
     63         # Section, then StorageClass, so record Name and Section when we see
     64         # them and decide if this is a defined external symbol when we see
     65         # StorageClass.
     66         match = re.search('Name: (\S+)', line)
     67         if match:
     68             name = match.group(1)
     69         match = re.search('Section: (\S+)', line)
     70         if match:
     71             section = match.group(1)
     72         match = re.search('StorageClass: (\S+)', line)
     73         if match:
     74             storageclass = match.group(1)
     75             if section != 'IMAGE_SYM_ABSOLUTE' and \
     76                section != 'IMAGE_SYM_UNDEFINED' and \
     77                storageclass == 'External':
     78                 yield name
     79     process.wait()
     80 
     81 # Define functions which determine if the target is 32-bit Windows (as that's
     82 # where calling convention name decoration happens).
     83 
     84 def dumpbin_is_32bit_windows(lib):
     85     # dumpbin /headers can output a huge amount of data (>100MB in a debug
     86     # build) so we read only up to the 'machine' line then close the output.
     87     process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
     88                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     89                                universal_newlines=True)
     90     process.stdin.close()
     91     retval = False
     92     for line in process.stdout:
     93         match = re.match('.+machine \((\S+)\)', line)
     94         if match:
     95             retval = (match.group(1) == 'x86')
     96             break
     97     process.stdout.close()
     98     process.wait()
     99     return retval
    100 
    101 def objdump_is_32bit_windows(lib):
    102     output = subprocess.check_output(['objdump','-f',lib],
    103                                      universal_newlines=True)
    104     for line in output:
    105         match = re.match('.+file format (\S+)', line)
    106         if match:
    107             return (match.group(1) == 'pe-i386')
    108     return False
    109 
    110 def readobj_is_32bit_windows(lib):
    111     output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
    112                                      universal_newlines=True)
    113     for line in output:
    114         match = re.match('Format: (\S+)', line)
    115         if match:
    116             return (match.group(1) == 'COFF-i386')
    117     return False
    118 
    119 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
    120 # identifier/type mangling we can decide which symbols could possibly be
    121 # required and which we can discard.
    122 def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
    123     # Keep unmangled (i.e. extern "C") names
    124     if not '?' in symbol:
    125         if calling_convention_decoration:
    126             # Remove calling convention decoration from names
    127             match = re.match('[_@]([^@]+)', symbol)
    128             if match:
    129                 return match.group(1)
    130         return symbol
    131     # Function template instantiations start with ?$, discard them as it's
    132     # assumed that the definition is public
    133     elif symbol.startswith('??$'):
    134         return None
    135     # Deleting destructors start with ?_G or ?_E and can be discarded because
    136     # link.exe gives you a warning telling you they can't be exported if you
    137     # don't
    138     elif symbol.startswith('??_G') or symbol.startswith('??_E'):
    139         return None
    140     # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
    141     # defined in headers and not required to be kept
    142     elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
    143         return None
    144     # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
    145     # that mentions an anonymous namespace can be discarded, as the anonymous
    146     # namespace doesn't exist outside of that translation unit.
    147     elif re.search('\?A(0x\w+)?@', symbol):
    148         return None
    149     # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
    150     # bit of a mess and imprecise, but that avoids having to completely demangle
    151     # the symbol name. The outermost namespace is at the end of the identifier
    152     # mangling, and the identifier mangling is followed by the type mangling, so
    153     # we look for (llvm|clang)@@ followed by something that looks like a
    154     # function type mangling. To spot a function type we use (this is derived
    155     # from clang/lib/AST/MicrosoftMangle.cpp):
    156     # <function-type> ::= <function-class> <this-cvr-qualifiers>
    157     #                     <calling-convention> <return-type>
    158     #                     <argument-list> <throw-spec>
    159     # <function-class> ::= [A-Z]
    160     # <this-cvr-qualifiers> ::= [A-Z0-9_]*
    161     # <calling-convention> ::= [A-JQ]
    162     # <return-type> ::= .+
    163     # <argument-list> ::= X   (void)
    164     #                 ::= .+@ (list of types)
    165     #                 ::= .*Z (list of types, varargs)
    166     # <throw-spec> ::= exceptions are not allowed
    167     elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
    168         return symbol
    169     return None
    170 
    171 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
    172 # demangle the identifier mangling to identify symbols that can be safely
    173 # discarded.
    174 def should_keep_itanium_symbol(symbol, calling_convention_decoration):
    175     # Start by removing any calling convention decoration (which we expect to
    176     # see on all symbols, even mangled C++ symbols)
    177     if calling_convention_decoration and symbol.startswith('_'):
    178         symbol = symbol[1:]
    179     # Keep unmangled names
    180     if not symbol.startswith('_') and not symbol.startswith('.'):
    181         return symbol
    182     # Discard manglings that aren't nested names
    183     match = re.match('_Z(T[VTIS])?(N.+)', symbol)
    184     if not match:
    185         return None
    186     # Demangle the name. If the name is too complex then we don't need to keep
    187     # it, but it the demangling fails then keep the symbol just in case.
    188     try:
    189         names, _ = parse_itanium_nested_name(match.group(2))
    190     except TooComplexName:
    191         return None
    192     if not names:
    193         return symbol
    194     # Constructors and destructors of templates classes are assumed to be
    195     # defined in headers and not required to be kept
    196     if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
    197         return None
    198     # Discard function template instantiations as it's assumed that the
    199     # definition is public
    200     elif names[-1][1]:
    201         return None
    202     # Keep llvm:: and clang:: names
    203     elif names[0][0] == '4llvm' or names[0][0] == '5clang':
    204         return symbol
    205     # Discard everything else
    206     else:
    207         return None
    208 
    209 # Certain kinds of complex manglings we assume cannot be part of a public
    210 # interface, and we handle them by raising an exception.
    211 class TooComplexName(Exception):
    212     pass
    213 
    214 # Parse an itanium mangled name from the start of a string and return a
    215 # (name, rest of string) pair.
    216 def parse_itanium_name(arg):
    217     # Check for a normal name
    218     match = re.match('(\d+)(.+)', arg)
    219     if match:
    220         n = int(match.group(1))
    221         name = match.group(1)+match.group(2)[:n]
    222         rest = match.group(2)[n:]
    223         return name, rest
    224     # Check for constructor/destructor names
    225     match = re.match('([CD][123])(.+)', arg)
    226     if match:
    227         return match.group(1), match.group(2)
    228     # Assume that a sequence of characters that doesn't end a nesting is an
    229     # operator (this is very imprecise, but appears to be good enough)
    230     match = re.match('([^E]+)(.+)', arg)
    231     if match:
    232         return match.group(1), match.group(2)
    233     # Anything else: we can't handle it
    234     return None, arg
    235 
    236 # Parse an itanium mangled template argument list from the start of a string
    237 # and throw it away, returning the rest of the string.
    238 def skip_itanium_template(arg):
    239     # A template argument list starts with I
    240     assert arg.startswith('I'), arg
    241     tmp = arg[1:]
    242     while tmp:
    243         # Check for names
    244         match = re.match('(\d+)(.+)', tmp)
    245         if match:
    246             n = int(match.group(1))
    247             tmp =  match.group(2)[n:]
    248             continue
    249         # Check for substitutions
    250         match = re.match('S[A-Z0-9]*_(.+)', tmp)
    251         if match:
    252             tmp = match.group(1)
    253         # Start of a template
    254         elif tmp.startswith('I'):
    255             tmp = skip_itanium_template(tmp)
    256         # Start of a nested name
    257         elif tmp.startswith('N'):
    258             _, tmp = parse_itanium_nested_name(tmp)
    259         # Start of an expression: assume that it's too complicated
    260         elif tmp.startswith('L') or tmp.startswith('X'):
    261             raise TooComplexName
    262         # End of the template
    263         elif tmp.startswith('E'):
    264             return tmp[1:]
    265         # Something else: probably a type, skip it
    266         else:
    267             tmp = tmp[1:]
    268     return None
    269 
    270 # Parse an itanium mangled nested name and transform it into a list of pairs of
    271 # (name, is_template), returning (list, rest of string).
    272 def parse_itanium_nested_name(arg):
    273     # A nested name starts with N
    274     assert arg.startswith('N'), arg
    275     ret = []
    276 
    277     # Skip past the N, and possibly a substitution
    278     match = re.match('NS[A-Z0-9]*_(.+)', arg)
    279     if match:
    280         tmp = match.group(1)
    281     else:
    282         tmp = arg[1:]
    283 
    284     # Skip past CV-qualifiers and ref qualifiers
    285     match = re.match('[rVKRO]*(.+)', tmp);
    286     if match:
    287         tmp = match.group(1)
    288 
    289     # Repeatedly parse names from the string until we reach the end of the
    290     # nested name
    291     while tmp:
    292         # An E ends the nested name
    293         if tmp.startswith('E'):
    294             return ret, tmp[1:]
    295         # Parse a name
    296         name_part, tmp = parse_itanium_name(tmp)
    297         if not name_part:
    298             # If we failed then we don't know how to demangle this
    299             return None, None
    300         is_template = False
    301         # If this name is a template record that, then skip the template
    302         # arguments
    303         if tmp.startswith('I'):
    304             tmp = skip_itanium_template(tmp)
    305             is_template = True
    306         # Add the name to the list
    307         ret.append((name_part, is_template))
    308 
    309     # If we get here then something went wrong
    310     return None, None
    311 
    312 def extract_symbols(arg):
    313     get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
    314     symbols = dict()
    315     for symbol in get_symbols(lib):
    316         symbol = should_keep_symbol(symbol, calling_convention_decoration)
    317         if symbol:
    318             symbols[symbol] = 1 + symbols.setdefault(symbol,0)
    319     return symbols
    320 
    321 if __name__ == '__main__':
    322     tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
    323     parser = argparse.ArgumentParser(
    324         description='Extract symbols to export from libraries')
    325     parser.add_argument('--mangling', choices=['itanium','microsoft'],
    326                         required=True, help='expected symbol mangling scheme')
    327     parser.add_argument('--tools', choices=tool_exes, nargs='*',
    328                         help='tools to use to extract symbols and determine the'
    329                         ' target')
    330     parser.add_argument('libs', metavar='lib', type=str, nargs='+',
    331                         help='libraries to extract symbols from')
    332     parser.add_argument('-o', metavar='file', type=str, help='output to file')
    333     args = parser.parse_args()
    334 
    335     # Determine the function to use to get the list of symbols from the inputs,
    336     # and the function to use to determine if the target is 32-bit windows.
    337     tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
    338               'nm' : (nm_get_symbols, None),
    339               'objdump' : (None, objdump_is_32bit_windows),
    340               'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
    341     get_symbols = None
    342     is_32bit_windows = None
    343     # If we have a tools argument then use that for the list of tools to check
    344     if args.tools:
    345         tool_exes = args.tools
    346     # Find a tool to use by trying each in turn until we find one that exists
    347     # (subprocess.call will throw OSError when the program does not exist)
    348     get_symbols = None
    349     for exe in tool_exes:
    350         try:
    351             # Close std streams as we don't want any output and we don't
    352             # want the process to wait for something on stdin.
    353             p = subprocess.Popen([exe], stdout=subprocess.PIPE,
    354                                  stderr=subprocess.PIPE,
    355                                  stdin=subprocess.PIPE,
    356                                  universal_newlines=True)
    357             p.stdout.close()
    358             p.stderr.close()
    359             p.stdin.close()
    360             p.wait()
    361             # Keep going until we have a tool to use for both get_symbols and
    362             # is_32bit_windows
    363             if not get_symbols:
    364                 get_symbols = tools[exe][0]
    365             if not is_32bit_windows:
    366                 is_32bit_windows = tools[exe][1]
    367             if get_symbols and is_32bit_windows:
    368                 break
    369         except OSError:
    370             continue
    371     if not get_symbols:
    372         print("Couldn't find a program to read symbols with", file=sys.stderr)
    373         exit(1)
    374     if not is_32bit_windows:
    375         print("Couldn't find a program to determing the target", file=sys.stderr)
    376         exit(1)
    377 
    378     # How we determine which symbols to keep and which to discard depends on
    379     # the mangling scheme
    380     if args.mangling == 'microsoft':
    381         should_keep_symbol = should_keep_microsoft_symbol
    382     else:
    383         should_keep_symbol = should_keep_itanium_symbol
    384 
    385     # Get the list of libraries to extract symbols from
    386     libs = list()
    387     for lib in args.libs:
    388         # When invoked by cmake the arguments are the cmake target names of the
    389         # libraries, so we need to add .lib/.a to the end and maybe lib to the
    390         # start to get the filename. Also allow objects.
    391         suffixes = ['.lib','.a','.obj','.o']
    392         if not any([lib.endswith(s) for s in suffixes]):
    393             for s in suffixes:
    394                 if os.path.exists(lib+s):
    395                     lib = lib+s
    396                     break
    397                 if os.path.exists('lib'+lib+s):
    398                     lib = 'lib'+lib+s
    399                     break
    400         if not any([lib.endswith(s) for s in suffixes]):
    401             print("Don't know what to do with argument "+lib, file=sys.stderr)
    402             exit(1)
    403         libs.append(lib)
    404 
    405     # Check if calling convention decoration is used by inspecting the first
    406     # library in the list
    407     calling_convention_decoration = is_32bit_windows(libs[0])
    408 
    409     # Extract symbols from libraries in parallel. This is a huge time saver when
    410     # doing a debug build, as there are hundreds of thousands of symbols in each
    411     # library.
    412     pool = multiprocessing.Pool()
    413     try:
    414         # Only one argument can be passed to the mapping function, and we can't
    415         # use a lambda or local function definition as that doesn't work on
    416         # windows, so create a list of tuples which duplicates the arguments
    417         # that are the same in all calls.
    418         vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
    419         # Do an async map then wait for the result to make sure that
    420         # KeyboardInterrupt gets caught correctly (see
    421         # http://bugs.python.org/issue8296)
    422         result = pool.map_async(extract_symbols, vals)
    423         pool.close()
    424         libs_symbols = result.get(3600)
    425     except KeyboardInterrupt:
    426         # On Ctrl-C terminate everything and exit
    427         pool.terminate()
    428         pool.join()
    429         exit(1)
    430 
    431     # Merge everything into a single dict
    432     symbols = dict()
    433     for this_lib_symbols in libs_symbols:
    434         for k,v in list(this_lib_symbols.items()):
    435             symbols[k] = v + symbols.setdefault(k,0)
    436 
    437     # Count instances of member functions of template classes, and map the
    438     # symbol name to the function+class. We do this under the assumption that if
    439     # a member function of a template class is instantiated many times it's
    440     # probably declared in a public header file.
    441     template_function_count = dict()
    442     template_function_mapping = dict()
    443     template_function_count[""] = 0
    444     for k in symbols:
    445         name = None
    446         if args.mangling == 'microsoft':
    447             # Member functions of templates start with
    448             # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
    449             # As manglings go from the innermost scope to the outermost scope
    450             # this means:
    451             #  * When we have a function member of a subclass of a template
    452             #    class then <fn_name> will actually contain the mangling of
    453             #    both the subclass and the function member. This is fine.
    454             #  * When we have a function member of a template subclass of a
    455             #    (possibly template) class then it's the innermost template
    456             #    subclass that becomes <class_name>. This should be OK so long
    457             #    as we don't have multiple classes with a template subclass of
    458             #    the same name.
    459             match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
    460             if match:
    461                 name = match.group(1)
    462         else:
    463             # Find member functions of templates by demangling the name and
    464             # checking if the second-to-last name in the list is a template.
    465             match = re.match('_Z(T[VTIS])?(N.+)', k)
    466             if match:
    467                 try:
    468                     names, _ = parse_itanium_nested_name(match.group(2))
    469                     if names and names[-2][1]:
    470                         name = ''.join([x for x,_ in names])
    471                 except TooComplexName:
    472                     # Manglings that are too complex should already have been
    473                     # filtered out, but if we happen to somehow see one here
    474                     # just leave it as-is.
    475                     pass
    476         if name:
    477             old_count = template_function_count.setdefault(name,0)
    478             template_function_count[name] = old_count + 1
    479             template_function_mapping[k] = name
    480         else:
    481             template_function_mapping[k] = ""
    482 
    483     # Print symbols which both:
    484     #  * Appear in exactly one input, as symbols defined in multiple
    485     #    objects/libraries are assumed to have public definitions.
    486     #  * Aren't instances of member functions of templates which have been
    487     #    instantiated 100 times or more, which are assumed to have public
    488     #    definitions. (100 is an arbitrary guess here.)
    489     if args.o:
    490         outfile = open(args.o,'w')
    491     else:
    492         outfile = sys.stdout
    493     for k,v in list(symbols.items()):
    494         template_count = template_function_count[template_function_mapping[k]]
    495         if v == 1 and template_count < 100:
    496             print(k, file=outfile)
    497