Home | History | Annotate | Download | only in utils
      1 #!/usr/bin/env python
      2 
      3 """A tool for extracting a list of symbols to export
      4 
      5 When exporting symbols from a dll or exe we either need to mark the symbols in
      6 the source code as __declspec(dllexport) or supply a list of symbols to the
      7 linker. This program automates the latter by inspecting the symbol tables of a
      8 list of link inputs and deciding which of those symbols need to be exported.
      9 
     10 We can't just export all the defined symbols, as there's a limit of 65535
     11 exported symbols and in clang we go way over that, particularly in a debug
     12 build. Therefore a large part of the work is pruning symbols either which can't
     13 be imported, or which we think are things that have definitions in public header
     14 files (i.e. template instantiations) and we would get defined in the thing
     15 importing these symbols anyway.
     16 """
     17 
     18 from __future__ import print_function
     19 import sys
     20 import re
     21 import os
     22 import subprocess
     23 import multiprocessing
     24 import argparse
     25 
     26 # Define functions which extract a list of symbols from a library using several
     27 # different tools. We use subprocess.Popen and yield a symbol at a time instead
     28 # of using subprocess.check_output and returning a list as, especially on
     29 # Windows, waiting for the entire output to be ready can take a significant
     30 # amount of time.
     31 
     32 def dumpbin_get_symbols(lib):
     33     process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1,
     34                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     35                                universal_newlines=True)
     36     process.stdin.close()
     37     for line in process.stdout:
     38         # Look for external symbols that are defined in some section
     39         match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line)
     40         if match:
     41             yield match.group(1)
     42     process.wait()
     43 
     44 def nm_get_symbols(lib):
     45     process = subprocess.Popen(['nm',lib], bufsize=1,
     46                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     47                                universal_newlines=True)
     48     process.stdin.close()
     49     for line in process.stdout:
     50         # Look for external symbols that are defined in some section
     51         match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line)
     52         if match:
     53             yield match.group(1)
     54     process.wait()
     55 
     56 def readobj_get_symbols(lib):
     57     process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1,
     58                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     59                                universal_newlines=True)
     60     process.stdin.close()
     61     for line in process.stdout:
     62         # When looking through the output of llvm-readobj we expect to see Name,
     63         # Section, then StorageClass, so record Name and Section when we see
     64         # them and decide if this is a defined external symbol when we see
     65         # StorageClass.
     66         match = re.search('Name: (\S+)', line)
     67         if match:
     68             name = match.group(1)
     69         match = re.search('Section: (\S+)', line)
     70         if match:
     71             section = match.group(1)
     72         match = re.search('StorageClass: (\S+)', line)
     73         if match:
     74             storageclass = match.group(1)
     75             if section != 'IMAGE_SYM_ABSOLUTE' and \
     76                section != 'IMAGE_SYM_UNDEFINED' and \
     77                storageclass == 'External':
     78                 yield name
     79     process.wait()
     80 
     81 # Define functions which determine if the target is 32-bit Windows (as that's
     82 # where calling convention name decoration happens).
     83 
     84 def dumpbin_is_32bit_windows(lib):
     85     # dumpbin /headers can output a huge amount of data (>100MB in a debug
     86     # build) so we read only up to the 'machine' line then close the output.
     87     process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1,
     88                                stdout=subprocess.PIPE, stdin=subprocess.PIPE,
     89                                universal_newlines=True)
     90     process.stdin.close()
     91     retval = False
     92     for line in process.stdout:
     93         match = re.match('.+machine \((\S+)\)', line)
     94         if match:
     95             retval = (match.group(1) == 'x86')
     96             break
     97     process.stdout.close()
     98     process.wait()
     99     return retval
    100 
    101 def objdump_is_32bit_windows(lib):
    102     output = subprocess.check_output(['objdump','-f',lib],
    103                                      universal_newlines=True)
    104     for line in output:
    105         match = re.match('.+file format (\S+)', line)
    106         if match:
    107             return (match.group(1) == 'pe-i386')
    108     return False
    109 
    110 def readobj_is_32bit_windows(lib):
    111     output = subprocess.check_output(['llvm-readobj','-file-headers',lib],
    112                                      universal_newlines=True)
    113     for line in output:
    114         match = re.match('Format: (\S+)', line)
    115         if match:
    116             return (match.group(1) == 'COFF-i386')
    117     return False
    118 
    119 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the
    120 # identifier/type mangling we can decide which symbols could possibly be
    121 # required and which we can discard.
    122 def should_keep_microsoft_symbol(symbol, calling_convention_decoration):
    123     # Keep unmangled (i.e. extern "C") names
    124     if not '?' in symbol:
    125         if calling_convention_decoration:
    126             # Remove calling convention decoration from names
    127             match = re.match('[_@]([^@]+)', symbol)
    128             if match:
    129                 return match.group(1)
    130         return symbol
    131     # Function template instantiations start with ?$; keep the instantiations of
    132     # clang::Type::getAs, as some of them are explipict specializations that are
    133     # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that
    134     # the definition is public
    135     elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol):
    136         return symbol
    137     elif symbol.startswith('??$'):
    138         return None
    139     # Deleting destructors start with ?_G or ?_E and can be discarded because
    140     # link.exe gives you a warning telling you they can't be exported if you
    141     # don't
    142     elif symbol.startswith('??_G') or symbol.startswith('??_E'):
    143         return None
    144     # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be
    145     # defined in headers and not required to be kept
    146     elif symbol.startswith('??0?$') or symbol.startswith('??1?$'):
    147         return None
    148     # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol
    149     # that mentions an anonymous namespace can be discarded, as the anonymous
    150     # namespace doesn't exist outside of that translation unit.
    151     elif re.search('\?A(0x\w+)?@', symbol):
    152         return None
    153     # Keep mangled llvm:: and clang:: function symbols. How we detect these is a
    154     # bit of a mess and imprecise, but that avoids having to completely demangle
    155     # the symbol name. The outermost namespace is at the end of the identifier
    156     # mangling, and the identifier mangling is followed by the type mangling, so
    157     # we look for (llvm|clang)@@ followed by something that looks like a
    158     # function type mangling. To spot a function type we use (this is derived
    159     # from clang/lib/AST/MicrosoftMangle.cpp):
    160     # <function-type> ::= <function-class> <this-cvr-qualifiers>
    161     #                     <calling-convention> <return-type>
    162     #                     <argument-list> <throw-spec>
    163     # <function-class> ::= [A-Z]
    164     # <this-cvr-qualifiers> ::= [A-Z0-9_]*
    165     # <calling-convention> ::= [A-JQ]
    166     # <return-type> ::= .+
    167     # <argument-list> ::= X   (void)
    168     #                 ::= .+@ (list of types)
    169     #                 ::= .*Z (list of types, varargs)
    170     # <throw-spec> ::= exceptions are not allowed
    171     elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol):
    172         return symbol
    173     return None
    174 
    175 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We
    176 # demangle the identifier mangling to identify symbols that can be safely
    177 # discarded.
    178 def should_keep_itanium_symbol(symbol, calling_convention_decoration):
    179     # Start by removing any calling convention decoration (which we expect to
    180     # see on all symbols, even mangled C++ symbols)
    181     if calling_convention_decoration and symbol.startswith('_'):
    182         symbol = symbol[1:]
    183     # Keep unmangled names
    184     if not symbol.startswith('_') and not symbol.startswith('.'):
    185         return symbol
    186     # Discard manglings that aren't nested names
    187     match = re.match('_Z(T[VTIS])?(N.+)', symbol)
    188     if not match:
    189         return None
    190     # Demangle the name. If the name is too complex then we don't need to keep
    191     # it, but it the demangling fails then keep the symbol just in case.
    192     try:
    193         names, _ = parse_itanium_nested_name(match.group(2))
    194     except TooComplexName:
    195         return None
    196     if not names:
    197         return symbol
    198     # Constructors and destructors of templates classes are assumed to be
    199     # defined in headers and not required to be kept
    200     if re.match('[CD][123]', names[-1][0]) and names[-2][1]:
    201         return None
    202     # Keep the instantiations of clang::Type::getAs, as some of them are
    203     # explipict specializations that are defined in clang's lib/AST/Type.cpp;
    204     # discard any other function template instantiations as it's assumed that
    205     # the definition is public
    206     elif symbol.startswith('_ZNK5clang4Type5getAs'):
    207         return symbol
    208     elif names[-1][1]:
    209         return None
    210     # Keep llvm:: and clang:: names
    211     elif names[0][0] == '4llvm' or names[0][0] == '5clang':
    212         return symbol
    213     # Discard everything else
    214     else:
    215         return None
    216 
    217 # Certain kinds of complex manglings we assume cannot be part of a public
    218 # interface, and we handle them by raising an exception.
    219 class TooComplexName(Exception):
    220     pass
    221 
    222 # Parse an itanium mangled name from the start of a string and return a
    223 # (name, rest of string) pair.
    224 def parse_itanium_name(arg):
    225     # Check for a normal name
    226     match = re.match('(\d+)(.+)', arg)
    227     if match:
    228         n = int(match.group(1))
    229         name = match.group(1)+match.group(2)[:n]
    230         rest = match.group(2)[n:]
    231         return name, rest
    232     # Check for constructor/destructor names
    233     match = re.match('([CD][123])(.+)', arg)
    234     if match:
    235         return match.group(1), match.group(2)
    236     # Assume that a sequence of characters that doesn't end a nesting is an
    237     # operator (this is very imprecise, but appears to be good enough)
    238     match = re.match('([^E]+)(.+)', arg)
    239     if match:
    240         return match.group(1), match.group(2)
    241     # Anything else: we can't handle it
    242     return None, arg
    243 
    244 # Parse an itanium mangled template argument list from the start of a string
    245 # and throw it away, returning the rest of the string.
    246 def skip_itanium_template(arg):
    247     # A template argument list starts with I
    248     assert arg.startswith('I'), arg
    249     tmp = arg[1:]
    250     while tmp:
    251         # Check for names
    252         match = re.match('(\d+)(.+)', tmp)
    253         if match:
    254             n = int(match.group(1))
    255             tmp =  match.group(2)[n:]
    256             continue
    257         # Check for substitutions
    258         match = re.match('S[A-Z0-9]*_(.+)', tmp)
    259         if match:
    260             tmp = match.group(1)
    261         # Start of a template
    262         elif tmp.startswith('I'):
    263             tmp = skip_itanium_template(tmp)
    264         # Start of a nested name
    265         elif tmp.startswith('N'):
    266             _, tmp = parse_itanium_nested_name(tmp)
    267         # Start of an expression: assume that it's too complicated
    268         elif tmp.startswith('L') or tmp.startswith('X'):
    269             raise TooComplexName
    270         # End of the template
    271         elif tmp.startswith('E'):
    272             return tmp[1:]
    273         # Something else: probably a type, skip it
    274         else:
    275             tmp = tmp[1:]
    276     return None
    277 
    278 # Parse an itanium mangled nested name and transform it into a list of pairs of
    279 # (name, is_template), returning (list, rest of string).
    280 def parse_itanium_nested_name(arg):
    281     # A nested name starts with N
    282     assert arg.startswith('N'), arg
    283     ret = []
    284 
    285     # Skip past the N, and possibly a substitution
    286     match = re.match('NS[A-Z0-9]*_(.+)', arg)
    287     if match:
    288         tmp = match.group(1)
    289     else:
    290         tmp = arg[1:]
    291 
    292     # Skip past CV-qualifiers and ref qualifiers
    293     match = re.match('[rVKRO]*(.+)', tmp);
    294     if match:
    295         tmp = match.group(1)
    296 
    297     # Repeatedly parse names from the string until we reach the end of the
    298     # nested name
    299     while tmp:
    300         # An E ends the nested name
    301         if tmp.startswith('E'):
    302             return ret, tmp[1:]
    303         # Parse a name
    304         name_part, tmp = parse_itanium_name(tmp)
    305         if not name_part:
    306             # If we failed then we don't know how to demangle this
    307             return None, None
    308         is_template = False
    309         # If this name is a template record that, then skip the template
    310         # arguments
    311         if tmp.startswith('I'):
    312             tmp = skip_itanium_template(tmp)
    313             is_template = True
    314         # Add the name to the list
    315         ret.append((name_part, is_template))
    316 
    317     # If we get here then something went wrong
    318     return None, None
    319 
    320 def extract_symbols(arg):
    321     get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg
    322     symbols = dict()
    323     for symbol in get_symbols(lib):
    324         symbol = should_keep_symbol(symbol, calling_convention_decoration)
    325         if symbol:
    326             symbols[symbol] = 1 + symbols.setdefault(symbol,0)
    327     return symbols
    328 
    329 if __name__ == '__main__':
    330     tool_exes = ['dumpbin','nm','objdump','llvm-readobj']
    331     parser = argparse.ArgumentParser(
    332         description='Extract symbols to export from libraries')
    333     parser.add_argument('--mangling', choices=['itanium','microsoft'],
    334                         required=True, help='expected symbol mangling scheme')
    335     parser.add_argument('--tools', choices=tool_exes, nargs='*',
    336                         help='tools to use to extract symbols and determine the'
    337                         ' target')
    338     parser.add_argument('libs', metavar='lib', type=str, nargs='+',
    339                         help='libraries to extract symbols from')
    340     parser.add_argument('-o', metavar='file', type=str, help='output to file')
    341     args = parser.parse_args()
    342 
    343     # Determine the function to use to get the list of symbols from the inputs,
    344     # and the function to use to determine if the target is 32-bit windows.
    345     tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows),
    346               'nm' : (nm_get_symbols, None),
    347               'objdump' : (None, objdump_is_32bit_windows),
    348               'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) }
    349     get_symbols = None
    350     is_32bit_windows = None
    351     # If we have a tools argument then use that for the list of tools to check
    352     if args.tools:
    353         tool_exes = args.tools
    354     # Find a tool to use by trying each in turn until we find one that exists
    355     # (subprocess.call will throw OSError when the program does not exist)
    356     get_symbols = None
    357     for exe in tool_exes:
    358         try:
    359             # Close std streams as we don't want any output and we don't
    360             # want the process to wait for something on stdin.
    361             p = subprocess.Popen([exe], stdout=subprocess.PIPE,
    362                                  stderr=subprocess.PIPE,
    363                                  stdin=subprocess.PIPE,
    364                                  universal_newlines=True)
    365             p.stdout.close()
    366             p.stderr.close()
    367             p.stdin.close()
    368             p.wait()
    369             # Keep going until we have a tool to use for both get_symbols and
    370             # is_32bit_windows
    371             if not get_symbols:
    372                 get_symbols = tools[exe][0]
    373             if not is_32bit_windows:
    374                 is_32bit_windows = tools[exe][1]
    375             if get_symbols and is_32bit_windows:
    376                 break
    377         except OSError:
    378             continue
    379     if not get_symbols:
    380         print("Couldn't find a program to read symbols with", file=sys.stderr)
    381         exit(1)
    382     if not is_32bit_windows:
    383         print("Couldn't find a program to determining the target", file=sys.stderr)
    384         exit(1)
    385 
    386     # How we determine which symbols to keep and which to discard depends on
    387     # the mangling scheme
    388     if args.mangling == 'microsoft':
    389         should_keep_symbol = should_keep_microsoft_symbol
    390     else:
    391         should_keep_symbol = should_keep_itanium_symbol
    392 
    393     # Get the list of libraries to extract symbols from
    394     libs = list()
    395     for lib in args.libs:
    396         # When invoked by cmake the arguments are the cmake target names of the
    397         # libraries, so we need to add .lib/.a to the end and maybe lib to the
    398         # start to get the filename. Also allow objects.
    399         suffixes = ['.lib','.a','.obj','.o']
    400         if not any([lib.endswith(s) for s in suffixes]):
    401             for s in suffixes:
    402                 if os.path.exists(lib+s):
    403                     lib = lib+s
    404                     break
    405                 if os.path.exists('lib'+lib+s):
    406                     lib = 'lib'+lib+s
    407                     break
    408         if not any([lib.endswith(s) for s in suffixes]):
    409             print("Don't know what to do with argument "+lib, file=sys.stderr)
    410             exit(1)
    411         libs.append(lib)
    412 
    413     # Check if calling convention decoration is used by inspecting the first
    414     # library in the list
    415     calling_convention_decoration = is_32bit_windows(libs[0])
    416 
    417     # Extract symbols from libraries in parallel. This is a huge time saver when
    418     # doing a debug build, as there are hundreds of thousands of symbols in each
    419     # library.
    420     pool = multiprocessing.Pool()
    421     try:
    422         # Only one argument can be passed to the mapping function, and we can't
    423         # use a lambda or local function definition as that doesn't work on
    424         # windows, so create a list of tuples which duplicates the arguments
    425         # that are the same in all calls.
    426         vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs]
    427         # Do an async map then wait for the result to make sure that
    428         # KeyboardInterrupt gets caught correctly (see
    429         # http://bugs.python.org/issue8296)
    430         result = pool.map_async(extract_symbols, vals)
    431         pool.close()
    432         libs_symbols = result.get(3600)
    433     except KeyboardInterrupt:
    434         # On Ctrl-C terminate everything and exit
    435         pool.terminate()
    436         pool.join()
    437         exit(1)
    438 
    439     # Merge everything into a single dict
    440     symbols = dict()
    441     for this_lib_symbols in libs_symbols:
    442         for k,v in list(this_lib_symbols.items()):
    443             symbols[k] = v + symbols.setdefault(k,0)
    444 
    445     # Count instances of member functions of template classes, and map the
    446     # symbol name to the function+class. We do this under the assumption that if
    447     # a member function of a template class is instantiated many times it's
    448     # probably declared in a public header file.
    449     template_function_count = dict()
    450     template_function_mapping = dict()
    451     template_function_count[""] = 0
    452     for k in symbols:
    453         name = None
    454         if args.mangling == 'microsoft':
    455             # Member functions of templates start with
    456             # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>.
    457             # As manglings go from the innermost scope to the outermost scope
    458             # this means:
    459             #  * When we have a function member of a subclass of a template
    460             #    class then <fn_name> will actually contain the mangling of
    461             #    both the subclass and the function member. This is fine.
    462             #  * When we have a function member of a template subclass of a
    463             #    (possibly template) class then it's the innermost template
    464             #    subclass that becomes <class_name>. This should be OK so long
    465             #    as we don't have multiple classes with a template subclass of
    466             #    the same name.
    467             match = re.search("^\?(\??\w+\@\?\$\w+)\@", k)
    468             if match:
    469                 name = match.group(1)
    470         else:
    471             # Find member functions of templates by demangling the name and
    472             # checking if the second-to-last name in the list is a template.
    473             match = re.match('_Z(T[VTIS])?(N.+)', k)
    474             if match:
    475                 try:
    476                     names, _ = parse_itanium_nested_name(match.group(2))
    477                     if names and names[-2][1]:
    478                         name = ''.join([x for x,_ in names])
    479                 except TooComplexName:
    480                     # Manglings that are too complex should already have been
    481                     # filtered out, but if we happen to somehow see one here
    482                     # just leave it as-is.
    483                     pass
    484         if name:
    485             old_count = template_function_count.setdefault(name,0)
    486             template_function_count[name] = old_count + 1
    487             template_function_mapping[k] = name
    488         else:
    489             template_function_mapping[k] = ""
    490 
    491     # Print symbols which both:
    492     #  * Appear in exactly one input, as symbols defined in multiple
    493     #    objects/libraries are assumed to have public definitions.
    494     #  * Aren't instances of member functions of templates which have been
    495     #    instantiated 100 times or more, which are assumed to have public
    496     #    definitions. (100 is an arbitrary guess here.)
    497     if args.o:
    498         outfile = open(args.o,'w')
    499     else:
    500         outfile = sys.stdout
    501     for k,v in list(symbols.items()):
    502         template_count = template_function_count[template_function_mapping[k]]
    503         if v == 1 and template_count < 100:
    504             print(k, file=outfile)
    505