1 #!/usr/bin/env python 2 3 """A tool for extracting a list of symbols to export 4 5 When exporting symbols from a dll or exe we either need to mark the symbols in 6 the source code as __declspec(dllexport) or supply a list of symbols to the 7 linker. This program automates the latter by inspecting the symbol tables of a 8 list of link inputs and deciding which of those symbols need to be exported. 9 10 We can't just export all the defined symbols, as there's a limit of 65535 11 exported symbols and in clang we go way over that, particularly in a debug 12 build. Therefore a large part of the work is pruning symbols either which can't 13 be imported, or which we think are things that have definitions in public header 14 files (i.e. template instantiations) and we would get defined in the thing 15 importing these symbols anyway. 16 """ 17 18 from __future__ import print_function 19 import sys 20 import re 21 import os 22 import subprocess 23 import multiprocessing 24 import argparse 25 26 # Define functions which extract a list of symbols from a library using several 27 # different tools. We use subprocess.Popen and yield a symbol at a time instead 28 # of using subprocess.check_output and returning a list as, especially on 29 # Windows, waiting for the entire output to be ready can take a significant 30 # amount of time. 31 32 def dumpbin_get_symbols(lib): 33 process = subprocess.Popen(['dumpbin','/symbols',lib], bufsize=1, 34 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 35 universal_newlines=True) 36 process.stdin.close() 37 for line in process.stdout: 38 # Look for external symbols that are defined in some section 39 match = re.match("^.+SECT.+External\s+\|\s+(\S+).*$", line) 40 if match: 41 yield match.group(1) 42 process.wait() 43 44 def nm_get_symbols(lib): 45 process = subprocess.Popen(['nm',lib], bufsize=1, 46 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 47 universal_newlines=True) 48 process.stdin.close() 49 for line in process.stdout: 50 # Look for external symbols that are defined in some section 51 match = re.match("^\S+\s+[BDGRSTVW]\s+(\S+)$", line) 52 if match: 53 yield match.group(1) 54 process.wait() 55 56 def readobj_get_symbols(lib): 57 process = subprocess.Popen(['llvm-readobj','-symbols',lib], bufsize=1, 58 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 59 universal_newlines=True) 60 process.stdin.close() 61 for line in process.stdout: 62 # When looking through the output of llvm-readobj we expect to see Name, 63 # Section, then StorageClass, so record Name and Section when we see 64 # them and decide if this is a defined external symbol when we see 65 # StorageClass. 66 match = re.search('Name: (\S+)', line) 67 if match: 68 name = match.group(1) 69 match = re.search('Section: (\S+)', line) 70 if match: 71 section = match.group(1) 72 match = re.search('StorageClass: (\S+)', line) 73 if match: 74 storageclass = match.group(1) 75 if section != 'IMAGE_SYM_ABSOLUTE' and \ 76 section != 'IMAGE_SYM_UNDEFINED' and \ 77 storageclass == 'External': 78 yield name 79 process.wait() 80 81 # Define functions which determine if the target is 32-bit Windows (as that's 82 # where calling convention name decoration happens). 83 84 def dumpbin_is_32bit_windows(lib): 85 # dumpbin /headers can output a huge amount of data (>100MB in a debug 86 # build) so we read only up to the 'machine' line then close the output. 87 process = subprocess.Popen(['dumpbin','/headers',lib], bufsize=1, 88 stdout=subprocess.PIPE, stdin=subprocess.PIPE, 89 universal_newlines=True) 90 process.stdin.close() 91 retval = False 92 for line in process.stdout: 93 match = re.match('.+machine \((\S+)\)', line) 94 if match: 95 retval = (match.group(1) == 'x86') 96 break 97 process.stdout.close() 98 process.wait() 99 return retval 100 101 def objdump_is_32bit_windows(lib): 102 output = subprocess.check_output(['objdump','-f',lib], 103 universal_newlines=True) 104 for line in output: 105 match = re.match('.+file format (\S+)', line) 106 if match: 107 return (match.group(1) == 'pe-i386') 108 return False 109 110 def readobj_is_32bit_windows(lib): 111 output = subprocess.check_output(['llvm-readobj','-file-headers',lib], 112 universal_newlines=True) 113 for line in output: 114 match = re.match('Format: (\S+)', line) 115 if match: 116 return (match.group(1) == 'COFF-i386') 117 return False 118 119 # MSVC mangles names to ?<identifier_mangling>@<type_mangling>. By examining the 120 # identifier/type mangling we can decide which symbols could possibly be 121 # required and which we can discard. 122 def should_keep_microsoft_symbol(symbol, calling_convention_decoration): 123 # Keep unmangled (i.e. extern "C") names 124 if not '?' in symbol: 125 if calling_convention_decoration: 126 # Remove calling convention decoration from names 127 match = re.match('[_@]([^@]+)', symbol) 128 if match: 129 return match.group(1) 130 return symbol 131 # Function template instantiations start with ?$; keep the instantiations of 132 # clang::Type::getAs, as some of them are explipict specializations that are 133 # defined in clang's lib/AST/Type.cpp; discard the rest as it's assumed that 134 # the definition is public 135 elif re.match('\?\?\$getAs@.+@Type@clang@@', symbol): 136 return symbol 137 elif symbol.startswith('??$'): 138 return None 139 # Deleting destructors start with ?_G or ?_E and can be discarded because 140 # link.exe gives you a warning telling you they can't be exported if you 141 # don't 142 elif symbol.startswith('??_G') or symbol.startswith('??_E'): 143 return None 144 # Constructors (?0) and destructors (?1) of templates (?$) are assumed to be 145 # defined in headers and not required to be kept 146 elif symbol.startswith('??0?$') or symbol.startswith('??1?$'): 147 return None 148 # An anonymous namespace is mangled as ?A(maybe hex number)@. Any symbol 149 # that mentions an anonymous namespace can be discarded, as the anonymous 150 # namespace doesn't exist outside of that translation unit. 151 elif re.search('\?A(0x\w+)?@', symbol): 152 return None 153 # Keep mangled llvm:: and clang:: function symbols. How we detect these is a 154 # bit of a mess and imprecise, but that avoids having to completely demangle 155 # the symbol name. The outermost namespace is at the end of the identifier 156 # mangling, and the identifier mangling is followed by the type mangling, so 157 # we look for (llvm|clang)@@ followed by something that looks like a 158 # function type mangling. To spot a function type we use (this is derived 159 # from clang/lib/AST/MicrosoftMangle.cpp): 160 # <function-type> ::= <function-class> <this-cvr-qualifiers> 161 # <calling-convention> <return-type> 162 # <argument-list> <throw-spec> 163 # <function-class> ::= [A-Z] 164 # <this-cvr-qualifiers> ::= [A-Z0-9_]* 165 # <calling-convention> ::= [A-JQ] 166 # <return-type> ::= .+ 167 # <argument-list> ::= X (void) 168 # ::= .+@ (list of types) 169 # ::= .*Z (list of types, varargs) 170 # <throw-spec> ::= exceptions are not allowed 171 elif re.search('(llvm|clang)@@[A-Z][A-Z0-9_]*[A-JQ].+(X|.+@|.*Z)$', symbol): 172 return symbol 173 return None 174 175 # Itanium manglings are of the form _Z<identifier_mangling><type_mangling>. We 176 # demangle the identifier mangling to identify symbols that can be safely 177 # discarded. 178 def should_keep_itanium_symbol(symbol, calling_convention_decoration): 179 # Start by removing any calling convention decoration (which we expect to 180 # see on all symbols, even mangled C++ symbols) 181 if calling_convention_decoration and symbol.startswith('_'): 182 symbol = symbol[1:] 183 # Keep unmangled names 184 if not symbol.startswith('_') and not symbol.startswith('.'): 185 return symbol 186 # Discard manglings that aren't nested names 187 match = re.match('_Z(T[VTIS])?(N.+)', symbol) 188 if not match: 189 return None 190 # Demangle the name. If the name is too complex then we don't need to keep 191 # it, but it the demangling fails then keep the symbol just in case. 192 try: 193 names, _ = parse_itanium_nested_name(match.group(2)) 194 except TooComplexName: 195 return None 196 if not names: 197 return symbol 198 # Constructors and destructors of templates classes are assumed to be 199 # defined in headers and not required to be kept 200 if re.match('[CD][123]', names[-1][0]) and names[-2][1]: 201 return None 202 # Keep the instantiations of clang::Type::getAs, as some of them are 203 # explipict specializations that are defined in clang's lib/AST/Type.cpp; 204 # discard any other function template instantiations as it's assumed that 205 # the definition is public 206 elif symbol.startswith('_ZNK5clang4Type5getAs'): 207 return symbol 208 elif names[-1][1]: 209 return None 210 # Keep llvm:: and clang:: names 211 elif names[0][0] == '4llvm' or names[0][0] == '5clang': 212 return symbol 213 # Discard everything else 214 else: 215 return None 216 217 # Certain kinds of complex manglings we assume cannot be part of a public 218 # interface, and we handle them by raising an exception. 219 class TooComplexName(Exception): 220 pass 221 222 # Parse an itanium mangled name from the start of a string and return a 223 # (name, rest of string) pair. 224 def parse_itanium_name(arg): 225 # Check for a normal name 226 match = re.match('(\d+)(.+)', arg) 227 if match: 228 n = int(match.group(1)) 229 name = match.group(1)+match.group(2)[:n] 230 rest = match.group(2)[n:] 231 return name, rest 232 # Check for constructor/destructor names 233 match = re.match('([CD][123])(.+)', arg) 234 if match: 235 return match.group(1), match.group(2) 236 # Assume that a sequence of characters that doesn't end a nesting is an 237 # operator (this is very imprecise, but appears to be good enough) 238 match = re.match('([^E]+)(.+)', arg) 239 if match: 240 return match.group(1), match.group(2) 241 # Anything else: we can't handle it 242 return None, arg 243 244 # Parse an itanium mangled template argument list from the start of a string 245 # and throw it away, returning the rest of the string. 246 def skip_itanium_template(arg): 247 # A template argument list starts with I 248 assert arg.startswith('I'), arg 249 tmp = arg[1:] 250 while tmp: 251 # Check for names 252 match = re.match('(\d+)(.+)', tmp) 253 if match: 254 n = int(match.group(1)) 255 tmp = match.group(2)[n:] 256 continue 257 # Check for substitutions 258 match = re.match('S[A-Z0-9]*_(.+)', tmp) 259 if match: 260 tmp = match.group(1) 261 # Start of a template 262 elif tmp.startswith('I'): 263 tmp = skip_itanium_template(tmp) 264 # Start of a nested name 265 elif tmp.startswith('N'): 266 _, tmp = parse_itanium_nested_name(tmp) 267 # Start of an expression: assume that it's too complicated 268 elif tmp.startswith('L') or tmp.startswith('X'): 269 raise TooComplexName 270 # End of the template 271 elif tmp.startswith('E'): 272 return tmp[1:] 273 # Something else: probably a type, skip it 274 else: 275 tmp = tmp[1:] 276 return None 277 278 # Parse an itanium mangled nested name and transform it into a list of pairs of 279 # (name, is_template), returning (list, rest of string). 280 def parse_itanium_nested_name(arg): 281 # A nested name starts with N 282 assert arg.startswith('N'), arg 283 ret = [] 284 285 # Skip past the N, and possibly a substitution 286 match = re.match('NS[A-Z0-9]*_(.+)', arg) 287 if match: 288 tmp = match.group(1) 289 else: 290 tmp = arg[1:] 291 292 # Skip past CV-qualifiers and ref qualifiers 293 match = re.match('[rVKRO]*(.+)', tmp); 294 if match: 295 tmp = match.group(1) 296 297 # Repeatedly parse names from the string until we reach the end of the 298 # nested name 299 while tmp: 300 # An E ends the nested name 301 if tmp.startswith('E'): 302 return ret, tmp[1:] 303 # Parse a name 304 name_part, tmp = parse_itanium_name(tmp) 305 if not name_part: 306 # If we failed then we don't know how to demangle this 307 return None, None 308 is_template = False 309 # If this name is a template record that, then skip the template 310 # arguments 311 if tmp.startswith('I'): 312 tmp = skip_itanium_template(tmp) 313 is_template = True 314 # Add the name to the list 315 ret.append((name_part, is_template)) 316 317 # If we get here then something went wrong 318 return None, None 319 320 def extract_symbols(arg): 321 get_symbols, should_keep_symbol, calling_convention_decoration, lib = arg 322 symbols = dict() 323 for symbol in get_symbols(lib): 324 symbol = should_keep_symbol(symbol, calling_convention_decoration) 325 if symbol: 326 symbols[symbol] = 1 + symbols.setdefault(symbol,0) 327 return symbols 328 329 if __name__ == '__main__': 330 tool_exes = ['dumpbin','nm','objdump','llvm-readobj'] 331 parser = argparse.ArgumentParser( 332 description='Extract symbols to export from libraries') 333 parser.add_argument('--mangling', choices=['itanium','microsoft'], 334 required=True, help='expected symbol mangling scheme') 335 parser.add_argument('--tools', choices=tool_exes, nargs='*', 336 help='tools to use to extract symbols and determine the' 337 ' target') 338 parser.add_argument('libs', metavar='lib', type=str, nargs='+', 339 help='libraries to extract symbols from') 340 parser.add_argument('-o', metavar='file', type=str, help='output to file') 341 args = parser.parse_args() 342 343 # Determine the function to use to get the list of symbols from the inputs, 344 # and the function to use to determine if the target is 32-bit windows. 345 tools = { 'dumpbin' : (dumpbin_get_symbols, dumpbin_is_32bit_windows), 346 'nm' : (nm_get_symbols, None), 347 'objdump' : (None, objdump_is_32bit_windows), 348 'llvm-readobj' : (readobj_get_symbols, readobj_is_32bit_windows) } 349 get_symbols = None 350 is_32bit_windows = None 351 # If we have a tools argument then use that for the list of tools to check 352 if args.tools: 353 tool_exes = args.tools 354 # Find a tool to use by trying each in turn until we find one that exists 355 # (subprocess.call will throw OSError when the program does not exist) 356 get_symbols = None 357 for exe in tool_exes: 358 try: 359 # Close std streams as we don't want any output and we don't 360 # want the process to wait for something on stdin. 361 p = subprocess.Popen([exe], stdout=subprocess.PIPE, 362 stderr=subprocess.PIPE, 363 stdin=subprocess.PIPE, 364 universal_newlines=True) 365 p.stdout.close() 366 p.stderr.close() 367 p.stdin.close() 368 p.wait() 369 # Keep going until we have a tool to use for both get_symbols and 370 # is_32bit_windows 371 if not get_symbols: 372 get_symbols = tools[exe][0] 373 if not is_32bit_windows: 374 is_32bit_windows = tools[exe][1] 375 if get_symbols and is_32bit_windows: 376 break 377 except OSError: 378 continue 379 if not get_symbols: 380 print("Couldn't find a program to read symbols with", file=sys.stderr) 381 exit(1) 382 if not is_32bit_windows: 383 print("Couldn't find a program to determining the target", file=sys.stderr) 384 exit(1) 385 386 # How we determine which symbols to keep and which to discard depends on 387 # the mangling scheme 388 if args.mangling == 'microsoft': 389 should_keep_symbol = should_keep_microsoft_symbol 390 else: 391 should_keep_symbol = should_keep_itanium_symbol 392 393 # Get the list of libraries to extract symbols from 394 libs = list() 395 for lib in args.libs: 396 # When invoked by cmake the arguments are the cmake target names of the 397 # libraries, so we need to add .lib/.a to the end and maybe lib to the 398 # start to get the filename. Also allow objects. 399 suffixes = ['.lib','.a','.obj','.o'] 400 if not any([lib.endswith(s) for s in suffixes]): 401 for s in suffixes: 402 if os.path.exists(lib+s): 403 lib = lib+s 404 break 405 if os.path.exists('lib'+lib+s): 406 lib = 'lib'+lib+s 407 break 408 if not any([lib.endswith(s) for s in suffixes]): 409 print("Don't know what to do with argument "+lib, file=sys.stderr) 410 exit(1) 411 libs.append(lib) 412 413 # Check if calling convention decoration is used by inspecting the first 414 # library in the list 415 calling_convention_decoration = is_32bit_windows(libs[0]) 416 417 # Extract symbols from libraries in parallel. This is a huge time saver when 418 # doing a debug build, as there are hundreds of thousands of symbols in each 419 # library. 420 pool = multiprocessing.Pool() 421 try: 422 # Only one argument can be passed to the mapping function, and we can't 423 # use a lambda or local function definition as that doesn't work on 424 # windows, so create a list of tuples which duplicates the arguments 425 # that are the same in all calls. 426 vals = [(get_symbols, should_keep_symbol, calling_convention_decoration, x) for x in libs] 427 # Do an async map then wait for the result to make sure that 428 # KeyboardInterrupt gets caught correctly (see 429 # http://bugs.python.org/issue8296) 430 result = pool.map_async(extract_symbols, vals) 431 pool.close() 432 libs_symbols = result.get(3600) 433 except KeyboardInterrupt: 434 # On Ctrl-C terminate everything and exit 435 pool.terminate() 436 pool.join() 437 exit(1) 438 439 # Merge everything into a single dict 440 symbols = dict() 441 for this_lib_symbols in libs_symbols: 442 for k,v in list(this_lib_symbols.items()): 443 symbols[k] = v + symbols.setdefault(k,0) 444 445 # Count instances of member functions of template classes, and map the 446 # symbol name to the function+class. We do this under the assumption that if 447 # a member function of a template class is instantiated many times it's 448 # probably declared in a public header file. 449 template_function_count = dict() 450 template_function_mapping = dict() 451 template_function_count[""] = 0 452 for k in symbols: 453 name = None 454 if args.mangling == 'microsoft': 455 # Member functions of templates start with 456 # ?<fn_name>@?$<class_name>@, so we map to <fn_name>@?$<class_name>. 457 # As manglings go from the innermost scope to the outermost scope 458 # this means: 459 # * When we have a function member of a subclass of a template 460 # class then <fn_name> will actually contain the mangling of 461 # both the subclass and the function member. This is fine. 462 # * When we have a function member of a template subclass of a 463 # (possibly template) class then it's the innermost template 464 # subclass that becomes <class_name>. This should be OK so long 465 # as we don't have multiple classes with a template subclass of 466 # the same name. 467 match = re.search("^\?(\??\w+\@\?\$\w+)\@", k) 468 if match: 469 name = match.group(1) 470 else: 471 # Find member functions of templates by demangling the name and 472 # checking if the second-to-last name in the list is a template. 473 match = re.match('_Z(T[VTIS])?(N.+)', k) 474 if match: 475 try: 476 names, _ = parse_itanium_nested_name(match.group(2)) 477 if names and names[-2][1]: 478 name = ''.join([x for x,_ in names]) 479 except TooComplexName: 480 # Manglings that are too complex should already have been 481 # filtered out, but if we happen to somehow see one here 482 # just leave it as-is. 483 pass 484 if name: 485 old_count = template_function_count.setdefault(name,0) 486 template_function_count[name] = old_count + 1 487 template_function_mapping[k] = name 488 else: 489 template_function_mapping[k] = "" 490 491 # Print symbols which both: 492 # * Appear in exactly one input, as symbols defined in multiple 493 # objects/libraries are assumed to have public definitions. 494 # * Aren't instances of member functions of templates which have been 495 # instantiated 100 times or more, which are assumed to have public 496 # definitions. (100 is an arbitrary guess here.) 497 if args.o: 498 outfile = open(args.o,'w') 499 else: 500 outfile = sys.stdout 501 for k,v in list(symbols.items()): 502 template_count = template_function_count[template_function_mapping[k]] 503 if v == 1 and template_count < 100: 504 print(k, file=outfile) 505