1 #! /usr/bin/env python 2 # -*- coding: iso-8859-1 -*- 3 # Originally written by Barry Warsaw <barry (at] zope.com> 4 # 5 # Minimally patched to make it even more xgettext compatible 6 # by Peter Funk <pf (at] artcom-gmbh.de> 7 # 8 # 2002-11-22 Jrgen Hermann <jh (at] web.de> 9 # Added checks that _() only contains string literals, and 10 # command line args are resolved to module lists, i.e. you 11 # can now pass a filename, a module or package name, or a 12 # directory (including globbing chars, important for Win32). 13 # Made docstring fit in 80 chars wide displays using pydoc. 14 # 15 16 # for selftesting 17 try: 18 import fintl 19 _ = fintl.gettext 20 except ImportError: 21 _ = lambda s: s 22 23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26 internationalization of C programs. Most of these tools are independent of 27 the programming language and can be used from within Python programs. 28 Martin von Loewis' work[1] helps considerably in this regard. 29 30 There's one problem though; xgettext is the program that scans source code 31 looking for message strings, but it groks only C (or C++). Python 32 introduces a few wrinkles, such as dual quoting characters, triple quoted 33 strings, and raw strings. xgettext understands none of this. 34 35 Enter pygettext, which uses Python's standard tokenize module to scan 36 Python source code, generating .pot files identical to what GNU xgettext[2] 37 generates for C and C++ code. From there, the standard GNU tools can be 38 used. 39 40 A word about marking Python strings as candidates for translation. GNU 41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42 and gettext_noop. But those can be a lot of text to include all over your 43 code. C and C++ have a trick: they use the C preprocessor. Most 44 internationalized C source includes a #define for gettext() to _() so that 45 what has to be written in the source is much less. Thus these are both 46 translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51 Python of course has no preprocessor so this doesn't work so well. Thus, 52 pygettext searches only for _() by default, but see the -k/--keyword flag 53 below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58 NOTE: pygettext attempts to be option and feature compatible with GNU 59 xgettext where ever possible. However some options are still missing or are 60 not fully implemented. Also, xgettext's use of command line switches with 61 option arguments is broken, and in these cases, pygettext just defines 62 additional switches. 63 64 Usage: pygettext [options] inputfile ... 65 66 Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155 If `inputfile' is -, standard input is read. 156 """) 157 158 import os 159 import imp 160 import sys 161 import glob 162 import time 163 import getopt 164 import token 165 import tokenize 166 import operator 167 168 __version__ = '1.5' 169 170 default_keywords = ['_'] 171 DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173 EMPTYSTRING = '' 174 175 176 178 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179 # there. 180 pot_header = _('''\ 181 # SOME DESCRIPTIVE TITLE. 182 # Copyright (C) YEAR ORGANIZATION 183 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184 # 185 msgid "" 186 msgstr "" 187 "Project-Id-Version: PACKAGE VERSION\\n" 188 "POT-Creation-Date: %(time)s\\n" 189 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191 "Language-Team: LANGUAGE <LL@li.org>\\n" 192 "MIME-Version: 1.0\\n" 193 "Content-Type: text/plain; charset=CHARSET\\n" 194 "Content-Transfer-Encoding: ENCODING\\n" 195 "Generated-By: pygettext.py %(version)s\\n" 196 197 ''') 198 199 201 def usage(code, msg=''): 202 print >> sys.stderr, __doc__ % globals() 203 if msg: 204 print >> sys.stderr, msg 205 sys.exit(code) 206 207 208 210 escapes = [] 211 212 def make_escapes(pass_iso8859): 213 global escapes 214 if pass_iso8859: 215 # Allow iso-8859 characters to pass through so that e.g. 'msgid 216 # "Hhe"' would result not result in 'msgid "H\366he"'. Otherwise we 217 # escape any character outside the 32..126 range. 218 mod = 128 219 else: 220 mod = 256 221 for i in range(256): 222 if 32 <= (i % mod) <= 126: 223 escapes.append(chr(i)) 224 else: 225 escapes.append("\\%03o" % i) 226 escapes[ord('\\')] = '\\\\' 227 escapes[ord('\t')] = '\\t' 228 escapes[ord('\r')] = '\\r' 229 escapes[ord('\n')] = '\\n' 230 escapes[ord('\"')] = '\\"' 231 232 233 def escape(s): 234 global escapes 235 s = list(s) 236 for i in range(len(s)): 237 s[i] = escapes[ord(s[i])] 238 return EMPTYSTRING.join(s) 239 240 241 def safe_eval(s): 242 # unwrap quotes, safely 243 return eval(s, {'__builtins__':{}}, {}) 244 245 246 def normalize(s): 247 # This converts the various Python string types into a format that is 248 # appropriate for .po files, namely much closer to C style. 249 lines = s.split('\n') 250 if len(lines) == 1: 251 s = '"' + escape(s) + '"' 252 else: 253 if not lines[-1]: 254 del lines[-1] 255 lines[-1] = lines[-1] + '\n' 256 for i in range(len(lines)): 257 lines[i] = escape(lines[i]) 258 lineterm = '\\n"\n"' 259 s = '""\n"' + lineterm.join(lines) + '"' 260 return s 261 262 264 def containsAny(str, set): 265 """Check whether 'str' contains ANY of the chars in 'set'""" 266 return 1 in [c in str for c in set] 267 268 269 def _visit_pyfiles(list, dirname, names): 270 """Helper for getFilesForName().""" 271 # get extension for python source files 272 if not globals().has_key('_py_ext'): 273 global _py_ext 274 _py_ext = [triple[0] for triple in imp.get_suffixes() 275 if triple[2] == imp.PY_SOURCE][0] 276 277 # don't recurse into CVS directories 278 if 'CVS' in names: 279 names.remove('CVS') 280 281 # add all *.py files to list 282 list.extend( 283 [os.path.join(dirname, file) for file in names 284 if os.path.splitext(file)[1] == _py_ext] 285 ) 286 287 288 def _get_modpkg_path(dotted_name, pathlist=None): 289 """Get the filesystem path for a module or a package. 290 291 Return the file system path to a file for a module, and to a directory for 292 a package. Return None if the name is not found, or is a builtin or 293 extension module. 294 """ 295 # split off top-most name 296 parts = dotted_name.split('.', 1) 297 298 if len(parts) > 1: 299 # we have a dotted path, import top-level package 300 try: 301 file, pathname, description = imp.find_module(parts[0], pathlist) 302 if file: file.close() 303 except ImportError: 304 return None 305 306 # check if it's indeed a package 307 if description[2] == imp.PKG_DIRECTORY: 308 # recursively handle the remaining name parts 309 pathname = _get_modpkg_path(parts[1], [pathname]) 310 else: 311 pathname = None 312 else: 313 # plain name 314 try: 315 file, pathname, description = imp.find_module( 316 dotted_name, pathlist) 317 if file: 318 file.close() 319 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: 320 pathname = None 321 except ImportError: 322 pathname = None 323 324 return pathname 325 326 327 def getFilesForName(name): 328 """Get a list of module files for a filename, a module or package name, 329 or a directory. 330 """ 331 if not os.path.exists(name): 332 # check for glob chars 333 if containsAny(name, "*?[]"): 334 files = glob.glob(name) 335 list = [] 336 for file in files: 337 list.extend(getFilesForName(file)) 338 return list 339 340 # try to find module or package 341 name = _get_modpkg_path(name) 342 if not name: 343 return [] 344 345 if os.path.isdir(name): 346 # find all python files in directory 347 list = [] 348 os.path.walk(name, _visit_pyfiles, list) 349 return list 350 elif os.path.exists(name): 351 # a single file 352 return [name] 353 354 return [] 355 356 358 class TokenEater: 359 def __init__(self, options): 360 self.__options = options 361 self.__messages = {} 362 self.__state = self.__waiting 363 self.__data = [] 364 self.__lineno = -1 365 self.__freshmodule = 1 366 self.__curfile = None 367 368 def __call__(self, ttype, tstring, stup, etup, line): 369 # dispatch 370 ## import token 371 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 372 ## 'tstring:', tstring 373 self.__state(ttype, tstring, stup[0]) 374 375 def __waiting(self, ttype, tstring, lineno): 376 opts = self.__options 377 # Do docstring extractions, if enabled 378 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 379 # module docstring? 380 if self.__freshmodule: 381 if ttype == tokenize.STRING: 382 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 383 self.__freshmodule = 0 384 elif ttype not in (tokenize.COMMENT, tokenize.NL): 385 self.__freshmodule = 0 386 return 387 # class docstring? 388 if ttype == tokenize.NAME and tstring in ('class', 'def'): 389 self.__state = self.__suiteseen 390 return 391 if ttype == tokenize.NAME and tstring in opts.keywords: 392 self.__state = self.__keywordseen 393 394 def __suiteseen(self, ttype, tstring, lineno): 395 # ignore anything until we see the colon 396 if ttype == tokenize.OP and tstring == ':': 397 self.__state = self.__suitedocstring 398 399 def __suitedocstring(self, ttype, tstring, lineno): 400 # ignore any intervening noise 401 if ttype == tokenize.STRING: 402 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 403 self.__state = self.__waiting 404 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 405 tokenize.COMMENT): 406 # there was no class docstring 407 self.__state = self.__waiting 408 409 def __keywordseen(self, ttype, tstring, lineno): 410 if ttype == tokenize.OP and tstring == '(': 411 self.__data = [] 412 self.__lineno = lineno 413 self.__state = self.__openseen 414 else: 415 self.__state = self.__waiting 416 417 def __openseen(self, ttype, tstring, lineno): 418 if ttype == tokenize.OP and tstring == ')': 419 # We've seen the last of the translatable strings. Record the 420 # line number of the first line of the strings and update the list 421 # of messages seen. Reset state for the next batch. If there 422 # were no strings inside _(), then just ignore this entry. 423 if self.__data: 424 self.__addentry(EMPTYSTRING.join(self.__data)) 425 self.__state = self.__waiting 426 elif ttype == tokenize.STRING: 427 self.__data.append(safe_eval(tstring)) 428 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 429 token.NEWLINE, tokenize.NL]: 430 # warn if we see anything else than STRING or whitespace 431 print >> sys.stderr, _( 432 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 433 ) % { 434 'token': tstring, 435 'file': self.__curfile, 436 'lineno': self.__lineno 437 } 438 self.__state = self.__waiting 439 440 def __addentry(self, msg, lineno=None, isdocstring=0): 441 if lineno is None: 442 lineno = self.__lineno 443 if not msg in self.__options.toexclude: 444 entry = (self.__curfile, lineno) 445 self.__messages.setdefault(msg, {})[entry] = isdocstring 446 447 def set_filename(self, filename): 448 self.__curfile = filename 449 self.__freshmodule = 1 450 451 def write(self, fp): 452 options = self.__options 453 timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') 454 # The time stamp in the header doesn't have the same format as that 455 # generated by xgettext... 456 print >> fp, pot_header % {'time': timestamp, 'version': __version__} 457 # Sort the entries. First sort each particular entry's keys, then 458 # sort all the entries by their first item. 459 reverse = {} 460 for k, v in self.__messages.items(): 461 keys = v.keys() 462 keys.sort() 463 reverse.setdefault(tuple(keys), []).append((k, v)) 464 rkeys = reverse.keys() 465 rkeys.sort() 466 for rkey in rkeys: 467 rentries = reverse[rkey] 468 rentries.sort() 469 for k, v in rentries: 470 isdocstring = 0 471 # If the entry was gleaned out of a docstring, then add a 472 # comment stating so. This is to aid translators who may wish 473 # to skip translating some unimportant docstrings. 474 if reduce(operator.__add__, v.values()): 475 isdocstring = 1 476 # k is the message string, v is a dictionary-set of (filename, 477 # lineno) tuples. We want to sort the entries in v first by 478 # file name and then by line number. 479 v = v.keys() 480 v.sort() 481 if not options.writelocations: 482 pass 483 # location comments are different b/w Solaris and GNU: 484 elif options.locationstyle == options.SOLARIS: 485 for filename, lineno in v: 486 d = {'filename': filename, 'lineno': lineno} 487 print >>fp, _( 488 '# File: %(filename)s, line: %(lineno)d') % d 489 elif options.locationstyle == options.GNU: 490 # fit as many locations on one line, as long as the 491 # resulting line length doesn't exceeds 'options.width' 492 locline = '#:' 493 for filename, lineno in v: 494 d = {'filename': filename, 'lineno': lineno} 495 s = _(' %(filename)s:%(lineno)d') % d 496 if len(locline) + len(s) <= options.width: 497 locline = locline + s 498 else: 499 print >> fp, locline 500 locline = "#:" + s 501 if len(locline) > 2: 502 print >> fp, locline 503 if isdocstring: 504 print >> fp, '#, docstring' 505 print >> fp, 'msgid', normalize(k) 506 print >> fp, 'msgstr ""\n' 507 508 509 511 def main(): 512 global default_keywords 513 try: 514 opts, args = getopt.getopt( 515 sys.argv[1:], 516 'ad:DEhk:Kno:p:S:Vvw:x:X:', 517 ['extract-all', 'default-domain=', 'escape', 'help', 518 'keyword=', 'no-default-keywords', 519 'add-location', 'no-location', 'output=', 'output-dir=', 520 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 521 'docstrings', 'no-docstrings', 522 ]) 523 except getopt.error, msg: 524 usage(1, msg) 525 526 # for holding option values 527 class Options: 528 # constants 529 GNU = 1 530 SOLARIS = 2 531 # defaults 532 extractall = 0 # FIXME: currently this option has no effect at all. 533 escape = 0 534 keywords = [] 535 outpath = '' 536 outfile = 'messages.pot' 537 writelocations = 1 538 locationstyle = GNU 539 verbose = 0 540 width = 78 541 excludefilename = '' 542 docstrings = 0 543 nodocstrings = {} 544 545 options = Options() 546 locations = {'gnu' : options.GNU, 547 'solaris' : options.SOLARIS, 548 } 549 550 # parse options 551 for opt, arg in opts: 552 if opt in ('-h', '--help'): 553 usage(0) 554 elif opt in ('-a', '--extract-all'): 555 options.extractall = 1 556 elif opt in ('-d', '--default-domain'): 557 options.outfile = arg + '.pot' 558 elif opt in ('-E', '--escape'): 559 options.escape = 1 560 elif opt in ('-D', '--docstrings'): 561 options.docstrings = 1 562 elif opt in ('-k', '--keyword'): 563 options.keywords.append(arg) 564 elif opt in ('-K', '--no-default-keywords'): 565 default_keywords = [] 566 elif opt in ('-n', '--add-location'): 567 options.writelocations = 1 568 elif opt in ('--no-location',): 569 options.writelocations = 0 570 elif opt in ('-S', '--style'): 571 options.locationstyle = locations.get(arg.lower()) 572 if options.locationstyle is None: 573 usage(1, _('Invalid value for --style: %s') % arg) 574 elif opt in ('-o', '--output'): 575 options.outfile = arg 576 elif opt in ('-p', '--output-dir'): 577 options.outpath = arg 578 elif opt in ('-v', '--verbose'): 579 options.verbose = 1 580 elif opt in ('-V', '--version'): 581 print _('pygettext.py (xgettext for Python) %s') % __version__ 582 sys.exit(0) 583 elif opt in ('-w', '--width'): 584 try: 585 options.width = int(arg) 586 except ValueError: 587 usage(1, _('--width argument must be an integer: %s') % arg) 588 elif opt in ('-x', '--exclude-file'): 589 options.excludefilename = arg 590 elif opt in ('-X', '--no-docstrings'): 591 fp = open(arg) 592 try: 593 while 1: 594 line = fp.readline() 595 if not line: 596 break 597 options.nodocstrings[line[:-1]] = 1 598 finally: 599 fp.close() 600 601 # calculate escapes 602 make_escapes(options.escape) 603 604 # calculate all keywords 605 options.keywords.extend(default_keywords) 606 607 # initialize list of strings to exclude 608 if options.excludefilename: 609 try: 610 fp = open(options.excludefilename) 611 options.toexclude = fp.readlines() 612 fp.close() 613 except IOError: 614 print >> sys.stderr, _( 615 "Can't read --exclude-file: %s") % options.excludefilename 616 sys.exit(1) 617 else: 618 options.toexclude = [] 619 620 # resolve args to module lists 621 expanded = [] 622 for arg in args: 623 if arg == '-': 624 expanded.append(arg) 625 else: 626 expanded.extend(getFilesForName(arg)) 627 args = expanded 628 629 # slurp through all the files 630 eater = TokenEater(options) 631 for filename in args: 632 if filename == '-': 633 if options.verbose: 634 print _('Reading standard input') 635 fp = sys.stdin 636 closep = 0 637 else: 638 if options.verbose: 639 print _('Working on %s') % filename 640 fp = open(filename) 641 closep = 1 642 try: 643 eater.set_filename(filename) 644 try: 645 tokenize.tokenize(fp.readline, eater) 646 except tokenize.TokenError, e: 647 print >> sys.stderr, '%s: %s, line %d, column %d' % ( 648 e[0], filename, e[1][0], e[1][1]) 649 finally: 650 if closep: 651 fp.close() 652 653 # write the output 654 if options.outfile == '-': 655 fp = sys.stdout 656 closep = 0 657 else: 658 if options.outpath: 659 options.outfile = os.path.join(options.outpath, options.outfile) 660 fp = open(options.outfile, 'w') 661 closep = 1 662 try: 663 eater.write(fp) 664 finally: 665 if closep: 666 fp.close() 667 668 670 if __name__ == '__main__': 671 main() 672 # some more test strings 673 _(u'a unicode string') 674 # this one creates a warning 675 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 676 _('more' 'than' 'one' 'string') 677