1 #! /usr/bin/env python 2 # -*- coding: iso-8859-1 -*- 3 # Originally written by Barry Warsaw <barry (at] python.org> 4 # 5 # Minimally patched to make it even more xgettext compatible 6 # by Peter Funk <pf (at] artcom-gmbh.de> 7 # 8 # 2002-11-22 Jrgen Hermann <jh (at] web.de> 9 # Added checks that _() only contains string literals, and 10 # command line args are resolved to module lists, i.e. you 11 # can now pass a filename, a module or package name, or a 12 # directory (including globbing chars, important for Win32). 13 # Made docstring fit in 80 chars wide displays using pydoc. 14 # 15 16 # for selftesting 17 try: 18 import fintl 19 _ = fintl.gettext 20 except ImportError: 21 _ = lambda s: s 22 23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26 internationalization of C programs. Most of these tools are independent of 27 the programming language and can be used from within Python programs. 28 Martin von Loewis' work[1] helps considerably in this regard. 29 30 There's one problem though; xgettext is the program that scans source code 31 looking for message strings, but it groks only C (or C++). Python 32 introduces a few wrinkles, such as dual quoting characters, triple quoted 33 strings, and raw strings. xgettext understands none of this. 34 35 Enter pygettext, which uses Python's standard tokenize module to scan 36 Python source code, generating .pot files identical to what GNU xgettext[2] 37 generates for C and C++ code. From there, the standard GNU tools can be 38 used. 39 40 A word about marking Python strings as candidates for translation. GNU 41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42 and gettext_noop. But those can be a lot of text to include all over your 43 code. C and C++ have a trick: they use the C preprocessor. Most 44 internationalized C source includes a #define for gettext() to _() so that 45 what has to be written in the source is much less. Thus these are both 46 translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51 Python of course has no preprocessor so this doesn't work so well. Thus, 52 pygettext searches only for _() by default, but see the -k/--keyword flag 53 below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58 NOTE: pygettext attempts to be option and feature compatible with GNU 59 xgettext where ever possible. However some options are still missing or are 60 not fully implemented. Also, xgettext's use of command line switches with 61 option arguments is broken, and in these cases, pygettext just defines 62 additional switches. 63 64 Usage: pygettext [options] inputfile ... 65 66 Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155 If `inputfile' is -, standard input is read. 156 """) 157 158 import os 159 import imp 160 import sys 161 import glob 162 import time 163 import getopt 164 import token 165 import tokenize 166 import operator 167 168 __version__ = '1.5' 169 170 default_keywords = ['_'] 171 DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173 EMPTYSTRING = '' 174 175 176 178 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179 # there. 180 pot_header = _('''\ 181 # SOME DESCRIPTIVE TITLE. 182 # Copyright (C) YEAR ORGANIZATION 183 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184 # 185 msgid "" 186 msgstr "" 187 "Project-Id-Version: PACKAGE VERSION\\n" 188 "POT-Creation-Date: %(time)s\\n" 189 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191 "Language-Team: LANGUAGE <LL@li.org>\\n" 192 "MIME-Version: 1.0\\n" 193 "Content-Type: text/plain; charset=CHARSET\\n" 194 "Content-Transfer-Encoding: ENCODING\\n" 195 "Generated-By: pygettext.py %(version)s\\n" 196 197 ''') 198 199 201 def usage(code, msg=''): 202 print >> sys.stderr, __doc__ % globals() 203 if msg: 204 print >> sys.stderr, msg 205 sys.exit(code) 206 207 208 210 escapes = [] 211 212 def make_escapes(pass_iso8859): 213 global escapes 214 escapes = [chr(i) for i in range(256)] 215 if pass_iso8859: 216 # Allow iso-8859 characters to pass through so that e.g. 'msgid 217 # "Hhe"' would result not result in 'msgid "H\366he"'. Otherwise we 218 # escape any character outside the 32..126 range. 219 mod = 128 220 else: 221 mod = 256 222 for i in range(mod): 223 if not(32 <= i <= 126): 224 escapes[i] = "\\%03o" % i 225 escapes[ord('\\')] = '\\\\' 226 escapes[ord('\t')] = '\\t' 227 escapes[ord('\r')] = '\\r' 228 escapes[ord('\n')] = '\\n' 229 escapes[ord('\"')] = '\\"' 230 231 232 def escape(s): 233 global escapes 234 s = list(s) 235 for i in range(len(s)): 236 s[i] = escapes[ord(s[i])] 237 return EMPTYSTRING.join(s) 238 239 240 def safe_eval(s): 241 # unwrap quotes, safely 242 return eval(s, {'__builtins__':{}}, {}) 243 244 245 def normalize(s): 246 # This converts the various Python string types into a format that is 247 # appropriate for .po files, namely much closer to C style. 248 lines = s.split('\n') 249 if len(lines) == 1: 250 s = '"' + escape(s) + '"' 251 else: 252 if not lines[-1]: 253 del lines[-1] 254 lines[-1] = lines[-1] + '\n' 255 for i in range(len(lines)): 256 lines[i] = escape(lines[i]) 257 lineterm = '\\n"\n"' 258 s = '""\n"' + lineterm.join(lines) + '"' 259 return s 260 261 263 def containsAny(str, set): 264 """Check whether 'str' contains ANY of the chars in 'set'""" 265 return 1 in [c in str for c in set] 266 267 268 def _get_modpkg_path(dotted_name, pathlist=None): 269 """Get the filesystem path for a module or a package. 270 271 Return the file system path to a file for a module, and to a directory for 272 a package. Return None if the name is not found, or is a builtin or 273 extension module. 274 """ 275 # split off top-most name 276 parts = dotted_name.split('.', 1) 277 278 if len(parts) > 1: 279 # we have a dotted path, import top-level package 280 try: 281 file, pathname, description = imp.find_module(parts[0], pathlist) 282 if file: file.close() 283 except ImportError: 284 return None 285 286 # check if it's indeed a package 287 if description[2] == imp.PKG_DIRECTORY: 288 # recursively handle the remaining name parts 289 pathname = _get_modpkg_path(parts[1], [pathname]) 290 else: 291 pathname = None 292 else: 293 # plain name 294 try: 295 file, pathname, description = imp.find_module( 296 dotted_name, pathlist) 297 if file: 298 file.close() 299 if description[2] not in [imp.PY_SOURCE, imp.PKG_DIRECTORY]: 300 pathname = None 301 except ImportError: 302 pathname = None 303 304 return pathname 305 306 307 def getFilesForName(name): 308 """Get a list of module files for a filename, a module or package name, 309 or a directory. 310 """ 311 if not os.path.exists(name): 312 # check for glob chars 313 if containsAny(name, "*?[]"): 314 files = glob.glob(name) 315 list = [] 316 for file in files: 317 list.extend(getFilesForName(file)) 318 return list 319 320 # try to find module or package 321 name = _get_modpkg_path(name) 322 if not name: 323 return [] 324 325 if os.path.isdir(name): 326 # find all python files in directory 327 list = [] 328 # get extension for python source files 329 if '_py_ext' not in globals(): 330 global _py_ext 331 _py_ext = [triple[0] for triple in imp.get_suffixes() 332 if triple[2] == imp.PY_SOURCE][0] 333 for root, dirs, files in os.walk(name): 334 # don't recurse into CVS directories 335 if 'CVS' in dirs: 336 dirs.remove('CVS') 337 # add all *.py files to list 338 list.extend( 339 [os.path.join(root, file) for file in files 340 if os.path.splitext(file)[1] == _py_ext] 341 ) 342 return list 343 elif os.path.exists(name): 344 # a single file 345 return [name] 346 347 return [] 348 349 351 class TokenEater: 352 def __init__(self, options): 353 self.__options = options 354 self.__messages = {} 355 self.__state = self.__waiting 356 self.__data = [] 357 self.__lineno = -1 358 self.__freshmodule = 1 359 self.__curfile = None 360 361 def __call__(self, ttype, tstring, stup, etup, line): 362 # dispatch 363 ## import token 364 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 365 ## 'tstring:', tstring 366 self.__state(ttype, tstring, stup[0]) 367 368 def __waiting(self, ttype, tstring, lineno): 369 opts = self.__options 370 # Do docstring extractions, if enabled 371 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 372 # module docstring? 373 if self.__freshmodule: 374 if ttype == tokenize.STRING: 375 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 376 self.__freshmodule = 0 377 elif ttype not in (tokenize.COMMENT, tokenize.NL): 378 self.__freshmodule = 0 379 return 380 # class docstring? 381 if ttype == tokenize.NAME and tstring in ('class', 'def'): 382 self.__state = self.__suiteseen 383 return 384 if ttype == tokenize.NAME and tstring in opts.keywords: 385 self.__state = self.__keywordseen 386 387 def __suiteseen(self, ttype, tstring, lineno): 388 # ignore anything until we see the colon 389 if ttype == tokenize.OP and tstring == ':': 390 self.__state = self.__suitedocstring 391 392 def __suitedocstring(self, ttype, tstring, lineno): 393 # ignore any intervening noise 394 if ttype == tokenize.STRING: 395 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 396 self.__state = self.__waiting 397 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 398 tokenize.COMMENT): 399 # there was no class docstring 400 self.__state = self.__waiting 401 402 def __keywordseen(self, ttype, tstring, lineno): 403 if ttype == tokenize.OP and tstring == '(': 404 self.__data = [] 405 self.__lineno = lineno 406 self.__state = self.__openseen 407 else: 408 self.__state = self.__waiting 409 410 def __openseen(self, ttype, tstring, lineno): 411 if ttype == tokenize.OP and tstring == ')': 412 # We've seen the last of the translatable strings. Record the 413 # line number of the first line of the strings and update the list 414 # of messages seen. Reset state for the next batch. If there 415 # were no strings inside _(), then just ignore this entry. 416 if self.__data: 417 self.__addentry(EMPTYSTRING.join(self.__data)) 418 self.__state = self.__waiting 419 elif ttype == tokenize.STRING: 420 self.__data.append(safe_eval(tstring)) 421 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 422 token.NEWLINE, tokenize.NL]: 423 # warn if we see anything else than STRING or whitespace 424 print >> sys.stderr, _( 425 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 426 ) % { 427 'token': tstring, 428 'file': self.__curfile, 429 'lineno': self.__lineno 430 } 431 self.__state = self.__waiting 432 433 def __addentry(self, msg, lineno=None, isdocstring=0): 434 if lineno is None: 435 lineno = self.__lineno 436 if not msg in self.__options.toexclude: 437 entry = (self.__curfile, lineno) 438 self.__messages.setdefault(msg, {})[entry] = isdocstring 439 440 def set_filename(self, filename): 441 self.__curfile = filename 442 self.__freshmodule = 1 443 444 def write(self, fp): 445 options = self.__options 446 timestamp = time.strftime('%Y-%m-%d %H:%M+%Z') 447 # The time stamp in the header doesn't have the same format as that 448 # generated by xgettext... 449 print >> fp, pot_header % {'time': timestamp, 'version': __version__} 450 # Sort the entries. First sort each particular entry's keys, then 451 # sort all the entries by their first item. 452 reverse = {} 453 for k, v in self.__messages.items(): 454 keys = v.keys() 455 keys.sort() 456 reverse.setdefault(tuple(keys), []).append((k, v)) 457 rkeys = reverse.keys() 458 rkeys.sort() 459 for rkey in rkeys: 460 rentries = reverse[rkey] 461 rentries.sort() 462 for k, v in rentries: 463 isdocstring = 0 464 # If the entry was gleaned out of a docstring, then add a 465 # comment stating so. This is to aid translators who may wish 466 # to skip translating some unimportant docstrings. 467 if reduce(operator.__add__, v.values()): 468 isdocstring = 1 469 # k is the message string, v is a dictionary-set of (filename, 470 # lineno) tuples. We want to sort the entries in v first by 471 # file name and then by line number. 472 v = v.keys() 473 v.sort() 474 if not options.writelocations: 475 pass 476 # location comments are different b/w Solaris and GNU: 477 elif options.locationstyle == options.SOLARIS: 478 for filename, lineno in v: 479 d = {'filename': filename, 'lineno': lineno} 480 print >>fp, _( 481 '# File: %(filename)s, line: %(lineno)d') % d 482 elif options.locationstyle == options.GNU: 483 # fit as many locations on one line, as long as the 484 # resulting line length doesn't exceed 'options.width' 485 locline = '#:' 486 for filename, lineno in v: 487 d = {'filename': filename, 'lineno': lineno} 488 s = _(' %(filename)s:%(lineno)d') % d 489 if len(locline) + len(s) <= options.width: 490 locline = locline + s 491 else: 492 print >> fp, locline 493 locline = "#:" + s 494 if len(locline) > 2: 495 print >> fp, locline 496 if isdocstring: 497 print >> fp, '#, docstring' 498 print >> fp, 'msgid', normalize(k) 499 print >> fp, 'msgstr ""\n' 500 501 502 504 def main(): 505 global default_keywords 506 try: 507 opts, args = getopt.getopt( 508 sys.argv[1:], 509 'ad:DEhk:Kno:p:S:Vvw:x:X:', 510 ['extract-all', 'default-domain=', 'escape', 'help', 511 'keyword=', 'no-default-keywords', 512 'add-location', 'no-location', 'output=', 'output-dir=', 513 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 514 'docstrings', 'no-docstrings', 515 ]) 516 except getopt.error, msg: 517 usage(1, msg) 518 519 # for holding option values 520 class Options: 521 # constants 522 GNU = 1 523 SOLARIS = 2 524 # defaults 525 extractall = 0 # FIXME: currently this option has no effect at all. 526 escape = 0 527 keywords = [] 528 outpath = '' 529 outfile = 'messages.pot' 530 writelocations = 1 531 locationstyle = GNU 532 verbose = 0 533 width = 78 534 excludefilename = '' 535 docstrings = 0 536 nodocstrings = {} 537 538 options = Options() 539 locations = {'gnu' : options.GNU, 540 'solaris' : options.SOLARIS, 541 } 542 543 # parse options 544 for opt, arg in opts: 545 if opt in ('-h', '--help'): 546 usage(0) 547 elif opt in ('-a', '--extract-all'): 548 options.extractall = 1 549 elif opt in ('-d', '--default-domain'): 550 options.outfile = arg + '.pot' 551 elif opt in ('-E', '--escape'): 552 options.escape = 1 553 elif opt in ('-D', '--docstrings'): 554 options.docstrings = 1 555 elif opt in ('-k', '--keyword'): 556 options.keywords.append(arg) 557 elif opt in ('-K', '--no-default-keywords'): 558 default_keywords = [] 559 elif opt in ('-n', '--add-location'): 560 options.writelocations = 1 561 elif opt in ('--no-location',): 562 options.writelocations = 0 563 elif opt in ('-S', '--style'): 564 options.locationstyle = locations.get(arg.lower()) 565 if options.locationstyle is None: 566 usage(1, _('Invalid value for --style: %s') % arg) 567 elif opt in ('-o', '--output'): 568 options.outfile = arg 569 elif opt in ('-p', '--output-dir'): 570 options.outpath = arg 571 elif opt in ('-v', '--verbose'): 572 options.verbose = 1 573 elif opt in ('-V', '--version'): 574 print _('pygettext.py (xgettext for Python) %s') % __version__ 575 sys.exit(0) 576 elif opt in ('-w', '--width'): 577 try: 578 options.width = int(arg) 579 except ValueError: 580 usage(1, _('--width argument must be an integer: %s') % arg) 581 elif opt in ('-x', '--exclude-file'): 582 options.excludefilename = arg 583 elif opt in ('-X', '--no-docstrings'): 584 fp = open(arg) 585 try: 586 while 1: 587 line = fp.readline() 588 if not line: 589 break 590 options.nodocstrings[line[:-1]] = 1 591 finally: 592 fp.close() 593 594 # calculate escapes 595 make_escapes(not options.escape) 596 597 # calculate all keywords 598 options.keywords.extend(default_keywords) 599 600 # initialize list of strings to exclude 601 if options.excludefilename: 602 try: 603 fp = open(options.excludefilename) 604 options.toexclude = fp.readlines() 605 fp.close() 606 except IOError: 607 print >> sys.stderr, _( 608 "Can't read --exclude-file: %s") % options.excludefilename 609 sys.exit(1) 610 else: 611 options.toexclude = [] 612 613 # resolve args to module lists 614 expanded = [] 615 for arg in args: 616 if arg == '-': 617 expanded.append(arg) 618 else: 619 expanded.extend(getFilesForName(arg)) 620 args = expanded 621 622 # slurp through all the files 623 eater = TokenEater(options) 624 for filename in args: 625 if filename == '-': 626 if options.verbose: 627 print _('Reading standard input') 628 fp = sys.stdin 629 closep = 0 630 else: 631 if options.verbose: 632 print _('Working on %s') % filename 633 fp = open(filename) 634 closep = 1 635 try: 636 eater.set_filename(filename) 637 try: 638 tokenize.tokenize(fp.readline, eater) 639 except tokenize.TokenError, e: 640 print >> sys.stderr, '%s: %s, line %d, column %d' % ( 641 e[0], filename, e[1][0], e[1][1]) 642 finally: 643 if closep: 644 fp.close() 645 646 # write the output 647 if options.outfile == '-': 648 fp = sys.stdout 649 closep = 0 650 else: 651 if options.outpath: 652 options.outfile = os.path.join(options.outpath, options.outfile) 653 fp = open(options.outfile, 'w') 654 closep = 1 655 try: 656 eater.write(fp) 657 finally: 658 if closep: 659 fp.close() 660 661 663 if __name__ == '__main__': 664 main() 665 # some more test strings 666 _(u'a unicode string') 667 # this one creates a warning 668 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 669 _('more' 'than' 'one' 'string') 670