1 #! /usr/bin/env python3 2 # -*- coding: iso-8859-1 -*- 3 # Originally written by Barry Warsaw <barry (at] python.org> 4 # 5 # Minimally patched to make it even more xgettext compatible 6 # by Peter Funk <pf (at] artcom-gmbh.de> 7 # 8 # 2002-11-22 Jrgen Hermann <jh (at] web.de> 9 # Added checks that _() only contains string literals, and 10 # command line args are resolved to module lists, i.e. you 11 # can now pass a filename, a module or package name, or a 12 # directory (including globbing chars, important for Win32). 13 # Made docstring fit in 80 chars wide displays using pydoc. 14 # 15 16 # for selftesting 17 try: 18 import fintl 19 _ = fintl.gettext 20 except ImportError: 21 _ = lambda s: s 22 23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26 internationalization of C programs. Most of these tools are independent of 27 the programming language and can be used from within Python programs. 28 Martin von Loewis' work[1] helps considerably in this regard. 29 30 There's one problem though; xgettext is the program that scans source code 31 looking for message strings, but it groks only C (or C++). Python 32 introduces a few wrinkles, such as dual quoting characters, triple quoted 33 strings, and raw strings. xgettext understands none of this. 34 35 Enter pygettext, which uses Python's standard tokenize module to scan 36 Python source code, generating .pot files identical to what GNU xgettext[2] 37 generates for C and C++ code. From there, the standard GNU tools can be 38 used. 39 40 A word about marking Python strings as candidates for translation. GNU 41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42 and gettext_noop. But those can be a lot of text to include all over your 43 code. C and C++ have a trick: they use the C preprocessor. Most 44 internationalized C source includes a #define for gettext() to _() so that 45 what has to be written in the source is much less. Thus these are both 46 translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51 Python of course has no preprocessor so this doesn't work so well. Thus, 52 pygettext searches only for _() by default, but see the -k/--keyword flag 53 below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58 NOTE: pygettext attempts to be option and feature compatible with GNU 59 xgettext where ever possible. However some options are still missing or are 60 not fully implemented. Also, xgettext's use of command line switches with 61 option arguments is broken, and in these cases, pygettext just defines 62 additional switches. 63 64 Usage: pygettext [options] inputfile ... 65 66 Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155 If `inputfile' is -, standard input is read. 156 """) 157 158 import os 159 import importlib.machinery 160 import importlib.util 161 import sys 162 import glob 163 import time 164 import getopt 165 import token 166 import tokenize 167 168 __version__ = '1.5' 169 170 default_keywords = ['_'] 171 DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173 EMPTYSTRING = '' 174 175 176 178 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179 # there. 180 pot_header = _('''\ 181 # SOME DESCRIPTIVE TITLE. 182 # Copyright (C) YEAR ORGANIZATION 183 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184 # 185 msgid "" 186 msgstr "" 187 "Project-Id-Version: PACKAGE VERSION\\n" 188 "POT-Creation-Date: %(time)s\\n" 189 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191 "Language-Team: LANGUAGE <LL@li.org>\\n" 192 "MIME-Version: 1.0\\n" 193 "Content-Type: text/plain; charset=%(charset)s\\n" 194 "Content-Transfer-Encoding: %(encoding)s\\n" 195 "Generated-By: pygettext.py %(version)s\\n" 196 197 ''') 198 199 201 def usage(code, msg=''): 202 print(__doc__ % globals(), file=sys.stderr) 203 if msg: 204 print(msg, file=sys.stderr) 205 sys.exit(code) 206 207 208 210 def make_escapes(pass_nonascii): 211 global escapes, escape 212 if pass_nonascii: 213 # Allow non-ascii characters to pass through so that e.g. 'msgid 214 # "Hhe"' would result not result in 'msgid "H\366he"'. Otherwise we 215 # escape any character outside the 32..126 range. 216 mod = 128 217 escape = escape_ascii 218 else: 219 mod = 256 220 escape = escape_nonascii 221 escapes = [r"\%03o" % i for i in range(mod)] 222 for i in range(32, 127): 223 escapes[i] = chr(i) 224 escapes[ord('\\')] = r'\\' 225 escapes[ord('\t')] = r'\t' 226 escapes[ord('\r')] = r'\r' 227 escapes[ord('\n')] = r'\n' 228 escapes[ord('\"')] = r'\"' 229 230 231 def escape_ascii(s, encoding): 232 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 233 234 def escape_nonascii(s, encoding): 235 return ''.join(escapes[b] for b in s.encode(encoding)) 236 237 238 def safe_eval(s): 239 # unwrap quotes, safely 240 return eval(s, {'__builtins__':{}}, {}) 241 242 243 def normalize(s, encoding): 244 # This converts the various Python string types into a format that is 245 # appropriate for .po files, namely much closer to C style. 246 lines = s.split('\n') 247 if len(lines) == 1: 248 s = '"' + escape(s, encoding) + '"' 249 else: 250 if not lines[-1]: 251 del lines[-1] 252 lines[-1] = lines[-1] + '\n' 253 for i in range(len(lines)): 254 lines[i] = escape(lines[i], encoding) 255 lineterm = '\\n"\n"' 256 s = '""\n"' + lineterm.join(lines) + '"' 257 return s 258 259 261 def containsAny(str, set): 262 """Check whether 'str' contains ANY of the chars in 'set'""" 263 return 1 in [c in str for c in set] 264 265 266 def _visit_pyfiles(list, dirname, names): 267 """Helper for getFilesForName().""" 268 # get extension for python source files 269 if '_py_ext' not in globals(): 270 global _py_ext 271 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 272 273 # don't recurse into CVS directories 274 if 'CVS' in names: 275 names.remove('CVS') 276 277 # add all *.py files to list 278 list.extend( 279 [os.path.join(dirname, file) for file in names 280 if os.path.splitext(file)[1] == _py_ext] 281 ) 282 283 284 def getFilesForName(name): 285 """Get a list of module files for a filename, a module or package name, 286 or a directory. 287 """ 288 if not os.path.exists(name): 289 # check for glob chars 290 if containsAny(name, "*?[]"): 291 files = glob.glob(name) 292 list = [] 293 for file in files: 294 list.extend(getFilesForName(file)) 295 return list 296 297 # try to find module or package 298 try: 299 spec = importlib.util.find_spec(name) 300 name = spec.origin 301 except ImportError: 302 name = None 303 if not name: 304 return [] 305 306 if os.path.isdir(name): 307 # find all python files in directory 308 list = [] 309 os.walk(name, _visit_pyfiles, list) 310 return list 311 elif os.path.exists(name): 312 # a single file 313 return [name] 314 315 return [] 316 317 319 class TokenEater: 320 def __init__(self, options): 321 self.__options = options 322 self.__messages = {} 323 self.__state = self.__waiting 324 self.__data = [] 325 self.__lineno = -1 326 self.__freshmodule = 1 327 self.__curfile = None 328 329 def __call__(self, ttype, tstring, stup, etup, line): 330 # dispatch 331 ## import token 332 ## print >> sys.stderr, 'ttype:', token.tok_name[ttype], \ 333 ## 'tstring:', tstring 334 self.__state(ttype, tstring, stup[0]) 335 336 def __waiting(self, ttype, tstring, lineno): 337 opts = self.__options 338 # Do docstring extractions, if enabled 339 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 340 # module docstring? 341 if self.__freshmodule: 342 if ttype == tokenize.STRING: 343 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 344 self.__freshmodule = 0 345 elif ttype not in (tokenize.COMMENT, tokenize.NL): 346 self.__freshmodule = 0 347 return 348 # class docstring? 349 if ttype == tokenize.NAME and tstring in ('class', 'def'): 350 self.__state = self.__suiteseen 351 return 352 if ttype == tokenize.NAME and tstring in opts.keywords: 353 self.__state = self.__keywordseen 354 355 def __suiteseen(self, ttype, tstring, lineno): 356 # ignore anything until we see the colon 357 if ttype == tokenize.OP and tstring == ':': 358 self.__state = self.__suitedocstring 359 360 def __suitedocstring(self, ttype, tstring, lineno): 361 # ignore any intervening noise 362 if ttype == tokenize.STRING: 363 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 364 self.__state = self.__waiting 365 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 366 tokenize.COMMENT): 367 # there was no class docstring 368 self.__state = self.__waiting 369 370 def __keywordseen(self, ttype, tstring, lineno): 371 if ttype == tokenize.OP and tstring == '(': 372 self.__data = [] 373 self.__lineno = lineno 374 self.__state = self.__openseen 375 else: 376 self.__state = self.__waiting 377 378 def __openseen(self, ttype, tstring, lineno): 379 if ttype == tokenize.OP and tstring == ')': 380 # We've seen the last of the translatable strings. Record the 381 # line number of the first line of the strings and update the list 382 # of messages seen. Reset state for the next batch. If there 383 # were no strings inside _(), then just ignore this entry. 384 if self.__data: 385 self.__addentry(EMPTYSTRING.join(self.__data)) 386 self.__state = self.__waiting 387 elif ttype == tokenize.STRING: 388 self.__data.append(safe_eval(tstring)) 389 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 390 token.NEWLINE, tokenize.NL]: 391 # warn if we see anything else than STRING or whitespace 392 print(_( 393 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 394 ) % { 395 'token': tstring, 396 'file': self.__curfile, 397 'lineno': self.__lineno 398 }, file=sys.stderr) 399 self.__state = self.__waiting 400 401 def __addentry(self, msg, lineno=None, isdocstring=0): 402 if lineno is None: 403 lineno = self.__lineno 404 if not msg in self.__options.toexclude: 405 entry = (self.__curfile, lineno) 406 self.__messages.setdefault(msg, {})[entry] = isdocstring 407 408 def set_filename(self, filename): 409 self.__curfile = filename 410 self.__freshmodule = 1 411 412 def write(self, fp): 413 options = self.__options 414 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 415 encoding = fp.encoding if fp.encoding else 'UTF-8' 416 print(pot_header % {'time': timestamp, 'version': __version__, 417 'charset': encoding, 418 'encoding': '8bit'}, file=fp) 419 # Sort the entries. First sort each particular entry's keys, then 420 # sort all the entries by their first item. 421 reverse = {} 422 for k, v in self.__messages.items(): 423 keys = sorted(v.keys()) 424 reverse.setdefault(tuple(keys), []).append((k, v)) 425 rkeys = sorted(reverse.keys()) 426 for rkey in rkeys: 427 rentries = reverse[rkey] 428 rentries.sort() 429 for k, v in rentries: 430 # If the entry was gleaned out of a docstring, then add a 431 # comment stating so. This is to aid translators who may wish 432 # to skip translating some unimportant docstrings. 433 isdocstring = any(v.values()) 434 # k is the message string, v is a dictionary-set of (filename, 435 # lineno) tuples. We want to sort the entries in v first by 436 # file name and then by line number. 437 v = sorted(v.keys()) 438 if not options.writelocations: 439 pass 440 # location comments are different b/w Solaris and GNU: 441 elif options.locationstyle == options.SOLARIS: 442 for filename, lineno in v: 443 d = {'filename': filename, 'lineno': lineno} 444 print(_( 445 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 446 elif options.locationstyle == options.GNU: 447 # fit as many locations on one line, as long as the 448 # resulting line length doesn't exceed 'options.width' 449 locline = '#:' 450 for filename, lineno in v: 451 d = {'filename': filename, 'lineno': lineno} 452 s = _(' %(filename)s:%(lineno)d') % d 453 if len(locline) + len(s) <= options.width: 454 locline = locline + s 455 else: 456 print(locline, file=fp) 457 locline = "#:" + s 458 if len(locline) > 2: 459 print(locline, file=fp) 460 if isdocstring: 461 print('#, docstring', file=fp) 462 print('msgid', normalize(k, encoding), file=fp) 463 print('msgstr ""\n', file=fp) 464 465 466 468 def main(): 469 global default_keywords 470 try: 471 opts, args = getopt.getopt( 472 sys.argv[1:], 473 'ad:DEhk:Kno:p:S:Vvw:x:X:', 474 ['extract-all', 'default-domain=', 'escape', 'help', 475 'keyword=', 'no-default-keywords', 476 'add-location', 'no-location', 'output=', 'output-dir=', 477 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 478 'docstrings', 'no-docstrings', 479 ]) 480 except getopt.error as msg: 481 usage(1, msg) 482 483 # for holding option values 484 class Options: 485 # constants 486 GNU = 1 487 SOLARIS = 2 488 # defaults 489 extractall = 0 # FIXME: currently this option has no effect at all. 490 escape = 0 491 keywords = [] 492 outpath = '' 493 outfile = 'messages.pot' 494 writelocations = 1 495 locationstyle = GNU 496 verbose = 0 497 width = 78 498 excludefilename = '' 499 docstrings = 0 500 nodocstrings = {} 501 502 options = Options() 503 locations = {'gnu' : options.GNU, 504 'solaris' : options.SOLARIS, 505 } 506 507 # parse options 508 for opt, arg in opts: 509 if opt in ('-h', '--help'): 510 usage(0) 511 elif opt in ('-a', '--extract-all'): 512 options.extractall = 1 513 elif opt in ('-d', '--default-domain'): 514 options.outfile = arg + '.pot' 515 elif opt in ('-E', '--escape'): 516 options.escape = 1 517 elif opt in ('-D', '--docstrings'): 518 options.docstrings = 1 519 elif opt in ('-k', '--keyword'): 520 options.keywords.append(arg) 521 elif opt in ('-K', '--no-default-keywords'): 522 default_keywords = [] 523 elif opt in ('-n', '--add-location'): 524 options.writelocations = 1 525 elif opt in ('--no-location',): 526 options.writelocations = 0 527 elif opt in ('-S', '--style'): 528 options.locationstyle = locations.get(arg.lower()) 529 if options.locationstyle is None: 530 usage(1, _('Invalid value for --style: %s') % arg) 531 elif opt in ('-o', '--output'): 532 options.outfile = arg 533 elif opt in ('-p', '--output-dir'): 534 options.outpath = arg 535 elif opt in ('-v', '--verbose'): 536 options.verbose = 1 537 elif opt in ('-V', '--version'): 538 print(_('pygettext.py (xgettext for Python) %s') % __version__) 539 sys.exit(0) 540 elif opt in ('-w', '--width'): 541 try: 542 options.width = int(arg) 543 except ValueError: 544 usage(1, _('--width argument must be an integer: %s') % arg) 545 elif opt in ('-x', '--exclude-file'): 546 options.excludefilename = arg 547 elif opt in ('-X', '--no-docstrings'): 548 fp = open(arg) 549 try: 550 while 1: 551 line = fp.readline() 552 if not line: 553 break 554 options.nodocstrings[line[:-1]] = 1 555 finally: 556 fp.close() 557 558 # calculate escapes 559 make_escapes(not options.escape) 560 561 # calculate all keywords 562 options.keywords.extend(default_keywords) 563 564 # initialize list of strings to exclude 565 if options.excludefilename: 566 try: 567 fp = open(options.excludefilename) 568 options.toexclude = fp.readlines() 569 fp.close() 570 except IOError: 571 print(_( 572 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 573 sys.exit(1) 574 else: 575 options.toexclude = [] 576 577 # resolve args to module lists 578 expanded = [] 579 for arg in args: 580 if arg == '-': 581 expanded.append(arg) 582 else: 583 expanded.extend(getFilesForName(arg)) 584 args = expanded 585 586 # slurp through all the files 587 eater = TokenEater(options) 588 for filename in args: 589 if filename == '-': 590 if options.verbose: 591 print(_('Reading standard input')) 592 fp = sys.stdin.buffer 593 closep = 0 594 else: 595 if options.verbose: 596 print(_('Working on %s') % filename) 597 fp = open(filename, 'rb') 598 closep = 1 599 try: 600 eater.set_filename(filename) 601 try: 602 tokens = tokenize.tokenize(fp.readline) 603 for _token in tokens: 604 eater(*_token) 605 except tokenize.TokenError as e: 606 print('%s: %s, line %d, column %d' % ( 607 e.args[0], filename, e.args[1][0], e.args[1][1]), 608 file=sys.stderr) 609 finally: 610 if closep: 611 fp.close() 612 613 # write the output 614 if options.outfile == '-': 615 fp = sys.stdout 616 closep = 0 617 else: 618 if options.outpath: 619 options.outfile = os.path.join(options.outpath, options.outfile) 620 fp = open(options.outfile, 'w') 621 closep = 1 622 try: 623 eater.write(fp) 624 finally: 625 if closep: 626 fp.close() 627 628 630 if __name__ == '__main__': 631 main() 632 # some more test strings 633 # this one creates a warning 634 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 635 _('more' 'than' 'one' 'string') 636