1 #! /usr/bin/env python3 2 # -*- coding: iso-8859-1 -*- 3 # Originally written by Barry Warsaw <barry (at] python.org> 4 # 5 # Minimally patched to make it even more xgettext compatible 6 # by Peter Funk <pf (at] artcom-gmbh.de> 7 # 8 # 2002-11-22 Jrgen Hermann <jh (at] web.de> 9 # Added checks that _() only contains string literals, and 10 # command line args are resolved to module lists, i.e. you 11 # can now pass a filename, a module or package name, or a 12 # directory (including globbing chars, important for Win32). 13 # Made docstring fit in 80 chars wide displays using pydoc. 14 # 15 16 # for selftesting 17 try: 18 import fintl 19 _ = fintl.gettext 20 except ImportError: 21 _ = lambda s: s 22 23 __doc__ = _("""pygettext -- Python equivalent of xgettext(1) 24 25 Many systems (Solaris, Linux, Gnu) provide extensive tools that ease the 26 internationalization of C programs. Most of these tools are independent of 27 the programming language and can be used from within Python programs. 28 Martin von Loewis' work[1] helps considerably in this regard. 29 30 There's one problem though; xgettext is the program that scans source code 31 looking for message strings, but it groks only C (or C++). Python 32 introduces a few wrinkles, such as dual quoting characters, triple quoted 33 strings, and raw strings. xgettext understands none of this. 34 35 Enter pygettext, which uses Python's standard tokenize module to scan 36 Python source code, generating .pot files identical to what GNU xgettext[2] 37 generates for C and C++ code. From there, the standard GNU tools can be 38 used. 39 40 A word about marking Python strings as candidates for translation. GNU 41 xgettext recognizes the following keywords: gettext, dgettext, dcgettext, 42 and gettext_noop. But those can be a lot of text to include all over your 43 code. C and C++ have a trick: they use the C preprocessor. Most 44 internationalized C source includes a #define for gettext() to _() so that 45 what has to be written in the source is much less. Thus these are both 46 translatable strings: 47 48 gettext("Translatable String") 49 _("Translatable String") 50 51 Python of course has no preprocessor so this doesn't work so well. Thus, 52 pygettext searches only for _() by default, but see the -k/--keyword flag 53 below for how to augment this. 54 55 [1] http://www.python.org/workshops/1997-10/proceedings/loewis.html 56 [2] http://www.gnu.org/software/gettext/gettext.html 57 58 NOTE: pygettext attempts to be option and feature compatible with GNU 59 xgettext where ever possible. However some options are still missing or are 60 not fully implemented. Also, xgettext's use of command line switches with 61 option arguments is broken, and in these cases, pygettext just defines 62 additional switches. 63 64 Usage: pygettext [options] inputfile ... 65 66 Options: 67 68 -a 69 --extract-all 70 Extract all strings. 71 72 -d name 73 --default-domain=name 74 Rename the default output file from messages.pot to name.pot. 75 76 -E 77 --escape 78 Replace non-ASCII characters with octal escape sequences. 79 80 -D 81 --docstrings 82 Extract module, class, method, and function docstrings. These do 83 not need to be wrapped in _() markers, and in fact cannot be for 84 Python to consider them docstrings. (See also the -X option). 85 86 -h 87 --help 88 Print this help message and exit. 89 90 -k word 91 --keyword=word 92 Keywords to look for in addition to the default set, which are: 93 %(DEFAULTKEYWORDS)s 94 95 You can have multiple -k flags on the command line. 96 97 -K 98 --no-default-keywords 99 Disable the default set of keywords (see above). Any keywords 100 explicitly added with the -k/--keyword option are still recognized. 101 102 --no-location 103 Do not write filename/lineno location comments. 104 105 -n 106 --add-location 107 Write filename/lineno location comments indicating where each 108 extracted string is found in the source. These lines appear before 109 each msgid. The style of comments is controlled by the -S/--style 110 option. This is the default. 111 112 -o filename 113 --output=filename 114 Rename the default output file from messages.pot to filename. If 115 filename is `-' then the output is sent to standard out. 116 117 -p dir 118 --output-dir=dir 119 Output files will be placed in directory dir. 120 121 -S stylename 122 --style stylename 123 Specify which style to use for location comments. Two styles are 124 supported: 125 126 Solaris # File: filename, line: line-number 127 GNU #: filename:line 128 129 The style name is case insensitive. GNU style is the default. 130 131 -v 132 --verbose 133 Print the names of the files being processed. 134 135 -V 136 --version 137 Print the version of pygettext and exit. 138 139 -w columns 140 --width=columns 141 Set width of output to columns. 142 143 -x filename 144 --exclude-file=filename 145 Specify a file that contains a list of strings that are not be 146 extracted from the input files. Each string to be excluded must 147 appear on a line by itself in the file. 148 149 -X filename 150 --no-docstrings=filename 151 Specify a file that contains a list of files (one per line) that 152 should not have their docstrings extracted. This is only useful in 153 conjunction with the -D option above. 154 155 If `inputfile' is -, standard input is read. 156 """) 157 158 import os 159 import importlib.machinery 160 import importlib.util 161 import sys 162 import glob 163 import time 164 import getopt 165 import token 166 import tokenize 167 168 __version__ = '1.5' 169 170 default_keywords = ['_'] 171 DEFAULTKEYWORDS = ', '.join(default_keywords) 172 173 EMPTYSTRING = '' 174 175 176 178 # The normal pot-file header. msgmerge and Emacs's po-mode work better if it's 179 # there. 180 pot_header = _('''\ 181 # SOME DESCRIPTIVE TITLE. 182 # Copyright (C) YEAR ORGANIZATION 183 # FIRST AUTHOR <EMAIL@ADDRESS>, YEAR. 184 # 185 msgid "" 186 msgstr "" 187 "Project-Id-Version: PACKAGE VERSION\\n" 188 "POT-Creation-Date: %(time)s\\n" 189 "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\\n" 190 "Last-Translator: FULL NAME <EMAIL@ADDRESS>\\n" 191 "Language-Team: LANGUAGE <LL@li.org>\\n" 192 "MIME-Version: 1.0\\n" 193 "Content-Type: text/plain; charset=%(charset)s\\n" 194 "Content-Transfer-Encoding: %(encoding)s\\n" 195 "Generated-By: pygettext.py %(version)s\\n" 196 197 ''') 198 199 201 def usage(code, msg=''): 202 print(__doc__ % globals(), file=sys.stderr) 203 if msg: 204 print(msg, file=sys.stderr) 205 sys.exit(code) 206 207 208 210 def make_escapes(pass_nonascii): 211 global escapes, escape 212 if pass_nonascii: 213 # Allow non-ascii characters to pass through so that e.g. 'msgid 214 # "Hhe"' would result not result in 'msgid "H\366he"'. Otherwise we 215 # escape any character outside the 32..126 range. 216 mod = 128 217 escape = escape_ascii 218 else: 219 mod = 256 220 escape = escape_nonascii 221 escapes = [r"\%03o" % i for i in range(mod)] 222 for i in range(32, 127): 223 escapes[i] = chr(i) 224 escapes[ord('\\')] = r'\\' 225 escapes[ord('\t')] = r'\t' 226 escapes[ord('\r')] = r'\r' 227 escapes[ord('\n')] = r'\n' 228 escapes[ord('\"')] = r'\"' 229 230 231 def escape_ascii(s, encoding): 232 return ''.join(escapes[ord(c)] if ord(c) < 128 else c for c in s) 233 234 def escape_nonascii(s, encoding): 235 return ''.join(escapes[b] for b in s.encode(encoding)) 236 237 238 def is_literal_string(s): 239 return s[0] in '\'"' or (s[0] in 'rRuU' and s[1] in '\'"') 240 241 242 def safe_eval(s): 243 # unwrap quotes, safely 244 return eval(s, {'__builtins__':{}}, {}) 245 246 247 def normalize(s, encoding): 248 # This converts the various Python string types into a format that is 249 # appropriate for .po files, namely much closer to C style. 250 lines = s.split('\n') 251 if len(lines) == 1: 252 s = '"' + escape(s, encoding) + '"' 253 else: 254 if not lines[-1]: 255 del lines[-1] 256 lines[-1] = lines[-1] + '\n' 257 for i in range(len(lines)): 258 lines[i] = escape(lines[i], encoding) 259 lineterm = '\\n"\n"' 260 s = '""\n"' + lineterm.join(lines) + '"' 261 return s 262 263 265 def containsAny(str, set): 266 """Check whether 'str' contains ANY of the chars in 'set'""" 267 return 1 in [c in str for c in set] 268 269 270 def getFilesForName(name): 271 """Get a list of module files for a filename, a module or package name, 272 or a directory. 273 """ 274 if not os.path.exists(name): 275 # check for glob chars 276 if containsAny(name, "*?[]"): 277 files = glob.glob(name) 278 list = [] 279 for file in files: 280 list.extend(getFilesForName(file)) 281 return list 282 283 # try to find module or package 284 try: 285 spec = importlib.util.find_spec(name) 286 name = spec.origin 287 except ImportError: 288 name = None 289 if not name: 290 return [] 291 292 if os.path.isdir(name): 293 # find all python files in directory 294 list = [] 295 # get extension for python source files 296 _py_ext = importlib.machinery.SOURCE_SUFFIXES[0] 297 for root, dirs, files in os.walk(name): 298 # don't recurse into CVS directories 299 if 'CVS' in dirs: 300 dirs.remove('CVS') 301 # add all *.py files to list 302 list.extend( 303 [os.path.join(root, file) for file in files 304 if os.path.splitext(file)[1] == _py_ext] 305 ) 306 return list 307 elif os.path.exists(name): 308 # a single file 309 return [name] 310 311 return [] 312 313 315 class TokenEater: 316 def __init__(self, options): 317 self.__options = options 318 self.__messages = {} 319 self.__state = self.__waiting 320 self.__data = [] 321 self.__lineno = -1 322 self.__freshmodule = 1 323 self.__curfile = None 324 self.__enclosurecount = 0 325 326 def __call__(self, ttype, tstring, stup, etup, line): 327 # dispatch 328 ## import token 329 ## print('ttype:', token.tok_name[ttype], 'tstring:', tstring, 330 ## file=sys.stderr) 331 self.__state(ttype, tstring, stup[0]) 332 333 def __waiting(self, ttype, tstring, lineno): 334 opts = self.__options 335 # Do docstring extractions, if enabled 336 if opts.docstrings and not opts.nodocstrings.get(self.__curfile): 337 # module docstring? 338 if self.__freshmodule: 339 if ttype == tokenize.STRING and is_literal_string(tstring): 340 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 341 self.__freshmodule = 0 342 elif ttype not in (tokenize.COMMENT, tokenize.NL): 343 self.__freshmodule = 0 344 return 345 # class or func/method docstring? 346 if ttype == tokenize.NAME and tstring in ('class', 'def'): 347 self.__state = self.__suiteseen 348 return 349 if ttype == tokenize.NAME and tstring in opts.keywords: 350 self.__state = self.__keywordseen 351 352 def __suiteseen(self, ttype, tstring, lineno): 353 # skip over any enclosure pairs until we see the colon 354 if ttype == tokenize.OP: 355 if tstring == ':' and self.__enclosurecount == 0: 356 # we see a colon and we're not in an enclosure: end of def 357 self.__state = self.__suitedocstring 358 elif tstring in '([{': 359 self.__enclosurecount += 1 360 elif tstring in ')]}': 361 self.__enclosurecount -= 1 362 363 def __suitedocstring(self, ttype, tstring, lineno): 364 # ignore any intervening noise 365 if ttype == tokenize.STRING and is_literal_string(tstring): 366 self.__addentry(safe_eval(tstring), lineno, isdocstring=1) 367 self.__state = self.__waiting 368 elif ttype not in (tokenize.NEWLINE, tokenize.INDENT, 369 tokenize.COMMENT): 370 # there was no class docstring 371 self.__state = self.__waiting 372 373 def __keywordseen(self, ttype, tstring, lineno): 374 if ttype == tokenize.OP and tstring == '(': 375 self.__data = [] 376 self.__lineno = lineno 377 self.__state = self.__openseen 378 else: 379 self.__state = self.__waiting 380 381 def __openseen(self, ttype, tstring, lineno): 382 if ttype == tokenize.OP and tstring == ')': 383 # We've seen the last of the translatable strings. Record the 384 # line number of the first line of the strings and update the list 385 # of messages seen. Reset state for the next batch. If there 386 # were no strings inside _(), then just ignore this entry. 387 if self.__data: 388 self.__addentry(EMPTYSTRING.join(self.__data)) 389 self.__state = self.__waiting 390 elif ttype == tokenize.STRING and is_literal_string(tstring): 391 self.__data.append(safe_eval(tstring)) 392 elif ttype not in [tokenize.COMMENT, token.INDENT, token.DEDENT, 393 token.NEWLINE, tokenize.NL]: 394 # warn if we see anything else than STRING or whitespace 395 print(_( 396 '*** %(file)s:%(lineno)s: Seen unexpected token "%(token)s"' 397 ) % { 398 'token': tstring, 399 'file': self.__curfile, 400 'lineno': self.__lineno 401 }, file=sys.stderr) 402 self.__state = self.__waiting 403 404 def __addentry(self, msg, lineno=None, isdocstring=0): 405 if lineno is None: 406 lineno = self.__lineno 407 if not msg in self.__options.toexclude: 408 entry = (self.__curfile, lineno) 409 self.__messages.setdefault(msg, {})[entry] = isdocstring 410 411 def set_filename(self, filename): 412 self.__curfile = filename 413 self.__freshmodule = 1 414 415 def write(self, fp): 416 options = self.__options 417 timestamp = time.strftime('%Y-%m-%d %H:%M%z') 418 encoding = fp.encoding if fp.encoding else 'UTF-8' 419 print(pot_header % {'time': timestamp, 'version': __version__, 420 'charset': encoding, 421 'encoding': '8bit'}, file=fp) 422 # Sort the entries. First sort each particular entry's keys, then 423 # sort all the entries by their first item. 424 reverse = {} 425 for k, v in self.__messages.items(): 426 keys = sorted(v.keys()) 427 reverse.setdefault(tuple(keys), []).append((k, v)) 428 rkeys = sorted(reverse.keys()) 429 for rkey in rkeys: 430 rentries = reverse[rkey] 431 rentries.sort() 432 for k, v in rentries: 433 # If the entry was gleaned out of a docstring, then add a 434 # comment stating so. This is to aid translators who may wish 435 # to skip translating some unimportant docstrings. 436 isdocstring = any(v.values()) 437 # k is the message string, v is a dictionary-set of (filename, 438 # lineno) tuples. We want to sort the entries in v first by 439 # file name and then by line number. 440 v = sorted(v.keys()) 441 if not options.writelocations: 442 pass 443 # location comments are different b/w Solaris and GNU: 444 elif options.locationstyle == options.SOLARIS: 445 for filename, lineno in v: 446 d = {'filename': filename, 'lineno': lineno} 447 print(_( 448 '# File: %(filename)s, line: %(lineno)d') % d, file=fp) 449 elif options.locationstyle == options.GNU: 450 # fit as many locations on one line, as long as the 451 # resulting line length doesn't exceed 'options.width' 452 locline = '#:' 453 for filename, lineno in v: 454 d = {'filename': filename, 'lineno': lineno} 455 s = _(' %(filename)s:%(lineno)d') % d 456 if len(locline) + len(s) <= options.width: 457 locline = locline + s 458 else: 459 print(locline, file=fp) 460 locline = "#:" + s 461 if len(locline) > 2: 462 print(locline, file=fp) 463 if isdocstring: 464 print('#, docstring', file=fp) 465 print('msgid', normalize(k, encoding), file=fp) 466 print('msgstr ""\n', file=fp) 467 468 469 471 def main(): 472 global default_keywords 473 try: 474 opts, args = getopt.getopt( 475 sys.argv[1:], 476 'ad:DEhk:Kno:p:S:Vvw:x:X:', 477 ['extract-all', 'default-domain=', 'escape', 'help', 478 'keyword=', 'no-default-keywords', 479 'add-location', 'no-location', 'output=', 'output-dir=', 480 'style=', 'verbose', 'version', 'width=', 'exclude-file=', 481 'docstrings', 'no-docstrings', 482 ]) 483 except getopt.error as msg: 484 usage(1, msg) 485 486 # for holding option values 487 class Options: 488 # constants 489 GNU = 1 490 SOLARIS = 2 491 # defaults 492 extractall = 0 # FIXME: currently this option has no effect at all. 493 escape = 0 494 keywords = [] 495 outpath = '' 496 outfile = 'messages.pot' 497 writelocations = 1 498 locationstyle = GNU 499 verbose = 0 500 width = 78 501 excludefilename = '' 502 docstrings = 0 503 nodocstrings = {} 504 505 options = Options() 506 locations = {'gnu' : options.GNU, 507 'solaris' : options.SOLARIS, 508 } 509 510 # parse options 511 for opt, arg in opts: 512 if opt in ('-h', '--help'): 513 usage(0) 514 elif opt in ('-a', '--extract-all'): 515 options.extractall = 1 516 elif opt in ('-d', '--default-domain'): 517 options.outfile = arg + '.pot' 518 elif opt in ('-E', '--escape'): 519 options.escape = 1 520 elif opt in ('-D', '--docstrings'): 521 options.docstrings = 1 522 elif opt in ('-k', '--keyword'): 523 options.keywords.append(arg) 524 elif opt in ('-K', '--no-default-keywords'): 525 default_keywords = [] 526 elif opt in ('-n', '--add-location'): 527 options.writelocations = 1 528 elif opt in ('--no-location',): 529 options.writelocations = 0 530 elif opt in ('-S', '--style'): 531 options.locationstyle = locations.get(arg.lower()) 532 if options.locationstyle is None: 533 usage(1, _('Invalid value for --style: %s') % arg) 534 elif opt in ('-o', '--output'): 535 options.outfile = arg 536 elif opt in ('-p', '--output-dir'): 537 options.outpath = arg 538 elif opt in ('-v', '--verbose'): 539 options.verbose = 1 540 elif opt in ('-V', '--version'): 541 print(_('pygettext.py (xgettext for Python) %s') % __version__) 542 sys.exit(0) 543 elif opt in ('-w', '--width'): 544 try: 545 options.width = int(arg) 546 except ValueError: 547 usage(1, _('--width argument must be an integer: %s') % arg) 548 elif opt in ('-x', '--exclude-file'): 549 options.excludefilename = arg 550 elif opt in ('-X', '--no-docstrings'): 551 fp = open(arg) 552 try: 553 while 1: 554 line = fp.readline() 555 if not line: 556 break 557 options.nodocstrings[line[:-1]] = 1 558 finally: 559 fp.close() 560 561 # calculate escapes 562 make_escapes(not options.escape) 563 564 # calculate all keywords 565 options.keywords.extend(default_keywords) 566 567 # initialize list of strings to exclude 568 if options.excludefilename: 569 try: 570 fp = open(options.excludefilename) 571 options.toexclude = fp.readlines() 572 fp.close() 573 except IOError: 574 print(_( 575 "Can't read --exclude-file: %s") % options.excludefilename, file=sys.stderr) 576 sys.exit(1) 577 else: 578 options.toexclude = [] 579 580 # resolve args to module lists 581 expanded = [] 582 for arg in args: 583 if arg == '-': 584 expanded.append(arg) 585 else: 586 expanded.extend(getFilesForName(arg)) 587 args = expanded 588 589 # slurp through all the files 590 eater = TokenEater(options) 591 for filename in args: 592 if filename == '-': 593 if options.verbose: 594 print(_('Reading standard input')) 595 fp = sys.stdin.buffer 596 closep = 0 597 else: 598 if options.verbose: 599 print(_('Working on %s') % filename) 600 fp = open(filename, 'rb') 601 closep = 1 602 try: 603 eater.set_filename(filename) 604 try: 605 tokens = tokenize.tokenize(fp.readline) 606 for _token in tokens: 607 eater(*_token) 608 except tokenize.TokenError as e: 609 print('%s: %s, line %d, column %d' % ( 610 e.args[0], filename, e.args[1][0], e.args[1][1]), 611 file=sys.stderr) 612 finally: 613 if closep: 614 fp.close() 615 616 # write the output 617 if options.outfile == '-': 618 fp = sys.stdout 619 closep = 0 620 else: 621 if options.outpath: 622 options.outfile = os.path.join(options.outpath, options.outfile) 623 fp = open(options.outfile, 'w') 624 closep = 1 625 try: 626 eater.write(fp) 627 finally: 628 if closep: 629 fp.close() 630 631 633 if __name__ == '__main__': 634 main() 635 # some more test strings 636 # this one creates a warning 637 _('*** Seen unexpected token "%(token)s"') % {'token': 'test'} 638 _('more' 'than' 'one' 'string') 639