1 """Internationalization and localization support. 2 3 This module provides internationalization (I18N) and localization (L10N) 4 support for your Python programs by providing an interface to the GNU gettext 5 message catalog library. 6 7 I18N refers to the operation by which a program is made aware of multiple 8 languages. L10N refers to the adaptation of your program, once 9 internationalized, to the local language and cultural habits. 10 11 """ 12 13 # This module represents the integration of work, contributions, feedback, and 14 # suggestions from the following people: 15 # 16 # Martin von Loewis, who wrote the initial implementation of the underlying 17 # C-based libintlmodule (later renamed _gettext), along with a skeletal 18 # gettext.py implementation. 19 # 20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule, 21 # which also included a pure-Python implementation to read .mo files if 22 # intlmodule wasn't available. 23 # 24 # James Henstridge, who also wrote a gettext.py module, which has some 25 # interesting, but currently unsupported experimental features: the notion of 26 # a Catalog class and instances, and the ability to add to a catalog file via 27 # a Python API. 28 # 29 # Barry Warsaw integrated these modules, wrote the .install() API and code, 30 # and conformed all C and Python code to Python's coding standards. 31 # 32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this 33 # module. 34 # 35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs. 36 # 37 # TODO: 38 # - Lazy loading of .mo files. Currently the entire catalog is loaded into 39 # memory, but that's probably bad for large translated programs. Instead, 40 # the lexical sort of original strings in GNU .mo files should be exploited 41 # to do binary searches and lazy initializations. Or you might want to use 42 # the undocumented double-hash algorithm for .mo files with hash tables, but 43 # you'll need to study the GNU gettext code to do this. 44 # 45 # - Support Solaris .mo file formats. Unfortunately, we've been unable to 46 # find this format documented anywhere. 47 48 49 import locale, copy, os, re, struct, sys 50 from errno import ENOENT 51 52 53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog', 54 'find', 'translation', 'install', 'textdomain', 'bindtextdomain', 55 'dgettext', 'dngettext', 'gettext', 'ngettext', 56 ] 57 58 _default_localedir = os.path.join(sys.prefix, 'share', 'locale') 59 60 61 def test(condition, true, false): 62 """ 63 Implements the C expression: 64 65 condition ? true : false 66 67 Required to correctly interpret plural forms. 68 """ 69 if condition: 70 return true 71 else: 72 return false 73 74 75 def c2py(plural): 76 """Gets a C expression as used in PO files for plural forms and returns a 77 Python lambda function that implements an equivalent expression. 78 """ 79 # Security check, allow only the "n" identifier 80 try: 81 from cStringIO import StringIO 82 except ImportError: 83 from StringIO import StringIO 84 import token, tokenize 85 tokens = tokenize.generate_tokens(StringIO(plural).readline) 86 try: 87 danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n'] 88 except tokenize.TokenError: 89 raise ValueError, \ 90 'plural forms expression error, maybe unbalanced parenthesis' 91 else: 92 if danger: 93 raise ValueError, 'plural forms expression could be dangerous' 94 95 # Replace some C operators by their Python equivalents 96 plural = plural.replace('&&', ' and ') 97 plural = plural.replace('||', ' or ') 98 99 expr = re.compile(r'\!([^=])') 100 plural = expr.sub(' not \\1', plural) 101 102 # Regular expression and replacement function used to transform 103 # "a?b:c" to "test(a,b,c)". 104 expr = re.compile(r'(.*?)\?(.*?):(.*)') 105 def repl(x): 106 return "test(%s, %s, %s)" % (x.group(1), x.group(2), 107 expr.sub(repl, x.group(3))) 108 109 # Code to transform the plural expression, taking care of parentheses 110 stack = [''] 111 for c in plural: 112 if c == '(': 113 stack.append('') 114 elif c == ')': 115 if len(stack) == 1: 116 # Actually, we never reach this code, because unbalanced 117 # parentheses get caught in the security check at the 118 # beginning. 119 raise ValueError, 'unbalanced parenthesis in plural form' 120 s = expr.sub(repl, stack.pop()) 121 stack[-1] += '(%s)' % s 122 else: 123 stack[-1] += c 124 plural = expr.sub(repl, stack.pop()) 125 126 return eval('lambda n: int(%s)' % plural) 127 128 129 130 def _expand_lang(locale): 131 from locale import normalize 132 locale = normalize(locale) 133 COMPONENT_CODESET = 1 << 0 134 COMPONENT_TERRITORY = 1 << 1 135 COMPONENT_MODIFIER = 1 << 2 136 # split up the locale into its base components 137 mask = 0 138 pos = locale.find('@') 139 if pos >= 0: 140 modifier = locale[pos:] 141 locale = locale[:pos] 142 mask |= COMPONENT_MODIFIER 143 else: 144 modifier = '' 145 pos = locale.find('.') 146 if pos >= 0: 147 codeset = locale[pos:] 148 locale = locale[:pos] 149 mask |= COMPONENT_CODESET 150 else: 151 codeset = '' 152 pos = locale.find('_') 153 if pos >= 0: 154 territory = locale[pos:] 155 locale = locale[:pos] 156 mask |= COMPONENT_TERRITORY 157 else: 158 territory = '' 159 language = locale 160 ret = [] 161 for i in range(mask+1): 162 if not (i & ~mask): # if all components for this combo exist ... 163 val = language 164 if i & COMPONENT_TERRITORY: val += territory 165 if i & COMPONENT_CODESET: val += codeset 166 if i & COMPONENT_MODIFIER: val += modifier 167 ret.append(val) 168 ret.reverse() 169 return ret 170 171 172 173 class NullTranslations: 174 def __init__(self, fp=None): 175 self._info = {} 176 self._charset = None 177 self._output_charset = None 178 self._fallback = None 179 if fp is not None: 180 self._parse(fp) 181 182 def _parse(self, fp): 183 pass 184 185 def add_fallback(self, fallback): 186 if self._fallback: 187 self._fallback.add_fallback(fallback) 188 else: 189 self._fallback = fallback 190 191 def gettext(self, message): 192 if self._fallback: 193 return self._fallback.gettext(message) 194 return message 195 196 def lgettext(self, message): 197 if self._fallback: 198 return self._fallback.lgettext(message) 199 return message 200 201 def ngettext(self, msgid1, msgid2, n): 202 if self._fallback: 203 return self._fallback.ngettext(msgid1, msgid2, n) 204 if n == 1: 205 return msgid1 206 else: 207 return msgid2 208 209 def lngettext(self, msgid1, msgid2, n): 210 if self._fallback: 211 return self._fallback.lngettext(msgid1, msgid2, n) 212 if n == 1: 213 return msgid1 214 else: 215 return msgid2 216 217 def ugettext(self, message): 218 if self._fallback: 219 return self._fallback.ugettext(message) 220 return unicode(message) 221 222 def ungettext(self, msgid1, msgid2, n): 223 if self._fallback: 224 return self._fallback.ungettext(msgid1, msgid2, n) 225 if n == 1: 226 return unicode(msgid1) 227 else: 228 return unicode(msgid2) 229 230 def info(self): 231 return self._info 232 233 def charset(self): 234 return self._charset 235 236 def output_charset(self): 237 return self._output_charset 238 239 def set_output_charset(self, charset): 240 self._output_charset = charset 241 242 def install(self, unicode=False, names=None): 243 import __builtin__ 244 __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext 245 if hasattr(names, "__contains__"): 246 if "gettext" in names: 247 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_'] 248 if "ngettext" in names: 249 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext 250 or self.ngettext) 251 if "lgettext" in names: 252 __builtin__.__dict__['lgettext'] = self.lgettext 253 if "lngettext" in names: 254 __builtin__.__dict__['lngettext'] = self.lngettext 255 256 257 class GNUTranslations(NullTranslations): 258 # Magic number of .mo files 259 LE_MAGIC = 0x950412deL 260 BE_MAGIC = 0xde120495L 261 262 def _parse(self, fp): 263 """Override this method to support alternative .mo formats.""" 264 unpack = struct.unpack 265 filename = getattr(fp, 'name', '') 266 # Parse the .mo file header, which consists of 5 little endian 32 267 # bit words. 268 self._catalog = catalog = {} 269 self.plural = lambda n: int(n != 1) # germanic plural by default 270 buf = fp.read() 271 buflen = len(buf) 272 # Are we big endian or little endian? 273 magic = unpack('<I', buf[:4])[0] 274 if magic == self.LE_MAGIC: 275 version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20]) 276 ii = '<II' 277 elif magic == self.BE_MAGIC: 278 version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20]) 279 ii = '>II' 280 else: 281 raise IOError(0, 'Bad magic number', filename) 282 # Now put all messages from the .mo file buffer into the catalog 283 # dictionary. 284 for i in xrange(0, msgcount): 285 mlen, moff = unpack(ii, buf[masteridx:masteridx+8]) 286 mend = moff + mlen 287 tlen, toff = unpack(ii, buf[transidx:transidx+8]) 288 tend = toff + tlen 289 if mend < buflen and tend < buflen: 290 msg = buf[moff:mend] 291 tmsg = buf[toff:tend] 292 else: 293 raise IOError(0, 'File is corrupt', filename) 294 # See if we're looking at GNU .mo conventions for metadata 295 if mlen == 0: 296 # Catalog description 297 lastk = k = None 298 for item in tmsg.splitlines(): 299 item = item.strip() 300 if not item: 301 continue 302 if ':' in item: 303 k, v = item.split(':', 1) 304 k = k.strip().lower() 305 v = v.strip() 306 self._info[k] = v 307 lastk = k 308 elif lastk: 309 self._info[lastk] += '\n' + item 310 if k == 'content-type': 311 self._charset = v.split('charset=')[1] 312 elif k == 'plural-forms': 313 v = v.split(';') 314 plural = v[1].split('plural=')[1] 315 self.plural = c2py(plural) 316 # Note: we unconditionally convert both msgids and msgstrs to 317 # Unicode using the character encoding specified in the charset 318 # parameter of the Content-Type header. The gettext documentation 319 # strongly encourages msgids to be us-ascii, but some applications 320 # require alternative encodings (e.g. Zope's ZCML and ZPT). For 321 # traditional gettext applications, the msgid conversion will 322 # cause no problems since us-ascii should always be a subset of 323 # the charset encoding. We may want to fall back to 8-bit msgids 324 # if the Unicode conversion fails. 325 if '\x00' in msg: 326 # Plural forms 327 msgid1, msgid2 = msg.split('\x00') 328 tmsg = tmsg.split('\x00') 329 if self._charset: 330 msgid1 = unicode(msgid1, self._charset) 331 tmsg = [unicode(x, self._charset) for x in tmsg] 332 for i in range(len(tmsg)): 333 catalog[(msgid1, i)] = tmsg[i] 334 else: 335 if self._charset: 336 msg = unicode(msg, self._charset) 337 tmsg = unicode(tmsg, self._charset) 338 catalog[msg] = tmsg 339 # advance to next entry in the seek tables 340 masteridx += 8 341 transidx += 8 342 343 def gettext(self, message): 344 missing = object() 345 tmsg = self._catalog.get(message, missing) 346 if tmsg is missing: 347 if self._fallback: 348 return self._fallback.gettext(message) 349 return message 350 # Encode the Unicode tmsg back to an 8-bit string, if possible 351 if self._output_charset: 352 return tmsg.encode(self._output_charset) 353 elif self._charset: 354 return tmsg.encode(self._charset) 355 return tmsg 356 357 def lgettext(self, message): 358 missing = object() 359 tmsg = self._catalog.get(message, missing) 360 if tmsg is missing: 361 if self._fallback: 362 return self._fallback.lgettext(message) 363 return message 364 if self._output_charset: 365 return tmsg.encode(self._output_charset) 366 return tmsg.encode(locale.getpreferredencoding()) 367 368 def ngettext(self, msgid1, msgid2, n): 369 try: 370 tmsg = self._catalog[(msgid1, self.plural(n))] 371 if self._output_charset: 372 return tmsg.encode(self._output_charset) 373 elif self._charset: 374 return tmsg.encode(self._charset) 375 return tmsg 376 except KeyError: 377 if self._fallback: 378 return self._fallback.ngettext(msgid1, msgid2, n) 379 if n == 1: 380 return msgid1 381 else: 382 return msgid2 383 384 def lngettext(self, msgid1, msgid2, n): 385 try: 386 tmsg = self._catalog[(msgid1, self.plural(n))] 387 if self._output_charset: 388 return tmsg.encode(self._output_charset) 389 return tmsg.encode(locale.getpreferredencoding()) 390 except KeyError: 391 if self._fallback: 392 return self._fallback.lngettext(msgid1, msgid2, n) 393 if n == 1: 394 return msgid1 395 else: 396 return msgid2 397 398 def ugettext(self, message): 399 missing = object() 400 tmsg = self._catalog.get(message, missing) 401 if tmsg is missing: 402 if self._fallback: 403 return self._fallback.ugettext(message) 404 return unicode(message) 405 return tmsg 406 407 def ungettext(self, msgid1, msgid2, n): 408 try: 409 tmsg = self._catalog[(msgid1, self.plural(n))] 410 except KeyError: 411 if self._fallback: 412 return self._fallback.ungettext(msgid1, msgid2, n) 413 if n == 1: 414 tmsg = unicode(msgid1) 415 else: 416 tmsg = unicode(msgid2) 417 return tmsg 418 419 420 # Locate a .mo file using the gettext strategy 421 def find(domain, localedir=None, languages=None, all=0): 422 # Get some reasonable defaults for arguments that were not supplied 423 if localedir is None: 424 localedir = _default_localedir 425 if languages is None: 426 languages = [] 427 for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'): 428 val = os.environ.get(envar) 429 if val: 430 languages = val.split(':') 431 break 432 if 'C' not in languages: 433 languages.append('C') 434 # now normalize and expand the languages 435 nelangs = [] 436 for lang in languages: 437 for nelang in _expand_lang(lang): 438 if nelang not in nelangs: 439 nelangs.append(nelang) 440 # select a language 441 if all: 442 result = [] 443 else: 444 result = None 445 for lang in nelangs: 446 if lang == 'C': 447 break 448 mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain) 449 if os.path.exists(mofile): 450 if all: 451 result.append(mofile) 452 else: 453 return mofile 454 return result 455 456 457 458 # a mapping between absolute .mo file path and Translation object 459 _translations = {} 460 461 def translation(domain, localedir=None, languages=None, 462 class_=None, fallback=False, codeset=None): 463 if class_ is None: 464 class_ = GNUTranslations 465 mofiles = find(domain, localedir, languages, all=1) 466 if not mofiles: 467 if fallback: 468 return NullTranslations() 469 raise IOError(ENOENT, 'No translation file found for domain', domain) 470 # Avoid opening, reading, and parsing the .mo file after it's been done 471 # once. 472 result = None 473 for mofile in mofiles: 474 key = (class_, os.path.abspath(mofile)) 475 t = _translations.get(key) 476 if t is None: 477 with open(mofile, 'rb') as fp: 478 t = _translations.setdefault(key, class_(fp)) 479 # Copy the translation object to allow setting fallbacks and 480 # output charset. All other instance data is shared with the 481 # cached object. 482 t = copy.copy(t) 483 if codeset: 484 t.set_output_charset(codeset) 485 if result is None: 486 result = t 487 else: 488 result.add_fallback(t) 489 return result 490 491 492 def install(domain, localedir=None, unicode=False, codeset=None, names=None): 493 t = translation(domain, localedir, fallback=True, codeset=codeset) 494 t.install(unicode, names) 495 496 497 498 # a mapping b/w domains and locale directories 499 _localedirs = {} 500 # a mapping b/w domains and codesets 501 _localecodesets = {} 502 # current global domain, `messages' used for compatibility w/ GNU gettext 503 _current_domain = 'messages' 504 505 506 def textdomain(domain=None): 507 global _current_domain 508 if domain is not None: 509 _current_domain = domain 510 return _current_domain 511 512 513 def bindtextdomain(domain, localedir=None): 514 global _localedirs 515 if localedir is not None: 516 _localedirs[domain] = localedir 517 return _localedirs.get(domain, _default_localedir) 518 519 520 def bind_textdomain_codeset(domain, codeset=None): 521 global _localecodesets 522 if codeset is not None: 523 _localecodesets[domain] = codeset 524 return _localecodesets.get(domain) 525 526 527 def dgettext(domain, message): 528 try: 529 t = translation(domain, _localedirs.get(domain, None), 530 codeset=_localecodesets.get(domain)) 531 except IOError: 532 return message 533 return t.gettext(message) 534 535 def ldgettext(domain, message): 536 try: 537 t = translation(domain, _localedirs.get(domain, None), 538 codeset=_localecodesets.get(domain)) 539 except IOError: 540 return message 541 return t.lgettext(message) 542 543 def dngettext(domain, msgid1, msgid2, n): 544 try: 545 t = translation(domain, _localedirs.get(domain, None), 546 codeset=_localecodesets.get(domain)) 547 except IOError: 548 if n == 1: 549 return msgid1 550 else: 551 return msgid2 552 return t.ngettext(msgid1, msgid2, n) 553 554 def ldngettext(domain, msgid1, msgid2, n): 555 try: 556 t = translation(domain, _localedirs.get(domain, None), 557 codeset=_localecodesets.get(domain)) 558 except IOError: 559 if n == 1: 560 return msgid1 561 else: 562 return msgid2 563 return t.lngettext(msgid1, msgid2, n) 564 565 def gettext(message): 566 return dgettext(_current_domain, message) 567 568 def lgettext(message): 569 return ldgettext(_current_domain, message) 570 571 def ngettext(msgid1, msgid2, n): 572 return dngettext(_current_domain, msgid1, msgid2, n) 573 574 def lngettext(msgid1, msgid2, n): 575 return ldngettext(_current_domain, msgid1, msgid2, n) 576 577 # dcgettext() has been deemed unnecessary and is not implemented. 578 579 # James Henstridge's Catalog constructor from GNOME gettext. Documented usage 580 # was: 581 # 582 # import gettext 583 # cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR) 584 # _ = cat.gettext 585 # print _('Hello World') 586 587 # The resulting catalog object currently don't support access through a 588 # dictionary API, which was supported (but apparently unused) in GNOME 589 # gettext. 590 591 Catalog = translation 592