Home | History | Annotate | Download | only in Lib
      1 """Internationalization and localization support.
      2 
      3 This module provides internationalization (I18N) and localization (L10N)
      4 support for your Python programs by providing an interface to the GNU gettext
      5 message catalog library.
      6 
      7 I18N refers to the operation by which a program is made aware of multiple
      8 languages.  L10N refers to the adaptation of your program, once
      9 internationalized, to the local language and cultural habits.
     10 
     11 """
     12 
     13 # This module represents the integration of work, contributions, feedback, and

     14 # suggestions from the following people:

     15 #

     16 # Martin von Loewis, who wrote the initial implementation of the underlying

     17 # C-based libintlmodule (later renamed _gettext), along with a skeletal

     18 # gettext.py implementation.

     19 #

     20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,

     21 # which also included a pure-Python implementation to read .mo files if

     22 # intlmodule wasn't available.

     23 #

     24 # James Henstridge, who also wrote a gettext.py module, which has some

     25 # interesting, but currently unsupported experimental features: the notion of

     26 # a Catalog class and instances, and the ability to add to a catalog file via

     27 # a Python API.

     28 #

     29 # Barry Warsaw integrated these modules, wrote the .install() API and code,

     30 # and conformed all C and Python code to Python's coding standards.

     31 #

     32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this

     33 # module.

     34 #

     35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.

     36 #

     37 # TODO:

     38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into

     39 #   memory, but that's probably bad for large translated programs.  Instead,

     40 #   the lexical sort of original strings in GNU .mo files should be exploited

     41 #   to do binary searches and lazy initializations.  Or you might want to use

     42 #   the undocumented double-hash algorithm for .mo files with hash tables, but

     43 #   you'll need to study the GNU gettext code to do this.

     44 #

     45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to

     46 #   find this format documented anywhere.

     47 
     48 
     49 import locale, copy, os, re, struct, sys
     50 from errno import ENOENT
     51 
     52 
     53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
     54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
     55            'bind_textdomain_codeset',
     56            'dgettext', 'dngettext', 'gettext', 'lgettext', 'ldgettext',
     57            'ldngettext', 'lngettext', 'ngettext',
     58            ]
     59 
     60 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
     61 
     62 
     63 def test(condition, true, false):
     64     """
     65     Implements the C expression:
     66 
     67       condition ? true : false
     68 
     69     Required to correctly interpret plural forms.
     70     """
     71     if condition:
     72         return true
     73     else:
     74         return false
     75 
     76 
     77 def c2py(plural):
     78     """Gets a C expression as used in PO files for plural forms and returns a
     79     Python lambda function that implements an equivalent expression.
     80     """
     81     # Security check, allow only the "n" identifier

     82     try:
     83         from cStringIO import StringIO
     84     except ImportError:
     85         from StringIO import StringIO
     86     import token, tokenize
     87     tokens = tokenize.generate_tokens(StringIO(plural).readline)
     88     try:
     89         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
     90     except tokenize.TokenError:
     91         raise ValueError, \
     92               'plural forms expression error, maybe unbalanced parenthesis'
     93     else:
     94         if danger:
     95             raise ValueError, 'plural forms expression could be dangerous'
     96 
     97     # Replace some C operators by their Python equivalents

     98     plural = plural.replace('&&', ' and ')
     99     plural = plural.replace('||', ' or ')
    100 
    101     expr = re.compile(r'\!([^=])')
    102     plural = expr.sub(' not \\1', plural)
    103 
    104     # Regular expression and replacement function used to transform

    105     # "a?b:c" to "test(a,b,c)".

    106     expr = re.compile(r'(.*?)\?(.*?):(.*)')
    107     def repl(x):
    108         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
    109                                      expr.sub(repl, x.group(3)))
    110 
    111     # Code to transform the plural expression, taking care of parentheses

    112     stack = ['']
    113     for c in plural:
    114         if c == '(':
    115             stack.append('')
    116         elif c == ')':
    117             if len(stack) == 1:
    118                 # Actually, we never reach this code, because unbalanced

    119                 # parentheses get caught in the security check at the

    120                 # beginning.

    121                 raise ValueError, 'unbalanced parenthesis in plural form'
    122             s = expr.sub(repl, stack.pop())
    123             stack[-1] += '(%s)' % s
    124         else:
    125             stack[-1] += c
    126     plural = expr.sub(repl, stack.pop())
    127 
    128     return eval('lambda n: int(%s)' % plural)
    129 
    130 
    131 
    132 def _expand_lang(locale):
    133     from locale import normalize
    134     locale = normalize(locale)
    135     COMPONENT_CODESET   = 1 << 0
    136     COMPONENT_TERRITORY = 1 << 1
    137     COMPONENT_MODIFIER  = 1 << 2
    138     # split up the locale into its base components

    139     mask = 0
    140     pos = locale.find('@')
    141     if pos >= 0:
    142         modifier = locale[pos:]
    143         locale = locale[:pos]
    144         mask |= COMPONENT_MODIFIER
    145     else:
    146         modifier = ''
    147     pos = locale.find('.')
    148     if pos >= 0:
    149         codeset = locale[pos:]
    150         locale = locale[:pos]
    151         mask |= COMPONENT_CODESET
    152     else:
    153         codeset = ''
    154     pos = locale.find('_')
    155     if pos >= 0:
    156         territory = locale[pos:]
    157         locale = locale[:pos]
    158         mask |= COMPONENT_TERRITORY
    159     else:
    160         territory = ''
    161     language = locale
    162     ret = []
    163     for i in range(mask+1):
    164         if not (i & ~mask):  # if all components for this combo exist ...

    165             val = language
    166             if i & COMPONENT_TERRITORY: val += territory
    167             if i & COMPONENT_CODESET:   val += codeset
    168             if i & COMPONENT_MODIFIER:  val += modifier
    169             ret.append(val)
    170     ret.reverse()
    171     return ret
    172 
    173 
    174 
    175 class NullTranslations:
    176     def __init__(self, fp=None):
    177         self._info = {}
    178         self._charset = None
    179         self._output_charset = None
    180         self._fallback = None
    181         if fp is not None:
    182             self._parse(fp)
    183 
    184     def _parse(self, fp):
    185         pass
    186 
    187     def add_fallback(self, fallback):
    188         if self._fallback:
    189             self._fallback.add_fallback(fallback)
    190         else:
    191             self._fallback = fallback
    192 
    193     def gettext(self, message):
    194         if self._fallback:
    195             return self._fallback.gettext(message)
    196         return message
    197 
    198     def lgettext(self, message):
    199         if self._fallback:
    200             return self._fallback.lgettext(message)
    201         return message
    202 
    203     def ngettext(self, msgid1, msgid2, n):
    204         if self._fallback:
    205             return self._fallback.ngettext(msgid1, msgid2, n)
    206         if n == 1:
    207             return msgid1
    208         else:
    209             return msgid2
    210 
    211     def lngettext(self, msgid1, msgid2, n):
    212         if self._fallback:
    213             return self._fallback.lngettext(msgid1, msgid2, n)
    214         if n == 1:
    215             return msgid1
    216         else:
    217             return msgid2
    218 
    219     def ugettext(self, message):
    220         if self._fallback:
    221             return self._fallback.ugettext(message)
    222         return unicode(message)
    223 
    224     def ungettext(self, msgid1, msgid2, n):
    225         if self._fallback:
    226             return self._fallback.ungettext(msgid1, msgid2, n)
    227         if n == 1:
    228             return unicode(msgid1)
    229         else:
    230             return unicode(msgid2)
    231 
    232     def info(self):
    233         return self._info
    234 
    235     def charset(self):
    236         return self._charset
    237 
    238     def output_charset(self):
    239         return self._output_charset
    240 
    241     def set_output_charset(self, charset):
    242         self._output_charset = charset
    243 
    244     def install(self, unicode=False, names=None):
    245         import __builtin__
    246         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
    247         if hasattr(names, "__contains__"):
    248             if "gettext" in names:
    249                 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
    250             if "ngettext" in names:
    251                 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext
    252                                                              or self.ngettext)
    253             if "lgettext" in names:
    254                 __builtin__.__dict__['lgettext'] = self.lgettext
    255             if "lngettext" in names:
    256                 __builtin__.__dict__['lngettext'] = self.lngettext
    257 
    258 
    259 class GNUTranslations(NullTranslations):
    260     # Magic number of .mo files

    261     LE_MAGIC = 0x950412deL
    262     BE_MAGIC = 0xde120495L
    263 
    264     def _parse(self, fp):
    265         """Override this method to support alternative .mo formats."""
    266         unpack = struct.unpack
    267         filename = getattr(fp, 'name', '')
    268         # Parse the .mo file header, which consists of 5 little endian 32

    269         # bit words.

    270         self._catalog = catalog = {}
    271         self.plural = lambda n: int(n != 1) # germanic plural by default

    272         buf = fp.read()
    273         buflen = len(buf)
    274         # Are we big endian or little endian?

    275         magic = unpack('<I', buf[:4])[0]
    276         if magic == self.LE_MAGIC:
    277             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
    278             ii = '<II'
    279         elif magic == self.BE_MAGIC:
    280             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
    281             ii = '>II'
    282         else:
    283             raise IOError(0, 'Bad magic number', filename)
    284         # Now put all messages from the .mo file buffer into the catalog

    285         # dictionary.

    286         for i in xrange(0, msgcount):
    287             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
    288             mend = moff + mlen
    289             tlen, toff = unpack(ii, buf[transidx:transidx+8])
    290             tend = toff + tlen
    291             if mend < buflen and tend < buflen:
    292                 msg = buf[moff:mend]
    293                 tmsg = buf[toff:tend]
    294             else:
    295                 raise IOError(0, 'File is corrupt', filename)
    296             # See if we're looking at GNU .mo conventions for metadata

    297             if mlen == 0:
    298                 # Catalog description

    299                 lastk = None
    300                 for item in tmsg.splitlines():
    301                     item = item.strip()
    302                     if not item:
    303                         continue
    304                     k = v = None
    305                     if ':' in item:
    306                         k, v = item.split(':', 1)
    307                         k = k.strip().lower()
    308                         v = v.strip()
    309                         self._info[k] = v
    310                         lastk = k
    311                     elif lastk:
    312                         self._info[lastk] += '\n' + item
    313                     if k == 'content-type':
    314                         self._charset = v.split('charset=')[1]
    315                     elif k == 'plural-forms':
    316                         v = v.split(';')
    317                         plural = v[1].split('plural=')[1]
    318                         self.plural = c2py(plural)
    319             # Note: we unconditionally convert both msgids and msgstrs to

    320             # Unicode using the character encoding specified in the charset

    321             # parameter of the Content-Type header.  The gettext documentation

    322             # strongly encourages msgids to be us-ascii, but some applications

    323             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For

    324             # traditional gettext applications, the msgid conversion will

    325             # cause no problems since us-ascii should always be a subset of

    326             # the charset encoding.  We may want to fall back to 8-bit msgids

    327             # if the Unicode conversion fails.

    328             if '\x00' in msg:
    329                 # Plural forms

    330                 msgid1, msgid2 = msg.split('\x00')
    331                 tmsg = tmsg.split('\x00')
    332                 if self._charset:
    333                     msgid1 = unicode(msgid1, self._charset)
    334                     tmsg = [unicode(x, self._charset) for x in tmsg]
    335                 for i in range(len(tmsg)):
    336                     catalog[(msgid1, i)] = tmsg[i]
    337             else:
    338                 if self._charset:
    339                     msg = unicode(msg, self._charset)
    340                     tmsg = unicode(tmsg, self._charset)
    341                 catalog[msg] = tmsg
    342             # advance to next entry in the seek tables

    343             masteridx += 8
    344             transidx += 8
    345 
    346     def gettext(self, message):
    347         missing = object()
    348         tmsg = self._catalog.get(message, missing)
    349         if tmsg is missing:
    350             if self._fallback:
    351                 return self._fallback.gettext(message)
    352             return message
    353         # Encode the Unicode tmsg back to an 8-bit string, if possible

    354         if self._output_charset:
    355             return tmsg.encode(self._output_charset)
    356         elif self._charset:
    357             return tmsg.encode(self._charset)
    358         return tmsg
    359 
    360     def lgettext(self, message):
    361         missing = object()
    362         tmsg = self._catalog.get(message, missing)
    363         if tmsg is missing:
    364             if self._fallback:
    365                 return self._fallback.lgettext(message)
    366             return message
    367         if self._output_charset:
    368             return tmsg.encode(self._output_charset)
    369         return tmsg.encode(locale.getpreferredencoding())
    370 
    371     def ngettext(self, msgid1, msgid2, n):
    372         try:
    373             tmsg = self._catalog[(msgid1, self.plural(n))]
    374             if self._output_charset:
    375                 return tmsg.encode(self._output_charset)
    376             elif self._charset:
    377                 return tmsg.encode(self._charset)
    378             return tmsg
    379         except KeyError:
    380             if self._fallback:
    381                 return self._fallback.ngettext(msgid1, msgid2, n)
    382             if n == 1:
    383                 return msgid1
    384             else:
    385                 return msgid2
    386 
    387     def lngettext(self, msgid1, msgid2, n):
    388         try:
    389             tmsg = self._catalog[(msgid1, self.plural(n))]
    390             if self._output_charset:
    391                 return tmsg.encode(self._output_charset)
    392             return tmsg.encode(locale.getpreferredencoding())
    393         except KeyError:
    394             if self._fallback:
    395                 return self._fallback.lngettext(msgid1, msgid2, n)
    396             if n == 1:
    397                 return msgid1
    398             else:
    399                 return msgid2
    400 
    401     def ugettext(self, message):
    402         missing = object()
    403         tmsg = self._catalog.get(message, missing)
    404         if tmsg is missing:
    405             if self._fallback:
    406                 return self._fallback.ugettext(message)
    407             return unicode(message)
    408         return tmsg
    409 
    410     def ungettext(self, msgid1, msgid2, n):
    411         try:
    412             tmsg = self._catalog[(msgid1, self.plural(n))]
    413         except KeyError:
    414             if self._fallback:
    415                 return self._fallback.ungettext(msgid1, msgid2, n)
    416             if n == 1:
    417                 tmsg = unicode(msgid1)
    418             else:
    419                 tmsg = unicode(msgid2)
    420         return tmsg
    421 
    422 
    423 # Locate a .mo file using the gettext strategy

    424 def find(domain, localedir=None, languages=None, all=0):
    425     # Get some reasonable defaults for arguments that were not supplied

    426     if localedir is None:
    427         localedir = _default_localedir
    428     if languages is None:
    429         languages = []
    430         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
    431             val = os.environ.get(envar)
    432             if val:
    433                 languages = val.split(':')
    434                 break
    435         if 'C' not in languages:
    436             languages.append('C')
    437     # now normalize and expand the languages

    438     nelangs = []
    439     for lang in languages:
    440         for nelang in _expand_lang(lang):
    441             if nelang not in nelangs:
    442                 nelangs.append(nelang)
    443     # select a language

    444     if all:
    445         result = []
    446     else:
    447         result = None
    448     for lang in nelangs:
    449         if lang == 'C':
    450             break
    451         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
    452         if os.path.exists(mofile):
    453             if all:
    454                 result.append(mofile)
    455             else:
    456                 return mofile
    457     return result
    458 
    459 
    460 
    461 # a mapping between absolute .mo file path and Translation object

    462 _translations = {}
    463 
    464 def translation(domain, localedir=None, languages=None,
    465                 class_=None, fallback=False, codeset=None):
    466     if class_ is None:
    467         class_ = GNUTranslations
    468     mofiles = find(domain, localedir, languages, all=1)
    469     if not mofiles:
    470         if fallback:
    471             return NullTranslations()
    472         raise IOError(ENOENT, 'No translation file found for domain', domain)
    473     # Avoid opening, reading, and parsing the .mo file after it's been done

    474     # once.

    475     result = None
    476     for mofile in mofiles:
    477         key = (class_, os.path.abspath(mofile))
    478         t = _translations.get(key)
    479         if t is None:
    480             with open(mofile, 'rb') as fp:
    481                 t = _translations.setdefault(key, class_(fp))
    482         # Copy the translation object to allow setting fallbacks and

    483         # output charset. All other instance data is shared with the

    484         # cached object.

    485         t = copy.copy(t)
    486         if codeset:
    487             t.set_output_charset(codeset)
    488         if result is None:
    489             result = t
    490         else:
    491             result.add_fallback(t)
    492     return result
    493 
    494 
    495 def install(domain, localedir=None, unicode=False, codeset=None, names=None):
    496     t = translation(domain, localedir, fallback=True, codeset=codeset)
    497     t.install(unicode, names)
    498 
    499 
    500 
    501 # a mapping b/w domains and locale directories

    502 _localedirs = {}
    503 # a mapping b/w domains and codesets

    504 _localecodesets = {}
    505 # current global domain, `messages' used for compatibility w/ GNU gettext

    506 _current_domain = 'messages'
    507 
    508 
    509 def textdomain(domain=None):
    510     global _current_domain
    511     if domain is not None:
    512         _current_domain = domain
    513     return _current_domain
    514 
    515 
    516 def bindtextdomain(domain, localedir=None):
    517     global _localedirs
    518     if localedir is not None:
    519         _localedirs[domain] = localedir
    520     return _localedirs.get(domain, _default_localedir)
    521 
    522 
    523 def bind_textdomain_codeset(domain, codeset=None):
    524     global _localecodesets
    525     if codeset is not None:
    526         _localecodesets[domain] = codeset
    527     return _localecodesets.get(domain)
    528 
    529 
    530 def dgettext(domain, message):
    531     try:
    532         t = translation(domain, _localedirs.get(domain, None),
    533                         codeset=_localecodesets.get(domain))
    534     except IOError:
    535         return message
    536     return t.gettext(message)
    537 
    538 def ldgettext(domain, message):
    539     try:
    540         t = translation(domain, _localedirs.get(domain, None),
    541                         codeset=_localecodesets.get(domain))
    542     except IOError:
    543         return message
    544     return t.lgettext(message)
    545 
    546 def dngettext(domain, msgid1, msgid2, n):
    547     try:
    548         t = translation(domain, _localedirs.get(domain, None),
    549                         codeset=_localecodesets.get(domain))
    550     except IOError:
    551         if n == 1:
    552             return msgid1
    553         else:
    554             return msgid2
    555     return t.ngettext(msgid1, msgid2, n)
    556 
    557 def ldngettext(domain, msgid1, msgid2, n):
    558     try:
    559         t = translation(domain, _localedirs.get(domain, None),
    560                         codeset=_localecodesets.get(domain))
    561     except IOError:
    562         if n == 1:
    563             return msgid1
    564         else:
    565             return msgid2
    566     return t.lngettext(msgid1, msgid2, n)
    567 
    568 def gettext(message):
    569     return dgettext(_current_domain, message)
    570 
    571 def lgettext(message):
    572     return ldgettext(_current_domain, message)
    573 
    574 def ngettext(msgid1, msgid2, n):
    575     return dngettext(_current_domain, msgid1, msgid2, n)
    576 
    577 def lngettext(msgid1, msgid2, n):
    578     return ldngettext(_current_domain, msgid1, msgid2, n)
    579 
    580 # dcgettext() has been deemed unnecessary and is not implemented.

    581 
    582 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage

    583 # was:

    584 #

    585 #    import gettext

    586 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)

    587 #    _ = cat.gettext

    588 #    print _('Hello World')

    589 
    590 # The resulting catalog object currently don't support access through a

    591 # dictionary API, which was supported (but apparently unused) in GNOME

    592 # gettext.

    593 
    594 Catalog = translation
    595