Home | History | Annotate | Download | only in Lib
      1 """Internationalization and localization support.
      2 
      3 This module provides internationalization (I18N) and localization (L10N)
      4 support for your Python programs by providing an interface to the GNU gettext
      5 message catalog library.
      6 
      7 I18N refers to the operation by which a program is made aware of multiple
      8 languages.  L10N refers to the adaptation of your program, once
      9 internationalized, to the local language and cultural habits.
     10 
     11 """
     12 
     13 # This module represents the integration of work, contributions, feedback, and
     14 # suggestions from the following people:
     15 #
     16 # Martin von Loewis, who wrote the initial implementation of the underlying
     17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
     18 # gettext.py implementation.
     19 #
     20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
     21 # which also included a pure-Python implementation to read .mo files if
     22 # intlmodule wasn't available.
     23 #
     24 # James Henstridge, who also wrote a gettext.py module, which has some
     25 # interesting, but currently unsupported experimental features: the notion of
     26 # a Catalog class and instances, and the ability to add to a catalog file via
     27 # a Python API.
     28 #
     29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
     30 # and conformed all C and Python code to Python's coding standards.
     31 #
     32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
     33 # module.
     34 #
     35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
     36 #
     37 # TODO:
     38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
     39 #   memory, but that's probably bad for large translated programs.  Instead,
     40 #   the lexical sort of original strings in GNU .mo files should be exploited
     41 #   to do binary searches and lazy initializations.  Or you might want to use
     42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
     43 #   you'll need to study the GNU gettext code to do this.
     44 #
     45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
     46 #   find this format documented anywhere.
     47 
     48 
     49 import locale, copy, os, re, struct, sys
     50 from errno import ENOENT
     51 
     52 
     53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
     54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
     55            'bind_textdomain_codeset',
     56            'dgettext', 'dngettext', 'gettext', 'lgettext', 'ldgettext',
     57            'ldngettext', 'lngettext', 'ngettext',
     58            ]
     59 
     60 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
     61 
     62 # Expression parsing for plural form selection.
     63 #
     64 # The gettext library supports a small subset of C syntax.  The only
     65 # incompatible difference is that integer literals starting with zero are
     66 # decimal.
     67 #
     68 # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
     69 # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
     70 
     71 _token_pattern = re.compile(r"""
     72         (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
     73         (?P<NUMBER>[0-9]+\b)                       | # decimal integer
     74         (?P<NAME>n\b)                              | # only n is allowed
     75         (?P<PARENTHESIS>[()])                      |
     76         (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
     77                                                      # <=, >=, ==, !=, &&, ||,
     78                                                      # ? :
     79                                                      # unary and bitwise ops
     80                                                      # not allowed
     81         (?P<INVALID>\w+|.)                           # invalid token
     82     """, re.VERBOSE|re.DOTALL)
     83 
     84 def _tokenize(plural):
     85     for mo in re.finditer(_token_pattern, plural):
     86         kind = mo.lastgroup
     87         if kind == 'WHITESPACES':
     88             continue
     89         value = mo.group(kind)
     90         if kind == 'INVALID':
     91             raise ValueError('invalid token in plural form: %s' % value)
     92         yield value
     93     yield ''
     94 
     95 def _error(value):
     96     if value:
     97         return ValueError('unexpected token in plural form: %s' % value)
     98     else:
     99         return ValueError('unexpected end of plural form')
    100 
    101 _binary_ops = (
    102     ('||',),
    103     ('&&',),
    104     ('==', '!='),
    105     ('<', '>', '<=', '>='),
    106     ('+', '-'),
    107     ('*', '/', '%'),
    108 )
    109 _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
    110 _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
    111 
    112 def _parse(tokens, priority=-1):
    113     result = ''
    114     nexttok = next(tokens)
    115     while nexttok == '!':
    116         result += 'not '
    117         nexttok = next(tokens)
    118 
    119     if nexttok == '(':
    120         sub, nexttok = _parse(tokens)
    121         result = '%s(%s)' % (result, sub)
    122         if nexttok != ')':
    123             raise ValueError('unbalanced parenthesis in plural form')
    124     elif nexttok == 'n':
    125         result = '%s%s' % (result, nexttok)
    126     else:
    127         try:
    128             value = int(nexttok, 10)
    129         except ValueError:
    130             raise _error(nexttok)
    131         result = '%s%d' % (result, value)
    132     nexttok = next(tokens)
    133 
    134     j = 100
    135     while nexttok in _binary_ops:
    136         i = _binary_ops[nexttok]
    137         if i < priority:
    138             break
    139         # Break chained comparisons
    140         if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
    141             result = '(%s)' % result
    142         # Replace some C operators by their Python equivalents
    143         op = _c2py_ops.get(nexttok, nexttok)
    144         right, nexttok = _parse(tokens, i + 1)
    145         result = '%s %s %s' % (result, op, right)
    146         j = i
    147     if j == priority == 4:  # '<', '>', '<=', '>='
    148         result = '(%s)' % result
    149 
    150     if nexttok == '?' and priority <= 0:
    151         if_true, nexttok = _parse(tokens, 0)
    152         if nexttok != ':':
    153             raise _error(nexttok)
    154         if_false, nexttok = _parse(tokens)
    155         result = '%s if %s else %s' % (if_true, result, if_false)
    156         if priority == 0:
    157             result = '(%s)' % result
    158 
    159     return result, nexttok
    160 
    161 def _as_int(n):
    162     try:
    163         i = round(n)
    164     except TypeError:
    165         raise TypeError('Plural value must be an integer, got %s' %
    166                         (n.__class__.__name__,))
    167     return n
    168 
    169 def c2py(plural):
    170     """Gets a C expression as used in PO files for plural forms and returns a
    171     Python function that implements an equivalent expression.
    172     """
    173 
    174     if len(plural) > 1000:
    175         raise ValueError('plural form expression is too long')
    176     try:
    177         result, nexttok = _parse(_tokenize(plural))
    178         if nexttok:
    179             raise _error(nexttok)
    180 
    181         depth = 0
    182         for c in result:
    183             if c == '(':
    184                 depth += 1
    185                 if depth > 20:
    186                     # Python compiler limit is about 90.
    187                     # The most complex example has 2.
    188                     raise ValueError('plural form expression is too complex')
    189             elif c == ')':
    190                 depth -= 1
    191 
    192         ns = {'_as_int': _as_int}
    193         exec('''if 1:
    194             def func(n):
    195                 if not isinstance(n, int):
    196                     n = _as_int(n)
    197                 return int(%s)
    198             ''' % result, ns)
    199         return ns['func']
    200     except RuntimeError:
    201         # Recursion error can be raised in _parse() or exec().
    202         raise ValueError('plural form expression is too complex')
    203 
    204 
    205 def _expand_lang(locale):
    206     from locale import normalize
    207     locale = normalize(locale)
    208     COMPONENT_CODESET   = 1 << 0
    209     COMPONENT_TERRITORY = 1 << 1
    210     COMPONENT_MODIFIER  = 1 << 2
    211     # split up the locale into its base components
    212     mask = 0
    213     pos = locale.find('@')
    214     if pos >= 0:
    215         modifier = locale[pos:]
    216         locale = locale[:pos]
    217         mask |= COMPONENT_MODIFIER
    218     else:
    219         modifier = ''
    220     pos = locale.find('.')
    221     if pos >= 0:
    222         codeset = locale[pos:]
    223         locale = locale[:pos]
    224         mask |= COMPONENT_CODESET
    225     else:
    226         codeset = ''
    227     pos = locale.find('_')
    228     if pos >= 0:
    229         territory = locale[pos:]
    230         locale = locale[:pos]
    231         mask |= COMPONENT_TERRITORY
    232     else:
    233         territory = ''
    234     language = locale
    235     ret = []
    236     for i in range(mask+1):
    237         if not (i & ~mask):  # if all components for this combo exist ...
    238             val = language
    239             if i & COMPONENT_TERRITORY: val += territory
    240             if i & COMPONENT_CODESET:   val += codeset
    241             if i & COMPONENT_MODIFIER:  val += modifier
    242             ret.append(val)
    243     ret.reverse()
    244     return ret
    245 
    246 
    247 
    248 class NullTranslations:
    249     def __init__(self, fp=None):
    250         self._info = {}
    251         self._charset = None
    252         self._output_charset = None
    253         self._fallback = None
    254         if fp is not None:
    255             self._parse(fp)
    256 
    257     def _parse(self, fp):
    258         pass
    259 
    260     def add_fallback(self, fallback):
    261         if self._fallback:
    262             self._fallback.add_fallback(fallback)
    263         else:
    264             self._fallback = fallback
    265 
    266     def gettext(self, message):
    267         if self._fallback:
    268             return self._fallback.gettext(message)
    269         return message
    270 
    271     def lgettext(self, message):
    272         if self._fallback:
    273             return self._fallback.lgettext(message)
    274         return message
    275 
    276     def ngettext(self, msgid1, msgid2, n):
    277         if self._fallback:
    278             return self._fallback.ngettext(msgid1, msgid2, n)
    279         if n == 1:
    280             return msgid1
    281         else:
    282             return msgid2
    283 
    284     def lngettext(self, msgid1, msgid2, n):
    285         if self._fallback:
    286             return self._fallback.lngettext(msgid1, msgid2, n)
    287         if n == 1:
    288             return msgid1
    289         else:
    290             return msgid2
    291 
    292     def ugettext(self, message):
    293         if self._fallback:
    294             return self._fallback.ugettext(message)
    295         return unicode(message)
    296 
    297     def ungettext(self, msgid1, msgid2, n):
    298         if self._fallback:
    299             return self._fallback.ungettext(msgid1, msgid2, n)
    300         if n == 1:
    301             return unicode(msgid1)
    302         else:
    303             return unicode(msgid2)
    304 
    305     def info(self):
    306         return self._info
    307 
    308     def charset(self):
    309         return self._charset
    310 
    311     def output_charset(self):
    312         return self._output_charset
    313 
    314     def set_output_charset(self, charset):
    315         self._output_charset = charset
    316 
    317     def install(self, unicode=False, names=None):
    318         import __builtin__
    319         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
    320         if hasattr(names, "__contains__"):
    321             if "gettext" in names:
    322                 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
    323             if "ngettext" in names:
    324                 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext
    325                                                              or self.ngettext)
    326             if "lgettext" in names:
    327                 __builtin__.__dict__['lgettext'] = self.lgettext
    328             if "lngettext" in names:
    329                 __builtin__.__dict__['lngettext'] = self.lngettext
    330 
    331 
    332 class GNUTranslations(NullTranslations):
    333     # Magic number of .mo files
    334     LE_MAGIC = 0x950412deL
    335     BE_MAGIC = 0xde120495L
    336 
    337     def _parse(self, fp):
    338         """Override this method to support alternative .mo formats."""
    339         unpack = struct.unpack
    340         filename = getattr(fp, 'name', '')
    341         # Parse the .mo file header, which consists of 5 little endian 32
    342         # bit words.
    343         self._catalog = catalog = {}
    344         self.plural = lambda n: int(n != 1) # germanic plural by default
    345         buf = fp.read()
    346         buflen = len(buf)
    347         # Are we big endian or little endian?
    348         magic = unpack('<I', buf[:4])[0]
    349         if magic == self.LE_MAGIC:
    350             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
    351             ii = '<II'
    352         elif magic == self.BE_MAGIC:
    353             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
    354             ii = '>II'
    355         else:
    356             raise IOError(0, 'Bad magic number', filename)
    357         # Now put all messages from the .mo file buffer into the catalog
    358         # dictionary.
    359         for i in xrange(0, msgcount):
    360             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
    361             mend = moff + mlen
    362             tlen, toff = unpack(ii, buf[transidx:transidx+8])
    363             tend = toff + tlen
    364             if mend < buflen and tend < buflen:
    365                 msg = buf[moff:mend]
    366                 tmsg = buf[toff:tend]
    367             else:
    368                 raise IOError(0, 'File is corrupt', filename)
    369             # See if we're looking at GNU .mo conventions for metadata
    370             if mlen == 0:
    371                 # Catalog description
    372                 lastk = None
    373                 for item in tmsg.splitlines():
    374                     item = item.strip()
    375                     if not item:
    376                         continue
    377                     k = v = None
    378                     if ':' in item:
    379                         k, v = item.split(':', 1)
    380                         k = k.strip().lower()
    381                         v = v.strip()
    382                         self._info[k] = v
    383                         lastk = k
    384                     elif lastk:
    385                         self._info[lastk] += '\n' + item
    386                     if k == 'content-type':
    387                         self._charset = v.split('charset=')[1]
    388                     elif k == 'plural-forms':
    389                         v = v.split(';')
    390                         plural = v[1].split('plural=')[1]
    391                         self.plural = c2py(plural)
    392             # Note: we unconditionally convert both msgids and msgstrs to
    393             # Unicode using the character encoding specified in the charset
    394             # parameter of the Content-Type header.  The gettext documentation
    395             # strongly encourages msgids to be us-ascii, but some applications
    396             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
    397             # traditional gettext applications, the msgid conversion will
    398             # cause no problems since us-ascii should always be a subset of
    399             # the charset encoding.  We may want to fall back to 8-bit msgids
    400             # if the Unicode conversion fails.
    401             if '\x00' in msg:
    402                 # Plural forms
    403                 msgid1, msgid2 = msg.split('\x00')
    404                 tmsg = tmsg.split('\x00')
    405                 if self._charset:
    406                     msgid1 = unicode(msgid1, self._charset)
    407                     tmsg = [unicode(x, self._charset) for x in tmsg]
    408                 for i in range(len(tmsg)):
    409                     catalog[(msgid1, i)] = tmsg[i]
    410             else:
    411                 if self._charset:
    412                     msg = unicode(msg, self._charset)
    413                     tmsg = unicode(tmsg, self._charset)
    414                 catalog[msg] = tmsg
    415             # advance to next entry in the seek tables
    416             masteridx += 8
    417             transidx += 8
    418 
    419     def gettext(self, message):
    420         missing = object()
    421         tmsg = self._catalog.get(message, missing)
    422         if tmsg is missing:
    423             if self._fallback:
    424                 return self._fallback.gettext(message)
    425             return message
    426         # Encode the Unicode tmsg back to an 8-bit string, if possible
    427         if self._output_charset:
    428             return tmsg.encode(self._output_charset)
    429         elif self._charset:
    430             return tmsg.encode(self._charset)
    431         return tmsg
    432 
    433     def lgettext(self, message):
    434         missing = object()
    435         tmsg = self._catalog.get(message, missing)
    436         if tmsg is missing:
    437             if self._fallback:
    438                 return self._fallback.lgettext(message)
    439             return message
    440         if self._output_charset:
    441             return tmsg.encode(self._output_charset)
    442         return tmsg.encode(locale.getpreferredencoding())
    443 
    444     def ngettext(self, msgid1, msgid2, n):
    445         try:
    446             tmsg = self._catalog[(msgid1, self.plural(n))]
    447             if self._output_charset:
    448                 return tmsg.encode(self._output_charset)
    449             elif self._charset:
    450                 return tmsg.encode(self._charset)
    451             return tmsg
    452         except KeyError:
    453             if self._fallback:
    454                 return self._fallback.ngettext(msgid1, msgid2, n)
    455             if n == 1:
    456                 return msgid1
    457             else:
    458                 return msgid2
    459 
    460     def lngettext(self, msgid1, msgid2, n):
    461         try:
    462             tmsg = self._catalog[(msgid1, self.plural(n))]
    463             if self._output_charset:
    464                 return tmsg.encode(self._output_charset)
    465             return tmsg.encode(locale.getpreferredencoding())
    466         except KeyError:
    467             if self._fallback:
    468                 return self._fallback.lngettext(msgid1, msgid2, n)
    469             if n == 1:
    470                 return msgid1
    471             else:
    472                 return msgid2
    473 
    474     def ugettext(self, message):
    475         missing = object()
    476         tmsg = self._catalog.get(message, missing)
    477         if tmsg is missing:
    478             if self._fallback:
    479                 return self._fallback.ugettext(message)
    480             return unicode(message)
    481         return tmsg
    482 
    483     def ungettext(self, msgid1, msgid2, n):
    484         try:
    485             tmsg = self._catalog[(msgid1, self.plural(n))]
    486         except KeyError:
    487             if self._fallback:
    488                 return self._fallback.ungettext(msgid1, msgid2, n)
    489             if n == 1:
    490                 tmsg = unicode(msgid1)
    491             else:
    492                 tmsg = unicode(msgid2)
    493         return tmsg
    494 
    495 
    496 # Locate a .mo file using the gettext strategy
    497 def find(domain, localedir=None, languages=None, all=0):
    498     # Get some reasonable defaults for arguments that were not supplied
    499     if localedir is None:
    500         localedir = _default_localedir
    501     if languages is None:
    502         languages = []
    503         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
    504             val = os.environ.get(envar)
    505             if val:
    506                 languages = val.split(':')
    507                 break
    508         if 'C' not in languages:
    509             languages.append('C')
    510     # now normalize and expand the languages
    511     nelangs = []
    512     for lang in languages:
    513         for nelang in _expand_lang(lang):
    514             if nelang not in nelangs:
    515                 nelangs.append(nelang)
    516     # select a language
    517     if all:
    518         result = []
    519     else:
    520         result = None
    521     for lang in nelangs:
    522         if lang == 'C':
    523             break
    524         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
    525         if os.path.exists(mofile):
    526             if all:
    527                 result.append(mofile)
    528             else:
    529                 return mofile
    530     return result
    531 
    532 
    533 
    534 # a mapping between absolute .mo file path and Translation object
    535 _translations = {}
    536 
    537 def translation(domain, localedir=None, languages=None,
    538                 class_=None, fallback=False, codeset=None):
    539     if class_ is None:
    540         class_ = GNUTranslations
    541     mofiles = find(domain, localedir, languages, all=1)
    542     if not mofiles:
    543         if fallback:
    544             return NullTranslations()
    545         raise IOError(ENOENT, 'No translation file found for domain', domain)
    546     # Avoid opening, reading, and parsing the .mo file after it's been done
    547     # once.
    548     result = None
    549     for mofile in mofiles:
    550         key = (class_, os.path.abspath(mofile))
    551         t = _translations.get(key)
    552         if t is None:
    553             with open(mofile, 'rb') as fp:
    554                 t = _translations.setdefault(key, class_(fp))
    555         # Copy the translation object to allow setting fallbacks and
    556         # output charset. All other instance data is shared with the
    557         # cached object.
    558         t = copy.copy(t)
    559         if codeset:
    560             t.set_output_charset(codeset)
    561         if result is None:
    562             result = t
    563         else:
    564             result.add_fallback(t)
    565     return result
    566 
    567 
    568 def install(domain, localedir=None, unicode=False, codeset=None, names=None):
    569     t = translation(domain, localedir, fallback=True, codeset=codeset)
    570     t.install(unicode, names)
    571 
    572 
    573 
    574 # a mapping b/w domains and locale directories
    575 _localedirs = {}
    576 # a mapping b/w domains and codesets
    577 _localecodesets = {}
    578 # current global domain, `messages' used for compatibility w/ GNU gettext
    579 _current_domain = 'messages'
    580 
    581 
    582 def textdomain(domain=None):
    583     global _current_domain
    584     if domain is not None:
    585         _current_domain = domain
    586     return _current_domain
    587 
    588 
    589 def bindtextdomain(domain, localedir=None):
    590     global _localedirs
    591     if localedir is not None:
    592         _localedirs[domain] = localedir
    593     return _localedirs.get(domain, _default_localedir)
    594 
    595 
    596 def bind_textdomain_codeset(domain, codeset=None):
    597     global _localecodesets
    598     if codeset is not None:
    599         _localecodesets[domain] = codeset
    600     return _localecodesets.get(domain)
    601 
    602 
    603 def dgettext(domain, message):
    604     try:
    605         t = translation(domain, _localedirs.get(domain, None),
    606                         codeset=_localecodesets.get(domain))
    607     except IOError:
    608         return message
    609     return t.gettext(message)
    610 
    611 def ldgettext(domain, message):
    612     try:
    613         t = translation(domain, _localedirs.get(domain, None),
    614                         codeset=_localecodesets.get(domain))
    615     except IOError:
    616         return message
    617     return t.lgettext(message)
    618 
    619 def dngettext(domain, msgid1, msgid2, n):
    620     try:
    621         t = translation(domain, _localedirs.get(domain, None),
    622                         codeset=_localecodesets.get(domain))
    623     except IOError:
    624         if n == 1:
    625             return msgid1
    626         else:
    627             return msgid2
    628     return t.ngettext(msgid1, msgid2, n)
    629 
    630 def ldngettext(domain, msgid1, msgid2, n):
    631     try:
    632         t = translation(domain, _localedirs.get(domain, None),
    633                         codeset=_localecodesets.get(domain))
    634     except IOError:
    635         if n == 1:
    636             return msgid1
    637         else:
    638             return msgid2
    639     return t.lngettext(msgid1, msgid2, n)
    640 
    641 def gettext(message):
    642     return dgettext(_current_domain, message)
    643 
    644 def lgettext(message):
    645     return ldgettext(_current_domain, message)
    646 
    647 def ngettext(msgid1, msgid2, n):
    648     return dngettext(_current_domain, msgid1, msgid2, n)
    649 
    650 def lngettext(msgid1, msgid2, n):
    651     return ldngettext(_current_domain, msgid1, msgid2, n)
    652 
    653 # dcgettext() has been deemed unnecessary and is not implemented.
    654 
    655 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
    656 # was:
    657 #
    658 #    import gettext
    659 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
    660 #    _ = cat.gettext
    661 #    print _('Hello World')
    662 
    663 # The resulting catalog object currently don't support access through a
    664 # dictionary API, which was supported (but apparently unused) in GNOME
    665 # gettext.
    666 
    667 Catalog = translation
    668