Home | History | Annotate | Download | only in Lib
      1 """Internationalization and localization support.
      2 
      3 This module provides internationalization (I18N) and localization (L10N)
      4 support for your Python programs by providing an interface to the GNU gettext
      5 message catalog library.
      6 
      7 I18N refers to the operation by which a program is made aware of multiple
      8 languages.  L10N refers to the adaptation of your program, once
      9 internationalized, to the local language and cultural habits.
     10 
     11 """
     12 
     13 # This module represents the integration of work, contributions, feedback, and
     14 # suggestions from the following people:
     15 #
     16 # Martin von Loewis, who wrote the initial implementation of the underlying
     17 # C-based libintlmodule (later renamed _gettext), along with a skeletal
     18 # gettext.py implementation.
     19 #
     20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,
     21 # which also included a pure-Python implementation to read .mo files if
     22 # intlmodule wasn't available.
     23 #
     24 # James Henstridge, who also wrote a gettext.py module, which has some
     25 # interesting, but currently unsupported experimental features: the notion of
     26 # a Catalog class and instances, and the ability to add to a catalog file via
     27 # a Python API.
     28 #
     29 # Barry Warsaw integrated these modules, wrote the .install() API and code,
     30 # and conformed all C and Python code to Python's coding standards.
     31 #
     32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this
     33 # module.
     34 #
     35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.
     36 #
     37 # TODO:
     38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into
     39 #   memory, but that's probably bad for large translated programs.  Instead,
     40 #   the lexical sort of original strings in GNU .mo files should be exploited
     41 #   to do binary searches and lazy initializations.  Or you might want to use
     42 #   the undocumented double-hash algorithm for .mo files with hash tables, but
     43 #   you'll need to study the GNU gettext code to do this.
     44 #
     45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to
     46 #   find this format documented anywhere.
     47 
     48 
     49 import locale
     50 import os
     51 import re
     52 import sys
     53 
     54 
     55 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
     56            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
     57            'bind_textdomain_codeset',
     58            'dgettext', 'dngettext', 'gettext', 'lgettext', 'ldgettext',
     59            'ldngettext', 'lngettext', 'ngettext',
     60            ]
     61 
     62 _default_localedir = os.path.join(sys.base_prefix, 'share', 'locale')
     63 
     64 # Expression parsing for plural form selection.
     65 #
     66 # The gettext library supports a small subset of C syntax.  The only
     67 # incompatible difference is that integer literals starting with zero are
     68 # decimal.
     69 #
     70 # https://www.gnu.org/software/gettext/manual/gettext.html#Plural-forms
     71 # http://git.savannah.gnu.org/cgit/gettext.git/tree/gettext-runtime/intl/plural.y
     72 
     73 _token_pattern = re.compile(r"""
     74         (?P<WHITESPACES>[ \t]+)                    | # spaces and horizontal tabs
     75         (?P<NUMBER>[0-9]+\b)                       | # decimal integer
     76         (?P<NAME>n\b)                              | # only n is allowed
     77         (?P<PARENTHESIS>[()])                      |
     78         (?P<OPERATOR>[-*/%+?:]|[><!]=?|==|&&|\|\|) | # !, *, /, %, +, -, <, >,
     79                                                      # <=, >=, ==, !=, &&, ||,
     80                                                      # ? :
     81                                                      # unary and bitwise ops
     82                                                      # not allowed
     83         (?P<INVALID>\w+|.)                           # invalid token
     84     """, re.VERBOSE|re.DOTALL)
     85 
     86 def _tokenize(plural):
     87     for mo in re.finditer(_token_pattern, plural):
     88         kind = mo.lastgroup
     89         if kind == 'WHITESPACES':
     90             continue
     91         value = mo.group(kind)
     92         if kind == 'INVALID':
     93             raise ValueError('invalid token in plural form: %s' % value)
     94         yield value
     95     yield ''
     96 
     97 def _error(value):
     98     if value:
     99         return ValueError('unexpected token in plural form: %s' % value)
    100     else:
    101         return ValueError('unexpected end of plural form')
    102 
    103 _binary_ops = (
    104     ('||',),
    105     ('&&',),
    106     ('==', '!='),
    107     ('<', '>', '<=', '>='),
    108     ('+', '-'),
    109     ('*', '/', '%'),
    110 )
    111 _binary_ops = {op: i for i, ops in enumerate(_binary_ops, 1) for op in ops}
    112 _c2py_ops = {'||': 'or', '&&': 'and', '/': '//'}
    113 
    114 def _parse(tokens, priority=-1):
    115     result = ''
    116     nexttok = next(tokens)
    117     while nexttok == '!':
    118         result += 'not '
    119         nexttok = next(tokens)
    120 
    121     if nexttok == '(':
    122         sub, nexttok = _parse(tokens)
    123         result = '%s(%s)' % (result, sub)
    124         if nexttok != ')':
    125             raise ValueError('unbalanced parenthesis in plural form')
    126     elif nexttok == 'n':
    127         result = '%s%s' % (result, nexttok)
    128     else:
    129         try:
    130             value = int(nexttok, 10)
    131         except ValueError:
    132             raise _error(nexttok) from None
    133         result = '%s%d' % (result, value)
    134     nexttok = next(tokens)
    135 
    136     j = 100
    137     while nexttok in _binary_ops:
    138         i = _binary_ops[nexttok]
    139         if i < priority:
    140             break
    141         # Break chained comparisons
    142         if i in (3, 4) and j in (3, 4):  # '==', '!=', '<', '>', '<=', '>='
    143             result = '(%s)' % result
    144         # Replace some C operators by their Python equivalents
    145         op = _c2py_ops.get(nexttok, nexttok)
    146         right, nexttok = _parse(tokens, i + 1)
    147         result = '%s %s %s' % (result, op, right)
    148         j = i
    149     if j == priority == 4:  # '<', '>', '<=', '>='
    150         result = '(%s)' % result
    151 
    152     if nexttok == '?' and priority <= 0:
    153         if_true, nexttok = _parse(tokens, 0)
    154         if nexttok != ':':
    155             raise _error(nexttok)
    156         if_false, nexttok = _parse(tokens)
    157         result = '%s if %s else %s' % (if_true, result, if_false)
    158         if priority == 0:
    159             result = '(%s)' % result
    160 
    161     return result, nexttok
    162 
    163 def _as_int(n):
    164     try:
    165         i = round(n)
    166     except TypeError:
    167         raise TypeError('Plural value must be an integer, got %s' %
    168                         (n.__class__.__name__,)) from None
    169     import warnings
    170     warnings.warn('Plural value must be an integer, got %s' %
    171                   (n.__class__.__name__,),
    172                   DeprecationWarning, 4)
    173     return n
    174 
    175 def c2py(plural):
    176     """Gets a C expression as used in PO files for plural forms and returns a
    177     Python function that implements an equivalent expression.
    178     """
    179 
    180     if len(plural) > 1000:
    181         raise ValueError('plural form expression is too long')
    182     try:
    183         result, nexttok = _parse(_tokenize(plural))
    184         if nexttok:
    185             raise _error(nexttok)
    186 
    187         depth = 0
    188         for c in result:
    189             if c == '(':
    190                 depth += 1
    191                 if depth > 20:
    192                     # Python compiler limit is about 90.
    193                     # The most complex example has 2.
    194                     raise ValueError('plural form expression is too complex')
    195             elif c == ')':
    196                 depth -= 1
    197 
    198         ns = {'_as_int': _as_int}
    199         exec('''if True:
    200             def func(n):
    201                 if not isinstance(n, int):
    202                     n = _as_int(n)
    203                 return int(%s)
    204             ''' % result, ns)
    205         return ns['func']
    206     except RecursionError:
    207         # Recursion error can be raised in _parse() or exec().
    208         raise ValueError('plural form expression is too complex')
    209 
    210 
    211 def _expand_lang(loc):
    212     loc = locale.normalize(loc)
    213     COMPONENT_CODESET   = 1 << 0
    214     COMPONENT_TERRITORY = 1 << 1
    215     COMPONENT_MODIFIER  = 1 << 2
    216     # split up the locale into its base components
    217     mask = 0
    218     pos = loc.find('@')
    219     if pos >= 0:
    220         modifier = loc[pos:]
    221         loc = loc[:pos]
    222         mask |= COMPONENT_MODIFIER
    223     else:
    224         modifier = ''
    225     pos = loc.find('.')
    226     if pos >= 0:
    227         codeset = loc[pos:]
    228         loc = loc[:pos]
    229         mask |= COMPONENT_CODESET
    230     else:
    231         codeset = ''
    232     pos = loc.find('_')
    233     if pos >= 0:
    234         territory = loc[pos:]
    235         loc = loc[:pos]
    236         mask |= COMPONENT_TERRITORY
    237     else:
    238         territory = ''
    239     language = loc
    240     ret = []
    241     for i in range(mask+1):
    242         if not (i & ~mask):  # if all components for this combo exist ...
    243             val = language
    244             if i & COMPONENT_TERRITORY: val += territory
    245             if i & COMPONENT_CODESET:   val += codeset
    246             if i & COMPONENT_MODIFIER:  val += modifier
    247             ret.append(val)
    248     ret.reverse()
    249     return ret
    250 
    251 
    252 
    253 class NullTranslations:
    254     def __init__(self, fp=None):
    255         self._info = {}
    256         self._charset = None
    257         self._output_charset = None
    258         self._fallback = None
    259         if fp is not None:
    260             self._parse(fp)
    261 
    262     def _parse(self, fp):
    263         pass
    264 
    265     def add_fallback(self, fallback):
    266         if self._fallback:
    267             self._fallback.add_fallback(fallback)
    268         else:
    269             self._fallback = fallback
    270 
    271     def gettext(self, message):
    272         if self._fallback:
    273             return self._fallback.gettext(message)
    274         return message
    275 
    276     def lgettext(self, message):
    277         if self._fallback:
    278             return self._fallback.lgettext(message)
    279         if self._output_charset:
    280             return message.encode(self._output_charset)
    281         return message.encode(locale.getpreferredencoding())
    282 
    283     def ngettext(self, msgid1, msgid2, n):
    284         if self._fallback:
    285             return self._fallback.ngettext(msgid1, msgid2, n)
    286         if n == 1:
    287             return msgid1
    288         else:
    289             return msgid2
    290 
    291     def lngettext(self, msgid1, msgid2, n):
    292         if self._fallback:
    293             return self._fallback.lngettext(msgid1, msgid2, n)
    294         if n == 1:
    295             tmsg = msgid1
    296         else:
    297             tmsg = msgid2
    298         if self._output_charset:
    299             return tmsg.encode(self._output_charset)
    300         return tmsg.encode(locale.getpreferredencoding())
    301 
    302     def info(self):
    303         return self._info
    304 
    305     def charset(self):
    306         return self._charset
    307 
    308     def output_charset(self):
    309         return self._output_charset
    310 
    311     def set_output_charset(self, charset):
    312         self._output_charset = charset
    313 
    314     def install(self, names=None):
    315         import builtins
    316         builtins.__dict__['_'] = self.gettext
    317         if hasattr(names, "__contains__"):
    318             if "gettext" in names:
    319                 builtins.__dict__['gettext'] = builtins.__dict__['_']
    320             if "ngettext" in names:
    321                 builtins.__dict__['ngettext'] = self.ngettext
    322             if "lgettext" in names:
    323                 builtins.__dict__['lgettext'] = self.lgettext
    324             if "lngettext" in names:
    325                 builtins.__dict__['lngettext'] = self.lngettext
    326 
    327 
    328 class GNUTranslations(NullTranslations):
    329     # Magic number of .mo files
    330     LE_MAGIC = 0x950412de
    331     BE_MAGIC = 0xde120495
    332 
    333     # Acceptable .mo versions
    334     VERSIONS = (0, 1)
    335 
    336     def _get_versions(self, version):
    337         """Returns a tuple of major version, minor version"""
    338         return (version >> 16, version & 0xffff)
    339 
    340     def _parse(self, fp):
    341         """Override this method to support alternative .mo formats."""
    342         # Delay struct import for speeding up gettext import when .mo files
    343         # are not used.
    344         from struct import unpack
    345         filename = getattr(fp, 'name', '')
    346         # Parse the .mo file header, which consists of 5 little endian 32
    347         # bit words.
    348         self._catalog = catalog = {}
    349         self.plural = lambda n: int(n != 1) # germanic plural by default
    350         buf = fp.read()
    351         buflen = len(buf)
    352         # Are we big endian or little endian?
    353         magic = unpack('<I', buf[:4])[0]
    354         if magic == self.LE_MAGIC:
    355             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
    356             ii = '<II'
    357         elif magic == self.BE_MAGIC:
    358             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
    359             ii = '>II'
    360         else:
    361             raise OSError(0, 'Bad magic number', filename)
    362 
    363         major_version, minor_version = self._get_versions(version)
    364 
    365         if major_version not in self.VERSIONS:
    366             raise OSError(0, 'Bad version number ' + str(major_version), filename)
    367 
    368         # Now put all messages from the .mo file buffer into the catalog
    369         # dictionary.
    370         for i in range(0, msgcount):
    371             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
    372             mend = moff + mlen
    373             tlen, toff = unpack(ii, buf[transidx:transidx+8])
    374             tend = toff + tlen
    375             if mend < buflen and tend < buflen:
    376                 msg = buf[moff:mend]
    377                 tmsg = buf[toff:tend]
    378             else:
    379                 raise OSError(0, 'File is corrupt', filename)
    380             # See if we're looking at GNU .mo conventions for metadata
    381             if mlen == 0:
    382                 # Catalog description
    383                 lastk = None
    384                 for b_item in tmsg.split(b'\n'):
    385                     item = b_item.decode().strip()
    386                     if not item:
    387                         continue
    388                     k = v = None
    389                     if ':' in item:
    390                         k, v = item.split(':', 1)
    391                         k = k.strip().lower()
    392                         v = v.strip()
    393                         self._info[k] = v
    394                         lastk = k
    395                     elif lastk:
    396                         self._info[lastk] += '\n' + item
    397                     if k == 'content-type':
    398                         self._charset = v.split('charset=')[1]
    399                     elif k == 'plural-forms':
    400                         v = v.split(';')
    401                         plural = v[1].split('plural=')[1]
    402                         self.plural = c2py(plural)
    403             # Note: we unconditionally convert both msgids and msgstrs to
    404             # Unicode using the character encoding specified in the charset
    405             # parameter of the Content-Type header.  The gettext documentation
    406             # strongly encourages msgids to be us-ascii, but some applications
    407             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For
    408             # traditional gettext applications, the msgid conversion will
    409             # cause no problems since us-ascii should always be a subset of
    410             # the charset encoding.  We may want to fall back to 8-bit msgids
    411             # if the Unicode conversion fails.
    412             charset = self._charset or 'ascii'
    413             if b'\x00' in msg:
    414                 # Plural forms
    415                 msgid1, msgid2 = msg.split(b'\x00')
    416                 tmsg = tmsg.split(b'\x00')
    417                 msgid1 = str(msgid1, charset)
    418                 for i, x in enumerate(tmsg):
    419                     catalog[(msgid1, i)] = str(x, charset)
    420             else:
    421                 catalog[str(msg, charset)] = str(tmsg, charset)
    422             # advance to next entry in the seek tables
    423             masteridx += 8
    424             transidx += 8
    425 
    426     def lgettext(self, message):
    427         missing = object()
    428         tmsg = self._catalog.get(message, missing)
    429         if tmsg is missing:
    430             if self._fallback:
    431                 return self._fallback.lgettext(message)
    432             tmsg = message
    433         if self._output_charset:
    434             return tmsg.encode(self._output_charset)
    435         return tmsg.encode(locale.getpreferredencoding())
    436 
    437     def lngettext(self, msgid1, msgid2, n):
    438         try:
    439             tmsg = self._catalog[(msgid1, self.plural(n))]
    440         except KeyError:
    441             if self._fallback:
    442                 return self._fallback.lngettext(msgid1, msgid2, n)
    443             if n == 1:
    444                 tmsg = msgid1
    445             else:
    446                 tmsg = msgid2
    447         if self._output_charset:
    448             return tmsg.encode(self._output_charset)
    449         return tmsg.encode(locale.getpreferredencoding())
    450 
    451     def gettext(self, message):
    452         missing = object()
    453         tmsg = self._catalog.get(message, missing)
    454         if tmsg is missing:
    455             if self._fallback:
    456                 return self._fallback.gettext(message)
    457             return message
    458         return tmsg
    459 
    460     def ngettext(self, msgid1, msgid2, n):
    461         try:
    462             tmsg = self._catalog[(msgid1, self.plural(n))]
    463         except KeyError:
    464             if self._fallback:
    465                 return self._fallback.ngettext(msgid1, msgid2, n)
    466             if n == 1:
    467                 tmsg = msgid1
    468             else:
    469                 tmsg = msgid2
    470         return tmsg
    471 
    472 
    473 # Locate a .mo file using the gettext strategy
    474 def find(domain, localedir=None, languages=None, all=False):
    475     # Get some reasonable defaults for arguments that were not supplied
    476     if localedir is None:
    477         localedir = _default_localedir
    478     if languages is None:
    479         languages = []
    480         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
    481             val = os.environ.get(envar)
    482             if val:
    483                 languages = val.split(':')
    484                 break
    485         if 'C' not in languages:
    486             languages.append('C')
    487     # now normalize and expand the languages
    488     nelangs = []
    489     for lang in languages:
    490         for nelang in _expand_lang(lang):
    491             if nelang not in nelangs:
    492                 nelangs.append(nelang)
    493     # select a language
    494     if all:
    495         result = []
    496     else:
    497         result = None
    498     for lang in nelangs:
    499         if lang == 'C':
    500             break
    501         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
    502         if os.path.exists(mofile):
    503             if all:
    504                 result.append(mofile)
    505             else:
    506                 return mofile
    507     return result
    508 
    509 
    510 
    511 # a mapping between absolute .mo file path and Translation object
    512 _translations = {}
    513 
    514 def translation(domain, localedir=None, languages=None,
    515                 class_=None, fallback=False, codeset=None):
    516     if class_ is None:
    517         class_ = GNUTranslations
    518     mofiles = find(domain, localedir, languages, all=True)
    519     if not mofiles:
    520         if fallback:
    521             return NullTranslations()
    522         from errno import ENOENT
    523         raise FileNotFoundError(ENOENT,
    524                                 'No translation file found for domain', domain)
    525     # Avoid opening, reading, and parsing the .mo file after it's been done
    526     # once.
    527     result = None
    528     for mofile in mofiles:
    529         key = (class_, os.path.abspath(mofile))
    530         t = _translations.get(key)
    531         if t is None:
    532             with open(mofile, 'rb') as fp:
    533                 t = _translations.setdefault(key, class_(fp))
    534         # Copy the translation object to allow setting fallbacks and
    535         # output charset. All other instance data is shared with the
    536         # cached object.
    537         # Delay copy import for speeding up gettext import when .mo files
    538         # are not used.
    539         import copy
    540         t = copy.copy(t)
    541         if codeset:
    542             t.set_output_charset(codeset)
    543         if result is None:
    544             result = t
    545         else:
    546             result.add_fallback(t)
    547     return result
    548 
    549 
    550 def install(domain, localedir=None, codeset=None, names=None):
    551     t = translation(domain, localedir, fallback=True, codeset=codeset)
    552     t.install(names)
    553 
    554 
    555 
    556 # a mapping b/w domains and locale directories
    557 _localedirs = {}
    558 # a mapping b/w domains and codesets
    559 _localecodesets = {}
    560 # current global domain, `messages' used for compatibility w/ GNU gettext
    561 _current_domain = 'messages'
    562 
    563 
    564 def textdomain(domain=None):
    565     global _current_domain
    566     if domain is not None:
    567         _current_domain = domain
    568     return _current_domain
    569 
    570 
    571 def bindtextdomain(domain, localedir=None):
    572     global _localedirs
    573     if localedir is not None:
    574         _localedirs[domain] = localedir
    575     return _localedirs.get(domain, _default_localedir)
    576 
    577 
    578 def bind_textdomain_codeset(domain, codeset=None):
    579     global _localecodesets
    580     if codeset is not None:
    581         _localecodesets[domain] = codeset
    582     return _localecodesets.get(domain)
    583 
    584 
    585 def dgettext(domain, message):
    586     try:
    587         t = translation(domain, _localedirs.get(domain, None),
    588                         codeset=_localecodesets.get(domain))
    589     except OSError:
    590         return message
    591     return t.gettext(message)
    592 
    593 def ldgettext(domain, message):
    594     codeset = _localecodesets.get(domain)
    595     try:
    596         t = translation(domain, _localedirs.get(domain, None), codeset=codeset)
    597     except OSError:
    598         return message.encode(codeset or locale.getpreferredencoding())
    599     return t.lgettext(message)
    600 
    601 def dngettext(domain, msgid1, msgid2, n):
    602     try:
    603         t = translation(domain, _localedirs.get(domain, None),
    604                         codeset=_localecodesets.get(domain))
    605     except OSError:
    606         if n == 1:
    607             return msgid1
    608         else:
    609             return msgid2
    610     return t.ngettext(msgid1, msgid2, n)
    611 
    612 def ldngettext(domain, msgid1, msgid2, n):
    613     codeset = _localecodesets.get(domain)
    614     try:
    615         t = translation(domain, _localedirs.get(domain, None), codeset=codeset)
    616     except OSError:
    617         if n == 1:
    618             tmsg = msgid1
    619         else:
    620             tmsg = msgid2
    621         return tmsg.encode(codeset or locale.getpreferredencoding())
    622     return t.lngettext(msgid1, msgid2, n)
    623 
    624 def gettext(message):
    625     return dgettext(_current_domain, message)
    626 
    627 def lgettext(message):
    628     return ldgettext(_current_domain, message)
    629 
    630 def ngettext(msgid1, msgid2, n):
    631     return dngettext(_current_domain, msgid1, msgid2, n)
    632 
    633 def lngettext(msgid1, msgid2, n):
    634     return ldngettext(_current_domain, msgid1, msgid2, n)
    635 
    636 # dcgettext() has been deemed unnecessary and is not implemented.
    637 
    638 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage
    639 # was:
    640 #
    641 #    import gettext
    642 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)
    643 #    _ = cat.gettext
    644 #    print _('Hello World')
    645 
    646 # The resulting catalog object currently don't support access through a
    647 # dictionary API, which was supported (but apparently unused) in GNOME
    648 # gettext.
    649 
    650 Catalog = translation
    651