Home | History | Annotate | Download | only in Lib
      1 """Internationalization and localization support.
      2 
      3 This module provides internationalization (I18N) and localization (L10N)
      4 support for your Python programs by providing an interface to the GNU gettext
      5 message catalog library.
      6 
      7 I18N refers to the operation by which a program is made aware of multiple
      8 languages.  L10N refers to the adaptation of your program, once
      9 internationalized, to the local language and cultural habits.
     10 
     11 """
     12 
     13 # This module represents the integration of work, contributions, feedback, and

     14 # suggestions from the following people:

     15 #

     16 # Martin von Loewis, who wrote the initial implementation of the underlying

     17 # C-based libintlmodule (later renamed _gettext), along with a skeletal

     18 # gettext.py implementation.

     19 #

     20 # Peter Funk, who wrote fintl.py, a fairly complete wrapper around intlmodule,

     21 # which also included a pure-Python implementation to read .mo files if

     22 # intlmodule wasn't available.

     23 #

     24 # James Henstridge, who also wrote a gettext.py module, which has some

     25 # interesting, but currently unsupported experimental features: the notion of

     26 # a Catalog class and instances, and the ability to add to a catalog file via

     27 # a Python API.

     28 #

     29 # Barry Warsaw integrated these modules, wrote the .install() API and code,

     30 # and conformed all C and Python code to Python's coding standards.

     31 #

     32 # Francois Pinard and Marc-Andre Lemburg also contributed valuably to this

     33 # module.

     34 #

     35 # J. David Ibanez implemented plural forms. Bruno Haible fixed some bugs.

     36 #

     37 # TODO:

     38 # - Lazy loading of .mo files.  Currently the entire catalog is loaded into

     39 #   memory, but that's probably bad for large translated programs.  Instead,

     40 #   the lexical sort of original strings in GNU .mo files should be exploited

     41 #   to do binary searches and lazy initializations.  Or you might want to use

     42 #   the undocumented double-hash algorithm for .mo files with hash tables, but

     43 #   you'll need to study the GNU gettext code to do this.

     44 #

     45 # - Support Solaris .mo file formats.  Unfortunately, we've been unable to

     46 #   find this format documented anywhere.

     47 
     48 
     49 import locale, copy, os, re, struct, sys
     50 from errno import ENOENT
     51 
     52 
     53 __all__ = ['NullTranslations', 'GNUTranslations', 'Catalog',
     54            'find', 'translation', 'install', 'textdomain', 'bindtextdomain',
     55            'dgettext', 'dngettext', 'gettext', 'ngettext',
     56            ]
     57 
     58 _default_localedir = os.path.join(sys.prefix, 'share', 'locale')
     59 
     60 
     61 def test(condition, true, false):
     62     """
     63     Implements the C expression:
     64 
     65       condition ? true : false
     66 
     67     Required to correctly interpret plural forms.
     68     """
     69     if condition:
     70         return true
     71     else:
     72         return false
     73 
     74 
     75 def c2py(plural):
     76     """Gets a C expression as used in PO files for plural forms and returns a
     77     Python lambda function that implements an equivalent expression.
     78     """
     79     # Security check, allow only the "n" identifier

     80     try:
     81         from cStringIO import StringIO
     82     except ImportError:
     83         from StringIO import StringIO
     84     import token, tokenize
     85     tokens = tokenize.generate_tokens(StringIO(plural).readline)
     86     try:
     87         danger = [x for x in tokens if x[0] == token.NAME and x[1] != 'n']
     88     except tokenize.TokenError:
     89         raise ValueError, \
     90               'plural forms expression error, maybe unbalanced parenthesis'
     91     else:
     92         if danger:
     93             raise ValueError, 'plural forms expression could be dangerous'
     94 
     95     # Replace some C operators by their Python equivalents

     96     plural = plural.replace('&&', ' and ')
     97     plural = plural.replace('||', ' or ')
     98 
     99     expr = re.compile(r'\!([^=])')
    100     plural = expr.sub(' not \\1', plural)
    101 
    102     # Regular expression and replacement function used to transform

    103     # "a?b:c" to "test(a,b,c)".

    104     expr = re.compile(r'(.*?)\?(.*?):(.*)')
    105     def repl(x):
    106         return "test(%s, %s, %s)" % (x.group(1), x.group(2),
    107                                      expr.sub(repl, x.group(3)))
    108 
    109     # Code to transform the plural expression, taking care of parentheses

    110     stack = ['']
    111     for c in plural:
    112         if c == '(':
    113             stack.append('')
    114         elif c == ')':
    115             if len(stack) == 1:
    116                 # Actually, we never reach this code, because unbalanced

    117                 # parentheses get caught in the security check at the

    118                 # beginning.

    119                 raise ValueError, 'unbalanced parenthesis in plural form'
    120             s = expr.sub(repl, stack.pop())
    121             stack[-1] += '(%s)' % s
    122         else:
    123             stack[-1] += c
    124     plural = expr.sub(repl, stack.pop())
    125 
    126     return eval('lambda n: int(%s)' % plural)
    127 
    128 
    129 
    130 def _expand_lang(locale):
    131     from locale import normalize
    132     locale = normalize(locale)
    133     COMPONENT_CODESET   = 1 << 0
    134     COMPONENT_TERRITORY = 1 << 1
    135     COMPONENT_MODIFIER  = 1 << 2
    136     # split up the locale into its base components

    137     mask = 0
    138     pos = locale.find('@')
    139     if pos >= 0:
    140         modifier = locale[pos:]
    141         locale = locale[:pos]
    142         mask |= COMPONENT_MODIFIER
    143     else:
    144         modifier = ''
    145     pos = locale.find('.')
    146     if pos >= 0:
    147         codeset = locale[pos:]
    148         locale = locale[:pos]
    149         mask |= COMPONENT_CODESET
    150     else:
    151         codeset = ''
    152     pos = locale.find('_')
    153     if pos >= 0:
    154         territory = locale[pos:]
    155         locale = locale[:pos]
    156         mask |= COMPONENT_TERRITORY
    157     else:
    158         territory = ''
    159     language = locale
    160     ret = []
    161     for i in range(mask+1):
    162         if not (i & ~mask):  # if all components for this combo exist ...

    163             val = language
    164             if i & COMPONENT_TERRITORY: val += territory
    165             if i & COMPONENT_CODESET:   val += codeset
    166             if i & COMPONENT_MODIFIER:  val += modifier
    167             ret.append(val)
    168     ret.reverse()
    169     return ret
    170 
    171 
    172 
    173 class NullTranslations:
    174     def __init__(self, fp=None):
    175         self._info = {}
    176         self._charset = None
    177         self._output_charset = None
    178         self._fallback = None
    179         if fp is not None:
    180             self._parse(fp)
    181 
    182     def _parse(self, fp):
    183         pass
    184 
    185     def add_fallback(self, fallback):
    186         if self._fallback:
    187             self._fallback.add_fallback(fallback)
    188         else:
    189             self._fallback = fallback
    190 
    191     def gettext(self, message):
    192         if self._fallback:
    193             return self._fallback.gettext(message)
    194         return message
    195 
    196     def lgettext(self, message):
    197         if self._fallback:
    198             return self._fallback.lgettext(message)
    199         return message
    200 
    201     def ngettext(self, msgid1, msgid2, n):
    202         if self._fallback:
    203             return self._fallback.ngettext(msgid1, msgid2, n)
    204         if n == 1:
    205             return msgid1
    206         else:
    207             return msgid2
    208 
    209     def lngettext(self, msgid1, msgid2, n):
    210         if self._fallback:
    211             return self._fallback.lngettext(msgid1, msgid2, n)
    212         if n == 1:
    213             return msgid1
    214         else:
    215             return msgid2
    216 
    217     def ugettext(self, message):
    218         if self._fallback:
    219             return self._fallback.ugettext(message)
    220         return unicode(message)
    221 
    222     def ungettext(self, msgid1, msgid2, n):
    223         if self._fallback:
    224             return self._fallback.ungettext(msgid1, msgid2, n)
    225         if n == 1:
    226             return unicode(msgid1)
    227         else:
    228             return unicode(msgid2)
    229 
    230     def info(self):
    231         return self._info
    232 
    233     def charset(self):
    234         return self._charset
    235 
    236     def output_charset(self):
    237         return self._output_charset
    238 
    239     def set_output_charset(self, charset):
    240         self._output_charset = charset
    241 
    242     def install(self, unicode=False, names=None):
    243         import __builtin__
    244         __builtin__.__dict__['_'] = unicode and self.ugettext or self.gettext
    245         if hasattr(names, "__contains__"):
    246             if "gettext" in names:
    247                 __builtin__.__dict__['gettext'] = __builtin__.__dict__['_']
    248             if "ngettext" in names:
    249                 __builtin__.__dict__['ngettext'] = (unicode and self.ungettext
    250                                                              or self.ngettext)
    251             if "lgettext" in names:
    252                 __builtin__.__dict__['lgettext'] = self.lgettext
    253             if "lngettext" in names:
    254                 __builtin__.__dict__['lngettext'] = self.lngettext
    255 
    256 
    257 class GNUTranslations(NullTranslations):
    258     # Magic number of .mo files

    259     LE_MAGIC = 0x950412deL
    260     BE_MAGIC = 0xde120495L
    261 
    262     def _parse(self, fp):
    263         """Override this method to support alternative .mo formats."""
    264         unpack = struct.unpack
    265         filename = getattr(fp, 'name', '')
    266         # Parse the .mo file header, which consists of 5 little endian 32

    267         # bit words.

    268         self._catalog = catalog = {}
    269         self.plural = lambda n: int(n != 1) # germanic plural by default

    270         buf = fp.read()
    271         buflen = len(buf)
    272         # Are we big endian or little endian?

    273         magic = unpack('<I', buf[:4])[0]
    274         if magic == self.LE_MAGIC:
    275             version, msgcount, masteridx, transidx = unpack('<4I', buf[4:20])
    276             ii = '<II'
    277         elif magic == self.BE_MAGIC:
    278             version, msgcount, masteridx, transidx = unpack('>4I', buf[4:20])
    279             ii = '>II'
    280         else:
    281             raise IOError(0, 'Bad magic number', filename)
    282         # Now put all messages from the .mo file buffer into the catalog

    283         # dictionary.

    284         for i in xrange(0, msgcount):
    285             mlen, moff = unpack(ii, buf[masteridx:masteridx+8])
    286             mend = moff + mlen
    287             tlen, toff = unpack(ii, buf[transidx:transidx+8])
    288             tend = toff + tlen
    289             if mend < buflen and tend < buflen:
    290                 msg = buf[moff:mend]
    291                 tmsg = buf[toff:tend]
    292             else:
    293                 raise IOError(0, 'File is corrupt', filename)
    294             # See if we're looking at GNU .mo conventions for metadata

    295             if mlen == 0:
    296                 # Catalog description

    297                 lastk = k = None
    298                 for item in tmsg.splitlines():
    299                     item = item.strip()
    300                     if not item:
    301                         continue
    302                     if ':' in item:
    303                         k, v = item.split(':', 1)
    304                         k = k.strip().lower()
    305                         v = v.strip()
    306                         self._info[k] = v
    307                         lastk = k
    308                     elif lastk:
    309                         self._info[lastk] += '\n' + item
    310                     if k == 'content-type':
    311                         self._charset = v.split('charset=')[1]
    312                     elif k == 'plural-forms':
    313                         v = v.split(';')
    314                         plural = v[1].split('plural=')[1]
    315                         self.plural = c2py(plural)
    316             # Note: we unconditionally convert both msgids and msgstrs to

    317             # Unicode using the character encoding specified in the charset

    318             # parameter of the Content-Type header.  The gettext documentation

    319             # strongly encourages msgids to be us-ascii, but some applications

    320             # require alternative encodings (e.g. Zope's ZCML and ZPT).  For

    321             # traditional gettext applications, the msgid conversion will

    322             # cause no problems since us-ascii should always be a subset of

    323             # the charset encoding.  We may want to fall back to 8-bit msgids

    324             # if the Unicode conversion fails.

    325             if '\x00' in msg:
    326                 # Plural forms

    327                 msgid1, msgid2 = msg.split('\x00')
    328                 tmsg = tmsg.split('\x00')
    329                 if self._charset:
    330                     msgid1 = unicode(msgid1, self._charset)
    331                     tmsg = [unicode(x, self._charset) for x in tmsg]
    332                 for i in range(len(tmsg)):
    333                     catalog[(msgid1, i)] = tmsg[i]
    334             else:
    335                 if self._charset:
    336                     msg = unicode(msg, self._charset)
    337                     tmsg = unicode(tmsg, self._charset)
    338                 catalog[msg] = tmsg
    339             # advance to next entry in the seek tables

    340             masteridx += 8
    341             transidx += 8
    342 
    343     def gettext(self, message):
    344         missing = object()
    345         tmsg = self._catalog.get(message, missing)
    346         if tmsg is missing:
    347             if self._fallback:
    348                 return self._fallback.gettext(message)
    349             return message
    350         # Encode the Unicode tmsg back to an 8-bit string, if possible

    351         if self._output_charset:
    352             return tmsg.encode(self._output_charset)
    353         elif self._charset:
    354             return tmsg.encode(self._charset)
    355         return tmsg
    356 
    357     def lgettext(self, message):
    358         missing = object()
    359         tmsg = self._catalog.get(message, missing)
    360         if tmsg is missing:
    361             if self._fallback:
    362                 return self._fallback.lgettext(message)
    363             return message
    364         if self._output_charset:
    365             return tmsg.encode(self._output_charset)
    366         return tmsg.encode(locale.getpreferredencoding())
    367 
    368     def ngettext(self, msgid1, msgid2, n):
    369         try:
    370             tmsg = self._catalog[(msgid1, self.plural(n))]
    371             if self._output_charset:
    372                 return tmsg.encode(self._output_charset)
    373             elif self._charset:
    374                 return tmsg.encode(self._charset)
    375             return tmsg
    376         except KeyError:
    377             if self._fallback:
    378                 return self._fallback.ngettext(msgid1, msgid2, n)
    379             if n == 1:
    380                 return msgid1
    381             else:
    382                 return msgid2
    383 
    384     def lngettext(self, msgid1, msgid2, n):
    385         try:
    386             tmsg = self._catalog[(msgid1, self.plural(n))]
    387             if self._output_charset:
    388                 return tmsg.encode(self._output_charset)
    389             return tmsg.encode(locale.getpreferredencoding())
    390         except KeyError:
    391             if self._fallback:
    392                 return self._fallback.lngettext(msgid1, msgid2, n)
    393             if n == 1:
    394                 return msgid1
    395             else:
    396                 return msgid2
    397 
    398     def ugettext(self, message):
    399         missing = object()
    400         tmsg = self._catalog.get(message, missing)
    401         if tmsg is missing:
    402             if self._fallback:
    403                 return self._fallback.ugettext(message)
    404             return unicode(message)
    405         return tmsg
    406 
    407     def ungettext(self, msgid1, msgid2, n):
    408         try:
    409             tmsg = self._catalog[(msgid1, self.plural(n))]
    410         except KeyError:
    411             if self._fallback:
    412                 return self._fallback.ungettext(msgid1, msgid2, n)
    413             if n == 1:
    414                 tmsg = unicode(msgid1)
    415             else:
    416                 tmsg = unicode(msgid2)
    417         return tmsg
    418 
    419 
    420 # Locate a .mo file using the gettext strategy

    421 def find(domain, localedir=None, languages=None, all=0):
    422     # Get some reasonable defaults for arguments that were not supplied

    423     if localedir is None:
    424         localedir = _default_localedir
    425     if languages is None:
    426         languages = []
    427         for envar in ('LANGUAGE', 'LC_ALL', 'LC_MESSAGES', 'LANG'):
    428             val = os.environ.get(envar)
    429             if val:
    430                 languages = val.split(':')
    431                 break
    432         if 'C' not in languages:
    433             languages.append('C')
    434     # now normalize and expand the languages

    435     nelangs = []
    436     for lang in languages:
    437         for nelang in _expand_lang(lang):
    438             if nelang not in nelangs:
    439                 nelangs.append(nelang)
    440     # select a language

    441     if all:
    442         result = []
    443     else:
    444         result = None
    445     for lang in nelangs:
    446         if lang == 'C':
    447             break
    448         mofile = os.path.join(localedir, lang, 'LC_MESSAGES', '%s.mo' % domain)
    449         if os.path.exists(mofile):
    450             if all:
    451                 result.append(mofile)
    452             else:
    453                 return mofile
    454     return result
    455 
    456 
    457 
    458 # a mapping between absolute .mo file path and Translation object

    459 _translations = {}
    460 
    461 def translation(domain, localedir=None, languages=None,
    462                 class_=None, fallback=False, codeset=None):
    463     if class_ is None:
    464         class_ = GNUTranslations
    465     mofiles = find(domain, localedir, languages, all=1)
    466     if not mofiles:
    467         if fallback:
    468             return NullTranslations()
    469         raise IOError(ENOENT, 'No translation file found for domain', domain)
    470     # Avoid opening, reading, and parsing the .mo file after it's been done

    471     # once.

    472     result = None
    473     for mofile in mofiles:
    474         key = (class_, os.path.abspath(mofile))
    475         t = _translations.get(key)
    476         if t is None:
    477             with open(mofile, 'rb') as fp:
    478                 t = _translations.setdefault(key, class_(fp))
    479         # Copy the translation object to allow setting fallbacks and

    480         # output charset. All other instance data is shared with the

    481         # cached object.

    482         t = copy.copy(t)
    483         if codeset:
    484             t.set_output_charset(codeset)
    485         if result is None:
    486             result = t
    487         else:
    488             result.add_fallback(t)
    489     return result
    490 
    491 
    492 def install(domain, localedir=None, unicode=False, codeset=None, names=None):
    493     t = translation(domain, localedir, fallback=True, codeset=codeset)
    494     t.install(unicode, names)
    495 
    496 
    497 
    498 # a mapping b/w domains and locale directories

    499 _localedirs = {}
    500 # a mapping b/w domains and codesets

    501 _localecodesets = {}
    502 # current global domain, `messages' used for compatibility w/ GNU gettext

    503 _current_domain = 'messages'
    504 
    505 
    506 def textdomain(domain=None):
    507     global _current_domain
    508     if domain is not None:
    509         _current_domain = domain
    510     return _current_domain
    511 
    512 
    513 def bindtextdomain(domain, localedir=None):
    514     global _localedirs
    515     if localedir is not None:
    516         _localedirs[domain] = localedir
    517     return _localedirs.get(domain, _default_localedir)
    518 
    519 
    520 def bind_textdomain_codeset(domain, codeset=None):
    521     global _localecodesets
    522     if codeset is not None:
    523         _localecodesets[domain] = codeset
    524     return _localecodesets.get(domain)
    525 
    526 
    527 def dgettext(domain, message):
    528     try:
    529         t = translation(domain, _localedirs.get(domain, None),
    530                         codeset=_localecodesets.get(domain))
    531     except IOError:
    532         return message
    533     return t.gettext(message)
    534 
    535 def ldgettext(domain, message):
    536     try:
    537         t = translation(domain, _localedirs.get(domain, None),
    538                         codeset=_localecodesets.get(domain))
    539     except IOError:
    540         return message
    541     return t.lgettext(message)
    542 
    543 def dngettext(domain, msgid1, msgid2, n):
    544     try:
    545         t = translation(domain, _localedirs.get(domain, None),
    546                         codeset=_localecodesets.get(domain))
    547     except IOError:
    548         if n == 1:
    549             return msgid1
    550         else:
    551             return msgid2
    552     return t.ngettext(msgid1, msgid2, n)
    553 
    554 def ldngettext(domain, msgid1, msgid2, n):
    555     try:
    556         t = translation(domain, _localedirs.get(domain, None),
    557                         codeset=_localecodesets.get(domain))
    558     except IOError:
    559         if n == 1:
    560             return msgid1
    561         else:
    562             return msgid2
    563     return t.lngettext(msgid1, msgid2, n)
    564 
    565 def gettext(message):
    566     return dgettext(_current_domain, message)
    567 
    568 def lgettext(message):
    569     return ldgettext(_current_domain, message)
    570 
    571 def ngettext(msgid1, msgid2, n):
    572     return dngettext(_current_domain, msgid1, msgid2, n)
    573 
    574 def lngettext(msgid1, msgid2, n):
    575     return ldngettext(_current_domain, msgid1, msgid2, n)
    576 
    577 # dcgettext() has been deemed unnecessary and is not implemented.

    578 
    579 # James Henstridge's Catalog constructor from GNOME gettext.  Documented usage

    580 # was:

    581 #

    582 #    import gettext

    583 #    cat = gettext.Catalog(PACKAGE, localedir=LOCALEDIR)

    584 #    _ = cat.gettext

    585 #    print _('Hello World')

    586 
    587 # The resulting catalog object currently don't support access through a

    588 # dictionary API, which was supported (but apparently unused) in GNOME

    589 # gettext.

    590 
    591 Catalog = translation
    592