Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2001-2007 Python Software Foundation
      2 # Author: Ben Gertzfield, Barry Warsaw
      3 # Contact: email-sig (at] python.org
      4 
      5 __all__ = [
      6     'Charset',
      7     'add_alias',
      8     'add_charset',
      9     'add_codec',
     10     ]
     11 
     12 from functools import partial
     13 
     14 import email.base64mime
     15 import email.quoprimime
     16 
     17 from email import errors
     18 from email.encoders import encode_7or8bit
     19 
     20 
     21 
     23 # Flags for types of header encodings
     24 QP          = 1 # Quoted-Printable
     25 BASE64      = 2 # Base64
     26 SHORTEST    = 3 # the shorter of QP and base64, but only for headers
     27 
     28 # In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
     29 RFC2047_CHROME_LEN = 7
     30 
     31 DEFAULT_CHARSET = 'us-ascii'
     32 UNKNOWN8BIT = 'unknown-8bit'
     33 EMPTYSTRING = ''
     34 
     35 
     36 
     38 # Defaults
     39 CHARSETS = {
     40     # input        header enc  body enc output conv
     41     'iso-8859-1':  (QP,        QP,      None),
     42     'iso-8859-2':  (QP,        QP,      None),
     43     'iso-8859-3':  (QP,        QP,      None),
     44     'iso-8859-4':  (QP,        QP,      None),
     45     # iso-8859-5 is Cyrillic, and not especially used
     46     # iso-8859-6 is Arabic, also not particularly used
     47     # iso-8859-7 is Greek, QP will not make it readable
     48     # iso-8859-8 is Hebrew, QP will not make it readable
     49     'iso-8859-9':  (QP,        QP,      None),
     50     'iso-8859-10': (QP,        QP,      None),
     51     # iso-8859-11 is Thai, QP will not make it readable
     52     'iso-8859-13': (QP,        QP,      None),
     53     'iso-8859-14': (QP,        QP,      None),
     54     'iso-8859-15': (QP,        QP,      None),
     55     'iso-8859-16': (QP,        QP,      None),
     56     'windows-1252':(QP,        QP,      None),
     57     'viscii':      (QP,        QP,      None),
     58     'us-ascii':    (None,      None,    None),
     59     'big5':        (BASE64,    BASE64,  None),
     60     'gb2312':      (BASE64,    BASE64,  None),
     61     'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
     62     'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
     63     'iso-2022-jp': (BASE64,    None,    None),
     64     'koi8-r':      (BASE64,    BASE64,  None),
     65     'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
     66     }
     67 
     68 # Aliases for other commonly-used names for character sets.  Map
     69 # them to the real ones used in email.
     70 ALIASES = {
     71     'latin_1': 'iso-8859-1',
     72     'latin-1': 'iso-8859-1',
     73     'latin_2': 'iso-8859-2',
     74     'latin-2': 'iso-8859-2',
     75     'latin_3': 'iso-8859-3',
     76     'latin-3': 'iso-8859-3',
     77     'latin_4': 'iso-8859-4',
     78     'latin-4': 'iso-8859-4',
     79     'latin_5': 'iso-8859-9',
     80     'latin-5': 'iso-8859-9',
     81     'latin_6': 'iso-8859-10',
     82     'latin-6': 'iso-8859-10',
     83     'latin_7': 'iso-8859-13',
     84     'latin-7': 'iso-8859-13',
     85     'latin_8': 'iso-8859-14',
     86     'latin-8': 'iso-8859-14',
     87     'latin_9': 'iso-8859-15',
     88     'latin-9': 'iso-8859-15',
     89     'latin_10':'iso-8859-16',
     90     'latin-10':'iso-8859-16',
     91     'cp949':   'ks_c_5601-1987',
     92     'euc_jp':  'euc-jp',
     93     'euc_kr':  'euc-kr',
     94     'ascii':   'us-ascii',
     95     }
     96 
     97 
     98 # Map charsets to their Unicode codec strings.
     99 CODEC_MAP = {
    100     'gb2312':      'eucgb2312_cn',
    101     'big5':        'big5_tw',
    102     # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
    103     # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
    104     # Let that stuff pass through without conversion to/from Unicode.
    105     'us-ascii':    None,
    106     }
    107 
    108 
    109 
    111 # Convenience functions for extending the above mappings
    112 def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
    113     """Add character set properties to the global registry.
    114 
    115     charset is the input character set, and must be the canonical name of a
    116     character set.
    117 
    118     Optional header_enc and body_enc is either Charset.QP for
    119     quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
    120     the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
    121     is only valid for header_enc.  It describes how message headers and
    122     message bodies in the input charset are to be encoded.  Default is no
    123     encoding.
    124 
    125     Optional output_charset is the character set that the output should be
    126     in.  Conversions will proceed from input charset, to Unicode, to the
    127     output charset when the method Charset.convert() is called.  The default
    128     is to output in the same character set as the input.
    129 
    130     Both input_charset and output_charset must have Unicode codec entries in
    131     the module's charset-to-codec mapping; use add_codec(charset, codecname)
    132     to add codecs the module does not know about.  See the codecs module's
    133     documentation for more information.
    134     """
    135     if body_enc == SHORTEST:
    136         raise ValueError('SHORTEST not allowed for body_enc')
    137     CHARSETS[charset] = (header_enc, body_enc, output_charset)
    138 
    139 
    140 def add_alias(alias, canonical):
    141     """Add a character set alias.
    142 
    143     alias is the alias name, e.g. latin-1
    144     canonical is the character set's canonical name, e.g. iso-8859-1
    145     """
    146     ALIASES[alias] = canonical
    147 
    148 
    149 def add_codec(charset, codecname):
    150     """Add a codec that map characters in the given charset to/from Unicode.
    151 
    152     charset is the canonical name of a character set.  codecname is the name
    153     of a Python codec, as appropriate for the second argument to the unicode()
    154     built-in, or to the encode() method of a Unicode string.
    155     """
    156     CODEC_MAP[charset] = codecname
    157 
    158 
    159 
    161 # Convenience function for encoding strings, taking into account
    162 # that they might be unknown-8bit (ie: have surrogate-escaped bytes)
    163 def _encode(string, codec):
    164     if codec == UNKNOWN8BIT:
    165         return string.encode('ascii', 'surrogateescape')
    166     else:
    167         return string.encode(codec)
    168 
    169 
    170 
    172 class Charset:
    173     """Map character sets to their email properties.
    174 
    175     This class provides information about the requirements imposed on email
    176     for a specific character set.  It also provides convenience routines for
    177     converting between character sets, given the availability of the
    178     applicable codecs.  Given a character set, it will do its best to provide
    179     information on how to use that character set in an email in an
    180     RFC-compliant way.
    181 
    182     Certain character sets must be encoded with quoted-printable or base64
    183     when used in email headers or bodies.  Certain character sets must be
    184     converted outright, and are not allowed in email.  Instances of this
    185     module expose the following information about a character set:
    186 
    187     input_charset: The initial character set specified.  Common aliases
    188                    are converted to their `official' email names (e.g. latin_1
    189                    is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
    190 
    191     header_encoding: If the character set must be encoded before it can be
    192                      used in an email header, this attribute will be set to
    193                      Charset.QP (for quoted-printable), Charset.BASE64 (for
    194                      base64 encoding), or Charset.SHORTEST for the shortest of
    195                      QP or BASE64 encoding.  Otherwise, it will be None.
    196 
    197     body_encoding: Same as header_encoding, but describes the encoding for the
    198                    mail message's body, which indeed may be different than the
    199                    header encoding.  Charset.SHORTEST is not allowed for
    200                    body_encoding.
    201 
    202     output_charset: Some character sets must be converted before they can be
    203                     used in email headers or bodies.  If the input_charset is
    204                     one of them, this attribute will contain the name of the
    205                     charset output will be converted to.  Otherwise, it will
    206                     be None.
    207 
    208     input_codec: The name of the Python codec used to convert the
    209                  input_charset to Unicode.  If no conversion codec is
    210                  necessary, this attribute will be None.
    211 
    212     output_codec: The name of the Python codec used to convert Unicode
    213                   to the output_charset.  If no conversion codec is necessary,
    214                   this attribute will have the same value as the input_codec.
    215     """
    216     def __init__(self, input_charset=DEFAULT_CHARSET):
    217         # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
    218         # unicode because its .lower() is locale insensitive.  If the argument
    219         # is already a unicode, we leave it at that, but ensure that the
    220         # charset is ASCII, as the standard (RFC XXX) requires.
    221         try:
    222             if isinstance(input_charset, str):
    223                 input_charset.encode('ascii')
    224             else:
    225                 input_charset = str(input_charset, 'ascii')
    226         except UnicodeError:
    227             raise errors.CharsetError(input_charset)
    228         input_charset = input_charset.lower()
    229         # Set the input charset after filtering through the aliases
    230         self.input_charset = ALIASES.get(input_charset, input_charset)
    231         # We can try to guess which encoding and conversion to use by the
    232         # charset_map dictionary.  Try that first, but let the user override
    233         # it.
    234         henc, benc, conv = CHARSETS.get(self.input_charset,
    235                                         (SHORTEST, BASE64, None))
    236         if not conv:
    237             conv = self.input_charset
    238         # Set the attributes, allowing the arguments to override the default.
    239         self.header_encoding = henc
    240         self.body_encoding = benc
    241         self.output_charset = ALIASES.get(conv, conv)
    242         # Now set the codecs.  If one isn't defined for input_charset,
    243         # guess and try a Unicode codec with the same name as input_codec.
    244         self.input_codec = CODEC_MAP.get(self.input_charset,
    245                                          self.input_charset)
    246         self.output_codec = CODEC_MAP.get(self.output_charset,
    247                                           self.output_charset)
    248 
    249     def __str__(self):
    250         return self.input_charset.lower()
    251 
    252     __repr__ = __str__
    253 
    254     def __eq__(self, other):
    255         return str(self) == str(other).lower()
    256 
    257     def get_body_encoding(self):
    258         """Return the content-transfer-encoding used for body encoding.
    259 
    260         This is either the string `quoted-printable' or `base64' depending on
    261         the encoding used, or it is a function in which case you should call
    262         the function with a single argument, the Message object being
    263         encoded.  The function should then set the Content-Transfer-Encoding
    264         header itself to whatever is appropriate.
    265 
    266         Returns "quoted-printable" if self.body_encoding is QP.
    267         Returns "base64" if self.body_encoding is BASE64.
    268         Returns conversion function otherwise.
    269         """
    270         assert self.body_encoding != SHORTEST
    271         if self.body_encoding == QP:
    272             return 'quoted-printable'
    273         elif self.body_encoding == BASE64:
    274             return 'base64'
    275         else:
    276             return encode_7or8bit
    277 
    278     def get_output_charset(self):
    279         """Return the output character set.
    280 
    281         This is self.output_charset if that is not None, otherwise it is
    282         self.input_charset.
    283         """
    284         return self.output_charset or self.input_charset
    285 
    286     def header_encode(self, string):
    287         """Header-encode a string by converting it first to bytes.
    288 
    289         The type of encoding (base64 or quoted-printable) will be based on
    290         this charset's `header_encoding`.
    291 
    292         :param string: A unicode string for the header.  It must be possible
    293             to encode this string to bytes using the character set's
    294             output codec.
    295         :return: The encoded string, with RFC 2047 chrome.
    296         """
    297         codec = self.output_codec or 'us-ascii'
    298         header_bytes = _encode(string, codec)
    299         # 7bit/8bit encodings return the string unchanged (modulo conversions)
    300         encoder_module = self._get_encoder(header_bytes)
    301         if encoder_module is None:
    302             return string
    303         return encoder_module.header_encode(header_bytes, codec)
    304 
    305     def header_encode_lines(self, string, maxlengths):
    306         """Header-encode a string by converting it first to bytes.
    307 
    308         This is similar to `header_encode()` except that the string is fit
    309         into maximum line lengths as given by the argument.
    310 
    311         :param string: A unicode string for the header.  It must be possible
    312             to encode this string to bytes using the character set's
    313             output codec.
    314         :param maxlengths: Maximum line length iterator.  Each element
    315             returned from this iterator will provide the next maximum line
    316             length.  This parameter is used as an argument to built-in next()
    317             and should never be exhausted.  The maximum line lengths should
    318             not count the RFC 2047 chrome.  These line lengths are only a
    319             hint; the splitter does the best it can.
    320         :return: Lines of encoded strings, each with RFC 2047 chrome.
    321         """
    322         # See which encoding we should use.
    323         codec = self.output_codec or 'us-ascii'
    324         header_bytes = _encode(string, codec)
    325         encoder_module = self._get_encoder(header_bytes)
    326         encoder = partial(encoder_module.header_encode, charset=codec)
    327         # Calculate the number of characters that the RFC 2047 chrome will
    328         # contribute to each line.
    329         charset = self.get_output_charset()
    330         extra = len(charset) + RFC2047_CHROME_LEN
    331         # Now comes the hard part.  We must encode bytes but we can't split on
    332         # bytes because some character sets are variable length and each
    333         # encoded word must stand on its own.  So the problem is you have to
    334         # encode to bytes to figure out this word's length, but you must split
    335         # on characters.  This causes two problems: first, we don't know how
    336         # many octets a specific substring of unicode characters will get
    337         # encoded to, and second, we don't know how many ASCII characters
    338         # those octets will get encoded to.  Unless we try it.  Which seems
    339         # inefficient.  In the interest of being correct rather than fast (and
    340         # in the hope that there will be few encoded headers in any such
    341         # message), brute force it. :(
    342         lines = []
    343         current_line = []
    344         maxlen = next(maxlengths) - extra
    345         for character in string:
    346             current_line.append(character)
    347             this_line = EMPTYSTRING.join(current_line)
    348             length = encoder_module.header_length(_encode(this_line, charset))
    349             if length > maxlen:
    350                 # This last character doesn't fit so pop it off.
    351                 current_line.pop()
    352                 # Does nothing fit on the first line?
    353                 if not lines and not current_line:
    354                     lines.append(None)
    355                 else:
    356                     separator = (' ' if lines else '')
    357                     joined_line = EMPTYSTRING.join(current_line)
    358                     header_bytes = _encode(joined_line, codec)
    359                     lines.append(encoder(header_bytes))
    360                 current_line = [character]
    361                 maxlen = next(maxlengths) - extra
    362         joined_line = EMPTYSTRING.join(current_line)
    363         header_bytes = _encode(joined_line, codec)
    364         lines.append(encoder(header_bytes))
    365         return lines
    366 
    367     def _get_encoder(self, header_bytes):
    368         if self.header_encoding == BASE64:
    369             return email.base64mime
    370         elif self.header_encoding == QP:
    371             return email.quoprimime
    372         elif self.header_encoding == SHORTEST:
    373             len64 = email.base64mime.header_length(header_bytes)
    374             lenqp = email.quoprimime.header_length(header_bytes)
    375             if len64 < lenqp:
    376                 return email.base64mime
    377             else:
    378                 return email.quoprimime
    379         else:
    380             return None
    381 
    382     def body_encode(self, string):
    383         """Body-encode a string by converting it first to bytes.
    384 
    385         The type of encoding (base64 or quoted-printable) will be based on
    386         self.body_encoding.  If body_encoding is None, we assume the
    387         output charset is a 7bit encoding, so re-encoding the decoded
    388         string using the ascii codec produces the correct string version
    389         of the content.
    390         """
    391         if not string:
    392             return string
    393         if self.body_encoding is BASE64:
    394             if isinstance(string, str):
    395                 string = string.encode(self.output_charset)
    396             return email.base64mime.body_encode(string)
    397         elif self.body_encoding is QP:
    398             # quopromime.body_encode takes a string, but operates on it as if
    399             # it were a list of byte codes.  For a (minimal) history on why
    400             # this is so, see changeset 0cf700464177.  To correctly encode a
    401             # character set, then, we must turn it into pseudo bytes via the
    402             # latin1 charset, which will encode any byte as a single code point
    403             # between 0 and 255, which is what body_encode is expecting.
    404             if isinstance(string, str):
    405                 string = string.encode(self.output_charset)
    406             string = string.decode('latin1')
    407             return email.quoprimime.body_encode(string)
    408         else:
    409             if isinstance(string, str):
    410                 string = string.encode(self.output_charset).decode('ascii')
    411             return string
    412