Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2001-2006 Python Software Foundation
      2 # Author: Ben Gertzfield
      3 # Contact: email-sig (at] python.org
      4 
      5 """Quoted-printable content transfer encoding per RFCs 2045-2047.
      6 
      7 This module handles the content transfer encoding method defined in RFC 2045
      8 to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to
      9 safely encode text that is in a character set similar to the 7-bit US ASCII
     10 character set, but that includes some 8-bit characters that are normally not
     11 allowed in email bodies or headers.
     12 
     13 Quoted-printable is very space-inefficient for encoding binary files; use the
     14 email.base64mime module for that instead.
     15 
     16 This module provides an interface to encode and decode both headers and bodies
     17 with quoted-printable encoding.
     18 
     19 RFC 2045 defines a method for including character set information in an
     20 `encoded-word' in a header.  This method is commonly used for 8-bit real names
     21 in To:/From:/Cc: etc. fields, as well as Subject: lines.
     22 
     23 This module does not do the line wrapping or end-of-line character
     24 conversion necessary for proper internationalized headers; it only
     25 does dumb encoding and decoding.  To deal with the various line
     26 wrapping issues, use the email.header module.
     27 """
     28 
     29 __all__ = [
     30     'body_decode',
     31     'body_encode',
     32     'body_length',
     33     'decode',
     34     'decodestring',
     35     'header_decode',
     36     'header_encode',
     37     'header_length',
     38     'quote',
     39     'unquote',
     40     ]
     41 
     42 import re
     43 
     44 from string import ascii_letters, digits, hexdigits
     45 
     46 CRLF = '\r\n'
     47 NL = '\n'
     48 EMPTYSTRING = ''
     49 
     50 # Build a mapping of octets to the expansion of that octet.  Since we're only
     51 # going to have 256 of these things, this isn't terribly inefficient
     52 # space-wise.  Remember that headers and bodies have different sets of safe
     53 # characters.  Initialize both maps with the full expansion, and then override
     54 # the safe bytes with the more compact form.
     55 _QUOPRI_MAP = ['=%02X' % c for c in range(256)]
     56 _QUOPRI_HEADER_MAP = _QUOPRI_MAP[:]
     57 _QUOPRI_BODY_MAP = _QUOPRI_MAP[:]
     58 
     59 # Safe header bytes which need no encoding.
     60 for c in b'-!*+/' + ascii_letters.encode('ascii') + digits.encode('ascii'):
     61     _QUOPRI_HEADER_MAP[c] = chr(c)
     62 # Headers have one other special encoding; spaces become underscores.
     63 _QUOPRI_HEADER_MAP[ord(' ')] = '_'
     64 
     65 # Safe body bytes which need no encoding.
     66 for c in (b' !"#$%&\'()*+,-./0123456789:;<>'
     67           b'?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`'
     68           b'abcdefghijklmnopqrstuvwxyz{|}~\t'):
     69     _QUOPRI_BODY_MAP[c] = chr(c)
     70 
     71 
     72 
     73 # Helpers
     74 def header_check(octet):
     75     """Return True if the octet should be escaped with header quopri."""
     76     return chr(octet) != _QUOPRI_HEADER_MAP[octet]
     77 
     78 
     79 def body_check(octet):
     80     """Return True if the octet should be escaped with body quopri."""
     81     return chr(octet) != _QUOPRI_BODY_MAP[octet]
     82 
     83 
     84 def header_length(bytearray):
     85     """Return a header quoted-printable encoding length.
     86 
     87     Note that this does not include any RFC 2047 chrome added by
     88     `header_encode()`.
     89 
     90     :param bytearray: An array of bytes (a.k.a. octets).
     91     :return: The length in bytes of the byte array when it is encoded with
     92         quoted-printable for headers.
     93     """
     94     return sum(len(_QUOPRI_HEADER_MAP[octet]) for octet in bytearray)
     95 
     96 
     97 def body_length(bytearray):
     98     """Return a body quoted-printable encoding length.
     99 
    100     :param bytearray: An array of bytes (a.k.a. octets).
    101     :return: The length in bytes of the byte array when it is encoded with
    102         quoted-printable for bodies.
    103     """
    104     return sum(len(_QUOPRI_BODY_MAP[octet]) for octet in bytearray)
    105 
    106 
    107 def _max_append(L, s, maxlen, extra=''):
    108     if not isinstance(s, str):
    109         s = chr(s)
    110     if not L:
    111         L.append(s.lstrip())
    112     elif len(L[-1]) + len(s) <= maxlen:
    113         L[-1] += extra + s
    114     else:
    115         L.append(s.lstrip())
    116 
    117 
    118 def unquote(s):
    119     """Turn a string in the form =AB to the ASCII character with value 0xab"""
    120     return chr(int(s[1:3], 16))
    121 
    122 
    123 def quote(c):
    124     return _QUOPRI_MAP[ord(c)]
    125 
    126 
    127 def header_encode(header_bytes, charset='iso-8859-1'):
    128     """Encode a single header line with quoted-printable (like) encoding.
    129 
    130     Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
    131     used specifically for email header fields to allow charsets with mostly 7
    132     bit characters (and some 8 bit) to remain more or less readable in non-RFC
    133     2045 aware mail clients.
    134 
    135     charset names the character set to use in the RFC 2046 header.  It
    136     defaults to iso-8859-1.
    137     """
    138     # Return empty headers as an empty string.
    139     if not header_bytes:
    140         return ''
    141     # Iterate over every byte, encoding if necessary.
    142     encoded = header_bytes.decode('latin1').translate(_QUOPRI_HEADER_MAP)
    143     # Now add the RFC chrome to each encoded chunk and glue the chunks
    144     # together.
    145     return '=?%s?q?%s?=' % (charset, encoded)
    146 
    147 
    148 _QUOPRI_BODY_ENCODE_MAP = _QUOPRI_BODY_MAP[:]
    149 for c in b'\r\n':
    150     _QUOPRI_BODY_ENCODE_MAP[c] = chr(c)
    151 
    152 def body_encode(body, maxlinelen=76, eol=NL):
    153     """Encode with quoted-printable, wrapping at maxlinelen characters.
    154 
    155     Each line of encoded text will end with eol, which defaults to "\\n".  Set
    156     this to "\\r\\n" if you will be using the result of this function directly
    157     in an email.
    158 
    159     Each line will be wrapped at, at most, maxlinelen characters before the
    160     eol string (maxlinelen defaults to 76 characters, the maximum value
    161     permitted by RFC 2045).  Long lines will have the 'soft line break'
    162     quoted-printable character "=" appended to them, so the decoded text will
    163     be identical to the original text.
    164 
    165     The minimum maxlinelen is 4 to have room for a quoted character ("=XX")
    166     followed by a soft line break.  Smaller values will generate a
    167     ValueError.
    168 
    169     """
    170 
    171     if maxlinelen < 4:
    172         raise ValueError("maxlinelen must be at least 4")
    173     if not body:
    174         return body
    175 
    176     # quote special characters
    177     body = body.translate(_QUOPRI_BODY_ENCODE_MAP)
    178 
    179     soft_break = '=' + eol
    180     # leave space for the '=' at the end of a line
    181     maxlinelen1 = maxlinelen - 1
    182 
    183     encoded_body = []
    184     append = encoded_body.append
    185 
    186     for line in body.splitlines():
    187         # break up the line into pieces no longer than maxlinelen - 1
    188         start = 0
    189         laststart = len(line) - 1 - maxlinelen
    190         while start <= laststart:
    191             stop = start + maxlinelen1
    192             # make sure we don't break up an escape sequence
    193             if line[stop - 2] == '=':
    194                 append(line[start:stop - 1])
    195                 start = stop - 2
    196             elif line[stop - 1] == '=':
    197                 append(line[start:stop])
    198                 start = stop - 1
    199             else:
    200                 append(line[start:stop] + '=')
    201                 start = stop
    202 
    203         # handle rest of line, special case if line ends in whitespace
    204         if line and line[-1] in ' \t':
    205             room = start - laststart
    206             if room >= 3:
    207                 # It's a whitespace character at end-of-line, and we have room
    208                 # for the three-character quoted encoding.
    209                 q = quote(line[-1])
    210             elif room == 2:
    211                 # There's room for the whitespace character and a soft break.
    212                 q = line[-1] + soft_break
    213             else:
    214                 # There's room only for a soft break.  The quoted whitespace
    215                 # will be the only content on the subsequent line.
    216                 q = soft_break + quote(line[-1])
    217             append(line[start:-1] + q)
    218         else:
    219             append(line[start:])
    220 
    221     # add back final newline if present
    222     if body[-1] in CRLF:
    223         append('')
    224 
    225     return eol.join(encoded_body)
    226 
    227 
    228 
    229 # BAW: I'm not sure if the intent was for the signature of this function to be
    230 # the same as base64MIME.decode() or not...
    231 def decode(encoded, eol=NL):
    232     """Decode a quoted-printable string.
    233 
    234     Lines are separated with eol, which defaults to \\n.
    235     """
    236     if not encoded:
    237         return encoded
    238     # BAW: see comment in encode() above.  Again, we're building up the
    239     # decoded string with string concatenation, which could be done much more
    240     # efficiently.
    241     decoded = ''
    242 
    243     for line in encoded.splitlines():
    244         line = line.rstrip()
    245         if not line:
    246             decoded += eol
    247             continue
    248 
    249         i = 0
    250         n = len(line)
    251         while i < n:
    252             c = line[i]
    253             if c != '=':
    254                 decoded += c
    255                 i += 1
    256             # Otherwise, c == "=".  Are we at the end of the line?  If so, add
    257             # a soft line break.
    258             elif i+1 == n:
    259                 i += 1
    260                 continue
    261             # Decode if in form =AB
    262             elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
    263                 decoded += unquote(line[i:i+3])
    264                 i += 3
    265             # Otherwise, not in form =AB, pass literally
    266             else:
    267                 decoded += c
    268                 i += 1
    269 
    270             if i == n:
    271                 decoded += eol
    272     # Special case if original string did not end with eol
    273     if encoded[-1] not in '\r\n' and decoded.endswith(eol):
    274         decoded = decoded[:-1]
    275     return decoded
    276 
    277 
    278 # For convenience and backwards compatibility w/ standard base64 module
    279 body_decode = decode
    280 decodestring = decode
    281 
    282 
    283 
    284 def _unquote_match(match):
    285     """Turn a match in the form =AB to the ASCII character with value 0xab"""
    286     s = match.group(0)
    287     return unquote(s)
    288 
    289 
    290 # Header decoding is done a bit differently
    291 def header_decode(s):
    292     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
    293 
    294     This function does not parse a full MIME header value encoded with
    295     quoted-printable (like =?iso-8859-1?q?Hello_World?=) -- please use
    296     the high level email.header class for that functionality.
    297     """
    298     s = s.replace('_', ' ')
    299     return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s, flags=re.ASCII)
    300