Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2001-2006 Python Software Foundation
      2 # Author: Ben Gertzfield
      3 # Contact: email-sig (at] python.org
      4 
      5 """Quoted-printable content transfer encoding per RFCs 2045-2047.
      6 
      7 This module handles the content transfer encoding method defined in RFC 2045
      8 to encode US ASCII-like 8-bit data called `quoted-printable'.  It is used to
      9 safely encode text that is in a character set similar to the 7-bit US ASCII
     10 character set, but that includes some 8-bit characters that are normally not
     11 allowed in email bodies or headers.
     12 
     13 Quoted-printable is very space-inefficient for encoding binary files; use the
     14 email.base64mime module for that instead.
     15 
     16 This module provides an interface to encode and decode both headers and bodies
     17 with quoted-printable encoding.
     18 
     19 RFC 2045 defines a method for including character set information in an
     20 `encoded-word' in a header.  This method is commonly used for 8-bit real names
     21 in To:/From:/Cc: etc. fields, as well as Subject: lines.
     22 
     23 This module does not do the line wrapping or end-of-line character
     24 conversion necessary for proper internationalized headers; it only
     25 does dumb encoding and decoding.  To deal with the various line
     26 wrapping issues, use the email.header module.
     27 """
     28 
     29 __all__ = [
     30     'body_decode',
     31     'body_encode',
     32     'body_quopri_check',
     33     'body_quopri_len',
     34     'decode',
     35     'decodestring',
     36     'encode',
     37     'encodestring',
     38     'header_decode',
     39     'header_encode',
     40     'header_quopri_check',
     41     'header_quopri_len',
     42     'quote',
     43     'unquote',
     44     ]
     45 
     46 import re
     47 
     48 from string import hexdigits
     49 from email.utils import fix_eols
     50 
     51 CRLF = '\r\n'
     52 NL = '\n'
     53 
     54 # See also Charset.py
     55 MISC_LEN = 7
     56 
     57 hqre = re.compile(r'[^-a-zA-Z0-9!*+/ ]')
     58 bqre = re.compile(r'[^ !-<>-~\t]')
     59 
     60 
     61 
     63 # Helpers
     64 def header_quopri_check(c):
     65     """Return True if the character should be escaped with header quopri."""
     66     return bool(hqre.match(c))
     67 
     68 
     69 def body_quopri_check(c):
     70     """Return True if the character should be escaped with body quopri."""
     71     return bool(bqre.match(c))
     72 
     73 
     74 def header_quopri_len(s):
     75     """Return the length of str when it is encoded with header quopri."""
     76     count = 0
     77     for c in s:
     78         if hqre.match(c):
     79             count += 3
     80         else:
     81             count += 1
     82     return count
     83 
     84 
     85 def body_quopri_len(str):
     86     """Return the length of str when it is encoded with body quopri."""
     87     count = 0
     88     for c in str:
     89         if bqre.match(c):
     90             count += 3
     91         else:
     92             count += 1
     93     return count
     94 
     95 
     96 def _max_append(L, s, maxlen, extra=''):
     97     if not L:
     98         L.append(s.lstrip())
     99     elif len(L[-1]) + len(s) <= maxlen:
    100         L[-1] += extra + s
    101     else:
    102         L.append(s.lstrip())
    103 
    104 
    105 def unquote(s):
    106     """Turn a string in the form =AB to the ASCII character with value 0xab"""
    107     return chr(int(s[1:3], 16))
    108 
    109 
    110 def quote(c):
    111     return "=%02X" % ord(c)
    112 
    113 
    114 
    116 def header_encode(header, charset="iso-8859-1", keep_eols=False,
    117                   maxlinelen=76, eol=NL):
    118     """Encode a single header line with quoted-printable (like) encoding.
    119 
    120     Defined in RFC 2045, this `Q' encoding is similar to quoted-printable, but
    121     used specifically for email header fields to allow charsets with mostly 7
    122     bit characters (and some 8 bit) to remain more or less readable in non-RFC
    123     2045 aware mail clients.
    124 
    125     charset names the character set to use to encode the header.  It defaults
    126     to iso-8859-1.
    127 
    128     The resulting string will be in the form:
    129 
    130     "=?charset?q?I_f=E2rt_in_your_g=E8n=E8ral_dire=E7tion?\\n
    131       =?charset?q?Silly_=C8nglish_Kn=EEghts?="
    132 
    133     with each line wrapped safely at, at most, maxlinelen characters (defaults
    134     to 76 characters).  If maxlinelen is None, the entire string is encoded in
    135     one chunk with no splitting.
    136 
    137     End-of-line characters (\\r, \\n, \\r\\n) will be automatically converted
    138     to the canonical email line separator \\r\\n unless the keep_eols
    139     parameter is True (the default is False).
    140 
    141     Each line of the header will be terminated in the value of eol, which
    142     defaults to "\\n".  Set this to "\\r\\n" if you are using the result of
    143     this function directly in email.
    144     """
    145     # Return empty headers unchanged
    146     if not header:
    147         return header
    148 
    149     if not keep_eols:
    150         header = fix_eols(header)
    151 
    152     # Quopri encode each line, in encoded chunks no greater than maxlinelen in
    153     # length, after the RFC chrome is added in.
    154     quoted = []
    155     if maxlinelen is None:
    156         # An obnoxiously large number that's good enough
    157         max_encoded = 100000
    158     else:
    159         max_encoded = maxlinelen - len(charset) - MISC_LEN - 1
    160 
    161     for c in header:
    162         # Space may be represented as _ instead of =20 for readability
    163         if c == ' ':
    164             _max_append(quoted, '_', max_encoded)
    165         # These characters can be included verbatim
    166         elif not hqre.match(c):
    167             _max_append(quoted, c, max_encoded)
    168         # Otherwise, replace with hex value like =E2
    169         else:
    170             _max_append(quoted, "=%02X" % ord(c), max_encoded)
    171 
    172     # Now add the RFC chrome to each encoded chunk and glue the chunks
    173     # together.  BAW: should we be able to specify the leading whitespace in
    174     # the joiner?
    175     joiner = eol + ' '
    176     return joiner.join(['=?%s?q?%s?=' % (charset, line) for line in quoted])
    177 
    178 
    179 
    181 def encode(body, binary=False, maxlinelen=76, eol=NL):
    182     """Encode with quoted-printable, wrapping at maxlinelen characters.
    183 
    184     If binary is False (the default), end-of-line characters will be converted
    185     to the canonical email end-of-line sequence \\r\\n.  Otherwise they will
    186     be left verbatim.
    187 
    188     Each line of encoded text will end with eol, which defaults to "\\n".  Set
    189     this to "\\r\\n" if you will be using the result of this function directly
    190     in an email.
    191 
    192     Each line will be wrapped at, at most, maxlinelen characters (defaults to
    193     76 characters).  Long lines will have the `soft linefeed' quoted-printable
    194     character "=" appended to them, so the decoded text will be identical to
    195     the original text.
    196     """
    197     if not body:
    198         return body
    199 
    200     if not binary:
    201         body = fix_eols(body)
    202 
    203     # BAW: We're accumulating the body text by string concatenation.  That
    204     # can't be very efficient, but I don't have time now to rewrite it.  It
    205     # just feels like this algorithm could be more efficient.
    206     encoded_body = ''
    207     lineno = -1
    208     # Preserve line endings here so we can check later to see an eol needs to
    209     # be added to the output later.
    210     lines = body.splitlines(1)
    211     for line in lines:
    212         # But strip off line-endings for processing this line.
    213         if line.endswith(CRLF):
    214             line = line[:-2]
    215         elif line[-1] in CRLF:
    216             line = line[:-1]
    217 
    218         lineno += 1
    219         encoded_line = ''
    220         prev = None
    221         linelen = len(line)
    222         # Now we need to examine every character to see if it needs to be
    223         # quopri encoded.  BAW: again, string concatenation is inefficient.
    224         for j in range(linelen):
    225             c = line[j]
    226             prev = c
    227             if bqre.match(c):
    228                 c = quote(c)
    229             elif j+1 == linelen:
    230                 # Check for whitespace at end of line; special case
    231                 if c not in ' \t':
    232                     encoded_line += c
    233                 prev = c
    234                 continue
    235             # Check to see to see if the line has reached its maximum length
    236             if len(encoded_line) + len(c) >= maxlinelen:
    237                 encoded_body += encoded_line + '=' + eol
    238                 encoded_line = ''
    239             encoded_line += c
    240         # Now at end of line..
    241         if prev and prev in ' \t':
    242             # Special case for whitespace at end of file
    243             if lineno + 1 == len(lines):
    244                 prev = quote(prev)
    245                 if len(encoded_line) + len(prev) > maxlinelen:
    246                     encoded_body += encoded_line + '=' + eol + prev
    247                 else:
    248                     encoded_body += encoded_line + prev
    249             # Just normal whitespace at end of line
    250             else:
    251                 encoded_body += encoded_line + prev + '=' + eol
    252             encoded_line = ''
    253         # Now look at the line we just finished and it has a line ending, we
    254         # need to add eol to the end of the line.
    255         if lines[lineno].endswith(CRLF) or lines[lineno][-1] in CRLF:
    256             encoded_body += encoded_line + eol
    257         else:
    258             encoded_body += encoded_line
    259         encoded_line = ''
    260     return encoded_body
    261 
    262 
    263 # For convenience and backwards compatibility w/ standard base64 module
    264 body_encode = encode
    265 encodestring = encode
    266 
    267 
    268 
    270 # BAW: I'm not sure if the intent was for the signature of this function to be
    271 # the same as base64MIME.decode() or not...
    272 def decode(encoded, eol=NL):
    273     """Decode a quoted-printable string.
    274 
    275     Lines are separated with eol, which defaults to \\n.
    276     """
    277     if not encoded:
    278         return encoded
    279     # BAW: see comment in encode() above.  Again, we're building up the
    280     # decoded string with string concatenation, which could be done much more
    281     # efficiently.
    282     decoded = ''
    283 
    284     for line in encoded.splitlines():
    285         line = line.rstrip()
    286         if not line:
    287             decoded += eol
    288             continue
    289 
    290         i = 0
    291         n = len(line)
    292         while i < n:
    293             c = line[i]
    294             if c != '=':
    295                 decoded += c
    296                 i += 1
    297             # Otherwise, c == "=".  Are we at the end of the line?  If so, add
    298             # a soft line break.
    299             elif i+1 == n:
    300                 i += 1
    301                 continue
    302             # Decode if in form =AB
    303             elif i+2 < n and line[i+1] in hexdigits and line[i+2] in hexdigits:
    304                 decoded += unquote(line[i:i+3])
    305                 i += 3
    306             # Otherwise, not in form =AB, pass literally
    307             else:
    308                 decoded += c
    309                 i += 1
    310 
    311             if i == n:
    312                 decoded += eol
    313     # Special case if original string did not end with eol
    314     if not encoded.endswith(eol) and decoded.endswith(eol):
    315         decoded = decoded[:-1]
    316     return decoded
    317 
    318 
    319 # For convenience and backwards compatibility w/ standard base64 module
    320 body_decode = decode
    321 decodestring = decode
    322 
    323 
    324 
    326 def _unquote_match(match):
    327     """Turn a match in the form =AB to the ASCII character with value 0xab"""
    328     s = match.group(0)
    329     return unquote(s)
    330 
    331 
    332 # Header decoding is done a bit differently
    333 def header_decode(s):
    334     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
    335 
    336     This function does not parse a full MIME header value encoded with
    337     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
    338     the high level email.header class for that functionality.
    339     """
    340     s = s.replace('_', ' ')
    341     return re.sub(r'=[a-fA-F0-9]{2}', _unquote_match, s)
    342