Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2001-2010 Python Software Foundation
      2 # Author: Barry Warsaw
      3 # Contact: email-sig (at] python.org
      4 
      5 """Classes to generate plain text from a message object tree."""
      6 
      7 __all__ = ['Generator', 'DecodedGenerator', 'BytesGenerator']
      8 
      9 import re
     10 import sys
     11 import time
     12 import random
     13 
     14 from copy import deepcopy
     15 from io import StringIO, BytesIO
     16 from email.utils import _has_surrogates
     17 
     18 UNDERSCORE = '_'
     19 NL = '\n'  # XXX: no longer used by the code below.
     20 
     21 NLCRE = re.compile(r'\r\n|\r|\n')
     22 fcre = re.compile(r'^From ', re.MULTILINE)
     23 
     24 
     25 
     27 class Generator:
     28     """Generates output from a Message object tree.
     29 
     30     This basic generator writes the message to the given file object as plain
     31     text.
     32     """
     33     #
     34     # Public interface
     35     #
     36 
     37     def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, *,
     38                  policy=None):
     39         """Create the generator for message flattening.
     40 
     41         outfp is the output file-like object for writing the message to.  It
     42         must have a write() method.
     43 
     44         Optional mangle_from_ is a flag that, when True (the default if policy
     45         is not set), escapes From_ lines in the body of the message by putting
     46         a `>' in front of them.
     47 
     48         Optional maxheaderlen specifies the longest length for a non-continued
     49         header.  When a header line is longer (in characters, with tabs
     50         expanded to 8 spaces) than maxheaderlen, the header will split as
     51         defined in the Header class.  Set maxheaderlen to zero to disable
     52         header wrapping.  The default is 78, as recommended (but not required)
     53         by RFC 2822.
     54 
     55         The policy keyword specifies a policy object that controls a number of
     56         aspects of the generator's operation.  If no policy is specified,
     57         the policy associated with the Message object passed to the
     58         flatten method is used.
     59 
     60         """
     61 
     62         if mangle_from_ is None:
     63             mangle_from_ = True if policy is None else policy.mangle_from_
     64         self._fp = outfp
     65         self._mangle_from_ = mangle_from_
     66         self.maxheaderlen = maxheaderlen
     67         self.policy = policy
     68 
     69     def write(self, s):
     70         # Just delegate to the file object
     71         self._fp.write(s)
     72 
     73     def flatten(self, msg, unixfrom=False, linesep=None):
     74         r"""Print the message object tree rooted at msg to the output file
     75         specified when the Generator instance was created.
     76 
     77         unixfrom is a flag that forces the printing of a Unix From_ delimiter
     78         before the first object in the message tree.  If the original message
     79         has no From_ delimiter, a `standard' one is crafted.  By default, this
     80         is False to inhibit the printing of any From_ delimiter.
     81 
     82         Note that for subobjects, no From_ line is printed.
     83 
     84         linesep specifies the characters used to indicate a new line in
     85         the output.  The default value is determined by the policy specified
     86         when the Generator instance was created or, if none was specified,
     87         from the policy associated with the msg.
     88 
     89         """
     90         # We use the _XXX constants for operating on data that comes directly
     91         # from the msg, and _encoded_XXX constants for operating on data that
     92         # has already been converted (to bytes in the BytesGenerator) and
     93         # inserted into a temporary buffer.
     94         policy = msg.policy if self.policy is None else self.policy
     95         if linesep is not None:
     96             policy = policy.clone(linesep=linesep)
     97         if self.maxheaderlen is not None:
     98             policy = policy.clone(max_line_length=self.maxheaderlen)
     99         self._NL = policy.linesep
    100         self._encoded_NL = self._encode(self._NL)
    101         self._EMPTY = ''
    102         self._encoded_EMPTY = self._encode(self._EMPTY)
    103         # Because we use clone (below) when we recursively process message
    104         # subparts, and because clone uses the computed policy (not None),
    105         # submessages will automatically get set to the computed policy when
    106         # they are processed by this code.
    107         old_gen_policy = self.policy
    108         old_msg_policy = msg.policy
    109         try:
    110             self.policy = policy
    111             msg.policy = policy
    112             if unixfrom:
    113                 ufrom = msg.get_unixfrom()
    114                 if not ufrom:
    115                     ufrom = 'From nobody ' + time.ctime(time.time())
    116                 self.write(ufrom + self._NL)
    117             self._write(msg)
    118         finally:
    119             self.policy = old_gen_policy
    120             msg.policy = old_msg_policy
    121 
    122     def clone(self, fp):
    123         """Clone this generator with the exact same options."""
    124         return self.__class__(fp,
    125                               self._mangle_from_,
    126                               None, # Use policy setting, which we've adjusted
    127                               policy=self.policy)
    128 
    129     #
    130     # Protected interface - undocumented ;/
    131     #
    132 
    133     # Note that we use 'self.write' when what we are writing is coming from
    134     # the source, and self._fp.write when what we are writing is coming from a
    135     # buffer (because the Bytes subclass has already had a chance to transform
    136     # the data in its write method in that case).  This is an entirely
    137     # pragmatic split determined by experiment; we could be more general by
    138     # always using write and having the Bytes subclass write method detect when
    139     # it has already transformed the input; but, since this whole thing is a
    140     # hack anyway this seems good enough.
    141 
    142     def _new_buffer(self):
    143         # BytesGenerator overrides this to return BytesIO.
    144         return StringIO()
    145 
    146     def _encode(self, s):
    147         # BytesGenerator overrides this to encode strings to bytes.
    148         return s
    149 
    150     def _write_lines(self, lines):
    151         # We have to transform the line endings.
    152         if not lines:
    153             return
    154         lines = NLCRE.split(lines)
    155         for line in lines[:-1]:
    156             self.write(line)
    157             self.write(self._NL)
    158         if lines[-1]:
    159             self.write(lines[-1])
    160         # XXX logic tells me this else should be needed, but the tests fail
    161         # with it and pass without it.  (NLCRE.split ends with a blank element
    162         # if and only if there was a trailing newline.)
    163         #else:
    164         #    self.write(self._NL)
    165 
    166     def _write(self, msg):
    167         # We can't write the headers yet because of the following scenario:
    168         # say a multipart message includes the boundary string somewhere in
    169         # its body.  We'd have to calculate the new boundary /before/ we write
    170         # the headers so that we can write the correct Content-Type:
    171         # parameter.
    172         #
    173         # The way we do this, so as to make the _handle_*() methods simpler,
    174         # is to cache any subpart writes into a buffer.  The we write the
    175         # headers and the buffer contents.  That way, subpart handlers can
    176         # Do The Right Thing, and can still modify the Content-Type: header if
    177         # necessary.
    178         oldfp = self._fp
    179         try:
    180             self._munge_cte = None
    181             self._fp = sfp = self._new_buffer()
    182             self._dispatch(msg)
    183         finally:
    184             self._fp = oldfp
    185             munge_cte = self._munge_cte
    186             del self._munge_cte
    187         # If we munged the cte, copy the message again and re-fix the CTE.
    188         if munge_cte:
    189             msg = deepcopy(msg)
    190             msg.replace_header('content-transfer-encoding', munge_cte[0])
    191             msg.replace_header('content-type', munge_cte[1])
    192         # Write the headers.  First we see if the message object wants to
    193         # handle that itself.  If not, we'll do it generically.
    194         meth = getattr(msg, '_write_headers', None)
    195         if meth is None:
    196             self._write_headers(msg)
    197         else:
    198             meth(self)
    199         self._fp.write(sfp.getvalue())
    200 
    201     def _dispatch(self, msg):
    202         # Get the Content-Type: for the message, then try to dispatch to
    203         # self._handle_<maintype>_<subtype>().  If there's no handler for the
    204         # full MIME type, then dispatch to self._handle_<maintype>().  If
    205         # that's missing too, then dispatch to self._writeBody().
    206         main = msg.get_content_maintype()
    207         sub = msg.get_content_subtype()
    208         specific = UNDERSCORE.join((main, sub)).replace('-', '_')
    209         meth = getattr(self, '_handle_' + specific, None)
    210         if meth is None:
    211             generic = main.replace('-', '_')
    212             meth = getattr(self, '_handle_' + generic, None)
    213             if meth is None:
    214                 meth = self._writeBody
    215         meth(msg)
    216 
    217     #
    218     # Default handlers
    219     #
    220 
    221     def _write_headers(self, msg):
    222         for h, v in msg.raw_items():
    223             self.write(self.policy.fold(h, v))
    224         # A blank line always separates headers from body
    225         self.write(self._NL)
    226 
    227     #
    228     # Handlers for writing types and subtypes
    229     #
    230 
    231     def _handle_text(self, msg):
    232         payload = msg.get_payload()
    233         if payload is None:
    234             return
    235         if not isinstance(payload, str):
    236             raise TypeError('string payload expected: %s' % type(payload))
    237         if _has_surrogates(msg._payload):
    238             charset = msg.get_param('charset')
    239             if charset is not None:
    240                 # XXX: This copy stuff is an ugly hack to avoid modifying the
    241                 # existing message.
    242                 msg = deepcopy(msg)
    243                 del msg['content-transfer-encoding']
    244                 msg.set_payload(payload, charset)
    245                 payload = msg.get_payload()
    246                 self._munge_cte = (msg['content-transfer-encoding'],
    247                                    msg['content-type'])
    248         if self._mangle_from_:
    249             payload = fcre.sub('>From ', payload)
    250         self._write_lines(payload)
    251 
    252     # Default body handler
    253     _writeBody = _handle_text
    254 
    255     def _handle_multipart(self, msg):
    256         # The trick here is to write out each part separately, merge them all
    257         # together, and then make sure that the boundary we've chosen isn't
    258         # present in the payload.
    259         msgtexts = []
    260         subparts = msg.get_payload()
    261         if subparts is None:
    262             subparts = []
    263         elif isinstance(subparts, str):
    264             # e.g. a non-strict parse of a message with no starting boundary.
    265             self.write(subparts)
    266             return
    267         elif not isinstance(subparts, list):
    268             # Scalar payload
    269             subparts = [subparts]
    270         for part in subparts:
    271             s = self._new_buffer()
    272             g = self.clone(s)
    273             g.flatten(part, unixfrom=False, linesep=self._NL)
    274             msgtexts.append(s.getvalue())
    275         # BAW: What about boundaries that are wrapped in double-quotes?
    276         boundary = msg.get_boundary()
    277         if not boundary:
    278             # Create a boundary that doesn't appear in any of the
    279             # message texts.
    280             alltext = self._encoded_NL.join(msgtexts)
    281             boundary = self._make_boundary(alltext)
    282             msg.set_boundary(boundary)
    283         # If there's a preamble, write it out, with a trailing CRLF
    284         if msg.preamble is not None:
    285             if self._mangle_from_:
    286                 preamble = fcre.sub('>From ', msg.preamble)
    287             else:
    288                 preamble = msg.preamble
    289             self._write_lines(preamble)
    290             self.write(self._NL)
    291         # dash-boundary transport-padding CRLF
    292         self.write('--' + boundary + self._NL)
    293         # body-part
    294         if msgtexts:
    295             self._fp.write(msgtexts.pop(0))
    296         # *encapsulation
    297         # --> delimiter transport-padding
    298         # --> CRLF body-part
    299         for body_part in msgtexts:
    300             # delimiter transport-padding CRLF
    301             self.write(self._NL + '--' + boundary + self._NL)
    302             # body-part
    303             self._fp.write(body_part)
    304         # close-delimiter transport-padding
    305         self.write(self._NL + '--' + boundary + '--' + self._NL)
    306         if msg.epilogue is not None:
    307             if self._mangle_from_:
    308                 epilogue = fcre.sub('>From ', msg.epilogue)
    309             else:
    310                 epilogue = msg.epilogue
    311             self._write_lines(epilogue)
    312 
    313     def _handle_multipart_signed(self, msg):
    314         # The contents of signed parts has to stay unmodified in order to keep
    315         # the signature intact per RFC1847 2.1, so we disable header wrapping.
    316         # RDM: This isn't enough to completely preserve the part, but it helps.
    317         p = self.policy
    318         self.policy = p.clone(max_line_length=0)
    319         try:
    320             self._handle_multipart(msg)
    321         finally:
    322             self.policy = p
    323 
    324     def _handle_message_delivery_status(self, msg):
    325         # We can't just write the headers directly to self's file object
    326         # because this will leave an extra newline between the last header
    327         # block and the boundary.  Sigh.
    328         blocks = []
    329         for part in msg.get_payload():
    330             s = self._new_buffer()
    331             g = self.clone(s)
    332             g.flatten(part, unixfrom=False, linesep=self._NL)
    333             text = s.getvalue()
    334             lines = text.split(self._encoded_NL)
    335             # Strip off the unnecessary trailing empty line
    336             if lines and lines[-1] == self._encoded_EMPTY:
    337                 blocks.append(self._encoded_NL.join(lines[:-1]))
    338             else:
    339                 blocks.append(text)
    340         # Now join all the blocks with an empty line.  This has the lovely
    341         # effect of separating each block with an empty line, but not adding
    342         # an extra one after the last one.
    343         self._fp.write(self._encoded_NL.join(blocks))
    344 
    345     def _handle_message(self, msg):
    346         s = self._new_buffer()
    347         g = self.clone(s)
    348         # The payload of a message/rfc822 part should be a multipart sequence
    349         # of length 1.  The zeroth element of the list should be the Message
    350         # object for the subpart.  Extract that object, stringify it, and
    351         # write it out.
    352         # Except, it turns out, when it's a string instead, which happens when
    353         # and only when HeaderParser is used on a message of mime type
    354         # message/rfc822.  Such messages are generated by, for example,
    355         # Groupwise when forwarding unadorned messages.  (Issue 7970.)  So
    356         # in that case we just emit the string body.
    357         payload = msg._payload
    358         if isinstance(payload, list):
    359             g.flatten(msg.get_payload(0), unixfrom=False, linesep=self._NL)
    360             payload = s.getvalue()
    361         else:
    362             payload = self._encode(payload)
    363         self._fp.write(payload)
    364 
    365     # This used to be a module level function; we use a classmethod for this
    366     # and _compile_re so we can continue to provide the module level function
    367     # for backward compatibility by doing
    368     #   _make_boundary = Generator._make_boundary
    369     # at the end of the module.  It *is* internal, so we could drop that...
    370     @classmethod
    371     def _make_boundary(cls, text=None):
    372         # Craft a random boundary.  If text is given, ensure that the chosen
    373         # boundary doesn't appear in the text.
    374         token = random.randrange(sys.maxsize)
    375         boundary = ('=' * 15) + (_fmt % token) + '=='
    376         if text is None:
    377             return boundary
    378         b = boundary
    379         counter = 0
    380         while True:
    381             cre = cls._compile_re('^--' + re.escape(b) + '(--)?$', re.MULTILINE)
    382             if not cre.search(text):
    383                 break
    384             b = boundary + '.' + str(counter)
    385             counter += 1
    386         return b
    387 
    388     @classmethod
    389     def _compile_re(cls, s, flags):
    390         return re.compile(s, flags)
    391 
    392 
    394 class BytesGenerator(Generator):
    395     """Generates a bytes version of a Message object tree.
    396 
    397     Functionally identical to the base Generator except that the output is
    398     bytes and not string.  When surrogates were used in the input to encode
    399     bytes, these are decoded back to bytes for output.  If the policy has
    400     cte_type set to 7bit, then the message is transformed such that the
    401     non-ASCII bytes are properly content transfer encoded, using the charset
    402     unknown-8bit.
    403 
    404     The outfp object must accept bytes in its write method.
    405     """
    406 
    407     def write(self, s):
    408         self._fp.write(s.encode('ascii', 'surrogateescape'))
    409 
    410     def _new_buffer(self):
    411         return BytesIO()
    412 
    413     def _encode(self, s):
    414         return s.encode('ascii')
    415 
    416     def _write_headers(self, msg):
    417         # This is almost the same as the string version, except for handling
    418         # strings with 8bit bytes.
    419         for h, v in msg.raw_items():
    420             self._fp.write(self.policy.fold_binary(h, v))
    421         # A blank line always separates headers from body
    422         self.write(self._NL)
    423 
    424     def _handle_text(self, msg):
    425         # If the string has surrogates the original source was bytes, so
    426         # just write it back out.
    427         if msg._payload is None:
    428             return
    429         if _has_surrogates(msg._payload) and not self.policy.cte_type=='7bit':
    430             if self._mangle_from_:
    431                 msg._payload = fcre.sub(">From ", msg._payload)
    432             self._write_lines(msg._payload)
    433         else:
    434             super(BytesGenerator,self)._handle_text(msg)
    435 
    436     # Default body handler
    437     _writeBody = _handle_text
    438 
    439     @classmethod
    440     def _compile_re(cls, s, flags):
    441         return re.compile(s.encode('ascii'), flags)
    442 
    443 
    444 
    446 _FMT = '[Non-text (%(type)s) part of message omitted, filename %(filename)s]'
    447 
    448 class DecodedGenerator(Generator):
    449     """Generates a text representation of a message.
    450 
    451     Like the Generator base class, except that non-text parts are substituted
    452     with a format string representing the part.
    453     """
    454     def __init__(self, outfp, mangle_from_=None, maxheaderlen=None, fmt=None, *,
    455                  policy=None):
    456         """Like Generator.__init__() except that an additional optional
    457         argument is allowed.
    458 
    459         Walks through all subparts of a message.  If the subpart is of main
    460         type `text', then it prints the decoded payload of the subpart.
    461 
    462         Otherwise, fmt is a format string that is used instead of the message
    463         payload.  fmt is expanded with the following keywords (in
    464         %(keyword)s format):
    465 
    466         type       : Full MIME type of the non-text part
    467         maintype   : Main MIME type of the non-text part
    468         subtype    : Sub-MIME type of the non-text part
    469         filename   : Filename of the non-text part
    470         description: Description associated with the non-text part
    471         encoding   : Content transfer encoding of the non-text part
    472 
    473         The default value for fmt is None, meaning
    474 
    475         [Non-text (%(type)s) part of message omitted, filename %(filename)s]
    476         """
    477         Generator.__init__(self, outfp, mangle_from_, maxheaderlen,
    478                            policy=policy)
    479         if fmt is None:
    480             self._fmt = _FMT
    481         else:
    482             self._fmt = fmt
    483 
    484     def _dispatch(self, msg):
    485         for part in msg.walk():
    486             maintype = part.get_content_maintype()
    487             if maintype == 'text':
    488                 print(part.get_payload(decode=False), file=self)
    489             elif maintype == 'multipart':
    490                 # Just skip this
    491                 pass
    492             else:
    493                 print(self._fmt % {
    494                     'type'       : part.get_content_type(),
    495                     'maintype'   : part.get_content_maintype(),
    496                     'subtype'    : part.get_content_subtype(),
    497                     'filename'   : part.get_filename('[no filename]'),
    498                     'description': part.get('Content-Description',
    499                                             '[no description]'),
    500                     'encoding'   : part.get('Content-Transfer-Encoding',
    501                                             '[no encoding]'),
    502                     }, file=self)
    503 
    504 
    505 
    507 # Helper used by Generator._make_boundary
    508 _width = len(repr(sys.maxsize-1))
    509 _fmt = '%%0%dd' % _width
    510 
    511 # Backward compatibility
    512 _make_boundary = Generator._make_boundary
    513