Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2004-2006 Python Software Foundation
      2 # Authors: Baxter, Wouters and Warsaw
      3 # Contact: email-sig (at] python.org
      4 
      5 """FeedParser - An email feed parser.
      6 
      7 The feed parser implements an interface for incrementally parsing an email
      8 message, line by line.  This has advantages for certain applications, such as
      9 those reading email messages off a socket.
     10 
     11 FeedParser.feed() is the primary interface for pushing new data into the
     12 parser.  It returns when there's nothing more it can do with the available
     13 data.  When you have no more data to push into the parser, call .close().
     14 This completes the parsing and returns the root message object.
     15 
     16 The other advantage of this parser is that it will never raise a parsing
     17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
     18 the current message.  Defects are just instances that live on the message
     19 object's .defects attribute.
     20 """
     21 
     22 __all__ = ['FeedParser', 'BytesFeedParser']
     23 
     24 import re
     25 
     26 from email import errors
     27 from email._policybase import compat32
     28 from collections import deque
     29 from io import StringIO
     30 
     31 NLCRE = re.compile(r'\r\n|\r|\n')
     32 NLCRE_bol = re.compile(r'(\r\n|\r|\n)')
     33 NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z')
     34 NLCRE_crack = re.compile(r'(\r\n|\r|\n)')
     35 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
     36 # except controls, SP, and ":".
     37 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])')
     38 EMPTYSTRING = ''
     39 NL = '\n'
     40 
     41 NeedMoreData = object()
     42 
     43 
     44 
     46 class BufferedSubFile(object):
     47     """A file-ish object that can have new data loaded into it.
     48 
     49     You can also push and pop line-matching predicates onto a stack.  When the
     50     current predicate matches the current line, a false EOF response
     51     (i.e. empty string) is returned instead.  This lets the parser adhere to a
     52     simple abstraction -- it parses until EOF closes the current message.
     53     """
     54     def __init__(self):
     55         # Text stream of the last partial line pushed into this object.
     56         # See issue 22233 for why this is a text stream and not a list.
     57         self._partial = StringIO(newline='')
     58         # A deque of full, pushed lines
     59         self._lines = deque()
     60         # The stack of false-EOF checking predicates.
     61         self._eofstack = []
     62         # A flag indicating whether the file has been closed or not.
     63         self._closed = False
     64 
     65     def push_eof_matcher(self, pred):
     66         self._eofstack.append(pred)
     67 
     68     def pop_eof_matcher(self):
     69         return self._eofstack.pop()
     70 
     71     def close(self):
     72         # Don't forget any trailing partial line.
     73         self._partial.seek(0)
     74         self.pushlines(self._partial.readlines())
     75         self._partial.seek(0)
     76         self._partial.truncate()
     77         self._closed = True
     78 
     79     def readline(self):
     80         if not self._lines:
     81             if self._closed:
     82                 return ''
     83             return NeedMoreData
     84         # Pop the line off the stack and see if it matches the current
     85         # false-EOF predicate.
     86         line = self._lines.popleft()
     87         # RFC 2046, section 5.1.2 requires us to recognize outer level
     88         # boundaries at any level of inner nesting.  Do this, but be sure it's
     89         # in the order of most to least nested.
     90         for ateof in reversed(self._eofstack):
     91             if ateof(line):
     92                 # We're at the false EOF.  But push the last line back first.
     93                 self._lines.appendleft(line)
     94                 return ''
     95         return line
     96 
     97     def unreadline(self, line):
     98         # Let the consumer push a line back into the buffer.
     99         assert line is not NeedMoreData
    100         self._lines.appendleft(line)
    101 
    102     def push(self, data):
    103         """Push some new data into this object."""
    104         self._partial.write(data)
    105         if '\n' not in data and '\r' not in data:
    106             # No new complete lines, wait for more.
    107             return
    108 
    109         # Crack into lines, preserving the linesep characters.
    110         self._partial.seek(0)
    111         parts = self._partial.readlines()
    112         self._partial.seek(0)
    113         self._partial.truncate()
    114 
    115         # If the last element of the list does not end in a newline, then treat
    116         # it as a partial line.  We only check for '\n' here because a line
    117         # ending with '\r' might be a line that was split in the middle of a
    118         # '\r\n' sequence (see bugs 1555570 and 1721862).
    119         if not parts[-1].endswith('\n'):
    120             self._partial.write(parts.pop())
    121         self.pushlines(parts)
    122 
    123     def pushlines(self, lines):
    124         self._lines.extend(lines)
    125 
    126     def __iter__(self):
    127         return self
    128 
    129     def __next__(self):
    130         line = self.readline()
    131         if line == '':
    132             raise StopIteration
    133         return line
    134 
    135 
    136 
    138 class FeedParser:
    139     """A feed-style parser of email."""
    140 
    141     def __init__(self, _factory=None, *, policy=compat32):
    142         """_factory is called with no arguments to create a new message obj
    143 
    144         The policy keyword specifies a policy object that controls a number of
    145         aspects of the parser's operation.  The default policy maintains
    146         backward compatibility.
    147 
    148         """
    149         self.policy = policy
    150         self._old_style_factory = False
    151         if _factory is None:
    152             if policy.message_factory is None:
    153                 from email.message import Message
    154                 self._factory = Message
    155             else:
    156                 self._factory = policy.message_factory
    157         else:
    158             self._factory = _factory
    159             try:
    160                 _factory(policy=self.policy)
    161             except TypeError:
    162                 # Assume this is an old-style factory
    163                 self._old_style_factory = True
    164         self._input = BufferedSubFile()
    165         self._msgstack = []
    166         self._parse = self._parsegen().__next__
    167         self._cur = None
    168         self._last = None
    169         self._headersonly = False
    170 
    171     # Non-public interface for supporting Parser's headersonly flag
    172     def _set_headersonly(self):
    173         self._headersonly = True
    174 
    175     def feed(self, data):
    176         """Push more data into the parser."""
    177         self._input.push(data)
    178         self._call_parse()
    179 
    180     def _call_parse(self):
    181         try:
    182             self._parse()
    183         except StopIteration:
    184             pass
    185 
    186     def close(self):
    187         """Parse all remaining data and return the root message object."""
    188         self._input.close()
    189         self._call_parse()
    190         root = self._pop_message()
    191         assert not self._msgstack
    192         # Look for final set of defects
    193         if root.get_content_maintype() == 'multipart' \
    194                and not root.is_multipart():
    195             defect = errors.MultipartInvariantViolationDefect()
    196             self.policy.handle_defect(root, defect)
    197         return root
    198 
    199     def _new_message(self):
    200         if self._old_style_factory:
    201             msg = self._factory()
    202         else:
    203             msg = self._factory(policy=self.policy)
    204         if self._cur and self._cur.get_content_type() == 'multipart/digest':
    205             msg.set_default_type('message/rfc822')
    206         if self._msgstack:
    207             self._msgstack[-1].attach(msg)
    208         self._msgstack.append(msg)
    209         self._cur = msg
    210         self._last = msg
    211 
    212     def _pop_message(self):
    213         retval = self._msgstack.pop()
    214         if self._msgstack:
    215             self._cur = self._msgstack[-1]
    216         else:
    217             self._cur = None
    218         return retval
    219 
    220     def _parsegen(self):
    221         # Create a new message and start by parsing headers.
    222         self._new_message()
    223         headers = []
    224         # Collect the headers, searching for a line that doesn't match the RFC
    225         # 2822 header or continuation pattern (including an empty line).
    226         for line in self._input:
    227             if line is NeedMoreData:
    228                 yield NeedMoreData
    229                 continue
    230             if not headerRE.match(line):
    231                 # If we saw the RFC defined header/body separator
    232                 # (i.e. newline), just throw it away. Otherwise the line is
    233                 # part of the body so push it back.
    234                 if not NLCRE.match(line):
    235                     defect = errors.MissingHeaderBodySeparatorDefect()
    236                     self.policy.handle_defect(self._cur, defect)
    237                     self._input.unreadline(line)
    238                 break
    239             headers.append(line)
    240         # Done with the headers, so parse them and figure out what we're
    241         # supposed to see in the body of the message.
    242         self._parse_headers(headers)
    243         # Headers-only parsing is a backwards compatibility hack, which was
    244         # necessary in the older parser, which could raise errors.  All
    245         # remaining lines in the input are thrown into the message body.
    246         if self._headersonly:
    247             lines = []
    248             while True:
    249                 line = self._input.readline()
    250                 if line is NeedMoreData:
    251                     yield NeedMoreData
    252                     continue
    253                 if line == '':
    254                     break
    255                 lines.append(line)
    256             self._cur.set_payload(EMPTYSTRING.join(lines))
    257             return
    258         if self._cur.get_content_type() == 'message/delivery-status':
    259             # message/delivery-status contains blocks of headers separated by
    260             # a blank line.  We'll represent each header block as a separate
    261             # nested message object, but the processing is a bit different
    262             # than standard message/* types because there is no body for the
    263             # nested messages.  A blank line separates the subparts.
    264             while True:
    265                 self._input.push_eof_matcher(NLCRE.match)
    266                 for retval in self._parsegen():
    267                     if retval is NeedMoreData:
    268                         yield NeedMoreData
    269                         continue
    270                     break
    271                 msg = self._pop_message()
    272                 # We need to pop the EOF matcher in order to tell if we're at
    273                 # the end of the current file, not the end of the last block
    274                 # of message headers.
    275                 self._input.pop_eof_matcher()
    276                 # The input stream must be sitting at the newline or at the
    277                 # EOF.  We want to see if we're at the end of this subpart, so
    278                 # first consume the blank line, then test the next line to see
    279                 # if we're at this subpart's EOF.
    280                 while True:
    281                     line = self._input.readline()
    282                     if line is NeedMoreData:
    283                         yield NeedMoreData
    284                         continue
    285                     break
    286                 while True:
    287                     line = self._input.readline()
    288                     if line is NeedMoreData:
    289                         yield NeedMoreData
    290                         continue
    291                     break
    292                 if line == '':
    293                     break
    294                 # Not at EOF so this is a line we're going to need.
    295                 self._input.unreadline(line)
    296             return
    297         if self._cur.get_content_maintype() == 'message':
    298             # The message claims to be a message/* type, then what follows is
    299             # another RFC 2822 message.
    300             for retval in self._parsegen():
    301                 if retval is NeedMoreData:
    302                     yield NeedMoreData
    303                     continue
    304                 break
    305             self._pop_message()
    306             return
    307         if self._cur.get_content_maintype() == 'multipart':
    308             boundary = self._cur.get_boundary()
    309             if boundary is None:
    310                 # The message /claims/ to be a multipart but it has not
    311                 # defined a boundary.  That's a problem which we'll handle by
    312                 # reading everything until the EOF and marking the message as
    313                 # defective.
    314                 defect = errors.NoBoundaryInMultipartDefect()
    315                 self.policy.handle_defect(self._cur, defect)
    316                 lines = []
    317                 for line in self._input:
    318                     if line is NeedMoreData:
    319                         yield NeedMoreData
    320                         continue
    321                     lines.append(line)
    322                 self._cur.set_payload(EMPTYSTRING.join(lines))
    323                 return
    324             # Make sure a valid content type was specified per RFC 2045:6.4.
    325             if (self._cur.get('content-transfer-encoding', '8bit').lower()
    326                     not in ('7bit', '8bit', 'binary')):
    327                 defect = errors.InvalidMultipartContentTransferEncodingDefect()
    328                 self.policy.handle_defect(self._cur, defect)
    329             # Create a line match predicate which matches the inter-part
    330             # boundary as well as the end-of-multipart boundary.  Don't push
    331             # this onto the input stream until we've scanned past the
    332             # preamble.
    333             separator = '--' + boundary
    334             boundaryre = re.compile(
    335                 '(?P<sep>' + re.escape(separator) +
    336                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
    337             capturing_preamble = True
    338             preamble = []
    339             linesep = False
    340             close_boundary_seen = False
    341             while True:
    342                 line = self._input.readline()
    343                 if line is NeedMoreData:
    344                     yield NeedMoreData
    345                     continue
    346                 if line == '':
    347                     break
    348                 mo = boundaryre.match(line)
    349                 if mo:
    350                     # If we're looking at the end boundary, we're done with
    351                     # this multipart.  If there was a newline at the end of
    352                     # the closing boundary, then we need to initialize the
    353                     # epilogue with the empty string (see below).
    354                     if mo.group('end'):
    355                         close_boundary_seen = True
    356                         linesep = mo.group('linesep')
    357                         break
    358                     # We saw an inter-part boundary.  Were we in the preamble?
    359                     if capturing_preamble:
    360                         if preamble:
    361                             # According to RFC 2046, the last newline belongs
    362                             # to the boundary.
    363                             lastline = preamble[-1]
    364                             eolmo = NLCRE_eol.search(lastline)
    365                             if eolmo:
    366                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
    367                             self._cur.preamble = EMPTYSTRING.join(preamble)
    368                         capturing_preamble = False
    369                         self._input.unreadline(line)
    370                         continue
    371                     # We saw a boundary separating two parts.  Consume any
    372                     # multiple boundary lines that may be following.  Our
    373                     # interpretation of RFC 2046 BNF grammar does not produce
    374                     # body parts within such double boundaries.
    375                     while True:
    376                         line = self._input.readline()
    377                         if line is NeedMoreData:
    378                             yield NeedMoreData
    379                             continue
    380                         mo = boundaryre.match(line)
    381                         if not mo:
    382                             self._input.unreadline(line)
    383                             break
    384                     # Recurse to parse this subpart; the input stream points
    385                     # at the subpart's first line.
    386                     self._input.push_eof_matcher(boundaryre.match)
    387                     for retval in self._parsegen():
    388                         if retval is NeedMoreData:
    389                             yield NeedMoreData
    390                             continue
    391                         break
    392                     # Because of RFC 2046, the newline preceding the boundary
    393                     # separator actually belongs to the boundary, not the
    394                     # previous subpart's payload (or epilogue if the previous
    395                     # part is a multipart).
    396                     if self._last.get_content_maintype() == 'multipart':
    397                         epilogue = self._last.epilogue
    398                         if epilogue == '':
    399                             self._last.epilogue = None
    400                         elif epilogue is not None:
    401                             mo = NLCRE_eol.search(epilogue)
    402                             if mo:
    403                                 end = len(mo.group(0))
    404                                 self._last.epilogue = epilogue[:-end]
    405                     else:
    406                         payload = self._last._payload
    407                         if isinstance(payload, str):
    408                             mo = NLCRE_eol.search(payload)
    409                             if mo:
    410                                 payload = payload[:-len(mo.group(0))]
    411                                 self._last._payload = payload
    412                     self._input.pop_eof_matcher()
    413                     self._pop_message()
    414                     # Set the multipart up for newline cleansing, which will
    415                     # happen if we're in a nested multipart.
    416                     self._last = self._cur
    417                 else:
    418                     # I think we must be in the preamble
    419                     assert capturing_preamble
    420                     preamble.append(line)
    421             # We've seen either the EOF or the end boundary.  If we're still
    422             # capturing the preamble, we never saw the start boundary.  Note
    423             # that as a defect and store the captured text as the payload.
    424             if capturing_preamble:
    425                 defect = errors.StartBoundaryNotFoundDefect()
    426                 self.policy.handle_defect(self._cur, defect)
    427                 self._cur.set_payload(EMPTYSTRING.join(preamble))
    428                 epilogue = []
    429                 for line in self._input:
    430                     if line is NeedMoreData:
    431                         yield NeedMoreData
    432                         continue
    433                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
    434                 return
    435             # If we're not processing the preamble, then we might have seen
    436             # EOF without seeing that end boundary...that is also a defect.
    437             if not close_boundary_seen:
    438                 defect = errors.CloseBoundaryNotFoundDefect()
    439                 self.policy.handle_defect(self._cur, defect)
    440                 return
    441             # Everything from here to the EOF is epilogue.  If the end boundary
    442             # ended in a newline, we'll need to make sure the epilogue isn't
    443             # None
    444             if linesep:
    445                 epilogue = ['']
    446             else:
    447                 epilogue = []
    448             for line in self._input:
    449                 if line is NeedMoreData:
    450                     yield NeedMoreData
    451                     continue
    452                 epilogue.append(line)
    453             # Any CRLF at the front of the epilogue is not technically part of
    454             # the epilogue.  Also, watch out for an empty string epilogue,
    455             # which means a single newline.
    456             if epilogue:
    457                 firstline = epilogue[0]
    458                 bolmo = NLCRE_bol.match(firstline)
    459                 if bolmo:
    460                     epilogue[0] = firstline[len(bolmo.group(0)):]
    461             self._cur.epilogue = EMPTYSTRING.join(epilogue)
    462             return
    463         # Otherwise, it's some non-multipart type, so the entire rest of the
    464         # file contents becomes the payload.
    465         lines = []
    466         for line in self._input:
    467             if line is NeedMoreData:
    468                 yield NeedMoreData
    469                 continue
    470             lines.append(line)
    471         self._cur.set_payload(EMPTYSTRING.join(lines))
    472 
    473     def _parse_headers(self, lines):
    474         # Passed a list of lines that make up the headers for the current msg
    475         lastheader = ''
    476         lastvalue = []
    477         for lineno, line in enumerate(lines):
    478             # Check for continuation
    479             if line[0] in ' \t':
    480                 if not lastheader:
    481                     # The first line of the headers was a continuation.  This
    482                     # is illegal, so let's note the defect, store the illegal
    483                     # line, and ignore it for purposes of headers.
    484                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
    485                     self.policy.handle_defect(self._cur, defect)
    486                     continue
    487                 lastvalue.append(line)
    488                 continue
    489             if lastheader:
    490                 self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
    491                 lastheader, lastvalue = '', []
    492             # Check for envelope header, i.e. unix-from
    493             if line.startswith('From '):
    494                 if lineno == 0:
    495                     # Strip off the trailing newline
    496                     mo = NLCRE_eol.search(line)
    497                     if mo:
    498                         line = line[:-len(mo.group(0))]
    499                     self._cur.set_unixfrom(line)
    500                     continue
    501                 elif lineno == len(lines) - 1:
    502                     # Something looking like a unix-from at the end - it's
    503                     # probably the first line of the body, so push back the
    504                     # line and stop.
    505                     self._input.unreadline(line)
    506                     return
    507                 else:
    508                     # Weirdly placed unix-from line.  Note this as a defect
    509                     # and ignore it.
    510                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
    511                     self._cur.defects.append(defect)
    512                     continue
    513             # Split the line on the colon separating field name from value.
    514             # There will always be a colon, because if there wasn't the part of
    515             # the parser that calls us would have started parsing the body.
    516             i = line.find(':')
    517 
    518             # If the colon is on the start of the line the header is clearly
    519             # malformed, but we might be able to salvage the rest of the
    520             # message. Track the error but keep going.
    521             if i == 0:
    522                 defect = errors.InvalidHeaderDefect("Missing header name.")
    523                 self._cur.defects.append(defect)
    524                 continue
    525 
    526             assert i>0, "_parse_headers fed line with no : and no leading WS"
    527             lastheader = line[:i]
    528             lastvalue = [line]
    529         # Done with all the lines, so handle the last header.
    530         if lastheader:
    531             self._cur.set_raw(*self.policy.header_source_parse(lastvalue))
    532 
    533 
    534 class BytesFeedParser(FeedParser):
    535     """Like FeedParser, but feed accepts bytes."""
    536 
    537     def feed(self, data):
    538         super().feed(data.decode('ascii', 'surrogateescape'))
    539