Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2004-2006 Python Software Foundation
      2 # Authors: Baxter, Wouters and Warsaw
      3 # Contact: email-sig (at] python.org
      4 
      5 """FeedParser - An email feed parser.
      6 
      7 The feed parser implements an interface for incrementally parsing an email
      8 message, line by line.  This has advantages for certain applications, such as
      9 those reading email messages off a socket.
     10 
     11 FeedParser.feed() is the primary interface for pushing new data into the
     12 parser.  It returns when there's nothing more it can do with the available
     13 data.  When you have no more data to push into the parser, call .close().
     14 This completes the parsing and returns the root message object.
     15 
     16 The other advantage of this parser is that it will never raise a parsing
     17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
     18 the current message.  Defects are just instances that live on the message
     19 object's .defects attribute.
     20 """
     21 
     22 __all__ = ['FeedParser']
     23 
     24 import re
     25 
     26 from email import errors
     27 from email import message
     28 
     29 NLCRE = re.compile('\r\n|\r|\n')
     30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
     31 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
     32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
     33 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
     34 # except controls, SP, and ":".
     35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
     36 EMPTYSTRING = ''
     37 NL = '\n'
     38 
     39 NeedMoreData = object()
     40 
     41 
     42 
     44 class BufferedSubFile(object):
     45     """A file-ish object that can have new data loaded into it.
     46 
     47     You can also push and pop line-matching predicates onto a stack.  When the
     48     current predicate matches the current line, a false EOF response
     49     (i.e. empty string) is returned instead.  This lets the parser adhere to a
     50     simple abstraction -- it parses until EOF closes the current message.
     51     """
     52     def __init__(self):
     53         # Chunks of the last partial line pushed into this object.
     54         self._partial = []
     55         # The list of full, pushed lines, in reverse order
     56         self._lines = []
     57         # The stack of false-EOF checking predicates.
     58         self._eofstack = []
     59         # A flag indicating whether the file has been closed or not.
     60         self._closed = False
     61 
     62     def push_eof_matcher(self, pred):
     63         self._eofstack.append(pred)
     64 
     65     def pop_eof_matcher(self):
     66         return self._eofstack.pop()
     67 
     68     def close(self):
     69         # Don't forget any trailing partial line.
     70         self.pushlines(''.join(self._partial).splitlines(True))
     71         self._partial = []
     72         self._closed = True
     73 
     74     def readline(self):
     75         if not self._lines:
     76             if self._closed:
     77                 return ''
     78             return NeedMoreData
     79         # Pop the line off the stack and see if it matches the current
     80         # false-EOF predicate.
     81         line = self._lines.pop()
     82         # RFC 2046, section 5.1.2 requires us to recognize outer level
     83         # boundaries at any level of inner nesting.  Do this, but be sure it's
     84         # in the order of most to least nested.
     85         for ateof in self._eofstack[::-1]:
     86             if ateof(line):
     87                 # We're at the false EOF.  But push the last line back first.
     88                 self._lines.append(line)
     89                 return ''
     90         return line
     91 
     92     def unreadline(self, line):
     93         # Let the consumer push a line back into the buffer.
     94         assert line is not NeedMoreData
     95         self._lines.append(line)
     96 
     97     def push(self, data):
     98         """Push some new data into this object."""
     99         # Crack into lines, but preserve the linesep characters on the end of each
    100         parts = data.splitlines(True)
    101 
    102         if not parts or not parts[0].endswith(('\n', '\r')):
    103             # No new complete lines, so just accumulate partials
    104             self._partial += parts
    105             return
    106 
    107         if self._partial:
    108             # If there are previous leftovers, complete them now
    109             self._partial.append(parts[0])
    110             parts[0:1] = ''.join(self._partial).splitlines(True)
    111             del self._partial[:]
    112 
    113         # If the last element of the list does not end in a newline, then treat
    114         # it as a partial line.  We only check for '\n' here because a line
    115         # ending with '\r' might be a line that was split in the middle of a
    116         # '\r\n' sequence (see bugs 1555570 and 1721862).
    117         if not parts[-1].endswith('\n'):
    118             self._partial = [parts.pop()]
    119         self.pushlines(parts)
    120 
    121     def pushlines(self, lines):
    122         # Crack into lines, but preserve the newlines on the end of each
    123         parts = NLCRE_crack.split(data)
    124         # The *ahem* interesting behaviour of re.split when supplied grouping
    125         # parentheses is that the last element of the resulting list is the
    126         # data after the final RE.  In the case of a NL/CR terminated string,
    127         # this is the empty string.
    128         self._partial = parts.pop()
    129         #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r:
    130         # is there a \n to follow later?
    131         if not self._partial and parts and parts[-1].endswith('\r'):
    132             self._partial = parts.pop(-2)+parts.pop()
    133         # parts is a list of strings, alternating between the line contents
    134         # and the eol character(s).  Gather up a list of lines after
    135         # re-attaching the newlines.
    136         lines = []
    137         for i in range(len(parts) // 2):
    138             lines.append(parts[i*2] + parts[i*2+1])
    139         self.pushlines(lines)
    140 
    141     def pushlines(self, lines):
    142         # Reverse and insert at the front of the lines.
    143         self._lines[:0] = lines[::-1]
    144 
    145     def is_closed(self):
    146         return self._closed
    147 
    148     def __iter__(self):
    149         return self
    150 
    151     def next(self):
    152         line = self.readline()
    153         if line == '':
    154             raise StopIteration
    155         return line
    156 
    157 
    158 
    160 class FeedParser:
    161     """A feed-style parser of email."""
    162 
    163     def __init__(self, _factory=message.Message):
    164         """_factory is called with no arguments to create a new message obj"""
    165         self._factory = _factory
    166         self._input = BufferedSubFile()
    167         self._msgstack = []
    168         self._parse = self._parsegen().next
    169         self._cur = None
    170         self._last = None
    171         self._headersonly = False
    172 
    173     # Non-public interface for supporting Parser's headersonly flag
    174     def _set_headersonly(self):
    175         self._headersonly = True
    176 
    177     def feed(self, data):
    178         """Push more data into the parser."""
    179         self._input.push(data)
    180         self._call_parse()
    181 
    182     def _call_parse(self):
    183         try:
    184             self._parse()
    185         except StopIteration:
    186             pass
    187 
    188     def close(self):
    189         """Parse all remaining data and return the root message object."""
    190         self._input.close()
    191         self._call_parse()
    192         root = self._pop_message()
    193         assert not self._msgstack
    194         # Look for final set of defects
    195         if root.get_content_maintype() == 'multipart' \
    196                and not root.is_multipart():
    197             root.defects.append(errors.MultipartInvariantViolationDefect())
    198         return root
    199 
    200     def _new_message(self):
    201         msg = self._factory()
    202         if self._cur and self._cur.get_content_type() == 'multipart/digest':
    203             msg.set_default_type('message/rfc822')
    204         if self._msgstack:
    205             self._msgstack[-1].attach(msg)
    206         self._msgstack.append(msg)
    207         self._cur = msg
    208         self._last = msg
    209 
    210     def _pop_message(self):
    211         retval = self._msgstack.pop()
    212         if self._msgstack:
    213             self._cur = self._msgstack[-1]
    214         else:
    215             self._cur = None
    216         return retval
    217 
    218     def _parsegen(self):
    219         # Create a new message and start by parsing headers.
    220         self._new_message()
    221         headers = []
    222         # Collect the headers, searching for a line that doesn't match the RFC
    223         # 2822 header or continuation pattern (including an empty line).
    224         for line in self._input:
    225             if line is NeedMoreData:
    226                 yield NeedMoreData
    227                 continue
    228             if not headerRE.match(line):
    229                 # If we saw the RFC defined header/body separator
    230                 # (i.e. newline), just throw it away. Otherwise the line is
    231                 # part of the body so push it back.
    232                 if not NLCRE.match(line):
    233                     self._input.unreadline(line)
    234                 break
    235             headers.append(line)
    236         # Done with the headers, so parse them and figure out what we're
    237         # supposed to see in the body of the message.
    238         self._parse_headers(headers)
    239         # Headers-only parsing is a backwards compatibility hack, which was
    240         # necessary in the older parser, which could raise errors.  All
    241         # remaining lines in the input are thrown into the message body.
    242         if self._headersonly:
    243             lines = []
    244             while True:
    245                 line = self._input.readline()
    246                 if line is NeedMoreData:
    247                     yield NeedMoreData
    248                     continue
    249                 if line == '':
    250                     break
    251                 lines.append(line)
    252             self._cur.set_payload(EMPTYSTRING.join(lines))
    253             return
    254         if self._cur.get_content_type() == 'message/delivery-status':
    255             # message/delivery-status contains blocks of headers separated by
    256             # a blank line.  We'll represent each header block as a separate
    257             # nested message object, but the processing is a bit different
    258             # than standard message/* types because there is no body for the
    259             # nested messages.  A blank line separates the subparts.
    260             while True:
    261                 self._input.push_eof_matcher(NLCRE.match)
    262                 for retval in self._parsegen():
    263                     if retval is NeedMoreData:
    264                         yield NeedMoreData
    265                         continue
    266                     break
    267                 msg = self._pop_message()
    268                 # We need to pop the EOF matcher in order to tell if we're at
    269                 # the end of the current file, not the end of the last block
    270                 # of message headers.
    271                 self._input.pop_eof_matcher()
    272                 # The input stream must be sitting at the newline or at the
    273                 # EOF.  We want to see if we're at the end of this subpart, so
    274                 # first consume the blank line, then test the next line to see
    275                 # if we're at this subpart's EOF.
    276                 while True:
    277                     line = self._input.readline()
    278                     if line is NeedMoreData:
    279                         yield NeedMoreData
    280                         continue
    281                     break
    282                 while True:
    283                     line = self._input.readline()
    284                     if line is NeedMoreData:
    285                         yield NeedMoreData
    286                         continue
    287                     break
    288                 if line == '':
    289                     break
    290                 # Not at EOF so this is a line we're going to need.
    291                 self._input.unreadline(line)
    292             return
    293         if self._cur.get_content_maintype() == 'message':
    294             # The message claims to be a message/* type, then what follows is
    295             # another RFC 2822 message.
    296             for retval in self._parsegen():
    297                 if retval is NeedMoreData:
    298                     yield NeedMoreData
    299                     continue
    300                 break
    301             self._pop_message()
    302             return
    303         if self._cur.get_content_maintype() == 'multipart':
    304             boundary = self._cur.get_boundary()
    305             if boundary is None:
    306                 # The message /claims/ to be a multipart but it has not
    307                 # defined a boundary.  That's a problem which we'll handle by
    308                 # reading everything until the EOF and marking the message as
    309                 # defective.
    310                 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
    311                 lines = []
    312                 for line in self._input:
    313                     if line is NeedMoreData:
    314                         yield NeedMoreData
    315                         continue
    316                     lines.append(line)
    317                 self._cur.set_payload(EMPTYSTRING.join(lines))
    318                 return
    319             # Create a line match predicate which matches the inter-part
    320             # boundary as well as the end-of-multipart boundary.  Don't push
    321             # this onto the input stream until we've scanned past the
    322             # preamble.
    323             separator = '--' + boundary
    324             boundaryre = re.compile(
    325                 '(?P<sep>' + re.escape(separator) +
    326                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
    327             capturing_preamble = True
    328             preamble = []
    329             linesep = False
    330             while True:
    331                 line = self._input.readline()
    332                 if line is NeedMoreData:
    333                     yield NeedMoreData
    334                     continue
    335                 if line == '':
    336                     break
    337                 mo = boundaryre.match(line)
    338                 if mo:
    339                     # If we're looking at the end boundary, we're done with
    340                     # this multipart.  If there was a newline at the end of
    341                     # the closing boundary, then we need to initialize the
    342                     # epilogue with the empty string (see below).
    343                     if mo.group('end'):
    344                         linesep = mo.group('linesep')
    345                         break
    346                     # We saw an inter-part boundary.  Were we in the preamble?
    347                     if capturing_preamble:
    348                         if preamble:
    349                             # According to RFC 2046, the last newline belongs
    350                             # to the boundary.
    351                             lastline = preamble[-1]
    352                             eolmo = NLCRE_eol.search(lastline)
    353                             if eolmo:
    354                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
    355                             self._cur.preamble = EMPTYSTRING.join(preamble)
    356                         capturing_preamble = False
    357                         self._input.unreadline(line)
    358                         continue
    359                     # We saw a boundary separating two parts.  Consume any
    360                     # multiple boundary lines that may be following.  Our
    361                     # interpretation of RFC 2046 BNF grammar does not produce
    362                     # body parts within such double boundaries.
    363                     while True:
    364                         line = self._input.readline()
    365                         if line is NeedMoreData:
    366                             yield NeedMoreData
    367                             continue
    368                         mo = boundaryre.match(line)
    369                         if not mo:
    370                             self._input.unreadline(line)
    371                             break
    372                     # Recurse to parse this subpart; the input stream points
    373                     # at the subpart's first line.
    374                     self._input.push_eof_matcher(boundaryre.match)
    375                     for retval in self._parsegen():
    376                         if retval is NeedMoreData:
    377                             yield NeedMoreData
    378                             continue
    379                         break
    380                     # Because of RFC 2046, the newline preceding the boundary
    381                     # separator actually belongs to the boundary, not the
    382                     # previous subpart's payload (or epilogue if the previous
    383                     # part is a multipart).
    384                     if self._last.get_content_maintype() == 'multipart':
    385                         epilogue = self._last.epilogue
    386                         if epilogue == '':
    387                             self._last.epilogue = None
    388                         elif epilogue is not None:
    389                             mo = NLCRE_eol.search(epilogue)
    390                             if mo:
    391                                 end = len(mo.group(0))
    392                                 self._last.epilogue = epilogue[:-end]
    393                     else:
    394                         payload = self._last.get_payload()
    395                         if isinstance(payload, basestring):
    396                             mo = NLCRE_eol.search(payload)
    397                             if mo:
    398                                 payload = payload[:-len(mo.group(0))]
    399                                 self._last.set_payload(payload)
    400                     self._input.pop_eof_matcher()
    401                     self._pop_message()
    402                     # Set the multipart up for newline cleansing, which will
    403                     # happen if we're in a nested multipart.
    404                     self._last = self._cur
    405                 else:
    406                     # I think we must be in the preamble
    407                     assert capturing_preamble
    408                     preamble.append(line)
    409             # We've seen either the EOF or the end boundary.  If we're still
    410             # capturing the preamble, we never saw the start boundary.  Note
    411             # that as a defect and store the captured text as the payload.
    412             # Everything from here to the EOF is epilogue.
    413             if capturing_preamble:
    414                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
    415                 self._cur.set_payload(EMPTYSTRING.join(preamble))
    416                 epilogue = []
    417                 for line in self._input:
    418                     if line is NeedMoreData:
    419                         yield NeedMoreData
    420                         continue
    421                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
    422                 return
    423             # If the end boundary ended in a newline, we'll need to make sure
    424             # the epilogue isn't None
    425             if linesep:
    426                 epilogue = ['']
    427             else:
    428                 epilogue = []
    429             for line in self._input:
    430                 if line is NeedMoreData:
    431                     yield NeedMoreData
    432                     continue
    433                 epilogue.append(line)
    434             # Any CRLF at the front of the epilogue is not technically part of
    435             # the epilogue.  Also, watch out for an empty string epilogue,
    436             # which means a single newline.
    437             if epilogue:
    438                 firstline = epilogue[0]
    439                 bolmo = NLCRE_bol.match(firstline)
    440                 if bolmo:
    441                     epilogue[0] = firstline[len(bolmo.group(0)):]
    442             self._cur.epilogue = EMPTYSTRING.join(epilogue)
    443             return
    444         # Otherwise, it's some non-multipart type, so the entire rest of the
    445         # file contents becomes the payload.
    446         lines = []
    447         for line in self._input:
    448             if line is NeedMoreData:
    449                 yield NeedMoreData
    450                 continue
    451             lines.append(line)
    452         self._cur.set_payload(EMPTYSTRING.join(lines))
    453 
    454     def _parse_headers(self, lines):
    455         # Passed a list of lines that make up the headers for the current msg
    456         lastheader = ''
    457         lastvalue = []
    458         for lineno, line in enumerate(lines):
    459             # Check for continuation
    460             if line[0] in ' \t':
    461                 if not lastheader:
    462                     # The first line of the headers was a continuation.  This
    463                     # is illegal, so let's note the defect, store the illegal
    464                     # line, and ignore it for purposes of headers.
    465                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
    466                     self._cur.defects.append(defect)
    467                     continue
    468                 lastvalue.append(line)
    469                 continue
    470             if lastheader:
    471                 # XXX reconsider the joining of folded lines
    472                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
    473                 self._cur[lastheader] = lhdr
    474                 lastheader, lastvalue = '', []
    475             # Check for envelope header, i.e. unix-from
    476             if line.startswith('From '):
    477                 if lineno == 0:
    478                     # Strip off the trailing newline
    479                     mo = NLCRE_eol.search(line)
    480                     if mo:
    481                         line = line[:-len(mo.group(0))]
    482                     self._cur.set_unixfrom(line)
    483                     continue
    484                 elif lineno == len(lines) - 1:
    485                     # Something looking like a unix-from at the end - it's
    486                     # probably the first line of the body, so push back the
    487                     # line and stop.
    488                     self._input.unreadline(line)
    489                     return
    490                 else:
    491                     # Weirdly placed unix-from line.  Note this as a defect
    492                     # and ignore it.
    493                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
    494                     self._cur.defects.append(defect)
    495                     continue
    496             # Split the line on the colon separating field name from value.
    497             i = line.find(':')
    498             if i < 0:
    499                 defect = errors.MalformedHeaderDefect(line)
    500                 self._cur.defects.append(defect)
    501                 continue
    502             lastheader = line[:i]
    503             lastvalue = [line[i+1:].lstrip()]
    504         # Done with all the lines, so handle the last header.
    505         if lastheader:
    506             # XXX reconsider the joining of folded lines
    507             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
    508