Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2004-2006 Python Software Foundation
      2 # Authors: Baxter, Wouters and Warsaw
      3 # Contact: email-sig (at] python.org
      4 
      5 """FeedParser - An email feed parser.
      6 
      7 The feed parser implements an interface for incrementally parsing an email
      8 message, line by line.  This has advantages for certain applications, such as
      9 those reading email messages off a socket.
     10 
     11 FeedParser.feed() is the primary interface for pushing new data into the
     12 parser.  It returns when there's nothing more it can do with the available
     13 data.  When you have no more data to push into the parser, call .close().
     14 This completes the parsing and returns the root message object.
     15 
     16 The other advantage of this parser is that it will never raise a parsing
     17 exception.  Instead, when it finds something unexpected, it adds a 'defect' to
     18 the current message.  Defects are just instances that live on the message
     19 object's .defects attribute.
     20 """
     21 
     22 __all__ = ['FeedParser']
     23 
     24 import re
     25 
     26 from email import errors
     27 from email import message
     28 
     29 NLCRE = re.compile('\r\n|\r|\n')
     30 NLCRE_bol = re.compile('(\r\n|\r|\n)')
     31 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')
     32 NLCRE_crack = re.compile('(\r\n|\r|\n)')
     33 # RFC 2822 $3.6.8 Optional fields.  ftext is %d33-57 / %d59-126, Any character
     34 # except controls, SP, and ":".
     35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')
     36 EMPTYSTRING = ''
     37 NL = '\n'
     38 
     39 NeedMoreData = object()
     40 
     41 
     42 
     44 class BufferedSubFile(object):
     45     """A file-ish object that can have new data loaded into it.
     46 
     47     You can also push and pop line-matching predicates onto a stack.  When the
     48     current predicate matches the current line, a false EOF response
     49     (i.e. empty string) is returned instead.  This lets the parser adhere to a
     50     simple abstraction -- it parses until EOF closes the current message.
     51     """
     52     def __init__(self):
     53         # The last partial line pushed into this object.
     54         self._partial = ''
     55         # The list of full, pushed lines, in reverse order
     56         self._lines = []
     57         # The stack of false-EOF checking predicates.
     58         self._eofstack = []
     59         # A flag indicating whether the file has been closed or not.
     60         self._closed = False
     61 
     62     def push_eof_matcher(self, pred):
     63         self._eofstack.append(pred)
     64 
     65     def pop_eof_matcher(self):
     66         return self._eofstack.pop()
     67 
     68     def close(self):
     69         # Don't forget any trailing partial line.
     70         self._lines.append(self._partial)
     71         self._partial = ''
     72         self._closed = True
     73 
     74     def readline(self):
     75         if not self._lines:
     76             if self._closed:
     77                 return ''
     78             return NeedMoreData
     79         # Pop the line off the stack and see if it matches the current
     80         # false-EOF predicate.
     81         line = self._lines.pop()
     82         # RFC 2046, section 5.1.2 requires us to recognize outer level
     83         # boundaries at any level of inner nesting.  Do this, but be sure it's
     84         # in the order of most to least nested.
     85         for ateof in self._eofstack[::-1]:
     86             if ateof(line):
     87                 # We're at the false EOF.  But push the last line back first.
     88                 self._lines.append(line)
     89                 return ''
     90         return line
     91 
     92     def unreadline(self, line):
     93         # Let the consumer push a line back into the buffer.
     94         assert line is not NeedMoreData
     95         self._lines.append(line)
     96 
     97     def push(self, data):
     98         """Push some new data into this object."""
     99         # Handle any previous leftovers
    100         data, self._partial = self._partial + data, ''
    101         # Crack into lines, but preserve the newlines on the end of each
    102         parts = NLCRE_crack.split(data)
    103         # The *ahem* interesting behaviour of re.split when supplied grouping
    104         # parentheses is that the last element of the resulting list is the
    105         # data after the final RE.  In the case of a NL/CR terminated string,
    106         # this is the empty string.
    107         self._partial = parts.pop()
    108         #GAN 29Mar09  bugs 1555570, 1721862  Confusion at 8K boundary ending with \r:
    109         # is there a \n to follow later?
    110         if not self._partial and parts and parts[-1].endswith('\r'):
    111             self._partial = parts.pop(-2)+parts.pop()
    112         # parts is a list of strings, alternating between the line contents
    113         # and the eol character(s).  Gather up a list of lines after
    114         # re-attaching the newlines.
    115         lines = []
    116         for i in range(len(parts) // 2):
    117             lines.append(parts[i*2] + parts[i*2+1])
    118         self.pushlines(lines)
    119 
    120     def pushlines(self, lines):
    121         # Reverse and insert at the front of the lines.
    122         self._lines[:0] = lines[::-1]
    123 
    124     def is_closed(self):
    125         return self._closed
    126 
    127     def __iter__(self):
    128         return self
    129 
    130     def next(self):
    131         line = self.readline()
    132         if line == '':
    133             raise StopIteration
    134         return line
    135 
    136 
    137 
    139 class FeedParser:
    140     """A feed-style parser of email."""
    141 
    142     def __init__(self, _factory=message.Message):
    143         """_factory is called with no arguments to create a new message obj"""
    144         self._factory = _factory
    145         self._input = BufferedSubFile()
    146         self._msgstack = []
    147         self._parse = self._parsegen().next
    148         self._cur = None
    149         self._last = None
    150         self._headersonly = False
    151 
    152     # Non-public interface for supporting Parser's headersonly flag
    153     def _set_headersonly(self):
    154         self._headersonly = True
    155 
    156     def feed(self, data):
    157         """Push more data into the parser."""
    158         self._input.push(data)
    159         self._call_parse()
    160 
    161     def _call_parse(self):
    162         try:
    163             self._parse()
    164         except StopIteration:
    165             pass
    166 
    167     def close(self):
    168         """Parse all remaining data and return the root message object."""
    169         self._input.close()
    170         self._call_parse()
    171         root = self._pop_message()
    172         assert not self._msgstack
    173         # Look for final set of defects
    174         if root.get_content_maintype() == 'multipart' \
    175                and not root.is_multipart():
    176             root.defects.append(errors.MultipartInvariantViolationDefect())
    177         return root
    178 
    179     def _new_message(self):
    180         msg = self._factory()
    181         if self._cur and self._cur.get_content_type() == 'multipart/digest':
    182             msg.set_default_type('message/rfc822')
    183         if self._msgstack:
    184             self._msgstack[-1].attach(msg)
    185         self._msgstack.append(msg)
    186         self._cur = msg
    187         self._last = msg
    188 
    189     def _pop_message(self):
    190         retval = self._msgstack.pop()
    191         if self._msgstack:
    192             self._cur = self._msgstack[-1]
    193         else:
    194             self._cur = None
    195         return retval
    196 
    197     def _parsegen(self):
    198         # Create a new message and start by parsing headers.
    199         self._new_message()
    200         headers = []
    201         # Collect the headers, searching for a line that doesn't match the RFC
    202         # 2822 header or continuation pattern (including an empty line).
    203         for line in self._input:
    204             if line is NeedMoreData:
    205                 yield NeedMoreData
    206                 continue
    207             if not headerRE.match(line):
    208                 # If we saw the RFC defined header/body separator
    209                 # (i.e. newline), just throw it away. Otherwise the line is
    210                 # part of the body so push it back.
    211                 if not NLCRE.match(line):
    212                     self._input.unreadline(line)
    213                 break
    214             headers.append(line)
    215         # Done with the headers, so parse them and figure out what we're
    216         # supposed to see in the body of the message.
    217         self._parse_headers(headers)
    218         # Headers-only parsing is a backwards compatibility hack, which was
    219         # necessary in the older parser, which could raise errors.  All
    220         # remaining lines in the input are thrown into the message body.
    221         if self._headersonly:
    222             lines = []
    223             while True:
    224                 line = self._input.readline()
    225                 if line is NeedMoreData:
    226                     yield NeedMoreData
    227                     continue
    228                 if line == '':
    229                     break
    230                 lines.append(line)
    231             self._cur.set_payload(EMPTYSTRING.join(lines))
    232             return
    233         if self._cur.get_content_type() == 'message/delivery-status':
    234             # message/delivery-status contains blocks of headers separated by
    235             # a blank line.  We'll represent each header block as a separate
    236             # nested message object, but the processing is a bit different
    237             # than standard message/* types because there is no body for the
    238             # nested messages.  A blank line separates the subparts.
    239             while True:
    240                 self._input.push_eof_matcher(NLCRE.match)
    241                 for retval in self._parsegen():
    242                     if retval is NeedMoreData:
    243                         yield NeedMoreData
    244                         continue
    245                     break
    246                 msg = self._pop_message()
    247                 # We need to pop the EOF matcher in order to tell if we're at
    248                 # the end of the current file, not the end of the last block
    249                 # of message headers.
    250                 self._input.pop_eof_matcher()
    251                 # The input stream must be sitting at the newline or at the
    252                 # EOF.  We want to see if we're at the end of this subpart, so
    253                 # first consume the blank line, then test the next line to see
    254                 # if we're at this subpart's EOF.
    255                 while True:
    256                     line = self._input.readline()
    257                     if line is NeedMoreData:
    258                         yield NeedMoreData
    259                         continue
    260                     break
    261                 while True:
    262                     line = self._input.readline()
    263                     if line is NeedMoreData:
    264                         yield NeedMoreData
    265                         continue
    266                     break
    267                 if line == '':
    268                     break
    269                 # Not at EOF so this is a line we're going to need.
    270                 self._input.unreadline(line)
    271             return
    272         if self._cur.get_content_maintype() == 'message':
    273             # The message claims to be a message/* type, then what follows is
    274             # another RFC 2822 message.
    275             for retval in self._parsegen():
    276                 if retval is NeedMoreData:
    277                     yield NeedMoreData
    278                     continue
    279                 break
    280             self._pop_message()
    281             return
    282         if self._cur.get_content_maintype() == 'multipart':
    283             boundary = self._cur.get_boundary()
    284             if boundary is None:
    285                 # The message /claims/ to be a multipart but it has not
    286                 # defined a boundary.  That's a problem which we'll handle by
    287                 # reading everything until the EOF and marking the message as
    288                 # defective.
    289                 self._cur.defects.append(errors.NoBoundaryInMultipartDefect())
    290                 lines = []
    291                 for line in self._input:
    292                     if line is NeedMoreData:
    293                         yield NeedMoreData
    294                         continue
    295                     lines.append(line)
    296                 self._cur.set_payload(EMPTYSTRING.join(lines))
    297                 return
    298             # Create a line match predicate which matches the inter-part
    299             # boundary as well as the end-of-multipart boundary.  Don't push
    300             # this onto the input stream until we've scanned past the
    301             # preamble.
    302             separator = '--' + boundary
    303             boundaryre = re.compile(
    304                 '(?P<sep>' + re.escape(separator) +
    305                 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')
    306             capturing_preamble = True
    307             preamble = []
    308             linesep = False
    309             while True:
    310                 line = self._input.readline()
    311                 if line is NeedMoreData:
    312                     yield NeedMoreData
    313                     continue
    314                 if line == '':
    315                     break
    316                 mo = boundaryre.match(line)
    317                 if mo:
    318                     # If we're looking at the end boundary, we're done with
    319                     # this multipart.  If there was a newline at the end of
    320                     # the closing boundary, then we need to initialize the
    321                     # epilogue with the empty string (see below).
    322                     if mo.group('end'):
    323                         linesep = mo.group('linesep')
    324                         break
    325                     # We saw an inter-part boundary.  Were we in the preamble?
    326                     if capturing_preamble:
    327                         if preamble:
    328                             # According to RFC 2046, the last newline belongs
    329                             # to the boundary.
    330                             lastline = preamble[-1]
    331                             eolmo = NLCRE_eol.search(lastline)
    332                             if eolmo:
    333                                 preamble[-1] = lastline[:-len(eolmo.group(0))]
    334                             self._cur.preamble = EMPTYSTRING.join(preamble)
    335                         capturing_preamble = False
    336                         self._input.unreadline(line)
    337                         continue
    338                     # We saw a boundary separating two parts.  Consume any
    339                     # multiple boundary lines that may be following.  Our
    340                     # interpretation of RFC 2046 BNF grammar does not produce
    341                     # body parts within such double boundaries.
    342                     while True:
    343                         line = self._input.readline()
    344                         if line is NeedMoreData:
    345                             yield NeedMoreData
    346                             continue
    347                         mo = boundaryre.match(line)
    348                         if not mo:
    349                             self._input.unreadline(line)
    350                             break
    351                     # Recurse to parse this subpart; the input stream points
    352                     # at the subpart's first line.
    353                     self._input.push_eof_matcher(boundaryre.match)
    354                     for retval in self._parsegen():
    355                         if retval is NeedMoreData:
    356                             yield NeedMoreData
    357                             continue
    358                         break
    359                     # Because of RFC 2046, the newline preceding the boundary
    360                     # separator actually belongs to the boundary, not the
    361                     # previous subpart's payload (or epilogue if the previous
    362                     # part is a multipart).
    363                     if self._last.get_content_maintype() == 'multipart':
    364                         epilogue = self._last.epilogue
    365                         if epilogue == '':
    366                             self._last.epilogue = None
    367                         elif epilogue is not None:
    368                             mo = NLCRE_eol.search(epilogue)
    369                             if mo:
    370                                 end = len(mo.group(0))
    371                                 self._last.epilogue = epilogue[:-end]
    372                     else:
    373                         payload = self._last.get_payload()
    374                         if isinstance(payload, basestring):
    375                             mo = NLCRE_eol.search(payload)
    376                             if mo:
    377                                 payload = payload[:-len(mo.group(0))]
    378                                 self._last.set_payload(payload)
    379                     self._input.pop_eof_matcher()
    380                     self._pop_message()
    381                     # Set the multipart up for newline cleansing, which will
    382                     # happen if we're in a nested multipart.
    383                     self._last = self._cur
    384                 else:
    385                     # I think we must be in the preamble
    386                     assert capturing_preamble
    387                     preamble.append(line)
    388             # We've seen either the EOF or the end boundary.  If we're still
    389             # capturing the preamble, we never saw the start boundary.  Note
    390             # that as a defect and store the captured text as the payload.
    391             # Everything from here to the EOF is epilogue.
    392             if capturing_preamble:
    393                 self._cur.defects.append(errors.StartBoundaryNotFoundDefect())
    394                 self._cur.set_payload(EMPTYSTRING.join(preamble))
    395                 epilogue = []
    396                 for line in self._input:
    397                     if line is NeedMoreData:
    398                         yield NeedMoreData
    399                         continue
    400                 self._cur.epilogue = EMPTYSTRING.join(epilogue)
    401                 return
    402             # If the end boundary ended in a newline, we'll need to make sure
    403             # the epilogue isn't None
    404             if linesep:
    405                 epilogue = ['']
    406             else:
    407                 epilogue = []
    408             for line in self._input:
    409                 if line is NeedMoreData:
    410                     yield NeedMoreData
    411                     continue
    412                 epilogue.append(line)
    413             # Any CRLF at the front of the epilogue is not technically part of
    414             # the epilogue.  Also, watch out for an empty string epilogue,
    415             # which means a single newline.
    416             if epilogue:
    417                 firstline = epilogue[0]
    418                 bolmo = NLCRE_bol.match(firstline)
    419                 if bolmo:
    420                     epilogue[0] = firstline[len(bolmo.group(0)):]
    421             self._cur.epilogue = EMPTYSTRING.join(epilogue)
    422             return
    423         # Otherwise, it's some non-multipart type, so the entire rest of the
    424         # file contents becomes the payload.
    425         lines = []
    426         for line in self._input:
    427             if line is NeedMoreData:
    428                 yield NeedMoreData
    429                 continue
    430             lines.append(line)
    431         self._cur.set_payload(EMPTYSTRING.join(lines))
    432 
    433     def _parse_headers(self, lines):
    434         # Passed a list of lines that make up the headers for the current msg
    435         lastheader = ''
    436         lastvalue = []
    437         for lineno, line in enumerate(lines):
    438             # Check for continuation
    439             if line[0] in ' \t':
    440                 if not lastheader:
    441                     # The first line of the headers was a continuation.  This
    442                     # is illegal, so let's note the defect, store the illegal
    443                     # line, and ignore it for purposes of headers.
    444                     defect = errors.FirstHeaderLineIsContinuationDefect(line)
    445                     self._cur.defects.append(defect)
    446                     continue
    447                 lastvalue.append(line)
    448                 continue
    449             if lastheader:
    450                 # XXX reconsider the joining of folded lines
    451                 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')
    452                 self._cur[lastheader] = lhdr
    453                 lastheader, lastvalue = '', []
    454             # Check for envelope header, i.e. unix-from
    455             if line.startswith('From '):
    456                 if lineno == 0:
    457                     # Strip off the trailing newline
    458                     mo = NLCRE_eol.search(line)
    459                     if mo:
    460                         line = line[:-len(mo.group(0))]
    461                     self._cur.set_unixfrom(line)
    462                     continue
    463                 elif lineno == len(lines) - 1:
    464                     # Something looking like a unix-from at the end - it's
    465                     # probably the first line of the body, so push back the
    466                     # line and stop.
    467                     self._input.unreadline(line)
    468                     return
    469                 else:
    470                     # Weirdly placed unix-from line.  Note this as a defect
    471                     # and ignore it.
    472                     defect = errors.MisplacedEnvelopeHeaderDefect(line)
    473                     self._cur.defects.append(defect)
    474                     continue
    475             # Split the line on the colon separating field name from value.
    476             i = line.find(':')
    477             if i < 0:
    478                 defect = errors.MalformedHeaderDefect(line)
    479                 self._cur.defects.append(defect)
    480                 continue
    481             lastheader = line[:i]
    482             lastvalue = [line[i+1:].lstrip()]
    483         # Done with all the lines, so handle the last header.
    484         if lastheader:
    485             # XXX reconsider the joining of folded lines
    486             self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')
    487