1 # Copyright (C) 2004-2006 Python Software Foundation 2 # Authors: Baxter, Wouters and Warsaw 3 # Contact: email-sig (at] python.org 4 5 """FeedParser - An email feed parser. 6 7 The feed parser implements an interface for incrementally parsing an email 8 message, line by line. This has advantages for certain applications, such as 9 those reading email messages off a socket. 10 11 FeedParser.feed() is the primary interface for pushing new data into the 12 parser. It returns when there's nothing more it can do with the available 13 data. When you have no more data to push into the parser, call .close(). 14 This completes the parsing and returns the root message object. 15 16 The other advantage of this parser is that it will never raise a parsing 17 exception. Instead, when it finds something unexpected, it adds a 'defect' to 18 the current message. Defects are just instances that live on the message 19 object's .defects attribute. 20 """ 21 22 __all__ = ['FeedParser'] 23 24 import re 25 26 from email import errors 27 from email import message 28 29 NLCRE = re.compile('\r\n|\r|\n') 30 NLCRE_bol = re.compile('(\r\n|\r|\n)') 31 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z') 32 NLCRE_crack = re.compile('(\r\n|\r|\n)') 33 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 34 # except controls, SP, and ":". 35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') 36 EMPTYSTRING = '' 37 NL = '\n' 38 39 NeedMoreData = object() 40 41 42 44 class BufferedSubFile(object): 45 """A file-ish object that can have new data loaded into it. 46 47 You can also push and pop line-matching predicates onto a stack. When the 48 current predicate matches the current line, a false EOF response 49 (i.e. empty string) is returned instead. This lets the parser adhere to a 50 simple abstraction -- it parses until EOF closes the current message. 51 """ 52 def __init__(self): 53 # Chunks of the last partial line pushed into this object. 54 self._partial = [] 55 # The list of full, pushed lines, in reverse order 56 self._lines = [] 57 # The stack of false-EOF checking predicates. 58 self._eofstack = [] 59 # A flag indicating whether the file has been closed or not. 60 self._closed = False 61 62 def push_eof_matcher(self, pred): 63 self._eofstack.append(pred) 64 65 def pop_eof_matcher(self): 66 return self._eofstack.pop() 67 68 def close(self): 69 # Don't forget any trailing partial line. 70 self.pushlines(''.join(self._partial).splitlines(True)) 71 self._partial = [] 72 self._closed = True 73 74 def readline(self): 75 if not self._lines: 76 if self._closed: 77 return '' 78 return NeedMoreData 79 # Pop the line off the stack and see if it matches the current 80 # false-EOF predicate. 81 line = self._lines.pop() 82 # RFC 2046, section 5.1.2 requires us to recognize outer level 83 # boundaries at any level of inner nesting. Do this, but be sure it's 84 # in the order of most to least nested. 85 for ateof in self._eofstack[::-1]: 86 if ateof(line): 87 # We're at the false EOF. But push the last line back first. 88 self._lines.append(line) 89 return '' 90 return line 91 92 def unreadline(self, line): 93 # Let the consumer push a line back into the buffer. 94 assert line is not NeedMoreData 95 self._lines.append(line) 96 97 def push(self, data): 98 """Push some new data into this object.""" 99 # Crack into lines, but preserve the linesep characters on the end of each 100 parts = data.splitlines(True) 101 102 if not parts or not parts[0].endswith(('\n', '\r')): 103 # No new complete lines, so just accumulate partials 104 self._partial += parts 105 return 106 107 if self._partial: 108 # If there are previous leftovers, complete them now 109 self._partial.append(parts[0]) 110 parts[0:1] = ''.join(self._partial).splitlines(True) 111 del self._partial[:] 112 113 # If the last element of the list does not end in a newline, then treat 114 # it as a partial line. We only check for '\n' here because a line 115 # ending with '\r' might be a line that was split in the middle of a 116 # '\r\n' sequence (see bugs 1555570 and 1721862). 117 if not parts[-1].endswith('\n'): 118 self._partial = [parts.pop()] 119 self.pushlines(parts) 120 121 def pushlines(self, lines): 122 # Crack into lines, but preserve the newlines on the end of each 123 parts = NLCRE_crack.split(data) 124 # The *ahem* interesting behaviour of re.split when supplied grouping 125 # parentheses is that the last element of the resulting list is the 126 # data after the final RE. In the case of a NL/CR terminated string, 127 # this is the empty string. 128 self._partial = parts.pop() 129 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: 130 # is there a \n to follow later? 131 if not self._partial and parts and parts[-1].endswith('\r'): 132 self._partial = parts.pop(-2)+parts.pop() 133 # parts is a list of strings, alternating between the line contents 134 # and the eol character(s). Gather up a list of lines after 135 # re-attaching the newlines. 136 lines = [] 137 for i in range(len(parts) // 2): 138 lines.append(parts[i*2] + parts[i*2+1]) 139 self.pushlines(lines) 140 141 def pushlines(self, lines): 142 # Reverse and insert at the front of the lines. 143 self._lines[:0] = lines[::-1] 144 145 def is_closed(self): 146 return self._closed 147 148 def __iter__(self): 149 return self 150 151 def next(self): 152 line = self.readline() 153 if line == '': 154 raise StopIteration 155 return line 156 157 158 160 class FeedParser: 161 """A feed-style parser of email.""" 162 163 def __init__(self, _factory=message.Message): 164 """_factory is called with no arguments to create a new message obj""" 165 self._factory = _factory 166 self._input = BufferedSubFile() 167 self._msgstack = [] 168 self._parse = self._parsegen().next 169 self._cur = None 170 self._last = None 171 self._headersonly = False 172 173 # Non-public interface for supporting Parser's headersonly flag 174 def _set_headersonly(self): 175 self._headersonly = True 176 177 def feed(self, data): 178 """Push more data into the parser.""" 179 self._input.push(data) 180 self._call_parse() 181 182 def _call_parse(self): 183 try: 184 self._parse() 185 except StopIteration: 186 pass 187 188 def close(self): 189 """Parse all remaining data and return the root message object.""" 190 self._input.close() 191 self._call_parse() 192 root = self._pop_message() 193 assert not self._msgstack 194 # Look for final set of defects 195 if root.get_content_maintype() == 'multipart' \ 196 and not root.is_multipart(): 197 root.defects.append(errors.MultipartInvariantViolationDefect()) 198 return root 199 200 def _new_message(self): 201 msg = self._factory() 202 if self._cur and self._cur.get_content_type() == 'multipart/digest': 203 msg.set_default_type('message/rfc822') 204 if self._msgstack: 205 self._msgstack[-1].attach(msg) 206 self._msgstack.append(msg) 207 self._cur = msg 208 self._last = msg 209 210 def _pop_message(self): 211 retval = self._msgstack.pop() 212 if self._msgstack: 213 self._cur = self._msgstack[-1] 214 else: 215 self._cur = None 216 return retval 217 218 def _parsegen(self): 219 # Create a new message and start by parsing headers. 220 self._new_message() 221 headers = [] 222 # Collect the headers, searching for a line that doesn't match the RFC 223 # 2822 header or continuation pattern (including an empty line). 224 for line in self._input: 225 if line is NeedMoreData: 226 yield NeedMoreData 227 continue 228 if not headerRE.match(line): 229 # If we saw the RFC defined header/body separator 230 # (i.e. newline), just throw it away. Otherwise the line is 231 # part of the body so push it back. 232 if not NLCRE.match(line): 233 self._input.unreadline(line) 234 break 235 headers.append(line) 236 # Done with the headers, so parse them and figure out what we're 237 # supposed to see in the body of the message. 238 self._parse_headers(headers) 239 # Headers-only parsing is a backwards compatibility hack, which was 240 # necessary in the older parser, which could raise errors. All 241 # remaining lines in the input are thrown into the message body. 242 if self._headersonly: 243 lines = [] 244 while True: 245 line = self._input.readline() 246 if line is NeedMoreData: 247 yield NeedMoreData 248 continue 249 if line == '': 250 break 251 lines.append(line) 252 self._cur.set_payload(EMPTYSTRING.join(lines)) 253 return 254 if self._cur.get_content_type() == 'message/delivery-status': 255 # message/delivery-status contains blocks of headers separated by 256 # a blank line. We'll represent each header block as a separate 257 # nested message object, but the processing is a bit different 258 # than standard message/* types because there is no body for the 259 # nested messages. A blank line separates the subparts. 260 while True: 261 self._input.push_eof_matcher(NLCRE.match) 262 for retval in self._parsegen(): 263 if retval is NeedMoreData: 264 yield NeedMoreData 265 continue 266 break 267 msg = self._pop_message() 268 # We need to pop the EOF matcher in order to tell if we're at 269 # the end of the current file, not the end of the last block 270 # of message headers. 271 self._input.pop_eof_matcher() 272 # The input stream must be sitting at the newline or at the 273 # EOF. We want to see if we're at the end of this subpart, so 274 # first consume the blank line, then test the next line to see 275 # if we're at this subpart's EOF. 276 while True: 277 line = self._input.readline() 278 if line is NeedMoreData: 279 yield NeedMoreData 280 continue 281 break 282 while True: 283 line = self._input.readline() 284 if line is NeedMoreData: 285 yield NeedMoreData 286 continue 287 break 288 if line == '': 289 break 290 # Not at EOF so this is a line we're going to need. 291 self._input.unreadline(line) 292 return 293 if self._cur.get_content_maintype() == 'message': 294 # The message claims to be a message/* type, then what follows is 295 # another RFC 2822 message. 296 for retval in self._parsegen(): 297 if retval is NeedMoreData: 298 yield NeedMoreData 299 continue 300 break 301 self._pop_message() 302 return 303 if self._cur.get_content_maintype() == 'multipart': 304 boundary = self._cur.get_boundary() 305 if boundary is None: 306 # The message /claims/ to be a multipart but it has not 307 # defined a boundary. That's a problem which we'll handle by 308 # reading everything until the EOF and marking the message as 309 # defective. 310 self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) 311 lines = [] 312 for line in self._input: 313 if line is NeedMoreData: 314 yield NeedMoreData 315 continue 316 lines.append(line) 317 self._cur.set_payload(EMPTYSTRING.join(lines)) 318 return 319 # Create a line match predicate which matches the inter-part 320 # boundary as well as the end-of-multipart boundary. Don't push 321 # this onto the input stream until we've scanned past the 322 # preamble. 323 separator = '--' + boundary 324 boundaryre = re.compile( 325 '(?P<sep>' + re.escape(separator) + 326 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 327 capturing_preamble = True 328 preamble = [] 329 linesep = False 330 while True: 331 line = self._input.readline() 332 if line is NeedMoreData: 333 yield NeedMoreData 334 continue 335 if line == '': 336 break 337 mo = boundaryre.match(line) 338 if mo: 339 # If we're looking at the end boundary, we're done with 340 # this multipart. If there was a newline at the end of 341 # the closing boundary, then we need to initialize the 342 # epilogue with the empty string (see below). 343 if mo.group('end'): 344 linesep = mo.group('linesep') 345 break 346 # We saw an inter-part boundary. Were we in the preamble? 347 if capturing_preamble: 348 if preamble: 349 # According to RFC 2046, the last newline belongs 350 # to the boundary. 351 lastline = preamble[-1] 352 eolmo = NLCRE_eol.search(lastline) 353 if eolmo: 354 preamble[-1] = lastline[:-len(eolmo.group(0))] 355 self._cur.preamble = EMPTYSTRING.join(preamble) 356 capturing_preamble = False 357 self._input.unreadline(line) 358 continue 359 # We saw a boundary separating two parts. Consume any 360 # multiple boundary lines that may be following. Our 361 # interpretation of RFC 2046 BNF grammar does not produce 362 # body parts within such double boundaries. 363 while True: 364 line = self._input.readline() 365 if line is NeedMoreData: 366 yield NeedMoreData 367 continue 368 mo = boundaryre.match(line) 369 if not mo: 370 self._input.unreadline(line) 371 break 372 # Recurse to parse this subpart; the input stream points 373 # at the subpart's first line. 374 self._input.push_eof_matcher(boundaryre.match) 375 for retval in self._parsegen(): 376 if retval is NeedMoreData: 377 yield NeedMoreData 378 continue 379 break 380 # Because of RFC 2046, the newline preceding the boundary 381 # separator actually belongs to the boundary, not the 382 # previous subpart's payload (or epilogue if the previous 383 # part is a multipart). 384 if self._last.get_content_maintype() == 'multipart': 385 epilogue = self._last.epilogue 386 if epilogue == '': 387 self._last.epilogue = None 388 elif epilogue is not None: 389 mo = NLCRE_eol.search(epilogue) 390 if mo: 391 end = len(mo.group(0)) 392 self._last.epilogue = epilogue[:-end] 393 else: 394 payload = self._last.get_payload() 395 if isinstance(payload, basestring): 396 mo = NLCRE_eol.search(payload) 397 if mo: 398 payload = payload[:-len(mo.group(0))] 399 self._last.set_payload(payload) 400 self._input.pop_eof_matcher() 401 self._pop_message() 402 # Set the multipart up for newline cleansing, which will 403 # happen if we're in a nested multipart. 404 self._last = self._cur 405 else: 406 # I think we must be in the preamble 407 assert capturing_preamble 408 preamble.append(line) 409 # We've seen either the EOF or the end boundary. If we're still 410 # capturing the preamble, we never saw the start boundary. Note 411 # that as a defect and store the captured text as the payload. 412 # Everything from here to the EOF is epilogue. 413 if capturing_preamble: 414 self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) 415 self._cur.set_payload(EMPTYSTRING.join(preamble)) 416 epilogue = [] 417 for line in self._input: 418 if line is NeedMoreData: 419 yield NeedMoreData 420 continue 421 self._cur.epilogue = EMPTYSTRING.join(epilogue) 422 return 423 # If the end boundary ended in a newline, we'll need to make sure 424 # the epilogue isn't None 425 if linesep: 426 epilogue = [''] 427 else: 428 epilogue = [] 429 for line in self._input: 430 if line is NeedMoreData: 431 yield NeedMoreData 432 continue 433 epilogue.append(line) 434 # Any CRLF at the front of the epilogue is not technically part of 435 # the epilogue. Also, watch out for an empty string epilogue, 436 # which means a single newline. 437 if epilogue: 438 firstline = epilogue[0] 439 bolmo = NLCRE_bol.match(firstline) 440 if bolmo: 441 epilogue[0] = firstline[len(bolmo.group(0)):] 442 self._cur.epilogue = EMPTYSTRING.join(epilogue) 443 return 444 # Otherwise, it's some non-multipart type, so the entire rest of the 445 # file contents becomes the payload. 446 lines = [] 447 for line in self._input: 448 if line is NeedMoreData: 449 yield NeedMoreData 450 continue 451 lines.append(line) 452 self._cur.set_payload(EMPTYSTRING.join(lines)) 453 454 def _parse_headers(self, lines): 455 # Passed a list of lines that make up the headers for the current msg 456 lastheader = '' 457 lastvalue = [] 458 for lineno, line in enumerate(lines): 459 # Check for continuation 460 if line[0] in ' \t': 461 if not lastheader: 462 # The first line of the headers was a continuation. This 463 # is illegal, so let's note the defect, store the illegal 464 # line, and ignore it for purposes of headers. 465 defect = errors.FirstHeaderLineIsContinuationDefect(line) 466 self._cur.defects.append(defect) 467 continue 468 lastvalue.append(line) 469 continue 470 if lastheader: 471 # XXX reconsider the joining of folded lines 472 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') 473 self._cur[lastheader] = lhdr 474 lastheader, lastvalue = '', [] 475 # Check for envelope header, i.e. unix-from 476 if line.startswith('From '): 477 if lineno == 0: 478 # Strip off the trailing newline 479 mo = NLCRE_eol.search(line) 480 if mo: 481 line = line[:-len(mo.group(0))] 482 self._cur.set_unixfrom(line) 483 continue 484 elif lineno == len(lines) - 1: 485 # Something looking like a unix-from at the end - it's 486 # probably the first line of the body, so push back the 487 # line and stop. 488 self._input.unreadline(line) 489 return 490 else: 491 # Weirdly placed unix-from line. Note this as a defect 492 # and ignore it. 493 defect = errors.MisplacedEnvelopeHeaderDefect(line) 494 self._cur.defects.append(defect) 495 continue 496 # Split the line on the colon separating field name from value. 497 i = line.find(':') 498 if i < 0: 499 defect = errors.MalformedHeaderDefect(line) 500 self._cur.defects.append(defect) 501 continue 502 lastheader = line[:i] 503 lastvalue = [line[i+1:].lstrip()] 504 # Done with all the lines, so handle the last header. 505 if lastheader: 506 # XXX reconsider the joining of folded lines 507 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') 508