1 # Copyright (C) 2004-2006 Python Software Foundation 2 # Authors: Baxter, Wouters and Warsaw 3 # Contact: email-sig (at] python.org 4 5 """FeedParser - An email feed parser. 6 7 The feed parser implements an interface for incrementally parsing an email 8 message, line by line. This has advantages for certain applications, such as 9 those reading email messages off a socket. 10 11 FeedParser.feed() is the primary interface for pushing new data into the 12 parser. It returns when there's nothing more it can do with the available 13 data. When you have no more data to push into the parser, call .close(). 14 This completes the parsing and returns the root message object. 15 16 The other advantage of this parser is that it will never raise a parsing 17 exception. Instead, when it finds something unexpected, it adds a 'defect' to 18 the current message. Defects are just instances that live on the message 19 object's .defects attribute. 20 """ 21 22 __all__ = ['FeedParser', 'BytesFeedParser'] 23 24 import re 25 26 from email import errors 27 from email._policybase import compat32 28 from collections import deque 29 from io import StringIO 30 31 NLCRE = re.compile(r'\r\n|\r|\n') 32 NLCRE_bol = re.compile(r'(\r\n|\r|\n)') 33 NLCRE_eol = re.compile(r'(\r\n|\r|\n)\Z') 34 NLCRE_crack = re.compile(r'(\r\n|\r|\n)') 35 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 36 # except controls, SP, and ":". 37 headerRE = re.compile(r'^(From |[\041-\071\073-\176]*:|[\t ])') 38 EMPTYSTRING = '' 39 NL = '\n' 40 41 NeedMoreData = object() 42 43 44 46 class BufferedSubFile(object): 47 """A file-ish object that can have new data loaded into it. 48 49 You can also push and pop line-matching predicates onto a stack. When the 50 current predicate matches the current line, a false EOF response 51 (i.e. empty string) is returned instead. This lets the parser adhere to a 52 simple abstraction -- it parses until EOF closes the current message. 53 """ 54 def __init__(self): 55 # Text stream of the last partial line pushed into this object. 56 # See issue 22233 for why this is a text stream and not a list. 57 self._partial = StringIO(newline='') 58 # A deque of full, pushed lines 59 self._lines = deque() 60 # The stack of false-EOF checking predicates. 61 self._eofstack = [] 62 # A flag indicating whether the file has been closed or not. 63 self._closed = False 64 65 def push_eof_matcher(self, pred): 66 self._eofstack.append(pred) 67 68 def pop_eof_matcher(self): 69 return self._eofstack.pop() 70 71 def close(self): 72 # Don't forget any trailing partial line. 73 self._partial.seek(0) 74 self.pushlines(self._partial.readlines()) 75 self._partial.seek(0) 76 self._partial.truncate() 77 self._closed = True 78 79 def readline(self): 80 if not self._lines: 81 if self._closed: 82 return '' 83 return NeedMoreData 84 # Pop the line off the stack and see if it matches the current 85 # false-EOF predicate. 86 line = self._lines.popleft() 87 # RFC 2046, section 5.1.2 requires us to recognize outer level 88 # boundaries at any level of inner nesting. Do this, but be sure it's 89 # in the order of most to least nested. 90 for ateof in reversed(self._eofstack): 91 if ateof(line): 92 # We're at the false EOF. But push the last line back first. 93 self._lines.appendleft(line) 94 return '' 95 return line 96 97 def unreadline(self, line): 98 # Let the consumer push a line back into the buffer. 99 assert line is not NeedMoreData 100 self._lines.appendleft(line) 101 102 def push(self, data): 103 """Push some new data into this object.""" 104 self._partial.write(data) 105 if '\n' not in data and '\r' not in data: 106 # No new complete lines, wait for more. 107 return 108 109 # Crack into lines, preserving the linesep characters. 110 self._partial.seek(0) 111 parts = self._partial.readlines() 112 self._partial.seek(0) 113 self._partial.truncate() 114 115 # If the last element of the list does not end in a newline, then treat 116 # it as a partial line. We only check for '\n' here because a line 117 # ending with '\r' might be a line that was split in the middle of a 118 # '\r\n' sequence (see bugs 1555570 and 1721862). 119 if not parts[-1].endswith('\n'): 120 self._partial.write(parts.pop()) 121 self.pushlines(parts) 122 123 def pushlines(self, lines): 124 self._lines.extend(lines) 125 126 def __iter__(self): 127 return self 128 129 def __next__(self): 130 line = self.readline() 131 if line == '': 132 raise StopIteration 133 return line 134 135 136 138 class FeedParser: 139 """A feed-style parser of email.""" 140 141 def __init__(self, _factory=None, *, policy=compat32): 142 """_factory is called with no arguments to create a new message obj 143 144 The policy keyword specifies a policy object that controls a number of 145 aspects of the parser's operation. The default policy maintains 146 backward compatibility. 147 148 """ 149 self.policy = policy 150 self._old_style_factory = False 151 if _factory is None: 152 if policy.message_factory is None: 153 from email.message import Message 154 self._factory = Message 155 else: 156 self._factory = policy.message_factory 157 else: 158 self._factory = _factory 159 try: 160 _factory(policy=self.policy) 161 except TypeError: 162 # Assume this is an old-style factory 163 self._old_style_factory = True 164 self._input = BufferedSubFile() 165 self._msgstack = [] 166 self._parse = self._parsegen().__next__ 167 self._cur = None 168 self._last = None 169 self._headersonly = False 170 171 # Non-public interface for supporting Parser's headersonly flag 172 def _set_headersonly(self): 173 self._headersonly = True 174 175 def feed(self, data): 176 """Push more data into the parser.""" 177 self._input.push(data) 178 self._call_parse() 179 180 def _call_parse(self): 181 try: 182 self._parse() 183 except StopIteration: 184 pass 185 186 def close(self): 187 """Parse all remaining data and return the root message object.""" 188 self._input.close() 189 self._call_parse() 190 root = self._pop_message() 191 assert not self._msgstack 192 # Look for final set of defects 193 if root.get_content_maintype() == 'multipart' \ 194 and not root.is_multipart(): 195 defect = errors.MultipartInvariantViolationDefect() 196 self.policy.handle_defect(root, defect) 197 return root 198 199 def _new_message(self): 200 if self._old_style_factory: 201 msg = self._factory() 202 else: 203 msg = self._factory(policy=self.policy) 204 if self._cur and self._cur.get_content_type() == 'multipart/digest': 205 msg.set_default_type('message/rfc822') 206 if self._msgstack: 207 self._msgstack[-1].attach(msg) 208 self._msgstack.append(msg) 209 self._cur = msg 210 self._last = msg 211 212 def _pop_message(self): 213 retval = self._msgstack.pop() 214 if self._msgstack: 215 self._cur = self._msgstack[-1] 216 else: 217 self._cur = None 218 return retval 219 220 def _parsegen(self): 221 # Create a new message and start by parsing headers. 222 self._new_message() 223 headers = [] 224 # Collect the headers, searching for a line that doesn't match the RFC 225 # 2822 header or continuation pattern (including an empty line). 226 for line in self._input: 227 if line is NeedMoreData: 228 yield NeedMoreData 229 continue 230 if not headerRE.match(line): 231 # If we saw the RFC defined header/body separator 232 # (i.e. newline), just throw it away. Otherwise the line is 233 # part of the body so push it back. 234 if not NLCRE.match(line): 235 defect = errors.MissingHeaderBodySeparatorDefect() 236 self.policy.handle_defect(self._cur, defect) 237 self._input.unreadline(line) 238 break 239 headers.append(line) 240 # Done with the headers, so parse them and figure out what we're 241 # supposed to see in the body of the message. 242 self._parse_headers(headers) 243 # Headers-only parsing is a backwards compatibility hack, which was 244 # necessary in the older parser, which could raise errors. All 245 # remaining lines in the input are thrown into the message body. 246 if self._headersonly: 247 lines = [] 248 while True: 249 line = self._input.readline() 250 if line is NeedMoreData: 251 yield NeedMoreData 252 continue 253 if line == '': 254 break 255 lines.append(line) 256 self._cur.set_payload(EMPTYSTRING.join(lines)) 257 return 258 if self._cur.get_content_type() == 'message/delivery-status': 259 # message/delivery-status contains blocks of headers separated by 260 # a blank line. We'll represent each header block as a separate 261 # nested message object, but the processing is a bit different 262 # than standard message/* types because there is no body for the 263 # nested messages. A blank line separates the subparts. 264 while True: 265 self._input.push_eof_matcher(NLCRE.match) 266 for retval in self._parsegen(): 267 if retval is NeedMoreData: 268 yield NeedMoreData 269 continue 270 break 271 msg = self._pop_message() 272 # We need to pop the EOF matcher in order to tell if we're at 273 # the end of the current file, not the end of the last block 274 # of message headers. 275 self._input.pop_eof_matcher() 276 # The input stream must be sitting at the newline or at the 277 # EOF. We want to see if we're at the end of this subpart, so 278 # first consume the blank line, then test the next line to see 279 # if we're at this subpart's EOF. 280 while True: 281 line = self._input.readline() 282 if line is NeedMoreData: 283 yield NeedMoreData 284 continue 285 break 286 while True: 287 line = self._input.readline() 288 if line is NeedMoreData: 289 yield NeedMoreData 290 continue 291 break 292 if line == '': 293 break 294 # Not at EOF so this is a line we're going to need. 295 self._input.unreadline(line) 296 return 297 if self._cur.get_content_maintype() == 'message': 298 # The message claims to be a message/* type, then what follows is 299 # another RFC 2822 message. 300 for retval in self._parsegen(): 301 if retval is NeedMoreData: 302 yield NeedMoreData 303 continue 304 break 305 self._pop_message() 306 return 307 if self._cur.get_content_maintype() == 'multipart': 308 boundary = self._cur.get_boundary() 309 if boundary is None: 310 # The message /claims/ to be a multipart but it has not 311 # defined a boundary. That's a problem which we'll handle by 312 # reading everything until the EOF and marking the message as 313 # defective. 314 defect = errors.NoBoundaryInMultipartDefect() 315 self.policy.handle_defect(self._cur, defect) 316 lines = [] 317 for line in self._input: 318 if line is NeedMoreData: 319 yield NeedMoreData 320 continue 321 lines.append(line) 322 self._cur.set_payload(EMPTYSTRING.join(lines)) 323 return 324 # Make sure a valid content type was specified per RFC 2045:6.4. 325 if (self._cur.get('content-transfer-encoding', '8bit').lower() 326 not in ('7bit', '8bit', 'binary')): 327 defect = errors.InvalidMultipartContentTransferEncodingDefect() 328 self.policy.handle_defect(self._cur, defect) 329 # Create a line match predicate which matches the inter-part 330 # boundary as well as the end-of-multipart boundary. Don't push 331 # this onto the input stream until we've scanned past the 332 # preamble. 333 separator = '--' + boundary 334 boundaryre = re.compile( 335 '(?P<sep>' + re.escape(separator) + 336 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 337 capturing_preamble = True 338 preamble = [] 339 linesep = False 340 close_boundary_seen = False 341 while True: 342 line = self._input.readline() 343 if line is NeedMoreData: 344 yield NeedMoreData 345 continue 346 if line == '': 347 break 348 mo = boundaryre.match(line) 349 if mo: 350 # If we're looking at the end boundary, we're done with 351 # this multipart. If there was a newline at the end of 352 # the closing boundary, then we need to initialize the 353 # epilogue with the empty string (see below). 354 if mo.group('end'): 355 close_boundary_seen = True 356 linesep = mo.group('linesep') 357 break 358 # We saw an inter-part boundary. Were we in the preamble? 359 if capturing_preamble: 360 if preamble: 361 # According to RFC 2046, the last newline belongs 362 # to the boundary. 363 lastline = preamble[-1] 364 eolmo = NLCRE_eol.search(lastline) 365 if eolmo: 366 preamble[-1] = lastline[:-len(eolmo.group(0))] 367 self._cur.preamble = EMPTYSTRING.join(preamble) 368 capturing_preamble = False 369 self._input.unreadline(line) 370 continue 371 # We saw a boundary separating two parts. Consume any 372 # multiple boundary lines that may be following. Our 373 # interpretation of RFC 2046 BNF grammar does not produce 374 # body parts within such double boundaries. 375 while True: 376 line = self._input.readline() 377 if line is NeedMoreData: 378 yield NeedMoreData 379 continue 380 mo = boundaryre.match(line) 381 if not mo: 382 self._input.unreadline(line) 383 break 384 # Recurse to parse this subpart; the input stream points 385 # at the subpart's first line. 386 self._input.push_eof_matcher(boundaryre.match) 387 for retval in self._parsegen(): 388 if retval is NeedMoreData: 389 yield NeedMoreData 390 continue 391 break 392 # Because of RFC 2046, the newline preceding the boundary 393 # separator actually belongs to the boundary, not the 394 # previous subpart's payload (or epilogue if the previous 395 # part is a multipart). 396 if self._last.get_content_maintype() == 'multipart': 397 epilogue = self._last.epilogue 398 if epilogue == '': 399 self._last.epilogue = None 400 elif epilogue is not None: 401 mo = NLCRE_eol.search(epilogue) 402 if mo: 403 end = len(mo.group(0)) 404 self._last.epilogue = epilogue[:-end] 405 else: 406 payload = self._last._payload 407 if isinstance(payload, str): 408 mo = NLCRE_eol.search(payload) 409 if mo: 410 payload = payload[:-len(mo.group(0))] 411 self._last._payload = payload 412 self._input.pop_eof_matcher() 413 self._pop_message() 414 # Set the multipart up for newline cleansing, which will 415 # happen if we're in a nested multipart. 416 self._last = self._cur 417 else: 418 # I think we must be in the preamble 419 assert capturing_preamble 420 preamble.append(line) 421 # We've seen either the EOF or the end boundary. If we're still 422 # capturing the preamble, we never saw the start boundary. Note 423 # that as a defect and store the captured text as the payload. 424 if capturing_preamble: 425 defect = errors.StartBoundaryNotFoundDefect() 426 self.policy.handle_defect(self._cur, defect) 427 self._cur.set_payload(EMPTYSTRING.join(preamble)) 428 epilogue = [] 429 for line in self._input: 430 if line is NeedMoreData: 431 yield NeedMoreData 432 continue 433 self._cur.epilogue = EMPTYSTRING.join(epilogue) 434 return 435 # If we're not processing the preamble, then we might have seen 436 # EOF without seeing that end boundary...that is also a defect. 437 if not close_boundary_seen: 438 defect = errors.CloseBoundaryNotFoundDefect() 439 self.policy.handle_defect(self._cur, defect) 440 return 441 # Everything from here to the EOF is epilogue. If the end boundary 442 # ended in a newline, we'll need to make sure the epilogue isn't 443 # None 444 if linesep: 445 epilogue = [''] 446 else: 447 epilogue = [] 448 for line in self._input: 449 if line is NeedMoreData: 450 yield NeedMoreData 451 continue 452 epilogue.append(line) 453 # Any CRLF at the front of the epilogue is not technically part of 454 # the epilogue. Also, watch out for an empty string epilogue, 455 # which means a single newline. 456 if epilogue: 457 firstline = epilogue[0] 458 bolmo = NLCRE_bol.match(firstline) 459 if bolmo: 460 epilogue[0] = firstline[len(bolmo.group(0)):] 461 self._cur.epilogue = EMPTYSTRING.join(epilogue) 462 return 463 # Otherwise, it's some non-multipart type, so the entire rest of the 464 # file contents becomes the payload. 465 lines = [] 466 for line in self._input: 467 if line is NeedMoreData: 468 yield NeedMoreData 469 continue 470 lines.append(line) 471 self._cur.set_payload(EMPTYSTRING.join(lines)) 472 473 def _parse_headers(self, lines): 474 # Passed a list of lines that make up the headers for the current msg 475 lastheader = '' 476 lastvalue = [] 477 for lineno, line in enumerate(lines): 478 # Check for continuation 479 if line[0] in ' \t': 480 if not lastheader: 481 # The first line of the headers was a continuation. This 482 # is illegal, so let's note the defect, store the illegal 483 # line, and ignore it for purposes of headers. 484 defect = errors.FirstHeaderLineIsContinuationDefect(line) 485 self.policy.handle_defect(self._cur, defect) 486 continue 487 lastvalue.append(line) 488 continue 489 if lastheader: 490 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 491 lastheader, lastvalue = '', [] 492 # Check for envelope header, i.e. unix-from 493 if line.startswith('From '): 494 if lineno == 0: 495 # Strip off the trailing newline 496 mo = NLCRE_eol.search(line) 497 if mo: 498 line = line[:-len(mo.group(0))] 499 self._cur.set_unixfrom(line) 500 continue 501 elif lineno == len(lines) - 1: 502 # Something looking like a unix-from at the end - it's 503 # probably the first line of the body, so push back the 504 # line and stop. 505 self._input.unreadline(line) 506 return 507 else: 508 # Weirdly placed unix-from line. Note this as a defect 509 # and ignore it. 510 defect = errors.MisplacedEnvelopeHeaderDefect(line) 511 self._cur.defects.append(defect) 512 continue 513 # Split the line on the colon separating field name from value. 514 # There will always be a colon, because if there wasn't the part of 515 # the parser that calls us would have started parsing the body. 516 i = line.find(':') 517 518 # If the colon is on the start of the line the header is clearly 519 # malformed, but we might be able to salvage the rest of the 520 # message. Track the error but keep going. 521 if i == 0: 522 defect = errors.InvalidHeaderDefect("Missing header name.") 523 self._cur.defects.append(defect) 524 continue 525 526 assert i>0, "_parse_headers fed line with no : and no leading WS" 527 lastheader = line[:i] 528 lastvalue = [line] 529 # Done with all the lines, so handle the last header. 530 if lastheader: 531 self._cur.set_raw(*self.policy.header_source_parse(lastvalue)) 532 533 534 class BytesFeedParser(FeedParser): 535 """Like FeedParser, but feed accepts bytes.""" 536 537 def feed(self, data): 538 super().feed(data.decode('ascii', 'surrogateescape')) 539