1 # Copyright (C) 2004-2006 Python Software Foundation 2 # Authors: Baxter, Wouters and Warsaw 3 # Contact: email-sig (at] python.org 4 5 """FeedParser - An email feed parser. 6 7 The feed parser implements an interface for incrementally parsing an email 8 message, line by line. This has advantages for certain applications, such as 9 those reading email messages off a socket. 10 11 FeedParser.feed() is the primary interface for pushing new data into the 12 parser. It returns when there's nothing more it can do with the available 13 data. When you have no more data to push into the parser, call .close(). 14 This completes the parsing and returns the root message object. 15 16 The other advantage of this parser is that it will never raise a parsing 17 exception. Instead, when it finds something unexpected, it adds a 'defect' to 18 the current message. Defects are just instances that live on the message 19 object's .defects attribute. 20 """ 21 22 __all__ = ['FeedParser'] 23 24 import re 25 26 from email import errors 27 from email import message 28 29 NLCRE = re.compile('\r\n|\r|\n') 30 NLCRE_bol = re.compile('(\r\n|\r|\n)') 31 NLCRE_eol = re.compile('(\r\n|\r|\n)\Z') 32 NLCRE_crack = re.compile('(\r\n|\r|\n)') 33 # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character 34 # except controls, SP, and ":". 35 headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])') 36 EMPTYSTRING = '' 37 NL = '\n' 38 39 NeedMoreData = object() 40 41 42 44 class BufferedSubFile(object): 45 """A file-ish object that can have new data loaded into it. 46 47 You can also push and pop line-matching predicates onto a stack. When the 48 current predicate matches the current line, a false EOF response 49 (i.e. empty string) is returned instead. This lets the parser adhere to a 50 simple abstraction -- it parses until EOF closes the current message. 51 """ 52 def __init__(self): 53 # The last partial line pushed into this object. 54 self._partial = '' 55 # The list of full, pushed lines, in reverse order 56 self._lines = [] 57 # The stack of false-EOF checking predicates. 58 self._eofstack = [] 59 # A flag indicating whether the file has been closed or not. 60 self._closed = False 61 62 def push_eof_matcher(self, pred): 63 self._eofstack.append(pred) 64 65 def pop_eof_matcher(self): 66 return self._eofstack.pop() 67 68 def close(self): 69 # Don't forget any trailing partial line. 70 self._lines.append(self._partial) 71 self._partial = '' 72 self._closed = True 73 74 def readline(self): 75 if not self._lines: 76 if self._closed: 77 return '' 78 return NeedMoreData 79 # Pop the line off the stack and see if it matches the current 80 # false-EOF predicate. 81 line = self._lines.pop() 82 # RFC 2046, section 5.1.2 requires us to recognize outer level 83 # boundaries at any level of inner nesting. Do this, but be sure it's 84 # in the order of most to least nested. 85 for ateof in self._eofstack[::-1]: 86 if ateof(line): 87 # We're at the false EOF. But push the last line back first. 88 self._lines.append(line) 89 return '' 90 return line 91 92 def unreadline(self, line): 93 # Let the consumer push a line back into the buffer. 94 assert line is not NeedMoreData 95 self._lines.append(line) 96 97 def push(self, data): 98 """Push some new data into this object.""" 99 # Handle any previous leftovers 100 data, self._partial = self._partial + data, '' 101 # Crack into lines, but preserve the newlines on the end of each 102 parts = NLCRE_crack.split(data) 103 # The *ahem* interesting behaviour of re.split when supplied grouping 104 # parentheses is that the last element of the resulting list is the 105 # data after the final RE. In the case of a NL/CR terminated string, 106 # this is the empty string. 107 self._partial = parts.pop() 108 #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r: 109 # is there a \n to follow later? 110 if not self._partial and parts and parts[-1].endswith('\r'): 111 self._partial = parts.pop(-2)+parts.pop() 112 # parts is a list of strings, alternating between the line contents 113 # and the eol character(s). Gather up a list of lines after 114 # re-attaching the newlines. 115 lines = [] 116 for i in range(len(parts) // 2): 117 lines.append(parts[i*2] + parts[i*2+1]) 118 self.pushlines(lines) 119 120 def pushlines(self, lines): 121 # Reverse and insert at the front of the lines. 122 self._lines[:0] = lines[::-1] 123 124 def is_closed(self): 125 return self._closed 126 127 def __iter__(self): 128 return self 129 130 def next(self): 131 line = self.readline() 132 if line == '': 133 raise StopIteration 134 return line 135 136 137 139 class FeedParser: 140 """A feed-style parser of email.""" 141 142 def __init__(self, _factory=message.Message): 143 """_factory is called with no arguments to create a new message obj""" 144 self._factory = _factory 145 self._input = BufferedSubFile() 146 self._msgstack = [] 147 self._parse = self._parsegen().next 148 self._cur = None 149 self._last = None 150 self._headersonly = False 151 152 # Non-public interface for supporting Parser's headersonly flag 153 def _set_headersonly(self): 154 self._headersonly = True 155 156 def feed(self, data): 157 """Push more data into the parser.""" 158 self._input.push(data) 159 self._call_parse() 160 161 def _call_parse(self): 162 try: 163 self._parse() 164 except StopIteration: 165 pass 166 167 def close(self): 168 """Parse all remaining data and return the root message object.""" 169 self._input.close() 170 self._call_parse() 171 root = self._pop_message() 172 assert not self._msgstack 173 # Look for final set of defects 174 if root.get_content_maintype() == 'multipart' \ 175 and not root.is_multipart(): 176 root.defects.append(errors.MultipartInvariantViolationDefect()) 177 return root 178 179 def _new_message(self): 180 msg = self._factory() 181 if self._cur and self._cur.get_content_type() == 'multipart/digest': 182 msg.set_default_type('message/rfc822') 183 if self._msgstack: 184 self._msgstack[-1].attach(msg) 185 self._msgstack.append(msg) 186 self._cur = msg 187 self._last = msg 188 189 def _pop_message(self): 190 retval = self._msgstack.pop() 191 if self._msgstack: 192 self._cur = self._msgstack[-1] 193 else: 194 self._cur = None 195 return retval 196 197 def _parsegen(self): 198 # Create a new message and start by parsing headers. 199 self._new_message() 200 headers = [] 201 # Collect the headers, searching for a line that doesn't match the RFC 202 # 2822 header or continuation pattern (including an empty line). 203 for line in self._input: 204 if line is NeedMoreData: 205 yield NeedMoreData 206 continue 207 if not headerRE.match(line): 208 # If we saw the RFC defined header/body separator 209 # (i.e. newline), just throw it away. Otherwise the line is 210 # part of the body so push it back. 211 if not NLCRE.match(line): 212 self._input.unreadline(line) 213 break 214 headers.append(line) 215 # Done with the headers, so parse them and figure out what we're 216 # supposed to see in the body of the message. 217 self._parse_headers(headers) 218 # Headers-only parsing is a backwards compatibility hack, which was 219 # necessary in the older parser, which could raise errors. All 220 # remaining lines in the input are thrown into the message body. 221 if self._headersonly: 222 lines = [] 223 while True: 224 line = self._input.readline() 225 if line is NeedMoreData: 226 yield NeedMoreData 227 continue 228 if line == '': 229 break 230 lines.append(line) 231 self._cur.set_payload(EMPTYSTRING.join(lines)) 232 return 233 if self._cur.get_content_type() == 'message/delivery-status': 234 # message/delivery-status contains blocks of headers separated by 235 # a blank line. We'll represent each header block as a separate 236 # nested message object, but the processing is a bit different 237 # than standard message/* types because there is no body for the 238 # nested messages. A blank line separates the subparts. 239 while True: 240 self._input.push_eof_matcher(NLCRE.match) 241 for retval in self._parsegen(): 242 if retval is NeedMoreData: 243 yield NeedMoreData 244 continue 245 break 246 msg = self._pop_message() 247 # We need to pop the EOF matcher in order to tell if we're at 248 # the end of the current file, not the end of the last block 249 # of message headers. 250 self._input.pop_eof_matcher() 251 # The input stream must be sitting at the newline or at the 252 # EOF. We want to see if we're at the end of this subpart, so 253 # first consume the blank line, then test the next line to see 254 # if we're at this subpart's EOF. 255 while True: 256 line = self._input.readline() 257 if line is NeedMoreData: 258 yield NeedMoreData 259 continue 260 break 261 while True: 262 line = self._input.readline() 263 if line is NeedMoreData: 264 yield NeedMoreData 265 continue 266 break 267 if line == '': 268 break 269 # Not at EOF so this is a line we're going to need. 270 self._input.unreadline(line) 271 return 272 if self._cur.get_content_maintype() == 'message': 273 # The message claims to be a message/* type, then what follows is 274 # another RFC 2822 message. 275 for retval in self._parsegen(): 276 if retval is NeedMoreData: 277 yield NeedMoreData 278 continue 279 break 280 self._pop_message() 281 return 282 if self._cur.get_content_maintype() == 'multipart': 283 boundary = self._cur.get_boundary() 284 if boundary is None: 285 # The message /claims/ to be a multipart but it has not 286 # defined a boundary. That's a problem which we'll handle by 287 # reading everything until the EOF and marking the message as 288 # defective. 289 self._cur.defects.append(errors.NoBoundaryInMultipartDefect()) 290 lines = [] 291 for line in self._input: 292 if line is NeedMoreData: 293 yield NeedMoreData 294 continue 295 lines.append(line) 296 self._cur.set_payload(EMPTYSTRING.join(lines)) 297 return 298 # Create a line match predicate which matches the inter-part 299 # boundary as well as the end-of-multipart boundary. Don't push 300 # this onto the input stream until we've scanned past the 301 # preamble. 302 separator = '--' + boundary 303 boundaryre = re.compile( 304 '(?P<sep>' + re.escape(separator) + 305 r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$') 306 capturing_preamble = True 307 preamble = [] 308 linesep = False 309 while True: 310 line = self._input.readline() 311 if line is NeedMoreData: 312 yield NeedMoreData 313 continue 314 if line == '': 315 break 316 mo = boundaryre.match(line) 317 if mo: 318 # If we're looking at the end boundary, we're done with 319 # this multipart. If there was a newline at the end of 320 # the closing boundary, then we need to initialize the 321 # epilogue with the empty string (see below). 322 if mo.group('end'): 323 linesep = mo.group('linesep') 324 break 325 # We saw an inter-part boundary. Were we in the preamble? 326 if capturing_preamble: 327 if preamble: 328 # According to RFC 2046, the last newline belongs 329 # to the boundary. 330 lastline = preamble[-1] 331 eolmo = NLCRE_eol.search(lastline) 332 if eolmo: 333 preamble[-1] = lastline[:-len(eolmo.group(0))] 334 self._cur.preamble = EMPTYSTRING.join(preamble) 335 capturing_preamble = False 336 self._input.unreadline(line) 337 continue 338 # We saw a boundary separating two parts. Consume any 339 # multiple boundary lines that may be following. Our 340 # interpretation of RFC 2046 BNF grammar does not produce 341 # body parts within such double boundaries. 342 while True: 343 line = self._input.readline() 344 if line is NeedMoreData: 345 yield NeedMoreData 346 continue 347 mo = boundaryre.match(line) 348 if not mo: 349 self._input.unreadline(line) 350 break 351 # Recurse to parse this subpart; the input stream points 352 # at the subpart's first line. 353 self._input.push_eof_matcher(boundaryre.match) 354 for retval in self._parsegen(): 355 if retval is NeedMoreData: 356 yield NeedMoreData 357 continue 358 break 359 # Because of RFC 2046, the newline preceding the boundary 360 # separator actually belongs to the boundary, not the 361 # previous subpart's payload (or epilogue if the previous 362 # part is a multipart). 363 if self._last.get_content_maintype() == 'multipart': 364 epilogue = self._last.epilogue 365 if epilogue == '': 366 self._last.epilogue = None 367 elif epilogue is not None: 368 mo = NLCRE_eol.search(epilogue) 369 if mo: 370 end = len(mo.group(0)) 371 self._last.epilogue = epilogue[:-end] 372 else: 373 payload = self._last.get_payload() 374 if isinstance(payload, basestring): 375 mo = NLCRE_eol.search(payload) 376 if mo: 377 payload = payload[:-len(mo.group(0))] 378 self._last.set_payload(payload) 379 self._input.pop_eof_matcher() 380 self._pop_message() 381 # Set the multipart up for newline cleansing, which will 382 # happen if we're in a nested multipart. 383 self._last = self._cur 384 else: 385 # I think we must be in the preamble 386 assert capturing_preamble 387 preamble.append(line) 388 # We've seen either the EOF or the end boundary. If we're still 389 # capturing the preamble, we never saw the start boundary. Note 390 # that as a defect and store the captured text as the payload. 391 # Everything from here to the EOF is epilogue. 392 if capturing_preamble: 393 self._cur.defects.append(errors.StartBoundaryNotFoundDefect()) 394 self._cur.set_payload(EMPTYSTRING.join(preamble)) 395 epilogue = [] 396 for line in self._input: 397 if line is NeedMoreData: 398 yield NeedMoreData 399 continue 400 self._cur.epilogue = EMPTYSTRING.join(epilogue) 401 return 402 # If the end boundary ended in a newline, we'll need to make sure 403 # the epilogue isn't None 404 if linesep: 405 epilogue = [''] 406 else: 407 epilogue = [] 408 for line in self._input: 409 if line is NeedMoreData: 410 yield NeedMoreData 411 continue 412 epilogue.append(line) 413 # Any CRLF at the front of the epilogue is not technically part of 414 # the epilogue. Also, watch out for an empty string epilogue, 415 # which means a single newline. 416 if epilogue: 417 firstline = epilogue[0] 418 bolmo = NLCRE_bol.match(firstline) 419 if bolmo: 420 epilogue[0] = firstline[len(bolmo.group(0)):] 421 self._cur.epilogue = EMPTYSTRING.join(epilogue) 422 return 423 # Otherwise, it's some non-multipart type, so the entire rest of the 424 # file contents becomes the payload. 425 lines = [] 426 for line in self._input: 427 if line is NeedMoreData: 428 yield NeedMoreData 429 continue 430 lines.append(line) 431 self._cur.set_payload(EMPTYSTRING.join(lines)) 432 433 def _parse_headers(self, lines): 434 # Passed a list of lines that make up the headers for the current msg 435 lastheader = '' 436 lastvalue = [] 437 for lineno, line in enumerate(lines): 438 # Check for continuation 439 if line[0] in ' \t': 440 if not lastheader: 441 # The first line of the headers was a continuation. This 442 # is illegal, so let's note the defect, store the illegal 443 # line, and ignore it for purposes of headers. 444 defect = errors.FirstHeaderLineIsContinuationDefect(line) 445 self._cur.defects.append(defect) 446 continue 447 lastvalue.append(line) 448 continue 449 if lastheader: 450 # XXX reconsider the joining of folded lines 451 lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n') 452 self._cur[lastheader] = lhdr 453 lastheader, lastvalue = '', [] 454 # Check for envelope header, i.e. unix-from 455 if line.startswith('From '): 456 if lineno == 0: 457 # Strip off the trailing newline 458 mo = NLCRE_eol.search(line) 459 if mo: 460 line = line[:-len(mo.group(0))] 461 self._cur.set_unixfrom(line) 462 continue 463 elif lineno == len(lines) - 1: 464 # Something looking like a unix-from at the end - it's 465 # probably the first line of the body, so push back the 466 # line and stop. 467 self._input.unreadline(line) 468 return 469 else: 470 # Weirdly placed unix-from line. Note this as a defect 471 # and ignore it. 472 defect = errors.MisplacedEnvelopeHeaderDefect(line) 473 self._cur.defects.append(defect) 474 continue 475 # Split the line on the colon separating field name from value. 476 i = line.find(':') 477 if i < 0: 478 defect = errors.MalformedHeaderDefect(line) 479 self._cur.defects.append(defect) 480 continue 481 lastheader = line[:i] 482 lastvalue = [line[i+1:].lstrip()] 483 # Done with all the lines, so handle the last header. 484 if lastheader: 485 # XXX reconsider the joining of folded lines 486 self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n') 487