1 """Header value parser implementing various email-related RFC parsing rules. 2 3 The parsing methods defined in this module implement various email related 4 parsing rules. Principal among them is RFC 5322, which is the followon 5 to RFC 2822 and primarily a clarification of the former. It also implements 6 RFC 2047 encoded word decoding. 7 8 RFC 5322 goes to considerable trouble to maintain backward compatibility with 9 RFC 822 in the parse phase, while cleaning up the structure on the generation 10 phase. This parser supports correct RFC 5322 generation by tagging white space 11 as folding white space only when folding is allowed in the non-obsolete rule 12 sets. Actually, the parser is even more generous when accepting input than RFC 13 5322 mandates, following the spirit of Postel's Law, which RFC 5322 encourages. 14 Where possible deviations from the standard are annotated on the 'defects' 15 attribute of tokens that deviate. 16 17 The general structure of the parser follows RFC 5322, and uses its terminology 18 where there is a direct correspondence. Where the implementation requires a 19 somewhat different structure than that used by the formal grammar, new terms 20 that mimic the closest existing terms are used. Thus, it really helps to have 21 a copy of RFC 5322 handy when studying this code. 22 23 Input to the parser is a string that has already been unfolded according to 24 RFC 5322 rules. According to the RFC this unfolding is the very first step, and 25 this parser leaves the unfolding step to a higher level message parser, which 26 will have already detected the line breaks that need unfolding while 27 determining the beginning and end of each header. 28 29 The output of the parser is a TokenList object, which is a list subclass. A 30 TokenList is a recursive data structure. The terminal nodes of the structure 31 are Terminal objects, which are subclasses of str. These do not correspond 32 directly to terminal objects in the formal grammar, but are instead more 33 practical higher level combinations of true terminals. 34 35 All TokenList and Terminal objects have a 'value' attribute, which produces the 36 semantically meaningful value of that part of the parse subtree. The value of 37 all whitespace tokens (no matter how many sub-tokens they may contain) is a 38 single space, as per the RFC rules. This includes 'CFWS', which is herein 39 included in the general class of whitespace tokens. There is one exception to 40 the rule that whitespace tokens are collapsed into single spaces in values: in 41 the value of a 'bare-quoted-string' (a quoted-string with no leading or 42 trailing whitespace), any whitespace that appeared between the quotation marks 43 is preserved in the returned value. Note that in all Terminal strings quoted 44 pairs are turned into their unquoted values. 45 46 All TokenList and Terminal objects also have a string value, which attempts to 47 be a "canonical" representation of the RFC-compliant form of the substring that 48 produced the parsed subtree, including minimal use of quoted pair quoting. 49 Whitespace runs are not collapsed. 50 51 Comment tokens also have a 'content' attribute providing the string found 52 between the parens (including any nested comments) with whitespace preserved. 53 54 All TokenList and Terminal objects have a 'defects' attribute which is a 55 possibly empty list all of the defects found while creating the token. Defects 56 may appear on any token in the tree, and a composite list of all defects in the 57 subtree is available through the 'all_defects' attribute of any node. (For 58 Terminal notes x.defects == x.all_defects.) 59 60 Each object in a parse tree is called a 'token', and each has a 'token_type' 61 attribute that gives the name from the RFC 5322 grammar that it represents. 62 Not all RFC 5322 nodes are produced, and there is one non-RFC 5322 node that 63 may be produced: 'ptext'. A 'ptext' is a string of printable ascii characters. 64 It is returned in place of lists of (ctext/quoted-pair) and 65 (qtext/quoted-pair). 66 67 XXX: provide complete list of token types. 68 """ 69 70 import re 71 import urllib # For urllib.parse.unquote 72 from string import hexdigits 73 from collections import OrderedDict 74 from operator import itemgetter 75 from email import _encoded_words as _ew 76 from email import errors 77 from email import utils 78 79 # 80 # Useful constants and functions 81 # 82 83 WSP = set(' \t') 84 CFWS_LEADER = WSP | set('(') 85 SPECIALS = set(r'()<>@,:;.\"[]') 86 ATOM_ENDS = SPECIALS | WSP 87 DOT_ATOM_ENDS = ATOM_ENDS - set('.') 88 # '.', '"', and '(' do not end phrases in order to support obs-phrase 89 PHRASE_ENDS = SPECIALS - set('."(') 90 TSPECIALS = (SPECIALS | set('/?=')) - set('.') 91 TOKEN_ENDS = TSPECIALS | WSP 92 ASPECIALS = TSPECIALS | set("*'%") 93 ATTRIBUTE_ENDS = ASPECIALS | WSP 94 EXTENDED_ATTRIBUTE_ENDS = ATTRIBUTE_ENDS - set('%') 95 96 def quote_string(value): 97 return '"'+str(value).replace('\\', '\\\\').replace('"', r'\"')+'"' 98 99 # 100 # Accumulator for header folding 101 # 102 103 class _Folded: 104 105 def __init__(self, maxlen, policy): 106 self.maxlen = maxlen 107 self.policy = policy 108 self.lastlen = 0 109 self.stickyspace = None 110 self.firstline = True 111 self.done = [] 112 self.current = [] 113 114 def newline(self): 115 self.done.extend(self.current) 116 self.done.append(self.policy.linesep) 117 self.current.clear() 118 self.lastlen = 0 119 120 def finalize(self): 121 if self.current: 122 self.newline() 123 124 def __str__(self): 125 return ''.join(self.done) 126 127 def append(self, stoken): 128 self.current.append(stoken) 129 130 def append_if_fits(self, token, stoken=None): 131 if stoken is None: 132 stoken = str(token) 133 l = len(stoken) 134 if self.stickyspace is not None: 135 stickyspace_len = len(self.stickyspace) 136 if self.lastlen + stickyspace_len + l <= self.maxlen: 137 self.current.append(self.stickyspace) 138 self.lastlen += stickyspace_len 139 self.current.append(stoken) 140 self.lastlen += l 141 self.stickyspace = None 142 self.firstline = False 143 return True 144 if token.has_fws: 145 ws = token.pop_leading_fws() 146 if ws is not None: 147 self.stickyspace += str(ws) 148 stickyspace_len += len(ws) 149 token._fold(self) 150 return True 151 if stickyspace_len and l + 1 <= self.maxlen: 152 margin = self.maxlen - l 153 if 0 < margin < stickyspace_len: 154 trim = stickyspace_len - margin 155 self.current.append(self.stickyspace[:trim]) 156 self.stickyspace = self.stickyspace[trim:] 157 stickyspace_len = trim 158 self.newline() 159 self.current.append(self.stickyspace) 160 self.current.append(stoken) 161 self.lastlen = l + stickyspace_len 162 self.stickyspace = None 163 self.firstline = False 164 return True 165 if not self.firstline: 166 self.newline() 167 self.current.append(self.stickyspace) 168 self.current.append(stoken) 169 self.stickyspace = None 170 self.firstline = False 171 return True 172 if self.lastlen + l <= self.maxlen: 173 self.current.append(stoken) 174 self.lastlen += l 175 return True 176 if l < self.maxlen: 177 self.newline() 178 self.current.append(stoken) 179 self.lastlen = l 180 return True 181 return False 182 183 # 184 # TokenList and its subclasses 185 # 186 187 class TokenList(list): 188 189 token_type = None 190 191 def __init__(self, *args, **kw): 192 super().__init__(*args, **kw) 193 self.defects = [] 194 195 def __str__(self): 196 return ''.join(str(x) for x in self) 197 198 def __repr__(self): 199 return '{}({})'.format(self.__class__.__name__, 200 super().__repr__()) 201 202 @property 203 def value(self): 204 return ''.join(x.value for x in self if x.value) 205 206 @property 207 def all_defects(self): 208 return sum((x.all_defects for x in self), self.defects) 209 210 # 211 # Folding API 212 # 213 # parts(): 214 # 215 # return a list of objects that constitute the "higher level syntactic 216 # objects" specified by the RFC as the best places to fold a header line. 217 # The returned objects must include leading folding white space, even if 218 # this means mutating the underlying parse tree of the object. Each object 219 # is only responsible for returning *its* parts, and should not drill down 220 # to any lower level except as required to meet the leading folding white 221 # space constraint. 222 # 223 # _fold(folded): 224 # 225 # folded: the result accumulator. This is an instance of _Folded. 226 # (XXX: I haven't finished factoring this out yet, the folding code 227 # pretty much uses this as a state object.) When the folded.current 228 # contains as much text as will fit, the _fold method should call 229 # folded.newline. 230 # folded.lastlen: the current length of the test stored in folded.current. 231 # folded.maxlen: The maximum number of characters that may appear on a 232 # folded line. Differs from the policy setting in that "no limit" is 233 # represented by +inf, which means it can be used in the trivially 234 # logical fashion in comparisons. 235 # 236 # Currently no subclasses implement parts, and I think this will remain 237 # true. A subclass only needs to implement _fold when the generic version 238 # isn't sufficient. _fold will need to be implemented primarily when it is 239 # possible for encoded words to appear in the specialized token-list, since 240 # there is no generic algorithm that can know where exactly the encoded 241 # words are allowed. A _fold implementation is responsible for filling 242 # lines in the same general way that the top level _fold does. It may, and 243 # should, call the _fold method of sub-objects in a similar fashion to that 244 # of the top level _fold. 245 # 246 # XXX: I'm hoping it will be possible to factor the existing code further 247 # to reduce redundancy and make the logic clearer. 248 249 @property 250 def parts(self): 251 klass = self.__class__ 252 this = [] 253 for token in self: 254 if token.startswith_fws(): 255 if this: 256 yield this[0] if len(this)==1 else klass(this) 257 this.clear() 258 end_ws = token.pop_trailing_ws() 259 this.append(token) 260 if end_ws: 261 yield klass(this) 262 this = [end_ws] 263 if this: 264 yield this[0] if len(this)==1 else klass(this) 265 266 def startswith_fws(self): 267 return self[0].startswith_fws() 268 269 def pop_leading_fws(self): 270 if self[0].token_type == 'fws': 271 return self.pop(0) 272 return self[0].pop_leading_fws() 273 274 def pop_trailing_ws(self): 275 if self[-1].token_type == 'cfws': 276 return self.pop(-1) 277 return self[-1].pop_trailing_ws() 278 279 @property 280 def has_fws(self): 281 for part in self: 282 if part.has_fws: 283 return True 284 return False 285 286 def has_leading_comment(self): 287 return self[0].has_leading_comment() 288 289 @property 290 def comments(self): 291 comments = [] 292 for token in self: 293 comments.extend(token.comments) 294 return comments 295 296 def fold(self, *, policy): 297 # max_line_length 0/None means no limit, ie: infinitely long. 298 maxlen = policy.max_line_length or float("+inf") 299 folded = _Folded(maxlen, policy) 300 self._fold(folded) 301 folded.finalize() 302 return str(folded) 303 304 def as_encoded_word(self, charset): 305 # This works only for things returned by 'parts', which include 306 # the leading fws, if any, that should be used. 307 res = [] 308 ws = self.pop_leading_fws() 309 if ws: 310 res.append(ws) 311 trailer = self.pop(-1) if self[-1].token_type=='fws' else '' 312 res.append(_ew.encode(str(self), charset)) 313 res.append(trailer) 314 return ''.join(res) 315 316 def cte_encode(self, charset, policy): 317 res = [] 318 for part in self: 319 res.append(part.cte_encode(charset, policy)) 320 return ''.join(res) 321 322 def _fold(self, folded): 323 encoding = 'utf-8' if folded.policy.utf8 else 'ascii' 324 for part in self.parts: 325 tstr = str(part) 326 tlen = len(tstr) 327 try: 328 str(part).encode(encoding) 329 except UnicodeEncodeError: 330 if any(isinstance(x, errors.UndecodableBytesDefect) 331 for x in part.all_defects): 332 charset = 'unknown-8bit' 333 else: 334 # XXX: this should be a policy setting when utf8 is False. 335 charset = 'utf-8' 336 tstr = part.cte_encode(charset, folded.policy) 337 tlen = len(tstr) 338 if folded.append_if_fits(part, tstr): 339 continue 340 # Peel off the leading whitespace if any and make it sticky, to 341 # avoid infinite recursion. 342 ws = part.pop_leading_fws() 343 if ws is not None: 344 # Peel off the leading whitespace and make it sticky, to 345 # avoid infinite recursion. 346 folded.stickyspace = str(part.pop(0)) 347 if folded.append_if_fits(part): 348 continue 349 if part.has_fws: 350 part._fold(folded) 351 continue 352 # There are no fold points in this one; it is too long for a single 353 # line and can't be split...we just have to put it on its own line. 354 folded.append(tstr) 355 folded.newline() 356 357 def pprint(self, indent=''): 358 print('\n'.join(self._pp(indent=''))) 359 360 def ppstr(self, indent=''): 361 return '\n'.join(self._pp(indent='')) 362 363 def _pp(self, indent=''): 364 yield '{}{}/{}('.format( 365 indent, 366 self.__class__.__name__, 367 self.token_type) 368 for token in self: 369 if not hasattr(token, '_pp'): 370 yield (indent + ' !! invalid element in token ' 371 'list: {!r}'.format(token)) 372 else: 373 yield from token._pp(indent+' ') 374 if self.defects: 375 extra = ' Defects: {}'.format(self.defects) 376 else: 377 extra = '' 378 yield '{}){}'.format(indent, extra) 379 380 381 class WhiteSpaceTokenList(TokenList): 382 383 @property 384 def value(self): 385 return ' ' 386 387 @property 388 def comments(self): 389 return [x.content for x in self if x.token_type=='comment'] 390 391 392 class UnstructuredTokenList(TokenList): 393 394 token_type = 'unstructured' 395 396 def _fold(self, folded): 397 last_ew = None 398 encoding = 'utf-8' if folded.policy.utf8 else 'ascii' 399 for part in self.parts: 400 tstr = str(part) 401 is_ew = False 402 try: 403 str(part).encode(encoding) 404 except UnicodeEncodeError: 405 if any(isinstance(x, errors.UndecodableBytesDefect) 406 for x in part.all_defects): 407 charset = 'unknown-8bit' 408 else: 409 charset = 'utf-8' 410 if last_ew is not None: 411 # We've already done an EW, combine this one with it 412 # if there's room. 413 chunk = get_unstructured( 414 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) 415 oldlastlen = sum(len(x) for x in folded.current[:last_ew]) 416 schunk = str(chunk) 417 lchunk = len(schunk) 418 if oldlastlen + lchunk <= folded.maxlen: 419 del folded.current[last_ew:] 420 folded.append(schunk) 421 folded.lastlen = oldlastlen + lchunk 422 continue 423 tstr = part.as_encoded_word(charset) 424 is_ew = True 425 if folded.append_if_fits(part, tstr): 426 if is_ew: 427 last_ew = len(folded.current) - 1 428 continue 429 if is_ew or last_ew: 430 # It's too big to fit on the line, but since we've 431 # got encoded words we can use encoded word folding. 432 part._fold_as_ew(folded) 433 continue 434 # Peel off the leading whitespace if any and make it sticky, to 435 # avoid infinite recursion. 436 ws = part.pop_leading_fws() 437 if ws is not None: 438 folded.stickyspace = str(ws) 439 if folded.append_if_fits(part): 440 continue 441 if part.has_fws: 442 part._fold(folded) 443 continue 444 # It can't be split...we just have to put it on its own line. 445 folded.append(tstr) 446 folded.newline() 447 last_ew = None 448 449 def cte_encode(self, charset, policy): 450 res = [] 451 last_ew = None 452 for part in self: 453 spart = str(part) 454 try: 455 spart.encode('us-ascii') 456 res.append(spart) 457 except UnicodeEncodeError: 458 if last_ew is None: 459 res.append(part.cte_encode(charset, policy)) 460 last_ew = len(res) 461 else: 462 tl = get_unstructured(''.join(res[last_ew:] + [spart])) 463 res.append(tl.as_encoded_word(charset)) 464 return ''.join(res) 465 466 467 class Phrase(TokenList): 468 469 token_type = 'phrase' 470 471 def _fold(self, folded): 472 # As with Unstructured, we can have pure ASCII with or without 473 # surrogateescape encoded bytes, or we could have unicode. But this 474 # case is more complicated, since we have to deal with the various 475 # sub-token types and how they can be composed in the face of 476 # unicode-that-needs-CTE-encoding, and the fact that if a token a 477 # comment that becomes a barrier across which we can't compose encoded 478 # words. 479 last_ew = None 480 encoding = 'utf-8' if folded.policy.utf8 else 'ascii' 481 for part in self.parts: 482 tstr = str(part) 483 tlen = len(tstr) 484 has_ew = False 485 try: 486 str(part).encode(encoding) 487 except UnicodeEncodeError: 488 if any(isinstance(x, errors.UndecodableBytesDefect) 489 for x in part.all_defects): 490 charset = 'unknown-8bit' 491 else: 492 charset = 'utf-8' 493 if last_ew is not None and not part.has_leading_comment(): 494 # We've already done an EW, let's see if we can combine 495 # this one with it. The last_ew logic ensures that all we 496 # have at this point is atoms, no comments or quoted 497 # strings. So we can treat the text between the last 498 # encoded word and the content of this token as 499 # unstructured text, and things will work correctly. But 500 # we have to strip off any trailing comment on this token 501 # first, and if it is a quoted string we have to pull out 502 # the content (we're encoding it, so it no longer needs to 503 # be quoted). 504 if part[-1].token_type == 'cfws' and part.comments: 505 remainder = part.pop(-1) 506 else: 507 remainder = '' 508 for i, token in enumerate(part): 509 if token.token_type == 'bare-quoted-string': 510 part[i] = UnstructuredTokenList(token[:]) 511 chunk = get_unstructured( 512 ''.join(folded.current[last_ew:]+[tstr])).as_encoded_word(charset) 513 schunk = str(chunk) 514 lchunk = len(schunk) 515 if last_ew + lchunk <= folded.maxlen: 516 del folded.current[last_ew:] 517 folded.append(schunk) 518 folded.lastlen = sum(len(x) for x in folded.current) 519 continue 520 tstr = part.as_encoded_word(charset) 521 tlen = len(tstr) 522 has_ew = True 523 if folded.append_if_fits(part, tstr): 524 if has_ew and not part.comments: 525 last_ew = len(folded.current) - 1 526 elif part.comments or part.token_type == 'quoted-string': 527 # If a comment is involved we can't combine EWs. And if a 528 # quoted string is involved, it's not worth the effort to 529 # try to combine them. 530 last_ew = None 531 continue 532 part._fold(folded) 533 534 def cte_encode(self, charset, policy): 535 res = [] 536 last_ew = None 537 is_ew = False 538 for part in self: 539 spart = str(part) 540 try: 541 spart.encode('us-ascii') 542 res.append(spart) 543 except UnicodeEncodeError: 544 is_ew = True 545 if last_ew is None: 546 if not part.comments: 547 last_ew = len(res) 548 res.append(part.cte_encode(charset, policy)) 549 elif not part.has_leading_comment(): 550 if part[-1].token_type == 'cfws' and part.comments: 551 remainder = part.pop(-1) 552 else: 553 remainder = '' 554 for i, token in enumerate(part): 555 if token.token_type == 'bare-quoted-string': 556 part[i] = UnstructuredTokenList(token[:]) 557 tl = get_unstructured(''.join(res[last_ew:] + [spart])) 558 res[last_ew:] = [tl.as_encoded_word(charset)] 559 if part.comments or (not is_ew and part.token_type == 'quoted-string'): 560 last_ew = None 561 return ''.join(res) 562 563 class Word(TokenList): 564 565 token_type = 'word' 566 567 568 class CFWSList(WhiteSpaceTokenList): 569 570 token_type = 'cfws' 571 572 def has_leading_comment(self): 573 return bool(self.comments) 574 575 576 class Atom(TokenList): 577 578 token_type = 'atom' 579 580 581 class Token(TokenList): 582 583 token_type = 'token' 584 585 586 class EncodedWord(TokenList): 587 588 token_type = 'encoded-word' 589 cte = None 590 charset = None 591 lang = None 592 593 @property 594 def encoded(self): 595 if self.cte is not None: 596 return self.cte 597 _ew.encode(str(self), self.charset) 598 599 600 601 class QuotedString(TokenList): 602 603 token_type = 'quoted-string' 604 605 @property 606 def content(self): 607 for x in self: 608 if x.token_type == 'bare-quoted-string': 609 return x.value 610 611 @property 612 def quoted_value(self): 613 res = [] 614 for x in self: 615 if x.token_type == 'bare-quoted-string': 616 res.append(str(x)) 617 else: 618 res.append(x.value) 619 return ''.join(res) 620 621 @property 622 def stripped_value(self): 623 for token in self: 624 if token.token_type == 'bare-quoted-string': 625 return token.value 626 627 628 class BareQuotedString(QuotedString): 629 630 token_type = 'bare-quoted-string' 631 632 def __str__(self): 633 return quote_string(''.join(str(x) for x in self)) 634 635 @property 636 def value(self): 637 return ''.join(str(x) for x in self) 638 639 640 class Comment(WhiteSpaceTokenList): 641 642 token_type = 'comment' 643 644 def __str__(self): 645 return ''.join(sum([ 646 ["("], 647 [self.quote(x) for x in self], 648 [")"], 649 ], [])) 650 651 def quote(self, value): 652 if value.token_type == 'comment': 653 return str(value) 654 return str(value).replace('\\', '\\\\').replace( 655 '(', r'\(').replace( 656 ')', r'\)') 657 658 @property 659 def content(self): 660 return ''.join(str(x) for x in self) 661 662 @property 663 def comments(self): 664 return [self.content] 665 666 class AddressList(TokenList): 667 668 token_type = 'address-list' 669 670 @property 671 def addresses(self): 672 return [x for x in self if x.token_type=='address'] 673 674 @property 675 def mailboxes(self): 676 return sum((x.mailboxes 677 for x in self if x.token_type=='address'), []) 678 679 @property 680 def all_mailboxes(self): 681 return sum((x.all_mailboxes 682 for x in self if x.token_type=='address'), []) 683 684 685 class Address(TokenList): 686 687 token_type = 'address' 688 689 @property 690 def display_name(self): 691 if self[0].token_type == 'group': 692 return self[0].display_name 693 694 @property 695 def mailboxes(self): 696 if self[0].token_type == 'mailbox': 697 return [self[0]] 698 elif self[0].token_type == 'invalid-mailbox': 699 return [] 700 return self[0].mailboxes 701 702 @property 703 def all_mailboxes(self): 704 if self[0].token_type == 'mailbox': 705 return [self[0]] 706 elif self[0].token_type == 'invalid-mailbox': 707 return [self[0]] 708 return self[0].all_mailboxes 709 710 class MailboxList(TokenList): 711 712 token_type = 'mailbox-list' 713 714 @property 715 def mailboxes(self): 716 return [x for x in self if x.token_type=='mailbox'] 717 718 @property 719 def all_mailboxes(self): 720 return [x for x in self 721 if x.token_type in ('mailbox', 'invalid-mailbox')] 722 723 724 class GroupList(TokenList): 725 726 token_type = 'group-list' 727 728 @property 729 def mailboxes(self): 730 if not self or self[0].token_type != 'mailbox-list': 731 return [] 732 return self[0].mailboxes 733 734 @property 735 def all_mailboxes(self): 736 if not self or self[0].token_type != 'mailbox-list': 737 return [] 738 return self[0].all_mailboxes 739 740 741 class Group(TokenList): 742 743 token_type = "group" 744 745 @property 746 def mailboxes(self): 747 if self[2].token_type != 'group-list': 748 return [] 749 return self[2].mailboxes 750 751 @property 752 def all_mailboxes(self): 753 if self[2].token_type != 'group-list': 754 return [] 755 return self[2].all_mailboxes 756 757 @property 758 def display_name(self): 759 return self[0].display_name 760 761 762 class NameAddr(TokenList): 763 764 token_type = 'name-addr' 765 766 @property 767 def display_name(self): 768 if len(self) == 1: 769 return None 770 return self[0].display_name 771 772 @property 773 def local_part(self): 774 return self[-1].local_part 775 776 @property 777 def domain(self): 778 return self[-1].domain 779 780 @property 781 def route(self): 782 return self[-1].route 783 784 @property 785 def addr_spec(self): 786 return self[-1].addr_spec 787 788 789 class AngleAddr(TokenList): 790 791 token_type = 'angle-addr' 792 793 @property 794 def local_part(self): 795 for x in self: 796 if x.token_type == 'addr-spec': 797 return x.local_part 798 799 @property 800 def domain(self): 801 for x in self: 802 if x.token_type == 'addr-spec': 803 return x.domain 804 805 @property 806 def route(self): 807 for x in self: 808 if x.token_type == 'obs-route': 809 return x.domains 810 811 @property 812 def addr_spec(self): 813 for x in self: 814 if x.token_type == 'addr-spec': 815 return x.addr_spec 816 else: 817 return '<>' 818 819 820 class ObsRoute(TokenList): 821 822 token_type = 'obs-route' 823 824 @property 825 def domains(self): 826 return [x.domain for x in self if x.token_type == 'domain'] 827 828 829 class Mailbox(TokenList): 830 831 token_type = 'mailbox' 832 833 @property 834 def display_name(self): 835 if self[0].token_type == 'name-addr': 836 return self[0].display_name 837 838 @property 839 def local_part(self): 840 return self[0].local_part 841 842 @property 843 def domain(self): 844 return self[0].domain 845 846 @property 847 def route(self): 848 if self[0].token_type == 'name-addr': 849 return self[0].route 850 851 @property 852 def addr_spec(self): 853 return self[0].addr_spec 854 855 856 class InvalidMailbox(TokenList): 857 858 token_type = 'invalid-mailbox' 859 860 @property 861 def display_name(self): 862 return None 863 864 local_part = domain = route = addr_spec = display_name 865 866 867 class Domain(TokenList): 868 869 token_type = 'domain' 870 871 @property 872 def domain(self): 873 return ''.join(super().value.split()) 874 875 876 class DotAtom(TokenList): 877 878 token_type = 'dot-atom' 879 880 881 class DotAtomText(TokenList): 882 883 token_type = 'dot-atom-text' 884 885 886 class AddrSpec(TokenList): 887 888 token_type = 'addr-spec' 889 890 @property 891 def local_part(self): 892 return self[0].local_part 893 894 @property 895 def domain(self): 896 if len(self) < 3: 897 return None 898 return self[-1].domain 899 900 @property 901 def value(self): 902 if len(self) < 3: 903 return self[0].value 904 return self[0].value.rstrip()+self[1].value+self[2].value.lstrip() 905 906 @property 907 def addr_spec(self): 908 nameset = set(self.local_part) 909 if len(nameset) > len(nameset-DOT_ATOM_ENDS): 910 lp = quote_string(self.local_part) 911 else: 912 lp = self.local_part 913 if self.domain is not None: 914 return lp + '@' + self.domain 915 return lp 916 917 918 class ObsLocalPart(TokenList): 919 920 token_type = 'obs-local-part' 921 922 923 class DisplayName(Phrase): 924 925 token_type = 'display-name' 926 927 @property 928 def display_name(self): 929 res = TokenList(self) 930 if res[0].token_type == 'cfws': 931 res.pop(0) 932 else: 933 if res[0][0].token_type == 'cfws': 934 res[0] = TokenList(res[0][1:]) 935 if res[-1].token_type == 'cfws': 936 res.pop() 937 else: 938 if res[-1][-1].token_type == 'cfws': 939 res[-1] = TokenList(res[-1][:-1]) 940 return res.value 941 942 @property 943 def value(self): 944 quote = False 945 if self.defects: 946 quote = True 947 else: 948 for x in self: 949 if x.token_type == 'quoted-string': 950 quote = True 951 if quote: 952 pre = post = '' 953 if self[0].token_type=='cfws' or self[0][0].token_type=='cfws': 954 pre = ' ' 955 if self[-1].token_type=='cfws' or self[-1][-1].token_type=='cfws': 956 post = ' ' 957 return pre+quote_string(self.display_name)+post 958 else: 959 return super().value 960 961 962 class LocalPart(TokenList): 963 964 token_type = 'local-part' 965 966 @property 967 def value(self): 968 if self[0].token_type == "quoted-string": 969 return self[0].quoted_value 970 else: 971 return self[0].value 972 973 @property 974 def local_part(self): 975 # Strip whitespace from front, back, and around dots. 976 res = [DOT] 977 last = DOT 978 last_is_tl = False 979 for tok in self[0] + [DOT]: 980 if tok.token_type == 'cfws': 981 continue 982 if (last_is_tl and tok.token_type == 'dot' and 983 last[-1].token_type == 'cfws'): 984 res[-1] = TokenList(last[:-1]) 985 is_tl = isinstance(tok, TokenList) 986 if (is_tl and last.token_type == 'dot' and 987 tok[0].token_type == 'cfws'): 988 res.append(TokenList(tok[1:])) 989 else: 990 res.append(tok) 991 last = res[-1] 992 last_is_tl = is_tl 993 res = TokenList(res[1:-1]) 994 return res.value 995 996 997 class DomainLiteral(TokenList): 998 999 token_type = 'domain-literal' 1000 1001 @property 1002 def domain(self): 1003 return ''.join(super().value.split()) 1004 1005 @property 1006 def ip(self): 1007 for x in self: 1008 if x.token_type == 'ptext': 1009 return x.value 1010 1011 1012 class MIMEVersion(TokenList): 1013 1014 token_type = 'mime-version' 1015 major = None 1016 minor = None 1017 1018 1019 class Parameter(TokenList): 1020 1021 token_type = 'parameter' 1022 sectioned = False 1023 extended = False 1024 charset = 'us-ascii' 1025 1026 @property 1027 def section_number(self): 1028 # Because the first token, the attribute (name) eats CFWS, the second 1029 # token is always the section if there is one. 1030 return self[1].number if self.sectioned else 0 1031 1032 @property 1033 def param_value(self): 1034 # This is part of the "handle quoted extended parameters" hack. 1035 for token in self: 1036 if token.token_type == 'value': 1037 return token.stripped_value 1038 if token.token_type == 'quoted-string': 1039 for token in token: 1040 if token.token_type == 'bare-quoted-string': 1041 for token in token: 1042 if token.token_type == 'value': 1043 return token.stripped_value 1044 return '' 1045 1046 1047 class InvalidParameter(Parameter): 1048 1049 token_type = 'invalid-parameter' 1050 1051 1052 class Attribute(TokenList): 1053 1054 token_type = 'attribute' 1055 1056 @property 1057 def stripped_value(self): 1058 for token in self: 1059 if token.token_type.endswith('attrtext'): 1060 return token.value 1061 1062 class Section(TokenList): 1063 1064 token_type = 'section' 1065 number = None 1066 1067 1068 class Value(TokenList): 1069 1070 token_type = 'value' 1071 1072 @property 1073 def stripped_value(self): 1074 token = self[0] 1075 if token.token_type == 'cfws': 1076 token = self[1] 1077 if token.token_type.endswith( 1078 ('quoted-string', 'attribute', 'extended-attribute')): 1079 return token.stripped_value 1080 return self.value 1081 1082 1083 class MimeParameters(TokenList): 1084 1085 token_type = 'mime-parameters' 1086 1087 @property 1088 def params(self): 1089 # The RFC specifically states that the ordering of parameters is not 1090 # guaranteed and may be reordered by the transport layer. So we have 1091 # to assume the RFC 2231 pieces can come in any order. However, we 1092 # output them in the order that we first see a given name, which gives 1093 # us a stable __str__. 1094 params = OrderedDict() 1095 for token in self: 1096 if not token.token_type.endswith('parameter'): 1097 continue 1098 if token[0].token_type != 'attribute': 1099 continue 1100 name = token[0].value.strip() 1101 if name not in params: 1102 params[name] = [] 1103 params[name].append((token.section_number, token)) 1104 for name, parts in params.items(): 1105 parts = sorted(parts, key=itemgetter(0)) 1106 first_param = parts[0][1] 1107 charset = first_param.charset 1108 # Our arbitrary error recovery is to ignore duplicate parameters, 1109 # to use appearance order if there are duplicate rfc 2231 parts, 1110 # and to ignore gaps. This mimics the error recovery of get_param. 1111 if not first_param.extended and len(parts) > 1: 1112 if parts[1][0] == 0: 1113 parts[1][1].defects.append(errors.InvalidHeaderDefect( 1114 'duplicate parameter name; duplicate(s) ignored')) 1115 parts = parts[:1] 1116 # Else assume the *0* was missing...note that this is different 1117 # from get_param, but we registered a defect for this earlier. 1118 value_parts = [] 1119 i = 0 1120 for section_number, param in parts: 1121 if section_number != i: 1122 # We could get fancier here and look for a complete 1123 # duplicate extended parameter and ignore the second one 1124 # seen. But we're not doing that. The old code didn't. 1125 if not param.extended: 1126 param.defects.append(errors.InvalidHeaderDefect( 1127 'duplicate parameter name; duplicate ignored')) 1128 continue 1129 else: 1130 param.defects.append(errors.InvalidHeaderDefect( 1131 "inconsistent RFC2231 parameter numbering")) 1132 i += 1 1133 value = param.param_value 1134 if param.extended: 1135 try: 1136 value = urllib.parse.unquote_to_bytes(value) 1137 except UnicodeEncodeError: 1138 # source had surrogate escaped bytes. What we do now 1139 # is a bit of an open question. I'm not sure this is 1140 # the best choice, but it is what the old algorithm did 1141 value = urllib.parse.unquote(value, encoding='latin-1') 1142 else: 1143 try: 1144 value = value.decode(charset, 'surrogateescape') 1145 except LookupError: 1146 # XXX: there should really be a custom defect for 1147 # unknown character set to make it easy to find, 1148 # because otherwise unknown charset is a silent 1149 # failure. 1150 value = value.decode('us-ascii', 'surrogateescape') 1151 if utils._has_surrogates(value): 1152 param.defects.append(errors.UndecodableBytesDefect()) 1153 value_parts.append(value) 1154 value = ''.join(value_parts) 1155 yield name, value 1156 1157 def __str__(self): 1158 params = [] 1159 for name, value in self.params: 1160 if value: 1161 params.append('{}={}'.format(name, quote_string(value))) 1162 else: 1163 params.append(name) 1164 params = '; '.join(params) 1165 return ' ' + params if params else '' 1166 1167 1168 class ParameterizedHeaderValue(TokenList): 1169 1170 @property 1171 def params(self): 1172 for token in reversed(self): 1173 if token.token_type == 'mime-parameters': 1174 return token.params 1175 return {} 1176 1177 @property 1178 def parts(self): 1179 if self and self[-1].token_type == 'mime-parameters': 1180 # We don't want to start a new line if all of the params don't fit 1181 # after the value, so unwrap the parameter list. 1182 return TokenList(self[:-1] + self[-1]) 1183 return TokenList(self).parts 1184 1185 1186 class ContentType(ParameterizedHeaderValue): 1187 1188 token_type = 'content-type' 1189 maintype = 'text' 1190 subtype = 'plain' 1191 1192 1193 class ContentDisposition(ParameterizedHeaderValue): 1194 1195 token_type = 'content-disposition' 1196 content_disposition = None 1197 1198 1199 class ContentTransferEncoding(TokenList): 1200 1201 token_type = 'content-transfer-encoding' 1202 cte = '7bit' 1203 1204 1205 class HeaderLabel(TokenList): 1206 1207 token_type = 'header-label' 1208 1209 1210 class Header(TokenList): 1211 1212 token_type = 'header' 1213 1214 def _fold(self, folded): 1215 folded.append(str(self.pop(0))) 1216 folded.lastlen = len(folded.current[0]) 1217 # The first line of the header is different from all others: we don't 1218 # want to start a new object on a new line if it has any fold points in 1219 # it that would allow part of it to be on the first header line. 1220 # Further, if the first fold point would fit on the new line, we want 1221 # to do that, but if it doesn't we want to put it on the first line. 1222 # Folded supports this via the stickyspace attribute. If this 1223 # attribute is not None, it does the special handling. 1224 folded.stickyspace = str(self.pop(0)) if self[0].token_type == 'cfws' else '' 1225 rest = self.pop(0) 1226 if self: 1227 raise ValueError("Malformed Header token list") 1228 rest._fold(folded) 1229 1230 1231 # 1232 # Terminal classes and instances 1233 # 1234 1235 class Terminal(str): 1236 1237 def __new__(cls, value, token_type): 1238 self = super().__new__(cls, value) 1239 self.token_type = token_type 1240 self.defects = [] 1241 return self 1242 1243 def __repr__(self): 1244 return "{}({})".format(self.__class__.__name__, super().__repr__()) 1245 1246 @property 1247 def all_defects(self): 1248 return list(self.defects) 1249 1250 def _pp(self, indent=''): 1251 return ["{}{}/{}({}){}".format( 1252 indent, 1253 self.__class__.__name__, 1254 self.token_type, 1255 super().__repr__(), 1256 '' if not self.defects else ' {}'.format(self.defects), 1257 )] 1258 1259 def cte_encode(self, charset, policy): 1260 value = str(self) 1261 try: 1262 value.encode('us-ascii') 1263 return value 1264 except UnicodeEncodeError: 1265 return _ew.encode(value, charset) 1266 1267 def pop_trailing_ws(self): 1268 # This terminates the recursion. 1269 return None 1270 1271 def pop_leading_fws(self): 1272 # This terminates the recursion. 1273 return None 1274 1275 @property 1276 def comments(self): 1277 return [] 1278 1279 def has_leading_comment(self): 1280 return False 1281 1282 def __getnewargs__(self): 1283 return(str(self), self.token_type) 1284 1285 1286 class WhiteSpaceTerminal(Terminal): 1287 1288 @property 1289 def value(self): 1290 return ' ' 1291 1292 def startswith_fws(self): 1293 return True 1294 1295 has_fws = True 1296 1297 1298 class ValueTerminal(Terminal): 1299 1300 @property 1301 def value(self): 1302 return self 1303 1304 def startswith_fws(self): 1305 return False 1306 1307 has_fws = False 1308 1309 def as_encoded_word(self, charset): 1310 return _ew.encode(str(self), charset) 1311 1312 1313 class EWWhiteSpaceTerminal(WhiteSpaceTerminal): 1314 1315 @property 1316 def value(self): 1317 return '' 1318 1319 @property 1320 def encoded(self): 1321 return self[:] 1322 1323 def __str__(self): 1324 return '' 1325 1326 has_fws = True 1327 1328 1329 # XXX these need to become classes and used as instances so 1330 # that a program can't change them in a parse tree and screw 1331 # up other parse trees. Maybe should have tests for that, too. 1332 DOT = ValueTerminal('.', 'dot') 1333 ListSeparator = ValueTerminal(',', 'list-separator') 1334 RouteComponentMarker = ValueTerminal('@', 'route-component-marker') 1335 1336 # 1337 # Parser 1338 # 1339 1340 # Parse strings according to RFC822/2047/2822/5322 rules. 1341 # 1342 # This is a stateless parser. Each get_XXX function accepts a string and 1343 # returns either a Terminal or a TokenList representing the RFC object named 1344 # by the method and a string containing the remaining unparsed characters 1345 # from the input. Thus a parser method consumes the next syntactic construct 1346 # of a given type and returns a token representing the construct plus the 1347 # unparsed remainder of the input string. 1348 # 1349 # For example, if the first element of a structured header is a 'phrase', 1350 # then: 1351 # 1352 # phrase, value = get_phrase(value) 1353 # 1354 # returns the complete phrase from the start of the string value, plus any 1355 # characters left in the string after the phrase is removed. 1356 1357 _wsp_splitter = re.compile(r'([{}]+)'.format(''.join(WSP))).split 1358 _non_atom_end_matcher = re.compile(r"[^{}]+".format( 1359 ''.join(ATOM_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match 1360 _non_printable_finder = re.compile(r"[\x00-\x20\x7F]").findall 1361 _non_token_end_matcher = re.compile(r"[^{}]+".format( 1362 ''.join(TOKEN_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match 1363 _non_attribute_end_matcher = re.compile(r"[^{}]+".format( 1364 ''.join(ATTRIBUTE_ENDS).replace('\\','\\\\').replace(']',r'\]'))).match 1365 _non_extended_attribute_end_matcher = re.compile(r"[^{}]+".format( 1366 ''.join(EXTENDED_ATTRIBUTE_ENDS).replace( 1367 '\\','\\\\').replace(']',r'\]'))).match 1368 1369 def _validate_xtext(xtext): 1370 """If input token contains ASCII non-printables, register a defect.""" 1371 1372 non_printables = _non_printable_finder(xtext) 1373 if non_printables: 1374 xtext.defects.append(errors.NonPrintableDefect(non_printables)) 1375 if utils._has_surrogates(xtext): 1376 xtext.defects.append(errors.UndecodableBytesDefect( 1377 "Non-ASCII characters found in header token")) 1378 1379 def _get_ptext_to_endchars(value, endchars): 1380 """Scan printables/quoted-pairs until endchars and return unquoted ptext. 1381 1382 This function turns a run of qcontent, ccontent-without-comments, or 1383 dtext-with-quoted-printables into a single string by unquoting any 1384 quoted printables. It returns the string, the remaining value, and 1385 a flag that is True iff there were any quoted printables decoded. 1386 1387 """ 1388 fragment, *remainder = _wsp_splitter(value, 1) 1389 vchars = [] 1390 escape = False 1391 had_qp = False 1392 for pos in range(len(fragment)): 1393 if fragment[pos] == '\\': 1394 if escape: 1395 escape = False 1396 had_qp = True 1397 else: 1398 escape = True 1399 continue 1400 if escape: 1401 escape = False 1402 elif fragment[pos] in endchars: 1403 break 1404 vchars.append(fragment[pos]) 1405 else: 1406 pos = pos + 1 1407 return ''.join(vchars), ''.join([fragment[pos:]] + remainder), had_qp 1408 1409 def get_fws(value): 1410 """FWS = 1*WSP 1411 1412 This isn't the RFC definition. We're using fws to represent tokens where 1413 folding can be done, but when we are parsing the *un*folding has already 1414 been done so we don't need to watch out for CRLF. 1415 1416 """ 1417 newvalue = value.lstrip() 1418 fws = WhiteSpaceTerminal(value[:len(value)-len(newvalue)], 'fws') 1419 return fws, newvalue 1420 1421 def get_encoded_word(value): 1422 """ encoded-word = "=?" charset "?" encoding "?" encoded-text "?=" 1423 1424 """ 1425 ew = EncodedWord() 1426 if not value.startswith('=?'): 1427 raise errors.HeaderParseError( 1428 "expected encoded word but found {}".format(value)) 1429 tok, *remainder = value[2:].split('?=', 1) 1430 if tok == value[2:]: 1431 raise errors.HeaderParseError( 1432 "expected encoded word but found {}".format(value)) 1433 remstr = ''.join(remainder) 1434 if len(remstr) > 1 and remstr[0] in hexdigits and remstr[1] in hexdigits: 1435 # The ? after the CTE was followed by an encoded word escape (=XX). 1436 rest, *remainder = remstr.split('?=', 1) 1437 tok = tok + '?=' + rest 1438 if len(tok.split()) > 1: 1439 ew.defects.append(errors.InvalidHeaderDefect( 1440 "whitespace inside encoded word")) 1441 ew.cte = value 1442 value = ''.join(remainder) 1443 try: 1444 text, charset, lang, defects = _ew.decode('=?' + tok + '?=') 1445 except ValueError: 1446 raise errors.HeaderParseError( 1447 "encoded word format invalid: '{}'".format(ew.cte)) 1448 ew.charset = charset 1449 ew.lang = lang 1450 ew.defects.extend(defects) 1451 while text: 1452 if text[0] in WSP: 1453 token, text = get_fws(text) 1454 ew.append(token) 1455 continue 1456 chars, *remainder = _wsp_splitter(text, 1) 1457 vtext = ValueTerminal(chars, 'vtext') 1458 _validate_xtext(vtext) 1459 ew.append(vtext) 1460 text = ''.join(remainder) 1461 return ew, value 1462 1463 def get_unstructured(value): 1464 """unstructured = (*([FWS] vchar) *WSP) / obs-unstruct 1465 obs-unstruct = *((*LF *CR *(obs-utext) *LF *CR)) / FWS) 1466 obs-utext = %d0 / obs-NO-WS-CTL / LF / CR 1467 1468 obs-NO-WS-CTL is control characters except WSP/CR/LF. 1469 1470 So, basically, we have printable runs, plus control characters or nulls in 1471 the obsolete syntax, separated by whitespace. Since RFC 2047 uses the 1472 obsolete syntax in its specification, but requires whitespace on either 1473 side of the encoded words, I can see no reason to need to separate the 1474 non-printable-non-whitespace from the printable runs if they occur, so we 1475 parse this into xtext tokens separated by WSP tokens. 1476 1477 Because an 'unstructured' value must by definition constitute the entire 1478 value, this 'get' routine does not return a remaining value, only the 1479 parsed TokenList. 1480 1481 """ 1482 # XXX: but what about bare CR and LF? They might signal the start or 1483 # end of an encoded word. YAGNI for now, since our current parsers 1484 # will never send us strings with bare CR or LF. 1485 1486 unstructured = UnstructuredTokenList() 1487 while value: 1488 if value[0] in WSP: 1489 token, value = get_fws(value) 1490 unstructured.append(token) 1491 continue 1492 if value.startswith('=?'): 1493 try: 1494 token, value = get_encoded_word(value) 1495 except errors.HeaderParseError: 1496 # XXX: Need to figure out how to register defects when 1497 # appropriate here. 1498 pass 1499 else: 1500 have_ws = True 1501 if len(unstructured) > 0: 1502 if unstructured[-1].token_type != 'fws': 1503 unstructured.defects.append(errors.InvalidHeaderDefect( 1504 "missing whitespace before encoded word")) 1505 have_ws = False 1506 if have_ws and len(unstructured) > 1: 1507 if unstructured[-2].token_type == 'encoded-word': 1508 unstructured[-1] = EWWhiteSpaceTerminal( 1509 unstructured[-1], 'fws') 1510 unstructured.append(token) 1511 continue 1512 tok, *remainder = _wsp_splitter(value, 1) 1513 vtext = ValueTerminal(tok, 'vtext') 1514 _validate_xtext(vtext) 1515 unstructured.append(vtext) 1516 value = ''.join(remainder) 1517 return unstructured 1518 1519 def get_qp_ctext(value): 1520 r"""ctext = <printable ascii except \ ( )> 1521 1522 This is not the RFC ctext, since we are handling nested comments in comment 1523 and unquoting quoted-pairs here. We allow anything except the '()' 1524 characters, but if we find any ASCII other than the RFC defined printable 1525 ASCII, a NonPrintableDefect is added to the token's defects list. Since 1526 quoted pairs are converted to their unquoted values, what is returned is 1527 a 'ptext' token. In this case it is a WhiteSpaceTerminal, so it's value 1528 is ' '. 1529 1530 """ 1531 ptext, value, _ = _get_ptext_to_endchars(value, '()') 1532 ptext = WhiteSpaceTerminal(ptext, 'ptext') 1533 _validate_xtext(ptext) 1534 return ptext, value 1535 1536 def get_qcontent(value): 1537 """qcontent = qtext / quoted-pair 1538 1539 We allow anything except the DQUOTE character, but if we find any ASCII 1540 other than the RFC defined printable ASCII, a NonPrintableDefect is 1541 added to the token's defects list. Any quoted pairs are converted to their 1542 unquoted values, so what is returned is a 'ptext' token. In this case it 1543 is a ValueTerminal. 1544 1545 """ 1546 ptext, value, _ = _get_ptext_to_endchars(value, '"') 1547 ptext = ValueTerminal(ptext, 'ptext') 1548 _validate_xtext(ptext) 1549 return ptext, value 1550 1551 def get_atext(value): 1552 """atext = <matches _atext_matcher> 1553 1554 We allow any non-ATOM_ENDS in atext, but add an InvalidATextDefect to 1555 the token's defects list if we find non-atext characters. 1556 """ 1557 m = _non_atom_end_matcher(value) 1558 if not m: 1559 raise errors.HeaderParseError( 1560 "expected atext but found '{}'".format(value)) 1561 atext = m.group() 1562 value = value[len(atext):] 1563 atext = ValueTerminal(atext, 'atext') 1564 _validate_xtext(atext) 1565 return atext, value 1566 1567 def get_bare_quoted_string(value): 1568 """bare-quoted-string = DQUOTE *([FWS] qcontent) [FWS] DQUOTE 1569 1570 A quoted-string without the leading or trailing white space. Its 1571 value is the text between the quote marks, with whitespace 1572 preserved and quoted pairs decoded. 1573 """ 1574 if value[0] != '"': 1575 raise errors.HeaderParseError( 1576 "expected '\"' but found '{}'".format(value)) 1577 bare_quoted_string = BareQuotedString() 1578 value = value[1:] 1579 while value and value[0] != '"': 1580 if value[0] in WSP: 1581 token, value = get_fws(value) 1582 elif value[:2] == '=?': 1583 try: 1584 token, value = get_encoded_word(value) 1585 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1586 "encoded word inside quoted string")) 1587 except errors.HeaderParseError: 1588 token, value = get_qcontent(value) 1589 else: 1590 token, value = get_qcontent(value) 1591 bare_quoted_string.append(token) 1592 if not value: 1593 bare_quoted_string.defects.append(errors.InvalidHeaderDefect( 1594 "end of header inside quoted string")) 1595 return bare_quoted_string, value 1596 return bare_quoted_string, value[1:] 1597 1598 def get_comment(value): 1599 """comment = "(" *([FWS] ccontent) [FWS] ")" 1600 ccontent = ctext / quoted-pair / comment 1601 1602 We handle nested comments here, and quoted-pair in our qp-ctext routine. 1603 """ 1604 if value and value[0] != '(': 1605 raise errors.HeaderParseError( 1606 "expected '(' but found '{}'".format(value)) 1607 comment = Comment() 1608 value = value[1:] 1609 while value and value[0] != ")": 1610 if value[0] in WSP: 1611 token, value = get_fws(value) 1612 elif value[0] == '(': 1613 token, value = get_comment(value) 1614 else: 1615 token, value = get_qp_ctext(value) 1616 comment.append(token) 1617 if not value: 1618 comment.defects.append(errors.InvalidHeaderDefect( 1619 "end of header inside comment")) 1620 return comment, value 1621 return comment, value[1:] 1622 1623 def get_cfws(value): 1624 """CFWS = (1*([FWS] comment) [FWS]) / FWS 1625 1626 """ 1627 cfws = CFWSList() 1628 while value and value[0] in CFWS_LEADER: 1629 if value[0] in WSP: 1630 token, value = get_fws(value) 1631 else: 1632 token, value = get_comment(value) 1633 cfws.append(token) 1634 return cfws, value 1635 1636 def get_quoted_string(value): 1637 """quoted-string = [CFWS] <bare-quoted-string> [CFWS] 1638 1639 'bare-quoted-string' is an intermediate class defined by this 1640 parser and not by the RFC grammar. It is the quoted string 1641 without any attached CFWS. 1642 """ 1643 quoted_string = QuotedString() 1644 if value and value[0] in CFWS_LEADER: 1645 token, value = get_cfws(value) 1646 quoted_string.append(token) 1647 token, value = get_bare_quoted_string(value) 1648 quoted_string.append(token) 1649 if value and value[0] in CFWS_LEADER: 1650 token, value = get_cfws(value) 1651 quoted_string.append(token) 1652 return quoted_string, value 1653 1654 def get_atom(value): 1655 """atom = [CFWS] 1*atext [CFWS] 1656 1657 An atom could be an rfc2047 encoded word. 1658 """ 1659 atom = Atom() 1660 if value and value[0] in CFWS_LEADER: 1661 token, value = get_cfws(value) 1662 atom.append(token) 1663 if value and value[0] in ATOM_ENDS: 1664 raise errors.HeaderParseError( 1665 "expected atom but found '{}'".format(value)) 1666 if value.startswith('=?'): 1667 try: 1668 token, value = get_encoded_word(value) 1669 except errors.HeaderParseError: 1670 # XXX: need to figure out how to register defects when 1671 # appropriate here. 1672 token, value = get_atext(value) 1673 else: 1674 token, value = get_atext(value) 1675 atom.append(token) 1676 if value and value[0] in CFWS_LEADER: 1677 token, value = get_cfws(value) 1678 atom.append(token) 1679 return atom, value 1680 1681 def get_dot_atom_text(value): 1682 """ dot-text = 1*atext *("." 1*atext) 1683 1684 """ 1685 dot_atom_text = DotAtomText() 1686 if not value or value[0] in ATOM_ENDS: 1687 raise errors.HeaderParseError("expected atom at a start of " 1688 "dot-atom-text but found '{}'".format(value)) 1689 while value and value[0] not in ATOM_ENDS: 1690 token, value = get_atext(value) 1691 dot_atom_text.append(token) 1692 if value and value[0] == '.': 1693 dot_atom_text.append(DOT) 1694 value = value[1:] 1695 if dot_atom_text[-1] is DOT: 1696 raise errors.HeaderParseError("expected atom at end of dot-atom-text " 1697 "but found '{}'".format('.'+value)) 1698 return dot_atom_text, value 1699 1700 def get_dot_atom(value): 1701 """ dot-atom = [CFWS] dot-atom-text [CFWS] 1702 1703 Any place we can have a dot atom, we could instead have an rfc2047 encoded 1704 word. 1705 """ 1706 dot_atom = DotAtom() 1707 if value[0] in CFWS_LEADER: 1708 token, value = get_cfws(value) 1709 dot_atom.append(token) 1710 if value.startswith('=?'): 1711 try: 1712 token, value = get_encoded_word(value) 1713 except errors.HeaderParseError: 1714 # XXX: need to figure out how to register defects when 1715 # appropriate here. 1716 token, value = get_dot_atom_text(value) 1717 else: 1718 token, value = get_dot_atom_text(value) 1719 dot_atom.append(token) 1720 if value and value[0] in CFWS_LEADER: 1721 token, value = get_cfws(value) 1722 dot_atom.append(token) 1723 return dot_atom, value 1724 1725 def get_word(value): 1726 """word = atom / quoted-string 1727 1728 Either atom or quoted-string may start with CFWS. We have to peel off this 1729 CFWS first to determine which type of word to parse. Afterward we splice 1730 the leading CFWS, if any, into the parsed sub-token. 1731 1732 If neither an atom or a quoted-string is found before the next special, a 1733 HeaderParseError is raised. 1734 1735 The token returned is either an Atom or a QuotedString, as appropriate. 1736 This means the 'word' level of the formal grammar is not represented in the 1737 parse tree; this is because having that extra layer when manipulating the 1738 parse tree is more confusing than it is helpful. 1739 1740 """ 1741 if value[0] in CFWS_LEADER: 1742 leader, value = get_cfws(value) 1743 else: 1744 leader = None 1745 if value[0]=='"': 1746 token, value = get_quoted_string(value) 1747 elif value[0] in SPECIALS: 1748 raise errors.HeaderParseError("Expected 'atom' or 'quoted-string' " 1749 "but found '{}'".format(value)) 1750 else: 1751 token, value = get_atom(value) 1752 if leader is not None: 1753 token[:0] = [leader] 1754 return token, value 1755 1756 def get_phrase(value): 1757 """ phrase = 1*word / obs-phrase 1758 obs-phrase = word *(word / "." / CFWS) 1759 1760 This means a phrase can be a sequence of words, periods, and CFWS in any 1761 order as long as it starts with at least one word. If anything other than 1762 words is detected, an ObsoleteHeaderDefect is added to the token's defect 1763 list. We also accept a phrase that starts with CFWS followed by a dot; 1764 this is registered as an InvalidHeaderDefect, since it is not supported by 1765 even the obsolete grammar. 1766 1767 """ 1768 phrase = Phrase() 1769 try: 1770 token, value = get_word(value) 1771 phrase.append(token) 1772 except errors.HeaderParseError: 1773 phrase.defects.append(errors.InvalidHeaderDefect( 1774 "phrase does not start with word")) 1775 while value and value[0] not in PHRASE_ENDS: 1776 if value[0]=='.': 1777 phrase.append(DOT) 1778 phrase.defects.append(errors.ObsoleteHeaderDefect( 1779 "period in 'phrase'")) 1780 value = value[1:] 1781 else: 1782 try: 1783 token, value = get_word(value) 1784 except errors.HeaderParseError: 1785 if value[0] in CFWS_LEADER: 1786 token, value = get_cfws(value) 1787 phrase.defects.append(errors.ObsoleteHeaderDefect( 1788 "comment found without atom")) 1789 else: 1790 raise 1791 phrase.append(token) 1792 return phrase, value 1793 1794 def get_local_part(value): 1795 """ local-part = dot-atom / quoted-string / obs-local-part 1796 1797 """ 1798 local_part = LocalPart() 1799 leader = None 1800 if value[0] in CFWS_LEADER: 1801 leader, value = get_cfws(value) 1802 if not value: 1803 raise errors.HeaderParseError( 1804 "expected local-part but found '{}'".format(value)) 1805 try: 1806 token, value = get_dot_atom(value) 1807 except errors.HeaderParseError: 1808 try: 1809 token, value = get_word(value) 1810 except errors.HeaderParseError: 1811 if value[0] != '\\' and value[0] in PHRASE_ENDS: 1812 raise 1813 token = TokenList() 1814 if leader is not None: 1815 token[:0] = [leader] 1816 local_part.append(token) 1817 if value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1818 obs_local_part, value = get_obs_local_part(str(local_part) + value) 1819 if obs_local_part.token_type == 'invalid-obs-local-part': 1820 local_part.defects.append(errors.InvalidHeaderDefect( 1821 "local-part is not dot-atom, quoted-string, or obs-local-part")) 1822 else: 1823 local_part.defects.append(errors.ObsoleteHeaderDefect( 1824 "local-part is not a dot-atom (contains CFWS)")) 1825 local_part[0] = obs_local_part 1826 try: 1827 local_part.value.encode('ascii') 1828 except UnicodeEncodeError: 1829 local_part.defects.append(errors.NonASCIILocalPartDefect( 1830 "local-part contains non-ASCII characters)")) 1831 return local_part, value 1832 1833 def get_obs_local_part(value): 1834 """ obs-local-part = word *("." word) 1835 """ 1836 obs_local_part = ObsLocalPart() 1837 last_non_ws_was_dot = False 1838 while value and (value[0]=='\\' or value[0] not in PHRASE_ENDS): 1839 if value[0] == '.': 1840 if last_non_ws_was_dot: 1841 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1842 "invalid repeated '.'")) 1843 obs_local_part.append(DOT) 1844 last_non_ws_was_dot = True 1845 value = value[1:] 1846 continue 1847 elif value[0]=='\\': 1848 obs_local_part.append(ValueTerminal(value[0], 1849 'misplaced-special')) 1850 value = value[1:] 1851 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1852 "'\\' character outside of quoted-string/ccontent")) 1853 last_non_ws_was_dot = False 1854 continue 1855 if obs_local_part and obs_local_part[-1].token_type != 'dot': 1856 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1857 "missing '.' between words")) 1858 try: 1859 token, value = get_word(value) 1860 last_non_ws_was_dot = False 1861 except errors.HeaderParseError: 1862 if value[0] not in CFWS_LEADER: 1863 raise 1864 token, value = get_cfws(value) 1865 obs_local_part.append(token) 1866 if (obs_local_part[0].token_type == 'dot' or 1867 obs_local_part[0].token_type=='cfws' and 1868 obs_local_part[1].token_type=='dot'): 1869 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1870 "Invalid leading '.' in local part")) 1871 if (obs_local_part[-1].token_type == 'dot' or 1872 obs_local_part[-1].token_type=='cfws' and 1873 obs_local_part[-2].token_type=='dot'): 1874 obs_local_part.defects.append(errors.InvalidHeaderDefect( 1875 "Invalid trailing '.' in local part")) 1876 if obs_local_part.defects: 1877 obs_local_part.token_type = 'invalid-obs-local-part' 1878 return obs_local_part, value 1879 1880 def get_dtext(value): 1881 r""" dtext = <printable ascii except \ [ ]> / obs-dtext 1882 obs-dtext = obs-NO-WS-CTL / quoted-pair 1883 1884 We allow anything except the excluded characters, but if we find any 1885 ASCII other than the RFC defined printable ASCII, a NonPrintableDefect is 1886 added to the token's defects list. Quoted pairs are converted to their 1887 unquoted values, so what is returned is a ptext token, in this case a 1888 ValueTerminal. If there were quoted-printables, an ObsoleteHeaderDefect is 1889 added to the returned token's defect list. 1890 1891 """ 1892 ptext, value, had_qp = _get_ptext_to_endchars(value, '[]') 1893 ptext = ValueTerminal(ptext, 'ptext') 1894 if had_qp: 1895 ptext.defects.append(errors.ObsoleteHeaderDefect( 1896 "quoted printable found in domain-literal")) 1897 _validate_xtext(ptext) 1898 return ptext, value 1899 1900 def _check_for_early_dl_end(value, domain_literal): 1901 if value: 1902 return False 1903 domain_literal.append(errors.InvalidHeaderDefect( 1904 "end of input inside domain-literal")) 1905 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1906 return True 1907 1908 def get_domain_literal(value): 1909 """ domain-literal = [CFWS] "[" *([FWS] dtext) [FWS] "]" [CFWS] 1910 1911 """ 1912 domain_literal = DomainLiteral() 1913 if value[0] in CFWS_LEADER: 1914 token, value = get_cfws(value) 1915 domain_literal.append(token) 1916 if not value: 1917 raise errors.HeaderParseError("expected domain-literal") 1918 if value[0] != '[': 1919 raise errors.HeaderParseError("expected '[' at start of domain-literal " 1920 "but found '{}'".format(value)) 1921 value = value[1:] 1922 if _check_for_early_dl_end(value, domain_literal): 1923 return domain_literal, value 1924 domain_literal.append(ValueTerminal('[', 'domain-literal-start')) 1925 if value[0] in WSP: 1926 token, value = get_fws(value) 1927 domain_literal.append(token) 1928 token, value = get_dtext(value) 1929 domain_literal.append(token) 1930 if _check_for_early_dl_end(value, domain_literal): 1931 return domain_literal, value 1932 if value[0] in WSP: 1933 token, value = get_fws(value) 1934 domain_literal.append(token) 1935 if _check_for_early_dl_end(value, domain_literal): 1936 return domain_literal, value 1937 if value[0] != ']': 1938 raise errors.HeaderParseError("expected ']' at end of domain-literal " 1939 "but found '{}'".format(value)) 1940 domain_literal.append(ValueTerminal(']', 'domain-literal-end')) 1941 value = value[1:] 1942 if value and value[0] in CFWS_LEADER: 1943 token, value = get_cfws(value) 1944 domain_literal.append(token) 1945 return domain_literal, value 1946 1947 def get_domain(value): 1948 """ domain = dot-atom / domain-literal / obs-domain 1949 obs-domain = atom *("." atom)) 1950 1951 """ 1952 domain = Domain() 1953 leader = None 1954 if value[0] in CFWS_LEADER: 1955 leader, value = get_cfws(value) 1956 if not value: 1957 raise errors.HeaderParseError( 1958 "expected domain but found '{}'".format(value)) 1959 if value[0] == '[': 1960 token, value = get_domain_literal(value) 1961 if leader is not None: 1962 token[:0] = [leader] 1963 domain.append(token) 1964 return domain, value 1965 try: 1966 token, value = get_dot_atom(value) 1967 except errors.HeaderParseError: 1968 token, value = get_atom(value) 1969 if leader is not None: 1970 token[:0] = [leader] 1971 domain.append(token) 1972 if value and value[0] == '.': 1973 domain.defects.append(errors.ObsoleteHeaderDefect( 1974 "domain is not a dot-atom (contains CFWS)")) 1975 if domain[0].token_type == 'dot-atom': 1976 domain[:] = domain[0] 1977 while value and value[0] == '.': 1978 domain.append(DOT) 1979 token, value = get_atom(value[1:]) 1980 domain.append(token) 1981 return domain, value 1982 1983 def get_addr_spec(value): 1984 """ addr-spec = local-part "@" domain 1985 1986 """ 1987 addr_spec = AddrSpec() 1988 token, value = get_local_part(value) 1989 addr_spec.append(token) 1990 if not value or value[0] != '@': 1991 addr_spec.defects.append(errors.InvalidHeaderDefect( 1992 "add-spec local part with no domain")) 1993 return addr_spec, value 1994 addr_spec.append(ValueTerminal('@', 'address-at-symbol')) 1995 token, value = get_domain(value[1:]) 1996 addr_spec.append(token) 1997 return addr_spec, value 1998 1999 def get_obs_route(value): 2000 """ obs-route = obs-domain-list ":" 2001 obs-domain-list = *(CFWS / ",") "@" domain *("," [CFWS] ["@" domain]) 2002 2003 Returns an obs-route token with the appropriate sub-tokens (that is, 2004 there is no obs-domain-list in the parse tree). 2005 """ 2006 obs_route = ObsRoute() 2007 while value and (value[0]==',' or value[0] in CFWS_LEADER): 2008 if value[0] in CFWS_LEADER: 2009 token, value = get_cfws(value) 2010 obs_route.append(token) 2011 elif value[0] == ',': 2012 obs_route.append(ListSeparator) 2013 value = value[1:] 2014 if not value or value[0] != '@': 2015 raise errors.HeaderParseError( 2016 "expected obs-route domain but found '{}'".format(value)) 2017 obs_route.append(RouteComponentMarker) 2018 token, value = get_domain(value[1:]) 2019 obs_route.append(token) 2020 while value and value[0]==',': 2021 obs_route.append(ListSeparator) 2022 value = value[1:] 2023 if not value: 2024 break 2025 if value[0] in CFWS_LEADER: 2026 token, value = get_cfws(value) 2027 obs_route.append(token) 2028 if value[0] == '@': 2029 obs_route.append(RouteComponentMarker) 2030 token, value = get_domain(value[1:]) 2031 obs_route.append(token) 2032 if not value: 2033 raise errors.HeaderParseError("end of header while parsing obs-route") 2034 if value[0] != ':': 2035 raise errors.HeaderParseError( "expected ':' marking end of " 2036 "obs-route but found '{}'".format(value)) 2037 obs_route.append(ValueTerminal(':', 'end-of-obs-route-marker')) 2038 return obs_route, value[1:] 2039 2040 def get_angle_addr(value): 2041 """ angle-addr = [CFWS] "<" addr-spec ">" [CFWS] / obs-angle-addr 2042 obs-angle-addr = [CFWS] "<" obs-route addr-spec ">" [CFWS] 2043 2044 """ 2045 angle_addr = AngleAddr() 2046 if value[0] in CFWS_LEADER: 2047 token, value = get_cfws(value) 2048 angle_addr.append(token) 2049 if not value or value[0] != '<': 2050 raise errors.HeaderParseError( 2051 "expected angle-addr but found '{}'".format(value)) 2052 angle_addr.append(ValueTerminal('<', 'angle-addr-start')) 2053 value = value[1:] 2054 # Although it is not legal per RFC5322, SMTP uses '<>' in certain 2055 # circumstances. 2056 if value[0] == '>': 2057 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 2058 angle_addr.defects.append(errors.InvalidHeaderDefect( 2059 "null addr-spec in angle-addr")) 2060 value = value[1:] 2061 return angle_addr, value 2062 try: 2063 token, value = get_addr_spec(value) 2064 except errors.HeaderParseError: 2065 try: 2066 token, value = get_obs_route(value) 2067 angle_addr.defects.append(errors.ObsoleteHeaderDefect( 2068 "obsolete route specification in angle-addr")) 2069 except errors.HeaderParseError: 2070 raise errors.HeaderParseError( 2071 "expected addr-spec or obs-route but found '{}'".format(value)) 2072 angle_addr.append(token) 2073 token, value = get_addr_spec(value) 2074 angle_addr.append(token) 2075 if value and value[0] == '>': 2076 value = value[1:] 2077 else: 2078 angle_addr.defects.append(errors.InvalidHeaderDefect( 2079 "missing trailing '>' on angle-addr")) 2080 angle_addr.append(ValueTerminal('>', 'angle-addr-end')) 2081 if value and value[0] in CFWS_LEADER: 2082 token, value = get_cfws(value) 2083 angle_addr.append(token) 2084 return angle_addr, value 2085 2086 def get_display_name(value): 2087 """ display-name = phrase 2088 2089 Because this is simply a name-rule, we don't return a display-name 2090 token containing a phrase, but rather a display-name token with 2091 the content of the phrase. 2092 2093 """ 2094 display_name = DisplayName() 2095 token, value = get_phrase(value) 2096 display_name.extend(token[:]) 2097 display_name.defects = token.defects[:] 2098 return display_name, value 2099 2100 2101 def get_name_addr(value): 2102 """ name-addr = [display-name] angle-addr 2103 2104 """ 2105 name_addr = NameAddr() 2106 # Both the optional display name and the angle-addr can start with cfws. 2107 leader = None 2108 if value[0] in CFWS_LEADER: 2109 leader, value = get_cfws(value) 2110 if not value: 2111 raise errors.HeaderParseError( 2112 "expected name-addr but found '{}'".format(leader)) 2113 if value[0] != '<': 2114 if value[0] in PHRASE_ENDS: 2115 raise errors.HeaderParseError( 2116 "expected name-addr but found '{}'".format(value)) 2117 token, value = get_display_name(value) 2118 if not value: 2119 raise errors.HeaderParseError( 2120 "expected name-addr but found '{}'".format(token)) 2121 if leader is not None: 2122 token[0][:0] = [leader] 2123 leader = None 2124 name_addr.append(token) 2125 token, value = get_angle_addr(value) 2126 if leader is not None: 2127 token[:0] = [leader] 2128 name_addr.append(token) 2129 return name_addr, value 2130 2131 def get_mailbox(value): 2132 """ mailbox = name-addr / addr-spec 2133 2134 """ 2135 # The only way to figure out if we are dealing with a name-addr or an 2136 # addr-spec is to try parsing each one. 2137 mailbox = Mailbox() 2138 try: 2139 token, value = get_name_addr(value) 2140 except errors.HeaderParseError: 2141 try: 2142 token, value = get_addr_spec(value) 2143 except errors.HeaderParseError: 2144 raise errors.HeaderParseError( 2145 "expected mailbox but found '{}'".format(value)) 2146 if any(isinstance(x, errors.InvalidHeaderDefect) 2147 for x in token.all_defects): 2148 mailbox.token_type = 'invalid-mailbox' 2149 mailbox.append(token) 2150 return mailbox, value 2151 2152 def get_invalid_mailbox(value, endchars): 2153 """ Read everything up to one of the chars in endchars. 2154 2155 This is outside the formal grammar. The InvalidMailbox TokenList that is 2156 returned acts like a Mailbox, but the data attributes are None. 2157 2158 """ 2159 invalid_mailbox = InvalidMailbox() 2160 while value and value[0] not in endchars: 2161 if value[0] in PHRASE_ENDS: 2162 invalid_mailbox.append(ValueTerminal(value[0], 2163 'misplaced-special')) 2164 value = value[1:] 2165 else: 2166 token, value = get_phrase(value) 2167 invalid_mailbox.append(token) 2168 return invalid_mailbox, value 2169 2170 def get_mailbox_list(value): 2171 """ mailbox-list = (mailbox *("," mailbox)) / obs-mbox-list 2172 obs-mbox-list = *([CFWS] ",") mailbox *("," [mailbox / CFWS]) 2173 2174 For this routine we go outside the formal grammar in order to improve error 2175 handling. We recognize the end of the mailbox list only at the end of the 2176 value or at a ';' (the group terminator). This is so that we can turn 2177 invalid mailboxes into InvalidMailbox tokens and continue parsing any 2178 remaining valid mailboxes. We also allow all mailbox entries to be null, 2179 and this condition is handled appropriately at a higher level. 2180 2181 """ 2182 mailbox_list = MailboxList() 2183 while value and value[0] != ';': 2184 try: 2185 token, value = get_mailbox(value) 2186 mailbox_list.append(token) 2187 except errors.HeaderParseError: 2188 leader = None 2189 if value[0] in CFWS_LEADER: 2190 leader, value = get_cfws(value) 2191 if not value or value[0] in ',;': 2192 mailbox_list.append(leader) 2193 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 2194 "empty element in mailbox-list")) 2195 else: 2196 token, value = get_invalid_mailbox(value, ',;') 2197 if leader is not None: 2198 token[:0] = [leader] 2199 mailbox_list.append(token) 2200 mailbox_list.defects.append(errors.InvalidHeaderDefect( 2201 "invalid mailbox in mailbox-list")) 2202 elif value[0] == ',': 2203 mailbox_list.defects.append(errors.ObsoleteHeaderDefect( 2204 "empty element in mailbox-list")) 2205 else: 2206 token, value = get_invalid_mailbox(value, ',;') 2207 if leader is not None: 2208 token[:0] = [leader] 2209 mailbox_list.append(token) 2210 mailbox_list.defects.append(errors.InvalidHeaderDefect( 2211 "invalid mailbox in mailbox-list")) 2212 if value and value[0] not in ',;': 2213 # Crap after mailbox; treat it as an invalid mailbox. 2214 # The mailbox info will still be available. 2215 mailbox = mailbox_list[-1] 2216 mailbox.token_type = 'invalid-mailbox' 2217 token, value = get_invalid_mailbox(value, ',;') 2218 mailbox.extend(token) 2219 mailbox_list.defects.append(errors.InvalidHeaderDefect( 2220 "invalid mailbox in mailbox-list")) 2221 if value and value[0] == ',': 2222 mailbox_list.append(ListSeparator) 2223 value = value[1:] 2224 return mailbox_list, value 2225 2226 2227 def get_group_list(value): 2228 """ group-list = mailbox-list / CFWS / obs-group-list 2229 obs-group-list = 1*([CFWS] ",") [CFWS] 2230 2231 """ 2232 group_list = GroupList() 2233 if not value: 2234 group_list.defects.append(errors.InvalidHeaderDefect( 2235 "end of header before group-list")) 2236 return group_list, value 2237 leader = None 2238 if value and value[0] in CFWS_LEADER: 2239 leader, value = get_cfws(value) 2240 if not value: 2241 # This should never happen in email parsing, since CFWS-only is a 2242 # legal alternative to group-list in a group, which is the only 2243 # place group-list appears. 2244 group_list.defects.append(errors.InvalidHeaderDefect( 2245 "end of header in group-list")) 2246 group_list.append(leader) 2247 return group_list, value 2248 if value[0] == ';': 2249 group_list.append(leader) 2250 return group_list, value 2251 token, value = get_mailbox_list(value) 2252 if len(token.all_mailboxes)==0: 2253 if leader is not None: 2254 group_list.append(leader) 2255 group_list.extend(token) 2256 group_list.defects.append(errors.ObsoleteHeaderDefect( 2257 "group-list with empty entries")) 2258 return group_list, value 2259 if leader is not None: 2260 token[:0] = [leader] 2261 group_list.append(token) 2262 return group_list, value 2263 2264 def get_group(value): 2265 """ group = display-name ":" [group-list] ";" [CFWS] 2266 2267 """ 2268 group = Group() 2269 token, value = get_display_name(value) 2270 if not value or value[0] != ':': 2271 raise errors.HeaderParseError("expected ':' at end of group " 2272 "display name but found '{}'".format(value)) 2273 group.append(token) 2274 group.append(ValueTerminal(':', 'group-display-name-terminator')) 2275 value = value[1:] 2276 if value and value[0] == ';': 2277 group.append(ValueTerminal(';', 'group-terminator')) 2278 return group, value[1:] 2279 token, value = get_group_list(value) 2280 group.append(token) 2281 if not value: 2282 group.defects.append(errors.InvalidHeaderDefect( 2283 "end of header in group")) 2284 if value[0] != ';': 2285 raise errors.HeaderParseError( 2286 "expected ';' at end of group but found {}".format(value)) 2287 group.append(ValueTerminal(';', 'group-terminator')) 2288 value = value[1:] 2289 if value and value[0] in CFWS_LEADER: 2290 token, value = get_cfws(value) 2291 group.append(token) 2292 return group, value 2293 2294 def get_address(value): 2295 """ address = mailbox / group 2296 2297 Note that counter-intuitively, an address can be either a single address or 2298 a list of addresses (a group). This is why the returned Address object has 2299 a 'mailboxes' attribute which treats a single address as a list of length 2300 one. When you need to differentiate between to two cases, extract the single 2301 element, which is either a mailbox or a group token. 2302 2303 """ 2304 # The formal grammar isn't very helpful when parsing an address. mailbox 2305 # and group, especially when allowing for obsolete forms, start off very 2306 # similarly. It is only when you reach one of @, <, or : that you know 2307 # what you've got. So, we try each one in turn, starting with the more 2308 # likely of the two. We could perhaps make this more efficient by looking 2309 # for a phrase and then branching based on the next character, but that 2310 # would be a premature optimization. 2311 address = Address() 2312 try: 2313 token, value = get_group(value) 2314 except errors.HeaderParseError: 2315 try: 2316 token, value = get_mailbox(value) 2317 except errors.HeaderParseError: 2318 raise errors.HeaderParseError( 2319 "expected address but found '{}'".format(value)) 2320 address.append(token) 2321 return address, value 2322 2323 def get_address_list(value): 2324 """ address_list = (address *("," address)) / obs-addr-list 2325 obs-addr-list = *([CFWS] ",") address *("," [address / CFWS]) 2326 2327 We depart from the formal grammar here by continuing to parse until the end 2328 of the input, assuming the input to be entirely composed of an 2329 address-list. This is always true in email parsing, and allows us 2330 to skip invalid addresses to parse additional valid ones. 2331 2332 """ 2333 address_list = AddressList() 2334 while value: 2335 try: 2336 token, value = get_address(value) 2337 address_list.append(token) 2338 except errors.HeaderParseError as err: 2339 leader = None 2340 if value[0] in CFWS_LEADER: 2341 leader, value = get_cfws(value) 2342 if not value or value[0] == ',': 2343 address_list.append(leader) 2344 address_list.defects.append(errors.ObsoleteHeaderDefect( 2345 "address-list entry with no content")) 2346 else: 2347 token, value = get_invalid_mailbox(value, ',') 2348 if leader is not None: 2349 token[:0] = [leader] 2350 address_list.append(Address([token])) 2351 address_list.defects.append(errors.InvalidHeaderDefect( 2352 "invalid address in address-list")) 2353 elif value[0] == ',': 2354 address_list.defects.append(errors.ObsoleteHeaderDefect( 2355 "empty element in address-list")) 2356 else: 2357 token, value = get_invalid_mailbox(value, ',') 2358 if leader is not None: 2359 token[:0] = [leader] 2360 address_list.append(Address([token])) 2361 address_list.defects.append(errors.InvalidHeaderDefect( 2362 "invalid address in address-list")) 2363 if value and value[0] != ',': 2364 # Crap after address; treat it as an invalid mailbox. 2365 # The mailbox info will still be available. 2366 mailbox = address_list[-1][0] 2367 mailbox.token_type = 'invalid-mailbox' 2368 token, value = get_invalid_mailbox(value, ',') 2369 mailbox.extend(token) 2370 address_list.defects.append(errors.InvalidHeaderDefect( 2371 "invalid address in address-list")) 2372 if value: # Must be a , at this point. 2373 address_list.append(ValueTerminal(',', 'list-separator')) 2374 value = value[1:] 2375 return address_list, value 2376 2377 # 2378 # XXX: As I begin to add additional header parsers, I'm realizing we probably 2379 # have two level of parser routines: the get_XXX methods that get a token in 2380 # the grammar, and parse_XXX methods that parse an entire field value. So 2381 # get_address_list above should really be a parse_ method, as probably should 2382 # be get_unstructured. 2383 # 2384 2385 def parse_mime_version(value): 2386 """ mime-version = [CFWS] 1*digit [CFWS] "." [CFWS] 1*digit [CFWS] 2387 2388 """ 2389 # The [CFWS] is implicit in the RFC 2045 BNF. 2390 # XXX: This routine is a bit verbose, should factor out a get_int method. 2391 mime_version = MIMEVersion() 2392 if not value: 2393 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2394 "Missing MIME version number (eg: 1.0)")) 2395 return mime_version 2396 if value[0] in CFWS_LEADER: 2397 token, value = get_cfws(value) 2398 mime_version.append(token) 2399 if not value: 2400 mime_version.defects.append(errors.HeaderMissingRequiredValue( 2401 "Expected MIME version number but found only CFWS")) 2402 digits = '' 2403 while value and value[0] != '.' and value[0] not in CFWS_LEADER: 2404 digits += value[0] 2405 value = value[1:] 2406 if not digits.isdigit(): 2407 mime_version.defects.append(errors.InvalidHeaderDefect( 2408 "Expected MIME major version number but found {!r}".format(digits))) 2409 mime_version.append(ValueTerminal(digits, 'xtext')) 2410 else: 2411 mime_version.major = int(digits) 2412 mime_version.append(ValueTerminal(digits, 'digits')) 2413 if value and value[0] in CFWS_LEADER: 2414 token, value = get_cfws(value) 2415 mime_version.append(token) 2416 if not value or value[0] != '.': 2417 if mime_version.major is not None: 2418 mime_version.defects.append(errors.InvalidHeaderDefect( 2419 "Incomplete MIME version; found only major number")) 2420 if value: 2421 mime_version.append(ValueTerminal(value, 'xtext')) 2422 return mime_version 2423 mime_version.append(ValueTerminal('.', 'version-separator')) 2424 value = value[1:] 2425 if value and value[0] in CFWS_LEADER: 2426 token, value = get_cfws(value) 2427 mime_version.append(token) 2428 if not value: 2429 if mime_version.major is not None: 2430 mime_version.defects.append(errors.InvalidHeaderDefect( 2431 "Incomplete MIME version; found only major number")) 2432 return mime_version 2433 digits = '' 2434 while value and value[0] not in CFWS_LEADER: 2435 digits += value[0] 2436 value = value[1:] 2437 if not digits.isdigit(): 2438 mime_version.defects.append(errors.InvalidHeaderDefect( 2439 "Expected MIME minor version number but found {!r}".format(digits))) 2440 mime_version.append(ValueTerminal(digits, 'xtext')) 2441 else: 2442 mime_version.minor = int(digits) 2443 mime_version.append(ValueTerminal(digits, 'digits')) 2444 if value and value[0] in CFWS_LEADER: 2445 token, value = get_cfws(value) 2446 mime_version.append(token) 2447 if value: 2448 mime_version.defects.append(errors.InvalidHeaderDefect( 2449 "Excess non-CFWS text after MIME version")) 2450 mime_version.append(ValueTerminal(value, 'xtext')) 2451 return mime_version 2452 2453 def get_invalid_parameter(value): 2454 """ Read everything up to the next ';'. 2455 2456 This is outside the formal grammar. The InvalidParameter TokenList that is 2457 returned acts like a Parameter, but the data attributes are None. 2458 2459 """ 2460 invalid_parameter = InvalidParameter() 2461 while value and value[0] != ';': 2462 if value[0] in PHRASE_ENDS: 2463 invalid_parameter.append(ValueTerminal(value[0], 2464 'misplaced-special')) 2465 value = value[1:] 2466 else: 2467 token, value = get_phrase(value) 2468 invalid_parameter.append(token) 2469 return invalid_parameter, value 2470 2471 def get_ttext(value): 2472 """ttext = <matches _ttext_matcher> 2473 2474 We allow any non-TOKEN_ENDS in ttext, but add defects to the token's 2475 defects list if we find non-ttext characters. We also register defects for 2476 *any* non-printables even though the RFC doesn't exclude all of them, 2477 because we follow the spirit of RFC 5322. 2478 2479 """ 2480 m = _non_token_end_matcher(value) 2481 if not m: 2482 raise errors.HeaderParseError( 2483 "expected ttext but found '{}'".format(value)) 2484 ttext = m.group() 2485 value = value[len(ttext):] 2486 ttext = ValueTerminal(ttext, 'ttext') 2487 _validate_xtext(ttext) 2488 return ttext, value 2489 2490 def get_token(value): 2491 """token = [CFWS] 1*ttext [CFWS] 2492 2493 The RFC equivalent of ttext is any US-ASCII chars except space, ctls, or 2494 tspecials. We also exclude tabs even though the RFC doesn't. 2495 2496 The RFC implies the CFWS but is not explicit about it in the BNF. 2497 2498 """ 2499 mtoken = Token() 2500 if value and value[0] in CFWS_LEADER: 2501 token, value = get_cfws(value) 2502 mtoken.append(token) 2503 if value and value[0] in TOKEN_ENDS: 2504 raise errors.HeaderParseError( 2505 "expected token but found '{}'".format(value)) 2506 token, value = get_ttext(value) 2507 mtoken.append(token) 2508 if value and value[0] in CFWS_LEADER: 2509 token, value = get_cfws(value) 2510 mtoken.append(token) 2511 return mtoken, value 2512 2513 def get_attrtext(value): 2514 """attrtext = 1*(any non-ATTRIBUTE_ENDS character) 2515 2516 We allow any non-ATTRIBUTE_ENDS in attrtext, but add defects to the 2517 token's defects list if we find non-attrtext characters. We also register 2518 defects for *any* non-printables even though the RFC doesn't exclude all of 2519 them, because we follow the spirit of RFC 5322. 2520 2521 """ 2522 m = _non_attribute_end_matcher(value) 2523 if not m: 2524 raise errors.HeaderParseError( 2525 "expected attrtext but found {!r}".format(value)) 2526 attrtext = m.group() 2527 value = value[len(attrtext):] 2528 attrtext = ValueTerminal(attrtext, 'attrtext') 2529 _validate_xtext(attrtext) 2530 return attrtext, value 2531 2532 def get_attribute(value): 2533 """ [CFWS] 1*attrtext [CFWS] 2534 2535 This version of the BNF makes the CFWS explicit, and as usual we use a 2536 value terminal for the actual run of characters. The RFC equivalent of 2537 attrtext is the token characters, with the subtraction of '*', "'", and '%'. 2538 We include tab in the excluded set just as we do for token. 2539 2540 """ 2541 attribute = Attribute() 2542 if value and value[0] in CFWS_LEADER: 2543 token, value = get_cfws(value) 2544 attribute.append(token) 2545 if value and value[0] in ATTRIBUTE_ENDS: 2546 raise errors.HeaderParseError( 2547 "expected token but found '{}'".format(value)) 2548 token, value = get_attrtext(value) 2549 attribute.append(token) 2550 if value and value[0] in CFWS_LEADER: 2551 token, value = get_cfws(value) 2552 attribute.append(token) 2553 return attribute, value 2554 2555 def get_extended_attrtext(value): 2556 """attrtext = 1*(any non-ATTRIBUTE_ENDS character plus '%') 2557 2558 This is a special parsing routine so that we get a value that 2559 includes % escapes as a single string (which we decode as a single 2560 string later). 2561 2562 """ 2563 m = _non_extended_attribute_end_matcher(value) 2564 if not m: 2565 raise errors.HeaderParseError( 2566 "expected extended attrtext but found {!r}".format(value)) 2567 attrtext = m.group() 2568 value = value[len(attrtext):] 2569 attrtext = ValueTerminal(attrtext, 'extended-attrtext') 2570 _validate_xtext(attrtext) 2571 return attrtext, value 2572 2573 def get_extended_attribute(value): 2574 """ [CFWS] 1*extended_attrtext [CFWS] 2575 2576 This is like the non-extended version except we allow % characters, so that 2577 we can pick up an encoded value as a single string. 2578 2579 """ 2580 # XXX: should we have an ExtendedAttribute TokenList? 2581 attribute = Attribute() 2582 if value and value[0] in CFWS_LEADER: 2583 token, value = get_cfws(value) 2584 attribute.append(token) 2585 if value and value[0] in EXTENDED_ATTRIBUTE_ENDS: 2586 raise errors.HeaderParseError( 2587 "expected token but found '{}'".format(value)) 2588 token, value = get_extended_attrtext(value) 2589 attribute.append(token) 2590 if value and value[0] in CFWS_LEADER: 2591 token, value = get_cfws(value) 2592 attribute.append(token) 2593 return attribute, value 2594 2595 def get_section(value): 2596 """ '*' digits 2597 2598 The formal BNF is more complicated because leading 0s are not allowed. We 2599 check for that and add a defect. We also assume no CFWS is allowed between 2600 the '*' and the digits, though the RFC is not crystal clear on that. 2601 The caller should already have dealt with leading CFWS. 2602 2603 """ 2604 section = Section() 2605 if not value or value[0] != '*': 2606 raise errors.HeaderParseError("Expected section but found {}".format( 2607 value)) 2608 section.append(ValueTerminal('*', 'section-marker')) 2609 value = value[1:] 2610 if not value or not value[0].isdigit(): 2611 raise errors.HeaderParseError("Expected section number but " 2612 "found {}".format(value)) 2613 digits = '' 2614 while value and value[0].isdigit(): 2615 digits += value[0] 2616 value = value[1:] 2617 if digits[0] == '0' and digits != '0': 2618 section.defects.append(errors.InvalidHeaderError("section number" 2619 "has an invalid leading 0")) 2620 section.number = int(digits) 2621 section.append(ValueTerminal(digits, 'digits')) 2622 return section, value 2623 2624 2625 def get_value(value): 2626 """ quoted-string / attribute 2627 2628 """ 2629 v = Value() 2630 if not value: 2631 raise errors.HeaderParseError("Expected value but found end of string") 2632 leader = None 2633 if value[0] in CFWS_LEADER: 2634 leader, value = get_cfws(value) 2635 if not value: 2636 raise errors.HeaderParseError("Expected value but found " 2637 "only {}".format(leader)) 2638 if value[0] == '"': 2639 token, value = get_quoted_string(value) 2640 else: 2641 token, value = get_extended_attribute(value) 2642 if leader is not None: 2643 token[:0] = [leader] 2644 v.append(token) 2645 return v, value 2646 2647 def get_parameter(value): 2648 """ attribute [section] ["*"] [CFWS] "=" value 2649 2650 The CFWS is implied by the RFC but not made explicit in the BNF. This 2651 simplified form of the BNF from the RFC is made to conform with the RFC BNF 2652 through some extra checks. We do it this way because it makes both error 2653 recovery and working with the resulting parse tree easier. 2654 """ 2655 # It is possible CFWS would also be implicitly allowed between the section 2656 # and the 'extended-attribute' marker (the '*') , but we've never seen that 2657 # in the wild and we will therefore ignore the possibility. 2658 param = Parameter() 2659 token, value = get_attribute(value) 2660 param.append(token) 2661 if not value or value[0] == ';': 2662 param.defects.append(errors.InvalidHeaderDefect("Parameter contains " 2663 "name ({}) but no value".format(token))) 2664 return param, value 2665 if value[0] == '*': 2666 try: 2667 token, value = get_section(value) 2668 param.sectioned = True 2669 param.append(token) 2670 except errors.HeaderParseError: 2671 pass 2672 if not value: 2673 raise errors.HeaderParseError("Incomplete parameter") 2674 if value[0] == '*': 2675 param.append(ValueTerminal('*', 'extended-parameter-marker')) 2676 value = value[1:] 2677 param.extended = True 2678 if value[0] != '=': 2679 raise errors.HeaderParseError("Parameter not followed by '='") 2680 param.append(ValueTerminal('=', 'parameter-separator')) 2681 value = value[1:] 2682 leader = None 2683 if value and value[0] in CFWS_LEADER: 2684 token, value = get_cfws(value) 2685 param.append(token) 2686 remainder = None 2687 appendto = param 2688 if param.extended and value and value[0] == '"': 2689 # Now for some serious hackery to handle the common invalid case of 2690 # double quotes around an extended value. We also accept (with defect) 2691 # a value marked as encoded that isn't really. 2692 qstring, remainder = get_quoted_string(value) 2693 inner_value = qstring.stripped_value 2694 semi_valid = False 2695 if param.section_number == 0: 2696 if inner_value and inner_value[0] == "'": 2697 semi_valid = True 2698 else: 2699 token, rest = get_attrtext(inner_value) 2700 if rest and rest[0] == "'": 2701 semi_valid = True 2702 else: 2703 try: 2704 token, rest = get_extended_attrtext(inner_value) 2705 except: 2706 pass 2707 else: 2708 if not rest: 2709 semi_valid = True 2710 if semi_valid: 2711 param.defects.append(errors.InvalidHeaderDefect( 2712 "Quoted string value for extended parameter is invalid")) 2713 param.append(qstring) 2714 for t in qstring: 2715 if t.token_type == 'bare-quoted-string': 2716 t[:] = [] 2717 appendto = t 2718 break 2719 value = inner_value 2720 else: 2721 remainder = None 2722 param.defects.append(errors.InvalidHeaderDefect( 2723 "Parameter marked as extended but appears to have a " 2724 "quoted string value that is non-encoded")) 2725 if value and value[0] == "'": 2726 token = None 2727 else: 2728 token, value = get_value(value) 2729 if not param.extended or param.section_number > 0: 2730 if not value or value[0] != "'": 2731 appendto.append(token) 2732 if remainder is not None: 2733 assert not value, value 2734 value = remainder 2735 return param, value 2736 param.defects.append(errors.InvalidHeaderDefect( 2737 "Apparent initial-extended-value but attribute " 2738 "was not marked as extended or was not initial section")) 2739 if not value: 2740 # Assume the charset/lang is missing and the token is the value. 2741 param.defects.append(errors.InvalidHeaderDefect( 2742 "Missing required charset/lang delimiters")) 2743 appendto.append(token) 2744 if remainder is None: 2745 return param, value 2746 else: 2747 if token is not None: 2748 for t in token: 2749 if t.token_type == 'extended-attrtext': 2750 break 2751 t.token_type == 'attrtext' 2752 appendto.append(t) 2753 param.charset = t.value 2754 if value[0] != "'": 2755 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2756 "delimiter, but found {!r}".format(value)) 2757 appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) 2758 value = value[1:] 2759 if value and value[0] != "'": 2760 token, value = get_attrtext(value) 2761 appendto.append(token) 2762 param.lang = token.value 2763 if not value or value[0] != "'": 2764 raise errors.HeaderParseError("Expected RFC2231 char/lang encoding " 2765 "delimiter, but found {}".format(value)) 2766 appendto.append(ValueTerminal("'", 'RFC2231 delimiter')) 2767 value = value[1:] 2768 if remainder is not None: 2769 # Treat the rest of value as bare quoted string content. 2770 v = Value() 2771 while value: 2772 if value[0] in WSP: 2773 token, value = get_fws(value) 2774 else: 2775 token, value = get_qcontent(value) 2776 v.append(token) 2777 token = v 2778 else: 2779 token, value = get_value(value) 2780 appendto.append(token) 2781 if remainder is not None: 2782 assert not value, value 2783 value = remainder 2784 return param, value 2785 2786 def parse_mime_parameters(value): 2787 """ parameter *( ";" parameter ) 2788 2789 That BNF is meant to indicate this routine should only be called after 2790 finding and handling the leading ';'. There is no corresponding rule in 2791 the formal RFC grammar, but it is more convenient for us for the set of 2792 parameters to be treated as its own TokenList. 2793 2794 This is 'parse' routine because it consumes the reminaing value, but it 2795 would never be called to parse a full header. Instead it is called to 2796 parse everything after the non-parameter value of a specific MIME header. 2797 2798 """ 2799 mime_parameters = MimeParameters() 2800 while value: 2801 try: 2802 token, value = get_parameter(value) 2803 mime_parameters.append(token) 2804 except errors.HeaderParseError as err: 2805 leader = None 2806 if value[0] in CFWS_LEADER: 2807 leader, value = get_cfws(value) 2808 if not value: 2809 mime_parameters.append(leader) 2810 return mime_parameters 2811 if value[0] == ';': 2812 if leader is not None: 2813 mime_parameters.append(leader) 2814 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2815 "parameter entry with no content")) 2816 else: 2817 token, value = get_invalid_parameter(value) 2818 if leader: 2819 token[:0] = [leader] 2820 mime_parameters.append(token) 2821 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2822 "invalid parameter {!r}".format(token))) 2823 if value and value[0] != ';': 2824 # Junk after the otherwise valid parameter. Mark it as 2825 # invalid, but it will have a value. 2826 param = mime_parameters[-1] 2827 param.token_type = 'invalid-parameter' 2828 token, value = get_invalid_parameter(value) 2829 param.extend(token) 2830 mime_parameters.defects.append(errors.InvalidHeaderDefect( 2831 "parameter with invalid trailing text {!r}".format(token))) 2832 if value: 2833 # Must be a ';' at this point. 2834 mime_parameters.append(ValueTerminal(';', 'parameter-separator')) 2835 value = value[1:] 2836 return mime_parameters 2837 2838 def _find_mime_parameters(tokenlist, value): 2839 """Do our best to find the parameters in an invalid MIME header 2840 2841 """ 2842 while value and value[0] != ';': 2843 if value[0] in PHRASE_ENDS: 2844 tokenlist.append(ValueTerminal(value[0], 'misplaced-special')) 2845 value = value[1:] 2846 else: 2847 token, value = get_phrase(value) 2848 tokenlist.append(token) 2849 if not value: 2850 return 2851 tokenlist.append(ValueTerminal(';', 'parameter-separator')) 2852 tokenlist.append(parse_mime_parameters(value[1:])) 2853 2854 def parse_content_type_header(value): 2855 """ maintype "/" subtype *( ";" parameter ) 2856 2857 The maintype and substype are tokens. Theoretically they could 2858 be checked against the official IANA list + x-token, but we 2859 don't do that. 2860 """ 2861 ctype = ContentType() 2862 recover = False 2863 if not value: 2864 ctype.defects.append(errors.HeaderMissingRequiredValue( 2865 "Missing content type specification")) 2866 return ctype 2867 try: 2868 token, value = get_token(value) 2869 except errors.HeaderParseError: 2870 ctype.defects.append(errors.InvalidHeaderDefect( 2871 "Expected content maintype but found {!r}".format(value))) 2872 _find_mime_parameters(ctype, value) 2873 return ctype 2874 ctype.append(token) 2875 # XXX: If we really want to follow the formal grammar we should make 2876 # mantype and subtype specialized TokenLists here. Probably not worth it. 2877 if not value or value[0] != '/': 2878 ctype.defects.append(errors.InvalidHeaderDefect( 2879 "Invalid content type")) 2880 if value: 2881 _find_mime_parameters(ctype, value) 2882 return ctype 2883 ctype.maintype = token.value.strip().lower() 2884 ctype.append(ValueTerminal('/', 'content-type-separator')) 2885 value = value[1:] 2886 try: 2887 token, value = get_token(value) 2888 except errors.HeaderParseError: 2889 ctype.defects.append(errors.InvalidHeaderDefect( 2890 "Expected content subtype but found {!r}".format(value))) 2891 _find_mime_parameters(ctype, value) 2892 return ctype 2893 ctype.append(token) 2894 ctype.subtype = token.value.strip().lower() 2895 if not value: 2896 return ctype 2897 if value[0] != ';': 2898 ctype.defects.append(errors.InvalidHeaderDefect( 2899 "Only parameters are valid after content type, but " 2900 "found {!r}".format(value))) 2901 # The RFC requires that a syntactically invalid content-type be treated 2902 # as text/plain. Perhaps we should postel this, but we should probably 2903 # only do that if we were checking the subtype value against IANA. 2904 del ctype.maintype, ctype.subtype 2905 _find_mime_parameters(ctype, value) 2906 return ctype 2907 ctype.append(ValueTerminal(';', 'parameter-separator')) 2908 ctype.append(parse_mime_parameters(value[1:])) 2909 return ctype 2910 2911 def parse_content_disposition_header(value): 2912 """ disposition-type *( ";" parameter ) 2913 2914 """ 2915 disp_header = ContentDisposition() 2916 if not value: 2917 disp_header.defects.append(errors.HeaderMissingRequiredValue( 2918 "Missing content disposition")) 2919 return disp_header 2920 try: 2921 token, value = get_token(value) 2922 except errors.HeaderParseError: 2923 disp_header.defects.append(errors.InvalidHeaderDefect( 2924 "Expected content disposition but found {!r}".format(value))) 2925 _find_mime_parameters(disp_header, value) 2926 return disp_header 2927 disp_header.append(token) 2928 disp_header.content_disposition = token.value.strip().lower() 2929 if not value: 2930 return disp_header 2931 if value[0] != ';': 2932 disp_header.defects.append(errors.InvalidHeaderDefect( 2933 "Only parameters are valid after content disposition, but " 2934 "found {!r}".format(value))) 2935 _find_mime_parameters(disp_header, value) 2936 return disp_header 2937 disp_header.append(ValueTerminal(';', 'parameter-separator')) 2938 disp_header.append(parse_mime_parameters(value[1:])) 2939 return disp_header 2940 2941 def parse_content_transfer_encoding_header(value): 2942 """ mechanism 2943 2944 """ 2945 # We should probably validate the values, since the list is fixed. 2946 cte_header = ContentTransferEncoding() 2947 if not value: 2948 cte_header.defects.append(errors.HeaderMissingRequiredValue( 2949 "Missing content transfer encoding")) 2950 return cte_header 2951 try: 2952 token, value = get_token(value) 2953 except errors.HeaderParseError: 2954 cte_header.defects.append(errors.InvalidHeaderDefect( 2955 "Expected content transfer encoding but found {!r}".format(value))) 2956 else: 2957 cte_header.append(token) 2958 cte_header.cte = token.value.strip().lower() 2959 if not value: 2960 return cte_header 2961 while value: 2962 cte_header.defects.append(errors.InvalidHeaderDefect( 2963 "Extra text after content transfer encoding")) 2964 if value[0] in PHRASE_ENDS: 2965 cte_header.append(ValueTerminal(value[0], 'misplaced-special')) 2966 value = value[1:] 2967 else: 2968 token, value = get_phrase(value) 2969 cte_header.append(token) 2970 return cte_header 2971