1 """RFC 2822 message manipulation. 2 3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular 4 the tokenizing of addresses does not adhere to all the quoting rules. 5 6 Note: RFC 2822 is a long awaited update to RFC 822. This module should 7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it). Some 8 effort at RFC 2822 updates have been made, but a thorough audit has not been 9 performed. Consider any RFC 2822 non-conformance to be a bug. 10 11 RFC 2822: http://www.faqs.org/rfcs/rfc2822.html 12 RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete) 13 14 Directions for use: 15 16 To create a Message object: first open a file, e.g.: 17 18 fp = open(file, 'r') 19 20 You can use any other legal way of getting an open file object, e.g. use 21 sys.stdin or call os.popen(). Then pass the open file object to the Message() 22 constructor: 23 24 m = Message(fp) 25 26 This class can work with any input object that supports a readline method. If 27 the input object has seek and tell capability, the rewindbody method will 28 work; also illegal lines will be pushed back onto the input stream. If the 29 input object lacks seek but has an `unread' method that can push back a line 30 of input, Message will use that to push back illegal lines. Thus this class 31 can be used to parse messages coming from a buffered stream. 32 33 The optional `seekable' argument is provided as a workaround for certain stdio 34 libraries in which tell() discards buffered data before discovering that the 35 lseek() system call doesn't work. For maximum portability, you should set the 36 seekable argument to zero to prevent that initial \code{tell} when passing in 37 an unseekable object such as a file object created from a socket object. If 38 it is 1 on entry -- which it is by default -- the tell() method of the open 39 file object is called once; if this raises an exception, seekable is reset to 40 0. For other nonzero values of seekable, this test is not made. 41 42 To get the text of a particular header there are several methods: 43 44 str = m.getheader(name) 45 str = m.getrawheader(name) 46 47 where name is the name of the header, e.g. 'Subject'. The difference is that 48 getheader() strips the leading and trailing whitespace, while getrawheader() 49 doesn't. Both functions retain embedded whitespace (including newlines) 50 exactly as they are specified in the header, and leave the case of the text 51 unchanged. 52 53 For addresses and address lists there are functions 54 55 realname, mailaddress = m.getaddr(name) 56 list = m.getaddrlist(name) 57 58 where the latter returns a list of (realname, mailaddr) tuples. 59 60 There is also a method 61 62 time = m.getdate(name) 63 64 which parses a Date-like field and returns a time-compatible tuple, 65 i.e. a tuple such as returned by time.localtime() or accepted by 66 time.mktime(). 67 68 See the class definition for lower level access methods. 69 70 There are also some utility functions here. 71 """ 72 # Cleanup and extensions by Eric S. Raymond <esr (at] thyrsus.com> 73 74 import time 75 76 from warnings import warnpy3k 77 warnpy3k("in 3.x, rfc822 has been removed in favor of the email package", 78 stacklevel=2) 79 80 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"] 81 82 _blanklines = ('\r\n', '\n') # Optimization for islast() 83 84 85 class Message: 86 """Represents a single RFC 2822-compliant message.""" 87 88 def __init__(self, fp, seekable = 1): 89 """Initialize the class instance and read the headers.""" 90 if seekable == 1: 91 # Exercise tell() to make sure it works 92 # (and then assume seek() works, too) 93 try: 94 fp.tell() 95 except (AttributeError, IOError): 96 seekable = 0 97 self.fp = fp 98 self.seekable = seekable 99 self.startofheaders = None 100 self.startofbody = None 101 # 102 if self.seekable: 103 try: 104 self.startofheaders = self.fp.tell() 105 except IOError: 106 self.seekable = 0 107 # 108 self.readheaders() 109 # 110 if self.seekable: 111 try: 112 self.startofbody = self.fp.tell() 113 except IOError: 114 self.seekable = 0 115 116 def rewindbody(self): 117 """Rewind the file to the start of the body (if seekable).""" 118 if not self.seekable: 119 raise IOError, "unseekable file" 120 self.fp.seek(self.startofbody) 121 122 def readheaders(self): 123 """Read header lines. 124 125 Read header lines up to the entirely blank line that terminates them. 126 The (normally blank) line that ends the headers is skipped, but not 127 included in the returned list. If a non-header line ends the headers, 128 (which is an error), an attempt is made to backspace over it; it is 129 never included in the returned list. 130 131 The variable self.status is set to the empty string if all went well, 132 otherwise it is an error message. The variable self.headers is a 133 completely uninterpreted list of lines contained in the header (so 134 printing them will reproduce the header exactly as it appears in the 135 file). 136 """ 137 self.dict = {} 138 self.unixfrom = '' 139 self.headers = lst = [] 140 self.status = '' 141 headerseen = "" 142 firstline = 1 143 startofline = unread = tell = None 144 if hasattr(self.fp, 'unread'): 145 unread = self.fp.unread 146 elif self.seekable: 147 tell = self.fp.tell 148 while 1: 149 if tell: 150 try: 151 startofline = tell() 152 except IOError: 153 startofline = tell = None 154 self.seekable = 0 155 line = self.fp.readline() 156 if not line: 157 self.status = 'EOF in headers' 158 break 159 # Skip unix From name time lines 160 if firstline and line.startswith('From '): 161 self.unixfrom = self.unixfrom + line 162 continue 163 firstline = 0 164 if headerseen and line[0] in ' \t': 165 # It's a continuation line. 166 lst.append(line) 167 x = (self.dict[headerseen] + "\n " + line.strip()) 168 self.dict[headerseen] = x.strip() 169 continue 170 elif self.iscomment(line): 171 # It's a comment. Ignore it. 172 continue 173 elif self.islast(line): 174 # Note! No pushback here! The delimiter line gets eaten. 175 break 176 headerseen = self.isheader(line) 177 if headerseen: 178 # It's a legal header line, save it. 179 lst.append(line) 180 self.dict[headerseen] = line[len(headerseen)+1:].strip() 181 continue 182 else: 183 # It's not a header line; throw it back and stop here. 184 if not self.dict: 185 self.status = 'No headers' 186 else: 187 self.status = 'Non-header line where header expected' 188 # Try to undo the read. 189 if unread: 190 unread(line) 191 elif tell: 192 self.fp.seek(startofline) 193 else: 194 self.status = self.status + '; bad seek' 195 break 196 197 def isheader(self, line): 198 """Determine whether a given line is a legal header. 199 200 This method should return the header name, suitably canonicalized. 201 You may override this method in order to use Message parsing on tagged 202 data in RFC 2822-like formats with special header formats. 203 """ 204 i = line.find(':') 205 if i > 0: 206 return line[:i].lower() 207 return None 208 209 def islast(self, line): 210 """Determine whether a line is a legal end of RFC 2822 headers. 211 212 You may override this method if your application wants to bend the 213 rules, e.g. to strip trailing whitespace, or to recognize MH template 214 separators ('--------'). For convenience (e.g. for code reading from 215 sockets) a line consisting of \\r\\n also matches. 216 """ 217 return line in _blanklines 218 219 def iscomment(self, line): 220 """Determine whether a line should be skipped entirely. 221 222 You may override this method in order to use Message parsing on tagged 223 data in RFC 2822-like formats that support embedded comments or 224 free-text data. 225 """ 226 return False 227 228 def getallmatchingheaders(self, name): 229 """Find all header lines matching a given header name. 230 231 Look through the list of headers and find all lines matching a given 232 header name (and their continuation lines). A list of the lines is 233 returned, without interpretation. If the header does not occur, an 234 empty list is returned. If the header occurs multiple times, all 235 occurrences are returned. Case is not important in the header name. 236 """ 237 name = name.lower() + ':' 238 n = len(name) 239 lst = [] 240 hit = 0 241 for line in self.headers: 242 if line[:n].lower() == name: 243 hit = 1 244 elif not line[:1].isspace(): 245 hit = 0 246 if hit: 247 lst.append(line) 248 return lst 249 250 def getfirstmatchingheader(self, name): 251 """Get the first header line matching name. 252 253 This is similar to getallmatchingheaders, but it returns only the 254 first matching header (and its continuation lines). 255 """ 256 name = name.lower() + ':' 257 n = len(name) 258 lst = [] 259 hit = 0 260 for line in self.headers: 261 if hit: 262 if not line[:1].isspace(): 263 break 264 elif line[:n].lower() == name: 265 hit = 1 266 if hit: 267 lst.append(line) 268 return lst 269 270 def getrawheader(self, name): 271 """A higher-level interface to getfirstmatchingheader(). 272 273 Return a string containing the literal text of the header but with the 274 keyword stripped. All leading, trailing and embedded whitespace is 275 kept in the string, however. Return None if the header does not 276 occur. 277 """ 278 279 lst = self.getfirstmatchingheader(name) 280 if not lst: 281 return None 282 lst[0] = lst[0][len(name) + 1:] 283 return ''.join(lst) 284 285 def getheader(self, name, default=None): 286 """Get the header value for a name. 287 288 This is the normal interface: it returns a stripped version of the 289 header value for a given header name, or None if it doesn't exist. 290 This uses the dictionary version which finds the *last* such header. 291 """ 292 return self.dict.get(name.lower(), default) 293 get = getheader 294 295 def getheaders(self, name): 296 """Get all values for a header. 297 298 This returns a list of values for headers given more than once; each 299 value in the result list is stripped in the same way as the result of 300 getheader(). If the header is not given, return an empty list. 301 """ 302 result = [] 303 current = '' 304 have_header = 0 305 for s in self.getallmatchingheaders(name): 306 if s[0].isspace(): 307 if current: 308 current = "%s\n %s" % (current, s.strip()) 309 else: 310 current = s.strip() 311 else: 312 if have_header: 313 result.append(current) 314 current = s[s.find(":") + 1:].strip() 315 have_header = 1 316 if have_header: 317 result.append(current) 318 return result 319 320 def getaddr(self, name): 321 """Get a single address from a header, as a tuple. 322 323 An example return value: 324 ('Guido van Rossum', 'guido@cwi.nl') 325 """ 326 # New, by Ben Escoto 327 alist = self.getaddrlist(name) 328 if alist: 329 return alist[0] 330 else: 331 return (None, None) 332 333 def getaddrlist(self, name): 334 """Get a list of addresses from a header. 335 336 Retrieves a list of addresses from a header, where each address is a 337 tuple as returned by getaddr(). Scans all named headers, so it works 338 properly with multiple To: or Cc: headers for example. 339 """ 340 raw = [] 341 for h in self.getallmatchingheaders(name): 342 if h[0] in ' \t': 343 raw.append(h) 344 else: 345 if raw: 346 raw.append(', ') 347 i = h.find(':') 348 if i > 0: 349 addr = h[i+1:] 350 raw.append(addr) 351 alladdrs = ''.join(raw) 352 a = AddressList(alladdrs) 353 return a.addresslist 354 355 def getdate(self, name): 356 """Retrieve a date field from a header. 357 358 Retrieves a date field from the named header, returning a tuple 359 compatible with time.mktime(). 360 """ 361 try: 362 data = self[name] 363 except KeyError: 364 return None 365 return parsedate(data) 366 367 def getdate_tz(self, name): 368 """Retrieve a date field from a header as a 10-tuple. 369 370 The first 9 elements make up a tuple compatible with time.mktime(), 371 and the 10th is the offset of the poster's time zone from GMT/UTC. 372 """ 373 try: 374 data = self[name] 375 except KeyError: 376 return None 377 return parsedate_tz(data) 378 379 380 # Access as a dictionary (only finds *last* header of each type): 381 382 def __len__(self): 383 """Get the number of headers in a message.""" 384 return len(self.dict) 385 386 def __getitem__(self, name): 387 """Get a specific header, as from a dictionary.""" 388 return self.dict[name.lower()] 389 390 def __setitem__(self, name, value): 391 """Set the value of a header. 392 393 Note: This is not a perfect inversion of __getitem__, because any 394 changed headers get stuck at the end of the raw-headers list rather 395 than where the altered header was. 396 """ 397 del self[name] # Won't fail if it doesn't exist 398 self.dict[name.lower()] = value 399 text = name + ": " + value 400 for line in text.split("\n"): 401 self.headers.append(line + "\n") 402 403 def __delitem__(self, name): 404 """Delete all occurrences of a specific header, if it is present.""" 405 name = name.lower() 406 if not name in self.dict: 407 return 408 del self.dict[name] 409 name = name + ':' 410 n = len(name) 411 lst = [] 412 hit = 0 413 for i in range(len(self.headers)): 414 line = self.headers[i] 415 if line[:n].lower() == name: 416 hit = 1 417 elif not line[:1].isspace(): 418 hit = 0 419 if hit: 420 lst.append(i) 421 for i in reversed(lst): 422 del self.headers[i] 423 424 def setdefault(self, name, default=""): 425 lowername = name.lower() 426 if lowername in self.dict: 427 return self.dict[lowername] 428 else: 429 text = name + ": " + default 430 for line in text.split("\n"): 431 self.headers.append(line + "\n") 432 self.dict[lowername] = default 433 return default 434 435 def has_key(self, name): 436 """Determine whether a message contains the named header.""" 437 return name.lower() in self.dict 438 439 def __contains__(self, name): 440 """Determine whether a message contains the named header.""" 441 return name.lower() in self.dict 442 443 def __iter__(self): 444 return iter(self.dict) 445 446 def keys(self): 447 """Get all of a message's header field names.""" 448 return self.dict.keys() 449 450 def values(self): 451 """Get all of a message's header field values.""" 452 return self.dict.values() 453 454 def items(self): 455 """Get all of a message's headers. 456 457 Returns a list of name, value tuples. 458 """ 459 return self.dict.items() 460 461 def __str__(self): 462 return ''.join(self.headers) 463 464 465 # Utility functions 466 # ----------------- 467 468 # XXX Should fix unquote() and quote() to be really conformant. 469 # XXX The inverses of the parse functions may also be useful. 470 471 472 def unquote(s): 473 """Remove quotes from a string.""" 474 if len(s) > 1: 475 if s.startswith('"') and s.endswith('"'): 476 return s[1:-1].replace('\\\\', '\\').replace('\\"', '"') 477 if s.startswith('<') and s.endswith('>'): 478 return s[1:-1] 479 return s 480 481 482 def quote(s): 483 """Add quotes around a string.""" 484 return s.replace('\\', '\\\\').replace('"', '\\"') 485 486 487 def parseaddr(address): 488 """Parse an address into a (realname, mailaddr) tuple.""" 489 a = AddressList(address) 490 lst = a.addresslist 491 if not lst: 492 return (None, None) 493 return lst[0] 494 495 496 class AddrlistClass: 497 """Address parser class by Ben Escoto. 498 499 To understand what this class does, it helps to have a copy of 500 RFC 2822 in front of you. 501 502 http://www.faqs.org/rfcs/rfc2822.html 503 504 Note: this class interface is deprecated and may be removed in the future. 505 Use rfc822.AddressList instead. 506 """ 507 508 def __init__(self, field): 509 """Initialize a new instance. 510 511 `field' is an unparsed address header field, containing one or more 512 addresses. 513 """ 514 self.specials = '()<>@,:;.\"[]' 515 self.pos = 0 516 self.LWS = ' \t' 517 self.CR = '\r\n' 518 self.atomends = self.specials + self.LWS + self.CR 519 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 520 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 521 # syntax, so allow dots in phrases. 522 self.phraseends = self.atomends.replace('.', '') 523 self.field = field 524 self.commentlist = [] 525 526 def gotonext(self): 527 """Parse up to the start of the next address.""" 528 while self.pos < len(self.field): 529 if self.field[self.pos] in self.LWS + '\n\r': 530 self.pos = self.pos + 1 531 elif self.field[self.pos] == '(': 532 self.commentlist.append(self.getcomment()) 533 else: break 534 535 def getaddrlist(self): 536 """Parse all addresses. 537 538 Returns a list containing all of the addresses. 539 """ 540 result = [] 541 ad = self.getaddress() 542 while ad: 543 result += ad 544 ad = self.getaddress() 545 return result 546 547 def getaddress(self): 548 """Parse the next address.""" 549 self.commentlist = [] 550 self.gotonext() 551 552 oldpos = self.pos 553 oldcl = self.commentlist 554 plist = self.getphraselist() 555 556 self.gotonext() 557 returnlist = [] 558 559 if self.pos >= len(self.field): 560 # Bad email address technically, no domain. 561 if plist: 562 returnlist = [(' '.join(self.commentlist), plist[0])] 563 564 elif self.field[self.pos] in '.@': 565 # email address is just an addrspec 566 # this isn't very efficient since we start over 567 self.pos = oldpos 568 self.commentlist = oldcl 569 addrspec = self.getaddrspec() 570 returnlist = [(' '.join(self.commentlist), addrspec)] 571 572 elif self.field[self.pos] == ':': 573 # address is a group 574 returnlist = [] 575 576 fieldlen = len(self.field) 577 self.pos += 1 578 while self.pos < len(self.field): 579 self.gotonext() 580 if self.pos < fieldlen and self.field[self.pos] == ';': 581 self.pos += 1 582 break 583 returnlist = returnlist + self.getaddress() 584 585 elif self.field[self.pos] == '<': 586 # Address is a phrase then a route addr 587 routeaddr = self.getrouteaddr() 588 589 if self.commentlist: 590 returnlist = [(' '.join(plist) + ' (' + \ 591 ' '.join(self.commentlist) + ')', routeaddr)] 592 else: returnlist = [(' '.join(plist), routeaddr)] 593 594 else: 595 if plist: 596 returnlist = [(' '.join(self.commentlist), plist[0])] 597 elif self.field[self.pos] in self.specials: 598 self.pos += 1 599 600 self.gotonext() 601 if self.pos < len(self.field) and self.field[self.pos] == ',': 602 self.pos += 1 603 return returnlist 604 605 def getrouteaddr(self): 606 """Parse a route address (Return-path value). 607 608 This method just skips all the route stuff and returns the addrspec. 609 """ 610 if self.field[self.pos] != '<': 611 return 612 613 expectroute = 0 614 self.pos += 1 615 self.gotonext() 616 adlist = "" 617 while self.pos < len(self.field): 618 if expectroute: 619 self.getdomain() 620 expectroute = 0 621 elif self.field[self.pos] == '>': 622 self.pos += 1 623 break 624 elif self.field[self.pos] == '@': 625 self.pos += 1 626 expectroute = 1 627 elif self.field[self.pos] == ':': 628 self.pos += 1 629 else: 630 adlist = self.getaddrspec() 631 self.pos += 1 632 break 633 self.gotonext() 634 635 return adlist 636 637 def getaddrspec(self): 638 """Parse an RFC 2822 addr-spec.""" 639 aslist = [] 640 641 self.gotonext() 642 while self.pos < len(self.field): 643 if self.field[self.pos] == '.': 644 aslist.append('.') 645 self.pos += 1 646 elif self.field[self.pos] == '"': 647 aslist.append('"%s"' % self.getquote()) 648 elif self.field[self.pos] in self.atomends: 649 break 650 else: aslist.append(self.getatom()) 651 self.gotonext() 652 653 if self.pos >= len(self.field) or self.field[self.pos] != '@': 654 return ''.join(aslist) 655 656 aslist.append('@') 657 self.pos += 1 658 self.gotonext() 659 return ''.join(aslist) + self.getdomain() 660 661 def getdomain(self): 662 """Get the complete domain name from an address.""" 663 sdlist = [] 664 while self.pos < len(self.field): 665 if self.field[self.pos] in self.LWS: 666 self.pos += 1 667 elif self.field[self.pos] == '(': 668 self.commentlist.append(self.getcomment()) 669 elif self.field[self.pos] == '[': 670 sdlist.append(self.getdomainliteral()) 671 elif self.field[self.pos] == '.': 672 self.pos += 1 673 sdlist.append('.') 674 elif self.field[self.pos] in self.atomends: 675 break 676 else: sdlist.append(self.getatom()) 677 return ''.join(sdlist) 678 679 def getdelimited(self, beginchar, endchars, allowcomments = 1): 680 """Parse a header fragment delimited by special characters. 681 682 `beginchar' is the start character for the fragment. If self is not 683 looking at an instance of `beginchar' then getdelimited returns the 684 empty string. 685 686 `endchars' is a sequence of allowable end-delimiting characters. 687 Parsing stops when one of these is encountered. 688 689 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 690 within the parsed fragment. 691 """ 692 if self.field[self.pos] != beginchar: 693 return '' 694 695 slist = [''] 696 quote = 0 697 self.pos += 1 698 while self.pos < len(self.field): 699 if quote == 1: 700 slist.append(self.field[self.pos]) 701 quote = 0 702 elif self.field[self.pos] in endchars: 703 self.pos += 1 704 break 705 elif allowcomments and self.field[self.pos] == '(': 706 slist.append(self.getcomment()) 707 continue # have already advanced pos from getcomment 708 elif self.field[self.pos] == '\\': 709 quote = 1 710 else: 711 slist.append(self.field[self.pos]) 712 self.pos += 1 713 714 return ''.join(slist) 715 716 def getquote(self): 717 """Get a quote-delimited fragment from self's field.""" 718 return self.getdelimited('"', '"\r', 0) 719 720 def getcomment(self): 721 """Get a parenthesis-delimited fragment from self's field.""" 722 return self.getdelimited('(', ')\r', 1) 723 724 def getdomainliteral(self): 725 """Parse an RFC 2822 domain-literal.""" 726 return '[%s]' % self.getdelimited('[', ']\r', 0) 727 728 def getatom(self, atomends=None): 729 """Parse an RFC 2822 atom. 730 731 Optional atomends specifies a different set of end token delimiters 732 (the default is to use self.atomends). This is used e.g. in 733 getphraselist() since phrase endings must not include the `.' (which 734 is legal in phrases).""" 735 atomlist = [''] 736 if atomends is None: 737 atomends = self.atomends 738 739 while self.pos < len(self.field): 740 if self.field[self.pos] in atomends: 741 break 742 else: atomlist.append(self.field[self.pos]) 743 self.pos += 1 744 745 return ''.join(atomlist) 746 747 def getphraselist(self): 748 """Parse a sequence of RFC 2822 phrases. 749 750 A phrase is a sequence of words, which are in turn either RFC 2822 751 atoms or quoted-strings. Phrases are canonicalized by squeezing all 752 runs of continuous whitespace into one space. 753 """ 754 plist = [] 755 756 while self.pos < len(self.field): 757 if self.field[self.pos] in self.LWS: 758 self.pos += 1 759 elif self.field[self.pos] == '"': 760 plist.append(self.getquote()) 761 elif self.field[self.pos] == '(': 762 self.commentlist.append(self.getcomment()) 763 elif self.field[self.pos] in self.phraseends: 764 break 765 else: 766 plist.append(self.getatom(self.phraseends)) 767 768 return plist 769 770 class AddressList(AddrlistClass): 771 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 772 def __init__(self, field): 773 AddrlistClass.__init__(self, field) 774 if field: 775 self.addresslist = self.getaddrlist() 776 else: 777 self.addresslist = [] 778 779 def __len__(self): 780 return len(self.addresslist) 781 782 def __str__(self): 783 return ", ".join(map(dump_address_pair, self.addresslist)) 784 785 def __add__(self, other): 786 # Set union 787 newaddr = AddressList(None) 788 newaddr.addresslist = self.addresslist[:] 789 for x in other.addresslist: 790 if not x in self.addresslist: 791 newaddr.addresslist.append(x) 792 return newaddr 793 794 def __iadd__(self, other): 795 # Set union, in-place 796 for x in other.addresslist: 797 if not x in self.addresslist: 798 self.addresslist.append(x) 799 return self 800 801 def __sub__(self, other): 802 # Set difference 803 newaddr = AddressList(None) 804 for x in self.addresslist: 805 if not x in other.addresslist: 806 newaddr.addresslist.append(x) 807 return newaddr 808 809 def __isub__(self, other): 810 # Set difference, in-place 811 for x in other.addresslist: 812 if x in self.addresslist: 813 self.addresslist.remove(x) 814 return self 815 816 def __getitem__(self, index): 817 # Make indexing, slices, and 'in' work 818 return self.addresslist[index] 819 820 def dump_address_pair(pair): 821 """Dump a (name, address) pair in a canonicalized form.""" 822 if pair[0]: 823 return '"' + pair[0] + '" <' + pair[1] + '>' 824 else: 825 return pair[1] 826 827 # Parse a date field 828 829 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 830 'aug', 'sep', 'oct', 'nov', 'dec', 831 'january', 'february', 'march', 'april', 'may', 'june', 'july', 832 'august', 'september', 'october', 'november', 'december'] 833 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 834 835 # The timezone table does not include the military time zones defined 836 # in RFC822, other than Z. According to RFC1123, the description in 837 # RFC822 gets the signs wrong, so we can't rely on any such time 838 # zones. RFC1123 recommends that numeric timezone indicators be used 839 # instead of timezone names. 840 841 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 842 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 843 'EST': -500, 'EDT': -400, # Eastern 844 'CST': -600, 'CDT': -500, # Central 845 'MST': -700, 'MDT': -600, # Mountain 846 'PST': -800, 'PDT': -700 # Pacific 847 } 848 849 850 def parsedate_tz(data): 851 """Convert a date string to a time tuple. 852 853 Accounts for military timezones. 854 """ 855 if not data: 856 return None 857 data = data.split() 858 if data[0][-1] in (',', '.') or data[0].lower() in _daynames: 859 # There's a dayname here. Skip it 860 del data[0] 861 else: 862 # no space after the "weekday,"? 863 i = data[0].rfind(',') 864 if i >= 0: 865 data[0] = data[0][i+1:] 866 if len(data) == 3: # RFC 850 date, deprecated 867 stuff = data[0].split('-') 868 if len(stuff) == 3: 869 data = stuff + data[1:] 870 if len(data) == 4: 871 s = data[3] 872 i = s.find('+') 873 if i > 0: 874 data[3:] = [s[:i], s[i+1:]] 875 else: 876 data.append('') # Dummy tz 877 if len(data) < 5: 878 return None 879 data = data[:5] 880 [dd, mm, yy, tm, tz] = data 881 mm = mm.lower() 882 if not mm in _monthnames: 883 dd, mm = mm, dd.lower() 884 if not mm in _monthnames: 885 return None 886 mm = _monthnames.index(mm)+1 887 if mm > 12: mm = mm - 12 888 if dd[-1] == ',': 889 dd = dd[:-1] 890 i = yy.find(':') 891 if i > 0: 892 yy, tm = tm, yy 893 if yy[-1] == ',': 894 yy = yy[:-1] 895 if not yy[0].isdigit(): 896 yy, tz = tz, yy 897 if tm[-1] == ',': 898 tm = tm[:-1] 899 tm = tm.split(':') 900 if len(tm) == 2: 901 [thh, tmm] = tm 902 tss = '0' 903 elif len(tm) == 3: 904 [thh, tmm, tss] = tm 905 else: 906 return None 907 try: 908 yy = int(yy) 909 dd = int(dd) 910 thh = int(thh) 911 tmm = int(tmm) 912 tss = int(tss) 913 except ValueError: 914 return None 915 tzoffset = None 916 tz = tz.upper() 917 if tz in _timezones: 918 tzoffset = _timezones[tz] 919 else: 920 try: 921 tzoffset = int(tz) 922 except ValueError: 923 pass 924 # Convert a timezone offset into seconds ; -0500 -> -18000 925 if tzoffset: 926 if tzoffset < 0: 927 tzsign = -1 928 tzoffset = -tzoffset 929 else: 930 tzsign = 1 931 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 932 return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset) 933 934 935 def parsedate(data): 936 """Convert a time string to a time tuple.""" 937 t = parsedate_tz(data) 938 if t is None: 939 return t 940 return t[:9] 941 942 943 def mktime_tz(data): 944 """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp.""" 945 if data[9] is None: 946 # No zone info, so localtime is better assumption than GMT 947 return time.mktime(data[:8] + (-1,)) 948 else: 949 t = time.mktime(data[:8] + (0,)) 950 return t - data[9] - time.timezone 951 952 def formatdate(timeval=None): 953 """Returns time format preferred for Internet standards. 954 955 Sun, 06 Nov 1994 08:49:37 GMT ; RFC 822, updated by RFC 1123 956 957 According to RFC 1123, day and month names must always be in 958 English. If not for that, this code could use strftime(). It 959 can't because strftime() honors the locale and could generated 960 non-English names. 961 """ 962 if timeval is None: 963 timeval = time.time() 964 timeval = time.gmtime(timeval) 965 return "%s, %02d %s %04d %02d:%02d:%02d GMT" % ( 966 ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]], 967 timeval[2], 968 ("Jan", "Feb", "Mar", "Apr", "May", "Jun", 969 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1], 970 timeval[0], timeval[3], timeval[4], timeval[5]) 971 972 973 # When used as script, run a small test program. 974 # The first command line argument must be a filename containing one 975 # message in RFC-822 format. 976 977 if __name__ == '__main__': 978 import sys, os 979 file = os.path.join(os.environ['HOME'], 'Mail/inbox/1') 980 if sys.argv[1:]: file = sys.argv[1] 981 f = open(file, 'r') 982 m = Message(f) 983 print 'From:', m.getaddr('from') 984 print 'To:', m.getaddrlist('to') 985 print 'Subject:', m.getheader('subject') 986 print 'Date:', m.getheader('date') 987 date = m.getdate_tz('date') 988 tz = date[-1] 989 date = time.localtime(mktime_tz(date)) 990 if date: 991 print 'ParsedDate:', time.asctime(date), 992 hhmmss = tz 993 hhmm, ss = divmod(hhmmss, 60) 994 hh, mm = divmod(hhmm, 60) 995 print "%+03d%02d" % (hh, mm), 996 if ss: print ".%02d" % ss, 997 print 998 else: 999 print 'ParsedDate:', None 1000 m.rewindbody() 1001 n = 0 1002 while f.readline(): 1003 n += 1 1004 print 'Lines:', n 1005 print '-'*70 1006 print 'len =', len(m) 1007 if 'Date' in m: print 'Date =', m['Date'] 1008 if 'X-Nonsense' in m: pass 1009 print 'keys =', m.keys() 1010 print 'values =', m.values() 1011 print 'items =', m.items() 1012