Home | History | Annotate | Download | only in Lib
      1 """RFC 2822 message manipulation.
      2 
      3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
      4 the tokenizing of addresses does not adhere to all the quoting rules.
      5 
      6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
      7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
      8 effort at RFC 2822 updates have been made, but a thorough audit has not been
      9 performed.  Consider any RFC 2822 non-conformance to be a bug.
     10 
     11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
     12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
     13 
     14 Directions for use:
     15 
     16 To create a Message object: first open a file, e.g.:
     17 
     18   fp = open(file, 'r')
     19 
     20 You can use any other legal way of getting an open file object, e.g. use
     21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
     22 constructor:
     23 
     24   m = Message(fp)
     25 
     26 This class can work with any input object that supports a readline method.  If
     27 the input object has seek and tell capability, the rewindbody method will
     28 work; also illegal lines will be pushed back onto the input stream.  If the
     29 input object lacks seek but has an `unread' method that can push back a line
     30 of input, Message will use that to push back illegal lines.  Thus this class
     31 can be used to parse messages coming from a buffered stream.
     32 
     33 The optional `seekable' argument is provided as a workaround for certain stdio
     34 libraries in which tell() discards buffered data before discovering that the
     35 lseek() system call doesn't work.  For maximum portability, you should set the
     36 seekable argument to zero to prevent that initial \code{tell} when passing in
     37 an unseekable object such as a file object created from a socket object.  If
     38 it is 1 on entry -- which it is by default -- the tell() method of the open
     39 file object is called once; if this raises an exception, seekable is reset to
     40 0.  For other nonzero values of seekable, this test is not made.
     41 
     42 To get the text of a particular header there are several methods:
     43 
     44   str = m.getheader(name)
     45   str = m.getrawheader(name)
     46 
     47 where name is the name of the header, e.g. 'Subject'.  The difference is that
     48 getheader() strips the leading and trailing whitespace, while getrawheader()
     49 doesn't.  Both functions retain embedded whitespace (including newlines)
     50 exactly as they are specified in the header, and leave the case of the text
     51 unchanged.
     52 
     53 For addresses and address lists there are functions
     54 
     55   realname, mailaddress = m.getaddr(name)
     56   list = m.getaddrlist(name)
     57 
     58 where the latter returns a list of (realname, mailaddr) tuples.
     59 
     60 There is also a method
     61 
     62   time = m.getdate(name)
     63 
     64 which parses a Date-like field and returns a time-compatible tuple,
     65 i.e. a tuple such as returned by time.localtime() or accepted by
     66 time.mktime().
     67 
     68 See the class definition for lower level access methods.
     69 
     70 There are also some utility functions here.
     71 """
     72 # Cleanup and extensions by Eric S. Raymond <esr (at] thyrsus.com>
     73 
     74 import time
     75 
     76 from warnings import warnpy3k
     77 warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
     78          stacklevel=2)
     79 
     80 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
     81 
     82 _blanklines = ('\r\n', '\n')            # Optimization for islast()
     83 
     84 
     85 class Message:
     86     """Represents a single RFC 2822-compliant message."""
     87 
     88     def __init__(self, fp, seekable = 1):
     89         """Initialize the class instance and read the headers."""
     90         if seekable == 1:
     91             # Exercise tell() to make sure it works
     92             # (and then assume seek() works, too)
     93             try:
     94                 fp.tell()
     95             except (AttributeError, IOError):
     96                 seekable = 0
     97         self.fp = fp
     98         self.seekable = seekable
     99         self.startofheaders = None
    100         self.startofbody = None
    101         #
    102         if self.seekable:
    103             try:
    104                 self.startofheaders = self.fp.tell()
    105             except IOError:
    106                 self.seekable = 0
    107         #
    108         self.readheaders()
    109         #
    110         if self.seekable:
    111             try:
    112                 self.startofbody = self.fp.tell()
    113             except IOError:
    114                 self.seekable = 0
    115 
    116     def rewindbody(self):
    117         """Rewind the file to the start of the body (if seekable)."""
    118         if not self.seekable:
    119             raise IOError, "unseekable file"
    120         self.fp.seek(self.startofbody)
    121 
    122     def readheaders(self):
    123         """Read header lines.
    124 
    125         Read header lines up to the entirely blank line that terminates them.
    126         The (normally blank) line that ends the headers is skipped, but not
    127         included in the returned list.  If a non-header line ends the headers,
    128         (which is an error), an attempt is made to backspace over it; it is
    129         never included in the returned list.
    130 
    131         The variable self.status is set to the empty string if all went well,
    132         otherwise it is an error message.  The variable self.headers is a
    133         completely uninterpreted list of lines contained in the header (so
    134         printing them will reproduce the header exactly as it appears in the
    135         file).
    136         """
    137         self.dict = {}
    138         self.unixfrom = ''
    139         self.headers = lst = []
    140         self.status = ''
    141         headerseen = ""
    142         firstline = 1
    143         startofline = unread = tell = None
    144         if hasattr(self.fp, 'unread'):
    145             unread = self.fp.unread
    146         elif self.seekable:
    147             tell = self.fp.tell
    148         while 1:
    149             if tell:
    150                 try:
    151                     startofline = tell()
    152                 except IOError:
    153                     startofline = tell = None
    154                     self.seekable = 0
    155             line = self.fp.readline()
    156             if not line:
    157                 self.status = 'EOF in headers'
    158                 break
    159             # Skip unix From name time lines
    160             if firstline and line.startswith('From '):
    161                 self.unixfrom = self.unixfrom + line
    162                 continue
    163             firstline = 0
    164             if headerseen and line[0] in ' \t':
    165                 # It's a continuation line.
    166                 lst.append(line)
    167                 x = (self.dict[headerseen] + "\n " + line.strip())
    168                 self.dict[headerseen] = x.strip()
    169                 continue
    170             elif self.iscomment(line):
    171                 # It's a comment.  Ignore it.
    172                 continue
    173             elif self.islast(line):
    174                 # Note! No pushback here!  The delimiter line gets eaten.
    175                 break
    176             headerseen = self.isheader(line)
    177             if headerseen:
    178                 # It's a legal header line, save it.
    179                 lst.append(line)
    180                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
    181                 continue
    182             elif headerseen is not None:
    183                 # An empty header name. These aren't allowed in HTTP, but it's
    184                 # probably a benign mistake. Don't add the header, just keep
    185                 # going.
    186                 continue
    187             else:
    188                 # It's not a header line; throw it back and stop here.
    189                 if not self.dict:
    190                     self.status = 'No headers'
    191                 else:
    192                     self.status = 'Non-header line where header expected'
    193                 # Try to undo the read.
    194                 if unread:
    195                     unread(line)
    196                 elif tell:
    197                     self.fp.seek(startofline)
    198                 else:
    199                     self.status = self.status + '; bad seek'
    200                 break
    201 
    202     def isheader(self, line):
    203         """Determine whether a given line is a legal header.
    204 
    205         This method should return the header name, suitably canonicalized.
    206         You may override this method in order to use Message parsing on tagged
    207         data in RFC 2822-like formats with special header formats.
    208         """
    209         i = line.find(':')
    210         if i > -1:
    211             return line[:i].lower()
    212         return None
    213 
    214     def islast(self, line):
    215         """Determine whether a line is a legal end of RFC 2822 headers.
    216 
    217         You may override this method if your application wants to bend the
    218         rules, e.g. to strip trailing whitespace, or to recognize MH template
    219         separators ('--------').  For convenience (e.g. for code reading from
    220         sockets) a line consisting of \\r\\n also matches.
    221         """
    222         return line in _blanklines
    223 
    224     def iscomment(self, line):
    225         """Determine whether a line should be skipped entirely.
    226 
    227         You may override this method in order to use Message parsing on tagged
    228         data in RFC 2822-like formats that support embedded comments or
    229         free-text data.
    230         """
    231         return False
    232 
    233     def getallmatchingheaders(self, name):
    234         """Find all header lines matching a given header name.
    235 
    236         Look through the list of headers and find all lines matching a given
    237         header name (and their continuation lines).  A list of the lines is
    238         returned, without interpretation.  If the header does not occur, an
    239         empty list is returned.  If the header occurs multiple times, all
    240         occurrences are returned.  Case is not important in the header name.
    241         """
    242         name = name.lower() + ':'
    243         n = len(name)
    244         lst = []
    245         hit = 0
    246         for line in self.headers:
    247             if line[:n].lower() == name:
    248                 hit = 1
    249             elif not line[:1].isspace():
    250                 hit = 0
    251             if hit:
    252                 lst.append(line)
    253         return lst
    254 
    255     def getfirstmatchingheader(self, name):
    256         """Get the first header line matching name.
    257 
    258         This is similar to getallmatchingheaders, but it returns only the
    259         first matching header (and its continuation lines).
    260         """
    261         name = name.lower() + ':'
    262         n = len(name)
    263         lst = []
    264         hit = 0
    265         for line in self.headers:
    266             if hit:
    267                 if not line[:1].isspace():
    268                     break
    269             elif line[:n].lower() == name:
    270                 hit = 1
    271             if hit:
    272                 lst.append(line)
    273         return lst
    274 
    275     def getrawheader(self, name):
    276         """A higher-level interface to getfirstmatchingheader().
    277 
    278         Return a string containing the literal text of the header but with the
    279         keyword stripped.  All leading, trailing and embedded whitespace is
    280         kept in the string, however.  Return None if the header does not
    281         occur.
    282         """
    283 
    284         lst = self.getfirstmatchingheader(name)
    285         if not lst:
    286             return None
    287         lst[0] = lst[0][len(name) + 1:]
    288         return ''.join(lst)
    289 
    290     def getheader(self, name, default=None):
    291         """Get the header value for a name.
    292 
    293         This is the normal interface: it returns a stripped version of the
    294         header value for a given header name, or None if it doesn't exist.
    295         This uses the dictionary version which finds the *last* such header.
    296         """
    297         return self.dict.get(name.lower(), default)
    298     get = getheader
    299 
    300     def getheaders(self, name):
    301         """Get all values for a header.
    302 
    303         This returns a list of values for headers given more than once; each
    304         value in the result list is stripped in the same way as the result of
    305         getheader().  If the header is not given, return an empty list.
    306         """
    307         result = []
    308         current = ''
    309         have_header = 0
    310         for s in self.getallmatchingheaders(name):
    311             if s[0].isspace():
    312                 if current:
    313                     current = "%s\n %s" % (current, s.strip())
    314                 else:
    315                     current = s.strip()
    316             else:
    317                 if have_header:
    318                     result.append(current)
    319                 current = s[s.find(":") + 1:].strip()
    320                 have_header = 1
    321         if have_header:
    322             result.append(current)
    323         return result
    324 
    325     def getaddr(self, name):
    326         """Get a single address from a header, as a tuple.
    327 
    328         An example return value:
    329         ('Guido van Rossum', 'guido@cwi.nl')
    330         """
    331         # New, by Ben Escoto
    332         alist = self.getaddrlist(name)
    333         if alist:
    334             return alist[0]
    335         else:
    336             return (None, None)
    337 
    338     def getaddrlist(self, name):
    339         """Get a list of addresses from a header.
    340 
    341         Retrieves a list of addresses from a header, where each address is a
    342         tuple as returned by getaddr().  Scans all named headers, so it works
    343         properly with multiple To: or Cc: headers for example.
    344         """
    345         raw = []
    346         for h in self.getallmatchingheaders(name):
    347             if h[0] in ' \t':
    348                 raw.append(h)
    349             else:
    350                 if raw:
    351                     raw.append(', ')
    352                 i = h.find(':')
    353                 if i > 0:
    354                     addr = h[i+1:]
    355                 raw.append(addr)
    356         alladdrs = ''.join(raw)
    357         a = AddressList(alladdrs)
    358         return a.addresslist
    359 
    360     def getdate(self, name):
    361         """Retrieve a date field from a header.
    362 
    363         Retrieves a date field from the named header, returning a tuple
    364         compatible with time.mktime().
    365         """
    366         try:
    367             data = self[name]
    368         except KeyError:
    369             return None
    370         return parsedate(data)
    371 
    372     def getdate_tz(self, name):
    373         """Retrieve a date field from a header as a 10-tuple.
    374 
    375         The first 9 elements make up a tuple compatible with time.mktime(),
    376         and the 10th is the offset of the poster's time zone from GMT/UTC.
    377         """
    378         try:
    379             data = self[name]
    380         except KeyError:
    381             return None
    382         return parsedate_tz(data)
    383 
    384 
    385     # Access as a dictionary (only finds *last* header of each type):
    386 
    387     def __len__(self):
    388         """Get the number of headers in a message."""
    389         return len(self.dict)
    390 
    391     def __getitem__(self, name):
    392         """Get a specific header, as from a dictionary."""
    393         return self.dict[name.lower()]
    394 
    395     def __setitem__(self, name, value):
    396         """Set the value of a header.
    397 
    398         Note: This is not a perfect inversion of __getitem__, because any
    399         changed headers get stuck at the end of the raw-headers list rather
    400         than where the altered header was.
    401         """
    402         del self[name] # Won't fail if it doesn't exist
    403         self.dict[name.lower()] = value
    404         text = name + ": " + value
    405         for line in text.split("\n"):
    406             self.headers.append(line + "\n")
    407 
    408     def __delitem__(self, name):
    409         """Delete all occurrences of a specific header, if it is present."""
    410         name = name.lower()
    411         if not name in self.dict:
    412             return
    413         del self.dict[name]
    414         name = name + ':'
    415         n = len(name)
    416         lst = []
    417         hit = 0
    418         for i in range(len(self.headers)):
    419             line = self.headers[i]
    420             if line[:n].lower() == name:
    421                 hit = 1
    422             elif not line[:1].isspace():
    423                 hit = 0
    424             if hit:
    425                 lst.append(i)
    426         for i in reversed(lst):
    427             del self.headers[i]
    428 
    429     def setdefault(self, name, default=""):
    430         lowername = name.lower()
    431         if lowername in self.dict:
    432             return self.dict[lowername]
    433         else:
    434             text = name + ": " + default
    435             for line in text.split("\n"):
    436                 self.headers.append(line + "\n")
    437             self.dict[lowername] = default
    438             return default
    439 
    440     def has_key(self, name):
    441         """Determine whether a message contains the named header."""
    442         return name.lower() in self.dict
    443 
    444     def __contains__(self, name):
    445         """Determine whether a message contains the named header."""
    446         return name.lower() in self.dict
    447 
    448     def __iter__(self):
    449         return iter(self.dict)
    450 
    451     def keys(self):
    452         """Get all of a message's header field names."""
    453         return self.dict.keys()
    454 
    455     def values(self):
    456         """Get all of a message's header field values."""
    457         return self.dict.values()
    458 
    459     def items(self):
    460         """Get all of a message's headers.
    461 
    462         Returns a list of name, value tuples.
    463         """
    464         return self.dict.items()
    465 
    466     def __str__(self):
    467         return ''.join(self.headers)
    468 
    469 
    470 # Utility functions
    471 # -----------------
    472 
    473 # XXX Should fix unquote() and quote() to be really conformant.
    474 # XXX The inverses of the parse functions may also be useful.
    475 
    476 
    477 def unquote(s):
    478     """Remove quotes from a string."""
    479     if len(s) > 1:
    480         if s.startswith('"') and s.endswith('"'):
    481             return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
    482         if s.startswith('<') and s.endswith('>'):
    483             return s[1:-1]
    484     return s
    485 
    486 
    487 def quote(s):
    488     """Add quotes around a string."""
    489     return s.replace('\\', '\\\\').replace('"', '\\"')
    490 
    491 
    492 def parseaddr(address):
    493     """Parse an address into a (realname, mailaddr) tuple."""
    494     a = AddressList(address)
    495     lst = a.addresslist
    496     if not lst:
    497         return (None, None)
    498     return lst[0]
    499 
    500 
    501 class AddrlistClass:
    502     """Address parser class by Ben Escoto.
    503 
    504     To understand what this class does, it helps to have a copy of
    505     RFC 2822 in front of you.
    506 
    507     http://www.faqs.org/rfcs/rfc2822.html
    508 
    509     Note: this class interface is deprecated and may be removed in the future.
    510     Use rfc822.AddressList instead.
    511     """
    512 
    513     def __init__(self, field):
    514         """Initialize a new instance.
    515 
    516         `field' is an unparsed address header field, containing one or more
    517         addresses.
    518         """
    519         self.specials = '()<>@,:;.\"[]'
    520         self.pos = 0
    521         self.LWS = ' \t'
    522         self.CR = '\r\n'
    523         self.atomends = self.specials + self.LWS + self.CR
    524         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
    525         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
    526         # syntax, so allow dots in phrases.
    527         self.phraseends = self.atomends.replace('.', '')
    528         self.field = field
    529         self.commentlist = []
    530 
    531     def gotonext(self):
    532         """Parse up to the start of the next address."""
    533         while self.pos < len(self.field):
    534             if self.field[self.pos] in self.LWS + '\n\r':
    535                 self.pos = self.pos + 1
    536             elif self.field[self.pos] == '(':
    537                 self.commentlist.append(self.getcomment())
    538             else: break
    539 
    540     def getaddrlist(self):
    541         """Parse all addresses.
    542 
    543         Returns a list containing all of the addresses.
    544         """
    545         result = []
    546         ad = self.getaddress()
    547         while ad:
    548             result += ad
    549             ad = self.getaddress()
    550         return result
    551 
    552     def getaddress(self):
    553         """Parse the next address."""
    554         self.commentlist = []
    555         self.gotonext()
    556 
    557         oldpos = self.pos
    558         oldcl = self.commentlist
    559         plist = self.getphraselist()
    560 
    561         self.gotonext()
    562         returnlist = []
    563 
    564         if self.pos >= len(self.field):
    565             # Bad email address technically, no domain.
    566             if plist:
    567                 returnlist = [(' '.join(self.commentlist), plist[0])]
    568 
    569         elif self.field[self.pos] in '.@':
    570             # email address is just an addrspec
    571             # this isn't very efficient since we start over
    572             self.pos = oldpos
    573             self.commentlist = oldcl
    574             addrspec = self.getaddrspec()
    575             returnlist = [(' '.join(self.commentlist), addrspec)]
    576 
    577         elif self.field[self.pos] == ':':
    578             # address is a group
    579             returnlist = []
    580 
    581             fieldlen = len(self.field)
    582             self.pos += 1
    583             while self.pos < len(self.field):
    584                 self.gotonext()
    585                 if self.pos < fieldlen and self.field[self.pos] == ';':
    586                     self.pos += 1
    587                     break
    588                 returnlist = returnlist + self.getaddress()
    589 
    590         elif self.field[self.pos] == '<':
    591             # Address is a phrase then a route addr
    592             routeaddr = self.getrouteaddr()
    593 
    594             if self.commentlist:
    595                 returnlist = [(' '.join(plist) + ' (' + \
    596                          ' '.join(self.commentlist) + ')', routeaddr)]
    597             else: returnlist = [(' '.join(plist), routeaddr)]
    598 
    599         else:
    600             if plist:
    601                 returnlist = [(' '.join(self.commentlist), plist[0])]
    602             elif self.field[self.pos] in self.specials:
    603                 self.pos += 1
    604 
    605         self.gotonext()
    606         if self.pos < len(self.field) and self.field[self.pos] == ',':
    607             self.pos += 1
    608         return returnlist
    609 
    610     def getrouteaddr(self):
    611         """Parse a route address (Return-path value).
    612 
    613         This method just skips all the route stuff and returns the addrspec.
    614         """
    615         if self.field[self.pos] != '<':
    616             return
    617 
    618         expectroute = 0
    619         self.pos += 1
    620         self.gotonext()
    621         adlist = ""
    622         while self.pos < len(self.field):
    623             if expectroute:
    624                 self.getdomain()
    625                 expectroute = 0
    626             elif self.field[self.pos] == '>':
    627                 self.pos += 1
    628                 break
    629             elif self.field[self.pos] == '@':
    630                 self.pos += 1
    631                 expectroute = 1
    632             elif self.field[self.pos] == ':':
    633                 self.pos += 1
    634             else:
    635                 adlist = self.getaddrspec()
    636                 self.pos += 1
    637                 break
    638             self.gotonext()
    639 
    640         return adlist
    641 
    642     def getaddrspec(self):
    643         """Parse an RFC 2822 addr-spec."""
    644         aslist = []
    645 
    646         self.gotonext()
    647         while self.pos < len(self.field):
    648             if self.field[self.pos] == '.':
    649                 aslist.append('.')
    650                 self.pos += 1
    651             elif self.field[self.pos] == '"':
    652                 aslist.append('"%s"' % self.getquote())
    653             elif self.field[self.pos] in self.atomends:
    654                 break
    655             else: aslist.append(self.getatom())
    656             self.gotonext()
    657 
    658         if self.pos >= len(self.field) or self.field[self.pos] != '@':
    659             return ''.join(aslist)
    660 
    661         aslist.append('@')
    662         self.pos += 1
    663         self.gotonext()
    664         return ''.join(aslist) + self.getdomain()
    665 
    666     def getdomain(self):
    667         """Get the complete domain name from an address."""
    668         sdlist = []
    669         while self.pos < len(self.field):
    670             if self.field[self.pos] in self.LWS:
    671                 self.pos += 1
    672             elif self.field[self.pos] == '(':
    673                 self.commentlist.append(self.getcomment())
    674             elif self.field[self.pos] == '[':
    675                 sdlist.append(self.getdomainliteral())
    676             elif self.field[self.pos] == '.':
    677                 self.pos += 1
    678                 sdlist.append('.')
    679             elif self.field[self.pos] in self.atomends:
    680                 break
    681             else: sdlist.append(self.getatom())
    682         return ''.join(sdlist)
    683 
    684     def getdelimited(self, beginchar, endchars, allowcomments = 1):
    685         """Parse a header fragment delimited by special characters.
    686 
    687         `beginchar' is the start character for the fragment.  If self is not
    688         looking at an instance of `beginchar' then getdelimited returns the
    689         empty string.
    690 
    691         `endchars' is a sequence of allowable end-delimiting characters.
    692         Parsing stops when one of these is encountered.
    693 
    694         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
    695         within the parsed fragment.
    696         """
    697         if self.field[self.pos] != beginchar:
    698             return ''
    699 
    700         slist = ['']
    701         quote = 0
    702         self.pos += 1
    703         while self.pos < len(self.field):
    704             if quote == 1:
    705                 slist.append(self.field[self.pos])
    706                 quote = 0
    707             elif self.field[self.pos] in endchars:
    708                 self.pos += 1
    709                 break
    710             elif allowcomments and self.field[self.pos] == '(':
    711                 slist.append(self.getcomment())
    712                 continue        # have already advanced pos from getcomment
    713             elif self.field[self.pos] == '\\':
    714                 quote = 1
    715             else:
    716                 slist.append(self.field[self.pos])
    717             self.pos += 1
    718 
    719         return ''.join(slist)
    720 
    721     def getquote(self):
    722         """Get a quote-delimited fragment from self's field."""
    723         return self.getdelimited('"', '"\r', 0)
    724 
    725     def getcomment(self):
    726         """Get a parenthesis-delimited fragment from self's field."""
    727         return self.getdelimited('(', ')\r', 1)
    728 
    729     def getdomainliteral(self):
    730         """Parse an RFC 2822 domain-literal."""
    731         return '[%s]' % self.getdelimited('[', ']\r', 0)
    732 
    733     def getatom(self, atomends=None):
    734         """Parse an RFC 2822 atom.
    735 
    736         Optional atomends specifies a different set of end token delimiters
    737         (the default is to use self.atomends).  This is used e.g. in
    738         getphraselist() since phrase endings must not include the `.' (which
    739         is legal in phrases)."""
    740         atomlist = ['']
    741         if atomends is None:
    742             atomends = self.atomends
    743 
    744         while self.pos < len(self.field):
    745             if self.field[self.pos] in atomends:
    746                 break
    747             else: atomlist.append(self.field[self.pos])
    748             self.pos += 1
    749 
    750         return ''.join(atomlist)
    751 
    752     def getphraselist(self):
    753         """Parse a sequence of RFC 2822 phrases.
    754 
    755         A phrase is a sequence of words, which are in turn either RFC 2822
    756         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
    757         runs of continuous whitespace into one space.
    758         """
    759         plist = []
    760 
    761         while self.pos < len(self.field):
    762             if self.field[self.pos] in self.LWS:
    763                 self.pos += 1
    764             elif self.field[self.pos] == '"':
    765                 plist.append(self.getquote())
    766             elif self.field[self.pos] == '(':
    767                 self.commentlist.append(self.getcomment())
    768             elif self.field[self.pos] in self.phraseends:
    769                 break
    770             else:
    771                 plist.append(self.getatom(self.phraseends))
    772 
    773         return plist
    774 
    775 class AddressList(AddrlistClass):
    776     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
    777     def __init__(self, field):
    778         AddrlistClass.__init__(self, field)
    779         if field:
    780             self.addresslist = self.getaddrlist()
    781         else:
    782             self.addresslist = []
    783 
    784     def __len__(self):
    785         return len(self.addresslist)
    786 
    787     def __str__(self):
    788         return ", ".join(map(dump_address_pair, self.addresslist))
    789 
    790     def __add__(self, other):
    791         # Set union
    792         newaddr = AddressList(None)
    793         newaddr.addresslist = self.addresslist[:]
    794         for x in other.addresslist:
    795             if not x in self.addresslist:
    796                 newaddr.addresslist.append(x)
    797         return newaddr
    798 
    799     def __iadd__(self, other):
    800         # Set union, in-place
    801         for x in other.addresslist:
    802             if not x in self.addresslist:
    803                 self.addresslist.append(x)
    804         return self
    805 
    806     def __sub__(self, other):
    807         # Set difference
    808         newaddr = AddressList(None)
    809         for x in self.addresslist:
    810             if not x in other.addresslist:
    811                 newaddr.addresslist.append(x)
    812         return newaddr
    813 
    814     def __isub__(self, other):
    815         # Set difference, in-place
    816         for x in other.addresslist:
    817             if x in self.addresslist:
    818                 self.addresslist.remove(x)
    819         return self
    820 
    821     def __getitem__(self, index):
    822         # Make indexing, slices, and 'in' work
    823         return self.addresslist[index]
    824 
    825 def dump_address_pair(pair):
    826     """Dump a (name, address) pair in a canonicalized form."""
    827     if pair[0]:
    828         return '"' + pair[0] + '" <' + pair[1] + '>'
    829     else:
    830         return pair[1]
    831 
    832 # Parse a date field
    833 
    834 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
    835                'aug', 'sep', 'oct', 'nov', 'dec',
    836                'january', 'february', 'march', 'april', 'may', 'june', 'july',
    837                'august', 'september', 'october', 'november', 'december']
    838 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    839 
    840 # The timezone table does not include the military time zones defined
    841 # in RFC822, other than Z.  According to RFC1123, the description in
    842 # RFC822 gets the signs wrong, so we can't rely on any such time
    843 # zones.  RFC1123 recommends that numeric timezone indicators be used
    844 # instead of timezone names.
    845 
    846 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
    847               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
    848               'EST': -500, 'EDT': -400,  # Eastern
    849               'CST': -600, 'CDT': -500,  # Central
    850               'MST': -700, 'MDT': -600,  # Mountain
    851               'PST': -800, 'PDT': -700   # Pacific
    852               }
    853 
    854 
    855 def parsedate_tz(data):
    856     """Convert a date string to a time tuple.
    857 
    858     Accounts for military timezones.
    859     """
    860     if not data:
    861         return None
    862     data = data.split()
    863     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
    864         # There's a dayname here. Skip it
    865         del data[0]
    866     else:
    867         # no space after the "weekday,"?
    868         i = data[0].rfind(',')
    869         if i >= 0:
    870             data[0] = data[0][i+1:]
    871     if len(data) == 3: # RFC 850 date, deprecated
    872         stuff = data[0].split('-')
    873         if len(stuff) == 3:
    874             data = stuff + data[1:]
    875     if len(data) == 4:
    876         s = data[3]
    877         i = s.find('+')
    878         if i > 0:
    879             data[3:] = [s[:i], s[i+1:]]
    880         else:
    881             data.append('') # Dummy tz
    882     if len(data) < 5:
    883         return None
    884     data = data[:5]
    885     [dd, mm, yy, tm, tz] = data
    886     mm = mm.lower()
    887     if not mm in _monthnames:
    888         dd, mm = mm, dd.lower()
    889         if not mm in _monthnames:
    890             return None
    891     mm = _monthnames.index(mm)+1
    892     if mm > 12: mm = mm - 12
    893     if dd[-1] == ',':
    894         dd = dd[:-1]
    895     i = yy.find(':')
    896     if i > 0:
    897         yy, tm = tm, yy
    898     if yy[-1] == ',':
    899         yy = yy[:-1]
    900     if not yy[0].isdigit():
    901         yy, tz = tz, yy
    902     if tm[-1] == ',':
    903         tm = tm[:-1]
    904     tm = tm.split(':')
    905     if len(tm) == 2:
    906         [thh, tmm] = tm
    907         tss = '0'
    908     elif len(tm) == 3:
    909         [thh, tmm, tss] = tm
    910     else:
    911         return None
    912     try:
    913         yy = int(yy)
    914         dd = int(dd)
    915         thh = int(thh)
    916         tmm = int(tmm)
    917         tss = int(tss)
    918     except ValueError:
    919         return None
    920     tzoffset = None
    921     tz = tz.upper()
    922     if tz in _timezones:
    923         tzoffset = _timezones[tz]
    924     else:
    925         try:
    926             tzoffset = int(tz)
    927         except ValueError:
    928             pass
    929     # Convert a timezone offset into seconds ; -0500 -> -18000
    930     if tzoffset:
    931         if tzoffset < 0:
    932             tzsign = -1
    933             tzoffset = -tzoffset
    934         else:
    935             tzsign = 1
    936         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
    937     return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
    938 
    939 
    940 def parsedate(data):
    941     """Convert a time string to a time tuple."""
    942     t = parsedate_tz(data)
    943     if t is None:
    944         return t
    945     return t[:9]
    946 
    947 
    948 def mktime_tz(data):
    949     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
    950     if data[9] is None:
    951         # No zone info, so localtime is better assumption than GMT
    952         return time.mktime(data[:8] + (-1,))
    953     else:
    954         t = time.mktime(data[:8] + (0,))
    955         return t - data[9] - time.timezone
    956 
    957 def formatdate(timeval=None):
    958     """Returns time format preferred for Internet standards.
    959 
    960     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
    961 
    962     According to RFC 1123, day and month names must always be in
    963     English.  If not for that, this code could use strftime().  It
    964     can't because strftime() honors the locale and could generate
    965     non-English names.
    966     """
    967     if timeval is None:
    968         timeval = time.time()
    969     timeval = time.gmtime(timeval)
    970     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
    971             ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
    972             timeval[2],
    973             ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
    974              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
    975                                 timeval[0], timeval[3], timeval[4], timeval[5])
    976 
    977 
    978 # When used as script, run a small test program.
    979 # The first command line argument must be a filename containing one
    980 # message in RFC-822 format.
    981 
    982 if __name__ == '__main__':
    983     import sys, os
    984     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
    985     if sys.argv[1:]: file = sys.argv[1]
    986     f = open(file, 'r')
    987     m = Message(f)
    988     print 'From:', m.getaddr('from')
    989     print 'To:', m.getaddrlist('to')
    990     print 'Subject:', m.getheader('subject')
    991     print 'Date:', m.getheader('date')
    992     date = m.getdate_tz('date')
    993     tz = date[-1]
    994     date = time.localtime(mktime_tz(date))
    995     if date:
    996         print 'ParsedDate:', time.asctime(date),
    997         hhmmss = tz
    998         hhmm, ss = divmod(hhmmss, 60)
    999         hh, mm = divmod(hhmm, 60)
   1000         print "%+03d%02d" % (hh, mm),
   1001         if ss: print ".%02d" % ss,
   1002         print
   1003     else:
   1004         print 'ParsedDate:', None
   1005     m.rewindbody()
   1006     n = 0
   1007     while f.readline():
   1008         n += 1
   1009     print 'Lines:', n
   1010     print '-'*70
   1011     print 'len =', len(m)
   1012     if 'Date' in m: print 'Date =', m['Date']
   1013     if 'X-Nonsense' in m: pass
   1014     print 'keys =', m.keys()
   1015     print 'values =', m.values()
   1016     print 'items =', m.items()
   1017