Home | History | Annotate | Download | only in python2.7
      1 """RFC 2822 message manipulation.
      2 
      3 Note: This is only a very rough sketch of a full RFC-822 parser; in particular
      4 the tokenizing of addresses does not adhere to all the quoting rules.
      5 
      6 Note: RFC 2822 is a long awaited update to RFC 822.  This module should
      7 conform to RFC 2822, and is thus mis-named (it's not worth renaming it).  Some
      8 effort at RFC 2822 updates have been made, but a thorough audit has not been
      9 performed.  Consider any RFC 2822 non-conformance to be a bug.
     10 
     11     RFC 2822: http://www.faqs.org/rfcs/rfc2822.html
     12     RFC 822 : http://www.faqs.org/rfcs/rfc822.html (obsolete)
     13 
     14 Directions for use:
     15 
     16 To create a Message object: first open a file, e.g.:
     17 
     18   fp = open(file, 'r')
     19 
     20 You can use any other legal way of getting an open file object, e.g. use
     21 sys.stdin or call os.popen().  Then pass the open file object to the Message()
     22 constructor:
     23 
     24   m = Message(fp)
     25 
     26 This class can work with any input object that supports a readline method.  If
     27 the input object has seek and tell capability, the rewindbody method will
     28 work; also illegal lines will be pushed back onto the input stream.  If the
     29 input object lacks seek but has an `unread' method that can push back a line
     30 of input, Message will use that to push back illegal lines.  Thus this class
     31 can be used to parse messages coming from a buffered stream.
     32 
     33 The optional `seekable' argument is provided as a workaround for certain stdio
     34 libraries in which tell() discards buffered data before discovering that the
     35 lseek() system call doesn't work.  For maximum portability, you should set the
     36 seekable argument to zero to prevent that initial \code{tell} when passing in
     37 an unseekable object such as a file object created from a socket object.  If
     38 it is 1 on entry -- which it is by default -- the tell() method of the open
     39 file object is called once; if this raises an exception, seekable is reset to
     40 0.  For other nonzero values of seekable, this test is not made.
     41 
     42 To get the text of a particular header there are several methods:
     43 
     44   str = m.getheader(name)
     45   str = m.getrawheader(name)
     46 
     47 where name is the name of the header, e.g. 'Subject'.  The difference is that
     48 getheader() strips the leading and trailing whitespace, while getrawheader()
     49 doesn't.  Both functions retain embedded whitespace (including newlines)
     50 exactly as they are specified in the header, and leave the case of the text
     51 unchanged.
     52 
     53 For addresses and address lists there are functions
     54 
     55   realname, mailaddress = m.getaddr(name)
     56   list = m.getaddrlist(name)
     57 
     58 where the latter returns a list of (realname, mailaddr) tuples.
     59 
     60 There is also a method
     61 
     62   time = m.getdate(name)
     63 
     64 which parses a Date-like field and returns a time-compatible tuple,
     65 i.e. a tuple such as returned by time.localtime() or accepted by
     66 time.mktime().
     67 
     68 See the class definition for lower level access methods.
     69 
     70 There are also some utility functions here.
     71 """
     72 # Cleanup and extensions by Eric S. Raymond <esr (at] thyrsus.com>
     73 
     74 import time
     75 
     76 from warnings import warnpy3k
     77 warnpy3k("in 3.x, rfc822 has been removed in favor of the email package",
     78          stacklevel=2)
     79 
     80 __all__ = ["Message","AddressList","parsedate","parsedate_tz","mktime_tz"]
     81 
     82 _blanklines = ('\r\n', '\n')            # Optimization for islast()
     83 
     84 
     85 class Message:
     86     """Represents a single RFC 2822-compliant message."""
     87 
     88     def __init__(self, fp, seekable = 1):
     89         """Initialize the class instance and read the headers."""
     90         if seekable == 1:
     91             # Exercise tell() to make sure it works
     92             # (and then assume seek() works, too)
     93             try:
     94                 fp.tell()
     95             except (AttributeError, IOError):
     96                 seekable = 0
     97         self.fp = fp
     98         self.seekable = seekable
     99         self.startofheaders = None
    100         self.startofbody = None
    101         #
    102         if self.seekable:
    103             try:
    104                 self.startofheaders = self.fp.tell()
    105             except IOError:
    106                 self.seekable = 0
    107         #
    108         self.readheaders()
    109         #
    110         if self.seekable:
    111             try:
    112                 self.startofbody = self.fp.tell()
    113             except IOError:
    114                 self.seekable = 0
    115 
    116     def rewindbody(self):
    117         """Rewind the file to the start of the body (if seekable)."""
    118         if not self.seekable:
    119             raise IOError, "unseekable file"
    120         self.fp.seek(self.startofbody)
    121 
    122     def readheaders(self):
    123         """Read header lines.
    124 
    125         Read header lines up to the entirely blank line that terminates them.
    126         The (normally blank) line that ends the headers is skipped, but not
    127         included in the returned list.  If a non-header line ends the headers,
    128         (which is an error), an attempt is made to backspace over it; it is
    129         never included in the returned list.
    130 
    131         The variable self.status is set to the empty string if all went well,
    132         otherwise it is an error message.  The variable self.headers is a
    133         completely uninterpreted list of lines contained in the header (so
    134         printing them will reproduce the header exactly as it appears in the
    135         file).
    136         """
    137         self.dict = {}
    138         self.unixfrom = ''
    139         self.headers = lst = []
    140         self.status = ''
    141         headerseen = ""
    142         firstline = 1
    143         startofline = unread = tell = None
    144         if hasattr(self.fp, 'unread'):
    145             unread = self.fp.unread
    146         elif self.seekable:
    147             tell = self.fp.tell
    148         while 1:
    149             if tell:
    150                 try:
    151                     startofline = tell()
    152                 except IOError:
    153                     startofline = tell = None
    154                     self.seekable = 0
    155             line = self.fp.readline()
    156             if not line:
    157                 self.status = 'EOF in headers'
    158                 break
    159             # Skip unix From name time lines
    160             if firstline and line.startswith('From '):
    161                 self.unixfrom = self.unixfrom + line
    162                 continue
    163             firstline = 0
    164             if headerseen and line[0] in ' \t':
    165                 # It's a continuation line.
    166                 lst.append(line)
    167                 x = (self.dict[headerseen] + "\n " + line.strip())
    168                 self.dict[headerseen] = x.strip()
    169                 continue
    170             elif self.iscomment(line):
    171                 # It's a comment.  Ignore it.
    172                 continue
    173             elif self.islast(line):
    174                 # Note! No pushback here!  The delimiter line gets eaten.
    175                 break
    176             headerseen = self.isheader(line)
    177             if headerseen:
    178                 # It's a legal header line, save it.
    179                 lst.append(line)
    180                 self.dict[headerseen] = line[len(headerseen)+1:].strip()
    181                 continue
    182             else:
    183                 # It's not a header line; throw it back and stop here.
    184                 if not self.dict:
    185                     self.status = 'No headers'
    186                 else:
    187                     self.status = 'Non-header line where header expected'
    188                 # Try to undo the read.
    189                 if unread:
    190                     unread(line)
    191                 elif tell:
    192                     self.fp.seek(startofline)
    193                 else:
    194                     self.status = self.status + '; bad seek'
    195                 break
    196 
    197     def isheader(self, line):
    198         """Determine whether a given line is a legal header.
    199 
    200         This method should return the header name, suitably canonicalized.
    201         You may override this method in order to use Message parsing on tagged
    202         data in RFC 2822-like formats with special header formats.
    203         """
    204         i = line.find(':')
    205         if i > 0:
    206             return line[:i].lower()
    207         return None
    208 
    209     def islast(self, line):
    210         """Determine whether a line is a legal end of RFC 2822 headers.
    211 
    212         You may override this method if your application wants to bend the
    213         rules, e.g. to strip trailing whitespace, or to recognize MH template
    214         separators ('--------').  For convenience (e.g. for code reading from
    215         sockets) a line consisting of \\r\\n also matches.
    216         """
    217         return line in _blanklines
    218 
    219     def iscomment(self, line):
    220         """Determine whether a line should be skipped entirely.
    221 
    222         You may override this method in order to use Message parsing on tagged
    223         data in RFC 2822-like formats that support embedded comments or
    224         free-text data.
    225         """
    226         return False
    227 
    228     def getallmatchingheaders(self, name):
    229         """Find all header lines matching a given header name.
    230 
    231         Look through the list of headers and find all lines matching a given
    232         header name (and their continuation lines).  A list of the lines is
    233         returned, without interpretation.  If the header does not occur, an
    234         empty list is returned.  If the header occurs multiple times, all
    235         occurrences are returned.  Case is not important in the header name.
    236         """
    237         name = name.lower() + ':'
    238         n = len(name)
    239         lst = []
    240         hit = 0
    241         for line in self.headers:
    242             if line[:n].lower() == name:
    243                 hit = 1
    244             elif not line[:1].isspace():
    245                 hit = 0
    246             if hit:
    247                 lst.append(line)
    248         return lst
    249 
    250     def getfirstmatchingheader(self, name):
    251         """Get the first header line matching name.
    252 
    253         This is similar to getallmatchingheaders, but it returns only the
    254         first matching header (and its continuation lines).
    255         """
    256         name = name.lower() + ':'
    257         n = len(name)
    258         lst = []
    259         hit = 0
    260         for line in self.headers:
    261             if hit:
    262                 if not line[:1].isspace():
    263                     break
    264             elif line[:n].lower() == name:
    265                 hit = 1
    266             if hit:
    267                 lst.append(line)
    268         return lst
    269 
    270     def getrawheader(self, name):
    271         """A higher-level interface to getfirstmatchingheader().
    272 
    273         Return a string containing the literal text of the header but with the
    274         keyword stripped.  All leading, trailing and embedded whitespace is
    275         kept in the string, however.  Return None if the header does not
    276         occur.
    277         """
    278 
    279         lst = self.getfirstmatchingheader(name)
    280         if not lst:
    281             return None
    282         lst[0] = lst[0][len(name) + 1:]
    283         return ''.join(lst)
    284 
    285     def getheader(self, name, default=None):
    286         """Get the header value for a name.
    287 
    288         This is the normal interface: it returns a stripped version of the
    289         header value for a given header name, or None if it doesn't exist.
    290         This uses the dictionary version which finds the *last* such header.
    291         """
    292         return self.dict.get(name.lower(), default)
    293     get = getheader
    294 
    295     def getheaders(self, name):
    296         """Get all values for a header.
    297 
    298         This returns a list of values for headers given more than once; each
    299         value in the result list is stripped in the same way as the result of
    300         getheader().  If the header is not given, return an empty list.
    301         """
    302         result = []
    303         current = ''
    304         have_header = 0
    305         for s in self.getallmatchingheaders(name):
    306             if s[0].isspace():
    307                 if current:
    308                     current = "%s\n %s" % (current, s.strip())
    309                 else:
    310                     current = s.strip()
    311             else:
    312                 if have_header:
    313                     result.append(current)
    314                 current = s[s.find(":") + 1:].strip()
    315                 have_header = 1
    316         if have_header:
    317             result.append(current)
    318         return result
    319 
    320     def getaddr(self, name):
    321         """Get a single address from a header, as a tuple.
    322 
    323         An example return value:
    324         ('Guido van Rossum', 'guido@cwi.nl')
    325         """
    326         # New, by Ben Escoto
    327         alist = self.getaddrlist(name)
    328         if alist:
    329             return alist[0]
    330         else:
    331             return (None, None)
    332 
    333     def getaddrlist(self, name):
    334         """Get a list of addresses from a header.
    335 
    336         Retrieves a list of addresses from a header, where each address is a
    337         tuple as returned by getaddr().  Scans all named headers, so it works
    338         properly with multiple To: or Cc: headers for example.
    339         """
    340         raw = []
    341         for h in self.getallmatchingheaders(name):
    342             if h[0] in ' \t':
    343                 raw.append(h)
    344             else:
    345                 if raw:
    346                     raw.append(', ')
    347                 i = h.find(':')
    348                 if i > 0:
    349                     addr = h[i+1:]
    350                 raw.append(addr)
    351         alladdrs = ''.join(raw)
    352         a = AddressList(alladdrs)
    353         return a.addresslist
    354 
    355     def getdate(self, name):
    356         """Retrieve a date field from a header.
    357 
    358         Retrieves a date field from the named header, returning a tuple
    359         compatible with time.mktime().
    360         """
    361         try:
    362             data = self[name]
    363         except KeyError:
    364             return None
    365         return parsedate(data)
    366 
    367     def getdate_tz(self, name):
    368         """Retrieve a date field from a header as a 10-tuple.
    369 
    370         The first 9 elements make up a tuple compatible with time.mktime(),
    371         and the 10th is the offset of the poster's time zone from GMT/UTC.
    372         """
    373         try:
    374             data = self[name]
    375         except KeyError:
    376             return None
    377         return parsedate_tz(data)
    378 
    379 
    380     # Access as a dictionary (only finds *last* header of each type):
    381 
    382     def __len__(self):
    383         """Get the number of headers in a message."""
    384         return len(self.dict)
    385 
    386     def __getitem__(self, name):
    387         """Get a specific header, as from a dictionary."""
    388         return self.dict[name.lower()]
    389 
    390     def __setitem__(self, name, value):
    391         """Set the value of a header.
    392 
    393         Note: This is not a perfect inversion of __getitem__, because any
    394         changed headers get stuck at the end of the raw-headers list rather
    395         than where the altered header was.
    396         """
    397         del self[name] # Won't fail if it doesn't exist
    398         self.dict[name.lower()] = value
    399         text = name + ": " + value
    400         for line in text.split("\n"):
    401             self.headers.append(line + "\n")
    402 
    403     def __delitem__(self, name):
    404         """Delete all occurrences of a specific header, if it is present."""
    405         name = name.lower()
    406         if not name in self.dict:
    407             return
    408         del self.dict[name]
    409         name = name + ':'
    410         n = len(name)
    411         lst = []
    412         hit = 0
    413         for i in range(len(self.headers)):
    414             line = self.headers[i]
    415             if line[:n].lower() == name:
    416                 hit = 1
    417             elif not line[:1].isspace():
    418                 hit = 0
    419             if hit:
    420                 lst.append(i)
    421         for i in reversed(lst):
    422             del self.headers[i]
    423 
    424     def setdefault(self, name, default=""):
    425         lowername = name.lower()
    426         if lowername in self.dict:
    427             return self.dict[lowername]
    428         else:
    429             text = name + ": " + default
    430             for line in text.split("\n"):
    431                 self.headers.append(line + "\n")
    432             self.dict[lowername] = default
    433             return default
    434 
    435     def has_key(self, name):
    436         """Determine whether a message contains the named header."""
    437         return name.lower() in self.dict
    438 
    439     def __contains__(self, name):
    440         """Determine whether a message contains the named header."""
    441         return name.lower() in self.dict
    442 
    443     def __iter__(self):
    444         return iter(self.dict)
    445 
    446     def keys(self):
    447         """Get all of a message's header field names."""
    448         return self.dict.keys()
    449 
    450     def values(self):
    451         """Get all of a message's header field values."""
    452         return self.dict.values()
    453 
    454     def items(self):
    455         """Get all of a message's headers.
    456 
    457         Returns a list of name, value tuples.
    458         """
    459         return self.dict.items()
    460 
    461     def __str__(self):
    462         return ''.join(self.headers)
    463 
    464 
    465 # Utility functions
    466 # -----------------
    467 
    468 # XXX Should fix unquote() and quote() to be really conformant.
    469 # XXX The inverses of the parse functions may also be useful.
    470 
    471 
    472 def unquote(s):
    473     """Remove quotes from a string."""
    474     if len(s) > 1:
    475         if s.startswith('"') and s.endswith('"'):
    476             return s[1:-1].replace('\\\\', '\\').replace('\\"', '"')
    477         if s.startswith('<') and s.endswith('>'):
    478             return s[1:-1]
    479     return s
    480 
    481 
    482 def quote(s):
    483     """Add quotes around a string."""
    484     return s.replace('\\', '\\\\').replace('"', '\\"')
    485 
    486 
    487 def parseaddr(address):
    488     """Parse an address into a (realname, mailaddr) tuple."""
    489     a = AddressList(address)
    490     lst = a.addresslist
    491     if not lst:
    492         return (None, None)
    493     return lst[0]
    494 
    495 
    496 class AddrlistClass:
    497     """Address parser class by Ben Escoto.
    498 
    499     To understand what this class does, it helps to have a copy of
    500     RFC 2822 in front of you.
    501 
    502     http://www.faqs.org/rfcs/rfc2822.html
    503 
    504     Note: this class interface is deprecated and may be removed in the future.
    505     Use rfc822.AddressList instead.
    506     """
    507 
    508     def __init__(self, field):
    509         """Initialize a new instance.
    510 
    511         `field' is an unparsed address header field, containing one or more
    512         addresses.
    513         """
    514         self.specials = '()<>@,:;.\"[]'
    515         self.pos = 0
    516         self.LWS = ' \t'
    517         self.CR = '\r\n'
    518         self.atomends = self.specials + self.LWS + self.CR
    519         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
    520         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
    521         # syntax, so allow dots in phrases.
    522         self.phraseends = self.atomends.replace('.', '')
    523         self.field = field
    524         self.commentlist = []
    525 
    526     def gotonext(self):
    527         """Parse up to the start of the next address."""
    528         while self.pos < len(self.field):
    529             if self.field[self.pos] in self.LWS + '\n\r':
    530                 self.pos = self.pos + 1
    531             elif self.field[self.pos] == '(':
    532                 self.commentlist.append(self.getcomment())
    533             else: break
    534 
    535     def getaddrlist(self):
    536         """Parse all addresses.
    537 
    538         Returns a list containing all of the addresses.
    539         """
    540         result = []
    541         ad = self.getaddress()
    542         while ad:
    543             result += ad
    544             ad = self.getaddress()
    545         return result
    546 
    547     def getaddress(self):
    548         """Parse the next address."""
    549         self.commentlist = []
    550         self.gotonext()
    551 
    552         oldpos = self.pos
    553         oldcl = self.commentlist
    554         plist = self.getphraselist()
    555 
    556         self.gotonext()
    557         returnlist = []
    558 
    559         if self.pos >= len(self.field):
    560             # Bad email address technically, no domain.
    561             if plist:
    562                 returnlist = [(' '.join(self.commentlist), plist[0])]
    563 
    564         elif self.field[self.pos] in '.@':
    565             # email address is just an addrspec
    566             # this isn't very efficient since we start over
    567             self.pos = oldpos
    568             self.commentlist = oldcl
    569             addrspec = self.getaddrspec()
    570             returnlist = [(' '.join(self.commentlist), addrspec)]
    571 
    572         elif self.field[self.pos] == ':':
    573             # address is a group
    574             returnlist = []
    575 
    576             fieldlen = len(self.field)
    577             self.pos += 1
    578             while self.pos < len(self.field):
    579                 self.gotonext()
    580                 if self.pos < fieldlen and self.field[self.pos] == ';':
    581                     self.pos += 1
    582                     break
    583                 returnlist = returnlist + self.getaddress()
    584 
    585         elif self.field[self.pos] == '<':
    586             # Address is a phrase then a route addr
    587             routeaddr = self.getrouteaddr()
    588 
    589             if self.commentlist:
    590                 returnlist = [(' '.join(plist) + ' (' + \
    591                          ' '.join(self.commentlist) + ')', routeaddr)]
    592             else: returnlist = [(' '.join(plist), routeaddr)]
    593 
    594         else:
    595             if plist:
    596                 returnlist = [(' '.join(self.commentlist), plist[0])]
    597             elif self.field[self.pos] in self.specials:
    598                 self.pos += 1
    599 
    600         self.gotonext()
    601         if self.pos < len(self.field) and self.field[self.pos] == ',':
    602             self.pos += 1
    603         return returnlist
    604 
    605     def getrouteaddr(self):
    606         """Parse a route address (Return-path value).
    607 
    608         This method just skips all the route stuff and returns the addrspec.
    609         """
    610         if self.field[self.pos] != '<':
    611             return
    612 
    613         expectroute = 0
    614         self.pos += 1
    615         self.gotonext()
    616         adlist = ""
    617         while self.pos < len(self.field):
    618             if expectroute:
    619                 self.getdomain()
    620                 expectroute = 0
    621             elif self.field[self.pos] == '>':
    622                 self.pos += 1
    623                 break
    624             elif self.field[self.pos] == '@':
    625                 self.pos += 1
    626                 expectroute = 1
    627             elif self.field[self.pos] == ':':
    628                 self.pos += 1
    629             else:
    630                 adlist = self.getaddrspec()
    631                 self.pos += 1
    632                 break
    633             self.gotonext()
    634 
    635         return adlist
    636 
    637     def getaddrspec(self):
    638         """Parse an RFC 2822 addr-spec."""
    639         aslist = []
    640 
    641         self.gotonext()
    642         while self.pos < len(self.field):
    643             if self.field[self.pos] == '.':
    644                 aslist.append('.')
    645                 self.pos += 1
    646             elif self.field[self.pos] == '"':
    647                 aslist.append('"%s"' % self.getquote())
    648             elif self.field[self.pos] in self.atomends:
    649                 break
    650             else: aslist.append(self.getatom())
    651             self.gotonext()
    652 
    653         if self.pos >= len(self.field) or self.field[self.pos] != '@':
    654             return ''.join(aslist)
    655 
    656         aslist.append('@')
    657         self.pos += 1
    658         self.gotonext()
    659         return ''.join(aslist) + self.getdomain()
    660 
    661     def getdomain(self):
    662         """Get the complete domain name from an address."""
    663         sdlist = []
    664         while self.pos < len(self.field):
    665             if self.field[self.pos] in self.LWS:
    666                 self.pos += 1
    667             elif self.field[self.pos] == '(':
    668                 self.commentlist.append(self.getcomment())
    669             elif self.field[self.pos] == '[':
    670                 sdlist.append(self.getdomainliteral())
    671             elif self.field[self.pos] == '.':
    672                 self.pos += 1
    673                 sdlist.append('.')
    674             elif self.field[self.pos] in self.atomends:
    675                 break
    676             else: sdlist.append(self.getatom())
    677         return ''.join(sdlist)
    678 
    679     def getdelimited(self, beginchar, endchars, allowcomments = 1):
    680         """Parse a header fragment delimited by special characters.
    681 
    682         `beginchar' is the start character for the fragment.  If self is not
    683         looking at an instance of `beginchar' then getdelimited returns the
    684         empty string.
    685 
    686         `endchars' is a sequence of allowable end-delimiting characters.
    687         Parsing stops when one of these is encountered.
    688 
    689         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
    690         within the parsed fragment.
    691         """
    692         if self.field[self.pos] != beginchar:
    693             return ''
    694 
    695         slist = ['']
    696         quote = 0
    697         self.pos += 1
    698         while self.pos < len(self.field):
    699             if quote == 1:
    700                 slist.append(self.field[self.pos])
    701                 quote = 0
    702             elif self.field[self.pos] in endchars:
    703                 self.pos += 1
    704                 break
    705             elif allowcomments and self.field[self.pos] == '(':
    706                 slist.append(self.getcomment())
    707                 continue        # have already advanced pos from getcomment
    708             elif self.field[self.pos] == '\\':
    709                 quote = 1
    710             else:
    711                 slist.append(self.field[self.pos])
    712             self.pos += 1
    713 
    714         return ''.join(slist)
    715 
    716     def getquote(self):
    717         """Get a quote-delimited fragment from self's field."""
    718         return self.getdelimited('"', '"\r', 0)
    719 
    720     def getcomment(self):
    721         """Get a parenthesis-delimited fragment from self's field."""
    722         return self.getdelimited('(', ')\r', 1)
    723 
    724     def getdomainliteral(self):
    725         """Parse an RFC 2822 domain-literal."""
    726         return '[%s]' % self.getdelimited('[', ']\r', 0)
    727 
    728     def getatom(self, atomends=None):
    729         """Parse an RFC 2822 atom.
    730 
    731         Optional atomends specifies a different set of end token delimiters
    732         (the default is to use self.atomends).  This is used e.g. in
    733         getphraselist() since phrase endings must not include the `.' (which
    734         is legal in phrases)."""
    735         atomlist = ['']
    736         if atomends is None:
    737             atomends = self.atomends
    738 
    739         while self.pos < len(self.field):
    740             if self.field[self.pos] in atomends:
    741                 break
    742             else: atomlist.append(self.field[self.pos])
    743             self.pos += 1
    744 
    745         return ''.join(atomlist)
    746 
    747     def getphraselist(self):
    748         """Parse a sequence of RFC 2822 phrases.
    749 
    750         A phrase is a sequence of words, which are in turn either RFC 2822
    751         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
    752         runs of continuous whitespace into one space.
    753         """
    754         plist = []
    755 
    756         while self.pos < len(self.field):
    757             if self.field[self.pos] in self.LWS:
    758                 self.pos += 1
    759             elif self.field[self.pos] == '"':
    760                 plist.append(self.getquote())
    761             elif self.field[self.pos] == '(':
    762                 self.commentlist.append(self.getcomment())
    763             elif self.field[self.pos] in self.phraseends:
    764                 break
    765             else:
    766                 plist.append(self.getatom(self.phraseends))
    767 
    768         return plist
    769 
    770 class AddressList(AddrlistClass):
    771     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
    772     def __init__(self, field):
    773         AddrlistClass.__init__(self, field)
    774         if field:
    775             self.addresslist = self.getaddrlist()
    776         else:
    777             self.addresslist = []
    778 
    779     def __len__(self):
    780         return len(self.addresslist)
    781 
    782     def __str__(self):
    783         return ", ".join(map(dump_address_pair, self.addresslist))
    784 
    785     def __add__(self, other):
    786         # Set union
    787         newaddr = AddressList(None)
    788         newaddr.addresslist = self.addresslist[:]
    789         for x in other.addresslist:
    790             if not x in self.addresslist:
    791                 newaddr.addresslist.append(x)
    792         return newaddr
    793 
    794     def __iadd__(self, other):
    795         # Set union, in-place
    796         for x in other.addresslist:
    797             if not x in self.addresslist:
    798                 self.addresslist.append(x)
    799         return self
    800 
    801     def __sub__(self, other):
    802         # Set difference
    803         newaddr = AddressList(None)
    804         for x in self.addresslist:
    805             if not x in other.addresslist:
    806                 newaddr.addresslist.append(x)
    807         return newaddr
    808 
    809     def __isub__(self, other):
    810         # Set difference, in-place
    811         for x in other.addresslist:
    812             if x in self.addresslist:
    813                 self.addresslist.remove(x)
    814         return self
    815 
    816     def __getitem__(self, index):
    817         # Make indexing, slices, and 'in' work
    818         return self.addresslist[index]
    819 
    820 def dump_address_pair(pair):
    821     """Dump a (name, address) pair in a canonicalized form."""
    822     if pair[0]:
    823         return '"' + pair[0] + '" <' + pair[1] + '>'
    824     else:
    825         return pair[1]
    826 
    827 # Parse a date field
    828 
    829 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
    830                'aug', 'sep', 'oct', 'nov', 'dec',
    831                'january', 'february', 'march', 'april', 'may', 'june', 'july',
    832                'august', 'september', 'october', 'november', 'december']
    833 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
    834 
    835 # The timezone table does not include the military time zones defined
    836 # in RFC822, other than Z.  According to RFC1123, the description in
    837 # RFC822 gets the signs wrong, so we can't rely on any such time
    838 # zones.  RFC1123 recommends that numeric timezone indicators be used
    839 # instead of timezone names.
    840 
    841 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
    842               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
    843               'EST': -500, 'EDT': -400,  # Eastern
    844               'CST': -600, 'CDT': -500,  # Central
    845               'MST': -700, 'MDT': -600,  # Mountain
    846               'PST': -800, 'PDT': -700   # Pacific
    847               }
    848 
    849 
    850 def parsedate_tz(data):
    851     """Convert a date string to a time tuple.
    852 
    853     Accounts for military timezones.
    854     """
    855     if not data:
    856         return None
    857     data = data.split()
    858     if data[0][-1] in (',', '.') or data[0].lower() in _daynames:
    859         # There's a dayname here. Skip it
    860         del data[0]
    861     else:
    862         # no space after the "weekday,"?
    863         i = data[0].rfind(',')
    864         if i >= 0:
    865             data[0] = data[0][i+1:]
    866     if len(data) == 3: # RFC 850 date, deprecated
    867         stuff = data[0].split('-')
    868         if len(stuff) == 3:
    869             data = stuff + data[1:]
    870     if len(data) == 4:
    871         s = data[3]
    872         i = s.find('+')
    873         if i > 0:
    874             data[3:] = [s[:i], s[i+1:]]
    875         else:
    876             data.append('') # Dummy tz
    877     if len(data) < 5:
    878         return None
    879     data = data[:5]
    880     [dd, mm, yy, tm, tz] = data
    881     mm = mm.lower()
    882     if not mm in _monthnames:
    883         dd, mm = mm, dd.lower()
    884         if not mm in _monthnames:
    885             return None
    886     mm = _monthnames.index(mm)+1
    887     if mm > 12: mm = mm - 12
    888     if dd[-1] == ',':
    889         dd = dd[:-1]
    890     i = yy.find(':')
    891     if i > 0:
    892         yy, tm = tm, yy
    893     if yy[-1] == ',':
    894         yy = yy[:-1]
    895     if not yy[0].isdigit():
    896         yy, tz = tz, yy
    897     if tm[-1] == ',':
    898         tm = tm[:-1]
    899     tm = tm.split(':')
    900     if len(tm) == 2:
    901         [thh, tmm] = tm
    902         tss = '0'
    903     elif len(tm) == 3:
    904         [thh, tmm, tss] = tm
    905     else:
    906         return None
    907     try:
    908         yy = int(yy)
    909         dd = int(dd)
    910         thh = int(thh)
    911         tmm = int(tmm)
    912         tss = int(tss)
    913     except ValueError:
    914         return None
    915     tzoffset = None
    916     tz = tz.upper()
    917     if tz in _timezones:
    918         tzoffset = _timezones[tz]
    919     else:
    920         try:
    921             tzoffset = int(tz)
    922         except ValueError:
    923             pass
    924     # Convert a timezone offset into seconds ; -0500 -> -18000
    925     if tzoffset:
    926         if tzoffset < 0:
    927             tzsign = -1
    928             tzoffset = -tzoffset
    929         else:
    930             tzsign = 1
    931         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
    932     return (yy, mm, dd, thh, tmm, tss, 0, 1, 0, tzoffset)
    933 
    934 
    935 def parsedate(data):
    936     """Convert a time string to a time tuple."""
    937     t = parsedate_tz(data)
    938     if t is None:
    939         return t
    940     return t[:9]
    941 
    942 
    943 def mktime_tz(data):
    944     """Turn a 10-tuple as returned by parsedate_tz() into a UTC timestamp."""
    945     if data[9] is None:
    946         # No zone info, so localtime is better assumption than GMT
    947         return time.mktime(data[:8] + (-1,))
    948     else:
    949         t = time.mktime(data[:8] + (0,))
    950         return t - data[9] - time.timezone
    951 
    952 def formatdate(timeval=None):
    953     """Returns time format preferred for Internet standards.
    954 
    955     Sun, 06 Nov 1994 08:49:37 GMT  ; RFC 822, updated by RFC 1123
    956 
    957     According to RFC 1123, day and month names must always be in
    958     English.  If not for that, this code could use strftime().  It
    959     can't because strftime() honors the locale and could generated
    960     non-English names.
    961     """
    962     if timeval is None:
    963         timeval = time.time()
    964     timeval = time.gmtime(timeval)
    965     return "%s, %02d %s %04d %02d:%02d:%02d GMT" % (
    966             ("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")[timeval[6]],
    967             timeval[2],
    968             ("Jan", "Feb", "Mar", "Apr", "May", "Jun",
    969              "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")[timeval[1]-1],
    970                                 timeval[0], timeval[3], timeval[4], timeval[5])
    971 
    972 
    973 # When used as script, run a small test program.
    974 # The first command line argument must be a filename containing one
    975 # message in RFC-822 format.
    976 
    977 if __name__ == '__main__':
    978     import sys, os
    979     file = os.path.join(os.environ['HOME'], 'Mail/inbox/1')
    980     if sys.argv[1:]: file = sys.argv[1]
    981     f = open(file, 'r')
    982     m = Message(f)
    983     print 'From:', m.getaddr('from')
    984     print 'To:', m.getaddrlist('to')
    985     print 'Subject:', m.getheader('subject')
    986     print 'Date:', m.getheader('date')
    987     date = m.getdate_tz('date')
    988     tz = date[-1]
    989     date = time.localtime(mktime_tz(date))
    990     if date:
    991         print 'ParsedDate:', time.asctime(date),
    992         hhmmss = tz
    993         hhmm, ss = divmod(hhmmss, 60)
    994         hh, mm = divmod(hhmm, 60)
    995         print "%+03d%02d" % (hh, mm),
    996         if ss: print ".%02d" % ss,
    997         print
    998     else:
    999         print 'ParsedDate:', None
   1000     m.rewindbody()
   1001     n = 0
   1002     while f.readline():
   1003         n += 1
   1004     print 'Lines:', n
   1005     print '-'*70
   1006     print 'len =', len(m)
   1007     if 'Date' in m: print 'Date =', m['Date']
   1008     if 'X-Nonsense' in m: pass
   1009     print 'keys =', m.keys()
   1010     print 'values =', m.values()
   1011     print 'items =', m.items()
   1012