Home | History | Annotate | Download | only in email
      1 # Copyright (C) 2002-2007 Python Software Foundation
      2 # Contact: email-sig (at] python.org
      3 
      4 """Email address parsing code.
      5 
      6 Lifted directly from rfc822.py.  This should eventually be rewritten.
      7 """
      8 
      9 __all__ = [
     10     'mktime_tz',
     11     'parsedate',
     12     'parsedate_tz',
     13     'quote',
     14     ]
     15 
     16 import time, calendar
     17 
     18 SPACE = ' '
     19 EMPTYSTRING = ''
     20 COMMASPACE = ', '
     21 
     22 # Parse a date field
     23 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul',
     24                'aug', 'sep', 'oct', 'nov', 'dec',
     25                'january', 'february', 'march', 'april', 'may', 'june', 'july',
     26                'august', 'september', 'october', 'november', 'december']
     27 
     28 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun']
     29 
     30 # The timezone table does not include the military time zones defined
     31 # in RFC822, other than Z.  According to RFC1123, the description in
     32 # RFC822 gets the signs wrong, so we can't rely on any such time
     33 # zones.  RFC1123 recommends that numeric timezone indicators be used
     34 # instead of timezone names.
     35 
     36 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0,
     37               'AST': -400, 'ADT': -300,  # Atlantic (used in Canada)
     38               'EST': -500, 'EDT': -400,  # Eastern
     39               'CST': -600, 'CDT': -500,  # Central
     40               'MST': -700, 'MDT': -600,  # Mountain
     41               'PST': -800, 'PDT': -700   # Pacific
     42               }
     43 
     44 
     45 def parsedate_tz(data):
     46     """Convert a date string to a time tuple.
     47 
     48     Accounts for military timezones.
     49     """
     50     data = data.split()
     51     # The FWS after the comma after the day-of-week is optional, so search and
     52     # adjust for this.
     53     if data[0].endswith(',') or data[0].lower() in _daynames:
     54         # There's a dayname here. Skip it
     55         del data[0]
     56     else:
     57         i = data[0].rfind(',')
     58         if i >= 0:
     59             data[0] = data[0][i+1:]
     60     if len(data) == 3: # RFC 850 date, deprecated
     61         stuff = data[0].split('-')
     62         if len(stuff) == 3:
     63             data = stuff + data[1:]
     64     if len(data) == 4:
     65         s = data[3]
     66         i = s.find('+')
     67         if i > 0:
     68             data[3:] = [s[:i], s[i+1:]]
     69         else:
     70             data.append('') # Dummy tz
     71     if len(data) < 5:
     72         return None
     73     data = data[:5]
     74     [dd, mm, yy, tm, tz] = data
     75     mm = mm.lower()
     76     if mm not in _monthnames:
     77         dd, mm = mm, dd.lower()
     78         if mm not in _monthnames:
     79             return None
     80     mm = _monthnames.index(mm) + 1
     81     if mm > 12:
     82         mm -= 12
     83     if dd[-1] == ',':
     84         dd = dd[:-1]
     85     i = yy.find(':')
     86     if i > 0:
     87         yy, tm = tm, yy
     88     if yy[-1] == ',':
     89         yy = yy[:-1]
     90     if not yy[0].isdigit():
     91         yy, tz = tz, yy
     92     if tm[-1] == ',':
     93         tm = tm[:-1]
     94     tm = tm.split(':')
     95     if len(tm) == 2:
     96         [thh, tmm] = tm
     97         tss = '0'
     98     elif len(tm) == 3:
     99         [thh, tmm, tss] = tm
    100     else:
    101         return None
    102     try:
    103         yy = int(yy)
    104         dd = int(dd)
    105         thh = int(thh)
    106         tmm = int(tmm)
    107         tss = int(tss)
    108     except ValueError:
    109         return None
    110     # Check for a yy specified in two-digit format, then convert it to the
    111     # appropriate four-digit format, according to the POSIX standard. RFC 822
    112     # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822)
    113     # mandates a 4-digit yy. For more information, see the documentation for
    114     # the time module.
    115     if yy < 100:
    116         # The year is between 1969 and 1999 (inclusive).
    117         if yy > 68:
    118             yy += 1900
    119         # The year is between 2000 and 2068 (inclusive).
    120         else:
    121             yy += 2000
    122     tzoffset = None
    123     tz = tz.upper()
    124     if tz in _timezones:
    125         tzoffset = _timezones[tz]
    126     else:
    127         try:
    128             tzoffset = int(tz)
    129         except ValueError:
    130             pass
    131     # Convert a timezone offset into seconds ; -0500 -> -18000
    132     if tzoffset:
    133         if tzoffset < 0:
    134             tzsign = -1
    135             tzoffset = -tzoffset
    136         else:
    137             tzsign = 1
    138         tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60)
    139     # Daylight Saving Time flag is set to -1, since DST is unknown.
    140     return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset
    141 
    142 
    143 def parsedate(data):
    144     """Convert a time string to a time tuple."""
    145     t = parsedate_tz(data)
    146     if isinstance(t, tuple):
    147         return t[:9]
    148     else:
    149         return t
    150 
    151 
    152 def mktime_tz(data):
    153     """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp."""
    154     if data[9] is None:
    155         # No zone info, so localtime is better assumption than GMT
    156         return time.mktime(data[:8] + (-1,))
    157     else:
    158         t = calendar.timegm(data)
    159         return t - data[9]
    160 
    161 
    162 def quote(str):
    163     """Prepare string to be used in a quoted string.
    164 
    165     Turns backslash and double quote characters into quoted pairs.  These
    166     are the only characters that need to be quoted inside a quoted string.
    167     Does not add the surrounding double quotes.
    168     """
    169     return str.replace('\\', '\\\\').replace('"', '\\"')
    170 
    171 
    172 class AddrlistClass:
    173     """Address parser class by Ben Escoto.
    174 
    175     To understand what this class does, it helps to have a copy of RFC 2822 in
    176     front of you.
    177 
    178     Note: this class interface is deprecated and may be removed in the future.
    179     Use rfc822.AddressList instead.
    180     """
    181 
    182     def __init__(self, field):
    183         """Initialize a new instance.
    184 
    185         `field' is an unparsed address header field, containing
    186         one or more addresses.
    187         """
    188         self.specials = '()<>@,:;.\"[]'
    189         self.pos = 0
    190         self.LWS = ' \t'
    191         self.CR = '\r\n'
    192         self.FWS = self.LWS + self.CR
    193         self.atomends = self.specials + self.LWS + self.CR
    194         # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it
    195         # is obsolete syntax.  RFC 2822 requires that we recognize obsolete
    196         # syntax, so allow dots in phrases.
    197         self.phraseends = self.atomends.replace('.', '')
    198         self.field = field
    199         self.commentlist = []
    200 
    201     def gotonext(self):
    202         """Parse up to the start of the next address."""
    203         while self.pos < len(self.field):
    204             if self.field[self.pos] in self.LWS + '\n\r':
    205                 self.pos += 1
    206             elif self.field[self.pos] == '(':
    207                 self.commentlist.append(self.getcomment())
    208             else:
    209                 break
    210 
    211     def getaddrlist(self):
    212         """Parse all addresses.
    213 
    214         Returns a list containing all of the addresses.
    215         """
    216         result = []
    217         while self.pos < len(self.field):
    218             ad = self.getaddress()
    219             if ad:
    220                 result += ad
    221             else:
    222                 result.append(('', ''))
    223         return result
    224 
    225     def getaddress(self):
    226         """Parse the next address."""
    227         self.commentlist = []
    228         self.gotonext()
    229 
    230         oldpos = self.pos
    231         oldcl = self.commentlist
    232         plist = self.getphraselist()
    233 
    234         self.gotonext()
    235         returnlist = []
    236 
    237         if self.pos >= len(self.field):
    238             # Bad email address technically, no domain.
    239             if plist:
    240                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
    241 
    242         elif self.field[self.pos] in '.@':
    243             # email address is just an addrspec
    244             # this isn't very efficient since we start over
    245             self.pos = oldpos
    246             self.commentlist = oldcl
    247             addrspec = self.getaddrspec()
    248             returnlist = [(SPACE.join(self.commentlist), addrspec)]
    249 
    250         elif self.field[self.pos] == ':':
    251             # address is a group
    252             returnlist = []
    253 
    254             fieldlen = len(self.field)
    255             self.pos += 1
    256             while self.pos < len(self.field):
    257                 self.gotonext()
    258                 if self.pos < fieldlen and self.field[self.pos] == ';':
    259                     self.pos += 1
    260                     break
    261                 returnlist = returnlist + self.getaddress()
    262 
    263         elif self.field[self.pos] == '<':
    264             # Address is a phrase then a route addr
    265             routeaddr = self.getrouteaddr()
    266 
    267             if self.commentlist:
    268                 returnlist = [(SPACE.join(plist) + ' (' +
    269                                ' '.join(self.commentlist) + ')', routeaddr)]
    270             else:
    271                 returnlist = [(SPACE.join(plist), routeaddr)]
    272 
    273         else:
    274             if plist:
    275                 returnlist = [(SPACE.join(self.commentlist), plist[0])]
    276             elif self.field[self.pos] in self.specials:
    277                 self.pos += 1
    278 
    279         self.gotonext()
    280         if self.pos < len(self.field) and self.field[self.pos] == ',':
    281             self.pos += 1
    282         return returnlist
    283 
    284     def getrouteaddr(self):
    285         """Parse a route address (Return-path value).
    286 
    287         This method just skips all the route stuff and returns the addrspec.
    288         """
    289         if self.field[self.pos] != '<':
    290             return
    291 
    292         expectroute = False
    293         self.pos += 1
    294         self.gotonext()
    295         adlist = ''
    296         while self.pos < len(self.field):
    297             if expectroute:
    298                 self.getdomain()
    299                 expectroute = False
    300             elif self.field[self.pos] == '>':
    301                 self.pos += 1
    302                 break
    303             elif self.field[self.pos] == '@':
    304                 self.pos += 1
    305                 expectroute = True
    306             elif self.field[self.pos] == ':':
    307                 self.pos += 1
    308             else:
    309                 adlist = self.getaddrspec()
    310                 self.pos += 1
    311                 break
    312             self.gotonext()
    313 
    314         return adlist
    315 
    316     def getaddrspec(self):
    317         """Parse an RFC 2822 addr-spec."""
    318         aslist = []
    319 
    320         self.gotonext()
    321         while self.pos < len(self.field):
    322             if self.field[self.pos] == '.':
    323                 aslist.append('.')
    324                 self.pos += 1
    325             elif self.field[self.pos] == '"':
    326                 aslist.append('"%s"' % quote(self.getquote()))
    327             elif self.field[self.pos] in self.atomends:
    328                 break
    329             else:
    330                 aslist.append(self.getatom())
    331             self.gotonext()
    332 
    333         if self.pos >= len(self.field) or self.field[self.pos] != '@':
    334             return EMPTYSTRING.join(aslist)
    335 
    336         aslist.append('@')
    337         self.pos += 1
    338         self.gotonext()
    339         return EMPTYSTRING.join(aslist) + self.getdomain()
    340 
    341     def getdomain(self):
    342         """Get the complete domain name from an address."""
    343         sdlist = []
    344         while self.pos < len(self.field):
    345             if self.field[self.pos] in self.LWS:
    346                 self.pos += 1
    347             elif self.field[self.pos] == '(':
    348                 self.commentlist.append(self.getcomment())
    349             elif self.field[self.pos] == '[':
    350                 sdlist.append(self.getdomainliteral())
    351             elif self.field[self.pos] == '.':
    352                 self.pos += 1
    353                 sdlist.append('.')
    354             elif self.field[self.pos] in self.atomends:
    355                 break
    356             else:
    357                 sdlist.append(self.getatom())
    358         return EMPTYSTRING.join(sdlist)
    359 
    360     def getdelimited(self, beginchar, endchars, allowcomments=True):
    361         """Parse a header fragment delimited by special characters.
    362 
    363         `beginchar' is the start character for the fragment.
    364         If self is not looking at an instance of `beginchar' then
    365         getdelimited returns the empty string.
    366 
    367         `endchars' is a sequence of allowable end-delimiting characters.
    368         Parsing stops when one of these is encountered.
    369 
    370         If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed
    371         within the parsed fragment.
    372         """
    373         if self.field[self.pos] != beginchar:
    374             return ''
    375 
    376         slist = ['']
    377         quote = False
    378         self.pos += 1
    379         while self.pos < len(self.field):
    380             if quote:
    381                 slist.append(self.field[self.pos])
    382                 quote = False
    383             elif self.field[self.pos] in endchars:
    384                 self.pos += 1
    385                 break
    386             elif allowcomments and self.field[self.pos] == '(':
    387                 slist.append(self.getcomment())
    388                 continue        # have already advanced pos from getcomment
    389             elif self.field[self.pos] == '\\':
    390                 quote = True
    391             else:
    392                 slist.append(self.field[self.pos])
    393             self.pos += 1
    394 
    395         return EMPTYSTRING.join(slist)
    396 
    397     def getquote(self):
    398         """Get a quote-delimited fragment from self's field."""
    399         return self.getdelimited('"', '"\r', False)
    400 
    401     def getcomment(self):
    402         """Get a parenthesis-delimited fragment from self's field."""
    403         return self.getdelimited('(', ')\r', True)
    404 
    405     def getdomainliteral(self):
    406         """Parse an RFC 2822 domain-literal."""
    407         return '[%s]' % self.getdelimited('[', ']\r', False)
    408 
    409     def getatom(self, atomends=None):
    410         """Parse an RFC 2822 atom.
    411 
    412         Optional atomends specifies a different set of end token delimiters
    413         (the default is to use self.atomends).  This is used e.g. in
    414         getphraselist() since phrase endings must not include the `.' (which
    415         is legal in phrases)."""
    416         atomlist = ['']
    417         if atomends is None:
    418             atomends = self.atomends
    419 
    420         while self.pos < len(self.field):
    421             if self.field[self.pos] in atomends:
    422                 break
    423             else:
    424                 atomlist.append(self.field[self.pos])
    425             self.pos += 1
    426 
    427         return EMPTYSTRING.join(atomlist)
    428 
    429     def getphraselist(self):
    430         """Parse a sequence of RFC 2822 phrases.
    431 
    432         A phrase is a sequence of words, which are in turn either RFC 2822
    433         atoms or quoted-strings.  Phrases are canonicalized by squeezing all
    434         runs of continuous whitespace into one space.
    435         """
    436         plist = []
    437 
    438         while self.pos < len(self.field):
    439             if self.field[self.pos] in self.FWS:
    440                 self.pos += 1
    441             elif self.field[self.pos] == '"':
    442                 plist.append(self.getquote())
    443             elif self.field[self.pos] == '(':
    444                 self.commentlist.append(self.getcomment())
    445             elif self.field[self.pos] in self.phraseends:
    446                 break
    447             else:
    448                 plist.append(self.getatom(self.phraseends))
    449 
    450         return plist
    451 
    452 class AddressList(AddrlistClass):
    453     """An AddressList encapsulates a list of parsed RFC 2822 addresses."""
    454     def __init__(self, field):
    455         AddrlistClass.__init__(self, field)
    456         if field:
    457             self.addresslist = self.getaddrlist()
    458         else:
    459             self.addresslist = []
    460 
    461     def __len__(self):
    462         return len(self.addresslist)
    463 
    464     def __add__(self, other):
    465         # Set union
    466         newaddr = AddressList(None)
    467         newaddr.addresslist = self.addresslist[:]
    468         for x in other.addresslist:
    469             if not x in self.addresslist:
    470                 newaddr.addresslist.append(x)
    471         return newaddr
    472 
    473     def __iadd__(self, other):
    474         # Set union, in-place
    475         for x in other.addresslist:
    476             if not x in self.addresslist:
    477                 self.addresslist.append(x)
    478         return self
    479 
    480     def __sub__(self, other):
    481         # Set difference
    482         newaddr = AddressList(None)
    483         for x in self.addresslist:
    484             if not x in other.addresslist:
    485                 newaddr.addresslist.append(x)
    486         return newaddr
    487 
    488     def __isub__(self, other):
    489         # Set difference, in-place
    490         for x in other.addresslist:
    491             if x in self.addresslist:
    492                 self.addresslist.remove(x)
    493         return self
    494 
    495     def __getitem__(self, index):
    496         # Make indexing, slices, and 'in' work
    497         return self.addresslist[index]
    498