1 # Copyright (C) 2002-2007 Python Software Foundation 2 # Contact: email-sig (at] python.org 3 4 """Email address parsing code. 5 6 Lifted directly from rfc822.py. This should eventually be rewritten. 7 """ 8 9 __all__ = [ 10 'mktime_tz', 11 'parsedate', 12 'parsedate_tz', 13 'quote', 14 ] 15 16 import time, calendar 17 18 SPACE = ' ' 19 EMPTYSTRING = '' 20 COMMASPACE = ', ' 21 22 # Parse a date field 23 _monthnames = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 24 'aug', 'sep', 'oct', 'nov', 'dec', 25 'january', 'february', 'march', 'april', 'may', 'june', 'july', 26 'august', 'september', 'october', 'november', 'december'] 27 28 _daynames = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat', 'sun'] 29 30 # The timezone table does not include the military time zones defined 31 # in RFC822, other than Z. According to RFC1123, the description in 32 # RFC822 gets the signs wrong, so we can't rely on any such time 33 # zones. RFC1123 recommends that numeric timezone indicators be used 34 # instead of timezone names. 35 36 _timezones = {'UT':0, 'UTC':0, 'GMT':0, 'Z':0, 37 'AST': -400, 'ADT': -300, # Atlantic (used in Canada) 38 'EST': -500, 'EDT': -400, # Eastern 39 'CST': -600, 'CDT': -500, # Central 40 'MST': -700, 'MDT': -600, # Mountain 41 'PST': -800, 'PDT': -700 # Pacific 42 } 43 44 45 def parsedate_tz(data): 46 """Convert a date string to a time tuple. 47 48 Accounts for military timezones. 49 """ 50 data = data.split() 51 # The FWS after the comma after the day-of-week is optional, so search and 52 # adjust for this. 53 if data[0].endswith(',') or data[0].lower() in _daynames: 54 # There's a dayname here. Skip it 55 del data[0] 56 else: 57 i = data[0].rfind(',') 58 if i >= 0: 59 data[0] = data[0][i+1:] 60 if len(data) == 3: # RFC 850 date, deprecated 61 stuff = data[0].split('-') 62 if len(stuff) == 3: 63 data = stuff + data[1:] 64 if len(data) == 4: 65 s = data[3] 66 i = s.find('+') 67 if i > 0: 68 data[3:] = [s[:i], s[i+1:]] 69 else: 70 data.append('') # Dummy tz 71 if len(data) < 5: 72 return None 73 data = data[:5] 74 [dd, mm, yy, tm, tz] = data 75 mm = mm.lower() 76 if mm not in _monthnames: 77 dd, mm = mm, dd.lower() 78 if mm not in _monthnames: 79 return None 80 mm = _monthnames.index(mm) + 1 81 if mm > 12: 82 mm -= 12 83 if dd[-1] == ',': 84 dd = dd[:-1] 85 i = yy.find(':') 86 if i > 0: 87 yy, tm = tm, yy 88 if yy[-1] == ',': 89 yy = yy[:-1] 90 if not yy[0].isdigit(): 91 yy, tz = tz, yy 92 if tm[-1] == ',': 93 tm = tm[:-1] 94 tm = tm.split(':') 95 if len(tm) == 2: 96 [thh, tmm] = tm 97 tss = '0' 98 elif len(tm) == 3: 99 [thh, tmm, tss] = tm 100 else: 101 return None 102 try: 103 yy = int(yy) 104 dd = int(dd) 105 thh = int(thh) 106 tmm = int(tmm) 107 tss = int(tss) 108 except ValueError: 109 return None 110 # Check for a yy specified in two-digit format, then convert it to the 111 # appropriate four-digit format, according to the POSIX standard. RFC 822 112 # calls for a two-digit yy, but RFC 2822 (which obsoletes RFC 822) 113 # mandates a 4-digit yy. For more information, see the documentation for 114 # the time module. 115 if yy < 100: 116 # The year is between 1969 and 1999 (inclusive). 117 if yy > 68: 118 yy += 1900 119 # The year is between 2000 and 2068 (inclusive). 120 else: 121 yy += 2000 122 tzoffset = None 123 tz = tz.upper() 124 if tz in _timezones: 125 tzoffset = _timezones[tz] 126 else: 127 try: 128 tzoffset = int(tz) 129 except ValueError: 130 pass 131 # Convert a timezone offset into seconds ; -0500 -> -18000 132 if tzoffset: 133 if tzoffset < 0: 134 tzsign = -1 135 tzoffset = -tzoffset 136 else: 137 tzsign = 1 138 tzoffset = tzsign * ( (tzoffset//100)*3600 + (tzoffset % 100)*60) 139 # Daylight Saving Time flag is set to -1, since DST is unknown. 140 return yy, mm, dd, thh, tmm, tss, 0, 1, -1, tzoffset 141 142 143 def parsedate(data): 144 """Convert a time string to a time tuple.""" 145 t = parsedate_tz(data) 146 if isinstance(t, tuple): 147 return t[:9] 148 else: 149 return t 150 151 152 def mktime_tz(data): 153 """Turn a 10-tuple as returned by parsedate_tz() into a POSIX timestamp.""" 154 if data[9] is None: 155 # No zone info, so localtime is better assumption than GMT 156 return time.mktime(data[:8] + (-1,)) 157 else: 158 t = calendar.timegm(data) 159 return t - data[9] 160 161 162 def quote(str): 163 """Prepare string to be used in a quoted string. 164 165 Turns backslash and double quote characters into quoted pairs. These 166 are the only characters that need to be quoted inside a quoted string. 167 Does not add the surrounding double quotes. 168 """ 169 return str.replace('\\', '\\\\').replace('"', '\\"') 170 171 172 class AddrlistClass: 173 """Address parser class by Ben Escoto. 174 175 To understand what this class does, it helps to have a copy of RFC 2822 in 176 front of you. 177 178 Note: this class interface is deprecated and may be removed in the future. 179 Use rfc822.AddressList instead. 180 """ 181 182 def __init__(self, field): 183 """Initialize a new instance. 184 185 `field' is an unparsed address header field, containing 186 one or more addresses. 187 """ 188 self.specials = '()<>@,:;.\"[]' 189 self.pos = 0 190 self.LWS = ' \t' 191 self.CR = '\r\n' 192 self.FWS = self.LWS + self.CR 193 self.atomends = self.specials + self.LWS + self.CR 194 # Note that RFC 2822 now specifies `.' as obs-phrase, meaning that it 195 # is obsolete syntax. RFC 2822 requires that we recognize obsolete 196 # syntax, so allow dots in phrases. 197 self.phraseends = self.atomends.replace('.', '') 198 self.field = field 199 self.commentlist = [] 200 201 def gotonext(self): 202 """Parse up to the start of the next address.""" 203 while self.pos < len(self.field): 204 if self.field[self.pos] in self.LWS + '\n\r': 205 self.pos += 1 206 elif self.field[self.pos] == '(': 207 self.commentlist.append(self.getcomment()) 208 else: 209 break 210 211 def getaddrlist(self): 212 """Parse all addresses. 213 214 Returns a list containing all of the addresses. 215 """ 216 result = [] 217 while self.pos < len(self.field): 218 ad = self.getaddress() 219 if ad: 220 result += ad 221 else: 222 result.append(('', '')) 223 return result 224 225 def getaddress(self): 226 """Parse the next address.""" 227 self.commentlist = [] 228 self.gotonext() 229 230 oldpos = self.pos 231 oldcl = self.commentlist 232 plist = self.getphraselist() 233 234 self.gotonext() 235 returnlist = [] 236 237 if self.pos >= len(self.field): 238 # Bad email address technically, no domain. 239 if plist: 240 returnlist = [(SPACE.join(self.commentlist), plist[0])] 241 242 elif self.field[self.pos] in '.@': 243 # email address is just an addrspec 244 # this isn't very efficient since we start over 245 self.pos = oldpos 246 self.commentlist = oldcl 247 addrspec = self.getaddrspec() 248 returnlist = [(SPACE.join(self.commentlist), addrspec)] 249 250 elif self.field[self.pos] == ':': 251 # address is a group 252 returnlist = [] 253 254 fieldlen = len(self.field) 255 self.pos += 1 256 while self.pos < len(self.field): 257 self.gotonext() 258 if self.pos < fieldlen and self.field[self.pos] == ';': 259 self.pos += 1 260 break 261 returnlist = returnlist + self.getaddress() 262 263 elif self.field[self.pos] == '<': 264 # Address is a phrase then a route addr 265 routeaddr = self.getrouteaddr() 266 267 if self.commentlist: 268 returnlist = [(SPACE.join(plist) + ' (' + 269 ' '.join(self.commentlist) + ')', routeaddr)] 270 else: 271 returnlist = [(SPACE.join(plist), routeaddr)] 272 273 else: 274 if plist: 275 returnlist = [(SPACE.join(self.commentlist), plist[0])] 276 elif self.field[self.pos] in self.specials: 277 self.pos += 1 278 279 self.gotonext() 280 if self.pos < len(self.field) and self.field[self.pos] == ',': 281 self.pos += 1 282 return returnlist 283 284 def getrouteaddr(self): 285 """Parse a route address (Return-path value). 286 287 This method just skips all the route stuff and returns the addrspec. 288 """ 289 if self.field[self.pos] != '<': 290 return 291 292 expectroute = False 293 self.pos += 1 294 self.gotonext() 295 adlist = '' 296 while self.pos < len(self.field): 297 if expectroute: 298 self.getdomain() 299 expectroute = False 300 elif self.field[self.pos] == '>': 301 self.pos += 1 302 break 303 elif self.field[self.pos] == '@': 304 self.pos += 1 305 expectroute = True 306 elif self.field[self.pos] == ':': 307 self.pos += 1 308 else: 309 adlist = self.getaddrspec() 310 self.pos += 1 311 break 312 self.gotonext() 313 314 return adlist 315 316 def getaddrspec(self): 317 """Parse an RFC 2822 addr-spec.""" 318 aslist = [] 319 320 self.gotonext() 321 while self.pos < len(self.field): 322 if self.field[self.pos] == '.': 323 aslist.append('.') 324 self.pos += 1 325 elif self.field[self.pos] == '"': 326 aslist.append('"%s"' % quote(self.getquote())) 327 elif self.field[self.pos] in self.atomends: 328 break 329 else: 330 aslist.append(self.getatom()) 331 self.gotonext() 332 333 if self.pos >= len(self.field) or self.field[self.pos] != '@': 334 return EMPTYSTRING.join(aslist) 335 336 aslist.append('@') 337 self.pos += 1 338 self.gotonext() 339 return EMPTYSTRING.join(aslist) + self.getdomain() 340 341 def getdomain(self): 342 """Get the complete domain name from an address.""" 343 sdlist = [] 344 while self.pos < len(self.field): 345 if self.field[self.pos] in self.LWS: 346 self.pos += 1 347 elif self.field[self.pos] == '(': 348 self.commentlist.append(self.getcomment()) 349 elif self.field[self.pos] == '[': 350 sdlist.append(self.getdomainliteral()) 351 elif self.field[self.pos] == '.': 352 self.pos += 1 353 sdlist.append('.') 354 elif self.field[self.pos] in self.atomends: 355 break 356 else: 357 sdlist.append(self.getatom()) 358 return EMPTYSTRING.join(sdlist) 359 360 def getdelimited(self, beginchar, endchars, allowcomments=True): 361 """Parse a header fragment delimited by special characters. 362 363 `beginchar' is the start character for the fragment. 364 If self is not looking at an instance of `beginchar' then 365 getdelimited returns the empty string. 366 367 `endchars' is a sequence of allowable end-delimiting characters. 368 Parsing stops when one of these is encountered. 369 370 If `allowcomments' is non-zero, embedded RFC 2822 comments are allowed 371 within the parsed fragment. 372 """ 373 if self.field[self.pos] != beginchar: 374 return '' 375 376 slist = [''] 377 quote = False 378 self.pos += 1 379 while self.pos < len(self.field): 380 if quote: 381 slist.append(self.field[self.pos]) 382 quote = False 383 elif self.field[self.pos] in endchars: 384 self.pos += 1 385 break 386 elif allowcomments and self.field[self.pos] == '(': 387 slist.append(self.getcomment()) 388 continue # have already advanced pos from getcomment 389 elif self.field[self.pos] == '\\': 390 quote = True 391 else: 392 slist.append(self.field[self.pos]) 393 self.pos += 1 394 395 return EMPTYSTRING.join(slist) 396 397 def getquote(self): 398 """Get a quote-delimited fragment from self's field.""" 399 return self.getdelimited('"', '"\r', False) 400 401 def getcomment(self): 402 """Get a parenthesis-delimited fragment from self's field.""" 403 return self.getdelimited('(', ')\r', True) 404 405 def getdomainliteral(self): 406 """Parse an RFC 2822 domain-literal.""" 407 return '[%s]' % self.getdelimited('[', ']\r', False) 408 409 def getatom(self, atomends=None): 410 """Parse an RFC 2822 atom. 411 412 Optional atomends specifies a different set of end token delimiters 413 (the default is to use self.atomends). This is used e.g. in 414 getphraselist() since phrase endings must not include the `.' (which 415 is legal in phrases).""" 416 atomlist = [''] 417 if atomends is None: 418 atomends = self.atomends 419 420 while self.pos < len(self.field): 421 if self.field[self.pos] in atomends: 422 break 423 else: 424 atomlist.append(self.field[self.pos]) 425 self.pos += 1 426 427 return EMPTYSTRING.join(atomlist) 428 429 def getphraselist(self): 430 """Parse a sequence of RFC 2822 phrases. 431 432 A phrase is a sequence of words, which are in turn either RFC 2822 433 atoms or quoted-strings. Phrases are canonicalized by squeezing all 434 runs of continuous whitespace into one space. 435 """ 436 plist = [] 437 438 while self.pos < len(self.field): 439 if self.field[self.pos] in self.FWS: 440 self.pos += 1 441 elif self.field[self.pos] == '"': 442 plist.append(self.getquote()) 443 elif self.field[self.pos] == '(': 444 self.commentlist.append(self.getcomment()) 445 elif self.field[self.pos] in self.phraseends: 446 break 447 else: 448 plist.append(self.getatom(self.phraseends)) 449 450 return plist 451 452 class AddressList(AddrlistClass): 453 """An AddressList encapsulates a list of parsed RFC 2822 addresses.""" 454 def __init__(self, field): 455 AddrlistClass.__init__(self, field) 456 if field: 457 self.addresslist = self.getaddrlist() 458 else: 459 self.addresslist = [] 460 461 def __len__(self): 462 return len(self.addresslist) 463 464 def __add__(self, other): 465 # Set union 466 newaddr = AddressList(None) 467 newaddr.addresslist = self.addresslist[:] 468 for x in other.addresslist: 469 if not x in self.addresslist: 470 newaddr.addresslist.append(x) 471 return newaddr 472 473 def __iadd__(self, other): 474 # Set union, in-place 475 for x in other.addresslist: 476 if not x in self.addresslist: 477 self.addresslist.append(x) 478 return self 479 480 def __sub__(self, other): 481 # Set difference 482 newaddr = AddressList(None) 483 for x in self.addresslist: 484 if not x in other.addresslist: 485 newaddr.addresslist.append(x) 486 return newaddr 487 488 def __isub__(self, other): 489 # Set difference, in-place 490 for x in other.addresslist: 491 if x in self.addresslist: 492 self.addresslist.remove(x) 493 return self 494 495 def __getitem__(self, index): 496 # Make indexing, slices, and 'in' work 497 return self.addresslist[index] 498