1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 /* 6 Package mail implements parsing of mail messages. 7 8 For the most part, this package follows the syntax as specified by RFC 5322. 9 Notable divergences: 10 * Obsolete address formats are not parsed, including addresses with 11 embedded route information. 12 * Group addresses are not parsed. 13 * The full range of spacing (the CFWS syntax element) is not supported, 14 such as breaking addresses across lines. 15 */ 16 package mail 17 18 import ( 19 "bufio" 20 "bytes" 21 "errors" 22 "fmt" 23 "io" 24 "log" 25 "mime" 26 "net/textproto" 27 "strings" 28 "time" 29 ) 30 31 var debug = debugT(false) 32 33 type debugT bool 34 35 func (d debugT) Printf(format string, args ...interface{}) { 36 if d { 37 log.Printf(format, args...) 38 } 39 } 40 41 // A Message represents a parsed mail message. 42 type Message struct { 43 Header Header 44 Body io.Reader 45 } 46 47 // ReadMessage reads a message from r. 48 // The headers are parsed, and the body of the message will be available 49 // for reading from r. 50 func ReadMessage(r io.Reader) (msg *Message, err error) { 51 tp := textproto.NewReader(bufio.NewReader(r)) 52 53 hdr, err := tp.ReadMIMEHeader() 54 if err != nil { 55 return nil, err 56 } 57 58 return &Message{ 59 Header: Header(hdr), 60 Body: tp.R, 61 }, nil 62 } 63 64 // Layouts suitable for passing to time.Parse. 65 // These are tried in order. 66 var dateLayouts []string 67 68 func init() { 69 // Generate layouts based on RFC 5322, section 3.3. 70 71 dows := [...]string{"", "Mon, "} // day-of-week 72 days := [...]string{"2", "02"} // day = 1*2DIGIT 73 years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT 74 seconds := [...]string{":05", ""} // second 75 // "-0700 (MST)" is not in RFC 5322, but is common. 76 zones := [...]string{"-0700", "MST", "-0700 (MST)"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ... 77 78 for _, dow := range dows { 79 for _, day := range days { 80 for _, year := range years { 81 for _, second := range seconds { 82 for _, zone := range zones { 83 s := dow + day + " Jan " + year + " 15:04" + second + " " + zone 84 dateLayouts = append(dateLayouts, s) 85 } 86 } 87 } 88 } 89 } 90 } 91 92 func parseDate(date string) (time.Time, error) { 93 for _, layout := range dateLayouts { 94 t, err := time.Parse(layout, date) 95 if err == nil { 96 return t, nil 97 } 98 } 99 return time.Time{}, errors.New("mail: header could not be parsed") 100 } 101 102 // A Header represents the key-value pairs in a mail message header. 103 type Header map[string][]string 104 105 // Get gets the first value associated with the given key. 106 // If there are no values associated with the key, Get returns "". 107 func (h Header) Get(key string) string { 108 return textproto.MIMEHeader(h).Get(key) 109 } 110 111 var ErrHeaderNotPresent = errors.New("mail: header not in message") 112 113 // Date parses the Date header field. 114 func (h Header) Date() (time.Time, error) { 115 hdr := h.Get("Date") 116 if hdr == "" { 117 return time.Time{}, ErrHeaderNotPresent 118 } 119 return parseDate(hdr) 120 } 121 122 // AddressList parses the named header field as a list of addresses. 123 func (h Header) AddressList(key string) ([]*Address, error) { 124 hdr := h.Get(key) 125 if hdr == "" { 126 return nil, ErrHeaderNotPresent 127 } 128 return ParseAddressList(hdr) 129 } 130 131 // Address represents a single mail address. 132 // An address such as "Barry Gibbs <bg (a] example.com>" is represented 133 // as Address{Name: "Barry Gibbs", Address: "bg (a] example.com"}. 134 type Address struct { 135 Name string // Proper name; may be empty. 136 Address string // user@domain 137 } 138 139 // Parses a single RFC 5322 address, e.g. "Barry Gibbs <bg (a] example.com>" 140 func ParseAddress(address string) (*Address, error) { 141 return (&addrParser{s: address}).parseAddress() 142 } 143 144 // ParseAddressList parses the given string as a list of addresses. 145 func ParseAddressList(list string) ([]*Address, error) { 146 return (&addrParser{s: list}).parseAddressList() 147 } 148 149 // An AddressParser is an RFC 5322 address parser. 150 type AddressParser struct { 151 // WordDecoder optionally specifies a decoder for RFC 2047 encoded-words. 152 WordDecoder *mime.WordDecoder 153 } 154 155 // Parse parses a single RFC 5322 address of the 156 // form "Gogh Fir <gf (a] example.com>" or "foo (a] example.com". 157 func (p *AddressParser) Parse(address string) (*Address, error) { 158 return (&addrParser{s: address, dec: p.WordDecoder}).parseAddress() 159 } 160 161 // ParseList parses the given string as a list of comma-separated addresses 162 // of the form "Gogh Fir <gf (a] example.com>" or "foo (a] example.com". 163 func (p *AddressParser) ParseList(list string) ([]*Address, error) { 164 return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList() 165 } 166 167 // String formats the address as a valid RFC 5322 address. 168 // If the address's name contains non-ASCII characters 169 // the name will be rendered according to RFC 2047. 170 func (a *Address) String() string { 171 172 // Format address local@domain 173 at := strings.LastIndex(a.Address, "@") 174 var local, domain string 175 if at < 0 { 176 // This is a malformed address ("@" is required in addr-spec); 177 // treat the whole address as local-part. 178 local = a.Address 179 } else { 180 local, domain = a.Address[:at], a.Address[at+1:] 181 } 182 183 // Add quotes if needed 184 // TODO: rendering quoted local part and rendering printable name 185 // should be merged in helper function. 186 quoteLocal := false 187 for i := 0; i < len(local); i++ { 188 ch := local[i] 189 if isAtext(ch, false) { 190 continue 191 } 192 if ch == '.' { 193 // Dots are okay if they are surrounded by atext. 194 // We only need to check that the previous byte is 195 // not a dot, and this isn't the end of the string. 196 if i > 0 && local[i-1] != '.' && i < len(local)-1 { 197 continue 198 } 199 } 200 quoteLocal = true 201 break 202 } 203 if quoteLocal { 204 local = quoteString(local) 205 206 } 207 208 s := "<" + local + "@" + domain + ">" 209 210 if a.Name == "" { 211 return s 212 } 213 214 // If every character is printable ASCII, quoting is simple. 215 allPrintable := true 216 for i := 0; i < len(a.Name); i++ { 217 // isWSP here should actually be isFWS, 218 // but we don't support folding yet. 219 if !isVchar(a.Name[i]) && !isWSP(a.Name[i]) { 220 allPrintable = false 221 break 222 } 223 } 224 if allPrintable { 225 b := bytes.NewBufferString(`"`) 226 for i := 0; i < len(a.Name); i++ { 227 if !isQtext(a.Name[i]) && !isWSP(a.Name[i]) { 228 b.WriteByte('\\') 229 } 230 b.WriteByte(a.Name[i]) 231 } 232 b.WriteString(`" `) 233 b.WriteString(s) 234 return b.String() 235 } 236 237 return mime.QEncoding.Encode("utf-8", a.Name) + " " + s 238 } 239 240 type addrParser struct { 241 s string 242 dec *mime.WordDecoder // may be nil 243 } 244 245 func (p *addrParser) parseAddressList() ([]*Address, error) { 246 var list []*Address 247 for { 248 p.skipSpace() 249 addr, err := p.parseAddress() 250 if err != nil { 251 return nil, err 252 } 253 list = append(list, addr) 254 255 p.skipSpace() 256 if p.empty() { 257 break 258 } 259 if !p.consume(',') { 260 return nil, errors.New("mail: expected comma") 261 } 262 } 263 return list, nil 264 } 265 266 // parseAddress parses a single RFC 5322 address at the start of p. 267 func (p *addrParser) parseAddress() (addr *Address, err error) { 268 debug.Printf("parseAddress: %q", p.s) 269 p.skipSpace() 270 if p.empty() { 271 return nil, errors.New("mail: no address") 272 } 273 274 // address = name-addr / addr-spec 275 // TODO(dsymonds): Support parsing group address. 276 277 // addr-spec has a more restricted grammar than name-addr, 278 // so try parsing it first, and fallback to name-addr. 279 // TODO(dsymonds): Is this really correct? 280 spec, err := p.consumeAddrSpec() 281 if err == nil { 282 return &Address{ 283 Address: spec, 284 }, err 285 } 286 debug.Printf("parseAddress: not an addr-spec: %v", err) 287 debug.Printf("parseAddress: state is now %q", p.s) 288 289 // display-name 290 var displayName string 291 if p.peek() != '<' { 292 displayName, err = p.consumePhrase() 293 if err != nil { 294 return nil, err 295 } 296 } 297 debug.Printf("parseAddress: displayName=%q", displayName) 298 299 // angle-addr = "<" addr-spec ">" 300 p.skipSpace() 301 if !p.consume('<') { 302 return nil, errors.New("mail: no angle-addr") 303 } 304 spec, err = p.consumeAddrSpec() 305 if err != nil { 306 return nil, err 307 } 308 if !p.consume('>') { 309 return nil, errors.New("mail: unclosed angle-addr") 310 } 311 debug.Printf("parseAddress: spec=%q", spec) 312 313 return &Address{ 314 Name: displayName, 315 Address: spec, 316 }, nil 317 } 318 319 // consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p. 320 func (p *addrParser) consumeAddrSpec() (spec string, err error) { 321 debug.Printf("consumeAddrSpec: %q", p.s) 322 323 orig := *p 324 defer func() { 325 if err != nil { 326 *p = orig 327 } 328 }() 329 330 // local-part = dot-atom / quoted-string 331 var localPart string 332 p.skipSpace() 333 if p.empty() { 334 return "", errors.New("mail: no addr-spec") 335 } 336 if p.peek() == '"' { 337 // quoted-string 338 debug.Printf("consumeAddrSpec: parsing quoted-string") 339 localPart, err = p.consumeQuotedString() 340 } else { 341 // dot-atom 342 debug.Printf("consumeAddrSpec: parsing dot-atom") 343 localPart, err = p.consumeAtom(true, false) 344 } 345 if err != nil { 346 debug.Printf("consumeAddrSpec: failed: %v", err) 347 return "", err 348 } 349 350 if !p.consume('@') { 351 return "", errors.New("mail: missing @ in addr-spec") 352 } 353 354 // domain = dot-atom / domain-literal 355 var domain string 356 p.skipSpace() 357 if p.empty() { 358 return "", errors.New("mail: no domain in addr-spec") 359 } 360 // TODO(dsymonds): Handle domain-literal 361 domain, err = p.consumeAtom(true, false) 362 if err != nil { 363 return "", err 364 } 365 366 return localPart + "@" + domain, nil 367 } 368 369 // consumePhrase parses the RFC 5322 phrase at the start of p. 370 func (p *addrParser) consumePhrase() (phrase string, err error) { 371 debug.Printf("consumePhrase: [%s]", p.s) 372 // phrase = 1*word 373 var words []string 374 for { 375 // word = atom / quoted-string 376 var word string 377 p.skipSpace() 378 if p.empty() { 379 return "", errors.New("mail: missing phrase") 380 } 381 if p.peek() == '"' { 382 // quoted-string 383 word, err = p.consumeQuotedString() 384 } else { 385 // atom 386 // We actually parse dot-atom here to be more permissive 387 // than what RFC 5322 specifies. 388 word, err = p.consumeAtom(true, true) 389 } 390 391 if err == nil { 392 word, err = p.decodeRFC2047Word(word) 393 } 394 395 if err != nil { 396 break 397 } 398 debug.Printf("consumePhrase: consumed %q", word) 399 words = append(words, word) 400 } 401 // Ignore any error if we got at least one word. 402 if err != nil && len(words) == 0 { 403 debug.Printf("consumePhrase: hit err: %v", err) 404 return "", fmt.Errorf("mail: missing word in phrase: %v", err) 405 } 406 phrase = strings.Join(words, " ") 407 return phrase, nil 408 } 409 410 // consumeQuotedString parses the quoted string at the start of p. 411 func (p *addrParser) consumeQuotedString() (qs string, err error) { 412 // Assume first byte is '"'. 413 i := 1 414 qsb := make([]byte, 0, 10) 415 Loop: 416 for { 417 if i >= p.len() { 418 return "", errors.New("mail: unclosed quoted-string") 419 } 420 switch c := p.s[i]; { 421 case c == '"': 422 break Loop 423 case c == '\\': 424 if i+1 == p.len() { 425 return "", errors.New("mail: unclosed quoted-string") 426 } 427 qsb = append(qsb, p.s[i+1]) 428 i += 2 429 case isQtext(c), c == ' ': 430 // qtext (printable US-ASCII excluding " and \), or 431 // FWS (almost; we're ignoring CRLF) 432 qsb = append(qsb, c) 433 i++ 434 default: 435 return "", fmt.Errorf("mail: bad character in quoted-string: %q", c) 436 } 437 } 438 p.s = p.s[i+1:] 439 if len(qsb) == 0 { 440 return "", errors.New("mail: empty quoted-string") 441 } 442 return string(qsb), nil 443 } 444 445 // consumeAtom parses an RFC 5322 atom at the start of p. 446 // If dot is true, consumeAtom parses an RFC 5322 dot-atom instead. 447 // If permissive is true, consumeAtom will not fail on 448 // leading/trailing/double dots in the atom (see golang.org/issue/4938). 449 func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) { 450 if !isAtext(p.peek(), false) { 451 return "", errors.New("mail: invalid string") 452 } 453 i := 1 454 for ; i < p.len() && isAtext(p.s[i], dot); i++ { 455 } 456 atom, p.s = string(p.s[:i]), p.s[i:] 457 if !permissive { 458 if strings.HasPrefix(atom, ".") { 459 return "", errors.New("mail: leading dot in atom") 460 } 461 if strings.Contains(atom, "..") { 462 return "", errors.New("mail: double dot in atom") 463 } 464 if strings.HasSuffix(atom, ".") { 465 return "", errors.New("mail: trailing dot in atom") 466 } 467 } 468 return atom, nil 469 } 470 471 func (p *addrParser) consume(c byte) bool { 472 if p.empty() || p.peek() != c { 473 return false 474 } 475 p.s = p.s[1:] 476 return true 477 } 478 479 // skipSpace skips the leading space and tab characters. 480 func (p *addrParser) skipSpace() { 481 p.s = strings.TrimLeft(p.s, " \t") 482 } 483 484 func (p *addrParser) peek() byte { 485 return p.s[0] 486 } 487 488 func (p *addrParser) empty() bool { 489 return p.len() == 0 490 } 491 492 func (p *addrParser) len() int { 493 return len(p.s) 494 } 495 496 func (p *addrParser) decodeRFC2047Word(s string) (string, error) { 497 if p.dec != nil { 498 return p.dec.DecodeHeader(s) 499 } 500 501 dec, err := rfc2047Decoder.Decode(s) 502 if err == nil { 503 return dec, nil 504 } 505 506 if _, ok := err.(charsetError); ok { 507 return s, err 508 } 509 510 // Ignore invalid RFC 2047 encoded-word errors. 511 return s, nil 512 } 513 514 var rfc2047Decoder = mime.WordDecoder{ 515 CharsetReader: func(charset string, input io.Reader) (io.Reader, error) { 516 return nil, charsetError(charset) 517 }, 518 } 519 520 type charsetError string 521 522 func (e charsetError) Error() string { 523 return fmt.Sprintf("charset not supported: %q", string(e)) 524 } 525 526 var atextChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ" + 527 "abcdefghijklmnopqrstuvwxyz" + 528 "0123456789" + 529 "!#$%&'*+-/=?^_`{|}~") 530 531 // isAtext reports whether c is an RFC 5322 atext character. 532 // If dot is true, period is included. 533 func isAtext(c byte, dot bool) bool { 534 if dot && c == '.' { 535 return true 536 } 537 return bytes.IndexByte(atextChars, c) >= 0 538 } 539 540 // isQtext reports whether c is an RFC 5322 qtext character. 541 func isQtext(c byte) bool { 542 // Printable US-ASCII, excluding backslash or quote. 543 if c == '\\' || c == '"' { 544 return false 545 } 546 return '!' <= c && c <= '~' 547 } 548 549 // quoteString renders a string as a RFC5322 quoted-string. 550 func quoteString(s string) string { 551 var buf bytes.Buffer 552 buf.WriteByte('"') 553 for _, c := range s { 554 ch := byte(c) 555 if isQtext(ch) || isWSP(ch) { 556 buf.WriteByte(ch) 557 } else if isVchar(ch) { 558 buf.WriteByte('\\') 559 buf.WriteByte(ch) 560 } 561 } 562 buf.WriteByte('"') 563 return buf.String() 564 } 565 566 // isVchar reports whether c is an RFC 5322 VCHAR character. 567 func isVchar(c byte) bool { 568 // Visible (printing) characters. 569 return '!' <= c && c <= '~' 570 } 571 572 // isWSP reports whether c is a WSP (white space). 573 // WSP is a space or horizontal tab (RFC5234 Appendix B). 574 func isWSP(c byte) bool { 575 return c == ' ' || c == '\t' 576 } 577