Home | History | Annotate | Download | only in mail
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 /*
      6 Package mail implements parsing of mail messages.
      7 
      8 For the most part, this package follows the syntax as specified by RFC 5322.
      9 Notable divergences:
     10 	* Obsolete address formats are not parsed, including addresses with
     11 	  embedded route information.
     12 	* Group addresses are not parsed.
     13 	* The full range of spacing (the CFWS syntax element) is not supported,
     14 	  such as breaking addresses across lines.
     15 */
     16 package mail
     17 
     18 import (
     19 	"bufio"
     20 	"bytes"
     21 	"errors"
     22 	"fmt"
     23 	"io"
     24 	"log"
     25 	"mime"
     26 	"net/textproto"
     27 	"strings"
     28 	"time"
     29 )
     30 
     31 var debug = debugT(false)
     32 
     33 type debugT bool
     34 
     35 func (d debugT) Printf(format string, args ...interface{}) {
     36 	if d {
     37 		log.Printf(format, args...)
     38 	}
     39 }
     40 
     41 // A Message represents a parsed mail message.
     42 type Message struct {
     43 	Header Header
     44 	Body   io.Reader
     45 }
     46 
     47 // ReadMessage reads a message from r.
     48 // The headers are parsed, and the body of the message will be available
     49 // for reading from r.
     50 func ReadMessage(r io.Reader) (msg *Message, err error) {
     51 	tp := textproto.NewReader(bufio.NewReader(r))
     52 
     53 	hdr, err := tp.ReadMIMEHeader()
     54 	if err != nil {
     55 		return nil, err
     56 	}
     57 
     58 	return &Message{
     59 		Header: Header(hdr),
     60 		Body:   tp.R,
     61 	}, nil
     62 }
     63 
     64 // Layouts suitable for passing to time.Parse.
     65 // These are tried in order.
     66 var dateLayouts []string
     67 
     68 func init() {
     69 	// Generate layouts based on RFC 5322, section 3.3.
     70 
     71 	dows := [...]string{"", "Mon, "}   // day-of-week
     72 	days := [...]string{"2", "02"}     // day = 1*2DIGIT
     73 	years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT
     74 	seconds := [...]string{":05", ""}  // second
     75 	// "-0700 (MST)" is not in RFC 5322, but is common.
     76 	zones := [...]string{"-0700", "MST", "-0700 (MST)"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ...
     77 
     78 	for _, dow := range dows {
     79 		for _, day := range days {
     80 			for _, year := range years {
     81 				for _, second := range seconds {
     82 					for _, zone := range zones {
     83 						s := dow + day + " Jan " + year + " 15:04" + second + " " + zone
     84 						dateLayouts = append(dateLayouts, s)
     85 					}
     86 				}
     87 			}
     88 		}
     89 	}
     90 }
     91 
     92 func parseDate(date string) (time.Time, error) {
     93 	for _, layout := range dateLayouts {
     94 		t, err := time.Parse(layout, date)
     95 		if err == nil {
     96 			return t, nil
     97 		}
     98 	}
     99 	return time.Time{}, errors.New("mail: header could not be parsed")
    100 }
    101 
    102 // A Header represents the key-value pairs in a mail message header.
    103 type Header map[string][]string
    104 
    105 // Get gets the first value associated with the given key.
    106 // If there are no values associated with the key, Get returns "".
    107 func (h Header) Get(key string) string {
    108 	return textproto.MIMEHeader(h).Get(key)
    109 }
    110 
    111 var ErrHeaderNotPresent = errors.New("mail: header not in message")
    112 
    113 // Date parses the Date header field.
    114 func (h Header) Date() (time.Time, error) {
    115 	hdr := h.Get("Date")
    116 	if hdr == "" {
    117 		return time.Time{}, ErrHeaderNotPresent
    118 	}
    119 	return parseDate(hdr)
    120 }
    121 
    122 // AddressList parses the named header field as a list of addresses.
    123 func (h Header) AddressList(key string) ([]*Address, error) {
    124 	hdr := h.Get(key)
    125 	if hdr == "" {
    126 		return nil, ErrHeaderNotPresent
    127 	}
    128 	return ParseAddressList(hdr)
    129 }
    130 
    131 // Address represents a single mail address.
    132 // An address such as "Barry Gibbs <bg (a] example.com>" is represented
    133 // as Address{Name: "Barry Gibbs", Address: "bg (a] example.com"}.
    134 type Address struct {
    135 	Name    string // Proper name; may be empty.
    136 	Address string // user@domain
    137 }
    138 
    139 // Parses a single RFC 5322 address, e.g. "Barry Gibbs <bg (a] example.com>"
    140 func ParseAddress(address string) (*Address, error) {
    141 	return (&addrParser{s: address}).parseAddress()
    142 }
    143 
    144 // ParseAddressList parses the given string as a list of addresses.
    145 func ParseAddressList(list string) ([]*Address, error) {
    146 	return (&addrParser{s: list}).parseAddressList()
    147 }
    148 
    149 // An AddressParser is an RFC 5322 address parser.
    150 type AddressParser struct {
    151 	// WordDecoder optionally specifies a decoder for RFC 2047 encoded-words.
    152 	WordDecoder *mime.WordDecoder
    153 }
    154 
    155 // Parse parses a single RFC 5322 address of the
    156 // form "Gogh Fir <gf (a] example.com>" or "foo (a] example.com".
    157 func (p *AddressParser) Parse(address string) (*Address, error) {
    158 	return (&addrParser{s: address, dec: p.WordDecoder}).parseAddress()
    159 }
    160 
    161 // ParseList parses the given string as a list of comma-separated addresses
    162 // of the form "Gogh Fir <gf (a] example.com>" or "foo (a] example.com".
    163 func (p *AddressParser) ParseList(list string) ([]*Address, error) {
    164 	return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList()
    165 }
    166 
    167 // String formats the address as a valid RFC 5322 address.
    168 // If the address's name contains non-ASCII characters
    169 // the name will be rendered according to RFC 2047.
    170 func (a *Address) String() string {
    171 
    172 	// Format address local@domain
    173 	at := strings.LastIndex(a.Address, "@")
    174 	var local, domain string
    175 	if at < 0 {
    176 		// This is a malformed address ("@" is required in addr-spec);
    177 		// treat the whole address as local-part.
    178 		local = a.Address
    179 	} else {
    180 		local, domain = a.Address[:at], a.Address[at+1:]
    181 	}
    182 
    183 	// Add quotes if needed
    184 	// TODO: rendering quoted local part and rendering printable name
    185 	//       should be merged in helper function.
    186 	quoteLocal := false
    187 	for i := 0; i < len(local); i++ {
    188 		ch := local[i]
    189 		if isAtext(ch, false) {
    190 			continue
    191 		}
    192 		if ch == '.' {
    193 			// Dots are okay if they are surrounded by atext.
    194 			// We only need to check that the previous byte is
    195 			// not a dot, and this isn't the end of the string.
    196 			if i > 0 && local[i-1] != '.' && i < len(local)-1 {
    197 				continue
    198 			}
    199 		}
    200 		quoteLocal = true
    201 		break
    202 	}
    203 	if quoteLocal {
    204 		local = quoteString(local)
    205 
    206 	}
    207 
    208 	s := "<" + local + "@" + domain + ">"
    209 
    210 	if a.Name == "" {
    211 		return s
    212 	}
    213 
    214 	// If every character is printable ASCII, quoting is simple.
    215 	allPrintable := true
    216 	for i := 0; i < len(a.Name); i++ {
    217 		// isWSP here should actually be isFWS,
    218 		// but we don't support folding yet.
    219 		if !isVchar(a.Name[i]) && !isWSP(a.Name[i]) {
    220 			allPrintable = false
    221 			break
    222 		}
    223 	}
    224 	if allPrintable {
    225 		b := bytes.NewBufferString(`"`)
    226 		for i := 0; i < len(a.Name); i++ {
    227 			if !isQtext(a.Name[i]) && !isWSP(a.Name[i]) {
    228 				b.WriteByte('\\')
    229 			}
    230 			b.WriteByte(a.Name[i])
    231 		}
    232 		b.WriteString(`" `)
    233 		b.WriteString(s)
    234 		return b.String()
    235 	}
    236 
    237 	return mime.QEncoding.Encode("utf-8", a.Name) + " " + s
    238 }
    239 
    240 type addrParser struct {
    241 	s   string
    242 	dec *mime.WordDecoder // may be nil
    243 }
    244 
    245 func (p *addrParser) parseAddressList() ([]*Address, error) {
    246 	var list []*Address
    247 	for {
    248 		p.skipSpace()
    249 		addr, err := p.parseAddress()
    250 		if err != nil {
    251 			return nil, err
    252 		}
    253 		list = append(list, addr)
    254 
    255 		p.skipSpace()
    256 		if p.empty() {
    257 			break
    258 		}
    259 		if !p.consume(',') {
    260 			return nil, errors.New("mail: expected comma")
    261 		}
    262 	}
    263 	return list, nil
    264 }
    265 
    266 // parseAddress parses a single RFC 5322 address at the start of p.
    267 func (p *addrParser) parseAddress() (addr *Address, err error) {
    268 	debug.Printf("parseAddress: %q", p.s)
    269 	p.skipSpace()
    270 	if p.empty() {
    271 		return nil, errors.New("mail: no address")
    272 	}
    273 
    274 	// address = name-addr / addr-spec
    275 	// TODO(dsymonds): Support parsing group address.
    276 
    277 	// addr-spec has a more restricted grammar than name-addr,
    278 	// so try parsing it first, and fallback to name-addr.
    279 	// TODO(dsymonds): Is this really correct?
    280 	spec, err := p.consumeAddrSpec()
    281 	if err == nil {
    282 		return &Address{
    283 			Address: spec,
    284 		}, err
    285 	}
    286 	debug.Printf("parseAddress: not an addr-spec: %v", err)
    287 	debug.Printf("parseAddress: state is now %q", p.s)
    288 
    289 	// display-name
    290 	var displayName string
    291 	if p.peek() != '<' {
    292 		displayName, err = p.consumePhrase()
    293 		if err != nil {
    294 			return nil, err
    295 		}
    296 	}
    297 	debug.Printf("parseAddress: displayName=%q", displayName)
    298 
    299 	// angle-addr = "<" addr-spec ">"
    300 	p.skipSpace()
    301 	if !p.consume('<') {
    302 		return nil, errors.New("mail: no angle-addr")
    303 	}
    304 	spec, err = p.consumeAddrSpec()
    305 	if err != nil {
    306 		return nil, err
    307 	}
    308 	if !p.consume('>') {
    309 		return nil, errors.New("mail: unclosed angle-addr")
    310 	}
    311 	debug.Printf("parseAddress: spec=%q", spec)
    312 
    313 	return &Address{
    314 		Name:    displayName,
    315 		Address: spec,
    316 	}, nil
    317 }
    318 
    319 // consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p.
    320 func (p *addrParser) consumeAddrSpec() (spec string, err error) {
    321 	debug.Printf("consumeAddrSpec: %q", p.s)
    322 
    323 	orig := *p
    324 	defer func() {
    325 		if err != nil {
    326 			*p = orig
    327 		}
    328 	}()
    329 
    330 	// local-part = dot-atom / quoted-string
    331 	var localPart string
    332 	p.skipSpace()
    333 	if p.empty() {
    334 		return "", errors.New("mail: no addr-spec")
    335 	}
    336 	if p.peek() == '"' {
    337 		// quoted-string
    338 		debug.Printf("consumeAddrSpec: parsing quoted-string")
    339 		localPart, err = p.consumeQuotedString()
    340 	} else {
    341 		// dot-atom
    342 		debug.Printf("consumeAddrSpec: parsing dot-atom")
    343 		localPart, err = p.consumeAtom(true, false)
    344 	}
    345 	if err != nil {
    346 		debug.Printf("consumeAddrSpec: failed: %v", err)
    347 		return "", err
    348 	}
    349 
    350 	if !p.consume('@') {
    351 		return "", errors.New("mail: missing @ in addr-spec")
    352 	}
    353 
    354 	// domain = dot-atom / domain-literal
    355 	var domain string
    356 	p.skipSpace()
    357 	if p.empty() {
    358 		return "", errors.New("mail: no domain in addr-spec")
    359 	}
    360 	// TODO(dsymonds): Handle domain-literal
    361 	domain, err = p.consumeAtom(true, false)
    362 	if err != nil {
    363 		return "", err
    364 	}
    365 
    366 	return localPart + "@" + domain, nil
    367 }
    368 
    369 // consumePhrase parses the RFC 5322 phrase at the start of p.
    370 func (p *addrParser) consumePhrase() (phrase string, err error) {
    371 	debug.Printf("consumePhrase: [%s]", p.s)
    372 	// phrase = 1*word
    373 	var words []string
    374 	for {
    375 		// word = atom / quoted-string
    376 		var word string
    377 		p.skipSpace()
    378 		if p.empty() {
    379 			return "", errors.New("mail: missing phrase")
    380 		}
    381 		if p.peek() == '"' {
    382 			// quoted-string
    383 			word, err = p.consumeQuotedString()
    384 		} else {
    385 			// atom
    386 			// We actually parse dot-atom here to be more permissive
    387 			// than what RFC 5322 specifies.
    388 			word, err = p.consumeAtom(true, true)
    389 		}
    390 
    391 		if err == nil {
    392 			word, err = p.decodeRFC2047Word(word)
    393 		}
    394 
    395 		if err != nil {
    396 			break
    397 		}
    398 		debug.Printf("consumePhrase: consumed %q", word)
    399 		words = append(words, word)
    400 	}
    401 	// Ignore any error if we got at least one word.
    402 	if err != nil && len(words) == 0 {
    403 		debug.Printf("consumePhrase: hit err: %v", err)
    404 		return "", fmt.Errorf("mail: missing word in phrase: %v", err)
    405 	}
    406 	phrase = strings.Join(words, " ")
    407 	return phrase, nil
    408 }
    409 
    410 // consumeQuotedString parses the quoted string at the start of p.
    411 func (p *addrParser) consumeQuotedString() (qs string, err error) {
    412 	// Assume first byte is '"'.
    413 	i := 1
    414 	qsb := make([]byte, 0, 10)
    415 Loop:
    416 	for {
    417 		if i >= p.len() {
    418 			return "", errors.New("mail: unclosed quoted-string")
    419 		}
    420 		switch c := p.s[i]; {
    421 		case c == '"':
    422 			break Loop
    423 		case c == '\\':
    424 			if i+1 == p.len() {
    425 				return "", errors.New("mail: unclosed quoted-string")
    426 			}
    427 			qsb = append(qsb, p.s[i+1])
    428 			i += 2
    429 		case isQtext(c), c == ' ':
    430 			// qtext (printable US-ASCII excluding " and \), or
    431 			// FWS (almost; we're ignoring CRLF)
    432 			qsb = append(qsb, c)
    433 			i++
    434 		default:
    435 			return "", fmt.Errorf("mail: bad character in quoted-string: %q", c)
    436 		}
    437 	}
    438 	p.s = p.s[i+1:]
    439 	if len(qsb) == 0 {
    440 		return "", errors.New("mail: empty quoted-string")
    441 	}
    442 	return string(qsb), nil
    443 }
    444 
    445 // consumeAtom parses an RFC 5322 atom at the start of p.
    446 // If dot is true, consumeAtom parses an RFC 5322 dot-atom instead.
    447 // If permissive is true, consumeAtom will not fail on
    448 // leading/trailing/double dots in the atom (see golang.org/issue/4938).
    449 func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) {
    450 	if !isAtext(p.peek(), false) {
    451 		return "", errors.New("mail: invalid string")
    452 	}
    453 	i := 1
    454 	for ; i < p.len() && isAtext(p.s[i], dot); i++ {
    455 	}
    456 	atom, p.s = string(p.s[:i]), p.s[i:]
    457 	if !permissive {
    458 		if strings.HasPrefix(atom, ".") {
    459 			return "", errors.New("mail: leading dot in atom")
    460 		}
    461 		if strings.Contains(atom, "..") {
    462 			return "", errors.New("mail: double dot in atom")
    463 		}
    464 		if strings.HasSuffix(atom, ".") {
    465 			return "", errors.New("mail: trailing dot in atom")
    466 		}
    467 	}
    468 	return atom, nil
    469 }
    470 
    471 func (p *addrParser) consume(c byte) bool {
    472 	if p.empty() || p.peek() != c {
    473 		return false
    474 	}
    475 	p.s = p.s[1:]
    476 	return true
    477 }
    478 
    479 // skipSpace skips the leading space and tab characters.
    480 func (p *addrParser) skipSpace() {
    481 	p.s = strings.TrimLeft(p.s, " \t")
    482 }
    483 
    484 func (p *addrParser) peek() byte {
    485 	return p.s[0]
    486 }
    487 
    488 func (p *addrParser) empty() bool {
    489 	return p.len() == 0
    490 }
    491 
    492 func (p *addrParser) len() int {
    493 	return len(p.s)
    494 }
    495 
    496 func (p *addrParser) decodeRFC2047Word(s string) (string, error) {
    497 	if p.dec != nil {
    498 		return p.dec.DecodeHeader(s)
    499 	}
    500 
    501 	dec, err := rfc2047Decoder.Decode(s)
    502 	if err == nil {
    503 		return dec, nil
    504 	}
    505 
    506 	if _, ok := err.(charsetError); ok {
    507 		return s, err
    508 	}
    509 
    510 	// Ignore invalid RFC 2047 encoded-word errors.
    511 	return s, nil
    512 }
    513 
    514 var rfc2047Decoder = mime.WordDecoder{
    515 	CharsetReader: func(charset string, input io.Reader) (io.Reader, error) {
    516 		return nil, charsetError(charset)
    517 	},
    518 }
    519 
    520 type charsetError string
    521 
    522 func (e charsetError) Error() string {
    523 	return fmt.Sprintf("charset not supported: %q", string(e))
    524 }
    525 
    526 var atextChars = []byte("ABCDEFGHIJKLMNOPQRSTUVWXYZ" +
    527 	"abcdefghijklmnopqrstuvwxyz" +
    528 	"0123456789" +
    529 	"!#$%&'*+-/=?^_`{|}~")
    530 
    531 // isAtext reports whether c is an RFC 5322 atext character.
    532 // If dot is true, period is included.
    533 func isAtext(c byte, dot bool) bool {
    534 	if dot && c == '.' {
    535 		return true
    536 	}
    537 	return bytes.IndexByte(atextChars, c) >= 0
    538 }
    539 
    540 // isQtext reports whether c is an RFC 5322 qtext character.
    541 func isQtext(c byte) bool {
    542 	// Printable US-ASCII, excluding backslash or quote.
    543 	if c == '\\' || c == '"' {
    544 		return false
    545 	}
    546 	return '!' <= c && c <= '~'
    547 }
    548 
    549 // quoteString renders a string as a RFC5322 quoted-string.
    550 func quoteString(s string) string {
    551 	var buf bytes.Buffer
    552 	buf.WriteByte('"')
    553 	for _, c := range s {
    554 		ch := byte(c)
    555 		if isQtext(ch) || isWSP(ch) {
    556 			buf.WriteByte(ch)
    557 		} else if isVchar(ch) {
    558 			buf.WriteByte('\\')
    559 			buf.WriteByte(ch)
    560 		}
    561 	}
    562 	buf.WriteByte('"')
    563 	return buf.String()
    564 }
    565 
    566 // isVchar reports whether c is an RFC 5322 VCHAR character.
    567 func isVchar(c byte) bool {
    568 	// Visible (printing) characters.
    569 	return '!' <= c && c <= '~'
    570 }
    571 
    572 // isWSP reports whether c is a WSP (white space).
    573 // WSP is a space or horizontal tab (RFC5234 Appendix B).
    574 func isWSP(c byte) bool {
    575 	return c == ' ' || c == '\t'
    576 }
    577