Home | History | Annotate | Download | only in mime
      1 // Copyright 2015 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package mime
      6 
      7 import (
      8 	"bytes"
      9 	"encoding/base64"
     10 	"errors"
     11 	"fmt"
     12 	"io"
     13 	"strings"
     14 	"sync"
     15 	"unicode"
     16 	"unicode/utf8"
     17 )
     18 
     19 // A WordEncoder is an RFC 2047 encoded-word encoder.
     20 type WordEncoder byte
     21 
     22 const (
     23 	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
     24 	BEncoding = WordEncoder('b')
     25 	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
     26 	QEncoding = WordEncoder('q')
     27 )
     28 
     29 var (
     30 	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
     31 )
     32 
     33 // Encode returns the encoded-word form of s. If s is ASCII without special
     34 // characters, it is returned unchanged. The provided charset is the IANA
     35 // charset name of s. It is case insensitive.
     36 func (e WordEncoder) Encode(charset, s string) string {
     37 	if !needsEncoding(s) {
     38 		return s
     39 	}
     40 	return e.encodeWord(charset, s)
     41 }
     42 
     43 func needsEncoding(s string) bool {
     44 	for _, b := range s {
     45 		if (b < ' ' || b > '~') && b != '\t' {
     46 			return true
     47 		}
     48 	}
     49 	return false
     50 }
     51 
     52 // encodeWord encodes a string into an encoded-word.
     53 func (e WordEncoder) encodeWord(charset, s string) string {
     54 	buf := getBuffer()
     55 	defer putBuffer(buf)
     56 
     57 	e.openWord(buf, charset)
     58 	if e == BEncoding {
     59 		e.bEncode(buf, charset, s)
     60 	} else {
     61 		e.qEncode(buf, charset, s)
     62 	}
     63 	closeWord(buf)
     64 
     65 	return buf.String()
     66 }
     67 
     68 const (
     69 	// The maximum length of an encoded-word is 75 characters.
     70 	// See RFC 2047, section 2.
     71 	maxEncodedWordLen = 75
     72 	// maxContentLen is how much content can be encoded, ignoring the header and
     73 	// 2-byte footer.
     74 	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
     75 )
     76 
     77 var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
     78 
     79 // bEncode encodes s using base64 encoding and writes it to buf.
     80 func (e WordEncoder) bEncode(buf *bytes.Buffer, charset, s string) {
     81 	w := base64.NewEncoder(base64.StdEncoding, buf)
     82 	// If the charset is not UTF-8 or if the content is short, do not bother
     83 	// splitting the encoded-word.
     84 	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
     85 		io.WriteString(w, s)
     86 		w.Close()
     87 		return
     88 	}
     89 
     90 	var currentLen, last, runeLen int
     91 	for i := 0; i < len(s); i += runeLen {
     92 		// Multi-byte characters must not be split across encoded-words.
     93 		// See RFC 2047, section 5.3.
     94 		_, runeLen = utf8.DecodeRuneInString(s[i:])
     95 
     96 		if currentLen+runeLen <= maxBase64Len {
     97 			currentLen += runeLen
     98 		} else {
     99 			io.WriteString(w, s[last:i])
    100 			w.Close()
    101 			e.splitWord(buf, charset)
    102 			last = i
    103 			currentLen = runeLen
    104 		}
    105 	}
    106 	io.WriteString(w, s[last:])
    107 	w.Close()
    108 }
    109 
    110 // qEncode encodes s using Q encoding and writes it to buf. It splits the
    111 // encoded-words when necessary.
    112 func (e WordEncoder) qEncode(buf *bytes.Buffer, charset, s string) {
    113 	// We only split encoded-words when the charset is UTF-8.
    114 	if !isUTF8(charset) {
    115 		writeQString(buf, s)
    116 		return
    117 	}
    118 
    119 	var currentLen, runeLen int
    120 	for i := 0; i < len(s); i += runeLen {
    121 		b := s[i]
    122 		// Multi-byte characters must not be split across encoded-words.
    123 		// See RFC 2047, section 5.3.
    124 		var encLen int
    125 		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
    126 			runeLen, encLen = 1, 1
    127 		} else {
    128 			_, runeLen = utf8.DecodeRuneInString(s[i:])
    129 			encLen = 3 * runeLen
    130 		}
    131 
    132 		if currentLen+encLen > maxContentLen {
    133 			e.splitWord(buf, charset)
    134 			currentLen = 0
    135 		}
    136 		writeQString(buf, s[i:i+runeLen])
    137 		currentLen += encLen
    138 	}
    139 }
    140 
    141 // writeQString encodes s using Q encoding and writes it to buf.
    142 func writeQString(buf *bytes.Buffer, s string) {
    143 	for i := 0; i < len(s); i++ {
    144 		switch b := s[i]; {
    145 		case b == ' ':
    146 			buf.WriteByte('_')
    147 		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
    148 			buf.WriteByte(b)
    149 		default:
    150 			buf.WriteByte('=')
    151 			buf.WriteByte(upperhex[b>>4])
    152 			buf.WriteByte(upperhex[b&0x0f])
    153 		}
    154 	}
    155 }
    156 
    157 // openWord writes the beginning of an encoded-word into buf.
    158 func (e WordEncoder) openWord(buf *bytes.Buffer, charset string) {
    159 	buf.WriteString("=?")
    160 	buf.WriteString(charset)
    161 	buf.WriteByte('?')
    162 	buf.WriteByte(byte(e))
    163 	buf.WriteByte('?')
    164 }
    165 
    166 // closeWord writes the end of an encoded-word into buf.
    167 func closeWord(buf *bytes.Buffer) {
    168 	buf.WriteString("?=")
    169 }
    170 
    171 // splitWord closes the current encoded-word and opens a new one.
    172 func (e WordEncoder) splitWord(buf *bytes.Buffer, charset string) {
    173 	closeWord(buf)
    174 	buf.WriteByte(' ')
    175 	e.openWord(buf, charset)
    176 }
    177 
    178 func isUTF8(charset string) bool {
    179 	return strings.EqualFold(charset, "UTF-8")
    180 }
    181 
    182 const upperhex = "0123456789ABCDEF"
    183 
    184 // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
    185 type WordDecoder struct {
    186 	// CharsetReader, if non-nil, defines a function to generate
    187 	// charset-conversion readers, converting from the provided
    188 	// charset into UTF-8.
    189 	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
    190 	// are handled by default.
    191 	// One of the CharsetReader's result values must be non-nil.
    192 	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
    193 }
    194 
    195 // Decode decodes an RFC 2047 encoded-word.
    196 func (d *WordDecoder) Decode(word string) (string, error) {
    197 	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
    198 	// Our decoder is permissive, we accept empty encoded-text.
    199 	if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
    200 		return "", errInvalidWord
    201 	}
    202 	word = word[2 : len(word)-2]
    203 
    204 	// split delimits the first 2 fields
    205 	split := strings.IndexByte(word, '?')
    206 
    207 	// split word "UTF-8?q?ascii" into "UTF-8", 'q', and "ascii"
    208 	charset := word[:split]
    209 	if len(charset) == 0 {
    210 		return "", errInvalidWord
    211 	}
    212 	if len(word) < split+3 {
    213 		return "", errInvalidWord
    214 	}
    215 	encoding := word[split+1]
    216 	// the field after split must only be one byte
    217 	if word[split+2] != '?' {
    218 		return "", errInvalidWord
    219 	}
    220 	text := word[split+3:]
    221 
    222 	content, err := decode(encoding, text)
    223 	if err != nil {
    224 		return "", err
    225 	}
    226 
    227 	buf := getBuffer()
    228 	defer putBuffer(buf)
    229 
    230 	if err := d.convert(buf, charset, content); err != nil {
    231 		return "", err
    232 	}
    233 
    234 	return buf.String(), nil
    235 }
    236 
    237 // DecodeHeader decodes all encoded-words of the given string. It returns an
    238 // error if and only if CharsetReader of d returns an error.
    239 func (d *WordDecoder) DecodeHeader(header string) (string, error) {
    240 	// If there is no encoded-word, returns before creating a buffer.
    241 	i := strings.Index(header, "=?")
    242 	if i == -1 {
    243 		return header, nil
    244 	}
    245 
    246 	buf := getBuffer()
    247 	defer putBuffer(buf)
    248 
    249 	buf.WriteString(header[:i])
    250 	header = header[i:]
    251 
    252 	betweenWords := false
    253 	for {
    254 		start := strings.Index(header, "=?")
    255 		if start == -1 {
    256 			break
    257 		}
    258 		cur := start + len("=?")
    259 
    260 		i := strings.Index(header[cur:], "?")
    261 		if i == -1 {
    262 			break
    263 		}
    264 		charset := header[cur : cur+i]
    265 		cur += i + len("?")
    266 
    267 		if len(header) < cur+len("Q??=") {
    268 			break
    269 		}
    270 		encoding := header[cur]
    271 		cur++
    272 
    273 		if header[cur] != '?' {
    274 			break
    275 		}
    276 		cur++
    277 
    278 		j := strings.Index(header[cur:], "?=")
    279 		if j == -1 {
    280 			break
    281 		}
    282 		text := header[cur : cur+j]
    283 		end := cur + j + len("?=")
    284 
    285 		content, err := decode(encoding, text)
    286 		if err != nil {
    287 			betweenWords = false
    288 			buf.WriteString(header[:start+2])
    289 			header = header[start+2:]
    290 			continue
    291 		}
    292 
    293 		// Write characters before the encoded-word. White-space and newline
    294 		// characters separating two encoded-words must be deleted.
    295 		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
    296 			buf.WriteString(header[:start])
    297 		}
    298 
    299 		if err := d.convert(buf, charset, content); err != nil {
    300 			return "", err
    301 		}
    302 
    303 		header = header[end:]
    304 		betweenWords = true
    305 	}
    306 
    307 	if len(header) > 0 {
    308 		buf.WriteString(header)
    309 	}
    310 
    311 	return buf.String(), nil
    312 }
    313 
    314 func decode(encoding byte, text string) ([]byte, error) {
    315 	switch encoding {
    316 	case 'B', 'b':
    317 		return base64.StdEncoding.DecodeString(text)
    318 	case 'Q', 'q':
    319 		return qDecode(text)
    320 	default:
    321 		return nil, errInvalidWord
    322 	}
    323 }
    324 
    325 func (d *WordDecoder) convert(buf *bytes.Buffer, charset string, content []byte) error {
    326 	switch {
    327 	case strings.EqualFold("utf-8", charset):
    328 		buf.Write(content)
    329 	case strings.EqualFold("iso-8859-1", charset):
    330 		for _, c := range content {
    331 			buf.WriteRune(rune(c))
    332 		}
    333 	case strings.EqualFold("us-ascii", charset):
    334 		for _, c := range content {
    335 			if c >= utf8.RuneSelf {
    336 				buf.WriteRune(unicode.ReplacementChar)
    337 			} else {
    338 				buf.WriteByte(c)
    339 			}
    340 		}
    341 	default:
    342 		if d.CharsetReader == nil {
    343 			return fmt.Errorf("mime: unhandled charset %q", charset)
    344 		}
    345 		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
    346 		if err != nil {
    347 			return err
    348 		}
    349 		if _, err = buf.ReadFrom(r); err != nil {
    350 			return err
    351 		}
    352 	}
    353 	return nil
    354 }
    355 
    356 // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
    357 // one byte of non-whitespace.
    358 func hasNonWhitespace(s string) bool {
    359 	for _, b := range s {
    360 		switch b {
    361 		// Encoded-words can only be separated by linear white spaces which does
    362 		// not include vertical tabs (\v).
    363 		case ' ', '\t', '\n', '\r':
    364 		default:
    365 			return true
    366 		}
    367 	}
    368 	return false
    369 }
    370 
    371 // qDecode decodes a Q encoded string.
    372 func qDecode(s string) ([]byte, error) {
    373 	dec := make([]byte, len(s))
    374 	n := 0
    375 	for i := 0; i < len(s); i++ {
    376 		switch c := s[i]; {
    377 		case c == '_':
    378 			dec[n] = ' '
    379 		case c == '=':
    380 			if i+2 >= len(s) {
    381 				return nil, errInvalidWord
    382 			}
    383 			b, err := readHexByte(s[i+1], s[i+2])
    384 			if err != nil {
    385 				return nil, err
    386 			}
    387 			dec[n] = b
    388 			i += 2
    389 		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
    390 			dec[n] = c
    391 		default:
    392 			return nil, errInvalidWord
    393 		}
    394 		n++
    395 	}
    396 
    397 	return dec[:n], nil
    398 }
    399 
    400 // readHexByte returns the byte from its quoted-printable representation.
    401 func readHexByte(a, b byte) (byte, error) {
    402 	var hb, lb byte
    403 	var err error
    404 	if hb, err = fromHex(a); err != nil {
    405 		return 0, err
    406 	}
    407 	if lb, err = fromHex(b); err != nil {
    408 		return 0, err
    409 	}
    410 	return hb<<4 | lb, nil
    411 }
    412 
    413 func fromHex(b byte) (byte, error) {
    414 	switch {
    415 	case b >= '0' && b <= '9':
    416 		return b - '0', nil
    417 	case b >= 'A' && b <= 'F':
    418 		return b - 'A' + 10, nil
    419 	// Accept badly encoded bytes.
    420 	case b >= 'a' && b <= 'f':
    421 		return b - 'a' + 10, nil
    422 	}
    423 	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
    424 }
    425 
    426 var bufPool = sync.Pool{
    427 	New: func() interface{} {
    428 		return new(bytes.Buffer)
    429 	},
    430 }
    431 
    432 func getBuffer() *bytes.Buffer {
    433 	return bufPool.Get().(*bytes.Buffer)
    434 }
    435 
    436 func putBuffer(buf *bytes.Buffer) {
    437 	if buf.Len() > 1024 {
    438 		return
    439 	}
    440 	buf.Reset()
    441 	bufPool.Put(buf)
    442 }
    443