Home | History | Annotate | Download | only in template
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package template
      6 
      7 import (
      8 	"bytes"
      9 	"strings"
     10 )
     11 
     12 // transitionFunc is the array of context transition functions for text nodes.
     13 // A transition function takes a context and template text input, and returns
     14 // the updated context and the number of bytes consumed from the front of the
     15 // input.
     16 var transitionFunc = [...]func(context, []byte) (context, int){
     17 	stateText:        tText,
     18 	stateTag:         tTag,
     19 	stateAttrName:    tAttrName,
     20 	stateAfterName:   tAfterName,
     21 	stateBeforeValue: tBeforeValue,
     22 	stateHTMLCmt:     tHTMLCmt,
     23 	stateRCDATA:      tSpecialTagEnd,
     24 	stateAttr:        tAttr,
     25 	stateURL:         tURL,
     26 	stateSrcset:      tURL,
     27 	stateJS:          tJS,
     28 	stateJSDqStr:     tJSDelimited,
     29 	stateJSSqStr:     tJSDelimited,
     30 	stateJSRegexp:    tJSDelimited,
     31 	stateJSBlockCmt:  tBlockCmt,
     32 	stateJSLineCmt:   tLineCmt,
     33 	stateCSS:         tCSS,
     34 	stateCSSDqStr:    tCSSStr,
     35 	stateCSSSqStr:    tCSSStr,
     36 	stateCSSDqURL:    tCSSStr,
     37 	stateCSSSqURL:    tCSSStr,
     38 	stateCSSURL:      tCSSStr,
     39 	stateCSSBlockCmt: tBlockCmt,
     40 	stateCSSLineCmt:  tLineCmt,
     41 	stateError:       tError,
     42 }
     43 
     44 var commentStart = []byte("<!--")
     45 var commentEnd = []byte("-->")
     46 
     47 // tText is the context transition function for the text state.
     48 func tText(c context, s []byte) (context, int) {
     49 	k := 0
     50 	for {
     51 		i := k + bytes.IndexByte(s[k:], '<')
     52 		if i < k || i+1 == len(s) {
     53 			return c, len(s)
     54 		} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
     55 			return context{state: stateHTMLCmt}, i + 4
     56 		}
     57 		i++
     58 		end := false
     59 		if s[i] == '/' {
     60 			if i+1 == len(s) {
     61 				return c, len(s)
     62 			}
     63 			end, i = true, i+1
     64 		}
     65 		j, e := eatTagName(s, i)
     66 		if j != i {
     67 			if end {
     68 				e = elementNone
     69 			}
     70 			// We've found an HTML tag.
     71 			return context{state: stateTag, element: e}, j
     72 		}
     73 		k = j
     74 	}
     75 }
     76 
     77 var elementContentType = [...]state{
     78 	elementNone:     stateText,
     79 	elementScript:   stateJS,
     80 	elementStyle:    stateCSS,
     81 	elementTextarea: stateRCDATA,
     82 	elementTitle:    stateRCDATA,
     83 }
     84 
     85 // tTag is the context transition function for the tag state.
     86 func tTag(c context, s []byte) (context, int) {
     87 	// Find the attribute name.
     88 	i := eatWhiteSpace(s, 0)
     89 	if i == len(s) {
     90 		return c, len(s)
     91 	}
     92 	if s[i] == '>' {
     93 		return context{
     94 			state:   elementContentType[c.element],
     95 			element: c.element,
     96 		}, i + 1
     97 	}
     98 	j, err := eatAttrName(s, i)
     99 	if err != nil {
    100 		return context{state: stateError, err: err}, len(s)
    101 	}
    102 	state, attr := stateTag, attrNone
    103 	if i == j {
    104 		return context{
    105 			state: stateError,
    106 			err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
    107 		}, len(s)
    108 	}
    109 
    110 	attrName := strings.ToLower(string(s[i:j]))
    111 	if c.element == elementScript && attrName == "type" {
    112 		attr = attrScriptType
    113 	} else {
    114 		switch attrType(attrName) {
    115 		case contentTypeURL:
    116 			attr = attrURL
    117 		case contentTypeCSS:
    118 			attr = attrStyle
    119 		case contentTypeJS:
    120 			attr = attrScript
    121 		case contentTypeSrcset:
    122 			attr = attrSrcset
    123 		}
    124 	}
    125 
    126 	if j == len(s) {
    127 		state = stateAttrName
    128 	} else {
    129 		state = stateAfterName
    130 	}
    131 	return context{state: state, element: c.element, attr: attr}, j
    132 }
    133 
    134 // tAttrName is the context transition function for stateAttrName.
    135 func tAttrName(c context, s []byte) (context, int) {
    136 	i, err := eatAttrName(s, 0)
    137 	if err != nil {
    138 		return context{state: stateError, err: err}, len(s)
    139 	} else if i != len(s) {
    140 		c.state = stateAfterName
    141 	}
    142 	return c, i
    143 }
    144 
    145 // tAfterName is the context transition function for stateAfterName.
    146 func tAfterName(c context, s []byte) (context, int) {
    147 	// Look for the start of the value.
    148 	i := eatWhiteSpace(s, 0)
    149 	if i == len(s) {
    150 		return c, len(s)
    151 	} else if s[i] != '=' {
    152 		// Occurs due to tag ending '>', and valueless attribute.
    153 		c.state = stateTag
    154 		return c, i
    155 	}
    156 	c.state = stateBeforeValue
    157 	// Consume the "=".
    158 	return c, i + 1
    159 }
    160 
    161 var attrStartStates = [...]state{
    162 	attrNone:       stateAttr,
    163 	attrScript:     stateJS,
    164 	attrScriptType: stateAttr,
    165 	attrStyle:      stateCSS,
    166 	attrURL:        stateURL,
    167 	attrSrcset:     stateSrcset,
    168 }
    169 
    170 // tBeforeValue is the context transition function for stateBeforeValue.
    171 func tBeforeValue(c context, s []byte) (context, int) {
    172 	i := eatWhiteSpace(s, 0)
    173 	if i == len(s) {
    174 		return c, len(s)
    175 	}
    176 	// Find the attribute delimiter.
    177 	delim := delimSpaceOrTagEnd
    178 	switch s[i] {
    179 	case '\'':
    180 		delim, i = delimSingleQuote, i+1
    181 	case '"':
    182 		delim, i = delimDoubleQuote, i+1
    183 	}
    184 	c.state, c.delim = attrStartStates[c.attr], delim
    185 	return c, i
    186 }
    187 
    188 // tHTMLCmt is the context transition function for stateHTMLCmt.
    189 func tHTMLCmt(c context, s []byte) (context, int) {
    190 	if i := bytes.Index(s, commentEnd); i != -1 {
    191 		return context{}, i + 3
    192 	}
    193 	return c, len(s)
    194 }
    195 
    196 // specialTagEndMarkers maps element types to the character sequence that
    197 // case-insensitively signals the end of the special tag body.
    198 var specialTagEndMarkers = [...][]byte{
    199 	elementScript:   []byte("script"),
    200 	elementStyle:    []byte("style"),
    201 	elementTextarea: []byte("textarea"),
    202 	elementTitle:    []byte("title"),
    203 }
    204 
    205 var (
    206 	specialTagEndPrefix = []byte("</")
    207 	tagEndSeparators    = []byte("> \t\n\f/")
    208 )
    209 
    210 // tSpecialTagEnd is the context transition function for raw text and RCDATA
    211 // element states.
    212 func tSpecialTagEnd(c context, s []byte) (context, int) {
    213 	if c.element != elementNone {
    214 		if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
    215 			return context{}, i
    216 		}
    217 	}
    218 	return c, len(s)
    219 }
    220 
    221 // indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
    222 func indexTagEnd(s []byte, tag []byte) int {
    223 	res := 0
    224 	plen := len(specialTagEndPrefix)
    225 	for len(s) > 0 {
    226 		// Try to find the tag end prefix first
    227 		i := bytes.Index(s, specialTagEndPrefix)
    228 		if i == -1 {
    229 			return i
    230 		}
    231 		s = s[i+plen:]
    232 		// Try to match the actual tag if there is still space for it
    233 		if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
    234 			s = s[len(tag):]
    235 			// Check the tag is followed by a proper separator
    236 			if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
    237 				return res + i
    238 			}
    239 			res += len(tag)
    240 		}
    241 		res += i + plen
    242 	}
    243 	return -1
    244 }
    245 
    246 // tAttr is the context transition function for the attribute state.
    247 func tAttr(c context, s []byte) (context, int) {
    248 	return c, len(s)
    249 }
    250 
    251 // tURL is the context transition function for the URL state.
    252 func tURL(c context, s []byte) (context, int) {
    253 	if bytes.ContainsAny(s, "#?") {
    254 		c.urlPart = urlPartQueryOrFrag
    255 	} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
    256 		// HTML5 uses "Valid URL potentially surrounded by spaces" for
    257 		// attrs: http://www.w3.org/TR/html5/index.html#attributes-1
    258 		c.urlPart = urlPartPreQuery
    259 	}
    260 	return c, len(s)
    261 }
    262 
    263 // tJS is the context transition function for the JS state.
    264 func tJS(c context, s []byte) (context, int) {
    265 	i := bytes.IndexAny(s, `"'/`)
    266 	if i == -1 {
    267 		// Entire input is non string, comment, regexp tokens.
    268 		c.jsCtx = nextJSCtx(s, c.jsCtx)
    269 		return c, len(s)
    270 	}
    271 	c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
    272 	switch s[i] {
    273 	case '"':
    274 		c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
    275 	case '\'':
    276 		c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
    277 	case '/':
    278 		switch {
    279 		case i+1 < len(s) && s[i+1] == '/':
    280 			c.state, i = stateJSLineCmt, i+1
    281 		case i+1 < len(s) && s[i+1] == '*':
    282 			c.state, i = stateJSBlockCmt, i+1
    283 		case c.jsCtx == jsCtxRegexp:
    284 			c.state = stateJSRegexp
    285 		case c.jsCtx == jsCtxDivOp:
    286 			c.jsCtx = jsCtxRegexp
    287 		default:
    288 			return context{
    289 				state: stateError,
    290 				err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
    291 			}, len(s)
    292 		}
    293 	default:
    294 		panic("unreachable")
    295 	}
    296 	return c, i + 1
    297 }
    298 
    299 // tJSDelimited is the context transition function for the JS string and regexp
    300 // states.
    301 func tJSDelimited(c context, s []byte) (context, int) {
    302 	specials := `\"`
    303 	switch c.state {
    304 	case stateJSSqStr:
    305 		specials = `\'`
    306 	case stateJSRegexp:
    307 		specials = `\/[]`
    308 	}
    309 
    310 	k, inCharset := 0, false
    311 	for {
    312 		i := k + bytes.IndexAny(s[k:], specials)
    313 		if i < k {
    314 			break
    315 		}
    316 		switch s[i] {
    317 		case '\\':
    318 			i++
    319 			if i == len(s) {
    320 				return context{
    321 					state: stateError,
    322 					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
    323 				}, len(s)
    324 			}
    325 		case '[':
    326 			inCharset = true
    327 		case ']':
    328 			inCharset = false
    329 		default:
    330 			// end delimiter
    331 			if !inCharset {
    332 				c.state, c.jsCtx = stateJS, jsCtxDivOp
    333 				return c, i + 1
    334 			}
    335 		}
    336 		k = i + 1
    337 	}
    338 
    339 	if inCharset {
    340 		// This can be fixed by making context richer if interpolation
    341 		// into charsets is desired.
    342 		return context{
    343 			state: stateError,
    344 			err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
    345 		}, len(s)
    346 	}
    347 
    348 	return c, len(s)
    349 }
    350 
    351 var blockCommentEnd = []byte("*/")
    352 
    353 // tBlockCmt is the context transition function for /*comment*/ states.
    354 func tBlockCmt(c context, s []byte) (context, int) {
    355 	i := bytes.Index(s, blockCommentEnd)
    356 	if i == -1 {
    357 		return c, len(s)
    358 	}
    359 	switch c.state {
    360 	case stateJSBlockCmt:
    361 		c.state = stateJS
    362 	case stateCSSBlockCmt:
    363 		c.state = stateCSS
    364 	default:
    365 		panic(c.state.String())
    366 	}
    367 	return c, i + 2
    368 }
    369 
    370 // tLineCmt is the context transition function for //comment states.
    371 func tLineCmt(c context, s []byte) (context, int) {
    372 	var lineTerminators string
    373 	var endState state
    374 	switch c.state {
    375 	case stateJSLineCmt:
    376 		lineTerminators, endState = "\n\r\u2028\u2029", stateJS
    377 	case stateCSSLineCmt:
    378 		lineTerminators, endState = "\n\f\r", stateCSS
    379 		// Line comments are not part of any published CSS standard but
    380 		// are supported by the 4 major browsers.
    381 		// This defines line comments as
    382 		//     LINECOMMENT ::= "//" [^\n\f\d]*
    383 		// since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
    384 		// newlines:
    385 		//     nl ::= #xA | #xD #xA | #xD | #xC
    386 	default:
    387 		panic(c.state.String())
    388 	}
    389 
    390 	i := bytes.IndexAny(s, lineTerminators)
    391 	if i == -1 {
    392 		return c, len(s)
    393 	}
    394 	c.state = endState
    395 	// Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4
    396 	// "However, the LineTerminator at the end of the line is not
    397 	// considered to be part of the single-line comment; it is
    398 	// recognized separately by the lexical grammar and becomes part
    399 	// of the stream of input elements for the syntactic grammar."
    400 	return c, i
    401 }
    402 
    403 // tCSS is the context transition function for the CSS state.
    404 func tCSS(c context, s []byte) (context, int) {
    405 	// CSS quoted strings are almost never used except for:
    406 	// (1) URLs as in background: "/foo.png"
    407 	// (2) Multiword font-names as in font-family: "Times New Roman"
    408 	// (3) List separators in content values as in inline-lists:
    409 	//    <style>
    410 	//    ul.inlineList { list-style: none; padding:0 }
    411 	//    ul.inlineList > li { display: inline }
    412 	//    ul.inlineList > li:before { content: ", " }
    413 	//    ul.inlineList > li:first-child:before { content: "" }
    414 	//    </style>
    415 	//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
    416 	// (4) Attribute value selectors as in a[href="http://example.com/"]
    417 	//
    418 	// We conservatively treat all strings as URLs, but make some
    419 	// allowances to avoid confusion.
    420 	//
    421 	// In (1), our conservative assumption is justified.
    422 	// In (2), valid font names do not contain ':', '?', or '#', so our
    423 	// conservative assumption is fine since we will never transition past
    424 	// urlPartPreQuery.
    425 	// In (3), our protocol heuristic should not be tripped, and there
    426 	// should not be non-space content after a '?' or '#', so as long as
    427 	// we only %-encode RFC 3986 reserved characters we are ok.
    428 	// In (4), we should URL escape for URL attributes, and for others we
    429 	// have the attribute name available if our conservative assumption
    430 	// proves problematic for real code.
    431 
    432 	k := 0
    433 	for {
    434 		i := k + bytes.IndexAny(s[k:], `("'/`)
    435 		if i < k {
    436 			return c, len(s)
    437 		}
    438 		switch s[i] {
    439 		case '(':
    440 			// Look for url to the left.
    441 			p := bytes.TrimRight(s[:i], "\t\n\f\r ")
    442 			if endsWithCSSKeyword(p, "url") {
    443 				j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
    444 				switch {
    445 				case j != len(s) && s[j] == '"':
    446 					c.state, j = stateCSSDqURL, j+1
    447 				case j != len(s) && s[j] == '\'':
    448 					c.state, j = stateCSSSqURL, j+1
    449 				default:
    450 					c.state = stateCSSURL
    451 				}
    452 				return c, j
    453 			}
    454 		case '/':
    455 			if i+1 < len(s) {
    456 				switch s[i+1] {
    457 				case '/':
    458 					c.state = stateCSSLineCmt
    459 					return c, i + 2
    460 				case '*':
    461 					c.state = stateCSSBlockCmt
    462 					return c, i + 2
    463 				}
    464 			}
    465 		case '"':
    466 			c.state = stateCSSDqStr
    467 			return c, i + 1
    468 		case '\'':
    469 			c.state = stateCSSSqStr
    470 			return c, i + 1
    471 		}
    472 		k = i + 1
    473 	}
    474 }
    475 
    476 // tCSSStr is the context transition function for the CSS string and URL states.
    477 func tCSSStr(c context, s []byte) (context, int) {
    478 	var endAndEsc string
    479 	switch c.state {
    480 	case stateCSSDqStr, stateCSSDqURL:
    481 		endAndEsc = `\"`
    482 	case stateCSSSqStr, stateCSSSqURL:
    483 		endAndEsc = `\'`
    484 	case stateCSSURL:
    485 		// Unquoted URLs end with a newline or close parenthesis.
    486 		// The below includes the wc (whitespace character) and nl.
    487 		endAndEsc = "\\\t\n\f\r )"
    488 	default:
    489 		panic(c.state.String())
    490 	}
    491 
    492 	k := 0
    493 	for {
    494 		i := k + bytes.IndexAny(s[k:], endAndEsc)
    495 		if i < k {
    496 			c, nread := tURL(c, decodeCSS(s[k:]))
    497 			return c, k + nread
    498 		}
    499 		if s[i] == '\\' {
    500 			i++
    501 			if i == len(s) {
    502 				return context{
    503 					state: stateError,
    504 					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
    505 				}, len(s)
    506 			}
    507 		} else {
    508 			c.state = stateCSS
    509 			return c, i + 1
    510 		}
    511 		c, _ = tURL(c, decodeCSS(s[:i+1]))
    512 		k = i + 1
    513 	}
    514 }
    515 
    516 // tError is the context transition function for the error state.
    517 func tError(c context, s []byte) (context, int) {
    518 	return c, len(s)
    519 }
    520 
    521 // eatAttrName returns the largest j such that s[i:j] is an attribute name.
    522 // It returns an error if s[i:] does not look like it begins with an
    523 // attribute name, such as encountering a quote mark without a preceding
    524 // equals sign.
    525 func eatAttrName(s []byte, i int) (int, *Error) {
    526 	for j := i; j < len(s); j++ {
    527 		switch s[j] {
    528 		case ' ', '\t', '\n', '\f', '\r', '=', '>':
    529 			return j, nil
    530 		case '\'', '"', '<':
    531 			// These result in a parse warning in HTML5 and are
    532 			// indicative of serious problems if seen in an attr
    533 			// name in a template.
    534 			return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
    535 		default:
    536 			// No-op.
    537 		}
    538 	}
    539 	return len(s), nil
    540 }
    541 
    542 var elementNameMap = map[string]element{
    543 	"script":   elementScript,
    544 	"style":    elementStyle,
    545 	"textarea": elementTextarea,
    546 	"title":    elementTitle,
    547 }
    548 
    549 // asciiAlpha reports whether c is an ASCII letter.
    550 func asciiAlpha(c byte) bool {
    551 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    552 }
    553 
    554 // asciiAlphaNum reports whether c is an ASCII letter or digit.
    555 func asciiAlphaNum(c byte) bool {
    556 	return asciiAlpha(c) || '0' <= c && c <= '9'
    557 }
    558 
    559 // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
    560 func eatTagName(s []byte, i int) (int, element) {
    561 	if i == len(s) || !asciiAlpha(s[i]) {
    562 		return i, elementNone
    563 	}
    564 	j := i + 1
    565 	for j < len(s) {
    566 		x := s[j]
    567 		if asciiAlphaNum(x) {
    568 			j++
    569 			continue
    570 		}
    571 		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
    572 		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
    573 			j += 2
    574 			continue
    575 		}
    576 		break
    577 	}
    578 	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
    579 }
    580 
    581 // eatWhiteSpace returns the largest j such that s[i:j] is white space.
    582 func eatWhiteSpace(s []byte, i int) int {
    583 	for j := i; j < len(s); j++ {
    584 		switch s[j] {
    585 		case ' ', '\t', '\n', '\f', '\r':
    586 			// No-op.
    587 		default:
    588 			return j
    589 		}
    590 	}
    591 	return len(s)
    592 }
    593