Home | History | Annotate | Download | only in template
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package template
      6 
      7 import (
      8 	"bytes"
      9 	"strings"
     10 )
     11 
     12 // transitionFunc is the array of context transition functions for text nodes.
     13 // A transition function takes a context and template text input, and returns
     14 // the updated context and the number of bytes consumed from the front of the
     15 // input.
     16 var transitionFunc = [...]func(context, []byte) (context, int){
     17 	stateText:        tText,
     18 	stateTag:         tTag,
     19 	stateAttrName:    tAttrName,
     20 	stateAfterName:   tAfterName,
     21 	stateBeforeValue: tBeforeValue,
     22 	stateHTMLCmt:     tHTMLCmt,
     23 	stateRCDATA:      tSpecialTagEnd,
     24 	stateAttr:        tAttr,
     25 	stateURL:         tURL,
     26 	stateJS:          tJS,
     27 	stateJSDqStr:     tJSDelimited,
     28 	stateJSSqStr:     tJSDelimited,
     29 	stateJSRegexp:    tJSDelimited,
     30 	stateJSBlockCmt:  tBlockCmt,
     31 	stateJSLineCmt:   tLineCmt,
     32 	stateCSS:         tCSS,
     33 	stateCSSDqStr:    tCSSStr,
     34 	stateCSSSqStr:    tCSSStr,
     35 	stateCSSDqURL:    tCSSStr,
     36 	stateCSSSqURL:    tCSSStr,
     37 	stateCSSURL:      tCSSStr,
     38 	stateCSSBlockCmt: tBlockCmt,
     39 	stateCSSLineCmt:  tLineCmt,
     40 	stateError:       tError,
     41 }
     42 
     43 var commentStart = []byte("<!--")
     44 var commentEnd = []byte("-->")
     45 
     46 // tText is the context transition function for the text state.
     47 func tText(c context, s []byte) (context, int) {
     48 	k := 0
     49 	for {
     50 		i := k + bytes.IndexByte(s[k:], '<')
     51 		if i < k || i+1 == len(s) {
     52 			return c, len(s)
     53 		} else if i+4 <= len(s) && bytes.Equal(commentStart, s[i:i+4]) {
     54 			return context{state: stateHTMLCmt}, i + 4
     55 		}
     56 		i++
     57 		end := false
     58 		if s[i] == '/' {
     59 			if i+1 == len(s) {
     60 				return c, len(s)
     61 			}
     62 			end, i = true, i+1
     63 		}
     64 		j, e := eatTagName(s, i)
     65 		if j != i {
     66 			if end {
     67 				e = elementNone
     68 			}
     69 			// We've found an HTML tag.
     70 			return context{state: stateTag, element: e}, j
     71 		}
     72 		k = j
     73 	}
     74 }
     75 
     76 var elementContentType = [...]state{
     77 	elementNone:     stateText,
     78 	elementScript:   stateJS,
     79 	elementStyle:    stateCSS,
     80 	elementTextarea: stateRCDATA,
     81 	elementTitle:    stateRCDATA,
     82 }
     83 
     84 // tTag is the context transition function for the tag state.
     85 func tTag(c context, s []byte) (context, int) {
     86 	// Find the attribute name.
     87 	i := eatWhiteSpace(s, 0)
     88 	if i == len(s) {
     89 		return c, len(s)
     90 	}
     91 	if s[i] == '>' {
     92 		return context{
     93 			state:   elementContentType[c.element],
     94 			element: c.element,
     95 		}, i + 1
     96 	}
     97 	j, err := eatAttrName(s, i)
     98 	if err != nil {
     99 		return context{state: stateError, err: err}, len(s)
    100 	}
    101 	state, attr := stateTag, attrNone
    102 	if i == j {
    103 		return context{
    104 			state: stateError,
    105 			err:   errorf(ErrBadHTML, nil, 0, "expected space, attr name, or end of tag, but got %q", s[i:]),
    106 		}, len(s)
    107 	}
    108 	switch attrType(string(s[i:j])) {
    109 	case contentTypeURL:
    110 		attr = attrURL
    111 	case contentTypeCSS:
    112 		attr = attrStyle
    113 	case contentTypeJS:
    114 		attr = attrScript
    115 	}
    116 	if j == len(s) {
    117 		state = stateAttrName
    118 	} else {
    119 		state = stateAfterName
    120 	}
    121 	return context{state: state, element: c.element, attr: attr}, j
    122 }
    123 
    124 // tAttrName is the context transition function for stateAttrName.
    125 func tAttrName(c context, s []byte) (context, int) {
    126 	i, err := eatAttrName(s, 0)
    127 	if err != nil {
    128 		return context{state: stateError, err: err}, len(s)
    129 	} else if i != len(s) {
    130 		c.state = stateAfterName
    131 	}
    132 	return c, i
    133 }
    134 
    135 // tAfterName is the context transition function for stateAfterName.
    136 func tAfterName(c context, s []byte) (context, int) {
    137 	// Look for the start of the value.
    138 	i := eatWhiteSpace(s, 0)
    139 	if i == len(s) {
    140 		return c, len(s)
    141 	} else if s[i] != '=' {
    142 		// Occurs due to tag ending '>', and valueless attribute.
    143 		c.state = stateTag
    144 		return c, i
    145 	}
    146 	c.state = stateBeforeValue
    147 	// Consume the "=".
    148 	return c, i + 1
    149 }
    150 
    151 var attrStartStates = [...]state{
    152 	attrNone:   stateAttr,
    153 	attrScript: stateJS,
    154 	attrStyle:  stateCSS,
    155 	attrURL:    stateURL,
    156 }
    157 
    158 // tBeforeValue is the context transition function for stateBeforeValue.
    159 func tBeforeValue(c context, s []byte) (context, int) {
    160 	i := eatWhiteSpace(s, 0)
    161 	if i == len(s) {
    162 		return c, len(s)
    163 	}
    164 	// Find the attribute delimiter.
    165 	delim := delimSpaceOrTagEnd
    166 	switch s[i] {
    167 	case '\'':
    168 		delim, i = delimSingleQuote, i+1
    169 	case '"':
    170 		delim, i = delimDoubleQuote, i+1
    171 	}
    172 	c.state, c.delim, c.attr = attrStartStates[c.attr], delim, attrNone
    173 	return c, i
    174 }
    175 
    176 // tHTMLCmt is the context transition function for stateHTMLCmt.
    177 func tHTMLCmt(c context, s []byte) (context, int) {
    178 	if i := bytes.Index(s, commentEnd); i != -1 {
    179 		return context{}, i + 3
    180 	}
    181 	return c, len(s)
    182 }
    183 
    184 // specialTagEndMarkers maps element types to the character sequence that
    185 // case-insensitively signals the end of the special tag body.
    186 var specialTagEndMarkers = [...][]byte{
    187 	elementScript:   []byte("script"),
    188 	elementStyle:    []byte("style"),
    189 	elementTextarea: []byte("textarea"),
    190 	elementTitle:    []byte("title"),
    191 }
    192 
    193 var (
    194 	specialTagEndPrefix = []byte("</")
    195 	tagEndSeparators    = []byte("> \t\n\f/")
    196 )
    197 
    198 // tSpecialTagEnd is the context transition function for raw text and RCDATA
    199 // element states.
    200 func tSpecialTagEnd(c context, s []byte) (context, int) {
    201 	if c.element != elementNone {
    202 		if i := indexTagEnd(s, specialTagEndMarkers[c.element]); i != -1 {
    203 			return context{}, i
    204 		}
    205 	}
    206 	return c, len(s)
    207 }
    208 
    209 // indexTagEnd finds the index of a special tag end in a case insensitive way, or returns -1
    210 func indexTagEnd(s []byte, tag []byte) int {
    211 	res := 0
    212 	plen := len(specialTagEndPrefix)
    213 	for len(s) > 0 {
    214 		// Try to find the tag end prefix first
    215 		i := bytes.Index(s, specialTagEndPrefix)
    216 		if i == -1 {
    217 			return i
    218 		}
    219 		s = s[i+plen:]
    220 		// Try to match the actual tag if there is still space for it
    221 		if len(tag) <= len(s) && bytes.EqualFold(tag, s[:len(tag)]) {
    222 			s = s[len(tag):]
    223 			// Check the tag is followed by a proper separator
    224 			if len(s) > 0 && bytes.IndexByte(tagEndSeparators, s[0]) != -1 {
    225 				return res + i
    226 			}
    227 			res += len(tag)
    228 		}
    229 		res += i + plen
    230 	}
    231 	return -1
    232 }
    233 
    234 // tAttr is the context transition function for the attribute state.
    235 func tAttr(c context, s []byte) (context, int) {
    236 	return c, len(s)
    237 }
    238 
    239 // tURL is the context transition function for the URL state.
    240 func tURL(c context, s []byte) (context, int) {
    241 	if bytes.IndexAny(s, "#?") >= 0 {
    242 		c.urlPart = urlPartQueryOrFrag
    243 	} else if len(s) != eatWhiteSpace(s, 0) && c.urlPart == urlPartNone {
    244 		// HTML5 uses "Valid URL potentially surrounded by spaces" for
    245 		// attrs: http://www.w3.org/TR/html5/index.html#attributes-1
    246 		c.urlPart = urlPartPreQuery
    247 	}
    248 	return c, len(s)
    249 }
    250 
    251 // tJS is the context transition function for the JS state.
    252 func tJS(c context, s []byte) (context, int) {
    253 	i := bytes.IndexAny(s, `"'/`)
    254 	if i == -1 {
    255 		// Entire input is non string, comment, regexp tokens.
    256 		c.jsCtx = nextJSCtx(s, c.jsCtx)
    257 		return c, len(s)
    258 	}
    259 	c.jsCtx = nextJSCtx(s[:i], c.jsCtx)
    260 	switch s[i] {
    261 	case '"':
    262 		c.state, c.jsCtx = stateJSDqStr, jsCtxRegexp
    263 	case '\'':
    264 		c.state, c.jsCtx = stateJSSqStr, jsCtxRegexp
    265 	case '/':
    266 		switch {
    267 		case i+1 < len(s) && s[i+1] == '/':
    268 			c.state, i = stateJSLineCmt, i+1
    269 		case i+1 < len(s) && s[i+1] == '*':
    270 			c.state, i = stateJSBlockCmt, i+1
    271 		case c.jsCtx == jsCtxRegexp:
    272 			c.state = stateJSRegexp
    273 		case c.jsCtx == jsCtxDivOp:
    274 			c.jsCtx = jsCtxRegexp
    275 		default:
    276 			return context{
    277 				state: stateError,
    278 				err:   errorf(ErrSlashAmbig, nil, 0, "'/' could start a division or regexp: %.32q", s[i:]),
    279 			}, len(s)
    280 		}
    281 	default:
    282 		panic("unreachable")
    283 	}
    284 	return c, i + 1
    285 }
    286 
    287 // tJSDelimited is the context transition function for the JS string and regexp
    288 // states.
    289 func tJSDelimited(c context, s []byte) (context, int) {
    290 	specials := `\"`
    291 	switch c.state {
    292 	case stateJSSqStr:
    293 		specials = `\'`
    294 	case stateJSRegexp:
    295 		specials = `\/[]`
    296 	}
    297 
    298 	k, inCharset := 0, false
    299 	for {
    300 		i := k + bytes.IndexAny(s[k:], specials)
    301 		if i < k {
    302 			break
    303 		}
    304 		switch s[i] {
    305 		case '\\':
    306 			i++
    307 			if i == len(s) {
    308 				return context{
    309 					state: stateError,
    310 					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in JS string: %q", s),
    311 				}, len(s)
    312 			}
    313 		case '[':
    314 			inCharset = true
    315 		case ']':
    316 			inCharset = false
    317 		default:
    318 			// end delimiter
    319 			if !inCharset {
    320 				c.state, c.jsCtx = stateJS, jsCtxDivOp
    321 				return c, i + 1
    322 			}
    323 		}
    324 		k = i + 1
    325 	}
    326 
    327 	if inCharset {
    328 		// This can be fixed by making context richer if interpolation
    329 		// into charsets is desired.
    330 		return context{
    331 			state: stateError,
    332 			err:   errorf(ErrPartialCharset, nil, 0, "unfinished JS regexp charset: %q", s),
    333 		}, len(s)
    334 	}
    335 
    336 	return c, len(s)
    337 }
    338 
    339 var blockCommentEnd = []byte("*/")
    340 
    341 // tBlockCmt is the context transition function for /*comment*/ states.
    342 func tBlockCmt(c context, s []byte) (context, int) {
    343 	i := bytes.Index(s, blockCommentEnd)
    344 	if i == -1 {
    345 		return c, len(s)
    346 	}
    347 	switch c.state {
    348 	case stateJSBlockCmt:
    349 		c.state = stateJS
    350 	case stateCSSBlockCmt:
    351 		c.state = stateCSS
    352 	default:
    353 		panic(c.state.String())
    354 	}
    355 	return c, i + 2
    356 }
    357 
    358 // tLineCmt is the context transition function for //comment states.
    359 func tLineCmt(c context, s []byte) (context, int) {
    360 	var lineTerminators string
    361 	var endState state
    362 	switch c.state {
    363 	case stateJSLineCmt:
    364 		lineTerminators, endState = "\n\r\u2028\u2029", stateJS
    365 	case stateCSSLineCmt:
    366 		lineTerminators, endState = "\n\f\r", stateCSS
    367 		// Line comments are not part of any published CSS standard but
    368 		// are supported by the 4 major browsers.
    369 		// This defines line comments as
    370 		//     LINECOMMENT ::= "//" [^\n\f\d]*
    371 		// since http://www.w3.org/TR/css3-syntax/#SUBTOK-nl defines
    372 		// newlines:
    373 		//     nl ::= #xA | #xD #xA | #xD | #xC
    374 	default:
    375 		panic(c.state.String())
    376 	}
    377 
    378 	i := bytes.IndexAny(s, lineTerminators)
    379 	if i == -1 {
    380 		return c, len(s)
    381 	}
    382 	c.state = endState
    383 	// Per section 7.4 of EcmaScript 5 : http://es5.github.com/#x7.4
    384 	// "However, the LineTerminator at the end of the line is not
    385 	// considered to be part of the single-line comment; it is
    386 	// recognized separately by the lexical grammar and becomes part
    387 	// of the stream of input elements for the syntactic grammar."
    388 	return c, i
    389 }
    390 
    391 // tCSS is the context transition function for the CSS state.
    392 func tCSS(c context, s []byte) (context, int) {
    393 	// CSS quoted strings are almost never used except for:
    394 	// (1) URLs as in background: "/foo.png"
    395 	// (2) Multiword font-names as in font-family: "Times New Roman"
    396 	// (3) List separators in content values as in inline-lists:
    397 	//    <style>
    398 	//    ul.inlineList { list-style: none; padding:0 }
    399 	//    ul.inlineList > li { display: inline }
    400 	//    ul.inlineList > li:before { content: ", " }
    401 	//    ul.inlineList > li:first-child:before { content: "" }
    402 	//    </style>
    403 	//    <ul class=inlineList><li>One<li>Two<li>Three</ul>
    404 	// (4) Attribute value selectors as in a[href="http://example.com/"]
    405 	//
    406 	// We conservatively treat all strings as URLs, but make some
    407 	// allowances to avoid confusion.
    408 	//
    409 	// In (1), our conservative assumption is justified.
    410 	// In (2), valid font names do not contain ':', '?', or '#', so our
    411 	// conservative assumption is fine since we will never transition past
    412 	// urlPartPreQuery.
    413 	// In (3), our protocol heuristic should not be tripped, and there
    414 	// should not be non-space content after a '?' or '#', so as long as
    415 	// we only %-encode RFC 3986 reserved characters we are ok.
    416 	// In (4), we should URL escape for URL attributes, and for others we
    417 	// have the attribute name available if our conservative assumption
    418 	// proves problematic for real code.
    419 
    420 	k := 0
    421 	for {
    422 		i := k + bytes.IndexAny(s[k:], `("'/`)
    423 		if i < k {
    424 			return c, len(s)
    425 		}
    426 		switch s[i] {
    427 		case '(':
    428 			// Look for url to the left.
    429 			p := bytes.TrimRight(s[:i], "\t\n\f\r ")
    430 			if endsWithCSSKeyword(p, "url") {
    431 				j := len(s) - len(bytes.TrimLeft(s[i+1:], "\t\n\f\r "))
    432 				switch {
    433 				case j != len(s) && s[j] == '"':
    434 					c.state, j = stateCSSDqURL, j+1
    435 				case j != len(s) && s[j] == '\'':
    436 					c.state, j = stateCSSSqURL, j+1
    437 				default:
    438 					c.state = stateCSSURL
    439 				}
    440 				return c, j
    441 			}
    442 		case '/':
    443 			if i+1 < len(s) {
    444 				switch s[i+1] {
    445 				case '/':
    446 					c.state = stateCSSLineCmt
    447 					return c, i + 2
    448 				case '*':
    449 					c.state = stateCSSBlockCmt
    450 					return c, i + 2
    451 				}
    452 			}
    453 		case '"':
    454 			c.state = stateCSSDqStr
    455 			return c, i + 1
    456 		case '\'':
    457 			c.state = stateCSSSqStr
    458 			return c, i + 1
    459 		}
    460 		k = i + 1
    461 	}
    462 }
    463 
    464 // tCSSStr is the context transition function for the CSS string and URL states.
    465 func tCSSStr(c context, s []byte) (context, int) {
    466 	var endAndEsc string
    467 	switch c.state {
    468 	case stateCSSDqStr, stateCSSDqURL:
    469 		endAndEsc = `\"`
    470 	case stateCSSSqStr, stateCSSSqURL:
    471 		endAndEsc = `\'`
    472 	case stateCSSURL:
    473 		// Unquoted URLs end with a newline or close parenthesis.
    474 		// The below includes the wc (whitespace character) and nl.
    475 		endAndEsc = "\\\t\n\f\r )"
    476 	default:
    477 		panic(c.state.String())
    478 	}
    479 
    480 	k := 0
    481 	for {
    482 		i := k + bytes.IndexAny(s[k:], endAndEsc)
    483 		if i < k {
    484 			c, nread := tURL(c, decodeCSS(s[k:]))
    485 			return c, k + nread
    486 		}
    487 		if s[i] == '\\' {
    488 			i++
    489 			if i == len(s) {
    490 				return context{
    491 					state: stateError,
    492 					err:   errorf(ErrPartialEscape, nil, 0, "unfinished escape sequence in CSS string: %q", s),
    493 				}, len(s)
    494 			}
    495 		} else {
    496 			c.state = stateCSS
    497 			return c, i + 1
    498 		}
    499 		c, _ = tURL(c, decodeCSS(s[:i+1]))
    500 		k = i + 1
    501 	}
    502 }
    503 
    504 // tError is the context transition function for the error state.
    505 func tError(c context, s []byte) (context, int) {
    506 	return c, len(s)
    507 }
    508 
    509 // eatAttrName returns the largest j such that s[i:j] is an attribute name.
    510 // It returns an error if s[i:] does not look like it begins with an
    511 // attribute name, such as encountering a quote mark without a preceding
    512 // equals sign.
    513 func eatAttrName(s []byte, i int) (int, *Error) {
    514 	for j := i; j < len(s); j++ {
    515 		switch s[j] {
    516 		case ' ', '\t', '\n', '\f', '\r', '=', '>':
    517 			return j, nil
    518 		case '\'', '"', '<':
    519 			// These result in a parse warning in HTML5 and are
    520 			// indicative of serious problems if seen in an attr
    521 			// name in a template.
    522 			return -1, errorf(ErrBadHTML, nil, 0, "%q in attribute name: %.32q", s[j:j+1], s)
    523 		default:
    524 			// No-op.
    525 		}
    526 	}
    527 	return len(s), nil
    528 }
    529 
    530 var elementNameMap = map[string]element{
    531 	"script":   elementScript,
    532 	"style":    elementStyle,
    533 	"textarea": elementTextarea,
    534 	"title":    elementTitle,
    535 }
    536 
    537 // asciiAlpha reports whether c is an ASCII letter.
    538 func asciiAlpha(c byte) bool {
    539 	return 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z'
    540 }
    541 
    542 // asciiAlphaNum reports whether c is an ASCII letter or digit.
    543 func asciiAlphaNum(c byte) bool {
    544 	return asciiAlpha(c) || '0' <= c && c <= '9'
    545 }
    546 
    547 // eatTagName returns the largest j such that s[i:j] is a tag name and the tag type.
    548 func eatTagName(s []byte, i int) (int, element) {
    549 	if i == len(s) || !asciiAlpha(s[i]) {
    550 		return i, elementNone
    551 	}
    552 	j := i + 1
    553 	for j < len(s) {
    554 		x := s[j]
    555 		if asciiAlphaNum(x) {
    556 			j++
    557 			continue
    558 		}
    559 		// Allow "x-y" or "x:y" but not "x-", "-y", or "x--y".
    560 		if (x == ':' || x == '-') && j+1 < len(s) && asciiAlphaNum(s[j+1]) {
    561 			j += 2
    562 			continue
    563 		}
    564 		break
    565 	}
    566 	return j, elementNameMap[strings.ToLower(string(s[i:j]))]
    567 }
    568 
    569 // eatWhiteSpace returns the largest j such that s[i:j] is white space.
    570 func eatWhiteSpace(s []byte, i int) int {
    571 	for j := i; j < len(s); j++ {
    572 		switch s[j] {
    573 		case ' ', '\t', '\n', '\f', '\r':
    574 			// No-op.
    575 		default:
    576 			return j
    577 		}
    578 	}
    579 	return len(s)
    580 }
    581