Home | History | Annotate | Download | only in parse
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package parse
      6 
      7 import (
      8 	"fmt"
      9 	"strings"
     10 	"unicode"
     11 	"unicode/utf8"
     12 )
     13 
     14 // item represents a token or text string returned from the scanner.
     15 type item struct {
     16 	typ  itemType // The type of this item.
     17 	pos  Pos      // The starting position, in bytes, of this item in the input string.
     18 	val  string   // The value of this item.
     19 	line int      // The line number at the start of this item.
     20 }
     21 
     22 func (i item) String() string {
     23 	switch {
     24 	case i.typ == itemEOF:
     25 		return "EOF"
     26 	case i.typ == itemError:
     27 		return i.val
     28 	case i.typ > itemKeyword:
     29 		return fmt.Sprintf("<%s>", i.val)
     30 	case len(i.val) > 10:
     31 		return fmt.Sprintf("%.10q...", i.val)
     32 	}
     33 	return fmt.Sprintf("%q", i.val)
     34 }
     35 
     36 // itemType identifies the type of lex items.
     37 type itemType int
     38 
     39 const (
     40 	itemError        itemType = iota // error occurred; value is text of error
     41 	itemBool                         // boolean constant
     42 	itemChar                         // printable ASCII character; grab bag for comma etc.
     43 	itemCharConstant                 // character constant
     44 	itemComplex                      // complex constant (1+2i); imaginary is just a number
     45 	itemColonEquals                  // colon-equals (':=') introducing a declaration
     46 	itemEOF
     47 	itemField      // alphanumeric identifier starting with '.'
     48 	itemIdentifier // alphanumeric identifier not starting with '.'
     49 	itemLeftDelim  // left action delimiter
     50 	itemLeftParen  // '(' inside action
     51 	itemNumber     // simple number, including imaginary
     52 	itemPipe       // pipe symbol
     53 	itemRawString  // raw quoted string (includes quotes)
     54 	itemRightDelim // right action delimiter
     55 	itemRightParen // ')' inside action
     56 	itemSpace      // run of spaces separating arguments
     57 	itemString     // quoted string (includes quotes)
     58 	itemText       // plain text
     59 	itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
     60 	// Keywords appear after all the rest.
     61 	itemKeyword  // used only to delimit the keywords
     62 	itemBlock    // block keyword
     63 	itemDot      // the cursor, spelled '.'
     64 	itemDefine   // define keyword
     65 	itemElse     // else keyword
     66 	itemEnd      // end keyword
     67 	itemIf       // if keyword
     68 	itemNil      // the untyped nil constant, easiest to treat as a keyword
     69 	itemRange    // range keyword
     70 	itemTemplate // template keyword
     71 	itemWith     // with keyword
     72 )
     73 
     74 var key = map[string]itemType{
     75 	".":        itemDot,
     76 	"block":    itemBlock,
     77 	"define":   itemDefine,
     78 	"else":     itemElse,
     79 	"end":      itemEnd,
     80 	"if":       itemIf,
     81 	"range":    itemRange,
     82 	"nil":      itemNil,
     83 	"template": itemTemplate,
     84 	"with":     itemWith,
     85 }
     86 
     87 const eof = -1
     88 
     89 // Trimming spaces.
     90 // If the action begins "{{- " rather than "{{", then all space/tab/newlines
     91 // preceding the action are trimmed; conversely if it ends " -}}" the
     92 // leading spaces are trimmed. This is done entirely in the lexer; the
     93 // parser never sees it happen. We require an ASCII space to be
     94 // present to avoid ambiguity with things like "{{-3}}". It reads
     95 // better with the space present anyway. For simplicity, only ASCII
     96 // space does the job.
     97 const (
     98 	spaceChars      = " \t\r\n" // These are the space characters defined by Go itself.
     99 	leftTrimMarker  = "- "      // Attached to left delimiter, trims trailing spaces from preceding text.
    100 	rightTrimMarker = " -"      // Attached to right delimiter, trims leading spaces from following text.
    101 	trimMarkerLen   = Pos(len(leftTrimMarker))
    102 )
    103 
    104 // stateFn represents the state of the scanner as a function that returns the next state.
    105 type stateFn func(*lexer) stateFn
    106 
    107 // lexer holds the state of the scanner.
    108 type lexer struct {
    109 	name       string    // the name of the input; used only for error reports
    110 	input      string    // the string being scanned
    111 	leftDelim  string    // start of action
    112 	rightDelim string    // end of action
    113 	pos        Pos       // current position in the input
    114 	start      Pos       // start position of this item
    115 	width      Pos       // width of last rune read from input
    116 	items      chan item // channel of scanned items
    117 	parenDepth int       // nesting depth of ( ) exprs
    118 	line       int       // 1+number of newlines seen
    119 }
    120 
    121 // next returns the next rune in the input.
    122 func (l *lexer) next() rune {
    123 	if int(l.pos) >= len(l.input) {
    124 		l.width = 0
    125 		return eof
    126 	}
    127 	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
    128 	l.width = Pos(w)
    129 	l.pos += l.width
    130 	if r == '\n' {
    131 		l.line++
    132 	}
    133 	return r
    134 }
    135 
    136 // peek returns but does not consume the next rune in the input.
    137 func (l *lexer) peek() rune {
    138 	r := l.next()
    139 	l.backup()
    140 	return r
    141 }
    142 
    143 // backup steps back one rune. Can only be called once per call of next.
    144 func (l *lexer) backup() {
    145 	l.pos -= l.width
    146 	// Correct newline count.
    147 	if l.width == 1 && l.input[l.pos] == '\n' {
    148 		l.line--
    149 	}
    150 }
    151 
    152 // emit passes an item back to the client.
    153 func (l *lexer) emit(t itemType) {
    154 	l.items <- item{t, l.start, l.input[l.start:l.pos], l.line}
    155 	// Some items contain text internally. If so, count their newlines.
    156 	switch t {
    157 	case itemText, itemRawString, itemLeftDelim, itemRightDelim:
    158 		l.line += strings.Count(l.input[l.start:l.pos], "\n")
    159 	}
    160 	l.start = l.pos
    161 }
    162 
    163 // ignore skips over the pending input before this point.
    164 func (l *lexer) ignore() {
    165 	l.line += strings.Count(l.input[l.start:l.pos], "\n")
    166 	l.start = l.pos
    167 }
    168 
    169 // accept consumes the next rune if it's from the valid set.
    170 func (l *lexer) accept(valid string) bool {
    171 	if strings.ContainsRune(valid, l.next()) {
    172 		return true
    173 	}
    174 	l.backup()
    175 	return false
    176 }
    177 
    178 // acceptRun consumes a run of runes from the valid set.
    179 func (l *lexer) acceptRun(valid string) {
    180 	for strings.ContainsRune(valid, l.next()) {
    181 	}
    182 	l.backup()
    183 }
    184 
    185 // errorf returns an error token and terminates the scan by passing
    186 // back a nil pointer that will be the next state, terminating l.nextItem.
    187 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
    188 	l.items <- item{itemError, l.start, fmt.Sprintf(format, args...), l.line}
    189 	return nil
    190 }
    191 
    192 // nextItem returns the next item from the input.
    193 // Called by the parser, not in the lexing goroutine.
    194 func (l *lexer) nextItem() item {
    195 	return <-l.items
    196 }
    197 
    198 // drain drains the output so the lexing goroutine will exit.
    199 // Called by the parser, not in the lexing goroutine.
    200 func (l *lexer) drain() {
    201 	for range l.items {
    202 	}
    203 }
    204 
    205 // lex creates a new scanner for the input string.
    206 func lex(name, input, left, right string) *lexer {
    207 	if left == "" {
    208 		left = leftDelim
    209 	}
    210 	if right == "" {
    211 		right = rightDelim
    212 	}
    213 	l := &lexer{
    214 		name:       name,
    215 		input:      input,
    216 		leftDelim:  left,
    217 		rightDelim: right,
    218 		items:      make(chan item),
    219 		line:       1,
    220 	}
    221 	go l.run()
    222 	return l
    223 }
    224 
    225 // run runs the state machine for the lexer.
    226 func (l *lexer) run() {
    227 	for state := lexText; state != nil; {
    228 		state = state(l)
    229 	}
    230 	close(l.items)
    231 }
    232 
    233 // state functions
    234 
    235 const (
    236 	leftDelim    = "{{"
    237 	rightDelim   = "}}"
    238 	leftComment  = "/*"
    239 	rightComment = "*/"
    240 )
    241 
    242 // lexText scans until an opening action delimiter, "{{".
    243 func lexText(l *lexer) stateFn {
    244 	l.width = 0
    245 	if x := strings.Index(l.input[l.pos:], l.leftDelim); x >= 0 {
    246 		ldn := Pos(len(l.leftDelim))
    247 		l.pos += Pos(x)
    248 		trimLength := Pos(0)
    249 		if strings.HasPrefix(l.input[l.pos+ldn:], leftTrimMarker) {
    250 			trimLength = rightTrimLength(l.input[l.start:l.pos])
    251 		}
    252 		l.pos -= trimLength
    253 		if l.pos > l.start {
    254 			l.emit(itemText)
    255 		}
    256 		l.pos += trimLength
    257 		l.ignore()
    258 		return lexLeftDelim
    259 	} else {
    260 		l.pos = Pos(len(l.input))
    261 	}
    262 	// Correctly reached EOF.
    263 	if l.pos > l.start {
    264 		l.emit(itemText)
    265 	}
    266 	l.emit(itemEOF)
    267 	return nil
    268 }
    269 
    270 // rightTrimLength returns the length of the spaces at the end of the string.
    271 func rightTrimLength(s string) Pos {
    272 	return Pos(len(s) - len(strings.TrimRight(s, spaceChars)))
    273 }
    274 
    275 // atRightDelim reports whether the lexer is at a right delimiter, possibly preceded by a trim marker.
    276 func (l *lexer) atRightDelim() (delim, trimSpaces bool) {
    277 	if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
    278 		return true, false
    279 	}
    280 	// The right delim might have the marker before.
    281 	if strings.HasPrefix(l.input[l.pos:], rightTrimMarker) &&
    282 		strings.HasPrefix(l.input[l.pos+trimMarkerLen:], l.rightDelim) {
    283 		return true, true
    284 	}
    285 	return false, false
    286 }
    287 
    288 // leftTrimLength returns the length of the spaces at the beginning of the string.
    289 func leftTrimLength(s string) Pos {
    290 	return Pos(len(s) - len(strings.TrimLeft(s, spaceChars)))
    291 }
    292 
    293 // lexLeftDelim scans the left delimiter, which is known to be present, possibly with a trim marker.
    294 func lexLeftDelim(l *lexer) stateFn {
    295 	l.pos += Pos(len(l.leftDelim))
    296 	trimSpace := strings.HasPrefix(l.input[l.pos:], leftTrimMarker)
    297 	afterMarker := Pos(0)
    298 	if trimSpace {
    299 		afterMarker = trimMarkerLen
    300 	}
    301 	if strings.HasPrefix(l.input[l.pos+afterMarker:], leftComment) {
    302 		l.pos += afterMarker
    303 		l.ignore()
    304 		return lexComment
    305 	}
    306 	l.emit(itemLeftDelim)
    307 	l.pos += afterMarker
    308 	l.ignore()
    309 	l.parenDepth = 0
    310 	return lexInsideAction
    311 }
    312 
    313 // lexComment scans a comment. The left comment marker is known to be present.
    314 func lexComment(l *lexer) stateFn {
    315 	l.pos += Pos(len(leftComment))
    316 	i := strings.Index(l.input[l.pos:], rightComment)
    317 	if i < 0 {
    318 		return l.errorf("unclosed comment")
    319 	}
    320 	l.pos += Pos(i + len(rightComment))
    321 	delim, trimSpace := l.atRightDelim()
    322 	if !delim {
    323 		return l.errorf("comment ends before closing delimiter")
    324 	}
    325 	if trimSpace {
    326 		l.pos += trimMarkerLen
    327 	}
    328 	l.pos += Pos(len(l.rightDelim))
    329 	if trimSpace {
    330 		l.pos += leftTrimLength(l.input[l.pos:])
    331 	}
    332 	l.ignore()
    333 	return lexText
    334 }
    335 
    336 // lexRightDelim scans the right delimiter, which is known to be present, possibly with a trim marker.
    337 func lexRightDelim(l *lexer) stateFn {
    338 	trimSpace := strings.HasPrefix(l.input[l.pos:], rightTrimMarker)
    339 	if trimSpace {
    340 		l.pos += trimMarkerLen
    341 		l.ignore()
    342 	}
    343 	l.pos += Pos(len(l.rightDelim))
    344 	l.emit(itemRightDelim)
    345 	if trimSpace {
    346 		l.pos += leftTrimLength(l.input[l.pos:])
    347 		l.ignore()
    348 	}
    349 	return lexText
    350 }
    351 
    352 // lexInsideAction scans the elements inside action delimiters.
    353 func lexInsideAction(l *lexer) stateFn {
    354 	// Either number, quoted string, or identifier.
    355 	// Spaces separate arguments; runs of spaces turn into itemSpace.
    356 	// Pipe symbols separate and are emitted.
    357 	delim, _ := l.atRightDelim()
    358 	if delim {
    359 		if l.parenDepth == 0 {
    360 			return lexRightDelim
    361 		}
    362 		return l.errorf("unclosed left paren")
    363 	}
    364 	switch r := l.next(); {
    365 	case r == eof || isEndOfLine(r):
    366 		return l.errorf("unclosed action")
    367 	case isSpace(r):
    368 		return lexSpace
    369 	case r == ':':
    370 		if l.next() != '=' {
    371 			return l.errorf("expected :=")
    372 		}
    373 		l.emit(itemColonEquals)
    374 	case r == '|':
    375 		l.emit(itemPipe)
    376 	case r == '"':
    377 		return lexQuote
    378 	case r == '`':
    379 		return lexRawQuote
    380 	case r == '$':
    381 		return lexVariable
    382 	case r == '\'':
    383 		return lexChar
    384 	case r == '.':
    385 		// special look-ahead for ".field" so we don't break l.backup().
    386 		if l.pos < Pos(len(l.input)) {
    387 			r := l.input[l.pos]
    388 			if r < '0' || '9' < r {
    389 				return lexField
    390 			}
    391 		}
    392 		fallthrough // '.' can start a number.
    393 	case r == '+' || r == '-' || ('0' <= r && r <= '9'):
    394 		l.backup()
    395 		return lexNumber
    396 	case isAlphaNumeric(r):
    397 		l.backup()
    398 		return lexIdentifier
    399 	case r == '(':
    400 		l.emit(itemLeftParen)
    401 		l.parenDepth++
    402 	case r == ')':
    403 		l.emit(itemRightParen)
    404 		l.parenDepth--
    405 		if l.parenDepth < 0 {
    406 			return l.errorf("unexpected right paren %#U", r)
    407 		}
    408 	case r <= unicode.MaxASCII && unicode.IsPrint(r):
    409 		l.emit(itemChar)
    410 		return lexInsideAction
    411 	default:
    412 		return l.errorf("unrecognized character in action: %#U", r)
    413 	}
    414 	return lexInsideAction
    415 }
    416 
    417 // lexSpace scans a run of space characters.
    418 // One space has already been seen.
    419 func lexSpace(l *lexer) stateFn {
    420 	for isSpace(l.peek()) {
    421 		l.next()
    422 	}
    423 	l.emit(itemSpace)
    424 	return lexInsideAction
    425 }
    426 
    427 // lexIdentifier scans an alphanumeric.
    428 func lexIdentifier(l *lexer) stateFn {
    429 Loop:
    430 	for {
    431 		switch r := l.next(); {
    432 		case isAlphaNumeric(r):
    433 			// absorb.
    434 		default:
    435 			l.backup()
    436 			word := l.input[l.start:l.pos]
    437 			if !l.atTerminator() {
    438 				return l.errorf("bad character %#U", r)
    439 			}
    440 			switch {
    441 			case key[word] > itemKeyword:
    442 				l.emit(key[word])
    443 			case word[0] == '.':
    444 				l.emit(itemField)
    445 			case word == "true", word == "false":
    446 				l.emit(itemBool)
    447 			default:
    448 				l.emit(itemIdentifier)
    449 			}
    450 			break Loop
    451 		}
    452 	}
    453 	return lexInsideAction
    454 }
    455 
    456 // lexField scans a field: .Alphanumeric.
    457 // The . has been scanned.
    458 func lexField(l *lexer) stateFn {
    459 	return lexFieldOrVariable(l, itemField)
    460 }
    461 
    462 // lexVariable scans a Variable: $Alphanumeric.
    463 // The $ has been scanned.
    464 func lexVariable(l *lexer) stateFn {
    465 	if l.atTerminator() { // Nothing interesting follows -> "$".
    466 		l.emit(itemVariable)
    467 		return lexInsideAction
    468 	}
    469 	return lexFieldOrVariable(l, itemVariable)
    470 }
    471 
    472 // lexVariable scans a field or variable: [.$]Alphanumeric.
    473 // The . or $ has been scanned.
    474 func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
    475 	if l.atTerminator() { // Nothing interesting follows -> "." or "$".
    476 		if typ == itemVariable {
    477 			l.emit(itemVariable)
    478 		} else {
    479 			l.emit(itemDot)
    480 		}
    481 		return lexInsideAction
    482 	}
    483 	var r rune
    484 	for {
    485 		r = l.next()
    486 		if !isAlphaNumeric(r) {
    487 			l.backup()
    488 			break
    489 		}
    490 	}
    491 	if !l.atTerminator() {
    492 		return l.errorf("bad character %#U", r)
    493 	}
    494 	l.emit(typ)
    495 	return lexInsideAction
    496 }
    497 
    498 // atTerminator reports whether the input is at valid termination character to
    499 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
    500 // like "$x+2" not being acceptable without a space, in case we decide one
    501 // day to implement arithmetic.
    502 func (l *lexer) atTerminator() bool {
    503 	r := l.peek()
    504 	if isSpace(r) || isEndOfLine(r) {
    505 		return true
    506 	}
    507 	switch r {
    508 	case eof, '.', ',', '|', ':', ')', '(':
    509 		return true
    510 	}
    511 	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
    512 	// succeed but should fail) but only in extremely rare cases caused by willfully
    513 	// bad choice of delimiter.
    514 	if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
    515 		return true
    516 	}
    517 	return false
    518 }
    519 
    520 // lexChar scans a character constant. The initial quote is already
    521 // scanned. Syntax checking is done by the parser.
    522 func lexChar(l *lexer) stateFn {
    523 Loop:
    524 	for {
    525 		switch l.next() {
    526 		case '\\':
    527 			if r := l.next(); r != eof && r != '\n' {
    528 				break
    529 			}
    530 			fallthrough
    531 		case eof, '\n':
    532 			return l.errorf("unterminated character constant")
    533 		case '\'':
    534 			break Loop
    535 		}
    536 	}
    537 	l.emit(itemCharConstant)
    538 	return lexInsideAction
    539 }
    540 
    541 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
    542 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
    543 // and "089" - but when it's wrong the input is invalid and the parser (via
    544 // strconv) will notice.
    545 func lexNumber(l *lexer) stateFn {
    546 	if !l.scanNumber() {
    547 		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
    548 	}
    549 	if sign := l.peek(); sign == '+' || sign == '-' {
    550 		// Complex: 1+2i. No spaces, must end in 'i'.
    551 		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
    552 			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
    553 		}
    554 		l.emit(itemComplex)
    555 	} else {
    556 		l.emit(itemNumber)
    557 	}
    558 	return lexInsideAction
    559 }
    560 
    561 func (l *lexer) scanNumber() bool {
    562 	// Optional leading sign.
    563 	l.accept("+-")
    564 	// Is it hex?
    565 	digits := "0123456789"
    566 	if l.accept("0") && l.accept("xX") {
    567 		digits = "0123456789abcdefABCDEF"
    568 	}
    569 	l.acceptRun(digits)
    570 	if l.accept(".") {
    571 		l.acceptRun(digits)
    572 	}
    573 	if l.accept("eE") {
    574 		l.accept("+-")
    575 		l.acceptRun("0123456789")
    576 	}
    577 	// Is it imaginary?
    578 	l.accept("i")
    579 	// Next thing mustn't be alphanumeric.
    580 	if isAlphaNumeric(l.peek()) {
    581 		l.next()
    582 		return false
    583 	}
    584 	return true
    585 }
    586 
    587 // lexQuote scans a quoted string.
    588 func lexQuote(l *lexer) stateFn {
    589 Loop:
    590 	for {
    591 		switch l.next() {
    592 		case '\\':
    593 			if r := l.next(); r != eof && r != '\n' {
    594 				break
    595 			}
    596 			fallthrough
    597 		case eof, '\n':
    598 			return l.errorf("unterminated quoted string")
    599 		case '"':
    600 			break Loop
    601 		}
    602 	}
    603 	l.emit(itemString)
    604 	return lexInsideAction
    605 }
    606 
    607 // lexRawQuote scans a raw quoted string.
    608 func lexRawQuote(l *lexer) stateFn {
    609 	startLine := l.line
    610 Loop:
    611 	for {
    612 		switch l.next() {
    613 		case eof:
    614 			// Restore line number to location of opening quote.
    615 			// We will error out so it's ok just to overwrite the field.
    616 			l.line = startLine
    617 			return l.errorf("unterminated raw quoted string")
    618 		case '`':
    619 			break Loop
    620 		}
    621 	}
    622 	l.emit(itemRawString)
    623 	return lexInsideAction
    624 }
    625 
    626 // isSpace reports whether r is a space character.
    627 func isSpace(r rune) bool {
    628 	return r == ' ' || r == '\t'
    629 }
    630 
    631 // isEndOfLine reports whether r is an end-of-line character.
    632 func isEndOfLine(r rune) bool {
    633 	return r == '\r' || r == '\n'
    634 }
    635 
    636 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
    637 func isAlphaNumeric(r rune) bool {
    638 	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
    639 }
    640