Home | History | Annotate | Download | only in parse
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package parse
      6 
      7 import (
      8 	"fmt"
      9 	"strings"
     10 	"unicode"
     11 	"unicode/utf8"
     12 )
     13 
     14 // item represents a token or text string returned from the scanner.
     15 type item struct {
     16 	typ itemType // The type of this item.
     17 	pos Pos      // The starting position, in bytes, of this item in the input string.
     18 	val string   // The value of this item.
     19 }
     20 
     21 func (i item) String() string {
     22 	switch {
     23 	case i.typ == itemEOF:
     24 		return "EOF"
     25 	case i.typ == itemError:
     26 		return i.val
     27 	case i.typ > itemKeyword:
     28 		return fmt.Sprintf("<%s>", i.val)
     29 	case len(i.val) > 10:
     30 		return fmt.Sprintf("%.10q...", i.val)
     31 	}
     32 	return fmt.Sprintf("%q", i.val)
     33 }
     34 
     35 // itemType identifies the type of lex items.
     36 type itemType int
     37 
     38 const (
     39 	itemError        itemType = iota // error occurred; value is text of error
     40 	itemBool                         // boolean constant
     41 	itemChar                         // printable ASCII character; grab bag for comma etc.
     42 	itemCharConstant                 // character constant
     43 	itemComplex                      // complex constant (1+2i); imaginary is just a number
     44 	itemColonEquals                  // colon-equals (':=') introducing a declaration
     45 	itemEOF
     46 	itemField      // alphanumeric identifier starting with '.'
     47 	itemIdentifier // alphanumeric identifier not starting with '.'
     48 	itemLeftDelim  // left action delimiter
     49 	itemLeftParen  // '(' inside action
     50 	itemNumber     // simple number, including imaginary
     51 	itemPipe       // pipe symbol
     52 	itemRawString  // raw quoted string (includes quotes)
     53 	itemRightDelim // right action delimiter
     54 	itemRightParen // ')' inside action
     55 	itemSpace      // run of spaces separating arguments
     56 	itemString     // quoted string (includes quotes)
     57 	itemText       // plain text
     58 	itemVariable   // variable starting with '$', such as '$' or  '$1' or '$hello'
     59 	// Keywords appear after all the rest.
     60 	itemKeyword  // used only to delimit the keywords
     61 	itemDot      // the cursor, spelled '.'
     62 	itemDefine   // define keyword
     63 	itemElse     // else keyword
     64 	itemEnd      // end keyword
     65 	itemIf       // if keyword
     66 	itemNil      // the untyped nil constant, easiest to treat as a keyword
     67 	itemRange    // range keyword
     68 	itemTemplate // template keyword
     69 	itemWith     // with keyword
     70 )
     71 
     72 var key = map[string]itemType{
     73 	".":        itemDot,
     74 	"define":   itemDefine,
     75 	"else":     itemElse,
     76 	"end":      itemEnd,
     77 	"if":       itemIf,
     78 	"range":    itemRange,
     79 	"nil":      itemNil,
     80 	"template": itemTemplate,
     81 	"with":     itemWith,
     82 }
     83 
     84 const eof = -1
     85 
     86 // stateFn represents the state of the scanner as a function that returns the next state.
     87 type stateFn func(*lexer) stateFn
     88 
     89 // lexer holds the state of the scanner.
     90 type lexer struct {
     91 	name       string    // the name of the input; used only for error reports
     92 	input      string    // the string being scanned
     93 	leftDelim  string    // start of action
     94 	rightDelim string    // end of action
     95 	state      stateFn   // the next lexing function to enter
     96 	pos        Pos       // current position in the input
     97 	start      Pos       // start position of this item
     98 	width      Pos       // width of last rune read from input
     99 	lastPos    Pos       // position of most recent item returned by nextItem
    100 	items      chan item // channel of scanned items
    101 	parenDepth int       // nesting depth of ( ) exprs
    102 }
    103 
    104 // next returns the next rune in the input.
    105 func (l *lexer) next() rune {
    106 	if int(l.pos) >= len(l.input) {
    107 		l.width = 0
    108 		return eof
    109 	}
    110 	r, w := utf8.DecodeRuneInString(l.input[l.pos:])
    111 	l.width = Pos(w)
    112 	l.pos += l.width
    113 	return r
    114 }
    115 
    116 // peek returns but does not consume the next rune in the input.
    117 func (l *lexer) peek() rune {
    118 	r := l.next()
    119 	l.backup()
    120 	return r
    121 }
    122 
    123 // backup steps back one rune. Can only be called once per call of next.
    124 func (l *lexer) backup() {
    125 	l.pos -= l.width
    126 }
    127 
    128 // emit passes an item back to the client.
    129 func (l *lexer) emit(t itemType) {
    130 	l.items <- item{t, l.start, l.input[l.start:l.pos]}
    131 	l.start = l.pos
    132 }
    133 
    134 // ignore skips over the pending input before this point.
    135 func (l *lexer) ignore() {
    136 	l.start = l.pos
    137 }
    138 
    139 // accept consumes the next rune if it's from the valid set.
    140 func (l *lexer) accept(valid string) bool {
    141 	if strings.IndexRune(valid, l.next()) >= 0 {
    142 		return true
    143 	}
    144 	l.backup()
    145 	return false
    146 }
    147 
    148 // acceptRun consumes a run of runes from the valid set.
    149 func (l *lexer) acceptRun(valid string) {
    150 	for strings.IndexRune(valid, l.next()) >= 0 {
    151 	}
    152 	l.backup()
    153 }
    154 
    155 // lineNumber reports which line we're on, based on the position of
    156 // the previous item returned by nextItem. Doing it this way
    157 // means we don't have to worry about peek double counting.
    158 func (l *lexer) lineNumber() int {
    159 	return 1 + strings.Count(l.input[:l.lastPos], "\n")
    160 }
    161 
    162 // errorf returns an error token and terminates the scan by passing
    163 // back a nil pointer that will be the next state, terminating l.nextItem.
    164 func (l *lexer) errorf(format string, args ...interface{}) stateFn {
    165 	l.items <- item{itemError, l.start, fmt.Sprintf(format, args...)}
    166 	return nil
    167 }
    168 
    169 // nextItem returns the next item from the input.
    170 // Called by the parser, not in the lexing goroutine.
    171 func (l *lexer) nextItem() item {
    172 	item := <-l.items
    173 	l.lastPos = item.pos
    174 	return item
    175 }
    176 
    177 // drain drains the output so the lexing goroutine will exit.
    178 // Called by the parser, not in the lexing goroutine.
    179 func (l *lexer) drain() {
    180 	for range l.items {
    181 	}
    182 }
    183 
    184 // lex creates a new scanner for the input string.
    185 func lex(name, input, left, right string) *lexer {
    186 	if left == "" {
    187 		left = leftDelim
    188 	}
    189 	if right == "" {
    190 		right = rightDelim
    191 	}
    192 	l := &lexer{
    193 		name:       name,
    194 		input:      input,
    195 		leftDelim:  left,
    196 		rightDelim: right,
    197 		items:      make(chan item),
    198 	}
    199 	go l.run()
    200 	return l
    201 }
    202 
    203 // run runs the state machine for the lexer.
    204 func (l *lexer) run() {
    205 	for l.state = lexText; l.state != nil; {
    206 		l.state = l.state(l)
    207 	}
    208 	close(l.items)
    209 }
    210 
    211 // state functions
    212 
    213 const (
    214 	leftDelim    = "{{"
    215 	rightDelim   = "}}"
    216 	leftComment  = "/*"
    217 	rightComment = "*/"
    218 )
    219 
    220 // lexText scans until an opening action delimiter, "{{".
    221 func lexText(l *lexer) stateFn {
    222 	for {
    223 		if strings.HasPrefix(l.input[l.pos:], l.leftDelim) {
    224 			if l.pos > l.start {
    225 				l.emit(itemText)
    226 			}
    227 			return lexLeftDelim
    228 		}
    229 		if l.next() == eof {
    230 			break
    231 		}
    232 	}
    233 	// Correctly reached EOF.
    234 	if l.pos > l.start {
    235 		l.emit(itemText)
    236 	}
    237 	l.emit(itemEOF)
    238 	return nil
    239 }
    240 
    241 // lexLeftDelim scans the left delimiter, which is known to be present.
    242 func lexLeftDelim(l *lexer) stateFn {
    243 	l.pos += Pos(len(l.leftDelim))
    244 	if strings.HasPrefix(l.input[l.pos:], leftComment) {
    245 		return lexComment
    246 	}
    247 	l.emit(itemLeftDelim)
    248 	l.parenDepth = 0
    249 	return lexInsideAction
    250 }
    251 
    252 // lexComment scans a comment. The left comment marker is known to be present.
    253 func lexComment(l *lexer) stateFn {
    254 	l.pos += Pos(len(leftComment))
    255 	i := strings.Index(l.input[l.pos:], rightComment)
    256 	if i < 0 {
    257 		return l.errorf("unclosed comment")
    258 	}
    259 	l.pos += Pos(i + len(rightComment))
    260 	if !strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
    261 		return l.errorf("comment ends before closing delimiter")
    262 
    263 	}
    264 	l.pos += Pos(len(l.rightDelim))
    265 	l.ignore()
    266 	return lexText
    267 }
    268 
    269 // lexRightDelim scans the right delimiter, which is known to be present.
    270 func lexRightDelim(l *lexer) stateFn {
    271 	l.pos += Pos(len(l.rightDelim))
    272 	l.emit(itemRightDelim)
    273 	return lexText
    274 }
    275 
    276 // lexInsideAction scans the elements inside action delimiters.
    277 func lexInsideAction(l *lexer) stateFn {
    278 	// Either number, quoted string, or identifier.
    279 	// Spaces separate arguments; runs of spaces turn into itemSpace.
    280 	// Pipe symbols separate and are emitted.
    281 	if strings.HasPrefix(l.input[l.pos:], l.rightDelim) {
    282 		if l.parenDepth == 0 {
    283 			return lexRightDelim
    284 		}
    285 		return l.errorf("unclosed left paren")
    286 	}
    287 	switch r := l.next(); {
    288 	case r == eof || isEndOfLine(r):
    289 		return l.errorf("unclosed action")
    290 	case isSpace(r):
    291 		return lexSpace
    292 	case r == ':':
    293 		if l.next() != '=' {
    294 			return l.errorf("expected :=")
    295 		}
    296 		l.emit(itemColonEquals)
    297 	case r == '|':
    298 		l.emit(itemPipe)
    299 	case r == '"':
    300 		return lexQuote
    301 	case r == '`':
    302 		return lexRawQuote
    303 	case r == '$':
    304 		return lexVariable
    305 	case r == '\'':
    306 		return lexChar
    307 	case r == '.':
    308 		// special look-ahead for ".field" so we don't break l.backup().
    309 		if l.pos < Pos(len(l.input)) {
    310 			r := l.input[l.pos]
    311 			if r < '0' || '9' < r {
    312 				return lexField
    313 			}
    314 		}
    315 		fallthrough // '.' can start a number.
    316 	case r == '+' || r == '-' || ('0' <= r && r <= '9'):
    317 		l.backup()
    318 		return lexNumber
    319 	case isAlphaNumeric(r):
    320 		l.backup()
    321 		return lexIdentifier
    322 	case r == '(':
    323 		l.emit(itemLeftParen)
    324 		l.parenDepth++
    325 	case r == ')':
    326 		l.emit(itemRightParen)
    327 		l.parenDepth--
    328 		if l.parenDepth < 0 {
    329 			return l.errorf("unexpected right paren %#U", r)
    330 		}
    331 	case r <= unicode.MaxASCII && unicode.IsPrint(r):
    332 		l.emit(itemChar)
    333 		return lexInsideAction
    334 	default:
    335 		return l.errorf("unrecognized character in action: %#U", r)
    336 	}
    337 	return lexInsideAction
    338 }
    339 
    340 // lexSpace scans a run of space characters.
    341 // One space has already been seen.
    342 func lexSpace(l *lexer) stateFn {
    343 	for isSpace(l.peek()) {
    344 		l.next()
    345 	}
    346 	l.emit(itemSpace)
    347 	return lexInsideAction
    348 }
    349 
    350 // lexIdentifier scans an alphanumeric.
    351 func lexIdentifier(l *lexer) stateFn {
    352 Loop:
    353 	for {
    354 		switch r := l.next(); {
    355 		case isAlphaNumeric(r):
    356 			// absorb.
    357 		default:
    358 			l.backup()
    359 			word := l.input[l.start:l.pos]
    360 			if !l.atTerminator() {
    361 				return l.errorf("bad character %#U", r)
    362 			}
    363 			switch {
    364 			case key[word] > itemKeyword:
    365 				l.emit(key[word])
    366 			case word[0] == '.':
    367 				l.emit(itemField)
    368 			case word == "true", word == "false":
    369 				l.emit(itemBool)
    370 			default:
    371 				l.emit(itemIdentifier)
    372 			}
    373 			break Loop
    374 		}
    375 	}
    376 	return lexInsideAction
    377 }
    378 
    379 // lexField scans a field: .Alphanumeric.
    380 // The . has been scanned.
    381 func lexField(l *lexer) stateFn {
    382 	return lexFieldOrVariable(l, itemField)
    383 }
    384 
    385 // lexVariable scans a Variable: $Alphanumeric.
    386 // The $ has been scanned.
    387 func lexVariable(l *lexer) stateFn {
    388 	if l.atTerminator() { // Nothing interesting follows -> "$".
    389 		l.emit(itemVariable)
    390 		return lexInsideAction
    391 	}
    392 	return lexFieldOrVariable(l, itemVariable)
    393 }
    394 
    395 // lexVariable scans a field or variable: [.$]Alphanumeric.
    396 // The . or $ has been scanned.
    397 func lexFieldOrVariable(l *lexer, typ itemType) stateFn {
    398 	if l.atTerminator() { // Nothing interesting follows -> "." or "$".
    399 		if typ == itemVariable {
    400 			l.emit(itemVariable)
    401 		} else {
    402 			l.emit(itemDot)
    403 		}
    404 		return lexInsideAction
    405 	}
    406 	var r rune
    407 	for {
    408 		r = l.next()
    409 		if !isAlphaNumeric(r) {
    410 			l.backup()
    411 			break
    412 		}
    413 	}
    414 	if !l.atTerminator() {
    415 		return l.errorf("bad character %#U", r)
    416 	}
    417 	l.emit(typ)
    418 	return lexInsideAction
    419 }
    420 
    421 // atTerminator reports whether the input is at valid termination character to
    422 // appear after an identifier. Breaks .X.Y into two pieces. Also catches cases
    423 // like "$x+2" not being acceptable without a space, in case we decide one
    424 // day to implement arithmetic.
    425 func (l *lexer) atTerminator() bool {
    426 	r := l.peek()
    427 	if isSpace(r) || isEndOfLine(r) {
    428 		return true
    429 	}
    430 	switch r {
    431 	case eof, '.', ',', '|', ':', ')', '(':
    432 		return true
    433 	}
    434 	// Does r start the delimiter? This can be ambiguous (with delim=="//", $x/2 will
    435 	// succeed but should fail) but only in extremely rare cases caused by willfully
    436 	// bad choice of delimiter.
    437 	if rd, _ := utf8.DecodeRuneInString(l.rightDelim); rd == r {
    438 		return true
    439 	}
    440 	return false
    441 }
    442 
    443 // lexChar scans a character constant. The initial quote is already
    444 // scanned. Syntax checking is done by the parser.
    445 func lexChar(l *lexer) stateFn {
    446 Loop:
    447 	for {
    448 		switch l.next() {
    449 		case '\\':
    450 			if r := l.next(); r != eof && r != '\n' {
    451 				break
    452 			}
    453 			fallthrough
    454 		case eof, '\n':
    455 			return l.errorf("unterminated character constant")
    456 		case '\'':
    457 			break Loop
    458 		}
    459 	}
    460 	l.emit(itemCharConstant)
    461 	return lexInsideAction
    462 }
    463 
    464 // lexNumber scans a number: decimal, octal, hex, float, or imaginary. This
    465 // isn't a perfect number scanner - for instance it accepts "." and "0x0.2"
    466 // and "089" - but when it's wrong the input is invalid and the parser (via
    467 // strconv) will notice.
    468 func lexNumber(l *lexer) stateFn {
    469 	if !l.scanNumber() {
    470 		return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
    471 	}
    472 	if sign := l.peek(); sign == '+' || sign == '-' {
    473 		// Complex: 1+2i. No spaces, must end in 'i'.
    474 		if !l.scanNumber() || l.input[l.pos-1] != 'i' {
    475 			return l.errorf("bad number syntax: %q", l.input[l.start:l.pos])
    476 		}
    477 		l.emit(itemComplex)
    478 	} else {
    479 		l.emit(itemNumber)
    480 	}
    481 	return lexInsideAction
    482 }
    483 
    484 func (l *lexer) scanNumber() bool {
    485 	// Optional leading sign.
    486 	l.accept("+-")
    487 	// Is it hex?
    488 	digits := "0123456789"
    489 	if l.accept("0") && l.accept("xX") {
    490 		digits = "0123456789abcdefABCDEF"
    491 	}
    492 	l.acceptRun(digits)
    493 	if l.accept(".") {
    494 		l.acceptRun(digits)
    495 	}
    496 	if l.accept("eE") {
    497 		l.accept("+-")
    498 		l.acceptRun("0123456789")
    499 	}
    500 	// Is it imaginary?
    501 	l.accept("i")
    502 	// Next thing mustn't be alphanumeric.
    503 	if isAlphaNumeric(l.peek()) {
    504 		l.next()
    505 		return false
    506 	}
    507 	return true
    508 }
    509 
    510 // lexQuote scans a quoted string.
    511 func lexQuote(l *lexer) stateFn {
    512 Loop:
    513 	for {
    514 		switch l.next() {
    515 		case '\\':
    516 			if r := l.next(); r != eof && r != '\n' {
    517 				break
    518 			}
    519 			fallthrough
    520 		case eof, '\n':
    521 			return l.errorf("unterminated quoted string")
    522 		case '"':
    523 			break Loop
    524 		}
    525 	}
    526 	l.emit(itemString)
    527 	return lexInsideAction
    528 }
    529 
    530 // lexRawQuote scans a raw quoted string.
    531 func lexRawQuote(l *lexer) stateFn {
    532 Loop:
    533 	for {
    534 		switch l.next() {
    535 		case eof:
    536 			return l.errorf("unterminated raw quoted string")
    537 		case '`':
    538 			break Loop
    539 		}
    540 	}
    541 	l.emit(itemRawString)
    542 	return lexInsideAction
    543 }
    544 
    545 // isSpace reports whether r is a space character.
    546 func isSpace(r rune) bool {
    547 	return r == ' ' || r == '\t'
    548 }
    549 
    550 // isEndOfLine reports whether r is an end-of-line character.
    551 func isEndOfLine(r rune) bool {
    552 	return r == '\r' || r == '\n'
    553 }
    554 
    555 // isAlphaNumeric reports whether r is an alphabetic, digit, or underscore.
    556 func isAlphaNumeric(r rune) bool {
    557 	return r == '_' || unicode.IsLetter(r) || unicode.IsDigit(r)
    558 }
    559