Home | History | Annotate | Download | only in scanner
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
      6 // It takes an io.Reader providing the source, which then can be tokenized
      7 // through repeated calls to the Scan function. For compatibility with
      8 // existing tools, the NUL character is not allowed. If the first character
      9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded.
     10 //
     11 // By default, a Scanner skips white space and Go comments and recognizes all
     12 // literals as defined by the Go language specification. It may be
     13 // customized to recognize only a subset of those literals and to recognize
     14 // different identifier and white space characters.
     15 package scanner
     16 
     17 import (
     18 	"bytes"
     19 	"fmt"
     20 	"io"
     21 	"os"
     22 	"unicode"
     23 	"unicode/utf8"
     24 )
     25 
     26 // A source position is represented by a Position value.
     27 // A position is valid if Line > 0.
     28 type Position struct {
     29 	Filename string // filename, if any
     30 	Offset   int    // byte offset, starting at 0
     31 	Line     int    // line number, starting at 1
     32 	Column   int    // column number, starting at 1 (character count per line)
     33 }
     34 
     35 // IsValid reports whether the position is valid.
     36 func (pos *Position) IsValid() bool { return pos.Line > 0 }
     37 
     38 func (pos Position) String() string {
     39 	s := pos.Filename
     40 	if s == "" {
     41 		s = "<input>"
     42 	}
     43 	if pos.IsValid() {
     44 		s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
     45 	}
     46 	return s
     47 }
     48 
     49 // Predefined mode bits to control recognition of tokens. For instance,
     50 // to configure a Scanner such that it only recognizes (Go) identifiers,
     51 // integers, and skips comments, set the Scanner's Mode field to:
     52 //
     53 //	ScanIdents | ScanInts | SkipComments
     54 //
     55 // With the exceptions of comments, which are skipped if SkipComments is
     56 // set, unrecognized tokens are not ignored. Instead, the scanner simply
     57 // returns the respective individual characters (or possibly sub-tokens).
     58 // For instance, if the mode is ScanIdents (not ScanStrings), the string
     59 // "foo" is scanned as the token sequence '"' Ident '"'.
     60 //
     61 const (
     62 	ScanIdents     = 1 << -Ident
     63 	ScanInts       = 1 << -Int
     64 	ScanFloats     = 1 << -Float // includes Ints
     65 	ScanChars      = 1 << -Char
     66 	ScanStrings    = 1 << -String
     67 	ScanRawStrings = 1 << -RawString
     68 	ScanComments   = 1 << -Comment
     69 	SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
     70 	GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
     71 )
     72 
     73 // The result of Scan is one of these tokens or a Unicode character.
     74 const (
     75 	EOF = -(iota + 1)
     76 	Ident
     77 	Int
     78 	Float
     79 	Char
     80 	String
     81 	RawString
     82 	Comment
     83 	skipComment
     84 )
     85 
     86 var tokenString = map[rune]string{
     87 	EOF:       "EOF",
     88 	Ident:     "Ident",
     89 	Int:       "Int",
     90 	Float:     "Float",
     91 	Char:      "Char",
     92 	String:    "String",
     93 	RawString: "RawString",
     94 	Comment:   "Comment",
     95 }
     96 
     97 // TokenString returns a printable string for a token or Unicode character.
     98 func TokenString(tok rune) string {
     99 	if s, found := tokenString[tok]; found {
    100 		return s
    101 	}
    102 	return fmt.Sprintf("%q", string(tok))
    103 }
    104 
    105 // GoWhitespace is the default value for the Scanner's Whitespace field.
    106 // Its value selects Go's white space characters.
    107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
    108 
    109 const bufLen = 1024 // at least utf8.UTFMax
    110 
    111 // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
    112 type Scanner struct {
    113 	// Input
    114 	src io.Reader
    115 
    116 	// Source buffer
    117 	srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
    118 	srcPos int              // reading position (srcBuf index)
    119 	srcEnd int              // source end (srcBuf index)
    120 
    121 	// Source position
    122 	srcBufOffset int // byte offset of srcBuf[0] in source
    123 	line         int // line count
    124 	column       int // character count
    125 	lastLineLen  int // length of last line in characters (for correct column reporting)
    126 	lastCharLen  int // length of last character in bytes
    127 
    128 	// Token text buffer
    129 	// Typically, token text is stored completely in srcBuf, but in general
    130 	// the token text's head may be buffered in tokBuf while the token text's
    131 	// tail is stored in srcBuf.
    132 	tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
    133 	tokPos int          // token text tail position (srcBuf index); valid if >= 0
    134 	tokEnd int          // token text tail end (srcBuf index)
    135 
    136 	// One character look-ahead
    137 	ch rune // character before current srcPos
    138 
    139 	// Error is called for each error encountered. If no Error
    140 	// function is set, the error is reported to os.Stderr.
    141 	Error func(s *Scanner, msg string)
    142 
    143 	// ErrorCount is incremented by one for each error encountered.
    144 	ErrorCount int
    145 
    146 	// The Mode field controls which tokens are recognized. For instance,
    147 	// to recognize Ints, set the ScanInts bit in Mode. The field may be
    148 	// changed at any time.
    149 	Mode uint
    150 
    151 	// The Whitespace field controls which characters are recognized
    152 	// as white space. To recognize a character ch <= ' ' as white space,
    153 	// set the ch'th bit in Whitespace (the Scanner's behavior is undefined
    154 	// for values ch > ' '). The field may be changed at any time.
    155 	Whitespace uint64
    156 
    157 	// IsIdentRune is a predicate controlling the characters accepted
    158 	// as the ith rune in an identifier. The set of valid characters
    159 	// must not intersect with the set of white space characters.
    160 	// If no IsIdentRune function is set, regular Go identifiers are
    161 	// accepted instead. The field may be changed at any time.
    162 	IsIdentRune func(ch rune, i int) bool
    163 
    164 	// Start position of most recently scanned token; set by Scan.
    165 	// Calling Init or Next invalidates the position (Line == 0).
    166 	// The Filename field is always left untouched by the Scanner.
    167 	// If an error is reported (via Error) and Position is invalid,
    168 	// the scanner is not inside a token. Call Pos to obtain an error
    169 	// position in that case, or to obtain the position immediately
    170 	// after the most recently scanned token.
    171 	Position
    172 }
    173 
    174 // Init initializes a Scanner with a new source and returns s.
    175 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
    176 // and Whitespace is set to GoWhitespace.
    177 func (s *Scanner) Init(src io.Reader) *Scanner {
    178 	s.src = src
    179 
    180 	// initialize source buffer
    181 	// (the first call to next() will fill it by calling src.Read)
    182 	s.srcBuf[0] = utf8.RuneSelf // sentinel
    183 	s.srcPos = 0
    184 	s.srcEnd = 0
    185 
    186 	// initialize source position
    187 	s.srcBufOffset = 0
    188 	s.line = 1
    189 	s.column = 0
    190 	s.lastLineLen = 0
    191 	s.lastCharLen = 0
    192 
    193 	// initialize token text buffer
    194 	// (required for first call to next()).
    195 	s.tokPos = -1
    196 
    197 	// initialize one character look-ahead
    198 	s.ch = -2 // no char read yet, not EOF
    199 
    200 	// initialize public fields
    201 	s.Error = nil
    202 	s.ErrorCount = 0
    203 	s.Mode = GoTokens
    204 	s.Whitespace = GoWhitespace
    205 	s.Line = 0 // invalidate token position
    206 
    207 	return s
    208 }
    209 
    210 // next reads and returns the next Unicode character. It is designed such
    211 // that only a minimal amount of work needs to be done in the common ASCII
    212 // case (one test to check for both ASCII and end-of-buffer, and one test
    213 // to check for newlines).
    214 func (s *Scanner) next() rune {
    215 	ch, width := rune(s.srcBuf[s.srcPos]), 1
    216 
    217 	if ch >= utf8.RuneSelf {
    218 		// uncommon case: not ASCII or not enough bytes
    219 		for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
    220 			// not enough bytes: read some more, but first
    221 			// save away token text if any
    222 			if s.tokPos >= 0 {
    223 				s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
    224 				s.tokPos = 0
    225 				// s.tokEnd is set by Scan()
    226 			}
    227 			// move unread bytes to beginning of buffer
    228 			copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
    229 			s.srcBufOffset += s.srcPos
    230 			// read more bytes
    231 			// (an io.Reader must return io.EOF when it reaches
    232 			// the end of what it is reading - simply returning
    233 			// n == 0 will make this loop retry forever; but the
    234 			// error is in the reader implementation in that case)
    235 			i := s.srcEnd - s.srcPos
    236 			n, err := s.src.Read(s.srcBuf[i:bufLen])
    237 			s.srcPos = 0
    238 			s.srcEnd = i + n
    239 			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
    240 			if err != nil {
    241 				if err != io.EOF {
    242 					s.error(err.Error())
    243 				}
    244 				if s.srcEnd == 0 {
    245 					if s.lastCharLen > 0 {
    246 						// previous character was not EOF
    247 						s.column++
    248 					}
    249 					s.lastCharLen = 0
    250 					return EOF
    251 				}
    252 				// If err == EOF, we won't be getting more
    253 				// bytes; break to avoid infinite loop. If
    254 				// err is something else, we don't know if
    255 				// we can get more bytes; thus also break.
    256 				break
    257 			}
    258 		}
    259 		// at least one byte
    260 		ch = rune(s.srcBuf[s.srcPos])
    261 		if ch >= utf8.RuneSelf {
    262 			// uncommon case: not ASCII
    263 			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
    264 			if ch == utf8.RuneError && width == 1 {
    265 				// advance for correct error position
    266 				s.srcPos += width
    267 				s.lastCharLen = width
    268 				s.column++
    269 				s.error("illegal UTF-8 encoding")
    270 				return ch
    271 			}
    272 		}
    273 	}
    274 
    275 	// advance
    276 	s.srcPos += width
    277 	s.lastCharLen = width
    278 	s.column++
    279 
    280 	// special situations
    281 	switch ch {
    282 	case 0:
    283 		// for compatibility with other tools
    284 		s.error("illegal character NUL")
    285 	case '\n':
    286 		s.line++
    287 		s.lastLineLen = s.column
    288 		s.column = 0
    289 	}
    290 
    291 	return ch
    292 }
    293 
    294 // Next reads and returns the next Unicode character.
    295 // It returns EOF at the end of the source. It reports
    296 // a read error by calling s.Error, if not nil; otherwise
    297 // it prints an error message to os.Stderr. Next does not
    298 // update the Scanner's Position field; use Pos() to
    299 // get the current position.
    300 func (s *Scanner) Next() rune {
    301 	s.tokPos = -1 // don't collect token text
    302 	s.Line = 0    // invalidate token position
    303 	ch := s.Peek()
    304 	if ch != EOF {
    305 		s.ch = s.next()
    306 	}
    307 	return ch
    308 }
    309 
    310 // Peek returns the next Unicode character in the source without advancing
    311 // the scanner. It returns EOF if the scanner's position is at the last
    312 // character of the source.
    313 func (s *Scanner) Peek() rune {
    314 	if s.ch == -2 {
    315 		// this code is only run for the very first character
    316 		s.ch = s.next()
    317 		if s.ch == '\uFEFF' {
    318 			s.ch = s.next() // ignore BOM
    319 		}
    320 	}
    321 	return s.ch
    322 }
    323 
    324 func (s *Scanner) error(msg string) {
    325 	s.ErrorCount++
    326 	if s.Error != nil {
    327 		s.Error(s, msg)
    328 		return
    329 	}
    330 	pos := s.Position
    331 	if !pos.IsValid() {
    332 		pos = s.Pos()
    333 	}
    334 	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
    335 }
    336 
    337 func (s *Scanner) isIdentRune(ch rune, i int) bool {
    338 	if s.IsIdentRune != nil {
    339 		return s.IsIdentRune(ch, i)
    340 	}
    341 	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
    342 }
    343 
    344 func (s *Scanner) scanIdentifier() rune {
    345 	// we know the zero'th rune is OK; start scanning at the next one
    346 	ch := s.next()
    347 	for i := 1; s.isIdentRune(ch, i); i++ {
    348 		ch = s.next()
    349 	}
    350 	return ch
    351 }
    352 
    353 func digitVal(ch rune) int {
    354 	switch {
    355 	case '0' <= ch && ch <= '9':
    356 		return int(ch - '0')
    357 	case 'a' <= ch && ch <= 'f':
    358 		return int(ch - 'a' + 10)
    359 	case 'A' <= ch && ch <= 'F':
    360 		return int(ch - 'A' + 10)
    361 	}
    362 	return 16 // larger than any legal digit val
    363 }
    364 
    365 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
    366 
    367 func (s *Scanner) scanMantissa(ch rune) rune {
    368 	for isDecimal(ch) {
    369 		ch = s.next()
    370 	}
    371 	return ch
    372 }
    373 
    374 func (s *Scanner) scanFraction(ch rune) rune {
    375 	if ch == '.' {
    376 		ch = s.scanMantissa(s.next())
    377 	}
    378 	return ch
    379 }
    380 
    381 func (s *Scanner) scanExponent(ch rune) rune {
    382 	if ch == 'e' || ch == 'E' {
    383 		ch = s.next()
    384 		if ch == '-' || ch == '+' {
    385 			ch = s.next()
    386 		}
    387 		ch = s.scanMantissa(ch)
    388 	}
    389 	return ch
    390 }
    391 
    392 func (s *Scanner) scanNumber(ch rune) (rune, rune) {
    393 	// isDecimal(ch)
    394 	if ch == '0' {
    395 		// int or float
    396 		ch = s.next()
    397 		if ch == 'x' || ch == 'X' {
    398 			// hexadecimal int
    399 			ch = s.next()
    400 			hasMantissa := false
    401 			for digitVal(ch) < 16 {
    402 				ch = s.next()
    403 				hasMantissa = true
    404 			}
    405 			if !hasMantissa {
    406 				s.error("illegal hexadecimal number")
    407 			}
    408 		} else {
    409 			// octal int or float
    410 			has8or9 := false
    411 			for isDecimal(ch) {
    412 				if ch > '7' {
    413 					has8or9 = true
    414 				}
    415 				ch = s.next()
    416 			}
    417 			if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
    418 				// float
    419 				ch = s.scanFraction(ch)
    420 				ch = s.scanExponent(ch)
    421 				return Float, ch
    422 			}
    423 			// octal int
    424 			if has8or9 {
    425 				s.error("illegal octal number")
    426 			}
    427 		}
    428 		return Int, ch
    429 	}
    430 	// decimal int or float
    431 	ch = s.scanMantissa(ch)
    432 	if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
    433 		// float
    434 		ch = s.scanFraction(ch)
    435 		ch = s.scanExponent(ch)
    436 		return Float, ch
    437 	}
    438 	return Int, ch
    439 }
    440 
    441 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
    442 	for n > 0 && digitVal(ch) < base {
    443 		ch = s.next()
    444 		n--
    445 	}
    446 	if n > 0 {
    447 		s.error("illegal char escape")
    448 	}
    449 	return ch
    450 }
    451 
    452 func (s *Scanner) scanEscape(quote rune) rune {
    453 	ch := s.next() // read character after '/'
    454 	switch ch {
    455 	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
    456 		// nothing to do
    457 		ch = s.next()
    458 	case '0', '1', '2', '3', '4', '5', '6', '7':
    459 		ch = s.scanDigits(ch, 8, 3)
    460 	case 'x':
    461 		ch = s.scanDigits(s.next(), 16, 2)
    462 	case 'u':
    463 		ch = s.scanDigits(s.next(), 16, 4)
    464 	case 'U':
    465 		ch = s.scanDigits(s.next(), 16, 8)
    466 	default:
    467 		s.error("illegal char escape")
    468 	}
    469 	return ch
    470 }
    471 
    472 func (s *Scanner) scanString(quote rune) (n int) {
    473 	ch := s.next() // read character after quote
    474 	for ch != quote {
    475 		if ch == '\n' || ch < 0 {
    476 			s.error("literal not terminated")
    477 			return
    478 		}
    479 		if ch == '\\' {
    480 			ch = s.scanEscape(quote)
    481 		} else {
    482 			ch = s.next()
    483 		}
    484 		n++
    485 	}
    486 	return
    487 }
    488 
    489 func (s *Scanner) scanRawString() {
    490 	ch := s.next() // read character after '`'
    491 	for ch != '`' {
    492 		if ch < 0 {
    493 			s.error("literal not terminated")
    494 			return
    495 		}
    496 		ch = s.next()
    497 	}
    498 }
    499 
    500 func (s *Scanner) scanChar() {
    501 	if s.scanString('\'') != 1 {
    502 		s.error("illegal char literal")
    503 	}
    504 }
    505 
    506 func (s *Scanner) scanComment(ch rune) rune {
    507 	// ch == '/' || ch == '*'
    508 	if ch == '/' {
    509 		// line comment
    510 		ch = s.next() // read character after "//"
    511 		for ch != '\n' && ch >= 0 {
    512 			ch = s.next()
    513 		}
    514 		return ch
    515 	}
    516 
    517 	// general comment
    518 	ch = s.next() // read character after "/*"
    519 	for {
    520 		if ch < 0 {
    521 			s.error("comment not terminated")
    522 			break
    523 		}
    524 		ch0 := ch
    525 		ch = s.next()
    526 		if ch0 == '*' && ch == '/' {
    527 			ch = s.next()
    528 			break
    529 		}
    530 	}
    531 	return ch
    532 }
    533 
    534 // Scan reads the next token or Unicode character from source and returns it.
    535 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
    536 // It returns EOF at the end of the source. It reports scanner errors (read and
    537 // token errors) by calling s.Error, if not nil; otherwise it prints an error
    538 // message to os.Stderr.
    539 func (s *Scanner) Scan() rune {
    540 	ch := s.Peek()
    541 
    542 	// reset token text position
    543 	s.tokPos = -1
    544 	s.Line = 0
    545 
    546 redo:
    547 	// skip white space
    548 	for s.Whitespace&(1<<uint(ch)) != 0 {
    549 		ch = s.next()
    550 	}
    551 
    552 	// start collecting token text
    553 	s.tokBuf.Reset()
    554 	s.tokPos = s.srcPos - s.lastCharLen
    555 
    556 	// set token position
    557 	// (this is a slightly optimized version of the code in Pos())
    558 	s.Offset = s.srcBufOffset + s.tokPos
    559 	if s.column > 0 {
    560 		// common case: last character was not a '\n'
    561 		s.Line = s.line
    562 		s.Column = s.column
    563 	} else {
    564 		// last character was a '\n'
    565 		// (we cannot be at the beginning of the source
    566 		// since we have called next() at least once)
    567 		s.Line = s.line - 1
    568 		s.Column = s.lastLineLen
    569 	}
    570 
    571 	// determine token value
    572 	tok := ch
    573 	switch {
    574 	case s.isIdentRune(ch, 0):
    575 		if s.Mode&ScanIdents != 0 {
    576 			tok = Ident
    577 			ch = s.scanIdentifier()
    578 		} else {
    579 			ch = s.next()
    580 		}
    581 	case isDecimal(ch):
    582 		if s.Mode&(ScanInts|ScanFloats) != 0 {
    583 			tok, ch = s.scanNumber(ch)
    584 		} else {
    585 			ch = s.next()
    586 		}
    587 	default:
    588 		switch ch {
    589 		case EOF:
    590 			break
    591 		case '"':
    592 			if s.Mode&ScanStrings != 0 {
    593 				s.scanString('"')
    594 				tok = String
    595 			}
    596 			ch = s.next()
    597 		case '\'':
    598 			if s.Mode&ScanChars != 0 {
    599 				s.scanChar()
    600 				tok = Char
    601 			}
    602 			ch = s.next()
    603 		case '.':
    604 			ch = s.next()
    605 			if isDecimal(ch) && s.Mode&ScanFloats != 0 {
    606 				tok = Float
    607 				ch = s.scanMantissa(ch)
    608 				ch = s.scanExponent(ch)
    609 			}
    610 		case '/':
    611 			ch = s.next()
    612 			if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
    613 				if s.Mode&SkipComments != 0 {
    614 					s.tokPos = -1 // don't collect token text
    615 					ch = s.scanComment(ch)
    616 					goto redo
    617 				}
    618 				ch = s.scanComment(ch)
    619 				tok = Comment
    620 			}
    621 		case '`':
    622 			if s.Mode&ScanRawStrings != 0 {
    623 				s.scanRawString()
    624 				tok = String
    625 			}
    626 			ch = s.next()
    627 		default:
    628 			ch = s.next()
    629 		}
    630 	}
    631 
    632 	// end of token text
    633 	s.tokEnd = s.srcPos - s.lastCharLen
    634 
    635 	s.ch = ch
    636 	return tok
    637 }
    638 
    639 // Pos returns the position of the character immediately after
    640 // the character or token returned by the last call to Next or Scan.
    641 // Use the Scanner's Position field for the start position of the most
    642 // recently scanned token.
    643 func (s *Scanner) Pos() (pos Position) {
    644 	pos.Filename = s.Filename
    645 	pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
    646 	switch {
    647 	case s.column > 0:
    648 		// common case: last character was not a '\n'
    649 		pos.Line = s.line
    650 		pos.Column = s.column
    651 	case s.lastLineLen > 0:
    652 		// last character was a '\n'
    653 		pos.Line = s.line - 1
    654 		pos.Column = s.lastLineLen
    655 	default:
    656 		// at the beginning of the source
    657 		pos.Line = 1
    658 		pos.Column = 1
    659 	}
    660 	return
    661 }
    662 
    663 // TokenText returns the string corresponding to the most recently scanned token.
    664 // Valid after calling Scan().
    665 func (s *Scanner) TokenText() string {
    666 	if s.tokPos < 0 {
    667 		// no token text
    668 		return ""
    669 	}
    670 
    671 	if s.tokEnd < 0 {
    672 		// if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
    673 		s.tokEnd = s.tokPos
    674 	}
    675 
    676 	if s.tokBuf.Len() == 0 {
    677 		// common case: the entire token text is still in srcBuf
    678 		return string(s.srcBuf[s.tokPos:s.tokEnd])
    679 	}
    680 
    681 	// part of the token text was saved in tokBuf: save the rest in
    682 	// tokBuf as well and return its content
    683 	s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
    684 	s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
    685 	return s.tokBuf.String()
    686 }
    687