Home | History | Annotate | Download | only in scanner
      1 // Copyright 2009 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package scanner provides a scanner and tokenizer for UTF-8-encoded text.
      6 // It takes an io.Reader providing the source, which then can be tokenized
      7 // through repeated calls to the Scan function. For compatibility with
      8 // existing tools, the NUL character is not allowed. If the first character
      9 // in the source is a UTF-8 encoded byte order mark (BOM), it is discarded.
     10 //
     11 // By default, a Scanner skips white space and Go comments and recognizes all
     12 // literals as defined by the Go language specification. It may be
     13 // customized to recognize only a subset of those literals and to recognize
     14 // different identifier and white space characters.
     15 package scanner
     16 
     17 import (
     18 	"bytes"
     19 	"fmt"
     20 	"io"
     21 	"os"
     22 	"unicode"
     23 	"unicode/utf8"
     24 )
     25 
     26 // A source position is represented by a Position value.
     27 // A position is valid if Line > 0.
     28 type Position struct {
     29 	Filename string // filename, if any
     30 	Offset   int    // byte offset, starting at 0
     31 	Line     int    // line number, starting at 1
     32 	Column   int    // column number, starting at 1 (character count per line)
     33 }
     34 
     35 // IsValid reports whether the position is valid.
     36 func (pos *Position) IsValid() bool { return pos.Line > 0 }
     37 
     38 func (pos Position) String() string {
     39 	s := pos.Filename
     40 	if s == "" {
     41 		s = "<input>"
     42 	}
     43 	if pos.IsValid() {
     44 		s += fmt.Sprintf(":%d:%d", pos.Line, pos.Column)
     45 	}
     46 	return s
     47 }
     48 
     49 // Predefined mode bits to control recognition of tokens. For instance,
     50 // to configure a Scanner such that it only recognizes (Go) identifiers,
     51 // integers, and skips comments, set the Scanner's Mode field to:
     52 //
     53 //	ScanIdents | ScanInts | SkipComments
     54 //
     55 // With the exceptions of comments, which are skipped if SkipComments is
     56 // set, unrecognized tokens are not ignored. Instead, the scanner simply
     57 // returns the respective individual characters (or possibly sub-tokens).
     58 // For instance, if the mode is ScanIdents (not ScanStrings), the string
     59 // "foo" is scanned as the token sequence '"' Ident '"'.
     60 //
     61 const (
     62 	ScanIdents     = 1 << -Ident
     63 	ScanInts       = 1 << -Int
     64 	ScanFloats     = 1 << -Float // includes Ints
     65 	ScanChars      = 1 << -Char
     66 	ScanStrings    = 1 << -String
     67 	ScanRawStrings = 1 << -RawString
     68 	ScanComments   = 1 << -Comment
     69 	SkipComments   = 1 << -skipComment // if set with ScanComments, comments become white space
     70 	GoTokens       = ScanIdents | ScanFloats | ScanChars | ScanStrings | ScanRawStrings | ScanComments | SkipComments
     71 )
     72 
     73 // The result of Scan is one of these tokens or a Unicode character.
     74 const (
     75 	EOF = -(iota + 1)
     76 	Ident
     77 	Int
     78 	Float
     79 	Char
     80 	String
     81 	RawString
     82 	Comment
     83 	skipComment
     84 )
     85 
     86 var tokenString = map[rune]string{
     87 	EOF:       "EOF",
     88 	Ident:     "Ident",
     89 	Int:       "Int",
     90 	Float:     "Float",
     91 	Char:      "Char",
     92 	String:    "String",
     93 	RawString: "RawString",
     94 	Comment:   "Comment",
     95 }
     96 
     97 // TokenString returns a printable string for a token or Unicode character.
     98 func TokenString(tok rune) string {
     99 	if s, found := tokenString[tok]; found {
    100 		return s
    101 	}
    102 	return fmt.Sprintf("%q", string(tok))
    103 }
    104 
    105 // GoWhitespace is the default value for the Scanner's Whitespace field.
    106 // Its value selects Go's white space characters.
    107 const GoWhitespace = 1<<'\t' | 1<<'\n' | 1<<'\r' | 1<<' '
    108 
    109 const bufLen = 1024 // at least utf8.UTFMax
    110 
    111 // A Scanner implements reading of Unicode characters and tokens from an io.Reader.
    112 type Scanner struct {
    113 	// Input
    114 	src io.Reader
    115 
    116 	// Source buffer
    117 	srcBuf [bufLen + 1]byte // +1 for sentinel for common case of s.next()
    118 	srcPos int              // reading position (srcBuf index)
    119 	srcEnd int              // source end (srcBuf index)
    120 
    121 	// Source position
    122 	srcBufOffset int // byte offset of srcBuf[0] in source
    123 	line         int // line count
    124 	column       int // character count
    125 	lastLineLen  int // length of last line in characters (for correct column reporting)
    126 	lastCharLen  int // length of last character in bytes
    127 
    128 	// Token text buffer
    129 	// Typically, token text is stored completely in srcBuf, but in general
    130 	// the token text's head may be buffered in tokBuf while the token text's
    131 	// tail is stored in srcBuf.
    132 	tokBuf bytes.Buffer // token text head that is not in srcBuf anymore
    133 	tokPos int          // token text tail position (srcBuf index); valid if >= 0
    134 	tokEnd int          // token text tail end (srcBuf index)
    135 
    136 	// One character look-ahead
    137 	ch rune // character before current srcPos
    138 
    139 	// Error is called for each error encountered. If no Error
    140 	// function is set, the error is reported to os.Stderr.
    141 	Error func(s *Scanner, msg string)
    142 
    143 	// ErrorCount is incremented by one for each error encountered.
    144 	ErrorCount int
    145 
    146 	// The Mode field controls which tokens are recognized. For instance,
    147 	// to recognize Ints, set the ScanInts bit in Mode. The field may be
    148 	// changed at any time.
    149 	Mode uint
    150 
    151 	// The Whitespace field controls which characters are recognized
    152 	// as white space. To recognize a character ch <= ' ' as white space,
    153 	// set the ch'th bit in Whitespace (the Scanner's behavior is undefined
    154 	// for values ch > ' '). The field may be changed at any time.
    155 	Whitespace uint64
    156 
    157 	// IsIdentRune is a predicate controlling the characters accepted
    158 	// as the ith rune in an identifier. The set of valid characters
    159 	// must not intersect with the set of white space characters.
    160 	// If no IsIdentRune function is set, regular Go identifiers are
    161 	// accepted instead. The field may be changed at any time.
    162 	IsIdentRune func(ch rune, i int) bool
    163 
    164 	// Start position of most recently scanned token; set by Scan.
    165 	// Calling Init or Next invalidates the position (Line == 0).
    166 	// The Filename field is always left untouched by the Scanner.
    167 	// If an error is reported (via Error) and Position is invalid,
    168 	// the scanner is not inside a token. Call Pos to obtain an error
    169 	// position in that case.
    170 	Position
    171 }
    172 
    173 // Init initializes a Scanner with a new source and returns s.
    174 // Error is set to nil, ErrorCount is set to 0, Mode is set to GoTokens,
    175 // and Whitespace is set to GoWhitespace.
    176 func (s *Scanner) Init(src io.Reader) *Scanner {
    177 	s.src = src
    178 
    179 	// initialize source buffer
    180 	// (the first call to next() will fill it by calling src.Read)
    181 	s.srcBuf[0] = utf8.RuneSelf // sentinel
    182 	s.srcPos = 0
    183 	s.srcEnd = 0
    184 
    185 	// initialize source position
    186 	s.srcBufOffset = 0
    187 	s.line = 1
    188 	s.column = 0
    189 	s.lastLineLen = 0
    190 	s.lastCharLen = 0
    191 
    192 	// initialize token text buffer
    193 	// (required for first call to next()).
    194 	s.tokPos = -1
    195 
    196 	// initialize one character look-ahead
    197 	s.ch = -2 // no char read yet, not EOF
    198 
    199 	// initialize public fields
    200 	s.Error = nil
    201 	s.ErrorCount = 0
    202 	s.Mode = GoTokens
    203 	s.Whitespace = GoWhitespace
    204 	s.Line = 0 // invalidate token position
    205 
    206 	return s
    207 }
    208 
    209 // next reads and returns the next Unicode character. It is designed such
    210 // that only a minimal amount of work needs to be done in the common ASCII
    211 // case (one test to check for both ASCII and end-of-buffer, and one test
    212 // to check for newlines).
    213 func (s *Scanner) next() rune {
    214 	ch, width := rune(s.srcBuf[s.srcPos]), 1
    215 
    216 	if ch >= utf8.RuneSelf {
    217 		// uncommon case: not ASCII or not enough bytes
    218 		for s.srcPos+utf8.UTFMax > s.srcEnd && !utf8.FullRune(s.srcBuf[s.srcPos:s.srcEnd]) {
    219 			// not enough bytes: read some more, but first
    220 			// save away token text if any
    221 			if s.tokPos >= 0 {
    222 				s.tokBuf.Write(s.srcBuf[s.tokPos:s.srcPos])
    223 				s.tokPos = 0
    224 				// s.tokEnd is set by Scan()
    225 			}
    226 			// move unread bytes to beginning of buffer
    227 			copy(s.srcBuf[0:], s.srcBuf[s.srcPos:s.srcEnd])
    228 			s.srcBufOffset += s.srcPos
    229 			// read more bytes
    230 			// (an io.Reader must return io.EOF when it reaches
    231 			// the end of what it is reading - simply returning
    232 			// n == 0 will make this loop retry forever; but the
    233 			// error is in the reader implementation in that case)
    234 			i := s.srcEnd - s.srcPos
    235 			n, err := s.src.Read(s.srcBuf[i:bufLen])
    236 			s.srcPos = 0
    237 			s.srcEnd = i + n
    238 			s.srcBuf[s.srcEnd] = utf8.RuneSelf // sentinel
    239 			if err != nil {
    240 				if err != io.EOF {
    241 					s.error(err.Error())
    242 				}
    243 				if s.srcEnd == 0 {
    244 					if s.lastCharLen > 0 {
    245 						// previous character was not EOF
    246 						s.column++
    247 					}
    248 					s.lastCharLen = 0
    249 					return EOF
    250 				}
    251 				// If err == EOF, we won't be getting more
    252 				// bytes; break to avoid infinite loop. If
    253 				// err is something else, we don't know if
    254 				// we can get more bytes; thus also break.
    255 				break
    256 			}
    257 		}
    258 		// at least one byte
    259 		ch = rune(s.srcBuf[s.srcPos])
    260 		if ch >= utf8.RuneSelf {
    261 			// uncommon case: not ASCII
    262 			ch, width = utf8.DecodeRune(s.srcBuf[s.srcPos:s.srcEnd])
    263 			if ch == utf8.RuneError && width == 1 {
    264 				// advance for correct error position
    265 				s.srcPos += width
    266 				s.lastCharLen = width
    267 				s.column++
    268 				s.error("illegal UTF-8 encoding")
    269 				return ch
    270 			}
    271 		}
    272 	}
    273 
    274 	// advance
    275 	s.srcPos += width
    276 	s.lastCharLen = width
    277 	s.column++
    278 
    279 	// special situations
    280 	switch ch {
    281 	case 0:
    282 		// for compatibility with other tools
    283 		s.error("illegal character NUL")
    284 	case '\n':
    285 		s.line++
    286 		s.lastLineLen = s.column
    287 		s.column = 0
    288 	}
    289 
    290 	return ch
    291 }
    292 
    293 // Next reads and returns the next Unicode character.
    294 // It returns EOF at the end of the source. It reports
    295 // a read error by calling s.Error, if not nil; otherwise
    296 // it prints an error message to os.Stderr. Next does not
    297 // update the Scanner's Position field; use Pos() to
    298 // get the current position.
    299 func (s *Scanner) Next() rune {
    300 	s.tokPos = -1 // don't collect token text
    301 	s.Line = 0    // invalidate token position
    302 	ch := s.Peek()
    303 	if ch != EOF {
    304 		s.ch = s.next()
    305 	}
    306 	return ch
    307 }
    308 
    309 // Peek returns the next Unicode character in the source without advancing
    310 // the scanner. It returns EOF if the scanner's position is at the last
    311 // character of the source.
    312 func (s *Scanner) Peek() rune {
    313 	if s.ch == -2 {
    314 		// this code is only run for the very first character
    315 		s.ch = s.next()
    316 		if s.ch == '\uFEFF' {
    317 			s.ch = s.next() // ignore BOM
    318 		}
    319 	}
    320 	return s.ch
    321 }
    322 
    323 func (s *Scanner) error(msg string) {
    324 	s.ErrorCount++
    325 	if s.Error != nil {
    326 		s.Error(s, msg)
    327 		return
    328 	}
    329 	pos := s.Position
    330 	if !pos.IsValid() {
    331 		pos = s.Pos()
    332 	}
    333 	fmt.Fprintf(os.Stderr, "%s: %s\n", pos, msg)
    334 }
    335 
    336 func (s *Scanner) isIdentRune(ch rune, i int) bool {
    337 	if s.IsIdentRune != nil {
    338 		return s.IsIdentRune(ch, i)
    339 	}
    340 	return ch == '_' || unicode.IsLetter(ch) || unicode.IsDigit(ch) && i > 0
    341 }
    342 
    343 func (s *Scanner) scanIdentifier() rune {
    344 	// we know the zero'th rune is OK; start scanning at the next one
    345 	ch := s.next()
    346 	for i := 1; s.isIdentRune(ch, i); i++ {
    347 		ch = s.next()
    348 	}
    349 	return ch
    350 }
    351 
    352 func digitVal(ch rune) int {
    353 	switch {
    354 	case '0' <= ch && ch <= '9':
    355 		return int(ch - '0')
    356 	case 'a' <= ch && ch <= 'f':
    357 		return int(ch - 'a' + 10)
    358 	case 'A' <= ch && ch <= 'F':
    359 		return int(ch - 'A' + 10)
    360 	}
    361 	return 16 // larger than any legal digit val
    362 }
    363 
    364 func isDecimal(ch rune) bool { return '0' <= ch && ch <= '9' }
    365 
    366 func (s *Scanner) scanMantissa(ch rune) rune {
    367 	for isDecimal(ch) {
    368 		ch = s.next()
    369 	}
    370 	return ch
    371 }
    372 
    373 func (s *Scanner) scanFraction(ch rune) rune {
    374 	if ch == '.' {
    375 		ch = s.scanMantissa(s.next())
    376 	}
    377 	return ch
    378 }
    379 
    380 func (s *Scanner) scanExponent(ch rune) rune {
    381 	if ch == 'e' || ch == 'E' {
    382 		ch = s.next()
    383 		if ch == '-' || ch == '+' {
    384 			ch = s.next()
    385 		}
    386 		ch = s.scanMantissa(ch)
    387 	}
    388 	return ch
    389 }
    390 
    391 func (s *Scanner) scanNumber(ch rune) (rune, rune) {
    392 	// isDecimal(ch)
    393 	if ch == '0' {
    394 		// int or float
    395 		ch = s.next()
    396 		if ch == 'x' || ch == 'X' {
    397 			// hexadecimal int
    398 			ch = s.next()
    399 			hasMantissa := false
    400 			for digitVal(ch) < 16 {
    401 				ch = s.next()
    402 				hasMantissa = true
    403 			}
    404 			if !hasMantissa {
    405 				s.error("illegal hexadecimal number")
    406 			}
    407 		} else {
    408 			// octal int or float
    409 			has8or9 := false
    410 			for isDecimal(ch) {
    411 				if ch > '7' {
    412 					has8or9 = true
    413 				}
    414 				ch = s.next()
    415 			}
    416 			if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
    417 				// float
    418 				ch = s.scanFraction(ch)
    419 				ch = s.scanExponent(ch)
    420 				return Float, ch
    421 			}
    422 			// octal int
    423 			if has8or9 {
    424 				s.error("illegal octal number")
    425 			}
    426 		}
    427 		return Int, ch
    428 	}
    429 	// decimal int or float
    430 	ch = s.scanMantissa(ch)
    431 	if s.Mode&ScanFloats != 0 && (ch == '.' || ch == 'e' || ch == 'E') {
    432 		// float
    433 		ch = s.scanFraction(ch)
    434 		ch = s.scanExponent(ch)
    435 		return Float, ch
    436 	}
    437 	return Int, ch
    438 }
    439 
    440 func (s *Scanner) scanDigits(ch rune, base, n int) rune {
    441 	for n > 0 && digitVal(ch) < base {
    442 		ch = s.next()
    443 		n--
    444 	}
    445 	if n > 0 {
    446 		s.error("illegal char escape")
    447 	}
    448 	return ch
    449 }
    450 
    451 func (s *Scanner) scanEscape(quote rune) rune {
    452 	ch := s.next() // read character after '/'
    453 	switch ch {
    454 	case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
    455 		// nothing to do
    456 		ch = s.next()
    457 	case '0', '1', '2', '3', '4', '5', '6', '7':
    458 		ch = s.scanDigits(ch, 8, 3)
    459 	case 'x':
    460 		ch = s.scanDigits(s.next(), 16, 2)
    461 	case 'u':
    462 		ch = s.scanDigits(s.next(), 16, 4)
    463 	case 'U':
    464 		ch = s.scanDigits(s.next(), 16, 8)
    465 	default:
    466 		s.error("illegal char escape")
    467 	}
    468 	return ch
    469 }
    470 
    471 func (s *Scanner) scanString(quote rune) (n int) {
    472 	ch := s.next() // read character after quote
    473 	for ch != quote {
    474 		if ch == '\n' || ch < 0 {
    475 			s.error("literal not terminated")
    476 			return
    477 		}
    478 		if ch == '\\' {
    479 			ch = s.scanEscape(quote)
    480 		} else {
    481 			ch = s.next()
    482 		}
    483 		n++
    484 	}
    485 	return
    486 }
    487 
    488 func (s *Scanner) scanRawString() {
    489 	ch := s.next() // read character after '`'
    490 	for ch != '`' {
    491 		if ch < 0 {
    492 			s.error("literal not terminated")
    493 			return
    494 		}
    495 		ch = s.next()
    496 	}
    497 }
    498 
    499 func (s *Scanner) scanChar() {
    500 	if s.scanString('\'') != 1 {
    501 		s.error("illegal char literal")
    502 	}
    503 }
    504 
    505 func (s *Scanner) scanComment(ch rune) rune {
    506 	// ch == '/' || ch == '*'
    507 	if ch == '/' {
    508 		// line comment
    509 		ch = s.next() // read character after "//"
    510 		for ch != '\n' && ch >= 0 {
    511 			ch = s.next()
    512 		}
    513 		return ch
    514 	}
    515 
    516 	// general comment
    517 	ch = s.next() // read character after "/*"
    518 	for {
    519 		if ch < 0 {
    520 			s.error("comment not terminated")
    521 			break
    522 		}
    523 		ch0 := ch
    524 		ch = s.next()
    525 		if ch0 == '*' && ch == '/' {
    526 			ch = s.next()
    527 			break
    528 		}
    529 	}
    530 	return ch
    531 }
    532 
    533 // Scan reads the next token or Unicode character from source and returns it.
    534 // It only recognizes tokens t for which the respective Mode bit (1<<-t) is set.
    535 // It returns EOF at the end of the source. It reports scanner errors (read and
    536 // token errors) by calling s.Error, if not nil; otherwise it prints an error
    537 // message to os.Stderr.
    538 func (s *Scanner) Scan() rune {
    539 	ch := s.Peek()
    540 
    541 	// reset token text position
    542 	s.tokPos = -1
    543 	s.Line = 0
    544 
    545 redo:
    546 	// skip white space
    547 	for s.Whitespace&(1<<uint(ch)) != 0 {
    548 		ch = s.next()
    549 	}
    550 
    551 	// start collecting token text
    552 	s.tokBuf.Reset()
    553 	s.tokPos = s.srcPos - s.lastCharLen
    554 
    555 	// set token position
    556 	// (this is a slightly optimized version of the code in Pos())
    557 	s.Offset = s.srcBufOffset + s.tokPos
    558 	if s.column > 0 {
    559 		// common case: last character was not a '\n'
    560 		s.Line = s.line
    561 		s.Column = s.column
    562 	} else {
    563 		// last character was a '\n'
    564 		// (we cannot be at the beginning of the source
    565 		// since we have called next() at least once)
    566 		s.Line = s.line - 1
    567 		s.Column = s.lastLineLen
    568 	}
    569 
    570 	// determine token value
    571 	tok := ch
    572 	switch {
    573 	case s.isIdentRune(ch, 0):
    574 		if s.Mode&ScanIdents != 0 {
    575 			tok = Ident
    576 			ch = s.scanIdentifier()
    577 		} else {
    578 			ch = s.next()
    579 		}
    580 	case isDecimal(ch):
    581 		if s.Mode&(ScanInts|ScanFloats) != 0 {
    582 			tok, ch = s.scanNumber(ch)
    583 		} else {
    584 			ch = s.next()
    585 		}
    586 	default:
    587 		switch ch {
    588 		case EOF:
    589 			break
    590 		case '"':
    591 			if s.Mode&ScanStrings != 0 {
    592 				s.scanString('"')
    593 				tok = String
    594 			}
    595 			ch = s.next()
    596 		case '\'':
    597 			if s.Mode&ScanChars != 0 {
    598 				s.scanChar()
    599 				tok = Char
    600 			}
    601 			ch = s.next()
    602 		case '.':
    603 			ch = s.next()
    604 			if isDecimal(ch) && s.Mode&ScanFloats != 0 {
    605 				tok = Float
    606 				ch = s.scanMantissa(ch)
    607 				ch = s.scanExponent(ch)
    608 			}
    609 		case '/':
    610 			ch = s.next()
    611 			if (ch == '/' || ch == '*') && s.Mode&ScanComments != 0 {
    612 				if s.Mode&SkipComments != 0 {
    613 					s.tokPos = -1 // don't collect token text
    614 					ch = s.scanComment(ch)
    615 					goto redo
    616 				}
    617 				ch = s.scanComment(ch)
    618 				tok = Comment
    619 			}
    620 		case '`':
    621 			if s.Mode&ScanRawStrings != 0 {
    622 				s.scanRawString()
    623 				tok = String
    624 			}
    625 			ch = s.next()
    626 		default:
    627 			ch = s.next()
    628 		}
    629 	}
    630 
    631 	// end of token text
    632 	s.tokEnd = s.srcPos - s.lastCharLen
    633 
    634 	s.ch = ch
    635 	return tok
    636 }
    637 
    638 // Pos returns the position of the character immediately after
    639 // the character or token returned by the last call to Next or Scan.
    640 func (s *Scanner) Pos() (pos Position) {
    641 	pos.Filename = s.Filename
    642 	pos.Offset = s.srcBufOffset + s.srcPos - s.lastCharLen
    643 	switch {
    644 	case s.column > 0:
    645 		// common case: last character was not a '\n'
    646 		pos.Line = s.line
    647 		pos.Column = s.column
    648 	case s.lastLineLen > 0:
    649 		// last character was a '\n'
    650 		pos.Line = s.line - 1
    651 		pos.Column = s.lastLineLen
    652 	default:
    653 		// at the beginning of the source
    654 		pos.Line = 1
    655 		pos.Column = 1
    656 	}
    657 	return
    658 }
    659 
    660 // TokenText returns the string corresponding to the most recently scanned token.
    661 // Valid after calling Scan().
    662 func (s *Scanner) TokenText() string {
    663 	if s.tokPos < 0 {
    664 		// no token text
    665 		return ""
    666 	}
    667 
    668 	if s.tokEnd < 0 {
    669 		// if EOF was reached, s.tokEnd is set to -1 (s.srcPos == 0)
    670 		s.tokEnd = s.tokPos
    671 	}
    672 
    673 	if s.tokBuf.Len() == 0 {
    674 		// common case: the entire token text is still in srcBuf
    675 		return string(s.srcBuf[s.tokPos:s.tokEnd])
    676 	}
    677 
    678 	// part of the token text was saved in tokBuf: save the rest in
    679 	// tokBuf as well and return its content
    680 	s.tokBuf.Write(s.srcBuf[s.tokPos:s.tokEnd])
    681 	s.tokPos = s.tokEnd // ensure idempotency of TokenText() call
    682 	return s.tokBuf.String()
    683 }
    684