Home | History | Annotate | Download | only in syntax
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // This file implements source, a buffered rune reader
      6 // which is specialized for the needs of the Go scanner:
      7 // Contiguous sequences of runes (literals) are extracted
      8 // directly as []byte without the need to re-encode the
      9 // runes in UTF-8 (as would be necessary with bufio.Reader).
     10 //
     11 // This file is self-contained (go tool compile source.go
     12 // compiles) and thus could be made into its own package.
     13 
     14 package syntax
     15 
     16 import (
     17 	"io"
     18 	"unicode/utf8"
     19 )
     20 
     21 // starting points for line and column numbers
     22 const linebase = 1
     23 const colbase = 1
     24 
     25 // buf [...read...|...|...unread...|s|...free...]
     26 //         ^      ^   ^            ^
     27 //         |      |   |            |
     28 //        suf     r0  r            w
     29 
     30 type source struct {
     31 	src  io.Reader
     32 	errh func(line, pos uint, msg string)
     33 
     34 	// source buffer
     35 	buf         [4 << 10]byte
     36 	offs        int   // source offset of buf
     37 	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
     38 	line0, line uint  // previous/current line
     39 	col0, col   uint  // previous/current column (byte offsets from line start)
     40 	ioerr       error // pending io error
     41 
     42 	// literal buffer
     43 	lit []byte // literal prefix
     44 	suf int    // literal suffix; suf >= 0 means we are scanning a literal
     45 }
     46 
     47 // init initializes source to read from src and to report errors via errh.
     48 // errh must not be nil.
     49 func (s *source) init(src io.Reader, errh func(line, pos uint, msg string)) {
     50 	s.src = src
     51 	s.errh = errh
     52 
     53 	s.buf[0] = utf8.RuneSelf // terminate with sentinel
     54 	s.offs = 0
     55 	s.r0, s.r, s.w = 0, 0, 0
     56 	s.line0, s.line = 0, linebase
     57 	s.col0, s.col = 0, colbase
     58 	s.ioerr = nil
     59 
     60 	s.lit = s.lit[:0]
     61 	s.suf = -1
     62 }
     63 
     64 // ungetr ungets the most recently read rune.
     65 func (s *source) ungetr() {
     66 	s.r, s.line, s.col = s.r0, s.line0, s.col0
     67 }
     68 
     69 // ungetr2 is like ungetr but enables a 2nd ungetr.
     70 // It must not be called if one of the runes seen
     71 // was a newline.
     72 func (s *source) ungetr2() {
     73 	s.ungetr()
     74 	// line must not have changed
     75 	s.r0--
     76 	s.col0--
     77 }
     78 
     79 func (s *source) error(msg string) {
     80 	s.errh(s.line0, s.col0, msg)
     81 }
     82 
     83 // getr reads and returns the next rune.
     84 //
     85 // If a read or source encoding error occurs, getr
     86 // calls the error handler installed with init.
     87 // The handler must exist.
     88 //
     89 // The (line, col) position passed to the error handler
     90 // is always at the current source reading position.
     91 func (s *source) getr() rune {
     92 redo:
     93 	s.r0, s.line0, s.col0 = s.r, s.line, s.col
     94 
     95 	// We could avoid at least one test that is always taken in the
     96 	// for loop below by duplicating the common case code (ASCII)
     97 	// here since we always have at least the sentinel (utf8.RuneSelf)
     98 	// in the buffer. Measure and optimize if necessary.
     99 
    100 	// make sure we have at least one rune in buffer, or we are at EOF
    101 	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.ioerr == nil && s.w-s.r < len(s.buf) {
    102 		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
    103 	}
    104 
    105 	// common case: ASCII and enough bytes
    106 	// (invariant: s.buf[s.w] == utf8.RuneSelf)
    107 	if b := s.buf[s.r]; b < utf8.RuneSelf {
    108 		s.r++
    109 		// TODO(gri) Optimization: Instead of adjusting s.col for each character,
    110 		// remember the line offset instead and then compute the offset as needed
    111 		// (which is less often).
    112 		s.col++
    113 		if b == 0 {
    114 			s.error("invalid NUL character")
    115 			goto redo
    116 		}
    117 		if b == '\n' {
    118 			s.line++
    119 			s.col = colbase
    120 		}
    121 		return rune(b)
    122 	}
    123 
    124 	// EOF
    125 	if s.r == s.w {
    126 		if s.ioerr != io.EOF {
    127 			s.error(s.ioerr.Error())
    128 		}
    129 		return -1
    130 	}
    131 
    132 	// uncommon case: not ASCII
    133 	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
    134 	s.r += w
    135 	s.col += uint(w)
    136 
    137 	if r == utf8.RuneError && w == 1 {
    138 		s.error("invalid UTF-8 encoding")
    139 		goto redo
    140 	}
    141 
    142 	// BOM's are only allowed as the first character in a file
    143 	const BOM = 0xfeff
    144 	if r == BOM {
    145 		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
    146 			s.error("invalid BOM in the middle of the file")
    147 		}
    148 		goto redo
    149 	}
    150 
    151 	return r
    152 }
    153 
    154 func (s *source) fill() {
    155 	// Slide unread bytes to beginning but preserve last read char
    156 	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
    157 	// call, only for ".." character sequence and float literals
    158 	// starting with ".").
    159 	if s.r0 > 1 {
    160 		// save literal prefix, if any
    161 		// (We see at most one ungetr call while reading
    162 		// a literal, so make sure s.r0 remains in buf.)
    163 		if s.suf >= 0 {
    164 			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
    165 			s.suf = 1 // == s.r0 after slide below
    166 		}
    167 		n := s.r0 - 1
    168 		copy(s.buf[:], s.buf[n:s.w])
    169 		s.offs += n
    170 		s.r0 = 1 // eqv: s.r0 -= n
    171 		s.r -= n
    172 		s.w -= n
    173 	}
    174 
    175 	// read more data: try a limited number of times
    176 	for i := 100; i > 0; i-- {
    177 		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
    178 		if n < 0 {
    179 			panic("negative read") // incorrect underlying io.Reader implementation
    180 		}
    181 		s.w += n
    182 		if n > 0 || err != nil {
    183 			s.buf[s.w] = utf8.RuneSelf // sentinel
    184 			if err != nil {
    185 				s.ioerr = err
    186 			}
    187 			return
    188 		}
    189 	}
    190 
    191 	s.ioerr = io.ErrNoProgress
    192 }
    193 
    194 func (s *source) startLit() {
    195 	s.suf = s.r0
    196 	s.lit = s.lit[:0] // reuse lit
    197 }
    198 
    199 func (s *source) stopLit() []byte {
    200 	lit := s.buf[s.suf:s.r]
    201 	if len(s.lit) > 0 {
    202 		lit = append(s.lit, lit...)
    203 	}
    204 	s.suf = -1 // no pending literal
    205 	return lit
    206 }
    207