Home | History | Annotate | Download | only in syntax
      1 // Copyright 2016 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package syntax
      6 
      7 import (
      8 	"io"
      9 	"unicode/utf8"
     10 )
     11 
     12 // buf [...read...|...|...unread...|s|...free...]
     13 //         ^      ^   ^            ^
     14 //         |      |   |            |
     15 //        suf     r0  r            w
     16 
     17 type source struct {
     18 	src   io.Reader
     19 	errh  ErrorHandler
     20 	first error // first error encountered
     21 
     22 	// source buffer
     23 	buf         [4 << 10]byte
     24 	offs        int   // source offset of buf
     25 	r0, r, w    int   // previous/current read and write buf positions, excluding sentinel
     26 	line0, line int   // previous/current line
     27 	err         error // pending io error
     28 
     29 	// literal buffer
     30 	lit []byte // literal prefix
     31 	suf int    // literal suffix; suf >= 0 means we are scanning a literal
     32 }
     33 
     34 func (s *source) init(src io.Reader, errh ErrorHandler) {
     35 	s.src = src
     36 	s.errh = errh
     37 	s.first = nil
     38 
     39 	s.buf[0] = utf8.RuneSelf // terminate with sentinel
     40 	s.offs = 0
     41 	s.r0, s.r, s.w = 0, 0, 0
     42 	s.line0, s.line = 1, 1
     43 	s.err = nil
     44 
     45 	s.lit = s.lit[:0]
     46 	s.suf = -1
     47 }
     48 
     49 func (s *source) error(msg string) {
     50 	s.error_at(s.pos0(), s.line0, msg)
     51 }
     52 
     53 func (s *source) error_at(pos, line int, msg string) {
     54 	err := Error{pos, line, msg}
     55 	if s.first == nil {
     56 		s.first = err
     57 	}
     58 	if s.errh == nil {
     59 		panic(s.first)
     60 	}
     61 	s.errh(err)
     62 }
     63 
     64 // pos0 returns the byte position of the last character read.
     65 func (s *source) pos0() int {
     66 	return s.offs + s.r0
     67 }
     68 
     69 func (s *source) ungetr() {
     70 	s.r, s.line = s.r0, s.line0
     71 }
     72 
     73 func (s *source) getr() rune {
     74 redo:
     75 	s.r0, s.line0 = s.r, s.line
     76 
     77 	// We could avoid at least one test that is always taken in the
     78 	// for loop below by duplicating the common case code (ASCII)
     79 	// here since we always have at least the sentinel (utf8.RuneSelf)
     80 	// in the buffer. Measure and optimize if necessary.
     81 
     82 	// make sure we have at least one rune in buffer, or we are at EOF
     83 	for s.r+utf8.UTFMax > s.w && !utf8.FullRune(s.buf[s.r:s.w]) && s.err == nil && s.w-s.r < len(s.buf) {
     84 		s.fill() // s.w-s.r < len(s.buf) => buffer is not full
     85 	}
     86 
     87 	// common case: ASCII and enough bytes
     88 	// (invariant: s.buf[s.w] == utf8.RuneSelf)
     89 	if b := s.buf[s.r]; b < utf8.RuneSelf {
     90 		s.r++
     91 		if b == 0 {
     92 			s.error("invalid NUL character")
     93 			goto redo
     94 		}
     95 		if b == '\n' {
     96 			s.line++
     97 		}
     98 		return rune(b)
     99 	}
    100 
    101 	// EOF
    102 	if s.r == s.w {
    103 		if s.err != io.EOF {
    104 			s.error(s.err.Error())
    105 		}
    106 		return -1
    107 	}
    108 
    109 	// uncommon case: not ASCII
    110 	r, w := utf8.DecodeRune(s.buf[s.r:s.w])
    111 	s.r += w
    112 
    113 	if r == utf8.RuneError && w == 1 {
    114 		s.error("invalid UTF-8 encoding")
    115 		goto redo
    116 	}
    117 
    118 	// BOM's are only allowed as the first character in a file
    119 	const BOM = 0xfeff
    120 	if r == BOM {
    121 		if s.r0 > 0 { // s.r0 is always > 0 after 1st character (fill will set it to 1)
    122 			s.error("invalid BOM in the middle of the file")
    123 		}
    124 		goto redo
    125 	}
    126 
    127 	return r
    128 }
    129 
    130 func (s *source) fill() {
    131 	// Slide unread bytes to beginning but preserve last read char
    132 	// (for one ungetr call) plus one extra byte (for a 2nd ungetr
    133 	// call, only for ".." character sequence and float literals
    134 	// starting with ".").
    135 	if s.r0 > 1 {
    136 		// save literal prefix, if any
    137 		// (We see at most one ungetr call while reading
    138 		// a literal, so make sure s.r0 remains in buf.)
    139 		if s.suf >= 0 {
    140 			s.lit = append(s.lit, s.buf[s.suf:s.r0]...)
    141 			s.suf = 1 // == s.r0 after slide below
    142 		}
    143 		s.offs += s.r0 - 1
    144 		r := s.r - s.r0 + 1 // last read char plus one byte
    145 		s.w = r + copy(s.buf[r:], s.buf[s.r:s.w])
    146 		s.r = r
    147 		s.r0 = 1
    148 	}
    149 
    150 	// read more data: try a limited number of times
    151 	for i := 100; i > 0; i-- {
    152 		n, err := s.src.Read(s.buf[s.w : len(s.buf)-1]) // -1 to leave space for sentinel
    153 		if n < 0 {
    154 			panic("negative read") // incorrect underlying io.Reader implementation
    155 		}
    156 		s.w += n
    157 		if n > 0 || err != nil {
    158 			s.buf[s.w] = utf8.RuneSelf // sentinel
    159 			if err != nil {
    160 				s.err = err
    161 			}
    162 			return
    163 		}
    164 	}
    165 
    166 	s.err = io.ErrNoProgress
    167 }
    168 
    169 func (s *source) startLit() {
    170 	s.suf = s.r0
    171 	s.lit = s.lit[:0] // reuse lit
    172 }
    173 
    174 func (s *source) stopLit() []byte {
    175 	lit := s.buf[s.suf:s.r]
    176 	if len(s.lit) > 0 {
    177 		lit = append(s.lit, lit...)
    178 	}
    179 	s.suf = -1 // no pending literal
    180 	return lit
    181 }
    182