Home | History | Annotate | Download | only in json
      1 // Copyright 2010 The Go Authors.  All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package json
      6 
      7 // JSON value parser state machine.
      8 // Just about at the limit of what is reasonable to write by hand.
      9 // Some parts are a bit tedious, but overall it nicely factors out the
     10 // otherwise common code from the multiple scanning functions
     11 // in this package (Compact, Indent, checkValid, nextValue, etc).
     12 //
     13 // This file starts with two simple examples using the scanner
     14 // before diving into the scanner itself.
     15 
     16 import "strconv"
     17 
     18 // checkValid verifies that data is valid JSON-encoded data.
     19 // scan is passed in for use by checkValid to avoid an allocation.
     20 func checkValid(data []byte, scan *scanner) error {
     21 	scan.reset()
     22 	for _, c := range data {
     23 		scan.bytes++
     24 		if scan.step(scan, int(c)) == scanError {
     25 			return scan.err
     26 		}
     27 	}
     28 	if scan.eof() == scanError {
     29 		return scan.err
     30 	}
     31 	return nil
     32 }
     33 
     34 // nextValue splits data after the next whole JSON value,
     35 // returning that value and the bytes that follow it as separate slices.
     36 // scan is passed in for use by nextValue to avoid an allocation.
     37 func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
     38 	scan.reset()
     39 	for i, c := range data {
     40 		v := scan.step(scan, int(c))
     41 		if v >= scanEndObject {
     42 			switch v {
     43 			// probe the scanner with a space to determine whether we will
     44 			// get scanEnd on the next character. Otherwise, if the next character
     45 			// is not a space, scanEndTop allocates a needless error.
     46 			case scanEndObject, scanEndArray:
     47 				if scan.step(scan, ' ') == scanEnd {
     48 					return data[:i+1], data[i+1:], nil
     49 				}
     50 			case scanError:
     51 				return nil, nil, scan.err
     52 			case scanEnd:
     53 				return data[0:i], data[i:], nil
     54 			}
     55 		}
     56 	}
     57 	if scan.eof() == scanError {
     58 		return nil, nil, scan.err
     59 	}
     60 	return data, nil, nil
     61 }
     62 
     63 // A SyntaxError is a description of a JSON syntax error.
     64 type SyntaxError struct {
     65 	msg    string // description of error
     66 	Offset int64  // error occurred after reading Offset bytes
     67 }
     68 
     69 func (e *SyntaxError) Error() string { return e.msg }
     70 
     71 // A scanner is a JSON scanning state machine.
     72 // Callers call scan.reset() and then pass bytes in one at a time
     73 // by calling scan.step(&scan, c) for each byte.
     74 // The return value, referred to as an opcode, tells the
     75 // caller about significant parsing events like beginning
     76 // and ending literals, objects, and arrays, so that the
     77 // caller can follow along if it wishes.
     78 // The return value scanEnd indicates that a single top-level
     79 // JSON value has been completed, *before* the byte that
     80 // just got passed in.  (The indication must be delayed in order
     81 // to recognize the end of numbers: is 123 a whole value or
     82 // the beginning of 12345e+6?).
     83 type scanner struct {
     84 	// The step is a func to be called to execute the next transition.
     85 	// Also tried using an integer constant and a single func
     86 	// with a switch, but using the func directly was 10% faster
     87 	// on a 64-bit Mac Mini, and it's nicer to read.
     88 	step func(*scanner, int) int
     89 
     90 	// Reached end of top-level value.
     91 	endTop bool
     92 
     93 	// Stack of what we're in the middle of - array values, object keys, object values.
     94 	parseState []int
     95 
     96 	// Error that happened, if any.
     97 	err error
     98 
     99 	// 1-byte redo (see undo method)
    100 	redo      bool
    101 	redoCode  int
    102 	redoState func(*scanner, int) int
    103 
    104 	// total bytes consumed, updated by decoder.Decode
    105 	bytes int64
    106 }
    107 
    108 // These values are returned by the state transition functions
    109 // assigned to scanner.state and the method scanner.eof.
    110 // They give details about the current state of the scan that
    111 // callers might be interested to know about.
    112 // It is okay to ignore the return value of any particular
    113 // call to scanner.state: if one call returns scanError,
    114 // every subsequent call will return scanError too.
    115 const (
    116 	// Continue.
    117 	scanContinue     = iota // uninteresting byte
    118 	scanBeginLiteral        // end implied by next result != scanContinue
    119 	scanBeginObject         // begin object
    120 	scanObjectKey           // just finished object key (string)
    121 	scanObjectValue         // just finished non-last object value
    122 	scanEndObject           // end object (implies scanObjectValue if possible)
    123 	scanBeginArray          // begin array
    124 	scanArrayValue          // just finished array value
    125 	scanEndArray            // end array (implies scanArrayValue if possible)
    126 	scanSkipSpace           // space byte; can skip; known to be last "continue" result
    127 
    128 	// Stop.
    129 	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
    130 	scanError // hit an error, scanner.err.
    131 )
    132 
    133 // These values are stored in the parseState stack.
    134 // They give the current state of a composite value
    135 // being scanned.  If the parser is inside a nested value
    136 // the parseState describes the nested state, outermost at entry 0.
    137 const (
    138 	parseObjectKey   = iota // parsing object key (before colon)
    139 	parseObjectValue        // parsing object value (after colon)
    140 	parseArrayValue         // parsing array value
    141 )
    142 
    143 // reset prepares the scanner for use.
    144 // It must be called before calling s.step.
    145 func (s *scanner) reset() {
    146 	s.step = stateBeginValue
    147 	s.parseState = s.parseState[0:0]
    148 	s.err = nil
    149 	s.redo = false
    150 	s.endTop = false
    151 }
    152 
    153 // eof tells the scanner that the end of input has been reached.
    154 // It returns a scan status just as s.step does.
    155 func (s *scanner) eof() int {
    156 	if s.err != nil {
    157 		return scanError
    158 	}
    159 	if s.endTop {
    160 		return scanEnd
    161 	}
    162 	s.step(s, ' ')
    163 	if s.endTop {
    164 		return scanEnd
    165 	}
    166 	if s.err == nil {
    167 		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
    168 	}
    169 	return scanError
    170 }
    171 
    172 // pushParseState pushes a new parse state p onto the parse stack.
    173 func (s *scanner) pushParseState(p int) {
    174 	s.parseState = append(s.parseState, p)
    175 }
    176 
    177 // popParseState pops a parse state (already obtained) off the stack
    178 // and updates s.step accordingly.
    179 func (s *scanner) popParseState() {
    180 	n := len(s.parseState) - 1
    181 	s.parseState = s.parseState[0:n]
    182 	s.redo = false
    183 	if n == 0 {
    184 		s.step = stateEndTop
    185 		s.endTop = true
    186 	} else {
    187 		s.step = stateEndValue
    188 	}
    189 }
    190 
    191 func isSpace(c rune) bool {
    192 	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
    193 }
    194 
    195 // stateBeginValueOrEmpty is the state after reading `[`.
    196 func stateBeginValueOrEmpty(s *scanner, c int) int {
    197 	if c <= ' ' && isSpace(rune(c)) {
    198 		return scanSkipSpace
    199 	}
    200 	if c == ']' {
    201 		return stateEndValue(s, c)
    202 	}
    203 	return stateBeginValue(s, c)
    204 }
    205 
    206 // stateBeginValue is the state at the beginning of the input.
    207 func stateBeginValue(s *scanner, c int) int {
    208 	if c <= ' ' && isSpace(rune(c)) {
    209 		return scanSkipSpace
    210 	}
    211 	switch c {
    212 	case '{':
    213 		s.step = stateBeginStringOrEmpty
    214 		s.pushParseState(parseObjectKey)
    215 		return scanBeginObject
    216 	case '[':
    217 		s.step = stateBeginValueOrEmpty
    218 		s.pushParseState(parseArrayValue)
    219 		return scanBeginArray
    220 	case '"':
    221 		s.step = stateInString
    222 		return scanBeginLiteral
    223 	case '-':
    224 		s.step = stateNeg
    225 		return scanBeginLiteral
    226 	case '0': // beginning of 0.123
    227 		s.step = state0
    228 		return scanBeginLiteral
    229 	case 't': // beginning of true
    230 		s.step = stateT
    231 		return scanBeginLiteral
    232 	case 'f': // beginning of false
    233 		s.step = stateF
    234 		return scanBeginLiteral
    235 	case 'n': // beginning of null
    236 		s.step = stateN
    237 		return scanBeginLiteral
    238 	}
    239 	if '1' <= c && c <= '9' { // beginning of 1234.5
    240 		s.step = state1
    241 		return scanBeginLiteral
    242 	}
    243 	return s.error(c, "looking for beginning of value")
    244 }
    245 
    246 // stateBeginStringOrEmpty is the state after reading `{`.
    247 func stateBeginStringOrEmpty(s *scanner, c int) int {
    248 	if c <= ' ' && isSpace(rune(c)) {
    249 		return scanSkipSpace
    250 	}
    251 	if c == '}' {
    252 		n := len(s.parseState)
    253 		s.parseState[n-1] = parseObjectValue
    254 		return stateEndValue(s, c)
    255 	}
    256 	return stateBeginString(s, c)
    257 }
    258 
    259 // stateBeginString is the state after reading `{"key": value,`.
    260 func stateBeginString(s *scanner, c int) int {
    261 	if c <= ' ' && isSpace(rune(c)) {
    262 		return scanSkipSpace
    263 	}
    264 	if c == '"' {
    265 		s.step = stateInString
    266 		return scanBeginLiteral
    267 	}
    268 	return s.error(c, "looking for beginning of object key string")
    269 }
    270 
    271 // stateEndValue is the state after completing a value,
    272 // such as after reading `{}` or `true` or `["x"`.
    273 func stateEndValue(s *scanner, c int) int {
    274 	n := len(s.parseState)
    275 	if n == 0 {
    276 		// Completed top-level before the current byte.
    277 		s.step = stateEndTop
    278 		s.endTop = true
    279 		return stateEndTop(s, c)
    280 	}
    281 	if c <= ' ' && isSpace(rune(c)) {
    282 		s.step = stateEndValue
    283 		return scanSkipSpace
    284 	}
    285 	ps := s.parseState[n-1]
    286 	switch ps {
    287 	case parseObjectKey:
    288 		if c == ':' {
    289 			s.parseState[n-1] = parseObjectValue
    290 			s.step = stateBeginValue
    291 			return scanObjectKey
    292 		}
    293 		return s.error(c, "after object key")
    294 	case parseObjectValue:
    295 		if c == ',' {
    296 			s.parseState[n-1] = parseObjectKey
    297 			s.step = stateBeginString
    298 			return scanObjectValue
    299 		}
    300 		if c == '}' {
    301 			s.popParseState()
    302 			return scanEndObject
    303 		}
    304 		return s.error(c, "after object key:value pair")
    305 	case parseArrayValue:
    306 		if c == ',' {
    307 			s.step = stateBeginValue
    308 			return scanArrayValue
    309 		}
    310 		if c == ']' {
    311 			s.popParseState()
    312 			return scanEndArray
    313 		}
    314 		return s.error(c, "after array element")
    315 	}
    316 	return s.error(c, "")
    317 }
    318 
    319 // stateEndTop is the state after finishing the top-level value,
    320 // such as after reading `{}` or `[1,2,3]`.
    321 // Only space characters should be seen now.
    322 func stateEndTop(s *scanner, c int) int {
    323 	if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
    324 		// Complain about non-space byte on next call.
    325 		s.error(c, "after top-level value")
    326 	}
    327 	return scanEnd
    328 }
    329 
    330 // stateInString is the state after reading `"`.
    331 func stateInString(s *scanner, c int) int {
    332 	if c == '"' {
    333 		s.step = stateEndValue
    334 		return scanContinue
    335 	}
    336 	if c == '\\' {
    337 		s.step = stateInStringEsc
    338 		return scanContinue
    339 	}
    340 	if c < 0x20 {
    341 		return s.error(c, "in string literal")
    342 	}
    343 	return scanContinue
    344 }
    345 
    346 // stateInStringEsc is the state after reading `"\` during a quoted string.
    347 func stateInStringEsc(s *scanner, c int) int {
    348 	switch c {
    349 	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
    350 		s.step = stateInString
    351 		return scanContinue
    352 	}
    353 	if c == 'u' {
    354 		s.step = stateInStringEscU
    355 		return scanContinue
    356 	}
    357 	return s.error(c, "in string escape code")
    358 }
    359 
    360 // stateInStringEscU is the state after reading `"\u` during a quoted string.
    361 func stateInStringEscU(s *scanner, c int) int {
    362 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    363 		s.step = stateInStringEscU1
    364 		return scanContinue
    365 	}
    366 	// numbers
    367 	return s.error(c, "in \\u hexadecimal character escape")
    368 }
    369 
    370 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
    371 func stateInStringEscU1(s *scanner, c int) int {
    372 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    373 		s.step = stateInStringEscU12
    374 		return scanContinue
    375 	}
    376 	// numbers
    377 	return s.error(c, "in \\u hexadecimal character escape")
    378 }
    379 
    380 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
    381 func stateInStringEscU12(s *scanner, c int) int {
    382 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    383 		s.step = stateInStringEscU123
    384 		return scanContinue
    385 	}
    386 	// numbers
    387 	return s.error(c, "in \\u hexadecimal character escape")
    388 }
    389 
    390 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
    391 func stateInStringEscU123(s *scanner, c int) int {
    392 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    393 		s.step = stateInString
    394 		return scanContinue
    395 	}
    396 	// numbers
    397 	return s.error(c, "in \\u hexadecimal character escape")
    398 }
    399 
    400 // stateNeg is the state after reading `-` during a number.
    401 func stateNeg(s *scanner, c int) int {
    402 	if c == '0' {
    403 		s.step = state0
    404 		return scanContinue
    405 	}
    406 	if '1' <= c && c <= '9' {
    407 		s.step = state1
    408 		return scanContinue
    409 	}
    410 	return s.error(c, "in numeric literal")
    411 }
    412 
    413 // state1 is the state after reading a non-zero integer during a number,
    414 // such as after reading `1` or `100` but not `0`.
    415 func state1(s *scanner, c int) int {
    416 	if '0' <= c && c <= '9' {
    417 		s.step = state1
    418 		return scanContinue
    419 	}
    420 	return state0(s, c)
    421 }
    422 
    423 // state0 is the state after reading `0` during a number.
    424 func state0(s *scanner, c int) int {
    425 	if c == '.' {
    426 		s.step = stateDot
    427 		return scanContinue
    428 	}
    429 	if c == 'e' || c == 'E' {
    430 		s.step = stateE
    431 		return scanContinue
    432 	}
    433 	return stateEndValue(s, c)
    434 }
    435 
    436 // stateDot is the state after reading the integer and decimal point in a number,
    437 // such as after reading `1.`.
    438 func stateDot(s *scanner, c int) int {
    439 	if '0' <= c && c <= '9' {
    440 		s.step = stateDot0
    441 		return scanContinue
    442 	}
    443 	return s.error(c, "after decimal point in numeric literal")
    444 }
    445 
    446 // stateDot0 is the state after reading the integer, decimal point, and subsequent
    447 // digits of a number, such as after reading `3.14`.
    448 func stateDot0(s *scanner, c int) int {
    449 	if '0' <= c && c <= '9' {
    450 		s.step = stateDot0
    451 		return scanContinue
    452 	}
    453 	if c == 'e' || c == 'E' {
    454 		s.step = stateE
    455 		return scanContinue
    456 	}
    457 	return stateEndValue(s, c)
    458 }
    459 
    460 // stateE is the state after reading the mantissa and e in a number,
    461 // such as after reading `314e` or `0.314e`.
    462 func stateE(s *scanner, c int) int {
    463 	if c == '+' {
    464 		s.step = stateESign
    465 		return scanContinue
    466 	}
    467 	if c == '-' {
    468 		s.step = stateESign
    469 		return scanContinue
    470 	}
    471 	return stateESign(s, c)
    472 }
    473 
    474 // stateESign is the state after reading the mantissa, e, and sign in a number,
    475 // such as after reading `314e-` or `0.314e+`.
    476 func stateESign(s *scanner, c int) int {
    477 	if '0' <= c && c <= '9' {
    478 		s.step = stateE0
    479 		return scanContinue
    480 	}
    481 	return s.error(c, "in exponent of numeric literal")
    482 }
    483 
    484 // stateE0 is the state after reading the mantissa, e, optional sign,
    485 // and at least one digit of the exponent in a number,
    486 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
    487 func stateE0(s *scanner, c int) int {
    488 	if '0' <= c && c <= '9' {
    489 		s.step = stateE0
    490 		return scanContinue
    491 	}
    492 	return stateEndValue(s, c)
    493 }
    494 
    495 // stateT is the state after reading `t`.
    496 func stateT(s *scanner, c int) int {
    497 	if c == 'r' {
    498 		s.step = stateTr
    499 		return scanContinue
    500 	}
    501 	return s.error(c, "in literal true (expecting 'r')")
    502 }
    503 
    504 // stateTr is the state after reading `tr`.
    505 func stateTr(s *scanner, c int) int {
    506 	if c == 'u' {
    507 		s.step = stateTru
    508 		return scanContinue
    509 	}
    510 	return s.error(c, "in literal true (expecting 'u')")
    511 }
    512 
    513 // stateTru is the state after reading `tru`.
    514 func stateTru(s *scanner, c int) int {
    515 	if c == 'e' {
    516 		s.step = stateEndValue
    517 		return scanContinue
    518 	}
    519 	return s.error(c, "in literal true (expecting 'e')")
    520 }
    521 
    522 // stateF is the state after reading `f`.
    523 func stateF(s *scanner, c int) int {
    524 	if c == 'a' {
    525 		s.step = stateFa
    526 		return scanContinue
    527 	}
    528 	return s.error(c, "in literal false (expecting 'a')")
    529 }
    530 
    531 // stateFa is the state after reading `fa`.
    532 func stateFa(s *scanner, c int) int {
    533 	if c == 'l' {
    534 		s.step = stateFal
    535 		return scanContinue
    536 	}
    537 	return s.error(c, "in literal false (expecting 'l')")
    538 }
    539 
    540 // stateFal is the state after reading `fal`.
    541 func stateFal(s *scanner, c int) int {
    542 	if c == 's' {
    543 		s.step = stateFals
    544 		return scanContinue
    545 	}
    546 	return s.error(c, "in literal false (expecting 's')")
    547 }
    548 
    549 // stateFals is the state after reading `fals`.
    550 func stateFals(s *scanner, c int) int {
    551 	if c == 'e' {
    552 		s.step = stateEndValue
    553 		return scanContinue
    554 	}
    555 	return s.error(c, "in literal false (expecting 'e')")
    556 }
    557 
    558 // stateN is the state after reading `n`.
    559 func stateN(s *scanner, c int) int {
    560 	if c == 'u' {
    561 		s.step = stateNu
    562 		return scanContinue
    563 	}
    564 	return s.error(c, "in literal null (expecting 'u')")
    565 }
    566 
    567 // stateNu is the state after reading `nu`.
    568 func stateNu(s *scanner, c int) int {
    569 	if c == 'l' {
    570 		s.step = stateNul
    571 		return scanContinue
    572 	}
    573 	return s.error(c, "in literal null (expecting 'l')")
    574 }
    575 
    576 // stateNul is the state after reading `nul`.
    577 func stateNul(s *scanner, c int) int {
    578 	if c == 'l' {
    579 		s.step = stateEndValue
    580 		return scanContinue
    581 	}
    582 	return s.error(c, "in literal null (expecting 'l')")
    583 }
    584 
    585 // stateError is the state after reaching a syntax error,
    586 // such as after reading `[1}` or `5.1.2`.
    587 func stateError(s *scanner, c int) int {
    588 	return scanError
    589 }
    590 
    591 // error records an error and switches to the error state.
    592 func (s *scanner) error(c int, context string) int {
    593 	s.step = stateError
    594 	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
    595 	return scanError
    596 }
    597 
    598 // quoteChar formats c as a quoted character literal
    599 func quoteChar(c int) string {
    600 	// special cases - different from quoted strings
    601 	if c == '\'' {
    602 		return `'\''`
    603 	}
    604 	if c == '"' {
    605 		return `'"'`
    606 	}
    607 
    608 	// use quoted string with different quotation marks
    609 	s := strconv.Quote(string(c))
    610 	return "'" + s[1:len(s)-1] + "'"
    611 }
    612 
    613 // undo causes the scanner to return scanCode from the next state transition.
    614 // This gives callers a simple 1-byte undo mechanism.
    615 func (s *scanner) undo(scanCode int) {
    616 	if s.redo {
    617 		panic("json: invalid use of scanner")
    618 	}
    619 	s.redoCode = scanCode
    620 	s.redoState = s.step
    621 	s.step = stateRedo
    622 	s.redo = true
    623 }
    624 
    625 // stateRedo helps implement the scanner's 1-byte undo.
    626 func stateRedo(s *scanner, c int) int {
    627 	s.redo = false
    628 	s.step = s.redoState
    629 	return s.redoCode
    630 }
    631