Home | History | Annotate | Download | only in json
      1 // Copyright 2010 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 package json
      6 
      7 // JSON value parser state machine.
      8 // Just about at the limit of what is reasonable to write by hand.
      9 // Some parts are a bit tedious, but overall it nicely factors out the
     10 // otherwise common code from the multiple scanning functions
     11 // in this package (Compact, Indent, checkValid, nextValue, etc).
     12 //
     13 // This file starts with two simple examples using the scanner
     14 // before diving into the scanner itself.
     15 
     16 import "strconv"
     17 
     18 // Valid reports whether data is a valid JSON encoding.
     19 func Valid(data []byte) bool {
     20 	return checkValid(data, &scanner{}) == nil
     21 }
     22 
     23 // checkValid verifies that data is valid JSON-encoded data.
     24 // scan is passed in for use by checkValid to avoid an allocation.
     25 func checkValid(data []byte, scan *scanner) error {
     26 	scan.reset()
     27 	for _, c := range data {
     28 		scan.bytes++
     29 		if scan.step(scan, c) == scanError {
     30 			return scan.err
     31 		}
     32 	}
     33 	if scan.eof() == scanError {
     34 		return scan.err
     35 	}
     36 	return nil
     37 }
     38 
     39 // nextValue splits data after the next whole JSON value,
     40 // returning that value and the bytes that follow it as separate slices.
     41 // scan is passed in for use by nextValue to avoid an allocation.
     42 func nextValue(data []byte, scan *scanner) (value, rest []byte, err error) {
     43 	scan.reset()
     44 	for i, c := range data {
     45 		v := scan.step(scan, c)
     46 		if v >= scanEndObject {
     47 			switch v {
     48 			// probe the scanner with a space to determine whether we will
     49 			// get scanEnd on the next character. Otherwise, if the next character
     50 			// is not a space, scanEndTop allocates a needless error.
     51 			case scanEndObject, scanEndArray:
     52 				if scan.step(scan, ' ') == scanEnd {
     53 					return data[:i+1], data[i+1:], nil
     54 				}
     55 			case scanError:
     56 				return nil, nil, scan.err
     57 			case scanEnd:
     58 				return data[:i], data[i:], nil
     59 			}
     60 		}
     61 	}
     62 	if scan.eof() == scanError {
     63 		return nil, nil, scan.err
     64 	}
     65 	return data, nil, nil
     66 }
     67 
     68 // A SyntaxError is a description of a JSON syntax error.
     69 type SyntaxError struct {
     70 	msg    string // description of error
     71 	Offset int64  // error occurred after reading Offset bytes
     72 }
     73 
     74 func (e *SyntaxError) Error() string { return e.msg }
     75 
     76 // A scanner is a JSON scanning state machine.
     77 // Callers call scan.reset() and then pass bytes in one at a time
     78 // by calling scan.step(&scan, c) for each byte.
     79 // The return value, referred to as an opcode, tells the
     80 // caller about significant parsing events like beginning
     81 // and ending literals, objects, and arrays, so that the
     82 // caller can follow along if it wishes.
     83 // The return value scanEnd indicates that a single top-level
     84 // JSON value has been completed, *before* the byte that
     85 // just got passed in.  (The indication must be delayed in order
     86 // to recognize the end of numbers: is 123 a whole value or
     87 // the beginning of 12345e+6?).
     88 type scanner struct {
     89 	// The step is a func to be called to execute the next transition.
     90 	// Also tried using an integer constant and a single func
     91 	// with a switch, but using the func directly was 10% faster
     92 	// on a 64-bit Mac Mini, and it's nicer to read.
     93 	step func(*scanner, byte) int
     94 
     95 	// Reached end of top-level value.
     96 	endTop bool
     97 
     98 	// Stack of what we're in the middle of - array values, object keys, object values.
     99 	parseState []int
    100 
    101 	// Error that happened, if any.
    102 	err error
    103 
    104 	// 1-byte redo (see undo method)
    105 	redo      bool
    106 	redoCode  int
    107 	redoState func(*scanner, byte) int
    108 
    109 	// total bytes consumed, updated by decoder.Decode
    110 	bytes int64
    111 }
    112 
    113 // These values are returned by the state transition functions
    114 // assigned to scanner.state and the method scanner.eof.
    115 // They give details about the current state of the scan that
    116 // callers might be interested to know about.
    117 // It is okay to ignore the return value of any particular
    118 // call to scanner.state: if one call returns scanError,
    119 // every subsequent call will return scanError too.
    120 const (
    121 	// Continue.
    122 	scanContinue     = iota // uninteresting byte
    123 	scanBeginLiteral        // end implied by next result != scanContinue
    124 	scanBeginObject         // begin object
    125 	scanObjectKey           // just finished object key (string)
    126 	scanObjectValue         // just finished non-last object value
    127 	scanEndObject           // end object (implies scanObjectValue if possible)
    128 	scanBeginArray          // begin array
    129 	scanArrayValue          // just finished array value
    130 	scanEndArray            // end array (implies scanArrayValue if possible)
    131 	scanSkipSpace           // space byte; can skip; known to be last "continue" result
    132 
    133 	// Stop.
    134 	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
    135 	scanError // hit an error, scanner.err.
    136 )
    137 
    138 // These values are stored in the parseState stack.
    139 // They give the current state of a composite value
    140 // being scanned. If the parser is inside a nested value
    141 // the parseState describes the nested state, outermost at entry 0.
    142 const (
    143 	parseObjectKey   = iota // parsing object key (before colon)
    144 	parseObjectValue        // parsing object value (after colon)
    145 	parseArrayValue         // parsing array value
    146 )
    147 
    148 // reset prepares the scanner for use.
    149 // It must be called before calling s.step.
    150 func (s *scanner) reset() {
    151 	s.step = stateBeginValue
    152 	s.parseState = s.parseState[0:0]
    153 	s.err = nil
    154 	s.redo = false
    155 	s.endTop = false
    156 }
    157 
    158 // eof tells the scanner that the end of input has been reached.
    159 // It returns a scan status just as s.step does.
    160 func (s *scanner) eof() int {
    161 	if s.err != nil {
    162 		return scanError
    163 	}
    164 	if s.endTop {
    165 		return scanEnd
    166 	}
    167 	s.step(s, ' ')
    168 	if s.endTop {
    169 		return scanEnd
    170 	}
    171 	if s.err == nil {
    172 		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
    173 	}
    174 	return scanError
    175 }
    176 
    177 // pushParseState pushes a new parse state p onto the parse stack.
    178 func (s *scanner) pushParseState(p int) {
    179 	s.parseState = append(s.parseState, p)
    180 }
    181 
    182 // popParseState pops a parse state (already obtained) off the stack
    183 // and updates s.step accordingly.
    184 func (s *scanner) popParseState() {
    185 	n := len(s.parseState) - 1
    186 	s.parseState = s.parseState[0:n]
    187 	s.redo = false
    188 	if n == 0 {
    189 		s.step = stateEndTop
    190 		s.endTop = true
    191 	} else {
    192 		s.step = stateEndValue
    193 	}
    194 }
    195 
    196 func isSpace(c byte) bool {
    197 	return c == ' ' || c == '\t' || c == '\r' || c == '\n'
    198 }
    199 
    200 // stateBeginValueOrEmpty is the state after reading `[`.
    201 func stateBeginValueOrEmpty(s *scanner, c byte) int {
    202 	if c <= ' ' && isSpace(c) {
    203 		return scanSkipSpace
    204 	}
    205 	if c == ']' {
    206 		return stateEndValue(s, c)
    207 	}
    208 	return stateBeginValue(s, c)
    209 }
    210 
    211 // stateBeginValue is the state at the beginning of the input.
    212 func stateBeginValue(s *scanner, c byte) int {
    213 	if c <= ' ' && isSpace(c) {
    214 		return scanSkipSpace
    215 	}
    216 	switch c {
    217 	case '{':
    218 		s.step = stateBeginStringOrEmpty
    219 		s.pushParseState(parseObjectKey)
    220 		return scanBeginObject
    221 	case '[':
    222 		s.step = stateBeginValueOrEmpty
    223 		s.pushParseState(parseArrayValue)
    224 		return scanBeginArray
    225 	case '"':
    226 		s.step = stateInString
    227 		return scanBeginLiteral
    228 	case '-':
    229 		s.step = stateNeg
    230 		return scanBeginLiteral
    231 	case '0': // beginning of 0.123
    232 		s.step = state0
    233 		return scanBeginLiteral
    234 	case 't': // beginning of true
    235 		s.step = stateT
    236 		return scanBeginLiteral
    237 	case 'f': // beginning of false
    238 		s.step = stateF
    239 		return scanBeginLiteral
    240 	case 'n': // beginning of null
    241 		s.step = stateN
    242 		return scanBeginLiteral
    243 	}
    244 	if '1' <= c && c <= '9' { // beginning of 1234.5
    245 		s.step = state1
    246 		return scanBeginLiteral
    247 	}
    248 	return s.error(c, "looking for beginning of value")
    249 }
    250 
    251 // stateBeginStringOrEmpty is the state after reading `{`.
    252 func stateBeginStringOrEmpty(s *scanner, c byte) int {
    253 	if c <= ' ' && isSpace(c) {
    254 		return scanSkipSpace
    255 	}
    256 	if c == '}' {
    257 		n := len(s.parseState)
    258 		s.parseState[n-1] = parseObjectValue
    259 		return stateEndValue(s, c)
    260 	}
    261 	return stateBeginString(s, c)
    262 }
    263 
    264 // stateBeginString is the state after reading `{"key": value,`.
    265 func stateBeginString(s *scanner, c byte) int {
    266 	if c <= ' ' && isSpace(c) {
    267 		return scanSkipSpace
    268 	}
    269 	if c == '"' {
    270 		s.step = stateInString
    271 		return scanBeginLiteral
    272 	}
    273 	return s.error(c, "looking for beginning of object key string")
    274 }
    275 
    276 // stateEndValue is the state after completing a value,
    277 // such as after reading `{}` or `true` or `["x"`.
    278 func stateEndValue(s *scanner, c byte) int {
    279 	n := len(s.parseState)
    280 	if n == 0 {
    281 		// Completed top-level before the current byte.
    282 		s.step = stateEndTop
    283 		s.endTop = true
    284 		return stateEndTop(s, c)
    285 	}
    286 	if c <= ' ' && isSpace(c) {
    287 		s.step = stateEndValue
    288 		return scanSkipSpace
    289 	}
    290 	ps := s.parseState[n-1]
    291 	switch ps {
    292 	case parseObjectKey:
    293 		if c == ':' {
    294 			s.parseState[n-1] = parseObjectValue
    295 			s.step = stateBeginValue
    296 			return scanObjectKey
    297 		}
    298 		return s.error(c, "after object key")
    299 	case parseObjectValue:
    300 		if c == ',' {
    301 			s.parseState[n-1] = parseObjectKey
    302 			s.step = stateBeginString
    303 			return scanObjectValue
    304 		}
    305 		if c == '}' {
    306 			s.popParseState()
    307 			return scanEndObject
    308 		}
    309 		return s.error(c, "after object key:value pair")
    310 	case parseArrayValue:
    311 		if c == ',' {
    312 			s.step = stateBeginValue
    313 			return scanArrayValue
    314 		}
    315 		if c == ']' {
    316 			s.popParseState()
    317 			return scanEndArray
    318 		}
    319 		return s.error(c, "after array element")
    320 	}
    321 	return s.error(c, "")
    322 }
    323 
    324 // stateEndTop is the state after finishing the top-level value,
    325 // such as after reading `{}` or `[1,2,3]`.
    326 // Only space characters should be seen now.
    327 func stateEndTop(s *scanner, c byte) int {
    328 	if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
    329 		// Complain about non-space byte on next call.
    330 		s.error(c, "after top-level value")
    331 	}
    332 	return scanEnd
    333 }
    334 
    335 // stateInString is the state after reading `"`.
    336 func stateInString(s *scanner, c byte) int {
    337 	if c == '"' {
    338 		s.step = stateEndValue
    339 		return scanContinue
    340 	}
    341 	if c == '\\' {
    342 		s.step = stateInStringEsc
    343 		return scanContinue
    344 	}
    345 	if c < 0x20 {
    346 		return s.error(c, "in string literal")
    347 	}
    348 	return scanContinue
    349 }
    350 
    351 // stateInStringEsc is the state after reading `"\` during a quoted string.
    352 func stateInStringEsc(s *scanner, c byte) int {
    353 	switch c {
    354 	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
    355 		s.step = stateInString
    356 		return scanContinue
    357 	case 'u':
    358 		s.step = stateInStringEscU
    359 		return scanContinue
    360 	}
    361 	return s.error(c, "in string escape code")
    362 }
    363 
    364 // stateInStringEscU is the state after reading `"\u` during a quoted string.
    365 func stateInStringEscU(s *scanner, c byte) int {
    366 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    367 		s.step = stateInStringEscU1
    368 		return scanContinue
    369 	}
    370 	// numbers
    371 	return s.error(c, "in \\u hexadecimal character escape")
    372 }
    373 
    374 // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
    375 func stateInStringEscU1(s *scanner, c byte) int {
    376 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    377 		s.step = stateInStringEscU12
    378 		return scanContinue
    379 	}
    380 	// numbers
    381 	return s.error(c, "in \\u hexadecimal character escape")
    382 }
    383 
    384 // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
    385 func stateInStringEscU12(s *scanner, c byte) int {
    386 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    387 		s.step = stateInStringEscU123
    388 		return scanContinue
    389 	}
    390 	// numbers
    391 	return s.error(c, "in \\u hexadecimal character escape")
    392 }
    393 
    394 // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
    395 func stateInStringEscU123(s *scanner, c byte) int {
    396 	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
    397 		s.step = stateInString
    398 		return scanContinue
    399 	}
    400 	// numbers
    401 	return s.error(c, "in \\u hexadecimal character escape")
    402 }
    403 
    404 // stateNeg is the state after reading `-` during a number.
    405 func stateNeg(s *scanner, c byte) int {
    406 	if c == '0' {
    407 		s.step = state0
    408 		return scanContinue
    409 	}
    410 	if '1' <= c && c <= '9' {
    411 		s.step = state1
    412 		return scanContinue
    413 	}
    414 	return s.error(c, "in numeric literal")
    415 }
    416 
    417 // state1 is the state after reading a non-zero integer during a number,
    418 // such as after reading `1` or `100` but not `0`.
    419 func state1(s *scanner, c byte) int {
    420 	if '0' <= c && c <= '9' {
    421 		s.step = state1
    422 		return scanContinue
    423 	}
    424 	return state0(s, c)
    425 }
    426 
    427 // state0 is the state after reading `0` during a number.
    428 func state0(s *scanner, c byte) int {
    429 	if c == '.' {
    430 		s.step = stateDot
    431 		return scanContinue
    432 	}
    433 	if c == 'e' || c == 'E' {
    434 		s.step = stateE
    435 		return scanContinue
    436 	}
    437 	return stateEndValue(s, c)
    438 }
    439 
    440 // stateDot is the state after reading the integer and decimal point in a number,
    441 // such as after reading `1.`.
    442 func stateDot(s *scanner, c byte) int {
    443 	if '0' <= c && c <= '9' {
    444 		s.step = stateDot0
    445 		return scanContinue
    446 	}
    447 	return s.error(c, "after decimal point in numeric literal")
    448 }
    449 
    450 // stateDot0 is the state after reading the integer, decimal point, and subsequent
    451 // digits of a number, such as after reading `3.14`.
    452 func stateDot0(s *scanner, c byte) int {
    453 	if '0' <= c && c <= '9' {
    454 		return scanContinue
    455 	}
    456 	if c == 'e' || c == 'E' {
    457 		s.step = stateE
    458 		return scanContinue
    459 	}
    460 	return stateEndValue(s, c)
    461 }
    462 
    463 // stateE is the state after reading the mantissa and e in a number,
    464 // such as after reading `314e` or `0.314e`.
    465 func stateE(s *scanner, c byte) int {
    466 	if c == '+' || c == '-' {
    467 		s.step = stateESign
    468 		return scanContinue
    469 	}
    470 	return stateESign(s, c)
    471 }
    472 
    473 // stateESign is the state after reading the mantissa, e, and sign in a number,
    474 // such as after reading `314e-` or `0.314e+`.
    475 func stateESign(s *scanner, c byte) int {
    476 	if '0' <= c && c <= '9' {
    477 		s.step = stateE0
    478 		return scanContinue
    479 	}
    480 	return s.error(c, "in exponent of numeric literal")
    481 }
    482 
    483 // stateE0 is the state after reading the mantissa, e, optional sign,
    484 // and at least one digit of the exponent in a number,
    485 // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
    486 func stateE0(s *scanner, c byte) int {
    487 	if '0' <= c && c <= '9' {
    488 		return scanContinue
    489 	}
    490 	return stateEndValue(s, c)
    491 }
    492 
    493 // stateT is the state after reading `t`.
    494 func stateT(s *scanner, c byte) int {
    495 	if c == 'r' {
    496 		s.step = stateTr
    497 		return scanContinue
    498 	}
    499 	return s.error(c, "in literal true (expecting 'r')")
    500 }
    501 
    502 // stateTr is the state after reading `tr`.
    503 func stateTr(s *scanner, c byte) int {
    504 	if c == 'u' {
    505 		s.step = stateTru
    506 		return scanContinue
    507 	}
    508 	return s.error(c, "in literal true (expecting 'u')")
    509 }
    510 
    511 // stateTru is the state after reading `tru`.
    512 func stateTru(s *scanner, c byte) int {
    513 	if c == 'e' {
    514 		s.step = stateEndValue
    515 		return scanContinue
    516 	}
    517 	return s.error(c, "in literal true (expecting 'e')")
    518 }
    519 
    520 // stateF is the state after reading `f`.
    521 func stateF(s *scanner, c byte) int {
    522 	if c == 'a' {
    523 		s.step = stateFa
    524 		return scanContinue
    525 	}
    526 	return s.error(c, "in literal false (expecting 'a')")
    527 }
    528 
    529 // stateFa is the state after reading `fa`.
    530 func stateFa(s *scanner, c byte) int {
    531 	if c == 'l' {
    532 		s.step = stateFal
    533 		return scanContinue
    534 	}
    535 	return s.error(c, "in literal false (expecting 'l')")
    536 }
    537 
    538 // stateFal is the state after reading `fal`.
    539 func stateFal(s *scanner, c byte) int {
    540 	if c == 's' {
    541 		s.step = stateFals
    542 		return scanContinue
    543 	}
    544 	return s.error(c, "in literal false (expecting 's')")
    545 }
    546 
    547 // stateFals is the state after reading `fals`.
    548 func stateFals(s *scanner, c byte) int {
    549 	if c == 'e' {
    550 		s.step = stateEndValue
    551 		return scanContinue
    552 	}
    553 	return s.error(c, "in literal false (expecting 'e')")
    554 }
    555 
    556 // stateN is the state after reading `n`.
    557 func stateN(s *scanner, c byte) int {
    558 	if c == 'u' {
    559 		s.step = stateNu
    560 		return scanContinue
    561 	}
    562 	return s.error(c, "in literal null (expecting 'u')")
    563 }
    564 
    565 // stateNu is the state after reading `nu`.
    566 func stateNu(s *scanner, c byte) int {
    567 	if c == 'l' {
    568 		s.step = stateNul
    569 		return scanContinue
    570 	}
    571 	return s.error(c, "in literal null (expecting 'l')")
    572 }
    573 
    574 // stateNul is the state after reading `nul`.
    575 func stateNul(s *scanner, c byte) int {
    576 	if c == 'l' {
    577 		s.step = stateEndValue
    578 		return scanContinue
    579 	}
    580 	return s.error(c, "in literal null (expecting 'l')")
    581 }
    582 
    583 // stateError is the state after reaching a syntax error,
    584 // such as after reading `[1}` or `5.1.2`.
    585 func stateError(s *scanner, c byte) int {
    586 	return scanError
    587 }
    588 
    589 // error records an error and switches to the error state.
    590 func (s *scanner) error(c byte, context string) int {
    591 	s.step = stateError
    592 	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
    593 	return scanError
    594 }
    595 
    596 // quoteChar formats c as a quoted character literal
    597 func quoteChar(c byte) string {
    598 	// special cases - different from quoted strings
    599 	if c == '\'' {
    600 		return `'\''`
    601 	}
    602 	if c == '"' {
    603 		return `'"'`
    604 	}
    605 
    606 	// use quoted string with different quotation marks
    607 	s := strconv.Quote(string(c))
    608 	return "'" + s[1:len(s)-1] + "'"
    609 }
    610 
    611 // undo causes the scanner to return scanCode from the next state transition.
    612 // This gives callers a simple 1-byte undo mechanism.
    613 func (s *scanner) undo(scanCode int) {
    614 	if s.redo {
    615 		panic("json: invalid use of scanner")
    616 	}
    617 	s.redoCode = scanCode
    618 	s.redoState = s.step
    619 	s.step = stateRedo
    620 	s.redo = true
    621 }
    622 
    623 // stateRedo helps implement the scanner's 1-byte undo.
    624 func stateRedo(s *scanner, c byte) int {
    625 	s.redo = false
    626 	s.step = s.redoState
    627 	return s.redoCode
    628 }
    629