1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/json/json_reader.h" 6 7 #include "base/float_util.h" 8 #include "base/logging.h" 9 #include "base/memory/scoped_ptr.h" 10 #include "base/string_number_conversions.h" 11 #include "base/string_util.h" 12 #include "base/utf_string_conversions.h" 13 #include "base/values.h" 14 15 namespace base { 16 17 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN, 18 0, 0); 19 static const int kStackLimit = 100; 20 21 namespace { 22 23 // A helper method for ParseNumberToken. It reads an int from the end of 24 // token. The method returns false if there is no valid integer at the end of 25 // the token. 26 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) { 27 wchar_t first = token.NextChar(); 28 int len = 0; 29 30 // Read in more digits 31 wchar_t c = first; 32 while ('\0' != c && '0' <= c && c <= '9') { 33 ++token.length; 34 ++len; 35 c = token.NextChar(); 36 } 37 // We need at least 1 digit. 38 if (len == 0) 39 return false; 40 41 if (!can_have_leading_zeros && len > 1 && '0' == first) 42 return false; 43 44 return true; 45 } 46 47 // A helper method for ParseStringToken. It reads |digits| hex digits from the 48 // token. If the sequence if digits is not valid (contains other characters), 49 // the method returns false. 50 bool ReadHexDigits(JSONReader::Token& token, int digits) { 51 for (int i = 1; i <= digits; ++i) { 52 wchar_t c = *(token.begin + token.length + i); 53 if ('\0' == c) 54 return false; 55 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 56 ('A' <= c && c <= 'F'))) { 57 return false; 58 } 59 } 60 61 token.length += digits; 62 return true; 63 } 64 65 } // anonymous namespace 66 67 const char* JSONReader::kBadRootElementType = 68 "Root value must be an array or object."; 69 const char* JSONReader::kInvalidEscape = 70 "Invalid escape sequence."; 71 const char* JSONReader::kSyntaxError = 72 "Syntax error."; 73 const char* JSONReader::kTrailingComma = 74 "Trailing comma not allowed."; 75 const char* JSONReader::kTooMuchNesting = 76 "Too much nesting."; 77 const char* JSONReader::kUnexpectedDataAfterRoot = 78 "Unexpected data after root element."; 79 const char* JSONReader::kUnsupportedEncoding = 80 "Unsupported encoding. JSON must be UTF-8."; 81 const char* JSONReader::kUnquotedDictionaryKey = 82 "Dictionary keys must be quoted."; 83 84 JSONReader::JSONReader() 85 : start_pos_(NULL), json_pos_(NULL), stack_depth_(0), 86 allow_trailing_comma_(false), 87 error_code_(JSON_NO_ERROR), error_line_(0), error_col_(0) {} 88 89 /* static */ 90 Value* JSONReader::Read(const std::string& json, 91 bool allow_trailing_comma) { 92 return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL); 93 } 94 95 /* static */ 96 Value* JSONReader::ReadAndReturnError(const std::string& json, 97 bool allow_trailing_comma, 98 int* error_code_out, 99 std::string* error_msg_out) { 100 JSONReader reader = JSONReader(); 101 Value* root = reader.JsonToValue(json, true, allow_trailing_comma); 102 if (root) 103 return root; 104 105 if (error_code_out) 106 *error_code_out = reader.error_code(); 107 if (error_msg_out) 108 *error_msg_out = reader.GetErrorMessage(); 109 110 return NULL; 111 } 112 113 /* static */ 114 std::string JSONReader::ErrorCodeToString(JsonParseError error_code) { 115 switch (error_code) { 116 case JSON_NO_ERROR: 117 return std::string(); 118 case JSON_BAD_ROOT_ELEMENT_TYPE: 119 return kBadRootElementType; 120 case JSON_INVALID_ESCAPE: 121 return kInvalidEscape; 122 case JSON_SYNTAX_ERROR: 123 return kSyntaxError; 124 case JSON_TRAILING_COMMA: 125 return kTrailingComma; 126 case JSON_TOO_MUCH_NESTING: 127 return kTooMuchNesting; 128 case JSON_UNEXPECTED_DATA_AFTER_ROOT: 129 return kUnexpectedDataAfterRoot; 130 case JSON_UNSUPPORTED_ENCODING: 131 return kUnsupportedEncoding; 132 case JSON_UNQUOTED_DICTIONARY_KEY: 133 return kUnquotedDictionaryKey; 134 default: 135 NOTREACHED(); 136 return std::string(); 137 } 138 } 139 140 std::string JSONReader::GetErrorMessage() const { 141 return FormatErrorMessage(error_line_, error_col_, 142 ErrorCodeToString(error_code_)); 143 } 144 145 Value* JSONReader::JsonToValue(const std::string& json, bool check_root, 146 bool allow_trailing_comma) { 147 // The input must be in UTF-8. 148 if (!IsStringUTF8(json.c_str())) { 149 error_code_ = JSON_UNSUPPORTED_ENCODING; 150 return NULL; 151 } 152 153 // The conversion from UTF8 to wstring removes null bytes for us 154 // (a good thing). 155 std::wstring json_wide(UTF8ToWide(json)); 156 start_pos_ = json_wide.c_str(); 157 158 // When the input JSON string starts with a UTF-8 Byte-Order-Mark 159 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode 160 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from 161 // mis-treating a Unicode BOM as an invalid character and returning NULL, 162 // skip a converted Unicode BOM if it exists. 163 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { 164 ++start_pos_; 165 } 166 167 json_pos_ = start_pos_; 168 allow_trailing_comma_ = allow_trailing_comma; 169 stack_depth_ = 0; 170 error_code_ = JSON_NO_ERROR; 171 172 scoped_ptr<Value> root(BuildValue(check_root)); 173 if (root.get()) { 174 if (ParseToken().type == Token::END_OF_INPUT) { 175 return root.release(); 176 } else { 177 SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_); 178 } 179 } 180 181 // Default to calling errors "syntax errors". 182 if (error_code_ == 0) 183 SetErrorCode(JSON_SYNTAX_ERROR, json_pos_); 184 185 return NULL; 186 } 187 188 /* static */ 189 std::string JSONReader::FormatErrorMessage(int line, int column, 190 const std::string& description) { 191 if (line || column) { 192 return StringPrintf("Line: %i, column: %i, %s", 193 line, column, description.c_str()); 194 } 195 return description; 196 } 197 198 Value* JSONReader::BuildValue(bool is_root) { 199 ++stack_depth_; 200 if (stack_depth_ > kStackLimit) { 201 SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_); 202 return NULL; 203 } 204 205 Token token = ParseToken(); 206 // The root token must be an array or an object. 207 if (is_root && token.type != Token::OBJECT_BEGIN && 208 token.type != Token::ARRAY_BEGIN) { 209 SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_); 210 return NULL; 211 } 212 213 scoped_ptr<Value> node; 214 215 switch (token.type) { 216 case Token::END_OF_INPUT: 217 case Token::INVALID_TOKEN: 218 return NULL; 219 220 case Token::NULL_TOKEN: 221 node.reset(Value::CreateNullValue()); 222 break; 223 224 case Token::BOOL_TRUE: 225 node.reset(Value::CreateBooleanValue(true)); 226 break; 227 228 case Token::BOOL_FALSE: 229 node.reset(Value::CreateBooleanValue(false)); 230 break; 231 232 case Token::NUMBER: 233 node.reset(DecodeNumber(token)); 234 if (!node.get()) 235 return NULL; 236 break; 237 238 case Token::STRING: 239 node.reset(DecodeString(token)); 240 if (!node.get()) 241 return NULL; 242 break; 243 244 case Token::ARRAY_BEGIN: 245 { 246 json_pos_ += token.length; 247 token = ParseToken(); 248 249 node.reset(new ListValue()); 250 while (token.type != Token::ARRAY_END) { 251 Value* array_node = BuildValue(false); 252 if (!array_node) 253 return NULL; 254 static_cast<ListValue*>(node.get())->Append(array_node); 255 256 // After a list value, we expect a comma or the end of the list. 257 token = ParseToken(); 258 if (token.type == Token::LIST_SEPARATOR) { 259 json_pos_ += token.length; 260 token = ParseToken(); 261 // Trailing commas are invalid according to the JSON RFC, but some 262 // consumers need the parsing leniency, so handle accordingly. 263 if (token.type == Token::ARRAY_END) { 264 if (!allow_trailing_comma_) { 265 SetErrorCode(JSON_TRAILING_COMMA, json_pos_); 266 return NULL; 267 } 268 // Trailing comma OK, stop parsing the Array. 269 break; 270 } 271 } else if (token.type != Token::ARRAY_END) { 272 // Unexpected value after list value. Bail out. 273 return NULL; 274 } 275 } 276 if (token.type != Token::ARRAY_END) { 277 return NULL; 278 } 279 break; 280 } 281 282 case Token::OBJECT_BEGIN: 283 { 284 json_pos_ += token.length; 285 token = ParseToken(); 286 287 node.reset(new DictionaryValue); 288 while (token.type != Token::OBJECT_END) { 289 if (token.type != Token::STRING) { 290 SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_); 291 return NULL; 292 } 293 scoped_ptr<Value> dict_key_value(DecodeString(token)); 294 if (!dict_key_value.get()) 295 return NULL; 296 297 // Convert the key into a wstring. 298 std::string dict_key; 299 bool success = dict_key_value->GetAsString(&dict_key); 300 DCHECK(success); 301 302 json_pos_ += token.length; 303 token = ParseToken(); 304 if (token.type != Token::OBJECT_PAIR_SEPARATOR) 305 return NULL; 306 307 json_pos_ += token.length; 308 token = ParseToken(); 309 Value* dict_value = BuildValue(false); 310 if (!dict_value) 311 return NULL; 312 static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion( 313 dict_key, dict_value); 314 315 // After a key/value pair, we expect a comma or the end of the 316 // object. 317 token = ParseToken(); 318 if (token.type == Token::LIST_SEPARATOR) { 319 json_pos_ += token.length; 320 token = ParseToken(); 321 // Trailing commas are invalid according to the JSON RFC, but some 322 // consumers need the parsing leniency, so handle accordingly. 323 if (token.type == Token::OBJECT_END) { 324 if (!allow_trailing_comma_) { 325 SetErrorCode(JSON_TRAILING_COMMA, json_pos_); 326 return NULL; 327 } 328 // Trailing comma OK, stop parsing the Object. 329 break; 330 } 331 } else if (token.type != Token::OBJECT_END) { 332 // Unexpected value after last object value. Bail out. 333 return NULL; 334 } 335 } 336 if (token.type != Token::OBJECT_END) 337 return NULL; 338 339 break; 340 } 341 342 default: 343 // We got a token that's not a value. 344 return NULL; 345 } 346 json_pos_ += token.length; 347 348 --stack_depth_; 349 return node.release(); 350 } 351 352 JSONReader::Token JSONReader::ParseNumberToken() { 353 // We just grab the number here. We validate the size in DecodeNumber. 354 // According to RFC4627, a valid number is: [minus] int [frac] [exp] 355 Token token(Token::NUMBER, json_pos_, 0); 356 wchar_t c = *json_pos_; 357 if ('-' == c) { 358 ++token.length; 359 c = token.NextChar(); 360 } 361 362 if (!ReadInt(token, false)) 363 return kInvalidToken; 364 365 // Optional fraction part 366 c = token.NextChar(); 367 if ('.' == c) { 368 ++token.length; 369 if (!ReadInt(token, true)) 370 return kInvalidToken; 371 c = token.NextChar(); 372 } 373 374 // Optional exponent part 375 if ('e' == c || 'E' == c) { 376 ++token.length; 377 c = token.NextChar(); 378 if ('-' == c || '+' == c) { 379 ++token.length; 380 c = token.NextChar(); 381 } 382 if (!ReadInt(token, true)) 383 return kInvalidToken; 384 } 385 386 return token; 387 } 388 389 Value* JSONReader::DecodeNumber(const Token& token) { 390 const std::wstring num_string(token.begin, token.length); 391 392 int num_int; 393 if (StringToInt(WideToUTF8(num_string), &num_int)) 394 return Value::CreateIntegerValue(num_int); 395 396 double num_double; 397 if (StringToDouble(WideToUTF8(num_string), &num_double) && 398 base::IsFinite(num_double)) 399 return Value::CreateDoubleValue(num_double); 400 401 return NULL; 402 } 403 404 JSONReader::Token JSONReader::ParseStringToken() { 405 Token token(Token::STRING, json_pos_, 1); 406 wchar_t c = token.NextChar(); 407 while ('\0' != c) { 408 if ('\\' == c) { 409 ++token.length; 410 c = token.NextChar(); 411 // Make sure the escaped char is valid. 412 switch (c) { 413 case 'x': 414 if (!ReadHexDigits(token, 2)) { 415 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 416 return kInvalidToken; 417 } 418 break; 419 case 'u': 420 if (!ReadHexDigits(token, 4)) { 421 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 422 return kInvalidToken; 423 } 424 break; 425 case '\\': 426 case '/': 427 case 'b': 428 case 'f': 429 case 'n': 430 case 'r': 431 case 't': 432 case 'v': 433 case '"': 434 break; 435 default: 436 SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length); 437 return kInvalidToken; 438 } 439 } else if ('"' == c) { 440 ++token.length; 441 return token; 442 } 443 ++token.length; 444 c = token.NextChar(); 445 } 446 return kInvalidToken; 447 } 448 449 Value* JSONReader::DecodeString(const Token& token) { 450 std::wstring decoded_str; 451 decoded_str.reserve(token.length - 2); 452 453 for (int i = 1; i < token.length - 1; ++i) { 454 wchar_t c = *(token.begin + i); 455 if ('\\' == c) { 456 ++i; 457 c = *(token.begin + i); 458 switch (c) { 459 case '"': 460 case '/': 461 case '\\': 462 decoded_str.push_back(c); 463 break; 464 case 'b': 465 decoded_str.push_back('\b'); 466 break; 467 case 'f': 468 decoded_str.push_back('\f'); 469 break; 470 case 'n': 471 decoded_str.push_back('\n'); 472 break; 473 case 'r': 474 decoded_str.push_back('\r'); 475 break; 476 case 't': 477 decoded_str.push_back('\t'); 478 break; 479 case 'v': 480 decoded_str.push_back('\v'); 481 break; 482 483 case 'x': 484 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) + 485 HexDigitToInt(*(token.begin + i + 2))); 486 i += 2; 487 break; 488 case 'u': 489 decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) + 490 (HexDigitToInt(*(token.begin + i + 2)) << 8) + 491 (HexDigitToInt(*(token.begin + i + 3)) << 4) + 492 HexDigitToInt(*(token.begin + i + 4))); 493 i += 4; 494 break; 495 496 default: 497 // We should only have valid strings at this point. If not, 498 // ParseStringToken didn't do it's job. 499 NOTREACHED(); 500 return NULL; 501 } 502 } else { 503 // Not escaped 504 decoded_str.push_back(c); 505 } 506 } 507 return Value::CreateStringValue(WideToUTF16Hack(decoded_str)); 508 } 509 510 JSONReader::Token JSONReader::ParseToken() { 511 static const std::wstring kNullString(L"null"); 512 static const std::wstring kTrueString(L"true"); 513 static const std::wstring kFalseString(L"false"); 514 515 EatWhitespaceAndComments(); 516 517 Token token(Token::INVALID_TOKEN, 0, 0); 518 switch (*json_pos_) { 519 case '\0': 520 token.type = Token::END_OF_INPUT; 521 break; 522 523 case 'n': 524 if (NextStringMatch(kNullString)) 525 token = Token(Token::NULL_TOKEN, json_pos_, 4); 526 break; 527 528 case 't': 529 if (NextStringMatch(kTrueString)) 530 token = Token(Token::BOOL_TRUE, json_pos_, 4); 531 break; 532 533 case 'f': 534 if (NextStringMatch(kFalseString)) 535 token = Token(Token::BOOL_FALSE, json_pos_, 5); 536 break; 537 538 case '[': 539 token = Token(Token::ARRAY_BEGIN, json_pos_, 1); 540 break; 541 542 case ']': 543 token = Token(Token::ARRAY_END, json_pos_, 1); 544 break; 545 546 case ',': 547 token = Token(Token::LIST_SEPARATOR, json_pos_, 1); 548 break; 549 550 case '{': 551 token = Token(Token::OBJECT_BEGIN, json_pos_, 1); 552 break; 553 554 case '}': 555 token = Token(Token::OBJECT_END, json_pos_, 1); 556 break; 557 558 case ':': 559 token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1); 560 break; 561 562 case '0': 563 case '1': 564 case '2': 565 case '3': 566 case '4': 567 case '5': 568 case '6': 569 case '7': 570 case '8': 571 case '9': 572 case '-': 573 token = ParseNumberToken(); 574 break; 575 576 case '"': 577 token = ParseStringToken(); 578 break; 579 } 580 return token; 581 } 582 583 void JSONReader::EatWhitespaceAndComments() { 584 while ('\0' != *json_pos_) { 585 switch (*json_pos_) { 586 case ' ': 587 case '\n': 588 case '\r': 589 case '\t': 590 ++json_pos_; 591 break; 592 case '/': 593 // TODO(tc): This isn't in the RFC so it should be a parser flag. 594 if (!EatComment()) 595 return; 596 break; 597 default: 598 // Not a whitespace char, just exit. 599 return; 600 } 601 } 602 } 603 604 bool JSONReader::EatComment() { 605 if ('/' != *json_pos_) 606 return false; 607 608 wchar_t next_char = *(json_pos_ + 1); 609 if ('/' == next_char) { 610 // Line comment, read until \n or \r 611 json_pos_ += 2; 612 while ('\0' != *json_pos_) { 613 switch (*json_pos_) { 614 case '\n': 615 case '\r': 616 ++json_pos_; 617 return true; 618 default: 619 ++json_pos_; 620 } 621 } 622 } else if ('*' == next_char) { 623 // Block comment, read until */ 624 json_pos_ += 2; 625 while ('\0' != *json_pos_) { 626 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { 627 json_pos_ += 2; 628 return true; 629 } 630 ++json_pos_; 631 } 632 } else { 633 return false; 634 } 635 return true; 636 } 637 638 bool JSONReader::NextStringMatch(const std::wstring& str) { 639 for (size_t i = 0; i < str.length(); ++i) { 640 if ('\0' == *json_pos_) 641 return false; 642 if (*(json_pos_ + i) != str[i]) 643 return false; 644 } 645 return true; 646 } 647 648 void JSONReader::SetErrorCode(JsonParseError error, 649 const wchar_t* error_pos) { 650 int line_number = 1; 651 int column_number = 1; 652 653 // Figure out the line and column the error occured at. 654 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { 655 if (*pos == '\0') { 656 NOTREACHED(); 657 return; 658 } 659 660 if (*pos == '\n') { 661 ++line_number; 662 column_number = 1; 663 } else { 664 ++column_number; 665 } 666 } 667 668 error_line_ = line_number; 669 error_col_ = column_number; 670 error_code_ = error; 671 } 672 673 } // namespace base 674