1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/json/json_reader.h" 6 7 #include "base/float_util.h" 8 #include "base/logging.h" 9 #include "base/scoped_ptr.h" 10 #include "base/string_util.h" 11 #include "base/utf_string_conversions.h" 12 #include "base/values.h" 13 14 namespace base { 15 16 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN, 17 0, 0); 18 static const int kStackLimit = 100; 19 20 namespace { 21 22 inline int HexToInt(wchar_t c) { 23 if ('0' <= c && c <= '9') { 24 return c - '0'; 25 } else if ('A' <= c && c <= 'F') { 26 return c - 'A' + 10; 27 } else if ('a' <= c && c <= 'f') { 28 return c - 'a' + 10; 29 } 30 NOTREACHED(); 31 return 0; 32 } 33 34 // A helper method for ParseNumberToken. It reads an int from the end of 35 // token. The method returns false if there is no valid integer at the end of 36 // the token. 37 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) { 38 wchar_t first = token.NextChar(); 39 int len = 0; 40 41 // Read in more digits 42 wchar_t c = first; 43 while ('\0' != c && '0' <= c && c <= '9') { 44 ++token.length; 45 ++len; 46 c = token.NextChar(); 47 } 48 // We need at least 1 digit. 49 if (len == 0) 50 return false; 51 52 if (!can_have_leading_zeros && len > 1 && '0' == first) 53 return false; 54 55 return true; 56 } 57 58 // A helper method for ParseStringToken. It reads |digits| hex digits from the 59 // token. If the sequence if digits is not valid (contains other characters), 60 // the method returns false. 61 bool ReadHexDigits(JSONReader::Token& token, int digits) { 62 for (int i = 1; i <= digits; ++i) { 63 wchar_t c = *(token.begin + token.length + i); 64 if ('\0' == c) 65 return false; 66 if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || 67 ('A' <= c && c <= 'F'))) { 68 return false; 69 } 70 } 71 72 token.length += digits; 73 return true; 74 } 75 76 } // anonymous namespace 77 78 const char* JSONReader::kBadRootElementType = 79 "Root value must be an array or object."; 80 const char* JSONReader::kInvalidEscape = 81 "Invalid escape sequence."; 82 const char* JSONReader::kSyntaxError = 83 "Syntax error."; 84 const char* JSONReader::kTrailingComma = 85 "Trailing comma not allowed."; 86 const char* JSONReader::kTooMuchNesting = 87 "Too much nesting."; 88 const char* JSONReader::kUnexpectedDataAfterRoot = 89 "Unexpected data after root element."; 90 const char* JSONReader::kUnsupportedEncoding = 91 "Unsupported encoding. JSON must be UTF-8."; 92 const char* JSONReader::kUnquotedDictionaryKey = 93 "Dictionary keys must be quoted."; 94 95 /* static */ 96 Value* JSONReader::Read(const std::string& json, 97 bool allow_trailing_comma) { 98 return ReadAndReturnError(json, allow_trailing_comma, NULL); 99 } 100 101 /* static */ 102 Value* JSONReader::ReadAndReturnError(const std::string& json, 103 bool allow_trailing_comma, 104 std::string *error_message_out) { 105 JSONReader reader = JSONReader(); 106 Value* root = reader.JsonToValue(json, true, allow_trailing_comma); 107 if (root) 108 return root; 109 110 if (error_message_out) 111 *error_message_out = reader.error_message(); 112 113 return NULL; 114 } 115 116 /* static */ 117 std::string JSONReader::FormatErrorMessage(int line, int column, 118 const char* description) { 119 return StringPrintf("Line: %i, column: %i, %s", 120 line, column, description); 121 } 122 123 JSONReader::JSONReader() 124 : start_pos_(NULL), json_pos_(NULL), stack_depth_(0), 125 allow_trailing_comma_(false) {} 126 127 Value* JSONReader::JsonToValue(const std::string& json, bool check_root, 128 bool allow_trailing_comma) { 129 // The input must be in UTF-8. 130 if (!IsStringUTF8(json.c_str())) { 131 error_message_ = kUnsupportedEncoding; 132 return NULL; 133 } 134 135 // The conversion from UTF8 to wstring removes null bytes for us 136 // (a good thing). 137 std::wstring json_wide(UTF8ToWide(json)); 138 start_pos_ = json_wide.c_str(); 139 140 // When the input JSON string starts with a UTF-8 Byte-Order-Mark 141 // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode 142 // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from 143 // mis-treating a Unicode BOM as an invalid character and returning NULL, 144 // skip a converted Unicode BOM if it exists. 145 if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { 146 ++start_pos_; 147 } 148 149 json_pos_ = start_pos_; 150 allow_trailing_comma_ = allow_trailing_comma; 151 stack_depth_ = 0; 152 error_message_.clear(); 153 154 scoped_ptr<Value> root(BuildValue(check_root)); 155 if (root.get()) { 156 if (ParseToken().type == Token::END_OF_INPUT) { 157 return root.release(); 158 } else { 159 SetErrorMessage(kUnexpectedDataAfterRoot, json_pos_); 160 } 161 } 162 163 // Default to calling errors "syntax errors". 164 if (error_message_.empty()) 165 SetErrorMessage(kSyntaxError, json_pos_); 166 167 return NULL; 168 } 169 170 Value* JSONReader::BuildValue(bool is_root) { 171 ++stack_depth_; 172 if (stack_depth_ > kStackLimit) { 173 SetErrorMessage(kTooMuchNesting, json_pos_); 174 return NULL; 175 } 176 177 Token token = ParseToken(); 178 // The root token must be an array or an object. 179 if (is_root && token.type != Token::OBJECT_BEGIN && 180 token.type != Token::ARRAY_BEGIN) { 181 SetErrorMessage(kBadRootElementType, json_pos_); 182 return NULL; 183 } 184 185 scoped_ptr<Value> node; 186 187 switch (token.type) { 188 case Token::END_OF_INPUT: 189 case Token::INVALID_TOKEN: 190 return NULL; 191 192 case Token::NULL_TOKEN: 193 node.reset(Value::CreateNullValue()); 194 break; 195 196 case Token::BOOL_TRUE: 197 node.reset(Value::CreateBooleanValue(true)); 198 break; 199 200 case Token::BOOL_FALSE: 201 node.reset(Value::CreateBooleanValue(false)); 202 break; 203 204 case Token::NUMBER: 205 node.reset(DecodeNumber(token)); 206 if (!node.get()) 207 return NULL; 208 break; 209 210 case Token::STRING: 211 node.reset(DecodeString(token)); 212 if (!node.get()) 213 return NULL; 214 break; 215 216 case Token::ARRAY_BEGIN: 217 { 218 json_pos_ += token.length; 219 token = ParseToken(); 220 221 node.reset(new ListValue()); 222 while (token.type != Token::ARRAY_END) { 223 Value* array_node = BuildValue(false); 224 if (!array_node) 225 return NULL; 226 static_cast<ListValue*>(node.get())->Append(array_node); 227 228 // After a list value, we expect a comma or the end of the list. 229 token = ParseToken(); 230 if (token.type == Token::LIST_SEPARATOR) { 231 json_pos_ += token.length; 232 token = ParseToken(); 233 // Trailing commas are invalid according to the JSON RFC, but some 234 // consumers need the parsing leniency, so handle accordingly. 235 if (token.type == Token::ARRAY_END) { 236 if (!allow_trailing_comma_) { 237 SetErrorMessage(kTrailingComma, json_pos_); 238 return NULL; 239 } 240 // Trailing comma OK, stop parsing the Array. 241 break; 242 } 243 } else if (token.type != Token::ARRAY_END) { 244 // Unexpected value after list value. Bail out. 245 return NULL; 246 } 247 } 248 if (token.type != Token::ARRAY_END) { 249 return NULL; 250 } 251 break; 252 } 253 254 case Token::OBJECT_BEGIN: 255 { 256 json_pos_ += token.length; 257 token = ParseToken(); 258 259 node.reset(new DictionaryValue); 260 while (token.type != Token::OBJECT_END) { 261 if (token.type != Token::STRING) { 262 SetErrorMessage(kUnquotedDictionaryKey, json_pos_); 263 return NULL; 264 } 265 scoped_ptr<Value> dict_key_value(DecodeString(token)); 266 if (!dict_key_value.get()) 267 return NULL; 268 269 // Convert the key into a wstring. 270 std::wstring dict_key; 271 bool success = dict_key_value->GetAsString(&dict_key); 272 DCHECK(success); 273 274 json_pos_ += token.length; 275 token = ParseToken(); 276 if (token.type != Token::OBJECT_PAIR_SEPARATOR) 277 return NULL; 278 279 json_pos_ += token.length; 280 token = ParseToken(); 281 Value* dict_value = BuildValue(false); 282 if (!dict_value) 283 return NULL; 284 static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion( 285 dict_key, dict_value); 286 287 // After a key/value pair, we expect a comma or the end of the 288 // object. 289 token = ParseToken(); 290 if (token.type == Token::LIST_SEPARATOR) { 291 json_pos_ += token.length; 292 token = ParseToken(); 293 // Trailing commas are invalid according to the JSON RFC, but some 294 // consumers need the parsing leniency, so handle accordingly. 295 if (token.type == Token::OBJECT_END) { 296 if (!allow_trailing_comma_) { 297 SetErrorMessage(kTrailingComma, json_pos_); 298 return NULL; 299 } 300 // Trailing comma OK, stop parsing the Object. 301 break; 302 } 303 } else if (token.type != Token::OBJECT_END) { 304 // Unexpected value after last object value. Bail out. 305 return NULL; 306 } 307 } 308 if (token.type != Token::OBJECT_END) 309 return NULL; 310 311 break; 312 } 313 314 default: 315 // We got a token that's not a value. 316 return NULL; 317 } 318 json_pos_ += token.length; 319 320 --stack_depth_; 321 return node.release(); 322 } 323 324 JSONReader::Token JSONReader::ParseNumberToken() { 325 // We just grab the number here. We validate the size in DecodeNumber. 326 // According to RFC4627, a valid number is: [minus] int [frac] [exp] 327 Token token(Token::NUMBER, json_pos_, 0); 328 wchar_t c = *json_pos_; 329 if ('-' == c) { 330 ++token.length; 331 c = token.NextChar(); 332 } 333 334 if (!ReadInt(token, false)) 335 return kInvalidToken; 336 337 // Optional fraction part 338 c = token.NextChar(); 339 if ('.' == c) { 340 ++token.length; 341 if (!ReadInt(token, true)) 342 return kInvalidToken; 343 c = token.NextChar(); 344 } 345 346 // Optional exponent part 347 if ('e' == c || 'E' == c) { 348 ++token.length; 349 c = token.NextChar(); 350 if ('-' == c || '+' == c) { 351 ++token.length; 352 c = token.NextChar(); 353 } 354 if (!ReadInt(token, true)) 355 return kInvalidToken; 356 } 357 358 return token; 359 } 360 361 Value* JSONReader::DecodeNumber(const Token& token) { 362 const std::wstring num_string(token.begin, token.length); 363 364 int num_int; 365 if (StringToInt(WideToUTF16Hack(num_string), &num_int)) 366 return Value::CreateIntegerValue(num_int); 367 368 double num_double; 369 if (StringToDouble(WideToUTF16Hack(num_string), &num_double) && 370 base::IsFinite(num_double)) 371 return Value::CreateRealValue(num_double); 372 373 return NULL; 374 } 375 376 JSONReader::Token JSONReader::ParseStringToken() { 377 Token token(Token::STRING, json_pos_, 1); 378 wchar_t c = token.NextChar(); 379 while ('\0' != c) { 380 if ('\\' == c) { 381 ++token.length; 382 c = token.NextChar(); 383 // Make sure the escaped char is valid. 384 switch (c) { 385 case 'x': 386 if (!ReadHexDigits(token, 2)) { 387 SetErrorMessage(kInvalidEscape, json_pos_ + token.length); 388 return kInvalidToken; 389 } 390 break; 391 case 'u': 392 if (!ReadHexDigits(token, 4)) { 393 SetErrorMessage(kInvalidEscape, json_pos_ + token.length); 394 return kInvalidToken; 395 } 396 break; 397 case '\\': 398 case '/': 399 case 'b': 400 case 'f': 401 case 'n': 402 case 'r': 403 case 't': 404 case 'v': 405 case '"': 406 break; 407 default: 408 SetErrorMessage(kInvalidEscape, json_pos_ + token.length); 409 return kInvalidToken; 410 } 411 } else if ('"' == c) { 412 ++token.length; 413 return token; 414 } 415 ++token.length; 416 c = token.NextChar(); 417 } 418 return kInvalidToken; 419 } 420 421 Value* JSONReader::DecodeString(const Token& token) { 422 std::wstring decoded_str; 423 decoded_str.reserve(token.length - 2); 424 425 for (int i = 1; i < token.length - 1; ++i) { 426 wchar_t c = *(token.begin + i); 427 if ('\\' == c) { 428 ++i; 429 c = *(token.begin + i); 430 switch (c) { 431 case '"': 432 case '/': 433 case '\\': 434 decoded_str.push_back(c); 435 break; 436 case 'b': 437 decoded_str.push_back('\b'); 438 break; 439 case 'f': 440 decoded_str.push_back('\f'); 441 break; 442 case 'n': 443 decoded_str.push_back('\n'); 444 break; 445 case 'r': 446 decoded_str.push_back('\r'); 447 break; 448 case 't': 449 decoded_str.push_back('\t'); 450 break; 451 case 'v': 452 decoded_str.push_back('\v'); 453 break; 454 455 case 'x': 456 decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 4) + 457 HexToInt(*(token.begin + i + 2))); 458 i += 2; 459 break; 460 case 'u': 461 decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 12 ) + 462 (HexToInt(*(token.begin + i + 2)) << 8) + 463 (HexToInt(*(token.begin + i + 3)) << 4) + 464 HexToInt(*(token.begin + i + 4))); 465 i += 4; 466 break; 467 468 default: 469 // We should only have valid strings at this point. If not, 470 // ParseStringToken didn't do it's job. 471 NOTREACHED(); 472 return NULL; 473 } 474 } else { 475 // Not escaped 476 decoded_str.push_back(c); 477 } 478 } 479 return Value::CreateStringValue(decoded_str); 480 } 481 482 JSONReader::Token JSONReader::ParseToken() { 483 static const std::wstring kNullString(L"null"); 484 static const std::wstring kTrueString(L"true"); 485 static const std::wstring kFalseString(L"false"); 486 487 EatWhitespaceAndComments(); 488 489 Token token(Token::INVALID_TOKEN, 0, 0); 490 switch (*json_pos_) { 491 case '\0': 492 token.type = Token::END_OF_INPUT; 493 break; 494 495 case 'n': 496 if (NextStringMatch(kNullString)) 497 token = Token(Token::NULL_TOKEN, json_pos_, 4); 498 break; 499 500 case 't': 501 if (NextStringMatch(kTrueString)) 502 token = Token(Token::BOOL_TRUE, json_pos_, 4); 503 break; 504 505 case 'f': 506 if (NextStringMatch(kFalseString)) 507 token = Token(Token::BOOL_FALSE, json_pos_, 5); 508 break; 509 510 case '[': 511 token = Token(Token::ARRAY_BEGIN, json_pos_, 1); 512 break; 513 514 case ']': 515 token = Token(Token::ARRAY_END, json_pos_, 1); 516 break; 517 518 case ',': 519 token = Token(Token::LIST_SEPARATOR, json_pos_, 1); 520 break; 521 522 case '{': 523 token = Token(Token::OBJECT_BEGIN, json_pos_, 1); 524 break; 525 526 case '}': 527 token = Token(Token::OBJECT_END, json_pos_, 1); 528 break; 529 530 case ':': 531 token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1); 532 break; 533 534 case '0': 535 case '1': 536 case '2': 537 case '3': 538 case '4': 539 case '5': 540 case '6': 541 case '7': 542 case '8': 543 case '9': 544 case '-': 545 token = ParseNumberToken(); 546 break; 547 548 case '"': 549 token = ParseStringToken(); 550 break; 551 } 552 return token; 553 } 554 555 bool JSONReader::NextStringMatch(const std::wstring& str) { 556 for (size_t i = 0; i < str.length(); ++i) { 557 if ('\0' == *json_pos_) 558 return false; 559 if (*(json_pos_ + i) != str[i]) 560 return false; 561 } 562 return true; 563 } 564 565 void JSONReader::EatWhitespaceAndComments() { 566 while ('\0' != *json_pos_) { 567 switch (*json_pos_) { 568 case ' ': 569 case '\n': 570 case '\r': 571 case '\t': 572 ++json_pos_; 573 break; 574 case '/': 575 // TODO(tc): This isn't in the RFC so it should be a parser flag. 576 if (!EatComment()) 577 return; 578 break; 579 default: 580 // Not a whitespace char, just exit. 581 return; 582 } 583 } 584 } 585 586 bool JSONReader::EatComment() { 587 if ('/' != *json_pos_) 588 return false; 589 590 wchar_t next_char = *(json_pos_ + 1); 591 if ('/' == next_char) { 592 // Line comment, read until \n or \r 593 json_pos_ += 2; 594 while ('\0' != *json_pos_) { 595 switch (*json_pos_) { 596 case '\n': 597 case '\r': 598 ++json_pos_; 599 return true; 600 default: 601 ++json_pos_; 602 } 603 } 604 } else if ('*' == next_char) { 605 // Block comment, read until */ 606 json_pos_ += 2; 607 while ('\0' != *json_pos_) { 608 if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { 609 json_pos_ += 2; 610 return true; 611 } 612 ++json_pos_; 613 } 614 } else { 615 return false; 616 } 617 return true; 618 } 619 620 void JSONReader::SetErrorMessage(const char* description, 621 const wchar_t* error_pos) { 622 int line_number = 1; 623 int column_number = 1; 624 625 // Figure out the line and column the error occured at. 626 for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { 627 if (*pos == '\0') { 628 NOTREACHED(); 629 return; 630 } 631 632 if (*pos == '\n') { 633 ++line_number; 634 column_number = 1; 635 } else { 636 ++column_number; 637 } 638 } 639 640 error_message_ = FormatErrorMessage(line_number, column_number, description); 641 } 642 643 } // namespace base 644