Home | History | Annotate | Download | only in json
      1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/json/json_reader.h"
      6 
      7 #include "base/float_util.h"
      8 #include "base/logging.h"
      9 #include "base/memory/scoped_ptr.h"
     10 #include "base/string_number_conversions.h"
     11 #include "base/string_util.h"
     12 #include "base/utf_string_conversions.h"
     13 #include "base/values.h"
     14 
     15 namespace base {
     16 
     17 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
     18                                              0, 0);
     19 static const int kStackLimit = 100;
     20 
     21 namespace {
     22 
     23 // A helper method for ParseNumberToken.  It reads an int from the end of
     24 // token.  The method returns false if there is no valid integer at the end of
     25 // the token.
     26 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
     27   wchar_t first = token.NextChar();
     28   int len = 0;
     29 
     30   // Read in more digits
     31   wchar_t c = first;
     32   while ('\0' != c && '0' <= c && c <= '9') {
     33     ++token.length;
     34     ++len;
     35     c = token.NextChar();
     36   }
     37   // We need at least 1 digit.
     38   if (len == 0)
     39     return false;
     40 
     41   if (!can_have_leading_zeros && len > 1 && '0' == first)
     42     return false;
     43 
     44   return true;
     45 }
     46 
     47 // A helper method for ParseStringToken.  It reads |digits| hex digits from the
     48 // token. If the sequence if digits is not valid (contains other characters),
     49 // the method returns false.
     50 bool ReadHexDigits(JSONReader::Token& token, int digits) {
     51   for (int i = 1; i <= digits; ++i) {
     52     wchar_t c = *(token.begin + token.length + i);
     53     if ('\0' == c)
     54       return false;
     55     if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
     56           ('A' <= c && c <= 'F'))) {
     57       return false;
     58     }
     59   }
     60 
     61   token.length += digits;
     62   return true;
     63 }
     64 
     65 }  // anonymous namespace
     66 
     67 const char* JSONReader::kBadRootElementType =
     68     "Root value must be an array or object.";
     69 const char* JSONReader::kInvalidEscape =
     70     "Invalid escape sequence.";
     71 const char* JSONReader::kSyntaxError =
     72     "Syntax error.";
     73 const char* JSONReader::kTrailingComma =
     74     "Trailing comma not allowed.";
     75 const char* JSONReader::kTooMuchNesting =
     76     "Too much nesting.";
     77 const char* JSONReader::kUnexpectedDataAfterRoot =
     78     "Unexpected data after root element.";
     79 const char* JSONReader::kUnsupportedEncoding =
     80     "Unsupported encoding. JSON must be UTF-8.";
     81 const char* JSONReader::kUnquotedDictionaryKey =
     82     "Dictionary keys must be quoted.";
     83 
     84 JSONReader::JSONReader()
     85     : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
     86       allow_trailing_comma_(false),
     87       error_code_(JSON_NO_ERROR), error_line_(0), error_col_(0) {}
     88 
     89 /* static */
     90 Value* JSONReader::Read(const std::string& json,
     91                         bool allow_trailing_comma) {
     92   return ReadAndReturnError(json, allow_trailing_comma, NULL, NULL);
     93 }
     94 
     95 /* static */
     96 Value* JSONReader::ReadAndReturnError(const std::string& json,
     97                                       bool allow_trailing_comma,
     98                                       int* error_code_out,
     99                                       std::string* error_msg_out) {
    100   JSONReader reader = JSONReader();
    101   Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
    102   if (root)
    103     return root;
    104 
    105   if (error_code_out)
    106     *error_code_out = reader.error_code();
    107   if (error_msg_out)
    108     *error_msg_out = reader.GetErrorMessage();
    109 
    110   return NULL;
    111 }
    112 
    113 /* static */
    114 std::string JSONReader::ErrorCodeToString(JsonParseError error_code) {
    115   switch (error_code) {
    116     case JSON_NO_ERROR:
    117       return std::string();
    118     case JSON_BAD_ROOT_ELEMENT_TYPE:
    119       return kBadRootElementType;
    120     case JSON_INVALID_ESCAPE:
    121       return kInvalidEscape;
    122     case JSON_SYNTAX_ERROR:
    123       return kSyntaxError;
    124     case JSON_TRAILING_COMMA:
    125       return kTrailingComma;
    126     case JSON_TOO_MUCH_NESTING:
    127       return kTooMuchNesting;
    128     case JSON_UNEXPECTED_DATA_AFTER_ROOT:
    129       return kUnexpectedDataAfterRoot;
    130     case JSON_UNSUPPORTED_ENCODING:
    131       return kUnsupportedEncoding;
    132     case JSON_UNQUOTED_DICTIONARY_KEY:
    133       return kUnquotedDictionaryKey;
    134     default:
    135       NOTREACHED();
    136       return std::string();
    137   }
    138 }
    139 
    140 std::string JSONReader::GetErrorMessage() const {
    141   return FormatErrorMessage(error_line_, error_col_,
    142                             ErrorCodeToString(error_code_));
    143 }
    144 
    145 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
    146                                bool allow_trailing_comma) {
    147   // The input must be in UTF-8.
    148   if (!IsStringUTF8(json.c_str())) {
    149     error_code_ = JSON_UNSUPPORTED_ENCODING;
    150     return NULL;
    151   }
    152 
    153   // The conversion from UTF8 to wstring removes null bytes for us
    154   // (a good thing).
    155   std::wstring json_wide(UTF8ToWide(json));
    156   start_pos_ = json_wide.c_str();
    157 
    158   // When the input JSON string starts with a UTF-8 Byte-Order-Mark
    159   // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
    160   // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
    161   // mis-treating a Unicode BOM as an invalid character and returning NULL,
    162   // skip a converted Unicode BOM if it exists.
    163   if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
    164     ++start_pos_;
    165   }
    166 
    167   json_pos_ = start_pos_;
    168   allow_trailing_comma_ = allow_trailing_comma;
    169   stack_depth_ = 0;
    170   error_code_ = JSON_NO_ERROR;
    171 
    172   scoped_ptr<Value> root(BuildValue(check_root));
    173   if (root.get()) {
    174     if (ParseToken().type == Token::END_OF_INPUT) {
    175       return root.release();
    176     } else {
    177       SetErrorCode(JSON_UNEXPECTED_DATA_AFTER_ROOT, json_pos_);
    178     }
    179   }
    180 
    181   // Default to calling errors "syntax errors".
    182   if (error_code_ == 0)
    183     SetErrorCode(JSON_SYNTAX_ERROR, json_pos_);
    184 
    185   return NULL;
    186 }
    187 
    188 /* static */
    189 std::string JSONReader::FormatErrorMessage(int line, int column,
    190                                            const std::string& description) {
    191   if (line || column) {
    192     return StringPrintf("Line: %i, column: %i, %s",
    193                         line, column, description.c_str());
    194   }
    195   return description;
    196 }
    197 
    198 Value* JSONReader::BuildValue(bool is_root) {
    199   ++stack_depth_;
    200   if (stack_depth_ > kStackLimit) {
    201     SetErrorCode(JSON_TOO_MUCH_NESTING, json_pos_);
    202     return NULL;
    203   }
    204 
    205   Token token = ParseToken();
    206   // The root token must be an array or an object.
    207   if (is_root && token.type != Token::OBJECT_BEGIN &&
    208       token.type != Token::ARRAY_BEGIN) {
    209     SetErrorCode(JSON_BAD_ROOT_ELEMENT_TYPE, json_pos_);
    210     return NULL;
    211   }
    212 
    213   scoped_ptr<Value> node;
    214 
    215   switch (token.type) {
    216     case Token::END_OF_INPUT:
    217     case Token::INVALID_TOKEN:
    218       return NULL;
    219 
    220     case Token::NULL_TOKEN:
    221       node.reset(Value::CreateNullValue());
    222       break;
    223 
    224     case Token::BOOL_TRUE:
    225       node.reset(Value::CreateBooleanValue(true));
    226       break;
    227 
    228     case Token::BOOL_FALSE:
    229       node.reset(Value::CreateBooleanValue(false));
    230       break;
    231 
    232     case Token::NUMBER:
    233       node.reset(DecodeNumber(token));
    234       if (!node.get())
    235         return NULL;
    236       break;
    237 
    238     case Token::STRING:
    239       node.reset(DecodeString(token));
    240       if (!node.get())
    241         return NULL;
    242       break;
    243 
    244     case Token::ARRAY_BEGIN:
    245       {
    246         json_pos_ += token.length;
    247         token = ParseToken();
    248 
    249         node.reset(new ListValue());
    250         while (token.type != Token::ARRAY_END) {
    251           Value* array_node = BuildValue(false);
    252           if (!array_node)
    253             return NULL;
    254           static_cast<ListValue*>(node.get())->Append(array_node);
    255 
    256           // After a list value, we expect a comma or the end of the list.
    257           token = ParseToken();
    258           if (token.type == Token::LIST_SEPARATOR) {
    259             json_pos_ += token.length;
    260             token = ParseToken();
    261             // Trailing commas are invalid according to the JSON RFC, but some
    262             // consumers need the parsing leniency, so handle accordingly.
    263             if (token.type == Token::ARRAY_END) {
    264               if (!allow_trailing_comma_) {
    265                 SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
    266                 return NULL;
    267               }
    268               // Trailing comma OK, stop parsing the Array.
    269               break;
    270             }
    271           } else if (token.type != Token::ARRAY_END) {
    272             // Unexpected value after list value.  Bail out.
    273             return NULL;
    274           }
    275         }
    276         if (token.type != Token::ARRAY_END) {
    277           return NULL;
    278         }
    279         break;
    280       }
    281 
    282     case Token::OBJECT_BEGIN:
    283       {
    284         json_pos_ += token.length;
    285         token = ParseToken();
    286 
    287         node.reset(new DictionaryValue);
    288         while (token.type != Token::OBJECT_END) {
    289           if (token.type != Token::STRING) {
    290             SetErrorCode(JSON_UNQUOTED_DICTIONARY_KEY, json_pos_);
    291             return NULL;
    292           }
    293           scoped_ptr<Value> dict_key_value(DecodeString(token));
    294           if (!dict_key_value.get())
    295             return NULL;
    296 
    297           // Convert the key into a wstring.
    298           std::string dict_key;
    299           bool success = dict_key_value->GetAsString(&dict_key);
    300           DCHECK(success);
    301 
    302           json_pos_ += token.length;
    303           token = ParseToken();
    304           if (token.type != Token::OBJECT_PAIR_SEPARATOR)
    305             return NULL;
    306 
    307           json_pos_ += token.length;
    308           token = ParseToken();
    309           Value* dict_value = BuildValue(false);
    310           if (!dict_value)
    311             return NULL;
    312           static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
    313               dict_key, dict_value);
    314 
    315           // After a key/value pair, we expect a comma or the end of the
    316           // object.
    317           token = ParseToken();
    318           if (token.type == Token::LIST_SEPARATOR) {
    319             json_pos_ += token.length;
    320             token = ParseToken();
    321             // Trailing commas are invalid according to the JSON RFC, but some
    322             // consumers need the parsing leniency, so handle accordingly.
    323             if (token.type == Token::OBJECT_END) {
    324               if (!allow_trailing_comma_) {
    325                 SetErrorCode(JSON_TRAILING_COMMA, json_pos_);
    326                 return NULL;
    327               }
    328               // Trailing comma OK, stop parsing the Object.
    329               break;
    330             }
    331           } else if (token.type != Token::OBJECT_END) {
    332             // Unexpected value after last object value.  Bail out.
    333             return NULL;
    334           }
    335         }
    336         if (token.type != Token::OBJECT_END)
    337           return NULL;
    338 
    339         break;
    340       }
    341 
    342     default:
    343       // We got a token that's not a value.
    344       return NULL;
    345   }
    346   json_pos_ += token.length;
    347 
    348   --stack_depth_;
    349   return node.release();
    350 }
    351 
    352 JSONReader::Token JSONReader::ParseNumberToken() {
    353   // We just grab the number here.  We validate the size in DecodeNumber.
    354   // According   to RFC4627, a valid number is: [minus] int [frac] [exp]
    355   Token token(Token::NUMBER, json_pos_, 0);
    356   wchar_t c = *json_pos_;
    357   if ('-' == c) {
    358     ++token.length;
    359     c = token.NextChar();
    360   }
    361 
    362   if (!ReadInt(token, false))
    363     return kInvalidToken;
    364 
    365   // Optional fraction part
    366   c = token.NextChar();
    367   if ('.' == c) {
    368     ++token.length;
    369     if (!ReadInt(token, true))
    370       return kInvalidToken;
    371     c = token.NextChar();
    372   }
    373 
    374   // Optional exponent part
    375   if ('e' == c || 'E' == c) {
    376     ++token.length;
    377     c = token.NextChar();
    378     if ('-' == c || '+' == c) {
    379       ++token.length;
    380       c = token.NextChar();
    381     }
    382     if (!ReadInt(token, true))
    383       return kInvalidToken;
    384   }
    385 
    386   return token;
    387 }
    388 
    389 Value* JSONReader::DecodeNumber(const Token& token) {
    390   const std::wstring num_string(token.begin, token.length);
    391 
    392   int num_int;
    393   if (StringToInt(WideToUTF8(num_string), &num_int))
    394     return Value::CreateIntegerValue(num_int);
    395 
    396   double num_double;
    397   if (StringToDouble(WideToUTF8(num_string), &num_double) &&
    398       base::IsFinite(num_double))
    399     return Value::CreateDoubleValue(num_double);
    400 
    401   return NULL;
    402 }
    403 
    404 JSONReader::Token JSONReader::ParseStringToken() {
    405   Token token(Token::STRING, json_pos_, 1);
    406   wchar_t c = token.NextChar();
    407   while ('\0' != c) {
    408     if ('\\' == c) {
    409       ++token.length;
    410       c = token.NextChar();
    411       // Make sure the escaped char is valid.
    412       switch (c) {
    413         case 'x':
    414           if (!ReadHexDigits(token, 2)) {
    415             SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
    416             return kInvalidToken;
    417           }
    418           break;
    419         case 'u':
    420           if (!ReadHexDigits(token, 4)) {
    421             SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
    422             return kInvalidToken;
    423           }
    424           break;
    425         case '\\':
    426         case '/':
    427         case 'b':
    428         case 'f':
    429         case 'n':
    430         case 'r':
    431         case 't':
    432         case 'v':
    433         case '"':
    434           break;
    435         default:
    436           SetErrorCode(JSON_INVALID_ESCAPE, json_pos_ + token.length);
    437           return kInvalidToken;
    438       }
    439     } else if ('"' == c) {
    440       ++token.length;
    441       return token;
    442     }
    443     ++token.length;
    444     c = token.NextChar();
    445   }
    446   return kInvalidToken;
    447 }
    448 
    449 Value* JSONReader::DecodeString(const Token& token) {
    450   std::wstring decoded_str;
    451   decoded_str.reserve(token.length - 2);
    452 
    453   for (int i = 1; i < token.length - 1; ++i) {
    454     wchar_t c = *(token.begin + i);
    455     if ('\\' == c) {
    456       ++i;
    457       c = *(token.begin + i);
    458       switch (c) {
    459         case '"':
    460         case '/':
    461         case '\\':
    462           decoded_str.push_back(c);
    463           break;
    464         case 'b':
    465           decoded_str.push_back('\b');
    466           break;
    467         case 'f':
    468           decoded_str.push_back('\f');
    469           break;
    470         case 'n':
    471           decoded_str.push_back('\n');
    472           break;
    473         case 'r':
    474           decoded_str.push_back('\r');
    475           break;
    476         case 't':
    477           decoded_str.push_back('\t');
    478           break;
    479         case 'v':
    480           decoded_str.push_back('\v');
    481           break;
    482 
    483         case 'x':
    484           decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
    485                                 HexDigitToInt(*(token.begin + i + 2)));
    486           i += 2;
    487           break;
    488         case 'u':
    489           decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +
    490                                 (HexDigitToInt(*(token.begin + i + 2)) << 8) +
    491                                 (HexDigitToInt(*(token.begin + i + 3)) << 4) +
    492                                 HexDigitToInt(*(token.begin + i + 4)));
    493           i += 4;
    494           break;
    495 
    496         default:
    497           // We should only have valid strings at this point.  If not,
    498           // ParseStringToken didn't do it's job.
    499           NOTREACHED();
    500           return NULL;
    501       }
    502     } else {
    503       // Not escaped
    504       decoded_str.push_back(c);
    505     }
    506   }
    507   return Value::CreateStringValue(WideToUTF16Hack(decoded_str));
    508 }
    509 
    510 JSONReader::Token JSONReader::ParseToken() {
    511   static const std::wstring kNullString(L"null");
    512   static const std::wstring kTrueString(L"true");
    513   static const std::wstring kFalseString(L"false");
    514 
    515   EatWhitespaceAndComments();
    516 
    517   Token token(Token::INVALID_TOKEN, 0, 0);
    518   switch (*json_pos_) {
    519     case '\0':
    520       token.type = Token::END_OF_INPUT;
    521       break;
    522 
    523     case 'n':
    524       if (NextStringMatch(kNullString))
    525         token = Token(Token::NULL_TOKEN, json_pos_, 4);
    526       break;
    527 
    528     case 't':
    529       if (NextStringMatch(kTrueString))
    530         token = Token(Token::BOOL_TRUE, json_pos_, 4);
    531       break;
    532 
    533     case 'f':
    534       if (NextStringMatch(kFalseString))
    535         token = Token(Token::BOOL_FALSE, json_pos_, 5);
    536       break;
    537 
    538     case '[':
    539       token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
    540       break;
    541 
    542     case ']':
    543       token = Token(Token::ARRAY_END, json_pos_, 1);
    544       break;
    545 
    546     case ',':
    547       token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
    548       break;
    549 
    550     case '{':
    551       token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
    552       break;
    553 
    554     case '}':
    555       token = Token(Token::OBJECT_END, json_pos_, 1);
    556       break;
    557 
    558     case ':':
    559       token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
    560       break;
    561 
    562     case '0':
    563     case '1':
    564     case '2':
    565     case '3':
    566     case '4':
    567     case '5':
    568     case '6':
    569     case '7':
    570     case '8':
    571     case '9':
    572     case '-':
    573       token = ParseNumberToken();
    574       break;
    575 
    576     case '"':
    577       token = ParseStringToken();
    578       break;
    579   }
    580   return token;
    581 }
    582 
    583 void JSONReader::EatWhitespaceAndComments() {
    584   while ('\0' != *json_pos_) {
    585     switch (*json_pos_) {
    586       case ' ':
    587       case '\n':
    588       case '\r':
    589       case '\t':
    590         ++json_pos_;
    591         break;
    592       case '/':
    593         // TODO(tc): This isn't in the RFC so it should be a parser flag.
    594         if (!EatComment())
    595           return;
    596         break;
    597       default:
    598         // Not a whitespace char, just exit.
    599         return;
    600     }
    601   }
    602 }
    603 
    604 bool JSONReader::EatComment() {
    605   if ('/' != *json_pos_)
    606     return false;
    607 
    608   wchar_t next_char = *(json_pos_ + 1);
    609   if ('/' == next_char) {
    610     // Line comment, read until \n or \r
    611     json_pos_ += 2;
    612     while ('\0' != *json_pos_) {
    613       switch (*json_pos_) {
    614         case '\n':
    615         case '\r':
    616           ++json_pos_;
    617           return true;
    618         default:
    619           ++json_pos_;
    620       }
    621     }
    622   } else if ('*' == next_char) {
    623     // Block comment, read until */
    624     json_pos_ += 2;
    625     while ('\0' != *json_pos_) {
    626       if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
    627         json_pos_ += 2;
    628         return true;
    629       }
    630       ++json_pos_;
    631     }
    632   } else {
    633     return false;
    634   }
    635   return true;
    636 }
    637 
    638 bool JSONReader::NextStringMatch(const std::wstring& str) {
    639   for (size_t i = 0; i < str.length(); ++i) {
    640     if ('\0' == *json_pos_)
    641       return false;
    642     if (*(json_pos_ + i) != str[i])
    643       return false;
    644   }
    645   return true;
    646 }
    647 
    648 void JSONReader::SetErrorCode(JsonParseError error,
    649                               const wchar_t* error_pos) {
    650   int line_number = 1;
    651   int column_number = 1;
    652 
    653   // Figure out the line and column the error occured at.
    654   for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
    655     if (*pos == '\0') {
    656       NOTREACHED();
    657       return;
    658     }
    659 
    660     if (*pos == '\n') {
    661       ++line_number;
    662       column_number = 1;
    663     } else {
    664       ++column_number;
    665     }
    666   }
    667 
    668   error_line_ = line_number;
    669   error_col_ = column_number;
    670   error_code_ = error;
    671 }
    672 
    673 }  // namespace base
    674