Home | History | Annotate | Download | only in json
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "base/json/json_reader.h"
      6 
      7 #include "base/float_util.h"
      8 #include "base/logging.h"
      9 #include "base/scoped_ptr.h"
     10 #include "base/string_util.h"
     11 #include "base/utf_string_conversions.h"
     12 #include "base/values.h"
     13 
     14 namespace base {
     15 
     16 static const JSONReader::Token kInvalidToken(JSONReader::Token::INVALID_TOKEN,
     17                                              0, 0);
     18 static const int kStackLimit = 100;
     19 
     20 namespace {
     21 
     22 inline int HexToInt(wchar_t c) {
     23   if ('0' <= c && c <= '9') {
     24     return c - '0';
     25   } else if ('A' <= c && c <= 'F') {
     26     return c - 'A' + 10;
     27   } else if ('a' <= c && c <= 'f') {
     28     return c - 'a' + 10;
     29   }
     30   NOTREACHED();
     31   return 0;
     32 }
     33 
     34 // A helper method for ParseNumberToken.  It reads an int from the end of
     35 // token.  The method returns false if there is no valid integer at the end of
     36 // the token.
     37 bool ReadInt(JSONReader::Token& token, bool can_have_leading_zeros) {
     38   wchar_t first = token.NextChar();
     39   int len = 0;
     40 
     41   // Read in more digits
     42   wchar_t c = first;
     43   while ('\0' != c && '0' <= c && c <= '9') {
     44     ++token.length;
     45     ++len;
     46     c = token.NextChar();
     47   }
     48   // We need at least 1 digit.
     49   if (len == 0)
     50     return false;
     51 
     52   if (!can_have_leading_zeros && len > 1 && '0' == first)
     53     return false;
     54 
     55   return true;
     56 }
     57 
     58 // A helper method for ParseStringToken.  It reads |digits| hex digits from the
     59 // token. If the sequence if digits is not valid (contains other characters),
     60 // the method returns false.
     61 bool ReadHexDigits(JSONReader::Token& token, int digits) {
     62   for (int i = 1; i <= digits; ++i) {
     63     wchar_t c = *(token.begin + token.length + i);
     64     if ('\0' == c)
     65       return false;
     66     if (!(('0' <= c && c <= '9') || ('a' <= c && c <= 'f') ||
     67           ('A' <= c && c <= 'F'))) {
     68       return false;
     69     }
     70   }
     71 
     72   token.length += digits;
     73   return true;
     74 }
     75 
     76 }  // anonymous namespace
     77 
     78 const char* JSONReader::kBadRootElementType =
     79     "Root value must be an array or object.";
     80 const char* JSONReader::kInvalidEscape =
     81     "Invalid escape sequence.";
     82 const char* JSONReader::kSyntaxError =
     83     "Syntax error.";
     84 const char* JSONReader::kTrailingComma =
     85     "Trailing comma not allowed.";
     86 const char* JSONReader::kTooMuchNesting =
     87     "Too much nesting.";
     88 const char* JSONReader::kUnexpectedDataAfterRoot =
     89     "Unexpected data after root element.";
     90 const char* JSONReader::kUnsupportedEncoding =
     91     "Unsupported encoding. JSON must be UTF-8.";
     92 const char* JSONReader::kUnquotedDictionaryKey =
     93     "Dictionary keys must be quoted.";
     94 
     95 /* static */
     96 Value* JSONReader::Read(const std::string& json,
     97                         bool allow_trailing_comma) {
     98   return ReadAndReturnError(json, allow_trailing_comma, NULL);
     99 }
    100 
    101 /* static */
    102 Value* JSONReader::ReadAndReturnError(const std::string& json,
    103                                       bool allow_trailing_comma,
    104                                       std::string *error_message_out) {
    105   JSONReader reader = JSONReader();
    106   Value* root = reader.JsonToValue(json, true, allow_trailing_comma);
    107   if (root)
    108     return root;
    109 
    110   if (error_message_out)
    111     *error_message_out = reader.error_message();
    112 
    113   return NULL;
    114 }
    115 
    116 /* static */
    117 std::string JSONReader::FormatErrorMessage(int line, int column,
    118                                            const char* description) {
    119   return StringPrintf("Line: %i, column: %i, %s",
    120                       line, column, description);
    121 }
    122 
    123 JSONReader::JSONReader()
    124     : start_pos_(NULL), json_pos_(NULL), stack_depth_(0),
    125       allow_trailing_comma_(false) {}
    126 
    127 Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
    128                                bool allow_trailing_comma) {
    129   // The input must be in UTF-8.
    130   if (!IsStringUTF8(json.c_str())) {
    131     error_message_ = kUnsupportedEncoding;
    132     return NULL;
    133   }
    134 
    135   // The conversion from UTF8 to wstring removes null bytes for us
    136   // (a good thing).
    137   std::wstring json_wide(UTF8ToWide(json));
    138   start_pos_ = json_wide.c_str();
    139 
    140   // When the input JSON string starts with a UTF-8 Byte-Order-Mark
    141   // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
    142   // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
    143   // mis-treating a Unicode BOM as an invalid character and returning NULL,
    144   // skip a converted Unicode BOM if it exists.
    145   if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
    146     ++start_pos_;
    147   }
    148 
    149   json_pos_ = start_pos_;
    150   allow_trailing_comma_ = allow_trailing_comma;
    151   stack_depth_ = 0;
    152   error_message_.clear();
    153 
    154   scoped_ptr<Value> root(BuildValue(check_root));
    155   if (root.get()) {
    156     if (ParseToken().type == Token::END_OF_INPUT) {
    157       return root.release();
    158     } else {
    159       SetErrorMessage(kUnexpectedDataAfterRoot, json_pos_);
    160     }
    161   }
    162 
    163   // Default to calling errors "syntax errors".
    164   if (error_message_.empty())
    165     SetErrorMessage(kSyntaxError, json_pos_);
    166 
    167   return NULL;
    168 }
    169 
    170 Value* JSONReader::BuildValue(bool is_root) {
    171   ++stack_depth_;
    172   if (stack_depth_ > kStackLimit) {
    173     SetErrorMessage(kTooMuchNesting, json_pos_);
    174     return NULL;
    175   }
    176 
    177   Token token = ParseToken();
    178   // The root token must be an array or an object.
    179   if (is_root && token.type != Token::OBJECT_BEGIN &&
    180       token.type != Token::ARRAY_BEGIN) {
    181     SetErrorMessage(kBadRootElementType, json_pos_);
    182     return NULL;
    183   }
    184 
    185   scoped_ptr<Value> node;
    186 
    187   switch (token.type) {
    188     case Token::END_OF_INPUT:
    189     case Token::INVALID_TOKEN:
    190       return NULL;
    191 
    192     case Token::NULL_TOKEN:
    193       node.reset(Value::CreateNullValue());
    194       break;
    195 
    196     case Token::BOOL_TRUE:
    197       node.reset(Value::CreateBooleanValue(true));
    198       break;
    199 
    200     case Token::BOOL_FALSE:
    201       node.reset(Value::CreateBooleanValue(false));
    202       break;
    203 
    204     case Token::NUMBER:
    205       node.reset(DecodeNumber(token));
    206       if (!node.get())
    207         return NULL;
    208       break;
    209 
    210     case Token::STRING:
    211       node.reset(DecodeString(token));
    212       if (!node.get())
    213         return NULL;
    214       break;
    215 
    216     case Token::ARRAY_BEGIN:
    217       {
    218         json_pos_ += token.length;
    219         token = ParseToken();
    220 
    221         node.reset(new ListValue());
    222         while (token.type != Token::ARRAY_END) {
    223           Value* array_node = BuildValue(false);
    224           if (!array_node)
    225             return NULL;
    226           static_cast<ListValue*>(node.get())->Append(array_node);
    227 
    228           // After a list value, we expect a comma or the end of the list.
    229           token = ParseToken();
    230           if (token.type == Token::LIST_SEPARATOR) {
    231             json_pos_ += token.length;
    232             token = ParseToken();
    233             // Trailing commas are invalid according to the JSON RFC, but some
    234             // consumers need the parsing leniency, so handle accordingly.
    235             if (token.type == Token::ARRAY_END) {
    236               if (!allow_trailing_comma_) {
    237                 SetErrorMessage(kTrailingComma, json_pos_);
    238                 return NULL;
    239               }
    240               // Trailing comma OK, stop parsing the Array.
    241               break;
    242             }
    243           } else if (token.type != Token::ARRAY_END) {
    244             // Unexpected value after list value.  Bail out.
    245             return NULL;
    246           }
    247         }
    248         if (token.type != Token::ARRAY_END) {
    249           return NULL;
    250         }
    251         break;
    252       }
    253 
    254     case Token::OBJECT_BEGIN:
    255       {
    256         json_pos_ += token.length;
    257         token = ParseToken();
    258 
    259         node.reset(new DictionaryValue);
    260         while (token.type != Token::OBJECT_END) {
    261           if (token.type != Token::STRING) {
    262             SetErrorMessage(kUnquotedDictionaryKey, json_pos_);
    263             return NULL;
    264           }
    265           scoped_ptr<Value> dict_key_value(DecodeString(token));
    266           if (!dict_key_value.get())
    267             return NULL;
    268 
    269           // Convert the key into a wstring.
    270           std::wstring dict_key;
    271           bool success = dict_key_value->GetAsString(&dict_key);
    272           DCHECK(success);
    273 
    274           json_pos_ += token.length;
    275           token = ParseToken();
    276           if (token.type != Token::OBJECT_PAIR_SEPARATOR)
    277             return NULL;
    278 
    279           json_pos_ += token.length;
    280           token = ParseToken();
    281           Value* dict_value = BuildValue(false);
    282           if (!dict_value)
    283             return NULL;
    284           static_cast<DictionaryValue*>(node.get())->SetWithoutPathExpansion(
    285               dict_key, dict_value);
    286 
    287           // After a key/value pair, we expect a comma or the end of the
    288           // object.
    289           token = ParseToken();
    290           if (token.type == Token::LIST_SEPARATOR) {
    291             json_pos_ += token.length;
    292             token = ParseToken();
    293             // Trailing commas are invalid according to the JSON RFC, but some
    294             // consumers need the parsing leniency, so handle accordingly.
    295             if (token.type == Token::OBJECT_END) {
    296               if (!allow_trailing_comma_) {
    297                 SetErrorMessage(kTrailingComma, json_pos_);
    298                 return NULL;
    299               }
    300               // Trailing comma OK, stop parsing the Object.
    301               break;
    302             }
    303           } else if (token.type != Token::OBJECT_END) {
    304             // Unexpected value after last object value.  Bail out.
    305             return NULL;
    306           }
    307         }
    308         if (token.type != Token::OBJECT_END)
    309           return NULL;
    310 
    311         break;
    312       }
    313 
    314     default:
    315       // We got a token that's not a value.
    316       return NULL;
    317   }
    318   json_pos_ += token.length;
    319 
    320   --stack_depth_;
    321   return node.release();
    322 }
    323 
    324 JSONReader::Token JSONReader::ParseNumberToken() {
    325   // We just grab the number here.  We validate the size in DecodeNumber.
    326   // According   to RFC4627, a valid number is: [minus] int [frac] [exp]
    327   Token token(Token::NUMBER, json_pos_, 0);
    328   wchar_t c = *json_pos_;
    329   if ('-' == c) {
    330     ++token.length;
    331     c = token.NextChar();
    332   }
    333 
    334   if (!ReadInt(token, false))
    335     return kInvalidToken;
    336 
    337   // Optional fraction part
    338   c = token.NextChar();
    339   if ('.' == c) {
    340     ++token.length;
    341     if (!ReadInt(token, true))
    342       return kInvalidToken;
    343     c = token.NextChar();
    344   }
    345 
    346   // Optional exponent part
    347   if ('e' == c || 'E' == c) {
    348     ++token.length;
    349     c = token.NextChar();
    350     if ('-' == c || '+' == c) {
    351       ++token.length;
    352       c = token.NextChar();
    353     }
    354     if (!ReadInt(token, true))
    355       return kInvalidToken;
    356   }
    357 
    358   return token;
    359 }
    360 
    361 Value* JSONReader::DecodeNumber(const Token& token) {
    362   const std::wstring num_string(token.begin, token.length);
    363 
    364   int num_int;
    365   if (StringToInt(WideToUTF16Hack(num_string), &num_int))
    366     return Value::CreateIntegerValue(num_int);
    367 
    368   double num_double;
    369   if (StringToDouble(WideToUTF16Hack(num_string), &num_double) &&
    370       base::IsFinite(num_double))
    371     return Value::CreateRealValue(num_double);
    372 
    373   return NULL;
    374 }
    375 
    376 JSONReader::Token JSONReader::ParseStringToken() {
    377   Token token(Token::STRING, json_pos_, 1);
    378   wchar_t c = token.NextChar();
    379   while ('\0' != c) {
    380     if ('\\' == c) {
    381       ++token.length;
    382       c = token.NextChar();
    383       // Make sure the escaped char is valid.
    384       switch (c) {
    385         case 'x':
    386           if (!ReadHexDigits(token, 2)) {
    387             SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
    388             return kInvalidToken;
    389           }
    390           break;
    391         case 'u':
    392           if (!ReadHexDigits(token, 4)) {
    393             SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
    394             return kInvalidToken;
    395           }
    396           break;
    397         case '\\':
    398         case '/':
    399         case 'b':
    400         case 'f':
    401         case 'n':
    402         case 'r':
    403         case 't':
    404         case 'v':
    405         case '"':
    406           break;
    407         default:
    408           SetErrorMessage(kInvalidEscape, json_pos_ + token.length);
    409           return kInvalidToken;
    410       }
    411     } else if ('"' == c) {
    412       ++token.length;
    413       return token;
    414     }
    415     ++token.length;
    416     c = token.NextChar();
    417   }
    418   return kInvalidToken;
    419 }
    420 
    421 Value* JSONReader::DecodeString(const Token& token) {
    422   std::wstring decoded_str;
    423   decoded_str.reserve(token.length - 2);
    424 
    425   for (int i = 1; i < token.length - 1; ++i) {
    426     wchar_t c = *(token.begin + i);
    427     if ('\\' == c) {
    428       ++i;
    429       c = *(token.begin + i);
    430       switch (c) {
    431         case '"':
    432         case '/':
    433         case '\\':
    434           decoded_str.push_back(c);
    435           break;
    436         case 'b':
    437           decoded_str.push_back('\b');
    438           break;
    439         case 'f':
    440           decoded_str.push_back('\f');
    441           break;
    442         case 'n':
    443           decoded_str.push_back('\n');
    444           break;
    445         case 'r':
    446           decoded_str.push_back('\r');
    447           break;
    448         case 't':
    449           decoded_str.push_back('\t');
    450           break;
    451         case 'v':
    452           decoded_str.push_back('\v');
    453           break;
    454 
    455         case 'x':
    456           decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 4) +
    457                                 HexToInt(*(token.begin + i + 2)));
    458           i += 2;
    459           break;
    460         case 'u':
    461           decoded_str.push_back((HexToInt(*(token.begin + i + 1)) << 12 ) +
    462                                 (HexToInt(*(token.begin + i + 2)) << 8) +
    463                                 (HexToInt(*(token.begin + i + 3)) << 4) +
    464                                 HexToInt(*(token.begin + i + 4)));
    465           i += 4;
    466           break;
    467 
    468         default:
    469           // We should only have valid strings at this point.  If not,
    470           // ParseStringToken didn't do it's job.
    471           NOTREACHED();
    472           return NULL;
    473       }
    474     } else {
    475       // Not escaped
    476       decoded_str.push_back(c);
    477     }
    478   }
    479   return Value::CreateStringValue(decoded_str);
    480 }
    481 
    482 JSONReader::Token JSONReader::ParseToken() {
    483   static const std::wstring kNullString(L"null");
    484   static const std::wstring kTrueString(L"true");
    485   static const std::wstring kFalseString(L"false");
    486 
    487   EatWhitespaceAndComments();
    488 
    489   Token token(Token::INVALID_TOKEN, 0, 0);
    490   switch (*json_pos_) {
    491     case '\0':
    492       token.type = Token::END_OF_INPUT;
    493       break;
    494 
    495     case 'n':
    496       if (NextStringMatch(kNullString))
    497         token = Token(Token::NULL_TOKEN, json_pos_, 4);
    498       break;
    499 
    500     case 't':
    501       if (NextStringMatch(kTrueString))
    502         token = Token(Token::BOOL_TRUE, json_pos_, 4);
    503       break;
    504 
    505     case 'f':
    506       if (NextStringMatch(kFalseString))
    507         token = Token(Token::BOOL_FALSE, json_pos_, 5);
    508       break;
    509 
    510     case '[':
    511       token = Token(Token::ARRAY_BEGIN, json_pos_, 1);
    512       break;
    513 
    514     case ']':
    515       token = Token(Token::ARRAY_END, json_pos_, 1);
    516       break;
    517 
    518     case ',':
    519       token = Token(Token::LIST_SEPARATOR, json_pos_, 1);
    520       break;
    521 
    522     case '{':
    523       token = Token(Token::OBJECT_BEGIN, json_pos_, 1);
    524       break;
    525 
    526     case '}':
    527       token = Token(Token::OBJECT_END, json_pos_, 1);
    528       break;
    529 
    530     case ':':
    531       token = Token(Token::OBJECT_PAIR_SEPARATOR, json_pos_, 1);
    532       break;
    533 
    534     case '0':
    535     case '1':
    536     case '2':
    537     case '3':
    538     case '4':
    539     case '5':
    540     case '6':
    541     case '7':
    542     case '8':
    543     case '9':
    544     case '-':
    545       token = ParseNumberToken();
    546       break;
    547 
    548     case '"':
    549       token = ParseStringToken();
    550       break;
    551   }
    552   return token;
    553 }
    554 
    555 bool JSONReader::NextStringMatch(const std::wstring& str) {
    556   for (size_t i = 0; i < str.length(); ++i) {
    557     if ('\0' == *json_pos_)
    558       return false;
    559     if (*(json_pos_ + i) != str[i])
    560       return false;
    561   }
    562   return true;
    563 }
    564 
    565 void JSONReader::EatWhitespaceAndComments() {
    566   while ('\0' != *json_pos_) {
    567     switch (*json_pos_) {
    568       case ' ':
    569       case '\n':
    570       case '\r':
    571       case '\t':
    572         ++json_pos_;
    573         break;
    574       case '/':
    575         // TODO(tc): This isn't in the RFC so it should be a parser flag.
    576         if (!EatComment())
    577           return;
    578         break;
    579       default:
    580         // Not a whitespace char, just exit.
    581         return;
    582     }
    583   }
    584 }
    585 
    586 bool JSONReader::EatComment() {
    587   if ('/' != *json_pos_)
    588     return false;
    589 
    590   wchar_t next_char = *(json_pos_ + 1);
    591   if ('/' == next_char) {
    592     // Line comment, read until \n or \r
    593     json_pos_ += 2;
    594     while ('\0' != *json_pos_) {
    595       switch (*json_pos_) {
    596         case '\n':
    597         case '\r':
    598           ++json_pos_;
    599           return true;
    600         default:
    601           ++json_pos_;
    602       }
    603     }
    604   } else if ('*' == next_char) {
    605     // Block comment, read until */
    606     json_pos_ += 2;
    607     while ('\0' != *json_pos_) {
    608       if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
    609         json_pos_ += 2;
    610         return true;
    611       }
    612       ++json_pos_;
    613     }
    614   } else {
    615     return false;
    616   }
    617   return true;
    618 }
    619 
    620 void JSONReader::SetErrorMessage(const char* description,
    621                                  const wchar_t* error_pos) {
    622   int line_number = 1;
    623   int column_number = 1;
    624 
    625   // Figure out the line and column the error occured at.
    626   for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
    627     if (*pos == '\0') {
    628       NOTREACHED();
    629       return;
    630     }
    631 
    632     if (*pos == '\n') {
    633       ++line_number;
    634       column_number = 1;
    635     } else {
    636       ++column_number;
    637     }
    638   }
    639 
    640   error_message_ = FormatErrorMessage(line_number, column_number, description);
    641 }
    642 
    643 }  // namespace base
    644