Home | History | Annotate | Download | only in Support
      1 //=== JSON.cpp - JSON value, parsing and serialization - C++ -----------*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===---------------------------------------------------------------------===//
      9 
     10 #include "llvm/Support/JSON.h"
     11 #include "llvm/Support/ConvertUTF.h"
     12 #include "llvm/Support/Format.h"
     13 #include <cctype>
     14 
     15 namespace llvm {
     16 namespace json {
     17 
     18 Value &Object::operator[](const ObjectKey &K) {
     19   return try_emplace(K, nullptr).first->getSecond();
     20 }
     21 Value &Object::operator[](ObjectKey &&K) {
     22   return try_emplace(std::move(K), nullptr).first->getSecond();
     23 }
     24 Value *Object::get(StringRef K) {
     25   auto I = find(K);
     26   if (I == end())
     27     return nullptr;
     28   return &I->second;
     29 }
     30 const Value *Object::get(StringRef K) const {
     31   auto I = find(K);
     32   if (I == end())
     33     return nullptr;
     34   return &I->second;
     35 }
     36 llvm::Optional<std::nullptr_t> Object::getNull(StringRef K) const {
     37   if (auto *V = get(K))
     38     return V->getAsNull();
     39   return llvm::None;
     40 }
     41 llvm::Optional<bool> Object::getBoolean(StringRef K) const {
     42   if (auto *V = get(K))
     43     return V->getAsBoolean();
     44   return llvm::None;
     45 }
     46 llvm::Optional<double> Object::getNumber(StringRef K) const {
     47   if (auto *V = get(K))
     48     return V->getAsNumber();
     49   return llvm::None;
     50 }
     51 llvm::Optional<int64_t> Object::getInteger(StringRef K) const {
     52   if (auto *V = get(K))
     53     return V->getAsInteger();
     54   return llvm::None;
     55 }
     56 llvm::Optional<llvm::StringRef> Object::getString(StringRef K) const {
     57   if (auto *V = get(K))
     58     return V->getAsString();
     59   return llvm::None;
     60 }
     61 const json::Object *Object::getObject(StringRef K) const {
     62   if (auto *V = get(K))
     63     return V->getAsObject();
     64   return nullptr;
     65 }
     66 json::Object *Object::getObject(StringRef K) {
     67   if (auto *V = get(K))
     68     return V->getAsObject();
     69   return nullptr;
     70 }
     71 const json::Array *Object::getArray(StringRef K) const {
     72   if (auto *V = get(K))
     73     return V->getAsArray();
     74   return nullptr;
     75 }
     76 json::Array *Object::getArray(StringRef K) {
     77   if (auto *V = get(K))
     78     return V->getAsArray();
     79   return nullptr;
     80 }
     81 bool operator==(const Object &LHS, const Object &RHS) {
     82   if (LHS.size() != RHS.size())
     83     return false;
     84   for (const auto &L : LHS) {
     85     auto R = RHS.find(L.first);
     86     if (R == RHS.end() || L.second != R->second)
     87       return false;
     88   }
     89   return true;
     90 }
     91 
     92 Array::Array(std::initializer_list<Value> Elements) {
     93   V.reserve(Elements.size());
     94   for (const Value &V : Elements) {
     95     emplace_back(nullptr);
     96     back().moveFrom(std::move(V));
     97   }
     98 }
     99 
    100 Value::Value(std::initializer_list<Value> Elements)
    101     : Value(json::Array(Elements)) {}
    102 
    103 void Value::copyFrom(const Value &M) {
    104   Type = M.Type;
    105   switch (Type) {
    106   case T_Null:
    107   case T_Boolean:
    108   case T_Double:
    109   case T_Integer:
    110     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
    111     break;
    112   case T_StringRef:
    113     create<StringRef>(M.as<StringRef>());
    114     break;
    115   case T_String:
    116     create<std::string>(M.as<std::string>());
    117     break;
    118   case T_Object:
    119     create<json::Object>(M.as<json::Object>());
    120     break;
    121   case T_Array:
    122     create<json::Array>(M.as<json::Array>());
    123     break;
    124   }
    125 }
    126 
    127 void Value::moveFrom(const Value &&M) {
    128   Type = M.Type;
    129   switch (Type) {
    130   case T_Null:
    131   case T_Boolean:
    132   case T_Double:
    133   case T_Integer:
    134     memcpy(Union.buffer, M.Union.buffer, sizeof(Union.buffer));
    135     break;
    136   case T_StringRef:
    137     create<StringRef>(M.as<StringRef>());
    138     break;
    139   case T_String:
    140     create<std::string>(std::move(M.as<std::string>()));
    141     M.Type = T_Null;
    142     break;
    143   case T_Object:
    144     create<json::Object>(std::move(M.as<json::Object>()));
    145     M.Type = T_Null;
    146     break;
    147   case T_Array:
    148     create<json::Array>(std::move(M.as<json::Array>()));
    149     M.Type = T_Null;
    150     break;
    151   }
    152 }
    153 
    154 void Value::destroy() {
    155   switch (Type) {
    156   case T_Null:
    157   case T_Boolean:
    158   case T_Double:
    159   case T_Integer:
    160     break;
    161   case T_StringRef:
    162     as<StringRef>().~StringRef();
    163     break;
    164   case T_String:
    165     as<std::string>().~basic_string();
    166     break;
    167   case T_Object:
    168     as<json::Object>().~Object();
    169     break;
    170   case T_Array:
    171     as<json::Array>().~Array();
    172     break;
    173   }
    174 }
    175 
    176 bool operator==(const Value &L, const Value &R) {
    177   if (L.kind() != R.kind())
    178     return false;
    179   switch (L.kind()) {
    180   case Value::Null:
    181     return *L.getAsNull() == *R.getAsNull();
    182   case Value::Boolean:
    183     return *L.getAsBoolean() == *R.getAsBoolean();
    184   case Value::Number:
    185     return *L.getAsNumber() == *R.getAsNumber();
    186   case Value::String:
    187     return *L.getAsString() == *R.getAsString();
    188   case Value::Array:
    189     return *L.getAsArray() == *R.getAsArray();
    190   case Value::Object:
    191     return *L.getAsObject() == *R.getAsObject();
    192   }
    193   llvm_unreachable("Unknown value kind");
    194 }
    195 
    196 namespace {
    197 // Simple recursive-descent JSON parser.
    198 class Parser {
    199 public:
    200   Parser(StringRef JSON)
    201       : Start(JSON.begin()), P(JSON.begin()), End(JSON.end()) {}
    202 
    203   bool checkUTF8() {
    204     size_t ErrOffset;
    205     if (isUTF8(StringRef(Start, End - Start), &ErrOffset))
    206       return true;
    207     P = Start + ErrOffset; // For line/column calculation.
    208     return parseError("Invalid UTF-8 sequence");
    209   }
    210 
    211   bool parseValue(Value &Out);
    212 
    213   bool assertEnd() {
    214     eatWhitespace();
    215     if (P == End)
    216       return true;
    217     return parseError("Text after end of document");
    218   }
    219 
    220   Error takeError() {
    221     assert(Err);
    222     return std::move(*Err);
    223   }
    224 
    225 private:
    226   void eatWhitespace() {
    227     while (P != End && (*P == ' ' || *P == '\r' || *P == '\n' || *P == '\t'))
    228       ++P;
    229   }
    230 
    231   // On invalid syntax, parseX() functions return false and set Err.
    232   bool parseNumber(char First, Value &Out);
    233   bool parseString(std::string &Out);
    234   bool parseUnicode(std::string &Out);
    235   bool parseError(const char *Msg); // always returns false
    236 
    237   char next() { return P == End ? 0 : *P++; }
    238   char peek() { return P == End ? 0 : *P; }
    239   static bool isNumber(char C) {
    240     return C == '0' || C == '1' || C == '2' || C == '3' || C == '4' ||
    241            C == '5' || C == '6' || C == '7' || C == '8' || C == '9' ||
    242            C == 'e' || C == 'E' || C == '+' || C == '-' || C == '.';
    243   }
    244 
    245   Optional<Error> Err;
    246   const char *Start, *P, *End;
    247 };
    248 
    249 bool Parser::parseValue(Value &Out) {
    250   eatWhitespace();
    251   if (P == End)
    252     return parseError("Unexpected EOF");
    253   switch (char C = next()) {
    254   // Bare null/true/false are easy - first char identifies them.
    255   case 'n':
    256     Out = nullptr;
    257     return (next() == 'u' && next() == 'l' && next() == 'l') ||
    258            parseError("Invalid JSON value (null?)");
    259   case 't':
    260     Out = true;
    261     return (next() == 'r' && next() == 'u' && next() == 'e') ||
    262            parseError("Invalid JSON value (true?)");
    263   case 'f':
    264     Out = false;
    265     return (next() == 'a' && next() == 'l' && next() == 's' && next() == 'e') ||
    266            parseError("Invalid JSON value (false?)");
    267   case '"': {
    268     std::string S;
    269     if (parseString(S)) {
    270       Out = std::move(S);
    271       return true;
    272     }
    273     return false;
    274   }
    275   case '[': {
    276     Out = Array{};
    277     Array &A = *Out.getAsArray();
    278     eatWhitespace();
    279     if (peek() == ']') {
    280       ++P;
    281       return true;
    282     }
    283     for (;;) {
    284       A.emplace_back(nullptr);
    285       if (!parseValue(A.back()))
    286         return false;
    287       eatWhitespace();
    288       switch (next()) {
    289       case ',':
    290         eatWhitespace();
    291         continue;
    292       case ']':
    293         return true;
    294       default:
    295         return parseError("Expected , or ] after array element");
    296       }
    297     }
    298   }
    299   case '{': {
    300     Out = Object{};
    301     Object &O = *Out.getAsObject();
    302     eatWhitespace();
    303     if (peek() == '}') {
    304       ++P;
    305       return true;
    306     }
    307     for (;;) {
    308       if (next() != '"')
    309         return parseError("Expected object key");
    310       std::string K;
    311       if (!parseString(K))
    312         return false;
    313       eatWhitespace();
    314       if (next() != ':')
    315         return parseError("Expected : after object key");
    316       eatWhitespace();
    317       if (!parseValue(O[std::move(K)]))
    318         return false;
    319       eatWhitespace();
    320       switch (next()) {
    321       case ',':
    322         eatWhitespace();
    323         continue;
    324       case '}':
    325         return true;
    326       default:
    327         return parseError("Expected , or } after object property");
    328       }
    329     }
    330   }
    331   default:
    332     if (isNumber(C))
    333       return parseNumber(C, Out);
    334     return parseError("Invalid JSON value");
    335   }
    336 }
    337 
    338 bool Parser::parseNumber(char First, Value &Out) {
    339   // Read the number into a string. (Must be null-terminated for strto*).
    340   SmallString<24> S;
    341   S.push_back(First);
    342   while (isNumber(peek()))
    343     S.push_back(next());
    344   char *End;
    345   // Try first to parse as integer, and if so preserve full 64 bits.
    346   // strtoll returns long long >= 64 bits, so check it's in range too.
    347   auto I = std::strtoll(S.c_str(), &End, 10);
    348   if (End == S.end() && I >= std::numeric_limits<int64_t>::min() &&
    349       I <= std::numeric_limits<int64_t>::max()) {
    350     Out = int64_t(I);
    351     return true;
    352   }
    353   // If it's not an integer
    354   Out = std::strtod(S.c_str(), &End);
    355   return End == S.end() || parseError("Invalid JSON value (number?)");
    356 }
    357 
    358 bool Parser::parseString(std::string &Out) {
    359   // leading quote was already consumed.
    360   for (char C = next(); C != '"'; C = next()) {
    361     if (LLVM_UNLIKELY(P == End))
    362       return parseError("Unterminated string");
    363     if (LLVM_UNLIKELY((C & 0x1f) == C))
    364       return parseError("Control character in string");
    365     if (LLVM_LIKELY(C != '\\')) {
    366       Out.push_back(C);
    367       continue;
    368     }
    369     // Handle escape sequence.
    370     switch (C = next()) {
    371     case '"':
    372     case '\\':
    373     case '/':
    374       Out.push_back(C);
    375       break;
    376     case 'b':
    377       Out.push_back('\b');
    378       break;
    379     case 'f':
    380       Out.push_back('\f');
    381       break;
    382     case 'n':
    383       Out.push_back('\n');
    384       break;
    385     case 'r':
    386       Out.push_back('\r');
    387       break;
    388     case 't':
    389       Out.push_back('\t');
    390       break;
    391     case 'u':
    392       if (!parseUnicode(Out))
    393         return false;
    394       break;
    395     default:
    396       return parseError("Invalid escape sequence");
    397     }
    398   }
    399   return true;
    400 }
    401 
    402 static void encodeUtf8(uint32_t Rune, std::string &Out) {
    403   if (Rune < 0x80) {
    404     Out.push_back(Rune & 0x7F);
    405   } else if (Rune < 0x800) {
    406     uint8_t FirstByte = 0xC0 | ((Rune & 0x7C0) >> 6);
    407     uint8_t SecondByte = 0x80 | (Rune & 0x3F);
    408     Out.push_back(FirstByte);
    409     Out.push_back(SecondByte);
    410   } else if (Rune < 0x10000) {
    411     uint8_t FirstByte = 0xE0 | ((Rune & 0xF000) >> 12);
    412     uint8_t SecondByte = 0x80 | ((Rune & 0xFC0) >> 6);
    413     uint8_t ThirdByte = 0x80 | (Rune & 0x3F);
    414     Out.push_back(FirstByte);
    415     Out.push_back(SecondByte);
    416     Out.push_back(ThirdByte);
    417   } else if (Rune < 0x110000) {
    418     uint8_t FirstByte = 0xF0 | ((Rune & 0x1F0000) >> 18);
    419     uint8_t SecondByte = 0x80 | ((Rune & 0x3F000) >> 12);
    420     uint8_t ThirdByte = 0x80 | ((Rune & 0xFC0) >> 6);
    421     uint8_t FourthByte = 0x80 | (Rune & 0x3F);
    422     Out.push_back(FirstByte);
    423     Out.push_back(SecondByte);
    424     Out.push_back(ThirdByte);
    425     Out.push_back(FourthByte);
    426   } else {
    427     llvm_unreachable("Invalid codepoint");
    428   }
    429 }
    430 
    431 // Parse a UTF-16 \uNNNN escape sequence. "\u" has already been consumed.
    432 // May parse several sequential escapes to ensure proper surrogate handling.
    433 // We do not use ConvertUTF.h, it can't accept and replace unpaired surrogates.
    434 // These are invalid Unicode but valid JSON (RFC 8259, section 8.2).
    435 bool Parser::parseUnicode(std::string &Out) {
    436   // Invalid UTF is not a JSON error (RFC 85298.2). It gets replaced by U+FFFD.
    437   auto Invalid = [&] { Out.append(/* UTF-8 */ {'\xef', '\xbf', '\xbd'}); };
    438   // Decodes 4 hex digits from the stream into Out, returns false on error.
    439   auto Parse4Hex = [this](uint16_t &Out) -> bool {
    440     Out = 0;
    441     char Bytes[] = {next(), next(), next(), next()};
    442     for (unsigned char C : Bytes) {
    443       if (!std::isxdigit(C))
    444         return parseError("Invalid \\u escape sequence");
    445       Out <<= 4;
    446       Out |= (C > '9') ? (C & ~0x20) - 'A' + 10 : (C - '0');
    447     }
    448     return true;
    449   };
    450   uint16_t First; // UTF-16 code unit from the first \u escape.
    451   if (!Parse4Hex(First))
    452     return false;
    453 
    454   // We loop to allow proper surrogate-pair error handling.
    455   while (true) {
    456     // Case 1: the UTF-16 code unit is already a codepoint in the BMP.
    457     if (LLVM_LIKELY(First < 0xD800 || First >= 0xE000)) {
    458       encodeUtf8(First, Out);
    459       return true;
    460     }
    461 
    462     // Case 2: it's an (unpaired) trailing surrogate.
    463     if (LLVM_UNLIKELY(First >= 0xDC00)) {
    464       Invalid();
    465       return true;
    466     }
    467 
    468     // Case 3: it's a leading surrogate. We expect a trailing one next.
    469     // Case 3a: there's no trailing \u escape. Don't advance in the stream.
    470     if (LLVM_UNLIKELY(P + 2 > End || *P != '\\' || *(P + 1) != 'u')) {
    471       Invalid(); // Leading surrogate was unpaired.
    472       return true;
    473     }
    474     P += 2;
    475     uint16_t Second;
    476     if (!Parse4Hex(Second))
    477       return false;
    478     // Case 3b: there was another \u escape, but it wasn't a trailing surrogate.
    479     if (LLVM_UNLIKELY(Second < 0xDC00 || Second >= 0xE000)) {
    480       Invalid();      // Leading surrogate was unpaired.
    481       First = Second; // Second escape still needs to be processed.
    482       continue;
    483     }
    484     // Case 3c: a valid surrogate pair encoding an astral codepoint.
    485     encodeUtf8(0x10000 | ((First - 0xD800) << 10) | (Second - 0xDC00), Out);
    486     return true;
    487   }
    488 }
    489 
    490 bool Parser::parseError(const char *Msg) {
    491   int Line = 1;
    492   const char *StartOfLine = Start;
    493   for (const char *X = Start; X < P; ++X) {
    494     if (*X == 0x0A) {
    495       ++Line;
    496       StartOfLine = X + 1;
    497     }
    498   }
    499   Err.emplace(
    500       llvm::make_unique<ParseError>(Msg, Line, P - StartOfLine, P - Start));
    501   return false;
    502 }
    503 } // namespace
    504 
    505 Expected<Value> parse(StringRef JSON) {
    506   Parser P(JSON);
    507   Value E = nullptr;
    508   if (P.checkUTF8())
    509     if (P.parseValue(E))
    510       if (P.assertEnd())
    511         return std::move(E);
    512   return P.takeError();
    513 }
    514 char ParseError::ID = 0;
    515 
    516 static std::vector<const Object::value_type *> sortedElements(const Object &O) {
    517   std::vector<const Object::value_type *> Elements;
    518   for (const auto &E : O)
    519     Elements.push_back(&E);
    520   llvm::sort(Elements.begin(), Elements.end(),
    521              [](const Object::value_type *L, const Object::value_type *R) {
    522                return L->first < R->first;
    523              });
    524   return Elements;
    525 }
    526 
    527 bool isUTF8(llvm::StringRef S, size_t *ErrOffset) {
    528   // Fast-path for ASCII, which is valid UTF-8.
    529   if (LLVM_LIKELY(isASCII(S)))
    530     return true;
    531 
    532   const UTF8 *Data = reinterpret_cast<const UTF8 *>(S.data()), *Rest = Data;
    533   if (LLVM_LIKELY(isLegalUTF8String(&Rest, Data + S.size())))
    534     return true;
    535 
    536   if (ErrOffset)
    537     *ErrOffset = Rest - Data;
    538   return false;
    539 }
    540 
    541 std::string fixUTF8(llvm::StringRef S) {
    542   // This isn't particularly efficient, but is only for error-recovery.
    543   std::vector<UTF32> Codepoints(S.size()); // 1 codepoint per byte suffices.
    544   const UTF8 *In8 = reinterpret_cast<const UTF8 *>(S.data());
    545   UTF32 *Out32 = Codepoints.data();
    546   ConvertUTF8toUTF32(&In8, In8 + S.size(), &Out32, Out32 + Codepoints.size(),
    547                      lenientConversion);
    548   Codepoints.resize(Out32 - Codepoints.data());
    549   std::string Res(4 * Codepoints.size(), 0); // 4 bytes per codepoint suffice
    550   const UTF32 *In32 = Codepoints.data();
    551   UTF8 *Out8 = reinterpret_cast<UTF8 *>(&Res[0]);
    552   ConvertUTF32toUTF8(&In32, In32 + Codepoints.size(), &Out8, Out8 + Res.size(),
    553                      strictConversion);
    554   Res.resize(reinterpret_cast<char *>(Out8) - Res.data());
    555   return Res;
    556 }
    557 
    558 } // namespace json
    559 } // namespace llvm
    560 
    561 static void quote(llvm::raw_ostream &OS, llvm::StringRef S) {
    562   OS << '\"';
    563   for (unsigned char C : S) {
    564     if (C == 0x22 || C == 0x5C)
    565       OS << '\\';
    566     if (C >= 0x20) {
    567       OS << C;
    568       continue;
    569     }
    570     OS << '\\';
    571     switch (C) {
    572     // A few characters are common enough to make short escapes worthwhile.
    573     case '\t':
    574       OS << 't';
    575       break;
    576     case '\n':
    577       OS << 'n';
    578       break;
    579     case '\r':
    580       OS << 'r';
    581       break;
    582     default:
    583       OS << 'u';
    584       llvm::write_hex(OS, C, llvm::HexPrintStyle::Lower, 4);
    585       break;
    586     }
    587   }
    588   OS << '\"';
    589 }
    590 
    591 enum IndenterAction {
    592   Indent,
    593   Outdent,
    594   Newline,
    595   Space,
    596 };
    597 
    598 // Prints JSON. The indenter can be used to control formatting.
    599 template <typename Indenter>
    600 void llvm::json::Value::print(raw_ostream &OS, const Indenter &I) const {
    601   switch (Type) {
    602   case T_Null:
    603     OS << "null";
    604     break;
    605   case T_Boolean:
    606     OS << (as<bool>() ? "true" : "false");
    607     break;
    608   case T_Double:
    609     OS << format("%.*g", std::numeric_limits<double>::max_digits10,
    610                  as<double>());
    611     break;
    612   case T_Integer:
    613     OS << as<int64_t>();
    614     break;
    615   case T_StringRef:
    616     quote(OS, as<StringRef>());
    617     break;
    618   case T_String:
    619     quote(OS, as<std::string>());
    620     break;
    621   case T_Object: {
    622     bool Comma = false;
    623     OS << '{';
    624     I(Indent);
    625     for (const auto *P : sortedElements(as<json::Object>())) {
    626       if (Comma)
    627         OS << ',';
    628       Comma = true;
    629       I(Newline);
    630       quote(OS, P->first);
    631       OS << ':';
    632       I(Space);
    633       P->second.print(OS, I);
    634     }
    635     I(Outdent);
    636     if (Comma)
    637       I(Newline);
    638     OS << '}';
    639     break;
    640   }
    641   case T_Array: {
    642     bool Comma = false;
    643     OS << '[';
    644     I(Indent);
    645     for (const auto &E : as<json::Array>()) {
    646       if (Comma)
    647         OS << ',';
    648       Comma = true;
    649       I(Newline);
    650       E.print(OS, I);
    651     }
    652     I(Outdent);
    653     if (Comma)
    654       I(Newline);
    655     OS << ']';
    656     break;
    657   }
    658   }
    659 }
    660 
    661 void llvm::format_provider<llvm::json::Value>::format(
    662     const llvm::json::Value &E, raw_ostream &OS, StringRef Options) {
    663   if (Options.empty()) {
    664     OS << E;
    665     return;
    666   }
    667   unsigned IndentAmount = 0;
    668   if (Options.getAsInteger(/*Radix=*/10, IndentAmount))
    669     llvm_unreachable("json::Value format options should be an integer");
    670   unsigned IndentLevel = 0;
    671   E.print(OS, [&](IndenterAction A) {
    672     switch (A) {
    673     case Newline:
    674       OS << '\n';
    675       OS.indent(IndentLevel);
    676       break;
    677     case Space:
    678       OS << ' ';
    679       break;
    680     case Indent:
    681       IndentLevel += IndentAmount;
    682       break;
    683     case Outdent:
    684       IndentLevel -= IndentAmount;
    685       break;
    686     };
    687   });
    688 }
    689 
    690 llvm::raw_ostream &llvm::json::operator<<(raw_ostream &OS, const Value &E) {
    691   E.print(OS, [](IndenterAction A) { /*ignore*/ });
    692   return OS;
    693 }
    694