Home | History | Annotate | Download | only in json
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_JSON_JSON_PARSER_H_
      6 #define BASE_JSON_JSON_PARSER_H_
      7 
      8 #include <stddef.h>
      9 #include <stdint.h>
     10 
     11 #include <memory>
     12 #include <string>
     13 
     14 #include "base/base_export.h"
     15 #include "base/compiler_specific.h"
     16 #include "base/gtest_prod_util.h"
     17 #include "base/json/json_reader.h"
     18 #include "base/macros.h"
     19 #include "base/memory/manual_constructor.h"
     20 #include "base/strings/string_piece.h"
     21 
     22 namespace base {
     23 
     24 class Value;
     25 
     26 namespace internal {
     27 
     28 class JSONParserTest;
     29 
     30 // The implementation behind the JSONReader interface. This class is not meant
     31 // to be used directly; it encapsulates logic that need not be exposed publicly.
     32 //
     33 // This parser guarantees O(n) time through the input string. It also optimizes
     34 // base::Value by using StringPiece where possible when returning Value
     35 // objects by using "hidden roots," discussed in the implementation.
     36 //
     37 // Iteration happens on the byte level, with the functions CanConsume and
     38 // NextChar. The conversion from byte to JSON token happens without advancing
     39 // the parser in GetNextToken/ParseToken, that is tokenization operates on
     40 // the current parser position without advancing.
     41 //
     42 // Built on top of these are a family of Consume functions that iterate
     43 // internally. Invariant: on entry of a Consume function, the parser is wound
     44 // to the first byte of a valid JSON token. On exit, it is on the last byte
     45 // of a token, such that the next iteration of the parser will be at the byte
     46 // immediately following the token, which would likely be the first byte of the
     47 // next token.
     48 class BASE_EXPORT JSONParser {
     49  public:
     50   explicit JSONParser(int options);
     51   ~JSONParser();
     52 
     53   // Parses the input string according to the set options and returns the
     54   // result as a Value.
     55   // Wrap this in base::FooValue::From() to check the Value is of type Foo and
     56   // convert to a FooValue at the same time.
     57   std::unique_ptr<Value> Parse(StringPiece input);
     58 
     59   // Returns the error code.
     60   JSONReader::JsonParseError error_code() const;
     61 
     62   // Returns the human-friendly error message.
     63   std::string GetErrorMessage() const;
     64 
     65   // Returns the error line number if parse error happened. Otherwise always
     66   // returns 0.
     67   int error_line() const;
     68 
     69   // Returns the error column number if parse error happened. Otherwise always
     70   // returns 0.
     71   int error_column() const;
     72 
     73  private:
     74   enum Token {
     75     T_OBJECT_BEGIN,           // {
     76     T_OBJECT_END,             // }
     77     T_ARRAY_BEGIN,            // [
     78     T_ARRAY_END,              // ]
     79     T_STRING,
     80     T_NUMBER,
     81     T_BOOL_TRUE,              // true
     82     T_BOOL_FALSE,             // false
     83     T_NULL,                   // null
     84     T_LIST_SEPARATOR,         // ,
     85     T_OBJECT_PAIR_SEPARATOR,  // :
     86     T_END_OF_INPUT,
     87     T_INVALID_TOKEN,
     88   };
     89 
     90   // A helper class used for parsing strings. One optimization performed is to
     91   // create base::Value with a StringPiece to avoid unnecessary std::string
     92   // copies. This is not possible if the input string needs to be decoded from
     93   // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
     94   // This class centralizes that logic.
     95   class StringBuilder {
     96    public:
     97     // Empty constructor. Used for creating a builder with which to assign to.
     98     StringBuilder();
     99 
    100     // |pos| is the beginning of an input string, excluding the |"|.
    101     explicit StringBuilder(const char* pos);
    102 
    103     ~StringBuilder();
    104 
    105     void operator=(StringBuilder&& other);
    106 
    107     // Either increases the |length_| of the string or copies the character if
    108     // the StringBuilder has been converted. |c| must be in the basic ASCII
    109     // plane; all other characters need to be in UTF-8 units, appended with
    110     // AppendString below.
    111     void Append(const char& c);
    112 
    113     // Appends a string to the std::string. Must be Convert()ed to use.
    114     void AppendString(const char* str, size_t len);
    115 
    116     // Converts the builder from its default StringPiece to a full std::string,
    117     // performing a copy. Once a builder is converted, it cannot be made a
    118     // StringPiece again.
    119     void Convert();
    120 
    121     // Returns the builder as a StringPiece.
    122     StringPiece AsStringPiece();
    123 
    124     // Returns the builder as a std::string.
    125     const std::string& AsString();
    126 
    127     // Returns the builder as a string, invalidating all state. This allows
    128     // the internal string buffer representation to be destructively moved
    129     // in cases where the builder will not be needed any more.
    130     std::string DestructiveAsString();
    131 
    132    private:
    133     // The beginning of the input string.
    134     const char* pos_;
    135 
    136     // Number of bytes in |pos_| that make up the string being built.
    137     size_t length_;
    138 
    139     // The copied string representation. Will be uninitialized until Convert()
    140     // is called, which will set has_string_ to true.
    141     bool has_string_;
    142     base::ManualConstructor<std::string> string_;
    143   };
    144 
    145   // Quick check that the stream has capacity to consume |length| more bytes.
    146   bool CanConsume(int length);
    147 
    148   // The basic way to consume a single character in the stream. Consumes one
    149   // byte of the input stream and returns a pointer to the rest of it.
    150   const char* NextChar();
    151 
    152   // Performs the equivalent of NextChar N times.
    153   void NextNChars(int n);
    154 
    155   // Skips over whitespace and comments to find the next token in the stream.
    156   // This does not advance the parser for non-whitespace or comment chars.
    157   Token GetNextToken();
    158 
    159   // Consumes whitespace characters and comments until the next non-that is
    160   // encountered.
    161   void EatWhitespaceAndComments();
    162   // Helper function that consumes a comment, assuming that the parser is
    163   // currently wound to a '/'.
    164   bool EatComment();
    165 
    166   // Calls GetNextToken() and then ParseToken().
    167   std::unique_ptr<Value> ParseNextToken();
    168 
    169   // Takes a token that represents the start of a Value ("a structural token"
    170   // in RFC terms) and consumes it, returning the result as a Value.
    171   std::unique_ptr<Value> ParseToken(Token token);
    172 
    173   // Assuming that the parser is currently wound to '{', this parses a JSON
    174   // object into a DictionaryValue.
    175   std::unique_ptr<Value> ConsumeDictionary();
    176 
    177   // Assuming that the parser is wound to '[', this parses a JSON list into a
    178   // std::unique_ptr<ListValue>.
    179   std::unique_ptr<Value> ConsumeList();
    180 
    181   // Calls through ConsumeStringRaw and wraps it in a value.
    182   std::unique_ptr<Value> ConsumeString();
    183 
    184   // Assuming that the parser is wound to a double quote, this parses a string,
    185   // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
    186   // success and places result into |out|. Returns false on failure with
    187   // error information set.
    188   bool ConsumeStringRaw(StringBuilder* out);
    189   // Helper function for ConsumeStringRaw() that consumes the next four or 10
    190   // bytes (parser is wound to the first character of a HEX sequence, with the
    191   // potential for consuming another \uXXXX for a surrogate). Returns true on
    192   // success and places the UTF8 code units in |dest_string|, and false on
    193   // failure.
    194   bool DecodeUTF16(std::string* dest_string);
    195   // Helper function for ConsumeStringRaw() that takes a single code point,
    196   // decodes it into UTF-8 units, and appends it to the given builder. The
    197   // point must be valid.
    198   void DecodeUTF8(const int32_t& point, StringBuilder* dest);
    199 
    200   // Assuming that the parser is wound to the start of a valid JSON number,
    201   // this parses and converts it to either an int or double value.
    202   std::unique_ptr<Value> ConsumeNumber();
    203   // Helper that reads characters that are ints. Returns true if a number was
    204   // read and false on error.
    205   bool ReadInt(bool allow_leading_zeros);
    206 
    207   // Consumes the literal values of |true|, |false|, and |null|, assuming the
    208   // parser is wound to the first character of any of those.
    209   std::unique_ptr<Value> ConsumeLiteral();
    210 
    211   // Compares two string buffers of a given length.
    212   static bool StringsAreEqual(const char* left, const char* right, size_t len);
    213 
    214   // Sets the error information to |code| at the current column, based on
    215   // |index_| and |index_last_line_|, with an optional positive/negative
    216   // adjustment by |column_adjust|.
    217   void ReportError(JSONReader::JsonParseError code, int column_adjust);
    218 
    219   // Given the line and column number of an error, formats one of the error
    220   // message contants from json_reader.h for human display.
    221   static std::string FormatErrorMessage(int line, int column,
    222                                         const std::string& description);
    223 
    224   // base::JSONParserOptions that control parsing.
    225   const int options_;
    226 
    227   // Pointer to the start of the input data.
    228   const char* start_pos_;
    229 
    230   // Pointer to the current position in the input data. Equivalent to
    231   // |start_pos_ + index_|.
    232   const char* pos_;
    233 
    234   // Pointer to the last character of the input data.
    235   const char* end_pos_;
    236 
    237   // The index in the input stream to which the parser is wound.
    238   int index_;
    239 
    240   // The number of times the parser has recursed (current stack depth).
    241   int stack_depth_;
    242 
    243   // The line number that the parser is at currently.
    244   int line_number_;
    245 
    246   // The last value of |index_| on the previous line.
    247   int index_last_line_;
    248 
    249   // Error information.
    250   JSONReader::JsonParseError error_code_;
    251   int error_line_;
    252   int error_column_;
    253 
    254   friend class JSONParserTest;
    255   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
    256   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
    257   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
    258   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
    259   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
    260   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
    261   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
    262   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ReplaceInvalidCharacters);
    263 
    264   DISALLOW_COPY_AND_ASSIGN(JSONParser);
    265 };
    266 
    267 // Used when decoding and an invalid utf-8 sequence is encountered.
    268 BASE_EXPORT extern const char kUnicodeReplacementString[];
    269 
    270 }  // namespace internal
    271 }  // namespace base
    272 
    273 #endif  // BASE_JSON_JSON_PARSER_H_
    274