Home | History | Annotate | Download | only in json
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_JSON_JSON_PARSER_H_
      6 #define BASE_JSON_JSON_PARSER_H_
      7 
      8 #include <stddef.h>
      9 #include <stdint.h>
     10 
     11 #include <memory>
     12 #include <string>
     13 
     14 #include "base/base_export.h"
     15 #include "base/compiler_specific.h"
     16 #include "base/gtest_prod_util.h"
     17 #include "base/json/json_reader.h"
     18 #include "base/macros.h"
     19 #include "base/strings/string_piece.h"
     20 
     21 namespace base {
     22 
     23 class Value;
     24 
     25 namespace internal {
     26 
     27 class JSONParserTest;
     28 
     29 // The implementation behind the JSONReader interface. This class is not meant
     30 // to be used directly; it encapsulates logic that need not be exposed publicly.
     31 //
     32 // This parser guarantees O(n) time through the input string. It also optimizes
     33 // base::StringValue by using StringPiece where possible when returning Value
     34 // objects by using "hidden roots," discussed in the implementation.
     35 //
     36 // Iteration happens on the byte level, with the functions CanConsume and
     37 // NextChar. The conversion from byte to JSON token happens without advancing
     38 // the parser in GetNextToken/ParseToken, that is tokenization operates on
     39 // the current parser position without advancing.
     40 //
     41 // Built on top of these are a family of Consume functions that iterate
     42 // internally. Invariant: on entry of a Consume function, the parser is wound
     43 // to the first byte of a valid JSON token. On exit, it is on the last byte
     44 // of a token, such that the next iteration of the parser will be at the byte
     45 // immediately following the token, which would likely be the first byte of the
     46 // next token.
     47 class BASE_EXPORT JSONParser {
     48  public:
     49   explicit JSONParser(int options);
     50   ~JSONParser();
     51 
     52   // Parses the input string according to the set options and returns the
     53   // result as a Value.
     54   // Wrap this in base::FooValue::From() to check the Value is of type Foo and
     55   // convert to a FooValue at the same time.
     56   std::unique_ptr<Value> Parse(StringPiece input);
     57 
     58   // Returns the error code.
     59   JSONReader::JsonParseError error_code() const;
     60 
     61   // Returns the human-friendly error message.
     62   std::string GetErrorMessage() const;
     63 
     64   // Returns the error line number if parse error happened. Otherwise always
     65   // returns 0.
     66   int error_line() const;
     67 
     68   // Returns the error column number if parse error happened. Otherwise always
     69   // returns 0.
     70   int error_column() const;
     71 
     72  private:
     73   enum Token {
     74     T_OBJECT_BEGIN,           // {
     75     T_OBJECT_END,             // }
     76     T_ARRAY_BEGIN,            // [
     77     T_ARRAY_END,              // ]
     78     T_STRING,
     79     T_NUMBER,
     80     T_BOOL_TRUE,              // true
     81     T_BOOL_FALSE,             // false
     82     T_NULL,                   // null
     83     T_LIST_SEPARATOR,         // ,
     84     T_OBJECT_PAIR_SEPARATOR,  // :
     85     T_END_OF_INPUT,
     86     T_INVALID_TOKEN,
     87   };
     88 
     89   // A helper class used for parsing strings. One optimization performed is to
     90   // create base::Value with a StringPiece to avoid unnecessary std::string
     91   // copies. This is not possible if the input string needs to be decoded from
     92   // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
     93   // This class centralizes that logic.
     94   class StringBuilder {
     95    public:
     96     // Empty constructor. Used for creating a builder with which to Swap().
     97     StringBuilder();
     98 
     99     // |pos| is the beginning of an input string, excluding the |"|.
    100     explicit StringBuilder(const char* pos);
    101 
    102     ~StringBuilder();
    103 
    104     // Swaps the contents of |other| with this.
    105     void Swap(StringBuilder* other);
    106 
    107     // Either increases the |length_| of the string or copies the character if
    108     // the StringBuilder has been converted. |c| must be in the basic ASCII
    109     // plane; all other characters need to be in UTF-8 units, appended with
    110     // AppendString below.
    111     void Append(const char& c);
    112 
    113     // Appends a string to the std::string. Must be Convert()ed to use.
    114     void AppendString(const std::string& str);
    115 
    116     // Converts the builder from its default StringPiece to a full std::string,
    117     // performing a copy. Once a builder is converted, it cannot be made a
    118     // StringPiece again.
    119     void Convert();
    120 
    121     // Returns whether the builder can be converted to a StringPiece.
    122     bool CanBeStringPiece() const;
    123 
    124     // Returns the StringPiece representation. Returns an empty piece if it
    125     // cannot be converted.
    126     StringPiece AsStringPiece();
    127 
    128     // Returns the builder as a std::string.
    129     const std::string& AsString();
    130 
    131    private:
    132     // The beginning of the input string.
    133     const char* pos_;
    134 
    135     // Number of bytes in |pos_| that make up the string being built.
    136     size_t length_;
    137 
    138     // The copied string representation. NULL until Convert() is called.
    139     // Strong. std::unique_ptr<T> has too much of an overhead here.
    140     std::string* string_;
    141   };
    142 
    143   // Quick check that the stream has capacity to consume |length| more bytes.
    144   bool CanConsume(int length);
    145 
    146   // The basic way to consume a single character in the stream. Consumes one
    147   // byte of the input stream and returns a pointer to the rest of it.
    148   const char* NextChar();
    149 
    150   // Performs the equivalent of NextChar N times.
    151   void NextNChars(int n);
    152 
    153   // Skips over whitespace and comments to find the next token in the stream.
    154   // This does not advance the parser for non-whitespace or comment chars.
    155   Token GetNextToken();
    156 
    157   // Consumes whitespace characters and comments until the next non-that is
    158   // encountered.
    159   void EatWhitespaceAndComments();
    160   // Helper function that consumes a comment, assuming that the parser is
    161   // currently wound to a '/'.
    162   bool EatComment();
    163 
    164   // Calls GetNextToken() and then ParseToken(). Caller owns the result.
    165   Value* ParseNextToken();
    166 
    167   // Takes a token that represents the start of a Value ("a structural token"
    168   // in RFC terms) and consumes it, returning the result as an object the
    169   // caller owns.
    170   Value* ParseToken(Token token);
    171 
    172   // Assuming that the parser is currently wound to '{', this parses a JSON
    173   // object into a DictionaryValue.
    174   Value* ConsumeDictionary();
    175 
    176   // Assuming that the parser is wound to '[', this parses a JSON list into a
    177   // ListValue.
    178   Value* ConsumeList();
    179 
    180   // Calls through ConsumeStringRaw and wraps it in a value.
    181   Value* ConsumeString();
    182 
    183   // Assuming that the parser is wound to a double quote, this parses a string,
    184   // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
    185   // success and Swap()s the result into |out|. Returns false on failure with
    186   // error information set.
    187   bool ConsumeStringRaw(StringBuilder* out);
    188   // Helper function for ConsumeStringRaw() that consumes the next four or 10
    189   // bytes (parser is wound to the first character of a HEX sequence, with the
    190   // potential for consuming another \uXXXX for a surrogate). Returns true on
    191   // success and places the UTF8 code units in |dest_string|, and false on
    192   // failure.
    193   bool DecodeUTF16(std::string* dest_string);
    194   // Helper function for ConsumeStringRaw() that takes a single code point,
    195   // decodes it into UTF-8 units, and appends it to the given builder. The
    196   // point must be valid.
    197   void DecodeUTF8(const int32_t& point, StringBuilder* dest);
    198 
    199   // Assuming that the parser is wound to the start of a valid JSON number,
    200   // this parses and converts it to either an int or double value.
    201   Value* ConsumeNumber();
    202   // Helper that reads characters that are ints. Returns true if a number was
    203   // read and false on error.
    204   bool ReadInt(bool allow_leading_zeros);
    205 
    206   // Consumes the literal values of |true|, |false|, and |null|, assuming the
    207   // parser is wound to the first character of any of those.
    208   Value* ConsumeLiteral();
    209 
    210   // Compares two string buffers of a given length.
    211   static bool StringsAreEqual(const char* left, const char* right, size_t len);
    212 
    213   // Sets the error information to |code| at the current column, based on
    214   // |index_| and |index_last_line_|, with an optional positive/negative
    215   // adjustment by |column_adjust|.
    216   void ReportError(JSONReader::JsonParseError code, int column_adjust);
    217 
    218   // Given the line and column number of an error, formats one of the error
    219   // message contants from json_reader.h for human display.
    220   static std::string FormatErrorMessage(int line, int column,
    221                                         const std::string& description);
    222 
    223   // base::JSONParserOptions that control parsing.
    224   const int options_;
    225 
    226   // Pointer to the start of the input data.
    227   const char* start_pos_;
    228 
    229   // Pointer to the current position in the input data. Equivalent to
    230   // |start_pos_ + index_|.
    231   const char* pos_;
    232 
    233   // Pointer to the last character of the input data.
    234   const char* end_pos_;
    235 
    236   // The index in the input stream to which the parser is wound.
    237   int index_;
    238 
    239   // The number of times the parser has recursed (current stack depth).
    240   int stack_depth_;
    241 
    242   // The line number that the parser is at currently.
    243   int line_number_;
    244 
    245   // The last value of |index_| on the previous line.
    246   int index_last_line_;
    247 
    248   // Error information.
    249   JSONReader::JsonParseError error_code_;
    250   int error_line_;
    251   int error_column_;
    252 
    253   friend class JSONParserTest;
    254   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
    255   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
    256   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
    257   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
    258   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
    259   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
    260   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
    261 
    262   DISALLOW_COPY_AND_ASSIGN(JSONParser);
    263 };
    264 
    265 }  // namespace internal
    266 }  // namespace base
    267 
    268 #endif  // BASE_JSON_JSON_PARSER_H_
    269