Home | History | Annotate | Download | only in json
      1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef BASE_JSON_JSON_PARSER_H_
      6 #define BASE_JSON_JSON_PARSER_H_
      7 
      8 #include <stddef.h>
      9 #include <stdint.h>
     10 
     11 #include <string>
     12 
     13 #include "base/base_export.h"
     14 #include "base/compiler_specific.h"
     15 #include "base/gtest_prod_util.h"
     16 #include "base/json/json_reader.h"
     17 #include "base/macros.h"
     18 #include "base/strings/string_piece.h"
     19 
     20 namespace base {
     21 
     22 class Value;
     23 
     24 namespace internal {
     25 
     26 class JSONParserTest;
     27 
     28 // The implementation behind the JSONReader interface. This class is not meant
     29 // to be used directly; it encapsulates logic that need not be exposed publicly.
     30 //
     31 // This parser guarantees O(n) time through the input string. It also optimizes
     32 // base::StringValue by using StringPiece where possible when returning Value
     33 // objects by using "hidden roots," discussed in the implementation.
     34 //
     35 // Iteration happens on the byte level, with the functions CanConsume and
     36 // NextChar. The conversion from byte to JSON token happens without advancing
     37 // the parser in GetNextToken/ParseToken, that is tokenization operates on
     38 // the current parser position without advancing.
     39 //
     40 // Built on top of these are a family of Consume functions that iterate
     41 // internally. Invariant: on entry of a Consume function, the parser is wound
     42 // to the first byte of a valid JSON token. On exit, it is on the last byte
     43 // of a token, such that the next iteration of the parser will be at the byte
     44 // immediately following the token, which would likely be the first byte of the
     45 // next token.
     46 class BASE_EXPORT JSONParser {
     47  public:
     48   explicit JSONParser(int options);
     49   ~JSONParser();
     50 
     51   // Parses the input string according to the set options and returns the
     52   // result as a Value owned by the caller.
     53   Value* Parse(const StringPiece& input);
     54 
     55   // Returns the error code.
     56   JSONReader::JsonParseError error_code() const;
     57 
     58   // Returns the human-friendly error message.
     59   std::string GetErrorMessage() const;
     60 
     61   // Returns the error line number if parse error happened. Otherwise always
     62   // returns 0.
     63   int error_line() const;
     64 
     65   // Returns the error column number if parse error happened. Otherwise always
     66   // returns 0.
     67   int error_column() const;
     68 
     69  private:
     70   enum Token {
     71     T_OBJECT_BEGIN,           // {
     72     T_OBJECT_END,             // }
     73     T_ARRAY_BEGIN,            // [
     74     T_ARRAY_END,              // ]
     75     T_STRING,
     76     T_NUMBER,
     77     T_BOOL_TRUE,              // true
     78     T_BOOL_FALSE,             // false
     79     T_NULL,                   // null
     80     T_LIST_SEPARATOR,         // ,
     81     T_OBJECT_PAIR_SEPARATOR,  // :
     82     T_END_OF_INPUT,
     83     T_INVALID_TOKEN,
     84   };
     85 
     86   // A helper class used for parsing strings. One optimization performed is to
     87   // create base::Value with a StringPiece to avoid unnecessary std::string
     88   // copies. This is not possible if the input string needs to be decoded from
     89   // UTF-16 to UTF-8, or if an escape sequence causes characters to be skipped.
     90   // This class centralizes that logic.
     91   class StringBuilder {
     92    public:
     93     // Empty constructor. Used for creating a builder with which to Swap().
     94     StringBuilder();
     95 
     96     // |pos| is the beginning of an input string, excluding the |"|.
     97     explicit StringBuilder(const char* pos);
     98 
     99     ~StringBuilder();
    100 
    101     // Swaps the contents of |other| with this.
    102     void Swap(StringBuilder* other);
    103 
    104     // Either increases the |length_| of the string or copies the character if
    105     // the StringBuilder has been converted. |c| must be in the basic ASCII
    106     // plane; all other characters need to be in UTF-8 units, appended with
    107     // AppendString below.
    108     void Append(const char& c);
    109 
    110     // Appends a string to the std::string. Must be Convert()ed to use.
    111     void AppendString(const std::string& str);
    112 
    113     // Converts the builder from its default StringPiece to a full std::string,
    114     // performing a copy. Once a builder is converted, it cannot be made a
    115     // StringPiece again.
    116     void Convert();
    117 
    118     // Returns whether the builder can be converted to a StringPiece.
    119     bool CanBeStringPiece() const;
    120 
    121     // Returns the StringPiece representation. Returns an empty piece if it
    122     // cannot be converted.
    123     StringPiece AsStringPiece();
    124 
    125     // Returns the builder as a std::string.
    126     const std::string& AsString();
    127 
    128    private:
    129     // The beginning of the input string.
    130     const char* pos_;
    131 
    132     // Number of bytes in |pos_| that make up the string being built.
    133     size_t length_;
    134 
    135     // The copied string representation. NULL until Convert() is called.
    136     // Strong. scoped_ptr<T> has too much of an overhead here.
    137     std::string* string_;
    138   };
    139 
    140   // Quick check that the stream has capacity to consume |length| more bytes.
    141   bool CanConsume(int length);
    142 
    143   // The basic way to consume a single character in the stream. Consumes one
    144   // byte of the input stream and returns a pointer to the rest of it.
    145   const char* NextChar();
    146 
    147   // Performs the equivalent of NextChar N times.
    148   void NextNChars(int n);
    149 
    150   // Skips over whitespace and comments to find the next token in the stream.
    151   // This does not advance the parser for non-whitespace or comment chars.
    152   Token GetNextToken();
    153 
    154   // Consumes whitespace characters and comments until the next non-that is
    155   // encountered.
    156   void EatWhitespaceAndComments();
    157   // Helper function that consumes a comment, assuming that the parser is
    158   // currently wound to a '/'.
    159   bool EatComment();
    160 
    161   // Calls GetNextToken() and then ParseToken(). Caller owns the result.
    162   Value* ParseNextToken();
    163 
    164   // Takes a token that represents the start of a Value ("a structural token"
    165   // in RFC terms) and consumes it, returning the result as an object the
    166   // caller owns.
    167   Value* ParseToken(Token token);
    168 
    169   // Assuming that the parser is currently wound to '{', this parses a JSON
    170   // object into a DictionaryValue.
    171   Value* ConsumeDictionary();
    172 
    173   // Assuming that the parser is wound to '[', this parses a JSON list into a
    174   // ListValue.
    175   Value* ConsumeList();
    176 
    177   // Calls through ConsumeStringRaw and wraps it in a value.
    178   Value* ConsumeString();
    179 
    180   // Assuming that the parser is wound to a double quote, this parses a string,
    181   // decoding any escape sequences and converts UTF-16 to UTF-8. Returns true on
    182   // success and Swap()s the result into |out|. Returns false on failure with
    183   // error information set.
    184   bool ConsumeStringRaw(StringBuilder* out);
    185   // Helper function for ConsumeStringRaw() that consumes the next four or 10
    186   // bytes (parser is wound to the first character of a HEX sequence, with the
    187   // potential for consuming another \uXXXX for a surrogate). Returns true on
    188   // success and places the UTF8 code units in |dest_string|, and false on
    189   // failure.
    190   bool DecodeUTF16(std::string* dest_string);
    191   // Helper function for ConsumeStringRaw() that takes a single code point,
    192   // decodes it into UTF-8 units, and appends it to the given builder. The
    193   // point must be valid.
    194   void DecodeUTF8(const int32_t& point, StringBuilder* dest);
    195 
    196   // Assuming that the parser is wound to the start of a valid JSON number,
    197   // this parses and converts it to either an int or double value.
    198   Value* ConsumeNumber();
    199   // Helper that reads characters that are ints. Returns true if a number was
    200   // read and false on error.
    201   bool ReadInt(bool allow_leading_zeros);
    202 
    203   // Consumes the literal values of |true|, |false|, and |null|, assuming the
    204   // parser is wound to the first character of any of those.
    205   Value* ConsumeLiteral();
    206 
    207   // Compares two string buffers of a given length.
    208   static bool StringsAreEqual(const char* left, const char* right, size_t len);
    209 
    210   // Sets the error information to |code| at the current column, based on
    211   // |index_| and |index_last_line_|, with an optional positive/negative
    212   // adjustment by |column_adjust|.
    213   void ReportError(JSONReader::JsonParseError code, int column_adjust);
    214 
    215   // Given the line and column number of an error, formats one of the error
    216   // message contants from json_reader.h for human display.
    217   static std::string FormatErrorMessage(int line, int column,
    218                                         const std::string& description);
    219 
    220   // base::JSONParserOptions that control parsing.
    221   int options_;
    222 
    223   // Pointer to the start of the input data.
    224   const char* start_pos_;
    225 
    226   // Pointer to the current position in the input data. Equivalent to
    227   // |start_pos_ + index_|.
    228   const char* pos_;
    229 
    230   // Pointer to the last character of the input data.
    231   const char* end_pos_;
    232 
    233   // The index in the input stream to which the parser is wound.
    234   int index_;
    235 
    236   // The number of times the parser has recursed (current stack depth).
    237   int stack_depth_;
    238 
    239   // The line number that the parser is at currently.
    240   int line_number_;
    241 
    242   // The last value of |index_| on the previous line.
    243   int index_last_line_;
    244 
    245   // Error information.
    246   JSONReader::JsonParseError error_code_;
    247   int error_line_;
    248   int error_column_;
    249 
    250   friend class JSONParserTest;
    251   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, NextChar);
    252   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeDictionary);
    253   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeList);
    254   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeString);
    255   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeLiterals);
    256   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ConsumeNumbers);
    257   FRIEND_TEST_ALL_PREFIXES(JSONParserTest, ErrorMessages);
    258 
    259   DISALLOW_COPY_AND_ASSIGN(JSONParser);
    260 };
    261 
    262 }  // namespace internal
    263 }  // namespace base
    264 
    265 #endif  // BASE_JSON_JSON_PARSER_H_
    266