Home | History | Annotate | Download | only in compiler
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // http://code.google.com/p/protobuf/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Implements parsing of .proto files to FileDescriptorProtos.
     36 
     37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     39 
     40 #include <map>
     41 #include <string>
     42 #include <utility>
     43 #include <google/protobuf/stubs/common.h>
     44 #include <google/protobuf/descriptor.h>
     45 #include <google/protobuf/descriptor.pb.h>
     46 #include <google/protobuf/repeated_field.h>
     47 #include <google/protobuf/io/tokenizer.h>
     48 
     49 namespace google {
     50 namespace protobuf { class Message; }
     51 
     52 namespace protobuf {
     53 namespace compiler {
     54 
     55 // Defined in this file.
     56 class Parser;
     57 class SourceLocationTable;
     58 
     59 // Implements parsing of protocol definitions (such as .proto files).
     60 //
     61 // Note that most users will be more interested in the Importer class.
     62 // Parser is a lower-level class which simply converts a single .proto file
     63 // to a FileDescriptorProto.  It does not resolve import directives or perform
     64 // many other kinds of validation needed to construct a complete
     65 // FileDescriptor.
     66 class LIBPROTOBUF_EXPORT Parser {
     67  public:
     68   Parser();
     69   ~Parser();
     70 
     71   // Parse the entire input and construct a FileDescriptorProto representing
     72   // it.  Returns true if no errors occurred, false otherwise.
     73   bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
     74 
     75   // Optional fetaures:
     76 
     77   // Requests that locations of certain definitions be recorded to the given
     78   // SourceLocationTable while parsing.  This can be used to look up exact line
     79   // and column numbers for errors reported by DescriptorPool during validation.
     80   // Set to NULL (the default) to discard source location information.
     81   void RecordSourceLocationsTo(SourceLocationTable* location_table) {
     82     source_location_table_ = location_table;
     83   }
     84 
     85   // Requsets that errors be recorded to the given ErrorCollector while
     86   // parsing.  Set to NULL (the default) to discard error messages.
     87   void RecordErrorsTo(io::ErrorCollector* error_collector) {
     88     error_collector_ = error_collector;
     89   }
     90 
     91   // Returns the identifier used in the "syntax = " declaration, if one was
     92   // seen during the last call to Parse(), or the empty string otherwise.
     93   const string& GetSyntaxIdentifier() { return syntax_identifier_; }
     94 
     95   // If set true, input files will be required to begin with a syntax
     96   // identifier.  Otherwise, files may omit this.  If a syntax identifier
     97   // is provided, it must be 'syntax = "proto2";' and must appear at the
     98   // top of this file regardless of whether or not it was required.
     99   void SetRequireSyntaxIdentifier(bool value) {
    100     require_syntax_identifier_ = value;
    101   }
    102 
    103   // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
    104   // parsing as soon as it has seen the syntax identifier, or lack thereof.
    105   // This is useful for quickly identifying the syntax of the file without
    106   // parsing the whole thing.  If this is enabled, no error will be recorded
    107   // if the syntax identifier is something other than "proto2" (since
    108   // presumably the caller intends to deal with that), but other kinds of
    109   // errors (e.g. parse errors) will still be reported.  When this is enabled,
    110   // you may pass a NULL FileDescriptorProto to Parse().
    111   void SetStopAfterSyntaxIdentifier(bool value) {
    112     stop_after_syntax_identifier_ = value;
    113   }
    114 
    115  private:
    116   // =================================================================
    117   // Error recovery helpers
    118 
    119   // Consume the rest of the current statement.  This consumes tokens
    120   // until it sees one of:
    121   //   ';'  Consumes the token and returns.
    122   //   '{'  Consumes the brace then calls SkipRestOfBlock().
    123   //   '}'  Returns without consuming.
    124   //   EOF  Returns (can't consume).
    125   // The Parser often calls SkipStatement() after encountering a syntax
    126   // error.  This allows it to go on parsing the following lines, allowing
    127   // it to report more than just one error in the file.
    128   void SkipStatement();
    129 
    130   // Consume the rest of the current block, including nested blocks,
    131   // ending after the closing '}' is encountered and consumed, or at EOF.
    132   void SkipRestOfBlock();
    133 
    134   // -----------------------------------------------------------------
    135   // Single-token consuming helpers
    136   //
    137   // These make parsing code more readable.
    138 
    139   // True if the current token is TYPE_END.
    140   inline bool AtEnd();
    141 
    142   // True if the next token matches the given text.
    143   inline bool LookingAt(const char* text);
    144   // True if the next token is of the given type.
    145   inline bool LookingAtType(io::Tokenizer::TokenType token_type);
    146 
    147   // If the next token exactly matches the text given, consume it and return
    148   // true.  Otherwise, return false without logging an error.
    149   bool TryConsume(const char* text);
    150 
    151   // These attempt to read some kind of token from the input.  If successful,
    152   // they return true.  Otherwise they return false and add the given error
    153   // to the error list.
    154 
    155   // Consume a token with the exact text given.
    156   bool Consume(const char* text, const char* error);
    157   // Same as above, but automatically generates the error "Expected \"text\".",
    158   // where "text" is the expected token text.
    159   bool Consume(const char* text);
    160   // Consume a token of type IDENTIFIER and store its text in "output".
    161   bool ConsumeIdentifier(string* output, const char* error);
    162   // Consume an integer and store its value in "output".
    163   bool ConsumeInteger(int* output, const char* error);
    164   // Consume a 64-bit integer and store its value in "output".  If the value
    165   // is greater than max_value, an error will be reported.
    166   bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
    167   // Consume a number and store its value in "output".  This will accept
    168   // tokens of either INTEGER or FLOAT type.
    169   bool ConsumeNumber(double* output, const char* error);
    170   // Consume a string literal and store its (unescaped) value in "output".
    171   bool ConsumeString(string* output, const char* error);
    172 
    173   // -----------------------------------------------------------------
    174   // Error logging helpers
    175 
    176   // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
    177   void AddError(int line, int column, const string& error);
    178 
    179   // Invokes error_collector_->AddError() with the line and column number
    180   // of the current token.
    181   void AddError(const string& error);
    182 
    183   // Record the given line and column and associate it with this descriptor
    184   // in the SourceLocationTable.
    185   void RecordLocation(const Message* descriptor,
    186                       DescriptorPool::ErrorCollector::ErrorLocation location,
    187                       int line, int column);
    188 
    189   // Record the current line and column and associate it with this descriptor
    190   // in the SourceLocationTable.
    191   void RecordLocation(const Message* descriptor,
    192                       DescriptorPool::ErrorCollector::ErrorLocation location);
    193 
    194   // =================================================================
    195   // Parsers for various language constructs
    196 
    197   // Parses the "syntax = \"proto2\";" line at the top of the file.  Returns
    198   // false if it failed to parse or if the syntax identifier was not
    199   // recognized.
    200   bool ParseSyntaxIdentifier();
    201 
    202   // These methods parse various individual bits of code.  They return
    203   // false if they completely fail to parse the construct.  In this case,
    204   // it is probably necessary to skip the rest of the statement to recover.
    205   // However, if these methods return true, it does NOT mean that there
    206   // were no errors; only that there were no *syntax* errors.  For instance,
    207   // if a service method is defined using proper syntax but uses a primitive
    208   // type as its input or output, ParseMethodField() still returns true
    209   // and only reports the error by calling AddError().  In practice, this
    210   // makes logic much simpler for the caller.
    211 
    212   // Parse a top-level message, enum, service, etc.
    213   bool ParseTopLevelStatement(FileDescriptorProto* file);
    214 
    215   // Parse various language high-level language construrcts.
    216   bool ParseMessageDefinition(DescriptorProto* message);
    217   bool ParseEnumDefinition(EnumDescriptorProto* enum_type);
    218   bool ParseServiceDefinition(ServiceDescriptorProto* service);
    219   bool ParsePackage(FileDescriptorProto* file);
    220   bool ParseImport(string* import_filename);
    221   bool ParseOption(Message* options);
    222 
    223   // These methods parse the contents of a message, enum, or service type and
    224   // add them to the given object.  They consume the entire block including
    225   // the beginning and ending brace.
    226   bool ParseMessageBlock(DescriptorProto* message);
    227   bool ParseEnumBlock(EnumDescriptorProto* enum_type);
    228   bool ParseServiceBlock(ServiceDescriptorProto* service);
    229 
    230   // Parse one statement within a message, enum, or service block, inclunding
    231   // final semicolon.
    232   bool ParseMessageStatement(DescriptorProto* message);
    233   bool ParseEnumStatement(EnumDescriptorProto* message);
    234   bool ParseServiceStatement(ServiceDescriptorProto* message);
    235 
    236   // Parse a field of a message.  If the field is a group, its type will be
    237   // added to "messages".
    238   bool ParseMessageField(FieldDescriptorProto* field,
    239                          RepeatedPtrField<DescriptorProto>* messages);
    240 
    241   // Parse an "extensions" declaration.
    242   bool ParseExtensions(DescriptorProto* message);
    243 
    244   // Parse an "extend" declaration.
    245   bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
    246                    RepeatedPtrField<DescriptorProto>* messages);
    247 
    248   // Parse a single enum value within an enum block.
    249   bool ParseEnumConstant(EnumValueDescriptorProto* enum_value);
    250 
    251   // Parse enum constant options, i.e. the list in square brackets at the end
    252   // of the enum constant value definition.
    253   bool ParseEnumConstantOptions(EnumValueDescriptorProto* value);
    254 
    255   // Parse a single method within a service definition.
    256   bool ParseServiceMethod(MethodDescriptorProto* method);
    257 
    258   // Parse "required", "optional", or "repeated" and fill in "label"
    259   // with the value.
    260   bool ParseLabel(FieldDescriptorProto::Label* label);
    261 
    262   // Parse a type name and fill in "type" (if it is a primitive) or
    263   // "type_name" (if it is not) with the type parsed.
    264   bool ParseType(FieldDescriptorProto::Type* type,
    265                  string* type_name);
    266   // Parse a user-defined type and fill in "type_name" with the name.
    267   // If a primitive type is named, it is treated as an error.
    268   bool ParseUserDefinedType(string* type_name);
    269 
    270   // Parses field options, i.e. the stuff in square brackets at the end
    271   // of a field definition.  Also parses default value.
    272   bool ParseFieldOptions(FieldDescriptorProto* field);
    273 
    274   // Parse the "default" option.  This needs special handling because its
    275   // type is the field's type.
    276   bool ParseDefaultAssignment(FieldDescriptorProto* field);
    277 
    278   // Parse a single option name/value pair, e.g. "ctype = CORD".  The name
    279   // identifies a field of the given Message, and the value of that field
    280   // is set to the parsed value.
    281   bool ParseOptionAssignment(Message* options);
    282 
    283   // Parses a single part of a multipart option name. A multipart name consists
    284   // of names separated by dots. Each name is either an identifier or a series
    285   // of identifiers separated by dots and enclosed in parentheses. E.g.,
    286   // "foo.(bar.baz).qux".
    287   bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option);
    288 
    289   // =================================================================
    290 
    291   io::Tokenizer* input_;
    292   io::ErrorCollector* error_collector_;
    293   SourceLocationTable* source_location_table_;
    294   bool had_errors_;
    295   bool require_syntax_identifier_;
    296   bool stop_after_syntax_identifier_;
    297   string syntax_identifier_;
    298 
    299   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
    300 };
    301 
    302 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by
    303 // DescriptorPool when validating descriptors -- to line and column numbers
    304 // within the original source code.
    305 class LIBPROTOBUF_EXPORT SourceLocationTable {
    306  public:
    307   SourceLocationTable();
    308   ~SourceLocationTable();
    309 
    310   // Finds the precise location of the given error and fills in *line and
    311   // *column with the line and column numbers.  If not found, sets *line to
    312   // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
    313   // location" in the ErrorCollector interface).  Returns true if found, false
    314   // otherwise.
    315   bool Find(const Message* descriptor,
    316             DescriptorPool::ErrorCollector::ErrorLocation location,
    317             int* line, int* column) const;
    318 
    319   // Adds a location to the table.
    320   void Add(const Message* descriptor,
    321            DescriptorPool::ErrorCollector::ErrorLocation location,
    322            int line, int column);
    323 
    324   // Clears the contents of the table.
    325   void Clear();
    326 
    327  private:
    328   typedef map<
    329     pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
    330     pair<int, int> > LocationMap;
    331   LocationMap location_map_;
    332 };
    333 
    334 }  // namespace compiler
    335 }  // namespace protobuf
    336 
    337 }  // namespace google
    338 #endif  // GOOGLE_PROTOBUF_COMPILER_PARSER_H__
    339