Home | History | Annotate | Download | only in compiler
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // https://developers.google.com/protocol-buffers/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Implements parsing of .proto files to FileDescriptorProtos.
     36 
     37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     39 
     40 #include <map>
     41 #include <string>
     42 #include <utility>
     43 #include <google/protobuf/descriptor.h>
     44 #include <google/protobuf/descriptor.pb.h>
     45 #include <google/protobuf/repeated_field.h>
     46 #include <google/protobuf/io/tokenizer.h>
     47 
     48 namespace google {
     49 namespace protobuf { class Message; }
     50 
     51 namespace protobuf {
     52 namespace compiler {
     53 
     54 // Defined in this file.
     55 class Parser;
     56 class SourceLocationTable;
     57 
     58 // Implements parsing of protocol definitions (such as .proto files).
     59 //
     60 // Note that most users will be more interested in the Importer class.
     61 // Parser is a lower-level class which simply converts a single .proto file
     62 // to a FileDescriptorProto.  It does not resolve import directives or perform
     63 // many other kinds of validation needed to construct a complete
     64 // FileDescriptor.
     65 class LIBPROTOBUF_EXPORT Parser {
     66  public:
     67   Parser();
     68   ~Parser();
     69 
     70   // Parse the entire input and construct a FileDescriptorProto representing
     71   // it.  Returns true if no errors occurred, false otherwise.
     72   bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
     73 
     74   // Optional fetaures:
     75 
     76   // DEPRECATED:  New code should use the SourceCodeInfo embedded in the
     77   //   FileDescriptorProto.
     78   //
     79   // Requests that locations of certain definitions be recorded to the given
     80   // SourceLocationTable while parsing.  This can be used to look up exact line
     81   // and column numbers for errors reported by DescriptorPool during validation.
     82   // Set to NULL (the default) to discard source location information.
     83   void RecordSourceLocationsTo(SourceLocationTable* location_table) {
     84     source_location_table_ = location_table;
     85   }
     86 
     87   // Requests that errors be recorded to the given ErrorCollector while
     88   // parsing.  Set to NULL (the default) to discard error messages.
     89   void RecordErrorsTo(io::ErrorCollector* error_collector) {
     90     error_collector_ = error_collector;
     91   }
     92 
     93   // Returns the identifier used in the "syntax = " declaration, if one was
     94   // seen during the last call to Parse(), or the empty string otherwise.
     95   const string& GetSyntaxIdentifier() { return syntax_identifier_; }
     96 
     97   // If set true, input files will be required to begin with a syntax
     98   // identifier.  Otherwise, files may omit this.  If a syntax identifier
     99   // is provided, it must be 'syntax = "proto2";' and must appear at the
    100   // top of this file regardless of whether or not it was required.
    101   void SetRequireSyntaxIdentifier(bool value) {
    102     require_syntax_identifier_ = value;
    103   }
    104 
    105   // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
    106   // parsing as soon as it has seen the syntax identifier, or lack thereof.
    107   // This is useful for quickly identifying the syntax of the file without
    108   // parsing the whole thing.  If this is enabled, no error will be recorded
    109   // if the syntax identifier is something other than "proto2" (since
    110   // presumably the caller intends to deal with that), but other kinds of
    111   // errors (e.g. parse errors) will still be reported.  When this is enabled,
    112   // you may pass a NULL FileDescriptorProto to Parse().
    113   void SetStopAfterSyntaxIdentifier(bool value) {
    114     stop_after_syntax_identifier_ = value;
    115   }
    116 
    117  private:
    118   class LocationRecorder;
    119 
    120   // =================================================================
    121   // Error recovery helpers
    122 
    123   // Consume the rest of the current statement.  This consumes tokens
    124   // until it sees one of:
    125   //   ';'  Consumes the token and returns.
    126   //   '{'  Consumes the brace then calls SkipRestOfBlock().
    127   //   '}'  Returns without consuming.
    128   //   EOF  Returns (can't consume).
    129   // The Parser often calls SkipStatement() after encountering a syntax
    130   // error.  This allows it to go on parsing the following lines, allowing
    131   // it to report more than just one error in the file.
    132   void SkipStatement();
    133 
    134   // Consume the rest of the current block, including nested blocks,
    135   // ending after the closing '}' is encountered and consumed, or at EOF.
    136   void SkipRestOfBlock();
    137 
    138   // -----------------------------------------------------------------
    139   // Single-token consuming helpers
    140   //
    141   // These make parsing code more readable.
    142 
    143   // True if the current token is TYPE_END.
    144   inline bool AtEnd();
    145 
    146   // True if the next token matches the given text.
    147   inline bool LookingAt(const char* text);
    148   // True if the next token is of the given type.
    149   inline bool LookingAtType(io::Tokenizer::TokenType token_type);
    150 
    151   // If the next token exactly matches the text given, consume it and return
    152   // true.  Otherwise, return false without logging an error.
    153   bool TryConsume(const char* text);
    154 
    155   // These attempt to read some kind of token from the input.  If successful,
    156   // they return true.  Otherwise they return false and add the given error
    157   // to the error list.
    158 
    159   // Consume a token with the exact text given.
    160   bool Consume(const char* text, const char* error);
    161   // Same as above, but automatically generates the error "Expected \"text\".",
    162   // where "text" is the expected token text.
    163   bool Consume(const char* text);
    164   // Consume a token of type IDENTIFIER and store its text in "output".
    165   bool ConsumeIdentifier(string* output, const char* error);
    166   // Consume an integer and store its value in "output".
    167   bool ConsumeInteger(int* output, const char* error);
    168   // Consume a signed integer and store its value in "output".
    169   bool ConsumeSignedInteger(int* output, const char* error);
    170   // Consume a 64-bit integer and store its value in "output".  If the value
    171   // is greater than max_value, an error will be reported.
    172   bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
    173   // Consume a number and store its value in "output".  This will accept
    174   // tokens of either INTEGER or FLOAT type.
    175   bool ConsumeNumber(double* output, const char* error);
    176   // Consume a string literal and store its (unescaped) value in "output".
    177   bool ConsumeString(string* output, const char* error);
    178 
    179   // Consume a token representing the end of the statement.  Comments between
    180   // this token and the next will be harvested for documentation.  The given
    181   // LocationRecorder should refer to the declaration that was just parsed;
    182   // it will be populated with these comments.
    183   //
    184   // TODO(kenton):  The LocationRecorder is const because historically locations
    185   //   have been passed around by const reference, for no particularly good
    186   //   reason.  We should probably go through and change them all to mutable
    187   //   pointer to make this more intuitive.
    188   bool TryConsumeEndOfDeclaration(const char* text,
    189                                   const LocationRecorder* location);
    190   bool ConsumeEndOfDeclaration(const char* text,
    191                                const LocationRecorder* location);
    192 
    193   // -----------------------------------------------------------------
    194   // Error logging helpers
    195 
    196   // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
    197   void AddError(int line, int column, const string& error);
    198 
    199   // Invokes error_collector_->AddError() with the line and column number
    200   // of the current token.
    201   void AddError(const string& error);
    202 
    203   // Records a location in the SourceCodeInfo.location table (see
    204   // descriptor.proto).  We use RAII to ensure that the start and end locations
    205   // are recorded -- the constructor records the start location and the
    206   // destructor records the end location.  Since the parser is
    207   // recursive-descent, this works out beautifully.
    208   class LIBPROTOBUF_EXPORT LocationRecorder {
    209    public:
    210     // Construct the file's "root" location.
    211     LocationRecorder(Parser* parser);
    212 
    213     // Construct a location that represents a declaration nested within the
    214     // given parent.  E.g. a field's location is nested within the location
    215     // for a message type.  The parent's path will be copied, so you should
    216     // call AddPath() only to add the path components leading from the parent
    217     // to the child (as opposed to leading from the root to the child).
    218     LocationRecorder(const LocationRecorder& parent);
    219 
    220     // Convenience constructors that call AddPath() one or two times.
    221     LocationRecorder(const LocationRecorder& parent, int path1);
    222     LocationRecorder(const LocationRecorder& parent, int path1, int path2);
    223 
    224     ~LocationRecorder();
    225 
    226     // Add a path component.  See SourceCodeInfo.Location.path in
    227     // descriptor.proto.
    228     void AddPath(int path_component);
    229 
    230     // By default the location is considered to start at the current token at
    231     // the time the LocationRecorder is created.  StartAt() sets the start
    232     // location to the given token instead.
    233     void StartAt(const io::Tokenizer::Token& token);
    234 
    235     // Start at the same location as some other LocationRecorder.
    236     void StartAt(const LocationRecorder& other);
    237 
    238     // By default the location is considered to end at the previous token at
    239     // the time the LocationRecorder is destroyed.  EndAt() sets the end
    240     // location to the given token instead.
    241     void EndAt(const io::Tokenizer::Token& token);
    242 
    243     // Records the start point of this location to the SourceLocationTable that
    244     // was passed to RecordSourceLocationsTo(), if any.  SourceLocationTable
    245     // is an older way of keeping track of source locations which is still
    246     // used in some places.
    247     void RecordLegacyLocation(const Message* descriptor,
    248         DescriptorPool::ErrorCollector::ErrorLocation location);
    249 
    250     // Attaches leading and trailing comments to the location.  The two strings
    251     // will be swapped into place, so after this is called *leading and
    252     // *trailing will be empty.
    253     //
    254     // TODO(kenton):  See comment on TryConsumeEndOfDeclaration(), above, for
    255     //   why this is const.
    256     void AttachComments(string* leading, string* trailing) const;
    257 
    258    private:
    259     Parser* parser_;
    260     SourceCodeInfo::Location* location_;
    261 
    262     void Init(const LocationRecorder& parent);
    263   };
    264 
    265   // =================================================================
    266   // Parsers for various language constructs
    267 
    268   // Parses the "syntax = \"proto2\";" line at the top of the file.  Returns
    269   // false if it failed to parse or if the syntax identifier was not
    270   // recognized.
    271   bool ParseSyntaxIdentifier();
    272 
    273   // These methods parse various individual bits of code.  They return
    274   // false if they completely fail to parse the construct.  In this case,
    275   // it is probably necessary to skip the rest of the statement to recover.
    276   // However, if these methods return true, it does NOT mean that there
    277   // were no errors; only that there were no *syntax* errors.  For instance,
    278   // if a service method is defined using proper syntax but uses a primitive
    279   // type as its input or output, ParseMethodField() still returns true
    280   // and only reports the error by calling AddError().  In practice, this
    281   // makes logic much simpler for the caller.
    282 
    283   // Parse a top-level message, enum, service, etc.
    284   bool ParseTopLevelStatement(FileDescriptorProto* file,
    285                               const LocationRecorder& root_location);
    286 
    287   // Parse various language high-level language construrcts.
    288   bool ParseMessageDefinition(DescriptorProto* message,
    289                               const LocationRecorder& message_location,
    290                               const FileDescriptorProto* containing_file);
    291   bool ParseEnumDefinition(EnumDescriptorProto* enum_type,
    292                            const LocationRecorder& enum_location,
    293                            const FileDescriptorProto* containing_file);
    294   bool ParseServiceDefinition(ServiceDescriptorProto* service,
    295                               const LocationRecorder& service_location,
    296                               const FileDescriptorProto* containing_file);
    297   bool ParsePackage(FileDescriptorProto* file,
    298                     const LocationRecorder& root_location,
    299                     const FileDescriptorProto* containing_file);
    300   bool ParseImport(RepeatedPtrField<string>* dependency,
    301                    RepeatedField<int32>* public_dependency,
    302                    RepeatedField<int32>* weak_dependency,
    303                    const LocationRecorder& root_location,
    304                    const FileDescriptorProto* containing_file);
    305   bool ParseOption(Message* options,
    306                    const LocationRecorder& options_location,
    307                    const FileDescriptorProto* containing_file);
    308 
    309   // These methods parse the contents of a message, enum, or service type and
    310   // add them to the given object.  They consume the entire block including
    311   // the beginning and ending brace.
    312   bool ParseMessageBlock(DescriptorProto* message,
    313                          const LocationRecorder& message_location,
    314                          const FileDescriptorProto* containing_file);
    315   bool ParseEnumBlock(EnumDescriptorProto* enum_type,
    316                       const LocationRecorder& enum_location,
    317                       const FileDescriptorProto* containing_file);
    318   bool ParseServiceBlock(ServiceDescriptorProto* service,
    319                          const LocationRecorder& service_location,
    320                          const FileDescriptorProto* containing_file);
    321 
    322   // Parse one statement within a message, enum, or service block, inclunding
    323   // final semicolon.
    324   bool ParseMessageStatement(DescriptorProto* message,
    325                              const LocationRecorder& message_location,
    326                              const FileDescriptorProto* containing_file);
    327   bool ParseEnumStatement(EnumDescriptorProto* message,
    328                           const LocationRecorder& enum_location,
    329                           const FileDescriptorProto* containing_file);
    330   bool ParseServiceStatement(ServiceDescriptorProto* message,
    331                              const LocationRecorder& service_location,
    332                              const FileDescriptorProto* containing_file);
    333 
    334   // Parse a field of a message.  If the field is a group, its type will be
    335   // added to "messages".
    336   //
    337   // parent_location and location_field_number_for_nested_type are needed when
    338   // parsing groups -- we need to generate a nested message type within the
    339   // parent and record its location accordingly.  Since the parent could be
    340   // either a FileDescriptorProto or a DescriptorProto, we must pass in the
    341   // correct field number to use.
    342   bool ParseMessageField(FieldDescriptorProto* field,
    343                          RepeatedPtrField<DescriptorProto>* messages,
    344                          const LocationRecorder& parent_location,
    345                          int location_field_number_for_nested_type,
    346                          const LocationRecorder& field_location,
    347                          const FileDescriptorProto* containing_file);
    348 
    349   // Like ParseMessageField() but expects the label has already been filled in
    350   // by the caller.
    351   bool ParseMessageFieldNoLabel(FieldDescriptorProto* field,
    352                                 RepeatedPtrField<DescriptorProto>* messages,
    353                                 const LocationRecorder& parent_location,
    354                                 int location_field_number_for_nested_type,
    355                                 const LocationRecorder& field_location,
    356                                 const FileDescriptorProto* containing_file);
    357 
    358   // Parse an "extensions" declaration.
    359   bool ParseExtensions(DescriptorProto* message,
    360                        const LocationRecorder& extensions_location,
    361                        const FileDescriptorProto* containing_file);
    362 
    363   // Parse an "extend" declaration.  (See also comments for
    364   // ParseMessageField().)
    365   bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
    366                    RepeatedPtrField<DescriptorProto>* messages,
    367                    const LocationRecorder& parent_location,
    368                    int location_field_number_for_nested_type,
    369                    const LocationRecorder& extend_location,
    370                    const FileDescriptorProto* containing_file);
    371 
    372   // Parse a "oneof" declaration.  The caller is responsible for setting
    373   // oneof_decl->label() since it will have had to parse the label before it
    374   // knew it was parsing a oneof.
    375   bool ParseOneof(OneofDescriptorProto* oneof_decl,
    376                   DescriptorProto* containing_type,
    377                   int oneof_index,
    378                   const LocationRecorder& oneof_location,
    379                   const LocationRecorder& containing_type_location,
    380                   const FileDescriptorProto* containing_file);
    381 
    382   // Parse a single enum value within an enum block.
    383   bool ParseEnumConstant(EnumValueDescriptorProto* enum_value,
    384                          const LocationRecorder& enum_value_location,
    385                          const FileDescriptorProto* containing_file);
    386 
    387   // Parse enum constant options, i.e. the list in square brackets at the end
    388   // of the enum constant value definition.
    389   bool ParseEnumConstantOptions(EnumValueDescriptorProto* value,
    390                                 const LocationRecorder& enum_value_location,
    391                                 const FileDescriptorProto* containing_file);
    392 
    393   // Parse a single method within a service definition.
    394   bool ParseServiceMethod(MethodDescriptorProto* method,
    395                           const LocationRecorder& method_location,
    396                           const FileDescriptorProto* containing_file);
    397 
    398 
    399   // Parse options of a single method or stream.
    400   bool ParseOptions(const LocationRecorder& parent_location,
    401                     const FileDescriptorProto* containing_file,
    402                     const int optionsFieldNumber,
    403                     Message* mutable_options);
    404 
    405   // Parse "required", "optional", or "repeated" and fill in "label"
    406   // with the value.
    407   bool ParseLabel(FieldDescriptorProto::Label* label,
    408                   const FileDescriptorProto* containing_file);
    409 
    410   // Parse a type name and fill in "type" (if it is a primitive) or
    411   // "type_name" (if it is not) with the type parsed.
    412   bool ParseType(FieldDescriptorProto::Type* type,
    413                  string* type_name);
    414   // Parse a user-defined type and fill in "type_name" with the name.
    415   // If a primitive type is named, it is treated as an error.
    416   bool ParseUserDefinedType(string* type_name);
    417 
    418   // Parses field options, i.e. the stuff in square brackets at the end
    419   // of a field definition.  Also parses default value.
    420   bool ParseFieldOptions(FieldDescriptorProto* field,
    421                          const LocationRecorder& field_location,
    422                          const FileDescriptorProto* containing_file);
    423 
    424   // Parse the "default" option.  This needs special handling because its
    425   // type is the field's type.
    426   bool ParseDefaultAssignment(FieldDescriptorProto* field,
    427                               const LocationRecorder& field_location,
    428                               const FileDescriptorProto* containing_file);
    429 
    430   enum OptionStyle {
    431     OPTION_ASSIGNMENT,  // just "name = value"
    432     OPTION_STATEMENT    // "option name = value;"
    433   };
    434 
    435   // Parse a single option name/value pair, e.g. "ctype = CORD".  The name
    436   // identifies a field of the given Message, and the value of that field
    437   // is set to the parsed value.
    438   bool ParseOption(Message* options,
    439                    const LocationRecorder& options_location,
    440                    const FileDescriptorProto* containing_file,
    441                    OptionStyle style);
    442 
    443   // Parses a single part of a multipart option name. A multipart name consists
    444   // of names separated by dots. Each name is either an identifier or a series
    445   // of identifiers separated by dots and enclosed in parentheses. E.g.,
    446   // "foo.(bar.baz).qux".
    447   bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option,
    448                            const LocationRecorder& part_location,
    449                            const FileDescriptorProto* containing_file);
    450 
    451   // Parses a string surrounded by balanced braces.  Strips off the outer
    452   // braces and stores the enclosed string in *value.
    453   // E.g.,
    454   //     { foo }                     *value gets 'foo'
    455   //     { foo { bar: box } }        *value gets 'foo { bar: box }'
    456   //     {}                          *value gets ''
    457   //
    458   // REQUIRES: LookingAt("{")
    459   // When finished successfully, we are looking at the first token past
    460   // the ending brace.
    461   bool ParseUninterpretedBlock(string* value);
    462 
    463   // =================================================================
    464 
    465   io::Tokenizer* input_;
    466   io::ErrorCollector* error_collector_;
    467   SourceCodeInfo* source_code_info_;
    468   SourceLocationTable* source_location_table_;  // legacy
    469   bool had_errors_;
    470   bool require_syntax_identifier_;
    471   bool stop_after_syntax_identifier_;
    472   string syntax_identifier_;
    473 
    474   // Leading doc comments for the next declaration.  These are not complete
    475   // yet; use ConsumeEndOfDeclaration() to get the complete comments.
    476   string upcoming_doc_comments_;
    477 
    478   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
    479 };
    480 
    481 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by
    482 // DescriptorPool when validating descriptors -- to line and column numbers
    483 // within the original source code.
    484 //
    485 // This is semi-obsolete:  FileDescriptorProto.source_code_info now contains
    486 // far more complete information about source locations.  However, as of this
    487 // writing you still need to use SourceLocationTable when integrating with
    488 // DescriptorPool.
    489 class LIBPROTOBUF_EXPORT SourceLocationTable {
    490  public:
    491   SourceLocationTable();
    492   ~SourceLocationTable();
    493 
    494   // Finds the precise location of the given error and fills in *line and
    495   // *column with the line and column numbers.  If not found, sets *line to
    496   // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
    497   // location" in the ErrorCollector interface).  Returns true if found, false
    498   // otherwise.
    499   bool Find(const Message* descriptor,
    500             DescriptorPool::ErrorCollector::ErrorLocation location,
    501             int* line, int* column) const;
    502 
    503   // Adds a location to the table.
    504   void Add(const Message* descriptor,
    505            DescriptorPool::ErrorCollector::ErrorLocation location,
    506            int line, int column);
    507 
    508   // Clears the contents of the table.
    509   void Clear();
    510 
    511  private:
    512   typedef map<
    513     pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
    514     pair<int, int> > LocationMap;
    515   LocationMap location_map_;
    516 };
    517 
    518 }  // namespace compiler
    519 }  // namespace protobuf
    520 
    521 }  // namespace google
    522 #endif  // GOOGLE_PROTOBUF_COMPILER_PARSER_H__
    523