Home | History | Annotate | Download | only in compiler
      1 // Protocol Buffers - Google's data interchange format
      2 // Copyright 2008 Google Inc.  All rights reserved.
      3 // http://code.google.com/p/protobuf/
      4 //
      5 // Redistribution and use in source and binary forms, with or without
      6 // modification, are permitted provided that the following conditions are
      7 // met:
      8 //
      9 //     * Redistributions of source code must retain the above copyright
     10 // notice, this list of conditions and the following disclaimer.
     11 //     * Redistributions in binary form must reproduce the above
     12 // copyright notice, this list of conditions and the following disclaimer
     13 // in the documentation and/or other materials provided with the
     14 // distribution.
     15 //     * Neither the name of Google Inc. nor the names of its
     16 // contributors may be used to endorse or promote products derived from
     17 // this software without specific prior written permission.
     18 //
     19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     30 
     31 // Author: kenton (at) google.com (Kenton Varda)
     32 //  Based on original Protocol Buffers design by
     33 //  Sanjay Ghemawat, Jeff Dean, and others.
     34 //
     35 // Implements parsing of .proto files to FileDescriptorProtos.
     36 
     37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__
     39 
     40 #include <map>
     41 #include <string>
     42 #include <utility>
     43 #include <google/protobuf/stubs/common.h>
     44 #include <google/protobuf/descriptor.h>
     45 #include <google/protobuf/descriptor.pb.h>
     46 #include <google/protobuf/repeated_field.h>
     47 #include <google/protobuf/io/tokenizer.h>
     48 
     49 namespace google {
     50 namespace protobuf { class Message; }
     51 
     52 namespace protobuf {
     53 namespace compiler {
     54 
     55 // Defined in this file.
     56 class Parser;
     57 class SourceLocationTable;
     58 
     59 // Implements parsing of protocol definitions (such as .proto files).
     60 //
     61 // Note that most users will be more interested in the Importer class.
     62 // Parser is a lower-level class which simply converts a single .proto file
     63 // to a FileDescriptorProto.  It does not resolve import directives or perform
     64 // many other kinds of validation needed to construct a complete
     65 // FileDescriptor.
     66 class LIBPROTOBUF_EXPORT Parser {
     67  public:
     68   Parser();
     69   ~Parser();
     70 
     71   // Parse the entire input and construct a FileDescriptorProto representing
     72   // it.  Returns true if no errors occurred, false otherwise.
     73   bool Parse(io::Tokenizer* input, FileDescriptorProto* file);
     74 
     75   // Optional fetaures:
     76 
     77   // DEPRECATED:  New code should use the SourceCodeInfo embedded in the
     78   //   FileDescriptorProto.
     79   //
     80   // Requests that locations of certain definitions be recorded to the given
     81   // SourceLocationTable while parsing.  This can be used to look up exact line
     82   // and column numbers for errors reported by DescriptorPool during validation.
     83   // Set to NULL (the default) to discard source location information.
     84   void RecordSourceLocationsTo(SourceLocationTable* location_table) {
     85     source_location_table_ = location_table;
     86   }
     87 
     88   // Requests that errors be recorded to the given ErrorCollector while
     89   // parsing.  Set to NULL (the default) to discard error messages.
     90   void RecordErrorsTo(io::ErrorCollector* error_collector) {
     91     error_collector_ = error_collector;
     92   }
     93 
     94   // Returns the identifier used in the "syntax = " declaration, if one was
     95   // seen during the last call to Parse(), or the empty string otherwise.
     96   const string& GetSyntaxIdentifier() { return syntax_identifier_; }
     97 
     98   // If set true, input files will be required to begin with a syntax
     99   // identifier.  Otherwise, files may omit this.  If a syntax identifier
    100   // is provided, it must be 'syntax = "proto2";' and must appear at the
    101   // top of this file regardless of whether or not it was required.
    102   void SetRequireSyntaxIdentifier(bool value) {
    103     require_syntax_identifier_ = value;
    104   }
    105 
    106   // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop
    107   // parsing as soon as it has seen the syntax identifier, or lack thereof.
    108   // This is useful for quickly identifying the syntax of the file without
    109   // parsing the whole thing.  If this is enabled, no error will be recorded
    110   // if the syntax identifier is something other than "proto2" (since
    111   // presumably the caller intends to deal with that), but other kinds of
    112   // errors (e.g. parse errors) will still be reported.  When this is enabled,
    113   // you may pass a NULL FileDescriptorProto to Parse().
    114   void SetStopAfterSyntaxIdentifier(bool value) {
    115     stop_after_syntax_identifier_ = value;
    116   }
    117 
    118  private:
    119   class LocationRecorder;
    120 
    121   // =================================================================
    122   // Error recovery helpers
    123 
    124   // Consume the rest of the current statement.  This consumes tokens
    125   // until it sees one of:
    126   //   ';'  Consumes the token and returns.
    127   //   '{'  Consumes the brace then calls SkipRestOfBlock().
    128   //   '}'  Returns without consuming.
    129   //   EOF  Returns (can't consume).
    130   // The Parser often calls SkipStatement() after encountering a syntax
    131   // error.  This allows it to go on parsing the following lines, allowing
    132   // it to report more than just one error in the file.
    133   void SkipStatement();
    134 
    135   // Consume the rest of the current block, including nested blocks,
    136   // ending after the closing '}' is encountered and consumed, or at EOF.
    137   void SkipRestOfBlock();
    138 
    139   // -----------------------------------------------------------------
    140   // Single-token consuming helpers
    141   //
    142   // These make parsing code more readable.
    143 
    144   // True if the current token is TYPE_END.
    145   inline bool AtEnd();
    146 
    147   // True if the next token matches the given text.
    148   inline bool LookingAt(const char* text);
    149   // True if the next token is of the given type.
    150   inline bool LookingAtType(io::Tokenizer::TokenType token_type);
    151 
    152   // If the next token exactly matches the text given, consume it and return
    153   // true.  Otherwise, return false without logging an error.
    154   bool TryConsume(const char* text);
    155 
    156   // These attempt to read some kind of token from the input.  If successful,
    157   // they return true.  Otherwise they return false and add the given error
    158   // to the error list.
    159 
    160   // Consume a token with the exact text given.
    161   bool Consume(const char* text, const char* error);
    162   // Same as above, but automatically generates the error "Expected \"text\".",
    163   // where "text" is the expected token text.
    164   bool Consume(const char* text);
    165   // Consume a token of type IDENTIFIER and store its text in "output".
    166   bool ConsumeIdentifier(string* output, const char* error);
    167   // Consume an integer and store its value in "output".
    168   bool ConsumeInteger(int* output, const char* error);
    169   // Consume a signed integer and store its value in "output".
    170   bool ConsumeSignedInteger(int* output, const char* error);
    171   // Consume a 64-bit integer and store its value in "output".  If the value
    172   // is greater than max_value, an error will be reported.
    173   bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error);
    174   // Consume a number and store its value in "output".  This will accept
    175   // tokens of either INTEGER or FLOAT type.
    176   bool ConsumeNumber(double* output, const char* error);
    177   // Consume a string literal and store its (unescaped) value in "output".
    178   bool ConsumeString(string* output, const char* error);
    179 
    180   // Consume a token representing the end of the statement.  Comments between
    181   // this token and the next will be harvested for documentation.  The given
    182   // LocationRecorder should refer to the declaration that was just parsed;
    183   // it will be populated with these comments.
    184   //
    185   // TODO(kenton):  The LocationRecorder is const because historically locations
    186   //   have been passed around by const reference, for no particularly good
    187   //   reason.  We should probably go through and change them all to mutable
    188   //   pointer to make this more intuitive.
    189   bool TryConsumeEndOfDeclaration(const char* text,
    190                                   const LocationRecorder* location);
    191   bool ConsumeEndOfDeclaration(const char* text,
    192                                const LocationRecorder* location);
    193 
    194   // -----------------------------------------------------------------
    195   // Error logging helpers
    196 
    197   // Invokes error_collector_->AddError(), if error_collector_ is not NULL.
    198   void AddError(int line, int column, const string& error);
    199 
    200   // Invokes error_collector_->AddError() with the line and column number
    201   // of the current token.
    202   void AddError(const string& error);
    203 
    204   // Records a location in the SourceCodeInfo.location table (see
    205   // descriptor.proto).  We use RAII to ensure that the start and end locations
    206   // are recorded -- the constructor records the start location and the
    207   // destructor records the end location.  Since the parser is
    208   // recursive-descent, this works out beautifully.
    209   class LIBPROTOBUF_EXPORT LocationRecorder {
    210    public:
    211     // Construct the file's "root" location.
    212     LocationRecorder(Parser* parser);
    213 
    214     // Construct a location that represents a declaration nested within the
    215     // given parent.  E.g. a field's location is nested within the location
    216     // for a message type.  The parent's path will be copied, so you should
    217     // call AddPath() only to add the path components leading from the parent
    218     // to the child (as opposed to leading from the root to the child).
    219     LocationRecorder(const LocationRecorder& parent);
    220 
    221     // Convenience constructors that call AddPath() one or two times.
    222     LocationRecorder(const LocationRecorder& parent, int path1);
    223     LocationRecorder(const LocationRecorder& parent, int path1, int path2);
    224 
    225     ~LocationRecorder();
    226 
    227     // Add a path component.  See SourceCodeInfo.Location.path in
    228     // descriptor.proto.
    229     void AddPath(int path_component);
    230 
    231     // By default the location is considered to start at the current token at
    232     // the time the LocationRecorder is created.  StartAt() sets the start
    233     // location to the given token instead.
    234     void StartAt(const io::Tokenizer::Token& token);
    235 
    236     // By default the location is considered to end at the previous token at
    237     // the time the LocationRecorder is destroyed.  EndAt() sets the end
    238     // location to the given token instead.
    239     void EndAt(const io::Tokenizer::Token& token);
    240 
    241     // Records the start point of this location to the SourceLocationTable that
    242     // was passed to RecordSourceLocationsTo(), if any.  SourceLocationTable
    243     // is an older way of keeping track of source locations which is still
    244     // used in some places.
    245     void RecordLegacyLocation(const Message* descriptor,
    246         DescriptorPool::ErrorCollector::ErrorLocation location);
    247 
    248     // Attaches leading and trailing comments to the location.  The two strings
    249     // will be swapped into place, so after this is called *leading and
    250     // *trailing will be empty.
    251     //
    252     // TODO(kenton):  See comment on TryConsumeEndOfDeclaration(), above, for
    253     //   why this is const.
    254     void AttachComments(string* leading, string* trailing) const;
    255 
    256    private:
    257     Parser* parser_;
    258     SourceCodeInfo::Location* location_;
    259 
    260     void Init(const LocationRecorder& parent);
    261   };
    262 
    263   // =================================================================
    264   // Parsers for various language constructs
    265 
    266   // Parses the "syntax = \"proto2\";" line at the top of the file.  Returns
    267   // false if it failed to parse or if the syntax identifier was not
    268   // recognized.
    269   bool ParseSyntaxIdentifier();
    270 
    271   // These methods parse various individual bits of code.  They return
    272   // false if they completely fail to parse the construct.  In this case,
    273   // it is probably necessary to skip the rest of the statement to recover.
    274   // However, if these methods return true, it does NOT mean that there
    275   // were no errors; only that there were no *syntax* errors.  For instance,
    276   // if a service method is defined using proper syntax but uses a primitive
    277   // type as its input or output, ParseMethodField() still returns true
    278   // and only reports the error by calling AddError().  In practice, this
    279   // makes logic much simpler for the caller.
    280 
    281   // Parse a top-level message, enum, service, etc.
    282   bool ParseTopLevelStatement(FileDescriptorProto* file,
    283                               const LocationRecorder& root_location);
    284 
    285   // Parse various language high-level language construrcts.
    286   bool ParseMessageDefinition(DescriptorProto* message,
    287                               const LocationRecorder& message_location);
    288   bool ParseEnumDefinition(EnumDescriptorProto* enum_type,
    289                            const LocationRecorder& enum_location);
    290   bool ParseServiceDefinition(ServiceDescriptorProto* service,
    291                               const LocationRecorder& service_location);
    292   bool ParsePackage(FileDescriptorProto* file,
    293                     const LocationRecorder& root_location);
    294   bool ParseImport(RepeatedPtrField<string>* dependency,
    295                    RepeatedField<int32>* public_dependency,
    296                    RepeatedField<int32>* weak_dependency,
    297                    const LocationRecorder& root_location);
    298   bool ParseOption(Message* options,
    299                    const LocationRecorder& options_location);
    300 
    301   // These methods parse the contents of a message, enum, or service type and
    302   // add them to the given object.  They consume the entire block including
    303   // the beginning and ending brace.
    304   bool ParseMessageBlock(DescriptorProto* message,
    305                          const LocationRecorder& message_location);
    306   bool ParseEnumBlock(EnumDescriptorProto* enum_type,
    307                       const LocationRecorder& enum_location);
    308   bool ParseServiceBlock(ServiceDescriptorProto* service,
    309                          const LocationRecorder& service_location);
    310 
    311   // Parse one statement within a message, enum, or service block, inclunding
    312   // final semicolon.
    313   bool ParseMessageStatement(DescriptorProto* message,
    314                              const LocationRecorder& message_location);
    315   bool ParseEnumStatement(EnumDescriptorProto* message,
    316                           const LocationRecorder& enum_location);
    317   bool ParseServiceStatement(ServiceDescriptorProto* message,
    318                              const LocationRecorder& service_location);
    319 
    320   // Parse a field of a message.  If the field is a group, its type will be
    321   // added to "messages".
    322   //
    323   // parent_location and location_field_number_for_nested_type are needed when
    324   // parsing groups -- we need to generate a nested message type within the
    325   // parent and record its location accordingly.  Since the parent could be
    326   // either a FileDescriptorProto or a DescriptorProto, we must pass in the
    327   // correct field number to use.
    328   bool ParseMessageField(FieldDescriptorProto* field,
    329                          RepeatedPtrField<DescriptorProto>* messages,
    330                          const LocationRecorder& parent_location,
    331                          int location_field_number_for_nested_type,
    332                          const LocationRecorder& field_location);
    333 
    334   // Parse an "extensions" declaration.
    335   bool ParseExtensions(DescriptorProto* message,
    336                        const LocationRecorder& extensions_location);
    337 
    338   // Parse an "extend" declaration.  (See also comments for
    339   // ParseMessageField().)
    340   bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions,
    341                    RepeatedPtrField<DescriptorProto>* messages,
    342                    const LocationRecorder& parent_location,
    343                    int location_field_number_for_nested_type,
    344                    const LocationRecorder& extend_location);
    345 
    346   // Parse a single enum value within an enum block.
    347   bool ParseEnumConstant(EnumValueDescriptorProto* enum_value,
    348                          const LocationRecorder& enum_value_location);
    349 
    350   // Parse enum constant options, i.e. the list in square brackets at the end
    351   // of the enum constant value definition.
    352   bool ParseEnumConstantOptions(EnumValueDescriptorProto* value,
    353                                 const LocationRecorder& enum_value_location);
    354 
    355   // Parse a single method within a service definition.
    356   bool ParseServiceMethod(MethodDescriptorProto* method,
    357                           const LocationRecorder& method_location);
    358 
    359 
    360   // Parse options of a single method or stream.
    361   bool ParseOptions(const LocationRecorder& parent_location,
    362                     const int optionsFieldNumber,
    363                     Message* mutable_options);
    364 
    365   // Parse "required", "optional", or "repeated" and fill in "label"
    366   // with the value.
    367   bool ParseLabel(FieldDescriptorProto::Label* label);
    368 
    369   // Parse a type name and fill in "type" (if it is a primitive) or
    370   // "type_name" (if it is not) with the type parsed.
    371   bool ParseType(FieldDescriptorProto::Type* type,
    372                  string* type_name);
    373   // Parse a user-defined type and fill in "type_name" with the name.
    374   // If a primitive type is named, it is treated as an error.
    375   bool ParseUserDefinedType(string* type_name);
    376 
    377   // Parses field options, i.e. the stuff in square brackets at the end
    378   // of a field definition.  Also parses default value.
    379   bool ParseFieldOptions(FieldDescriptorProto* field,
    380                          const LocationRecorder& field_location);
    381 
    382   // Parse the "default" option.  This needs special handling because its
    383   // type is the field's type.
    384   bool ParseDefaultAssignment(FieldDescriptorProto* field,
    385                               const LocationRecorder& field_location);
    386 
    387   enum OptionStyle {
    388     OPTION_ASSIGNMENT,  // just "name = value"
    389     OPTION_STATEMENT    // "option name = value;"
    390   };
    391 
    392   // Parse a single option name/value pair, e.g. "ctype = CORD".  The name
    393   // identifies a field of the given Message, and the value of that field
    394   // is set to the parsed value.
    395   bool ParseOption(Message* options,
    396                    const LocationRecorder& options_location,
    397                    OptionStyle style);
    398 
    399   // Parses a single part of a multipart option name. A multipart name consists
    400   // of names separated by dots. Each name is either an identifier or a series
    401   // of identifiers separated by dots and enclosed in parentheses. E.g.,
    402   // "foo.(bar.baz).qux".
    403   bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option,
    404                            const LocationRecorder& part_location);
    405 
    406   // Parses a string surrounded by balanced braces.  Strips off the outer
    407   // braces and stores the enclosed string in *value.
    408   // E.g.,
    409   //     { foo }                     *value gets 'foo'
    410   //     { foo { bar: box } }        *value gets 'foo { bar: box }'
    411   //     {}                          *value gets ''
    412   //
    413   // REQUIRES: LookingAt("{")
    414   // When finished successfully, we are looking at the first token past
    415   // the ending brace.
    416   bool ParseUninterpretedBlock(string* value);
    417 
    418   // =================================================================
    419 
    420   io::Tokenizer* input_;
    421   io::ErrorCollector* error_collector_;
    422   SourceCodeInfo* source_code_info_;
    423   SourceLocationTable* source_location_table_;  // legacy
    424   bool had_errors_;
    425   bool require_syntax_identifier_;
    426   bool stop_after_syntax_identifier_;
    427   string syntax_identifier_;
    428 
    429   // Leading doc comments for the next declaration.  These are not complete
    430   // yet; use ConsumeEndOfDeclaration() to get the complete comments.
    431   string upcoming_doc_comments_;
    432 
    433   GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser);
    434 };
    435 
    436 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by
    437 // DescriptorPool when validating descriptors -- to line and column numbers
    438 // within the original source code.
    439 //
    440 // This is semi-obsolete:  FileDescriptorProto.source_code_info now contains
    441 // far more complete information about source locations.  However, as of this
    442 // writing you still need to use SourceLocationTable when integrating with
    443 // DescriptorPool.
    444 class LIBPROTOBUF_EXPORT SourceLocationTable {
    445  public:
    446   SourceLocationTable();
    447   ~SourceLocationTable();
    448 
    449   // Finds the precise location of the given error and fills in *line and
    450   // *column with the line and column numbers.  If not found, sets *line to
    451   // -1 and *column to 0 (since line = -1 is used to mean "error has no exact
    452   // location" in the ErrorCollector interface).  Returns true if found, false
    453   // otherwise.
    454   bool Find(const Message* descriptor,
    455             DescriptorPool::ErrorCollector::ErrorLocation location,
    456             int* line, int* column) const;
    457 
    458   // Adds a location to the table.
    459   void Add(const Message* descriptor,
    460            DescriptorPool::ErrorCollector::ErrorLocation location,
    461            int line, int column);
    462 
    463   // Clears the contents of the table.
    464   void Clear();
    465 
    466  private:
    467   typedef map<
    468     pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>,
    469     pair<int, int> > LocationMap;
    470   LocationMap location_map_;
    471 };
    472 
    473 }  // namespace compiler
    474 }  // namespace protobuf
    475 
    476 }  // namespace google
    477 #endif  // GOOGLE_PROTOBUF_COMPILER_PARSER_H__
    478