1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // https://developers.google.com/protocol-buffers/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Implements parsing of .proto files to FileDescriptorProtos. 36 37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 39 40 #include <map> 41 #include <string> 42 #include <utility> 43 #include <google/protobuf/descriptor.h> 44 #include <google/protobuf/descriptor.pb.h> 45 #include <google/protobuf/repeated_field.h> 46 #include <google/protobuf/io/tokenizer.h> 47 48 namespace google { 49 namespace protobuf { class Message; } 50 51 namespace protobuf { 52 namespace compiler { 53 54 // Defined in this file. 55 class Parser; 56 class SourceLocationTable; 57 58 // Implements parsing of protocol definitions (such as .proto files). 59 // 60 // Note that most users will be more interested in the Importer class. 61 // Parser is a lower-level class which simply converts a single .proto file 62 // to a FileDescriptorProto. It does not resolve import directives or perform 63 // many other kinds of validation needed to construct a complete 64 // FileDescriptor. 65 class LIBPROTOBUF_EXPORT Parser { 66 public: 67 Parser(); 68 ~Parser(); 69 70 // Parse the entire input and construct a FileDescriptorProto representing 71 // it. Returns true if no errors occurred, false otherwise. 72 bool Parse(io::Tokenizer* input, FileDescriptorProto* file); 73 74 // Optional fetaures: 75 76 // DEPRECATED: New code should use the SourceCodeInfo embedded in the 77 // FileDescriptorProto. 78 // 79 // Requests that locations of certain definitions be recorded to the given 80 // SourceLocationTable while parsing. This can be used to look up exact line 81 // and column numbers for errors reported by DescriptorPool during validation. 82 // Set to NULL (the default) to discard source location information. 83 void RecordSourceLocationsTo(SourceLocationTable* location_table) { 84 source_location_table_ = location_table; 85 } 86 87 // Requests that errors be recorded to the given ErrorCollector while 88 // parsing. Set to NULL (the default) to discard error messages. 89 void RecordErrorsTo(io::ErrorCollector* error_collector) { 90 error_collector_ = error_collector; 91 } 92 93 // Returns the identifier used in the "syntax = " declaration, if one was 94 // seen during the last call to Parse(), or the empty string otherwise. 95 const string& GetSyntaxIdentifier() { return syntax_identifier_; } 96 97 // If set true, input files will be required to begin with a syntax 98 // identifier. Otherwise, files may omit this. If a syntax identifier 99 // is provided, it must be 'syntax = "proto2";' and must appear at the 100 // top of this file regardless of whether or not it was required. 101 void SetRequireSyntaxIdentifier(bool value) { 102 require_syntax_identifier_ = value; 103 } 104 105 // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop 106 // parsing as soon as it has seen the syntax identifier, or lack thereof. 107 // This is useful for quickly identifying the syntax of the file without 108 // parsing the whole thing. If this is enabled, no error will be recorded 109 // if the syntax identifier is something other than "proto2" (since 110 // presumably the caller intends to deal with that), but other kinds of 111 // errors (e.g. parse errors) will still be reported. When this is enabled, 112 // you may pass a NULL FileDescriptorProto to Parse(). 113 void SetStopAfterSyntaxIdentifier(bool value) { 114 stop_after_syntax_identifier_ = value; 115 } 116 117 private: 118 class LocationRecorder; 119 120 // ================================================================= 121 // Error recovery helpers 122 123 // Consume the rest of the current statement. This consumes tokens 124 // until it sees one of: 125 // ';' Consumes the token and returns. 126 // '{' Consumes the brace then calls SkipRestOfBlock(). 127 // '}' Returns without consuming. 128 // EOF Returns (can't consume). 129 // The Parser often calls SkipStatement() after encountering a syntax 130 // error. This allows it to go on parsing the following lines, allowing 131 // it to report more than just one error in the file. 132 void SkipStatement(); 133 134 // Consume the rest of the current block, including nested blocks, 135 // ending after the closing '}' is encountered and consumed, or at EOF. 136 void SkipRestOfBlock(); 137 138 // ----------------------------------------------------------------- 139 // Single-token consuming helpers 140 // 141 // These make parsing code more readable. 142 143 // True if the current token is TYPE_END. 144 inline bool AtEnd(); 145 146 // True if the next token matches the given text. 147 inline bool LookingAt(const char* text); 148 // True if the next token is of the given type. 149 inline bool LookingAtType(io::Tokenizer::TokenType token_type); 150 151 // If the next token exactly matches the text given, consume it and return 152 // true. Otherwise, return false without logging an error. 153 bool TryConsume(const char* text); 154 155 // These attempt to read some kind of token from the input. If successful, 156 // they return true. Otherwise they return false and add the given error 157 // to the error list. 158 159 // Consume a token with the exact text given. 160 bool Consume(const char* text, const char* error); 161 // Same as above, but automatically generates the error "Expected \"text\".", 162 // where "text" is the expected token text. 163 bool Consume(const char* text); 164 // Consume a token of type IDENTIFIER and store its text in "output". 165 bool ConsumeIdentifier(string* output, const char* error); 166 // Consume an integer and store its value in "output". 167 bool ConsumeInteger(int* output, const char* error); 168 // Consume a signed integer and store its value in "output". 169 bool ConsumeSignedInteger(int* output, const char* error); 170 // Consume a 64-bit integer and store its value in "output". If the value 171 // is greater than max_value, an error will be reported. 172 bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error); 173 // Consume a number and store its value in "output". This will accept 174 // tokens of either INTEGER or FLOAT type. 175 bool ConsumeNumber(double* output, const char* error); 176 // Consume a string literal and store its (unescaped) value in "output". 177 bool ConsumeString(string* output, const char* error); 178 179 // Consume a token representing the end of the statement. Comments between 180 // this token and the next will be harvested for documentation. The given 181 // LocationRecorder should refer to the declaration that was just parsed; 182 // it will be populated with these comments. 183 // 184 // TODO(kenton): The LocationRecorder is const because historically locations 185 // have been passed around by const reference, for no particularly good 186 // reason. We should probably go through and change them all to mutable 187 // pointer to make this more intuitive. 188 bool TryConsumeEndOfDeclaration( 189 const char* text, const LocationRecorder* location); 190 bool TryConsumeEndOfDeclarationFinishScope( 191 const char* text, const LocationRecorder* location); 192 193 bool ConsumeEndOfDeclaration( 194 const char* text, const LocationRecorder* location); 195 196 // ----------------------------------------------------------------- 197 // Error logging helpers 198 199 // Invokes error_collector_->AddError(), if error_collector_ is not NULL. 200 void AddError(int line, int column, const string& error); 201 202 // Invokes error_collector_->AddError() with the line and column number 203 // of the current token. 204 void AddError(const string& error); 205 206 // Records a location in the SourceCodeInfo.location table (see 207 // descriptor.proto). We use RAII to ensure that the start and end locations 208 // are recorded -- the constructor records the start location and the 209 // destructor records the end location. Since the parser is 210 // recursive-descent, this works out beautifully. 211 class LIBPROTOBUF_EXPORT LocationRecorder { 212 public: 213 // Construct the file's "root" location. 214 LocationRecorder(Parser* parser); 215 216 // Construct a location that represents a declaration nested within the 217 // given parent. E.g. a field's location is nested within the location 218 // for a message type. The parent's path will be copied, so you should 219 // call AddPath() only to add the path components leading from the parent 220 // to the child (as opposed to leading from the root to the child). 221 LocationRecorder(const LocationRecorder& parent); 222 223 // Convenience constructors that call AddPath() one or two times. 224 LocationRecorder(const LocationRecorder& parent, int path1); 225 LocationRecorder(const LocationRecorder& parent, int path1, int path2); 226 227 ~LocationRecorder(); 228 229 // Add a path component. See SourceCodeInfo.Location.path in 230 // descriptor.proto. 231 void AddPath(int path_component); 232 233 // By default the location is considered to start at the current token at 234 // the time the LocationRecorder is created. StartAt() sets the start 235 // location to the given token instead. 236 void StartAt(const io::Tokenizer::Token& token); 237 238 // Start at the same location as some other LocationRecorder. 239 void StartAt(const LocationRecorder& other); 240 241 // By default the location is considered to end at the previous token at 242 // the time the LocationRecorder is destroyed. EndAt() sets the end 243 // location to the given token instead. 244 void EndAt(const io::Tokenizer::Token& token); 245 246 // Records the start point of this location to the SourceLocationTable that 247 // was passed to RecordSourceLocationsTo(), if any. SourceLocationTable 248 // is an older way of keeping track of source locations which is still 249 // used in some places. 250 void RecordLegacyLocation(const Message* descriptor, 251 DescriptorPool::ErrorCollector::ErrorLocation location); 252 253 // Attaches leading and trailing comments to the location. The two strings 254 // will be swapped into place, so after this is called *leading and 255 // *trailing will be empty. 256 // 257 // TODO(kenton): See comment on TryConsumeEndOfDeclaration(), above, for 258 // why this is const. 259 void AttachComments(string* leading, string* trailing, 260 vector<string>* detached_comments) const; 261 262 private: 263 // Indexes of parent and current location in the parent 264 // SourceCodeInfo.location repeated field. For top-level elements, 265 // parent_index_ is -1. 266 Parser* parser_; 267 SourceCodeInfo::Location* location_; 268 269 void Init(const LocationRecorder& parent); 270 }; 271 272 // ================================================================= 273 // Parsers for various language constructs 274 275 // Parses the "syntax = \"proto2\";" line at the top of the file. Returns 276 // false if it failed to parse or if the syntax identifier was not 277 // recognized. 278 bool ParseSyntaxIdentifier(const LocationRecorder& parent); 279 280 // These methods parse various individual bits of code. They return 281 // false if they completely fail to parse the construct. In this case, 282 // it is probably necessary to skip the rest of the statement to recover. 283 // However, if these methods return true, it does NOT mean that there 284 // were no errors; only that there were no *syntax* errors. For instance, 285 // if a service method is defined using proper syntax but uses a primitive 286 // type as its input or output, ParseMethodField() still returns true 287 // and only reports the error by calling AddError(). In practice, this 288 // makes logic much simpler for the caller. 289 290 // Parse a top-level message, enum, service, etc. 291 bool ParseTopLevelStatement(FileDescriptorProto* file, 292 const LocationRecorder& root_location); 293 294 // Parse various language high-level language construrcts. 295 bool ParseMessageDefinition(DescriptorProto* message, 296 const LocationRecorder& message_location, 297 const FileDescriptorProto* containing_file); 298 bool ParseEnumDefinition(EnumDescriptorProto* enum_type, 299 const LocationRecorder& enum_location, 300 const FileDescriptorProto* containing_file); 301 bool ParseServiceDefinition(ServiceDescriptorProto* service, 302 const LocationRecorder& service_location, 303 const FileDescriptorProto* containing_file); 304 bool ParsePackage(FileDescriptorProto* file, 305 const LocationRecorder& root_location, 306 const FileDescriptorProto* containing_file); 307 bool ParseImport(RepeatedPtrField<string>* dependency, 308 RepeatedField<int32>* public_dependency, 309 RepeatedField<int32>* weak_dependency, 310 const LocationRecorder& root_location, 311 const FileDescriptorProto* containing_file); 312 313 // These methods parse the contents of a message, enum, or service type and 314 // add them to the given object. They consume the entire block including 315 // the beginning and ending brace. 316 bool ParseMessageBlock(DescriptorProto* message, 317 const LocationRecorder& message_location, 318 const FileDescriptorProto* containing_file); 319 bool ParseEnumBlock(EnumDescriptorProto* enum_type, 320 const LocationRecorder& enum_location, 321 const FileDescriptorProto* containing_file); 322 bool ParseServiceBlock(ServiceDescriptorProto* service, 323 const LocationRecorder& service_location, 324 const FileDescriptorProto* containing_file); 325 326 // Parse one statement within a message, enum, or service block, including 327 // final semicolon. 328 bool ParseMessageStatement(DescriptorProto* message, 329 const LocationRecorder& message_location, 330 const FileDescriptorProto* containing_file); 331 bool ParseEnumStatement(EnumDescriptorProto* message, 332 const LocationRecorder& enum_location, 333 const FileDescriptorProto* containing_file); 334 bool ParseServiceStatement(ServiceDescriptorProto* message, 335 const LocationRecorder& service_location, 336 const FileDescriptorProto* containing_file); 337 338 // Parse a field of a message. If the field is a group, its type will be 339 // added to "messages". 340 // 341 // parent_location and location_field_number_for_nested_type are needed when 342 // parsing groups -- we need to generate a nested message type within the 343 // parent and record its location accordingly. Since the parent could be 344 // either a FileDescriptorProto or a DescriptorProto, we must pass in the 345 // correct field number to use. 346 bool ParseMessageField(FieldDescriptorProto* field, 347 RepeatedPtrField<DescriptorProto>* messages, 348 const LocationRecorder& parent_location, 349 int location_field_number_for_nested_type, 350 const LocationRecorder& field_location, 351 const FileDescriptorProto* containing_file); 352 353 // Like ParseMessageField() but expects the label has already been filled in 354 // by the caller. 355 bool ParseMessageFieldNoLabel(FieldDescriptorProto* field, 356 RepeatedPtrField<DescriptorProto>* messages, 357 const LocationRecorder& parent_location, 358 int location_field_number_for_nested_type, 359 const LocationRecorder& field_location, 360 const FileDescriptorProto* containing_file); 361 362 // Parse an "extensions" declaration. 363 bool ParseExtensions(DescriptorProto* message, 364 const LocationRecorder& extensions_location, 365 const FileDescriptorProto* containing_file); 366 367 // Parse a "reserved" declaration. 368 bool ParseReserved(DescriptorProto* message, 369 const LocationRecorder& message_location); 370 bool ParseReservedNames(DescriptorProto* message, 371 const LocationRecorder& parent_location); 372 bool ParseReservedNumbers(DescriptorProto* message, 373 const LocationRecorder& parent_location); 374 375 // Parse an "extend" declaration. (See also comments for 376 // ParseMessageField().) 377 bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions, 378 RepeatedPtrField<DescriptorProto>* messages, 379 const LocationRecorder& parent_location, 380 int location_field_number_for_nested_type, 381 const LocationRecorder& extend_location, 382 const FileDescriptorProto* containing_file); 383 384 // Parse a "oneof" declaration. The caller is responsible for setting 385 // oneof_decl->label() since it will have had to parse the label before it 386 // knew it was parsing a oneof. 387 bool ParseOneof(OneofDescriptorProto* oneof_decl, 388 DescriptorProto* containing_type, 389 int oneof_index, 390 const LocationRecorder& oneof_location, 391 const LocationRecorder& containing_type_location, 392 const FileDescriptorProto* containing_file); 393 394 // Parse a single enum value within an enum block. 395 bool ParseEnumConstant(EnumValueDescriptorProto* enum_value, 396 const LocationRecorder& enum_value_location, 397 const FileDescriptorProto* containing_file); 398 399 // Parse enum constant options, i.e. the list in square brackets at the end 400 // of the enum constant value definition. 401 bool ParseEnumConstantOptions(EnumValueDescriptorProto* value, 402 const LocationRecorder& enum_value_location, 403 const FileDescriptorProto* containing_file); 404 405 // Parse a single method within a service definition. 406 bool ParseServiceMethod(MethodDescriptorProto* method, 407 const LocationRecorder& method_location, 408 const FileDescriptorProto* containing_file); 409 410 411 // Parse options of a single method or stream. 412 bool ParseMethodOptions(const LocationRecorder& parent_location, 413 const FileDescriptorProto* containing_file, 414 const int optionsFieldNumber, 415 Message* mutable_options); 416 417 // Parse "required", "optional", or "repeated" and fill in "label" 418 // with the value. Returns true if such a label is consumed. 419 bool ParseLabel(FieldDescriptorProto::Label* label, 420 const FileDescriptorProto* containing_file); 421 422 // Parse a type name and fill in "type" (if it is a primitive) or 423 // "type_name" (if it is not) with the type parsed. 424 bool ParseType(FieldDescriptorProto::Type* type, 425 string* type_name); 426 // Parse a user-defined type and fill in "type_name" with the name. 427 // If a primitive type is named, it is treated as an error. 428 bool ParseUserDefinedType(string* type_name); 429 430 // Parses field options, i.e. the stuff in square brackets at the end 431 // of a field definition. Also parses default value. 432 bool ParseFieldOptions(FieldDescriptorProto* field, 433 const LocationRecorder& field_location, 434 const FileDescriptorProto* containing_file); 435 436 // Parse the "default" option. This needs special handling because its 437 // type is the field's type. 438 bool ParseDefaultAssignment(FieldDescriptorProto* field, 439 const LocationRecorder& field_location, 440 const FileDescriptorProto* containing_file); 441 442 bool ParseJsonName(FieldDescriptorProto* field, 443 const LocationRecorder& field_location, 444 const FileDescriptorProto* containing_file); 445 446 enum OptionStyle { 447 OPTION_ASSIGNMENT, // just "name = value" 448 OPTION_STATEMENT // "option name = value;" 449 }; 450 451 // Parse a single option name/value pair, e.g. "ctype = CORD". The name 452 // identifies a field of the given Message, and the value of that field 453 // is set to the parsed value. 454 bool ParseOption(Message* options, 455 const LocationRecorder& options_location, 456 const FileDescriptorProto* containing_file, 457 OptionStyle style); 458 459 // Parses a single part of a multipart option name. A multipart name consists 460 // of names separated by dots. Each name is either an identifier or a series 461 // of identifiers separated by dots and enclosed in parentheses. E.g., 462 // "foo.(bar.baz).qux". 463 bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option, 464 const LocationRecorder& part_location, 465 const FileDescriptorProto* containing_file); 466 467 // Parses a string surrounded by balanced braces. Strips off the outer 468 // braces and stores the enclosed string in *value. 469 // E.g., 470 // { foo } *value gets 'foo' 471 // { foo { bar: box } } *value gets 'foo { bar: box }' 472 // {} *value gets '' 473 // 474 // REQUIRES: LookingAt("{") 475 // When finished successfully, we are looking at the first token past 476 // the ending brace. 477 bool ParseUninterpretedBlock(string* value); 478 479 struct MapField { 480 // Whether the field is a map field. 481 bool is_map_field; 482 // The types of the key and value if they are primitive types. 483 FieldDescriptorProto::Type key_type; 484 FieldDescriptorProto::Type value_type; 485 // Or the type names string if the types are customized types. 486 string key_type_name; 487 string value_type_name; 488 489 MapField() : is_map_field(false) {} 490 }; 491 // Desugar the map syntax to generate a nested map entry message. 492 void GenerateMapEntry(const MapField& map_field, FieldDescriptorProto* field, 493 RepeatedPtrField<DescriptorProto>* messages); 494 495 // Whether fields without label default to optional fields. 496 bool DefaultToOptionalFields() const { 497 return syntax_identifier_ == "proto3"; 498 } 499 500 501 bool ValidateEnum(const EnumDescriptorProto* proto); 502 503 // ================================================================= 504 505 io::Tokenizer* input_; 506 io::ErrorCollector* error_collector_; 507 SourceCodeInfo* source_code_info_; 508 SourceLocationTable* source_location_table_; // legacy 509 bool had_errors_; 510 bool require_syntax_identifier_; 511 bool stop_after_syntax_identifier_; 512 string syntax_identifier_; 513 514 // Leading doc comments for the next declaration. These are not complete 515 // yet; use ConsumeEndOfDeclaration() to get the complete comments. 516 string upcoming_doc_comments_; 517 518 // Detached comments are not connected to any syntax entities. Elements in 519 // this vector are paragraphs of comments separated by empty lines. The 520 // detached comments will be put into the leading_detached_comments field for 521 // the next element (See SourceCodeInfo.Location in descriptor.proto), when 522 // ConsumeEndOfDeclaration() is called. 523 vector<string> upcoming_detached_comments_; 524 525 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser); 526 }; 527 528 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by 529 // DescriptorPool when validating descriptors -- to line and column numbers 530 // within the original source code. 531 // 532 // This is semi-obsolete: FileDescriptorProto.source_code_info now contains 533 // far more complete information about source locations. However, as of this 534 // writing you still need to use SourceLocationTable when integrating with 535 // DescriptorPool. 536 class LIBPROTOBUF_EXPORT SourceLocationTable { 537 public: 538 SourceLocationTable(); 539 ~SourceLocationTable(); 540 541 // Finds the precise location of the given error and fills in *line and 542 // *column with the line and column numbers. If not found, sets *line to 543 // -1 and *column to 0 (since line = -1 is used to mean "error has no exact 544 // location" in the ErrorCollector interface). Returns true if found, false 545 // otherwise. 546 bool Find(const Message* descriptor, 547 DescriptorPool::ErrorCollector::ErrorLocation location, 548 int* line, int* column) const; 549 550 // Adds a location to the table. 551 void Add(const Message* descriptor, 552 DescriptorPool::ErrorCollector::ErrorLocation location, 553 int line, int column); 554 555 // Clears the contents of the table. 556 void Clear(); 557 558 private: 559 typedef map< 560 pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>, 561 pair<int, int> > LocationMap; 562 LocationMap location_map_; 563 }; 564 565 } // namespace compiler 566 } // namespace protobuf 567 568 } // namespace google 569 #endif // GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 570