1 // Protocol Buffers - Google's data interchange format 2 // Copyright 2008 Google Inc. All rights reserved. 3 // http://code.google.com/p/protobuf/ 4 // 5 // Redistribution and use in source and binary forms, with or without 6 // modification, are permitted provided that the following conditions are 7 // met: 8 // 9 // * Redistributions of source code must retain the above copyright 10 // notice, this list of conditions and the following disclaimer. 11 // * Redistributions in binary form must reproduce the above 12 // copyright notice, this list of conditions and the following disclaimer 13 // in the documentation and/or other materials provided with the 14 // distribution. 15 // * Neither the name of Google Inc. nor the names of its 16 // contributors may be used to endorse or promote products derived from 17 // this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 31 // Author: kenton (at) google.com (Kenton Varda) 32 // Based on original Protocol Buffers design by 33 // Sanjay Ghemawat, Jeff Dean, and others. 34 // 35 // Implements parsing of .proto files to FileDescriptorProtos. 36 37 #ifndef GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 38 #define GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 39 40 #include <map> 41 #include <string> 42 #include <utility> 43 #include <google/protobuf/stubs/common.h> 44 #include <google/protobuf/descriptor.h> 45 #include <google/protobuf/descriptor.pb.h> 46 #include <google/protobuf/repeated_field.h> 47 #include <google/protobuf/io/tokenizer.h> 48 49 namespace google { 50 namespace protobuf { class Message; } 51 52 namespace protobuf { 53 namespace compiler { 54 55 // Defined in this file. 56 class Parser; 57 class SourceLocationTable; 58 59 // Implements parsing of protocol definitions (such as .proto files). 60 // 61 // Note that most users will be more interested in the Importer class. 62 // Parser is a lower-level class which simply converts a single .proto file 63 // to a FileDescriptorProto. It does not resolve import directives or perform 64 // many other kinds of validation needed to construct a complete 65 // FileDescriptor. 66 class LIBPROTOBUF_EXPORT Parser { 67 public: 68 Parser(); 69 ~Parser(); 70 71 // Parse the entire input and construct a FileDescriptorProto representing 72 // it. Returns true if no errors occurred, false otherwise. 73 bool Parse(io::Tokenizer* input, FileDescriptorProto* file); 74 75 // Optional fetaures: 76 77 // DEPRECATED: New code should use the SourceCodeInfo embedded in the 78 // FileDescriptorProto. 79 // 80 // Requests that locations of certain definitions be recorded to the given 81 // SourceLocationTable while parsing. This can be used to look up exact line 82 // and column numbers for errors reported by DescriptorPool during validation. 83 // Set to NULL (the default) to discard source location information. 84 void RecordSourceLocationsTo(SourceLocationTable* location_table) { 85 source_location_table_ = location_table; 86 } 87 88 // Requests that errors be recorded to the given ErrorCollector while 89 // parsing. Set to NULL (the default) to discard error messages. 90 void RecordErrorsTo(io::ErrorCollector* error_collector) { 91 error_collector_ = error_collector; 92 } 93 94 // Returns the identifier used in the "syntax = " declaration, if one was 95 // seen during the last call to Parse(), or the empty string otherwise. 96 const string& GetSyntaxIdentifier() { return syntax_identifier_; } 97 98 // If set true, input files will be required to begin with a syntax 99 // identifier. Otherwise, files may omit this. If a syntax identifier 100 // is provided, it must be 'syntax = "proto2";' and must appear at the 101 // top of this file regardless of whether or not it was required. 102 void SetRequireSyntaxIdentifier(bool value) { 103 require_syntax_identifier_ = value; 104 } 105 106 // Call SetStopAfterSyntaxIdentifier(true) to tell the parser to stop 107 // parsing as soon as it has seen the syntax identifier, or lack thereof. 108 // This is useful for quickly identifying the syntax of the file without 109 // parsing the whole thing. If this is enabled, no error will be recorded 110 // if the syntax identifier is something other than "proto2" (since 111 // presumably the caller intends to deal with that), but other kinds of 112 // errors (e.g. parse errors) will still be reported. When this is enabled, 113 // you may pass a NULL FileDescriptorProto to Parse(). 114 void SetStopAfterSyntaxIdentifier(bool value) { 115 stop_after_syntax_identifier_ = value; 116 } 117 118 private: 119 class LocationRecorder; 120 121 // ================================================================= 122 // Error recovery helpers 123 124 // Consume the rest of the current statement. This consumes tokens 125 // until it sees one of: 126 // ';' Consumes the token and returns. 127 // '{' Consumes the brace then calls SkipRestOfBlock(). 128 // '}' Returns without consuming. 129 // EOF Returns (can't consume). 130 // The Parser often calls SkipStatement() after encountering a syntax 131 // error. This allows it to go on parsing the following lines, allowing 132 // it to report more than just one error in the file. 133 void SkipStatement(); 134 135 // Consume the rest of the current block, including nested blocks, 136 // ending after the closing '}' is encountered and consumed, or at EOF. 137 void SkipRestOfBlock(); 138 139 // ----------------------------------------------------------------- 140 // Single-token consuming helpers 141 // 142 // These make parsing code more readable. 143 144 // True if the current token is TYPE_END. 145 inline bool AtEnd(); 146 147 // True if the next token matches the given text. 148 inline bool LookingAt(const char* text); 149 // True if the next token is of the given type. 150 inline bool LookingAtType(io::Tokenizer::TokenType token_type); 151 152 // If the next token exactly matches the text given, consume it and return 153 // true. Otherwise, return false without logging an error. 154 bool TryConsume(const char* text); 155 156 // These attempt to read some kind of token from the input. If successful, 157 // they return true. Otherwise they return false and add the given error 158 // to the error list. 159 160 // Consume a token with the exact text given. 161 bool Consume(const char* text, const char* error); 162 // Same as above, but automatically generates the error "Expected \"text\".", 163 // where "text" is the expected token text. 164 bool Consume(const char* text); 165 // Consume a token of type IDENTIFIER and store its text in "output". 166 bool ConsumeIdentifier(string* output, const char* error); 167 // Consume an integer and store its value in "output". 168 bool ConsumeInteger(int* output, const char* error); 169 // Consume a signed integer and store its value in "output". 170 bool ConsumeSignedInteger(int* output, const char* error); 171 // Consume a 64-bit integer and store its value in "output". If the value 172 // is greater than max_value, an error will be reported. 173 bool ConsumeInteger64(uint64 max_value, uint64* output, const char* error); 174 // Consume a number and store its value in "output". This will accept 175 // tokens of either INTEGER or FLOAT type. 176 bool ConsumeNumber(double* output, const char* error); 177 // Consume a string literal and store its (unescaped) value in "output". 178 bool ConsumeString(string* output, const char* error); 179 180 // Consume a token representing the end of the statement. Comments between 181 // this token and the next will be harvested for documentation. The given 182 // LocationRecorder should refer to the declaration that was just parsed; 183 // it will be populated with these comments. 184 // 185 // TODO(kenton): The LocationRecorder is const because historically locations 186 // have been passed around by const reference, for no particularly good 187 // reason. We should probably go through and change them all to mutable 188 // pointer to make this more intuitive. 189 bool TryConsumeEndOfDeclaration(const char* text, 190 const LocationRecorder* location); 191 bool ConsumeEndOfDeclaration(const char* text, 192 const LocationRecorder* location); 193 194 // ----------------------------------------------------------------- 195 // Error logging helpers 196 197 // Invokes error_collector_->AddError(), if error_collector_ is not NULL. 198 void AddError(int line, int column, const string& error); 199 200 // Invokes error_collector_->AddError() with the line and column number 201 // of the current token. 202 void AddError(const string& error); 203 204 // Records a location in the SourceCodeInfo.location table (see 205 // descriptor.proto). We use RAII to ensure that the start and end locations 206 // are recorded -- the constructor records the start location and the 207 // destructor records the end location. Since the parser is 208 // recursive-descent, this works out beautifully. 209 class LIBPROTOBUF_EXPORT LocationRecorder { 210 public: 211 // Construct the file's "root" location. 212 LocationRecorder(Parser* parser); 213 214 // Construct a location that represents a declaration nested within the 215 // given parent. E.g. a field's location is nested within the location 216 // for a message type. The parent's path will be copied, so you should 217 // call AddPath() only to add the path components leading from the parent 218 // to the child (as opposed to leading from the root to the child). 219 LocationRecorder(const LocationRecorder& parent); 220 221 // Convenience constructors that call AddPath() one or two times. 222 LocationRecorder(const LocationRecorder& parent, int path1); 223 LocationRecorder(const LocationRecorder& parent, int path1, int path2); 224 225 ~LocationRecorder(); 226 227 // Add a path component. See SourceCodeInfo.Location.path in 228 // descriptor.proto. 229 void AddPath(int path_component); 230 231 // By default the location is considered to start at the current token at 232 // the time the LocationRecorder is created. StartAt() sets the start 233 // location to the given token instead. 234 void StartAt(const io::Tokenizer::Token& token); 235 236 // By default the location is considered to end at the previous token at 237 // the time the LocationRecorder is destroyed. EndAt() sets the end 238 // location to the given token instead. 239 void EndAt(const io::Tokenizer::Token& token); 240 241 // Records the start point of this location to the SourceLocationTable that 242 // was passed to RecordSourceLocationsTo(), if any. SourceLocationTable 243 // is an older way of keeping track of source locations which is still 244 // used in some places. 245 void RecordLegacyLocation(const Message* descriptor, 246 DescriptorPool::ErrorCollector::ErrorLocation location); 247 248 // Attaches leading and trailing comments to the location. The two strings 249 // will be swapped into place, so after this is called *leading and 250 // *trailing will be empty. 251 // 252 // TODO(kenton): See comment on TryConsumeEndOfDeclaration(), above, for 253 // why this is const. 254 void AttachComments(string* leading, string* trailing) const; 255 256 private: 257 Parser* parser_; 258 SourceCodeInfo::Location* location_; 259 260 void Init(const LocationRecorder& parent); 261 }; 262 263 // ================================================================= 264 // Parsers for various language constructs 265 266 // Parses the "syntax = \"proto2\";" line at the top of the file. Returns 267 // false if it failed to parse or if the syntax identifier was not 268 // recognized. 269 bool ParseSyntaxIdentifier(); 270 271 // These methods parse various individual bits of code. They return 272 // false if they completely fail to parse the construct. In this case, 273 // it is probably necessary to skip the rest of the statement to recover. 274 // However, if these methods return true, it does NOT mean that there 275 // were no errors; only that there were no *syntax* errors. For instance, 276 // if a service method is defined using proper syntax but uses a primitive 277 // type as its input or output, ParseMethodField() still returns true 278 // and only reports the error by calling AddError(). In practice, this 279 // makes logic much simpler for the caller. 280 281 // Parse a top-level message, enum, service, etc. 282 bool ParseTopLevelStatement(FileDescriptorProto* file, 283 const LocationRecorder& root_location); 284 285 // Parse various language high-level language construrcts. 286 bool ParseMessageDefinition(DescriptorProto* message, 287 const LocationRecorder& message_location); 288 bool ParseEnumDefinition(EnumDescriptorProto* enum_type, 289 const LocationRecorder& enum_location); 290 bool ParseServiceDefinition(ServiceDescriptorProto* service, 291 const LocationRecorder& service_location); 292 bool ParsePackage(FileDescriptorProto* file, 293 const LocationRecorder& root_location); 294 bool ParseImport(RepeatedPtrField<string>* dependency, 295 RepeatedField<int32>* public_dependency, 296 RepeatedField<int32>* weak_dependency, 297 const LocationRecorder& root_location); 298 bool ParseOption(Message* options, 299 const LocationRecorder& options_location); 300 301 // These methods parse the contents of a message, enum, or service type and 302 // add them to the given object. They consume the entire block including 303 // the beginning and ending brace. 304 bool ParseMessageBlock(DescriptorProto* message, 305 const LocationRecorder& message_location); 306 bool ParseEnumBlock(EnumDescriptorProto* enum_type, 307 const LocationRecorder& enum_location); 308 bool ParseServiceBlock(ServiceDescriptorProto* service, 309 const LocationRecorder& service_location); 310 311 // Parse one statement within a message, enum, or service block, inclunding 312 // final semicolon. 313 bool ParseMessageStatement(DescriptorProto* message, 314 const LocationRecorder& message_location); 315 bool ParseEnumStatement(EnumDescriptorProto* message, 316 const LocationRecorder& enum_location); 317 bool ParseServiceStatement(ServiceDescriptorProto* message, 318 const LocationRecorder& service_location); 319 320 // Parse a field of a message. If the field is a group, its type will be 321 // added to "messages". 322 // 323 // parent_location and location_field_number_for_nested_type are needed when 324 // parsing groups -- we need to generate a nested message type within the 325 // parent and record its location accordingly. Since the parent could be 326 // either a FileDescriptorProto or a DescriptorProto, we must pass in the 327 // correct field number to use. 328 bool ParseMessageField(FieldDescriptorProto* field, 329 RepeatedPtrField<DescriptorProto>* messages, 330 const LocationRecorder& parent_location, 331 int location_field_number_for_nested_type, 332 const LocationRecorder& field_location); 333 334 // Parse an "extensions" declaration. 335 bool ParseExtensions(DescriptorProto* message, 336 const LocationRecorder& extensions_location); 337 338 // Parse an "extend" declaration. (See also comments for 339 // ParseMessageField().) 340 bool ParseExtend(RepeatedPtrField<FieldDescriptorProto>* extensions, 341 RepeatedPtrField<DescriptorProto>* messages, 342 const LocationRecorder& parent_location, 343 int location_field_number_for_nested_type, 344 const LocationRecorder& extend_location); 345 346 // Parse a single enum value within an enum block. 347 bool ParseEnumConstant(EnumValueDescriptorProto* enum_value, 348 const LocationRecorder& enum_value_location); 349 350 // Parse enum constant options, i.e. the list in square brackets at the end 351 // of the enum constant value definition. 352 bool ParseEnumConstantOptions(EnumValueDescriptorProto* value, 353 const LocationRecorder& enum_value_location); 354 355 // Parse a single method within a service definition. 356 bool ParseServiceMethod(MethodDescriptorProto* method, 357 const LocationRecorder& method_location); 358 359 360 // Parse options of a single method or stream. 361 bool ParseOptions(const LocationRecorder& parent_location, 362 const int optionsFieldNumber, 363 Message* mutable_options); 364 365 // Parse "required", "optional", or "repeated" and fill in "label" 366 // with the value. 367 bool ParseLabel(FieldDescriptorProto::Label* label); 368 369 // Parse a type name and fill in "type" (if it is a primitive) or 370 // "type_name" (if it is not) with the type parsed. 371 bool ParseType(FieldDescriptorProto::Type* type, 372 string* type_name); 373 // Parse a user-defined type and fill in "type_name" with the name. 374 // If a primitive type is named, it is treated as an error. 375 bool ParseUserDefinedType(string* type_name); 376 377 // Parses field options, i.e. the stuff in square brackets at the end 378 // of a field definition. Also parses default value. 379 bool ParseFieldOptions(FieldDescriptorProto* field, 380 const LocationRecorder& field_location); 381 382 // Parse the "default" option. This needs special handling because its 383 // type is the field's type. 384 bool ParseDefaultAssignment(FieldDescriptorProto* field, 385 const LocationRecorder& field_location); 386 387 enum OptionStyle { 388 OPTION_ASSIGNMENT, // just "name = value" 389 OPTION_STATEMENT // "option name = value;" 390 }; 391 392 // Parse a single option name/value pair, e.g. "ctype = CORD". The name 393 // identifies a field of the given Message, and the value of that field 394 // is set to the parsed value. 395 bool ParseOption(Message* options, 396 const LocationRecorder& options_location, 397 OptionStyle style); 398 399 // Parses a single part of a multipart option name. A multipart name consists 400 // of names separated by dots. Each name is either an identifier or a series 401 // of identifiers separated by dots and enclosed in parentheses. E.g., 402 // "foo.(bar.baz).qux". 403 bool ParseOptionNamePart(UninterpretedOption* uninterpreted_option, 404 const LocationRecorder& part_location); 405 406 // Parses a string surrounded by balanced braces. Strips off the outer 407 // braces and stores the enclosed string in *value. 408 // E.g., 409 // { foo } *value gets 'foo' 410 // { foo { bar: box } } *value gets 'foo { bar: box }' 411 // {} *value gets '' 412 // 413 // REQUIRES: LookingAt("{") 414 // When finished successfully, we are looking at the first token past 415 // the ending brace. 416 bool ParseUninterpretedBlock(string* value); 417 418 // ================================================================= 419 420 io::Tokenizer* input_; 421 io::ErrorCollector* error_collector_; 422 SourceCodeInfo* source_code_info_; 423 SourceLocationTable* source_location_table_; // legacy 424 bool had_errors_; 425 bool require_syntax_identifier_; 426 bool stop_after_syntax_identifier_; 427 string syntax_identifier_; 428 429 // Leading doc comments for the next declaration. These are not complete 430 // yet; use ConsumeEndOfDeclaration() to get the complete comments. 431 string upcoming_doc_comments_; 432 433 GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(Parser); 434 }; 435 436 // A table mapping (descriptor, ErrorLocation) pairs -- as reported by 437 // DescriptorPool when validating descriptors -- to line and column numbers 438 // within the original source code. 439 // 440 // This is semi-obsolete: FileDescriptorProto.source_code_info now contains 441 // far more complete information about source locations. However, as of this 442 // writing you still need to use SourceLocationTable when integrating with 443 // DescriptorPool. 444 class LIBPROTOBUF_EXPORT SourceLocationTable { 445 public: 446 SourceLocationTable(); 447 ~SourceLocationTable(); 448 449 // Finds the precise location of the given error and fills in *line and 450 // *column with the line and column numbers. If not found, sets *line to 451 // -1 and *column to 0 (since line = -1 is used to mean "error has no exact 452 // location" in the ErrorCollector interface). Returns true if found, false 453 // otherwise. 454 bool Find(const Message* descriptor, 455 DescriptorPool::ErrorCollector::ErrorLocation location, 456 int* line, int* column) const; 457 458 // Adds a location to the table. 459 void Add(const Message* descriptor, 460 DescriptorPool::ErrorCollector::ErrorLocation location, 461 int line, int column); 462 463 // Clears the contents of the table. 464 void Clear(); 465 466 private: 467 typedef map< 468 pair<const Message*, DescriptorPool::ErrorCollector::ErrorLocation>, 469 pair<int, int> > LocationMap; 470 LocationMap location_map_; 471 }; 472 473 } // namespace compiler 474 } // namespace protobuf 475 476 } // namespace google 477 #endif // GOOGLE_PROTOBUF_COMPILER_PARSER_H__ 478