1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Token - A single YAML token. 100 struct Token : ilist_node<Token> { 101 enum TokenKind { 102 TK_Error, // Uninitialized token. 103 TK_StreamStart, 104 TK_StreamEnd, 105 TK_VersionDirective, 106 TK_TagDirective, 107 TK_DocumentStart, 108 TK_DocumentEnd, 109 TK_BlockEntry, 110 TK_BlockEnd, 111 TK_BlockSequenceStart, 112 TK_BlockMappingStart, 113 TK_FlowEntry, 114 TK_FlowSequenceStart, 115 TK_FlowSequenceEnd, 116 TK_FlowMappingStart, 117 TK_FlowMappingEnd, 118 TK_Key, 119 TK_Value, 120 TK_Scalar, 121 TK_Alias, 122 TK_Anchor, 123 TK_Tag 124 } Kind; 125 126 /// A string of length 0 or more whose begin() points to the logical location 127 /// of the token in the input. 128 StringRef Range; 129 130 Token() : Kind(TK_Error) {} 131 }; 132 } 133 } 134 135 namespace llvm { 136 template<> 137 struct ilist_sentinel_traits<Token> { 138 Token *createSentinel() const { 139 return &Sentinel; 140 } 141 static void destroySentinel(Token*) {} 142 143 Token *provideInitialHead() const { return createSentinel(); } 144 Token *ensureHead(Token*) const { return createSentinel(); } 145 static void noteHead(Token*, Token*) {} 146 147 private: 148 mutable Token Sentinel; 149 }; 150 151 template<> 152 struct ilist_node_traits<Token> { 153 Token *createNode(const Token &V) { 154 return new (Alloc.Allocate<Token>()) Token(V); 155 } 156 static void deleteNode(Token *V) {} 157 158 void addNodeToList(Token *) {} 159 void removeNodeFromList(Token *) {} 160 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 161 ilist_iterator<Token> /*first*/, 162 ilist_iterator<Token> /*last*/) {} 163 164 BumpPtrAllocator Alloc; 165 }; 166 } 167 168 typedef ilist<Token> TokenQueueT; 169 170 namespace { 171 /// @brief This struct is used to track simple keys. 172 /// 173 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 174 /// which could legally be the start of a simple key. When peekNext is called, 175 /// if the Token To be returned is referenced by a SimpleKey, we continue 176 /// tokenizing until that potential simple key has either been found to not be 177 /// a simple key (we moved on to the next line or went further than 1024 chars). 178 /// Or when we run into a Value, and then insert a Key token (and possibly 179 /// others) before the SimpleKey's Tok. 180 struct SimpleKey { 181 TokenQueueT::iterator Tok; 182 unsigned Column; 183 unsigned Line; 184 unsigned FlowLevel; 185 bool IsRequired; 186 187 bool operator ==(const SimpleKey &Other) { 188 return Tok == Other.Tok; 189 } 190 }; 191 } 192 193 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 194 /// subsequence and the subsequence's length in code units (uint8_t). 195 /// A length of 0 represents an error. 196 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 197 198 static UTF8Decoded decodeUTF8(StringRef Range) { 199 StringRef::iterator Position= Range.begin(); 200 StringRef::iterator End = Range.end(); 201 // 1 byte: [0x00, 0x7f] 202 // Bit pattern: 0xxxxxxx 203 if ((*Position & 0x80) == 0) { 204 return std::make_pair(*Position, 1); 205 } 206 // 2 bytes: [0x80, 0x7ff] 207 // Bit pattern: 110xxxxx 10xxxxxx 208 if (Position + 1 != End && 209 ((*Position & 0xE0) == 0xC0) && 210 ((*(Position + 1) & 0xC0) == 0x80)) { 211 uint32_t codepoint = ((*Position & 0x1F) << 6) | 212 (*(Position + 1) & 0x3F); 213 if (codepoint >= 0x80) 214 return std::make_pair(codepoint, 2); 215 } 216 // 3 bytes: [0x8000, 0xffff] 217 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 218 if (Position + 2 != End && 219 ((*Position & 0xF0) == 0xE0) && 220 ((*(Position + 1) & 0xC0) == 0x80) && 221 ((*(Position + 2) & 0xC0) == 0x80)) { 222 uint32_t codepoint = ((*Position & 0x0F) << 12) | 223 ((*(Position + 1) & 0x3F) << 6) | 224 (*(Position + 2) & 0x3F); 225 // Codepoints between 0xD800 and 0xDFFF are invalid, as 226 // they are high / low surrogate halves used by UTF-16. 227 if (codepoint >= 0x800 && 228 (codepoint < 0xD800 || codepoint > 0xDFFF)) 229 return std::make_pair(codepoint, 3); 230 } 231 // 4 bytes: [0x10000, 0x10FFFF] 232 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 233 if (Position + 3 != End && 234 ((*Position & 0xF8) == 0xF0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80) && 237 ((*(Position + 3) & 0xC0) == 0x80)) { 238 uint32_t codepoint = ((*Position & 0x07) << 18) | 239 ((*(Position + 1) & 0x3F) << 12) | 240 ((*(Position + 2) & 0x3F) << 6) | 241 (*(Position + 3) & 0x3F); 242 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 243 return std::make_pair(codepoint, 4); 244 } 245 return std::make_pair(0, 0); 246 } 247 248 namespace llvm { 249 namespace yaml { 250 /// @brief Scans YAML tokens from a MemoryBuffer. 251 class Scanner { 252 public: 253 Scanner(const StringRef Input, SourceMgr &SM); 254 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 255 256 /// @brief Parse the next token and return it without popping it. 257 Token &peekNext(); 258 259 /// @brief Parse the next token and pop it from the queue. 260 Token getNext(); 261 262 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 263 ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) { 264 SM.PrintMessage(Loc, Kind, Message, Ranges); 265 } 266 267 void setError(const Twine &Message, StringRef::iterator Position) { 268 if (Current >= End) 269 Current = End - 1; 270 271 // Don't print out more errors after the first one we encounter. The rest 272 // are just the result of the first, and have no meaning. 273 if (!Failed) 274 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 275 Failed = true; 276 } 277 278 void setError(const Twine &Message) { 279 setError(Message, Current); 280 } 281 282 /// @brief Returns true if an error occurred while parsing. 283 bool failed() { 284 return Failed; 285 } 286 287 private: 288 StringRef currentInput() { 289 return StringRef(Current, End - Current); 290 } 291 292 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 293 /// at \a Position. 294 /// 295 /// If the UTF-8 code units starting at Position do not form a well-formed 296 /// code unit subsequence, then the Unicode scalar value is 0, and the length 297 /// is 0. 298 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 299 return ::decodeUTF8(StringRef(Position, End - Position)); 300 } 301 302 // The following functions are based on the gramar rules in the YAML spec. The 303 // style of the function names it meant to closely match how they are written 304 // in the spec. The number within the [] is the number of the grammar rule in 305 // the spec. 306 // 307 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 308 // 309 // c- 310 // A production starting and ending with a special character. 311 // b- 312 // A production matching a single line break. 313 // nb- 314 // A production starting and ending with a non-break character. 315 // s- 316 // A production starting and ending with a white space character. 317 // ns- 318 // A production starting and ending with a non-space character. 319 // l- 320 // A production matching complete line(s). 321 322 /// @brief Skip a single nb-char[27] starting at Position. 323 /// 324 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 325 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 326 /// 327 /// @returns The code unit after the nb-char, or Position if it's not an 328 /// nb-char. 329 StringRef::iterator skip_nb_char(StringRef::iterator Position); 330 331 /// @brief Skip a single b-break[28] starting at Position. 332 /// 333 /// A b-break is 0xD 0xA | 0xD | 0xA 334 /// 335 /// @returns The code unit after the b-break, or Position if it's not a 336 /// b-break. 337 StringRef::iterator skip_b_break(StringRef::iterator Position); 338 339 /// @brief Skip a single s-white[33] starting at Position. 340 /// 341 /// A s-white is 0x20 | 0x9 342 /// 343 /// @returns The code unit after the s-white, or Position if it's not a 344 /// s-white. 345 StringRef::iterator skip_s_white(StringRef::iterator Position); 346 347 /// @brief Skip a single ns-char[34] starting at Position. 348 /// 349 /// A ns-char is nb-char - s-white 350 /// 351 /// @returns The code unit after the ns-char, or Position if it's not a 352 /// ns-char. 353 StringRef::iterator skip_ns_char(StringRef::iterator Position); 354 355 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 356 /// @brief Skip minimal well-formed code unit subsequences until Func 357 /// returns its input. 358 /// 359 /// @returns The code unit after the last minimal well-formed code unit 360 /// subsequence that Func accepted. 361 StringRef::iterator skip_while( SkipWhileFunc Func 362 , StringRef::iterator Position); 363 364 /// @brief Scan ns-uri-char[39]s starting at Cur. 365 /// 366 /// This updates Cur and Column while scanning. 367 /// 368 /// @returns A StringRef starting at Cur which covers the longest contiguous 369 /// sequence of ns-uri-char. 370 StringRef scan_ns_uri_char(); 371 372 /// @brief Scan ns-plain-one-line[133] starting at \a Cur. 373 StringRef scan_ns_plain_one_line(); 374 375 /// @brief Consume a minimal well-formed code unit subsequence starting at 376 /// \a Cur. Return false if it is not the same Unicode scalar value as 377 /// \a Expected. This updates \a Column. 378 bool consume(uint32_t Expected); 379 380 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 381 void skip(uint32_t Distance); 382 383 /// @brief Return true if the minimal well-formed code unit subsequence at 384 /// Pos is whitespace or a new line 385 bool isBlankOrBreak(StringRef::iterator Position); 386 387 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 388 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 389 , unsigned AtColumn 390 , bool IsRequired); 391 392 /// @brief Remove simple keys that can no longer be valid simple keys. 393 /// 394 /// Invalid simple keys are not on the current line or are further than 1024 395 /// columns back. 396 void removeStaleSimpleKeyCandidates(); 397 398 /// @brief Remove all simple keys on FlowLevel \a Level. 399 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 400 401 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 402 /// tokens if needed. 403 bool unrollIndent(int ToColumn); 404 405 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 406 /// if needed. 407 bool rollIndent( int ToColumn 408 , Token::TokenKind Kind 409 , TokenQueueT::iterator InsertPoint); 410 411 /// @brief Skip whitespace and comments until the start of the next token. 412 void scanToNextToken(); 413 414 /// @brief Must be the first token generated. 415 bool scanStreamStart(); 416 417 /// @brief Generate tokens needed to close out the stream. 418 bool scanStreamEnd(); 419 420 /// @brief Scan a %BLAH directive. 421 bool scanDirective(); 422 423 /// @brief Scan a ... or ---. 424 bool scanDocumentIndicator(bool IsStart); 425 426 /// @brief Scan a [ or { and generate the proper flow collection start token. 427 bool scanFlowCollectionStart(bool IsSequence); 428 429 /// @brief Scan a ] or } and generate the proper flow collection end token. 430 bool scanFlowCollectionEnd(bool IsSequence); 431 432 /// @brief Scan the , that separates entries in a flow collection. 433 bool scanFlowEntry(); 434 435 /// @brief Scan the - that starts block sequence entries. 436 bool scanBlockEntry(); 437 438 /// @brief Scan an explicit ? indicating a key. 439 bool scanKey(); 440 441 /// @brief Scan an explicit : indicating a value. 442 bool scanValue(); 443 444 /// @brief Scan a quoted scalar. 445 bool scanFlowScalar(bool IsDoubleQuoted); 446 447 /// @brief Scan an unquoted scalar. 448 bool scanPlainScalar(); 449 450 /// @brief Scan an Alias or Anchor starting with * or &. 451 bool scanAliasOrAnchor(bool IsAlias); 452 453 /// @brief Scan a block scalar starting with | or >. 454 bool scanBlockScalar(bool IsLiteral); 455 456 /// @brief Scan a tag of the form !stuff. 457 bool scanTag(); 458 459 /// @brief Dispatch to the next scanning function based on \a *Cur. 460 bool fetchMoreTokens(); 461 462 /// @brief The SourceMgr used for diagnostics and buffer management. 463 SourceMgr &SM; 464 465 /// @brief The original input. 466 MemoryBuffer *InputBuffer; 467 468 /// @brief The current position of the scanner. 469 StringRef::iterator Current; 470 471 /// @brief The end of the input (one past the last character). 472 StringRef::iterator End; 473 474 /// @brief Current YAML indentation level in spaces. 475 int Indent; 476 477 /// @brief Current column number in Unicode code points. 478 unsigned Column; 479 480 /// @brief Current line number. 481 unsigned Line; 482 483 /// @brief How deep we are in flow style containers. 0 Means at block level. 484 unsigned FlowLevel; 485 486 /// @brief Are we at the start of the stream? 487 bool IsStartOfStream; 488 489 /// @brief Can the next token be the start of a simple key? 490 bool IsSimpleKeyAllowed; 491 492 /// @brief True if an error has occurred. 493 bool Failed; 494 495 /// @brief Queue of tokens. This is required to queue up tokens while looking 496 /// for the end of a simple key. And for cases where a single character 497 /// can produce multiple tokens (e.g. BlockEnd). 498 TokenQueueT TokenQueue; 499 500 /// @brief Indentation levels. 501 SmallVector<int, 4> Indents; 502 503 /// @brief Potential simple keys. 504 SmallVector<SimpleKey, 4> SimpleKeys; 505 }; 506 507 } // end namespace yaml 508 } // end namespace llvm 509 510 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 511 static void encodeUTF8( uint32_t UnicodeScalarValue 512 , SmallVectorImpl<char> &Result) { 513 if (UnicodeScalarValue <= 0x7F) { 514 Result.push_back(UnicodeScalarValue & 0x7F); 515 } else if (UnicodeScalarValue <= 0x7FF) { 516 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 517 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 518 Result.push_back(FirstByte); 519 Result.push_back(SecondByte); 520 } else if (UnicodeScalarValue <= 0xFFFF) { 521 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 522 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 523 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 524 Result.push_back(FirstByte); 525 Result.push_back(SecondByte); 526 Result.push_back(ThirdByte); 527 } else if (UnicodeScalarValue <= 0x10FFFF) { 528 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 529 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 530 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 531 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 532 Result.push_back(FirstByte); 533 Result.push_back(SecondByte); 534 Result.push_back(ThirdByte); 535 Result.push_back(FourthByte); 536 } 537 } 538 539 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 540 SourceMgr SM; 541 Scanner scanner(Input, SM); 542 while (true) { 543 Token T = scanner.getNext(); 544 switch (T.Kind) { 545 case Token::TK_StreamStart: 546 OS << "Stream-Start: "; 547 break; 548 case Token::TK_StreamEnd: 549 OS << "Stream-End: "; 550 break; 551 case Token::TK_VersionDirective: 552 OS << "Version-Directive: "; 553 break; 554 case Token::TK_TagDirective: 555 OS << "Tag-Directive: "; 556 break; 557 case Token::TK_DocumentStart: 558 OS << "Document-Start: "; 559 break; 560 case Token::TK_DocumentEnd: 561 OS << "Document-End: "; 562 break; 563 case Token::TK_BlockEntry: 564 OS << "Block-Entry: "; 565 break; 566 case Token::TK_BlockEnd: 567 OS << "Block-End: "; 568 break; 569 case Token::TK_BlockSequenceStart: 570 OS << "Block-Sequence-Start: "; 571 break; 572 case Token::TK_BlockMappingStart: 573 OS << "Block-Mapping-Start: "; 574 break; 575 case Token::TK_FlowEntry: 576 OS << "Flow-Entry: "; 577 break; 578 case Token::TK_FlowSequenceStart: 579 OS << "Flow-Sequence-Start: "; 580 break; 581 case Token::TK_FlowSequenceEnd: 582 OS << "Flow-Sequence-End: "; 583 break; 584 case Token::TK_FlowMappingStart: 585 OS << "Flow-Mapping-Start: "; 586 break; 587 case Token::TK_FlowMappingEnd: 588 OS << "Flow-Mapping-End: "; 589 break; 590 case Token::TK_Key: 591 OS << "Key: "; 592 break; 593 case Token::TK_Value: 594 OS << "Value: "; 595 break; 596 case Token::TK_Scalar: 597 OS << "Scalar: "; 598 break; 599 case Token::TK_Alias: 600 OS << "Alias: "; 601 break; 602 case Token::TK_Anchor: 603 OS << "Anchor: "; 604 break; 605 case Token::TK_Tag: 606 OS << "Tag: "; 607 break; 608 case Token::TK_Error: 609 break; 610 } 611 OS << T.Range << "\n"; 612 if (T.Kind == Token::TK_StreamEnd) 613 break; 614 else if (T.Kind == Token::TK_Error) 615 return false; 616 } 617 return true; 618 } 619 620 bool yaml::scanTokens(StringRef Input) { 621 llvm::SourceMgr SM; 622 llvm::yaml::Scanner scanner(Input, SM); 623 for (;;) { 624 llvm::yaml::Token T = scanner.getNext(); 625 if (T.Kind == Token::TK_StreamEnd) 626 break; 627 else if (T.Kind == Token::TK_Error) 628 return false; 629 } 630 return true; 631 } 632 633 std::string yaml::escape(StringRef Input) { 634 std::string EscapedInput; 635 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 636 if (*i == '\\') 637 EscapedInput += "\\\\"; 638 else if (*i == '"') 639 EscapedInput += "\\\""; 640 else if (*i == 0) 641 EscapedInput += "\\0"; 642 else if (*i == 0x07) 643 EscapedInput += "\\a"; 644 else if (*i == 0x08) 645 EscapedInput += "\\b"; 646 else if (*i == 0x09) 647 EscapedInput += "\\t"; 648 else if (*i == 0x0A) 649 EscapedInput += "\\n"; 650 else if (*i == 0x0B) 651 EscapedInput += "\\v"; 652 else if (*i == 0x0C) 653 EscapedInput += "\\f"; 654 else if (*i == 0x0D) 655 EscapedInput += "\\r"; 656 else if (*i == 0x1B) 657 EscapedInput += "\\e"; 658 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 659 std::string HexStr = utohexstr(*i); 660 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 661 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 662 UTF8Decoded UnicodeScalarValue 663 = decodeUTF8(StringRef(i, Input.end() - i)); 664 if (UnicodeScalarValue.second == 0) { 665 // Found invalid char. 666 SmallString<4> Val; 667 encodeUTF8(0xFFFD, Val); 668 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 669 // FIXME: Error reporting. 670 return EscapedInput; 671 } 672 if (UnicodeScalarValue.first == 0x85) 673 EscapedInput += "\\N"; 674 else if (UnicodeScalarValue.first == 0xA0) 675 EscapedInput += "\\_"; 676 else if (UnicodeScalarValue.first == 0x2028) 677 EscapedInput += "\\L"; 678 else if (UnicodeScalarValue.first == 0x2029) 679 EscapedInput += "\\P"; 680 else { 681 std::string HexStr = utohexstr(UnicodeScalarValue.first); 682 if (HexStr.size() <= 2) 683 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 684 else if (HexStr.size() <= 4) 685 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 686 else if (HexStr.size() <= 8) 687 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 688 } 689 i += UnicodeScalarValue.second - 1; 690 } else 691 EscapedInput.push_back(*i); 692 } 693 return EscapedInput; 694 } 695 696 Scanner::Scanner(StringRef Input, SourceMgr &sm) 697 : SM(sm) 698 , Indent(-1) 699 , Column(0) 700 , Line(0) 701 , FlowLevel(0) 702 , IsStartOfStream(true) 703 , IsSimpleKeyAllowed(true) 704 , Failed(false) { 705 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 706 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 707 Current = InputBuffer->getBufferStart(); 708 End = InputBuffer->getBufferEnd(); 709 } 710 711 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 712 : SM(SM_) 713 , InputBuffer(Buffer) 714 , Current(InputBuffer->getBufferStart()) 715 , End(InputBuffer->getBufferEnd()) 716 , Indent(-1) 717 , Column(0) 718 , Line(0) 719 , FlowLevel(0) 720 , IsStartOfStream(true) 721 , IsSimpleKeyAllowed(true) 722 , Failed(false) { 723 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 724 } 725 726 Token &Scanner::peekNext() { 727 // If the current token is a possible simple key, keep parsing until we 728 // can confirm. 729 bool NeedMore = false; 730 while (true) { 731 if (TokenQueue.empty() || NeedMore) { 732 if (!fetchMoreTokens()) { 733 TokenQueue.clear(); 734 TokenQueue.push_back(Token()); 735 return TokenQueue.front(); 736 } 737 } 738 assert(!TokenQueue.empty() && 739 "fetchMoreTokens lied about getting tokens!"); 740 741 removeStaleSimpleKeyCandidates(); 742 SimpleKey SK; 743 SK.Tok = TokenQueue.front(); 744 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 745 == SimpleKeys.end()) 746 break; 747 else 748 NeedMore = true; 749 } 750 return TokenQueue.front(); 751 } 752 753 Token Scanner::getNext() { 754 Token Ret = peekNext(); 755 // TokenQueue can be empty if there was an error getting the next token. 756 if (!TokenQueue.empty()) 757 TokenQueue.pop_front(); 758 759 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 760 // quick deallocation of them all. 761 if (TokenQueue.empty()) { 762 TokenQueue.Alloc.Reset(); 763 } 764 765 return Ret; 766 } 767 768 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 769 if (Position == End) 770 return Position; 771 // Check 7 bit c-printable - b-char. 772 if ( *Position == 0x09 773 || (*Position >= 0x20 && *Position <= 0x7E)) 774 return Position + 1; 775 776 // Check for valid UTF-8. 777 if (uint8_t(*Position) & 0x80) { 778 UTF8Decoded u8d = decodeUTF8(Position); 779 if ( u8d.second != 0 780 && u8d.first != 0xFEFF 781 && ( u8d.first == 0x85 782 || ( u8d.first >= 0xA0 783 && u8d.first <= 0xD7FF) 784 || ( u8d.first >= 0xE000 785 && u8d.first <= 0xFFFD) 786 || ( u8d.first >= 0x10000 787 && u8d.first <= 0x10FFFF))) 788 return Position + u8d.second; 789 } 790 return Position; 791 } 792 793 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 794 if (Position == End) 795 return Position; 796 if (*Position == 0x0D) { 797 if (Position + 1 != End && *(Position + 1) == 0x0A) 798 return Position + 2; 799 return Position + 1; 800 } 801 802 if (*Position == 0x0A) 803 return Position + 1; 804 return Position; 805 } 806 807 808 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 809 if (Position == End) 810 return Position; 811 if (*Position == ' ' || *Position == '\t') 812 return Position + 1; 813 return Position; 814 } 815 816 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 817 if (Position == End) 818 return Position; 819 if (*Position == ' ' || *Position == '\t') 820 return Position; 821 return skip_nb_char(Position); 822 } 823 824 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 825 , StringRef::iterator Position) { 826 while (true) { 827 StringRef::iterator i = (this->*Func)(Position); 828 if (i == Position) 829 break; 830 Position = i; 831 } 832 return Position; 833 } 834 835 static bool is_ns_hex_digit(const char C) { 836 return (C >= '0' && C <= '9') 837 || (C >= 'a' && C <= 'z') 838 || (C >= 'A' && C <= 'Z'); 839 } 840 841 static bool is_ns_word_char(const char C) { 842 return C == '-' 843 || (C >= 'a' && C <= 'z') 844 || (C >= 'A' && C <= 'Z'); 845 } 846 847 StringRef Scanner::scan_ns_uri_char() { 848 StringRef::iterator Start = Current; 849 while (true) { 850 if (Current == End) 851 break; 852 if (( *Current == '%' 853 && Current + 2 < End 854 && is_ns_hex_digit(*(Current + 1)) 855 && is_ns_hex_digit(*(Current + 2))) 856 || is_ns_word_char(*Current) 857 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 858 != StringRef::npos) { 859 ++Current; 860 ++Column; 861 } else 862 break; 863 } 864 return StringRef(Start, Current - Start); 865 } 866 867 StringRef Scanner::scan_ns_plain_one_line() { 868 StringRef::iterator start = Current; 869 // The first character must already be verified. 870 ++Current; 871 while (true) { 872 if (Current == End) { 873 break; 874 } else if (*Current == ':') { 875 // Check if the next character is a ns-char. 876 if (Current + 1 == End) 877 break; 878 StringRef::iterator i = skip_ns_char(Current + 1); 879 if (Current + 1 != i) { 880 Current = i; 881 Column += 2; // Consume both the ':' and ns-char. 882 } else 883 break; 884 } else if (*Current == '#') { 885 // Check if the previous character was a ns-char. 886 // The & 0x80 check is to check for the trailing byte of a utf-8 887 if (*(Current - 1) & 0x80 || skip_ns_char(Current - 1) == Current) { 888 ++Current; 889 ++Column; 890 } else 891 break; 892 } else { 893 StringRef::iterator i = skip_nb_char(Current); 894 if (i == Current) 895 break; 896 Current = i; 897 ++Column; 898 } 899 } 900 return StringRef(start, Current - start); 901 } 902 903 bool Scanner::consume(uint32_t Expected) { 904 if (Expected >= 0x80) 905 report_fatal_error("Not dealing with this yet"); 906 if (Current == End) 907 return false; 908 if (uint8_t(*Current) >= 0x80) 909 report_fatal_error("Not dealing with this yet"); 910 if (uint8_t(*Current) == Expected) { 911 ++Current; 912 ++Column; 913 return true; 914 } 915 return false; 916 } 917 918 void Scanner::skip(uint32_t Distance) { 919 Current += Distance; 920 Column += Distance; 921 assert(Current <= End && "Skipped past the end"); 922 } 923 924 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 925 if (Position == End) 926 return false; 927 if ( *Position == ' ' || *Position == '\t' 928 || *Position == '\r' || *Position == '\n') 929 return true; 930 return false; 931 } 932 933 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 934 , unsigned AtColumn 935 , bool IsRequired) { 936 if (IsSimpleKeyAllowed) { 937 SimpleKey SK; 938 SK.Tok = Tok; 939 SK.Line = Line; 940 SK.Column = AtColumn; 941 SK.IsRequired = IsRequired; 942 SK.FlowLevel = FlowLevel; 943 SimpleKeys.push_back(SK); 944 } 945 } 946 947 void Scanner::removeStaleSimpleKeyCandidates() { 948 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 949 i != SimpleKeys.end();) { 950 if (i->Line != Line || i->Column + 1024 < Column) { 951 if (i->IsRequired) 952 setError( "Could not find expected : for simple key" 953 , i->Tok->Range.begin()); 954 i = SimpleKeys.erase(i); 955 } else 956 ++i; 957 } 958 } 959 960 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 961 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 962 SimpleKeys.pop_back(); 963 } 964 965 bool Scanner::unrollIndent(int ToColumn) { 966 Token T; 967 // Indentation is ignored in flow. 968 if (FlowLevel != 0) 969 return true; 970 971 while (Indent > ToColumn) { 972 T.Kind = Token::TK_BlockEnd; 973 T.Range = StringRef(Current, 1); 974 TokenQueue.push_back(T); 975 Indent = Indents.pop_back_val(); 976 } 977 978 return true; 979 } 980 981 bool Scanner::rollIndent( int ToColumn 982 , Token::TokenKind Kind 983 , TokenQueueT::iterator InsertPoint) { 984 if (FlowLevel) 985 return true; 986 if (Indent < ToColumn) { 987 Indents.push_back(Indent); 988 Indent = ToColumn; 989 990 Token T; 991 T.Kind = Kind; 992 T.Range = StringRef(Current, 0); 993 TokenQueue.insert(InsertPoint, T); 994 } 995 return true; 996 } 997 998 void Scanner::scanToNextToken() { 999 while (true) { 1000 while (*Current == ' ' || *Current == '\t') { 1001 skip(1); 1002 } 1003 1004 // Skip comment. 1005 if (*Current == '#') { 1006 while (true) { 1007 // This may skip more than one byte, thus Column is only incremented 1008 // for code points. 1009 StringRef::iterator i = skip_nb_char(Current); 1010 if (i == Current) 1011 break; 1012 Current = i; 1013 ++Column; 1014 } 1015 } 1016 1017 // Skip EOL. 1018 StringRef::iterator i = skip_b_break(Current); 1019 if (i == Current) 1020 break; 1021 Current = i; 1022 ++Line; 1023 Column = 0; 1024 // New lines may start a simple key. 1025 if (!FlowLevel) 1026 IsSimpleKeyAllowed = true; 1027 } 1028 } 1029 1030 bool Scanner::scanStreamStart() { 1031 IsStartOfStream = false; 1032 1033 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1034 1035 Token T; 1036 T.Kind = Token::TK_StreamStart; 1037 T.Range = StringRef(Current, EI.second); 1038 TokenQueue.push_back(T); 1039 Current += EI.second; 1040 return true; 1041 } 1042 1043 bool Scanner::scanStreamEnd() { 1044 // Force an ending new line if one isn't present. 1045 if (Column != 0) { 1046 Column = 0; 1047 ++Line; 1048 } 1049 1050 unrollIndent(-1); 1051 SimpleKeys.clear(); 1052 IsSimpleKeyAllowed = false; 1053 1054 Token T; 1055 T.Kind = Token::TK_StreamEnd; 1056 T.Range = StringRef(Current, 0); 1057 TokenQueue.push_back(T); 1058 return true; 1059 } 1060 1061 bool Scanner::scanDirective() { 1062 // Reset the indentation level. 1063 unrollIndent(-1); 1064 SimpleKeys.clear(); 1065 IsSimpleKeyAllowed = false; 1066 1067 StringRef::iterator Start = Current; 1068 consume('%'); 1069 StringRef::iterator NameStart = Current; 1070 Current = skip_while(&Scanner::skip_ns_char, Current); 1071 StringRef Name(NameStart, Current - NameStart); 1072 Current = skip_while(&Scanner::skip_s_white, Current); 1073 1074 if (Name == "YAML") { 1075 Current = skip_while(&Scanner::skip_ns_char, Current); 1076 Token T; 1077 T.Kind = Token::TK_VersionDirective; 1078 T.Range = StringRef(Start, Current - Start); 1079 TokenQueue.push_back(T); 1080 return true; 1081 } 1082 return false; 1083 } 1084 1085 bool Scanner::scanDocumentIndicator(bool IsStart) { 1086 unrollIndent(-1); 1087 SimpleKeys.clear(); 1088 IsSimpleKeyAllowed = false; 1089 1090 Token T; 1091 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1092 T.Range = StringRef(Current, 3); 1093 skip(3); 1094 TokenQueue.push_back(T); 1095 return true; 1096 } 1097 1098 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1099 Token T; 1100 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1101 : Token::TK_FlowMappingStart; 1102 T.Range = StringRef(Current, 1); 1103 skip(1); 1104 TokenQueue.push_back(T); 1105 1106 // [ and { may begin a simple key. 1107 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1108 1109 // And may also be followed by a simple key. 1110 IsSimpleKeyAllowed = true; 1111 ++FlowLevel; 1112 return true; 1113 } 1114 1115 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1116 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1117 IsSimpleKeyAllowed = false; 1118 Token T; 1119 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1120 : Token::TK_FlowMappingEnd; 1121 T.Range = StringRef(Current, 1); 1122 skip(1); 1123 TokenQueue.push_back(T); 1124 if (FlowLevel) 1125 --FlowLevel; 1126 return true; 1127 } 1128 1129 bool Scanner::scanFlowEntry() { 1130 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1131 IsSimpleKeyAllowed = true; 1132 Token T; 1133 T.Kind = Token::TK_FlowEntry; 1134 T.Range = StringRef(Current, 1); 1135 skip(1); 1136 TokenQueue.push_back(T); 1137 return true; 1138 } 1139 1140 bool Scanner::scanBlockEntry() { 1141 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1142 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1143 IsSimpleKeyAllowed = true; 1144 Token T; 1145 T.Kind = Token::TK_BlockEntry; 1146 T.Range = StringRef(Current, 1); 1147 skip(1); 1148 TokenQueue.push_back(T); 1149 return true; 1150 } 1151 1152 bool Scanner::scanKey() { 1153 if (!FlowLevel) 1154 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1155 1156 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1157 IsSimpleKeyAllowed = !FlowLevel; 1158 1159 Token T; 1160 T.Kind = Token::TK_Key; 1161 T.Range = StringRef(Current, 1); 1162 skip(1); 1163 TokenQueue.push_back(T); 1164 return true; 1165 } 1166 1167 bool Scanner::scanValue() { 1168 // If the previous token could have been a simple key, insert the key token 1169 // into the token queue. 1170 if (!SimpleKeys.empty()) { 1171 SimpleKey SK = SimpleKeys.pop_back_val(); 1172 Token T; 1173 T.Kind = Token::TK_Key; 1174 T.Range = SK.Tok->Range; 1175 TokenQueueT::iterator i, e; 1176 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1177 if (i == SK.Tok) 1178 break; 1179 } 1180 assert(i != e && "SimpleKey not in token queue!"); 1181 i = TokenQueue.insert(i, T); 1182 1183 // We may also need to add a Block-Mapping-Start token. 1184 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1185 1186 IsSimpleKeyAllowed = false; 1187 } else { 1188 if (!FlowLevel) 1189 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1190 IsSimpleKeyAllowed = !FlowLevel; 1191 } 1192 1193 Token T; 1194 T.Kind = Token::TK_Value; 1195 T.Range = StringRef(Current, 1); 1196 skip(1); 1197 TokenQueue.push_back(T); 1198 return true; 1199 } 1200 1201 // Forbidding inlining improves performance by roughly 20%. 1202 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1203 LLVM_ATTRIBUTE_NOINLINE static bool 1204 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1205 1206 // Returns whether a character at 'Position' was escaped with a leading '\'. 1207 // 'First' specifies the position of the first character in the string. 1208 static bool wasEscaped(StringRef::iterator First, 1209 StringRef::iterator Position) { 1210 assert(Position - 1 >= First); 1211 StringRef::iterator I = Position - 1; 1212 // We calculate the number of consecutive '\'s before the current position 1213 // by iterating backwards through our string. 1214 while (I >= First && *I == '\\') --I; 1215 // (Position - 1 - I) now contains the number of '\'s before the current 1216 // position. If it is odd, the character at 'Position' was escaped. 1217 return (Position - 1 - I) % 2 == 1; 1218 } 1219 1220 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1221 StringRef::iterator Start = Current; 1222 unsigned ColStart = Column; 1223 if (IsDoubleQuoted) { 1224 do { 1225 ++Current; 1226 while (Current != End && *Current != '"') 1227 ++Current; 1228 // Repeat until the previous character was not a '\' or was an escaped 1229 // backslash. 1230 } while ( Current != End 1231 && *(Current - 1) == '\\' 1232 && wasEscaped(Start + 1, Current)); 1233 } else { 1234 skip(1); 1235 while (true) { 1236 // Skip a ' followed by another '. 1237 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1238 skip(2); 1239 continue; 1240 } else if (*Current == '\'') 1241 break; 1242 StringRef::iterator i = skip_nb_char(Current); 1243 if (i == Current) { 1244 i = skip_b_break(Current); 1245 if (i == Current) 1246 break; 1247 Current = i; 1248 Column = 0; 1249 ++Line; 1250 } else { 1251 if (i == End) 1252 break; 1253 Current = i; 1254 ++Column; 1255 } 1256 } 1257 } 1258 1259 if (Current == End) { 1260 setError("Expected quote at end of scalar", Current); 1261 return false; 1262 } 1263 1264 skip(1); // Skip ending quote. 1265 Token T; 1266 T.Kind = Token::TK_Scalar; 1267 T.Range = StringRef(Start, Current - Start); 1268 TokenQueue.push_back(T); 1269 1270 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1271 1272 IsSimpleKeyAllowed = false; 1273 1274 return true; 1275 } 1276 1277 bool Scanner::scanPlainScalar() { 1278 StringRef::iterator Start = Current; 1279 unsigned ColStart = Column; 1280 unsigned LeadingBlanks = 0; 1281 assert(Indent >= -1 && "Indent must be >= -1 !"); 1282 unsigned indent = static_cast<unsigned>(Indent + 1); 1283 while (true) { 1284 if (*Current == '#') 1285 break; 1286 1287 while (!isBlankOrBreak(Current)) { 1288 if ( FlowLevel && *Current == ':' 1289 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1290 setError("Found unexpected ':' while scanning a plain scalar", Current); 1291 return false; 1292 } 1293 1294 // Check for the end of the plain scalar. 1295 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1296 || ( FlowLevel 1297 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1298 != StringRef::npos))) 1299 break; 1300 1301 StringRef::iterator i = skip_nb_char(Current); 1302 if (i == Current) 1303 break; 1304 Current = i; 1305 ++Column; 1306 } 1307 1308 // Are we at the end? 1309 if (!isBlankOrBreak(Current)) 1310 break; 1311 1312 // Eat blanks. 1313 StringRef::iterator Tmp = Current; 1314 while (isBlankOrBreak(Tmp)) { 1315 StringRef::iterator i = skip_s_white(Tmp); 1316 if (i != Tmp) { 1317 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1318 setError("Found invalid tab character in indentation", Tmp); 1319 return false; 1320 } 1321 Tmp = i; 1322 ++Column; 1323 } else { 1324 i = skip_b_break(Tmp); 1325 if (!LeadingBlanks) 1326 LeadingBlanks = 1; 1327 Tmp = i; 1328 Column = 0; 1329 ++Line; 1330 } 1331 } 1332 1333 if (!FlowLevel && Column < indent) 1334 break; 1335 1336 Current = Tmp; 1337 } 1338 if (Start == Current) { 1339 setError("Got empty plain scalar", Start); 1340 return false; 1341 } 1342 Token T; 1343 T.Kind = Token::TK_Scalar; 1344 T.Range = StringRef(Start, Current - Start); 1345 TokenQueue.push_back(T); 1346 1347 // Plain scalars can be simple keys. 1348 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1349 1350 IsSimpleKeyAllowed = false; 1351 1352 return true; 1353 } 1354 1355 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1356 StringRef::iterator Start = Current; 1357 unsigned ColStart = Column; 1358 skip(1); 1359 while(true) { 1360 if ( *Current == '[' || *Current == ']' 1361 || *Current == '{' || *Current == '}' 1362 || *Current == ',' 1363 || *Current == ':') 1364 break; 1365 StringRef::iterator i = skip_ns_char(Current); 1366 if (i == Current) 1367 break; 1368 Current = i; 1369 ++Column; 1370 } 1371 1372 if (Start == Current) { 1373 setError("Got empty alias or anchor", Start); 1374 return false; 1375 } 1376 1377 Token T; 1378 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1379 T.Range = StringRef(Start, Current - Start); 1380 TokenQueue.push_back(T); 1381 1382 // Alias and anchors can be simple keys. 1383 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1384 1385 IsSimpleKeyAllowed = false; 1386 1387 return true; 1388 } 1389 1390 bool Scanner::scanBlockScalar(bool IsLiteral) { 1391 StringRef::iterator Start = Current; 1392 skip(1); // Eat | or > 1393 while(true) { 1394 StringRef::iterator i = skip_nb_char(Current); 1395 if (i == Current) { 1396 if (Column == 0) 1397 break; 1398 i = skip_b_break(Current); 1399 if (i != Current) { 1400 // We got a line break. 1401 Column = 0; 1402 ++Line; 1403 Current = i; 1404 continue; 1405 } else { 1406 // There was an error, which should already have been printed out. 1407 return false; 1408 } 1409 } 1410 Current = i; 1411 ++Column; 1412 } 1413 1414 if (Start == Current) { 1415 setError("Got empty block scalar", Start); 1416 return false; 1417 } 1418 1419 Token T; 1420 T.Kind = Token::TK_Scalar; 1421 T.Range = StringRef(Start, Current - Start); 1422 TokenQueue.push_back(T); 1423 return true; 1424 } 1425 1426 bool Scanner::scanTag() { 1427 StringRef::iterator Start = Current; 1428 unsigned ColStart = Column; 1429 skip(1); // Eat !. 1430 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1431 else if (*Current == '<') { 1432 skip(1); 1433 scan_ns_uri_char(); 1434 if (!consume('>')) 1435 return false; 1436 } else { 1437 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1438 Current = skip_while(&Scanner::skip_ns_char, Current); 1439 } 1440 1441 Token T; 1442 T.Kind = Token::TK_Tag; 1443 T.Range = StringRef(Start, Current - Start); 1444 TokenQueue.push_back(T); 1445 1446 // Tags can be simple keys. 1447 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1448 1449 IsSimpleKeyAllowed = false; 1450 1451 return true; 1452 } 1453 1454 bool Scanner::fetchMoreTokens() { 1455 if (IsStartOfStream) 1456 return scanStreamStart(); 1457 1458 scanToNextToken(); 1459 1460 if (Current == End) 1461 return scanStreamEnd(); 1462 1463 removeStaleSimpleKeyCandidates(); 1464 1465 unrollIndent(Column); 1466 1467 if (Column == 0 && *Current == '%') 1468 return scanDirective(); 1469 1470 if (Column == 0 && Current + 4 <= End 1471 && *Current == '-' 1472 && *(Current + 1) == '-' 1473 && *(Current + 2) == '-' 1474 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1475 return scanDocumentIndicator(true); 1476 1477 if (Column == 0 && Current + 4 <= End 1478 && *Current == '.' 1479 && *(Current + 1) == '.' 1480 && *(Current + 2) == '.' 1481 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1482 return scanDocumentIndicator(false); 1483 1484 if (*Current == '[') 1485 return scanFlowCollectionStart(true); 1486 1487 if (*Current == '{') 1488 return scanFlowCollectionStart(false); 1489 1490 if (*Current == ']') 1491 return scanFlowCollectionEnd(true); 1492 1493 if (*Current == '}') 1494 return scanFlowCollectionEnd(false); 1495 1496 if (*Current == ',') 1497 return scanFlowEntry(); 1498 1499 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1500 return scanBlockEntry(); 1501 1502 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1503 return scanKey(); 1504 1505 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1506 return scanValue(); 1507 1508 if (*Current == '*') 1509 return scanAliasOrAnchor(true); 1510 1511 if (*Current == '&') 1512 return scanAliasOrAnchor(false); 1513 1514 if (*Current == '!') 1515 return scanTag(); 1516 1517 if (*Current == '|' && !FlowLevel) 1518 return scanBlockScalar(true); 1519 1520 if (*Current == '>' && !FlowLevel) 1521 return scanBlockScalar(false); 1522 1523 if (*Current == '\'') 1524 return scanFlowScalar(false); 1525 1526 if (*Current == '"') 1527 return scanFlowScalar(true); 1528 1529 // Get a plain scalar. 1530 StringRef FirstChar(Current, 1); 1531 if (!(isBlankOrBreak(Current) 1532 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1533 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1534 || (!FlowLevel && (*Current == '?' || *Current == ':') 1535 && isBlankOrBreak(Current + 1)) 1536 || (!FlowLevel && *Current == ':' 1537 && Current + 2 < End 1538 && *(Current + 1) == ':' 1539 && !isBlankOrBreak(Current + 2))) 1540 return scanPlainScalar(); 1541 1542 setError("Unrecognized character while tokenizing."); 1543 return false; 1544 } 1545 1546 Stream::Stream(StringRef Input, SourceMgr &SM) 1547 : scanner(new Scanner(Input, SM)) 1548 , CurrentDoc(0) {} 1549 1550 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1551 : scanner(new Scanner(InputBuffer, SM)) 1552 , CurrentDoc(0) {} 1553 1554 Stream::~Stream() {} 1555 1556 bool Stream::failed() { return scanner->failed(); } 1557 1558 void Stream::printError(Node *N, const Twine &Msg) { 1559 SmallVector<SMRange, 1> Ranges; 1560 Ranges.push_back(N->getSourceRange()); 1561 scanner->printError( N->getSourceRange().Start 1562 , SourceMgr::DK_Error 1563 , Msg 1564 , Ranges); 1565 } 1566 1567 void Stream::handleYAMLDirective(const Token &t) { 1568 // TODO: Ensure version is 1.x. 1569 } 1570 1571 document_iterator Stream::begin() { 1572 if (CurrentDoc) 1573 report_fatal_error("Can only iterate over the stream once"); 1574 1575 // Skip Stream-Start. 1576 scanner->getNext(); 1577 1578 CurrentDoc.reset(new Document(*this)); 1579 return document_iterator(CurrentDoc); 1580 } 1581 1582 document_iterator Stream::end() { 1583 return document_iterator(); 1584 } 1585 1586 void Stream::skip() { 1587 for (document_iterator i = begin(), e = end(); i != e; ++i) 1588 i->skip(); 1589 } 1590 1591 Node::Node(unsigned int Type, OwningPtr<Document> &D, StringRef A) 1592 : Doc(D) 1593 , TypeID(Type) 1594 , Anchor(A) { 1595 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1596 SourceRange = SMRange(Start, Start); 1597 } 1598 1599 Token &Node::peekNext() { 1600 return Doc->peekNext(); 1601 } 1602 1603 Token Node::getNext() { 1604 return Doc->getNext(); 1605 } 1606 1607 Node *Node::parseBlockNode() { 1608 return Doc->parseBlockNode(); 1609 } 1610 1611 BumpPtrAllocator &Node::getAllocator() { 1612 return Doc->NodeAllocator; 1613 } 1614 1615 void Node::setError(const Twine &Msg, Token &Tok) const { 1616 Doc->setError(Msg, Tok); 1617 } 1618 1619 bool Node::failed() const { 1620 return Doc->failed(); 1621 } 1622 1623 1624 1625 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1626 // TODO: Handle newlines properly. We need to remove leading whitespace. 1627 if (Value[0] == '"') { // Double quoted. 1628 // Pull off the leading and trailing "s. 1629 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1630 // Search for characters that would require unescaping the value. 1631 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1632 if (i != StringRef::npos) 1633 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1634 return UnquotedValue; 1635 } else if (Value[0] == '\'') { // Single quoted. 1636 // Pull off the leading and trailing 's. 1637 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1638 StringRef::size_type i = UnquotedValue.find('\''); 1639 if (i != StringRef::npos) { 1640 // We're going to need Storage. 1641 Storage.clear(); 1642 Storage.reserve(UnquotedValue.size()); 1643 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1644 StringRef Valid(UnquotedValue.begin(), i); 1645 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1646 Storage.push_back('\''); 1647 UnquotedValue = UnquotedValue.substr(i + 2); 1648 } 1649 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1650 return StringRef(Storage.begin(), Storage.size()); 1651 } 1652 return UnquotedValue; 1653 } 1654 // Plain or block. 1655 return Value.rtrim(" "); 1656 } 1657 1658 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1659 , StringRef::size_type i 1660 , SmallVectorImpl<char> &Storage) 1661 const { 1662 // Use Storage to build proper value. 1663 Storage.clear(); 1664 Storage.reserve(UnquotedValue.size()); 1665 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1666 // Insert all previous chars into Storage. 1667 StringRef Valid(UnquotedValue.begin(), i); 1668 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1669 // Chop off inserted chars. 1670 UnquotedValue = UnquotedValue.substr(i); 1671 1672 assert(!UnquotedValue.empty() && "Can't be empty!"); 1673 1674 // Parse escape or line break. 1675 switch (UnquotedValue[0]) { 1676 case '\r': 1677 case '\n': 1678 Storage.push_back('\n'); 1679 if ( UnquotedValue.size() > 1 1680 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1681 UnquotedValue = UnquotedValue.substr(1); 1682 UnquotedValue = UnquotedValue.substr(1); 1683 break; 1684 default: 1685 if (UnquotedValue.size() == 1) 1686 // TODO: Report error. 1687 break; 1688 UnquotedValue = UnquotedValue.substr(1); 1689 switch (UnquotedValue[0]) { 1690 default: { 1691 Token T; 1692 T.Range = StringRef(UnquotedValue.begin(), 1); 1693 setError("Unrecognized escape code!", T); 1694 return ""; 1695 } 1696 case '\r': 1697 case '\n': 1698 // Remove the new line. 1699 if ( UnquotedValue.size() > 1 1700 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1701 UnquotedValue = UnquotedValue.substr(1); 1702 // If this was just a single byte newline, it will get skipped 1703 // below. 1704 break; 1705 case '0': 1706 Storage.push_back(0x00); 1707 break; 1708 case 'a': 1709 Storage.push_back(0x07); 1710 break; 1711 case 'b': 1712 Storage.push_back(0x08); 1713 break; 1714 case 't': 1715 case 0x09: 1716 Storage.push_back(0x09); 1717 break; 1718 case 'n': 1719 Storage.push_back(0x0A); 1720 break; 1721 case 'v': 1722 Storage.push_back(0x0B); 1723 break; 1724 case 'f': 1725 Storage.push_back(0x0C); 1726 break; 1727 case 'r': 1728 Storage.push_back(0x0D); 1729 break; 1730 case 'e': 1731 Storage.push_back(0x1B); 1732 break; 1733 case ' ': 1734 Storage.push_back(0x20); 1735 break; 1736 case '"': 1737 Storage.push_back(0x22); 1738 break; 1739 case '/': 1740 Storage.push_back(0x2F); 1741 break; 1742 case '\\': 1743 Storage.push_back(0x5C); 1744 break; 1745 case 'N': 1746 encodeUTF8(0x85, Storage); 1747 break; 1748 case '_': 1749 encodeUTF8(0xA0, Storage); 1750 break; 1751 case 'L': 1752 encodeUTF8(0x2028, Storage); 1753 break; 1754 case 'P': 1755 encodeUTF8(0x2029, Storage); 1756 break; 1757 case 'x': { 1758 if (UnquotedValue.size() < 3) 1759 // TODO: Report error. 1760 break; 1761 unsigned int UnicodeScalarValue; 1762 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1763 // TODO: Report error. 1764 UnicodeScalarValue = 0xFFFD; 1765 encodeUTF8(UnicodeScalarValue, Storage); 1766 UnquotedValue = UnquotedValue.substr(2); 1767 break; 1768 } 1769 case 'u': { 1770 if (UnquotedValue.size() < 5) 1771 // TODO: Report error. 1772 break; 1773 unsigned int UnicodeScalarValue; 1774 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1775 // TODO: Report error. 1776 UnicodeScalarValue = 0xFFFD; 1777 encodeUTF8(UnicodeScalarValue, Storage); 1778 UnquotedValue = UnquotedValue.substr(4); 1779 break; 1780 } 1781 case 'U': { 1782 if (UnquotedValue.size() < 9) 1783 // TODO: Report error. 1784 break; 1785 unsigned int UnicodeScalarValue; 1786 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1787 // TODO: Report error. 1788 UnicodeScalarValue = 0xFFFD; 1789 encodeUTF8(UnicodeScalarValue, Storage); 1790 UnquotedValue = UnquotedValue.substr(8); 1791 break; 1792 } 1793 } 1794 UnquotedValue = UnquotedValue.substr(1); 1795 } 1796 } 1797 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1798 return StringRef(Storage.begin(), Storage.size()); 1799 } 1800 1801 Node *KeyValueNode::getKey() { 1802 if (Key) 1803 return Key; 1804 // Handle implicit null keys. 1805 { 1806 Token &t = peekNext(); 1807 if ( t.Kind == Token::TK_BlockEnd 1808 || t.Kind == Token::TK_Value 1809 || t.Kind == Token::TK_Error) { 1810 return Key = new (getAllocator()) NullNode(Doc); 1811 } 1812 if (t.Kind == Token::TK_Key) 1813 getNext(); // skip TK_Key. 1814 } 1815 1816 // Handle explicit null keys. 1817 Token &t = peekNext(); 1818 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1819 return Key = new (getAllocator()) NullNode(Doc); 1820 } 1821 1822 // We've got a normal key. 1823 return Key = parseBlockNode(); 1824 } 1825 1826 Node *KeyValueNode::getValue() { 1827 if (Value) 1828 return Value; 1829 getKey()->skip(); 1830 if (failed()) 1831 return Value = new (getAllocator()) NullNode(Doc); 1832 1833 // Handle implicit null values. 1834 { 1835 Token &t = peekNext(); 1836 if ( t.Kind == Token::TK_BlockEnd 1837 || t.Kind == Token::TK_FlowMappingEnd 1838 || t.Kind == Token::TK_Key 1839 || t.Kind == Token::TK_FlowEntry 1840 || t.Kind == Token::TK_Error) { 1841 return Value = new (getAllocator()) NullNode(Doc); 1842 } 1843 1844 if (t.Kind != Token::TK_Value) { 1845 setError("Unexpected token in Key Value.", t); 1846 return Value = new (getAllocator()) NullNode(Doc); 1847 } 1848 getNext(); // skip TK_Value. 1849 } 1850 1851 // Handle explicit null values. 1852 Token &t = peekNext(); 1853 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1854 return Value = new (getAllocator()) NullNode(Doc); 1855 } 1856 1857 // We got a normal value. 1858 return Value = parseBlockNode(); 1859 } 1860 1861 void MappingNode::increment() { 1862 if (failed()) { 1863 IsAtEnd = true; 1864 CurrentEntry = 0; 1865 return; 1866 } 1867 if (CurrentEntry) { 1868 CurrentEntry->skip(); 1869 if (Type == MT_Inline) { 1870 IsAtEnd = true; 1871 CurrentEntry = 0; 1872 return; 1873 } 1874 } 1875 Token T = peekNext(); 1876 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1877 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1878 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1879 } else if (Type == MT_Block) { 1880 switch (T.Kind) { 1881 case Token::TK_BlockEnd: 1882 getNext(); 1883 IsAtEnd = true; 1884 CurrentEntry = 0; 1885 break; 1886 default: 1887 setError("Unexpected token. Expected Key or Block End", T); 1888 case Token::TK_Error: 1889 IsAtEnd = true; 1890 CurrentEntry = 0; 1891 } 1892 } else { 1893 switch (T.Kind) { 1894 case Token::TK_FlowEntry: 1895 // Eat the flow entry and recurse. 1896 getNext(); 1897 return increment(); 1898 case Token::TK_FlowMappingEnd: 1899 getNext(); 1900 case Token::TK_Error: 1901 // Set this to end iterator. 1902 IsAtEnd = true; 1903 CurrentEntry = 0; 1904 break; 1905 default: 1906 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1907 "Mapping End." 1908 , T); 1909 IsAtEnd = true; 1910 CurrentEntry = 0; 1911 } 1912 } 1913 } 1914 1915 void SequenceNode::increment() { 1916 if (failed()) { 1917 IsAtEnd = true; 1918 CurrentEntry = 0; 1919 return; 1920 } 1921 if (CurrentEntry) 1922 CurrentEntry->skip(); 1923 Token T = peekNext(); 1924 if (SeqType == ST_Block) { 1925 switch (T.Kind) { 1926 case Token::TK_BlockEntry: 1927 getNext(); 1928 CurrentEntry = parseBlockNode(); 1929 if (CurrentEntry == 0) { // An error occurred. 1930 IsAtEnd = true; 1931 CurrentEntry = 0; 1932 } 1933 break; 1934 case Token::TK_BlockEnd: 1935 getNext(); 1936 IsAtEnd = true; 1937 CurrentEntry = 0; 1938 break; 1939 default: 1940 setError( "Unexpected token. Expected Block Entry or Block End." 1941 , T); 1942 case Token::TK_Error: 1943 IsAtEnd = true; 1944 CurrentEntry = 0; 1945 } 1946 } else if (SeqType == ST_Indentless) { 1947 switch (T.Kind) { 1948 case Token::TK_BlockEntry: 1949 getNext(); 1950 CurrentEntry = parseBlockNode(); 1951 if (CurrentEntry == 0) { // An error occurred. 1952 IsAtEnd = true; 1953 CurrentEntry = 0; 1954 } 1955 break; 1956 default: 1957 case Token::TK_Error: 1958 IsAtEnd = true; 1959 CurrentEntry = 0; 1960 } 1961 } else if (SeqType == ST_Flow) { 1962 switch (T.Kind) { 1963 case Token::TK_FlowEntry: 1964 // Eat the flow entry and recurse. 1965 getNext(); 1966 WasPreviousTokenFlowEntry = true; 1967 return increment(); 1968 case Token::TK_FlowSequenceEnd: 1969 getNext(); 1970 case Token::TK_Error: 1971 // Set this to end iterator. 1972 IsAtEnd = true; 1973 CurrentEntry = 0; 1974 break; 1975 case Token::TK_StreamEnd: 1976 case Token::TK_DocumentEnd: 1977 case Token::TK_DocumentStart: 1978 setError("Could not find closing ]!", T); 1979 // Set this to end iterator. 1980 IsAtEnd = true; 1981 CurrentEntry = 0; 1982 break; 1983 default: 1984 if (!WasPreviousTokenFlowEntry) { 1985 setError("Expected , between entries!", T); 1986 IsAtEnd = true; 1987 CurrentEntry = 0; 1988 break; 1989 } 1990 // Otherwise it must be a flow entry. 1991 CurrentEntry = parseBlockNode(); 1992 if (!CurrentEntry) { 1993 IsAtEnd = true; 1994 } 1995 WasPreviousTokenFlowEntry = false; 1996 break; 1997 } 1998 } 1999 } 2000 2001 Document::Document(Stream &S) : stream(S), Root(0) { 2002 if (parseDirectives()) 2003 expectToken(Token::TK_DocumentStart); 2004 Token &T = peekNext(); 2005 if (T.Kind == Token::TK_DocumentStart) 2006 getNext(); 2007 } 2008 2009 bool Document::skip() { 2010 if (stream.scanner->failed()) 2011 return false; 2012 if (!Root) 2013 getRoot(); 2014 Root->skip(); 2015 Token &T = peekNext(); 2016 if (T.Kind == Token::TK_StreamEnd) 2017 return false; 2018 if (T.Kind == Token::TK_DocumentEnd) { 2019 getNext(); 2020 return skip(); 2021 } 2022 return true; 2023 } 2024 2025 Token &Document::peekNext() { 2026 return stream.scanner->peekNext(); 2027 } 2028 2029 Token Document::getNext() { 2030 return stream.scanner->getNext(); 2031 } 2032 2033 void Document::setError(const Twine &Message, Token &Location) const { 2034 stream.scanner->setError(Message, Location.Range.begin()); 2035 } 2036 2037 bool Document::failed() const { 2038 return stream.scanner->failed(); 2039 } 2040 2041 Node *Document::parseBlockNode() { 2042 Token T = peekNext(); 2043 // Handle properties. 2044 Token AnchorInfo; 2045 parse_property: 2046 switch (T.Kind) { 2047 case Token::TK_Alias: 2048 getNext(); 2049 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2050 case Token::TK_Anchor: 2051 if (AnchorInfo.Kind == Token::TK_Anchor) { 2052 setError("Already encountered an anchor for this node!", T); 2053 return 0; 2054 } 2055 AnchorInfo = getNext(); // Consume TK_Anchor. 2056 T = peekNext(); 2057 goto parse_property; 2058 case Token::TK_Tag: 2059 getNext(); // Skip TK_Tag. 2060 T = peekNext(); 2061 goto parse_property; 2062 default: 2063 break; 2064 } 2065 2066 switch (T.Kind) { 2067 case Token::TK_BlockEntry: 2068 // We got an unindented BlockEntry sequence. This is not terminated with 2069 // a BlockEnd. 2070 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2071 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2072 , AnchorInfo.Range.substr(1) 2073 , SequenceNode::ST_Indentless); 2074 case Token::TK_BlockSequenceStart: 2075 getNext(); 2076 return new (NodeAllocator) 2077 SequenceNode( stream.CurrentDoc 2078 , AnchorInfo.Range.substr(1) 2079 , SequenceNode::ST_Block); 2080 case Token::TK_BlockMappingStart: 2081 getNext(); 2082 return new (NodeAllocator) 2083 MappingNode( stream.CurrentDoc 2084 , AnchorInfo.Range.substr(1) 2085 , MappingNode::MT_Block); 2086 case Token::TK_FlowSequenceStart: 2087 getNext(); 2088 return new (NodeAllocator) 2089 SequenceNode( stream.CurrentDoc 2090 , AnchorInfo.Range.substr(1) 2091 , SequenceNode::ST_Flow); 2092 case Token::TK_FlowMappingStart: 2093 getNext(); 2094 return new (NodeAllocator) 2095 MappingNode( stream.CurrentDoc 2096 , AnchorInfo.Range.substr(1) 2097 , MappingNode::MT_Flow); 2098 case Token::TK_Scalar: 2099 getNext(); 2100 return new (NodeAllocator) 2101 ScalarNode( stream.CurrentDoc 2102 , AnchorInfo.Range.substr(1) 2103 , T.Range); 2104 case Token::TK_Key: 2105 // Don't eat the TK_Key, KeyValueNode expects it. 2106 return new (NodeAllocator) 2107 MappingNode( stream.CurrentDoc 2108 , AnchorInfo.Range.substr(1) 2109 , MappingNode::MT_Inline); 2110 case Token::TK_DocumentStart: 2111 case Token::TK_DocumentEnd: 2112 case Token::TK_StreamEnd: 2113 default: 2114 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2115 // !!null null. 2116 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2117 case Token::TK_Error: 2118 return 0; 2119 } 2120 llvm_unreachable("Control flow shouldn't reach here."); 2121 return 0; 2122 } 2123 2124 bool Document::parseDirectives() { 2125 bool isDirective = false; 2126 while (true) { 2127 Token T = peekNext(); 2128 if (T.Kind == Token::TK_TagDirective) { 2129 handleTagDirective(getNext()); 2130 isDirective = true; 2131 } else if (T.Kind == Token::TK_VersionDirective) { 2132 stream.handleYAMLDirective(getNext()); 2133 isDirective = true; 2134 } else 2135 break; 2136 } 2137 return isDirective; 2138 } 2139 2140 bool Document::expectToken(int TK) { 2141 Token T = getNext(); 2142 if (T.Kind != TK) { 2143 setError("Unexpected token", T); 2144 return false; 2145 } 2146 return true; 2147 } 2148