1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallVector.h" 16 #include "llvm/ADT/StringExtras.h" 17 #include "llvm/ADT/Twine.h" 18 #include "llvm/ADT/ilist.h" 19 #include "llvm/ADT/ilist_node.h" 20 #include "llvm/Support/ErrorHandling.h" 21 #include "llvm/Support/MemoryBuffer.h" 22 #include "llvm/Support/SourceMgr.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 using namespace llvm; 26 using namespace yaml; 27 28 enum UnicodeEncodingForm { 29 UEF_UTF32_LE, ///< UTF-32 Little Endian 30 UEF_UTF32_BE, ///< UTF-32 Big Endian 31 UEF_UTF16_LE, ///< UTF-16 Little Endian 32 UEF_UTF16_BE, ///< UTF-16 Big Endian 33 UEF_UTF8, ///< UTF-8 or ascii. 34 UEF_Unknown ///< Not a valid Unicode encoding. 35 }; 36 37 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 38 /// it exists. Length is in {0, 2, 3, 4}. 39 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 40 41 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 42 /// encoding form of \a Input. 43 /// 44 /// @param Input A string of length 0 or more. 45 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 46 /// and how long the byte order mark is if one exists. 47 static EncodingInfo getUnicodeEncoding(StringRef Input) { 48 if (Input.size() == 0) 49 return std::make_pair(UEF_Unknown, 0); 50 51 switch (uint8_t(Input[0])) { 52 case 0x00: 53 if (Input.size() >= 4) { 54 if ( Input[1] == 0 55 && uint8_t(Input[2]) == 0xFE 56 && uint8_t(Input[3]) == 0xFF) 57 return std::make_pair(UEF_UTF32_BE, 4); 58 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 59 return std::make_pair(UEF_UTF32_BE, 0); 60 } 61 62 if (Input.size() >= 2 && Input[1] != 0) 63 return std::make_pair(UEF_UTF16_BE, 0); 64 return std::make_pair(UEF_Unknown, 0); 65 case 0xFF: 66 if ( Input.size() >= 4 67 && uint8_t(Input[1]) == 0xFE 68 && Input[2] == 0 69 && Input[3] == 0) 70 return std::make_pair(UEF_UTF32_LE, 4); 71 72 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 73 return std::make_pair(UEF_UTF16_LE, 2); 74 return std::make_pair(UEF_Unknown, 0); 75 case 0xFE: 76 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 77 return std::make_pair(UEF_UTF16_BE, 2); 78 return std::make_pair(UEF_Unknown, 0); 79 case 0xEF: 80 if ( Input.size() >= 3 81 && uint8_t(Input[1]) == 0xBB 82 && uint8_t(Input[2]) == 0xBF) 83 return std::make_pair(UEF_UTF8, 3); 84 return std::make_pair(UEF_Unknown, 0); 85 } 86 87 // It could still be utf-32 or utf-16. 88 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 89 return std::make_pair(UEF_UTF32_LE, 0); 90 91 if (Input.size() >= 2 && Input[1] == 0) 92 return std::make_pair(UEF_UTF16_LE, 0); 93 94 return std::make_pair(UEF_UTF8, 0); 95 } 96 97 namespace llvm { 98 namespace yaml { 99 /// Pin the vtables to this file. 100 void Node::anchor() {} 101 void NullNode::anchor() {} 102 void ScalarNode::anchor() {} 103 void KeyValueNode::anchor() {} 104 void MappingNode::anchor() {} 105 void SequenceNode::anchor() {} 106 void AliasNode::anchor() {} 107 108 /// Token - A single YAML token. 109 struct Token : ilist_node<Token> { 110 enum TokenKind { 111 TK_Error, // Uninitialized token. 112 TK_StreamStart, 113 TK_StreamEnd, 114 TK_VersionDirective, 115 TK_TagDirective, 116 TK_DocumentStart, 117 TK_DocumentEnd, 118 TK_BlockEntry, 119 TK_BlockEnd, 120 TK_BlockSequenceStart, 121 TK_BlockMappingStart, 122 TK_FlowEntry, 123 TK_FlowSequenceStart, 124 TK_FlowSequenceEnd, 125 TK_FlowMappingStart, 126 TK_FlowMappingEnd, 127 TK_Key, 128 TK_Value, 129 TK_Scalar, 130 TK_Alias, 131 TK_Anchor, 132 TK_Tag 133 } Kind; 134 135 /// A string of length 0 or more whose begin() points to the logical location 136 /// of the token in the input. 137 StringRef Range; 138 139 Token() : Kind(TK_Error) {} 140 }; 141 } 142 } 143 144 namespace llvm { 145 template<> 146 struct ilist_sentinel_traits<Token> { 147 Token *createSentinel() const { 148 return &Sentinel; 149 } 150 static void destroySentinel(Token*) {} 151 152 Token *provideInitialHead() const { return createSentinel(); } 153 Token *ensureHead(Token*) const { return createSentinel(); } 154 static void noteHead(Token*, Token*) {} 155 156 private: 157 mutable Token Sentinel; 158 }; 159 160 template<> 161 struct ilist_node_traits<Token> { 162 Token *createNode(const Token &V) { 163 return new (Alloc.Allocate<Token>()) Token(V); 164 } 165 static void deleteNode(Token *V) {} 166 167 void addNodeToList(Token *) {} 168 void removeNodeFromList(Token *) {} 169 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 170 ilist_iterator<Token> /*first*/, 171 ilist_iterator<Token> /*last*/) {} 172 173 BumpPtrAllocator Alloc; 174 }; 175 } 176 177 typedef ilist<Token> TokenQueueT; 178 179 namespace { 180 /// @brief This struct is used to track simple keys. 181 /// 182 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 183 /// which could legally be the start of a simple key. When peekNext is called, 184 /// if the Token To be returned is referenced by a SimpleKey, we continue 185 /// tokenizing until that potential simple key has either been found to not be 186 /// a simple key (we moved on to the next line or went further than 1024 chars). 187 /// Or when we run into a Value, and then insert a Key token (and possibly 188 /// others) before the SimpleKey's Tok. 189 struct SimpleKey { 190 TokenQueueT::iterator Tok; 191 unsigned Column; 192 unsigned Line; 193 unsigned FlowLevel; 194 bool IsRequired; 195 196 bool operator ==(const SimpleKey &Other) { 197 return Tok == Other.Tok; 198 } 199 }; 200 } 201 202 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 203 /// subsequence and the subsequence's length in code units (uint8_t). 204 /// A length of 0 represents an error. 205 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 206 207 static UTF8Decoded decodeUTF8(StringRef Range) { 208 StringRef::iterator Position= Range.begin(); 209 StringRef::iterator End = Range.end(); 210 // 1 byte: [0x00, 0x7f] 211 // Bit pattern: 0xxxxxxx 212 if ((*Position & 0x80) == 0) { 213 return std::make_pair(*Position, 1); 214 } 215 // 2 bytes: [0x80, 0x7ff] 216 // Bit pattern: 110xxxxx 10xxxxxx 217 if (Position + 1 != End && 218 ((*Position & 0xE0) == 0xC0) && 219 ((*(Position + 1) & 0xC0) == 0x80)) { 220 uint32_t codepoint = ((*Position & 0x1F) << 6) | 221 (*(Position + 1) & 0x3F); 222 if (codepoint >= 0x80) 223 return std::make_pair(codepoint, 2); 224 } 225 // 3 bytes: [0x8000, 0xffff] 226 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 227 if (Position + 2 != End && 228 ((*Position & 0xF0) == 0xE0) && 229 ((*(Position + 1) & 0xC0) == 0x80) && 230 ((*(Position + 2) & 0xC0) == 0x80)) { 231 uint32_t codepoint = ((*Position & 0x0F) << 12) | 232 ((*(Position + 1) & 0x3F) << 6) | 233 (*(Position + 2) & 0x3F); 234 // Codepoints between 0xD800 and 0xDFFF are invalid, as 235 // they are high / low surrogate halves used by UTF-16. 236 if (codepoint >= 0x800 && 237 (codepoint < 0xD800 || codepoint > 0xDFFF)) 238 return std::make_pair(codepoint, 3); 239 } 240 // 4 bytes: [0x10000, 0x10FFFF] 241 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 242 if (Position + 3 != End && 243 ((*Position & 0xF8) == 0xF0) && 244 ((*(Position + 1) & 0xC0) == 0x80) && 245 ((*(Position + 2) & 0xC0) == 0x80) && 246 ((*(Position + 3) & 0xC0) == 0x80)) { 247 uint32_t codepoint = ((*Position & 0x07) << 18) | 248 ((*(Position + 1) & 0x3F) << 12) | 249 ((*(Position + 2) & 0x3F) << 6) | 250 (*(Position + 3) & 0x3F); 251 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 252 return std::make_pair(codepoint, 4); 253 } 254 return std::make_pair(0, 0); 255 } 256 257 namespace llvm { 258 namespace yaml { 259 /// @brief Scans YAML tokens from a MemoryBuffer. 260 class Scanner { 261 public: 262 Scanner(const StringRef Input, SourceMgr &SM); 263 Scanner(MemoryBuffer *Buffer, SourceMgr &SM_); 264 265 /// @brief Parse the next token and return it without popping it. 266 Token &peekNext(); 267 268 /// @brief Parse the next token and pop it from the queue. 269 Token getNext(); 270 271 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 272 ArrayRef<SMRange> Ranges = None) { 273 SM.PrintMessage(Loc, Kind, Message, Ranges); 274 } 275 276 void setError(const Twine &Message, StringRef::iterator Position) { 277 if (Current >= End) 278 Current = End - 1; 279 280 // Don't print out more errors after the first one we encounter. The rest 281 // are just the result of the first, and have no meaning. 282 if (!Failed) 283 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 284 Failed = true; 285 } 286 287 void setError(const Twine &Message) { 288 setError(Message, Current); 289 } 290 291 /// @brief Returns true if an error occurred while parsing. 292 bool failed() { 293 return Failed; 294 } 295 296 private: 297 StringRef currentInput() { 298 return StringRef(Current, End - Current); 299 } 300 301 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 302 /// at \a Position. 303 /// 304 /// If the UTF-8 code units starting at Position do not form a well-formed 305 /// code unit subsequence, then the Unicode scalar value is 0, and the length 306 /// is 0. 307 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 308 return ::decodeUTF8(StringRef(Position, End - Position)); 309 } 310 311 // The following functions are based on the gramar rules in the YAML spec. The 312 // style of the function names it meant to closely match how they are written 313 // in the spec. The number within the [] is the number of the grammar rule in 314 // the spec. 315 // 316 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 317 // 318 // c- 319 // A production starting and ending with a special character. 320 // b- 321 // A production matching a single line break. 322 // nb- 323 // A production starting and ending with a non-break character. 324 // s- 325 // A production starting and ending with a white space character. 326 // ns- 327 // A production starting and ending with a non-space character. 328 // l- 329 // A production matching complete line(s). 330 331 /// @brief Skip a single nb-char[27] starting at Position. 332 /// 333 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 334 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 335 /// 336 /// @returns The code unit after the nb-char, or Position if it's not an 337 /// nb-char. 338 StringRef::iterator skip_nb_char(StringRef::iterator Position); 339 340 /// @brief Skip a single b-break[28] starting at Position. 341 /// 342 /// A b-break is 0xD 0xA | 0xD | 0xA 343 /// 344 /// @returns The code unit after the b-break, or Position if it's not a 345 /// b-break. 346 StringRef::iterator skip_b_break(StringRef::iterator Position); 347 348 /// @brief Skip a single s-white[33] starting at Position. 349 /// 350 /// A s-white is 0x20 | 0x9 351 /// 352 /// @returns The code unit after the s-white, or Position if it's not a 353 /// s-white. 354 StringRef::iterator skip_s_white(StringRef::iterator Position); 355 356 /// @brief Skip a single ns-char[34] starting at Position. 357 /// 358 /// A ns-char is nb-char - s-white 359 /// 360 /// @returns The code unit after the ns-char, or Position if it's not a 361 /// ns-char. 362 StringRef::iterator skip_ns_char(StringRef::iterator Position); 363 364 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 365 /// @brief Skip minimal well-formed code unit subsequences until Func 366 /// returns its input. 367 /// 368 /// @returns The code unit after the last minimal well-formed code unit 369 /// subsequence that Func accepted. 370 StringRef::iterator skip_while( SkipWhileFunc Func 371 , StringRef::iterator Position); 372 373 /// @brief Scan ns-uri-char[39]s starting at Cur. 374 /// 375 /// This updates Cur and Column while scanning. 376 /// 377 /// @returns A StringRef starting at Cur which covers the longest contiguous 378 /// sequence of ns-uri-char. 379 StringRef scan_ns_uri_char(); 380 381 /// @brief Consume a minimal well-formed code unit subsequence starting at 382 /// \a Cur. Return false if it is not the same Unicode scalar value as 383 /// \a Expected. This updates \a Column. 384 bool consume(uint32_t Expected); 385 386 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 387 void skip(uint32_t Distance); 388 389 /// @brief Return true if the minimal well-formed code unit subsequence at 390 /// Pos is whitespace or a new line 391 bool isBlankOrBreak(StringRef::iterator Position); 392 393 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 394 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 395 , unsigned AtColumn 396 , bool IsRequired); 397 398 /// @brief Remove simple keys that can no longer be valid simple keys. 399 /// 400 /// Invalid simple keys are not on the current line or are further than 1024 401 /// columns back. 402 void removeStaleSimpleKeyCandidates(); 403 404 /// @brief Remove all simple keys on FlowLevel \a Level. 405 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 406 407 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 408 /// tokens if needed. 409 bool unrollIndent(int ToColumn); 410 411 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 412 /// if needed. 413 bool rollIndent( int ToColumn 414 , Token::TokenKind Kind 415 , TokenQueueT::iterator InsertPoint); 416 417 /// @brief Skip whitespace and comments until the start of the next token. 418 void scanToNextToken(); 419 420 /// @brief Must be the first token generated. 421 bool scanStreamStart(); 422 423 /// @brief Generate tokens needed to close out the stream. 424 bool scanStreamEnd(); 425 426 /// @brief Scan a %BLAH directive. 427 bool scanDirective(); 428 429 /// @brief Scan a ... or ---. 430 bool scanDocumentIndicator(bool IsStart); 431 432 /// @brief Scan a [ or { and generate the proper flow collection start token. 433 bool scanFlowCollectionStart(bool IsSequence); 434 435 /// @brief Scan a ] or } and generate the proper flow collection end token. 436 bool scanFlowCollectionEnd(bool IsSequence); 437 438 /// @brief Scan the , that separates entries in a flow collection. 439 bool scanFlowEntry(); 440 441 /// @brief Scan the - that starts block sequence entries. 442 bool scanBlockEntry(); 443 444 /// @brief Scan an explicit ? indicating a key. 445 bool scanKey(); 446 447 /// @brief Scan an explicit : indicating a value. 448 bool scanValue(); 449 450 /// @brief Scan a quoted scalar. 451 bool scanFlowScalar(bool IsDoubleQuoted); 452 453 /// @brief Scan an unquoted scalar. 454 bool scanPlainScalar(); 455 456 /// @brief Scan an Alias or Anchor starting with * or &. 457 bool scanAliasOrAnchor(bool IsAlias); 458 459 /// @brief Scan a block scalar starting with | or >. 460 bool scanBlockScalar(bool IsLiteral); 461 462 /// @brief Scan a tag of the form !stuff. 463 bool scanTag(); 464 465 /// @brief Dispatch to the next scanning function based on \a *Cur. 466 bool fetchMoreTokens(); 467 468 /// @brief The SourceMgr used for diagnostics and buffer management. 469 SourceMgr &SM; 470 471 /// @brief The original input. 472 MemoryBuffer *InputBuffer; 473 474 /// @brief The current position of the scanner. 475 StringRef::iterator Current; 476 477 /// @brief The end of the input (one past the last character). 478 StringRef::iterator End; 479 480 /// @brief Current YAML indentation level in spaces. 481 int Indent; 482 483 /// @brief Current column number in Unicode code points. 484 unsigned Column; 485 486 /// @brief Current line number. 487 unsigned Line; 488 489 /// @brief How deep we are in flow style containers. 0 Means at block level. 490 unsigned FlowLevel; 491 492 /// @brief Are we at the start of the stream? 493 bool IsStartOfStream; 494 495 /// @brief Can the next token be the start of a simple key? 496 bool IsSimpleKeyAllowed; 497 498 /// @brief True if an error has occurred. 499 bool Failed; 500 501 /// @brief Queue of tokens. This is required to queue up tokens while looking 502 /// for the end of a simple key. And for cases where a single character 503 /// can produce multiple tokens (e.g. BlockEnd). 504 TokenQueueT TokenQueue; 505 506 /// @brief Indentation levels. 507 SmallVector<int, 4> Indents; 508 509 /// @brief Potential simple keys. 510 SmallVector<SimpleKey, 4> SimpleKeys; 511 }; 512 513 } // end namespace yaml 514 } // end namespace llvm 515 516 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 517 static void encodeUTF8( uint32_t UnicodeScalarValue 518 , SmallVectorImpl<char> &Result) { 519 if (UnicodeScalarValue <= 0x7F) { 520 Result.push_back(UnicodeScalarValue & 0x7F); 521 } else if (UnicodeScalarValue <= 0x7FF) { 522 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 523 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 524 Result.push_back(FirstByte); 525 Result.push_back(SecondByte); 526 } else if (UnicodeScalarValue <= 0xFFFF) { 527 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 528 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 529 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 530 Result.push_back(FirstByte); 531 Result.push_back(SecondByte); 532 Result.push_back(ThirdByte); 533 } else if (UnicodeScalarValue <= 0x10FFFF) { 534 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 535 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 536 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 537 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 538 Result.push_back(FirstByte); 539 Result.push_back(SecondByte); 540 Result.push_back(ThirdByte); 541 Result.push_back(FourthByte); 542 } 543 } 544 545 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 546 SourceMgr SM; 547 Scanner scanner(Input, SM); 548 while (true) { 549 Token T = scanner.getNext(); 550 switch (T.Kind) { 551 case Token::TK_StreamStart: 552 OS << "Stream-Start: "; 553 break; 554 case Token::TK_StreamEnd: 555 OS << "Stream-End: "; 556 break; 557 case Token::TK_VersionDirective: 558 OS << "Version-Directive: "; 559 break; 560 case Token::TK_TagDirective: 561 OS << "Tag-Directive: "; 562 break; 563 case Token::TK_DocumentStart: 564 OS << "Document-Start: "; 565 break; 566 case Token::TK_DocumentEnd: 567 OS << "Document-End: "; 568 break; 569 case Token::TK_BlockEntry: 570 OS << "Block-Entry: "; 571 break; 572 case Token::TK_BlockEnd: 573 OS << "Block-End: "; 574 break; 575 case Token::TK_BlockSequenceStart: 576 OS << "Block-Sequence-Start: "; 577 break; 578 case Token::TK_BlockMappingStart: 579 OS << "Block-Mapping-Start: "; 580 break; 581 case Token::TK_FlowEntry: 582 OS << "Flow-Entry: "; 583 break; 584 case Token::TK_FlowSequenceStart: 585 OS << "Flow-Sequence-Start: "; 586 break; 587 case Token::TK_FlowSequenceEnd: 588 OS << "Flow-Sequence-End: "; 589 break; 590 case Token::TK_FlowMappingStart: 591 OS << "Flow-Mapping-Start: "; 592 break; 593 case Token::TK_FlowMappingEnd: 594 OS << "Flow-Mapping-End: "; 595 break; 596 case Token::TK_Key: 597 OS << "Key: "; 598 break; 599 case Token::TK_Value: 600 OS << "Value: "; 601 break; 602 case Token::TK_Scalar: 603 OS << "Scalar: "; 604 break; 605 case Token::TK_Alias: 606 OS << "Alias: "; 607 break; 608 case Token::TK_Anchor: 609 OS << "Anchor: "; 610 break; 611 case Token::TK_Tag: 612 OS << "Tag: "; 613 break; 614 case Token::TK_Error: 615 break; 616 } 617 OS << T.Range << "\n"; 618 if (T.Kind == Token::TK_StreamEnd) 619 break; 620 else if (T.Kind == Token::TK_Error) 621 return false; 622 } 623 return true; 624 } 625 626 bool yaml::scanTokens(StringRef Input) { 627 llvm::SourceMgr SM; 628 llvm::yaml::Scanner scanner(Input, SM); 629 for (;;) { 630 llvm::yaml::Token T = scanner.getNext(); 631 if (T.Kind == Token::TK_StreamEnd) 632 break; 633 else if (T.Kind == Token::TK_Error) 634 return false; 635 } 636 return true; 637 } 638 639 std::string yaml::escape(StringRef Input) { 640 std::string EscapedInput; 641 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 642 if (*i == '\\') 643 EscapedInput += "\\\\"; 644 else if (*i == '"') 645 EscapedInput += "\\\""; 646 else if (*i == 0) 647 EscapedInput += "\\0"; 648 else if (*i == 0x07) 649 EscapedInput += "\\a"; 650 else if (*i == 0x08) 651 EscapedInput += "\\b"; 652 else if (*i == 0x09) 653 EscapedInput += "\\t"; 654 else if (*i == 0x0A) 655 EscapedInput += "\\n"; 656 else if (*i == 0x0B) 657 EscapedInput += "\\v"; 658 else if (*i == 0x0C) 659 EscapedInput += "\\f"; 660 else if (*i == 0x0D) 661 EscapedInput += "\\r"; 662 else if (*i == 0x1B) 663 EscapedInput += "\\e"; 664 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 665 std::string HexStr = utohexstr(*i); 666 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 667 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 668 UTF8Decoded UnicodeScalarValue 669 = decodeUTF8(StringRef(i, Input.end() - i)); 670 if (UnicodeScalarValue.second == 0) { 671 // Found invalid char. 672 SmallString<4> Val; 673 encodeUTF8(0xFFFD, Val); 674 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 675 // FIXME: Error reporting. 676 return EscapedInput; 677 } 678 if (UnicodeScalarValue.first == 0x85) 679 EscapedInput += "\\N"; 680 else if (UnicodeScalarValue.first == 0xA0) 681 EscapedInput += "\\_"; 682 else if (UnicodeScalarValue.first == 0x2028) 683 EscapedInput += "\\L"; 684 else if (UnicodeScalarValue.first == 0x2029) 685 EscapedInput += "\\P"; 686 else { 687 std::string HexStr = utohexstr(UnicodeScalarValue.first); 688 if (HexStr.size() <= 2) 689 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 690 else if (HexStr.size() <= 4) 691 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 692 else if (HexStr.size() <= 8) 693 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 694 } 695 i += UnicodeScalarValue.second - 1; 696 } else 697 EscapedInput.push_back(*i); 698 } 699 return EscapedInput; 700 } 701 702 Scanner::Scanner(StringRef Input, SourceMgr &sm) 703 : SM(sm) 704 , Indent(-1) 705 , Column(0) 706 , Line(0) 707 , FlowLevel(0) 708 , IsStartOfStream(true) 709 , IsSimpleKeyAllowed(true) 710 , Failed(false) { 711 InputBuffer = MemoryBuffer::getMemBuffer(Input, "YAML"); 712 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 713 Current = InputBuffer->getBufferStart(); 714 End = InputBuffer->getBufferEnd(); 715 } 716 717 Scanner::Scanner(MemoryBuffer *Buffer, SourceMgr &SM_) 718 : SM(SM_) 719 , InputBuffer(Buffer) 720 , Current(InputBuffer->getBufferStart()) 721 , End(InputBuffer->getBufferEnd()) 722 , Indent(-1) 723 , Column(0) 724 , Line(0) 725 , FlowLevel(0) 726 , IsStartOfStream(true) 727 , IsSimpleKeyAllowed(true) 728 , Failed(false) { 729 SM.AddNewSourceBuffer(InputBuffer, SMLoc()); 730 } 731 732 Token &Scanner::peekNext() { 733 // If the current token is a possible simple key, keep parsing until we 734 // can confirm. 735 bool NeedMore = false; 736 while (true) { 737 if (TokenQueue.empty() || NeedMore) { 738 if (!fetchMoreTokens()) { 739 TokenQueue.clear(); 740 TokenQueue.push_back(Token()); 741 return TokenQueue.front(); 742 } 743 } 744 assert(!TokenQueue.empty() && 745 "fetchMoreTokens lied about getting tokens!"); 746 747 removeStaleSimpleKeyCandidates(); 748 SimpleKey SK; 749 SK.Tok = TokenQueue.front(); 750 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 751 == SimpleKeys.end()) 752 break; 753 else 754 NeedMore = true; 755 } 756 return TokenQueue.front(); 757 } 758 759 Token Scanner::getNext() { 760 Token Ret = peekNext(); 761 // TokenQueue can be empty if there was an error getting the next token. 762 if (!TokenQueue.empty()) 763 TokenQueue.pop_front(); 764 765 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 766 // quick deallocation of them all. 767 if (TokenQueue.empty()) { 768 TokenQueue.Alloc.Reset(); 769 } 770 771 return Ret; 772 } 773 774 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 775 if (Position == End) 776 return Position; 777 // Check 7 bit c-printable - b-char. 778 if ( *Position == 0x09 779 || (*Position >= 0x20 && *Position <= 0x7E)) 780 return Position + 1; 781 782 // Check for valid UTF-8. 783 if (uint8_t(*Position) & 0x80) { 784 UTF8Decoded u8d = decodeUTF8(Position); 785 if ( u8d.second != 0 786 && u8d.first != 0xFEFF 787 && ( u8d.first == 0x85 788 || ( u8d.first >= 0xA0 789 && u8d.first <= 0xD7FF) 790 || ( u8d.first >= 0xE000 791 && u8d.first <= 0xFFFD) 792 || ( u8d.first >= 0x10000 793 && u8d.first <= 0x10FFFF))) 794 return Position + u8d.second; 795 } 796 return Position; 797 } 798 799 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 800 if (Position == End) 801 return Position; 802 if (*Position == 0x0D) { 803 if (Position + 1 != End && *(Position + 1) == 0x0A) 804 return Position + 2; 805 return Position + 1; 806 } 807 808 if (*Position == 0x0A) 809 return Position + 1; 810 return Position; 811 } 812 813 814 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 815 if (Position == End) 816 return Position; 817 if (*Position == ' ' || *Position == '\t') 818 return Position + 1; 819 return Position; 820 } 821 822 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 823 if (Position == End) 824 return Position; 825 if (*Position == ' ' || *Position == '\t') 826 return Position; 827 return skip_nb_char(Position); 828 } 829 830 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 831 , StringRef::iterator Position) { 832 while (true) { 833 StringRef::iterator i = (this->*Func)(Position); 834 if (i == Position) 835 break; 836 Position = i; 837 } 838 return Position; 839 } 840 841 static bool is_ns_hex_digit(const char C) { 842 return (C >= '0' && C <= '9') 843 || (C >= 'a' && C <= 'z') 844 || (C >= 'A' && C <= 'Z'); 845 } 846 847 static bool is_ns_word_char(const char C) { 848 return C == '-' 849 || (C >= 'a' && C <= 'z') 850 || (C >= 'A' && C <= 'Z'); 851 } 852 853 StringRef Scanner::scan_ns_uri_char() { 854 StringRef::iterator Start = Current; 855 while (true) { 856 if (Current == End) 857 break; 858 if (( *Current == '%' 859 && Current + 2 < End 860 && is_ns_hex_digit(*(Current + 1)) 861 && is_ns_hex_digit(*(Current + 2))) 862 || is_ns_word_char(*Current) 863 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 864 != StringRef::npos) { 865 ++Current; 866 ++Column; 867 } else 868 break; 869 } 870 return StringRef(Start, Current - Start); 871 } 872 873 bool Scanner::consume(uint32_t Expected) { 874 if (Expected >= 0x80) 875 report_fatal_error("Not dealing with this yet"); 876 if (Current == End) 877 return false; 878 if (uint8_t(*Current) >= 0x80) 879 report_fatal_error("Not dealing with this yet"); 880 if (uint8_t(*Current) == Expected) { 881 ++Current; 882 ++Column; 883 return true; 884 } 885 return false; 886 } 887 888 void Scanner::skip(uint32_t Distance) { 889 Current += Distance; 890 Column += Distance; 891 assert(Current <= End && "Skipped past the end"); 892 } 893 894 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 895 if (Position == End) 896 return false; 897 if ( *Position == ' ' || *Position == '\t' 898 || *Position == '\r' || *Position == '\n') 899 return true; 900 return false; 901 } 902 903 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 904 , unsigned AtColumn 905 , bool IsRequired) { 906 if (IsSimpleKeyAllowed) { 907 SimpleKey SK; 908 SK.Tok = Tok; 909 SK.Line = Line; 910 SK.Column = AtColumn; 911 SK.IsRequired = IsRequired; 912 SK.FlowLevel = FlowLevel; 913 SimpleKeys.push_back(SK); 914 } 915 } 916 917 void Scanner::removeStaleSimpleKeyCandidates() { 918 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 919 i != SimpleKeys.end();) { 920 if (i->Line != Line || i->Column + 1024 < Column) { 921 if (i->IsRequired) 922 setError( "Could not find expected : for simple key" 923 , i->Tok->Range.begin()); 924 i = SimpleKeys.erase(i); 925 } else 926 ++i; 927 } 928 } 929 930 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 931 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 932 SimpleKeys.pop_back(); 933 } 934 935 bool Scanner::unrollIndent(int ToColumn) { 936 Token T; 937 // Indentation is ignored in flow. 938 if (FlowLevel != 0) 939 return true; 940 941 while (Indent > ToColumn) { 942 T.Kind = Token::TK_BlockEnd; 943 T.Range = StringRef(Current, 1); 944 TokenQueue.push_back(T); 945 Indent = Indents.pop_back_val(); 946 } 947 948 return true; 949 } 950 951 bool Scanner::rollIndent( int ToColumn 952 , Token::TokenKind Kind 953 , TokenQueueT::iterator InsertPoint) { 954 if (FlowLevel) 955 return true; 956 if (Indent < ToColumn) { 957 Indents.push_back(Indent); 958 Indent = ToColumn; 959 960 Token T; 961 T.Kind = Kind; 962 T.Range = StringRef(Current, 0); 963 TokenQueue.insert(InsertPoint, T); 964 } 965 return true; 966 } 967 968 void Scanner::scanToNextToken() { 969 while (true) { 970 while (*Current == ' ' || *Current == '\t') { 971 skip(1); 972 } 973 974 // Skip comment. 975 if (*Current == '#') { 976 while (true) { 977 // This may skip more than one byte, thus Column is only incremented 978 // for code points. 979 StringRef::iterator i = skip_nb_char(Current); 980 if (i == Current) 981 break; 982 Current = i; 983 ++Column; 984 } 985 } 986 987 // Skip EOL. 988 StringRef::iterator i = skip_b_break(Current); 989 if (i == Current) 990 break; 991 Current = i; 992 ++Line; 993 Column = 0; 994 // New lines may start a simple key. 995 if (!FlowLevel) 996 IsSimpleKeyAllowed = true; 997 } 998 } 999 1000 bool Scanner::scanStreamStart() { 1001 IsStartOfStream = false; 1002 1003 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1004 1005 Token T; 1006 T.Kind = Token::TK_StreamStart; 1007 T.Range = StringRef(Current, EI.second); 1008 TokenQueue.push_back(T); 1009 Current += EI.second; 1010 return true; 1011 } 1012 1013 bool Scanner::scanStreamEnd() { 1014 // Force an ending new line if one isn't present. 1015 if (Column != 0) { 1016 Column = 0; 1017 ++Line; 1018 } 1019 1020 unrollIndent(-1); 1021 SimpleKeys.clear(); 1022 IsSimpleKeyAllowed = false; 1023 1024 Token T; 1025 T.Kind = Token::TK_StreamEnd; 1026 T.Range = StringRef(Current, 0); 1027 TokenQueue.push_back(T); 1028 return true; 1029 } 1030 1031 bool Scanner::scanDirective() { 1032 // Reset the indentation level. 1033 unrollIndent(-1); 1034 SimpleKeys.clear(); 1035 IsSimpleKeyAllowed = false; 1036 1037 StringRef::iterator Start = Current; 1038 consume('%'); 1039 StringRef::iterator NameStart = Current; 1040 Current = skip_while(&Scanner::skip_ns_char, Current); 1041 StringRef Name(NameStart, Current - NameStart); 1042 Current = skip_while(&Scanner::skip_s_white, Current); 1043 1044 Token T; 1045 if (Name == "YAML") { 1046 Current = skip_while(&Scanner::skip_ns_char, Current); 1047 T.Kind = Token::TK_VersionDirective; 1048 T.Range = StringRef(Start, Current - Start); 1049 TokenQueue.push_back(T); 1050 return true; 1051 } else if(Name == "TAG") { 1052 Current = skip_while(&Scanner::skip_ns_char, Current); 1053 Current = skip_while(&Scanner::skip_s_white, Current); 1054 Current = skip_while(&Scanner::skip_ns_char, Current); 1055 T.Kind = Token::TK_TagDirective; 1056 T.Range = StringRef(Start, Current - Start); 1057 TokenQueue.push_back(T); 1058 return true; 1059 } 1060 return false; 1061 } 1062 1063 bool Scanner::scanDocumentIndicator(bool IsStart) { 1064 unrollIndent(-1); 1065 SimpleKeys.clear(); 1066 IsSimpleKeyAllowed = false; 1067 1068 Token T; 1069 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1070 T.Range = StringRef(Current, 3); 1071 skip(3); 1072 TokenQueue.push_back(T); 1073 return true; 1074 } 1075 1076 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1077 Token T; 1078 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1079 : Token::TK_FlowMappingStart; 1080 T.Range = StringRef(Current, 1); 1081 skip(1); 1082 TokenQueue.push_back(T); 1083 1084 // [ and { may begin a simple key. 1085 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1086 1087 // And may also be followed by a simple key. 1088 IsSimpleKeyAllowed = true; 1089 ++FlowLevel; 1090 return true; 1091 } 1092 1093 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1094 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1095 IsSimpleKeyAllowed = false; 1096 Token T; 1097 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1098 : Token::TK_FlowMappingEnd; 1099 T.Range = StringRef(Current, 1); 1100 skip(1); 1101 TokenQueue.push_back(T); 1102 if (FlowLevel) 1103 --FlowLevel; 1104 return true; 1105 } 1106 1107 bool Scanner::scanFlowEntry() { 1108 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1109 IsSimpleKeyAllowed = true; 1110 Token T; 1111 T.Kind = Token::TK_FlowEntry; 1112 T.Range = StringRef(Current, 1); 1113 skip(1); 1114 TokenQueue.push_back(T); 1115 return true; 1116 } 1117 1118 bool Scanner::scanBlockEntry() { 1119 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1120 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1121 IsSimpleKeyAllowed = true; 1122 Token T; 1123 T.Kind = Token::TK_BlockEntry; 1124 T.Range = StringRef(Current, 1); 1125 skip(1); 1126 TokenQueue.push_back(T); 1127 return true; 1128 } 1129 1130 bool Scanner::scanKey() { 1131 if (!FlowLevel) 1132 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1133 1134 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1135 IsSimpleKeyAllowed = !FlowLevel; 1136 1137 Token T; 1138 T.Kind = Token::TK_Key; 1139 T.Range = StringRef(Current, 1); 1140 skip(1); 1141 TokenQueue.push_back(T); 1142 return true; 1143 } 1144 1145 bool Scanner::scanValue() { 1146 // If the previous token could have been a simple key, insert the key token 1147 // into the token queue. 1148 if (!SimpleKeys.empty()) { 1149 SimpleKey SK = SimpleKeys.pop_back_val(); 1150 Token T; 1151 T.Kind = Token::TK_Key; 1152 T.Range = SK.Tok->Range; 1153 TokenQueueT::iterator i, e; 1154 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1155 if (i == SK.Tok) 1156 break; 1157 } 1158 assert(i != e && "SimpleKey not in token queue!"); 1159 i = TokenQueue.insert(i, T); 1160 1161 // We may also need to add a Block-Mapping-Start token. 1162 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1163 1164 IsSimpleKeyAllowed = false; 1165 } else { 1166 if (!FlowLevel) 1167 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1168 IsSimpleKeyAllowed = !FlowLevel; 1169 } 1170 1171 Token T; 1172 T.Kind = Token::TK_Value; 1173 T.Range = StringRef(Current, 1); 1174 skip(1); 1175 TokenQueue.push_back(T); 1176 return true; 1177 } 1178 1179 // Forbidding inlining improves performance by roughly 20%. 1180 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1181 LLVM_ATTRIBUTE_NOINLINE static bool 1182 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1183 1184 // Returns whether a character at 'Position' was escaped with a leading '\'. 1185 // 'First' specifies the position of the first character in the string. 1186 static bool wasEscaped(StringRef::iterator First, 1187 StringRef::iterator Position) { 1188 assert(Position - 1 >= First); 1189 StringRef::iterator I = Position - 1; 1190 // We calculate the number of consecutive '\'s before the current position 1191 // by iterating backwards through our string. 1192 while (I >= First && *I == '\\') --I; 1193 // (Position - 1 - I) now contains the number of '\'s before the current 1194 // position. If it is odd, the character at 'Position' was escaped. 1195 return (Position - 1 - I) % 2 == 1; 1196 } 1197 1198 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1199 StringRef::iterator Start = Current; 1200 unsigned ColStart = Column; 1201 if (IsDoubleQuoted) { 1202 do { 1203 ++Current; 1204 while (Current != End && *Current != '"') 1205 ++Current; 1206 // Repeat until the previous character was not a '\' or was an escaped 1207 // backslash. 1208 } while ( Current != End 1209 && *(Current - 1) == '\\' 1210 && wasEscaped(Start + 1, Current)); 1211 } else { 1212 skip(1); 1213 while (true) { 1214 // Skip a ' followed by another '. 1215 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1216 skip(2); 1217 continue; 1218 } else if (*Current == '\'') 1219 break; 1220 StringRef::iterator i = skip_nb_char(Current); 1221 if (i == Current) { 1222 i = skip_b_break(Current); 1223 if (i == Current) 1224 break; 1225 Current = i; 1226 Column = 0; 1227 ++Line; 1228 } else { 1229 if (i == End) 1230 break; 1231 Current = i; 1232 ++Column; 1233 } 1234 } 1235 } 1236 1237 if (Current == End) { 1238 setError("Expected quote at end of scalar", Current); 1239 return false; 1240 } 1241 1242 skip(1); // Skip ending quote. 1243 Token T; 1244 T.Kind = Token::TK_Scalar; 1245 T.Range = StringRef(Start, Current - Start); 1246 TokenQueue.push_back(T); 1247 1248 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1249 1250 IsSimpleKeyAllowed = false; 1251 1252 return true; 1253 } 1254 1255 bool Scanner::scanPlainScalar() { 1256 StringRef::iterator Start = Current; 1257 unsigned ColStart = Column; 1258 unsigned LeadingBlanks = 0; 1259 assert(Indent >= -1 && "Indent must be >= -1 !"); 1260 unsigned indent = static_cast<unsigned>(Indent + 1); 1261 while (true) { 1262 if (*Current == '#') 1263 break; 1264 1265 while (!isBlankOrBreak(Current)) { 1266 if ( FlowLevel && *Current == ':' 1267 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1268 setError("Found unexpected ':' while scanning a plain scalar", Current); 1269 return false; 1270 } 1271 1272 // Check for the end of the plain scalar. 1273 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1274 || ( FlowLevel 1275 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1276 != StringRef::npos))) 1277 break; 1278 1279 StringRef::iterator i = skip_nb_char(Current); 1280 if (i == Current) 1281 break; 1282 Current = i; 1283 ++Column; 1284 } 1285 1286 // Are we at the end? 1287 if (!isBlankOrBreak(Current)) 1288 break; 1289 1290 // Eat blanks. 1291 StringRef::iterator Tmp = Current; 1292 while (isBlankOrBreak(Tmp)) { 1293 StringRef::iterator i = skip_s_white(Tmp); 1294 if (i != Tmp) { 1295 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1296 setError("Found invalid tab character in indentation", Tmp); 1297 return false; 1298 } 1299 Tmp = i; 1300 ++Column; 1301 } else { 1302 i = skip_b_break(Tmp); 1303 if (!LeadingBlanks) 1304 LeadingBlanks = 1; 1305 Tmp = i; 1306 Column = 0; 1307 ++Line; 1308 } 1309 } 1310 1311 if (!FlowLevel && Column < indent) 1312 break; 1313 1314 Current = Tmp; 1315 } 1316 if (Start == Current) { 1317 setError("Got empty plain scalar", Start); 1318 return false; 1319 } 1320 Token T; 1321 T.Kind = Token::TK_Scalar; 1322 T.Range = StringRef(Start, Current - Start); 1323 TokenQueue.push_back(T); 1324 1325 // Plain scalars can be simple keys. 1326 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1327 1328 IsSimpleKeyAllowed = false; 1329 1330 return true; 1331 } 1332 1333 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1334 StringRef::iterator Start = Current; 1335 unsigned ColStart = Column; 1336 skip(1); 1337 while(true) { 1338 if ( *Current == '[' || *Current == ']' 1339 || *Current == '{' || *Current == '}' 1340 || *Current == ',' 1341 || *Current == ':') 1342 break; 1343 StringRef::iterator i = skip_ns_char(Current); 1344 if (i == Current) 1345 break; 1346 Current = i; 1347 ++Column; 1348 } 1349 1350 if (Start == Current) { 1351 setError("Got empty alias or anchor", Start); 1352 return false; 1353 } 1354 1355 Token T; 1356 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1357 T.Range = StringRef(Start, Current - Start); 1358 TokenQueue.push_back(T); 1359 1360 // Alias and anchors can be simple keys. 1361 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1362 1363 IsSimpleKeyAllowed = false; 1364 1365 return true; 1366 } 1367 1368 bool Scanner::scanBlockScalar(bool IsLiteral) { 1369 StringRef::iterator Start = Current; 1370 skip(1); // Eat | or > 1371 while(true) { 1372 StringRef::iterator i = skip_nb_char(Current); 1373 if (i == Current) { 1374 if (Column == 0) 1375 break; 1376 i = skip_b_break(Current); 1377 if (i != Current) { 1378 // We got a line break. 1379 Column = 0; 1380 ++Line; 1381 Current = i; 1382 continue; 1383 } else { 1384 // There was an error, which should already have been printed out. 1385 return false; 1386 } 1387 } 1388 Current = i; 1389 ++Column; 1390 } 1391 1392 if (Start == Current) { 1393 setError("Got empty block scalar", Start); 1394 return false; 1395 } 1396 1397 Token T; 1398 T.Kind = Token::TK_Scalar; 1399 T.Range = StringRef(Start, Current - Start); 1400 TokenQueue.push_back(T); 1401 return true; 1402 } 1403 1404 bool Scanner::scanTag() { 1405 StringRef::iterator Start = Current; 1406 unsigned ColStart = Column; 1407 skip(1); // Eat !. 1408 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1409 else if (*Current == '<') { 1410 skip(1); 1411 scan_ns_uri_char(); 1412 if (!consume('>')) 1413 return false; 1414 } else { 1415 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1416 Current = skip_while(&Scanner::skip_ns_char, Current); 1417 } 1418 1419 Token T; 1420 T.Kind = Token::TK_Tag; 1421 T.Range = StringRef(Start, Current - Start); 1422 TokenQueue.push_back(T); 1423 1424 // Tags can be simple keys. 1425 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1426 1427 IsSimpleKeyAllowed = false; 1428 1429 return true; 1430 } 1431 1432 bool Scanner::fetchMoreTokens() { 1433 if (IsStartOfStream) 1434 return scanStreamStart(); 1435 1436 scanToNextToken(); 1437 1438 if (Current == End) 1439 return scanStreamEnd(); 1440 1441 removeStaleSimpleKeyCandidates(); 1442 1443 unrollIndent(Column); 1444 1445 if (Column == 0 && *Current == '%') 1446 return scanDirective(); 1447 1448 if (Column == 0 && Current + 4 <= End 1449 && *Current == '-' 1450 && *(Current + 1) == '-' 1451 && *(Current + 2) == '-' 1452 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1453 return scanDocumentIndicator(true); 1454 1455 if (Column == 0 && Current + 4 <= End 1456 && *Current == '.' 1457 && *(Current + 1) == '.' 1458 && *(Current + 2) == '.' 1459 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1460 return scanDocumentIndicator(false); 1461 1462 if (*Current == '[') 1463 return scanFlowCollectionStart(true); 1464 1465 if (*Current == '{') 1466 return scanFlowCollectionStart(false); 1467 1468 if (*Current == ']') 1469 return scanFlowCollectionEnd(true); 1470 1471 if (*Current == '}') 1472 return scanFlowCollectionEnd(false); 1473 1474 if (*Current == ',') 1475 return scanFlowEntry(); 1476 1477 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1478 return scanBlockEntry(); 1479 1480 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1481 return scanKey(); 1482 1483 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1484 return scanValue(); 1485 1486 if (*Current == '*') 1487 return scanAliasOrAnchor(true); 1488 1489 if (*Current == '&') 1490 return scanAliasOrAnchor(false); 1491 1492 if (*Current == '!') 1493 return scanTag(); 1494 1495 if (*Current == '|' && !FlowLevel) 1496 return scanBlockScalar(true); 1497 1498 if (*Current == '>' && !FlowLevel) 1499 return scanBlockScalar(false); 1500 1501 if (*Current == '\'') 1502 return scanFlowScalar(false); 1503 1504 if (*Current == '"') 1505 return scanFlowScalar(true); 1506 1507 // Get a plain scalar. 1508 StringRef FirstChar(Current, 1); 1509 if (!(isBlankOrBreak(Current) 1510 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1511 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1512 || (!FlowLevel && (*Current == '?' || *Current == ':') 1513 && isBlankOrBreak(Current + 1)) 1514 || (!FlowLevel && *Current == ':' 1515 && Current + 2 < End 1516 && *(Current + 1) == ':' 1517 && !isBlankOrBreak(Current + 2))) 1518 return scanPlainScalar(); 1519 1520 setError("Unrecognized character while tokenizing."); 1521 return false; 1522 } 1523 1524 Stream::Stream(StringRef Input, SourceMgr &SM) 1525 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 1526 1527 Stream::Stream(MemoryBuffer *InputBuffer, SourceMgr &SM) 1528 : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {} 1529 1530 Stream::~Stream() {} 1531 1532 bool Stream::failed() { return scanner->failed(); } 1533 1534 void Stream::printError(Node *N, const Twine &Msg) { 1535 SmallVector<SMRange, 1> Ranges; 1536 Ranges.push_back(N->getSourceRange()); 1537 scanner->printError( N->getSourceRange().Start 1538 , SourceMgr::DK_Error 1539 , Msg 1540 , Ranges); 1541 } 1542 1543 document_iterator Stream::begin() { 1544 if (CurrentDoc) 1545 report_fatal_error("Can only iterate over the stream once"); 1546 1547 // Skip Stream-Start. 1548 scanner->getNext(); 1549 1550 CurrentDoc.reset(new Document(*this)); 1551 return document_iterator(CurrentDoc); 1552 } 1553 1554 document_iterator Stream::end() { 1555 return document_iterator(); 1556 } 1557 1558 void Stream::skip() { 1559 for (document_iterator i = begin(), e = end(); i != e; ++i) 1560 i->skip(); 1561 } 1562 1563 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1564 StringRef T) 1565 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1566 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1567 SourceRange = SMRange(Start, Start); 1568 } 1569 1570 std::string Node::getVerbatimTag() const { 1571 StringRef Raw = getRawTag(); 1572 if (!Raw.empty() && Raw != "!") { 1573 std::string Ret; 1574 if (Raw.find_last_of('!') == 0) { 1575 Ret = Doc->getTagMap().find("!")->second; 1576 Ret += Raw.substr(1); 1577 return std::move(Ret); 1578 } else if (Raw.startswith("!!")) { 1579 Ret = Doc->getTagMap().find("!!")->second; 1580 Ret += Raw.substr(2); 1581 return std::move(Ret); 1582 } else { 1583 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1584 std::map<StringRef, StringRef>::const_iterator It = 1585 Doc->getTagMap().find(TagHandle); 1586 if (It != Doc->getTagMap().end()) 1587 Ret = It->second; 1588 else { 1589 Token T; 1590 T.Kind = Token::TK_Tag; 1591 T.Range = TagHandle; 1592 setError(Twine("Unknown tag handle ") + TagHandle, T); 1593 } 1594 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1595 return std::move(Ret); 1596 } 1597 } 1598 1599 switch (getType()) { 1600 case NK_Null: 1601 return "tag:yaml.org,2002:null"; 1602 case NK_Scalar: 1603 // TODO: Tag resolution. 1604 return "tag:yaml.org,2002:str"; 1605 case NK_Mapping: 1606 return "tag:yaml.org,2002:map"; 1607 case NK_Sequence: 1608 return "tag:yaml.org,2002:seq"; 1609 } 1610 1611 return ""; 1612 } 1613 1614 Token &Node::peekNext() { 1615 return Doc->peekNext(); 1616 } 1617 1618 Token Node::getNext() { 1619 return Doc->getNext(); 1620 } 1621 1622 Node *Node::parseBlockNode() { 1623 return Doc->parseBlockNode(); 1624 } 1625 1626 BumpPtrAllocator &Node::getAllocator() { 1627 return Doc->NodeAllocator; 1628 } 1629 1630 void Node::setError(const Twine &Msg, Token &Tok) const { 1631 Doc->setError(Msg, Tok); 1632 } 1633 1634 bool Node::failed() const { 1635 return Doc->failed(); 1636 } 1637 1638 1639 1640 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1641 // TODO: Handle newlines properly. We need to remove leading whitespace. 1642 if (Value[0] == '"') { // Double quoted. 1643 // Pull off the leading and trailing "s. 1644 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1645 // Search for characters that would require unescaping the value. 1646 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1647 if (i != StringRef::npos) 1648 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1649 return UnquotedValue; 1650 } else if (Value[0] == '\'') { // Single quoted. 1651 // Pull off the leading and trailing 's. 1652 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1653 StringRef::size_type i = UnquotedValue.find('\''); 1654 if (i != StringRef::npos) { 1655 // We're going to need Storage. 1656 Storage.clear(); 1657 Storage.reserve(UnquotedValue.size()); 1658 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1659 StringRef Valid(UnquotedValue.begin(), i); 1660 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1661 Storage.push_back('\''); 1662 UnquotedValue = UnquotedValue.substr(i + 2); 1663 } 1664 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1665 return StringRef(Storage.begin(), Storage.size()); 1666 } 1667 return UnquotedValue; 1668 } 1669 // Plain or block. 1670 return Value.rtrim(" "); 1671 } 1672 1673 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1674 , StringRef::size_type i 1675 , SmallVectorImpl<char> &Storage) 1676 const { 1677 // Use Storage to build proper value. 1678 Storage.clear(); 1679 Storage.reserve(UnquotedValue.size()); 1680 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1681 // Insert all previous chars into Storage. 1682 StringRef Valid(UnquotedValue.begin(), i); 1683 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1684 // Chop off inserted chars. 1685 UnquotedValue = UnquotedValue.substr(i); 1686 1687 assert(!UnquotedValue.empty() && "Can't be empty!"); 1688 1689 // Parse escape or line break. 1690 switch (UnquotedValue[0]) { 1691 case '\r': 1692 case '\n': 1693 Storage.push_back('\n'); 1694 if ( UnquotedValue.size() > 1 1695 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1696 UnquotedValue = UnquotedValue.substr(1); 1697 UnquotedValue = UnquotedValue.substr(1); 1698 break; 1699 default: 1700 if (UnquotedValue.size() == 1) 1701 // TODO: Report error. 1702 break; 1703 UnquotedValue = UnquotedValue.substr(1); 1704 switch (UnquotedValue[0]) { 1705 default: { 1706 Token T; 1707 T.Range = StringRef(UnquotedValue.begin(), 1); 1708 setError("Unrecognized escape code!", T); 1709 return ""; 1710 } 1711 case '\r': 1712 case '\n': 1713 // Remove the new line. 1714 if ( UnquotedValue.size() > 1 1715 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1716 UnquotedValue = UnquotedValue.substr(1); 1717 // If this was just a single byte newline, it will get skipped 1718 // below. 1719 break; 1720 case '0': 1721 Storage.push_back(0x00); 1722 break; 1723 case 'a': 1724 Storage.push_back(0x07); 1725 break; 1726 case 'b': 1727 Storage.push_back(0x08); 1728 break; 1729 case 't': 1730 case 0x09: 1731 Storage.push_back(0x09); 1732 break; 1733 case 'n': 1734 Storage.push_back(0x0A); 1735 break; 1736 case 'v': 1737 Storage.push_back(0x0B); 1738 break; 1739 case 'f': 1740 Storage.push_back(0x0C); 1741 break; 1742 case 'r': 1743 Storage.push_back(0x0D); 1744 break; 1745 case 'e': 1746 Storage.push_back(0x1B); 1747 break; 1748 case ' ': 1749 Storage.push_back(0x20); 1750 break; 1751 case '"': 1752 Storage.push_back(0x22); 1753 break; 1754 case '/': 1755 Storage.push_back(0x2F); 1756 break; 1757 case '\\': 1758 Storage.push_back(0x5C); 1759 break; 1760 case 'N': 1761 encodeUTF8(0x85, Storage); 1762 break; 1763 case '_': 1764 encodeUTF8(0xA0, Storage); 1765 break; 1766 case 'L': 1767 encodeUTF8(0x2028, Storage); 1768 break; 1769 case 'P': 1770 encodeUTF8(0x2029, Storage); 1771 break; 1772 case 'x': { 1773 if (UnquotedValue.size() < 3) 1774 // TODO: Report error. 1775 break; 1776 unsigned int UnicodeScalarValue; 1777 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1778 // TODO: Report error. 1779 UnicodeScalarValue = 0xFFFD; 1780 encodeUTF8(UnicodeScalarValue, Storage); 1781 UnquotedValue = UnquotedValue.substr(2); 1782 break; 1783 } 1784 case 'u': { 1785 if (UnquotedValue.size() < 5) 1786 // TODO: Report error. 1787 break; 1788 unsigned int UnicodeScalarValue; 1789 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1790 // TODO: Report error. 1791 UnicodeScalarValue = 0xFFFD; 1792 encodeUTF8(UnicodeScalarValue, Storage); 1793 UnquotedValue = UnquotedValue.substr(4); 1794 break; 1795 } 1796 case 'U': { 1797 if (UnquotedValue.size() < 9) 1798 // TODO: Report error. 1799 break; 1800 unsigned int UnicodeScalarValue; 1801 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1802 // TODO: Report error. 1803 UnicodeScalarValue = 0xFFFD; 1804 encodeUTF8(UnicodeScalarValue, Storage); 1805 UnquotedValue = UnquotedValue.substr(8); 1806 break; 1807 } 1808 } 1809 UnquotedValue = UnquotedValue.substr(1); 1810 } 1811 } 1812 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1813 return StringRef(Storage.begin(), Storage.size()); 1814 } 1815 1816 Node *KeyValueNode::getKey() { 1817 if (Key) 1818 return Key; 1819 // Handle implicit null keys. 1820 { 1821 Token &t = peekNext(); 1822 if ( t.Kind == Token::TK_BlockEnd 1823 || t.Kind == Token::TK_Value 1824 || t.Kind == Token::TK_Error) { 1825 return Key = new (getAllocator()) NullNode(Doc); 1826 } 1827 if (t.Kind == Token::TK_Key) 1828 getNext(); // skip TK_Key. 1829 } 1830 1831 // Handle explicit null keys. 1832 Token &t = peekNext(); 1833 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1834 return Key = new (getAllocator()) NullNode(Doc); 1835 } 1836 1837 // We've got a normal key. 1838 return Key = parseBlockNode(); 1839 } 1840 1841 Node *KeyValueNode::getValue() { 1842 if (Value) 1843 return Value; 1844 getKey()->skip(); 1845 if (failed()) 1846 return Value = new (getAllocator()) NullNode(Doc); 1847 1848 // Handle implicit null values. 1849 { 1850 Token &t = peekNext(); 1851 if ( t.Kind == Token::TK_BlockEnd 1852 || t.Kind == Token::TK_FlowMappingEnd 1853 || t.Kind == Token::TK_Key 1854 || t.Kind == Token::TK_FlowEntry 1855 || t.Kind == Token::TK_Error) { 1856 return Value = new (getAllocator()) NullNode(Doc); 1857 } 1858 1859 if (t.Kind != Token::TK_Value) { 1860 setError("Unexpected token in Key Value.", t); 1861 return Value = new (getAllocator()) NullNode(Doc); 1862 } 1863 getNext(); // skip TK_Value. 1864 } 1865 1866 // Handle explicit null values. 1867 Token &t = peekNext(); 1868 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1869 return Value = new (getAllocator()) NullNode(Doc); 1870 } 1871 1872 // We got a normal value. 1873 return Value = parseBlockNode(); 1874 } 1875 1876 void MappingNode::increment() { 1877 if (failed()) { 1878 IsAtEnd = true; 1879 CurrentEntry = nullptr; 1880 return; 1881 } 1882 if (CurrentEntry) { 1883 CurrentEntry->skip(); 1884 if (Type == MT_Inline) { 1885 IsAtEnd = true; 1886 CurrentEntry = nullptr; 1887 return; 1888 } 1889 } 1890 Token T = peekNext(); 1891 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1892 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1893 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1894 } else if (Type == MT_Block) { 1895 switch (T.Kind) { 1896 case Token::TK_BlockEnd: 1897 getNext(); 1898 IsAtEnd = true; 1899 CurrentEntry = nullptr; 1900 break; 1901 default: 1902 setError("Unexpected token. Expected Key or Block End", T); 1903 case Token::TK_Error: 1904 IsAtEnd = true; 1905 CurrentEntry = nullptr; 1906 } 1907 } else { 1908 switch (T.Kind) { 1909 case Token::TK_FlowEntry: 1910 // Eat the flow entry and recurse. 1911 getNext(); 1912 return increment(); 1913 case Token::TK_FlowMappingEnd: 1914 getNext(); 1915 case Token::TK_Error: 1916 // Set this to end iterator. 1917 IsAtEnd = true; 1918 CurrentEntry = nullptr; 1919 break; 1920 default: 1921 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1922 "Mapping End." 1923 , T); 1924 IsAtEnd = true; 1925 CurrentEntry = nullptr; 1926 } 1927 } 1928 } 1929 1930 void SequenceNode::increment() { 1931 if (failed()) { 1932 IsAtEnd = true; 1933 CurrentEntry = nullptr; 1934 return; 1935 } 1936 if (CurrentEntry) 1937 CurrentEntry->skip(); 1938 Token T = peekNext(); 1939 if (SeqType == ST_Block) { 1940 switch (T.Kind) { 1941 case Token::TK_BlockEntry: 1942 getNext(); 1943 CurrentEntry = parseBlockNode(); 1944 if (!CurrentEntry) { // An error occurred. 1945 IsAtEnd = true; 1946 CurrentEntry = nullptr; 1947 } 1948 break; 1949 case Token::TK_BlockEnd: 1950 getNext(); 1951 IsAtEnd = true; 1952 CurrentEntry = nullptr; 1953 break; 1954 default: 1955 setError( "Unexpected token. Expected Block Entry or Block End." 1956 , T); 1957 case Token::TK_Error: 1958 IsAtEnd = true; 1959 CurrentEntry = nullptr; 1960 } 1961 } else if (SeqType == ST_Indentless) { 1962 switch (T.Kind) { 1963 case Token::TK_BlockEntry: 1964 getNext(); 1965 CurrentEntry = parseBlockNode(); 1966 if (!CurrentEntry) { // An error occurred. 1967 IsAtEnd = true; 1968 CurrentEntry = nullptr; 1969 } 1970 break; 1971 default: 1972 case Token::TK_Error: 1973 IsAtEnd = true; 1974 CurrentEntry = nullptr; 1975 } 1976 } else if (SeqType == ST_Flow) { 1977 switch (T.Kind) { 1978 case Token::TK_FlowEntry: 1979 // Eat the flow entry and recurse. 1980 getNext(); 1981 WasPreviousTokenFlowEntry = true; 1982 return increment(); 1983 case Token::TK_FlowSequenceEnd: 1984 getNext(); 1985 case Token::TK_Error: 1986 // Set this to end iterator. 1987 IsAtEnd = true; 1988 CurrentEntry = nullptr; 1989 break; 1990 case Token::TK_StreamEnd: 1991 case Token::TK_DocumentEnd: 1992 case Token::TK_DocumentStart: 1993 setError("Could not find closing ]!", T); 1994 // Set this to end iterator. 1995 IsAtEnd = true; 1996 CurrentEntry = nullptr; 1997 break; 1998 default: 1999 if (!WasPreviousTokenFlowEntry) { 2000 setError("Expected , between entries!", T); 2001 IsAtEnd = true; 2002 CurrentEntry = nullptr; 2003 break; 2004 } 2005 // Otherwise it must be a flow entry. 2006 CurrentEntry = parseBlockNode(); 2007 if (!CurrentEntry) { 2008 IsAtEnd = true; 2009 } 2010 WasPreviousTokenFlowEntry = false; 2011 break; 2012 } 2013 } 2014 } 2015 2016 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2017 // Tag maps starts with two default mappings. 2018 TagMap["!"] = "!"; 2019 TagMap["!!"] = "tag:yaml.org,2002:"; 2020 2021 if (parseDirectives()) 2022 expectToken(Token::TK_DocumentStart); 2023 Token &T = peekNext(); 2024 if (T.Kind == Token::TK_DocumentStart) 2025 getNext(); 2026 } 2027 2028 bool Document::skip() { 2029 if (stream.scanner->failed()) 2030 return false; 2031 if (!Root) 2032 getRoot(); 2033 Root->skip(); 2034 Token &T = peekNext(); 2035 if (T.Kind == Token::TK_StreamEnd) 2036 return false; 2037 if (T.Kind == Token::TK_DocumentEnd) { 2038 getNext(); 2039 return skip(); 2040 } 2041 return true; 2042 } 2043 2044 Token &Document::peekNext() { 2045 return stream.scanner->peekNext(); 2046 } 2047 2048 Token Document::getNext() { 2049 return stream.scanner->getNext(); 2050 } 2051 2052 void Document::setError(const Twine &Message, Token &Location) const { 2053 stream.scanner->setError(Message, Location.Range.begin()); 2054 } 2055 2056 bool Document::failed() const { 2057 return stream.scanner->failed(); 2058 } 2059 2060 Node *Document::parseBlockNode() { 2061 Token T = peekNext(); 2062 // Handle properties. 2063 Token AnchorInfo; 2064 Token TagInfo; 2065 parse_property: 2066 switch (T.Kind) { 2067 case Token::TK_Alias: 2068 getNext(); 2069 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2070 case Token::TK_Anchor: 2071 if (AnchorInfo.Kind == Token::TK_Anchor) { 2072 setError("Already encountered an anchor for this node!", T); 2073 return nullptr; 2074 } 2075 AnchorInfo = getNext(); // Consume TK_Anchor. 2076 T = peekNext(); 2077 goto parse_property; 2078 case Token::TK_Tag: 2079 if (TagInfo.Kind == Token::TK_Tag) { 2080 setError("Already encountered a tag for this node!", T); 2081 return nullptr; 2082 } 2083 TagInfo = getNext(); // Consume TK_Tag. 2084 T = peekNext(); 2085 goto parse_property; 2086 default: 2087 break; 2088 } 2089 2090 switch (T.Kind) { 2091 case Token::TK_BlockEntry: 2092 // We got an unindented BlockEntry sequence. This is not terminated with 2093 // a BlockEnd. 2094 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2095 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2096 , AnchorInfo.Range.substr(1) 2097 , TagInfo.Range 2098 , SequenceNode::ST_Indentless); 2099 case Token::TK_BlockSequenceStart: 2100 getNext(); 2101 return new (NodeAllocator) 2102 SequenceNode( stream.CurrentDoc 2103 , AnchorInfo.Range.substr(1) 2104 , TagInfo.Range 2105 , SequenceNode::ST_Block); 2106 case Token::TK_BlockMappingStart: 2107 getNext(); 2108 return new (NodeAllocator) 2109 MappingNode( stream.CurrentDoc 2110 , AnchorInfo.Range.substr(1) 2111 , TagInfo.Range 2112 , MappingNode::MT_Block); 2113 case Token::TK_FlowSequenceStart: 2114 getNext(); 2115 return new (NodeAllocator) 2116 SequenceNode( stream.CurrentDoc 2117 , AnchorInfo.Range.substr(1) 2118 , TagInfo.Range 2119 , SequenceNode::ST_Flow); 2120 case Token::TK_FlowMappingStart: 2121 getNext(); 2122 return new (NodeAllocator) 2123 MappingNode( stream.CurrentDoc 2124 , AnchorInfo.Range.substr(1) 2125 , TagInfo.Range 2126 , MappingNode::MT_Flow); 2127 case Token::TK_Scalar: 2128 getNext(); 2129 return new (NodeAllocator) 2130 ScalarNode( stream.CurrentDoc 2131 , AnchorInfo.Range.substr(1) 2132 , TagInfo.Range 2133 , T.Range); 2134 case Token::TK_Key: 2135 // Don't eat the TK_Key, KeyValueNode expects it. 2136 return new (NodeAllocator) 2137 MappingNode( stream.CurrentDoc 2138 , AnchorInfo.Range.substr(1) 2139 , TagInfo.Range 2140 , MappingNode::MT_Inline); 2141 case Token::TK_DocumentStart: 2142 case Token::TK_DocumentEnd: 2143 case Token::TK_StreamEnd: 2144 default: 2145 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2146 // !!null null. 2147 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2148 case Token::TK_Error: 2149 return nullptr; 2150 } 2151 llvm_unreachable("Control flow shouldn't reach here."); 2152 return nullptr; 2153 } 2154 2155 bool Document::parseDirectives() { 2156 bool isDirective = false; 2157 while (true) { 2158 Token T = peekNext(); 2159 if (T.Kind == Token::TK_TagDirective) { 2160 parseTAGDirective(); 2161 isDirective = true; 2162 } else if (T.Kind == Token::TK_VersionDirective) { 2163 parseYAMLDirective(); 2164 isDirective = true; 2165 } else 2166 break; 2167 } 2168 return isDirective; 2169 } 2170 2171 void Document::parseYAMLDirective() { 2172 getNext(); // Eat %YAML <version> 2173 } 2174 2175 void Document::parseTAGDirective() { 2176 Token Tag = getNext(); // %TAG <handle> <prefix> 2177 StringRef T = Tag.Range; 2178 // Strip %TAG 2179 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2180 std::size_t HandleEnd = T.find_first_of(" \t"); 2181 StringRef TagHandle = T.substr(0, HandleEnd); 2182 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2183 TagMap[TagHandle] = TagPrefix; 2184 } 2185 2186 bool Document::expectToken(int TK) { 2187 Token T = getNext(); 2188 if (T.Kind != TK) { 2189 setError("Unexpected token", T); 2190 return false; 2191 } 2192 return true; 2193 } 2194