1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallString.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/StringExtras.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/ADT/ilist.h" 20 #include "llvm/ADT/ilist_node.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/SourceMgr.h" 24 #include "llvm/Support/raw_ostream.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Pin the vtables to this file. 101 void Node::anchor() {} 102 void NullNode::anchor() {} 103 void ScalarNode::anchor() {} 104 void KeyValueNode::anchor() {} 105 void MappingNode::anchor() {} 106 void SequenceNode::anchor() {} 107 void AliasNode::anchor() {} 108 109 /// Token - A single YAML token. 110 struct Token : ilist_node<Token> { 111 enum TokenKind { 112 TK_Error, // Uninitialized token. 113 TK_StreamStart, 114 TK_StreamEnd, 115 TK_VersionDirective, 116 TK_TagDirective, 117 TK_DocumentStart, 118 TK_DocumentEnd, 119 TK_BlockEntry, 120 TK_BlockEnd, 121 TK_BlockSequenceStart, 122 TK_BlockMappingStart, 123 TK_FlowEntry, 124 TK_FlowSequenceStart, 125 TK_FlowSequenceEnd, 126 TK_FlowMappingStart, 127 TK_FlowMappingEnd, 128 TK_Key, 129 TK_Value, 130 TK_Scalar, 131 TK_Alias, 132 TK_Anchor, 133 TK_Tag 134 } Kind; 135 136 /// A string of length 0 or more whose begin() points to the logical location 137 /// of the token in the input. 138 StringRef Range; 139 140 Token() : Kind(TK_Error) {} 141 }; 142 } 143 } 144 145 namespace llvm { 146 template<> 147 struct ilist_sentinel_traits<Token> { 148 Token *createSentinel() const { 149 return &Sentinel; 150 } 151 static void destroySentinel(Token*) {} 152 153 Token *provideInitialHead() const { return createSentinel(); } 154 Token *ensureHead(Token*) const { return createSentinel(); } 155 static void noteHead(Token*, Token*) {} 156 157 private: 158 mutable Token Sentinel; 159 }; 160 161 template<> 162 struct ilist_node_traits<Token> { 163 Token *createNode(const Token &V) { 164 return new (Alloc.Allocate<Token>()) Token(V); 165 } 166 static void deleteNode(Token *V) {} 167 168 void addNodeToList(Token *) {} 169 void removeNodeFromList(Token *) {} 170 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 171 ilist_iterator<Token> /*first*/, 172 ilist_iterator<Token> /*last*/) {} 173 174 BumpPtrAllocator Alloc; 175 }; 176 } 177 178 typedef ilist<Token> TokenQueueT; 179 180 namespace { 181 /// @brief This struct is used to track simple keys. 182 /// 183 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 184 /// which could legally be the start of a simple key. When peekNext is called, 185 /// if the Token To be returned is referenced by a SimpleKey, we continue 186 /// tokenizing until that potential simple key has either been found to not be 187 /// a simple key (we moved on to the next line or went further than 1024 chars). 188 /// Or when we run into a Value, and then insert a Key token (and possibly 189 /// others) before the SimpleKey's Tok. 190 struct SimpleKey { 191 TokenQueueT::iterator Tok; 192 unsigned Column; 193 unsigned Line; 194 unsigned FlowLevel; 195 bool IsRequired; 196 197 bool operator ==(const SimpleKey &Other) { 198 return Tok == Other.Tok; 199 } 200 }; 201 } 202 203 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 204 /// subsequence and the subsequence's length in code units (uint8_t). 205 /// A length of 0 represents an error. 206 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 207 208 static UTF8Decoded decodeUTF8(StringRef Range) { 209 StringRef::iterator Position= Range.begin(); 210 StringRef::iterator End = Range.end(); 211 // 1 byte: [0x00, 0x7f] 212 // Bit pattern: 0xxxxxxx 213 if ((*Position & 0x80) == 0) { 214 return std::make_pair(*Position, 1); 215 } 216 // 2 bytes: [0x80, 0x7ff] 217 // Bit pattern: 110xxxxx 10xxxxxx 218 if (Position + 1 != End && 219 ((*Position & 0xE0) == 0xC0) && 220 ((*(Position + 1) & 0xC0) == 0x80)) { 221 uint32_t codepoint = ((*Position & 0x1F) << 6) | 222 (*(Position + 1) & 0x3F); 223 if (codepoint >= 0x80) 224 return std::make_pair(codepoint, 2); 225 } 226 // 3 bytes: [0x8000, 0xffff] 227 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 228 if (Position + 2 != End && 229 ((*Position & 0xF0) == 0xE0) && 230 ((*(Position + 1) & 0xC0) == 0x80) && 231 ((*(Position + 2) & 0xC0) == 0x80)) { 232 uint32_t codepoint = ((*Position & 0x0F) << 12) | 233 ((*(Position + 1) & 0x3F) << 6) | 234 (*(Position + 2) & 0x3F); 235 // Codepoints between 0xD800 and 0xDFFF are invalid, as 236 // they are high / low surrogate halves used by UTF-16. 237 if (codepoint >= 0x800 && 238 (codepoint < 0xD800 || codepoint > 0xDFFF)) 239 return std::make_pair(codepoint, 3); 240 } 241 // 4 bytes: [0x10000, 0x10FFFF] 242 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 243 if (Position + 3 != End && 244 ((*Position & 0xF8) == 0xF0) && 245 ((*(Position + 1) & 0xC0) == 0x80) && 246 ((*(Position + 2) & 0xC0) == 0x80) && 247 ((*(Position + 3) & 0xC0) == 0x80)) { 248 uint32_t codepoint = ((*Position & 0x07) << 18) | 249 ((*(Position + 1) & 0x3F) << 12) | 250 ((*(Position + 2) & 0x3F) << 6) | 251 (*(Position + 3) & 0x3F); 252 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 253 return std::make_pair(codepoint, 4); 254 } 255 return std::make_pair(0, 0); 256 } 257 258 namespace llvm { 259 namespace yaml { 260 /// @brief Scans YAML tokens from a MemoryBuffer. 261 class Scanner { 262 public: 263 Scanner(StringRef Input, SourceMgr &SM); 264 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_); 265 266 /// @brief Parse the next token and return it without popping it. 267 Token &peekNext(); 268 269 /// @brief Parse the next token and pop it from the queue. 270 Token getNext(); 271 272 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 273 ArrayRef<SMRange> Ranges = None) { 274 SM.PrintMessage(Loc, Kind, Message, Ranges); 275 } 276 277 void setError(const Twine &Message, StringRef::iterator Position) { 278 if (Current >= End) 279 Current = End - 1; 280 281 // Don't print out more errors after the first one we encounter. The rest 282 // are just the result of the first, and have no meaning. 283 if (!Failed) 284 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 285 Failed = true; 286 } 287 288 void setError(const Twine &Message) { 289 setError(Message, Current); 290 } 291 292 /// @brief Returns true if an error occurred while parsing. 293 bool failed() { 294 return Failed; 295 } 296 297 private: 298 void init(MemoryBufferRef Buffer); 299 300 StringRef currentInput() { 301 return StringRef(Current, End - Current); 302 } 303 304 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 305 /// at \a Position. 306 /// 307 /// If the UTF-8 code units starting at Position do not form a well-formed 308 /// code unit subsequence, then the Unicode scalar value is 0, and the length 309 /// is 0. 310 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 311 return ::decodeUTF8(StringRef(Position, End - Position)); 312 } 313 314 // The following functions are based on the gramar rules in the YAML spec. The 315 // style of the function names it meant to closely match how they are written 316 // in the spec. The number within the [] is the number of the grammar rule in 317 // the spec. 318 // 319 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 320 // 321 // c- 322 // A production starting and ending with a special character. 323 // b- 324 // A production matching a single line break. 325 // nb- 326 // A production starting and ending with a non-break character. 327 // s- 328 // A production starting and ending with a white space character. 329 // ns- 330 // A production starting and ending with a non-space character. 331 // l- 332 // A production matching complete line(s). 333 334 /// @brief Skip a single nb-char[27] starting at Position. 335 /// 336 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 337 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 338 /// 339 /// @returns The code unit after the nb-char, or Position if it's not an 340 /// nb-char. 341 StringRef::iterator skip_nb_char(StringRef::iterator Position); 342 343 /// @brief Skip a single b-break[28] starting at Position. 344 /// 345 /// A b-break is 0xD 0xA | 0xD | 0xA 346 /// 347 /// @returns The code unit after the b-break, or Position if it's not a 348 /// b-break. 349 StringRef::iterator skip_b_break(StringRef::iterator Position); 350 351 /// @brief Skip a single s-white[33] starting at Position. 352 /// 353 /// A s-white is 0x20 | 0x9 354 /// 355 /// @returns The code unit after the s-white, or Position if it's not a 356 /// s-white. 357 StringRef::iterator skip_s_white(StringRef::iterator Position); 358 359 /// @brief Skip a single ns-char[34] starting at Position. 360 /// 361 /// A ns-char is nb-char - s-white 362 /// 363 /// @returns The code unit after the ns-char, or Position if it's not a 364 /// ns-char. 365 StringRef::iterator skip_ns_char(StringRef::iterator Position); 366 367 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 368 /// @brief Skip minimal well-formed code unit subsequences until Func 369 /// returns its input. 370 /// 371 /// @returns The code unit after the last minimal well-formed code unit 372 /// subsequence that Func accepted. 373 StringRef::iterator skip_while( SkipWhileFunc Func 374 , StringRef::iterator Position); 375 376 /// @brief Scan ns-uri-char[39]s starting at Cur. 377 /// 378 /// This updates Cur and Column while scanning. 379 /// 380 /// @returns A StringRef starting at Cur which covers the longest contiguous 381 /// sequence of ns-uri-char. 382 StringRef scan_ns_uri_char(); 383 384 /// @brief Consume a minimal well-formed code unit subsequence starting at 385 /// \a Cur. Return false if it is not the same Unicode scalar value as 386 /// \a Expected. This updates \a Column. 387 bool consume(uint32_t Expected); 388 389 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 390 void skip(uint32_t Distance); 391 392 /// @brief Return true if the minimal well-formed code unit subsequence at 393 /// Pos is whitespace or a new line 394 bool isBlankOrBreak(StringRef::iterator Position); 395 396 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 397 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 398 , unsigned AtColumn 399 , bool IsRequired); 400 401 /// @brief Remove simple keys that can no longer be valid simple keys. 402 /// 403 /// Invalid simple keys are not on the current line or are further than 1024 404 /// columns back. 405 void removeStaleSimpleKeyCandidates(); 406 407 /// @brief Remove all simple keys on FlowLevel \a Level. 408 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 409 410 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 411 /// tokens if needed. 412 bool unrollIndent(int ToColumn); 413 414 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 415 /// if needed. 416 bool rollIndent( int ToColumn 417 , Token::TokenKind Kind 418 , TokenQueueT::iterator InsertPoint); 419 420 /// @brief Skip whitespace and comments until the start of the next token. 421 void scanToNextToken(); 422 423 /// @brief Must be the first token generated. 424 bool scanStreamStart(); 425 426 /// @brief Generate tokens needed to close out the stream. 427 bool scanStreamEnd(); 428 429 /// @brief Scan a %BLAH directive. 430 bool scanDirective(); 431 432 /// @brief Scan a ... or ---. 433 bool scanDocumentIndicator(bool IsStart); 434 435 /// @brief Scan a [ or { and generate the proper flow collection start token. 436 bool scanFlowCollectionStart(bool IsSequence); 437 438 /// @brief Scan a ] or } and generate the proper flow collection end token. 439 bool scanFlowCollectionEnd(bool IsSequence); 440 441 /// @brief Scan the , that separates entries in a flow collection. 442 bool scanFlowEntry(); 443 444 /// @brief Scan the - that starts block sequence entries. 445 bool scanBlockEntry(); 446 447 /// @brief Scan an explicit ? indicating a key. 448 bool scanKey(); 449 450 /// @brief Scan an explicit : indicating a value. 451 bool scanValue(); 452 453 /// @brief Scan a quoted scalar. 454 bool scanFlowScalar(bool IsDoubleQuoted); 455 456 /// @brief Scan an unquoted scalar. 457 bool scanPlainScalar(); 458 459 /// @brief Scan an Alias or Anchor starting with * or &. 460 bool scanAliasOrAnchor(bool IsAlias); 461 462 /// @brief Scan a block scalar starting with | or >. 463 bool scanBlockScalar(bool IsLiteral); 464 465 /// @brief Scan a tag of the form !stuff. 466 bool scanTag(); 467 468 /// @brief Dispatch to the next scanning function based on \a *Cur. 469 bool fetchMoreTokens(); 470 471 /// @brief The SourceMgr used for diagnostics and buffer management. 472 SourceMgr &SM; 473 474 /// @brief The original input. 475 MemoryBufferRef InputBuffer; 476 477 /// @brief The current position of the scanner. 478 StringRef::iterator Current; 479 480 /// @brief The end of the input (one past the last character). 481 StringRef::iterator End; 482 483 /// @brief Current YAML indentation level in spaces. 484 int Indent; 485 486 /// @brief Current column number in Unicode code points. 487 unsigned Column; 488 489 /// @brief Current line number. 490 unsigned Line; 491 492 /// @brief How deep we are in flow style containers. 0 Means at block level. 493 unsigned FlowLevel; 494 495 /// @brief Are we at the start of the stream? 496 bool IsStartOfStream; 497 498 /// @brief Can the next token be the start of a simple key? 499 bool IsSimpleKeyAllowed; 500 501 /// @brief True if an error has occurred. 502 bool Failed; 503 504 /// @brief Queue of tokens. This is required to queue up tokens while looking 505 /// for the end of a simple key. And for cases where a single character 506 /// can produce multiple tokens (e.g. BlockEnd). 507 TokenQueueT TokenQueue; 508 509 /// @brief Indentation levels. 510 SmallVector<int, 4> Indents; 511 512 /// @brief Potential simple keys. 513 SmallVector<SimpleKey, 4> SimpleKeys; 514 }; 515 516 } // end namespace yaml 517 } // end namespace llvm 518 519 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 520 static void encodeUTF8( uint32_t UnicodeScalarValue 521 , SmallVectorImpl<char> &Result) { 522 if (UnicodeScalarValue <= 0x7F) { 523 Result.push_back(UnicodeScalarValue & 0x7F); 524 } else if (UnicodeScalarValue <= 0x7FF) { 525 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 526 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 527 Result.push_back(FirstByte); 528 Result.push_back(SecondByte); 529 } else if (UnicodeScalarValue <= 0xFFFF) { 530 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 531 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 532 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 533 Result.push_back(FirstByte); 534 Result.push_back(SecondByte); 535 Result.push_back(ThirdByte); 536 } else if (UnicodeScalarValue <= 0x10FFFF) { 537 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 538 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 539 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 540 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 541 Result.push_back(FirstByte); 542 Result.push_back(SecondByte); 543 Result.push_back(ThirdByte); 544 Result.push_back(FourthByte); 545 } 546 } 547 548 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 549 SourceMgr SM; 550 Scanner scanner(Input, SM); 551 while (true) { 552 Token T = scanner.getNext(); 553 switch (T.Kind) { 554 case Token::TK_StreamStart: 555 OS << "Stream-Start: "; 556 break; 557 case Token::TK_StreamEnd: 558 OS << "Stream-End: "; 559 break; 560 case Token::TK_VersionDirective: 561 OS << "Version-Directive: "; 562 break; 563 case Token::TK_TagDirective: 564 OS << "Tag-Directive: "; 565 break; 566 case Token::TK_DocumentStart: 567 OS << "Document-Start: "; 568 break; 569 case Token::TK_DocumentEnd: 570 OS << "Document-End: "; 571 break; 572 case Token::TK_BlockEntry: 573 OS << "Block-Entry: "; 574 break; 575 case Token::TK_BlockEnd: 576 OS << "Block-End: "; 577 break; 578 case Token::TK_BlockSequenceStart: 579 OS << "Block-Sequence-Start: "; 580 break; 581 case Token::TK_BlockMappingStart: 582 OS << "Block-Mapping-Start: "; 583 break; 584 case Token::TK_FlowEntry: 585 OS << "Flow-Entry: "; 586 break; 587 case Token::TK_FlowSequenceStart: 588 OS << "Flow-Sequence-Start: "; 589 break; 590 case Token::TK_FlowSequenceEnd: 591 OS << "Flow-Sequence-End: "; 592 break; 593 case Token::TK_FlowMappingStart: 594 OS << "Flow-Mapping-Start: "; 595 break; 596 case Token::TK_FlowMappingEnd: 597 OS << "Flow-Mapping-End: "; 598 break; 599 case Token::TK_Key: 600 OS << "Key: "; 601 break; 602 case Token::TK_Value: 603 OS << "Value: "; 604 break; 605 case Token::TK_Scalar: 606 OS << "Scalar: "; 607 break; 608 case Token::TK_Alias: 609 OS << "Alias: "; 610 break; 611 case Token::TK_Anchor: 612 OS << "Anchor: "; 613 break; 614 case Token::TK_Tag: 615 OS << "Tag: "; 616 break; 617 case Token::TK_Error: 618 break; 619 } 620 OS << T.Range << "\n"; 621 if (T.Kind == Token::TK_StreamEnd) 622 break; 623 else if (T.Kind == Token::TK_Error) 624 return false; 625 } 626 return true; 627 } 628 629 bool yaml::scanTokens(StringRef Input) { 630 llvm::SourceMgr SM; 631 llvm::yaml::Scanner scanner(Input, SM); 632 for (;;) { 633 llvm::yaml::Token T = scanner.getNext(); 634 if (T.Kind == Token::TK_StreamEnd) 635 break; 636 else if (T.Kind == Token::TK_Error) 637 return false; 638 } 639 return true; 640 } 641 642 std::string yaml::escape(StringRef Input) { 643 std::string EscapedInput; 644 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 645 if (*i == '\\') 646 EscapedInput += "\\\\"; 647 else if (*i == '"') 648 EscapedInput += "\\\""; 649 else if (*i == 0) 650 EscapedInput += "\\0"; 651 else if (*i == 0x07) 652 EscapedInput += "\\a"; 653 else if (*i == 0x08) 654 EscapedInput += "\\b"; 655 else if (*i == 0x09) 656 EscapedInput += "\\t"; 657 else if (*i == 0x0A) 658 EscapedInput += "\\n"; 659 else if (*i == 0x0B) 660 EscapedInput += "\\v"; 661 else if (*i == 0x0C) 662 EscapedInput += "\\f"; 663 else if (*i == 0x0D) 664 EscapedInput += "\\r"; 665 else if (*i == 0x1B) 666 EscapedInput += "\\e"; 667 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 668 std::string HexStr = utohexstr(*i); 669 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 670 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 671 UTF8Decoded UnicodeScalarValue 672 = decodeUTF8(StringRef(i, Input.end() - i)); 673 if (UnicodeScalarValue.second == 0) { 674 // Found invalid char. 675 SmallString<4> Val; 676 encodeUTF8(0xFFFD, Val); 677 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 678 // FIXME: Error reporting. 679 return EscapedInput; 680 } 681 if (UnicodeScalarValue.first == 0x85) 682 EscapedInput += "\\N"; 683 else if (UnicodeScalarValue.first == 0xA0) 684 EscapedInput += "\\_"; 685 else if (UnicodeScalarValue.first == 0x2028) 686 EscapedInput += "\\L"; 687 else if (UnicodeScalarValue.first == 0x2029) 688 EscapedInput += "\\P"; 689 else { 690 std::string HexStr = utohexstr(UnicodeScalarValue.first); 691 if (HexStr.size() <= 2) 692 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 693 else if (HexStr.size() <= 4) 694 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 695 else if (HexStr.size() <= 8) 696 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 697 } 698 i += UnicodeScalarValue.second - 1; 699 } else 700 EscapedInput.push_back(*i); 701 } 702 return EscapedInput; 703 } 704 705 Scanner::Scanner(StringRef Input, SourceMgr &sm) : SM(sm) { 706 init(MemoryBufferRef(Input, "YAML")); 707 } 708 709 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_) : SM(SM_) { 710 init(Buffer); 711 } 712 713 void Scanner::init(MemoryBufferRef Buffer) { 714 InputBuffer = Buffer; 715 Current = InputBuffer.getBufferStart(); 716 End = InputBuffer.getBufferEnd(); 717 Indent = -1; 718 Column = 0; 719 Line = 0; 720 FlowLevel = 0; 721 IsStartOfStream = true; 722 IsSimpleKeyAllowed = true; 723 Failed = false; 724 std::unique_ptr<MemoryBuffer> InputBufferOwner = 725 MemoryBuffer::getMemBuffer(Buffer); 726 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 727 } 728 729 Token &Scanner::peekNext() { 730 // If the current token is a possible simple key, keep parsing until we 731 // can confirm. 732 bool NeedMore = false; 733 while (true) { 734 if (TokenQueue.empty() || NeedMore) { 735 if (!fetchMoreTokens()) { 736 TokenQueue.clear(); 737 TokenQueue.push_back(Token()); 738 return TokenQueue.front(); 739 } 740 } 741 assert(!TokenQueue.empty() && 742 "fetchMoreTokens lied about getting tokens!"); 743 744 removeStaleSimpleKeyCandidates(); 745 SimpleKey SK; 746 SK.Tok = TokenQueue.front(); 747 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 748 == SimpleKeys.end()) 749 break; 750 else 751 NeedMore = true; 752 } 753 return TokenQueue.front(); 754 } 755 756 Token Scanner::getNext() { 757 Token Ret = peekNext(); 758 // TokenQueue can be empty if there was an error getting the next token. 759 if (!TokenQueue.empty()) 760 TokenQueue.pop_front(); 761 762 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 763 // quick deallocation of them all. 764 if (TokenQueue.empty()) { 765 TokenQueue.Alloc.Reset(); 766 } 767 768 return Ret; 769 } 770 771 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 772 if (Position == End) 773 return Position; 774 // Check 7 bit c-printable - b-char. 775 if ( *Position == 0x09 776 || (*Position >= 0x20 && *Position <= 0x7E)) 777 return Position + 1; 778 779 // Check for valid UTF-8. 780 if (uint8_t(*Position) & 0x80) { 781 UTF8Decoded u8d = decodeUTF8(Position); 782 if ( u8d.second != 0 783 && u8d.first != 0xFEFF 784 && ( u8d.first == 0x85 785 || ( u8d.first >= 0xA0 786 && u8d.first <= 0xD7FF) 787 || ( u8d.first >= 0xE000 788 && u8d.first <= 0xFFFD) 789 || ( u8d.first >= 0x10000 790 && u8d.first <= 0x10FFFF))) 791 return Position + u8d.second; 792 } 793 return Position; 794 } 795 796 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 797 if (Position == End) 798 return Position; 799 if (*Position == 0x0D) { 800 if (Position + 1 != End && *(Position + 1) == 0x0A) 801 return Position + 2; 802 return Position + 1; 803 } 804 805 if (*Position == 0x0A) 806 return Position + 1; 807 return Position; 808 } 809 810 811 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 812 if (Position == End) 813 return Position; 814 if (*Position == ' ' || *Position == '\t') 815 return Position + 1; 816 return Position; 817 } 818 819 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 820 if (Position == End) 821 return Position; 822 if (*Position == ' ' || *Position == '\t') 823 return Position; 824 return skip_nb_char(Position); 825 } 826 827 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 828 , StringRef::iterator Position) { 829 while (true) { 830 StringRef::iterator i = (this->*Func)(Position); 831 if (i == Position) 832 break; 833 Position = i; 834 } 835 return Position; 836 } 837 838 static bool is_ns_hex_digit(const char C) { 839 return (C >= '0' && C <= '9') 840 || (C >= 'a' && C <= 'z') 841 || (C >= 'A' && C <= 'Z'); 842 } 843 844 static bool is_ns_word_char(const char C) { 845 return C == '-' 846 || (C >= 'a' && C <= 'z') 847 || (C >= 'A' && C <= 'Z'); 848 } 849 850 StringRef Scanner::scan_ns_uri_char() { 851 StringRef::iterator Start = Current; 852 while (true) { 853 if (Current == End) 854 break; 855 if (( *Current == '%' 856 && Current + 2 < End 857 && is_ns_hex_digit(*(Current + 1)) 858 && is_ns_hex_digit(*(Current + 2))) 859 || is_ns_word_char(*Current) 860 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 861 != StringRef::npos) { 862 ++Current; 863 ++Column; 864 } else 865 break; 866 } 867 return StringRef(Start, Current - Start); 868 } 869 870 bool Scanner::consume(uint32_t Expected) { 871 if (Expected >= 0x80) 872 report_fatal_error("Not dealing with this yet"); 873 if (Current == End) 874 return false; 875 if (uint8_t(*Current) >= 0x80) 876 report_fatal_error("Not dealing with this yet"); 877 if (uint8_t(*Current) == Expected) { 878 ++Current; 879 ++Column; 880 return true; 881 } 882 return false; 883 } 884 885 void Scanner::skip(uint32_t Distance) { 886 Current += Distance; 887 Column += Distance; 888 assert(Current <= End && "Skipped past the end"); 889 } 890 891 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 892 if (Position == End) 893 return false; 894 if ( *Position == ' ' || *Position == '\t' 895 || *Position == '\r' || *Position == '\n') 896 return true; 897 return false; 898 } 899 900 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 901 , unsigned AtColumn 902 , bool IsRequired) { 903 if (IsSimpleKeyAllowed) { 904 SimpleKey SK; 905 SK.Tok = Tok; 906 SK.Line = Line; 907 SK.Column = AtColumn; 908 SK.IsRequired = IsRequired; 909 SK.FlowLevel = FlowLevel; 910 SimpleKeys.push_back(SK); 911 } 912 } 913 914 void Scanner::removeStaleSimpleKeyCandidates() { 915 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 916 i != SimpleKeys.end();) { 917 if (i->Line != Line || i->Column + 1024 < Column) { 918 if (i->IsRequired) 919 setError( "Could not find expected : for simple key" 920 , i->Tok->Range.begin()); 921 i = SimpleKeys.erase(i); 922 } else 923 ++i; 924 } 925 } 926 927 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 928 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 929 SimpleKeys.pop_back(); 930 } 931 932 bool Scanner::unrollIndent(int ToColumn) { 933 Token T; 934 // Indentation is ignored in flow. 935 if (FlowLevel != 0) 936 return true; 937 938 while (Indent > ToColumn) { 939 T.Kind = Token::TK_BlockEnd; 940 T.Range = StringRef(Current, 1); 941 TokenQueue.push_back(T); 942 Indent = Indents.pop_back_val(); 943 } 944 945 return true; 946 } 947 948 bool Scanner::rollIndent( int ToColumn 949 , Token::TokenKind Kind 950 , TokenQueueT::iterator InsertPoint) { 951 if (FlowLevel) 952 return true; 953 if (Indent < ToColumn) { 954 Indents.push_back(Indent); 955 Indent = ToColumn; 956 957 Token T; 958 T.Kind = Kind; 959 T.Range = StringRef(Current, 0); 960 TokenQueue.insert(InsertPoint, T); 961 } 962 return true; 963 } 964 965 void Scanner::scanToNextToken() { 966 while (true) { 967 while (*Current == ' ' || *Current == '\t') { 968 skip(1); 969 } 970 971 // Skip comment. 972 if (*Current == '#') { 973 while (true) { 974 // This may skip more than one byte, thus Column is only incremented 975 // for code points. 976 StringRef::iterator i = skip_nb_char(Current); 977 if (i == Current) 978 break; 979 Current = i; 980 ++Column; 981 } 982 } 983 984 // Skip EOL. 985 StringRef::iterator i = skip_b_break(Current); 986 if (i == Current) 987 break; 988 Current = i; 989 ++Line; 990 Column = 0; 991 // New lines may start a simple key. 992 if (!FlowLevel) 993 IsSimpleKeyAllowed = true; 994 } 995 } 996 997 bool Scanner::scanStreamStart() { 998 IsStartOfStream = false; 999 1000 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1001 1002 Token T; 1003 T.Kind = Token::TK_StreamStart; 1004 T.Range = StringRef(Current, EI.second); 1005 TokenQueue.push_back(T); 1006 Current += EI.second; 1007 return true; 1008 } 1009 1010 bool Scanner::scanStreamEnd() { 1011 // Force an ending new line if one isn't present. 1012 if (Column != 0) { 1013 Column = 0; 1014 ++Line; 1015 } 1016 1017 unrollIndent(-1); 1018 SimpleKeys.clear(); 1019 IsSimpleKeyAllowed = false; 1020 1021 Token T; 1022 T.Kind = Token::TK_StreamEnd; 1023 T.Range = StringRef(Current, 0); 1024 TokenQueue.push_back(T); 1025 return true; 1026 } 1027 1028 bool Scanner::scanDirective() { 1029 // Reset the indentation level. 1030 unrollIndent(-1); 1031 SimpleKeys.clear(); 1032 IsSimpleKeyAllowed = false; 1033 1034 StringRef::iterator Start = Current; 1035 consume('%'); 1036 StringRef::iterator NameStart = Current; 1037 Current = skip_while(&Scanner::skip_ns_char, Current); 1038 StringRef Name(NameStart, Current - NameStart); 1039 Current = skip_while(&Scanner::skip_s_white, Current); 1040 1041 Token T; 1042 if (Name == "YAML") { 1043 Current = skip_while(&Scanner::skip_ns_char, Current); 1044 T.Kind = Token::TK_VersionDirective; 1045 T.Range = StringRef(Start, Current - Start); 1046 TokenQueue.push_back(T); 1047 return true; 1048 } else if(Name == "TAG") { 1049 Current = skip_while(&Scanner::skip_ns_char, Current); 1050 Current = skip_while(&Scanner::skip_s_white, Current); 1051 Current = skip_while(&Scanner::skip_ns_char, Current); 1052 T.Kind = Token::TK_TagDirective; 1053 T.Range = StringRef(Start, Current - Start); 1054 TokenQueue.push_back(T); 1055 return true; 1056 } 1057 return false; 1058 } 1059 1060 bool Scanner::scanDocumentIndicator(bool IsStart) { 1061 unrollIndent(-1); 1062 SimpleKeys.clear(); 1063 IsSimpleKeyAllowed = false; 1064 1065 Token T; 1066 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1067 T.Range = StringRef(Current, 3); 1068 skip(3); 1069 TokenQueue.push_back(T); 1070 return true; 1071 } 1072 1073 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1074 Token T; 1075 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1076 : Token::TK_FlowMappingStart; 1077 T.Range = StringRef(Current, 1); 1078 skip(1); 1079 TokenQueue.push_back(T); 1080 1081 // [ and { may begin a simple key. 1082 saveSimpleKeyCandidate(TokenQueue.back(), Column - 1, false); 1083 1084 // And may also be followed by a simple key. 1085 IsSimpleKeyAllowed = true; 1086 ++FlowLevel; 1087 return true; 1088 } 1089 1090 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1091 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1092 IsSimpleKeyAllowed = false; 1093 Token T; 1094 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1095 : Token::TK_FlowMappingEnd; 1096 T.Range = StringRef(Current, 1); 1097 skip(1); 1098 TokenQueue.push_back(T); 1099 if (FlowLevel) 1100 --FlowLevel; 1101 return true; 1102 } 1103 1104 bool Scanner::scanFlowEntry() { 1105 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1106 IsSimpleKeyAllowed = true; 1107 Token T; 1108 T.Kind = Token::TK_FlowEntry; 1109 T.Range = StringRef(Current, 1); 1110 skip(1); 1111 TokenQueue.push_back(T); 1112 return true; 1113 } 1114 1115 bool Scanner::scanBlockEntry() { 1116 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1117 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1118 IsSimpleKeyAllowed = true; 1119 Token T; 1120 T.Kind = Token::TK_BlockEntry; 1121 T.Range = StringRef(Current, 1); 1122 skip(1); 1123 TokenQueue.push_back(T); 1124 return true; 1125 } 1126 1127 bool Scanner::scanKey() { 1128 if (!FlowLevel) 1129 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1130 1131 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1132 IsSimpleKeyAllowed = !FlowLevel; 1133 1134 Token T; 1135 T.Kind = Token::TK_Key; 1136 T.Range = StringRef(Current, 1); 1137 skip(1); 1138 TokenQueue.push_back(T); 1139 return true; 1140 } 1141 1142 bool Scanner::scanValue() { 1143 // If the previous token could have been a simple key, insert the key token 1144 // into the token queue. 1145 if (!SimpleKeys.empty()) { 1146 SimpleKey SK = SimpleKeys.pop_back_val(); 1147 Token T; 1148 T.Kind = Token::TK_Key; 1149 T.Range = SK.Tok->Range; 1150 TokenQueueT::iterator i, e; 1151 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1152 if (i == SK.Tok) 1153 break; 1154 } 1155 assert(i != e && "SimpleKey not in token queue!"); 1156 i = TokenQueue.insert(i, T); 1157 1158 // We may also need to add a Block-Mapping-Start token. 1159 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1160 1161 IsSimpleKeyAllowed = false; 1162 } else { 1163 if (!FlowLevel) 1164 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1165 IsSimpleKeyAllowed = !FlowLevel; 1166 } 1167 1168 Token T; 1169 T.Kind = Token::TK_Value; 1170 T.Range = StringRef(Current, 1); 1171 skip(1); 1172 TokenQueue.push_back(T); 1173 return true; 1174 } 1175 1176 // Forbidding inlining improves performance by roughly 20%. 1177 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1178 LLVM_ATTRIBUTE_NOINLINE static bool 1179 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1180 1181 // Returns whether a character at 'Position' was escaped with a leading '\'. 1182 // 'First' specifies the position of the first character in the string. 1183 static bool wasEscaped(StringRef::iterator First, 1184 StringRef::iterator Position) { 1185 assert(Position - 1 >= First); 1186 StringRef::iterator I = Position - 1; 1187 // We calculate the number of consecutive '\'s before the current position 1188 // by iterating backwards through our string. 1189 while (I >= First && *I == '\\') --I; 1190 // (Position - 1 - I) now contains the number of '\'s before the current 1191 // position. If it is odd, the character at 'Position' was escaped. 1192 return (Position - 1 - I) % 2 == 1; 1193 } 1194 1195 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1196 StringRef::iterator Start = Current; 1197 unsigned ColStart = Column; 1198 if (IsDoubleQuoted) { 1199 do { 1200 ++Current; 1201 while (Current != End && *Current != '"') 1202 ++Current; 1203 // Repeat until the previous character was not a '\' or was an escaped 1204 // backslash. 1205 } while ( Current != End 1206 && *(Current - 1) == '\\' 1207 && wasEscaped(Start + 1, Current)); 1208 } else { 1209 skip(1); 1210 while (true) { 1211 // Skip a ' followed by another '. 1212 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1213 skip(2); 1214 continue; 1215 } else if (*Current == '\'') 1216 break; 1217 StringRef::iterator i = skip_nb_char(Current); 1218 if (i == Current) { 1219 i = skip_b_break(Current); 1220 if (i == Current) 1221 break; 1222 Current = i; 1223 Column = 0; 1224 ++Line; 1225 } else { 1226 if (i == End) 1227 break; 1228 Current = i; 1229 ++Column; 1230 } 1231 } 1232 } 1233 1234 if (Current == End) { 1235 setError("Expected quote at end of scalar", Current); 1236 return false; 1237 } 1238 1239 skip(1); // Skip ending quote. 1240 Token T; 1241 T.Kind = Token::TK_Scalar; 1242 T.Range = StringRef(Start, Current - Start); 1243 TokenQueue.push_back(T); 1244 1245 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1246 1247 IsSimpleKeyAllowed = false; 1248 1249 return true; 1250 } 1251 1252 bool Scanner::scanPlainScalar() { 1253 StringRef::iterator Start = Current; 1254 unsigned ColStart = Column; 1255 unsigned LeadingBlanks = 0; 1256 assert(Indent >= -1 && "Indent must be >= -1 !"); 1257 unsigned indent = static_cast<unsigned>(Indent + 1); 1258 while (true) { 1259 if (*Current == '#') 1260 break; 1261 1262 while (!isBlankOrBreak(Current)) { 1263 if ( FlowLevel && *Current == ':' 1264 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1265 setError("Found unexpected ':' while scanning a plain scalar", Current); 1266 return false; 1267 } 1268 1269 // Check for the end of the plain scalar. 1270 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1271 || ( FlowLevel 1272 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1273 != StringRef::npos))) 1274 break; 1275 1276 StringRef::iterator i = skip_nb_char(Current); 1277 if (i == Current) 1278 break; 1279 Current = i; 1280 ++Column; 1281 } 1282 1283 // Are we at the end? 1284 if (!isBlankOrBreak(Current)) 1285 break; 1286 1287 // Eat blanks. 1288 StringRef::iterator Tmp = Current; 1289 while (isBlankOrBreak(Tmp)) { 1290 StringRef::iterator i = skip_s_white(Tmp); 1291 if (i != Tmp) { 1292 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1293 setError("Found invalid tab character in indentation", Tmp); 1294 return false; 1295 } 1296 Tmp = i; 1297 ++Column; 1298 } else { 1299 i = skip_b_break(Tmp); 1300 if (!LeadingBlanks) 1301 LeadingBlanks = 1; 1302 Tmp = i; 1303 Column = 0; 1304 ++Line; 1305 } 1306 } 1307 1308 if (!FlowLevel && Column < indent) 1309 break; 1310 1311 Current = Tmp; 1312 } 1313 if (Start == Current) { 1314 setError("Got empty plain scalar", Start); 1315 return false; 1316 } 1317 Token T; 1318 T.Kind = Token::TK_Scalar; 1319 T.Range = StringRef(Start, Current - Start); 1320 TokenQueue.push_back(T); 1321 1322 // Plain scalars can be simple keys. 1323 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1324 1325 IsSimpleKeyAllowed = false; 1326 1327 return true; 1328 } 1329 1330 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1331 StringRef::iterator Start = Current; 1332 unsigned ColStart = Column; 1333 skip(1); 1334 while(true) { 1335 if ( *Current == '[' || *Current == ']' 1336 || *Current == '{' || *Current == '}' 1337 || *Current == ',' 1338 || *Current == ':') 1339 break; 1340 StringRef::iterator i = skip_ns_char(Current); 1341 if (i == Current) 1342 break; 1343 Current = i; 1344 ++Column; 1345 } 1346 1347 if (Start == Current) { 1348 setError("Got empty alias or anchor", Start); 1349 return false; 1350 } 1351 1352 Token T; 1353 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1354 T.Range = StringRef(Start, Current - Start); 1355 TokenQueue.push_back(T); 1356 1357 // Alias and anchors can be simple keys. 1358 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1359 1360 IsSimpleKeyAllowed = false; 1361 1362 return true; 1363 } 1364 1365 bool Scanner::scanBlockScalar(bool IsLiteral) { 1366 StringRef::iterator Start = Current; 1367 skip(1); // Eat | or > 1368 while(true) { 1369 StringRef::iterator i = skip_nb_char(Current); 1370 if (i == Current) { 1371 if (Column == 0) 1372 break; 1373 i = skip_b_break(Current); 1374 if (i != Current) { 1375 // We got a line break. 1376 Column = 0; 1377 ++Line; 1378 Current = i; 1379 continue; 1380 } else { 1381 // There was an error, which should already have been printed out. 1382 return false; 1383 } 1384 } 1385 Current = i; 1386 ++Column; 1387 } 1388 1389 if (Start == Current) { 1390 setError("Got empty block scalar", Start); 1391 return false; 1392 } 1393 1394 Token T; 1395 T.Kind = Token::TK_Scalar; 1396 T.Range = StringRef(Start, Current - Start); 1397 TokenQueue.push_back(T); 1398 return true; 1399 } 1400 1401 bool Scanner::scanTag() { 1402 StringRef::iterator Start = Current; 1403 unsigned ColStart = Column; 1404 skip(1); // Eat !. 1405 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1406 else if (*Current == '<') { 1407 skip(1); 1408 scan_ns_uri_char(); 1409 if (!consume('>')) 1410 return false; 1411 } else { 1412 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1413 Current = skip_while(&Scanner::skip_ns_char, Current); 1414 } 1415 1416 Token T; 1417 T.Kind = Token::TK_Tag; 1418 T.Range = StringRef(Start, Current - Start); 1419 TokenQueue.push_back(T); 1420 1421 // Tags can be simple keys. 1422 saveSimpleKeyCandidate(TokenQueue.back(), ColStart, false); 1423 1424 IsSimpleKeyAllowed = false; 1425 1426 return true; 1427 } 1428 1429 bool Scanner::fetchMoreTokens() { 1430 if (IsStartOfStream) 1431 return scanStreamStart(); 1432 1433 scanToNextToken(); 1434 1435 if (Current == End) 1436 return scanStreamEnd(); 1437 1438 removeStaleSimpleKeyCandidates(); 1439 1440 unrollIndent(Column); 1441 1442 if (Column == 0 && *Current == '%') 1443 return scanDirective(); 1444 1445 if (Column == 0 && Current + 4 <= End 1446 && *Current == '-' 1447 && *(Current + 1) == '-' 1448 && *(Current + 2) == '-' 1449 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1450 return scanDocumentIndicator(true); 1451 1452 if (Column == 0 && Current + 4 <= End 1453 && *Current == '.' 1454 && *(Current + 1) == '.' 1455 && *(Current + 2) == '.' 1456 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1457 return scanDocumentIndicator(false); 1458 1459 if (*Current == '[') 1460 return scanFlowCollectionStart(true); 1461 1462 if (*Current == '{') 1463 return scanFlowCollectionStart(false); 1464 1465 if (*Current == ']') 1466 return scanFlowCollectionEnd(true); 1467 1468 if (*Current == '}') 1469 return scanFlowCollectionEnd(false); 1470 1471 if (*Current == ',') 1472 return scanFlowEntry(); 1473 1474 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1475 return scanBlockEntry(); 1476 1477 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1478 return scanKey(); 1479 1480 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1481 return scanValue(); 1482 1483 if (*Current == '*') 1484 return scanAliasOrAnchor(true); 1485 1486 if (*Current == '&') 1487 return scanAliasOrAnchor(false); 1488 1489 if (*Current == '!') 1490 return scanTag(); 1491 1492 if (*Current == '|' && !FlowLevel) 1493 return scanBlockScalar(true); 1494 1495 if (*Current == '>' && !FlowLevel) 1496 return scanBlockScalar(false); 1497 1498 if (*Current == '\'') 1499 return scanFlowScalar(false); 1500 1501 if (*Current == '"') 1502 return scanFlowScalar(true); 1503 1504 // Get a plain scalar. 1505 StringRef FirstChar(Current, 1); 1506 if (!(isBlankOrBreak(Current) 1507 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1508 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1509 || (!FlowLevel && (*Current == '?' || *Current == ':') 1510 && isBlankOrBreak(Current + 1)) 1511 || (!FlowLevel && *Current == ':' 1512 && Current + 2 < End 1513 && *(Current + 1) == ':' 1514 && !isBlankOrBreak(Current + 2))) 1515 return scanPlainScalar(); 1516 1517 setError("Unrecognized character while tokenizing."); 1518 return false; 1519 } 1520 1521 Stream::Stream(StringRef Input, SourceMgr &SM) 1522 : scanner(new Scanner(Input, SM)), CurrentDoc() {} 1523 1524 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM) 1525 : scanner(new Scanner(InputBuffer, SM)), CurrentDoc() {} 1526 1527 Stream::~Stream() {} 1528 1529 bool Stream::failed() { return scanner->failed(); } 1530 1531 void Stream::printError(Node *N, const Twine &Msg) { 1532 scanner->printError( N->getSourceRange().Start 1533 , SourceMgr::DK_Error 1534 , Msg 1535 , N->getSourceRange()); 1536 } 1537 1538 document_iterator Stream::begin() { 1539 if (CurrentDoc) 1540 report_fatal_error("Can only iterate over the stream once"); 1541 1542 // Skip Stream-Start. 1543 scanner->getNext(); 1544 1545 CurrentDoc.reset(new Document(*this)); 1546 return document_iterator(CurrentDoc); 1547 } 1548 1549 document_iterator Stream::end() { 1550 return document_iterator(); 1551 } 1552 1553 void Stream::skip() { 1554 for (document_iterator i = begin(), e = end(); i != e; ++i) 1555 i->skip(); 1556 } 1557 1558 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1559 StringRef T) 1560 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1561 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1562 SourceRange = SMRange(Start, Start); 1563 } 1564 1565 std::string Node::getVerbatimTag() const { 1566 StringRef Raw = getRawTag(); 1567 if (!Raw.empty() && Raw != "!") { 1568 std::string Ret; 1569 if (Raw.find_last_of('!') == 0) { 1570 Ret = Doc->getTagMap().find("!")->second; 1571 Ret += Raw.substr(1); 1572 return Ret; 1573 } else if (Raw.startswith("!!")) { 1574 Ret = Doc->getTagMap().find("!!")->second; 1575 Ret += Raw.substr(2); 1576 return Ret; 1577 } else { 1578 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1579 std::map<StringRef, StringRef>::const_iterator It = 1580 Doc->getTagMap().find(TagHandle); 1581 if (It != Doc->getTagMap().end()) 1582 Ret = It->second; 1583 else { 1584 Token T; 1585 T.Kind = Token::TK_Tag; 1586 T.Range = TagHandle; 1587 setError(Twine("Unknown tag handle ") + TagHandle, T); 1588 } 1589 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1590 return Ret; 1591 } 1592 } 1593 1594 switch (getType()) { 1595 case NK_Null: 1596 return "tag:yaml.org,2002:null"; 1597 case NK_Scalar: 1598 // TODO: Tag resolution. 1599 return "tag:yaml.org,2002:str"; 1600 case NK_Mapping: 1601 return "tag:yaml.org,2002:map"; 1602 case NK_Sequence: 1603 return "tag:yaml.org,2002:seq"; 1604 } 1605 1606 return ""; 1607 } 1608 1609 Token &Node::peekNext() { 1610 return Doc->peekNext(); 1611 } 1612 1613 Token Node::getNext() { 1614 return Doc->getNext(); 1615 } 1616 1617 Node *Node::parseBlockNode() { 1618 return Doc->parseBlockNode(); 1619 } 1620 1621 BumpPtrAllocator &Node::getAllocator() { 1622 return Doc->NodeAllocator; 1623 } 1624 1625 void Node::setError(const Twine &Msg, Token &Tok) const { 1626 Doc->setError(Msg, Tok); 1627 } 1628 1629 bool Node::failed() const { 1630 return Doc->failed(); 1631 } 1632 1633 1634 1635 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1636 // TODO: Handle newlines properly. We need to remove leading whitespace. 1637 if (Value[0] == '"') { // Double quoted. 1638 // Pull off the leading and trailing "s. 1639 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1640 // Search for characters that would require unescaping the value. 1641 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1642 if (i != StringRef::npos) 1643 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1644 return UnquotedValue; 1645 } else if (Value[0] == '\'') { // Single quoted. 1646 // Pull off the leading and trailing 's. 1647 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1648 StringRef::size_type i = UnquotedValue.find('\''); 1649 if (i != StringRef::npos) { 1650 // We're going to need Storage. 1651 Storage.clear(); 1652 Storage.reserve(UnquotedValue.size()); 1653 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1654 StringRef Valid(UnquotedValue.begin(), i); 1655 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1656 Storage.push_back('\''); 1657 UnquotedValue = UnquotedValue.substr(i + 2); 1658 } 1659 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1660 return StringRef(Storage.begin(), Storage.size()); 1661 } 1662 return UnquotedValue; 1663 } 1664 // Plain or block. 1665 return Value.rtrim(" "); 1666 } 1667 1668 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1669 , StringRef::size_type i 1670 , SmallVectorImpl<char> &Storage) 1671 const { 1672 // Use Storage to build proper value. 1673 Storage.clear(); 1674 Storage.reserve(UnquotedValue.size()); 1675 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1676 // Insert all previous chars into Storage. 1677 StringRef Valid(UnquotedValue.begin(), i); 1678 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1679 // Chop off inserted chars. 1680 UnquotedValue = UnquotedValue.substr(i); 1681 1682 assert(!UnquotedValue.empty() && "Can't be empty!"); 1683 1684 // Parse escape or line break. 1685 switch (UnquotedValue[0]) { 1686 case '\r': 1687 case '\n': 1688 Storage.push_back('\n'); 1689 if ( UnquotedValue.size() > 1 1690 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1691 UnquotedValue = UnquotedValue.substr(1); 1692 UnquotedValue = UnquotedValue.substr(1); 1693 break; 1694 default: 1695 if (UnquotedValue.size() == 1) 1696 // TODO: Report error. 1697 break; 1698 UnquotedValue = UnquotedValue.substr(1); 1699 switch (UnquotedValue[0]) { 1700 default: { 1701 Token T; 1702 T.Range = StringRef(UnquotedValue.begin(), 1); 1703 setError("Unrecognized escape code!", T); 1704 return ""; 1705 } 1706 case '\r': 1707 case '\n': 1708 // Remove the new line. 1709 if ( UnquotedValue.size() > 1 1710 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1711 UnquotedValue = UnquotedValue.substr(1); 1712 // If this was just a single byte newline, it will get skipped 1713 // below. 1714 break; 1715 case '0': 1716 Storage.push_back(0x00); 1717 break; 1718 case 'a': 1719 Storage.push_back(0x07); 1720 break; 1721 case 'b': 1722 Storage.push_back(0x08); 1723 break; 1724 case 't': 1725 case 0x09: 1726 Storage.push_back(0x09); 1727 break; 1728 case 'n': 1729 Storage.push_back(0x0A); 1730 break; 1731 case 'v': 1732 Storage.push_back(0x0B); 1733 break; 1734 case 'f': 1735 Storage.push_back(0x0C); 1736 break; 1737 case 'r': 1738 Storage.push_back(0x0D); 1739 break; 1740 case 'e': 1741 Storage.push_back(0x1B); 1742 break; 1743 case ' ': 1744 Storage.push_back(0x20); 1745 break; 1746 case '"': 1747 Storage.push_back(0x22); 1748 break; 1749 case '/': 1750 Storage.push_back(0x2F); 1751 break; 1752 case '\\': 1753 Storage.push_back(0x5C); 1754 break; 1755 case 'N': 1756 encodeUTF8(0x85, Storage); 1757 break; 1758 case '_': 1759 encodeUTF8(0xA0, Storage); 1760 break; 1761 case 'L': 1762 encodeUTF8(0x2028, Storage); 1763 break; 1764 case 'P': 1765 encodeUTF8(0x2029, Storage); 1766 break; 1767 case 'x': { 1768 if (UnquotedValue.size() < 3) 1769 // TODO: Report error. 1770 break; 1771 unsigned int UnicodeScalarValue; 1772 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 1773 // TODO: Report error. 1774 UnicodeScalarValue = 0xFFFD; 1775 encodeUTF8(UnicodeScalarValue, Storage); 1776 UnquotedValue = UnquotedValue.substr(2); 1777 break; 1778 } 1779 case 'u': { 1780 if (UnquotedValue.size() < 5) 1781 // TODO: Report error. 1782 break; 1783 unsigned int UnicodeScalarValue; 1784 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 1785 // TODO: Report error. 1786 UnicodeScalarValue = 0xFFFD; 1787 encodeUTF8(UnicodeScalarValue, Storage); 1788 UnquotedValue = UnquotedValue.substr(4); 1789 break; 1790 } 1791 case 'U': { 1792 if (UnquotedValue.size() < 9) 1793 // TODO: Report error. 1794 break; 1795 unsigned int UnicodeScalarValue; 1796 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 1797 // TODO: Report error. 1798 UnicodeScalarValue = 0xFFFD; 1799 encodeUTF8(UnicodeScalarValue, Storage); 1800 UnquotedValue = UnquotedValue.substr(8); 1801 break; 1802 } 1803 } 1804 UnquotedValue = UnquotedValue.substr(1); 1805 } 1806 } 1807 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1808 return StringRef(Storage.begin(), Storage.size()); 1809 } 1810 1811 Node *KeyValueNode::getKey() { 1812 if (Key) 1813 return Key; 1814 // Handle implicit null keys. 1815 { 1816 Token &t = peekNext(); 1817 if ( t.Kind == Token::TK_BlockEnd 1818 || t.Kind == Token::TK_Value 1819 || t.Kind == Token::TK_Error) { 1820 return Key = new (getAllocator()) NullNode(Doc); 1821 } 1822 if (t.Kind == Token::TK_Key) 1823 getNext(); // skip TK_Key. 1824 } 1825 1826 // Handle explicit null keys. 1827 Token &t = peekNext(); 1828 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 1829 return Key = new (getAllocator()) NullNode(Doc); 1830 } 1831 1832 // We've got a normal key. 1833 return Key = parseBlockNode(); 1834 } 1835 1836 Node *KeyValueNode::getValue() { 1837 if (Value) 1838 return Value; 1839 getKey()->skip(); 1840 if (failed()) 1841 return Value = new (getAllocator()) NullNode(Doc); 1842 1843 // Handle implicit null values. 1844 { 1845 Token &t = peekNext(); 1846 if ( t.Kind == Token::TK_BlockEnd 1847 || t.Kind == Token::TK_FlowMappingEnd 1848 || t.Kind == Token::TK_Key 1849 || t.Kind == Token::TK_FlowEntry 1850 || t.Kind == Token::TK_Error) { 1851 return Value = new (getAllocator()) NullNode(Doc); 1852 } 1853 1854 if (t.Kind != Token::TK_Value) { 1855 setError("Unexpected token in Key Value.", t); 1856 return Value = new (getAllocator()) NullNode(Doc); 1857 } 1858 getNext(); // skip TK_Value. 1859 } 1860 1861 // Handle explicit null values. 1862 Token &t = peekNext(); 1863 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 1864 return Value = new (getAllocator()) NullNode(Doc); 1865 } 1866 1867 // We got a normal value. 1868 return Value = parseBlockNode(); 1869 } 1870 1871 void MappingNode::increment() { 1872 if (failed()) { 1873 IsAtEnd = true; 1874 CurrentEntry = nullptr; 1875 return; 1876 } 1877 if (CurrentEntry) { 1878 CurrentEntry->skip(); 1879 if (Type == MT_Inline) { 1880 IsAtEnd = true; 1881 CurrentEntry = nullptr; 1882 return; 1883 } 1884 } 1885 Token T = peekNext(); 1886 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 1887 // KeyValueNode eats the TK_Key. That way it can detect null keys. 1888 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 1889 } else if (Type == MT_Block) { 1890 switch (T.Kind) { 1891 case Token::TK_BlockEnd: 1892 getNext(); 1893 IsAtEnd = true; 1894 CurrentEntry = nullptr; 1895 break; 1896 default: 1897 setError("Unexpected token. Expected Key or Block End", T); 1898 case Token::TK_Error: 1899 IsAtEnd = true; 1900 CurrentEntry = nullptr; 1901 } 1902 } else { 1903 switch (T.Kind) { 1904 case Token::TK_FlowEntry: 1905 // Eat the flow entry and recurse. 1906 getNext(); 1907 return increment(); 1908 case Token::TK_FlowMappingEnd: 1909 getNext(); 1910 case Token::TK_Error: 1911 // Set this to end iterator. 1912 IsAtEnd = true; 1913 CurrentEntry = nullptr; 1914 break; 1915 default: 1916 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 1917 "Mapping End." 1918 , T); 1919 IsAtEnd = true; 1920 CurrentEntry = nullptr; 1921 } 1922 } 1923 } 1924 1925 void SequenceNode::increment() { 1926 if (failed()) { 1927 IsAtEnd = true; 1928 CurrentEntry = nullptr; 1929 return; 1930 } 1931 if (CurrentEntry) 1932 CurrentEntry->skip(); 1933 Token T = peekNext(); 1934 if (SeqType == ST_Block) { 1935 switch (T.Kind) { 1936 case Token::TK_BlockEntry: 1937 getNext(); 1938 CurrentEntry = parseBlockNode(); 1939 if (!CurrentEntry) { // An error occurred. 1940 IsAtEnd = true; 1941 CurrentEntry = nullptr; 1942 } 1943 break; 1944 case Token::TK_BlockEnd: 1945 getNext(); 1946 IsAtEnd = true; 1947 CurrentEntry = nullptr; 1948 break; 1949 default: 1950 setError( "Unexpected token. Expected Block Entry or Block End." 1951 , T); 1952 case Token::TK_Error: 1953 IsAtEnd = true; 1954 CurrentEntry = nullptr; 1955 } 1956 } else if (SeqType == ST_Indentless) { 1957 switch (T.Kind) { 1958 case Token::TK_BlockEntry: 1959 getNext(); 1960 CurrentEntry = parseBlockNode(); 1961 if (!CurrentEntry) { // An error occurred. 1962 IsAtEnd = true; 1963 CurrentEntry = nullptr; 1964 } 1965 break; 1966 default: 1967 case Token::TK_Error: 1968 IsAtEnd = true; 1969 CurrentEntry = nullptr; 1970 } 1971 } else if (SeqType == ST_Flow) { 1972 switch (T.Kind) { 1973 case Token::TK_FlowEntry: 1974 // Eat the flow entry and recurse. 1975 getNext(); 1976 WasPreviousTokenFlowEntry = true; 1977 return increment(); 1978 case Token::TK_FlowSequenceEnd: 1979 getNext(); 1980 case Token::TK_Error: 1981 // Set this to end iterator. 1982 IsAtEnd = true; 1983 CurrentEntry = nullptr; 1984 break; 1985 case Token::TK_StreamEnd: 1986 case Token::TK_DocumentEnd: 1987 case Token::TK_DocumentStart: 1988 setError("Could not find closing ]!", T); 1989 // Set this to end iterator. 1990 IsAtEnd = true; 1991 CurrentEntry = nullptr; 1992 break; 1993 default: 1994 if (!WasPreviousTokenFlowEntry) { 1995 setError("Expected , between entries!", T); 1996 IsAtEnd = true; 1997 CurrentEntry = nullptr; 1998 break; 1999 } 2000 // Otherwise it must be a flow entry. 2001 CurrentEntry = parseBlockNode(); 2002 if (!CurrentEntry) { 2003 IsAtEnd = true; 2004 } 2005 WasPreviousTokenFlowEntry = false; 2006 break; 2007 } 2008 } 2009 } 2010 2011 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2012 // Tag maps starts with two default mappings. 2013 TagMap["!"] = "!"; 2014 TagMap["!!"] = "tag:yaml.org,2002:"; 2015 2016 if (parseDirectives()) 2017 expectToken(Token::TK_DocumentStart); 2018 Token &T = peekNext(); 2019 if (T.Kind == Token::TK_DocumentStart) 2020 getNext(); 2021 } 2022 2023 bool Document::skip() { 2024 if (stream.scanner->failed()) 2025 return false; 2026 if (!Root) 2027 getRoot(); 2028 Root->skip(); 2029 Token &T = peekNext(); 2030 if (T.Kind == Token::TK_StreamEnd) 2031 return false; 2032 if (T.Kind == Token::TK_DocumentEnd) { 2033 getNext(); 2034 return skip(); 2035 } 2036 return true; 2037 } 2038 2039 Token &Document::peekNext() { 2040 return stream.scanner->peekNext(); 2041 } 2042 2043 Token Document::getNext() { 2044 return stream.scanner->getNext(); 2045 } 2046 2047 void Document::setError(const Twine &Message, Token &Location) const { 2048 stream.scanner->setError(Message, Location.Range.begin()); 2049 } 2050 2051 bool Document::failed() const { 2052 return stream.scanner->failed(); 2053 } 2054 2055 Node *Document::parseBlockNode() { 2056 Token T = peekNext(); 2057 // Handle properties. 2058 Token AnchorInfo; 2059 Token TagInfo; 2060 parse_property: 2061 switch (T.Kind) { 2062 case Token::TK_Alias: 2063 getNext(); 2064 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2065 case Token::TK_Anchor: 2066 if (AnchorInfo.Kind == Token::TK_Anchor) { 2067 setError("Already encountered an anchor for this node!", T); 2068 return nullptr; 2069 } 2070 AnchorInfo = getNext(); // Consume TK_Anchor. 2071 T = peekNext(); 2072 goto parse_property; 2073 case Token::TK_Tag: 2074 if (TagInfo.Kind == Token::TK_Tag) { 2075 setError("Already encountered a tag for this node!", T); 2076 return nullptr; 2077 } 2078 TagInfo = getNext(); // Consume TK_Tag. 2079 T = peekNext(); 2080 goto parse_property; 2081 default: 2082 break; 2083 } 2084 2085 switch (T.Kind) { 2086 case Token::TK_BlockEntry: 2087 // We got an unindented BlockEntry sequence. This is not terminated with 2088 // a BlockEnd. 2089 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2090 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2091 , AnchorInfo.Range.substr(1) 2092 , TagInfo.Range 2093 , SequenceNode::ST_Indentless); 2094 case Token::TK_BlockSequenceStart: 2095 getNext(); 2096 return new (NodeAllocator) 2097 SequenceNode( stream.CurrentDoc 2098 , AnchorInfo.Range.substr(1) 2099 , TagInfo.Range 2100 , SequenceNode::ST_Block); 2101 case Token::TK_BlockMappingStart: 2102 getNext(); 2103 return new (NodeAllocator) 2104 MappingNode( stream.CurrentDoc 2105 , AnchorInfo.Range.substr(1) 2106 , TagInfo.Range 2107 , MappingNode::MT_Block); 2108 case Token::TK_FlowSequenceStart: 2109 getNext(); 2110 return new (NodeAllocator) 2111 SequenceNode( stream.CurrentDoc 2112 , AnchorInfo.Range.substr(1) 2113 , TagInfo.Range 2114 , SequenceNode::ST_Flow); 2115 case Token::TK_FlowMappingStart: 2116 getNext(); 2117 return new (NodeAllocator) 2118 MappingNode( stream.CurrentDoc 2119 , AnchorInfo.Range.substr(1) 2120 , TagInfo.Range 2121 , MappingNode::MT_Flow); 2122 case Token::TK_Scalar: 2123 getNext(); 2124 return new (NodeAllocator) 2125 ScalarNode( stream.CurrentDoc 2126 , AnchorInfo.Range.substr(1) 2127 , TagInfo.Range 2128 , T.Range); 2129 case Token::TK_Key: 2130 // Don't eat the TK_Key, KeyValueNode expects it. 2131 return new (NodeAllocator) 2132 MappingNode( stream.CurrentDoc 2133 , AnchorInfo.Range.substr(1) 2134 , TagInfo.Range 2135 , MappingNode::MT_Inline); 2136 case Token::TK_DocumentStart: 2137 case Token::TK_DocumentEnd: 2138 case Token::TK_StreamEnd: 2139 default: 2140 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2141 // !!null null. 2142 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2143 case Token::TK_Error: 2144 return nullptr; 2145 } 2146 llvm_unreachable("Control flow shouldn't reach here."); 2147 return nullptr; 2148 } 2149 2150 bool Document::parseDirectives() { 2151 bool isDirective = false; 2152 while (true) { 2153 Token T = peekNext(); 2154 if (T.Kind == Token::TK_TagDirective) { 2155 parseTAGDirective(); 2156 isDirective = true; 2157 } else if (T.Kind == Token::TK_VersionDirective) { 2158 parseYAMLDirective(); 2159 isDirective = true; 2160 } else 2161 break; 2162 } 2163 return isDirective; 2164 } 2165 2166 void Document::parseYAMLDirective() { 2167 getNext(); // Eat %YAML <version> 2168 } 2169 2170 void Document::parseTAGDirective() { 2171 Token Tag = getNext(); // %TAG <handle> <prefix> 2172 StringRef T = Tag.Range; 2173 // Strip %TAG 2174 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2175 std::size_t HandleEnd = T.find_first_of(" \t"); 2176 StringRef TagHandle = T.substr(0, HandleEnd); 2177 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2178 TagMap[TagHandle] = TagPrefix; 2179 } 2180 2181 bool Document::expectToken(int TK) { 2182 Token T = getNext(); 2183 if (T.Kind != TK) { 2184 setError("Unexpected token", T); 2185 return false; 2186 } 2187 return true; 2188 } 2189