1 //===--- YAMLParser.cpp - Simple YAML parser ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements a YAML parser. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "llvm/Support/YAMLParser.h" 15 #include "llvm/ADT/SmallString.h" 16 #include "llvm/ADT/SmallVector.h" 17 #include "llvm/ADT/StringExtras.h" 18 #include "llvm/ADT/Twine.h" 19 #include "llvm/ADT/ilist.h" 20 #include "llvm/ADT/ilist_node.h" 21 #include "llvm/Support/ErrorHandling.h" 22 #include "llvm/Support/MemoryBuffer.h" 23 #include "llvm/Support/SourceMgr.h" 24 #include "llvm/Support/raw_ostream.h" 25 26 using namespace llvm; 27 using namespace yaml; 28 29 enum UnicodeEncodingForm { 30 UEF_UTF32_LE, ///< UTF-32 Little Endian 31 UEF_UTF32_BE, ///< UTF-32 Big Endian 32 UEF_UTF16_LE, ///< UTF-16 Little Endian 33 UEF_UTF16_BE, ///< UTF-16 Big Endian 34 UEF_UTF8, ///< UTF-8 or ascii. 35 UEF_Unknown ///< Not a valid Unicode encoding. 36 }; 37 38 /// EncodingInfo - Holds the encoding type and length of the byte order mark if 39 /// it exists. Length is in {0, 2, 3, 4}. 40 typedef std::pair<UnicodeEncodingForm, unsigned> EncodingInfo; 41 42 /// getUnicodeEncoding - Reads up to the first 4 bytes to determine the Unicode 43 /// encoding form of \a Input. 44 /// 45 /// @param Input A string of length 0 or more. 46 /// @returns An EncodingInfo indicating the Unicode encoding form of the input 47 /// and how long the byte order mark is if one exists. 48 static EncodingInfo getUnicodeEncoding(StringRef Input) { 49 if (Input.size() == 0) 50 return std::make_pair(UEF_Unknown, 0); 51 52 switch (uint8_t(Input[0])) { 53 case 0x00: 54 if (Input.size() >= 4) { 55 if ( Input[1] == 0 56 && uint8_t(Input[2]) == 0xFE 57 && uint8_t(Input[3]) == 0xFF) 58 return std::make_pair(UEF_UTF32_BE, 4); 59 if (Input[1] == 0 && Input[2] == 0 && Input[3] != 0) 60 return std::make_pair(UEF_UTF32_BE, 0); 61 } 62 63 if (Input.size() >= 2 && Input[1] != 0) 64 return std::make_pair(UEF_UTF16_BE, 0); 65 return std::make_pair(UEF_Unknown, 0); 66 case 0xFF: 67 if ( Input.size() >= 4 68 && uint8_t(Input[1]) == 0xFE 69 && Input[2] == 0 70 && Input[3] == 0) 71 return std::make_pair(UEF_UTF32_LE, 4); 72 73 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFE) 74 return std::make_pair(UEF_UTF16_LE, 2); 75 return std::make_pair(UEF_Unknown, 0); 76 case 0xFE: 77 if (Input.size() >= 2 && uint8_t(Input[1]) == 0xFF) 78 return std::make_pair(UEF_UTF16_BE, 2); 79 return std::make_pair(UEF_Unknown, 0); 80 case 0xEF: 81 if ( Input.size() >= 3 82 && uint8_t(Input[1]) == 0xBB 83 && uint8_t(Input[2]) == 0xBF) 84 return std::make_pair(UEF_UTF8, 3); 85 return std::make_pair(UEF_Unknown, 0); 86 } 87 88 // It could still be utf-32 or utf-16. 89 if (Input.size() >= 4 && Input[1] == 0 && Input[2] == 0 && Input[3] == 0) 90 return std::make_pair(UEF_UTF32_LE, 0); 91 92 if (Input.size() >= 2 && Input[1] == 0) 93 return std::make_pair(UEF_UTF16_LE, 0); 94 95 return std::make_pair(UEF_UTF8, 0); 96 } 97 98 namespace llvm { 99 namespace yaml { 100 /// Pin the vtables to this file. 101 void Node::anchor() {} 102 void NullNode::anchor() {} 103 void ScalarNode::anchor() {} 104 void BlockScalarNode::anchor() {} 105 void KeyValueNode::anchor() {} 106 void MappingNode::anchor() {} 107 void SequenceNode::anchor() {} 108 void AliasNode::anchor() {} 109 110 /// Token - A single YAML token. 111 struct Token : ilist_node<Token> { 112 enum TokenKind { 113 TK_Error, // Uninitialized token. 114 TK_StreamStart, 115 TK_StreamEnd, 116 TK_VersionDirective, 117 TK_TagDirective, 118 TK_DocumentStart, 119 TK_DocumentEnd, 120 TK_BlockEntry, 121 TK_BlockEnd, 122 TK_BlockSequenceStart, 123 TK_BlockMappingStart, 124 TK_FlowEntry, 125 TK_FlowSequenceStart, 126 TK_FlowSequenceEnd, 127 TK_FlowMappingStart, 128 TK_FlowMappingEnd, 129 TK_Key, 130 TK_Value, 131 TK_Scalar, 132 TK_BlockScalar, 133 TK_Alias, 134 TK_Anchor, 135 TK_Tag 136 } Kind; 137 138 /// A string of length 0 or more whose begin() points to the logical location 139 /// of the token in the input. 140 StringRef Range; 141 142 /// The value of a block scalar node. 143 std::string Value; 144 145 Token() : Kind(TK_Error) {} 146 }; 147 } 148 } 149 150 namespace llvm { 151 template<> 152 struct ilist_sentinel_traits<Token> { 153 Token *createSentinel() const { 154 return &Sentinel; 155 } 156 static void destroySentinel(Token*) {} 157 158 Token *provideInitialHead() const { return createSentinel(); } 159 Token *ensureHead(Token*) const { return createSentinel(); } 160 static void noteHead(Token*, Token*) {} 161 162 private: 163 mutable Token Sentinel; 164 }; 165 166 template<> 167 struct ilist_node_traits<Token> { 168 Token *createNode(const Token &V) { 169 return new (Alloc.Allocate<Token>()) Token(V); 170 } 171 static void deleteNode(Token *V) { V->~Token(); } 172 173 void addNodeToList(Token *) {} 174 void removeNodeFromList(Token *) {} 175 void transferNodesFromList(ilist_node_traits & /*SrcTraits*/, 176 ilist_iterator<Token> /*first*/, 177 ilist_iterator<Token> /*last*/) {} 178 179 BumpPtrAllocator Alloc; 180 }; 181 } 182 183 typedef ilist<Token> TokenQueueT; 184 185 namespace { 186 /// @brief This struct is used to track simple keys. 187 /// 188 /// Simple keys are handled by creating an entry in SimpleKeys for each Token 189 /// which could legally be the start of a simple key. When peekNext is called, 190 /// if the Token To be returned is referenced by a SimpleKey, we continue 191 /// tokenizing until that potential simple key has either been found to not be 192 /// a simple key (we moved on to the next line or went further than 1024 chars). 193 /// Or when we run into a Value, and then insert a Key token (and possibly 194 /// others) before the SimpleKey's Tok. 195 struct SimpleKey { 196 TokenQueueT::iterator Tok; 197 unsigned Column; 198 unsigned Line; 199 unsigned FlowLevel; 200 bool IsRequired; 201 202 bool operator ==(const SimpleKey &Other) { 203 return Tok == Other.Tok; 204 } 205 }; 206 } 207 208 /// @brief The Unicode scalar value of a UTF-8 minimal well-formed code unit 209 /// subsequence and the subsequence's length in code units (uint8_t). 210 /// A length of 0 represents an error. 211 typedef std::pair<uint32_t, unsigned> UTF8Decoded; 212 213 static UTF8Decoded decodeUTF8(StringRef Range) { 214 StringRef::iterator Position= Range.begin(); 215 StringRef::iterator End = Range.end(); 216 // 1 byte: [0x00, 0x7f] 217 // Bit pattern: 0xxxxxxx 218 if ((*Position & 0x80) == 0) { 219 return std::make_pair(*Position, 1); 220 } 221 // 2 bytes: [0x80, 0x7ff] 222 // Bit pattern: 110xxxxx 10xxxxxx 223 if (Position + 1 != End && 224 ((*Position & 0xE0) == 0xC0) && 225 ((*(Position + 1) & 0xC0) == 0x80)) { 226 uint32_t codepoint = ((*Position & 0x1F) << 6) | 227 (*(Position + 1) & 0x3F); 228 if (codepoint >= 0x80) 229 return std::make_pair(codepoint, 2); 230 } 231 // 3 bytes: [0x8000, 0xffff] 232 // Bit pattern: 1110xxxx 10xxxxxx 10xxxxxx 233 if (Position + 2 != End && 234 ((*Position & 0xF0) == 0xE0) && 235 ((*(Position + 1) & 0xC0) == 0x80) && 236 ((*(Position + 2) & 0xC0) == 0x80)) { 237 uint32_t codepoint = ((*Position & 0x0F) << 12) | 238 ((*(Position + 1) & 0x3F) << 6) | 239 (*(Position + 2) & 0x3F); 240 // Codepoints between 0xD800 and 0xDFFF are invalid, as 241 // they are high / low surrogate halves used by UTF-16. 242 if (codepoint >= 0x800 && 243 (codepoint < 0xD800 || codepoint > 0xDFFF)) 244 return std::make_pair(codepoint, 3); 245 } 246 // 4 bytes: [0x10000, 0x10FFFF] 247 // Bit pattern: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx 248 if (Position + 3 != End && 249 ((*Position & 0xF8) == 0xF0) && 250 ((*(Position + 1) & 0xC0) == 0x80) && 251 ((*(Position + 2) & 0xC0) == 0x80) && 252 ((*(Position + 3) & 0xC0) == 0x80)) { 253 uint32_t codepoint = ((*Position & 0x07) << 18) | 254 ((*(Position + 1) & 0x3F) << 12) | 255 ((*(Position + 2) & 0x3F) << 6) | 256 (*(Position + 3) & 0x3F); 257 if (codepoint >= 0x10000 && codepoint <= 0x10FFFF) 258 return std::make_pair(codepoint, 4); 259 } 260 return std::make_pair(0, 0); 261 } 262 263 namespace llvm { 264 namespace yaml { 265 /// @brief Scans YAML tokens from a MemoryBuffer. 266 class Scanner { 267 public: 268 Scanner(StringRef Input, SourceMgr &SM, bool ShowColors = true); 269 Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors = true); 270 271 /// @brief Parse the next token and return it without popping it. 272 Token &peekNext(); 273 274 /// @brief Parse the next token and pop it from the queue. 275 Token getNext(); 276 277 void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message, 278 ArrayRef<SMRange> Ranges = None) { 279 SM.PrintMessage(Loc, Kind, Message, Ranges, /* FixIts= */ None, ShowColors); 280 } 281 282 void setError(const Twine &Message, StringRef::iterator Position) { 283 if (Current >= End) 284 Current = End - 1; 285 286 // Don't print out more errors after the first one we encounter. The rest 287 // are just the result of the first, and have no meaning. 288 if (!Failed) 289 printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message); 290 Failed = true; 291 } 292 293 void setError(const Twine &Message) { 294 setError(Message, Current); 295 } 296 297 /// @brief Returns true if an error occurred while parsing. 298 bool failed() { 299 return Failed; 300 } 301 302 private: 303 void init(MemoryBufferRef Buffer); 304 305 StringRef currentInput() { 306 return StringRef(Current, End - Current); 307 } 308 309 /// @brief Decode a UTF-8 minimal well-formed code unit subsequence starting 310 /// at \a Position. 311 /// 312 /// If the UTF-8 code units starting at Position do not form a well-formed 313 /// code unit subsequence, then the Unicode scalar value is 0, and the length 314 /// is 0. 315 UTF8Decoded decodeUTF8(StringRef::iterator Position) { 316 return ::decodeUTF8(StringRef(Position, End - Position)); 317 } 318 319 // The following functions are based on the gramar rules in the YAML spec. The 320 // style of the function names it meant to closely match how they are written 321 // in the spec. The number within the [] is the number of the grammar rule in 322 // the spec. 323 // 324 // See 4.2 [Production Naming Conventions] for the meaning of the prefixes. 325 // 326 // c- 327 // A production starting and ending with a special character. 328 // b- 329 // A production matching a single line break. 330 // nb- 331 // A production starting and ending with a non-break character. 332 // s- 333 // A production starting and ending with a white space character. 334 // ns- 335 // A production starting and ending with a non-space character. 336 // l- 337 // A production matching complete line(s). 338 339 /// @brief Skip a single nb-char[27] starting at Position. 340 /// 341 /// A nb-char is 0x9 | [0x20-0x7E] | 0x85 | [0xA0-0xD7FF] | [0xE000-0xFEFE] 342 /// | [0xFF00-0xFFFD] | [0x10000-0x10FFFF] 343 /// 344 /// @returns The code unit after the nb-char, or Position if it's not an 345 /// nb-char. 346 StringRef::iterator skip_nb_char(StringRef::iterator Position); 347 348 /// @brief Skip a single b-break[28] starting at Position. 349 /// 350 /// A b-break is 0xD 0xA | 0xD | 0xA 351 /// 352 /// @returns The code unit after the b-break, or Position if it's not a 353 /// b-break. 354 StringRef::iterator skip_b_break(StringRef::iterator Position); 355 356 /// Skip a single s-space[31] starting at Position. 357 /// 358 /// An s-space is 0x20 359 /// 360 /// @returns The code unit after the s-space, or Position if it's not a 361 /// s-space. 362 StringRef::iterator skip_s_space(StringRef::iterator Position); 363 364 /// @brief Skip a single s-white[33] starting at Position. 365 /// 366 /// A s-white is 0x20 | 0x9 367 /// 368 /// @returns The code unit after the s-white, or Position if it's not a 369 /// s-white. 370 StringRef::iterator skip_s_white(StringRef::iterator Position); 371 372 /// @brief Skip a single ns-char[34] starting at Position. 373 /// 374 /// A ns-char is nb-char - s-white 375 /// 376 /// @returns The code unit after the ns-char, or Position if it's not a 377 /// ns-char. 378 StringRef::iterator skip_ns_char(StringRef::iterator Position); 379 380 typedef StringRef::iterator (Scanner::*SkipWhileFunc)(StringRef::iterator); 381 /// @brief Skip minimal well-formed code unit subsequences until Func 382 /// returns its input. 383 /// 384 /// @returns The code unit after the last minimal well-formed code unit 385 /// subsequence that Func accepted. 386 StringRef::iterator skip_while( SkipWhileFunc Func 387 , StringRef::iterator Position); 388 389 /// Skip minimal well-formed code unit subsequences until Func returns its 390 /// input. 391 void advanceWhile(SkipWhileFunc Func); 392 393 /// @brief Scan ns-uri-char[39]s starting at Cur. 394 /// 395 /// This updates Cur and Column while scanning. 396 /// 397 /// @returns A StringRef starting at Cur which covers the longest contiguous 398 /// sequence of ns-uri-char. 399 StringRef scan_ns_uri_char(); 400 401 /// @brief Consume a minimal well-formed code unit subsequence starting at 402 /// \a Cur. Return false if it is not the same Unicode scalar value as 403 /// \a Expected. This updates \a Column. 404 bool consume(uint32_t Expected); 405 406 /// @brief Skip \a Distance UTF-8 code units. Updates \a Cur and \a Column. 407 void skip(uint32_t Distance); 408 409 /// @brief Return true if the minimal well-formed code unit subsequence at 410 /// Pos is whitespace or a new line 411 bool isBlankOrBreak(StringRef::iterator Position); 412 413 /// Consume a single b-break[28] if it's present at the current position. 414 /// 415 /// Return false if the code unit at the current position isn't a line break. 416 bool consumeLineBreakIfPresent(); 417 418 /// @brief If IsSimpleKeyAllowed, create and push_back a new SimpleKey. 419 void saveSimpleKeyCandidate( TokenQueueT::iterator Tok 420 , unsigned AtColumn 421 , bool IsRequired); 422 423 /// @brief Remove simple keys that can no longer be valid simple keys. 424 /// 425 /// Invalid simple keys are not on the current line or are further than 1024 426 /// columns back. 427 void removeStaleSimpleKeyCandidates(); 428 429 /// @brief Remove all simple keys on FlowLevel \a Level. 430 void removeSimpleKeyCandidatesOnFlowLevel(unsigned Level); 431 432 /// @brief Unroll indentation in \a Indents back to \a Col. Creates BlockEnd 433 /// tokens if needed. 434 bool unrollIndent(int ToColumn); 435 436 /// @brief Increase indent to \a Col. Creates \a Kind token at \a InsertPoint 437 /// if needed. 438 bool rollIndent( int ToColumn 439 , Token::TokenKind Kind 440 , TokenQueueT::iterator InsertPoint); 441 442 /// @brief Skip a single-line comment when the comment starts at the current 443 /// position of the scanner. 444 void skipComment(); 445 446 /// @brief Skip whitespace and comments until the start of the next token. 447 void scanToNextToken(); 448 449 /// @brief Must be the first token generated. 450 bool scanStreamStart(); 451 452 /// @brief Generate tokens needed to close out the stream. 453 bool scanStreamEnd(); 454 455 /// @brief Scan a %BLAH directive. 456 bool scanDirective(); 457 458 /// @brief Scan a ... or ---. 459 bool scanDocumentIndicator(bool IsStart); 460 461 /// @brief Scan a [ or { and generate the proper flow collection start token. 462 bool scanFlowCollectionStart(bool IsSequence); 463 464 /// @brief Scan a ] or } and generate the proper flow collection end token. 465 bool scanFlowCollectionEnd(bool IsSequence); 466 467 /// @brief Scan the , that separates entries in a flow collection. 468 bool scanFlowEntry(); 469 470 /// @brief Scan the - that starts block sequence entries. 471 bool scanBlockEntry(); 472 473 /// @brief Scan an explicit ? indicating a key. 474 bool scanKey(); 475 476 /// @brief Scan an explicit : indicating a value. 477 bool scanValue(); 478 479 /// @brief Scan a quoted scalar. 480 bool scanFlowScalar(bool IsDoubleQuoted); 481 482 /// @brief Scan an unquoted scalar. 483 bool scanPlainScalar(); 484 485 /// @brief Scan an Alias or Anchor starting with * or &. 486 bool scanAliasOrAnchor(bool IsAlias); 487 488 /// @brief Scan a block scalar starting with | or >. 489 bool scanBlockScalar(bool IsLiteral); 490 491 /// Scan a chomping indicator in a block scalar header. 492 char scanBlockChompingIndicator(); 493 494 /// Scan an indentation indicator in a block scalar header. 495 unsigned scanBlockIndentationIndicator(); 496 497 /// Scan a block scalar header. 498 /// 499 /// Return false if an error occurred. 500 bool scanBlockScalarHeader(char &ChompingIndicator, unsigned &IndentIndicator, 501 bool &IsDone); 502 503 /// Look for the indentation level of a block scalar. 504 /// 505 /// Return false if an error occurred. 506 bool findBlockScalarIndent(unsigned &BlockIndent, unsigned BlockExitIndent, 507 unsigned &LineBreaks, bool &IsDone); 508 509 /// Scan the indentation of a text line in a block scalar. 510 /// 511 /// Return false if an error occurred. 512 bool scanBlockScalarIndent(unsigned BlockIndent, unsigned BlockExitIndent, 513 bool &IsDone); 514 515 /// @brief Scan a tag of the form !stuff. 516 bool scanTag(); 517 518 /// @brief Dispatch to the next scanning function based on \a *Cur. 519 bool fetchMoreTokens(); 520 521 /// @brief The SourceMgr used for diagnostics and buffer management. 522 SourceMgr &SM; 523 524 /// @brief The original input. 525 MemoryBufferRef InputBuffer; 526 527 /// @brief The current position of the scanner. 528 StringRef::iterator Current; 529 530 /// @brief The end of the input (one past the last character). 531 StringRef::iterator End; 532 533 /// @brief Current YAML indentation level in spaces. 534 int Indent; 535 536 /// @brief Current column number in Unicode code points. 537 unsigned Column; 538 539 /// @brief Current line number. 540 unsigned Line; 541 542 /// @brief How deep we are in flow style containers. 0 Means at block level. 543 unsigned FlowLevel; 544 545 /// @brief Are we at the start of the stream? 546 bool IsStartOfStream; 547 548 /// @brief Can the next token be the start of a simple key? 549 bool IsSimpleKeyAllowed; 550 551 /// @brief True if an error has occurred. 552 bool Failed; 553 554 /// @brief Should colors be used when printing out the diagnostic messages? 555 bool ShowColors; 556 557 /// @brief Queue of tokens. This is required to queue up tokens while looking 558 /// for the end of a simple key. And for cases where a single character 559 /// can produce multiple tokens (e.g. BlockEnd). 560 TokenQueueT TokenQueue; 561 562 /// @brief Indentation levels. 563 SmallVector<int, 4> Indents; 564 565 /// @brief Potential simple keys. 566 SmallVector<SimpleKey, 4> SimpleKeys; 567 }; 568 569 } // end namespace yaml 570 } // end namespace llvm 571 572 /// encodeUTF8 - Encode \a UnicodeScalarValue in UTF-8 and append it to result. 573 static void encodeUTF8( uint32_t UnicodeScalarValue 574 , SmallVectorImpl<char> &Result) { 575 if (UnicodeScalarValue <= 0x7F) { 576 Result.push_back(UnicodeScalarValue & 0x7F); 577 } else if (UnicodeScalarValue <= 0x7FF) { 578 uint8_t FirstByte = 0xC0 | ((UnicodeScalarValue & 0x7C0) >> 6); 579 uint8_t SecondByte = 0x80 | (UnicodeScalarValue & 0x3F); 580 Result.push_back(FirstByte); 581 Result.push_back(SecondByte); 582 } else if (UnicodeScalarValue <= 0xFFFF) { 583 uint8_t FirstByte = 0xE0 | ((UnicodeScalarValue & 0xF000) >> 12); 584 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 585 uint8_t ThirdByte = 0x80 | (UnicodeScalarValue & 0x3F); 586 Result.push_back(FirstByte); 587 Result.push_back(SecondByte); 588 Result.push_back(ThirdByte); 589 } else if (UnicodeScalarValue <= 0x10FFFF) { 590 uint8_t FirstByte = 0xF0 | ((UnicodeScalarValue & 0x1F0000) >> 18); 591 uint8_t SecondByte = 0x80 | ((UnicodeScalarValue & 0x3F000) >> 12); 592 uint8_t ThirdByte = 0x80 | ((UnicodeScalarValue & 0xFC0) >> 6); 593 uint8_t FourthByte = 0x80 | (UnicodeScalarValue & 0x3F); 594 Result.push_back(FirstByte); 595 Result.push_back(SecondByte); 596 Result.push_back(ThirdByte); 597 Result.push_back(FourthByte); 598 } 599 } 600 601 bool yaml::dumpTokens(StringRef Input, raw_ostream &OS) { 602 SourceMgr SM; 603 Scanner scanner(Input, SM); 604 while (true) { 605 Token T = scanner.getNext(); 606 switch (T.Kind) { 607 case Token::TK_StreamStart: 608 OS << "Stream-Start: "; 609 break; 610 case Token::TK_StreamEnd: 611 OS << "Stream-End: "; 612 break; 613 case Token::TK_VersionDirective: 614 OS << "Version-Directive: "; 615 break; 616 case Token::TK_TagDirective: 617 OS << "Tag-Directive: "; 618 break; 619 case Token::TK_DocumentStart: 620 OS << "Document-Start: "; 621 break; 622 case Token::TK_DocumentEnd: 623 OS << "Document-End: "; 624 break; 625 case Token::TK_BlockEntry: 626 OS << "Block-Entry: "; 627 break; 628 case Token::TK_BlockEnd: 629 OS << "Block-End: "; 630 break; 631 case Token::TK_BlockSequenceStart: 632 OS << "Block-Sequence-Start: "; 633 break; 634 case Token::TK_BlockMappingStart: 635 OS << "Block-Mapping-Start: "; 636 break; 637 case Token::TK_FlowEntry: 638 OS << "Flow-Entry: "; 639 break; 640 case Token::TK_FlowSequenceStart: 641 OS << "Flow-Sequence-Start: "; 642 break; 643 case Token::TK_FlowSequenceEnd: 644 OS << "Flow-Sequence-End: "; 645 break; 646 case Token::TK_FlowMappingStart: 647 OS << "Flow-Mapping-Start: "; 648 break; 649 case Token::TK_FlowMappingEnd: 650 OS << "Flow-Mapping-End: "; 651 break; 652 case Token::TK_Key: 653 OS << "Key: "; 654 break; 655 case Token::TK_Value: 656 OS << "Value: "; 657 break; 658 case Token::TK_Scalar: 659 OS << "Scalar: "; 660 break; 661 case Token::TK_BlockScalar: 662 OS << "Block Scalar: "; 663 break; 664 case Token::TK_Alias: 665 OS << "Alias: "; 666 break; 667 case Token::TK_Anchor: 668 OS << "Anchor: "; 669 break; 670 case Token::TK_Tag: 671 OS << "Tag: "; 672 break; 673 case Token::TK_Error: 674 break; 675 } 676 OS << T.Range << "\n"; 677 if (T.Kind == Token::TK_StreamEnd) 678 break; 679 else if (T.Kind == Token::TK_Error) 680 return false; 681 } 682 return true; 683 } 684 685 bool yaml::scanTokens(StringRef Input) { 686 llvm::SourceMgr SM; 687 llvm::yaml::Scanner scanner(Input, SM); 688 for (;;) { 689 llvm::yaml::Token T = scanner.getNext(); 690 if (T.Kind == Token::TK_StreamEnd) 691 break; 692 else if (T.Kind == Token::TK_Error) 693 return false; 694 } 695 return true; 696 } 697 698 std::string yaml::escape(StringRef Input) { 699 std::string EscapedInput; 700 for (StringRef::iterator i = Input.begin(), e = Input.end(); i != e; ++i) { 701 if (*i == '\\') 702 EscapedInput += "\\\\"; 703 else if (*i == '"') 704 EscapedInput += "\\\""; 705 else if (*i == 0) 706 EscapedInput += "\\0"; 707 else if (*i == 0x07) 708 EscapedInput += "\\a"; 709 else if (*i == 0x08) 710 EscapedInput += "\\b"; 711 else if (*i == 0x09) 712 EscapedInput += "\\t"; 713 else if (*i == 0x0A) 714 EscapedInput += "\\n"; 715 else if (*i == 0x0B) 716 EscapedInput += "\\v"; 717 else if (*i == 0x0C) 718 EscapedInput += "\\f"; 719 else if (*i == 0x0D) 720 EscapedInput += "\\r"; 721 else if (*i == 0x1B) 722 EscapedInput += "\\e"; 723 else if ((unsigned char)*i < 0x20) { // Control characters not handled above. 724 std::string HexStr = utohexstr(*i); 725 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 726 } else if (*i & 0x80) { // UTF-8 multiple code unit subsequence. 727 UTF8Decoded UnicodeScalarValue 728 = decodeUTF8(StringRef(i, Input.end() - i)); 729 if (UnicodeScalarValue.second == 0) { 730 // Found invalid char. 731 SmallString<4> Val; 732 encodeUTF8(0xFFFD, Val); 733 EscapedInput.insert(EscapedInput.end(), Val.begin(), Val.end()); 734 // FIXME: Error reporting. 735 return EscapedInput; 736 } 737 if (UnicodeScalarValue.first == 0x85) 738 EscapedInput += "\\N"; 739 else if (UnicodeScalarValue.first == 0xA0) 740 EscapedInput += "\\_"; 741 else if (UnicodeScalarValue.first == 0x2028) 742 EscapedInput += "\\L"; 743 else if (UnicodeScalarValue.first == 0x2029) 744 EscapedInput += "\\P"; 745 else { 746 std::string HexStr = utohexstr(UnicodeScalarValue.first); 747 if (HexStr.size() <= 2) 748 EscapedInput += "\\x" + std::string(2 - HexStr.size(), '0') + HexStr; 749 else if (HexStr.size() <= 4) 750 EscapedInput += "\\u" + std::string(4 - HexStr.size(), '0') + HexStr; 751 else if (HexStr.size() <= 8) 752 EscapedInput += "\\U" + std::string(8 - HexStr.size(), '0') + HexStr; 753 } 754 i += UnicodeScalarValue.second - 1; 755 } else 756 EscapedInput.push_back(*i); 757 } 758 return EscapedInput; 759 } 760 761 Scanner::Scanner(StringRef Input, SourceMgr &sm, bool ShowColors) 762 : SM(sm), ShowColors(ShowColors) { 763 init(MemoryBufferRef(Input, "YAML")); 764 } 765 766 Scanner::Scanner(MemoryBufferRef Buffer, SourceMgr &SM_, bool ShowColors) 767 : SM(SM_), ShowColors(ShowColors) { 768 init(Buffer); 769 } 770 771 void Scanner::init(MemoryBufferRef Buffer) { 772 InputBuffer = Buffer; 773 Current = InputBuffer.getBufferStart(); 774 End = InputBuffer.getBufferEnd(); 775 Indent = -1; 776 Column = 0; 777 Line = 0; 778 FlowLevel = 0; 779 IsStartOfStream = true; 780 IsSimpleKeyAllowed = true; 781 Failed = false; 782 std::unique_ptr<MemoryBuffer> InputBufferOwner = 783 MemoryBuffer::getMemBuffer(Buffer); 784 SM.AddNewSourceBuffer(std::move(InputBufferOwner), SMLoc()); 785 } 786 787 Token &Scanner::peekNext() { 788 // If the current token is a possible simple key, keep parsing until we 789 // can confirm. 790 bool NeedMore = false; 791 while (true) { 792 if (TokenQueue.empty() || NeedMore) { 793 if (!fetchMoreTokens()) { 794 TokenQueue.clear(); 795 TokenQueue.push_back(Token()); 796 return TokenQueue.front(); 797 } 798 } 799 assert(!TokenQueue.empty() && 800 "fetchMoreTokens lied about getting tokens!"); 801 802 removeStaleSimpleKeyCandidates(); 803 SimpleKey SK; 804 SK.Tok = TokenQueue.begin(); 805 if (std::find(SimpleKeys.begin(), SimpleKeys.end(), SK) 806 == SimpleKeys.end()) 807 break; 808 else 809 NeedMore = true; 810 } 811 return TokenQueue.front(); 812 } 813 814 Token Scanner::getNext() { 815 Token Ret = peekNext(); 816 // TokenQueue can be empty if there was an error getting the next token. 817 if (!TokenQueue.empty()) 818 TokenQueue.pop_front(); 819 820 // There cannot be any referenced Token's if the TokenQueue is empty. So do a 821 // quick deallocation of them all. 822 if (TokenQueue.empty()) { 823 TokenQueue.Alloc.Reset(); 824 } 825 826 return Ret; 827 } 828 829 StringRef::iterator Scanner::skip_nb_char(StringRef::iterator Position) { 830 if (Position == End) 831 return Position; 832 // Check 7 bit c-printable - b-char. 833 if ( *Position == 0x09 834 || (*Position >= 0x20 && *Position <= 0x7E)) 835 return Position + 1; 836 837 // Check for valid UTF-8. 838 if (uint8_t(*Position) & 0x80) { 839 UTF8Decoded u8d = decodeUTF8(Position); 840 if ( u8d.second != 0 841 && u8d.first != 0xFEFF 842 && ( u8d.first == 0x85 843 || ( u8d.first >= 0xA0 844 && u8d.first <= 0xD7FF) 845 || ( u8d.first >= 0xE000 846 && u8d.first <= 0xFFFD) 847 || ( u8d.first >= 0x10000 848 && u8d.first <= 0x10FFFF))) 849 return Position + u8d.second; 850 } 851 return Position; 852 } 853 854 StringRef::iterator Scanner::skip_b_break(StringRef::iterator Position) { 855 if (Position == End) 856 return Position; 857 if (*Position == 0x0D) { 858 if (Position + 1 != End && *(Position + 1) == 0x0A) 859 return Position + 2; 860 return Position + 1; 861 } 862 863 if (*Position == 0x0A) 864 return Position + 1; 865 return Position; 866 } 867 868 StringRef::iterator Scanner::skip_s_space(StringRef::iterator Position) { 869 if (Position == End) 870 return Position; 871 if (*Position == ' ') 872 return Position + 1; 873 return Position; 874 } 875 876 StringRef::iterator Scanner::skip_s_white(StringRef::iterator Position) { 877 if (Position == End) 878 return Position; 879 if (*Position == ' ' || *Position == '\t') 880 return Position + 1; 881 return Position; 882 } 883 884 StringRef::iterator Scanner::skip_ns_char(StringRef::iterator Position) { 885 if (Position == End) 886 return Position; 887 if (*Position == ' ' || *Position == '\t') 888 return Position; 889 return skip_nb_char(Position); 890 } 891 892 StringRef::iterator Scanner::skip_while( SkipWhileFunc Func 893 , StringRef::iterator Position) { 894 while (true) { 895 StringRef::iterator i = (this->*Func)(Position); 896 if (i == Position) 897 break; 898 Position = i; 899 } 900 return Position; 901 } 902 903 void Scanner::advanceWhile(SkipWhileFunc Func) { 904 auto Final = skip_while(Func, Current); 905 Column += Final - Current; 906 Current = Final; 907 } 908 909 static bool is_ns_hex_digit(const char C) { 910 return (C >= '0' && C <= '9') 911 || (C >= 'a' && C <= 'z') 912 || (C >= 'A' && C <= 'Z'); 913 } 914 915 static bool is_ns_word_char(const char C) { 916 return C == '-' 917 || (C >= 'a' && C <= 'z') 918 || (C >= 'A' && C <= 'Z'); 919 } 920 921 StringRef Scanner::scan_ns_uri_char() { 922 StringRef::iterator Start = Current; 923 while (true) { 924 if (Current == End) 925 break; 926 if (( *Current == '%' 927 && Current + 2 < End 928 && is_ns_hex_digit(*(Current + 1)) 929 && is_ns_hex_digit(*(Current + 2))) 930 || is_ns_word_char(*Current) 931 || StringRef(Current, 1).find_first_of("#;/?:@&=+$,_.!~*'()[]") 932 != StringRef::npos) { 933 ++Current; 934 ++Column; 935 } else 936 break; 937 } 938 return StringRef(Start, Current - Start); 939 } 940 941 bool Scanner::consume(uint32_t Expected) { 942 if (Expected >= 0x80) 943 report_fatal_error("Not dealing with this yet"); 944 if (Current == End) 945 return false; 946 if (uint8_t(*Current) >= 0x80) 947 report_fatal_error("Not dealing with this yet"); 948 if (uint8_t(*Current) == Expected) { 949 ++Current; 950 ++Column; 951 return true; 952 } 953 return false; 954 } 955 956 void Scanner::skip(uint32_t Distance) { 957 Current += Distance; 958 Column += Distance; 959 assert(Current <= End && "Skipped past the end"); 960 } 961 962 bool Scanner::isBlankOrBreak(StringRef::iterator Position) { 963 if (Position == End) 964 return false; 965 if ( *Position == ' ' || *Position == '\t' 966 || *Position == '\r' || *Position == '\n') 967 return true; 968 return false; 969 } 970 971 bool Scanner::consumeLineBreakIfPresent() { 972 auto Next = skip_b_break(Current); 973 if (Next == Current) 974 return false; 975 Column = 0; 976 ++Line; 977 Current = Next; 978 return true; 979 } 980 981 void Scanner::saveSimpleKeyCandidate( TokenQueueT::iterator Tok 982 , unsigned AtColumn 983 , bool IsRequired) { 984 if (IsSimpleKeyAllowed) { 985 SimpleKey SK; 986 SK.Tok = Tok; 987 SK.Line = Line; 988 SK.Column = AtColumn; 989 SK.IsRequired = IsRequired; 990 SK.FlowLevel = FlowLevel; 991 SimpleKeys.push_back(SK); 992 } 993 } 994 995 void Scanner::removeStaleSimpleKeyCandidates() { 996 for (SmallVectorImpl<SimpleKey>::iterator i = SimpleKeys.begin(); 997 i != SimpleKeys.end();) { 998 if (i->Line != Line || i->Column + 1024 < Column) { 999 if (i->IsRequired) 1000 setError( "Could not find expected : for simple key" 1001 , i->Tok->Range.begin()); 1002 i = SimpleKeys.erase(i); 1003 } else 1004 ++i; 1005 } 1006 } 1007 1008 void Scanner::removeSimpleKeyCandidatesOnFlowLevel(unsigned Level) { 1009 if (!SimpleKeys.empty() && (SimpleKeys.end() - 1)->FlowLevel == Level) 1010 SimpleKeys.pop_back(); 1011 } 1012 1013 bool Scanner::unrollIndent(int ToColumn) { 1014 Token T; 1015 // Indentation is ignored in flow. 1016 if (FlowLevel != 0) 1017 return true; 1018 1019 while (Indent > ToColumn) { 1020 T.Kind = Token::TK_BlockEnd; 1021 T.Range = StringRef(Current, 1); 1022 TokenQueue.push_back(T); 1023 Indent = Indents.pop_back_val(); 1024 } 1025 1026 return true; 1027 } 1028 1029 bool Scanner::rollIndent( int ToColumn 1030 , Token::TokenKind Kind 1031 , TokenQueueT::iterator InsertPoint) { 1032 if (FlowLevel) 1033 return true; 1034 if (Indent < ToColumn) { 1035 Indents.push_back(Indent); 1036 Indent = ToColumn; 1037 1038 Token T; 1039 T.Kind = Kind; 1040 T.Range = StringRef(Current, 0); 1041 TokenQueue.insert(InsertPoint, T); 1042 } 1043 return true; 1044 } 1045 1046 void Scanner::skipComment() { 1047 if (*Current != '#') 1048 return; 1049 while (true) { 1050 // This may skip more than one byte, thus Column is only incremented 1051 // for code points. 1052 StringRef::iterator I = skip_nb_char(Current); 1053 if (I == Current) 1054 break; 1055 Current = I; 1056 ++Column; 1057 } 1058 } 1059 1060 void Scanner::scanToNextToken() { 1061 while (true) { 1062 while (*Current == ' ' || *Current == '\t') { 1063 skip(1); 1064 } 1065 1066 skipComment(); 1067 1068 // Skip EOL. 1069 StringRef::iterator i = skip_b_break(Current); 1070 if (i == Current) 1071 break; 1072 Current = i; 1073 ++Line; 1074 Column = 0; 1075 // New lines may start a simple key. 1076 if (!FlowLevel) 1077 IsSimpleKeyAllowed = true; 1078 } 1079 } 1080 1081 bool Scanner::scanStreamStart() { 1082 IsStartOfStream = false; 1083 1084 EncodingInfo EI = getUnicodeEncoding(currentInput()); 1085 1086 Token T; 1087 T.Kind = Token::TK_StreamStart; 1088 T.Range = StringRef(Current, EI.second); 1089 TokenQueue.push_back(T); 1090 Current += EI.second; 1091 return true; 1092 } 1093 1094 bool Scanner::scanStreamEnd() { 1095 // Force an ending new line if one isn't present. 1096 if (Column != 0) { 1097 Column = 0; 1098 ++Line; 1099 } 1100 1101 unrollIndent(-1); 1102 SimpleKeys.clear(); 1103 IsSimpleKeyAllowed = false; 1104 1105 Token T; 1106 T.Kind = Token::TK_StreamEnd; 1107 T.Range = StringRef(Current, 0); 1108 TokenQueue.push_back(T); 1109 return true; 1110 } 1111 1112 bool Scanner::scanDirective() { 1113 // Reset the indentation level. 1114 unrollIndent(-1); 1115 SimpleKeys.clear(); 1116 IsSimpleKeyAllowed = false; 1117 1118 StringRef::iterator Start = Current; 1119 consume('%'); 1120 StringRef::iterator NameStart = Current; 1121 Current = skip_while(&Scanner::skip_ns_char, Current); 1122 StringRef Name(NameStart, Current - NameStart); 1123 Current = skip_while(&Scanner::skip_s_white, Current); 1124 1125 Token T; 1126 if (Name == "YAML") { 1127 Current = skip_while(&Scanner::skip_ns_char, Current); 1128 T.Kind = Token::TK_VersionDirective; 1129 T.Range = StringRef(Start, Current - Start); 1130 TokenQueue.push_back(T); 1131 return true; 1132 } else if(Name == "TAG") { 1133 Current = skip_while(&Scanner::skip_ns_char, Current); 1134 Current = skip_while(&Scanner::skip_s_white, Current); 1135 Current = skip_while(&Scanner::skip_ns_char, Current); 1136 T.Kind = Token::TK_TagDirective; 1137 T.Range = StringRef(Start, Current - Start); 1138 TokenQueue.push_back(T); 1139 return true; 1140 } 1141 return false; 1142 } 1143 1144 bool Scanner::scanDocumentIndicator(bool IsStart) { 1145 unrollIndent(-1); 1146 SimpleKeys.clear(); 1147 IsSimpleKeyAllowed = false; 1148 1149 Token T; 1150 T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd; 1151 T.Range = StringRef(Current, 3); 1152 skip(3); 1153 TokenQueue.push_back(T); 1154 return true; 1155 } 1156 1157 bool Scanner::scanFlowCollectionStart(bool IsSequence) { 1158 Token T; 1159 T.Kind = IsSequence ? Token::TK_FlowSequenceStart 1160 : Token::TK_FlowMappingStart; 1161 T.Range = StringRef(Current, 1); 1162 skip(1); 1163 TokenQueue.push_back(T); 1164 1165 // [ and { may begin a simple key. 1166 saveSimpleKeyCandidate(--TokenQueue.end(), Column - 1, false); 1167 1168 // And may also be followed by a simple key. 1169 IsSimpleKeyAllowed = true; 1170 ++FlowLevel; 1171 return true; 1172 } 1173 1174 bool Scanner::scanFlowCollectionEnd(bool IsSequence) { 1175 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1176 IsSimpleKeyAllowed = false; 1177 Token T; 1178 T.Kind = IsSequence ? Token::TK_FlowSequenceEnd 1179 : Token::TK_FlowMappingEnd; 1180 T.Range = StringRef(Current, 1); 1181 skip(1); 1182 TokenQueue.push_back(T); 1183 if (FlowLevel) 1184 --FlowLevel; 1185 return true; 1186 } 1187 1188 bool Scanner::scanFlowEntry() { 1189 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1190 IsSimpleKeyAllowed = true; 1191 Token T; 1192 T.Kind = Token::TK_FlowEntry; 1193 T.Range = StringRef(Current, 1); 1194 skip(1); 1195 TokenQueue.push_back(T); 1196 return true; 1197 } 1198 1199 bool Scanner::scanBlockEntry() { 1200 rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end()); 1201 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1202 IsSimpleKeyAllowed = true; 1203 Token T; 1204 T.Kind = Token::TK_BlockEntry; 1205 T.Range = StringRef(Current, 1); 1206 skip(1); 1207 TokenQueue.push_back(T); 1208 return true; 1209 } 1210 1211 bool Scanner::scanKey() { 1212 if (!FlowLevel) 1213 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1214 1215 removeSimpleKeyCandidatesOnFlowLevel(FlowLevel); 1216 IsSimpleKeyAllowed = !FlowLevel; 1217 1218 Token T; 1219 T.Kind = Token::TK_Key; 1220 T.Range = StringRef(Current, 1); 1221 skip(1); 1222 TokenQueue.push_back(T); 1223 return true; 1224 } 1225 1226 bool Scanner::scanValue() { 1227 // If the previous token could have been a simple key, insert the key token 1228 // into the token queue. 1229 if (!SimpleKeys.empty()) { 1230 SimpleKey SK = SimpleKeys.pop_back_val(); 1231 Token T; 1232 T.Kind = Token::TK_Key; 1233 T.Range = SK.Tok->Range; 1234 TokenQueueT::iterator i, e; 1235 for (i = TokenQueue.begin(), e = TokenQueue.end(); i != e; ++i) { 1236 if (i == SK.Tok) 1237 break; 1238 } 1239 assert(i != e && "SimpleKey not in token queue!"); 1240 i = TokenQueue.insert(i, T); 1241 1242 // We may also need to add a Block-Mapping-Start token. 1243 rollIndent(SK.Column, Token::TK_BlockMappingStart, i); 1244 1245 IsSimpleKeyAllowed = false; 1246 } else { 1247 if (!FlowLevel) 1248 rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end()); 1249 IsSimpleKeyAllowed = !FlowLevel; 1250 } 1251 1252 Token T; 1253 T.Kind = Token::TK_Value; 1254 T.Range = StringRef(Current, 1); 1255 skip(1); 1256 TokenQueue.push_back(T); 1257 return true; 1258 } 1259 1260 // Forbidding inlining improves performance by roughly 20%. 1261 // FIXME: Remove once llvm optimizes this to the faster version without hints. 1262 LLVM_ATTRIBUTE_NOINLINE static bool 1263 wasEscaped(StringRef::iterator First, StringRef::iterator Position); 1264 1265 // Returns whether a character at 'Position' was escaped with a leading '\'. 1266 // 'First' specifies the position of the first character in the string. 1267 static bool wasEscaped(StringRef::iterator First, 1268 StringRef::iterator Position) { 1269 assert(Position - 1 >= First); 1270 StringRef::iterator I = Position - 1; 1271 // We calculate the number of consecutive '\'s before the current position 1272 // by iterating backwards through our string. 1273 while (I >= First && *I == '\\') --I; 1274 // (Position - 1 - I) now contains the number of '\'s before the current 1275 // position. If it is odd, the character at 'Position' was escaped. 1276 return (Position - 1 - I) % 2 == 1; 1277 } 1278 1279 bool Scanner::scanFlowScalar(bool IsDoubleQuoted) { 1280 StringRef::iterator Start = Current; 1281 unsigned ColStart = Column; 1282 if (IsDoubleQuoted) { 1283 do { 1284 ++Current; 1285 while (Current != End && *Current != '"') 1286 ++Current; 1287 // Repeat until the previous character was not a '\' or was an escaped 1288 // backslash. 1289 } while ( Current != End 1290 && *(Current - 1) == '\\' 1291 && wasEscaped(Start + 1, Current)); 1292 } else { 1293 skip(1); 1294 while (true) { 1295 // Skip a ' followed by another '. 1296 if (Current + 1 < End && *Current == '\'' && *(Current + 1) == '\'') { 1297 skip(2); 1298 continue; 1299 } else if (*Current == '\'') 1300 break; 1301 StringRef::iterator i = skip_nb_char(Current); 1302 if (i == Current) { 1303 i = skip_b_break(Current); 1304 if (i == Current) 1305 break; 1306 Current = i; 1307 Column = 0; 1308 ++Line; 1309 } else { 1310 if (i == End) 1311 break; 1312 Current = i; 1313 ++Column; 1314 } 1315 } 1316 } 1317 1318 if (Current == End) { 1319 setError("Expected quote at end of scalar", Current); 1320 return false; 1321 } 1322 1323 skip(1); // Skip ending quote. 1324 Token T; 1325 T.Kind = Token::TK_Scalar; 1326 T.Range = StringRef(Start, Current - Start); 1327 TokenQueue.push_back(T); 1328 1329 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1330 1331 IsSimpleKeyAllowed = false; 1332 1333 return true; 1334 } 1335 1336 bool Scanner::scanPlainScalar() { 1337 StringRef::iterator Start = Current; 1338 unsigned ColStart = Column; 1339 unsigned LeadingBlanks = 0; 1340 assert(Indent >= -1 && "Indent must be >= -1 !"); 1341 unsigned indent = static_cast<unsigned>(Indent + 1); 1342 while (true) { 1343 if (*Current == '#') 1344 break; 1345 1346 while (!isBlankOrBreak(Current)) { 1347 if ( FlowLevel && *Current == ':' 1348 && !(isBlankOrBreak(Current + 1) || *(Current + 1) == ',')) { 1349 setError("Found unexpected ':' while scanning a plain scalar", Current); 1350 return false; 1351 } 1352 1353 // Check for the end of the plain scalar. 1354 if ( (*Current == ':' && isBlankOrBreak(Current + 1)) 1355 || ( FlowLevel 1356 && (StringRef(Current, 1).find_first_of(",:?[]{}") 1357 != StringRef::npos))) 1358 break; 1359 1360 StringRef::iterator i = skip_nb_char(Current); 1361 if (i == Current) 1362 break; 1363 Current = i; 1364 ++Column; 1365 } 1366 1367 // Are we at the end? 1368 if (!isBlankOrBreak(Current)) 1369 break; 1370 1371 // Eat blanks. 1372 StringRef::iterator Tmp = Current; 1373 while (isBlankOrBreak(Tmp)) { 1374 StringRef::iterator i = skip_s_white(Tmp); 1375 if (i != Tmp) { 1376 if (LeadingBlanks && (Column < indent) && *Tmp == '\t') { 1377 setError("Found invalid tab character in indentation", Tmp); 1378 return false; 1379 } 1380 Tmp = i; 1381 ++Column; 1382 } else { 1383 i = skip_b_break(Tmp); 1384 if (!LeadingBlanks) 1385 LeadingBlanks = 1; 1386 Tmp = i; 1387 Column = 0; 1388 ++Line; 1389 } 1390 } 1391 1392 if (!FlowLevel && Column < indent) 1393 break; 1394 1395 Current = Tmp; 1396 } 1397 if (Start == Current) { 1398 setError("Got empty plain scalar", Start); 1399 return false; 1400 } 1401 Token T; 1402 T.Kind = Token::TK_Scalar; 1403 T.Range = StringRef(Start, Current - Start); 1404 TokenQueue.push_back(T); 1405 1406 // Plain scalars can be simple keys. 1407 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1408 1409 IsSimpleKeyAllowed = false; 1410 1411 return true; 1412 } 1413 1414 bool Scanner::scanAliasOrAnchor(bool IsAlias) { 1415 StringRef::iterator Start = Current; 1416 unsigned ColStart = Column; 1417 skip(1); 1418 while(true) { 1419 if ( *Current == '[' || *Current == ']' 1420 || *Current == '{' || *Current == '}' 1421 || *Current == ',' 1422 || *Current == ':') 1423 break; 1424 StringRef::iterator i = skip_ns_char(Current); 1425 if (i == Current) 1426 break; 1427 Current = i; 1428 ++Column; 1429 } 1430 1431 if (Start == Current) { 1432 setError("Got empty alias or anchor", Start); 1433 return false; 1434 } 1435 1436 Token T; 1437 T.Kind = IsAlias ? Token::TK_Alias : Token::TK_Anchor; 1438 T.Range = StringRef(Start, Current - Start); 1439 TokenQueue.push_back(T); 1440 1441 // Alias and anchors can be simple keys. 1442 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1443 1444 IsSimpleKeyAllowed = false; 1445 1446 return true; 1447 } 1448 1449 char Scanner::scanBlockChompingIndicator() { 1450 char Indicator = ' '; 1451 if (Current != End && (*Current == '+' || *Current == '-')) { 1452 Indicator = *Current; 1453 skip(1); 1454 } 1455 return Indicator; 1456 } 1457 1458 /// Get the number of line breaks after chomping. 1459 /// 1460 /// Return the number of trailing line breaks to emit, depending on 1461 /// \p ChompingIndicator. 1462 static unsigned getChompedLineBreaks(char ChompingIndicator, 1463 unsigned LineBreaks, StringRef Str) { 1464 if (ChompingIndicator == '-') // Strip all line breaks. 1465 return 0; 1466 if (ChompingIndicator == '+') // Keep all line breaks. 1467 return LineBreaks; 1468 // Clip trailing lines. 1469 return Str.empty() ? 0 : 1; 1470 } 1471 1472 unsigned Scanner::scanBlockIndentationIndicator() { 1473 unsigned Indent = 0; 1474 if (Current != End && (*Current >= '1' && *Current <= '9')) { 1475 Indent = unsigned(*Current - '0'); 1476 skip(1); 1477 } 1478 return Indent; 1479 } 1480 1481 bool Scanner::scanBlockScalarHeader(char &ChompingIndicator, 1482 unsigned &IndentIndicator, bool &IsDone) { 1483 auto Start = Current; 1484 1485 ChompingIndicator = scanBlockChompingIndicator(); 1486 IndentIndicator = scanBlockIndentationIndicator(); 1487 // Check for the chomping indicator once again. 1488 if (ChompingIndicator == ' ') 1489 ChompingIndicator = scanBlockChompingIndicator(); 1490 Current = skip_while(&Scanner::skip_s_white, Current); 1491 skipComment(); 1492 1493 if (Current == End) { // EOF, we have an empty scalar. 1494 Token T; 1495 T.Kind = Token::TK_BlockScalar; 1496 T.Range = StringRef(Start, Current - Start); 1497 TokenQueue.push_back(T); 1498 IsDone = true; 1499 return true; 1500 } 1501 1502 if (!consumeLineBreakIfPresent()) { 1503 setError("Expected a line break after block scalar header", Current); 1504 return false; 1505 } 1506 return true; 1507 } 1508 1509 bool Scanner::findBlockScalarIndent(unsigned &BlockIndent, 1510 unsigned BlockExitIndent, 1511 unsigned &LineBreaks, bool &IsDone) { 1512 unsigned MaxAllSpaceLineCharacters = 0; 1513 StringRef::iterator LongestAllSpaceLine; 1514 1515 while (true) { 1516 advanceWhile(&Scanner::skip_s_space); 1517 if (skip_nb_char(Current) != Current) { 1518 // This line isn't empty, so try and find the indentation. 1519 if (Column <= BlockExitIndent) { // End of the block literal. 1520 IsDone = true; 1521 return true; 1522 } 1523 // We found the block's indentation. 1524 BlockIndent = Column; 1525 if (MaxAllSpaceLineCharacters > BlockIndent) { 1526 setError( 1527 "Leading all-spaces line must be smaller than the block indent", 1528 LongestAllSpaceLine); 1529 return false; 1530 } 1531 return true; 1532 } 1533 if (skip_b_break(Current) != Current && 1534 Column > MaxAllSpaceLineCharacters) { 1535 // Record the longest all-space line in case it's longer than the 1536 // discovered block indent. 1537 MaxAllSpaceLineCharacters = Column; 1538 LongestAllSpaceLine = Current; 1539 } 1540 1541 // Check for EOF. 1542 if (Current == End) { 1543 IsDone = true; 1544 return true; 1545 } 1546 1547 if (!consumeLineBreakIfPresent()) { 1548 IsDone = true; 1549 return true; 1550 } 1551 ++LineBreaks; 1552 } 1553 return true; 1554 } 1555 1556 bool Scanner::scanBlockScalarIndent(unsigned BlockIndent, 1557 unsigned BlockExitIndent, bool &IsDone) { 1558 // Skip the indentation. 1559 while (Column < BlockIndent) { 1560 auto I = skip_s_space(Current); 1561 if (I == Current) 1562 break; 1563 Current = I; 1564 ++Column; 1565 } 1566 1567 if (skip_nb_char(Current) == Current) 1568 return true; 1569 1570 if (Column <= BlockExitIndent) { // End of the block literal. 1571 IsDone = true; 1572 return true; 1573 } 1574 1575 if (Column < BlockIndent) { 1576 if (Current != End && *Current == '#') { // Trailing comment. 1577 IsDone = true; 1578 return true; 1579 } 1580 setError("A text line is less indented than the block scalar", Current); 1581 return false; 1582 } 1583 return true; // A normal text line. 1584 } 1585 1586 bool Scanner::scanBlockScalar(bool IsLiteral) { 1587 // Eat '|' or '>' 1588 assert(*Current == '|' || *Current == '>'); 1589 skip(1); 1590 1591 char ChompingIndicator; 1592 unsigned BlockIndent; 1593 bool IsDone = false; 1594 if (!scanBlockScalarHeader(ChompingIndicator, BlockIndent, IsDone)) 1595 return false; 1596 if (IsDone) 1597 return true; 1598 1599 auto Start = Current; 1600 unsigned BlockExitIndent = Indent < 0 ? 0 : (unsigned)Indent; 1601 unsigned LineBreaks = 0; 1602 if (BlockIndent == 0) { 1603 if (!findBlockScalarIndent(BlockIndent, BlockExitIndent, LineBreaks, 1604 IsDone)) 1605 return false; 1606 } 1607 1608 // Scan the block's scalars body. 1609 SmallString<256> Str; 1610 while (!IsDone) { 1611 if (!scanBlockScalarIndent(BlockIndent, BlockExitIndent, IsDone)) 1612 return false; 1613 if (IsDone) 1614 break; 1615 1616 // Parse the current line. 1617 auto LineStart = Current; 1618 advanceWhile(&Scanner::skip_nb_char); 1619 if (LineStart != Current) { 1620 Str.append(LineBreaks, '\n'); 1621 Str.append(StringRef(LineStart, Current - LineStart)); 1622 LineBreaks = 0; 1623 } 1624 1625 // Check for EOF. 1626 if (Current == End) 1627 break; 1628 1629 if (!consumeLineBreakIfPresent()) 1630 break; 1631 ++LineBreaks; 1632 } 1633 1634 if (Current == End && !LineBreaks) 1635 // Ensure that there is at least one line break before the end of file. 1636 LineBreaks = 1; 1637 Str.append(getChompedLineBreaks(ChompingIndicator, LineBreaks, Str), '\n'); 1638 1639 // New lines may start a simple key. 1640 if (!FlowLevel) 1641 IsSimpleKeyAllowed = true; 1642 1643 Token T; 1644 T.Kind = Token::TK_BlockScalar; 1645 T.Range = StringRef(Start, Current - Start); 1646 T.Value = Str.str().str(); 1647 TokenQueue.push_back(T); 1648 return true; 1649 } 1650 1651 bool Scanner::scanTag() { 1652 StringRef::iterator Start = Current; 1653 unsigned ColStart = Column; 1654 skip(1); // Eat !. 1655 if (Current == End || isBlankOrBreak(Current)); // An empty tag. 1656 else if (*Current == '<') { 1657 skip(1); 1658 scan_ns_uri_char(); 1659 if (!consume('>')) 1660 return false; 1661 } else { 1662 // FIXME: Actually parse the c-ns-shorthand-tag rule. 1663 Current = skip_while(&Scanner::skip_ns_char, Current); 1664 } 1665 1666 Token T; 1667 T.Kind = Token::TK_Tag; 1668 T.Range = StringRef(Start, Current - Start); 1669 TokenQueue.push_back(T); 1670 1671 // Tags can be simple keys. 1672 saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false); 1673 1674 IsSimpleKeyAllowed = false; 1675 1676 return true; 1677 } 1678 1679 bool Scanner::fetchMoreTokens() { 1680 if (IsStartOfStream) 1681 return scanStreamStart(); 1682 1683 scanToNextToken(); 1684 1685 if (Current == End) 1686 return scanStreamEnd(); 1687 1688 removeStaleSimpleKeyCandidates(); 1689 1690 unrollIndent(Column); 1691 1692 if (Column == 0 && *Current == '%') 1693 return scanDirective(); 1694 1695 if (Column == 0 && Current + 4 <= End 1696 && *Current == '-' 1697 && *(Current + 1) == '-' 1698 && *(Current + 2) == '-' 1699 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1700 return scanDocumentIndicator(true); 1701 1702 if (Column == 0 && Current + 4 <= End 1703 && *Current == '.' 1704 && *(Current + 1) == '.' 1705 && *(Current + 2) == '.' 1706 && (Current + 3 == End || isBlankOrBreak(Current + 3))) 1707 return scanDocumentIndicator(false); 1708 1709 if (*Current == '[') 1710 return scanFlowCollectionStart(true); 1711 1712 if (*Current == '{') 1713 return scanFlowCollectionStart(false); 1714 1715 if (*Current == ']') 1716 return scanFlowCollectionEnd(true); 1717 1718 if (*Current == '}') 1719 return scanFlowCollectionEnd(false); 1720 1721 if (*Current == ',') 1722 return scanFlowEntry(); 1723 1724 if (*Current == '-' && isBlankOrBreak(Current + 1)) 1725 return scanBlockEntry(); 1726 1727 if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1))) 1728 return scanKey(); 1729 1730 if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1))) 1731 return scanValue(); 1732 1733 if (*Current == '*') 1734 return scanAliasOrAnchor(true); 1735 1736 if (*Current == '&') 1737 return scanAliasOrAnchor(false); 1738 1739 if (*Current == '!') 1740 return scanTag(); 1741 1742 if (*Current == '|' && !FlowLevel) 1743 return scanBlockScalar(true); 1744 1745 if (*Current == '>' && !FlowLevel) 1746 return scanBlockScalar(false); 1747 1748 if (*Current == '\'') 1749 return scanFlowScalar(false); 1750 1751 if (*Current == '"') 1752 return scanFlowScalar(true); 1753 1754 // Get a plain scalar. 1755 StringRef FirstChar(Current, 1); 1756 if (!(isBlankOrBreak(Current) 1757 || FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos) 1758 || (*Current == '-' && !isBlankOrBreak(Current + 1)) 1759 || (!FlowLevel && (*Current == '?' || *Current == ':') 1760 && isBlankOrBreak(Current + 1)) 1761 || (!FlowLevel && *Current == ':' 1762 && Current + 2 < End 1763 && *(Current + 1) == ':' 1764 && !isBlankOrBreak(Current + 2))) 1765 return scanPlainScalar(); 1766 1767 setError("Unrecognized character while tokenizing."); 1768 return false; 1769 } 1770 1771 Stream::Stream(StringRef Input, SourceMgr &SM, bool ShowColors) 1772 : scanner(new Scanner(Input, SM, ShowColors)), CurrentDoc() {} 1773 1774 Stream::Stream(MemoryBufferRef InputBuffer, SourceMgr &SM, bool ShowColors) 1775 : scanner(new Scanner(InputBuffer, SM, ShowColors)), CurrentDoc() {} 1776 1777 Stream::~Stream() {} 1778 1779 bool Stream::failed() { return scanner->failed(); } 1780 1781 void Stream::printError(Node *N, const Twine &Msg) { 1782 scanner->printError( N->getSourceRange().Start 1783 , SourceMgr::DK_Error 1784 , Msg 1785 , N->getSourceRange()); 1786 } 1787 1788 document_iterator Stream::begin() { 1789 if (CurrentDoc) 1790 report_fatal_error("Can only iterate over the stream once"); 1791 1792 // Skip Stream-Start. 1793 scanner->getNext(); 1794 1795 CurrentDoc.reset(new Document(*this)); 1796 return document_iterator(CurrentDoc); 1797 } 1798 1799 document_iterator Stream::end() { 1800 return document_iterator(); 1801 } 1802 1803 void Stream::skip() { 1804 for (document_iterator i = begin(), e = end(); i != e; ++i) 1805 i->skip(); 1806 } 1807 1808 Node::Node(unsigned int Type, std::unique_ptr<Document> &D, StringRef A, 1809 StringRef T) 1810 : Doc(D), TypeID(Type), Anchor(A), Tag(T) { 1811 SMLoc Start = SMLoc::getFromPointer(peekNext().Range.begin()); 1812 SourceRange = SMRange(Start, Start); 1813 } 1814 1815 std::string Node::getVerbatimTag() const { 1816 StringRef Raw = getRawTag(); 1817 if (!Raw.empty() && Raw != "!") { 1818 std::string Ret; 1819 if (Raw.find_last_of('!') == 0) { 1820 Ret = Doc->getTagMap().find("!")->second; 1821 Ret += Raw.substr(1); 1822 return Ret; 1823 } else if (Raw.startswith("!!")) { 1824 Ret = Doc->getTagMap().find("!!")->second; 1825 Ret += Raw.substr(2); 1826 return Ret; 1827 } else { 1828 StringRef TagHandle = Raw.substr(0, Raw.find_last_of('!') + 1); 1829 std::map<StringRef, StringRef>::const_iterator It = 1830 Doc->getTagMap().find(TagHandle); 1831 if (It != Doc->getTagMap().end()) 1832 Ret = It->second; 1833 else { 1834 Token T; 1835 T.Kind = Token::TK_Tag; 1836 T.Range = TagHandle; 1837 setError(Twine("Unknown tag handle ") + TagHandle, T); 1838 } 1839 Ret += Raw.substr(Raw.find_last_of('!') + 1); 1840 return Ret; 1841 } 1842 } 1843 1844 switch (getType()) { 1845 case NK_Null: 1846 return "tag:yaml.org,2002:null"; 1847 case NK_Scalar: 1848 case NK_BlockScalar: 1849 // TODO: Tag resolution. 1850 return "tag:yaml.org,2002:str"; 1851 case NK_Mapping: 1852 return "tag:yaml.org,2002:map"; 1853 case NK_Sequence: 1854 return "tag:yaml.org,2002:seq"; 1855 } 1856 1857 return ""; 1858 } 1859 1860 Token &Node::peekNext() { 1861 return Doc->peekNext(); 1862 } 1863 1864 Token Node::getNext() { 1865 return Doc->getNext(); 1866 } 1867 1868 Node *Node::parseBlockNode() { 1869 return Doc->parseBlockNode(); 1870 } 1871 1872 BumpPtrAllocator &Node::getAllocator() { 1873 return Doc->NodeAllocator; 1874 } 1875 1876 void Node::setError(const Twine &Msg, Token &Tok) const { 1877 Doc->setError(Msg, Tok); 1878 } 1879 1880 bool Node::failed() const { 1881 return Doc->failed(); 1882 } 1883 1884 1885 1886 StringRef ScalarNode::getValue(SmallVectorImpl<char> &Storage) const { 1887 // TODO: Handle newlines properly. We need to remove leading whitespace. 1888 if (Value[0] == '"') { // Double quoted. 1889 // Pull off the leading and trailing "s. 1890 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1891 // Search for characters that would require unescaping the value. 1892 StringRef::size_type i = UnquotedValue.find_first_of("\\\r\n"); 1893 if (i != StringRef::npos) 1894 return unescapeDoubleQuoted(UnquotedValue, i, Storage); 1895 return UnquotedValue; 1896 } else if (Value[0] == '\'') { // Single quoted. 1897 // Pull off the leading and trailing 's. 1898 StringRef UnquotedValue = Value.substr(1, Value.size() - 2); 1899 StringRef::size_type i = UnquotedValue.find('\''); 1900 if (i != StringRef::npos) { 1901 // We're going to need Storage. 1902 Storage.clear(); 1903 Storage.reserve(UnquotedValue.size()); 1904 for (; i != StringRef::npos; i = UnquotedValue.find('\'')) { 1905 StringRef Valid(UnquotedValue.begin(), i); 1906 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1907 Storage.push_back('\''); 1908 UnquotedValue = UnquotedValue.substr(i + 2); 1909 } 1910 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 1911 return StringRef(Storage.begin(), Storage.size()); 1912 } 1913 return UnquotedValue; 1914 } 1915 // Plain or block. 1916 return Value.rtrim(" "); 1917 } 1918 1919 StringRef ScalarNode::unescapeDoubleQuoted( StringRef UnquotedValue 1920 , StringRef::size_type i 1921 , SmallVectorImpl<char> &Storage) 1922 const { 1923 // Use Storage to build proper value. 1924 Storage.clear(); 1925 Storage.reserve(UnquotedValue.size()); 1926 for (; i != StringRef::npos; i = UnquotedValue.find_first_of("\\\r\n")) { 1927 // Insert all previous chars into Storage. 1928 StringRef Valid(UnquotedValue.begin(), i); 1929 Storage.insert(Storage.end(), Valid.begin(), Valid.end()); 1930 // Chop off inserted chars. 1931 UnquotedValue = UnquotedValue.substr(i); 1932 1933 assert(!UnquotedValue.empty() && "Can't be empty!"); 1934 1935 // Parse escape or line break. 1936 switch (UnquotedValue[0]) { 1937 case '\r': 1938 case '\n': 1939 Storage.push_back('\n'); 1940 if ( UnquotedValue.size() > 1 1941 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1942 UnquotedValue = UnquotedValue.substr(1); 1943 UnquotedValue = UnquotedValue.substr(1); 1944 break; 1945 default: 1946 if (UnquotedValue.size() == 1) 1947 // TODO: Report error. 1948 break; 1949 UnquotedValue = UnquotedValue.substr(1); 1950 switch (UnquotedValue[0]) { 1951 default: { 1952 Token T; 1953 T.Range = StringRef(UnquotedValue.begin(), 1); 1954 setError("Unrecognized escape code!", T); 1955 return ""; 1956 } 1957 case '\r': 1958 case '\n': 1959 // Remove the new line. 1960 if ( UnquotedValue.size() > 1 1961 && (UnquotedValue[1] == '\r' || UnquotedValue[1] == '\n')) 1962 UnquotedValue = UnquotedValue.substr(1); 1963 // If this was just a single byte newline, it will get skipped 1964 // below. 1965 break; 1966 case '0': 1967 Storage.push_back(0x00); 1968 break; 1969 case 'a': 1970 Storage.push_back(0x07); 1971 break; 1972 case 'b': 1973 Storage.push_back(0x08); 1974 break; 1975 case 't': 1976 case 0x09: 1977 Storage.push_back(0x09); 1978 break; 1979 case 'n': 1980 Storage.push_back(0x0A); 1981 break; 1982 case 'v': 1983 Storage.push_back(0x0B); 1984 break; 1985 case 'f': 1986 Storage.push_back(0x0C); 1987 break; 1988 case 'r': 1989 Storage.push_back(0x0D); 1990 break; 1991 case 'e': 1992 Storage.push_back(0x1B); 1993 break; 1994 case ' ': 1995 Storage.push_back(0x20); 1996 break; 1997 case '"': 1998 Storage.push_back(0x22); 1999 break; 2000 case '/': 2001 Storage.push_back(0x2F); 2002 break; 2003 case '\\': 2004 Storage.push_back(0x5C); 2005 break; 2006 case 'N': 2007 encodeUTF8(0x85, Storage); 2008 break; 2009 case '_': 2010 encodeUTF8(0xA0, Storage); 2011 break; 2012 case 'L': 2013 encodeUTF8(0x2028, Storage); 2014 break; 2015 case 'P': 2016 encodeUTF8(0x2029, Storage); 2017 break; 2018 case 'x': { 2019 if (UnquotedValue.size() < 3) 2020 // TODO: Report error. 2021 break; 2022 unsigned int UnicodeScalarValue; 2023 if (UnquotedValue.substr(1, 2).getAsInteger(16, UnicodeScalarValue)) 2024 // TODO: Report error. 2025 UnicodeScalarValue = 0xFFFD; 2026 encodeUTF8(UnicodeScalarValue, Storage); 2027 UnquotedValue = UnquotedValue.substr(2); 2028 break; 2029 } 2030 case 'u': { 2031 if (UnquotedValue.size() < 5) 2032 // TODO: Report error. 2033 break; 2034 unsigned int UnicodeScalarValue; 2035 if (UnquotedValue.substr(1, 4).getAsInteger(16, UnicodeScalarValue)) 2036 // TODO: Report error. 2037 UnicodeScalarValue = 0xFFFD; 2038 encodeUTF8(UnicodeScalarValue, Storage); 2039 UnquotedValue = UnquotedValue.substr(4); 2040 break; 2041 } 2042 case 'U': { 2043 if (UnquotedValue.size() < 9) 2044 // TODO: Report error. 2045 break; 2046 unsigned int UnicodeScalarValue; 2047 if (UnquotedValue.substr(1, 8).getAsInteger(16, UnicodeScalarValue)) 2048 // TODO: Report error. 2049 UnicodeScalarValue = 0xFFFD; 2050 encodeUTF8(UnicodeScalarValue, Storage); 2051 UnquotedValue = UnquotedValue.substr(8); 2052 break; 2053 } 2054 } 2055 UnquotedValue = UnquotedValue.substr(1); 2056 } 2057 } 2058 Storage.insert(Storage.end(), UnquotedValue.begin(), UnquotedValue.end()); 2059 return StringRef(Storage.begin(), Storage.size()); 2060 } 2061 2062 Node *KeyValueNode::getKey() { 2063 if (Key) 2064 return Key; 2065 // Handle implicit null keys. 2066 { 2067 Token &t = peekNext(); 2068 if ( t.Kind == Token::TK_BlockEnd 2069 || t.Kind == Token::TK_Value 2070 || t.Kind == Token::TK_Error) { 2071 return Key = new (getAllocator()) NullNode(Doc); 2072 } 2073 if (t.Kind == Token::TK_Key) 2074 getNext(); // skip TK_Key. 2075 } 2076 2077 // Handle explicit null keys. 2078 Token &t = peekNext(); 2079 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Value) { 2080 return Key = new (getAllocator()) NullNode(Doc); 2081 } 2082 2083 // We've got a normal key. 2084 return Key = parseBlockNode(); 2085 } 2086 2087 Node *KeyValueNode::getValue() { 2088 if (Value) 2089 return Value; 2090 getKey()->skip(); 2091 if (failed()) 2092 return Value = new (getAllocator()) NullNode(Doc); 2093 2094 // Handle implicit null values. 2095 { 2096 Token &t = peekNext(); 2097 if ( t.Kind == Token::TK_BlockEnd 2098 || t.Kind == Token::TK_FlowMappingEnd 2099 || t.Kind == Token::TK_Key 2100 || t.Kind == Token::TK_FlowEntry 2101 || t.Kind == Token::TK_Error) { 2102 return Value = new (getAllocator()) NullNode(Doc); 2103 } 2104 2105 if (t.Kind != Token::TK_Value) { 2106 setError("Unexpected token in Key Value.", t); 2107 return Value = new (getAllocator()) NullNode(Doc); 2108 } 2109 getNext(); // skip TK_Value. 2110 } 2111 2112 // Handle explicit null values. 2113 Token &t = peekNext(); 2114 if (t.Kind == Token::TK_BlockEnd || t.Kind == Token::TK_Key) { 2115 return Value = new (getAllocator()) NullNode(Doc); 2116 } 2117 2118 // We got a normal value. 2119 return Value = parseBlockNode(); 2120 } 2121 2122 void MappingNode::increment() { 2123 if (failed()) { 2124 IsAtEnd = true; 2125 CurrentEntry = nullptr; 2126 return; 2127 } 2128 if (CurrentEntry) { 2129 CurrentEntry->skip(); 2130 if (Type == MT_Inline) { 2131 IsAtEnd = true; 2132 CurrentEntry = nullptr; 2133 return; 2134 } 2135 } 2136 Token T = peekNext(); 2137 if (T.Kind == Token::TK_Key || T.Kind == Token::TK_Scalar) { 2138 // KeyValueNode eats the TK_Key. That way it can detect null keys. 2139 CurrentEntry = new (getAllocator()) KeyValueNode(Doc); 2140 } else if (Type == MT_Block) { 2141 switch (T.Kind) { 2142 case Token::TK_BlockEnd: 2143 getNext(); 2144 IsAtEnd = true; 2145 CurrentEntry = nullptr; 2146 break; 2147 default: 2148 setError("Unexpected token. Expected Key or Block End", T); 2149 case Token::TK_Error: 2150 IsAtEnd = true; 2151 CurrentEntry = nullptr; 2152 } 2153 } else { 2154 switch (T.Kind) { 2155 case Token::TK_FlowEntry: 2156 // Eat the flow entry and recurse. 2157 getNext(); 2158 return increment(); 2159 case Token::TK_FlowMappingEnd: 2160 getNext(); 2161 case Token::TK_Error: 2162 // Set this to end iterator. 2163 IsAtEnd = true; 2164 CurrentEntry = nullptr; 2165 break; 2166 default: 2167 setError( "Unexpected token. Expected Key, Flow Entry, or Flow " 2168 "Mapping End." 2169 , T); 2170 IsAtEnd = true; 2171 CurrentEntry = nullptr; 2172 } 2173 } 2174 } 2175 2176 void SequenceNode::increment() { 2177 if (failed()) { 2178 IsAtEnd = true; 2179 CurrentEntry = nullptr; 2180 return; 2181 } 2182 if (CurrentEntry) 2183 CurrentEntry->skip(); 2184 Token T = peekNext(); 2185 if (SeqType == ST_Block) { 2186 switch (T.Kind) { 2187 case Token::TK_BlockEntry: 2188 getNext(); 2189 CurrentEntry = parseBlockNode(); 2190 if (!CurrentEntry) { // An error occurred. 2191 IsAtEnd = true; 2192 CurrentEntry = nullptr; 2193 } 2194 break; 2195 case Token::TK_BlockEnd: 2196 getNext(); 2197 IsAtEnd = true; 2198 CurrentEntry = nullptr; 2199 break; 2200 default: 2201 setError( "Unexpected token. Expected Block Entry or Block End." 2202 , T); 2203 case Token::TK_Error: 2204 IsAtEnd = true; 2205 CurrentEntry = nullptr; 2206 } 2207 } else if (SeqType == ST_Indentless) { 2208 switch (T.Kind) { 2209 case Token::TK_BlockEntry: 2210 getNext(); 2211 CurrentEntry = parseBlockNode(); 2212 if (!CurrentEntry) { // An error occurred. 2213 IsAtEnd = true; 2214 CurrentEntry = nullptr; 2215 } 2216 break; 2217 default: 2218 case Token::TK_Error: 2219 IsAtEnd = true; 2220 CurrentEntry = nullptr; 2221 } 2222 } else if (SeqType == ST_Flow) { 2223 switch (T.Kind) { 2224 case Token::TK_FlowEntry: 2225 // Eat the flow entry and recurse. 2226 getNext(); 2227 WasPreviousTokenFlowEntry = true; 2228 return increment(); 2229 case Token::TK_FlowSequenceEnd: 2230 getNext(); 2231 case Token::TK_Error: 2232 // Set this to end iterator. 2233 IsAtEnd = true; 2234 CurrentEntry = nullptr; 2235 break; 2236 case Token::TK_StreamEnd: 2237 case Token::TK_DocumentEnd: 2238 case Token::TK_DocumentStart: 2239 setError("Could not find closing ]!", T); 2240 // Set this to end iterator. 2241 IsAtEnd = true; 2242 CurrentEntry = nullptr; 2243 break; 2244 default: 2245 if (!WasPreviousTokenFlowEntry) { 2246 setError("Expected , between entries!", T); 2247 IsAtEnd = true; 2248 CurrentEntry = nullptr; 2249 break; 2250 } 2251 // Otherwise it must be a flow entry. 2252 CurrentEntry = parseBlockNode(); 2253 if (!CurrentEntry) { 2254 IsAtEnd = true; 2255 } 2256 WasPreviousTokenFlowEntry = false; 2257 break; 2258 } 2259 } 2260 } 2261 2262 Document::Document(Stream &S) : stream(S), Root(nullptr) { 2263 // Tag maps starts with two default mappings. 2264 TagMap["!"] = "!"; 2265 TagMap["!!"] = "tag:yaml.org,2002:"; 2266 2267 if (parseDirectives()) 2268 expectToken(Token::TK_DocumentStart); 2269 Token &T = peekNext(); 2270 if (T.Kind == Token::TK_DocumentStart) 2271 getNext(); 2272 } 2273 2274 bool Document::skip() { 2275 if (stream.scanner->failed()) 2276 return false; 2277 if (!Root) 2278 getRoot(); 2279 Root->skip(); 2280 Token &T = peekNext(); 2281 if (T.Kind == Token::TK_StreamEnd) 2282 return false; 2283 if (T.Kind == Token::TK_DocumentEnd) { 2284 getNext(); 2285 return skip(); 2286 } 2287 return true; 2288 } 2289 2290 Token &Document::peekNext() { 2291 return stream.scanner->peekNext(); 2292 } 2293 2294 Token Document::getNext() { 2295 return stream.scanner->getNext(); 2296 } 2297 2298 void Document::setError(const Twine &Message, Token &Location) const { 2299 stream.scanner->setError(Message, Location.Range.begin()); 2300 } 2301 2302 bool Document::failed() const { 2303 return stream.scanner->failed(); 2304 } 2305 2306 Node *Document::parseBlockNode() { 2307 Token T = peekNext(); 2308 // Handle properties. 2309 Token AnchorInfo; 2310 Token TagInfo; 2311 parse_property: 2312 switch (T.Kind) { 2313 case Token::TK_Alias: 2314 getNext(); 2315 return new (NodeAllocator) AliasNode(stream.CurrentDoc, T.Range.substr(1)); 2316 case Token::TK_Anchor: 2317 if (AnchorInfo.Kind == Token::TK_Anchor) { 2318 setError("Already encountered an anchor for this node!", T); 2319 return nullptr; 2320 } 2321 AnchorInfo = getNext(); // Consume TK_Anchor. 2322 T = peekNext(); 2323 goto parse_property; 2324 case Token::TK_Tag: 2325 if (TagInfo.Kind == Token::TK_Tag) { 2326 setError("Already encountered a tag for this node!", T); 2327 return nullptr; 2328 } 2329 TagInfo = getNext(); // Consume TK_Tag. 2330 T = peekNext(); 2331 goto parse_property; 2332 default: 2333 break; 2334 } 2335 2336 switch (T.Kind) { 2337 case Token::TK_BlockEntry: 2338 // We got an unindented BlockEntry sequence. This is not terminated with 2339 // a BlockEnd. 2340 // Don't eat the TK_BlockEntry, SequenceNode needs it. 2341 return new (NodeAllocator) SequenceNode( stream.CurrentDoc 2342 , AnchorInfo.Range.substr(1) 2343 , TagInfo.Range 2344 , SequenceNode::ST_Indentless); 2345 case Token::TK_BlockSequenceStart: 2346 getNext(); 2347 return new (NodeAllocator) 2348 SequenceNode( stream.CurrentDoc 2349 , AnchorInfo.Range.substr(1) 2350 , TagInfo.Range 2351 , SequenceNode::ST_Block); 2352 case Token::TK_BlockMappingStart: 2353 getNext(); 2354 return new (NodeAllocator) 2355 MappingNode( stream.CurrentDoc 2356 , AnchorInfo.Range.substr(1) 2357 , TagInfo.Range 2358 , MappingNode::MT_Block); 2359 case Token::TK_FlowSequenceStart: 2360 getNext(); 2361 return new (NodeAllocator) 2362 SequenceNode( stream.CurrentDoc 2363 , AnchorInfo.Range.substr(1) 2364 , TagInfo.Range 2365 , SequenceNode::ST_Flow); 2366 case Token::TK_FlowMappingStart: 2367 getNext(); 2368 return new (NodeAllocator) 2369 MappingNode( stream.CurrentDoc 2370 , AnchorInfo.Range.substr(1) 2371 , TagInfo.Range 2372 , MappingNode::MT_Flow); 2373 case Token::TK_Scalar: 2374 getNext(); 2375 return new (NodeAllocator) 2376 ScalarNode( stream.CurrentDoc 2377 , AnchorInfo.Range.substr(1) 2378 , TagInfo.Range 2379 , T.Range); 2380 case Token::TK_BlockScalar: { 2381 getNext(); 2382 StringRef NullTerminatedStr(T.Value.c_str(), T.Value.length() + 1); 2383 StringRef StrCopy = NullTerminatedStr.copy(NodeAllocator).drop_back(); 2384 return new (NodeAllocator) 2385 BlockScalarNode(stream.CurrentDoc, AnchorInfo.Range.substr(1), 2386 TagInfo.Range, StrCopy, T.Range); 2387 } 2388 case Token::TK_Key: 2389 // Don't eat the TK_Key, KeyValueNode expects it. 2390 return new (NodeAllocator) 2391 MappingNode( stream.CurrentDoc 2392 , AnchorInfo.Range.substr(1) 2393 , TagInfo.Range 2394 , MappingNode::MT_Inline); 2395 case Token::TK_DocumentStart: 2396 case Token::TK_DocumentEnd: 2397 case Token::TK_StreamEnd: 2398 default: 2399 // TODO: Properly handle tags. "[!!str ]" should resolve to !!str "", not 2400 // !!null null. 2401 return new (NodeAllocator) NullNode(stream.CurrentDoc); 2402 case Token::TK_Error: 2403 return nullptr; 2404 } 2405 llvm_unreachable("Control flow shouldn't reach here."); 2406 return nullptr; 2407 } 2408 2409 bool Document::parseDirectives() { 2410 bool isDirective = false; 2411 while (true) { 2412 Token T = peekNext(); 2413 if (T.Kind == Token::TK_TagDirective) { 2414 parseTAGDirective(); 2415 isDirective = true; 2416 } else if (T.Kind == Token::TK_VersionDirective) { 2417 parseYAMLDirective(); 2418 isDirective = true; 2419 } else 2420 break; 2421 } 2422 return isDirective; 2423 } 2424 2425 void Document::parseYAMLDirective() { 2426 getNext(); // Eat %YAML <version> 2427 } 2428 2429 void Document::parseTAGDirective() { 2430 Token Tag = getNext(); // %TAG <handle> <prefix> 2431 StringRef T = Tag.Range; 2432 // Strip %TAG 2433 T = T.substr(T.find_first_of(" \t")).ltrim(" \t"); 2434 std::size_t HandleEnd = T.find_first_of(" \t"); 2435 StringRef TagHandle = T.substr(0, HandleEnd); 2436 StringRef TagPrefix = T.substr(HandleEnd).ltrim(" \t"); 2437 TagMap[TagHandle] = TagPrefix; 2438 } 2439 2440 bool Document::expectToken(int TK) { 2441 Token T = getNext(); 2442 if (T.Kind != TK) { 2443 setError("Unexpected token", T); 2444 return false; 2445 } 2446 return true; 2447 } 2448