1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is a YAML 1.2 parser. 11 // 12 // See http://www.yaml.org/spec/1.2/spec.html for the full standard. 13 // 14 // This currently does not implement the following: 15 // * Multi-line literal folding. 16 // * Tag resolution. 17 // * UTF-16. 18 // * BOMs anywhere other than the first Unicode scalar value in the file. 19 // 20 // The most important class here is Stream. This represents a YAML stream with 21 // 0, 1, or many documents. 22 // 23 // SourceMgr sm; 24 // StringRef input = getInput(); 25 // yaml::Stream stream(input, sm); 26 // 27 // for (yaml::document_iterator di = stream.begin(), de = stream.end(); 28 // di != de; ++di) { 29 // yaml::Node *n = di->getRoot(); 30 // if (n) { 31 // // Do something with n... 32 // } else 33 // break; 34 // } 35 // 36 //===----------------------------------------------------------------------===// 37 38 #ifndef LLVM_SUPPORT_YAMLPARSER_H 39 #define LLVM_SUPPORT_YAMLPARSER_H 40 41 #include "llvm/ADT/StringRef.h" 42 #include "llvm/Support/Allocator.h" 43 #include "llvm/Support/SMLoc.h" 44 #include <cassert> 45 #include <cstddef> 46 #include <iterator> 47 #include <map> 48 #include <memory> 49 #include <string> 50 #include <system_error> 51 52 namespace llvm { 53 54 class MemoryBufferRef; 55 class SourceMgr; 56 class raw_ostream; 57 class Twine; 58 59 namespace yaml { 60 61 class Document; 62 class document_iterator; 63 class Node; 64 class Scanner; 65 struct Token; 66 67 /// Dump all the tokens in this stream to OS. 68 /// \returns true if there was an error, false otherwise. 69 bool dumpTokens(StringRef Input, raw_ostream &); 70 71 /// Scans all tokens in input without outputting anything. This is used 72 /// for benchmarking the tokenizer. 73 /// \returns true if there was an error, false otherwise. 74 bool scanTokens(StringRef Input); 75 76 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable 77 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is 78 /// false, those UTF8 sequences encoding printable unicode scalars will not be 79 /// escaped, but emitted verbatim. 80 std::string escape(StringRef Input, bool EscapePrintable = true); 81 82 /// This class represents a YAML stream potentially containing multiple 83 /// documents. 84 class Stream { 85 public: 86 /// This keeps a reference to the string referenced by \p Input. 87 Stream(StringRef Input, SourceMgr &, bool ShowColors = true, 88 std::error_code *EC = nullptr); 89 90 Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, 91 std::error_code *EC = nullptr); 92 ~Stream(); 93 94 document_iterator begin(); 95 document_iterator end(); 96 void skip(); 97 bool failed(); 98 99 bool validate() { 100 skip(); 101 return !failed(); 102 } 103 104 void printError(Node *N, const Twine &Msg); 105 106 private: 107 friend class Document; 108 109 std::unique_ptr<Scanner> scanner; 110 std::unique_ptr<Document> CurrentDoc; 111 }; 112 113 /// Abstract base class for all Nodes. 114 class Node { 115 virtual void anchor(); 116 117 public: 118 enum NodeKind { 119 NK_Null, 120 NK_Scalar, 121 NK_BlockScalar, 122 NK_KeyValue, 123 NK_Mapping, 124 NK_Sequence, 125 NK_Alias 126 }; 127 128 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 129 StringRef Tag); 130 131 // It's not safe to copy YAML nodes; the document is streamed and the position 132 // is part of the state. 133 Node(const Node &) = delete; 134 void operator=(const Node &) = delete; 135 136 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 137 size_t Alignment = 16) noexcept { 138 return Alloc.Allocate(Size, Alignment); 139 } 140 141 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, 142 size_t Size) noexcept { 143 Alloc.Deallocate(Ptr, Size); 144 } 145 146 void operator delete(void *) noexcept = delete; 147 148 /// Get the value of the anchor attached to this node. If it does not 149 /// have one, getAnchor().size() will be 0. 150 StringRef getAnchor() const { return Anchor; } 151 152 /// Get the tag as it was written in the document. This does not 153 /// perform tag resolution. 154 StringRef getRawTag() const { return Tag; } 155 156 /// Get the verbatium tag for a given Node. This performs tag resoluton 157 /// and substitution. 158 std::string getVerbatimTag() const; 159 160 SMRange getSourceRange() const { return SourceRange; } 161 void setSourceRange(SMRange SR) { SourceRange = SR; } 162 163 // These functions forward to Document and Scanner. 164 Token &peekNext(); 165 Token getNext(); 166 Node *parseBlockNode(); 167 BumpPtrAllocator &getAllocator(); 168 void setError(const Twine &Message, Token &Location) const; 169 bool failed() const; 170 171 virtual void skip() {} 172 173 unsigned int getType() const { return TypeID; } 174 175 protected: 176 std::unique_ptr<Document> &Doc; 177 SMRange SourceRange; 178 179 ~Node() = default; 180 181 private: 182 unsigned int TypeID; 183 StringRef Anchor; 184 /// The tag as typed in the document. 185 StringRef Tag; 186 }; 187 188 /// A null value. 189 /// 190 /// Example: 191 /// !!null null 192 class NullNode final : public Node { 193 void anchor() override; 194 195 public: 196 NullNode(std::unique_ptr<Document> &D) 197 : Node(NK_Null, D, StringRef(), StringRef()) {} 198 199 static bool classof(const Node *N) { return N->getType() == NK_Null; } 200 }; 201 202 /// A scalar node is an opaque datum that can be presented as a 203 /// series of zero or more Unicode scalar values. 204 /// 205 /// Example: 206 /// Adena 207 class ScalarNode final : public Node { 208 void anchor() override; 209 210 public: 211 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 212 StringRef Val) 213 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 214 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 215 SMLoc End = SMLoc::getFromPointer(Val.end()); 216 SourceRange = SMRange(Start, End); 217 } 218 219 // Return Value without any escaping or folding or other fun YAML stuff. This 220 // is the exact bytes that are contained in the file (after conversion to 221 // utf8). 222 StringRef getRawValue() const { return Value; } 223 224 /// Gets the value of this node as a StringRef. 225 /// 226 /// \param Storage is used to store the content of the returned StringRef iff 227 /// it requires any modification from how it appeared in the source. 228 /// This happens with escaped characters and multi-line literals. 229 StringRef getValue(SmallVectorImpl<char> &Storage) const; 230 231 static bool classof(const Node *N) { 232 return N->getType() == NK_Scalar; 233 } 234 235 private: 236 StringRef Value; 237 238 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 239 StringRef::size_type Start, 240 SmallVectorImpl<char> &Storage) const; 241 }; 242 243 /// A block scalar node is an opaque datum that can be presented as a 244 /// series of zero or more Unicode scalar values. 245 /// 246 /// Example: 247 /// | 248 /// Hello 249 /// World 250 class BlockScalarNode final : public Node { 251 void anchor() override; 252 253 public: 254 BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 255 StringRef Value, StringRef RawVal) 256 : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { 257 SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); 258 SMLoc End = SMLoc::getFromPointer(RawVal.end()); 259 SourceRange = SMRange(Start, End); 260 } 261 262 /// Gets the value of this node as a StringRef. 263 StringRef getValue() const { return Value; } 264 265 static bool classof(const Node *N) { 266 return N->getType() == NK_BlockScalar; 267 } 268 269 private: 270 StringRef Value; 271 }; 272 273 /// A key and value pair. While not technically a Node under the YAML 274 /// representation graph, it is easier to treat them this way. 275 /// 276 /// TODO: Consider making this not a child of Node. 277 /// 278 /// Example: 279 /// Section: .text 280 class KeyValueNode final : public Node { 281 void anchor() override; 282 283 public: 284 KeyValueNode(std::unique_ptr<Document> &D) 285 : Node(NK_KeyValue, D, StringRef(), StringRef()) {} 286 287 /// Parse and return the key. 288 /// 289 /// This may be called multiple times. 290 /// 291 /// \returns The key, or nullptr if failed() == true. 292 Node *getKey(); 293 294 /// Parse and return the value. 295 /// 296 /// This may be called multiple times. 297 /// 298 /// \returns The value, or nullptr if failed() == true. 299 Node *getValue(); 300 301 void skip() override { 302 if (Node *Key = getKey()) { 303 Key->skip(); 304 if (Node *Val = getValue()) 305 Val->skip(); 306 } 307 } 308 309 static bool classof(const Node *N) { 310 return N->getType() == NK_KeyValue; 311 } 312 313 private: 314 Node *Key = nullptr; 315 Node *Value = nullptr; 316 }; 317 318 /// This is an iterator abstraction over YAML collections shared by both 319 /// sequences and maps. 320 /// 321 /// BaseT must have a ValueT* member named CurrentEntry and a member function 322 /// increment() which must set CurrentEntry to 0 to create an end iterator. 323 template <class BaseT, class ValueT> 324 class basic_collection_iterator 325 : public std::iterator<std::input_iterator_tag, ValueT> { 326 public: 327 basic_collection_iterator() = default; 328 basic_collection_iterator(BaseT *B) : Base(B) {} 329 330 ValueT *operator->() const { 331 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 332 return Base->CurrentEntry; 333 } 334 335 ValueT &operator*() const { 336 assert(Base && Base->CurrentEntry && 337 "Attempted to dereference end iterator!"); 338 return *Base->CurrentEntry; 339 } 340 341 operator ValueT *() const { 342 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 343 return Base->CurrentEntry; 344 } 345 346 /// Note on EqualityComparable: 347 /// 348 /// The iterator is not re-entrant, 349 /// it is meant to be used for parsing YAML on-demand 350 /// Once iteration started - it can point only to one entry at a time 351 /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal 352 /// iff Base and Other.Base are equal. 353 bool operator==(const basic_collection_iterator &Other) const { 354 if (Base && (Base == Other.Base)) { 355 assert((Base->CurrentEntry == Other.Base->CurrentEntry) 356 && "Equal Bases expected to point to equal Entries"); 357 } 358 359 return Base == Other.Base; 360 } 361 362 bool operator!=(const basic_collection_iterator &Other) const { 363 return !(Base == Other.Base); 364 } 365 366 basic_collection_iterator &operator++() { 367 assert(Base && "Attempted to advance iterator past end!"); 368 Base->increment(); 369 // Create an end iterator. 370 if (!Base->CurrentEntry) 371 Base = nullptr; 372 return *this; 373 } 374 375 private: 376 BaseT *Base = nullptr; 377 }; 378 379 // The following two templates are used for both MappingNode and Sequence Node. 380 template <class CollectionType> 381 typename CollectionType::iterator begin(CollectionType &C) { 382 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 383 C.IsAtBeginning = false; 384 typename CollectionType::iterator ret(&C); 385 ++ret; 386 return ret; 387 } 388 389 template <class CollectionType> void skip(CollectionType &C) { 390 // TODO: support skipping from the middle of a parsed collection ;/ 391 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 392 if (C.IsAtBeginning) 393 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 394 ++i) 395 i->skip(); 396 } 397 398 /// Represents a YAML map created from either a block map for a flow map. 399 /// 400 /// This parses the YAML stream as increment() is called. 401 /// 402 /// Example: 403 /// Name: _main 404 /// Scope: Global 405 class MappingNode final : public Node { 406 void anchor() override; 407 408 public: 409 enum MappingType { 410 MT_Block, 411 MT_Flow, 412 MT_Inline ///< An inline mapping node is used for "[key: value]". 413 }; 414 415 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 416 MappingType MT) 417 : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} 418 419 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 420 421 using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; 422 423 template <class T> friend typename T::iterator yaml::begin(T &); 424 template <class T> friend void yaml::skip(T &); 425 426 iterator begin() { return yaml::begin(*this); } 427 428 iterator end() { return iterator(); } 429 430 void skip() override { yaml::skip(*this); } 431 432 static bool classof(const Node *N) { 433 return N->getType() == NK_Mapping; 434 } 435 436 private: 437 MappingType Type; 438 bool IsAtBeginning = true; 439 bool IsAtEnd = false; 440 KeyValueNode *CurrentEntry = nullptr; 441 442 void increment(); 443 }; 444 445 /// Represents a YAML sequence created from either a block sequence for a 446 /// flow sequence. 447 /// 448 /// This parses the YAML stream as increment() is called. 449 /// 450 /// Example: 451 /// - Hello 452 /// - World 453 class SequenceNode final : public Node { 454 void anchor() override; 455 456 public: 457 enum SequenceType { 458 ST_Block, 459 ST_Flow, 460 // Use for: 461 // 462 // key: 463 // - val1 464 // - val2 465 // 466 // As a BlockMappingEntry and BlockEnd are not created in this case. 467 ST_Indentless 468 }; 469 470 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 471 SequenceType ST) 472 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} 473 474 friend class basic_collection_iterator<SequenceNode, Node>; 475 476 using iterator = basic_collection_iterator<SequenceNode, Node>; 477 478 template <class T> friend typename T::iterator yaml::begin(T &); 479 template <class T> friend void yaml::skip(T &); 480 481 void increment(); 482 483 iterator begin() { return yaml::begin(*this); } 484 485 iterator end() { return iterator(); } 486 487 void skip() override { yaml::skip(*this); } 488 489 static bool classof(const Node *N) { 490 return N->getType() == NK_Sequence; 491 } 492 493 private: 494 SequenceType SeqType; 495 bool IsAtBeginning = true; 496 bool IsAtEnd = false; 497 bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. 498 Node *CurrentEntry = nullptr; 499 }; 500 501 /// Represents an alias to a Node with an anchor. 502 /// 503 /// Example: 504 /// *AnchorName 505 class AliasNode final : public Node { 506 void anchor() override; 507 508 public: 509 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 510 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 511 512 StringRef getName() const { return Name; } 513 Node *getTarget(); 514 515 static bool classof(const Node *N) { return N->getType() == NK_Alias; } 516 517 private: 518 StringRef Name; 519 }; 520 521 /// A YAML Stream is a sequence of Documents. A document contains a root 522 /// node. 523 class Document { 524 public: 525 Document(Stream &ParentStream); 526 527 /// Root for parsing a node. Returns a single node. 528 Node *parseBlockNode(); 529 530 /// Finish parsing the current document and return true if there are 531 /// more. Return false otherwise. 532 bool skip(); 533 534 /// Parse and return the root level node. 535 Node *getRoot() { 536 if (Root) 537 return Root; 538 return Root = parseBlockNode(); 539 } 540 541 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 542 543 private: 544 friend class Node; 545 friend class document_iterator; 546 547 /// Stream to read tokens from. 548 Stream &stream; 549 550 /// Used to allocate nodes to. All are destroyed without calling their 551 /// destructor when the document is destroyed. 552 BumpPtrAllocator NodeAllocator; 553 554 /// The root node. Used to support skipping a partially parsed 555 /// document. 556 Node *Root; 557 558 /// Maps tag prefixes to their expansion. 559 std::map<StringRef, StringRef> TagMap; 560 561 Token &peekNext(); 562 Token getNext(); 563 void setError(const Twine &Message, Token &Location) const; 564 bool failed() const; 565 566 /// Parse %BLAH directives and return true if any were encountered. 567 bool parseDirectives(); 568 569 /// Parse %YAML 570 void parseYAMLDirective(); 571 572 /// Parse %TAG 573 void parseTAGDirective(); 574 575 /// Consume the next token and error if it is not \a TK. 576 bool expectToken(int TK); 577 }; 578 579 /// Iterator abstraction for Documents over a Stream. 580 class document_iterator { 581 public: 582 document_iterator() = default; 583 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 584 585 bool operator==(const document_iterator &Other) const { 586 if (isAtEnd() || Other.isAtEnd()) 587 return isAtEnd() && Other.isAtEnd(); 588 589 return Doc == Other.Doc; 590 } 591 bool operator!=(const document_iterator &Other) const { 592 return !(*this == Other); 593 } 594 595 document_iterator operator++() { 596 assert(Doc && "incrementing iterator past the end."); 597 if (!(*Doc)->skip()) { 598 Doc->reset(nullptr); 599 } else { 600 Stream &S = (*Doc)->stream; 601 Doc->reset(new Document(S)); 602 } 603 return *this; 604 } 605 606 Document &operator*() { return *Doc->get(); } 607 608 std::unique_ptr<Document> &operator->() { return *Doc; } 609 610 private: 611 bool isAtEnd() const { return !Doc || !*Doc; } 612 613 std::unique_ptr<Document> *Doc = nullptr; 614 }; 615 616 } // end namespace yaml 617 618 } // end namespace llvm 619 620 #endif // LLVM_SUPPORT_YAMLPARSER_H 621