1 //===--- YAMLParser.h - Simple YAML parser --------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is a YAML 1.2 parser. 11 // 12 // See http://www.yaml.org/spec/1.2/spec.html for the full standard. 13 // 14 // This currently does not implement the following: 15 // * Multi-line literal folding. 16 // * Tag resolution. 17 // * UTF-16. 18 // * BOMs anywhere other than the first Unicode scalar value in the file. 19 // 20 // The most important class here is Stream. This represents a YAML stream with 21 // 0, 1, or many documents. 22 // 23 // SourceMgr sm; 24 // StringRef input = getInput(); 25 // yaml::Stream stream(input, sm); 26 // 27 // for (yaml::document_iterator di = stream.begin(), de = stream.end(); 28 // di != de; ++di) { 29 // yaml::Node *n = di->getRoot(); 30 // if (n) { 31 // // Do something with n... 32 // } else 33 // break; 34 // } 35 // 36 //===----------------------------------------------------------------------===// 37 38 #ifndef LLVM_SUPPORT_YAMLPARSER_H 39 #define LLVM_SUPPORT_YAMLPARSER_H 40 41 #include "llvm/ADT/StringRef.h" 42 #include "llvm/Support/Allocator.h" 43 #include "llvm/Support/SMLoc.h" 44 #include <limits> 45 #include <map> 46 #include <utility> 47 48 namespace llvm { 49 class MemoryBufferRef; 50 class SourceMgr; 51 class Twine; 52 class raw_ostream; 53 54 namespace yaml { 55 56 class document_iterator; 57 class Document; 58 class Node; 59 class Scanner; 60 struct Token; 61 62 /// \brief Dump all the tokens in this stream to OS. 63 /// \returns true if there was an error, false otherwise. 64 bool dumpTokens(StringRef Input, raw_ostream &); 65 66 /// \brief Scans all tokens in input without outputting anything. This is used 67 /// for benchmarking the tokenizer. 68 /// \returns true if there was an error, false otherwise. 69 bool scanTokens(StringRef Input); 70 71 /// \brief Escape \a Input for a double quoted scalar. 72 std::string escape(StringRef Input); 73 74 /// \brief This class represents a YAML stream potentially containing multiple 75 /// documents. 76 class Stream { 77 public: 78 /// \brief This keeps a reference to the string referenced by \p Input. 79 Stream(StringRef Input, SourceMgr &, bool ShowColors = true); 80 81 Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true); 82 ~Stream(); 83 84 document_iterator begin(); 85 document_iterator end(); 86 void skip(); 87 bool failed(); 88 bool validate() { 89 skip(); 90 return !failed(); 91 } 92 93 void printError(Node *N, const Twine &Msg); 94 95 private: 96 std::unique_ptr<Scanner> scanner; 97 std::unique_ptr<Document> CurrentDoc; 98 99 friend class Document; 100 }; 101 102 /// \brief Abstract base class for all Nodes. 103 class Node { 104 virtual void anchor(); 105 106 public: 107 enum NodeKind { 108 NK_Null, 109 NK_Scalar, 110 NK_BlockScalar, 111 NK_KeyValue, 112 NK_Mapping, 113 NK_Sequence, 114 NK_Alias 115 }; 116 117 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 118 StringRef Tag); 119 120 /// \brief Get the value of the anchor attached to this node. If it does not 121 /// have one, getAnchor().size() will be 0. 122 StringRef getAnchor() const { return Anchor; } 123 124 /// \brief Get the tag as it was written in the document. This does not 125 /// perform tag resolution. 126 StringRef getRawTag() const { return Tag; } 127 128 /// \brief Get the verbatium tag for a given Node. This performs tag resoluton 129 /// and substitution. 130 std::string getVerbatimTag() const; 131 132 SMRange getSourceRange() const { return SourceRange; } 133 void setSourceRange(SMRange SR) { SourceRange = SR; } 134 135 // These functions forward to Document and Scanner. 136 Token &peekNext(); 137 Token getNext(); 138 Node *parseBlockNode(); 139 BumpPtrAllocator &getAllocator(); 140 void setError(const Twine &Message, Token &Location) const; 141 bool failed() const; 142 143 virtual void skip() {} 144 145 unsigned int getType() const { return TypeID; } 146 147 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 148 size_t Alignment = 16) LLVM_NOEXCEPT { 149 return Alloc.Allocate(Size, Alignment); 150 } 151 152 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, 153 size_t Size) LLVM_NOEXCEPT { 154 Alloc.Deallocate(Ptr, Size); 155 } 156 157 protected: 158 std::unique_ptr<Document> &Doc; 159 SMRange SourceRange; 160 161 void operator delete(void *) LLVM_NOEXCEPT = delete; 162 163 ~Node() = default; 164 165 private: 166 unsigned int TypeID; 167 StringRef Anchor; 168 /// \brief The tag as typed in the document. 169 StringRef Tag; 170 }; 171 172 /// \brief A null value. 173 /// 174 /// Example: 175 /// !!null null 176 class NullNode final : public Node { 177 void anchor() override; 178 179 public: 180 NullNode(std::unique_ptr<Document> &D) 181 : Node(NK_Null, D, StringRef(), StringRef()) {} 182 183 static inline bool classof(const Node *N) { return N->getType() == NK_Null; } 184 }; 185 186 /// \brief A scalar node is an opaque datum that can be presented as a 187 /// series of zero or more Unicode scalar values. 188 /// 189 /// Example: 190 /// Adena 191 class ScalarNode final : public Node { 192 void anchor() override; 193 194 public: 195 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 196 StringRef Val) 197 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 198 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 199 SMLoc End = SMLoc::getFromPointer(Val.end()); 200 SourceRange = SMRange(Start, End); 201 } 202 203 // Return Value without any escaping or folding or other fun YAML stuff. This 204 // is the exact bytes that are contained in the file (after conversion to 205 // utf8). 206 StringRef getRawValue() const { return Value; } 207 208 /// \brief Gets the value of this node as a StringRef. 209 /// 210 /// \param Storage is used to store the content of the returned StringRef iff 211 /// it requires any modification from how it appeared in the source. 212 /// This happens with escaped characters and multi-line literals. 213 StringRef getValue(SmallVectorImpl<char> &Storage) const; 214 215 static inline bool classof(const Node *N) { 216 return N->getType() == NK_Scalar; 217 } 218 219 private: 220 StringRef Value; 221 222 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 223 StringRef::size_type Start, 224 SmallVectorImpl<char> &Storage) const; 225 }; 226 227 /// \brief A block scalar node is an opaque datum that can be presented as a 228 /// series of zero or more Unicode scalar values. 229 /// 230 /// Example: 231 /// | 232 /// Hello 233 /// World 234 class BlockScalarNode final : public Node { 235 void anchor() override; 236 237 public: 238 BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 239 StringRef Value, StringRef RawVal) 240 : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { 241 SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); 242 SMLoc End = SMLoc::getFromPointer(RawVal.end()); 243 SourceRange = SMRange(Start, End); 244 } 245 246 /// \brief Gets the value of this node as a StringRef. 247 StringRef getValue() const { return Value; } 248 249 static inline bool classof(const Node *N) { 250 return N->getType() == NK_BlockScalar; 251 } 252 253 private: 254 StringRef Value; 255 }; 256 257 /// \brief A key and value pair. While not technically a Node under the YAML 258 /// representation graph, it is easier to treat them this way. 259 /// 260 /// TODO: Consider making this not a child of Node. 261 /// 262 /// Example: 263 /// Section: .text 264 class KeyValueNode final : public Node { 265 void anchor() override; 266 267 public: 268 KeyValueNode(std::unique_ptr<Document> &D) 269 : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr), 270 Value(nullptr) {} 271 272 /// \brief Parse and return the key. 273 /// 274 /// This may be called multiple times. 275 /// 276 /// \returns The key, or nullptr if failed() == true. 277 Node *getKey(); 278 279 /// \brief Parse and return the value. 280 /// 281 /// This may be called multiple times. 282 /// 283 /// \returns The value, or nullptr if failed() == true. 284 Node *getValue(); 285 286 void skip() override { 287 getKey()->skip(); 288 if (Node *Val = getValue()) 289 Val->skip(); 290 } 291 292 static inline bool classof(const Node *N) { 293 return N->getType() == NK_KeyValue; 294 } 295 296 private: 297 Node *Key; 298 Node *Value; 299 }; 300 301 /// \brief This is an iterator abstraction over YAML collections shared by both 302 /// sequences and maps. 303 /// 304 /// BaseT must have a ValueT* member named CurrentEntry and a member function 305 /// increment() which must set CurrentEntry to 0 to create an end iterator. 306 template <class BaseT, class ValueT> 307 class basic_collection_iterator 308 : public std::iterator<std::forward_iterator_tag, ValueT> { 309 public: 310 basic_collection_iterator() : Base(nullptr) {} 311 basic_collection_iterator(BaseT *B) : Base(B) {} 312 313 ValueT *operator->() const { 314 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 315 return Base->CurrentEntry; 316 } 317 318 ValueT &operator*() const { 319 assert(Base && Base->CurrentEntry && 320 "Attempted to dereference end iterator!"); 321 return *Base->CurrentEntry; 322 } 323 324 operator ValueT *() const { 325 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 326 return Base->CurrentEntry; 327 } 328 329 bool operator!=(const basic_collection_iterator &Other) const { 330 if (Base != Other.Base) 331 return true; 332 return (Base && Other.Base) && 333 Base->CurrentEntry != Other.Base->CurrentEntry; 334 } 335 336 basic_collection_iterator &operator++() { 337 assert(Base && "Attempted to advance iterator past end!"); 338 Base->increment(); 339 // Create an end iterator. 340 if (!Base->CurrentEntry) 341 Base = nullptr; 342 return *this; 343 } 344 345 private: 346 BaseT *Base; 347 }; 348 349 // The following two templates are used for both MappingNode and Sequence Node. 350 template <class CollectionType> 351 typename CollectionType::iterator begin(CollectionType &C) { 352 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 353 C.IsAtBeginning = false; 354 typename CollectionType::iterator ret(&C); 355 ++ret; 356 return ret; 357 } 358 359 template <class CollectionType> void skip(CollectionType &C) { 360 // TODO: support skipping from the middle of a parsed collection ;/ 361 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 362 if (C.IsAtBeginning) 363 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 364 ++i) 365 i->skip(); 366 } 367 368 /// \brief Represents a YAML map created from either a block map for a flow map. 369 /// 370 /// This parses the YAML stream as increment() is called. 371 /// 372 /// Example: 373 /// Name: _main 374 /// Scope: Global 375 class MappingNode final : public Node { 376 void anchor() override; 377 378 public: 379 enum MappingType { 380 MT_Block, 381 MT_Flow, 382 MT_Inline ///< An inline mapping node is used for "[key: value]". 383 }; 384 385 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 386 MappingType MT) 387 : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true), 388 IsAtEnd(false), CurrentEntry(nullptr) {} 389 390 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 391 typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator; 392 template <class T> friend typename T::iterator yaml::begin(T &); 393 template <class T> friend void yaml::skip(T &); 394 395 iterator begin() { return yaml::begin(*this); } 396 397 iterator end() { return iterator(); } 398 399 void skip() override { yaml::skip(*this); } 400 401 static inline bool classof(const Node *N) { 402 return N->getType() == NK_Mapping; 403 } 404 405 private: 406 MappingType Type; 407 bool IsAtBeginning; 408 bool IsAtEnd; 409 KeyValueNode *CurrentEntry; 410 411 void increment(); 412 }; 413 414 /// \brief Represents a YAML sequence created from either a block sequence for a 415 /// flow sequence. 416 /// 417 /// This parses the YAML stream as increment() is called. 418 /// 419 /// Example: 420 /// - Hello 421 /// - World 422 class SequenceNode final : public Node { 423 void anchor() override; 424 425 public: 426 enum SequenceType { 427 ST_Block, 428 ST_Flow, 429 // Use for: 430 // 431 // key: 432 // - val1 433 // - val2 434 // 435 // As a BlockMappingEntry and BlockEnd are not created in this case. 436 ST_Indentless 437 }; 438 439 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 440 SequenceType ST) 441 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true), 442 IsAtEnd(false), 443 WasPreviousTokenFlowEntry(true), // Start with an imaginary ','. 444 CurrentEntry(nullptr) {} 445 446 friend class basic_collection_iterator<SequenceNode, Node>; 447 typedef basic_collection_iterator<SequenceNode, Node> iterator; 448 template <class T> friend typename T::iterator yaml::begin(T &); 449 template <class T> friend void yaml::skip(T &); 450 451 void increment(); 452 453 iterator begin() { return yaml::begin(*this); } 454 455 iterator end() { return iterator(); } 456 457 void skip() override { yaml::skip(*this); } 458 459 static inline bool classof(const Node *N) { 460 return N->getType() == NK_Sequence; 461 } 462 463 private: 464 SequenceType SeqType; 465 bool IsAtBeginning; 466 bool IsAtEnd; 467 bool WasPreviousTokenFlowEntry; 468 Node *CurrentEntry; 469 }; 470 471 /// \brief Represents an alias to a Node with an anchor. 472 /// 473 /// Example: 474 /// *AnchorName 475 class AliasNode final : public Node { 476 void anchor() override; 477 478 public: 479 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 480 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 481 482 StringRef getName() const { return Name; } 483 Node *getTarget(); 484 485 static inline bool classof(const Node *N) { return N->getType() == NK_Alias; } 486 487 private: 488 StringRef Name; 489 }; 490 491 /// \brief A YAML Stream is a sequence of Documents. A document contains a root 492 /// node. 493 class Document { 494 public: 495 /// \brief Root for parsing a node. Returns a single node. 496 Node *parseBlockNode(); 497 498 Document(Stream &ParentStream); 499 500 /// \brief Finish parsing the current document and return true if there are 501 /// more. Return false otherwise. 502 bool skip(); 503 504 /// \brief Parse and return the root level node. 505 Node *getRoot() { 506 if (Root) 507 return Root; 508 return Root = parseBlockNode(); 509 } 510 511 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 512 513 private: 514 friend class Node; 515 friend class document_iterator; 516 517 /// \brief Stream to read tokens from. 518 Stream &stream; 519 520 /// \brief Used to allocate nodes to. All are destroyed without calling their 521 /// destructor when the document is destroyed. 522 BumpPtrAllocator NodeAllocator; 523 524 /// \brief The root node. Used to support skipping a partially parsed 525 /// document. 526 Node *Root; 527 528 /// \brief Maps tag prefixes to their expansion. 529 std::map<StringRef, StringRef> TagMap; 530 531 Token &peekNext(); 532 Token getNext(); 533 void setError(const Twine &Message, Token &Location) const; 534 bool failed() const; 535 536 /// \brief Parse %BLAH directives and return true if any were encountered. 537 bool parseDirectives(); 538 539 /// \brief Parse %YAML 540 void parseYAMLDirective(); 541 542 /// \brief Parse %TAG 543 void parseTAGDirective(); 544 545 /// \brief Consume the next token and error if it is not \a TK. 546 bool expectToken(int TK); 547 }; 548 549 /// \brief Iterator abstraction for Documents over a Stream. 550 class document_iterator { 551 public: 552 document_iterator() : Doc(nullptr) {} 553 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 554 555 bool operator==(const document_iterator &Other) { 556 if (isAtEnd() || Other.isAtEnd()) 557 return isAtEnd() && Other.isAtEnd(); 558 559 return Doc == Other.Doc; 560 } 561 bool operator!=(const document_iterator &Other) { return !(*this == Other); } 562 563 document_iterator operator++() { 564 assert(Doc && "incrementing iterator past the end."); 565 if (!(*Doc)->skip()) { 566 Doc->reset(nullptr); 567 } else { 568 Stream &S = (*Doc)->stream; 569 Doc->reset(new Document(S)); 570 } 571 return *this; 572 } 573 574 Document &operator*() { return *Doc->get(); } 575 576 std::unique_ptr<Document> &operator->() { return *Doc; } 577 578 private: 579 bool isAtEnd() const { return !Doc || !*Doc; } 580 581 std::unique_ptr<Document> *Doc; 582 }; 583 584 } // End namespace yaml. 585 586 } // End namespace llvm. 587 588 #endif 589