1 //===--- YAMLParser.h - Simple YAML parser --------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This is a YAML 1.2 parser. 11 // 12 // See http://www.yaml.org/spec/1.2/spec.html for the full standard. 13 // 14 // This currently does not implement the following: 15 // * Multi-line literal folding. 16 // * Tag resolution. 17 // * UTF-16. 18 // * BOMs anywhere other than the first Unicode scalar value in the file. 19 // 20 // The most important class here is Stream. This represents a YAML stream with 21 // 0, 1, or many documents. 22 // 23 // SourceMgr sm; 24 // StringRef input = getInput(); 25 // yaml::Stream stream(input, sm); 26 // 27 // for (yaml::document_iterator di = stream.begin(), de = stream.end(); 28 // di != de; ++di) { 29 // yaml::Node *n = di->getRoot(); 30 // if (n) { 31 // // Do something with n... 32 // } else 33 // break; 34 // } 35 // 36 //===----------------------------------------------------------------------===// 37 38 #ifndef LLVM_SUPPORT_YAMLPARSER_H 39 #define LLVM_SUPPORT_YAMLPARSER_H 40 41 #include "llvm/ADT/SmallString.h" 42 #include "llvm/ADT/StringRef.h" 43 #include "llvm/Support/Allocator.h" 44 #include "llvm/Support/SMLoc.h" 45 #include <limits> 46 #include <map> 47 #include <utility> 48 49 namespace llvm { 50 class MemoryBuffer; 51 class SourceMgr; 52 class raw_ostream; 53 class Twine; 54 55 namespace yaml { 56 57 class document_iterator; 58 class Document; 59 class Node; 60 class Scanner; 61 struct Token; 62 63 /// \brief Dump all the tokens in this stream to OS. 64 /// \returns true if there was an error, false otherwise. 65 bool dumpTokens(StringRef Input, raw_ostream &); 66 67 /// \brief Scans all tokens in input without outputting anything. This is used 68 /// for benchmarking the tokenizer. 69 /// \returns true if there was an error, false otherwise. 70 bool scanTokens(StringRef Input); 71 72 /// \brief Escape \a Input for a double quoted scalar. 73 std::string escape(StringRef Input); 74 75 /// \brief This class represents a YAML stream potentially containing multiple 76 /// documents. 77 class Stream { 78 public: 79 /// \brief This keeps a reference to the string referenced by \p Input. 80 Stream(StringRef Input, SourceMgr &); 81 82 /// \brief This takes ownership of \p InputBuffer. 83 Stream(MemoryBuffer *InputBuffer, SourceMgr &); 84 ~Stream(); 85 86 document_iterator begin(); 87 document_iterator end(); 88 void skip(); 89 bool failed(); 90 bool validate() { 91 skip(); 92 return !failed(); 93 } 94 95 void printError(Node *N, const Twine &Msg); 96 97 private: 98 std::unique_ptr<Scanner> scanner; 99 std::unique_ptr<Document> CurrentDoc; 100 101 friend class Document; 102 }; 103 104 /// \brief Abstract base class for all Nodes. 105 class Node { 106 virtual void anchor(); 107 108 public: 109 enum NodeKind { 110 NK_Null, 111 NK_Scalar, 112 NK_KeyValue, 113 NK_Mapping, 114 NK_Sequence, 115 NK_Alias 116 }; 117 118 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 119 StringRef Tag); 120 121 /// \brief Get the value of the anchor attached to this node. If it does not 122 /// have one, getAnchor().size() will be 0. 123 StringRef getAnchor() const { return Anchor; } 124 125 /// \brief Get the tag as it was written in the document. This does not 126 /// perform tag resolution. 127 StringRef getRawTag() const { return Tag; } 128 129 /// \brief Get the verbatium tag for a given Node. This performs tag resoluton 130 /// and substitution. 131 std::string getVerbatimTag() const; 132 133 SMRange getSourceRange() const { return SourceRange; } 134 void setSourceRange(SMRange SR) { SourceRange = SR; } 135 136 // These functions forward to Document and Scanner. 137 Token &peekNext(); 138 Token getNext(); 139 Node *parseBlockNode(); 140 BumpPtrAllocator &getAllocator(); 141 void setError(const Twine &Message, Token &Location) const; 142 bool failed() const; 143 144 virtual void skip() {} 145 146 unsigned int getType() const { return TypeID; } 147 148 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 149 size_t Alignment = 16) throw() { 150 return Alloc.Allocate(Size, Alignment); 151 } 152 153 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, size_t Size) throw() { 154 Alloc.Deallocate(Ptr, Size); 155 } 156 157 protected: 158 std::unique_ptr<Document> &Doc; 159 SMRange SourceRange; 160 161 void operator delete(void *) throw() {} 162 163 virtual ~Node() {} 164 165 private: 166 unsigned int TypeID; 167 StringRef Anchor; 168 /// \brief The tag as typed in the document. 169 StringRef Tag; 170 }; 171 172 /// \brief A null value. 173 /// 174 /// Example: 175 /// !!null null 176 class NullNode : public Node { 177 void anchor() override; 178 179 public: 180 NullNode(std::unique_ptr<Document> &D) 181 : Node(NK_Null, D, StringRef(), StringRef()) {} 182 183 static inline bool classof(const Node *N) { return N->getType() == NK_Null; } 184 }; 185 186 /// \brief A scalar node is an opaque datum that can be presented as a 187 /// series of zero or more Unicode scalar values. 188 /// 189 /// Example: 190 /// Adena 191 class ScalarNode : public Node { 192 void anchor() override; 193 194 public: 195 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 196 StringRef Val) 197 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 198 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 199 SMLoc End = SMLoc::getFromPointer(Val.end()); 200 SourceRange = SMRange(Start, End); 201 } 202 203 // Return Value without any escaping or folding or other fun YAML stuff. This 204 // is the exact bytes that are contained in the file (after conversion to 205 // utf8). 206 StringRef getRawValue() const { return Value; } 207 208 /// \brief Gets the value of this node as a StringRef. 209 /// 210 /// \param Storage is used to store the content of the returned StringRef iff 211 /// it requires any modification from how it appeared in the source. 212 /// This happens with escaped characters and multi-line literals. 213 StringRef getValue(SmallVectorImpl<char> &Storage) const; 214 215 static inline bool classof(const Node *N) { 216 return N->getType() == NK_Scalar; 217 } 218 219 private: 220 StringRef Value; 221 222 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 223 StringRef::size_type Start, 224 SmallVectorImpl<char> &Storage) const; 225 }; 226 227 /// \brief A key and value pair. While not technically a Node under the YAML 228 /// representation graph, it is easier to treat them this way. 229 /// 230 /// TODO: Consider making this not a child of Node. 231 /// 232 /// Example: 233 /// Section: .text 234 class KeyValueNode : public Node { 235 void anchor() override; 236 237 public: 238 KeyValueNode(std::unique_ptr<Document> &D) 239 : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr), 240 Value(nullptr) {} 241 242 /// \brief Parse and return the key. 243 /// 244 /// This may be called multiple times. 245 /// 246 /// \returns The key, or nullptr if failed() == true. 247 Node *getKey(); 248 249 /// \brief Parse and return the value. 250 /// 251 /// This may be called multiple times. 252 /// 253 /// \returns The value, or nullptr if failed() == true. 254 Node *getValue(); 255 256 void skip() override { 257 getKey()->skip(); 258 getValue()->skip(); 259 } 260 261 static inline bool classof(const Node *N) { 262 return N->getType() == NK_KeyValue; 263 } 264 265 private: 266 Node *Key; 267 Node *Value; 268 }; 269 270 /// \brief This is an iterator abstraction over YAML collections shared by both 271 /// sequences and maps. 272 /// 273 /// BaseT must have a ValueT* member named CurrentEntry and a member function 274 /// increment() which must set CurrentEntry to 0 to create an end iterator. 275 template <class BaseT, class ValueT> 276 class basic_collection_iterator 277 : public std::iterator<std::forward_iterator_tag, ValueT> { 278 public: 279 basic_collection_iterator() : Base(nullptr) {} 280 basic_collection_iterator(BaseT *B) : Base(B) {} 281 282 ValueT *operator->() const { 283 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 284 return Base->CurrentEntry; 285 } 286 287 ValueT &operator*() const { 288 assert(Base && Base->CurrentEntry && 289 "Attempted to dereference end iterator!"); 290 return *Base->CurrentEntry; 291 } 292 293 operator ValueT *() const { 294 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 295 return Base->CurrentEntry; 296 } 297 298 bool operator!=(const basic_collection_iterator &Other) const { 299 if (Base != Other.Base) 300 return true; 301 return (Base && Other.Base) && 302 Base->CurrentEntry != Other.Base->CurrentEntry; 303 } 304 305 basic_collection_iterator &operator++() { 306 assert(Base && "Attempted to advance iterator past end!"); 307 Base->increment(); 308 // Create an end iterator. 309 if (!Base->CurrentEntry) 310 Base = nullptr; 311 return *this; 312 } 313 314 private: 315 BaseT *Base; 316 }; 317 318 // The following two templates are used for both MappingNode and Sequence Node. 319 template <class CollectionType> 320 typename CollectionType::iterator begin(CollectionType &C) { 321 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 322 C.IsAtBeginning = false; 323 typename CollectionType::iterator ret(&C); 324 ++ret; 325 return ret; 326 } 327 328 template <class CollectionType> void skip(CollectionType &C) { 329 // TODO: support skipping from the middle of a parsed collection ;/ 330 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 331 if (C.IsAtBeginning) 332 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 333 ++i) 334 i->skip(); 335 } 336 337 /// \brief Represents a YAML map created from either a block map for a flow map. 338 /// 339 /// This parses the YAML stream as increment() is called. 340 /// 341 /// Example: 342 /// Name: _main 343 /// Scope: Global 344 class MappingNode : public Node { 345 void anchor() override; 346 347 public: 348 enum MappingType { 349 MT_Block, 350 MT_Flow, 351 MT_Inline ///< An inline mapping node is used for "[key: value]". 352 }; 353 354 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 355 MappingType MT) 356 : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true), 357 IsAtEnd(false), CurrentEntry(nullptr) {} 358 359 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 360 typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator; 361 template <class T> friend typename T::iterator yaml::begin(T &); 362 template <class T> friend void yaml::skip(T &); 363 364 iterator begin() { return yaml::begin(*this); } 365 366 iterator end() { return iterator(); } 367 368 void skip() override { yaml::skip(*this); } 369 370 static inline bool classof(const Node *N) { 371 return N->getType() == NK_Mapping; 372 } 373 374 private: 375 MappingType Type; 376 bool IsAtBeginning; 377 bool IsAtEnd; 378 KeyValueNode *CurrentEntry; 379 380 void increment(); 381 }; 382 383 /// \brief Represents a YAML sequence created from either a block sequence for a 384 /// flow sequence. 385 /// 386 /// This parses the YAML stream as increment() is called. 387 /// 388 /// Example: 389 /// - Hello 390 /// - World 391 class SequenceNode : public Node { 392 void anchor() override; 393 394 public: 395 enum SequenceType { 396 ST_Block, 397 ST_Flow, 398 // Use for: 399 // 400 // key: 401 // - val1 402 // - val2 403 // 404 // As a BlockMappingEntry and BlockEnd are not created in this case. 405 ST_Indentless 406 }; 407 408 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 409 SequenceType ST) 410 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true), 411 IsAtEnd(false), 412 WasPreviousTokenFlowEntry(true), // Start with an imaginary ','. 413 CurrentEntry(nullptr) {} 414 415 friend class basic_collection_iterator<SequenceNode, Node>; 416 typedef basic_collection_iterator<SequenceNode, Node> iterator; 417 template <class T> friend typename T::iterator yaml::begin(T &); 418 template <class T> friend void yaml::skip(T &); 419 420 void increment(); 421 422 iterator begin() { return yaml::begin(*this); } 423 424 iterator end() { return iterator(); } 425 426 void skip() override { yaml::skip(*this); } 427 428 static inline bool classof(const Node *N) { 429 return N->getType() == NK_Sequence; 430 } 431 432 private: 433 SequenceType SeqType; 434 bool IsAtBeginning; 435 bool IsAtEnd; 436 bool WasPreviousTokenFlowEntry; 437 Node *CurrentEntry; 438 }; 439 440 /// \brief Represents an alias to a Node with an anchor. 441 /// 442 /// Example: 443 /// *AnchorName 444 class AliasNode : public Node { 445 void anchor() override; 446 447 public: 448 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 449 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 450 451 StringRef getName() const { return Name; } 452 Node *getTarget(); 453 454 static inline bool classof(const Node *N) { return N->getType() == NK_Alias; } 455 456 private: 457 StringRef Name; 458 }; 459 460 /// \brief A YAML Stream is a sequence of Documents. A document contains a root 461 /// node. 462 class Document { 463 public: 464 /// \brief Root for parsing a node. Returns a single node. 465 Node *parseBlockNode(); 466 467 Document(Stream &ParentStream); 468 469 /// \brief Finish parsing the current document and return true if there are 470 /// more. Return false otherwise. 471 bool skip(); 472 473 /// \brief Parse and return the root level node. 474 Node *getRoot() { 475 if (Root) 476 return Root; 477 return Root = parseBlockNode(); 478 } 479 480 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 481 482 private: 483 friend class Node; 484 friend class document_iterator; 485 486 /// \brief Stream to read tokens from. 487 Stream &stream; 488 489 /// \brief Used to allocate nodes to. All are destroyed without calling their 490 /// destructor when the document is destroyed. 491 BumpPtrAllocator NodeAllocator; 492 493 /// \brief The root node. Used to support skipping a partially parsed 494 /// document. 495 Node *Root; 496 497 /// \brief Maps tag prefixes to their expansion. 498 std::map<StringRef, StringRef> TagMap; 499 500 Token &peekNext(); 501 Token getNext(); 502 void setError(const Twine &Message, Token &Location) const; 503 bool failed() const; 504 505 /// \brief Parse %BLAH directives and return true if any were encountered. 506 bool parseDirectives(); 507 508 /// \brief Parse %YAML 509 void parseYAMLDirective(); 510 511 /// \brief Parse %TAG 512 void parseTAGDirective(); 513 514 /// \brief Consume the next token and error if it is not \a TK. 515 bool expectToken(int TK); 516 }; 517 518 /// \brief Iterator abstraction for Documents over a Stream. 519 class document_iterator { 520 public: 521 document_iterator() : Doc(nullptr) {} 522 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 523 524 bool operator==(const document_iterator &Other) { 525 if (isAtEnd() || Other.isAtEnd()) 526 return isAtEnd() && Other.isAtEnd(); 527 528 return Doc == Other.Doc; 529 } 530 bool operator!=(const document_iterator &Other) { return !(*this == Other); } 531 532 document_iterator operator++() { 533 assert(Doc && "incrementing iterator past the end."); 534 if (!(*Doc)->skip()) { 535 Doc->reset(nullptr); 536 } else { 537 Stream &S = (*Doc)->stream; 538 Doc->reset(new Document(S)); 539 } 540 return *this; 541 } 542 543 Document &operator*() { return *Doc->get(); } 544 545 std::unique_ptr<Document> &operator->() { return *Doc; } 546 547 private: 548 bool isAtEnd() const { return !Doc || !*Doc; } 549 550 std::unique_ptr<Document> *Doc; 551 }; 552 553 } // End namespace yaml. 554 555 } // End namespace llvm. 556 557 #endif 558