Home | History | Annotate | Download | only in Support
      1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This is a YAML 1.2 parser.
     11 //
     12 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
     13 //
     14 //  This currently does not implement the following:
     15 //    * Multi-line literal folding.
     16 //    * Tag resolution.
     17 //    * UTF-16.
     18 //    * BOMs anywhere other than the first Unicode scalar value in the file.
     19 //
     20 //  The most important class here is Stream. This represents a YAML stream with
     21 //  0, 1, or many documents.
     22 //
     23 //  SourceMgr sm;
     24 //  StringRef input = getInput();
     25 //  yaml::Stream stream(input, sm);
     26 //
     27 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
     28 //       di != de; ++di) {
     29 //    yaml::Node *n = di->getRoot();
     30 //    if (n) {
     31 //      // Do something with n...
     32 //    } else
     33 //      break;
     34 //  }
     35 //
     36 //===----------------------------------------------------------------------===//
     37 
     38 #ifndef LLVM_SUPPORT_YAMLPARSER_H
     39 #define LLVM_SUPPORT_YAMLPARSER_H
     40 
     41 #include "llvm/ADT/StringRef.h"
     42 #include "llvm/Support/Allocator.h"
     43 #include "llvm/Support/SMLoc.h"
     44 #include <cassert>
     45 #include <cstddef>
     46 #include <iterator>
     47 #include <map>
     48 #include <memory>
     49 #include <string>
     50 #include <system_error>
     51 
     52 namespace llvm {
     53 
     54 class MemoryBufferRef;
     55 class SourceMgr;
     56 class raw_ostream;
     57 class Twine;
     58 
     59 namespace yaml {
     60 
     61 class Document;
     62 class document_iterator;
     63 class Node;
     64 class Scanner;
     65 struct Token;
     66 
     67 /// \brief Dump all the tokens in this stream to OS.
     68 /// \returns true if there was an error, false otherwise.
     69 bool dumpTokens(StringRef Input, raw_ostream &);
     70 
     71 /// \brief Scans all tokens in input without outputting anything. This is used
     72 ///        for benchmarking the tokenizer.
     73 /// \returns true if there was an error, false otherwise.
     74 bool scanTokens(StringRef Input);
     75 
     76 /// \brief Escape \a Input for a double quoted scalar.
     77 std::string escape(StringRef Input);
     78 
     79 /// \brief This class represents a YAML stream potentially containing multiple
     80 ///        documents.
     81 class Stream {
     82 public:
     83   /// \brief This keeps a reference to the string referenced by \p Input.
     84   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
     85          std::error_code *EC = nullptr);
     86 
     87   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
     88          std::error_code *EC = nullptr);
     89   ~Stream();
     90 
     91   document_iterator begin();
     92   document_iterator end();
     93   void skip();
     94   bool failed();
     95 
     96   bool validate() {
     97     skip();
     98     return !failed();
     99   }
    100 
    101   void printError(Node *N, const Twine &Msg);
    102 
    103 private:
    104   friend class Document;
    105 
    106   std::unique_ptr<Scanner> scanner;
    107   std::unique_ptr<Document> CurrentDoc;
    108 };
    109 
    110 /// \brief Abstract base class for all Nodes.
    111 class Node {
    112   virtual void anchor();
    113 
    114 public:
    115   enum NodeKind {
    116     NK_Null,
    117     NK_Scalar,
    118     NK_BlockScalar,
    119     NK_KeyValue,
    120     NK_Mapping,
    121     NK_Sequence,
    122     NK_Alias
    123   };
    124 
    125   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
    126        StringRef Tag);
    127 
    128   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
    129                      size_t Alignment = 16) noexcept {
    130     return Alloc.Allocate(Size, Alignment);
    131   }
    132 
    133   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
    134                        size_t Size) noexcept {
    135     Alloc.Deallocate(Ptr, Size);
    136   }
    137 
    138   void operator delete(void *) noexcept = delete;
    139 
    140   /// \brief Get the value of the anchor attached to this node. If it does not
    141   ///        have one, getAnchor().size() will be 0.
    142   StringRef getAnchor() const { return Anchor; }
    143 
    144   /// \brief Get the tag as it was written in the document. This does not
    145   ///   perform tag resolution.
    146   StringRef getRawTag() const { return Tag; }
    147 
    148   /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
    149   ///   and substitution.
    150   std::string getVerbatimTag() const;
    151 
    152   SMRange getSourceRange() const { return SourceRange; }
    153   void setSourceRange(SMRange SR) { SourceRange = SR; }
    154 
    155   // These functions forward to Document and Scanner.
    156   Token &peekNext();
    157   Token getNext();
    158   Node *parseBlockNode();
    159   BumpPtrAllocator &getAllocator();
    160   void setError(const Twine &Message, Token &Location) const;
    161   bool failed() const;
    162 
    163   virtual void skip() {}
    164 
    165   unsigned int getType() const { return TypeID; }
    166 
    167 protected:
    168   std::unique_ptr<Document> &Doc;
    169   SMRange SourceRange;
    170 
    171   ~Node() = default;
    172 
    173 private:
    174   unsigned int TypeID;
    175   StringRef Anchor;
    176   /// \brief The tag as typed in the document.
    177   StringRef Tag;
    178 };
    179 
    180 /// \brief A null value.
    181 ///
    182 /// Example:
    183 ///   !!null null
    184 class NullNode final : public Node {
    185   void anchor() override;
    186 
    187 public:
    188   NullNode(std::unique_ptr<Document> &D)
    189       : Node(NK_Null, D, StringRef(), StringRef()) {}
    190 
    191   static bool classof(const Node *N) { return N->getType() == NK_Null; }
    192 };
    193 
    194 /// \brief A scalar node is an opaque datum that can be presented as a
    195 ///        series of zero or more Unicode scalar values.
    196 ///
    197 /// Example:
    198 ///   Adena
    199 class ScalarNode final : public Node {
    200   void anchor() override;
    201 
    202 public:
    203   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    204              StringRef Val)
    205       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
    206     SMLoc Start = SMLoc::getFromPointer(Val.begin());
    207     SMLoc End = SMLoc::getFromPointer(Val.end());
    208     SourceRange = SMRange(Start, End);
    209   }
    210 
    211   // Return Value without any escaping or folding or other fun YAML stuff. This
    212   // is the exact bytes that are contained in the file (after conversion to
    213   // utf8).
    214   StringRef getRawValue() const { return Value; }
    215 
    216   /// \brief Gets the value of this node as a StringRef.
    217   ///
    218   /// \param Storage is used to store the content of the returned StringRef iff
    219   ///        it requires any modification from how it appeared in the source.
    220   ///        This happens with escaped characters and multi-line literals.
    221   StringRef getValue(SmallVectorImpl<char> &Storage) const;
    222 
    223   static bool classof(const Node *N) {
    224     return N->getType() == NK_Scalar;
    225   }
    226 
    227 private:
    228   StringRef Value;
    229 
    230   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
    231                                  StringRef::size_type Start,
    232                                  SmallVectorImpl<char> &Storage) const;
    233 };
    234 
    235 /// \brief A block scalar node is an opaque datum that can be presented as a
    236 ///        series of zero or more Unicode scalar values.
    237 ///
    238 /// Example:
    239 ///   |
    240 ///     Hello
    241 ///     World
    242 class BlockScalarNode final : public Node {
    243   void anchor() override;
    244 
    245 public:
    246   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    247                   StringRef Value, StringRef RawVal)
    248       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
    249     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
    250     SMLoc End = SMLoc::getFromPointer(RawVal.end());
    251     SourceRange = SMRange(Start, End);
    252   }
    253 
    254   /// \brief Gets the value of this node as a StringRef.
    255   StringRef getValue() const { return Value; }
    256 
    257   static bool classof(const Node *N) {
    258     return N->getType() == NK_BlockScalar;
    259   }
    260 
    261 private:
    262   StringRef Value;
    263 };
    264 
    265 /// \brief A key and value pair. While not technically a Node under the YAML
    266 ///        representation graph, it is easier to treat them this way.
    267 ///
    268 /// TODO: Consider making this not a child of Node.
    269 ///
    270 /// Example:
    271 ///   Section: .text
    272 class KeyValueNode final : public Node {
    273   void anchor() override;
    274 
    275 public:
    276   KeyValueNode(std::unique_ptr<Document> &D)
    277       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
    278 
    279   /// \brief Parse and return the key.
    280   ///
    281   /// This may be called multiple times.
    282   ///
    283   /// \returns The key, or nullptr if failed() == true.
    284   Node *getKey();
    285 
    286   /// \brief Parse and return the value.
    287   ///
    288   /// This may be called multiple times.
    289   ///
    290   /// \returns The value, or nullptr if failed() == true.
    291   Node *getValue();
    292 
    293   void skip() override {
    294     getKey()->skip();
    295     if (Node *Val = getValue())
    296       Val->skip();
    297   }
    298 
    299   static bool classof(const Node *N) {
    300     return N->getType() == NK_KeyValue;
    301   }
    302 
    303 private:
    304   Node *Key = nullptr;
    305   Node *Value = nullptr;
    306 };
    307 
    308 /// \brief This is an iterator abstraction over YAML collections shared by both
    309 ///        sequences and maps.
    310 ///
    311 /// BaseT must have a ValueT* member named CurrentEntry and a member function
    312 /// increment() which must set CurrentEntry to 0 to create an end iterator.
    313 template <class BaseT, class ValueT>
    314 class basic_collection_iterator
    315     : public std::iterator<std::input_iterator_tag, ValueT> {
    316 public:
    317   basic_collection_iterator() = default;
    318   basic_collection_iterator(BaseT *B) : Base(B) {}
    319 
    320   ValueT *operator->() const {
    321     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    322     return Base->CurrentEntry;
    323   }
    324 
    325   ValueT &operator*() const {
    326     assert(Base && Base->CurrentEntry &&
    327            "Attempted to dereference end iterator!");
    328     return *Base->CurrentEntry;
    329   }
    330 
    331   operator ValueT *() const {
    332     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    333     return Base->CurrentEntry;
    334   }
    335 
    336   /// Note on EqualityComparable:
    337   ///
    338   /// The iterator is not re-entrant,
    339   /// it is meant to be used for parsing YAML on-demand
    340   /// Once iteration started - it can point only to one entry at a time
    341   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
    342   /// iff Base and Other.Base are equal.
    343   bool operator==(const basic_collection_iterator &Other) const {
    344     if (Base && (Base == Other.Base)) {
    345       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
    346              && "Equal Bases expected to point to equal Entries");
    347     }
    348 
    349     return Base == Other.Base;
    350   }
    351 
    352   bool operator!=(const basic_collection_iterator &Other) const {
    353     return !(Base == Other.Base);
    354   }
    355 
    356   basic_collection_iterator &operator++() {
    357     assert(Base && "Attempted to advance iterator past end!");
    358     Base->increment();
    359     // Create an end iterator.
    360     if (!Base->CurrentEntry)
    361       Base = nullptr;
    362     return *this;
    363   }
    364 
    365 private:
    366   BaseT *Base = nullptr;
    367 };
    368 
    369 // The following two templates are used for both MappingNode and Sequence Node.
    370 template <class CollectionType>
    371 typename CollectionType::iterator begin(CollectionType &C) {
    372   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
    373   C.IsAtBeginning = false;
    374   typename CollectionType::iterator ret(&C);
    375   ++ret;
    376   return ret;
    377 }
    378 
    379 template <class CollectionType> void skip(CollectionType &C) {
    380   // TODO: support skipping from the middle of a parsed collection ;/
    381   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
    382   if (C.IsAtBeginning)
    383     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
    384          ++i)
    385       i->skip();
    386 }
    387 
    388 /// \brief Represents a YAML map created from either a block map for a flow map.
    389 ///
    390 /// This parses the YAML stream as increment() is called.
    391 ///
    392 /// Example:
    393 ///   Name: _main
    394 ///   Scope: Global
    395 class MappingNode final : public Node {
    396   void anchor() override;
    397 
    398 public:
    399   enum MappingType {
    400     MT_Block,
    401     MT_Flow,
    402     MT_Inline ///< An inline mapping node is used for "[key: value]".
    403   };
    404 
    405   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    406               MappingType MT)
    407       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
    408 
    409   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
    410 
    411   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
    412 
    413   template <class T> friend typename T::iterator yaml::begin(T &);
    414   template <class T> friend void yaml::skip(T &);
    415 
    416   iterator begin() { return yaml::begin(*this); }
    417 
    418   iterator end() { return iterator(); }
    419 
    420   void skip() override { yaml::skip(*this); }
    421 
    422   static bool classof(const Node *N) {
    423     return N->getType() == NK_Mapping;
    424   }
    425 
    426 private:
    427   MappingType Type;
    428   bool IsAtBeginning = true;
    429   bool IsAtEnd = false;
    430   KeyValueNode *CurrentEntry = nullptr;
    431 
    432   void increment();
    433 };
    434 
    435 /// \brief Represents a YAML sequence created from either a block sequence for a
    436 ///        flow sequence.
    437 ///
    438 /// This parses the YAML stream as increment() is called.
    439 ///
    440 /// Example:
    441 ///   - Hello
    442 ///   - World
    443 class SequenceNode final : public Node {
    444   void anchor() override;
    445 
    446 public:
    447   enum SequenceType {
    448     ST_Block,
    449     ST_Flow,
    450     // Use for:
    451     //
    452     // key:
    453     // - val1
    454     // - val2
    455     //
    456     // As a BlockMappingEntry and BlockEnd are not created in this case.
    457     ST_Indentless
    458   };
    459 
    460   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    461                SequenceType ST)
    462       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
    463 
    464   friend class basic_collection_iterator<SequenceNode, Node>;
    465 
    466   using iterator = basic_collection_iterator<SequenceNode, Node>;
    467 
    468   template <class T> friend typename T::iterator yaml::begin(T &);
    469   template <class T> friend void yaml::skip(T &);
    470 
    471   void increment();
    472 
    473   iterator begin() { return yaml::begin(*this); }
    474 
    475   iterator end() { return iterator(); }
    476 
    477   void skip() override { yaml::skip(*this); }
    478 
    479   static bool classof(const Node *N) {
    480     return N->getType() == NK_Sequence;
    481   }
    482 
    483 private:
    484   SequenceType SeqType;
    485   bool IsAtBeginning = true;
    486   bool IsAtEnd = false;
    487   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
    488   Node *CurrentEntry = nullptr;
    489 };
    490 
    491 /// \brief Represents an alias to a Node with an anchor.
    492 ///
    493 /// Example:
    494 ///   *AnchorName
    495 class AliasNode final : public Node {
    496   void anchor() override;
    497 
    498 public:
    499   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
    500       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
    501 
    502   StringRef getName() const { return Name; }
    503   Node *getTarget();
    504 
    505   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
    506 
    507 private:
    508   StringRef Name;
    509 };
    510 
    511 /// \brief A YAML Stream is a sequence of Documents. A document contains a root
    512 ///        node.
    513 class Document {
    514 public:
    515   Document(Stream &ParentStream);
    516 
    517   /// \brief Root for parsing a node. Returns a single node.
    518   Node *parseBlockNode();
    519 
    520   /// \brief Finish parsing the current document and return true if there are
    521   ///        more. Return false otherwise.
    522   bool skip();
    523 
    524   /// \brief Parse and return the root level node.
    525   Node *getRoot() {
    526     if (Root)
    527       return Root;
    528     return Root = parseBlockNode();
    529   }
    530 
    531   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
    532 
    533 private:
    534   friend class Node;
    535   friend class document_iterator;
    536 
    537   /// \brief Stream to read tokens from.
    538   Stream &stream;
    539 
    540   /// \brief Used to allocate nodes to. All are destroyed without calling their
    541   ///        destructor when the document is destroyed.
    542   BumpPtrAllocator NodeAllocator;
    543 
    544   /// \brief The root node. Used to support skipping a partially parsed
    545   ///        document.
    546   Node *Root;
    547 
    548   /// \brief Maps tag prefixes to their expansion.
    549   std::map<StringRef, StringRef> TagMap;
    550 
    551   Token &peekNext();
    552   Token getNext();
    553   void setError(const Twine &Message, Token &Location) const;
    554   bool failed() const;
    555 
    556   /// \brief Parse %BLAH directives and return true if any were encountered.
    557   bool parseDirectives();
    558 
    559   /// \brief Parse %YAML
    560   void parseYAMLDirective();
    561 
    562   /// \brief Parse %TAG
    563   void parseTAGDirective();
    564 
    565   /// \brief Consume the next token and error if it is not \a TK.
    566   bool expectToken(int TK);
    567 };
    568 
    569 /// \brief Iterator abstraction for Documents over a Stream.
    570 class document_iterator {
    571 public:
    572   document_iterator() = default;
    573   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
    574 
    575   bool operator==(const document_iterator &Other) const {
    576     if (isAtEnd() || Other.isAtEnd())
    577       return isAtEnd() && Other.isAtEnd();
    578 
    579     return Doc == Other.Doc;
    580   }
    581   bool operator!=(const document_iterator &Other) const {
    582     return !(*this == Other);
    583   }
    584 
    585   document_iterator operator++() {
    586     assert(Doc && "incrementing iterator past the end.");
    587     if (!(*Doc)->skip()) {
    588       Doc->reset(nullptr);
    589     } else {
    590       Stream &S = (*Doc)->stream;
    591       Doc->reset(new Document(S));
    592     }
    593     return *this;
    594   }
    595 
    596   Document &operator*() { return *Doc->get(); }
    597 
    598   std::unique_ptr<Document> &operator->() { return *Doc; }
    599 
    600 private:
    601   bool isAtEnd() const { return !Doc || !*Doc; }
    602 
    603   std::unique_ptr<Document> *Doc = nullptr;
    604 };
    605 
    606 } // end namespace yaml
    607 
    608 } // end namespace llvm
    609 
    610 #endif // LLVM_SUPPORT_YAMLPARSER_H
    611