Home | History | Annotate | Download | only in Support
      1 //===--- YAMLParser.h - Simple YAML parser --------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This is a YAML 1.2 parser.
     11 //
     12 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
     13 //
     14 //  This currently does not implement the following:
     15 //    * Multi-line literal folding.
     16 //    * Tag resolution.
     17 //    * UTF-16.
     18 //    * BOMs anywhere other than the first Unicode scalar value in the file.
     19 //
     20 //  The most important class here is Stream. This represents a YAML stream with
     21 //  0, 1, or many documents.
     22 //
     23 //  SourceMgr sm;
     24 //  StringRef input = getInput();
     25 //  yaml::Stream stream(input, sm);
     26 //
     27 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
     28 //       di != de; ++di) {
     29 //    yaml::Node *n = di->getRoot();
     30 //    if (n) {
     31 //      // Do something with n...
     32 //    } else
     33 //      break;
     34 //  }
     35 //
     36 //===----------------------------------------------------------------------===//
     37 
     38 #ifndef LLVM_SUPPORT_YAMLPARSER_H
     39 #define LLVM_SUPPORT_YAMLPARSER_H
     40 
     41 #include "llvm/ADT/StringRef.h"
     42 #include "llvm/Support/Allocator.h"
     43 #include "llvm/Support/SMLoc.h"
     44 #include <map>
     45 #include <system_error>
     46 #include <utility>
     47 
     48 namespace llvm {
     49 class MemoryBufferRef;
     50 class SourceMgr;
     51 class Twine;
     52 class raw_ostream;
     53 
     54 namespace yaml {
     55 
     56 class document_iterator;
     57 class Document;
     58 class Node;
     59 class Scanner;
     60 struct Token;
     61 
     62 /// \brief Dump all the tokens in this stream to OS.
     63 /// \returns true if there was an error, false otherwise.
     64 bool dumpTokens(StringRef Input, raw_ostream &);
     65 
     66 /// \brief Scans all tokens in input without outputting anything. This is used
     67 ///        for benchmarking the tokenizer.
     68 /// \returns true if there was an error, false otherwise.
     69 bool scanTokens(StringRef Input);
     70 
     71 /// \brief Escape \a Input for a double quoted scalar.
     72 std::string escape(StringRef Input);
     73 
     74 /// \brief This class represents a YAML stream potentially containing multiple
     75 ///        documents.
     76 class Stream {
     77 public:
     78   /// \brief This keeps a reference to the string referenced by \p Input.
     79   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
     80          std::error_code *EC = nullptr);
     81 
     82   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
     83          std::error_code *EC = nullptr);
     84   ~Stream();
     85 
     86   document_iterator begin();
     87   document_iterator end();
     88   void skip();
     89   bool failed();
     90   bool validate() {
     91     skip();
     92     return !failed();
     93   }
     94 
     95   void printError(Node *N, const Twine &Msg);
     96 
     97 private:
     98   std::unique_ptr<Scanner> scanner;
     99   std::unique_ptr<Document> CurrentDoc;
    100 
    101   friend class Document;
    102 };
    103 
    104 /// \brief Abstract base class for all Nodes.
    105 class Node {
    106   virtual void anchor();
    107 
    108 public:
    109   enum NodeKind {
    110     NK_Null,
    111     NK_Scalar,
    112     NK_BlockScalar,
    113     NK_KeyValue,
    114     NK_Mapping,
    115     NK_Sequence,
    116     NK_Alias
    117   };
    118 
    119   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
    120        StringRef Tag);
    121 
    122   /// \brief Get the value of the anchor attached to this node. If it does not
    123   ///        have one, getAnchor().size() will be 0.
    124   StringRef getAnchor() const { return Anchor; }
    125 
    126   /// \brief Get the tag as it was written in the document. This does not
    127   ///   perform tag resolution.
    128   StringRef getRawTag() const { return Tag; }
    129 
    130   /// \brief Get the verbatium tag for a given Node. This performs tag resoluton
    131   ///   and substitution.
    132   std::string getVerbatimTag() const;
    133 
    134   SMRange getSourceRange() const { return SourceRange; }
    135   void setSourceRange(SMRange SR) { SourceRange = SR; }
    136 
    137   // These functions forward to Document and Scanner.
    138   Token &peekNext();
    139   Token getNext();
    140   Node *parseBlockNode();
    141   BumpPtrAllocator &getAllocator();
    142   void setError(const Twine &Message, Token &Location) const;
    143   bool failed() const;
    144 
    145   virtual void skip() {}
    146 
    147   unsigned int getType() const { return TypeID; }
    148 
    149   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
    150                      size_t Alignment = 16) noexcept {
    151     return Alloc.Allocate(Size, Alignment);
    152   }
    153 
    154   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
    155                        size_t Size) noexcept {
    156     Alloc.Deallocate(Ptr, Size);
    157   }
    158 
    159 protected:
    160   std::unique_ptr<Document> &Doc;
    161   SMRange SourceRange;
    162 
    163   void operator delete(void *) noexcept = delete;
    164 
    165   ~Node() = default;
    166 
    167 private:
    168   unsigned int TypeID;
    169   StringRef Anchor;
    170   /// \brief The tag as typed in the document.
    171   StringRef Tag;
    172 };
    173 
    174 /// \brief A null value.
    175 ///
    176 /// Example:
    177 ///   !!null null
    178 class NullNode final : public Node {
    179   void anchor() override;
    180 
    181 public:
    182   NullNode(std::unique_ptr<Document> &D)
    183       : Node(NK_Null, D, StringRef(), StringRef()) {}
    184 
    185   static inline bool classof(const Node *N) { return N->getType() == NK_Null; }
    186 };
    187 
    188 /// \brief A scalar node is an opaque datum that can be presented as a
    189 ///        series of zero or more Unicode scalar values.
    190 ///
    191 /// Example:
    192 ///   Adena
    193 class ScalarNode final : public Node {
    194   void anchor() override;
    195 
    196 public:
    197   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    198              StringRef Val)
    199       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
    200     SMLoc Start = SMLoc::getFromPointer(Val.begin());
    201     SMLoc End = SMLoc::getFromPointer(Val.end());
    202     SourceRange = SMRange(Start, End);
    203   }
    204 
    205   // Return Value without any escaping or folding or other fun YAML stuff. This
    206   // is the exact bytes that are contained in the file (after conversion to
    207   // utf8).
    208   StringRef getRawValue() const { return Value; }
    209 
    210   /// \brief Gets the value of this node as a StringRef.
    211   ///
    212   /// \param Storage is used to store the content of the returned StringRef iff
    213   ///        it requires any modification from how it appeared in the source.
    214   ///        This happens with escaped characters and multi-line literals.
    215   StringRef getValue(SmallVectorImpl<char> &Storage) const;
    216 
    217   static inline bool classof(const Node *N) {
    218     return N->getType() == NK_Scalar;
    219   }
    220 
    221 private:
    222   StringRef Value;
    223 
    224   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
    225                                  StringRef::size_type Start,
    226                                  SmallVectorImpl<char> &Storage) const;
    227 };
    228 
    229 /// \brief A block scalar node is an opaque datum that can be presented as a
    230 ///        series of zero or more Unicode scalar values.
    231 ///
    232 /// Example:
    233 ///   |
    234 ///     Hello
    235 ///     World
    236 class BlockScalarNode final : public Node {
    237   void anchor() override;
    238 
    239 public:
    240   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    241                   StringRef Value, StringRef RawVal)
    242       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
    243     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
    244     SMLoc End = SMLoc::getFromPointer(RawVal.end());
    245     SourceRange = SMRange(Start, End);
    246   }
    247 
    248   /// \brief Gets the value of this node as a StringRef.
    249   StringRef getValue() const { return Value; }
    250 
    251   static inline bool classof(const Node *N) {
    252     return N->getType() == NK_BlockScalar;
    253   }
    254 
    255 private:
    256   StringRef Value;
    257 };
    258 
    259 /// \brief A key and value pair. While not technically a Node under the YAML
    260 ///        representation graph, it is easier to treat them this way.
    261 ///
    262 /// TODO: Consider making this not a child of Node.
    263 ///
    264 /// Example:
    265 ///   Section: .text
    266 class KeyValueNode final : public Node {
    267   void anchor() override;
    268 
    269 public:
    270   KeyValueNode(std::unique_ptr<Document> &D)
    271       : Node(NK_KeyValue, D, StringRef(), StringRef()), Key(nullptr),
    272         Value(nullptr) {}
    273 
    274   /// \brief Parse and return the key.
    275   ///
    276   /// This may be called multiple times.
    277   ///
    278   /// \returns The key, or nullptr if failed() == true.
    279   Node *getKey();
    280 
    281   /// \brief Parse and return the value.
    282   ///
    283   /// This may be called multiple times.
    284   ///
    285   /// \returns The value, or nullptr if failed() == true.
    286   Node *getValue();
    287 
    288   void skip() override {
    289     getKey()->skip();
    290     if (Node *Val = getValue())
    291       Val->skip();
    292   }
    293 
    294   static inline bool classof(const Node *N) {
    295     return N->getType() == NK_KeyValue;
    296   }
    297 
    298 private:
    299   Node *Key;
    300   Node *Value;
    301 };
    302 
    303 /// \brief This is an iterator abstraction over YAML collections shared by both
    304 ///        sequences and maps.
    305 ///
    306 /// BaseT must have a ValueT* member named CurrentEntry and a member function
    307 /// increment() which must set CurrentEntry to 0 to create an end iterator.
    308 template <class BaseT, class ValueT>
    309 class basic_collection_iterator
    310     : public std::iterator<std::input_iterator_tag, ValueT> {
    311 public:
    312   basic_collection_iterator() : Base(nullptr) {}
    313   basic_collection_iterator(BaseT *B) : Base(B) {}
    314 
    315   ValueT *operator->() const {
    316     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    317     return Base->CurrentEntry;
    318   }
    319 
    320   ValueT &operator*() const {
    321     assert(Base && Base->CurrentEntry &&
    322            "Attempted to dereference end iterator!");
    323     return *Base->CurrentEntry;
    324   }
    325 
    326   operator ValueT *() const {
    327     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    328     return Base->CurrentEntry;
    329   }
    330 
    331   /// Note on EqualityComparable:
    332   ///
    333   /// The iterator is not re-entrant,
    334   /// it is meant to be used for parsing YAML on-demand
    335   /// Once iteration started - it can point only to one entry at a time
    336   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
    337   /// iff Base and Other.Base are equal.
    338   bool operator==(const basic_collection_iterator &Other) const {
    339     if (Base && (Base == Other.Base)) {
    340       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
    341              && "Equal Bases expected to point to equal Entries");
    342     }
    343 
    344     return Base == Other.Base;
    345   }
    346 
    347   bool operator!=(const basic_collection_iterator &Other) const {
    348     return !(Base == Other.Base);
    349   }
    350 
    351   basic_collection_iterator &operator++() {
    352     assert(Base && "Attempted to advance iterator past end!");
    353     Base->increment();
    354     // Create an end iterator.
    355     if (!Base->CurrentEntry)
    356       Base = nullptr;
    357     return *this;
    358   }
    359 
    360 private:
    361   BaseT *Base;
    362 };
    363 
    364 // The following two templates are used for both MappingNode and Sequence Node.
    365 template <class CollectionType>
    366 typename CollectionType::iterator begin(CollectionType &C) {
    367   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
    368   C.IsAtBeginning = false;
    369   typename CollectionType::iterator ret(&C);
    370   ++ret;
    371   return ret;
    372 }
    373 
    374 template <class CollectionType> void skip(CollectionType &C) {
    375   // TODO: support skipping from the middle of a parsed collection ;/
    376   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
    377   if (C.IsAtBeginning)
    378     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
    379          ++i)
    380       i->skip();
    381 }
    382 
    383 /// \brief Represents a YAML map created from either a block map for a flow map.
    384 ///
    385 /// This parses the YAML stream as increment() is called.
    386 ///
    387 /// Example:
    388 ///   Name: _main
    389 ///   Scope: Global
    390 class MappingNode final : public Node {
    391   void anchor() override;
    392 
    393 public:
    394   enum MappingType {
    395     MT_Block,
    396     MT_Flow,
    397     MT_Inline ///< An inline mapping node is used for "[key: value]".
    398   };
    399 
    400   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    401               MappingType MT)
    402       : Node(NK_Mapping, D, Anchor, Tag), Type(MT), IsAtBeginning(true),
    403         IsAtEnd(false), CurrentEntry(nullptr) {}
    404 
    405   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
    406   typedef basic_collection_iterator<MappingNode, KeyValueNode> iterator;
    407   template <class T> friend typename T::iterator yaml::begin(T &);
    408   template <class T> friend void yaml::skip(T &);
    409 
    410   iterator begin() { return yaml::begin(*this); }
    411 
    412   iterator end() { return iterator(); }
    413 
    414   void skip() override { yaml::skip(*this); }
    415 
    416   static inline bool classof(const Node *N) {
    417     return N->getType() == NK_Mapping;
    418   }
    419 
    420 private:
    421   MappingType Type;
    422   bool IsAtBeginning;
    423   bool IsAtEnd;
    424   KeyValueNode *CurrentEntry;
    425 
    426   void increment();
    427 };
    428 
    429 /// \brief Represents a YAML sequence created from either a block sequence for a
    430 ///        flow sequence.
    431 ///
    432 /// This parses the YAML stream as increment() is called.
    433 ///
    434 /// Example:
    435 ///   - Hello
    436 ///   - World
    437 class SequenceNode final : public Node {
    438   void anchor() override;
    439 
    440 public:
    441   enum SequenceType {
    442     ST_Block,
    443     ST_Flow,
    444     // Use for:
    445     //
    446     // key:
    447     // - val1
    448     // - val2
    449     //
    450     // As a BlockMappingEntry and BlockEnd are not created in this case.
    451     ST_Indentless
    452   };
    453 
    454   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    455                SequenceType ST)
    456       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST), IsAtBeginning(true),
    457         IsAtEnd(false),
    458         WasPreviousTokenFlowEntry(true), // Start with an imaginary ','.
    459         CurrentEntry(nullptr) {}
    460 
    461   friend class basic_collection_iterator<SequenceNode, Node>;
    462   typedef basic_collection_iterator<SequenceNode, Node> iterator;
    463   template <class T> friend typename T::iterator yaml::begin(T &);
    464   template <class T> friend void yaml::skip(T &);
    465 
    466   void increment();
    467 
    468   iterator begin() { return yaml::begin(*this); }
    469 
    470   iterator end() { return iterator(); }
    471 
    472   void skip() override { yaml::skip(*this); }
    473 
    474   static inline bool classof(const Node *N) {
    475     return N->getType() == NK_Sequence;
    476   }
    477 
    478 private:
    479   SequenceType SeqType;
    480   bool IsAtBeginning;
    481   bool IsAtEnd;
    482   bool WasPreviousTokenFlowEntry;
    483   Node *CurrentEntry;
    484 };
    485 
    486 /// \brief Represents an alias to a Node with an anchor.
    487 ///
    488 /// Example:
    489 ///   *AnchorName
    490 class AliasNode final : public Node {
    491   void anchor() override;
    492 
    493 public:
    494   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
    495       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
    496 
    497   StringRef getName() const { return Name; }
    498   Node *getTarget();
    499 
    500   static inline bool classof(const Node *N) { return N->getType() == NK_Alias; }
    501 
    502 private:
    503   StringRef Name;
    504 };
    505 
    506 /// \brief A YAML Stream is a sequence of Documents. A document contains a root
    507 ///        node.
    508 class Document {
    509 public:
    510   /// \brief Root for parsing a node. Returns a single node.
    511   Node *parseBlockNode();
    512 
    513   Document(Stream &ParentStream);
    514 
    515   /// \brief Finish parsing the current document and return true if there are
    516   ///        more. Return false otherwise.
    517   bool skip();
    518 
    519   /// \brief Parse and return the root level node.
    520   Node *getRoot() {
    521     if (Root)
    522       return Root;
    523     return Root = parseBlockNode();
    524   }
    525 
    526   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
    527 
    528 private:
    529   friend class Node;
    530   friend class document_iterator;
    531 
    532   /// \brief Stream to read tokens from.
    533   Stream &stream;
    534 
    535   /// \brief Used to allocate nodes to. All are destroyed without calling their
    536   ///        destructor when the document is destroyed.
    537   BumpPtrAllocator NodeAllocator;
    538 
    539   /// \brief The root node. Used to support skipping a partially parsed
    540   ///        document.
    541   Node *Root;
    542 
    543   /// \brief Maps tag prefixes to their expansion.
    544   std::map<StringRef, StringRef> TagMap;
    545 
    546   Token &peekNext();
    547   Token getNext();
    548   void setError(const Twine &Message, Token &Location) const;
    549   bool failed() const;
    550 
    551   /// \brief Parse %BLAH directives and return true if any were encountered.
    552   bool parseDirectives();
    553 
    554   /// \brief Parse %YAML
    555   void parseYAMLDirective();
    556 
    557   /// \brief Parse %TAG
    558   void parseTAGDirective();
    559 
    560   /// \brief Consume the next token and error if it is not \a TK.
    561   bool expectToken(int TK);
    562 };
    563 
    564 /// \brief Iterator abstraction for Documents over a Stream.
    565 class document_iterator {
    566 public:
    567   document_iterator() : Doc(nullptr) {}
    568   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
    569 
    570   bool operator==(const document_iterator &Other) {
    571     if (isAtEnd() || Other.isAtEnd())
    572       return isAtEnd() && Other.isAtEnd();
    573 
    574     return Doc == Other.Doc;
    575   }
    576   bool operator!=(const document_iterator &Other) { return !(*this == Other); }
    577 
    578   document_iterator operator++() {
    579     assert(Doc && "incrementing iterator past the end.");
    580     if (!(*Doc)->skip()) {
    581       Doc->reset(nullptr);
    582     } else {
    583       Stream &S = (*Doc)->stream;
    584       Doc->reset(new Document(S));
    585     }
    586     return *this;
    587   }
    588 
    589   Document &operator*() { return *Doc->get(); }
    590 
    591   std::unique_ptr<Document> &operator->() { return *Doc; }
    592 
    593 private:
    594   bool isAtEnd() const { return !Doc || !*Doc; }
    595 
    596   std::unique_ptr<Document> *Doc;
    597 };
    598 
    599 } // End namespace yaml.
    600 
    601 } // End namespace llvm.
    602 
    603 #endif
    604