Home | History | Annotate | Download | only in Support
      1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This is a YAML 1.2 parser.
     11 //
     12 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
     13 //
     14 //  This currently does not implement the following:
     15 //    * Multi-line literal folding.
     16 //    * Tag resolution.
     17 //    * UTF-16.
     18 //    * BOMs anywhere other than the first Unicode scalar value in the file.
     19 //
     20 //  The most important class here is Stream. This represents a YAML stream with
     21 //  0, 1, or many documents.
     22 //
     23 //  SourceMgr sm;
     24 //  StringRef input = getInput();
     25 //  yaml::Stream stream(input, sm);
     26 //
     27 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
     28 //       di != de; ++di) {
     29 //    yaml::Node *n = di->getRoot();
     30 //    if (n) {
     31 //      // Do something with n...
     32 //    } else
     33 //      break;
     34 //  }
     35 //
     36 //===----------------------------------------------------------------------===//
     37 
     38 #ifndef LLVM_SUPPORT_YAMLPARSER_H
     39 #define LLVM_SUPPORT_YAMLPARSER_H
     40 
     41 #include "llvm/ADT/StringRef.h"
     42 #include "llvm/Support/Allocator.h"
     43 #include "llvm/Support/SMLoc.h"
     44 #include <cassert>
     45 #include <cstddef>
     46 #include <iterator>
     47 #include <map>
     48 #include <memory>
     49 #include <string>
     50 #include <system_error>
     51 
     52 namespace llvm {
     53 
     54 class MemoryBufferRef;
     55 class SourceMgr;
     56 class raw_ostream;
     57 class Twine;
     58 
     59 namespace yaml {
     60 
     61 class Document;
     62 class document_iterator;
     63 class Node;
     64 class Scanner;
     65 struct Token;
     66 
     67 /// Dump all the tokens in this stream to OS.
     68 /// \returns true if there was an error, false otherwise.
     69 bool dumpTokens(StringRef Input, raw_ostream &);
     70 
     71 /// Scans all tokens in input without outputting anything. This is used
     72 ///        for benchmarking the tokenizer.
     73 /// \returns true if there was an error, false otherwise.
     74 bool scanTokens(StringRef Input);
     75 
     76 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
     77 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
     78 /// false, those UTF8 sequences encoding printable unicode scalars will not be
     79 /// escaped, but emitted verbatim.
     80 std::string escape(StringRef Input, bool EscapePrintable = true);
     81 
     82 /// This class represents a YAML stream potentially containing multiple
     83 ///        documents.
     84 class Stream {
     85 public:
     86   /// This keeps a reference to the string referenced by \p Input.
     87   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
     88          std::error_code *EC = nullptr);
     89 
     90   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
     91          std::error_code *EC = nullptr);
     92   ~Stream();
     93 
     94   document_iterator begin();
     95   document_iterator end();
     96   void skip();
     97   bool failed();
     98 
     99   bool validate() {
    100     skip();
    101     return !failed();
    102   }
    103 
    104   void printError(Node *N, const Twine &Msg);
    105 
    106 private:
    107   friend class Document;
    108 
    109   std::unique_ptr<Scanner> scanner;
    110   std::unique_ptr<Document> CurrentDoc;
    111 };
    112 
    113 /// Abstract base class for all Nodes.
    114 class Node {
    115   virtual void anchor();
    116 
    117 public:
    118   enum NodeKind {
    119     NK_Null,
    120     NK_Scalar,
    121     NK_BlockScalar,
    122     NK_KeyValue,
    123     NK_Mapping,
    124     NK_Sequence,
    125     NK_Alias
    126   };
    127 
    128   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
    129        StringRef Tag);
    130 
    131   // It's not safe to copy YAML nodes; the document is streamed and the position
    132   // is part of the state.
    133   Node(const Node &) = delete;
    134   void operator=(const Node &) = delete;
    135 
    136   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
    137                      size_t Alignment = 16) noexcept {
    138     return Alloc.Allocate(Size, Alignment);
    139   }
    140 
    141   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
    142                        size_t Size) noexcept {
    143     Alloc.Deallocate(Ptr, Size);
    144   }
    145 
    146   void operator delete(void *) noexcept = delete;
    147 
    148   /// Get the value of the anchor attached to this node. If it does not
    149   ///        have one, getAnchor().size() will be 0.
    150   StringRef getAnchor() const { return Anchor; }
    151 
    152   /// Get the tag as it was written in the document. This does not
    153   ///   perform tag resolution.
    154   StringRef getRawTag() const { return Tag; }
    155 
    156   /// Get the verbatium tag for a given Node. This performs tag resoluton
    157   ///   and substitution.
    158   std::string getVerbatimTag() const;
    159 
    160   SMRange getSourceRange() const { return SourceRange; }
    161   void setSourceRange(SMRange SR) { SourceRange = SR; }
    162 
    163   // These functions forward to Document and Scanner.
    164   Token &peekNext();
    165   Token getNext();
    166   Node *parseBlockNode();
    167   BumpPtrAllocator &getAllocator();
    168   void setError(const Twine &Message, Token &Location) const;
    169   bool failed() const;
    170 
    171   virtual void skip() {}
    172 
    173   unsigned int getType() const { return TypeID; }
    174 
    175 protected:
    176   std::unique_ptr<Document> &Doc;
    177   SMRange SourceRange;
    178 
    179   ~Node() = default;
    180 
    181 private:
    182   unsigned int TypeID;
    183   StringRef Anchor;
    184   /// The tag as typed in the document.
    185   StringRef Tag;
    186 };
    187 
    188 /// A null value.
    189 ///
    190 /// Example:
    191 ///   !!null null
    192 class NullNode final : public Node {
    193   void anchor() override;
    194 
    195 public:
    196   NullNode(std::unique_ptr<Document> &D)
    197       : Node(NK_Null, D, StringRef(), StringRef()) {}
    198 
    199   static bool classof(const Node *N) { return N->getType() == NK_Null; }
    200 };
    201 
    202 /// A scalar node is an opaque datum that can be presented as a
    203 ///        series of zero or more Unicode scalar values.
    204 ///
    205 /// Example:
    206 ///   Adena
    207 class ScalarNode final : public Node {
    208   void anchor() override;
    209 
    210 public:
    211   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    212              StringRef Val)
    213       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
    214     SMLoc Start = SMLoc::getFromPointer(Val.begin());
    215     SMLoc End = SMLoc::getFromPointer(Val.end());
    216     SourceRange = SMRange(Start, End);
    217   }
    218 
    219   // Return Value without any escaping or folding or other fun YAML stuff. This
    220   // is the exact bytes that are contained in the file (after conversion to
    221   // utf8).
    222   StringRef getRawValue() const { return Value; }
    223 
    224   /// Gets the value of this node as a StringRef.
    225   ///
    226   /// \param Storage is used to store the content of the returned StringRef iff
    227   ///        it requires any modification from how it appeared in the source.
    228   ///        This happens with escaped characters and multi-line literals.
    229   StringRef getValue(SmallVectorImpl<char> &Storage) const;
    230 
    231   static bool classof(const Node *N) {
    232     return N->getType() == NK_Scalar;
    233   }
    234 
    235 private:
    236   StringRef Value;
    237 
    238   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
    239                                  StringRef::size_type Start,
    240                                  SmallVectorImpl<char> &Storage) const;
    241 };
    242 
    243 /// A block scalar node is an opaque datum that can be presented as a
    244 ///        series of zero or more Unicode scalar values.
    245 ///
    246 /// Example:
    247 ///   |
    248 ///     Hello
    249 ///     World
    250 class BlockScalarNode final : public Node {
    251   void anchor() override;
    252 
    253 public:
    254   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    255                   StringRef Value, StringRef RawVal)
    256       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
    257     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
    258     SMLoc End = SMLoc::getFromPointer(RawVal.end());
    259     SourceRange = SMRange(Start, End);
    260   }
    261 
    262   /// Gets the value of this node as a StringRef.
    263   StringRef getValue() const { return Value; }
    264 
    265   static bool classof(const Node *N) {
    266     return N->getType() == NK_BlockScalar;
    267   }
    268 
    269 private:
    270   StringRef Value;
    271 };
    272 
    273 /// A key and value pair. While not technically a Node under the YAML
    274 ///        representation graph, it is easier to treat them this way.
    275 ///
    276 /// TODO: Consider making this not a child of Node.
    277 ///
    278 /// Example:
    279 ///   Section: .text
    280 class KeyValueNode final : public Node {
    281   void anchor() override;
    282 
    283 public:
    284   KeyValueNode(std::unique_ptr<Document> &D)
    285       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
    286 
    287   /// Parse and return the key.
    288   ///
    289   /// This may be called multiple times.
    290   ///
    291   /// \returns The key, or nullptr if failed() == true.
    292   Node *getKey();
    293 
    294   /// Parse and return the value.
    295   ///
    296   /// This may be called multiple times.
    297   ///
    298   /// \returns The value, or nullptr if failed() == true.
    299   Node *getValue();
    300 
    301   void skip() override {
    302     if (Node *Key = getKey()) {
    303       Key->skip();
    304       if (Node *Val = getValue())
    305         Val->skip();
    306     }
    307   }
    308 
    309   static bool classof(const Node *N) {
    310     return N->getType() == NK_KeyValue;
    311   }
    312 
    313 private:
    314   Node *Key = nullptr;
    315   Node *Value = nullptr;
    316 };
    317 
    318 /// This is an iterator abstraction over YAML collections shared by both
    319 ///        sequences and maps.
    320 ///
    321 /// BaseT must have a ValueT* member named CurrentEntry and a member function
    322 /// increment() which must set CurrentEntry to 0 to create an end iterator.
    323 template <class BaseT, class ValueT>
    324 class basic_collection_iterator
    325     : public std::iterator<std::input_iterator_tag, ValueT> {
    326 public:
    327   basic_collection_iterator() = default;
    328   basic_collection_iterator(BaseT *B) : Base(B) {}
    329 
    330   ValueT *operator->() const {
    331     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    332     return Base->CurrentEntry;
    333   }
    334 
    335   ValueT &operator*() const {
    336     assert(Base && Base->CurrentEntry &&
    337            "Attempted to dereference end iterator!");
    338     return *Base->CurrentEntry;
    339   }
    340 
    341   operator ValueT *() const {
    342     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
    343     return Base->CurrentEntry;
    344   }
    345 
    346   /// Note on EqualityComparable:
    347   ///
    348   /// The iterator is not re-entrant,
    349   /// it is meant to be used for parsing YAML on-demand
    350   /// Once iteration started - it can point only to one entry at a time
    351   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
    352   /// iff Base and Other.Base are equal.
    353   bool operator==(const basic_collection_iterator &Other) const {
    354     if (Base && (Base == Other.Base)) {
    355       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
    356              && "Equal Bases expected to point to equal Entries");
    357     }
    358 
    359     return Base == Other.Base;
    360   }
    361 
    362   bool operator!=(const basic_collection_iterator &Other) const {
    363     return !(Base == Other.Base);
    364   }
    365 
    366   basic_collection_iterator &operator++() {
    367     assert(Base && "Attempted to advance iterator past end!");
    368     Base->increment();
    369     // Create an end iterator.
    370     if (!Base->CurrentEntry)
    371       Base = nullptr;
    372     return *this;
    373   }
    374 
    375 private:
    376   BaseT *Base = nullptr;
    377 };
    378 
    379 // The following two templates are used for both MappingNode and Sequence Node.
    380 template <class CollectionType>
    381 typename CollectionType::iterator begin(CollectionType &C) {
    382   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
    383   C.IsAtBeginning = false;
    384   typename CollectionType::iterator ret(&C);
    385   ++ret;
    386   return ret;
    387 }
    388 
    389 template <class CollectionType> void skip(CollectionType &C) {
    390   // TODO: support skipping from the middle of a parsed collection ;/
    391   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
    392   if (C.IsAtBeginning)
    393     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
    394          ++i)
    395       i->skip();
    396 }
    397 
    398 /// Represents a YAML map created from either a block map for a flow map.
    399 ///
    400 /// This parses the YAML stream as increment() is called.
    401 ///
    402 /// Example:
    403 ///   Name: _main
    404 ///   Scope: Global
    405 class MappingNode final : public Node {
    406   void anchor() override;
    407 
    408 public:
    409   enum MappingType {
    410     MT_Block,
    411     MT_Flow,
    412     MT_Inline ///< An inline mapping node is used for "[key: value]".
    413   };
    414 
    415   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    416               MappingType MT)
    417       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
    418 
    419   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
    420 
    421   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
    422 
    423   template <class T> friend typename T::iterator yaml::begin(T &);
    424   template <class T> friend void yaml::skip(T &);
    425 
    426   iterator begin() { return yaml::begin(*this); }
    427 
    428   iterator end() { return iterator(); }
    429 
    430   void skip() override { yaml::skip(*this); }
    431 
    432   static bool classof(const Node *N) {
    433     return N->getType() == NK_Mapping;
    434   }
    435 
    436 private:
    437   MappingType Type;
    438   bool IsAtBeginning = true;
    439   bool IsAtEnd = false;
    440   KeyValueNode *CurrentEntry = nullptr;
    441 
    442   void increment();
    443 };
    444 
    445 /// Represents a YAML sequence created from either a block sequence for a
    446 ///        flow sequence.
    447 ///
    448 /// This parses the YAML stream as increment() is called.
    449 ///
    450 /// Example:
    451 ///   - Hello
    452 ///   - World
    453 class SequenceNode final : public Node {
    454   void anchor() override;
    455 
    456 public:
    457   enum SequenceType {
    458     ST_Block,
    459     ST_Flow,
    460     // Use for:
    461     //
    462     // key:
    463     // - val1
    464     // - val2
    465     //
    466     // As a BlockMappingEntry and BlockEnd are not created in this case.
    467     ST_Indentless
    468   };
    469 
    470   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
    471                SequenceType ST)
    472       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
    473 
    474   friend class basic_collection_iterator<SequenceNode, Node>;
    475 
    476   using iterator = basic_collection_iterator<SequenceNode, Node>;
    477 
    478   template <class T> friend typename T::iterator yaml::begin(T &);
    479   template <class T> friend void yaml::skip(T &);
    480 
    481   void increment();
    482 
    483   iterator begin() { return yaml::begin(*this); }
    484 
    485   iterator end() { return iterator(); }
    486 
    487   void skip() override { yaml::skip(*this); }
    488 
    489   static bool classof(const Node *N) {
    490     return N->getType() == NK_Sequence;
    491   }
    492 
    493 private:
    494   SequenceType SeqType;
    495   bool IsAtBeginning = true;
    496   bool IsAtEnd = false;
    497   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
    498   Node *CurrentEntry = nullptr;
    499 };
    500 
    501 /// Represents an alias to a Node with an anchor.
    502 ///
    503 /// Example:
    504 ///   *AnchorName
    505 class AliasNode final : public Node {
    506   void anchor() override;
    507 
    508 public:
    509   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
    510       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
    511 
    512   StringRef getName() const { return Name; }
    513   Node *getTarget();
    514 
    515   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
    516 
    517 private:
    518   StringRef Name;
    519 };
    520 
    521 /// A YAML Stream is a sequence of Documents. A document contains a root
    522 ///        node.
    523 class Document {
    524 public:
    525   Document(Stream &ParentStream);
    526 
    527   /// Root for parsing a node. Returns a single node.
    528   Node *parseBlockNode();
    529 
    530   /// Finish parsing the current document and return true if there are
    531   ///        more. Return false otherwise.
    532   bool skip();
    533 
    534   /// Parse and return the root level node.
    535   Node *getRoot() {
    536     if (Root)
    537       return Root;
    538     return Root = parseBlockNode();
    539   }
    540 
    541   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
    542 
    543 private:
    544   friend class Node;
    545   friend class document_iterator;
    546 
    547   /// Stream to read tokens from.
    548   Stream &stream;
    549 
    550   /// Used to allocate nodes to. All are destroyed without calling their
    551   ///        destructor when the document is destroyed.
    552   BumpPtrAllocator NodeAllocator;
    553 
    554   /// The root node. Used to support skipping a partially parsed
    555   ///        document.
    556   Node *Root;
    557 
    558   /// Maps tag prefixes to their expansion.
    559   std::map<StringRef, StringRef> TagMap;
    560 
    561   Token &peekNext();
    562   Token getNext();
    563   void setError(const Twine &Message, Token &Location) const;
    564   bool failed() const;
    565 
    566   /// Parse %BLAH directives and return true if any were encountered.
    567   bool parseDirectives();
    568 
    569   /// Parse %YAML
    570   void parseYAMLDirective();
    571 
    572   /// Parse %TAG
    573   void parseTAGDirective();
    574 
    575   /// Consume the next token and error if it is not \a TK.
    576   bool expectToken(int TK);
    577 };
    578 
    579 /// Iterator abstraction for Documents over a Stream.
    580 class document_iterator {
    581 public:
    582   document_iterator() = default;
    583   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
    584 
    585   bool operator==(const document_iterator &Other) const {
    586     if (isAtEnd() || Other.isAtEnd())
    587       return isAtEnd() && Other.isAtEnd();
    588 
    589     return Doc == Other.Doc;
    590   }
    591   bool operator!=(const document_iterator &Other) const {
    592     return !(*this == Other);
    593   }
    594 
    595   document_iterator operator++() {
    596     assert(Doc && "incrementing iterator past the end.");
    597     if (!(*Doc)->skip()) {
    598       Doc->reset(nullptr);
    599     } else {
    600       Stream &S = (*Doc)->stream;
    601       Doc->reset(new Document(S));
    602     }
    603     return *this;
    604   }
    605 
    606   Document &operator*() { return *Doc->get(); }
    607 
    608   std::unique_ptr<Document> &operator->() { return *Doc; }
    609 
    610 private:
    611   bool isAtEnd() const { return !Doc || !*Doc; }
    612 
    613   std::unique_ptr<Document> *Doc = nullptr;
    614 };
    615 
    616 } // end namespace yaml
    617 
    618 } // end namespace llvm
    619 
    620 #endif // LLVM_SUPPORT_YAMLPARSER_H
    621