Home | History | Annotate | Download | only in AST
      1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This file defines lexer for structured comments and supporting token class.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
     15 #define LLVM_CLANG_AST_COMMENTLEXER_H
     16 
     17 #include "clang/Basic/Diagnostic.h"
     18 #include "clang/Basic/SourceManager.h"
     19 #include "llvm/ADT/SmallString.h"
     20 #include "llvm/ADT/StringRef.h"
     21 #include "llvm/Support/Allocator.h"
     22 #include "llvm/Support/raw_ostream.h"
     23 
     24 namespace clang {
     25 namespace comments {
     26 
     27 class Lexer;
     28 class TextTokenRetokenizer;
     29 struct CommandInfo;
     30 class CommandTraits;
     31 
     32 namespace tok {
     33 enum TokenKind {
     34   eof,
     35   newline,
     36   text,
     37   unknown_command,   // Command that does not have an ID.
     38   backslash_command, // Command with an ID, that used backslash marker.
     39   at_command,        // Command with an ID, that used 'at' marker.
     40   verbatim_block_begin,
     41   verbatim_block_line,
     42   verbatim_block_end,
     43   verbatim_line_name,
     44   verbatim_line_text,
     45   html_start_tag,     // <tag
     46   html_ident,         // attr
     47   html_equals,        // =
     48   html_quoted_string, // "blah\"blah" or 'blah\'blah'
     49   html_greater,       // >
     50   html_slash_greater, // />
     51   html_end_tag        // </tag
     52 };
     53 } // end namespace tok
     54 
     55 /// \brief Comment token.
     56 class Token {
     57   friend class Lexer;
     58   friend class TextTokenRetokenizer;
     59 
     60   /// The location of the token.
     61   SourceLocation Loc;
     62 
     63   /// The actual kind of the token.
     64   tok::TokenKind Kind;
     65 
     66   /// Length of the token spelling in comment.  Can be 0 for synthenized
     67   /// tokens.
     68   unsigned Length;
     69 
     70   /// Contains text value associated with a token.
     71   const char *TextPtr;
     72 
     73   /// Integer value associated with a token.
     74   ///
     75   /// If the token is a konwn command, contains command ID and TextPtr is
     76   /// unused (command spelling can be found with CommandTraits).  Otherwise,
     77   /// contains the length of the string that starts at TextPtr.
     78   unsigned IntVal;
     79 
     80 public:
     81   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
     82   void setLocation(SourceLocation SL) { Loc = SL; }
     83 
     84   SourceLocation getEndLocation() const LLVM_READONLY {
     85     if (Length == 0 || Length == 1)
     86       return Loc;
     87     return Loc.getLocWithOffset(Length - 1);
     88   }
     89 
     90   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
     91   void setKind(tok::TokenKind K) { Kind = K; }
     92 
     93   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
     94   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
     95 
     96   unsigned getLength() const LLVM_READONLY { return Length; }
     97   void setLength(unsigned L) { Length = L; }
     98 
     99   StringRef getText() const LLVM_READONLY {
    100     assert(is(tok::text));
    101     return StringRef(TextPtr, IntVal);
    102   }
    103 
    104   void setText(StringRef Text) {
    105     assert(is(tok::text));
    106     TextPtr = Text.data();
    107     IntVal = Text.size();
    108   }
    109 
    110   StringRef getUnknownCommandName() const LLVM_READONLY {
    111     assert(is(tok::unknown_command));
    112     return StringRef(TextPtr, IntVal);
    113   }
    114 
    115   void setUnknownCommandName(StringRef Name) {
    116     assert(is(tok::unknown_command));
    117     TextPtr = Name.data();
    118     IntVal = Name.size();
    119   }
    120 
    121   unsigned getCommandID() const LLVM_READONLY {
    122     assert(is(tok::backslash_command) || is(tok::at_command));
    123     return IntVal;
    124   }
    125 
    126   void setCommandID(unsigned ID) {
    127     assert(is(tok::backslash_command) || is(tok::at_command));
    128     IntVal = ID;
    129   }
    130 
    131   unsigned getVerbatimBlockID() const LLVM_READONLY {
    132     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    133     return IntVal;
    134   }
    135 
    136   void setVerbatimBlockID(unsigned ID) {
    137     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    138     IntVal = ID;
    139   }
    140 
    141   StringRef getVerbatimBlockText() const LLVM_READONLY {
    142     assert(is(tok::verbatim_block_line));
    143     return StringRef(TextPtr, IntVal);
    144   }
    145 
    146   void setVerbatimBlockText(StringRef Text) {
    147     assert(is(tok::verbatim_block_line));
    148     TextPtr = Text.data();
    149     IntVal = Text.size();
    150   }
    151 
    152   unsigned getVerbatimLineID() const LLVM_READONLY {
    153     assert(is(tok::verbatim_line_name));
    154     return IntVal;
    155   }
    156 
    157   void setVerbatimLineID(unsigned ID) {
    158     assert(is(tok::verbatim_line_name));
    159     IntVal = ID;
    160   }
    161 
    162   StringRef getVerbatimLineText() const LLVM_READONLY {
    163     assert(is(tok::verbatim_line_text));
    164     return StringRef(TextPtr, IntVal);
    165   }
    166 
    167   void setVerbatimLineText(StringRef Text) {
    168     assert(is(tok::verbatim_line_text));
    169     TextPtr = Text.data();
    170     IntVal = Text.size();
    171   }
    172 
    173   StringRef getHTMLTagStartName() const LLVM_READONLY {
    174     assert(is(tok::html_start_tag));
    175     return StringRef(TextPtr, IntVal);
    176   }
    177 
    178   void setHTMLTagStartName(StringRef Name) {
    179     assert(is(tok::html_start_tag));
    180     TextPtr = Name.data();
    181     IntVal = Name.size();
    182   }
    183 
    184   StringRef getHTMLIdent() const LLVM_READONLY {
    185     assert(is(tok::html_ident));
    186     return StringRef(TextPtr, IntVal);
    187   }
    188 
    189   void setHTMLIdent(StringRef Name) {
    190     assert(is(tok::html_ident));
    191     TextPtr = Name.data();
    192     IntVal = Name.size();
    193   }
    194 
    195   StringRef getHTMLQuotedString() const LLVM_READONLY {
    196     assert(is(tok::html_quoted_string));
    197     return StringRef(TextPtr, IntVal);
    198   }
    199 
    200   void setHTMLQuotedString(StringRef Str) {
    201     assert(is(tok::html_quoted_string));
    202     TextPtr = Str.data();
    203     IntVal = Str.size();
    204   }
    205 
    206   StringRef getHTMLTagEndName() const LLVM_READONLY {
    207     assert(is(tok::html_end_tag));
    208     return StringRef(TextPtr, IntVal);
    209   }
    210 
    211   void setHTMLTagEndName(StringRef Name) {
    212     assert(is(tok::html_end_tag));
    213     TextPtr = Name.data();
    214     IntVal = Name.size();
    215   }
    216 
    217   void dump(const Lexer &L, const SourceManager &SM) const;
    218 };
    219 
    220 /// \brief Comment lexer.
    221 class Lexer {
    222 private:
    223   Lexer(const Lexer &) = delete;
    224   void operator=(const Lexer &) = delete;
    225 
    226   /// Allocator for strings that are semantic values of tokens and have to be
    227   /// computed (for example, resolved decimal character references).
    228   llvm::BumpPtrAllocator &Allocator;
    229 
    230   DiagnosticsEngine &Diags;
    231 
    232   const CommandTraits &Traits;
    233 
    234   const char *const BufferStart;
    235   const char *const BufferEnd;
    236   SourceLocation FileLoc;
    237 
    238   const char *BufferPtr;
    239 
    240   /// One past end pointer for the current comment.  For BCPL comments points
    241   /// to newline or BufferEnd, for C comments points to star in '*/'.
    242   const char *CommentEnd;
    243 
    244   enum LexerCommentState {
    245     LCS_BeforeComment,
    246     LCS_InsideBCPLComment,
    247     LCS_InsideCComment,
    248     LCS_BetweenComments
    249   };
    250 
    251   /// Low-level lexer state, track if we are inside or outside of comment.
    252   LexerCommentState CommentState;
    253 
    254   enum LexerState {
    255     /// Lexing normal comment text
    256     LS_Normal,
    257 
    258     /// Finished lexing verbatim block beginning command, will lex first body
    259     /// line.
    260     LS_VerbatimBlockFirstLine,
    261 
    262     /// Lexing verbatim block body line-by-line, skipping line-starting
    263     /// decorations.
    264     LS_VerbatimBlockBody,
    265 
    266     /// Finished lexing verbatim line beginning command, will lex text (one
    267     /// line).
    268     LS_VerbatimLineText,
    269 
    270     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
    271     LS_HTMLStartTag,
    272 
    273     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
    274     LS_HTMLEndTag
    275   };
    276 
    277   /// Current lexing mode.
    278   LexerState State;
    279 
    280   /// If State is LS_VerbatimBlock, contains the name of verbatim end
    281   /// command, including command marker.
    282   SmallString<16> VerbatimBlockEndCommandName;
    283 
    284   /// Given a character reference name (e.g., "lt"), return the character that
    285   /// it stands for (e.g., "<").
    286   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
    287 
    288   /// Given a Unicode codepoint as base-10 integer, return the character.
    289   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
    290 
    291   /// Given a Unicode codepoint as base-16 integer, return the character.
    292   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
    293 
    294   void formTokenWithChars(Token &Result, const char *TokEnd,
    295                           tok::TokenKind Kind);
    296 
    297   void formTextToken(Token &Result, const char *TokEnd) {
    298     StringRef Text(BufferPtr, TokEnd - BufferPtr);
    299     formTokenWithChars(Result, TokEnd, tok::text);
    300     Result.setText(Text);
    301   }
    302 
    303   SourceLocation getSourceLocation(const char *Loc) const {
    304     assert(Loc >= BufferStart && Loc <= BufferEnd &&
    305            "Location out of range for this buffer!");
    306 
    307     const unsigned CharNo = Loc - BufferStart;
    308     return FileLoc.getLocWithOffset(CharNo);
    309   }
    310 
    311   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
    312     return Diags.Report(Loc, DiagID);
    313   }
    314 
    315   /// Eat string matching regexp \code \s*\* \endcode.
    316   void skipLineStartingDecorations();
    317 
    318   /// Lex stuff inside comments.  CommentEnd should be set correctly.
    319   void lexCommentText(Token &T);
    320 
    321   void setupAndLexVerbatimBlock(Token &T,
    322                                 const char *TextBegin,
    323                                 char Marker, const CommandInfo *Info);
    324 
    325   void lexVerbatimBlockFirstLine(Token &T);
    326 
    327   void lexVerbatimBlockBody(Token &T);
    328 
    329   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    330                                const CommandInfo *Info);
    331 
    332   void lexVerbatimLineText(Token &T);
    333 
    334   void lexHTMLCharacterReference(Token &T);
    335 
    336   void setupAndLexHTMLStartTag(Token &T);
    337 
    338   void lexHTMLStartTag(Token &T);
    339 
    340   void setupAndLexHTMLEndTag(Token &T);
    341 
    342   void lexHTMLEndTag(Token &T);
    343 
    344 public:
    345   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    346         const CommandTraits &Traits,
    347         SourceLocation FileLoc,
    348         const char *BufferStart, const char *BufferEnd);
    349 
    350   void lex(Token &T);
    351 
    352   StringRef getSpelling(const Token &Tok,
    353                         const SourceManager &SourceMgr,
    354                         bool *Invalid = nullptr) const;
    355 };
    356 
    357 } // end namespace comments
    358 } // end namespace clang
    359 
    360 #endif
    361 
    362