Home | History | Annotate | Download | only in AST
      1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This file defines lexer for structured comments and supporting token class.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
     15 #define LLVM_CLANG_AST_COMMENT_LEXER_H
     16 
     17 #include "clang/Basic/SourceManager.h"
     18 #include "clang/Basic/Diagnostic.h"
     19 #include "llvm/ADT/SmallString.h"
     20 #include "llvm/ADT/SmallVector.h"
     21 #include "llvm/ADT/StringRef.h"
     22 #include "llvm/Support/Allocator.h"
     23 #include "llvm/Support/raw_ostream.h"
     24 
     25 namespace clang {
     26 namespace comments {
     27 
     28 class Lexer;
     29 class TextTokenRetokenizer;
     30 struct CommandInfo;
     31 class CommandTraits;
     32 
     33 namespace tok {
     34 enum TokenKind {
     35   eof,
     36   newline,
     37   text,
     38   unknown_command,   // Command that does not have an ID.
     39   backslash_command, // Command with an ID, that used backslash marker.
     40   at_command,        // Command with an ID, that used 'at' marker.
     41   verbatim_block_begin,
     42   verbatim_block_line,
     43   verbatim_block_end,
     44   verbatim_line_name,
     45   verbatim_line_text,
     46   html_start_tag,     // <tag
     47   html_ident,         // attr
     48   html_equals,        // =
     49   html_quoted_string, // "blah\"blah" or 'blah\'blah'
     50   html_greater,       // >
     51   html_slash_greater, // />
     52   html_end_tag        // </tag
     53 };
     54 } // end namespace tok
     55 
     56 /// \brief Comment token.
     57 class Token {
     58   friend class Lexer;
     59   friend class TextTokenRetokenizer;
     60 
     61   /// The location of the token.
     62   SourceLocation Loc;
     63 
     64   /// The actual kind of the token.
     65   tok::TokenKind Kind;
     66 
     67   /// Length of the token spelling in comment.  Can be 0 for synthenized
     68   /// tokens.
     69   unsigned Length;
     70 
     71   /// Contains text value associated with a token.
     72   const char *TextPtr;
     73 
     74   /// Integer value associated with a token.
     75   ///
     76   /// If the token is a konwn command, contains command ID and TextPtr is
     77   /// unused (command spelling can be found with CommandTraits).  Otherwise,
     78   /// contains the length of the string that starts at TextPtr.
     79   unsigned IntVal;
     80 
     81 public:
     82   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
     83   void setLocation(SourceLocation SL) { Loc = SL; }
     84 
     85   SourceLocation getEndLocation() const LLVM_READONLY {
     86     if (Length == 0 || Length == 1)
     87       return Loc;
     88     return Loc.getLocWithOffset(Length - 1);
     89   }
     90 
     91   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
     92   void setKind(tok::TokenKind K) { Kind = K; }
     93 
     94   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
     95   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
     96 
     97   unsigned getLength() const LLVM_READONLY { return Length; }
     98   void setLength(unsigned L) { Length = L; }
     99 
    100   StringRef getText() const LLVM_READONLY {
    101     assert(is(tok::text));
    102     return StringRef(TextPtr, IntVal);
    103   }
    104 
    105   void setText(StringRef Text) {
    106     assert(is(tok::text));
    107     TextPtr = Text.data();
    108     IntVal = Text.size();
    109   }
    110 
    111   StringRef getUnknownCommandName() const LLVM_READONLY {
    112     assert(is(tok::unknown_command));
    113     return StringRef(TextPtr, IntVal);
    114   }
    115 
    116   void setUnknownCommandName(StringRef Name) {
    117     assert(is(tok::unknown_command));
    118     TextPtr = Name.data();
    119     IntVal = Name.size();
    120   }
    121 
    122   unsigned getCommandID() const LLVM_READONLY {
    123     assert(is(tok::backslash_command) || is(tok::at_command));
    124     return IntVal;
    125   }
    126 
    127   void setCommandID(unsigned ID) {
    128     assert(is(tok::backslash_command) || is(tok::at_command));
    129     IntVal = ID;
    130   }
    131 
    132   unsigned getVerbatimBlockID() const LLVM_READONLY {
    133     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    134     return IntVal;
    135   }
    136 
    137   void setVerbatimBlockID(unsigned ID) {
    138     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    139     IntVal = ID;
    140   }
    141 
    142   StringRef getVerbatimBlockText() const LLVM_READONLY {
    143     assert(is(tok::verbatim_block_line));
    144     return StringRef(TextPtr, IntVal);
    145   }
    146 
    147   void setVerbatimBlockText(StringRef Text) {
    148     assert(is(tok::verbatim_block_line));
    149     TextPtr = Text.data();
    150     IntVal = Text.size();
    151   }
    152 
    153   unsigned getVerbatimLineID() const LLVM_READONLY {
    154     assert(is(tok::verbatim_line_name));
    155     return IntVal;
    156   }
    157 
    158   void setVerbatimLineID(unsigned ID) {
    159     assert(is(tok::verbatim_line_name));
    160     IntVal = ID;
    161   }
    162 
    163   StringRef getVerbatimLineText() const LLVM_READONLY {
    164     assert(is(tok::verbatim_line_text));
    165     return StringRef(TextPtr, IntVal);
    166   }
    167 
    168   void setVerbatimLineText(StringRef Text) {
    169     assert(is(tok::verbatim_line_text));
    170     TextPtr = Text.data();
    171     IntVal = Text.size();
    172   }
    173 
    174   StringRef getHTMLTagStartName() const LLVM_READONLY {
    175     assert(is(tok::html_start_tag));
    176     return StringRef(TextPtr, IntVal);
    177   }
    178 
    179   void setHTMLTagStartName(StringRef Name) {
    180     assert(is(tok::html_start_tag));
    181     TextPtr = Name.data();
    182     IntVal = Name.size();
    183   }
    184 
    185   StringRef getHTMLIdent() const LLVM_READONLY {
    186     assert(is(tok::html_ident));
    187     return StringRef(TextPtr, IntVal);
    188   }
    189 
    190   void setHTMLIdent(StringRef Name) {
    191     assert(is(tok::html_ident));
    192     TextPtr = Name.data();
    193     IntVal = Name.size();
    194   }
    195 
    196   StringRef getHTMLQuotedString() const LLVM_READONLY {
    197     assert(is(tok::html_quoted_string));
    198     return StringRef(TextPtr, IntVal);
    199   }
    200 
    201   void setHTMLQuotedString(StringRef Str) {
    202     assert(is(tok::html_quoted_string));
    203     TextPtr = Str.data();
    204     IntVal = Str.size();
    205   }
    206 
    207   StringRef getHTMLTagEndName() const LLVM_READONLY {
    208     assert(is(tok::html_end_tag));
    209     return StringRef(TextPtr, IntVal);
    210   }
    211 
    212   void setHTMLTagEndName(StringRef Name) {
    213     assert(is(tok::html_end_tag));
    214     TextPtr = Name.data();
    215     IntVal = Name.size();
    216   }
    217 
    218   void dump(const Lexer &L, const SourceManager &SM) const;
    219 };
    220 
    221 /// \brief Comment lexer.
    222 class Lexer {
    223 private:
    224   Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
    225   void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
    226 
    227   /// Allocator for strings that are semantic values of tokens and have to be
    228   /// computed (for example, resolved decimal character references).
    229   llvm::BumpPtrAllocator &Allocator;
    230 
    231   DiagnosticsEngine &Diags;
    232 
    233   const CommandTraits &Traits;
    234 
    235   const char *const BufferStart;
    236   const char *const BufferEnd;
    237   SourceLocation FileLoc;
    238 
    239   const char *BufferPtr;
    240 
    241   /// One past end pointer for the current comment.  For BCPL comments points
    242   /// to newline or BufferEnd, for C comments points to star in '*/'.
    243   const char *CommentEnd;
    244 
    245   enum LexerCommentState {
    246     LCS_BeforeComment,
    247     LCS_InsideBCPLComment,
    248     LCS_InsideCComment,
    249     LCS_BetweenComments
    250   };
    251 
    252   /// Low-level lexer state, track if we are inside or outside of comment.
    253   LexerCommentState CommentState;
    254 
    255   enum LexerState {
    256     /// Lexing normal comment text
    257     LS_Normal,
    258 
    259     /// Finished lexing verbatim block beginning command, will lex first body
    260     /// line.
    261     LS_VerbatimBlockFirstLine,
    262 
    263     /// Lexing verbatim block body line-by-line, skipping line-starting
    264     /// decorations.
    265     LS_VerbatimBlockBody,
    266 
    267     /// Finished lexing verbatim line beginning command, will lex text (one
    268     /// line).
    269     LS_VerbatimLineText,
    270 
    271     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
    272     LS_HTMLStartTag,
    273 
    274     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
    275     LS_HTMLEndTag
    276   };
    277 
    278   /// Current lexing mode.
    279   LexerState State;
    280 
    281   /// If State is LS_VerbatimBlock, contains the name of verbatim end
    282   /// command, including command marker.
    283   SmallString<16> VerbatimBlockEndCommandName;
    284 
    285   /// Given a character reference name (e.g., "lt"), return the character that
    286   /// it stands for (e.g., "<").
    287   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
    288 
    289   /// Given a Unicode codepoint as base-10 integer, return the character.
    290   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
    291 
    292   /// Given a Unicode codepoint as base-16 integer, return the character.
    293   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
    294 
    295   void formTokenWithChars(Token &Result, const char *TokEnd,
    296                           tok::TokenKind Kind) {
    297     const unsigned TokLen = TokEnd - BufferPtr;
    298     Result.setLocation(getSourceLocation(BufferPtr));
    299     Result.setKind(Kind);
    300     Result.setLength(TokLen);
    301 #ifndef NDEBUG
    302     Result.TextPtr = "<UNSET>";
    303     Result.IntVal = 7;
    304 #endif
    305     BufferPtr = TokEnd;
    306   }
    307 
    308   void formTextToken(Token &Result, const char *TokEnd) {
    309     StringRef Text(BufferPtr, TokEnd - BufferPtr);
    310     formTokenWithChars(Result, TokEnd, tok::text);
    311     Result.setText(Text);
    312   }
    313 
    314   SourceLocation getSourceLocation(const char *Loc) const {
    315     assert(Loc >= BufferStart && Loc <= BufferEnd &&
    316            "Location out of range for this buffer!");
    317 
    318     const unsigned CharNo = Loc - BufferStart;
    319     return FileLoc.getLocWithOffset(CharNo);
    320   }
    321 
    322   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
    323     return Diags.Report(Loc, DiagID);
    324   }
    325 
    326   /// Eat string matching regexp \code \s*\* \endcode.
    327   void skipLineStartingDecorations();
    328 
    329   /// Lex stuff inside comments.  CommentEnd should be set correctly.
    330   void lexCommentText(Token &T);
    331 
    332   void setupAndLexVerbatimBlock(Token &T,
    333                                 const char *TextBegin,
    334                                 char Marker, const CommandInfo *Info);
    335 
    336   void lexVerbatimBlockFirstLine(Token &T);
    337 
    338   void lexVerbatimBlockBody(Token &T);
    339 
    340   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    341                                const CommandInfo *Info);
    342 
    343   void lexVerbatimLineText(Token &T);
    344 
    345   void lexHTMLCharacterReference(Token &T);
    346 
    347   void setupAndLexHTMLStartTag(Token &T);
    348 
    349   void lexHTMLStartTag(Token &T);
    350 
    351   void setupAndLexHTMLEndTag(Token &T);
    352 
    353   void lexHTMLEndTag(Token &T);
    354 
    355 public:
    356   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    357         const CommandTraits &Traits,
    358         SourceLocation FileLoc,
    359         const char *BufferStart, const char *BufferEnd);
    360 
    361   void lex(Token &T);
    362 
    363   StringRef getSpelling(const Token &Tok,
    364                         const SourceManager &SourceMgr,
    365                         bool *Invalid = NULL) const;
    366 };
    367 
    368 } // end namespace comments
    369 } // end namespace clang
    370 
    371 #endif
    372 
    373