Home | History | Annotate | Download | only in AST
      1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This file defines lexer for structured comments and supporting token class.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 
     14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
     15 #define LLVM_CLANG_AST_COMMENT_LEXER_H
     16 
     17 #include "clang/Basic/SourceManager.h"
     18 #include "llvm/ADT/SmallString.h"
     19 #include "llvm/ADT/SmallVector.h"
     20 #include "llvm/ADT/StringRef.h"
     21 #include "llvm/Support/Allocator.h"
     22 #include "llvm/Support/raw_ostream.h"
     23 
     24 namespace clang {
     25 namespace comments {
     26 
     27 class Lexer;
     28 class TextTokenRetokenizer;
     29 struct CommandInfo;
     30 class CommandTraits;
     31 
     32 namespace tok {
     33 enum TokenKind {
     34   eof,
     35   newline,
     36   text,
     37   unknown_command,   // Command that does not have an ID.
     38   backslash_command, // Command with an ID, that used backslash marker.
     39   at_command,        // Command with an ID, that used 'at' marker.
     40   verbatim_block_begin,
     41   verbatim_block_line,
     42   verbatim_block_end,
     43   verbatim_line_name,
     44   verbatim_line_text,
     45   html_start_tag,     // <tag
     46   html_ident,         // attr
     47   html_equals,        // =
     48   html_quoted_string, // "blah\"blah" or 'blah\'blah'
     49   html_greater,       // >
     50   html_slash_greater, // />
     51   html_end_tag        // </tag
     52 };
     53 } // end namespace tok
     54 
     55 /// \brief Comment token.
     56 class Token {
     57   friend class Lexer;
     58   friend class TextTokenRetokenizer;
     59 
     60   /// The location of the token.
     61   SourceLocation Loc;
     62 
     63   /// The actual kind of the token.
     64   tok::TokenKind Kind;
     65 
     66   /// Length of the token spelling in comment.  Can be 0 for synthenized
     67   /// tokens.
     68   unsigned Length;
     69 
     70   /// Contains text value associated with a token.
     71   const char *TextPtr;
     72 
     73   /// Integer value associated with a token.
     74   ///
     75   /// If the token is a konwn command, contains command ID and TextPtr is
     76   /// unused (command spelling can be found with CommandTraits).  Otherwise,
     77   /// contains the length of the string that starts at TextPtr.
     78   unsigned IntVal;
     79 
     80 public:
     81   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
     82   void setLocation(SourceLocation SL) { Loc = SL; }
     83 
     84   SourceLocation getEndLocation() const LLVM_READONLY {
     85     if (Length == 0 || Length == 1)
     86       return Loc;
     87     return Loc.getLocWithOffset(Length - 1);
     88   }
     89 
     90   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
     91   void setKind(tok::TokenKind K) { Kind = K; }
     92 
     93   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
     94   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
     95 
     96   unsigned getLength() const LLVM_READONLY { return Length; }
     97   void setLength(unsigned L) { Length = L; }
     98 
     99   StringRef getText() const LLVM_READONLY {
    100     assert(is(tok::text));
    101     return StringRef(TextPtr, IntVal);
    102   }
    103 
    104   void setText(StringRef Text) {
    105     assert(is(tok::text));
    106     TextPtr = Text.data();
    107     IntVal = Text.size();
    108   }
    109 
    110   StringRef getUnknownCommandName() const LLVM_READONLY {
    111     assert(is(tok::unknown_command));
    112     return StringRef(TextPtr, IntVal);
    113   }
    114 
    115   void setUnknownCommandName(StringRef Name) {
    116     assert(is(tok::unknown_command));
    117     TextPtr = Name.data();
    118     IntVal = Name.size();
    119   }
    120 
    121   unsigned getCommandID() const LLVM_READONLY {
    122     assert(is(tok::backslash_command) || is(tok::at_command));
    123     return IntVal;
    124   }
    125 
    126   void setCommandID(unsigned ID) {
    127     assert(is(tok::backslash_command) || is(tok::at_command));
    128     IntVal = ID;
    129   }
    130 
    131   unsigned getVerbatimBlockID() const LLVM_READONLY {
    132     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    133     return IntVal;
    134   }
    135 
    136   void setVerbatimBlockID(unsigned ID) {
    137     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
    138     IntVal = ID;
    139   }
    140 
    141   StringRef getVerbatimBlockText() const LLVM_READONLY {
    142     assert(is(tok::verbatim_block_line));
    143     return StringRef(TextPtr, IntVal);
    144   }
    145 
    146   void setVerbatimBlockText(StringRef Text) {
    147     assert(is(tok::verbatim_block_line));
    148     TextPtr = Text.data();
    149     IntVal = Text.size();
    150   }
    151 
    152   unsigned getVerbatimLineID() const LLVM_READONLY {
    153     assert(is(tok::verbatim_line_name));
    154     return IntVal;
    155   }
    156 
    157   void setVerbatimLineID(unsigned ID) {
    158     assert(is(tok::verbatim_line_name));
    159     IntVal = ID;
    160   }
    161 
    162   StringRef getVerbatimLineText() const LLVM_READONLY {
    163     assert(is(tok::verbatim_line_text));
    164     return StringRef(TextPtr, IntVal);
    165   }
    166 
    167   void setVerbatimLineText(StringRef Text) {
    168     assert(is(tok::verbatim_line_text));
    169     TextPtr = Text.data();
    170     IntVal = Text.size();
    171   }
    172 
    173   StringRef getHTMLTagStartName() const LLVM_READONLY {
    174     assert(is(tok::html_start_tag));
    175     return StringRef(TextPtr, IntVal);
    176   }
    177 
    178   void setHTMLTagStartName(StringRef Name) {
    179     assert(is(tok::html_start_tag));
    180     TextPtr = Name.data();
    181     IntVal = Name.size();
    182   }
    183 
    184   StringRef getHTMLIdent() const LLVM_READONLY {
    185     assert(is(tok::html_ident));
    186     return StringRef(TextPtr, IntVal);
    187   }
    188 
    189   void setHTMLIdent(StringRef Name) {
    190     assert(is(tok::html_ident));
    191     TextPtr = Name.data();
    192     IntVal = Name.size();
    193   }
    194 
    195   StringRef getHTMLQuotedString() const LLVM_READONLY {
    196     assert(is(tok::html_quoted_string));
    197     return StringRef(TextPtr, IntVal);
    198   }
    199 
    200   void setHTMLQuotedString(StringRef Str) {
    201     assert(is(tok::html_quoted_string));
    202     TextPtr = Str.data();
    203     IntVal = Str.size();
    204   }
    205 
    206   StringRef getHTMLTagEndName() const LLVM_READONLY {
    207     assert(is(tok::html_end_tag));
    208     return StringRef(TextPtr, IntVal);
    209   }
    210 
    211   void setHTMLTagEndName(StringRef Name) {
    212     assert(is(tok::html_end_tag));
    213     TextPtr = Name.data();
    214     IntVal = Name.size();
    215   }
    216 
    217   void dump(const Lexer &L, const SourceManager &SM) const;
    218 };
    219 
    220 /// \brief Comment lexer.
    221 class Lexer {
    222 private:
    223   Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
    224   void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
    225 
    226   /// Allocator for strings that are semantic values of tokens and have to be
    227   /// computed (for example, resolved decimal character references).
    228   llvm::BumpPtrAllocator &Allocator;
    229 
    230   const CommandTraits &Traits;
    231 
    232   const char *const BufferStart;
    233   const char *const BufferEnd;
    234   SourceLocation FileLoc;
    235 
    236   const char *BufferPtr;
    237 
    238   /// One past end pointer for the current comment.  For BCPL comments points
    239   /// to newline or BufferEnd, for C comments points to star in '*/'.
    240   const char *CommentEnd;
    241 
    242   enum LexerCommentState {
    243     LCS_BeforeComment,
    244     LCS_InsideBCPLComment,
    245     LCS_InsideCComment,
    246     LCS_BetweenComments
    247   };
    248 
    249   /// Low-level lexer state, track if we are inside or outside of comment.
    250   LexerCommentState CommentState;
    251 
    252   enum LexerState {
    253     /// Lexing normal comment text
    254     LS_Normal,
    255 
    256     /// Finished lexing verbatim block beginning command, will lex first body
    257     /// line.
    258     LS_VerbatimBlockFirstLine,
    259 
    260     /// Lexing verbatim block body line-by-line, skipping line-starting
    261     /// decorations.
    262     LS_VerbatimBlockBody,
    263 
    264     /// Finished lexing verbatim line beginning command, will lex text (one
    265     /// line).
    266     LS_VerbatimLineText,
    267 
    268     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
    269     LS_HTMLStartTag,
    270 
    271     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
    272     LS_HTMLEndTag
    273   };
    274 
    275   /// Current lexing mode.
    276   LexerState State;
    277 
    278   /// If State is LS_VerbatimBlock, contains the name of verbatim end
    279   /// command, including command marker.
    280   SmallString<16> VerbatimBlockEndCommandName;
    281 
    282   /// Given a character reference name (e.g., "lt"), return the character that
    283   /// it stands for (e.g., "<").
    284   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
    285 
    286   /// Given a Unicode codepoint as base-10 integer, return the character.
    287   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
    288 
    289   /// Given a Unicode codepoint as base-16 integer, return the character.
    290   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
    291 
    292   void formTokenWithChars(Token &Result, const char *TokEnd,
    293                           tok::TokenKind Kind) {
    294     const unsigned TokLen = TokEnd - BufferPtr;
    295     Result.setLocation(getSourceLocation(BufferPtr));
    296     Result.setKind(Kind);
    297     Result.setLength(TokLen);
    298 #ifndef NDEBUG
    299     Result.TextPtr = "<UNSET>";
    300     Result.IntVal = 7;
    301 #endif
    302     BufferPtr = TokEnd;
    303   }
    304 
    305   void formTextToken(Token &Result, const char *TokEnd) {
    306     StringRef Text(BufferPtr, TokEnd - BufferPtr);
    307     formTokenWithChars(Result, TokEnd, tok::text);
    308     Result.setText(Text);
    309   }
    310 
    311   SourceLocation getSourceLocation(const char *Loc) const {
    312     assert(Loc >= BufferStart && Loc <= BufferEnd &&
    313            "Location out of range for this buffer!");
    314 
    315     const unsigned CharNo = Loc - BufferStart;
    316     return FileLoc.getLocWithOffset(CharNo);
    317   }
    318 
    319   /// Eat string matching regexp \code \s*\* \endcode.
    320   void skipLineStartingDecorations();
    321 
    322   /// Lex stuff inside comments.  CommentEnd should be set correctly.
    323   void lexCommentText(Token &T);
    324 
    325   void setupAndLexVerbatimBlock(Token &T,
    326                                 const char *TextBegin,
    327                                 char Marker, const CommandInfo *Info);
    328 
    329   void lexVerbatimBlockFirstLine(Token &T);
    330 
    331   void lexVerbatimBlockBody(Token &T);
    332 
    333   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    334                                const CommandInfo *Info);
    335 
    336   void lexVerbatimLineText(Token &T);
    337 
    338   void lexHTMLCharacterReference(Token &T);
    339 
    340   void setupAndLexHTMLStartTag(Token &T);
    341 
    342   void lexHTMLStartTag(Token &T);
    343 
    344   void setupAndLexHTMLEndTag(Token &T);
    345 
    346   void lexHTMLEndTag(Token &T);
    347 
    348 public:
    349   Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
    350         SourceLocation FileLoc,
    351         const char *BufferStart, const char *BufferEnd);
    352 
    353   void lex(Token &T);
    354 
    355   StringRef getSpelling(const Token &Tok,
    356                         const SourceManager &SourceMgr,
    357                         bool *Invalid = NULL) const;
    358 };
    359 
    360 } // end namespace comments
    361 } // end namespace clang
    362 
    363 #endif
    364 
    365