1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines lexer for structured comments and supporting token class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15 #define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17 #include "clang/Basic/Diagnostic.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "llvm/ADT/SmallString.h" 20 #include "llvm/ADT/SmallVector.h" 21 #include "llvm/ADT/StringRef.h" 22 #include "llvm/Support/Allocator.h" 23 #include "llvm/Support/raw_ostream.h" 24 25 namespace clang { 26 namespace comments { 27 28 class Lexer; 29 class TextTokenRetokenizer; 30 struct CommandInfo; 31 class CommandTraits; 32 33 namespace tok { 34 enum TokenKind { 35 eof, 36 newline, 37 text, 38 unknown_command, // Command that does not have an ID. 39 backslash_command, // Command with an ID, that used backslash marker. 40 at_command, // Command with an ID, that used 'at' marker. 41 verbatim_block_begin, 42 verbatim_block_line, 43 verbatim_block_end, 44 verbatim_line_name, 45 verbatim_line_text, 46 html_start_tag, // <tag 47 html_ident, // attr 48 html_equals, // = 49 html_quoted_string, // "blah\"blah" or 'blah\'blah' 50 html_greater, // > 51 html_slash_greater, // /> 52 html_end_tag // </tag 53 }; 54 } // end namespace tok 55 56 /// \brief Comment token. 57 class Token { 58 friend class Lexer; 59 friend class TextTokenRetokenizer; 60 61 /// The location of the token. 62 SourceLocation Loc; 63 64 /// The actual kind of the token. 65 tok::TokenKind Kind; 66 67 /// Length of the token spelling in comment. Can be 0 for synthenized 68 /// tokens. 69 unsigned Length; 70 71 /// Contains text value associated with a token. 72 const char *TextPtr; 73 74 /// Integer value associated with a token. 75 /// 76 /// If the token is a konwn command, contains command ID and TextPtr is 77 /// unused (command spelling can be found with CommandTraits). Otherwise, 78 /// contains the length of the string that starts at TextPtr. 79 unsigned IntVal; 80 81 public: 82 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 83 void setLocation(SourceLocation SL) { Loc = SL; } 84 85 SourceLocation getEndLocation() const LLVM_READONLY { 86 if (Length == 0 || Length == 1) 87 return Loc; 88 return Loc.getLocWithOffset(Length - 1); 89 } 90 91 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 92 void setKind(tok::TokenKind K) { Kind = K; } 93 94 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 95 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 96 97 unsigned getLength() const LLVM_READONLY { return Length; } 98 void setLength(unsigned L) { Length = L; } 99 100 StringRef getText() const LLVM_READONLY { 101 assert(is(tok::text)); 102 return StringRef(TextPtr, IntVal); 103 } 104 105 void setText(StringRef Text) { 106 assert(is(tok::text)); 107 TextPtr = Text.data(); 108 IntVal = Text.size(); 109 } 110 111 StringRef getUnknownCommandName() const LLVM_READONLY { 112 assert(is(tok::unknown_command)); 113 return StringRef(TextPtr, IntVal); 114 } 115 116 void setUnknownCommandName(StringRef Name) { 117 assert(is(tok::unknown_command)); 118 TextPtr = Name.data(); 119 IntVal = Name.size(); 120 } 121 122 unsigned getCommandID() const LLVM_READONLY { 123 assert(is(tok::backslash_command) || is(tok::at_command)); 124 return IntVal; 125 } 126 127 void setCommandID(unsigned ID) { 128 assert(is(tok::backslash_command) || is(tok::at_command)); 129 IntVal = ID; 130 } 131 132 unsigned getVerbatimBlockID() const LLVM_READONLY { 133 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 134 return IntVal; 135 } 136 137 void setVerbatimBlockID(unsigned ID) { 138 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 139 IntVal = ID; 140 } 141 142 StringRef getVerbatimBlockText() const LLVM_READONLY { 143 assert(is(tok::verbatim_block_line)); 144 return StringRef(TextPtr, IntVal); 145 } 146 147 void setVerbatimBlockText(StringRef Text) { 148 assert(is(tok::verbatim_block_line)); 149 TextPtr = Text.data(); 150 IntVal = Text.size(); 151 } 152 153 unsigned getVerbatimLineID() const LLVM_READONLY { 154 assert(is(tok::verbatim_line_name)); 155 return IntVal; 156 } 157 158 void setVerbatimLineID(unsigned ID) { 159 assert(is(tok::verbatim_line_name)); 160 IntVal = ID; 161 } 162 163 StringRef getVerbatimLineText() const LLVM_READONLY { 164 assert(is(tok::verbatim_line_text)); 165 return StringRef(TextPtr, IntVal); 166 } 167 168 void setVerbatimLineText(StringRef Text) { 169 assert(is(tok::verbatim_line_text)); 170 TextPtr = Text.data(); 171 IntVal = Text.size(); 172 } 173 174 StringRef getHTMLTagStartName() const LLVM_READONLY { 175 assert(is(tok::html_start_tag)); 176 return StringRef(TextPtr, IntVal); 177 } 178 179 void setHTMLTagStartName(StringRef Name) { 180 assert(is(tok::html_start_tag)); 181 TextPtr = Name.data(); 182 IntVal = Name.size(); 183 } 184 185 StringRef getHTMLIdent() const LLVM_READONLY { 186 assert(is(tok::html_ident)); 187 return StringRef(TextPtr, IntVal); 188 } 189 190 void setHTMLIdent(StringRef Name) { 191 assert(is(tok::html_ident)); 192 TextPtr = Name.data(); 193 IntVal = Name.size(); 194 } 195 196 StringRef getHTMLQuotedString() const LLVM_READONLY { 197 assert(is(tok::html_quoted_string)); 198 return StringRef(TextPtr, IntVal); 199 } 200 201 void setHTMLQuotedString(StringRef Str) { 202 assert(is(tok::html_quoted_string)); 203 TextPtr = Str.data(); 204 IntVal = Str.size(); 205 } 206 207 StringRef getHTMLTagEndName() const LLVM_READONLY { 208 assert(is(tok::html_end_tag)); 209 return StringRef(TextPtr, IntVal); 210 } 211 212 void setHTMLTagEndName(StringRef Name) { 213 assert(is(tok::html_end_tag)); 214 TextPtr = Name.data(); 215 IntVal = Name.size(); 216 } 217 218 void dump(const Lexer &L, const SourceManager &SM) const; 219 }; 220 221 /// \brief Comment lexer. 222 class Lexer { 223 private: 224 Lexer(const Lexer &) LLVM_DELETED_FUNCTION; 225 void operator=(const Lexer &) LLVM_DELETED_FUNCTION; 226 227 /// Allocator for strings that are semantic values of tokens and have to be 228 /// computed (for example, resolved decimal character references). 229 llvm::BumpPtrAllocator &Allocator; 230 231 DiagnosticsEngine &Diags; 232 233 const CommandTraits &Traits; 234 235 const char *const BufferStart; 236 const char *const BufferEnd; 237 SourceLocation FileLoc; 238 239 const char *BufferPtr; 240 241 /// One past end pointer for the current comment. For BCPL comments points 242 /// to newline or BufferEnd, for C comments points to star in '*/'. 243 const char *CommentEnd; 244 245 enum LexerCommentState { 246 LCS_BeforeComment, 247 LCS_InsideBCPLComment, 248 LCS_InsideCComment, 249 LCS_BetweenComments 250 }; 251 252 /// Low-level lexer state, track if we are inside or outside of comment. 253 LexerCommentState CommentState; 254 255 enum LexerState { 256 /// Lexing normal comment text 257 LS_Normal, 258 259 /// Finished lexing verbatim block beginning command, will lex first body 260 /// line. 261 LS_VerbatimBlockFirstLine, 262 263 /// Lexing verbatim block body line-by-line, skipping line-starting 264 /// decorations. 265 LS_VerbatimBlockBody, 266 267 /// Finished lexing verbatim line beginning command, will lex text (one 268 /// line). 269 LS_VerbatimLineText, 270 271 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 272 LS_HTMLStartTag, 273 274 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 275 LS_HTMLEndTag 276 }; 277 278 /// Current lexing mode. 279 LexerState State; 280 281 /// If State is LS_VerbatimBlock, contains the name of verbatim end 282 /// command, including command marker. 283 SmallString<16> VerbatimBlockEndCommandName; 284 285 /// Given a character reference name (e.g., "lt"), return the character that 286 /// it stands for (e.g., "<"). 287 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 288 289 /// Given a Unicode codepoint as base-10 integer, return the character. 290 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 291 292 /// Given a Unicode codepoint as base-16 integer, return the character. 293 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 294 295 void formTokenWithChars(Token &Result, const char *TokEnd, 296 tok::TokenKind Kind); 297 298 void formTextToken(Token &Result, const char *TokEnd) { 299 StringRef Text(BufferPtr, TokEnd - BufferPtr); 300 formTokenWithChars(Result, TokEnd, tok::text); 301 Result.setText(Text); 302 } 303 304 SourceLocation getSourceLocation(const char *Loc) const { 305 assert(Loc >= BufferStart && Loc <= BufferEnd && 306 "Location out of range for this buffer!"); 307 308 const unsigned CharNo = Loc - BufferStart; 309 return FileLoc.getLocWithOffset(CharNo); 310 } 311 312 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 313 return Diags.Report(Loc, DiagID); 314 } 315 316 /// Eat string matching regexp \code \s*\* \endcode. 317 void skipLineStartingDecorations(); 318 319 /// Lex stuff inside comments. CommentEnd should be set correctly. 320 void lexCommentText(Token &T); 321 322 void setupAndLexVerbatimBlock(Token &T, 323 const char *TextBegin, 324 char Marker, const CommandInfo *Info); 325 326 void lexVerbatimBlockFirstLine(Token &T); 327 328 void lexVerbatimBlockBody(Token &T); 329 330 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 331 const CommandInfo *Info); 332 333 void lexVerbatimLineText(Token &T); 334 335 void lexHTMLCharacterReference(Token &T); 336 337 void setupAndLexHTMLStartTag(Token &T); 338 339 void lexHTMLStartTag(Token &T); 340 341 void setupAndLexHTMLEndTag(Token &T); 342 343 void lexHTMLEndTag(Token &T); 344 345 public: 346 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 347 const CommandTraits &Traits, 348 SourceLocation FileLoc, 349 const char *BufferStart, const char *BufferEnd); 350 351 void lex(Token &T); 352 353 StringRef getSpelling(const Token &Tok, 354 const SourceManager &SourceMgr, 355 bool *Invalid = nullptr) const; 356 }; 357 358 } // end namespace comments 359 } // end namespace clang 360 361 #endif 362 363