1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines lexer for structured comments and supporting token class. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H 15 #define LLVM_CLANG_AST_COMMENTLEXER_H 16 17 #include "clang/Basic/Diagnostic.h" 18 #include "clang/Basic/SourceManager.h" 19 #include "llvm/ADT/SmallString.h" 20 #include "llvm/ADT/StringRef.h" 21 #include "llvm/Support/Allocator.h" 22 #include "llvm/Support/raw_ostream.h" 23 24 namespace clang { 25 namespace comments { 26 27 class Lexer; 28 class TextTokenRetokenizer; 29 struct CommandInfo; 30 class CommandTraits; 31 32 namespace tok { 33 enum TokenKind { 34 eof, 35 newline, 36 text, 37 unknown_command, // Command that does not have an ID. 38 backslash_command, // Command with an ID, that used backslash marker. 39 at_command, // Command with an ID, that used 'at' marker. 40 verbatim_block_begin, 41 verbatim_block_line, 42 verbatim_block_end, 43 verbatim_line_name, 44 verbatim_line_text, 45 html_start_tag, // <tag 46 html_ident, // attr 47 html_equals, // = 48 html_quoted_string, // "blah\"blah" or 'blah\'blah' 49 html_greater, // > 50 html_slash_greater, // /> 51 html_end_tag // </tag 52 }; 53 } // end namespace tok 54 55 /// \brief Comment token. 56 class Token { 57 friend class Lexer; 58 friend class TextTokenRetokenizer; 59 60 /// The location of the token. 61 SourceLocation Loc; 62 63 /// The actual kind of the token. 64 tok::TokenKind Kind; 65 66 /// Length of the token spelling in comment. Can be 0 for synthenized 67 /// tokens. 68 unsigned Length; 69 70 /// Contains text value associated with a token. 71 const char *TextPtr; 72 73 /// Integer value associated with a token. 74 /// 75 /// If the token is a konwn command, contains command ID and TextPtr is 76 /// unused (command spelling can be found with CommandTraits). Otherwise, 77 /// contains the length of the string that starts at TextPtr. 78 unsigned IntVal; 79 80 public: 81 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 82 void setLocation(SourceLocation SL) { Loc = SL; } 83 84 SourceLocation getEndLocation() const LLVM_READONLY { 85 if (Length == 0 || Length == 1) 86 return Loc; 87 return Loc.getLocWithOffset(Length - 1); 88 } 89 90 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 91 void setKind(tok::TokenKind K) { Kind = K; } 92 93 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 94 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 95 96 unsigned getLength() const LLVM_READONLY { return Length; } 97 void setLength(unsigned L) { Length = L; } 98 99 StringRef getText() const LLVM_READONLY { 100 assert(is(tok::text)); 101 return StringRef(TextPtr, IntVal); 102 } 103 104 void setText(StringRef Text) { 105 assert(is(tok::text)); 106 TextPtr = Text.data(); 107 IntVal = Text.size(); 108 } 109 110 StringRef getUnknownCommandName() const LLVM_READONLY { 111 assert(is(tok::unknown_command)); 112 return StringRef(TextPtr, IntVal); 113 } 114 115 void setUnknownCommandName(StringRef Name) { 116 assert(is(tok::unknown_command)); 117 TextPtr = Name.data(); 118 IntVal = Name.size(); 119 } 120 121 unsigned getCommandID() const LLVM_READONLY { 122 assert(is(tok::backslash_command) || is(tok::at_command)); 123 return IntVal; 124 } 125 126 void setCommandID(unsigned ID) { 127 assert(is(tok::backslash_command) || is(tok::at_command)); 128 IntVal = ID; 129 } 130 131 unsigned getVerbatimBlockID() const LLVM_READONLY { 132 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 133 return IntVal; 134 } 135 136 void setVerbatimBlockID(unsigned ID) { 137 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 138 IntVal = ID; 139 } 140 141 StringRef getVerbatimBlockText() const LLVM_READONLY { 142 assert(is(tok::verbatim_block_line)); 143 return StringRef(TextPtr, IntVal); 144 } 145 146 void setVerbatimBlockText(StringRef Text) { 147 assert(is(tok::verbatim_block_line)); 148 TextPtr = Text.data(); 149 IntVal = Text.size(); 150 } 151 152 unsigned getVerbatimLineID() const LLVM_READONLY { 153 assert(is(tok::verbatim_line_name)); 154 return IntVal; 155 } 156 157 void setVerbatimLineID(unsigned ID) { 158 assert(is(tok::verbatim_line_name)); 159 IntVal = ID; 160 } 161 162 StringRef getVerbatimLineText() const LLVM_READONLY { 163 assert(is(tok::verbatim_line_text)); 164 return StringRef(TextPtr, IntVal); 165 } 166 167 void setVerbatimLineText(StringRef Text) { 168 assert(is(tok::verbatim_line_text)); 169 TextPtr = Text.data(); 170 IntVal = Text.size(); 171 } 172 173 StringRef getHTMLTagStartName() const LLVM_READONLY { 174 assert(is(tok::html_start_tag)); 175 return StringRef(TextPtr, IntVal); 176 } 177 178 void setHTMLTagStartName(StringRef Name) { 179 assert(is(tok::html_start_tag)); 180 TextPtr = Name.data(); 181 IntVal = Name.size(); 182 } 183 184 StringRef getHTMLIdent() const LLVM_READONLY { 185 assert(is(tok::html_ident)); 186 return StringRef(TextPtr, IntVal); 187 } 188 189 void setHTMLIdent(StringRef Name) { 190 assert(is(tok::html_ident)); 191 TextPtr = Name.data(); 192 IntVal = Name.size(); 193 } 194 195 StringRef getHTMLQuotedString() const LLVM_READONLY { 196 assert(is(tok::html_quoted_string)); 197 return StringRef(TextPtr, IntVal); 198 } 199 200 void setHTMLQuotedString(StringRef Str) { 201 assert(is(tok::html_quoted_string)); 202 TextPtr = Str.data(); 203 IntVal = Str.size(); 204 } 205 206 StringRef getHTMLTagEndName() const LLVM_READONLY { 207 assert(is(tok::html_end_tag)); 208 return StringRef(TextPtr, IntVal); 209 } 210 211 void setHTMLTagEndName(StringRef Name) { 212 assert(is(tok::html_end_tag)); 213 TextPtr = Name.data(); 214 IntVal = Name.size(); 215 } 216 217 void dump(const Lexer &L, const SourceManager &SM) const; 218 }; 219 220 /// \brief Comment lexer. 221 class Lexer { 222 private: 223 Lexer(const Lexer &) = delete; 224 void operator=(const Lexer &) = delete; 225 226 /// Allocator for strings that are semantic values of tokens and have to be 227 /// computed (for example, resolved decimal character references). 228 llvm::BumpPtrAllocator &Allocator; 229 230 DiagnosticsEngine &Diags; 231 232 const CommandTraits &Traits; 233 234 const char *const BufferStart; 235 const char *const BufferEnd; 236 SourceLocation FileLoc; 237 238 const char *BufferPtr; 239 240 /// One past end pointer for the current comment. For BCPL comments points 241 /// to newline or BufferEnd, for C comments points to star in '*/'. 242 const char *CommentEnd; 243 244 enum LexerCommentState { 245 LCS_BeforeComment, 246 LCS_InsideBCPLComment, 247 LCS_InsideCComment, 248 LCS_BetweenComments 249 }; 250 251 /// Low-level lexer state, track if we are inside or outside of comment. 252 LexerCommentState CommentState; 253 254 enum LexerState { 255 /// Lexing normal comment text 256 LS_Normal, 257 258 /// Finished lexing verbatim block beginning command, will lex first body 259 /// line. 260 LS_VerbatimBlockFirstLine, 261 262 /// Lexing verbatim block body line-by-line, skipping line-starting 263 /// decorations. 264 LS_VerbatimBlockBody, 265 266 /// Finished lexing verbatim line beginning command, will lex text (one 267 /// line). 268 LS_VerbatimLineText, 269 270 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 271 LS_HTMLStartTag, 272 273 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 274 LS_HTMLEndTag 275 }; 276 277 /// Current lexing mode. 278 LexerState State; 279 280 /// If State is LS_VerbatimBlock, contains the name of verbatim end 281 /// command, including command marker. 282 SmallString<16> VerbatimBlockEndCommandName; 283 284 /// Given a character reference name (e.g., "lt"), return the character that 285 /// it stands for (e.g., "<"). 286 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 287 288 /// Given a Unicode codepoint as base-10 integer, return the character. 289 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 290 291 /// Given a Unicode codepoint as base-16 integer, return the character. 292 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 293 294 void formTokenWithChars(Token &Result, const char *TokEnd, 295 tok::TokenKind Kind); 296 297 void formTextToken(Token &Result, const char *TokEnd) { 298 StringRef Text(BufferPtr, TokEnd - BufferPtr); 299 formTokenWithChars(Result, TokEnd, tok::text); 300 Result.setText(Text); 301 } 302 303 SourceLocation getSourceLocation(const char *Loc) const { 304 assert(Loc >= BufferStart && Loc <= BufferEnd && 305 "Location out of range for this buffer!"); 306 307 const unsigned CharNo = Loc - BufferStart; 308 return FileLoc.getLocWithOffset(CharNo); 309 } 310 311 DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { 312 return Diags.Report(Loc, DiagID); 313 } 314 315 /// Eat string matching regexp \code \s*\* \endcode. 316 void skipLineStartingDecorations(); 317 318 /// Lex stuff inside comments. CommentEnd should be set correctly. 319 void lexCommentText(Token &T); 320 321 void setupAndLexVerbatimBlock(Token &T, 322 const char *TextBegin, 323 char Marker, const CommandInfo *Info); 324 325 void lexVerbatimBlockFirstLine(Token &T); 326 327 void lexVerbatimBlockBody(Token &T); 328 329 void setupAndLexVerbatimLine(Token &T, const char *TextBegin, 330 const CommandInfo *Info); 331 332 void lexVerbatimLineText(Token &T); 333 334 void lexHTMLCharacterReference(Token &T); 335 336 void setupAndLexHTMLStartTag(Token &T); 337 338 void lexHTMLStartTag(Token &T); 339 340 void setupAndLexHTMLEndTag(Token &T); 341 342 void lexHTMLEndTag(Token &T); 343 344 public: 345 Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 346 const CommandTraits &Traits, 347 SourceLocation FileLoc, 348 const char *BufferStart, const char *BufferEnd); 349 350 void lex(Token &T); 351 352 StringRef getSpelling(const Token &Tok, 353 const SourceManager &SourceMgr, 354 bool *Invalid = nullptr) const; 355 }; 356 357 } // end namespace comments 358 } // end namespace clang 359 360 #endif 361 362