1 #include "clang/AST/CommentLexer.h" 2 #include "clang/AST/CommentCommandTraits.h" 3 #include "clang/AST/CommentDiagnostic.h" 4 #include "clang/Basic/CharInfo.h" 5 #include "llvm/ADT/StringExtras.h" 6 #include "llvm/ADT/StringSwitch.h" 7 #include "llvm/Support/ConvertUTF.h" 8 #include "llvm/Support/ErrorHandling.h" 9 10 namespace clang { 11 namespace comments { 12 13 void Token::dump(const Lexer &L, const SourceManager &SM) const { 14 llvm::errs() << "comments::Token Kind=" << Kind << " "; 15 Loc.dump(SM); 16 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17 } 18 19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20 return isLetter(C); 21 } 22 23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24 return isDigit(C); 25 } 26 27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28 return isHexDigit(C); 29 } 30 31 static inline StringRef convertCodePointToUTF8( 32 llvm::BumpPtrAllocator &Allocator, 33 unsigned CodePoint) { 34 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35 char *ResolvedPtr = Resolved; 36 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37 return StringRef(Resolved, ResolvedPtr - Resolved); 38 else 39 return StringRef(); 40 } 41 42 namespace { 43 44 #include "clang/AST/CommentHTMLTags.inc" 45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46 47 } // unnamed namespace 48 49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50 // Fast path, first check a few most widely used named character references. 51 return llvm::StringSwitch<StringRef>(Name) 52 .Case("amp", "&") 53 .Case("lt", "<") 54 .Case("gt", ">") 55 .Case("quot", "\"") 56 .Case("apos", "\'") 57 // Slow path. 58 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59 } 60 61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62 unsigned CodePoint = 0; 63 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65 CodePoint *= 10; 66 CodePoint += Name[i] - '0'; 67 } 68 return convertCodePointToUTF8(Allocator, CodePoint); 69 } 70 71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72 unsigned CodePoint = 0; 73 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74 CodePoint *= 16; 75 const char C = Name[i]; 76 assert(isHTMLHexCharacterReferenceCharacter(C)); 77 CodePoint += llvm::hexDigitValue(C); 78 } 79 return convertCodePointToUTF8(Allocator, CodePoint); 80 } 81 82 void Lexer::skipLineStartingDecorations() { 83 // This function should be called only for C comments 84 assert(CommentState == LCS_InsideCComment); 85 86 if (BufferPtr == CommentEnd) 87 return; 88 89 switch (*BufferPtr) { 90 case ' ': 91 case '\t': 92 case '\f': 93 case '\v': { 94 const char *NewBufferPtr = BufferPtr; 95 NewBufferPtr++; 96 if (NewBufferPtr == CommentEnd) 97 return; 98 99 char C = *NewBufferPtr; 100 while (isHorizontalWhitespace(C)) { 101 NewBufferPtr++; 102 if (NewBufferPtr == CommentEnd) 103 return; 104 C = *NewBufferPtr; 105 } 106 if (C == '*') 107 BufferPtr = NewBufferPtr + 1; 108 break; 109 } 110 case '*': 111 BufferPtr++; 112 break; 113 } 114 } 115 116 namespace { 117 /// Returns pointer to the first newline character in the string. 118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120 if (isVerticalWhitespace(*BufferPtr)) 121 return BufferPtr; 122 } 123 return BufferEnd; 124 } 125 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127 if (BufferPtr == BufferEnd) 128 return BufferPtr; 129 130 if (*BufferPtr == '\n') 131 BufferPtr++; 132 else { 133 assert(*BufferPtr == '\r'); 134 BufferPtr++; 135 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136 BufferPtr++; 137 } 138 return BufferPtr; 139 } 140 141 const char *skipNamedCharacterReference(const char *BufferPtr, 142 const char *BufferEnd) { 143 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145 return BufferPtr; 146 } 147 return BufferEnd; 148 } 149 150 const char *skipDecimalCharacterReference(const char *BufferPtr, 151 const char *BufferEnd) { 152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154 return BufferPtr; 155 } 156 return BufferEnd; 157 } 158 159 const char *skipHexCharacterReference(const char *BufferPtr, 160 const char *BufferEnd) { 161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163 return BufferPtr; 164 } 165 return BufferEnd; 166 } 167 168 bool isHTMLIdentifierStartingCharacter(char C) { 169 return isLetter(C); 170 } 171 172 bool isHTMLIdentifierCharacter(char C) { 173 return isAlphanumeric(C); 174 } 175 176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178 if (!isHTMLIdentifierCharacter(*BufferPtr)) 179 return BufferPtr; 180 } 181 return BufferEnd; 182 } 183 184 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185 /// string allowed. 186 /// 187 /// Returns pointer to closing quote. 188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189 { 190 const char Quote = *BufferPtr; 191 assert(Quote == '\"' || Quote == '\''); 192 193 BufferPtr++; 194 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195 const char C = *BufferPtr; 196 if (C == Quote && BufferPtr[-1] != '\\') 197 return BufferPtr; 198 } 199 return BufferEnd; 200 } 201 202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204 if (!isWhitespace(*BufferPtr)) 205 return BufferPtr; 206 } 207 return BufferEnd; 208 } 209 210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212 } 213 214 bool isCommandNameStartCharacter(char C) { 215 return isLetter(C); 216 } 217 218 bool isCommandNameCharacter(char C) { 219 return isAlphanumeric(C); 220 } 221 222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224 if (!isCommandNameCharacter(*BufferPtr)) 225 return BufferPtr; 226 } 227 return BufferEnd; 228 } 229 230 /// Return the one past end pointer for BCPL comments. 231 /// Handles newlines escaped with backslash or trigraph for backslahs. 232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233 const char *CurPtr = BufferPtr; 234 while (CurPtr != BufferEnd) { 235 while (!isVerticalWhitespace(*CurPtr)) { 236 CurPtr++; 237 if (CurPtr == BufferEnd) 238 return BufferEnd; 239 } 240 // We found a newline, check if it is escaped. 241 const char *EscapePtr = CurPtr - 1; 242 while(isHorizontalWhitespace(*EscapePtr)) 243 EscapePtr--; 244 245 if (*EscapePtr == '\\' || 246 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248 // We found an escaped newline. 249 CurPtr = skipNewline(CurPtr, BufferEnd); 250 } else 251 return CurPtr; // Not an escaped newline. 252 } 253 return BufferEnd; 254 } 255 256 /// Return the one past end pointer for C comments. 257 /// Very dumb, does not handle escaped newlines or trigraphs. 258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260 if (*BufferPtr == '*') { 261 assert(BufferPtr + 1 != BufferEnd); 262 if (*(BufferPtr + 1) == '/') 263 return BufferPtr; 264 } 265 } 266 llvm_unreachable("buffer end hit before '*/' was seen"); 267 } 268 269 } // unnamed namespace 270 271 void Lexer::lexCommentText(Token &T) { 272 assert(CommentState == LCS_InsideBCPLComment || 273 CommentState == LCS_InsideCComment); 274 275 switch (State) { 276 case LS_Normal: 277 break; 278 case LS_VerbatimBlockFirstLine: 279 lexVerbatimBlockFirstLine(T); 280 return; 281 case LS_VerbatimBlockBody: 282 lexVerbatimBlockBody(T); 283 return; 284 case LS_VerbatimLineText: 285 lexVerbatimLineText(T); 286 return; 287 case LS_HTMLStartTag: 288 lexHTMLStartTag(T); 289 return; 290 case LS_HTMLEndTag: 291 lexHTMLEndTag(T); 292 return; 293 } 294 295 assert(State == LS_Normal); 296 297 const char *TokenPtr = BufferPtr; 298 assert(TokenPtr < CommentEnd); 299 while (TokenPtr != CommentEnd) { 300 switch(*TokenPtr) { 301 case '\\': 302 case '@': { 303 // Commands that start with a backslash and commands that start with 304 // 'at' have equivalent semantics. But we keep information about the 305 // exact syntax in AST for comments. 306 tok::TokenKind CommandKind = 307 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 308 TokenPtr++; 309 if (TokenPtr == CommentEnd) { 310 formTextToken(T, TokenPtr); 311 return; 312 } 313 char C = *TokenPtr; 314 switch (C) { 315 default: 316 break; 317 318 case '\\': case '@': case '&': case '$': 319 case '#': case '<': case '>': case '%': 320 case '\"': case '.': case ':': 321 // This is one of \\ \@ \& \$ etc escape sequences. 322 TokenPtr++; 323 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 324 // This is the \:: escape sequence. 325 TokenPtr++; 326 } 327 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 328 formTokenWithChars(T, TokenPtr, tok::text); 329 T.setText(UnescapedText); 330 return; 331 } 332 333 // Don't make zero-length commands. 334 if (!isCommandNameStartCharacter(*TokenPtr)) { 335 formTextToken(T, TokenPtr); 336 return; 337 } 338 339 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 340 unsigned Length = TokenPtr - (BufferPtr + 1); 341 342 // Hardcoded support for lexing LaTeX formula commands 343 // \f$ \f[ \f] \f{ \f} as a single command. 344 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 345 C = *TokenPtr; 346 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 347 TokenPtr++; 348 Length++; 349 } 350 } 351 352 const StringRef CommandName(BufferPtr + 1, Length); 353 354 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 355 if (!Info) { 356 formTokenWithChars(T, TokenPtr, tok::unknown_command); 357 T.setUnknownCommandName(CommandName); 358 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 359 StringRef CorrectedName = Info->Name; 360 SourceRange CommandRange(T.getLocation().getLocWithOffset(1), 361 T.getEndLocation()); 362 Diag(T.getLocation(), diag::warn_correct_comment_command_name) 363 << CommandName << CorrectedName 364 << FixItHint::CreateReplacement(CommandRange, CorrectedName); 365 } else { 366 Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 367 return; 368 } 369 } 370 if (Info->IsVerbatimBlockCommand) { 371 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 372 return; 373 } 374 if (Info->IsVerbatimLineCommand) { 375 setupAndLexVerbatimLine(T, TokenPtr, Info); 376 return; 377 } 378 formTokenWithChars(T, TokenPtr, CommandKind); 379 T.setCommandID(Info->getID()); 380 return; 381 } 382 383 case '&': 384 lexHTMLCharacterReference(T); 385 return; 386 387 case '<': { 388 TokenPtr++; 389 if (TokenPtr == CommentEnd) { 390 formTextToken(T, TokenPtr); 391 return; 392 } 393 const char C = *TokenPtr; 394 if (isHTMLIdentifierStartingCharacter(C)) 395 setupAndLexHTMLStartTag(T); 396 else if (C == '/') 397 setupAndLexHTMLEndTag(T); 398 else 399 formTextToken(T, TokenPtr); 400 401 return; 402 } 403 404 case '\n': 405 case '\r': 406 TokenPtr = skipNewline(TokenPtr, CommentEnd); 407 formTokenWithChars(T, TokenPtr, tok::newline); 408 409 if (CommentState == LCS_InsideCComment) 410 skipLineStartingDecorations(); 411 return; 412 413 default: { 414 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 415 find_first_of("\n\r\\@&<"); 416 if (End != StringRef::npos) 417 TokenPtr += End; 418 else 419 TokenPtr = CommentEnd; 420 formTextToken(T, TokenPtr); 421 return; 422 } 423 } 424 } 425 } 426 427 void Lexer::setupAndLexVerbatimBlock(Token &T, 428 const char *TextBegin, 429 char Marker, const CommandInfo *Info) { 430 assert(Info->IsVerbatimBlockCommand); 431 432 VerbatimBlockEndCommandName.clear(); 433 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 434 VerbatimBlockEndCommandName.append(Info->EndCommandName); 435 436 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 437 T.setVerbatimBlockID(Info->getID()); 438 439 // If there is a newline following the verbatim opening command, skip the 440 // newline so that we don't create an tok::verbatim_block_line with empty 441 // text content. 442 if (BufferPtr != CommentEnd && 443 isVerticalWhitespace(*BufferPtr)) { 444 BufferPtr = skipNewline(BufferPtr, CommentEnd); 445 State = LS_VerbatimBlockBody; 446 return; 447 } 448 449 State = LS_VerbatimBlockFirstLine; 450 } 451 452 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 453 again: 454 assert(BufferPtr < CommentEnd); 455 456 // FIXME: It would be better to scan the text once, finding either the block 457 // end command or newline. 458 // 459 // Extract current line. 460 const char *Newline = findNewline(BufferPtr, CommentEnd); 461 StringRef Line(BufferPtr, Newline - BufferPtr); 462 463 // Look for end command in current line. 464 size_t Pos = Line.find(VerbatimBlockEndCommandName); 465 const char *TextEnd; 466 const char *NextLine; 467 if (Pos == StringRef::npos) { 468 // Current line is completely verbatim. 469 TextEnd = Newline; 470 NextLine = skipNewline(Newline, CommentEnd); 471 } else if (Pos == 0) { 472 // Current line contains just an end command. 473 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 474 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 475 formTokenWithChars(T, End, tok::verbatim_block_end); 476 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 477 State = LS_Normal; 478 return; 479 } else { 480 // There is some text, followed by end command. Extract text first. 481 TextEnd = BufferPtr + Pos; 482 NextLine = TextEnd; 483 // If there is only whitespace before end command, skip whitespace. 484 if (isWhitespace(BufferPtr, TextEnd)) { 485 BufferPtr = TextEnd; 486 goto again; 487 } 488 } 489 490 StringRef Text(BufferPtr, TextEnd - BufferPtr); 491 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 492 T.setVerbatimBlockText(Text); 493 494 State = LS_VerbatimBlockBody; 495 } 496 497 void Lexer::lexVerbatimBlockBody(Token &T) { 498 assert(State == LS_VerbatimBlockBody); 499 500 if (CommentState == LCS_InsideCComment) 501 skipLineStartingDecorations(); 502 503 lexVerbatimBlockFirstLine(T); 504 } 505 506 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 507 const CommandInfo *Info) { 508 assert(Info->IsVerbatimLineCommand); 509 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 510 T.setVerbatimLineID(Info->getID()); 511 512 State = LS_VerbatimLineText; 513 } 514 515 void Lexer::lexVerbatimLineText(Token &T) { 516 assert(State == LS_VerbatimLineText); 517 518 // Extract current line. 519 const char *Newline = findNewline(BufferPtr, CommentEnd); 520 const StringRef Text(BufferPtr, Newline - BufferPtr); 521 formTokenWithChars(T, Newline, tok::verbatim_line_text); 522 T.setVerbatimLineText(Text); 523 524 State = LS_Normal; 525 } 526 527 void Lexer::lexHTMLCharacterReference(Token &T) { 528 const char *TokenPtr = BufferPtr; 529 assert(*TokenPtr == '&'); 530 TokenPtr++; 531 if (TokenPtr == CommentEnd) { 532 formTextToken(T, TokenPtr); 533 return; 534 } 535 const char *NamePtr; 536 bool isNamed = false; 537 bool isDecimal = false; 538 char C = *TokenPtr; 539 if (isHTMLNamedCharacterReferenceCharacter(C)) { 540 NamePtr = TokenPtr; 541 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 542 isNamed = true; 543 } else if (C == '#') { 544 TokenPtr++; 545 if (TokenPtr == CommentEnd) { 546 formTextToken(T, TokenPtr); 547 return; 548 } 549 C = *TokenPtr; 550 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 551 NamePtr = TokenPtr; 552 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 553 isDecimal = true; 554 } else if (C == 'x' || C == 'X') { 555 TokenPtr++; 556 NamePtr = TokenPtr; 557 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 558 } else { 559 formTextToken(T, TokenPtr); 560 return; 561 } 562 } else { 563 formTextToken(T, TokenPtr); 564 return; 565 } 566 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 567 *TokenPtr != ';') { 568 formTextToken(T, TokenPtr); 569 return; 570 } 571 StringRef Name(NamePtr, TokenPtr - NamePtr); 572 TokenPtr++; // Skip semicolon. 573 StringRef Resolved; 574 if (isNamed) 575 Resolved = resolveHTMLNamedCharacterReference(Name); 576 else if (isDecimal) 577 Resolved = resolveHTMLDecimalCharacterReference(Name); 578 else 579 Resolved = resolveHTMLHexCharacterReference(Name); 580 581 if (Resolved.empty()) { 582 formTextToken(T, TokenPtr); 583 return; 584 } 585 formTokenWithChars(T, TokenPtr, tok::text); 586 T.setText(Resolved); 587 return; 588 } 589 590 void Lexer::setupAndLexHTMLStartTag(Token &T) { 591 assert(BufferPtr[0] == '<' && 592 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 593 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 594 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 595 if (!isHTMLTagName(Name)) { 596 formTextToken(T, TagNameEnd); 597 return; 598 } 599 600 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 601 T.setHTMLTagStartName(Name); 602 603 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 604 605 const char C = *BufferPtr; 606 if (BufferPtr != CommentEnd && 607 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 608 State = LS_HTMLStartTag; 609 } 610 611 void Lexer::lexHTMLStartTag(Token &T) { 612 assert(State == LS_HTMLStartTag); 613 614 const char *TokenPtr = BufferPtr; 615 char C = *TokenPtr; 616 if (isHTMLIdentifierCharacter(C)) { 617 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 618 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 619 formTokenWithChars(T, TokenPtr, tok::html_ident); 620 T.setHTMLIdent(Ident); 621 } else { 622 switch (C) { 623 case '=': 624 TokenPtr++; 625 formTokenWithChars(T, TokenPtr, tok::html_equals); 626 break; 627 case '\"': 628 case '\'': { 629 const char *OpenQuote = TokenPtr; 630 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 631 const char *ClosingQuote = TokenPtr; 632 if (TokenPtr != CommentEnd) // Skip closing quote. 633 TokenPtr++; 634 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 635 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 636 ClosingQuote - (OpenQuote + 1))); 637 break; 638 } 639 case '>': 640 TokenPtr++; 641 formTokenWithChars(T, TokenPtr, tok::html_greater); 642 State = LS_Normal; 643 return; 644 case '/': 645 TokenPtr++; 646 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 647 TokenPtr++; 648 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 649 } else 650 formTextToken(T, TokenPtr); 651 652 State = LS_Normal; 653 return; 654 } 655 } 656 657 // Now look ahead and return to normal state if we don't see any HTML tokens 658 // ahead. 659 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 660 if (BufferPtr == CommentEnd) { 661 State = LS_Normal; 662 return; 663 } 664 665 C = *BufferPtr; 666 if (!isHTMLIdentifierStartingCharacter(C) && 667 C != '=' && C != '\"' && C != '\'' && C != '>') { 668 State = LS_Normal; 669 return; 670 } 671 } 672 673 void Lexer::setupAndLexHTMLEndTag(Token &T) { 674 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 675 676 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 677 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 678 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 679 if (!isHTMLTagName(Name)) { 680 formTextToken(T, TagNameEnd); 681 return; 682 } 683 684 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 685 686 formTokenWithChars(T, End, tok::html_end_tag); 687 T.setHTMLTagEndName(Name); 688 689 if (BufferPtr != CommentEnd && *BufferPtr == '>') 690 State = LS_HTMLEndTag; 691 } 692 693 void Lexer::lexHTMLEndTag(Token &T) { 694 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 695 696 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 697 State = LS_Normal; 698 } 699 700 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 701 const CommandTraits &Traits, 702 SourceLocation FileLoc, 703 const char *BufferStart, const char *BufferEnd): 704 Allocator(Allocator), Diags(Diags), Traits(Traits), 705 BufferStart(BufferStart), BufferEnd(BufferEnd), 706 FileLoc(FileLoc), BufferPtr(BufferStart), 707 CommentState(LCS_BeforeComment), State(LS_Normal) { 708 } 709 710 void Lexer::lex(Token &T) { 711 again: 712 switch (CommentState) { 713 case LCS_BeforeComment: 714 if (BufferPtr == BufferEnd) { 715 formTokenWithChars(T, BufferPtr, tok::eof); 716 return; 717 } 718 719 assert(*BufferPtr == '/'); 720 BufferPtr++; // Skip first slash. 721 switch(*BufferPtr) { 722 case '/': { // BCPL comment. 723 BufferPtr++; // Skip second slash. 724 725 if (BufferPtr != BufferEnd) { 726 // Skip Doxygen magic marker, if it is present. 727 // It might be missing because of a typo //< or /*<, or because we 728 // merged this non-Doxygen comment into a bunch of Doxygen comments 729 // around it: /** ... */ /* ... */ /** ... */ 730 const char C = *BufferPtr; 731 if (C == '/' || C == '!') 732 BufferPtr++; 733 } 734 735 // Skip less-than symbol that marks trailing comments. 736 // Skip it even if the comment is not a Doxygen one, because //< and /*< 737 // are frequent typos. 738 if (BufferPtr != BufferEnd && *BufferPtr == '<') 739 BufferPtr++; 740 741 CommentState = LCS_InsideBCPLComment; 742 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 743 State = LS_Normal; 744 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 745 goto again; 746 } 747 case '*': { // C comment. 748 BufferPtr++; // Skip star. 749 750 // Skip Doxygen magic marker. 751 const char C = *BufferPtr; 752 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 753 BufferPtr++; 754 755 // Skip less-than symbol that marks trailing comments. 756 if (BufferPtr != BufferEnd && *BufferPtr == '<') 757 BufferPtr++; 758 759 CommentState = LCS_InsideCComment; 760 State = LS_Normal; 761 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 762 goto again; 763 } 764 default: 765 llvm_unreachable("second character of comment should be '/' or '*'"); 766 } 767 768 case LCS_BetweenComments: { 769 // Consecutive comments are extracted only if there is only whitespace 770 // between them. So we can search for the start of the next comment. 771 const char *EndWhitespace = BufferPtr; 772 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 773 EndWhitespace++; 774 775 // Turn any whitespace between comments (and there is only whitespace 776 // between them -- guaranteed by comment extraction) into a newline. We 777 // have two newlines between C comments in total (first one was synthesized 778 // after a comment). 779 formTokenWithChars(T, EndWhitespace, tok::newline); 780 781 CommentState = LCS_BeforeComment; 782 break; 783 } 784 785 case LCS_InsideBCPLComment: 786 case LCS_InsideCComment: 787 if (BufferPtr != CommentEnd) { 788 lexCommentText(T); 789 break; 790 } else { 791 // Skip C comment closing sequence. 792 if (CommentState == LCS_InsideCComment) { 793 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 794 BufferPtr += 2; 795 assert(BufferPtr <= BufferEnd); 796 797 // Synthenize newline just after the C comment, regardless if there is 798 // actually a newline. 799 formTokenWithChars(T, BufferPtr, tok::newline); 800 801 CommentState = LCS_BetweenComments; 802 break; 803 } else { 804 // Don't synthesized a newline after BCPL comment. 805 CommentState = LCS_BetweenComments; 806 goto again; 807 } 808 } 809 } 810 } 811 812 StringRef Lexer::getSpelling(const Token &Tok, 813 const SourceManager &SourceMgr, 814 bool *Invalid) const { 815 SourceLocation Loc = Tok.getLocation(); 816 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 817 818 bool InvalidTemp = false; 819 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 820 if (InvalidTemp) { 821 *Invalid = true; 822 return StringRef(); 823 } 824 825 const char *Begin = File.data() + LocInfo.second; 826 return StringRef(Begin, Tok.getLength()); 827 } 828 829 } // end namespace comments 830 } // end namespace clang 831 832