1 #include "clang/AST/CommentLexer.h" 2 #include "clang/AST/CommentCommandTraits.h" 3 #include "clang/AST/CommentDiagnostic.h" 4 #include "clang/Basic/CharInfo.h" 5 #include "llvm/ADT/StringExtras.h" 6 #include "llvm/ADT/StringSwitch.h" 7 #include "llvm/Support/ConvertUTF.h" 8 #include "llvm/Support/ErrorHandling.h" 9 10 namespace clang { 11 namespace comments { 12 13 void Token::dump(const Lexer &L, const SourceManager &SM) const { 14 llvm::errs() << "comments::Token Kind=" << Kind << " "; 15 Loc.dump(SM); 16 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17 } 18 19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20 return isLetter(C); 21 } 22 23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24 return isDigit(C); 25 } 26 27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28 return isHexDigit(C); 29 } 30 31 static inline StringRef convertCodePointToUTF8( 32 llvm::BumpPtrAllocator &Allocator, 33 unsigned CodePoint) { 34 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35 char *ResolvedPtr = Resolved; 36 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37 return StringRef(Resolved, ResolvedPtr - Resolved); 38 else 39 return StringRef(); 40 } 41 42 namespace { 43 44 #include "clang/AST/CommentHTMLTags.inc" 45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46 47 } // unnamed namespace 48 49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50 // Fast path, first check a few most widely used named character references. 51 return llvm::StringSwitch<StringRef>(Name) 52 .Case("amp", "&") 53 .Case("lt", "<") 54 .Case("gt", ">") 55 .Case("quot", "\"") 56 .Case("apos", "\'") 57 // Slow path. 58 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59 } 60 61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62 unsigned CodePoint = 0; 63 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65 CodePoint *= 10; 66 CodePoint += Name[i] - '0'; 67 } 68 return convertCodePointToUTF8(Allocator, CodePoint); 69 } 70 71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72 unsigned CodePoint = 0; 73 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74 CodePoint *= 16; 75 const char C = Name[i]; 76 assert(isHTMLHexCharacterReferenceCharacter(C)); 77 CodePoint += llvm::hexDigitValue(C); 78 } 79 return convertCodePointToUTF8(Allocator, CodePoint); 80 } 81 82 void Lexer::skipLineStartingDecorations() { 83 // This function should be called only for C comments 84 assert(CommentState == LCS_InsideCComment); 85 86 if (BufferPtr == CommentEnd) 87 return; 88 89 switch (*BufferPtr) { 90 case ' ': 91 case '\t': 92 case '\f': 93 case '\v': { 94 const char *NewBufferPtr = BufferPtr; 95 NewBufferPtr++; 96 if (NewBufferPtr == CommentEnd) 97 return; 98 99 char C = *NewBufferPtr; 100 while (isHorizontalWhitespace(C)) { 101 NewBufferPtr++; 102 if (NewBufferPtr == CommentEnd) 103 return; 104 C = *NewBufferPtr; 105 } 106 if (C == '*') 107 BufferPtr = NewBufferPtr + 1; 108 break; 109 } 110 case '*': 111 BufferPtr++; 112 break; 113 } 114 } 115 116 namespace { 117 /// Returns pointer to the first newline character in the string. 118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120 if (isVerticalWhitespace(*BufferPtr)) 121 return BufferPtr; 122 } 123 return BufferEnd; 124 } 125 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127 if (BufferPtr == BufferEnd) 128 return BufferPtr; 129 130 if (*BufferPtr == '\n') 131 BufferPtr++; 132 else { 133 assert(*BufferPtr == '\r'); 134 BufferPtr++; 135 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136 BufferPtr++; 137 } 138 return BufferPtr; 139 } 140 141 const char *skipNamedCharacterReference(const char *BufferPtr, 142 const char *BufferEnd) { 143 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145 return BufferPtr; 146 } 147 return BufferEnd; 148 } 149 150 const char *skipDecimalCharacterReference(const char *BufferPtr, 151 const char *BufferEnd) { 152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154 return BufferPtr; 155 } 156 return BufferEnd; 157 } 158 159 const char *skipHexCharacterReference(const char *BufferPtr, 160 const char *BufferEnd) { 161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163 return BufferPtr; 164 } 165 return BufferEnd; 166 } 167 168 bool isHTMLIdentifierStartingCharacter(char C) { 169 return isLetter(C); 170 } 171 172 bool isHTMLIdentifierCharacter(char C) { 173 return isAlphanumeric(C); 174 } 175 176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178 if (!isHTMLIdentifierCharacter(*BufferPtr)) 179 return BufferPtr; 180 } 181 return BufferEnd; 182 } 183 184 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185 /// string allowed. 186 /// 187 /// Returns pointer to closing quote. 188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189 { 190 const char Quote = *BufferPtr; 191 assert(Quote == '\"' || Quote == '\''); 192 193 BufferPtr++; 194 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195 const char C = *BufferPtr; 196 if (C == Quote && BufferPtr[-1] != '\\') 197 return BufferPtr; 198 } 199 return BufferEnd; 200 } 201 202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204 if (!isWhitespace(*BufferPtr)) 205 return BufferPtr; 206 } 207 return BufferEnd; 208 } 209 210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212 } 213 214 bool isCommandNameStartCharacter(char C) { 215 return isLetter(C); 216 } 217 218 bool isCommandNameCharacter(char C) { 219 return isAlphanumeric(C); 220 } 221 222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224 if (!isCommandNameCharacter(*BufferPtr)) 225 return BufferPtr; 226 } 227 return BufferEnd; 228 } 229 230 /// Return the one past end pointer for BCPL comments. 231 /// Handles newlines escaped with backslash or trigraph for backslahs. 232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233 const char *CurPtr = BufferPtr; 234 while (CurPtr != BufferEnd) { 235 while (!isVerticalWhitespace(*CurPtr)) { 236 CurPtr++; 237 if (CurPtr == BufferEnd) 238 return BufferEnd; 239 } 240 // We found a newline, check if it is escaped. 241 const char *EscapePtr = CurPtr - 1; 242 while(isHorizontalWhitespace(*EscapePtr)) 243 EscapePtr--; 244 245 if (*EscapePtr == '\\' || 246 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248 // We found an escaped newline. 249 CurPtr = skipNewline(CurPtr, BufferEnd); 250 } else 251 return CurPtr; // Not an escaped newline. 252 } 253 return BufferEnd; 254 } 255 256 /// Return the one past end pointer for C comments. 257 /// Very dumb, does not handle escaped newlines or trigraphs. 258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260 if (*BufferPtr == '*') { 261 assert(BufferPtr + 1 != BufferEnd); 262 if (*(BufferPtr + 1) == '/') 263 return BufferPtr; 264 } 265 } 266 llvm_unreachable("buffer end hit before '*/' was seen"); 267 } 268 269 } // unnamed namespace 270 271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, 272 tok::TokenKind Kind) { 273 const unsigned TokLen = TokEnd - BufferPtr; 274 Result.setLocation(getSourceLocation(BufferPtr)); 275 Result.setKind(Kind); 276 Result.setLength(TokLen); 277 #ifndef NDEBUG 278 Result.TextPtr = "<UNSET>"; 279 Result.IntVal = 7; 280 #endif 281 BufferPtr = TokEnd; 282 } 283 284 void Lexer::lexCommentText(Token &T) { 285 assert(CommentState == LCS_InsideBCPLComment || 286 CommentState == LCS_InsideCComment); 287 288 switch (State) { 289 case LS_Normal: 290 break; 291 case LS_VerbatimBlockFirstLine: 292 lexVerbatimBlockFirstLine(T); 293 return; 294 case LS_VerbatimBlockBody: 295 lexVerbatimBlockBody(T); 296 return; 297 case LS_VerbatimLineText: 298 lexVerbatimLineText(T); 299 return; 300 case LS_HTMLStartTag: 301 lexHTMLStartTag(T); 302 return; 303 case LS_HTMLEndTag: 304 lexHTMLEndTag(T); 305 return; 306 } 307 308 assert(State == LS_Normal); 309 310 const char *TokenPtr = BufferPtr; 311 assert(TokenPtr < CommentEnd); 312 while (TokenPtr != CommentEnd) { 313 switch(*TokenPtr) { 314 case '\\': 315 case '@': { 316 // Commands that start with a backslash and commands that start with 317 // 'at' have equivalent semantics. But we keep information about the 318 // exact syntax in AST for comments. 319 tok::TokenKind CommandKind = 320 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 321 TokenPtr++; 322 if (TokenPtr == CommentEnd) { 323 formTextToken(T, TokenPtr); 324 return; 325 } 326 char C = *TokenPtr; 327 switch (C) { 328 default: 329 break; 330 331 case '\\': case '@': case '&': case '$': 332 case '#': case '<': case '>': case '%': 333 case '\"': case '.': case ':': 334 // This is one of \\ \@ \& \$ etc escape sequences. 335 TokenPtr++; 336 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 337 // This is the \:: escape sequence. 338 TokenPtr++; 339 } 340 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 341 formTokenWithChars(T, TokenPtr, tok::text); 342 T.setText(UnescapedText); 343 return; 344 } 345 346 // Don't make zero-length commands. 347 if (!isCommandNameStartCharacter(*TokenPtr)) { 348 formTextToken(T, TokenPtr); 349 return; 350 } 351 352 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 353 unsigned Length = TokenPtr - (BufferPtr + 1); 354 355 // Hardcoded support for lexing LaTeX formula commands 356 // \f$ \f[ \f] \f{ \f} as a single command. 357 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 358 C = *TokenPtr; 359 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 360 TokenPtr++; 361 Length++; 362 } 363 } 364 365 StringRef CommandName(BufferPtr + 1, Length); 366 367 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 368 if (!Info) { 369 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 370 StringRef CorrectedName = Info->Name; 371 SourceLocation Loc = getSourceLocation(BufferPtr); 372 SourceRange CommandRange(Loc.getLocWithOffset(1), 373 getSourceLocation(TokenPtr)); 374 Diag(Loc, diag::warn_correct_comment_command_name) 375 << CommandName << CorrectedName 376 << FixItHint::CreateReplacement(CommandRange, CorrectedName); 377 } else { 378 formTokenWithChars(T, TokenPtr, tok::unknown_command); 379 T.setUnknownCommandName(CommandName); 380 Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 381 return; 382 } 383 } 384 if (Info->IsVerbatimBlockCommand) { 385 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 386 return; 387 } 388 if (Info->IsVerbatimLineCommand) { 389 setupAndLexVerbatimLine(T, TokenPtr, Info); 390 return; 391 } 392 formTokenWithChars(T, TokenPtr, CommandKind); 393 T.setCommandID(Info->getID()); 394 return; 395 } 396 397 case '&': 398 lexHTMLCharacterReference(T); 399 return; 400 401 case '<': { 402 TokenPtr++; 403 if (TokenPtr == CommentEnd) { 404 formTextToken(T, TokenPtr); 405 return; 406 } 407 const char C = *TokenPtr; 408 if (isHTMLIdentifierStartingCharacter(C)) 409 setupAndLexHTMLStartTag(T); 410 else if (C == '/') 411 setupAndLexHTMLEndTag(T); 412 else 413 formTextToken(T, TokenPtr); 414 415 return; 416 } 417 418 case '\n': 419 case '\r': 420 TokenPtr = skipNewline(TokenPtr, CommentEnd); 421 formTokenWithChars(T, TokenPtr, tok::newline); 422 423 if (CommentState == LCS_InsideCComment) 424 skipLineStartingDecorations(); 425 return; 426 427 default: { 428 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 429 find_first_of("\n\r\\@&<"); 430 if (End != StringRef::npos) 431 TokenPtr += End; 432 else 433 TokenPtr = CommentEnd; 434 formTextToken(T, TokenPtr); 435 return; 436 } 437 } 438 } 439 } 440 441 void Lexer::setupAndLexVerbatimBlock(Token &T, 442 const char *TextBegin, 443 char Marker, const CommandInfo *Info) { 444 assert(Info->IsVerbatimBlockCommand); 445 446 VerbatimBlockEndCommandName.clear(); 447 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 448 VerbatimBlockEndCommandName.append(Info->EndCommandName); 449 450 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 451 T.setVerbatimBlockID(Info->getID()); 452 453 // If there is a newline following the verbatim opening command, skip the 454 // newline so that we don't create an tok::verbatim_block_line with empty 455 // text content. 456 if (BufferPtr != CommentEnd && 457 isVerticalWhitespace(*BufferPtr)) { 458 BufferPtr = skipNewline(BufferPtr, CommentEnd); 459 State = LS_VerbatimBlockBody; 460 return; 461 } 462 463 State = LS_VerbatimBlockFirstLine; 464 } 465 466 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 467 again: 468 assert(BufferPtr < CommentEnd); 469 470 // FIXME: It would be better to scan the text once, finding either the block 471 // end command or newline. 472 // 473 // Extract current line. 474 const char *Newline = findNewline(BufferPtr, CommentEnd); 475 StringRef Line(BufferPtr, Newline - BufferPtr); 476 477 // Look for end command in current line. 478 size_t Pos = Line.find(VerbatimBlockEndCommandName); 479 const char *TextEnd; 480 const char *NextLine; 481 if (Pos == StringRef::npos) { 482 // Current line is completely verbatim. 483 TextEnd = Newline; 484 NextLine = skipNewline(Newline, CommentEnd); 485 } else if (Pos == 0) { 486 // Current line contains just an end command. 487 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 488 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 489 formTokenWithChars(T, End, tok::verbatim_block_end); 490 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 491 State = LS_Normal; 492 return; 493 } else { 494 // There is some text, followed by end command. Extract text first. 495 TextEnd = BufferPtr + Pos; 496 NextLine = TextEnd; 497 // If there is only whitespace before end command, skip whitespace. 498 if (isWhitespace(BufferPtr, TextEnd)) { 499 BufferPtr = TextEnd; 500 goto again; 501 } 502 } 503 504 StringRef Text(BufferPtr, TextEnd - BufferPtr); 505 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 506 T.setVerbatimBlockText(Text); 507 508 State = LS_VerbatimBlockBody; 509 } 510 511 void Lexer::lexVerbatimBlockBody(Token &T) { 512 assert(State == LS_VerbatimBlockBody); 513 514 if (CommentState == LCS_InsideCComment) 515 skipLineStartingDecorations(); 516 517 if (BufferPtr == CommentEnd) { 518 formTokenWithChars(T, BufferPtr, tok::verbatim_block_line); 519 T.setVerbatimBlockText(""); 520 return; 521 } 522 523 lexVerbatimBlockFirstLine(T); 524 } 525 526 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 527 const CommandInfo *Info) { 528 assert(Info->IsVerbatimLineCommand); 529 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 530 T.setVerbatimLineID(Info->getID()); 531 532 State = LS_VerbatimLineText; 533 } 534 535 void Lexer::lexVerbatimLineText(Token &T) { 536 assert(State == LS_VerbatimLineText); 537 538 // Extract current line. 539 const char *Newline = findNewline(BufferPtr, CommentEnd); 540 StringRef Text(BufferPtr, Newline - BufferPtr); 541 formTokenWithChars(T, Newline, tok::verbatim_line_text); 542 T.setVerbatimLineText(Text); 543 544 State = LS_Normal; 545 } 546 547 void Lexer::lexHTMLCharacterReference(Token &T) { 548 const char *TokenPtr = BufferPtr; 549 assert(*TokenPtr == '&'); 550 TokenPtr++; 551 if (TokenPtr == CommentEnd) { 552 formTextToken(T, TokenPtr); 553 return; 554 } 555 const char *NamePtr; 556 bool isNamed = false; 557 bool isDecimal = false; 558 char C = *TokenPtr; 559 if (isHTMLNamedCharacterReferenceCharacter(C)) { 560 NamePtr = TokenPtr; 561 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 562 isNamed = true; 563 } else if (C == '#') { 564 TokenPtr++; 565 if (TokenPtr == CommentEnd) { 566 formTextToken(T, TokenPtr); 567 return; 568 } 569 C = *TokenPtr; 570 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 571 NamePtr = TokenPtr; 572 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 573 isDecimal = true; 574 } else if (C == 'x' || C == 'X') { 575 TokenPtr++; 576 NamePtr = TokenPtr; 577 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 578 } else { 579 formTextToken(T, TokenPtr); 580 return; 581 } 582 } else { 583 formTextToken(T, TokenPtr); 584 return; 585 } 586 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 587 *TokenPtr != ';') { 588 formTextToken(T, TokenPtr); 589 return; 590 } 591 StringRef Name(NamePtr, TokenPtr - NamePtr); 592 TokenPtr++; // Skip semicolon. 593 StringRef Resolved; 594 if (isNamed) 595 Resolved = resolveHTMLNamedCharacterReference(Name); 596 else if (isDecimal) 597 Resolved = resolveHTMLDecimalCharacterReference(Name); 598 else 599 Resolved = resolveHTMLHexCharacterReference(Name); 600 601 if (Resolved.empty()) { 602 formTextToken(T, TokenPtr); 603 return; 604 } 605 formTokenWithChars(T, TokenPtr, tok::text); 606 T.setText(Resolved); 607 return; 608 } 609 610 void Lexer::setupAndLexHTMLStartTag(Token &T) { 611 assert(BufferPtr[0] == '<' && 612 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 613 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 614 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 615 if (!isHTMLTagName(Name)) { 616 formTextToken(T, TagNameEnd); 617 return; 618 } 619 620 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 621 T.setHTMLTagStartName(Name); 622 623 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 624 625 const char C = *BufferPtr; 626 if (BufferPtr != CommentEnd && 627 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 628 State = LS_HTMLStartTag; 629 } 630 631 void Lexer::lexHTMLStartTag(Token &T) { 632 assert(State == LS_HTMLStartTag); 633 634 const char *TokenPtr = BufferPtr; 635 char C = *TokenPtr; 636 if (isHTMLIdentifierCharacter(C)) { 637 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 638 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 639 formTokenWithChars(T, TokenPtr, tok::html_ident); 640 T.setHTMLIdent(Ident); 641 } else { 642 switch (C) { 643 case '=': 644 TokenPtr++; 645 formTokenWithChars(T, TokenPtr, tok::html_equals); 646 break; 647 case '\"': 648 case '\'': { 649 const char *OpenQuote = TokenPtr; 650 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 651 const char *ClosingQuote = TokenPtr; 652 if (TokenPtr != CommentEnd) // Skip closing quote. 653 TokenPtr++; 654 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 655 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 656 ClosingQuote - (OpenQuote + 1))); 657 break; 658 } 659 case '>': 660 TokenPtr++; 661 formTokenWithChars(T, TokenPtr, tok::html_greater); 662 State = LS_Normal; 663 return; 664 case '/': 665 TokenPtr++; 666 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 667 TokenPtr++; 668 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 669 } else 670 formTextToken(T, TokenPtr); 671 672 State = LS_Normal; 673 return; 674 } 675 } 676 677 // Now look ahead and return to normal state if we don't see any HTML tokens 678 // ahead. 679 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 680 if (BufferPtr == CommentEnd) { 681 State = LS_Normal; 682 return; 683 } 684 685 C = *BufferPtr; 686 if (!isHTMLIdentifierStartingCharacter(C) && 687 C != '=' && C != '\"' && C != '\'' && C != '>') { 688 State = LS_Normal; 689 return; 690 } 691 } 692 693 void Lexer::setupAndLexHTMLEndTag(Token &T) { 694 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 695 696 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 697 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 698 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 699 if (!isHTMLTagName(Name)) { 700 formTextToken(T, TagNameEnd); 701 return; 702 } 703 704 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 705 706 formTokenWithChars(T, End, tok::html_end_tag); 707 T.setHTMLTagEndName(Name); 708 709 if (BufferPtr != CommentEnd && *BufferPtr == '>') 710 State = LS_HTMLEndTag; 711 } 712 713 void Lexer::lexHTMLEndTag(Token &T) { 714 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 715 716 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 717 State = LS_Normal; 718 } 719 720 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 721 const CommandTraits &Traits, 722 SourceLocation FileLoc, 723 const char *BufferStart, const char *BufferEnd): 724 Allocator(Allocator), Diags(Diags), Traits(Traits), 725 BufferStart(BufferStart), BufferEnd(BufferEnd), 726 FileLoc(FileLoc), BufferPtr(BufferStart), 727 CommentState(LCS_BeforeComment), State(LS_Normal) { 728 } 729 730 void Lexer::lex(Token &T) { 731 again: 732 switch (CommentState) { 733 case LCS_BeforeComment: 734 if (BufferPtr == BufferEnd) { 735 formTokenWithChars(T, BufferPtr, tok::eof); 736 return; 737 } 738 739 assert(*BufferPtr == '/'); 740 BufferPtr++; // Skip first slash. 741 switch(*BufferPtr) { 742 case '/': { // BCPL comment. 743 BufferPtr++; // Skip second slash. 744 745 if (BufferPtr != BufferEnd) { 746 // Skip Doxygen magic marker, if it is present. 747 // It might be missing because of a typo //< or /*<, or because we 748 // merged this non-Doxygen comment into a bunch of Doxygen comments 749 // around it: /** ... */ /* ... */ /** ... */ 750 const char C = *BufferPtr; 751 if (C == '/' || C == '!') 752 BufferPtr++; 753 } 754 755 // Skip less-than symbol that marks trailing comments. 756 // Skip it even if the comment is not a Doxygen one, because //< and /*< 757 // are frequent typos. 758 if (BufferPtr != BufferEnd && *BufferPtr == '<') 759 BufferPtr++; 760 761 CommentState = LCS_InsideBCPLComment; 762 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 763 State = LS_Normal; 764 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 765 goto again; 766 } 767 case '*': { // C comment. 768 BufferPtr++; // Skip star. 769 770 // Skip Doxygen magic marker. 771 const char C = *BufferPtr; 772 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 773 BufferPtr++; 774 775 // Skip less-than symbol that marks trailing comments. 776 if (BufferPtr != BufferEnd && *BufferPtr == '<') 777 BufferPtr++; 778 779 CommentState = LCS_InsideCComment; 780 State = LS_Normal; 781 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 782 goto again; 783 } 784 default: 785 llvm_unreachable("second character of comment should be '/' or '*'"); 786 } 787 788 case LCS_BetweenComments: { 789 // Consecutive comments are extracted only if there is only whitespace 790 // between them. So we can search for the start of the next comment. 791 const char *EndWhitespace = BufferPtr; 792 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 793 EndWhitespace++; 794 795 // Turn any whitespace between comments (and there is only whitespace 796 // between them -- guaranteed by comment extraction) into a newline. We 797 // have two newlines between C comments in total (first one was synthesized 798 // after a comment). 799 formTokenWithChars(T, EndWhitespace, tok::newline); 800 801 CommentState = LCS_BeforeComment; 802 break; 803 } 804 805 case LCS_InsideBCPLComment: 806 case LCS_InsideCComment: 807 if (BufferPtr != CommentEnd) { 808 lexCommentText(T); 809 break; 810 } else { 811 // Skip C comment closing sequence. 812 if (CommentState == LCS_InsideCComment) { 813 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 814 BufferPtr += 2; 815 assert(BufferPtr <= BufferEnd); 816 817 // Synthenize newline just after the C comment, regardless if there is 818 // actually a newline. 819 formTokenWithChars(T, BufferPtr, tok::newline); 820 821 CommentState = LCS_BetweenComments; 822 break; 823 } else { 824 // Don't synthesized a newline after BCPL comment. 825 CommentState = LCS_BetweenComments; 826 goto again; 827 } 828 } 829 } 830 } 831 832 StringRef Lexer::getSpelling(const Token &Tok, 833 const SourceManager &SourceMgr, 834 bool *Invalid) const { 835 SourceLocation Loc = Tok.getLocation(); 836 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 837 838 bool InvalidTemp = false; 839 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 840 if (InvalidTemp) { 841 *Invalid = true; 842 return StringRef(); 843 } 844 845 const char *Begin = File.data() + LocInfo.second; 846 return StringRef(Begin, Tok.getLength()); 847 } 848 849 } // end namespace comments 850 } // end namespace clang 851 852