1 #include "clang/AST/CommentLexer.h" 2 #include "clang/AST/CommentCommandTraits.h" 3 #include "clang/AST/CommentDiagnostic.h" 4 #include "clang/Basic/CharInfo.h" 5 #include "llvm/ADT/StringExtras.h" 6 #include "llvm/ADT/StringSwitch.h" 7 #include "llvm/Support/ConvertUTF.h" 8 #include "llvm/Support/ErrorHandling.h" 9 10 namespace clang { 11 namespace comments { 12 13 void Token::dump(const Lexer &L, const SourceManager &SM) const { 14 llvm::errs() << "comments::Token Kind=" << Kind << " "; 15 Loc.dump(SM); 16 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 17 } 18 19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20 return isLetter(C); 21 } 22 23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24 return isDigit(C); 25 } 26 27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28 return isHexDigit(C); 29 } 30 31 static inline StringRef convertCodePointToUTF8( 32 llvm::BumpPtrAllocator &Allocator, 33 unsigned CodePoint) { 34 char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35 char *ResolvedPtr = Resolved; 36 if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37 return StringRef(Resolved, ResolvedPtr - Resolved); 38 else 39 return StringRef(); 40 } 41 42 namespace { 43 44 #include "clang/AST/CommentHTMLTags.inc" 45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 46 47 } // unnamed namespace 48 49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 50 // Fast path, first check a few most widely used named character references. 51 return llvm::StringSwitch<StringRef>(Name) 52 .Case("amp", "&") 53 .Case("lt", "<") 54 .Case("gt", ">") 55 .Case("quot", "\"") 56 .Case("apos", "\'") 57 // Slow path. 58 .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59 } 60 61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62 unsigned CodePoint = 0; 63 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64 assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65 CodePoint *= 10; 66 CodePoint += Name[i] - '0'; 67 } 68 return convertCodePointToUTF8(Allocator, CodePoint); 69 } 70 71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 72 unsigned CodePoint = 0; 73 for (unsigned i = 0, e = Name.size(); i != e; ++i) { 74 CodePoint *= 16; 75 const char C = Name[i]; 76 assert(isHTMLHexCharacterReferenceCharacter(C)); 77 CodePoint += llvm::hexDigitValue(C); 78 } 79 return convertCodePointToUTF8(Allocator, CodePoint); 80 } 81 82 void Lexer::skipLineStartingDecorations() { 83 // This function should be called only for C comments 84 assert(CommentState == LCS_InsideCComment); 85 86 if (BufferPtr == CommentEnd) 87 return; 88 89 switch (*BufferPtr) { 90 case ' ': 91 case '\t': 92 case '\f': 93 case '\v': { 94 const char *NewBufferPtr = BufferPtr; 95 NewBufferPtr++; 96 if (NewBufferPtr == CommentEnd) 97 return; 98 99 char C = *NewBufferPtr; 100 while (isHorizontalWhitespace(C)) { 101 NewBufferPtr++; 102 if (NewBufferPtr == CommentEnd) 103 return; 104 C = *NewBufferPtr; 105 } 106 if (C == '*') 107 BufferPtr = NewBufferPtr + 1; 108 break; 109 } 110 case '*': 111 BufferPtr++; 112 break; 113 } 114 } 115 116 namespace { 117 /// Returns pointer to the first newline character in the string. 118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 119 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120 if (isVerticalWhitespace(*BufferPtr)) 121 return BufferPtr; 122 } 123 return BufferEnd; 124 } 125 126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 127 if (BufferPtr == BufferEnd) 128 return BufferPtr; 129 130 if (*BufferPtr == '\n') 131 BufferPtr++; 132 else { 133 assert(*BufferPtr == '\r'); 134 BufferPtr++; 135 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 136 BufferPtr++; 137 } 138 return BufferPtr; 139 } 140 141 const char *skipNamedCharacterReference(const char *BufferPtr, 142 const char *BufferEnd) { 143 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144 if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145 return BufferPtr; 146 } 147 return BufferEnd; 148 } 149 150 const char *skipDecimalCharacterReference(const char *BufferPtr, 151 const char *BufferEnd) { 152 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153 if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154 return BufferPtr; 155 } 156 return BufferEnd; 157 } 158 159 const char *skipHexCharacterReference(const char *BufferPtr, 160 const char *BufferEnd) { 161 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162 if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163 return BufferPtr; 164 } 165 return BufferEnd; 166 } 167 168 bool isHTMLIdentifierStartingCharacter(char C) { 169 return isLetter(C); 170 } 171 172 bool isHTMLIdentifierCharacter(char C) { 173 return isAlphanumeric(C); 174 } 175 176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 177 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 178 if (!isHTMLIdentifierCharacter(*BufferPtr)) 179 return BufferPtr; 180 } 181 return BufferEnd; 182 } 183 184 /// Skip HTML string quoted in single or double quotes. Escaping quotes inside 185 /// string allowed. 186 /// 187 /// Returns pointer to closing quote. 188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 189 { 190 const char Quote = *BufferPtr; 191 assert(Quote == '\"' || Quote == '\''); 192 193 BufferPtr++; 194 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 195 const char C = *BufferPtr; 196 if (C == Quote && BufferPtr[-1] != '\\') 197 return BufferPtr; 198 } 199 return BufferEnd; 200 } 201 202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 203 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 204 if (!isWhitespace(*BufferPtr)) 205 return BufferPtr; 206 } 207 return BufferEnd; 208 } 209 210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 211 return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 212 } 213 214 bool isCommandNameStartCharacter(char C) { 215 return isLetter(C); 216 } 217 218 bool isCommandNameCharacter(char C) { 219 return isAlphanumeric(C); 220 } 221 222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 223 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 224 if (!isCommandNameCharacter(*BufferPtr)) 225 return BufferPtr; 226 } 227 return BufferEnd; 228 } 229 230 /// Return the one past end pointer for BCPL comments. 231 /// Handles newlines escaped with backslash or trigraph for backslahs. 232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 233 const char *CurPtr = BufferPtr; 234 while (CurPtr != BufferEnd) { 235 while (!isVerticalWhitespace(*CurPtr)) { 236 CurPtr++; 237 if (CurPtr == BufferEnd) 238 return BufferEnd; 239 } 240 // We found a newline, check if it is escaped. 241 const char *EscapePtr = CurPtr - 1; 242 while(isHorizontalWhitespace(*EscapePtr)) 243 EscapePtr--; 244 245 if (*EscapePtr == '\\' || 246 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 247 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 248 // We found an escaped newline. 249 CurPtr = skipNewline(CurPtr, BufferEnd); 250 } else 251 return CurPtr; // Not an escaped newline. 252 } 253 return BufferEnd; 254 } 255 256 /// Return the one past end pointer for C comments. 257 /// Very dumb, does not handle escaped newlines or trigraphs. 258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 259 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 260 if (*BufferPtr == '*') { 261 assert(BufferPtr + 1 != BufferEnd); 262 if (*(BufferPtr + 1) == '/') 263 return BufferPtr; 264 } 265 } 266 llvm_unreachable("buffer end hit before '*/' was seen"); 267 } 268 269 } // unnamed namespace 270 271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd, 272 tok::TokenKind Kind) { 273 const unsigned TokLen = TokEnd - BufferPtr; 274 Result.setLocation(getSourceLocation(BufferPtr)); 275 Result.setKind(Kind); 276 Result.setLength(TokLen); 277 #ifndef NDEBUG 278 Result.TextPtr = "<UNSET>"; 279 Result.IntVal = 7; 280 #endif 281 BufferPtr = TokEnd; 282 } 283 284 void Lexer::lexCommentText(Token &T) { 285 assert(CommentState == LCS_InsideBCPLComment || 286 CommentState == LCS_InsideCComment); 287 288 switch (State) { 289 case LS_Normal: 290 break; 291 case LS_VerbatimBlockFirstLine: 292 lexVerbatimBlockFirstLine(T); 293 return; 294 case LS_VerbatimBlockBody: 295 lexVerbatimBlockBody(T); 296 return; 297 case LS_VerbatimLineText: 298 lexVerbatimLineText(T); 299 return; 300 case LS_HTMLStartTag: 301 lexHTMLStartTag(T); 302 return; 303 case LS_HTMLEndTag: 304 lexHTMLEndTag(T); 305 return; 306 } 307 308 assert(State == LS_Normal); 309 310 const char *TokenPtr = BufferPtr; 311 assert(TokenPtr < CommentEnd); 312 while (TokenPtr != CommentEnd) { 313 switch(*TokenPtr) { 314 case '\\': 315 case '@': { 316 // Commands that start with a backslash and commands that start with 317 // 'at' have equivalent semantics. But we keep information about the 318 // exact syntax in AST for comments. 319 tok::TokenKind CommandKind = 320 (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 321 TokenPtr++; 322 if (TokenPtr == CommentEnd) { 323 formTextToken(T, TokenPtr); 324 return; 325 } 326 char C = *TokenPtr; 327 switch (C) { 328 default: 329 break; 330 331 case '\\': case '@': case '&': case '$': 332 case '#': case '<': case '>': case '%': 333 case '\"': case '.': case ':': 334 // This is one of \\ \@ \& \$ etc escape sequences. 335 TokenPtr++; 336 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 337 // This is the \:: escape sequence. 338 TokenPtr++; 339 } 340 StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 341 formTokenWithChars(T, TokenPtr, tok::text); 342 T.setText(UnescapedText); 343 return; 344 } 345 346 // Don't make zero-length commands. 347 if (!isCommandNameStartCharacter(*TokenPtr)) { 348 formTextToken(T, TokenPtr); 349 return; 350 } 351 352 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 353 unsigned Length = TokenPtr - (BufferPtr + 1); 354 355 // Hardcoded support for lexing LaTeX formula commands 356 // \f$ \f[ \f] \f{ \f} as a single command. 357 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 358 C = *TokenPtr; 359 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 360 TokenPtr++; 361 Length++; 362 } 363 } 364 365 const StringRef CommandName(BufferPtr + 1, Length); 366 367 const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 368 if (!Info) { 369 if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 370 StringRef CorrectedName = Info->Name; 371 SourceLocation Loc = getSourceLocation(BufferPtr); 372 SourceRange CommandRange(Loc.getLocWithOffset(1), 373 getSourceLocation(TokenPtr)); 374 Diag(Loc, diag::warn_correct_comment_command_name) 375 << CommandName << CorrectedName 376 << FixItHint::CreateReplacement(CommandRange, CorrectedName); 377 } else { 378 formTokenWithChars(T, TokenPtr, tok::unknown_command); 379 T.setUnknownCommandName(CommandName); 380 Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 381 return; 382 } 383 } 384 if (Info->IsVerbatimBlockCommand) { 385 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 386 return; 387 } 388 if (Info->IsVerbatimLineCommand) { 389 setupAndLexVerbatimLine(T, TokenPtr, Info); 390 return; 391 } 392 formTokenWithChars(T, TokenPtr, CommandKind); 393 T.setCommandID(Info->getID()); 394 return; 395 } 396 397 case '&': 398 lexHTMLCharacterReference(T); 399 return; 400 401 case '<': { 402 TokenPtr++; 403 if (TokenPtr == CommentEnd) { 404 formTextToken(T, TokenPtr); 405 return; 406 } 407 const char C = *TokenPtr; 408 if (isHTMLIdentifierStartingCharacter(C)) 409 setupAndLexHTMLStartTag(T); 410 else if (C == '/') 411 setupAndLexHTMLEndTag(T); 412 else 413 formTextToken(T, TokenPtr); 414 415 return; 416 } 417 418 case '\n': 419 case '\r': 420 TokenPtr = skipNewline(TokenPtr, CommentEnd); 421 formTokenWithChars(T, TokenPtr, tok::newline); 422 423 if (CommentState == LCS_InsideCComment) 424 skipLineStartingDecorations(); 425 return; 426 427 default: { 428 size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 429 find_first_of("\n\r\\@&<"); 430 if (End != StringRef::npos) 431 TokenPtr += End; 432 else 433 TokenPtr = CommentEnd; 434 formTextToken(T, TokenPtr); 435 return; 436 } 437 } 438 } 439 } 440 441 void Lexer::setupAndLexVerbatimBlock(Token &T, 442 const char *TextBegin, 443 char Marker, const CommandInfo *Info) { 444 assert(Info->IsVerbatimBlockCommand); 445 446 VerbatimBlockEndCommandName.clear(); 447 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 448 VerbatimBlockEndCommandName.append(Info->EndCommandName); 449 450 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 451 T.setVerbatimBlockID(Info->getID()); 452 453 // If there is a newline following the verbatim opening command, skip the 454 // newline so that we don't create an tok::verbatim_block_line with empty 455 // text content. 456 if (BufferPtr != CommentEnd && 457 isVerticalWhitespace(*BufferPtr)) { 458 BufferPtr = skipNewline(BufferPtr, CommentEnd); 459 State = LS_VerbatimBlockBody; 460 return; 461 } 462 463 State = LS_VerbatimBlockFirstLine; 464 } 465 466 void Lexer::lexVerbatimBlockFirstLine(Token &T) { 467 again: 468 assert(BufferPtr < CommentEnd); 469 470 // FIXME: It would be better to scan the text once, finding either the block 471 // end command or newline. 472 // 473 // Extract current line. 474 const char *Newline = findNewline(BufferPtr, CommentEnd); 475 StringRef Line(BufferPtr, Newline - BufferPtr); 476 477 // Look for end command in current line. 478 size_t Pos = Line.find(VerbatimBlockEndCommandName); 479 const char *TextEnd; 480 const char *NextLine; 481 if (Pos == StringRef::npos) { 482 // Current line is completely verbatim. 483 TextEnd = Newline; 484 NextLine = skipNewline(Newline, CommentEnd); 485 } else if (Pos == 0) { 486 // Current line contains just an end command. 487 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 488 StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 489 formTokenWithChars(T, End, tok::verbatim_block_end); 490 T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 491 State = LS_Normal; 492 return; 493 } else { 494 // There is some text, followed by end command. Extract text first. 495 TextEnd = BufferPtr + Pos; 496 NextLine = TextEnd; 497 // If there is only whitespace before end command, skip whitespace. 498 if (isWhitespace(BufferPtr, TextEnd)) { 499 BufferPtr = TextEnd; 500 goto again; 501 } 502 } 503 504 StringRef Text(BufferPtr, TextEnd - BufferPtr); 505 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 506 T.setVerbatimBlockText(Text); 507 508 State = LS_VerbatimBlockBody; 509 } 510 511 void Lexer::lexVerbatimBlockBody(Token &T) { 512 assert(State == LS_VerbatimBlockBody); 513 514 if (CommentState == LCS_InsideCComment) 515 skipLineStartingDecorations(); 516 517 lexVerbatimBlockFirstLine(T); 518 } 519 520 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 521 const CommandInfo *Info) { 522 assert(Info->IsVerbatimLineCommand); 523 formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 524 T.setVerbatimLineID(Info->getID()); 525 526 State = LS_VerbatimLineText; 527 } 528 529 void Lexer::lexVerbatimLineText(Token &T) { 530 assert(State == LS_VerbatimLineText); 531 532 // Extract current line. 533 const char *Newline = findNewline(BufferPtr, CommentEnd); 534 const StringRef Text(BufferPtr, Newline - BufferPtr); 535 formTokenWithChars(T, Newline, tok::verbatim_line_text); 536 T.setVerbatimLineText(Text); 537 538 State = LS_Normal; 539 } 540 541 void Lexer::lexHTMLCharacterReference(Token &T) { 542 const char *TokenPtr = BufferPtr; 543 assert(*TokenPtr == '&'); 544 TokenPtr++; 545 if (TokenPtr == CommentEnd) { 546 formTextToken(T, TokenPtr); 547 return; 548 } 549 const char *NamePtr; 550 bool isNamed = false; 551 bool isDecimal = false; 552 char C = *TokenPtr; 553 if (isHTMLNamedCharacterReferenceCharacter(C)) { 554 NamePtr = TokenPtr; 555 TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 556 isNamed = true; 557 } else if (C == '#') { 558 TokenPtr++; 559 if (TokenPtr == CommentEnd) { 560 formTextToken(T, TokenPtr); 561 return; 562 } 563 C = *TokenPtr; 564 if (isHTMLDecimalCharacterReferenceCharacter(C)) { 565 NamePtr = TokenPtr; 566 TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 567 isDecimal = true; 568 } else if (C == 'x' || C == 'X') { 569 TokenPtr++; 570 NamePtr = TokenPtr; 571 TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 572 } else { 573 formTextToken(T, TokenPtr); 574 return; 575 } 576 } else { 577 formTextToken(T, TokenPtr); 578 return; 579 } 580 if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 581 *TokenPtr != ';') { 582 formTextToken(T, TokenPtr); 583 return; 584 } 585 StringRef Name(NamePtr, TokenPtr - NamePtr); 586 TokenPtr++; // Skip semicolon. 587 StringRef Resolved; 588 if (isNamed) 589 Resolved = resolveHTMLNamedCharacterReference(Name); 590 else if (isDecimal) 591 Resolved = resolveHTMLDecimalCharacterReference(Name); 592 else 593 Resolved = resolveHTMLHexCharacterReference(Name); 594 595 if (Resolved.empty()) { 596 formTextToken(T, TokenPtr); 597 return; 598 } 599 formTokenWithChars(T, TokenPtr, tok::text); 600 T.setText(Resolved); 601 return; 602 } 603 604 void Lexer::setupAndLexHTMLStartTag(Token &T) { 605 assert(BufferPtr[0] == '<' && 606 isHTMLIdentifierStartingCharacter(BufferPtr[1])); 607 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 608 StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 609 if (!isHTMLTagName(Name)) { 610 formTextToken(T, TagNameEnd); 611 return; 612 } 613 614 formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 615 T.setHTMLTagStartName(Name); 616 617 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 618 619 const char C = *BufferPtr; 620 if (BufferPtr != CommentEnd && 621 (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 622 State = LS_HTMLStartTag; 623 } 624 625 void Lexer::lexHTMLStartTag(Token &T) { 626 assert(State == LS_HTMLStartTag); 627 628 const char *TokenPtr = BufferPtr; 629 char C = *TokenPtr; 630 if (isHTMLIdentifierCharacter(C)) { 631 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 632 StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 633 formTokenWithChars(T, TokenPtr, tok::html_ident); 634 T.setHTMLIdent(Ident); 635 } else { 636 switch (C) { 637 case '=': 638 TokenPtr++; 639 formTokenWithChars(T, TokenPtr, tok::html_equals); 640 break; 641 case '\"': 642 case '\'': { 643 const char *OpenQuote = TokenPtr; 644 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 645 const char *ClosingQuote = TokenPtr; 646 if (TokenPtr != CommentEnd) // Skip closing quote. 647 TokenPtr++; 648 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 649 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 650 ClosingQuote - (OpenQuote + 1))); 651 break; 652 } 653 case '>': 654 TokenPtr++; 655 formTokenWithChars(T, TokenPtr, tok::html_greater); 656 State = LS_Normal; 657 return; 658 case '/': 659 TokenPtr++; 660 if (TokenPtr != CommentEnd && *TokenPtr == '>') { 661 TokenPtr++; 662 formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 663 } else 664 formTextToken(T, TokenPtr); 665 666 State = LS_Normal; 667 return; 668 } 669 } 670 671 // Now look ahead and return to normal state if we don't see any HTML tokens 672 // ahead. 673 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 674 if (BufferPtr == CommentEnd) { 675 State = LS_Normal; 676 return; 677 } 678 679 C = *BufferPtr; 680 if (!isHTMLIdentifierStartingCharacter(C) && 681 C != '=' && C != '\"' && C != '\'' && C != '>') { 682 State = LS_Normal; 683 return; 684 } 685 } 686 687 void Lexer::setupAndLexHTMLEndTag(Token &T) { 688 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 689 690 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 691 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 692 StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 693 if (!isHTMLTagName(Name)) { 694 formTextToken(T, TagNameEnd); 695 return; 696 } 697 698 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 699 700 formTokenWithChars(T, End, tok::html_end_tag); 701 T.setHTMLTagEndName(Name); 702 703 if (BufferPtr != CommentEnd && *BufferPtr == '>') 704 State = LS_HTMLEndTag; 705 } 706 707 void Lexer::lexHTMLEndTag(Token &T) { 708 assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 709 710 formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 711 State = LS_Normal; 712 } 713 714 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 715 const CommandTraits &Traits, 716 SourceLocation FileLoc, 717 const char *BufferStart, const char *BufferEnd): 718 Allocator(Allocator), Diags(Diags), Traits(Traits), 719 BufferStart(BufferStart), BufferEnd(BufferEnd), 720 FileLoc(FileLoc), BufferPtr(BufferStart), 721 CommentState(LCS_BeforeComment), State(LS_Normal) { 722 } 723 724 void Lexer::lex(Token &T) { 725 again: 726 switch (CommentState) { 727 case LCS_BeforeComment: 728 if (BufferPtr == BufferEnd) { 729 formTokenWithChars(T, BufferPtr, tok::eof); 730 return; 731 } 732 733 assert(*BufferPtr == '/'); 734 BufferPtr++; // Skip first slash. 735 switch(*BufferPtr) { 736 case '/': { // BCPL comment. 737 BufferPtr++; // Skip second slash. 738 739 if (BufferPtr != BufferEnd) { 740 // Skip Doxygen magic marker, if it is present. 741 // It might be missing because of a typo //< or /*<, or because we 742 // merged this non-Doxygen comment into a bunch of Doxygen comments 743 // around it: /** ... */ /* ... */ /** ... */ 744 const char C = *BufferPtr; 745 if (C == '/' || C == '!') 746 BufferPtr++; 747 } 748 749 // Skip less-than symbol that marks trailing comments. 750 // Skip it even if the comment is not a Doxygen one, because //< and /*< 751 // are frequent typos. 752 if (BufferPtr != BufferEnd && *BufferPtr == '<') 753 BufferPtr++; 754 755 CommentState = LCS_InsideBCPLComment; 756 if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 757 State = LS_Normal; 758 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 759 goto again; 760 } 761 case '*': { // C comment. 762 BufferPtr++; // Skip star. 763 764 // Skip Doxygen magic marker. 765 const char C = *BufferPtr; 766 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 767 BufferPtr++; 768 769 // Skip less-than symbol that marks trailing comments. 770 if (BufferPtr != BufferEnd && *BufferPtr == '<') 771 BufferPtr++; 772 773 CommentState = LCS_InsideCComment; 774 State = LS_Normal; 775 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 776 goto again; 777 } 778 default: 779 llvm_unreachable("second character of comment should be '/' or '*'"); 780 } 781 782 case LCS_BetweenComments: { 783 // Consecutive comments are extracted only if there is only whitespace 784 // between them. So we can search for the start of the next comment. 785 const char *EndWhitespace = BufferPtr; 786 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 787 EndWhitespace++; 788 789 // Turn any whitespace between comments (and there is only whitespace 790 // between them -- guaranteed by comment extraction) into a newline. We 791 // have two newlines between C comments in total (first one was synthesized 792 // after a comment). 793 formTokenWithChars(T, EndWhitespace, tok::newline); 794 795 CommentState = LCS_BeforeComment; 796 break; 797 } 798 799 case LCS_InsideBCPLComment: 800 case LCS_InsideCComment: 801 if (BufferPtr != CommentEnd) { 802 lexCommentText(T); 803 break; 804 } else { 805 // Skip C comment closing sequence. 806 if (CommentState == LCS_InsideCComment) { 807 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 808 BufferPtr += 2; 809 assert(BufferPtr <= BufferEnd); 810 811 // Synthenize newline just after the C comment, regardless if there is 812 // actually a newline. 813 formTokenWithChars(T, BufferPtr, tok::newline); 814 815 CommentState = LCS_BetweenComments; 816 break; 817 } else { 818 // Don't synthesized a newline after BCPL comment. 819 CommentState = LCS_BetweenComments; 820 goto again; 821 } 822 } 823 } 824 } 825 826 StringRef Lexer::getSpelling(const Token &Tok, 827 const SourceManager &SourceMgr, 828 bool *Invalid) const { 829 SourceLocation Loc = Tok.getLocation(); 830 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 831 832 bool InvalidTemp = false; 833 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 834 if (InvalidTemp) { 835 *Invalid = true; 836 return StringRef(); 837 } 838 839 const char *Begin = File.data() + LocInfo.second; 840 return StringRef(Begin, Tok.getLength()); 841 } 842 843 } // end namespace comments 844 } // end namespace clang 845 846