Home | History | Annotate | Download | only in AST
      1 //===--- CommentLexer.cpp -------------------------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "clang/AST/CommentLexer.h"
     11 #include "clang/AST/CommentCommandTraits.h"
     12 #include "clang/AST/CommentDiagnostic.h"
     13 #include "clang/Basic/CharInfo.h"
     14 #include "llvm/ADT/StringExtras.h"
     15 #include "llvm/ADT/StringSwitch.h"
     16 #include "llvm/Support/ConvertUTF.h"
     17 #include "llvm/Support/ErrorHandling.h"
     18 
     19 namespace clang {
     20 namespace comments {
     21 
     22 void Token::dump(const Lexer &L, const SourceManager &SM) const {
     23   llvm::errs() << "comments::Token Kind=" << Kind << " ";
     24   Loc.dump(SM);
     25   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
     26 }
     27 
     28 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
     29   return isLetter(C);
     30 }
     31 
     32 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
     33   return isDigit(C);
     34 }
     35 
     36 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
     37   return isHexDigit(C);
     38 }
     39 
     40 static inline StringRef convertCodePointToUTF8(
     41                                       llvm::BumpPtrAllocator &Allocator,
     42                                       unsigned CodePoint) {
     43   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
     44   char *ResolvedPtr = Resolved;
     45   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
     46     return StringRef(Resolved, ResolvedPtr - Resolved);
     47   else
     48     return StringRef();
     49 }
     50 
     51 namespace {
     52 
     53 #include "clang/AST/CommentHTMLTags.inc"
     54 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
     55 
     56 } // end anonymous namespace
     57 
     58 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
     59   // Fast path, first check a few most widely used named character references.
     60   return llvm::StringSwitch<StringRef>(Name)
     61       .Case("amp", "&")
     62       .Case("lt", "<")
     63       .Case("gt", ">")
     64       .Case("quot", "\"")
     65       .Case("apos", "\'")
     66       // Slow path.
     67       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
     68 }
     69 
     70 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
     71   unsigned CodePoint = 0;
     72   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     73     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
     74     CodePoint *= 10;
     75     CodePoint += Name[i] - '0';
     76   }
     77   return convertCodePointToUTF8(Allocator, CodePoint);
     78 }
     79 
     80 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
     81   unsigned CodePoint = 0;
     82   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     83     CodePoint *= 16;
     84     const char C = Name[i];
     85     assert(isHTMLHexCharacterReferenceCharacter(C));
     86     CodePoint += llvm::hexDigitValue(C);
     87   }
     88   return convertCodePointToUTF8(Allocator, CodePoint);
     89 }
     90 
     91 void Lexer::skipLineStartingDecorations() {
     92   // This function should be called only for C comments
     93   assert(CommentState == LCS_InsideCComment);
     94 
     95   if (BufferPtr == CommentEnd)
     96     return;
     97 
     98   switch (*BufferPtr) {
     99   case ' ':
    100   case '\t':
    101   case '\f':
    102   case '\v': {
    103     const char *NewBufferPtr = BufferPtr;
    104     NewBufferPtr++;
    105     if (NewBufferPtr == CommentEnd)
    106       return;
    107 
    108     char C = *NewBufferPtr;
    109     while (isHorizontalWhitespace(C)) {
    110       NewBufferPtr++;
    111       if (NewBufferPtr == CommentEnd)
    112         return;
    113       C = *NewBufferPtr;
    114     }
    115     if (C == '*')
    116       BufferPtr = NewBufferPtr + 1;
    117     break;
    118   }
    119   case '*':
    120     BufferPtr++;
    121     break;
    122   }
    123 }
    124 
    125 namespace {
    126 /// Returns pointer to the first newline character in the string.
    127 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
    128   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    129     if (isVerticalWhitespace(*BufferPtr))
    130       return BufferPtr;
    131   }
    132   return BufferEnd;
    133 }
    134 
    135 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    136   if (BufferPtr == BufferEnd)
    137     return BufferPtr;
    138 
    139   if (*BufferPtr == '\n')
    140     BufferPtr++;
    141   else {
    142     assert(*BufferPtr == '\r');
    143     BufferPtr++;
    144     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
    145       BufferPtr++;
    146   }
    147   return BufferPtr;
    148 }
    149 
    150 const char *skipNamedCharacterReference(const char *BufferPtr,
    151                                         const char *BufferEnd) {
    152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    153     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
    154       return BufferPtr;
    155   }
    156   return BufferEnd;
    157 }
    158 
    159 const char *skipDecimalCharacterReference(const char *BufferPtr,
    160                                           const char *BufferEnd) {
    161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    162     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
    163       return BufferPtr;
    164   }
    165   return BufferEnd;
    166 }
    167 
    168 const char *skipHexCharacterReference(const char *BufferPtr,
    169                                       const char *BufferEnd) {
    170   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    171     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
    172       return BufferPtr;
    173   }
    174   return BufferEnd;
    175 }
    176 
    177 bool isHTMLIdentifierStartingCharacter(char C) {
    178   return isLetter(C);
    179 }
    180 
    181 bool isHTMLIdentifierCharacter(char C) {
    182   return isAlphanumeric(C);
    183 }
    184 
    185 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
    186   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    187     if (!isHTMLIdentifierCharacter(*BufferPtr))
    188       return BufferPtr;
    189   }
    190   return BufferEnd;
    191 }
    192 
    193 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
    194 /// string allowed.
    195 ///
    196 /// Returns pointer to closing quote.
    197 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
    198 {
    199   const char Quote = *BufferPtr;
    200   assert(Quote == '\"' || Quote == '\'');
    201 
    202   BufferPtr++;
    203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    204     const char C = *BufferPtr;
    205     if (C == Quote && BufferPtr[-1] != '\\')
    206       return BufferPtr;
    207   }
    208   return BufferEnd;
    209 }
    210 
    211 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
    212   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    213     if (!isWhitespace(*BufferPtr))
    214       return BufferPtr;
    215   }
    216   return BufferEnd;
    217 }
    218 
    219 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
    220   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
    221 }
    222 
    223 bool isCommandNameStartCharacter(char C) {
    224   return isLetter(C);
    225 }
    226 
    227 bool isCommandNameCharacter(char C) {
    228   return isAlphanumeric(C);
    229 }
    230 
    231 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
    232   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    233     if (!isCommandNameCharacter(*BufferPtr))
    234       return BufferPtr;
    235   }
    236   return BufferEnd;
    237 }
    238 
    239 /// Return the one past end pointer for BCPL comments.
    240 /// Handles newlines escaped with backslash or trigraph for backslahs.
    241 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    242   const char *CurPtr = BufferPtr;
    243   while (CurPtr != BufferEnd) {
    244     while (!isVerticalWhitespace(*CurPtr)) {
    245       CurPtr++;
    246       if (CurPtr == BufferEnd)
    247         return BufferEnd;
    248     }
    249     // We found a newline, check if it is escaped.
    250     const char *EscapePtr = CurPtr - 1;
    251     while(isHorizontalWhitespace(*EscapePtr))
    252       EscapePtr--;
    253 
    254     if (*EscapePtr == '\\' ||
    255         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
    256          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
    257       // We found an escaped newline.
    258       CurPtr = skipNewline(CurPtr, BufferEnd);
    259     } else
    260       return CurPtr; // Not an escaped newline.
    261   }
    262   return BufferEnd;
    263 }
    264 
    265 /// Return the one past end pointer for C comments.
    266 /// Very dumb, does not handle escaped newlines or trigraphs.
    267 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    268   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    269     if (*BufferPtr == '*') {
    270       assert(BufferPtr + 1 != BufferEnd);
    271       if (*(BufferPtr + 1) == '/')
    272         return BufferPtr;
    273     }
    274   }
    275   llvm_unreachable("buffer end hit before '*/' was seen");
    276 }
    277 
    278 } // end anonymous namespace
    279 
    280 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
    281                                tok::TokenKind Kind) {
    282   const unsigned TokLen = TokEnd - BufferPtr;
    283   Result.setLocation(getSourceLocation(BufferPtr));
    284   Result.setKind(Kind);
    285   Result.setLength(TokLen);
    286 #ifndef NDEBUG
    287   Result.TextPtr = "<UNSET>";
    288   Result.IntVal = 7;
    289 #endif
    290   BufferPtr = TokEnd;
    291 }
    292 
    293 void Lexer::lexCommentText(Token &T) {
    294   assert(CommentState == LCS_InsideBCPLComment ||
    295          CommentState == LCS_InsideCComment);
    296 
    297   switch (State) {
    298   case LS_Normal:
    299     break;
    300   case LS_VerbatimBlockFirstLine:
    301     lexVerbatimBlockFirstLine(T);
    302     return;
    303   case LS_VerbatimBlockBody:
    304     lexVerbatimBlockBody(T);
    305     return;
    306   case LS_VerbatimLineText:
    307     lexVerbatimLineText(T);
    308     return;
    309   case LS_HTMLStartTag:
    310     lexHTMLStartTag(T);
    311     return;
    312   case LS_HTMLEndTag:
    313     lexHTMLEndTag(T);
    314     return;
    315   }
    316 
    317   assert(State == LS_Normal);
    318 
    319   const char *TokenPtr = BufferPtr;
    320   assert(TokenPtr < CommentEnd);
    321   while (TokenPtr != CommentEnd) {
    322     switch(*TokenPtr) {
    323       case '\\':
    324       case '@': {
    325         // Commands that start with a backslash and commands that start with
    326         // 'at' have equivalent semantics.  But we keep information about the
    327         // exact syntax in AST for comments.
    328         tok::TokenKind CommandKind =
    329             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
    330         TokenPtr++;
    331         if (TokenPtr == CommentEnd) {
    332           formTextToken(T, TokenPtr);
    333           return;
    334         }
    335         char C = *TokenPtr;
    336         switch (C) {
    337         default:
    338           break;
    339 
    340         case '\\': case '@': case '&': case '$':
    341         case '#':  case '<': case '>': case '%':
    342         case '\"': case '.': case ':':
    343           // This is one of \\ \@ \& \$ etc escape sequences.
    344           TokenPtr++;
    345           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
    346             // This is the \:: escape sequence.
    347             TokenPtr++;
    348           }
    349           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
    350           formTokenWithChars(T, TokenPtr, tok::text);
    351           T.setText(UnescapedText);
    352           return;
    353         }
    354 
    355         // Don't make zero-length commands.
    356         if (!isCommandNameStartCharacter(*TokenPtr)) {
    357           formTextToken(T, TokenPtr);
    358           return;
    359         }
    360 
    361         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
    362         unsigned Length = TokenPtr - (BufferPtr + 1);
    363 
    364         // Hardcoded support for lexing LaTeX formula commands
    365         // \f$ \f[ \f] \f{ \f} as a single command.
    366         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
    367           C = *TokenPtr;
    368           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
    369             TokenPtr++;
    370             Length++;
    371           }
    372         }
    373 
    374         StringRef CommandName(BufferPtr + 1, Length);
    375 
    376         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
    377         if (!Info) {
    378           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
    379             StringRef CorrectedName = Info->Name;
    380             SourceLocation Loc = getSourceLocation(BufferPtr);
    381             SourceRange CommandRange(Loc.getLocWithOffset(1),
    382                                      getSourceLocation(TokenPtr));
    383             Diag(Loc, diag::warn_correct_comment_command_name)
    384               << CommandName << CorrectedName
    385               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
    386           } else {
    387             formTokenWithChars(T, TokenPtr, tok::unknown_command);
    388             T.setUnknownCommandName(CommandName);
    389             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
    390             return;
    391           }
    392         }
    393         if (Info->IsVerbatimBlockCommand) {
    394           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
    395           return;
    396         }
    397         if (Info->IsVerbatimLineCommand) {
    398           setupAndLexVerbatimLine(T, TokenPtr, Info);
    399           return;
    400         }
    401         formTokenWithChars(T, TokenPtr, CommandKind);
    402         T.setCommandID(Info->getID());
    403         return;
    404       }
    405 
    406       case '&':
    407         lexHTMLCharacterReference(T);
    408         return;
    409 
    410       case '<': {
    411         TokenPtr++;
    412         if (TokenPtr == CommentEnd) {
    413           formTextToken(T, TokenPtr);
    414           return;
    415         }
    416         const char C = *TokenPtr;
    417         if (isHTMLIdentifierStartingCharacter(C))
    418           setupAndLexHTMLStartTag(T);
    419         else if (C == '/')
    420           setupAndLexHTMLEndTag(T);
    421         else
    422           formTextToken(T, TokenPtr);
    423         return;
    424       }
    425 
    426       case '\n':
    427       case '\r':
    428         TokenPtr = skipNewline(TokenPtr, CommentEnd);
    429         formTokenWithChars(T, TokenPtr, tok::newline);
    430 
    431         if (CommentState == LCS_InsideCComment)
    432           skipLineStartingDecorations();
    433         return;
    434 
    435       default: {
    436         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
    437                          find_first_of("\n\r\\@&<");
    438         if (End != StringRef::npos)
    439           TokenPtr += End;
    440         else
    441           TokenPtr = CommentEnd;
    442         formTextToken(T, TokenPtr);
    443         return;
    444       }
    445     }
    446   }
    447 }
    448 
    449 void Lexer::setupAndLexVerbatimBlock(Token &T,
    450                                      const char *TextBegin,
    451                                      char Marker, const CommandInfo *Info) {
    452   assert(Info->IsVerbatimBlockCommand);
    453 
    454   VerbatimBlockEndCommandName.clear();
    455   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
    456   VerbatimBlockEndCommandName.append(Info->EndCommandName);
    457 
    458   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
    459   T.setVerbatimBlockID(Info->getID());
    460 
    461   // If there is a newline following the verbatim opening command, skip the
    462   // newline so that we don't create an tok::verbatim_block_line with empty
    463   // text content.
    464   if (BufferPtr != CommentEnd &&
    465       isVerticalWhitespace(*BufferPtr)) {
    466     BufferPtr = skipNewline(BufferPtr, CommentEnd);
    467     State = LS_VerbatimBlockBody;
    468     return;
    469   }
    470 
    471   State = LS_VerbatimBlockFirstLine;
    472 }
    473 
    474 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
    475 again:
    476   assert(BufferPtr < CommentEnd);
    477 
    478   // FIXME: It would be better to scan the text once, finding either the block
    479   // end command or newline.
    480   //
    481   // Extract current line.
    482   const char *Newline = findNewline(BufferPtr, CommentEnd);
    483   StringRef Line(BufferPtr, Newline - BufferPtr);
    484 
    485   // Look for end command in current line.
    486   size_t Pos = Line.find(VerbatimBlockEndCommandName);
    487   const char *TextEnd;
    488   const char *NextLine;
    489   if (Pos == StringRef::npos) {
    490     // Current line is completely verbatim.
    491     TextEnd = Newline;
    492     NextLine = skipNewline(Newline, CommentEnd);
    493   } else if (Pos == 0) {
    494     // Current line contains just an end command.
    495     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
    496     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
    497     formTokenWithChars(T, End, tok::verbatim_block_end);
    498     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
    499     State = LS_Normal;
    500     return;
    501   } else {
    502     // There is some text, followed by end command.  Extract text first.
    503     TextEnd = BufferPtr + Pos;
    504     NextLine = TextEnd;
    505     // If there is only whitespace before end command, skip whitespace.
    506     if (isWhitespace(BufferPtr, TextEnd)) {
    507       BufferPtr = TextEnd;
    508       goto again;
    509     }
    510   }
    511 
    512   StringRef Text(BufferPtr, TextEnd - BufferPtr);
    513   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
    514   T.setVerbatimBlockText(Text);
    515 
    516   State = LS_VerbatimBlockBody;
    517 }
    518 
    519 void Lexer::lexVerbatimBlockBody(Token &T) {
    520   assert(State == LS_VerbatimBlockBody);
    521 
    522   if (CommentState == LCS_InsideCComment)
    523     skipLineStartingDecorations();
    524 
    525   if (BufferPtr == CommentEnd) {
    526     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
    527     T.setVerbatimBlockText("");
    528     return;
    529   }
    530 
    531   lexVerbatimBlockFirstLine(T);
    532 }
    533 
    534 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    535                                     const CommandInfo *Info) {
    536   assert(Info->IsVerbatimLineCommand);
    537   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
    538   T.setVerbatimLineID(Info->getID());
    539 
    540   State = LS_VerbatimLineText;
    541 }
    542 
    543 void Lexer::lexVerbatimLineText(Token &T) {
    544   assert(State == LS_VerbatimLineText);
    545 
    546   // Extract current line.
    547   const char *Newline = findNewline(BufferPtr, CommentEnd);
    548   StringRef Text(BufferPtr, Newline - BufferPtr);
    549   formTokenWithChars(T, Newline, tok::verbatim_line_text);
    550   T.setVerbatimLineText(Text);
    551 
    552   State = LS_Normal;
    553 }
    554 
    555 void Lexer::lexHTMLCharacterReference(Token &T) {
    556   const char *TokenPtr = BufferPtr;
    557   assert(*TokenPtr == '&');
    558   TokenPtr++;
    559   if (TokenPtr == CommentEnd) {
    560     formTextToken(T, TokenPtr);
    561     return;
    562   }
    563   const char *NamePtr;
    564   bool isNamed = false;
    565   bool isDecimal = false;
    566   char C = *TokenPtr;
    567   if (isHTMLNamedCharacterReferenceCharacter(C)) {
    568     NamePtr = TokenPtr;
    569     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
    570     isNamed = true;
    571   } else if (C == '#') {
    572     TokenPtr++;
    573     if (TokenPtr == CommentEnd) {
    574       formTextToken(T, TokenPtr);
    575       return;
    576     }
    577     C = *TokenPtr;
    578     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
    579       NamePtr = TokenPtr;
    580       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
    581       isDecimal = true;
    582     } else if (C == 'x' || C == 'X') {
    583       TokenPtr++;
    584       NamePtr = TokenPtr;
    585       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
    586     } else {
    587       formTextToken(T, TokenPtr);
    588       return;
    589     }
    590   } else {
    591     formTextToken(T, TokenPtr);
    592     return;
    593   }
    594   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
    595       *TokenPtr != ';') {
    596     formTextToken(T, TokenPtr);
    597     return;
    598   }
    599   StringRef Name(NamePtr, TokenPtr - NamePtr);
    600   TokenPtr++; // Skip semicolon.
    601   StringRef Resolved;
    602   if (isNamed)
    603     Resolved = resolveHTMLNamedCharacterReference(Name);
    604   else if (isDecimal)
    605     Resolved = resolveHTMLDecimalCharacterReference(Name);
    606   else
    607     Resolved = resolveHTMLHexCharacterReference(Name);
    608 
    609   if (Resolved.empty()) {
    610     formTextToken(T, TokenPtr);
    611     return;
    612   }
    613   formTokenWithChars(T, TokenPtr, tok::text);
    614   T.setText(Resolved);
    615 }
    616 
    617 void Lexer::setupAndLexHTMLStartTag(Token &T) {
    618   assert(BufferPtr[0] == '<' &&
    619          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
    620   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
    621   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
    622   if (!isHTMLTagName(Name)) {
    623     formTextToken(T, TagNameEnd);
    624     return;
    625   }
    626 
    627   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
    628   T.setHTMLTagStartName(Name);
    629 
    630   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    631 
    632   const char C = *BufferPtr;
    633   if (BufferPtr != CommentEnd &&
    634       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
    635     State = LS_HTMLStartTag;
    636 }
    637 
    638 void Lexer::lexHTMLStartTag(Token &T) {
    639   assert(State == LS_HTMLStartTag);
    640 
    641   const char *TokenPtr = BufferPtr;
    642   char C = *TokenPtr;
    643   if (isHTMLIdentifierCharacter(C)) {
    644     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
    645     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
    646     formTokenWithChars(T, TokenPtr, tok::html_ident);
    647     T.setHTMLIdent(Ident);
    648   } else {
    649     switch (C) {
    650     case '=':
    651       TokenPtr++;
    652       formTokenWithChars(T, TokenPtr, tok::html_equals);
    653       break;
    654     case '\"':
    655     case '\'': {
    656       const char *OpenQuote = TokenPtr;
    657       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
    658       const char *ClosingQuote = TokenPtr;
    659       if (TokenPtr != CommentEnd) // Skip closing quote.
    660         TokenPtr++;
    661       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
    662       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
    663                                       ClosingQuote - (OpenQuote + 1)));
    664       break;
    665     }
    666     case '>':
    667       TokenPtr++;
    668       formTokenWithChars(T, TokenPtr, tok::html_greater);
    669       State = LS_Normal;
    670       return;
    671     case '/':
    672       TokenPtr++;
    673       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
    674         TokenPtr++;
    675         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
    676       } else
    677         formTextToken(T, TokenPtr);
    678 
    679       State = LS_Normal;
    680       return;
    681     }
    682   }
    683 
    684   // Now look ahead and return to normal state if we don't see any HTML tokens
    685   // ahead.
    686   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    687   if (BufferPtr == CommentEnd) {
    688     State = LS_Normal;
    689     return;
    690   }
    691 
    692   C = *BufferPtr;
    693   if (!isHTMLIdentifierStartingCharacter(C) &&
    694       C != '=' && C != '\"' && C != '\'' && C != '>') {
    695     State = LS_Normal;
    696     return;
    697   }
    698 }
    699 
    700 void Lexer::setupAndLexHTMLEndTag(Token &T) {
    701   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
    702 
    703   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
    704   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
    705   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
    706   if (!isHTMLTagName(Name)) {
    707     formTextToken(T, TagNameEnd);
    708     return;
    709   }
    710 
    711   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
    712 
    713   formTokenWithChars(T, End, tok::html_end_tag);
    714   T.setHTMLTagEndName(Name);
    715 
    716   if (BufferPtr != CommentEnd && *BufferPtr == '>')
    717     State = LS_HTMLEndTag;
    718 }
    719 
    720 void Lexer::lexHTMLEndTag(Token &T) {
    721   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
    722 
    723   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
    724   State = LS_Normal;
    725 }
    726 
    727 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    728              const CommandTraits &Traits,
    729              SourceLocation FileLoc,
    730              const char *BufferStart, const char *BufferEnd):
    731     Allocator(Allocator), Diags(Diags), Traits(Traits),
    732     BufferStart(BufferStart), BufferEnd(BufferEnd),
    733     FileLoc(FileLoc), BufferPtr(BufferStart),
    734     CommentState(LCS_BeforeComment), State(LS_Normal) {
    735 }
    736 
    737 void Lexer::lex(Token &T) {
    738 again:
    739   switch (CommentState) {
    740   case LCS_BeforeComment:
    741     if (BufferPtr == BufferEnd) {
    742       formTokenWithChars(T, BufferPtr, tok::eof);
    743       return;
    744     }
    745 
    746     assert(*BufferPtr == '/');
    747     BufferPtr++; // Skip first slash.
    748     switch(*BufferPtr) {
    749     case '/': { // BCPL comment.
    750       BufferPtr++; // Skip second slash.
    751 
    752       if (BufferPtr != BufferEnd) {
    753         // Skip Doxygen magic marker, if it is present.
    754         // It might be missing because of a typo //< or /*<, or because we
    755         // merged this non-Doxygen comment into a bunch of Doxygen comments
    756         // around it: /** ... */ /* ... */ /** ... */
    757         const char C = *BufferPtr;
    758         if (C == '/' || C == '!')
    759           BufferPtr++;
    760       }
    761 
    762       // Skip less-than symbol that marks trailing comments.
    763       // Skip it even if the comment is not a Doxygen one, because //< and /*<
    764       // are frequent typos.
    765       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    766         BufferPtr++;
    767 
    768       CommentState = LCS_InsideBCPLComment;
    769       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
    770         State = LS_Normal;
    771       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
    772       goto again;
    773     }
    774     case '*': { // C comment.
    775       BufferPtr++; // Skip star.
    776 
    777       // Skip Doxygen magic marker.
    778       const char C = *BufferPtr;
    779       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
    780         BufferPtr++;
    781 
    782       // Skip less-than symbol that marks trailing comments.
    783       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    784         BufferPtr++;
    785 
    786       CommentState = LCS_InsideCComment;
    787       State = LS_Normal;
    788       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
    789       goto again;
    790     }
    791     default:
    792       llvm_unreachable("second character of comment should be '/' or '*'");
    793     }
    794 
    795   case LCS_BetweenComments: {
    796     // Consecutive comments are extracted only if there is only whitespace
    797     // between them.  So we can search for the start of the next comment.
    798     const char *EndWhitespace = BufferPtr;
    799     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
    800       EndWhitespace++;
    801 
    802     // Turn any whitespace between comments (and there is only whitespace
    803     // between them -- guaranteed by comment extraction) into a newline.  We
    804     // have two newlines between C comments in total (first one was synthesized
    805     // after a comment).
    806     formTokenWithChars(T, EndWhitespace, tok::newline);
    807 
    808     CommentState = LCS_BeforeComment;
    809     break;
    810   }
    811 
    812   case LCS_InsideBCPLComment:
    813   case LCS_InsideCComment:
    814     if (BufferPtr != CommentEnd) {
    815       lexCommentText(T);
    816       break;
    817     } else {
    818       // Skip C comment closing sequence.
    819       if (CommentState == LCS_InsideCComment) {
    820         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
    821         BufferPtr += 2;
    822         assert(BufferPtr <= BufferEnd);
    823 
    824         // Synthenize newline just after the C comment, regardless if there is
    825         // actually a newline.
    826         formTokenWithChars(T, BufferPtr, tok::newline);
    827 
    828         CommentState = LCS_BetweenComments;
    829         break;
    830       } else {
    831         // Don't synthesized a newline after BCPL comment.
    832         CommentState = LCS_BetweenComments;
    833         goto again;
    834       }
    835     }
    836   }
    837 }
    838 
    839 StringRef Lexer::getSpelling(const Token &Tok,
    840                              const SourceManager &SourceMgr,
    841                              bool *Invalid) const {
    842   SourceLocation Loc = Tok.getLocation();
    843   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
    844 
    845   bool InvalidTemp = false;
    846   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
    847   if (InvalidTemp) {
    848     *Invalid = true;
    849     return StringRef();
    850   }
    851 
    852   const char *Begin = File.data() + LocInfo.second;
    853   return StringRef(Begin, Tok.getLength());
    854 }
    855 
    856 } // end namespace comments
    857 } // end namespace clang
    858