Home | History | Annotate | Download | only in AST
      1 #include "clang/AST/CommentLexer.h"
      2 #include "clang/AST/CommentCommandTraits.h"
      3 #include "clang/AST/CommentDiagnostic.h"
      4 #include "clang/Basic/CharInfo.h"
      5 #include "llvm/ADT/StringExtras.h"
      6 #include "llvm/ADT/StringSwitch.h"
      7 #include "llvm/Support/ConvertUTF.h"
      8 #include "llvm/Support/ErrorHandling.h"
      9 
     10 namespace clang {
     11 namespace comments {
     12 
     13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
     14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
     15   Loc.dump(SM);
     16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
     17 }
     18 
     19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
     20   return isLetter(C);
     21 }
     22 
     23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
     24   return isDigit(C);
     25 }
     26 
     27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
     28   return isHexDigit(C);
     29 }
     30 
     31 static inline StringRef convertCodePointToUTF8(
     32                                       llvm::BumpPtrAllocator &Allocator,
     33                                       unsigned CodePoint) {
     34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
     35   char *ResolvedPtr = Resolved;
     36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
     37     return StringRef(Resolved, ResolvedPtr - Resolved);
     38   else
     39     return StringRef();
     40 }
     41 
     42 namespace {
     43 
     44 #include "clang/AST/CommentHTMLTags.inc"
     45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
     46 
     47 } // unnamed namespace
     48 
     49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
     50   // Fast path, first check a few most widely used named character references.
     51   return llvm::StringSwitch<StringRef>(Name)
     52       .Case("amp", "&")
     53       .Case("lt", "<")
     54       .Case("gt", ">")
     55       .Case("quot", "\"")
     56       .Case("apos", "\'")
     57       // Slow path.
     58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
     59 }
     60 
     61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
     62   unsigned CodePoint = 0;
     63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
     65     CodePoint *= 10;
     66     CodePoint += Name[i] - '0';
     67   }
     68   return convertCodePointToUTF8(Allocator, CodePoint);
     69 }
     70 
     71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
     72   unsigned CodePoint = 0;
     73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     74     CodePoint *= 16;
     75     const char C = Name[i];
     76     assert(isHTMLHexCharacterReferenceCharacter(C));
     77     CodePoint += llvm::hexDigitValue(C);
     78   }
     79   return convertCodePointToUTF8(Allocator, CodePoint);
     80 }
     81 
     82 void Lexer::skipLineStartingDecorations() {
     83   // This function should be called only for C comments
     84   assert(CommentState == LCS_InsideCComment);
     85 
     86   if (BufferPtr == CommentEnd)
     87     return;
     88 
     89   switch (*BufferPtr) {
     90   case ' ':
     91   case '\t':
     92   case '\f':
     93   case '\v': {
     94     const char *NewBufferPtr = BufferPtr;
     95     NewBufferPtr++;
     96     if (NewBufferPtr == CommentEnd)
     97       return;
     98 
     99     char C = *NewBufferPtr;
    100     while (isHorizontalWhitespace(C)) {
    101       NewBufferPtr++;
    102       if (NewBufferPtr == CommentEnd)
    103         return;
    104       C = *NewBufferPtr;
    105     }
    106     if (C == '*')
    107       BufferPtr = NewBufferPtr + 1;
    108     break;
    109   }
    110   case '*':
    111     BufferPtr++;
    112     break;
    113   }
    114 }
    115 
    116 namespace {
    117 /// Returns pointer to the first newline character in the string.
    118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
    119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    120     if (isVerticalWhitespace(*BufferPtr))
    121       return BufferPtr;
    122   }
    123   return BufferEnd;
    124 }
    125 
    126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    127   if (BufferPtr == BufferEnd)
    128     return BufferPtr;
    129 
    130   if (*BufferPtr == '\n')
    131     BufferPtr++;
    132   else {
    133     assert(*BufferPtr == '\r');
    134     BufferPtr++;
    135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
    136       BufferPtr++;
    137   }
    138   return BufferPtr;
    139 }
    140 
    141 const char *skipNamedCharacterReference(const char *BufferPtr,
    142                                         const char *BufferEnd) {
    143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
    145       return BufferPtr;
    146   }
    147   return BufferEnd;
    148 }
    149 
    150 const char *skipDecimalCharacterReference(const char *BufferPtr,
    151                                           const char *BufferEnd) {
    152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
    154       return BufferPtr;
    155   }
    156   return BufferEnd;
    157 }
    158 
    159 const char *skipHexCharacterReference(const char *BufferPtr,
    160                                       const char *BufferEnd) {
    161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
    163       return BufferPtr;
    164   }
    165   return BufferEnd;
    166 }
    167 
    168 bool isHTMLIdentifierStartingCharacter(char C) {
    169   return isLetter(C);
    170 }
    171 
    172 bool isHTMLIdentifierCharacter(char C) {
    173   return isAlphanumeric(C);
    174 }
    175 
    176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
    177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    178     if (!isHTMLIdentifierCharacter(*BufferPtr))
    179       return BufferPtr;
    180   }
    181   return BufferEnd;
    182 }
    183 
    184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
    185 /// string allowed.
    186 ///
    187 /// Returns pointer to closing quote.
    188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
    189 {
    190   const char Quote = *BufferPtr;
    191   assert(Quote == '\"' || Quote == '\'');
    192 
    193   BufferPtr++;
    194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    195     const char C = *BufferPtr;
    196     if (C == Quote && BufferPtr[-1] != '\\')
    197       return BufferPtr;
    198   }
    199   return BufferEnd;
    200 }
    201 
    202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
    203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    204     if (!isWhitespace(*BufferPtr))
    205       return BufferPtr;
    206   }
    207   return BufferEnd;
    208 }
    209 
    210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
    211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
    212 }
    213 
    214 bool isCommandNameStartCharacter(char C) {
    215   return isLetter(C);
    216 }
    217 
    218 bool isCommandNameCharacter(char C) {
    219   return isAlphanumeric(C);
    220 }
    221 
    222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
    223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    224     if (!isCommandNameCharacter(*BufferPtr))
    225       return BufferPtr;
    226   }
    227   return BufferEnd;
    228 }
    229 
    230 /// Return the one past end pointer for BCPL comments.
    231 /// Handles newlines escaped with backslash or trigraph for backslahs.
    232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    233   const char *CurPtr = BufferPtr;
    234   while (CurPtr != BufferEnd) {
    235     while (!isVerticalWhitespace(*CurPtr)) {
    236       CurPtr++;
    237       if (CurPtr == BufferEnd)
    238         return BufferEnd;
    239     }
    240     // We found a newline, check if it is escaped.
    241     const char *EscapePtr = CurPtr - 1;
    242     while(isHorizontalWhitespace(*EscapePtr))
    243       EscapePtr--;
    244 
    245     if (*EscapePtr == '\\' ||
    246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
    247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
    248       // We found an escaped newline.
    249       CurPtr = skipNewline(CurPtr, BufferEnd);
    250     } else
    251       return CurPtr; // Not an escaped newline.
    252   }
    253   return BufferEnd;
    254 }
    255 
    256 /// Return the one past end pointer for C comments.
    257 /// Very dumb, does not handle escaped newlines or trigraphs.
    258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    260     if (*BufferPtr == '*') {
    261       assert(BufferPtr + 1 != BufferEnd);
    262       if (*(BufferPtr + 1) == '/')
    263         return BufferPtr;
    264     }
    265   }
    266   llvm_unreachable("buffer end hit before '*/' was seen");
    267 }
    268 
    269 } // unnamed namespace
    270 
    271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
    272                                tok::TokenKind Kind) {
    273   const unsigned TokLen = TokEnd - BufferPtr;
    274   Result.setLocation(getSourceLocation(BufferPtr));
    275   Result.setKind(Kind);
    276   Result.setLength(TokLen);
    277 #ifndef NDEBUG
    278   Result.TextPtr = "<UNSET>";
    279   Result.IntVal = 7;
    280 #endif
    281   BufferPtr = TokEnd;
    282 }
    283 
    284 void Lexer::lexCommentText(Token &T) {
    285   assert(CommentState == LCS_InsideBCPLComment ||
    286          CommentState == LCS_InsideCComment);
    287 
    288   switch (State) {
    289   case LS_Normal:
    290     break;
    291   case LS_VerbatimBlockFirstLine:
    292     lexVerbatimBlockFirstLine(T);
    293     return;
    294   case LS_VerbatimBlockBody:
    295     lexVerbatimBlockBody(T);
    296     return;
    297   case LS_VerbatimLineText:
    298     lexVerbatimLineText(T);
    299     return;
    300   case LS_HTMLStartTag:
    301     lexHTMLStartTag(T);
    302     return;
    303   case LS_HTMLEndTag:
    304     lexHTMLEndTag(T);
    305     return;
    306   }
    307 
    308   assert(State == LS_Normal);
    309 
    310   const char *TokenPtr = BufferPtr;
    311   assert(TokenPtr < CommentEnd);
    312   while (TokenPtr != CommentEnd) {
    313     switch(*TokenPtr) {
    314       case '\\':
    315       case '@': {
    316         // Commands that start with a backslash and commands that start with
    317         // 'at' have equivalent semantics.  But we keep information about the
    318         // exact syntax in AST for comments.
    319         tok::TokenKind CommandKind =
    320             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
    321         TokenPtr++;
    322         if (TokenPtr == CommentEnd) {
    323           formTextToken(T, TokenPtr);
    324           return;
    325         }
    326         char C = *TokenPtr;
    327         switch (C) {
    328         default:
    329           break;
    330 
    331         case '\\': case '@': case '&': case '$':
    332         case '#':  case '<': case '>': case '%':
    333         case '\"': case '.': case ':':
    334           // This is one of \\ \@ \& \$ etc escape sequences.
    335           TokenPtr++;
    336           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
    337             // This is the \:: escape sequence.
    338             TokenPtr++;
    339           }
    340           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
    341           formTokenWithChars(T, TokenPtr, tok::text);
    342           T.setText(UnescapedText);
    343           return;
    344         }
    345 
    346         // Don't make zero-length commands.
    347         if (!isCommandNameStartCharacter(*TokenPtr)) {
    348           formTextToken(T, TokenPtr);
    349           return;
    350         }
    351 
    352         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
    353         unsigned Length = TokenPtr - (BufferPtr + 1);
    354 
    355         // Hardcoded support for lexing LaTeX formula commands
    356         // \f$ \f[ \f] \f{ \f} as a single command.
    357         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
    358           C = *TokenPtr;
    359           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
    360             TokenPtr++;
    361             Length++;
    362           }
    363         }
    364 
    365         StringRef CommandName(BufferPtr + 1, Length);
    366 
    367         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
    368         if (!Info) {
    369           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
    370             StringRef CorrectedName = Info->Name;
    371             SourceLocation Loc = getSourceLocation(BufferPtr);
    372             SourceRange CommandRange(Loc.getLocWithOffset(1),
    373                                      getSourceLocation(TokenPtr));
    374             Diag(Loc, diag::warn_correct_comment_command_name)
    375               << CommandName << CorrectedName
    376               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
    377           } else {
    378             formTokenWithChars(T, TokenPtr, tok::unknown_command);
    379             T.setUnknownCommandName(CommandName);
    380             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
    381             return;
    382           }
    383         }
    384         if (Info->IsVerbatimBlockCommand) {
    385           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
    386           return;
    387         }
    388         if (Info->IsVerbatimLineCommand) {
    389           setupAndLexVerbatimLine(T, TokenPtr, Info);
    390           return;
    391         }
    392         formTokenWithChars(T, TokenPtr, CommandKind);
    393         T.setCommandID(Info->getID());
    394         return;
    395       }
    396 
    397       case '&':
    398         lexHTMLCharacterReference(T);
    399         return;
    400 
    401       case '<': {
    402         TokenPtr++;
    403         if (TokenPtr == CommentEnd) {
    404           formTextToken(T, TokenPtr);
    405           return;
    406         }
    407         const char C = *TokenPtr;
    408         if (isHTMLIdentifierStartingCharacter(C))
    409           setupAndLexHTMLStartTag(T);
    410         else if (C == '/')
    411           setupAndLexHTMLEndTag(T);
    412         else
    413           formTextToken(T, TokenPtr);
    414 
    415         return;
    416       }
    417 
    418       case '\n':
    419       case '\r':
    420         TokenPtr = skipNewline(TokenPtr, CommentEnd);
    421         formTokenWithChars(T, TokenPtr, tok::newline);
    422 
    423         if (CommentState == LCS_InsideCComment)
    424           skipLineStartingDecorations();
    425         return;
    426 
    427       default: {
    428         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
    429                          find_first_of("\n\r\\@&<");
    430         if (End != StringRef::npos)
    431           TokenPtr += End;
    432         else
    433           TokenPtr = CommentEnd;
    434         formTextToken(T, TokenPtr);
    435         return;
    436       }
    437     }
    438   }
    439 }
    440 
    441 void Lexer::setupAndLexVerbatimBlock(Token &T,
    442                                      const char *TextBegin,
    443                                      char Marker, const CommandInfo *Info) {
    444   assert(Info->IsVerbatimBlockCommand);
    445 
    446   VerbatimBlockEndCommandName.clear();
    447   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
    448   VerbatimBlockEndCommandName.append(Info->EndCommandName);
    449 
    450   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
    451   T.setVerbatimBlockID(Info->getID());
    452 
    453   // If there is a newline following the verbatim opening command, skip the
    454   // newline so that we don't create an tok::verbatim_block_line with empty
    455   // text content.
    456   if (BufferPtr != CommentEnd &&
    457       isVerticalWhitespace(*BufferPtr)) {
    458     BufferPtr = skipNewline(BufferPtr, CommentEnd);
    459     State = LS_VerbatimBlockBody;
    460     return;
    461   }
    462 
    463   State = LS_VerbatimBlockFirstLine;
    464 }
    465 
    466 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
    467 again:
    468   assert(BufferPtr < CommentEnd);
    469 
    470   // FIXME: It would be better to scan the text once, finding either the block
    471   // end command or newline.
    472   //
    473   // Extract current line.
    474   const char *Newline = findNewline(BufferPtr, CommentEnd);
    475   StringRef Line(BufferPtr, Newline - BufferPtr);
    476 
    477   // Look for end command in current line.
    478   size_t Pos = Line.find(VerbatimBlockEndCommandName);
    479   const char *TextEnd;
    480   const char *NextLine;
    481   if (Pos == StringRef::npos) {
    482     // Current line is completely verbatim.
    483     TextEnd = Newline;
    484     NextLine = skipNewline(Newline, CommentEnd);
    485   } else if (Pos == 0) {
    486     // Current line contains just an end command.
    487     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
    488     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
    489     formTokenWithChars(T, End, tok::verbatim_block_end);
    490     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
    491     State = LS_Normal;
    492     return;
    493   } else {
    494     // There is some text, followed by end command.  Extract text first.
    495     TextEnd = BufferPtr + Pos;
    496     NextLine = TextEnd;
    497     // If there is only whitespace before end command, skip whitespace.
    498     if (isWhitespace(BufferPtr, TextEnd)) {
    499       BufferPtr = TextEnd;
    500       goto again;
    501     }
    502   }
    503 
    504   StringRef Text(BufferPtr, TextEnd - BufferPtr);
    505   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
    506   T.setVerbatimBlockText(Text);
    507 
    508   State = LS_VerbatimBlockBody;
    509 }
    510 
    511 void Lexer::lexVerbatimBlockBody(Token &T) {
    512   assert(State == LS_VerbatimBlockBody);
    513 
    514   if (CommentState == LCS_InsideCComment)
    515     skipLineStartingDecorations();
    516 
    517   if (BufferPtr == CommentEnd) {
    518     formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
    519     T.setVerbatimBlockText("");
    520     return;
    521   }
    522 
    523   lexVerbatimBlockFirstLine(T);
    524 }
    525 
    526 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    527                                     const CommandInfo *Info) {
    528   assert(Info->IsVerbatimLineCommand);
    529   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
    530   T.setVerbatimLineID(Info->getID());
    531 
    532   State = LS_VerbatimLineText;
    533 }
    534 
    535 void Lexer::lexVerbatimLineText(Token &T) {
    536   assert(State == LS_VerbatimLineText);
    537 
    538   // Extract current line.
    539   const char *Newline = findNewline(BufferPtr, CommentEnd);
    540   StringRef Text(BufferPtr, Newline - BufferPtr);
    541   formTokenWithChars(T, Newline, tok::verbatim_line_text);
    542   T.setVerbatimLineText(Text);
    543 
    544   State = LS_Normal;
    545 }
    546 
    547 void Lexer::lexHTMLCharacterReference(Token &T) {
    548   const char *TokenPtr = BufferPtr;
    549   assert(*TokenPtr == '&');
    550   TokenPtr++;
    551   if (TokenPtr == CommentEnd) {
    552     formTextToken(T, TokenPtr);
    553     return;
    554   }
    555   const char *NamePtr;
    556   bool isNamed = false;
    557   bool isDecimal = false;
    558   char C = *TokenPtr;
    559   if (isHTMLNamedCharacterReferenceCharacter(C)) {
    560     NamePtr = TokenPtr;
    561     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
    562     isNamed = true;
    563   } else if (C == '#') {
    564     TokenPtr++;
    565     if (TokenPtr == CommentEnd) {
    566       formTextToken(T, TokenPtr);
    567       return;
    568     }
    569     C = *TokenPtr;
    570     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
    571       NamePtr = TokenPtr;
    572       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
    573       isDecimal = true;
    574     } else if (C == 'x' || C == 'X') {
    575       TokenPtr++;
    576       NamePtr = TokenPtr;
    577       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
    578     } else {
    579       formTextToken(T, TokenPtr);
    580       return;
    581     }
    582   } else {
    583     formTextToken(T, TokenPtr);
    584     return;
    585   }
    586   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
    587       *TokenPtr != ';') {
    588     formTextToken(T, TokenPtr);
    589     return;
    590   }
    591   StringRef Name(NamePtr, TokenPtr - NamePtr);
    592   TokenPtr++; // Skip semicolon.
    593   StringRef Resolved;
    594   if (isNamed)
    595     Resolved = resolveHTMLNamedCharacterReference(Name);
    596   else if (isDecimal)
    597     Resolved = resolveHTMLDecimalCharacterReference(Name);
    598   else
    599     Resolved = resolveHTMLHexCharacterReference(Name);
    600 
    601   if (Resolved.empty()) {
    602     formTextToken(T, TokenPtr);
    603     return;
    604   }
    605   formTokenWithChars(T, TokenPtr, tok::text);
    606   T.setText(Resolved);
    607   return;
    608 }
    609 
    610 void Lexer::setupAndLexHTMLStartTag(Token &T) {
    611   assert(BufferPtr[0] == '<' &&
    612          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
    613   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
    614   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
    615   if (!isHTMLTagName(Name)) {
    616     formTextToken(T, TagNameEnd);
    617     return;
    618   }
    619 
    620   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
    621   T.setHTMLTagStartName(Name);
    622 
    623   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    624 
    625   const char C = *BufferPtr;
    626   if (BufferPtr != CommentEnd &&
    627       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
    628     State = LS_HTMLStartTag;
    629 }
    630 
    631 void Lexer::lexHTMLStartTag(Token &T) {
    632   assert(State == LS_HTMLStartTag);
    633 
    634   const char *TokenPtr = BufferPtr;
    635   char C = *TokenPtr;
    636   if (isHTMLIdentifierCharacter(C)) {
    637     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
    638     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
    639     formTokenWithChars(T, TokenPtr, tok::html_ident);
    640     T.setHTMLIdent(Ident);
    641   } else {
    642     switch (C) {
    643     case '=':
    644       TokenPtr++;
    645       formTokenWithChars(T, TokenPtr, tok::html_equals);
    646       break;
    647     case '\"':
    648     case '\'': {
    649       const char *OpenQuote = TokenPtr;
    650       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
    651       const char *ClosingQuote = TokenPtr;
    652       if (TokenPtr != CommentEnd) // Skip closing quote.
    653         TokenPtr++;
    654       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
    655       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
    656                                       ClosingQuote - (OpenQuote + 1)));
    657       break;
    658     }
    659     case '>':
    660       TokenPtr++;
    661       formTokenWithChars(T, TokenPtr, tok::html_greater);
    662       State = LS_Normal;
    663       return;
    664     case '/':
    665       TokenPtr++;
    666       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
    667         TokenPtr++;
    668         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
    669       } else
    670         formTextToken(T, TokenPtr);
    671 
    672       State = LS_Normal;
    673       return;
    674     }
    675   }
    676 
    677   // Now look ahead and return to normal state if we don't see any HTML tokens
    678   // ahead.
    679   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    680   if (BufferPtr == CommentEnd) {
    681     State = LS_Normal;
    682     return;
    683   }
    684 
    685   C = *BufferPtr;
    686   if (!isHTMLIdentifierStartingCharacter(C) &&
    687       C != '=' && C != '\"' && C != '\'' && C != '>') {
    688     State = LS_Normal;
    689     return;
    690   }
    691 }
    692 
    693 void Lexer::setupAndLexHTMLEndTag(Token &T) {
    694   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
    695 
    696   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
    697   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
    698   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
    699   if (!isHTMLTagName(Name)) {
    700     formTextToken(T, TagNameEnd);
    701     return;
    702   }
    703 
    704   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
    705 
    706   formTokenWithChars(T, End, tok::html_end_tag);
    707   T.setHTMLTagEndName(Name);
    708 
    709   if (BufferPtr != CommentEnd && *BufferPtr == '>')
    710     State = LS_HTMLEndTag;
    711 }
    712 
    713 void Lexer::lexHTMLEndTag(Token &T) {
    714   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
    715 
    716   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
    717   State = LS_Normal;
    718 }
    719 
    720 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
    721              const CommandTraits &Traits,
    722              SourceLocation FileLoc,
    723              const char *BufferStart, const char *BufferEnd):
    724     Allocator(Allocator), Diags(Diags), Traits(Traits),
    725     BufferStart(BufferStart), BufferEnd(BufferEnd),
    726     FileLoc(FileLoc), BufferPtr(BufferStart),
    727     CommentState(LCS_BeforeComment), State(LS_Normal) {
    728 }
    729 
    730 void Lexer::lex(Token &T) {
    731 again:
    732   switch (CommentState) {
    733   case LCS_BeforeComment:
    734     if (BufferPtr == BufferEnd) {
    735       formTokenWithChars(T, BufferPtr, tok::eof);
    736       return;
    737     }
    738 
    739     assert(*BufferPtr == '/');
    740     BufferPtr++; // Skip first slash.
    741     switch(*BufferPtr) {
    742     case '/': { // BCPL comment.
    743       BufferPtr++; // Skip second slash.
    744 
    745       if (BufferPtr != BufferEnd) {
    746         // Skip Doxygen magic marker, if it is present.
    747         // It might be missing because of a typo //< or /*<, or because we
    748         // merged this non-Doxygen comment into a bunch of Doxygen comments
    749         // around it: /** ... */ /* ... */ /** ... */
    750         const char C = *BufferPtr;
    751         if (C == '/' || C == '!')
    752           BufferPtr++;
    753       }
    754 
    755       // Skip less-than symbol that marks trailing comments.
    756       // Skip it even if the comment is not a Doxygen one, because //< and /*<
    757       // are frequent typos.
    758       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    759         BufferPtr++;
    760 
    761       CommentState = LCS_InsideBCPLComment;
    762       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
    763         State = LS_Normal;
    764       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
    765       goto again;
    766     }
    767     case '*': { // C comment.
    768       BufferPtr++; // Skip star.
    769 
    770       // Skip Doxygen magic marker.
    771       const char C = *BufferPtr;
    772       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
    773         BufferPtr++;
    774 
    775       // Skip less-than symbol that marks trailing comments.
    776       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    777         BufferPtr++;
    778 
    779       CommentState = LCS_InsideCComment;
    780       State = LS_Normal;
    781       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
    782       goto again;
    783     }
    784     default:
    785       llvm_unreachable("second character of comment should be '/' or '*'");
    786     }
    787 
    788   case LCS_BetweenComments: {
    789     // Consecutive comments are extracted only if there is only whitespace
    790     // between them.  So we can search for the start of the next comment.
    791     const char *EndWhitespace = BufferPtr;
    792     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
    793       EndWhitespace++;
    794 
    795     // Turn any whitespace between comments (and there is only whitespace
    796     // between them -- guaranteed by comment extraction) into a newline.  We
    797     // have two newlines between C comments in total (first one was synthesized
    798     // after a comment).
    799     formTokenWithChars(T, EndWhitespace, tok::newline);
    800 
    801     CommentState = LCS_BeforeComment;
    802     break;
    803   }
    804 
    805   case LCS_InsideBCPLComment:
    806   case LCS_InsideCComment:
    807     if (BufferPtr != CommentEnd) {
    808       lexCommentText(T);
    809       break;
    810     } else {
    811       // Skip C comment closing sequence.
    812       if (CommentState == LCS_InsideCComment) {
    813         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
    814         BufferPtr += 2;
    815         assert(BufferPtr <= BufferEnd);
    816 
    817         // Synthenize newline just after the C comment, regardless if there is
    818         // actually a newline.
    819         formTokenWithChars(T, BufferPtr, tok::newline);
    820 
    821         CommentState = LCS_BetweenComments;
    822         break;
    823       } else {
    824         // Don't synthesized a newline after BCPL comment.
    825         CommentState = LCS_BetweenComments;
    826         goto again;
    827       }
    828     }
    829   }
    830 }
    831 
    832 StringRef Lexer::getSpelling(const Token &Tok,
    833                              const SourceManager &SourceMgr,
    834                              bool *Invalid) const {
    835   SourceLocation Loc = Tok.getLocation();
    836   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
    837 
    838   bool InvalidTemp = false;
    839   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
    840   if (InvalidTemp) {
    841     *Invalid = true;
    842     return StringRef();
    843   }
    844 
    845   const char *Begin = File.data() + LocInfo.second;
    846   return StringRef(Begin, Tok.getLength());
    847 }
    848 
    849 } // end namespace comments
    850 } // end namespace clang
    851 
    852