Home | History | Annotate | Download | only in AST
      1 #include "clang/AST/CommentLexer.h"
      2 #include "clang/AST/CommentCommandTraits.h"
      3 #include "clang/Basic/CharInfo.h"
      4 #include "llvm/ADT/StringExtras.h"
      5 #include "llvm/ADT/StringSwitch.h"
      6 #include "llvm/Support/ConvertUTF.h"
      7 #include "llvm/Support/ErrorHandling.h"
      8 
      9 namespace clang {
     10 namespace comments {
     11 
     12 void Token::dump(const Lexer &L, const SourceManager &SM) const {
     13   llvm::errs() << "comments::Token Kind=" << Kind << " ";
     14   Loc.dump(SM);
     15   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
     16 }
     17 
     18 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
     19   return isLetter(C);
     20 }
     21 
     22 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
     23   return isDigit(C);
     24 }
     25 
     26 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
     27   return isHexDigit(C);
     28 }
     29 
     30 static inline StringRef convertCodePointToUTF8(
     31                                       llvm::BumpPtrAllocator &Allocator,
     32                                       unsigned CodePoint) {
     33   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
     34   char *ResolvedPtr = Resolved;
     35   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
     36     return StringRef(Resolved, ResolvedPtr - Resolved);
     37   else
     38     return StringRef();
     39 }
     40 
     41 namespace {
     42 
     43 #include "clang/AST/CommentHTMLTags.inc"
     44 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
     45 
     46 } // unnamed namespace
     47 
     48 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
     49   // Fast path, first check a few most widely used named character references.
     50   return llvm::StringSwitch<StringRef>(Name)
     51       .Case("amp", "&")
     52       .Case("lt", "<")
     53       .Case("gt", ">")
     54       .Case("quot", "\"")
     55       .Case("apos", "\'")
     56       // Slow path.
     57       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
     58 }
     59 
     60 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
     61   unsigned CodePoint = 0;
     62   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     63     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
     64     CodePoint *= 10;
     65     CodePoint += Name[i] - '0';
     66   }
     67   return convertCodePointToUTF8(Allocator, CodePoint);
     68 }
     69 
     70 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
     71   unsigned CodePoint = 0;
     72   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
     73     CodePoint *= 16;
     74     const char C = Name[i];
     75     assert(isHTMLHexCharacterReferenceCharacter(C));
     76     CodePoint += llvm::hexDigitValue(C);
     77   }
     78   return convertCodePointToUTF8(Allocator, CodePoint);
     79 }
     80 
     81 void Lexer::skipLineStartingDecorations() {
     82   // This function should be called only for C comments
     83   assert(CommentState == LCS_InsideCComment);
     84 
     85   if (BufferPtr == CommentEnd)
     86     return;
     87 
     88   switch (*BufferPtr) {
     89   case ' ':
     90   case '\t':
     91   case '\f':
     92   case '\v': {
     93     const char *NewBufferPtr = BufferPtr;
     94     NewBufferPtr++;
     95     if (NewBufferPtr == CommentEnd)
     96       return;
     97 
     98     char C = *NewBufferPtr;
     99     while (isHorizontalWhitespace(C)) {
    100       NewBufferPtr++;
    101       if (NewBufferPtr == CommentEnd)
    102         return;
    103       C = *NewBufferPtr;
    104     }
    105     if (C == '*')
    106       BufferPtr = NewBufferPtr + 1;
    107     break;
    108   }
    109   case '*':
    110     BufferPtr++;
    111     break;
    112   }
    113 }
    114 
    115 namespace {
    116 /// Returns pointer to the first newline character in the string.
    117 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
    118   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    119     if (isVerticalWhitespace(*BufferPtr))
    120       return BufferPtr;
    121   }
    122   return BufferEnd;
    123 }
    124 
    125 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
    126   if (BufferPtr == BufferEnd)
    127     return BufferPtr;
    128 
    129   if (*BufferPtr == '\n')
    130     BufferPtr++;
    131   else {
    132     assert(*BufferPtr == '\r');
    133     BufferPtr++;
    134     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
    135       BufferPtr++;
    136   }
    137   return BufferPtr;
    138 }
    139 
    140 const char *skipNamedCharacterReference(const char *BufferPtr,
    141                                         const char *BufferEnd) {
    142   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    143     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
    144       return BufferPtr;
    145   }
    146   return BufferEnd;
    147 }
    148 
    149 const char *skipDecimalCharacterReference(const char *BufferPtr,
    150                                           const char *BufferEnd) {
    151   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    152     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
    153       return BufferPtr;
    154   }
    155   return BufferEnd;
    156 }
    157 
    158 const char *skipHexCharacterReference(const char *BufferPtr,
    159                                           const char *BufferEnd) {
    160   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    161     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
    162       return BufferPtr;
    163   }
    164   return BufferEnd;
    165 }
    166 
    167 bool isHTMLIdentifierStartingCharacter(char C) {
    168   return isLetter(C);
    169 }
    170 
    171 bool isHTMLIdentifierCharacter(char C) {
    172   return isAlphanumeric(C);
    173 }
    174 
    175 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
    176   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    177     if (!isHTMLIdentifierCharacter(*BufferPtr))
    178       return BufferPtr;
    179   }
    180   return BufferEnd;
    181 }
    182 
    183 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
    184 /// string allowed.
    185 ///
    186 /// Returns pointer to closing quote.
    187 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
    188 {
    189   const char Quote = *BufferPtr;
    190   assert(Quote == '\"' || Quote == '\'');
    191 
    192   BufferPtr++;
    193   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    194     const char C = *BufferPtr;
    195     if (C == Quote && BufferPtr[-1] != '\\')
    196       return BufferPtr;
    197   }
    198   return BufferEnd;
    199 }
    200 
    201 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
    202   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    203     if (!isWhitespace(*BufferPtr))
    204       return BufferPtr;
    205   }
    206   return BufferEnd;
    207 }
    208 
    209 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
    210   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
    211 }
    212 
    213 bool isCommandNameStartCharacter(char C) {
    214   return isLetter(C);
    215 }
    216 
    217 bool isCommandNameCharacter(char C) {
    218   return isAlphanumeric(C);
    219 }
    220 
    221 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
    222   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    223     if (!isCommandNameCharacter(*BufferPtr))
    224       return BufferPtr;
    225   }
    226   return BufferEnd;
    227 }
    228 
    229 /// Return the one past end pointer for BCPL comments.
    230 /// Handles newlines escaped with backslash or trigraph for backslahs.
    231 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    232   const char *CurPtr = BufferPtr;
    233   while (CurPtr != BufferEnd) {
    234     while (!isVerticalWhitespace(*CurPtr)) {
    235       CurPtr++;
    236       if (CurPtr == BufferEnd)
    237         return BufferEnd;
    238     }
    239     // We found a newline, check if it is escaped.
    240     const char *EscapePtr = CurPtr - 1;
    241     while(isHorizontalWhitespace(*EscapePtr))
    242       EscapePtr--;
    243 
    244     if (*EscapePtr == '\\' ||
    245         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
    246          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
    247       // We found an escaped newline.
    248       CurPtr = skipNewline(CurPtr, BufferEnd);
    249     } else
    250       return CurPtr; // Not an escaped newline.
    251   }
    252   return BufferEnd;
    253 }
    254 
    255 /// Return the one past end pointer for C comments.
    256 /// Very dumb, does not handle escaped newlines or trigraphs.
    257 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
    258   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
    259     if (*BufferPtr == '*') {
    260       assert(BufferPtr + 1 != BufferEnd);
    261       if (*(BufferPtr + 1) == '/')
    262         return BufferPtr;
    263     }
    264   }
    265   llvm_unreachable("buffer end hit before '*/' was seen");
    266 }
    267 } // unnamed namespace
    268 
    269 void Lexer::lexCommentText(Token &T) {
    270   assert(CommentState == LCS_InsideBCPLComment ||
    271          CommentState == LCS_InsideCComment);
    272 
    273   switch (State) {
    274   case LS_Normal:
    275     break;
    276   case LS_VerbatimBlockFirstLine:
    277     lexVerbatimBlockFirstLine(T);
    278     return;
    279   case LS_VerbatimBlockBody:
    280     lexVerbatimBlockBody(T);
    281     return;
    282   case LS_VerbatimLineText:
    283     lexVerbatimLineText(T);
    284     return;
    285   case LS_HTMLStartTag:
    286     lexHTMLStartTag(T);
    287     return;
    288   case LS_HTMLEndTag:
    289     lexHTMLEndTag(T);
    290     return;
    291   }
    292 
    293   assert(State == LS_Normal);
    294 
    295   const char *TokenPtr = BufferPtr;
    296   assert(TokenPtr < CommentEnd);
    297   while (TokenPtr != CommentEnd) {
    298     switch(*TokenPtr) {
    299       case '\\':
    300       case '@': {
    301         // Commands that start with a backslash and commands that start with
    302         // 'at' have equivalent semantics.  But we keep information about the
    303         // exact syntax in AST for comments.
    304         tok::TokenKind CommandKind =
    305             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
    306         TokenPtr++;
    307         if (TokenPtr == CommentEnd) {
    308           formTextToken(T, TokenPtr);
    309           return;
    310         }
    311         char C = *TokenPtr;
    312         switch (C) {
    313         default:
    314           break;
    315 
    316         case '\\': case '@': case '&': case '$':
    317         case '#':  case '<': case '>': case '%':
    318         case '\"': case '.': case ':':
    319           // This is one of \\ \@ \& \$ etc escape sequences.
    320           TokenPtr++;
    321           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
    322             // This is the \:: escape sequence.
    323             TokenPtr++;
    324           }
    325           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
    326           formTokenWithChars(T, TokenPtr, tok::text);
    327           T.setText(UnescapedText);
    328           return;
    329         }
    330 
    331         // Don't make zero-length commands.
    332         if (!isCommandNameStartCharacter(*TokenPtr)) {
    333           formTextToken(T, TokenPtr);
    334           return;
    335         }
    336 
    337         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
    338         unsigned Length = TokenPtr - (BufferPtr + 1);
    339 
    340         // Hardcoded support for lexing LaTeX formula commands
    341         // \f$ \f[ \f] \f{ \f} as a single command.
    342         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
    343           C = *TokenPtr;
    344           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
    345             TokenPtr++;
    346             Length++;
    347           }
    348         }
    349 
    350         const StringRef CommandName(BufferPtr + 1, Length);
    351 
    352         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
    353         if (!Info) {
    354           formTokenWithChars(T, TokenPtr, tok::unknown_command);
    355           T.setUnknownCommandName(CommandName);
    356           return;
    357         }
    358         if (Info->IsVerbatimBlockCommand) {
    359           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
    360           return;
    361         }
    362         if (Info->IsVerbatimLineCommand) {
    363           setupAndLexVerbatimLine(T, TokenPtr, Info);
    364           return;
    365         }
    366         formTokenWithChars(T, TokenPtr, CommandKind);
    367         T.setCommandID(Info->getID());
    368         return;
    369       }
    370 
    371       case '&':
    372         lexHTMLCharacterReference(T);
    373         return;
    374 
    375       case '<': {
    376         TokenPtr++;
    377         if (TokenPtr == CommentEnd) {
    378           formTextToken(T, TokenPtr);
    379           return;
    380         }
    381         const char C = *TokenPtr;
    382         if (isHTMLIdentifierStartingCharacter(C))
    383           setupAndLexHTMLStartTag(T);
    384         else if (C == '/')
    385           setupAndLexHTMLEndTag(T);
    386         else
    387           formTextToken(T, TokenPtr);
    388 
    389         return;
    390       }
    391 
    392       case '\n':
    393       case '\r':
    394         TokenPtr = skipNewline(TokenPtr, CommentEnd);
    395         formTokenWithChars(T, TokenPtr, tok::newline);
    396 
    397         if (CommentState == LCS_InsideCComment)
    398           skipLineStartingDecorations();
    399         return;
    400 
    401       default: {
    402         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
    403                          find_first_of("\n\r\\@&<");
    404         if (End != StringRef::npos)
    405           TokenPtr += End;
    406         else
    407           TokenPtr = CommentEnd;
    408         formTextToken(T, TokenPtr);
    409         return;
    410       }
    411     }
    412   }
    413 }
    414 
    415 void Lexer::setupAndLexVerbatimBlock(Token &T,
    416                                      const char *TextBegin,
    417                                      char Marker, const CommandInfo *Info) {
    418   assert(Info->IsVerbatimBlockCommand);
    419 
    420   VerbatimBlockEndCommandName.clear();
    421   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
    422   VerbatimBlockEndCommandName.append(Info->EndCommandName);
    423 
    424   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
    425   T.setVerbatimBlockID(Info->getID());
    426 
    427   // If there is a newline following the verbatim opening command, skip the
    428   // newline so that we don't create an tok::verbatim_block_line with empty
    429   // text content.
    430   if (BufferPtr != CommentEnd &&
    431       isVerticalWhitespace(*BufferPtr)) {
    432     BufferPtr = skipNewline(BufferPtr, CommentEnd);
    433     State = LS_VerbatimBlockBody;
    434     return;
    435   }
    436 
    437   State = LS_VerbatimBlockFirstLine;
    438 }
    439 
    440 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
    441 again:
    442   assert(BufferPtr < CommentEnd);
    443 
    444   // FIXME: It would be better to scan the text once, finding either the block
    445   // end command or newline.
    446   //
    447   // Extract current line.
    448   const char *Newline = findNewline(BufferPtr, CommentEnd);
    449   StringRef Line(BufferPtr, Newline - BufferPtr);
    450 
    451   // Look for end command in current line.
    452   size_t Pos = Line.find(VerbatimBlockEndCommandName);
    453   const char *TextEnd;
    454   const char *NextLine;
    455   if (Pos == StringRef::npos) {
    456     // Current line is completely verbatim.
    457     TextEnd = Newline;
    458     NextLine = skipNewline(Newline, CommentEnd);
    459   } else if (Pos == 0) {
    460     // Current line contains just an end command.
    461     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
    462     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
    463     formTokenWithChars(T, End, tok::verbatim_block_end);
    464     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
    465     State = LS_Normal;
    466     return;
    467   } else {
    468     // There is some text, followed by end command.  Extract text first.
    469     TextEnd = BufferPtr + Pos;
    470     NextLine = TextEnd;
    471     // If there is only whitespace before end command, skip whitespace.
    472     if (isWhitespace(BufferPtr, TextEnd)) {
    473       BufferPtr = TextEnd;
    474       goto again;
    475     }
    476   }
    477 
    478   StringRef Text(BufferPtr, TextEnd - BufferPtr);
    479   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
    480   T.setVerbatimBlockText(Text);
    481 
    482   State = LS_VerbatimBlockBody;
    483 }
    484 
    485 void Lexer::lexVerbatimBlockBody(Token &T) {
    486   assert(State == LS_VerbatimBlockBody);
    487 
    488   if (CommentState == LCS_InsideCComment)
    489     skipLineStartingDecorations();
    490 
    491   lexVerbatimBlockFirstLine(T);
    492 }
    493 
    494 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
    495                                     const CommandInfo *Info) {
    496   assert(Info->IsVerbatimLineCommand);
    497   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
    498   T.setVerbatimLineID(Info->getID());
    499 
    500   State = LS_VerbatimLineText;
    501 }
    502 
    503 void Lexer::lexVerbatimLineText(Token &T) {
    504   assert(State == LS_VerbatimLineText);
    505 
    506   // Extract current line.
    507   const char *Newline = findNewline(BufferPtr, CommentEnd);
    508   const StringRef Text(BufferPtr, Newline - BufferPtr);
    509   formTokenWithChars(T, Newline, tok::verbatim_line_text);
    510   T.setVerbatimLineText(Text);
    511 
    512   State = LS_Normal;
    513 }
    514 
    515 void Lexer::lexHTMLCharacterReference(Token &T) {
    516   const char *TokenPtr = BufferPtr;
    517   assert(*TokenPtr == '&');
    518   TokenPtr++;
    519   if (TokenPtr == CommentEnd) {
    520     formTextToken(T, TokenPtr);
    521     return;
    522   }
    523   const char *NamePtr;
    524   bool isNamed = false;
    525   bool isDecimal = false;
    526   char C = *TokenPtr;
    527   if (isHTMLNamedCharacterReferenceCharacter(C)) {
    528     NamePtr = TokenPtr;
    529     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
    530     isNamed = true;
    531   } else if (C == '#') {
    532     TokenPtr++;
    533     if (TokenPtr == CommentEnd) {
    534       formTextToken(T, TokenPtr);
    535       return;
    536     }
    537     C = *TokenPtr;
    538     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
    539       NamePtr = TokenPtr;
    540       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
    541       isDecimal = true;
    542     } else if (C == 'x' || C == 'X') {
    543       TokenPtr++;
    544       NamePtr = TokenPtr;
    545       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
    546     } else {
    547       formTextToken(T, TokenPtr);
    548       return;
    549     }
    550   } else {
    551     formTextToken(T, TokenPtr);
    552     return;
    553   }
    554   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
    555       *TokenPtr != ';') {
    556     formTextToken(T, TokenPtr);
    557     return;
    558   }
    559   StringRef Name(NamePtr, TokenPtr - NamePtr);
    560   TokenPtr++; // Skip semicolon.
    561   StringRef Resolved;
    562   if (isNamed)
    563     Resolved = resolveHTMLNamedCharacterReference(Name);
    564   else if (isDecimal)
    565     Resolved = resolveHTMLDecimalCharacterReference(Name);
    566   else
    567     Resolved = resolveHTMLHexCharacterReference(Name);
    568 
    569   if (Resolved.empty()) {
    570     formTextToken(T, TokenPtr);
    571     return;
    572   }
    573   formTokenWithChars(T, TokenPtr, tok::text);
    574   T.setText(Resolved);
    575   return;
    576 }
    577 
    578 void Lexer::setupAndLexHTMLStartTag(Token &T) {
    579   assert(BufferPtr[0] == '<' &&
    580          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
    581   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
    582   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
    583   if (!isHTMLTagName(Name)) {
    584     formTextToken(T, TagNameEnd);
    585     return;
    586   }
    587 
    588   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
    589   T.setHTMLTagStartName(Name);
    590 
    591   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    592 
    593   const char C = *BufferPtr;
    594   if (BufferPtr != CommentEnd &&
    595       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
    596     State = LS_HTMLStartTag;
    597 }
    598 
    599 void Lexer::lexHTMLStartTag(Token &T) {
    600   assert(State == LS_HTMLStartTag);
    601 
    602   const char *TokenPtr = BufferPtr;
    603   char C = *TokenPtr;
    604   if (isHTMLIdentifierCharacter(C)) {
    605     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
    606     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
    607     formTokenWithChars(T, TokenPtr, tok::html_ident);
    608     T.setHTMLIdent(Ident);
    609   } else {
    610     switch (C) {
    611     case '=':
    612       TokenPtr++;
    613       formTokenWithChars(T, TokenPtr, tok::html_equals);
    614       break;
    615     case '\"':
    616     case '\'': {
    617       const char *OpenQuote = TokenPtr;
    618       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
    619       const char *ClosingQuote = TokenPtr;
    620       if (TokenPtr != CommentEnd) // Skip closing quote.
    621         TokenPtr++;
    622       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
    623       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
    624                                       ClosingQuote - (OpenQuote + 1)));
    625       break;
    626     }
    627     case '>':
    628       TokenPtr++;
    629       formTokenWithChars(T, TokenPtr, tok::html_greater);
    630       State = LS_Normal;
    631       return;
    632     case '/':
    633       TokenPtr++;
    634       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
    635         TokenPtr++;
    636         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
    637       } else
    638         formTextToken(T, TokenPtr);
    639 
    640       State = LS_Normal;
    641       return;
    642     }
    643   }
    644 
    645   // Now look ahead and return to normal state if we don't see any HTML tokens
    646   // ahead.
    647   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
    648   if (BufferPtr == CommentEnd) {
    649     State = LS_Normal;
    650     return;
    651   }
    652 
    653   C = *BufferPtr;
    654   if (!isHTMLIdentifierStartingCharacter(C) &&
    655       C != '=' && C != '\"' && C != '\'' && C != '>') {
    656     State = LS_Normal;
    657     return;
    658   }
    659 }
    660 
    661 void Lexer::setupAndLexHTMLEndTag(Token &T) {
    662   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
    663 
    664   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
    665   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
    666   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
    667   if (!isHTMLTagName(Name)) {
    668     formTextToken(T, TagNameEnd);
    669     return;
    670   }
    671 
    672   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
    673 
    674   formTokenWithChars(T, End, tok::html_end_tag);
    675   T.setHTMLTagEndName(Name);
    676 
    677   if (BufferPtr != CommentEnd && *BufferPtr == '>')
    678     State = LS_HTMLEndTag;
    679 }
    680 
    681 void Lexer::lexHTMLEndTag(Token &T) {
    682   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
    683 
    684   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
    685   State = LS_Normal;
    686 }
    687 
    688 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
    689              SourceLocation FileLoc,
    690              const char *BufferStart, const char *BufferEnd):
    691     Allocator(Allocator), Traits(Traits),
    692     BufferStart(BufferStart), BufferEnd(BufferEnd),
    693     FileLoc(FileLoc), BufferPtr(BufferStart),
    694     CommentState(LCS_BeforeComment), State(LS_Normal) {
    695 }
    696 
    697 void Lexer::lex(Token &T) {
    698 again:
    699   switch (CommentState) {
    700   case LCS_BeforeComment:
    701     if (BufferPtr == BufferEnd) {
    702       formTokenWithChars(T, BufferPtr, tok::eof);
    703       return;
    704     }
    705 
    706     assert(*BufferPtr == '/');
    707     BufferPtr++; // Skip first slash.
    708     switch(*BufferPtr) {
    709     case '/': { // BCPL comment.
    710       BufferPtr++; // Skip second slash.
    711 
    712       if (BufferPtr != BufferEnd) {
    713         // Skip Doxygen magic marker, if it is present.
    714         // It might be missing because of a typo //< or /*<, or because we
    715         // merged this non-Doxygen comment into a bunch of Doxygen comments
    716         // around it: /** ... */ /* ... */ /** ... */
    717         const char C = *BufferPtr;
    718         if (C == '/' || C == '!')
    719           BufferPtr++;
    720       }
    721 
    722       // Skip less-than symbol that marks trailing comments.
    723       // Skip it even if the comment is not a Doxygen one, because //< and /*<
    724       // are frequent typos.
    725       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    726         BufferPtr++;
    727 
    728       CommentState = LCS_InsideBCPLComment;
    729       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
    730         State = LS_Normal;
    731       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
    732       goto again;
    733     }
    734     case '*': { // C comment.
    735       BufferPtr++; // Skip star.
    736 
    737       // Skip Doxygen magic marker.
    738       const char C = *BufferPtr;
    739       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
    740         BufferPtr++;
    741 
    742       // Skip less-than symbol that marks trailing comments.
    743       if (BufferPtr != BufferEnd && *BufferPtr == '<')
    744         BufferPtr++;
    745 
    746       CommentState = LCS_InsideCComment;
    747       State = LS_Normal;
    748       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
    749       goto again;
    750     }
    751     default:
    752       llvm_unreachable("second character of comment should be '/' or '*'");
    753     }
    754 
    755   case LCS_BetweenComments: {
    756     // Consecutive comments are extracted only if there is only whitespace
    757     // between them.  So we can search for the start of the next comment.
    758     const char *EndWhitespace = BufferPtr;
    759     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
    760       EndWhitespace++;
    761 
    762     // Turn any whitespace between comments (and there is only whitespace
    763     // between them -- guaranteed by comment extraction) into a newline.  We
    764     // have two newlines between C comments in total (first one was synthesized
    765     // after a comment).
    766     formTokenWithChars(T, EndWhitespace, tok::newline);
    767 
    768     CommentState = LCS_BeforeComment;
    769     break;
    770   }
    771 
    772   case LCS_InsideBCPLComment:
    773   case LCS_InsideCComment:
    774     if (BufferPtr != CommentEnd) {
    775       lexCommentText(T);
    776       break;
    777     } else {
    778       // Skip C comment closing sequence.
    779       if (CommentState == LCS_InsideCComment) {
    780         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
    781         BufferPtr += 2;
    782         assert(BufferPtr <= BufferEnd);
    783 
    784         // Synthenize newline just after the C comment, regardless if there is
    785         // actually a newline.
    786         formTokenWithChars(T, BufferPtr, tok::newline);
    787 
    788         CommentState = LCS_BetweenComments;
    789         break;
    790       } else {
    791         // Don't synthesized a newline after BCPL comment.
    792         CommentState = LCS_BetweenComments;
    793         goto again;
    794       }
    795     }
    796   }
    797 }
    798 
    799 StringRef Lexer::getSpelling(const Token &Tok,
    800                              const SourceManager &SourceMgr,
    801                              bool *Invalid) const {
    802   SourceLocation Loc = Tok.getLocation();
    803   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
    804 
    805   bool InvalidTemp = false;
    806   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
    807   if (InvalidTemp) {
    808     *Invalid = true;
    809     return StringRef();
    810   }
    811 
    812   const char *Begin = File.data() + LocInfo.second;
    813   return StringRef(Begin, Tok.getLength());
    814 }
    815 
    816 } // end namespace comments
    817 } // end namespace clang
    818 
    819