Home | History | Annotate | Download | only in Lex
      1 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This file implements the Lexer and Token interfaces.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 //
     14 // TODO: GCC Diagnostics emitted by the lexer:
     15 // PEDWARN: (form feed|vertical tab) in preprocessing directive
     16 //
     17 // Universal characters, unicode, char mapping:
     18 // WARNING: `%.*s' is not in NFKC
     19 // WARNING: `%.*s' is not in NFC
     20 //
     21 // Other:
     22 // TODO: Options to support:
     23 //    -fexec-charset,-fwide-exec-charset
     24 //
     25 //===----------------------------------------------------------------------===//
     26 
     27 #include "clang/Lex/Lexer.h"
     28 #include "clang/Basic/CharInfo.h"
     29 #include "clang/Basic/SourceManager.h"
     30 #include "clang/Lex/CodeCompletionHandler.h"
     31 #include "clang/Lex/LexDiagnostic.h"
     32 #include "clang/Lex/Preprocessor.h"
     33 #include "llvm/ADT/STLExtras.h"
     34 #include "llvm/ADT/StringExtras.h"
     35 #include "llvm/ADT/StringSwitch.h"
     36 #include "llvm/Support/Compiler.h"
     37 #include "llvm/Support/ConvertUTF.h"
     38 #include "llvm/Support/MemoryBuffer.h"
     39 #include "UnicodeCharSets.h"
     40 #include <cstring>
     41 using namespace clang;
     42 
     43 //===----------------------------------------------------------------------===//
     44 // Token Class Implementation
     45 //===----------------------------------------------------------------------===//
     46 
     47 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
     48 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
     49   if (IdentifierInfo *II = getIdentifierInfo())
     50     return II->getObjCKeywordID() == objcKey;
     51   return false;
     52 }
     53 
     54 /// getObjCKeywordID - Return the ObjC keyword kind.
     55 tok::ObjCKeywordKind Token::getObjCKeywordID() const {
     56   IdentifierInfo *specId = getIdentifierInfo();
     57   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
     58 }
     59 
     60 
     61 //===----------------------------------------------------------------------===//
     62 // Lexer Class Implementation
     63 //===----------------------------------------------------------------------===//
     64 
     65 void Lexer::anchor() { }
     66 
     67 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
     68                       const char *BufEnd) {
     69   BufferStart = BufStart;
     70   BufferPtr = BufPtr;
     71   BufferEnd = BufEnd;
     72 
     73   assert(BufEnd[0] == 0 &&
     74          "We assume that the input buffer has a null character at the end"
     75          " to simplify lexing!");
     76 
     77   // Check whether we have a BOM in the beginning of the buffer. If yes - act
     78   // accordingly. Right now we support only UTF-8 with and without BOM, so, just
     79   // skip the UTF-8 BOM if it's present.
     80   if (BufferStart == BufferPtr) {
     81     // Determine the size of the BOM.
     82     StringRef Buf(BufferStart, BufferEnd - BufferStart);
     83     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
     84       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
     85       .Default(0);
     86 
     87     // Skip the BOM.
     88     BufferPtr += BOMLength;
     89   }
     90 
     91   Is_PragmaLexer = false;
     92   CurrentConflictMarkerState = CMK_None;
     93 
     94   // Start of the file is a start of line.
     95   IsAtStartOfLine = true;
     96 
     97   // We are not after parsing a #.
     98   ParsingPreprocessorDirective = false;
     99 
    100   // We are not after parsing #include.
    101   ParsingFilename = false;
    102 
    103   // We are not in raw mode.  Raw mode disables diagnostics and interpretation
    104   // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
    105   // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
    106   // or otherwise skipping over tokens.
    107   LexingRawMode = false;
    108 
    109   // Default to not keeping comments.
    110   ExtendedTokenMode = 0;
    111 }
    112 
    113 /// Lexer constructor - Create a new lexer object for the specified buffer
    114 /// with the specified preprocessor managing the lexing process.  This lexer
    115 /// assumes that the associated file buffer and Preprocessor objects will
    116 /// outlive it, so it doesn't take ownership of either of them.
    117 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
    118   : PreprocessorLexer(&PP, FID),
    119     FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
    120     LangOpts(PP.getLangOpts()) {
    121 
    122   InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
    123             InputFile->getBufferEnd());
    124 
    125   resetExtendedTokenMode();
    126 }
    127 
    128 void Lexer::resetExtendedTokenMode() {
    129   assert(PP && "Cannot reset token mode without a preprocessor");
    130   if (LangOpts.TraditionalCPP)
    131     SetKeepWhitespaceMode(true);
    132   else
    133     SetCommentRetentionState(PP->getCommentRetentionState());
    134 }
    135 
    136 /// Lexer constructor - Create a new raw lexer object.  This object is only
    137 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
    138 /// range will outlive it, so it doesn't take ownership of it.
    139 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
    140              const char *BufStart, const char *BufPtr, const char *BufEnd)
    141   : FileLoc(fileloc), LangOpts(langOpts) {
    142 
    143   InitLexer(BufStart, BufPtr, BufEnd);
    144 
    145   // We *are* in raw mode.
    146   LexingRawMode = true;
    147 }
    148 
    149 /// Lexer constructor - Create a new raw lexer object.  This object is only
    150 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
    151 /// range will outlive it, so it doesn't take ownership of it.
    152 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
    153              const SourceManager &SM, const LangOptions &langOpts)
    154   : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) {
    155 
    156   InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
    157             FromFile->getBufferEnd());
    158 
    159   // We *are* in raw mode.
    160   LexingRawMode = true;
    161 }
    162 
    163 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
    164 /// _Pragma expansion.  This has a variety of magic semantics that this method
    165 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
    166 ///
    167 /// On entrance to this routine, TokStartLoc is a macro location which has a
    168 /// spelling loc that indicates the bytes to be lexed for the token and an
    169 /// expansion location that indicates where all lexed tokens should be
    170 /// "expanded from".
    171 ///
    172 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a
    173 /// normal lexer that remaps tokens as they fly by.  This would require making
    174 /// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
    175 /// interface that could handle this stuff.  This would pull GetMappedTokenLoc
    176 /// out of the critical path of the lexer!
    177 ///
    178 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
    179                                  SourceLocation ExpansionLocStart,
    180                                  SourceLocation ExpansionLocEnd,
    181                                  unsigned TokLen, Preprocessor &PP) {
    182   SourceManager &SM = PP.getSourceManager();
    183 
    184   // Create the lexer as if we were going to lex the file normally.
    185   FileID SpellingFID = SM.getFileID(SpellingLoc);
    186   const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
    187   Lexer *L = new Lexer(SpellingFID, InputFile, PP);
    188 
    189   // Now that the lexer is created, change the start/end locations so that we
    190   // just lex the subsection of the file that we want.  This is lexing from a
    191   // scratch buffer.
    192   const char *StrData = SM.getCharacterData(SpellingLoc);
    193 
    194   L->BufferPtr = StrData;
    195   L->BufferEnd = StrData+TokLen;
    196   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
    197 
    198   // Set the SourceLocation with the remapping information.  This ensures that
    199   // GetMappedTokenLoc will remap the tokens as they are lexed.
    200   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
    201                                      ExpansionLocStart,
    202                                      ExpansionLocEnd, TokLen);
    203 
    204   // Ensure that the lexer thinks it is inside a directive, so that end \n will
    205   // return an EOD token.
    206   L->ParsingPreprocessorDirective = true;
    207 
    208   // This lexer really is for _Pragma.
    209   L->Is_PragmaLexer = true;
    210   return L;
    211 }
    212 
    213 
    214 /// Stringify - Convert the specified string into a C string, with surrounding
    215 /// ""'s, and with escaped \ and " characters.
    216 std::string Lexer::Stringify(const std::string &Str, bool Charify) {
    217   std::string Result = Str;
    218   char Quote = Charify ? '\'' : '"';
    219   for (unsigned i = 0, e = Result.size(); i != e; ++i) {
    220     if (Result[i] == '\\' || Result[i] == Quote) {
    221       Result.insert(Result.begin()+i, '\\');
    222       ++i; ++e;
    223     }
    224   }
    225   return Result;
    226 }
    227 
    228 /// Stringify - Convert the specified string into a C string by escaping '\'
    229 /// and " characters.  This does not add surrounding ""'s to the string.
    230 void Lexer::Stringify(SmallVectorImpl<char> &Str) {
    231   for (unsigned i = 0, e = Str.size(); i != e; ++i) {
    232     if (Str[i] == '\\' || Str[i] == '"') {
    233       Str.insert(Str.begin()+i, '\\');
    234       ++i; ++e;
    235     }
    236   }
    237 }
    238 
    239 //===----------------------------------------------------------------------===//
    240 // Token Spelling
    241 //===----------------------------------------------------------------------===//
    242 
    243 /// \brief Slow case of getSpelling. Extract the characters comprising the
    244 /// spelling of this token from the provided input buffer.
    245 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
    246                               const LangOptions &LangOpts, char *Spelling) {
    247   assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
    248 
    249   size_t Length = 0;
    250   const char *BufEnd = BufPtr + Tok.getLength();
    251 
    252   if (Tok.is(tok::string_literal)) {
    253     // Munch the encoding-prefix and opening double-quote.
    254     while (BufPtr < BufEnd) {
    255       unsigned Size;
    256       Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
    257       BufPtr += Size;
    258 
    259       if (Spelling[Length - 1] == '"')
    260         break;
    261     }
    262 
    263     // Raw string literals need special handling; trigraph expansion and line
    264     // splicing do not occur within their d-char-sequence nor within their
    265     // r-char-sequence.
    266     if (Length >= 2 &&
    267         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
    268       // Search backwards from the end of the token to find the matching closing
    269       // quote.
    270       const char *RawEnd = BufEnd;
    271       do --RawEnd; while (*RawEnd != '"');
    272       size_t RawLength = RawEnd - BufPtr + 1;
    273 
    274       // Everything between the quotes is included verbatim in the spelling.
    275       memcpy(Spelling + Length, BufPtr, RawLength);
    276       Length += RawLength;
    277       BufPtr += RawLength;
    278 
    279       // The rest of the token is lexed normally.
    280     }
    281   }
    282 
    283   while (BufPtr < BufEnd) {
    284     unsigned Size;
    285     Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
    286     BufPtr += Size;
    287   }
    288 
    289   assert(Length < Tok.getLength() &&
    290          "NeedsCleaning flag set on token that didn't need cleaning!");
    291   return Length;
    292 }
    293 
    294 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
    295 /// token are the characters used to represent the token in the source file
    296 /// after trigraph expansion and escaped-newline folding.  In particular, this
    297 /// wants to get the true, uncanonicalized, spelling of things like digraphs
    298 /// UCNs, etc.
    299 StringRef Lexer::getSpelling(SourceLocation loc,
    300                              SmallVectorImpl<char> &buffer,
    301                              const SourceManager &SM,
    302                              const LangOptions &options,
    303                              bool *invalid) {
    304   // Break down the source location.
    305   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
    306 
    307   // Try to the load the file buffer.
    308   bool invalidTemp = false;
    309   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
    310   if (invalidTemp) {
    311     if (invalid) *invalid = true;
    312     return StringRef();
    313   }
    314 
    315   const char *tokenBegin = file.data() + locInfo.second;
    316 
    317   // Lex from the start of the given location.
    318   Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
    319               file.begin(), tokenBegin, file.end());
    320   Token token;
    321   lexer.LexFromRawLexer(token);
    322 
    323   unsigned length = token.getLength();
    324 
    325   // Common case:  no need for cleaning.
    326   if (!token.needsCleaning())
    327     return StringRef(tokenBegin, length);
    328 
    329   // Hard case, we need to relex the characters into the string.
    330   buffer.resize(length);
    331   buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
    332   return StringRef(buffer.data(), buffer.size());
    333 }
    334 
    335 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
    336 /// token are the characters used to represent the token in the source file
    337 /// after trigraph expansion and escaped-newline folding.  In particular, this
    338 /// wants to get the true, uncanonicalized, spelling of things like digraphs
    339 /// UCNs, etc.
    340 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
    341                                const LangOptions &LangOpts, bool *Invalid) {
    342   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
    343 
    344   bool CharDataInvalid = false;
    345   const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
    346                                                     &CharDataInvalid);
    347   if (Invalid)
    348     *Invalid = CharDataInvalid;
    349   if (CharDataInvalid)
    350     return std::string();
    351 
    352   // If this token contains nothing interesting, return it directly.
    353   if (!Tok.needsCleaning())
    354     return std::string(TokStart, TokStart + Tok.getLength());
    355 
    356   std::string Result;
    357   Result.resize(Tok.getLength());
    358   Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
    359   return Result;
    360 }
    361 
    362 /// getSpelling - This method is used to get the spelling of a token into a
    363 /// preallocated buffer, instead of as an std::string.  The caller is required
    364 /// to allocate enough space for the token, which is guaranteed to be at least
    365 /// Tok.getLength() bytes long.  The actual length of the token is returned.
    366 ///
    367 /// Note that this method may do two possible things: it may either fill in
    368 /// the buffer specified with characters, or it may *change the input pointer*
    369 /// to point to a constant buffer with the data already in it (avoiding a
    370 /// copy).  The caller is not allowed to modify the returned buffer pointer
    371 /// if an internal buffer is returned.
    372 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
    373                             const SourceManager &SourceMgr,
    374                             const LangOptions &LangOpts, bool *Invalid) {
    375   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
    376 
    377   const char *TokStart = 0;
    378   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
    379   if (Tok.is(tok::raw_identifier))
    380     TokStart = Tok.getRawIdentifierData();
    381   else if (!Tok.hasUCN()) {
    382     if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
    383       // Just return the string from the identifier table, which is very quick.
    384       Buffer = II->getNameStart();
    385       return II->getLength();
    386     }
    387   }
    388 
    389   // NOTE: this can be checked even after testing for an IdentifierInfo.
    390   if (Tok.isLiteral())
    391     TokStart = Tok.getLiteralData();
    392 
    393   if (TokStart == 0) {
    394     // Compute the start of the token in the input lexer buffer.
    395     bool CharDataInvalid = false;
    396     TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
    397     if (Invalid)
    398       *Invalid = CharDataInvalid;
    399     if (CharDataInvalid) {
    400       Buffer = "";
    401       return 0;
    402     }
    403   }
    404 
    405   // If this token contains nothing interesting, return it directly.
    406   if (!Tok.needsCleaning()) {
    407     Buffer = TokStart;
    408     return Tok.getLength();
    409   }
    410 
    411   // Otherwise, hard case, relex the characters into the string.
    412   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
    413 }
    414 
    415 
    416 /// MeasureTokenLength - Relex the token at the specified location and return
    417 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
    418 /// includes a trigraph or an escaped newline) then this count includes bytes
    419 /// that are part of that.
    420 unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
    421                                    const SourceManager &SM,
    422                                    const LangOptions &LangOpts) {
    423   Token TheTok;
    424   if (getRawToken(Loc, TheTok, SM, LangOpts))
    425     return 0;
    426   return TheTok.getLength();
    427 }
    428 
    429 /// \brief Relex the token at the specified location.
    430 /// \returns true if there was a failure, false on success.
    431 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
    432                         const SourceManager &SM,
    433                         const LangOptions &LangOpts) {
    434   // TODO: this could be special cased for common tokens like identifiers, ')',
    435   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
    436   // all obviously single-char tokens.  This could use
    437   // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
    438   // something.
    439 
    440   // If this comes from a macro expansion, we really do want the macro name, not
    441   // the token this macro expanded to.
    442   Loc = SM.getExpansionLoc(Loc);
    443   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
    444   bool Invalid = false;
    445   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
    446   if (Invalid)
    447     return true;
    448 
    449   const char *StrData = Buffer.data()+LocInfo.second;
    450 
    451   if (isWhitespace(StrData[0]))
    452     return true;
    453 
    454   // Create a lexer starting at the beginning of this token.
    455   Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
    456                  Buffer.begin(), StrData, Buffer.end());
    457   TheLexer.SetCommentRetentionState(true);
    458   TheLexer.LexFromRawLexer(Result);
    459   return false;
    460 }
    461 
    462 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
    463                                               const SourceManager &SM,
    464                                               const LangOptions &LangOpts) {
    465   assert(Loc.isFileID());
    466   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
    467   if (LocInfo.first.isInvalid())
    468     return Loc;
    469 
    470   bool Invalid = false;
    471   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
    472   if (Invalid)
    473     return Loc;
    474 
    475   // Back up from the current location until we hit the beginning of a line
    476   // (or the buffer). We'll relex from that point.
    477   const char *BufStart = Buffer.data();
    478   if (LocInfo.second >= Buffer.size())
    479     return Loc;
    480 
    481   const char *StrData = BufStart+LocInfo.second;
    482   if (StrData[0] == '\n' || StrData[0] == '\r')
    483     return Loc;
    484 
    485   const char *LexStart = StrData;
    486   while (LexStart != BufStart) {
    487     if (LexStart[0] == '\n' || LexStart[0] == '\r') {
    488       ++LexStart;
    489       break;
    490     }
    491 
    492     --LexStart;
    493   }
    494 
    495   // Create a lexer starting at the beginning of this token.
    496   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
    497   Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
    498   TheLexer.SetCommentRetentionState(true);
    499 
    500   // Lex tokens until we find the token that contains the source location.
    501   Token TheTok;
    502   do {
    503     TheLexer.LexFromRawLexer(TheTok);
    504 
    505     if (TheLexer.getBufferLocation() > StrData) {
    506       // Lexing this token has taken the lexer past the source location we're
    507       // looking for. If the current token encompasses our source location,
    508       // return the beginning of that token.
    509       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
    510         return TheTok.getLocation();
    511 
    512       // We ended up skipping over the source location entirely, which means
    513       // that it points into whitespace. We're done here.
    514       break;
    515     }
    516   } while (TheTok.getKind() != tok::eof);
    517 
    518   // We've passed our source location; just return the original source location.
    519   return Loc;
    520 }
    521 
    522 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
    523                                           const SourceManager &SM,
    524                                           const LangOptions &LangOpts) {
    525  if (Loc.isFileID())
    526    return getBeginningOfFileToken(Loc, SM, LangOpts);
    527 
    528  if (!SM.isMacroArgExpansion(Loc))
    529    return Loc;
    530 
    531  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
    532  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
    533  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
    534  std::pair<FileID, unsigned> BeginFileLocInfo
    535    = SM.getDecomposedLoc(BeginFileLoc);
    536  assert(FileLocInfo.first == BeginFileLocInfo.first &&
    537         FileLocInfo.second >= BeginFileLocInfo.second);
    538  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
    539 }
    540 
    541 namespace {
    542   enum PreambleDirectiveKind {
    543     PDK_Skipped,
    544     PDK_StartIf,
    545     PDK_EndIf,
    546     PDK_Unknown
    547   };
    548 }
    549 
    550 std::pair<unsigned, bool>
    551 Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer,
    552                        const LangOptions &LangOpts, unsigned MaxLines) {
    553   // Create a lexer starting at the beginning of the file. Note that we use a
    554   // "fake" file source location at offset 1 so that the lexer will track our
    555   // position within the file.
    556   const unsigned StartOffset = 1;
    557   SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
    558   Lexer TheLexer(FileLoc, LangOpts, Buffer->getBufferStart(),
    559                  Buffer->getBufferStart(), Buffer->getBufferEnd());
    560 
    561   // StartLoc will differ from FileLoc if there is a BOM that was skipped.
    562   SourceLocation StartLoc = TheLexer.getSourceLocation();
    563 
    564   bool InPreprocessorDirective = false;
    565   Token TheTok;
    566   Token IfStartTok;
    567   unsigned IfCount = 0;
    568 
    569   unsigned MaxLineOffset = 0;
    570   if (MaxLines) {
    571     const char *CurPtr = Buffer->getBufferStart();
    572     unsigned CurLine = 0;
    573     while (CurPtr != Buffer->getBufferEnd()) {
    574       char ch = *CurPtr++;
    575       if (ch == '\n') {
    576         ++CurLine;
    577         if (CurLine == MaxLines)
    578           break;
    579       }
    580     }
    581     if (CurPtr != Buffer->getBufferEnd())
    582       MaxLineOffset = CurPtr - Buffer->getBufferStart();
    583   }
    584 
    585   do {
    586     TheLexer.LexFromRawLexer(TheTok);
    587 
    588     if (InPreprocessorDirective) {
    589       // If we've hit the end of the file, we're done.
    590       if (TheTok.getKind() == tok::eof) {
    591         break;
    592       }
    593 
    594       // If we haven't hit the end of the preprocessor directive, skip this
    595       // token.
    596       if (!TheTok.isAtStartOfLine())
    597         continue;
    598 
    599       // We've passed the end of the preprocessor directive, and will look
    600       // at this token again below.
    601       InPreprocessorDirective = false;
    602     }
    603 
    604     // Keep track of the # of lines in the preamble.
    605     if (TheTok.isAtStartOfLine()) {
    606       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
    607 
    608       // If we were asked to limit the number of lines in the preamble,
    609       // and we're about to exceed that limit, we're done.
    610       if (MaxLineOffset && TokOffset >= MaxLineOffset)
    611         break;
    612     }
    613 
    614     // Comments are okay; skip over them.
    615     if (TheTok.getKind() == tok::comment)
    616       continue;
    617 
    618     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
    619       // This is the start of a preprocessor directive.
    620       Token HashTok = TheTok;
    621       InPreprocessorDirective = true;
    622 
    623       // Figure out which directive this is. Since we're lexing raw tokens,
    624       // we don't have an identifier table available. Instead, just look at
    625       // the raw identifier to recognize and categorize preprocessor directives.
    626       TheLexer.LexFromRawLexer(TheTok);
    627       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
    628         StringRef Keyword(TheTok.getRawIdentifierData(),
    629                                 TheTok.getLength());
    630         PreambleDirectiveKind PDK
    631           = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
    632               .Case("include", PDK_Skipped)
    633               .Case("__include_macros", PDK_Skipped)
    634               .Case("define", PDK_Skipped)
    635               .Case("undef", PDK_Skipped)
    636               .Case("line", PDK_Skipped)
    637               .Case("error", PDK_Skipped)
    638               .Case("pragma", PDK_Skipped)
    639               .Case("import", PDK_Skipped)
    640               .Case("include_next", PDK_Skipped)
    641               .Case("warning", PDK_Skipped)
    642               .Case("ident", PDK_Skipped)
    643               .Case("sccs", PDK_Skipped)
    644               .Case("assert", PDK_Skipped)
    645               .Case("unassert", PDK_Skipped)
    646               .Case("if", PDK_StartIf)
    647               .Case("ifdef", PDK_StartIf)
    648               .Case("ifndef", PDK_StartIf)
    649               .Case("elif", PDK_Skipped)
    650               .Case("else", PDK_Skipped)
    651               .Case("endif", PDK_EndIf)
    652               .Default(PDK_Unknown);
    653 
    654         switch (PDK) {
    655         case PDK_Skipped:
    656           continue;
    657 
    658         case PDK_StartIf:
    659           if (IfCount == 0)
    660             IfStartTok = HashTok;
    661 
    662           ++IfCount;
    663           continue;
    664 
    665         case PDK_EndIf:
    666           // Mismatched #endif. The preamble ends here.
    667           if (IfCount == 0)
    668             break;
    669 
    670           --IfCount;
    671           continue;
    672 
    673         case PDK_Unknown:
    674           // We don't know what this directive is; stop at the '#'.
    675           break;
    676         }
    677       }
    678 
    679       // We only end up here if we didn't recognize the preprocessor
    680       // directive or it was one that can't occur in the preamble at this
    681       // point. Roll back the current token to the location of the '#'.
    682       InPreprocessorDirective = false;
    683       TheTok = HashTok;
    684     }
    685 
    686     // We hit a token that we don't recognize as being in the
    687     // "preprocessing only" part of the file, so we're no longer in
    688     // the preamble.
    689     break;
    690   } while (true);
    691 
    692   SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation();
    693   return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
    694                         IfCount? IfStartTok.isAtStartOfLine()
    695                                : TheTok.isAtStartOfLine());
    696 }
    697 
    698 
    699 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
    700 /// token, return a new location that specifies a character within the token.
    701 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
    702                                               unsigned CharNo,
    703                                               const SourceManager &SM,
    704                                               const LangOptions &LangOpts) {
    705   // Figure out how many physical characters away the specified expansion
    706   // character is.  This needs to take into consideration newlines and
    707   // trigraphs.
    708   bool Invalid = false;
    709   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
    710 
    711   // If they request the first char of the token, we're trivially done.
    712   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
    713     return TokStart;
    714 
    715   unsigned PhysOffset = 0;
    716 
    717   // The usual case is that tokens don't contain anything interesting.  Skip
    718   // over the uninteresting characters.  If a token only consists of simple
    719   // chars, this method is extremely fast.
    720   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
    721     if (CharNo == 0)
    722       return TokStart.getLocWithOffset(PhysOffset);
    723     ++TokPtr, --CharNo, ++PhysOffset;
    724   }
    725 
    726   // If we have a character that may be a trigraph or escaped newline, use a
    727   // lexer to parse it correctly.
    728   for (; CharNo; --CharNo) {
    729     unsigned Size;
    730     Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
    731     TokPtr += Size;
    732     PhysOffset += Size;
    733   }
    734 
    735   // Final detail: if we end up on an escaped newline, we want to return the
    736   // location of the actual byte of the token.  For example foo\<newline>bar
    737   // advanced by 3 should return the location of b, not of \\.  One compounding
    738   // detail of this is that the escape may be made by a trigraph.
    739   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
    740     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
    741 
    742   return TokStart.getLocWithOffset(PhysOffset);
    743 }
    744 
    745 /// \brief Computes the source location just past the end of the
    746 /// token at this source location.
    747 ///
    748 /// This routine can be used to produce a source location that
    749 /// points just past the end of the token referenced by \p Loc, and
    750 /// is generally used when a diagnostic needs to point just after a
    751 /// token where it expected something different that it received. If
    752 /// the returned source location would not be meaningful (e.g., if
    753 /// it points into a macro), this routine returns an invalid
    754 /// source location.
    755 ///
    756 /// \param Offset an offset from the end of the token, where the source
    757 /// location should refer to. The default offset (0) produces a source
    758 /// location pointing just past the end of the token; an offset of 1 produces
    759 /// a source location pointing to the last character in the token, etc.
    760 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
    761                                           const SourceManager &SM,
    762                                           const LangOptions &LangOpts) {
    763   if (Loc.isInvalid())
    764     return SourceLocation();
    765 
    766   if (Loc.isMacroID()) {
    767     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
    768       return SourceLocation(); // Points inside the macro expansion.
    769   }
    770 
    771   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
    772   if (Len > Offset)
    773     Len = Len - Offset;
    774   else
    775     return Loc;
    776 
    777   return Loc.getLocWithOffset(Len);
    778 }
    779 
    780 /// \brief Returns true if the given MacroID location points at the first
    781 /// token of the macro expansion.
    782 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
    783                                       const SourceManager &SM,
    784                                       const LangOptions &LangOpts,
    785                                       SourceLocation *MacroBegin) {
    786   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
    787 
    788   std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc);
    789   // FIXME: If the token comes from the macro token paste operator ('##')
    790   // this function will always return false;
    791   if (infoLoc.second > 0)
    792     return false; // Does not point at the start of token.
    793 
    794   SourceLocation expansionLoc =
    795     SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart();
    796   if (expansionLoc.isFileID()) {
    797     // No other macro expansions, this is the first.
    798     if (MacroBegin)
    799       *MacroBegin = expansionLoc;
    800     return true;
    801   }
    802 
    803   return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
    804 }
    805 
    806 /// \brief Returns true if the given MacroID location points at the last
    807 /// token of the macro expansion.
    808 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
    809                                     const SourceManager &SM,
    810                                     const LangOptions &LangOpts,
    811                                     SourceLocation *MacroEnd) {
    812   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
    813 
    814   SourceLocation spellLoc = SM.getSpellingLoc(loc);
    815   unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
    816   if (tokLen == 0)
    817     return false;
    818 
    819   FileID FID = SM.getFileID(loc);
    820   SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1);
    821   if (SM.isInFileID(afterLoc, FID))
    822     return false; // Still in the same FileID, does not point to the last token.
    823 
    824   // FIXME: If the token comes from the macro token paste operator ('##')
    825   // or the stringify operator ('#') this function will always return false;
    826 
    827   SourceLocation expansionLoc =
    828     SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd();
    829   if (expansionLoc.isFileID()) {
    830     // No other macro expansions.
    831     if (MacroEnd)
    832       *MacroEnd = expansionLoc;
    833     return true;
    834   }
    835 
    836   return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
    837 }
    838 
    839 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
    840                                              const SourceManager &SM,
    841                                              const LangOptions &LangOpts) {
    842   SourceLocation Begin = Range.getBegin();
    843   SourceLocation End = Range.getEnd();
    844   assert(Begin.isFileID() && End.isFileID());
    845   if (Range.isTokenRange()) {
    846     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
    847     if (End.isInvalid())
    848       return CharSourceRange();
    849   }
    850 
    851   // Break down the source locations.
    852   FileID FID;
    853   unsigned BeginOffs;
    854   llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
    855   if (FID.isInvalid())
    856     return CharSourceRange();
    857 
    858   unsigned EndOffs;
    859   if (!SM.isInFileID(End, FID, &EndOffs) ||
    860       BeginOffs > EndOffs)
    861     return CharSourceRange();
    862 
    863   return CharSourceRange::getCharRange(Begin, End);
    864 }
    865 
    866 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
    867                                          const SourceManager &SM,
    868                                          const LangOptions &LangOpts) {
    869   SourceLocation Begin = Range.getBegin();
    870   SourceLocation End = Range.getEnd();
    871   if (Begin.isInvalid() || End.isInvalid())
    872     return CharSourceRange();
    873 
    874   if (Begin.isFileID() && End.isFileID())
    875     return makeRangeFromFileLocs(Range, SM, LangOpts);
    876 
    877   if (Begin.isMacroID() && End.isFileID()) {
    878     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
    879       return CharSourceRange();
    880     Range.setBegin(Begin);
    881     return makeRangeFromFileLocs(Range, SM, LangOpts);
    882   }
    883 
    884   if (Begin.isFileID() && End.isMacroID()) {
    885     if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
    886                                                           &End)) ||
    887         (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
    888                                                            &End)))
    889       return CharSourceRange();
    890     Range.setEnd(End);
    891     return makeRangeFromFileLocs(Range, SM, LangOpts);
    892   }
    893 
    894   assert(Begin.isMacroID() && End.isMacroID());
    895   SourceLocation MacroBegin, MacroEnd;
    896   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
    897       ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
    898                                                         &MacroEnd)) ||
    899        (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
    900                                                          &MacroEnd)))) {
    901     Range.setBegin(MacroBegin);
    902     Range.setEnd(MacroEnd);
    903     return makeRangeFromFileLocs(Range, SM, LangOpts);
    904   }
    905 
    906   FileID FID;
    907   unsigned BeginOffs;
    908   llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
    909   if (FID.isInvalid())
    910     return CharSourceRange();
    911 
    912   unsigned EndOffs;
    913   if (!SM.isInFileID(End, FID, &EndOffs) ||
    914       BeginOffs > EndOffs)
    915     return CharSourceRange();
    916 
    917   const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
    918   const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
    919   if (Expansion.isMacroArgExpansion() &&
    920       Expansion.getSpellingLoc().isFileID()) {
    921     SourceLocation SpellLoc = Expansion.getSpellingLoc();
    922     Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs));
    923     Range.setEnd(SpellLoc.getLocWithOffset(EndOffs));
    924     return makeRangeFromFileLocs(Range, SM, LangOpts);
    925   }
    926 
    927   return CharSourceRange();
    928 }
    929 
    930 StringRef Lexer::getSourceText(CharSourceRange Range,
    931                                const SourceManager &SM,
    932                                const LangOptions &LangOpts,
    933                                bool *Invalid) {
    934   Range = makeFileCharRange(Range, SM, LangOpts);
    935   if (Range.isInvalid()) {
    936     if (Invalid) *Invalid = true;
    937     return StringRef();
    938   }
    939 
    940   // Break down the source location.
    941   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
    942   if (beginInfo.first.isInvalid()) {
    943     if (Invalid) *Invalid = true;
    944     return StringRef();
    945   }
    946 
    947   unsigned EndOffs;
    948   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
    949       beginInfo.second > EndOffs) {
    950     if (Invalid) *Invalid = true;
    951     return StringRef();
    952   }
    953 
    954   // Try to the load the file buffer.
    955   bool invalidTemp = false;
    956   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
    957   if (invalidTemp) {
    958     if (Invalid) *Invalid = true;
    959     return StringRef();
    960   }
    961 
    962   if (Invalid) *Invalid = false;
    963   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
    964 }
    965 
    966 StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
    967                                        const SourceManager &SM,
    968                                        const LangOptions &LangOpts) {
    969   assert(Loc.isMacroID() && "Only reasonble to call this on macros");
    970 
    971   // Find the location of the immediate macro expansion.
    972   while (1) {
    973     FileID FID = SM.getFileID(Loc);
    974     const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
    975     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
    976     Loc = Expansion.getExpansionLocStart();
    977     if (!Expansion.isMacroArgExpansion())
    978       break;
    979 
    980     // For macro arguments we need to check that the argument did not come
    981     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
    982 
    983     // Loc points to the argument id of the macro definition, move to the
    984     // macro expansion.
    985     Loc = SM.getImmediateExpansionRange(Loc).first;
    986     SourceLocation SpellLoc = Expansion.getSpellingLoc();
    987     if (SpellLoc.isFileID())
    988       break; // No inner macro.
    989 
    990     // If spelling location resides in the same FileID as macro expansion
    991     // location, it means there is no inner macro.
    992     FileID MacroFID = SM.getFileID(Loc);
    993     if (SM.isInFileID(SpellLoc, MacroFID))
    994       break;
    995 
    996     // Argument came from inner macro.
    997     Loc = SpellLoc;
    998   }
    999 
   1000   // Find the spelling location of the start of the non-argument expansion
   1001   // range. This is where the macro name was spelled in order to begin
   1002   // expanding this macro.
   1003   Loc = SM.getSpellingLoc(Loc);
   1004 
   1005   // Dig out the buffer where the macro name was spelled and the extents of the
   1006   // name so that we can render it into the expansion note.
   1007   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
   1008   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
   1009   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
   1010   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
   1011 }
   1012 
   1013 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
   1014   return isIdentifierBody(c, LangOpts.DollarIdents);
   1015 }
   1016 
   1017 
   1018 //===----------------------------------------------------------------------===//
   1019 // Diagnostics forwarding code.
   1020 //===----------------------------------------------------------------------===//
   1021 
   1022 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
   1023 /// lexer buffer was all expanded at a single point, perform the mapping.
   1024 /// This is currently only used for _Pragma implementation, so it is the slow
   1025 /// path of the hot getSourceLocation method.  Do not allow it to be inlined.
   1026 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
   1027     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
   1028 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
   1029                                         SourceLocation FileLoc,
   1030                                         unsigned CharNo, unsigned TokLen) {
   1031   assert(FileLoc.isMacroID() && "Must be a macro expansion");
   1032 
   1033   // Otherwise, we're lexing "mapped tokens".  This is used for things like
   1034   // _Pragma handling.  Combine the expansion location of FileLoc with the
   1035   // spelling location.
   1036   SourceManager &SM = PP.getSourceManager();
   1037 
   1038   // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
   1039   // characters come from spelling(FileLoc)+Offset.
   1040   SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
   1041   SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
   1042 
   1043   // Figure out the expansion loc range, which is the range covered by the
   1044   // original _Pragma(...) sequence.
   1045   std::pair<SourceLocation,SourceLocation> II =
   1046     SM.getImmediateExpansionRange(FileLoc);
   1047 
   1048   return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
   1049 }
   1050 
   1051 /// getSourceLocation - Return a source location identifier for the specified
   1052 /// offset in the current file.
   1053 SourceLocation Lexer::getSourceLocation(const char *Loc,
   1054                                         unsigned TokLen) const {
   1055   assert(Loc >= BufferStart && Loc <= BufferEnd &&
   1056          "Location out of range for this buffer!");
   1057 
   1058   // In the normal case, we're just lexing from a simple file buffer, return
   1059   // the file id from FileLoc with the offset specified.
   1060   unsigned CharNo = Loc-BufferStart;
   1061   if (FileLoc.isFileID())
   1062     return FileLoc.getLocWithOffset(CharNo);
   1063 
   1064   // Otherwise, this is the _Pragma lexer case, which pretends that all of the
   1065   // tokens are lexed from where the _Pragma was defined.
   1066   assert(PP && "This doesn't work on raw lexers");
   1067   return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
   1068 }
   1069 
   1070 /// Diag - Forwarding function for diagnostics.  This translate a source
   1071 /// position in the current buffer into a SourceLocation object for rendering.
   1072 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
   1073   return PP->Diag(getSourceLocation(Loc), DiagID);
   1074 }
   1075 
   1076 //===----------------------------------------------------------------------===//
   1077 // Trigraph and Escaped Newline Handling Code.
   1078 //===----------------------------------------------------------------------===//
   1079 
   1080 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
   1081 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
   1082 static char GetTrigraphCharForLetter(char Letter) {
   1083   switch (Letter) {
   1084   default:   return 0;
   1085   case '=':  return '#';
   1086   case ')':  return ']';
   1087   case '(':  return '[';
   1088   case '!':  return '|';
   1089   case '\'': return '^';
   1090   case '>':  return '}';
   1091   case '/':  return '\\';
   1092   case '<':  return '{';
   1093   case '-':  return '~';
   1094   }
   1095 }
   1096 
   1097 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
   1098 /// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
   1099 /// return the result character.  Finally, emit a warning about trigraph use
   1100 /// whether trigraphs are enabled or not.
   1101 static char DecodeTrigraphChar(const char *CP, Lexer *L) {
   1102   char Res = GetTrigraphCharForLetter(*CP);
   1103   if (!Res || !L) return Res;
   1104 
   1105   if (!L->getLangOpts().Trigraphs) {
   1106     if (!L->isLexingRawMode())
   1107       L->Diag(CP-2, diag::trigraph_ignored);
   1108     return 0;
   1109   }
   1110 
   1111   if (!L->isLexingRawMode())
   1112     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
   1113   return Res;
   1114 }
   1115 
   1116 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   1117 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
   1118 /// trigraph equivalent on entry to this function.
   1119 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
   1120   unsigned Size = 0;
   1121   while (isWhitespace(Ptr[Size])) {
   1122     ++Size;
   1123 
   1124     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
   1125       continue;
   1126 
   1127     // If this is a \r\n or \n\r, skip the other half.
   1128     if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
   1129         Ptr[Size-1] != Ptr[Size])
   1130       ++Size;
   1131 
   1132     return Size;
   1133   }
   1134 
   1135   // Not an escaped newline, must be a \t or something else.
   1136   return 0;
   1137 }
   1138 
   1139 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
   1140 /// them), skip over them and return the first non-escaped-newline found,
   1141 /// otherwise return P.
   1142 const char *Lexer::SkipEscapedNewLines(const char *P) {
   1143   while (1) {
   1144     const char *AfterEscape;
   1145     if (*P == '\\') {
   1146       AfterEscape = P+1;
   1147     } else if (*P == '?') {
   1148       // If not a trigraph for escape, bail out.
   1149       if (P[1] != '?' || P[2] != '/')
   1150         return P;
   1151       AfterEscape = P+3;
   1152     } else {
   1153       return P;
   1154     }
   1155 
   1156     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
   1157     if (NewLineSize == 0) return P;
   1158     P = AfterEscape+NewLineSize;
   1159   }
   1160 }
   1161 
   1162 /// \brief Checks that the given token is the first token that occurs after the
   1163 /// given location (this excludes comments and whitespace). Returns the location
   1164 /// immediately after the specified token. If the token is not found or the
   1165 /// location is inside a macro, the returned source location will be invalid.
   1166 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
   1167                                         tok::TokenKind TKind,
   1168                                         const SourceManager &SM,
   1169                                         const LangOptions &LangOpts,
   1170                                         bool SkipTrailingWhitespaceAndNewLine) {
   1171   if (Loc.isMacroID()) {
   1172     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
   1173       return SourceLocation();
   1174   }
   1175   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
   1176 
   1177   // Break down the source location.
   1178   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
   1179 
   1180   // Try to load the file buffer.
   1181   bool InvalidTemp = false;
   1182   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
   1183   if (InvalidTemp)
   1184     return SourceLocation();
   1185 
   1186   const char *TokenBegin = File.data() + LocInfo.second;
   1187 
   1188   // Lex from the start of the given location.
   1189   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
   1190                                       TokenBegin, File.end());
   1191   // Find the token.
   1192   Token Tok;
   1193   lexer.LexFromRawLexer(Tok);
   1194   if (Tok.isNot(TKind))
   1195     return SourceLocation();
   1196   SourceLocation TokenLoc = Tok.getLocation();
   1197 
   1198   // Calculate how much whitespace needs to be skipped if any.
   1199   unsigned NumWhitespaceChars = 0;
   1200   if (SkipTrailingWhitespaceAndNewLine) {
   1201     const char *TokenEnd = SM.getCharacterData(TokenLoc) +
   1202                            Tok.getLength();
   1203     unsigned char C = *TokenEnd;
   1204     while (isHorizontalWhitespace(C)) {
   1205       C = *(++TokenEnd);
   1206       NumWhitespaceChars++;
   1207     }
   1208 
   1209     // Skip \r, \n, \r\n, or \n\r
   1210     if (C == '\n' || C == '\r') {
   1211       char PrevC = C;
   1212       C = *(++TokenEnd);
   1213       NumWhitespaceChars++;
   1214       if ((C == '\n' || C == '\r') && C != PrevC)
   1215         NumWhitespaceChars++;
   1216     }
   1217   }
   1218 
   1219   return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
   1220 }
   1221 
   1222 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
   1223 /// get its size, and return it.  This is tricky in several cases:
   1224 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
   1225 ///      then either return the trigraph (skipping 3 chars) or the '?',
   1226 ///      depending on whether trigraphs are enabled or not.
   1227 ///   2. If this is an escaped newline (potentially with whitespace between
   1228 ///      the backslash and newline), implicitly skip the newline and return
   1229 ///      the char after it.
   1230 ///
   1231 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
   1232 /// know that we can accumulate into Size, and that we have already incremented
   1233 /// Ptr by Size bytes.
   1234 ///
   1235 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
   1236 /// be updated to match.
   1237 ///
   1238 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
   1239                                Token *Tok) {
   1240   // If we have a slash, look for an escaped newline.
   1241   if (Ptr[0] == '\\') {
   1242     ++Size;
   1243     ++Ptr;
   1244 Slash:
   1245     // Common case, backslash-char where the char is not whitespace.
   1246     if (!isWhitespace(Ptr[0])) return '\\';
   1247 
   1248     // See if we have optional whitespace characters between the slash and
   1249     // newline.
   1250     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
   1251       // Remember that this token needs to be cleaned.
   1252       if (Tok) Tok->setFlag(Token::NeedsCleaning);
   1253 
   1254       // Warn if there was whitespace between the backslash and newline.
   1255       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
   1256         Diag(Ptr, diag::backslash_newline_space);
   1257 
   1258       // Found backslash<whitespace><newline>.  Parse the char after it.
   1259       Size += EscapedNewLineSize;
   1260       Ptr  += EscapedNewLineSize;
   1261 
   1262       // If the char that we finally got was a \n, then we must have had
   1263       // something like \<newline><newline>.  We don't want to consume the
   1264       // second newline.
   1265       if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
   1266         return ' ';
   1267 
   1268       // Use slow version to accumulate a correct size field.
   1269       return getCharAndSizeSlow(Ptr, Size, Tok);
   1270     }
   1271 
   1272     // Otherwise, this is not an escaped newline, just return the slash.
   1273     return '\\';
   1274   }
   1275 
   1276   // If this is a trigraph, process it.
   1277   if (Ptr[0] == '?' && Ptr[1] == '?') {
   1278     // If this is actually a legal trigraph (not something like "??x"), emit
   1279     // a trigraph warning.  If so, and if trigraphs are enabled, return it.
   1280     if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
   1281       // Remember that this token needs to be cleaned.
   1282       if (Tok) Tok->setFlag(Token::NeedsCleaning);
   1283 
   1284       Ptr += 3;
   1285       Size += 3;
   1286       if (C == '\\') goto Slash;
   1287       return C;
   1288     }
   1289   }
   1290 
   1291   // If this is neither, return a single character.
   1292   ++Size;
   1293   return *Ptr;
   1294 }
   1295 
   1296 
   1297 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
   1298 /// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
   1299 /// and that we have already incremented Ptr by Size bytes.
   1300 ///
   1301 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
   1302 /// be updated to match.
   1303 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
   1304                                      const LangOptions &LangOpts) {
   1305   // If we have a slash, look for an escaped newline.
   1306   if (Ptr[0] == '\\') {
   1307     ++Size;
   1308     ++Ptr;
   1309 Slash:
   1310     // Common case, backslash-char where the char is not whitespace.
   1311     if (!isWhitespace(Ptr[0])) return '\\';
   1312 
   1313     // See if we have optional whitespace characters followed by a newline.
   1314     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
   1315       // Found backslash<whitespace><newline>.  Parse the char after it.
   1316       Size += EscapedNewLineSize;
   1317       Ptr  += EscapedNewLineSize;
   1318 
   1319       // If the char that we finally got was a \n, then we must have had
   1320       // something like \<newline><newline>.  We don't want to consume the
   1321       // second newline.
   1322       if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
   1323         return ' ';
   1324 
   1325       // Use slow version to accumulate a correct size field.
   1326       return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
   1327     }
   1328 
   1329     // Otherwise, this is not an escaped newline, just return the slash.
   1330     return '\\';
   1331   }
   1332 
   1333   // If this is a trigraph, process it.
   1334   if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
   1335     // If this is actually a legal trigraph (not something like "??x"), return
   1336     // it.
   1337     if (char C = GetTrigraphCharForLetter(Ptr[2])) {
   1338       Ptr += 3;
   1339       Size += 3;
   1340       if (C == '\\') goto Slash;
   1341       return C;
   1342     }
   1343   }
   1344 
   1345   // If this is neither, return a single character.
   1346   ++Size;
   1347   return *Ptr;
   1348 }
   1349 
   1350 //===----------------------------------------------------------------------===//
   1351 // Helper methods for lexing.
   1352 //===----------------------------------------------------------------------===//
   1353 
   1354 /// \brief Routine that indiscriminately skips bytes in the source file.
   1355 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
   1356   BufferPtr += Bytes;
   1357   if (BufferPtr > BufferEnd)
   1358     BufferPtr = BufferEnd;
   1359   IsAtStartOfLine = StartOfLine;
   1360 }
   1361 
   1362 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
   1363   if (LangOpts.CPlusPlus11 || LangOpts.C11)
   1364     return isCharInSet(C, C11AllowedIDChars);
   1365   else if (LangOpts.CPlusPlus)
   1366     return isCharInSet(C, CXX03AllowedIDChars);
   1367   else
   1368     return isCharInSet(C, C99AllowedIDChars);
   1369 }
   1370 
   1371 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
   1372   assert(isAllowedIDChar(C, LangOpts));
   1373   if (LangOpts.CPlusPlus11 || LangOpts.C11)
   1374     return !isCharInSet(C, C11DisallowedInitialIDChars);
   1375   else if (LangOpts.CPlusPlus)
   1376     return true;
   1377   else
   1378     return !isCharInSet(C, C99DisallowedInitialIDChars);
   1379 }
   1380 
   1381 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
   1382                                             const char *End) {
   1383   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
   1384                                        L.getSourceLocation(End));
   1385 }
   1386 
   1387 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
   1388                                       CharSourceRange Range, bool IsFirst) {
   1389   // Check C99 compatibility.
   1390   if (Diags.getDiagnosticLevel(diag::warn_c99_compat_unicode_id,
   1391                                Range.getBegin()) > DiagnosticsEngine::Ignored) {
   1392     enum {
   1393       CannotAppearInIdentifier = 0,
   1394       CannotStartIdentifier
   1395     };
   1396 
   1397     if (!isCharInSet(C, C99AllowedIDChars)) {
   1398       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
   1399         << Range
   1400         << CannotAppearInIdentifier;
   1401     } else if (IsFirst && isCharInSet(C, C99DisallowedInitialIDChars)) {
   1402       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
   1403         << Range
   1404         << CannotStartIdentifier;
   1405     }
   1406   }
   1407 
   1408   // Check C++98 compatibility.
   1409   if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_unicode_id,
   1410                                Range.getBegin()) > DiagnosticsEngine::Ignored) {
   1411     if (!isCharInSet(C, CXX03AllowedIDChars)) {
   1412       Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
   1413         << Range;
   1414     }
   1415   }
   1416  }
   1417 
   1418 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   1419   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
   1420   unsigned Size;
   1421   unsigned char C = *CurPtr++;
   1422   while (isIdentifierBody(C))
   1423     C = *CurPtr++;
   1424 
   1425   --CurPtr;   // Back up over the skipped character.
   1426 
   1427   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   1428   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
   1429   //
   1430   // TODO: Could merge these checks into an InfoTable flag to make the
   1431   // comparison cheaper
   1432   if (isASCII(C) && C != '\\' && C != '?' &&
   1433       (C != '$' || !LangOpts.DollarIdents)) {
   1434 FinishIdentifier:
   1435     const char *IdStart = BufferPtr;
   1436     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
   1437     Result.setRawIdentifierData(IdStart);
   1438 
   1439     // If we are in raw mode, return this identifier raw.  There is no need to
   1440     // look up identifier information or attempt to macro expand it.
   1441     if (LexingRawMode)
   1442       return;
   1443 
   1444     // Fill in Result.IdentifierInfo and update the token kind,
   1445     // looking up the identifier in the identifier table.
   1446     IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
   1447 
   1448     // Finally, now that we know we have an identifier, pass this off to the
   1449     // preprocessor, which may macro expand it or something.
   1450     if (II->isHandleIdentifierCase())
   1451       PP->HandleIdentifier(Result);
   1452 
   1453     return;
   1454   }
   1455 
   1456   // Otherwise, $,\,? in identifier found.  Enter slower path.
   1457 
   1458   C = getCharAndSize(CurPtr, Size);
   1459   while (1) {
   1460     if (C == '$') {
   1461       // If we hit a $ and they are not supported in identifiers, we are done.
   1462       if (!LangOpts.DollarIdents) goto FinishIdentifier;
   1463 
   1464       // Otherwise, emit a diagnostic and continue.
   1465       if (!isLexingRawMode())
   1466         Diag(CurPtr, diag::ext_dollar_in_identifier);
   1467       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1468       C = getCharAndSize(CurPtr, Size);
   1469       continue;
   1470 
   1471     } else if (C == '\\') {
   1472       const char *UCNPtr = CurPtr + Size;
   1473       uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
   1474       if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
   1475         goto FinishIdentifier;
   1476 
   1477       if (!isLexingRawMode()) {
   1478         maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
   1479                                   makeCharRange(*this, CurPtr, UCNPtr),
   1480                                   /*IsFirst=*/false);
   1481       }
   1482 
   1483       Result.setFlag(Token::HasUCN);
   1484       if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
   1485           (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
   1486         CurPtr = UCNPtr;
   1487       else
   1488         while (CurPtr != UCNPtr)
   1489           (void)getAndAdvanceChar(CurPtr, Result);
   1490 
   1491       C = getCharAndSize(CurPtr, Size);
   1492       continue;
   1493     } else if (!isASCII(C)) {
   1494       const char *UnicodePtr = CurPtr;
   1495       UTF32 CodePoint;
   1496       ConversionResult Result =
   1497           llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
   1498                                     (const UTF8 *)BufferEnd,
   1499                                     &CodePoint,
   1500                                     strictConversion);
   1501       if (Result != conversionOK ||
   1502           !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
   1503         goto FinishIdentifier;
   1504 
   1505       if (!isLexingRawMode()) {
   1506         maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
   1507                                   makeCharRange(*this, CurPtr, UnicodePtr),
   1508                                   /*IsFirst=*/false);
   1509       }
   1510 
   1511       CurPtr = UnicodePtr;
   1512       C = getCharAndSize(CurPtr, Size);
   1513       continue;
   1514     } else if (!isIdentifierBody(C)) {
   1515       goto FinishIdentifier;
   1516     }
   1517 
   1518     // Otherwise, this character is good, consume it.
   1519     CurPtr = ConsumeChar(CurPtr, Size, Result);
   1520 
   1521     C = getCharAndSize(CurPtr, Size);
   1522     while (isIdentifierBody(C)) {
   1523       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1524       C = getCharAndSize(CurPtr, Size);
   1525     }
   1526   }
   1527 }
   1528 
   1529 /// isHexaLiteral - Return true if Start points to a hex constant.
   1530 /// in microsoft mode (where this is supposed to be several different tokens).
   1531 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
   1532   unsigned Size;
   1533   char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
   1534   if (C1 != '0')
   1535     return false;
   1536   char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
   1537   return (C2 == 'x' || C2 == 'X');
   1538 }
   1539 
   1540 /// LexNumericConstant - Lex the remainder of a integer or floating point
   1541 /// constant. From[-1] is the first character lexed.  Return the end of the
   1542 /// constant.
   1543 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   1544   unsigned Size;
   1545   char C = getCharAndSize(CurPtr, Size);
   1546   char PrevCh = 0;
   1547   while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
   1548     CurPtr = ConsumeChar(CurPtr, Size, Result);
   1549     PrevCh = C;
   1550     C = getCharAndSize(CurPtr, Size);
   1551   }
   1552 
   1553   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
   1554   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
   1555     // If we are in Microsoft mode, don't continue if the constant is hex.
   1556     // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
   1557     if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
   1558       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
   1559   }
   1560 
   1561   // If we have a hex FP constant, continue.
   1562   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
   1563     // Outside C99, we accept hexadecimal floating point numbers as a
   1564     // not-quite-conforming extension. Only do so if this looks like it's
   1565     // actually meant to be a hexfloat, and not if it has a ud-suffix.
   1566     bool IsHexFloat = true;
   1567     if (!LangOpts.C99) {
   1568       if (!isHexaLiteral(BufferPtr, LangOpts))
   1569         IsHexFloat = false;
   1570       else if (std::find(BufferPtr, CurPtr, '_') != CurPtr)
   1571         IsHexFloat = false;
   1572     }
   1573     if (IsHexFloat)
   1574       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
   1575   }
   1576 
   1577   // Update the location of token as well as BufferPtr.
   1578   const char *TokStart = BufferPtr;
   1579   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
   1580   Result.setLiteralData(TokStart);
   1581 }
   1582 
   1583 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
   1584 /// in C++11, or warn on a ud-suffix in C++98.
   1585 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) {
   1586   assert(getLangOpts().CPlusPlus);
   1587 
   1588   // Maximally munch an identifier. FIXME: UCNs.
   1589   unsigned Size;
   1590   char C = getCharAndSize(CurPtr, Size);
   1591   if (isIdentifierHead(C)) {
   1592     if (!getLangOpts().CPlusPlus11) {
   1593       if (!isLexingRawMode())
   1594         Diag(CurPtr,
   1595              C == '_' ? diag::warn_cxx11_compat_user_defined_literal
   1596                       : diag::warn_cxx11_compat_reserved_user_defined_literal)
   1597           << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
   1598       return CurPtr;
   1599     }
   1600 
   1601     // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
   1602     // that does not start with an underscore is ill-formed. As a conforming
   1603     // extension, we treat all such suffixes as if they had whitespace before
   1604     // them.
   1605     if (C != '_') {
   1606       if (!isLexingRawMode())
   1607         Diag(CurPtr, getLangOpts().MicrosoftMode ?
   1608             diag::ext_ms_reserved_user_defined_literal :
   1609             diag::ext_reserved_user_defined_literal)
   1610           << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
   1611       return CurPtr;
   1612     }
   1613 
   1614     Result.setFlag(Token::HasUDSuffix);
   1615     do {
   1616       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1617       C = getCharAndSize(CurPtr, Size);
   1618     } while (isIdentifierBody(C));
   1619   }
   1620   return CurPtr;
   1621 }
   1622 
   1623 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
   1624 /// either " or L" or u8" or u" or U".
   1625 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
   1626                              tok::TokenKind Kind) {
   1627   const char *NulCharacter = 0; // Does this string contain the \0 character?
   1628 
   1629   if (!isLexingRawMode() &&
   1630       (Kind == tok::utf8_string_literal ||
   1631        Kind == tok::utf16_string_literal ||
   1632        Kind == tok::utf32_string_literal))
   1633     Diag(BufferPtr, getLangOpts().CPlusPlus
   1634            ? diag::warn_cxx98_compat_unicode_literal
   1635            : diag::warn_c99_compat_unicode_literal);
   1636 
   1637   char C = getAndAdvanceChar(CurPtr, Result);
   1638   while (C != '"') {
   1639     // Skip escaped characters.  Escaped newlines will already be processed by
   1640     // getAndAdvanceChar.
   1641     if (C == '\\')
   1642       C = getAndAdvanceChar(CurPtr, Result);
   1643 
   1644     if (C == '\n' || C == '\r' ||             // Newline.
   1645         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
   1646       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1647         Diag(BufferPtr, diag::ext_unterminated_string);
   1648       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1649       return;
   1650     }
   1651 
   1652     if (C == 0) {
   1653       if (isCodeCompletionPoint(CurPtr-1)) {
   1654         PP->CodeCompleteNaturalLanguage();
   1655         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1656         return cutOffLexing();
   1657       }
   1658 
   1659       NulCharacter = CurPtr-1;
   1660     }
   1661     C = getAndAdvanceChar(CurPtr, Result);
   1662   }
   1663 
   1664   // If we are in C++11, lex the optional ud-suffix.
   1665   if (getLangOpts().CPlusPlus)
   1666     CurPtr = LexUDSuffix(Result, CurPtr);
   1667 
   1668   // If a nul character existed in the string, warn about it.
   1669   if (NulCharacter && !isLexingRawMode())
   1670     Diag(NulCharacter, diag::null_in_string);
   1671 
   1672   // Update the location of the token as well as the BufferPtr instance var.
   1673   const char *TokStart = BufferPtr;
   1674   FormTokenWithChars(Result, CurPtr, Kind);
   1675   Result.setLiteralData(TokStart);
   1676 }
   1677 
   1678 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
   1679 /// having lexed R", LR", u8R", uR", or UR".
   1680 void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
   1681                                 tok::TokenKind Kind) {
   1682   // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
   1683   //  Between the initial and final double quote characters of the raw string,
   1684   //  any transformations performed in phases 1 and 2 (trigraphs,
   1685   //  universal-character-names, and line splicing) are reverted.
   1686 
   1687   if (!isLexingRawMode())
   1688     Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
   1689 
   1690   unsigned PrefixLen = 0;
   1691 
   1692   while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
   1693     ++PrefixLen;
   1694 
   1695   // If the last character was not a '(', then we didn't lex a valid delimiter.
   1696   if (CurPtr[PrefixLen] != '(') {
   1697     if (!isLexingRawMode()) {
   1698       const char *PrefixEnd = &CurPtr[PrefixLen];
   1699       if (PrefixLen == 16) {
   1700         Diag(PrefixEnd, diag::err_raw_delim_too_long);
   1701       } else {
   1702         Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
   1703           << StringRef(PrefixEnd, 1);
   1704       }
   1705     }
   1706 
   1707     // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
   1708     // it's possible the '"' was intended to be part of the raw string, but
   1709     // there's not much we can do about that.
   1710     while (1) {
   1711       char C = *CurPtr++;
   1712 
   1713       if (C == '"')
   1714         break;
   1715       if (C == 0 && CurPtr-1 == BufferEnd) {
   1716         --CurPtr;
   1717         break;
   1718       }
   1719     }
   1720 
   1721     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1722     return;
   1723   }
   1724 
   1725   // Save prefix and move CurPtr past it
   1726   const char *Prefix = CurPtr;
   1727   CurPtr += PrefixLen + 1; // skip over prefix and '('
   1728 
   1729   while (1) {
   1730     char C = *CurPtr++;
   1731 
   1732     if (C == ')') {
   1733       // Check for prefix match and closing quote.
   1734       if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
   1735         CurPtr += PrefixLen + 1; // skip over prefix and '"'
   1736         break;
   1737       }
   1738     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
   1739       if (!isLexingRawMode())
   1740         Diag(BufferPtr, diag::err_unterminated_raw_string)
   1741           << StringRef(Prefix, PrefixLen);
   1742       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1743       return;
   1744     }
   1745   }
   1746 
   1747   // If we are in C++11, lex the optional ud-suffix.
   1748   if (getLangOpts().CPlusPlus)
   1749     CurPtr = LexUDSuffix(Result, CurPtr);
   1750 
   1751   // Update the location of token as well as BufferPtr.
   1752   const char *TokStart = BufferPtr;
   1753   FormTokenWithChars(Result, CurPtr, Kind);
   1754   Result.setLiteralData(TokStart);
   1755 }
   1756 
   1757 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
   1758 /// after having lexed the '<' character.  This is used for #include filenames.
   1759 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
   1760   const char *NulCharacter = 0; // Does this string contain the \0 character?
   1761   const char *AfterLessPos = CurPtr;
   1762   char C = getAndAdvanceChar(CurPtr, Result);
   1763   while (C != '>') {
   1764     // Skip escaped characters.
   1765     if (C == '\\') {
   1766       // Skip the escaped character.
   1767       getAndAdvanceChar(CurPtr, Result);
   1768     } else if (C == '\n' || C == '\r' ||             // Newline.
   1769                (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
   1770                            isCodeCompletionPoint(CurPtr-1)))) {
   1771       // If the filename is unterminated, then it must just be a lone <
   1772       // character.  Return this as such.
   1773       FormTokenWithChars(Result, AfterLessPos, tok::less);
   1774       return;
   1775     } else if (C == 0) {
   1776       NulCharacter = CurPtr-1;
   1777     }
   1778     C = getAndAdvanceChar(CurPtr, Result);
   1779   }
   1780 
   1781   // If a nul character existed in the string, warn about it.
   1782   if (NulCharacter && !isLexingRawMode())
   1783     Diag(NulCharacter, diag::null_in_string);
   1784 
   1785   // Update the location of token as well as BufferPtr.
   1786   const char *TokStart = BufferPtr;
   1787   FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
   1788   Result.setLiteralData(TokStart);
   1789 }
   1790 
   1791 
   1792 /// LexCharConstant - Lex the remainder of a character constant, after having
   1793 /// lexed either ' or L' or u' or U'.
   1794 void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
   1795                             tok::TokenKind Kind) {
   1796   const char *NulCharacter = 0; // Does this character contain the \0 character?
   1797 
   1798   if (!isLexingRawMode() &&
   1799       (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant))
   1800     Diag(BufferPtr, getLangOpts().CPlusPlus
   1801            ? diag::warn_cxx98_compat_unicode_literal
   1802            : diag::warn_c99_compat_unicode_literal);
   1803 
   1804   char C = getAndAdvanceChar(CurPtr, Result);
   1805   if (C == '\'') {
   1806     if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1807       Diag(BufferPtr, diag::ext_empty_character);
   1808     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1809     return;
   1810   }
   1811 
   1812   while (C != '\'') {
   1813     // Skip escaped characters.
   1814     if (C == '\\')
   1815       C = getAndAdvanceChar(CurPtr, Result);
   1816 
   1817     if (C == '\n' || C == '\r' ||             // Newline.
   1818         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
   1819       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1820         Diag(BufferPtr, diag::ext_unterminated_char);
   1821       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1822       return;
   1823     }
   1824 
   1825     if (C == 0) {
   1826       if (isCodeCompletionPoint(CurPtr-1)) {
   1827         PP->CodeCompleteNaturalLanguage();
   1828         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1829         return cutOffLexing();
   1830       }
   1831 
   1832       NulCharacter = CurPtr-1;
   1833     }
   1834     C = getAndAdvanceChar(CurPtr, Result);
   1835   }
   1836 
   1837   // If we are in C++11, lex the optional ud-suffix.
   1838   if (getLangOpts().CPlusPlus)
   1839     CurPtr = LexUDSuffix(Result, CurPtr);
   1840 
   1841   // If a nul character existed in the character, warn about it.
   1842   if (NulCharacter && !isLexingRawMode())
   1843     Diag(NulCharacter, diag::null_in_char);
   1844 
   1845   // Update the location of token as well as BufferPtr.
   1846   const char *TokStart = BufferPtr;
   1847   FormTokenWithChars(Result, CurPtr, Kind);
   1848   Result.setLiteralData(TokStart);
   1849 }
   1850 
   1851 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
   1852 /// Update BufferPtr to point to the next non-whitespace character and return.
   1853 ///
   1854 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
   1855 ///
   1856 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
   1857   // Whitespace - Skip it, then return the token after the whitespace.
   1858   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
   1859 
   1860   unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
   1861   while (1) {
   1862     // Skip horizontal whitespace very aggressively.
   1863     while (isHorizontalWhitespace(Char))
   1864       Char = *++CurPtr;
   1865 
   1866     // Otherwise if we have something other than whitespace, we're done.
   1867     if (!isVerticalWhitespace(Char))
   1868       break;
   1869 
   1870     if (ParsingPreprocessorDirective) {
   1871       // End of preprocessor directive line, let LexTokenInternal handle this.
   1872       BufferPtr = CurPtr;
   1873       return false;
   1874     }
   1875 
   1876     // ok, but handle newline.
   1877     SawNewline = true;
   1878     Char = *++CurPtr;
   1879   }
   1880 
   1881   // If the client wants us to return whitespace, return it now.
   1882   if (isKeepWhitespaceMode()) {
   1883     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1884     if (SawNewline)
   1885       IsAtStartOfLine = true;
   1886     // FIXME: The next token will not have LeadingSpace set.
   1887     return true;
   1888   }
   1889 
   1890   // If this isn't immediately after a newline, there is leading space.
   1891   char PrevChar = CurPtr[-1];
   1892   bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
   1893 
   1894   Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
   1895   if (SawNewline)
   1896     Result.setFlag(Token::StartOfLine);
   1897 
   1898   BufferPtr = CurPtr;
   1899   return false;
   1900 }
   1901 
   1902 /// We have just read the // characters from input.  Skip until we find the
   1903 /// newline character thats terminate the comment.  Then update BufferPtr and
   1904 /// return.
   1905 ///
   1906 /// If we're in KeepCommentMode or any CommentHandler has inserted
   1907 /// some tokens, this will store the first token and return true.
   1908 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) {
   1909   // If Line comments aren't explicitly enabled for this language, emit an
   1910   // extension warning.
   1911   if (!LangOpts.LineComment && !isLexingRawMode()) {
   1912     Diag(BufferPtr, diag::ext_line_comment);
   1913 
   1914     // Mark them enabled so we only emit one warning for this translation
   1915     // unit.
   1916     LangOpts.LineComment = true;
   1917   }
   1918 
   1919   // Scan over the body of the comment.  The common case, when scanning, is that
   1920   // the comment contains normal ascii characters with nothing interesting in
   1921   // them.  As such, optimize for this case with the inner loop.
   1922   char C;
   1923   do {
   1924     C = *CurPtr;
   1925     // Skip over characters in the fast loop.
   1926     while (C != 0 &&                // Potentially EOF.
   1927            C != '\n' && C != '\r')  // Newline or DOS-style newline.
   1928       C = *++CurPtr;
   1929 
   1930     const char *NextLine = CurPtr;
   1931     if (C != 0) {
   1932       // We found a newline, see if it's escaped.
   1933       const char *EscapePtr = CurPtr-1;
   1934       while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace.
   1935         --EscapePtr;
   1936 
   1937       if (*EscapePtr == '\\') // Escaped newline.
   1938         CurPtr = EscapePtr;
   1939       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
   1940                EscapePtr[-2] == '?') // Trigraph-escaped newline.
   1941         CurPtr = EscapePtr-2;
   1942       else
   1943         break; // This is a newline, we're done.
   1944     }
   1945 
   1946     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
   1947     // properly decode the character.  Read it in raw mode to avoid emitting
   1948     // diagnostics about things like trigraphs.  If we see an escaped newline,
   1949     // we'll handle it below.
   1950     const char *OldPtr = CurPtr;
   1951     bool OldRawMode = isLexingRawMode();
   1952     LexingRawMode = true;
   1953     C = getAndAdvanceChar(CurPtr, Result);
   1954     LexingRawMode = OldRawMode;
   1955 
   1956     // If we only read only one character, then no special handling is needed.
   1957     // We're done and can skip forward to the newline.
   1958     if (C != 0 && CurPtr == OldPtr+1) {
   1959       CurPtr = NextLine;
   1960       break;
   1961     }
   1962 
   1963     // If we read multiple characters, and one of those characters was a \r or
   1964     // \n, then we had an escaped newline within the comment.  Emit diagnostic
   1965     // unless the next line is also a // comment.
   1966     if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
   1967       for (; OldPtr != CurPtr; ++OldPtr)
   1968         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
   1969           // Okay, we found a // comment that ends in a newline, if the next
   1970           // line is also a // comment, but has spaces, don't emit a diagnostic.
   1971           if (isWhitespace(C)) {
   1972             const char *ForwardPtr = CurPtr;
   1973             while (isWhitespace(*ForwardPtr))  // Skip whitespace.
   1974               ++ForwardPtr;
   1975             if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
   1976               break;
   1977           }
   1978 
   1979           if (!isLexingRawMode())
   1980             Diag(OldPtr-1, diag::ext_multi_line_line_comment);
   1981           break;
   1982         }
   1983     }
   1984 
   1985     if (CurPtr == BufferEnd+1) {
   1986       --CurPtr;
   1987       break;
   1988     }
   1989 
   1990     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
   1991       PP->CodeCompleteNaturalLanguage();
   1992       cutOffLexing();
   1993       return false;
   1994     }
   1995 
   1996   } while (C != '\n' && C != '\r');
   1997 
   1998   // Found but did not consume the newline.  Notify comment handlers about the
   1999   // comment unless we're in a #if 0 block.
   2000   if (PP && !isLexingRawMode() &&
   2001       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
   2002                                             getSourceLocation(CurPtr)))) {
   2003     BufferPtr = CurPtr;
   2004     return true; // A token has to be returned.
   2005   }
   2006 
   2007   // If we are returning comments as tokens, return this comment as a token.
   2008   if (inKeepCommentMode())
   2009     return SaveLineComment(Result, CurPtr);
   2010 
   2011   // If we are inside a preprocessor directive and we see the end of line,
   2012   // return immediately, so that the lexer can return this as an EOD token.
   2013   if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
   2014     BufferPtr = CurPtr;
   2015     return false;
   2016   }
   2017 
   2018   // Otherwise, eat the \n character.  We don't care if this is a \n\r or
   2019   // \r\n sequence.  This is an efficiency hack (because we know the \n can't
   2020   // contribute to another token), it isn't needed for correctness.  Note that
   2021   // this is ok even in KeepWhitespaceMode, because we would have returned the
   2022   /// comment above in that mode.
   2023   ++CurPtr;
   2024 
   2025   // The next returned token is at the start of the line.
   2026   Result.setFlag(Token::StartOfLine);
   2027   // No leading whitespace seen so far.
   2028   Result.clearFlag(Token::LeadingSpace);
   2029   BufferPtr = CurPtr;
   2030   return false;
   2031 }
   2032 
   2033 /// If in save-comment mode, package up this Line comment in an appropriate
   2034 /// way and return it.
   2035 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
   2036   // If we're not in a preprocessor directive, just return the // comment
   2037   // directly.
   2038   FormTokenWithChars(Result, CurPtr, tok::comment);
   2039 
   2040   if (!ParsingPreprocessorDirective || LexingRawMode)
   2041     return true;
   2042 
   2043   // If this Line-style comment is in a macro definition, transmogrify it into
   2044   // a C-style block comment.
   2045   bool Invalid = false;
   2046   std::string Spelling = PP->getSpelling(Result, &Invalid);
   2047   if (Invalid)
   2048     return true;
   2049 
   2050   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
   2051   Spelling[1] = '*';   // Change prefix to "/*".
   2052   Spelling += "*/";    // add suffix.
   2053 
   2054   Result.setKind(tok::comment);
   2055   PP->CreateString(Spelling, Result,
   2056                    Result.getLocation(), Result.getLocation());
   2057   return true;
   2058 }
   2059 
   2060 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
   2061 /// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
   2062 /// a diagnostic if so.  We know that the newline is inside of a block comment.
   2063 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
   2064                                                   Lexer *L) {
   2065   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
   2066 
   2067   // Back up off the newline.
   2068   --CurPtr;
   2069 
   2070   // If this is a two-character newline sequence, skip the other character.
   2071   if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
   2072     // \n\n or \r\r -> not escaped newline.
   2073     if (CurPtr[0] == CurPtr[1])
   2074       return false;
   2075     // \n\r or \r\n -> skip the newline.
   2076     --CurPtr;
   2077   }
   2078 
   2079   // If we have horizontal whitespace, skip over it.  We allow whitespace
   2080   // between the slash and newline.
   2081   bool HasSpace = false;
   2082   while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
   2083     --CurPtr;
   2084     HasSpace = true;
   2085   }
   2086 
   2087   // If we have a slash, we know this is an escaped newline.
   2088   if (*CurPtr == '\\') {
   2089     if (CurPtr[-1] != '*') return false;
   2090   } else {
   2091     // It isn't a slash, is it the ?? / trigraph?
   2092     if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
   2093         CurPtr[-3] != '*')
   2094       return false;
   2095 
   2096     // This is the trigraph ending the comment.  Emit a stern warning!
   2097     CurPtr -= 2;
   2098 
   2099     // If no trigraphs are enabled, warn that we ignored this trigraph and
   2100     // ignore this * character.
   2101     if (!L->getLangOpts().Trigraphs) {
   2102       if (!L->isLexingRawMode())
   2103         L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
   2104       return false;
   2105     }
   2106     if (!L->isLexingRawMode())
   2107       L->Diag(CurPtr, diag::trigraph_ends_block_comment);
   2108   }
   2109 
   2110   // Warn about having an escaped newline between the */ characters.
   2111   if (!L->isLexingRawMode())
   2112     L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
   2113 
   2114   // If there was space between the backslash and newline, warn about it.
   2115   if (HasSpace && !L->isLexingRawMode())
   2116     L->Diag(CurPtr, diag::backslash_newline_space);
   2117 
   2118   return true;
   2119 }
   2120 
   2121 #ifdef __SSE2__
   2122 #include <emmintrin.h>
   2123 #elif __ALTIVEC__
   2124 #include <altivec.h>
   2125 #undef bool
   2126 #endif
   2127 
   2128 /// We have just read from input the / and * characters that started a comment.
   2129 /// Read until we find the * and / characters that terminate the comment.
   2130 /// Note that we don't bother decoding trigraphs or escaped newlines in block
   2131 /// comments, because they cannot cause the comment to end.  The only thing
   2132 /// that can happen is the comment could end with an escaped newline between
   2133 /// the terminating * and /.
   2134 ///
   2135 /// If we're in KeepCommentMode or any CommentHandler has inserted
   2136 /// some tokens, this will store the first token and return true.
   2137 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
   2138   // Scan one character past where we should, looking for a '/' character.  Once
   2139   // we find it, check to see if it was preceded by a *.  This common
   2140   // optimization helps people who like to put a lot of * characters in their
   2141   // comments.
   2142 
   2143   // The first character we get with newlines and trigraphs skipped to handle
   2144   // the degenerate /*/ case below correctly if the * has an escaped newline
   2145   // after it.
   2146   unsigned CharSize;
   2147   unsigned char C = getCharAndSize(CurPtr, CharSize);
   2148   CurPtr += CharSize;
   2149   if (C == 0 && CurPtr == BufferEnd+1) {
   2150     if (!isLexingRawMode())
   2151       Diag(BufferPtr, diag::err_unterminated_block_comment);
   2152     --CurPtr;
   2153 
   2154     // KeepWhitespaceMode should return this broken comment as a token.  Since
   2155     // it isn't a well formed comment, just return it as an 'unknown' token.
   2156     if (isKeepWhitespaceMode()) {
   2157       FormTokenWithChars(Result, CurPtr, tok::unknown);
   2158       return true;
   2159     }
   2160 
   2161     BufferPtr = CurPtr;
   2162     return false;
   2163   }
   2164 
   2165   // Check to see if the first character after the '/*' is another /.  If so,
   2166   // then this slash does not end the block comment, it is part of it.
   2167   if (C == '/')
   2168     C = *CurPtr++;
   2169 
   2170   while (1) {
   2171     // Skip over all non-interesting characters until we find end of buffer or a
   2172     // (probably ending) '/' character.
   2173     if (CurPtr + 24 < BufferEnd &&
   2174         // If there is a code-completion point avoid the fast scan because it
   2175         // doesn't check for '\0'.
   2176         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
   2177       // While not aligned to a 16-byte boundary.
   2178       while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
   2179         C = *CurPtr++;
   2180 
   2181       if (C == '/') goto FoundSlash;
   2182 
   2183 #ifdef __SSE2__
   2184       __m128i Slashes = _mm_set1_epi8('/');
   2185       while (CurPtr+16 <= BufferEnd) {
   2186         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
   2187                                     Slashes));
   2188         if (cmp != 0) {
   2189           // Adjust the pointer to point directly after the first slash. It's
   2190           // not necessary to set C here, it will be overwritten at the end of
   2191           // the outer loop.
   2192           CurPtr += llvm::CountTrailingZeros_32(cmp) + 1;
   2193           goto FoundSlash;
   2194         }
   2195         CurPtr += 16;
   2196       }
   2197 #elif __ALTIVEC__
   2198       __vector unsigned char Slashes = {
   2199         '/', '/', '/', '/',  '/', '/', '/', '/',
   2200         '/', '/', '/', '/',  '/', '/', '/', '/'
   2201       };
   2202       while (CurPtr+16 <= BufferEnd &&
   2203              !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
   2204         CurPtr += 16;
   2205 #else
   2206       // Scan for '/' quickly.  Many block comments are very large.
   2207       while (CurPtr[0] != '/' &&
   2208              CurPtr[1] != '/' &&
   2209              CurPtr[2] != '/' &&
   2210              CurPtr[3] != '/' &&
   2211              CurPtr+4 < BufferEnd) {
   2212         CurPtr += 4;
   2213       }
   2214 #endif
   2215 
   2216       // It has to be one of the bytes scanned, increment to it and read one.
   2217       C = *CurPtr++;
   2218     }
   2219 
   2220     // Loop to scan the remainder.
   2221     while (C != '/' && C != '\0')
   2222       C = *CurPtr++;
   2223 
   2224     if (C == '/') {
   2225   FoundSlash:
   2226       if (CurPtr[-2] == '*')  // We found the final */.  We're done!
   2227         break;
   2228 
   2229       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
   2230         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
   2231           // We found the final */, though it had an escaped newline between the
   2232           // * and /.  We're done!
   2233           break;
   2234         }
   2235       }
   2236       if (CurPtr[0] == '*' && CurPtr[1] != '/') {
   2237         // If this is a /* inside of the comment, emit a warning.  Don't do this
   2238         // if this is a /*/, which will end the comment.  This misses cases with
   2239         // embedded escaped newlines, but oh well.
   2240         if (!isLexingRawMode())
   2241           Diag(CurPtr-1, diag::warn_nested_block_comment);
   2242       }
   2243     } else if (C == 0 && CurPtr == BufferEnd+1) {
   2244       if (!isLexingRawMode())
   2245         Diag(BufferPtr, diag::err_unterminated_block_comment);
   2246       // Note: the user probably forgot a */.  We could continue immediately
   2247       // after the /*, but this would involve lexing a lot of what really is the
   2248       // comment, which surely would confuse the parser.
   2249       --CurPtr;
   2250 
   2251       // KeepWhitespaceMode should return this broken comment as a token.  Since
   2252       // it isn't a well formed comment, just return it as an 'unknown' token.
   2253       if (isKeepWhitespaceMode()) {
   2254         FormTokenWithChars(Result, CurPtr, tok::unknown);
   2255         return true;
   2256       }
   2257 
   2258       BufferPtr = CurPtr;
   2259       return false;
   2260     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
   2261       PP->CodeCompleteNaturalLanguage();
   2262       cutOffLexing();
   2263       return false;
   2264     }
   2265 
   2266     C = *CurPtr++;
   2267   }
   2268 
   2269   // Notify comment handlers about the comment unless we're in a #if 0 block.
   2270   if (PP && !isLexingRawMode() &&
   2271       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
   2272                                             getSourceLocation(CurPtr)))) {
   2273     BufferPtr = CurPtr;
   2274     return true; // A token has to be returned.
   2275   }
   2276 
   2277   // If we are returning comments as tokens, return this comment as a token.
   2278   if (inKeepCommentMode()) {
   2279     FormTokenWithChars(Result, CurPtr, tok::comment);
   2280     return true;
   2281   }
   2282 
   2283   // It is common for the tokens immediately after a /**/ comment to be
   2284   // whitespace.  Instead of going through the big switch, handle it
   2285   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
   2286   // have already returned above with the comment as a token.
   2287   if (isHorizontalWhitespace(*CurPtr)) {
   2288     SkipWhitespace(Result, CurPtr+1);
   2289     return false;
   2290   }
   2291 
   2292   // Otherwise, just return so that the next character will be lexed as a token.
   2293   BufferPtr = CurPtr;
   2294   Result.setFlag(Token::LeadingSpace);
   2295   return false;
   2296 }
   2297 
   2298 //===----------------------------------------------------------------------===//
   2299 // Primary Lexing Entry Points
   2300 //===----------------------------------------------------------------------===//
   2301 
   2302 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
   2303 /// uninterpreted string.  This switches the lexer out of directive mode.
   2304 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
   2305   assert(ParsingPreprocessorDirective && ParsingFilename == false &&
   2306          "Must be in a preprocessing directive!");
   2307   Token Tmp;
   2308 
   2309   // CurPtr - Cache BufferPtr in an automatic variable.
   2310   const char *CurPtr = BufferPtr;
   2311   while (1) {
   2312     char Char = getAndAdvanceChar(CurPtr, Tmp);
   2313     switch (Char) {
   2314     default:
   2315       if (Result)
   2316         Result->push_back(Char);
   2317       break;
   2318     case 0:  // Null.
   2319       // Found end of file?
   2320       if (CurPtr-1 != BufferEnd) {
   2321         if (isCodeCompletionPoint(CurPtr-1)) {
   2322           PP->CodeCompleteNaturalLanguage();
   2323           cutOffLexing();
   2324           return;
   2325         }
   2326 
   2327         // Nope, normal character, continue.
   2328         if (Result)
   2329           Result->push_back(Char);
   2330         break;
   2331       }
   2332       // FALL THROUGH.
   2333     case '\r':
   2334     case '\n':
   2335       // Okay, we found the end of the line. First, back up past the \0, \r, \n.
   2336       assert(CurPtr[-1] == Char && "Trigraphs for newline?");
   2337       BufferPtr = CurPtr-1;
   2338 
   2339       // Next, lex the character, which should handle the EOD transition.
   2340       Lex(Tmp);
   2341       if (Tmp.is(tok::code_completion)) {
   2342         if (PP)
   2343           PP->CodeCompleteNaturalLanguage();
   2344         Lex(Tmp);
   2345       }
   2346       assert(Tmp.is(tok::eod) && "Unexpected token!");
   2347 
   2348       // Finally, we're done;
   2349       return;
   2350     }
   2351   }
   2352 }
   2353 
   2354 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
   2355 /// condition, reporting diagnostics and handling other edge cases as required.
   2356 /// This returns true if Result contains a token, false if PP.Lex should be
   2357 /// called again.
   2358 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
   2359   // If we hit the end of the file while parsing a preprocessor directive,
   2360   // end the preprocessor directive first.  The next token returned will
   2361   // then be the end of file.
   2362   if (ParsingPreprocessorDirective) {
   2363     // Done parsing the "line".
   2364     ParsingPreprocessorDirective = false;
   2365     // Update the location of token as well as BufferPtr.
   2366     FormTokenWithChars(Result, CurPtr, tok::eod);
   2367 
   2368     // Restore comment saving mode, in case it was disabled for directive.
   2369     resetExtendedTokenMode();
   2370     return true;  // Have a token.
   2371   }
   2372 
   2373   // If we are in raw mode, return this event as an EOF token.  Let the caller
   2374   // that put us in raw mode handle the event.
   2375   if (isLexingRawMode()) {
   2376     Result.startToken();
   2377     BufferPtr = BufferEnd;
   2378     FormTokenWithChars(Result, BufferEnd, tok::eof);
   2379     return true;
   2380   }
   2381 
   2382   // Issue diagnostics for unterminated #if and missing newline.
   2383 
   2384   // If we are in a #if directive, emit an error.
   2385   while (!ConditionalStack.empty()) {
   2386     if (PP->getCodeCompletionFileLoc() != FileLoc)
   2387       PP->Diag(ConditionalStack.back().IfLoc,
   2388                diag::err_pp_unterminated_conditional);
   2389     ConditionalStack.pop_back();
   2390   }
   2391 
   2392   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
   2393   // a pedwarn.
   2394   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
   2395     Diag(BufferEnd, LangOpts.CPlusPlus11 ? // C++11 [lex.phases] 2.2 p2
   2396          diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof)
   2397     << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
   2398 
   2399   BufferPtr = CurPtr;
   2400 
   2401   // Finally, let the preprocessor handle this.
   2402   return PP->HandleEndOfFile(Result, isPragmaLexer());
   2403 }
   2404 
   2405 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
   2406 /// the specified lexer will return a tok::l_paren token, 0 if it is something
   2407 /// else and 2 if there are no more tokens in the buffer controlled by the
   2408 /// lexer.
   2409 unsigned Lexer::isNextPPTokenLParen() {
   2410   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
   2411 
   2412   // Switch to 'skipping' mode.  This will ensure that we can lex a token
   2413   // without emitting diagnostics, disables macro expansion, and will cause EOF
   2414   // to return an EOF token instead of popping the include stack.
   2415   LexingRawMode = true;
   2416 
   2417   // Save state that can be changed while lexing so that we can restore it.
   2418   const char *TmpBufferPtr = BufferPtr;
   2419   bool inPPDirectiveMode = ParsingPreprocessorDirective;
   2420 
   2421   Token Tok;
   2422   Tok.startToken();
   2423   LexTokenInternal(Tok);
   2424 
   2425   // Restore state that may have changed.
   2426   BufferPtr = TmpBufferPtr;
   2427   ParsingPreprocessorDirective = inPPDirectiveMode;
   2428 
   2429   // Restore the lexer back to non-skipping mode.
   2430   LexingRawMode = false;
   2431 
   2432   if (Tok.is(tok::eof))
   2433     return 2;
   2434   return Tok.is(tok::l_paren);
   2435 }
   2436 
   2437 /// \brief Find the end of a version control conflict marker.
   2438 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
   2439                                    ConflictMarkerKind CMK) {
   2440   const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
   2441   size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
   2442   StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
   2443   size_t Pos = RestOfBuffer.find(Terminator);
   2444   while (Pos != StringRef::npos) {
   2445     // Must occur at start of line.
   2446     if (RestOfBuffer[Pos-1] != '\r' &&
   2447         RestOfBuffer[Pos-1] != '\n') {
   2448       RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
   2449       Pos = RestOfBuffer.find(Terminator);
   2450       continue;
   2451     }
   2452     return RestOfBuffer.data()+Pos;
   2453   }
   2454   return 0;
   2455 }
   2456 
   2457 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
   2458 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
   2459 /// and recover nicely.  This returns true if it is a conflict marker and false
   2460 /// if not.
   2461 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
   2462   // Only a conflict marker if it starts at the beginning of a line.
   2463   if (CurPtr != BufferStart &&
   2464       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
   2465     return false;
   2466 
   2467   // Check to see if we have <<<<<<< or >>>>.
   2468   if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
   2469       (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
   2470     return false;
   2471 
   2472   // If we have a situation where we don't care about conflict markers, ignore
   2473   // it.
   2474   if (CurrentConflictMarkerState || isLexingRawMode())
   2475     return false;
   2476 
   2477   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
   2478 
   2479   // Check to see if there is an ending marker somewhere in the buffer at the
   2480   // start of a line to terminate this conflict marker.
   2481   if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
   2482     // We found a match.  We are really in a conflict marker.
   2483     // Diagnose this, and ignore to the end of line.
   2484     Diag(CurPtr, diag::err_conflict_marker);
   2485     CurrentConflictMarkerState = Kind;
   2486 
   2487     // Skip ahead to the end of line.  We know this exists because the
   2488     // end-of-conflict marker starts with \r or \n.
   2489     while (*CurPtr != '\r' && *CurPtr != '\n') {
   2490       assert(CurPtr != BufferEnd && "Didn't find end of line");
   2491       ++CurPtr;
   2492     }
   2493     BufferPtr = CurPtr;
   2494     return true;
   2495   }
   2496 
   2497   // No end of conflict marker found.
   2498   return false;
   2499 }
   2500 
   2501 
   2502 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
   2503 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
   2504 /// is the end of a conflict marker.  Handle it by ignoring up until the end of
   2505 /// the line.  This returns true if it is a conflict marker and false if not.
   2506 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
   2507   // Only a conflict marker if it starts at the beginning of a line.
   2508   if (CurPtr != BufferStart &&
   2509       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
   2510     return false;
   2511 
   2512   // If we have a situation where we don't care about conflict markers, ignore
   2513   // it.
   2514   if (!CurrentConflictMarkerState || isLexingRawMode())
   2515     return false;
   2516 
   2517   // Check to see if we have the marker (4 characters in a row).
   2518   for (unsigned i = 1; i != 4; ++i)
   2519     if (CurPtr[i] != CurPtr[0])
   2520       return false;
   2521 
   2522   // If we do have it, search for the end of the conflict marker.  This could
   2523   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
   2524   // be the end of conflict marker.
   2525   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
   2526                                         CurrentConflictMarkerState)) {
   2527     CurPtr = End;
   2528 
   2529     // Skip ahead to the end of line.
   2530     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
   2531       ++CurPtr;
   2532 
   2533     BufferPtr = CurPtr;
   2534 
   2535     // No longer in the conflict marker.
   2536     CurrentConflictMarkerState = CMK_None;
   2537     return true;
   2538   }
   2539 
   2540   return false;
   2541 }
   2542 
   2543 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
   2544   if (PP && PP->isCodeCompletionEnabled()) {
   2545     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
   2546     return Loc == PP->getCodeCompletionLoc();
   2547   }
   2548 
   2549   return false;
   2550 }
   2551 
   2552 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
   2553                            Token *Result) {
   2554   unsigned CharSize;
   2555   char Kind = getCharAndSize(StartPtr, CharSize);
   2556 
   2557   unsigned NumHexDigits;
   2558   if (Kind == 'u')
   2559     NumHexDigits = 4;
   2560   else if (Kind == 'U')
   2561     NumHexDigits = 8;
   2562   else
   2563     return 0;
   2564 
   2565   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
   2566     if (Result && !isLexingRawMode())
   2567       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
   2568     return 0;
   2569   }
   2570 
   2571   const char *CurPtr = StartPtr + CharSize;
   2572   const char *KindLoc = &CurPtr[-1];
   2573 
   2574   uint32_t CodePoint = 0;
   2575   for (unsigned i = 0; i < NumHexDigits; ++i) {
   2576     char C = getCharAndSize(CurPtr, CharSize);
   2577 
   2578     unsigned Value = llvm::hexDigitValue(C);
   2579     if (Value == -1U) {
   2580       if (Result && !isLexingRawMode()) {
   2581         if (i == 0) {
   2582           Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
   2583             << StringRef(KindLoc, 1);
   2584         } else {
   2585           Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
   2586 
   2587           // If the user wrote \U1234, suggest a fixit to \u.
   2588           if (i == 4 && NumHexDigits == 8) {
   2589             CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
   2590             Diag(KindLoc, diag::note_ucn_four_not_eight)
   2591               << FixItHint::CreateReplacement(URange, "u");
   2592           }
   2593         }
   2594       }
   2595 
   2596       return 0;
   2597     }
   2598 
   2599     CodePoint <<= 4;
   2600     CodePoint += Value;
   2601 
   2602     CurPtr += CharSize;
   2603   }
   2604 
   2605   if (Result) {
   2606     Result->setFlag(Token::HasUCN);
   2607     if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
   2608       StartPtr = CurPtr;
   2609     else
   2610       while (StartPtr != CurPtr)
   2611         (void)getAndAdvanceChar(StartPtr, *Result);
   2612   } else {
   2613     StartPtr = CurPtr;
   2614   }
   2615 
   2616   // C99 6.4.3p2: A universal character name shall not specify a character whose
   2617   //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
   2618   //   0060 (`), nor one in the range D800 through DFFF inclusive.)
   2619   // C++11 [lex.charset]p2: If the hexadecimal value for a
   2620   //   universal-character-name corresponds to a surrogate code point (in the
   2621   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
   2622   //   if the hexadecimal value for a universal-character-name outside the
   2623   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
   2624   //   string literal corresponds to a control character (in either of the
   2625   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
   2626   //   basic source character set, the program is ill-formed.
   2627   if (CodePoint < 0xA0) {
   2628     if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
   2629       return CodePoint;
   2630 
   2631     // We don't use isLexingRawMode() here because we need to warn about bad
   2632     // UCNs even when skipping preprocessing tokens in a #if block.
   2633     if (Result && PP) {
   2634       if (CodePoint < 0x20 || CodePoint >= 0x7F)
   2635         Diag(BufferPtr, diag::err_ucn_control_character);
   2636       else {
   2637         char C = static_cast<char>(CodePoint);
   2638         Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
   2639       }
   2640     }
   2641 
   2642     return 0;
   2643 
   2644   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
   2645     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
   2646     // We don't use isLexingRawMode() here because we need to diagnose bad
   2647     // UCNs even when skipping preprocessing tokens in a #if block.
   2648     if (Result && PP) {
   2649       if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
   2650         Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
   2651       else
   2652         Diag(BufferPtr, diag::err_ucn_escape_invalid);
   2653     }
   2654     return 0;
   2655   }
   2656 
   2657   return CodePoint;
   2658 }
   2659 
   2660 void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
   2661   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
   2662       isCharInSet(C, UnicodeWhitespaceChars)) {
   2663     Diag(BufferPtr, diag::ext_unicode_whitespace)
   2664       << makeCharRange(*this, BufferPtr, CurPtr);
   2665 
   2666     Result.setFlag(Token::LeadingSpace);
   2667     if (SkipWhitespace(Result, CurPtr))
   2668       return; // KeepWhitespaceMode
   2669 
   2670     return LexTokenInternal(Result);
   2671   }
   2672 
   2673   if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
   2674     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
   2675         !PP->isPreprocessedOutput()) {
   2676       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
   2677                                 makeCharRange(*this, BufferPtr, CurPtr),
   2678                                 /*IsFirst=*/true);
   2679     }
   2680 
   2681     MIOpt.ReadToken();
   2682     return LexIdentifier(Result, CurPtr);
   2683   }
   2684 
   2685   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
   2686       !PP->isPreprocessedOutput() &&
   2687       !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
   2688     // Non-ASCII characters tend to creep into source code unintentionally.
   2689     // Instead of letting the parser complain about the unknown token,
   2690     // just drop the character.
   2691     // Note that we can /only/ do this when the non-ASCII character is actually
   2692     // spelled as Unicode, not written as a UCN. The standard requires that
   2693     // we not throw away any possible preprocessor tokens, but there's a
   2694     // loophole in the mapping of Unicode characters to basic character set
   2695     // characters that allows us to map these particular characters to, say,
   2696     // whitespace.
   2697     Diag(BufferPtr, diag::err_non_ascii)
   2698       << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
   2699 
   2700     BufferPtr = CurPtr;
   2701     return LexTokenInternal(Result);
   2702   }
   2703 
   2704   // Otherwise, we have an explicit UCN or a character that's unlikely to show
   2705   // up by accident.
   2706   MIOpt.ReadToken();
   2707   FormTokenWithChars(Result, CurPtr, tok::unknown);
   2708 }
   2709 
   2710 
   2711 /// LexTokenInternal - This implements a simple C family lexer.  It is an
   2712 /// extremely performance critical piece of code.  This assumes that the buffer
   2713 /// has a null character at the end of the file.  This returns a preprocessing
   2714 /// token, not a normal token, as such, it is an internal interface.  It assumes
   2715 /// that the Flags of result have been cleared before calling this.
   2716 void Lexer::LexTokenInternal(Token &Result) {
   2717 LexNextToken:
   2718   // New token, can't need cleaning yet.
   2719   Result.clearFlag(Token::NeedsCleaning);
   2720   Result.setIdentifierInfo(0);
   2721 
   2722   // CurPtr - Cache BufferPtr in an automatic variable.
   2723   const char *CurPtr = BufferPtr;
   2724 
   2725   // Small amounts of horizontal whitespace is very common between tokens.
   2726   if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
   2727     ++CurPtr;
   2728     while ((*CurPtr == ' ') || (*CurPtr == '\t'))
   2729       ++CurPtr;
   2730 
   2731     // If we are keeping whitespace and other tokens, just return what we just
   2732     // skipped.  The next lexer invocation will return the token after the
   2733     // whitespace.
   2734     if (isKeepWhitespaceMode()) {
   2735       FormTokenWithChars(Result, CurPtr, tok::unknown);
   2736       // FIXME: The next token will not have LeadingSpace set.
   2737       return;
   2738     }
   2739 
   2740     BufferPtr = CurPtr;
   2741     Result.setFlag(Token::LeadingSpace);
   2742   }
   2743 
   2744   unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
   2745 
   2746   // Read a character, advancing over it.
   2747   char Char = getAndAdvanceChar(CurPtr, Result);
   2748   tok::TokenKind Kind;
   2749 
   2750   switch (Char) {
   2751   case 0:  // Null.
   2752     // Found end of file?
   2753     if (CurPtr-1 == BufferEnd) {
   2754       // Read the PP instance variable into an automatic variable, because
   2755       // LexEndOfFile will often delete 'this'.
   2756       Preprocessor *PPCache = PP;
   2757       if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
   2758         return;   // Got a token to return.
   2759       assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
   2760       return PPCache->Lex(Result);
   2761     }
   2762 
   2763     // Check if we are performing code completion.
   2764     if (isCodeCompletionPoint(CurPtr-1)) {
   2765       // Return the code-completion token.
   2766       Result.startToken();
   2767       FormTokenWithChars(Result, CurPtr, tok::code_completion);
   2768       return;
   2769     }
   2770 
   2771     if (!isLexingRawMode())
   2772       Diag(CurPtr-1, diag::null_in_file);
   2773     Result.setFlag(Token::LeadingSpace);
   2774     if (SkipWhitespace(Result, CurPtr))
   2775       return; // KeepWhitespaceMode
   2776 
   2777     goto LexNextToken;   // GCC isn't tail call eliminating.
   2778 
   2779   case 26:  // DOS & CP/M EOF: "^Z".
   2780     // If we're in Microsoft extensions mode, treat this as end of file.
   2781     if (LangOpts.MicrosoftExt) {
   2782       // Read the PP instance variable into an automatic variable, because
   2783       // LexEndOfFile will often delete 'this'.
   2784       Preprocessor *PPCache = PP;
   2785       if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
   2786         return;   // Got a token to return.
   2787       assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
   2788       return PPCache->Lex(Result);
   2789     }
   2790     // If Microsoft extensions are disabled, this is just random garbage.
   2791     Kind = tok::unknown;
   2792     break;
   2793 
   2794   case '\n':
   2795   case '\r':
   2796     // If we are inside a preprocessor directive and we see the end of line,
   2797     // we know we are done with the directive, so return an EOD token.
   2798     if (ParsingPreprocessorDirective) {
   2799       // Done parsing the "line".
   2800       ParsingPreprocessorDirective = false;
   2801 
   2802       // Restore comment saving mode, in case it was disabled for directive.
   2803       if (PP)
   2804         resetExtendedTokenMode();
   2805 
   2806       // Since we consumed a newline, we are back at the start of a line.
   2807       IsAtStartOfLine = true;
   2808 
   2809       Kind = tok::eod;
   2810       break;
   2811     }
   2812 
   2813     // No leading whitespace seen so far.
   2814     Result.clearFlag(Token::LeadingSpace);
   2815 
   2816     if (SkipWhitespace(Result, CurPtr))
   2817       return; // KeepWhitespaceMode
   2818     goto LexNextToken;   // GCC isn't tail call eliminating.
   2819   case ' ':
   2820   case '\t':
   2821   case '\f':
   2822   case '\v':
   2823   SkipHorizontalWhitespace:
   2824     Result.setFlag(Token::LeadingSpace);
   2825     if (SkipWhitespace(Result, CurPtr))
   2826       return; // KeepWhitespaceMode
   2827 
   2828   SkipIgnoredUnits:
   2829     CurPtr = BufferPtr;
   2830 
   2831     // If the next token is obviously a // or /* */ comment, skip it efficiently
   2832     // too (without going through the big switch stmt).
   2833     if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
   2834         LangOpts.LineComment && !LangOpts.TraditionalCPP) {
   2835       if (SkipLineComment(Result, CurPtr+2))
   2836         return; // There is a token to return.
   2837       goto SkipIgnoredUnits;
   2838     } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
   2839       if (SkipBlockComment(Result, CurPtr+2))
   2840         return; // There is a token to return.
   2841       goto SkipIgnoredUnits;
   2842     } else if (isHorizontalWhitespace(*CurPtr)) {
   2843       goto SkipHorizontalWhitespace;
   2844     }
   2845     goto LexNextToken;   // GCC isn't tail call eliminating.
   2846 
   2847   // C99 6.4.4.1: Integer Constants.
   2848   // C99 6.4.4.2: Floating Constants.
   2849   case '0': case '1': case '2': case '3': case '4':
   2850   case '5': case '6': case '7': case '8': case '9':
   2851     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2852     MIOpt.ReadToken();
   2853     return LexNumericConstant(Result, CurPtr);
   2854 
   2855   case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
   2856     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2857     MIOpt.ReadToken();
   2858 
   2859     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
   2860       Char = getCharAndSize(CurPtr, SizeTmp);
   2861 
   2862       // UTF-16 string literal
   2863       if (Char == '"')
   2864         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2865                                 tok::utf16_string_literal);
   2866 
   2867       // UTF-16 character constant
   2868       if (Char == '\'')
   2869         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2870                                tok::utf16_char_constant);
   2871 
   2872       // UTF-16 raw string literal
   2873       if (Char == 'R' && LangOpts.CPlusPlus11 &&
   2874           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2875         return LexRawStringLiteral(Result,
   2876                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2877                                            SizeTmp2, Result),
   2878                                tok::utf16_string_literal);
   2879 
   2880       if (Char == '8') {
   2881         char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
   2882 
   2883         // UTF-8 string literal
   2884         if (Char2 == '"')
   2885           return LexStringLiteral(Result,
   2886                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2887                                            SizeTmp2, Result),
   2888                                tok::utf8_string_literal);
   2889 
   2890         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
   2891           unsigned SizeTmp3;
   2892           char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
   2893           // UTF-8 raw string literal
   2894           if (Char3 == '"') {
   2895             return LexRawStringLiteral(Result,
   2896                    ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2897                                            SizeTmp2, Result),
   2898                                SizeTmp3, Result),
   2899                    tok::utf8_string_literal);
   2900           }
   2901         }
   2902       }
   2903     }
   2904 
   2905     // treat u like the start of an identifier.
   2906     return LexIdentifier(Result, CurPtr);
   2907 
   2908   case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
   2909     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2910     MIOpt.ReadToken();
   2911 
   2912     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
   2913       Char = getCharAndSize(CurPtr, SizeTmp);
   2914 
   2915       // UTF-32 string literal
   2916       if (Char == '"')
   2917         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2918                                 tok::utf32_string_literal);
   2919 
   2920       // UTF-32 character constant
   2921       if (Char == '\'')
   2922         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2923                                tok::utf32_char_constant);
   2924 
   2925       // UTF-32 raw string literal
   2926       if (Char == 'R' && LangOpts.CPlusPlus11 &&
   2927           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2928         return LexRawStringLiteral(Result,
   2929                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2930                                            SizeTmp2, Result),
   2931                                tok::utf32_string_literal);
   2932     }
   2933 
   2934     // treat U like the start of an identifier.
   2935     return LexIdentifier(Result, CurPtr);
   2936 
   2937   case 'R': // Identifier or C++0x raw string literal
   2938     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2939     MIOpt.ReadToken();
   2940 
   2941     if (LangOpts.CPlusPlus11) {
   2942       Char = getCharAndSize(CurPtr, SizeTmp);
   2943 
   2944       if (Char == '"')
   2945         return LexRawStringLiteral(Result,
   2946                                    ConsumeChar(CurPtr, SizeTmp, Result),
   2947                                    tok::string_literal);
   2948     }
   2949 
   2950     // treat R like the start of an identifier.
   2951     return LexIdentifier(Result, CurPtr);
   2952 
   2953   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
   2954     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2955     MIOpt.ReadToken();
   2956     Char = getCharAndSize(CurPtr, SizeTmp);
   2957 
   2958     // Wide string literal.
   2959     if (Char == '"')
   2960       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2961                               tok::wide_string_literal);
   2962 
   2963     // Wide raw string literal.
   2964     if (LangOpts.CPlusPlus11 && Char == 'R' &&
   2965         getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2966       return LexRawStringLiteral(Result,
   2967                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2968                                            SizeTmp2, Result),
   2969                                tok::wide_string_literal);
   2970 
   2971     // Wide character constant.
   2972     if (Char == '\'')
   2973       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2974                              tok::wide_char_constant);
   2975     // FALL THROUGH, treating L like the start of an identifier.
   2976 
   2977   // C99 6.4.2: Identifiers.
   2978   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
   2979   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
   2980   case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
   2981   case 'V': case 'W': case 'X': case 'Y': case 'Z':
   2982   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
   2983   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
   2984   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
   2985   case 'v': case 'w': case 'x': case 'y': case 'z':
   2986   case '_':
   2987     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2988     MIOpt.ReadToken();
   2989     return LexIdentifier(Result, CurPtr);
   2990 
   2991   case '$':   // $ in identifiers.
   2992     if (LangOpts.DollarIdents) {
   2993       if (!isLexingRawMode())
   2994         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
   2995       // Notify MIOpt that we read a non-whitespace/non-comment token.
   2996       MIOpt.ReadToken();
   2997       return LexIdentifier(Result, CurPtr);
   2998     }
   2999 
   3000     Kind = tok::unknown;
   3001     break;
   3002 
   3003   // C99 6.4.4: Character Constants.
   3004   case '\'':
   3005     // Notify MIOpt that we read a non-whitespace/non-comment token.
   3006     MIOpt.ReadToken();
   3007     return LexCharConstant(Result, CurPtr, tok::char_constant);
   3008 
   3009   // C99 6.4.5: String Literals.
   3010   case '"':
   3011     // Notify MIOpt that we read a non-whitespace/non-comment token.
   3012     MIOpt.ReadToken();
   3013     return LexStringLiteral(Result, CurPtr, tok::string_literal);
   3014 
   3015   // C99 6.4.6: Punctuators.
   3016   case '?':
   3017     Kind = tok::question;
   3018     break;
   3019   case '[':
   3020     Kind = tok::l_square;
   3021     break;
   3022   case ']':
   3023     Kind = tok::r_square;
   3024     break;
   3025   case '(':
   3026     Kind = tok::l_paren;
   3027     break;
   3028   case ')':
   3029     Kind = tok::r_paren;
   3030     break;
   3031   case '{':
   3032     Kind = tok::l_brace;
   3033     break;
   3034   case '}':
   3035     Kind = tok::r_brace;
   3036     break;
   3037   case '.':
   3038     Char = getCharAndSize(CurPtr, SizeTmp);
   3039     if (Char >= '0' && Char <= '9') {
   3040       // Notify MIOpt that we read a non-whitespace/non-comment token.
   3041       MIOpt.ReadToken();
   3042 
   3043       return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
   3044     } else if (LangOpts.CPlusPlus && Char == '*') {
   3045       Kind = tok::periodstar;
   3046       CurPtr += SizeTmp;
   3047     } else if (Char == '.' &&
   3048                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
   3049       Kind = tok::ellipsis;
   3050       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3051                            SizeTmp2, Result);
   3052     } else {
   3053       Kind = tok::period;
   3054     }
   3055     break;
   3056   case '&':
   3057     Char = getCharAndSize(CurPtr, SizeTmp);
   3058     if (Char == '&') {
   3059       Kind = tok::ampamp;
   3060       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3061     } else if (Char == '=') {
   3062       Kind = tok::ampequal;
   3063       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3064     } else {
   3065       Kind = tok::amp;
   3066     }
   3067     break;
   3068   case '*':
   3069     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
   3070       Kind = tok::starequal;
   3071       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3072     } else {
   3073       Kind = tok::star;
   3074     }
   3075     break;
   3076   case '+':
   3077     Char = getCharAndSize(CurPtr, SizeTmp);
   3078     if (Char == '+') {
   3079       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3080       Kind = tok::plusplus;
   3081     } else if (Char == '=') {
   3082       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3083       Kind = tok::plusequal;
   3084     } else {
   3085       Kind = tok::plus;
   3086     }
   3087     break;
   3088   case '-':
   3089     Char = getCharAndSize(CurPtr, SizeTmp);
   3090     if (Char == '-') {      // --
   3091       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3092       Kind = tok::minusminus;
   3093     } else if (Char == '>' && LangOpts.CPlusPlus &&
   3094                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
   3095       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3096                            SizeTmp2, Result);
   3097       Kind = tok::arrowstar;
   3098     } else if (Char == '>') {   // ->
   3099       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3100       Kind = tok::arrow;
   3101     } else if (Char == '=') {   // -=
   3102       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3103       Kind = tok::minusequal;
   3104     } else {
   3105       Kind = tok::minus;
   3106     }
   3107     break;
   3108   case '~':
   3109     Kind = tok::tilde;
   3110     break;
   3111   case '!':
   3112     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
   3113       Kind = tok::exclaimequal;
   3114       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3115     } else {
   3116       Kind = tok::exclaim;
   3117     }
   3118     break;
   3119   case '/':
   3120     // 6.4.9: Comments
   3121     Char = getCharAndSize(CurPtr, SizeTmp);
   3122     if (Char == '/') {         // Line comment.
   3123       // Even if Line comments are disabled (e.g. in C89 mode), we generally
   3124       // want to lex this as a comment.  There is one problem with this though,
   3125       // that in one particular corner case, this can change the behavior of the
   3126       // resultant program.  For example, In  "foo //**/ bar", C89 would lex
   3127       // this as "foo / bar" and langauges with Line comments would lex it as
   3128       // "foo".  Check to see if the character after the second slash is a '*'.
   3129       // If so, we will lex that as a "/" instead of the start of a comment.
   3130       // However, we never do this if we are just preprocessing.
   3131       bool TreatAsComment = LangOpts.LineComment && !LangOpts.TraditionalCPP;
   3132       if (!TreatAsComment)
   3133         if (!(PP && PP->isPreprocessedOutput()))
   3134           TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
   3135 
   3136       if (TreatAsComment) {
   3137         if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
   3138           return; // There is a token to return.
   3139 
   3140         // It is common for the tokens immediately after a // comment to be
   3141         // whitespace (indentation for the next line).  Instead of going through
   3142         // the big switch, handle it efficiently now.
   3143         goto SkipIgnoredUnits;
   3144       }
   3145     }
   3146 
   3147     if (Char == '*') {  // /**/ comment.
   3148       if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
   3149         return; // There is a token to return.
   3150       goto LexNextToken;   // GCC isn't tail call eliminating.
   3151     }
   3152 
   3153     if (Char == '=') {
   3154       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3155       Kind = tok::slashequal;
   3156     } else {
   3157       Kind = tok::slash;
   3158     }
   3159     break;
   3160   case '%':
   3161     Char = getCharAndSize(CurPtr, SizeTmp);
   3162     if (Char == '=') {
   3163       Kind = tok::percentequal;
   3164       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3165     } else if (LangOpts.Digraphs && Char == '>') {
   3166       Kind = tok::r_brace;                             // '%>' -> '}'
   3167       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3168     } else if (LangOpts.Digraphs && Char == ':') {
   3169       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3170       Char = getCharAndSize(CurPtr, SizeTmp);
   3171       if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
   3172         Kind = tok::hashhash;                          // '%:%:' -> '##'
   3173         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3174                              SizeTmp2, Result);
   3175       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
   3176         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3177         if (!isLexingRawMode())
   3178           Diag(BufferPtr, diag::ext_charize_microsoft);
   3179         Kind = tok::hashat;
   3180       } else {                                         // '%:' -> '#'
   3181         // We parsed a # character.  If this occurs at the start of the line,
   3182         // it's actually the start of a preprocessing directive.  Callback to
   3183         // the preprocessor to handle it.
   3184         // FIXME: -fpreprocessed mode??
   3185         if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
   3186           goto HandleDirective;
   3187 
   3188         Kind = tok::hash;
   3189       }
   3190     } else {
   3191       Kind = tok::percent;
   3192     }
   3193     break;
   3194   case '<':
   3195     Char = getCharAndSize(CurPtr, SizeTmp);
   3196     if (ParsingFilename) {
   3197       return LexAngledStringLiteral(Result, CurPtr);
   3198     } else if (Char == '<') {
   3199       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
   3200       if (After == '=') {
   3201         Kind = tok::lesslessequal;
   3202         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3203                              SizeTmp2, Result);
   3204       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
   3205         // If this is actually a '<<<<<<<' version control conflict marker,
   3206         // recognize it as such and recover nicely.
   3207         goto LexNextToken;
   3208       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
   3209         // If this is '<<<<' and we're in a Perforce-style conflict marker,
   3210         // ignore it.
   3211         goto LexNextToken;
   3212       } else if (LangOpts.CUDA && After == '<') {
   3213         Kind = tok::lesslessless;
   3214         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3215                              SizeTmp2, Result);
   3216       } else {
   3217         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3218         Kind = tok::lessless;
   3219       }
   3220     } else if (Char == '=') {
   3221       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3222       Kind = tok::lessequal;
   3223     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
   3224       if (LangOpts.CPlusPlus11 &&
   3225           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
   3226         // C++0x [lex.pptoken]p3:
   3227         //  Otherwise, if the next three characters are <:: and the subsequent
   3228         //  character is neither : nor >, the < is treated as a preprocessor
   3229         //  token by itself and not as the first character of the alternative
   3230         //  token <:.
   3231         unsigned SizeTmp3;
   3232         char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
   3233         if (After != ':' && After != '>') {
   3234           Kind = tok::less;
   3235           if (!isLexingRawMode())
   3236             Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
   3237           break;
   3238         }
   3239       }
   3240 
   3241       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3242       Kind = tok::l_square;
   3243     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
   3244       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3245       Kind = tok::l_brace;
   3246     } else {
   3247       Kind = tok::less;
   3248     }
   3249     break;
   3250   case '>':
   3251     Char = getCharAndSize(CurPtr, SizeTmp);
   3252     if (Char == '=') {
   3253       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3254       Kind = tok::greaterequal;
   3255     } else if (Char == '>') {
   3256       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
   3257       if (After == '=') {
   3258         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3259                              SizeTmp2, Result);
   3260         Kind = tok::greatergreaterequal;
   3261       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
   3262         // If this is actually a '>>>>' conflict marker, recognize it as such
   3263         // and recover nicely.
   3264         goto LexNextToken;
   3265       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
   3266         // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
   3267         goto LexNextToken;
   3268       } else if (LangOpts.CUDA && After == '>') {
   3269         Kind = tok::greatergreatergreater;
   3270         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3271                              SizeTmp2, Result);
   3272       } else {
   3273         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3274         Kind = tok::greatergreater;
   3275       }
   3276 
   3277     } else {
   3278       Kind = tok::greater;
   3279     }
   3280     break;
   3281   case '^':
   3282     Char = getCharAndSize(CurPtr, SizeTmp);
   3283     if (Char == '=') {
   3284       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3285       Kind = tok::caretequal;
   3286     } else {
   3287       Kind = tok::caret;
   3288     }
   3289     break;
   3290   case '|':
   3291     Char = getCharAndSize(CurPtr, SizeTmp);
   3292     if (Char == '=') {
   3293       Kind = tok::pipeequal;
   3294       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3295     } else if (Char == '|') {
   3296       // If this is '|||||||' and we're in a conflict marker, ignore it.
   3297       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
   3298         goto LexNextToken;
   3299       Kind = tok::pipepipe;
   3300       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3301     } else {
   3302       Kind = tok::pipe;
   3303     }
   3304     break;
   3305   case ':':
   3306     Char = getCharAndSize(CurPtr, SizeTmp);
   3307     if (LangOpts.Digraphs && Char == '>') {
   3308       Kind = tok::r_square; // ':>' -> ']'
   3309       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3310     } else if (LangOpts.CPlusPlus && Char == ':') {
   3311       Kind = tok::coloncolon;
   3312       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3313     } else {
   3314       Kind = tok::colon;
   3315     }
   3316     break;
   3317   case ';':
   3318     Kind = tok::semi;
   3319     break;
   3320   case '=':
   3321     Char = getCharAndSize(CurPtr, SizeTmp);
   3322     if (Char == '=') {
   3323       // If this is '====' and we're in a conflict marker, ignore it.
   3324       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
   3325         goto LexNextToken;
   3326 
   3327       Kind = tok::equalequal;
   3328       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3329     } else {
   3330       Kind = tok::equal;
   3331     }
   3332     break;
   3333   case ',':
   3334     Kind = tok::comma;
   3335     break;
   3336   case '#':
   3337     Char = getCharAndSize(CurPtr, SizeTmp);
   3338     if (Char == '#') {
   3339       Kind = tok::hashhash;
   3340       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3341     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize
   3342       Kind = tok::hashat;
   3343       if (!isLexingRawMode())
   3344         Diag(BufferPtr, diag::ext_charize_microsoft);
   3345       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3346     } else {
   3347       // We parsed a # character.  If this occurs at the start of the line,
   3348       // it's actually the start of a preprocessing directive.  Callback to
   3349       // the preprocessor to handle it.
   3350       // FIXME: -fpreprocessed mode??
   3351       if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
   3352         goto HandleDirective;
   3353 
   3354       Kind = tok::hash;
   3355     }
   3356     break;
   3357 
   3358   case '@':
   3359     // Objective C support.
   3360     if (CurPtr[-1] == '@' && LangOpts.ObjC1)
   3361       Kind = tok::at;
   3362     else
   3363       Kind = tok::unknown;
   3364     break;
   3365 
   3366   // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
   3367   case '\\':
   3368     if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
   3369       return LexUnicode(Result, CodePoint, CurPtr);
   3370 
   3371     Kind = tok::unknown;
   3372     break;
   3373 
   3374   default: {
   3375     if (isASCII(Char)) {
   3376       Kind = tok::unknown;
   3377       break;
   3378     }
   3379 
   3380     UTF32 CodePoint;
   3381 
   3382     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
   3383     // an escaped newline.
   3384     --CurPtr;
   3385     ConversionResult Status =
   3386         llvm::convertUTF8Sequence((const UTF8 **)&CurPtr,
   3387                                   (const UTF8 *)BufferEnd,
   3388                                   &CodePoint,
   3389                                   strictConversion);
   3390     if (Status == conversionOK)
   3391       return LexUnicode(Result, CodePoint, CurPtr);
   3392 
   3393     if (isLexingRawMode() || ParsingPreprocessorDirective ||
   3394         PP->isPreprocessedOutput()) {
   3395       ++CurPtr;
   3396       Kind = tok::unknown;
   3397       break;
   3398     }
   3399 
   3400     // Non-ASCII characters tend to creep into source code unintentionally.
   3401     // Instead of letting the parser complain about the unknown token,
   3402     // just diagnose the invalid UTF-8, then drop the character.
   3403     Diag(CurPtr, diag::err_invalid_utf8);
   3404 
   3405     BufferPtr = CurPtr+1;
   3406     goto LexNextToken;
   3407   }
   3408   }
   3409 
   3410   // Notify MIOpt that we read a non-whitespace/non-comment token.
   3411   MIOpt.ReadToken();
   3412 
   3413   // Update the location of token as well as BufferPtr.
   3414   FormTokenWithChars(Result, CurPtr, Kind);
   3415   return;
   3416 
   3417 HandleDirective:
   3418   // We parsed a # character and it's the start of a preprocessing directive.
   3419 
   3420   FormTokenWithChars(Result, CurPtr, tok::hash);
   3421   PP->HandleDirective(Result);
   3422 
   3423   // As an optimization, if the preprocessor didn't switch lexers, tail
   3424   // recurse.
   3425   if (PP->isCurrentLexer(this)) {
   3426     // Start a new token.  If this is a #include or something, the PP may
   3427     // want us starting at the beginning of the line again.  If so, set
   3428     // the StartOfLine flag and clear LeadingSpace.
   3429     if (IsAtStartOfLine) {
   3430       Result.setFlag(Token::StartOfLine);
   3431       Result.clearFlag(Token::LeadingSpace);
   3432       IsAtStartOfLine = false;
   3433     }
   3434     goto LexNextToken;   // GCC isn't tail call eliminating.
   3435   }
   3436   return PP->Lex(Result);
   3437 }
   3438