Home | History | Annotate | Download | only in Lex
      1 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 //  This file implements the Lexer and Token interfaces.
     11 //
     12 //===----------------------------------------------------------------------===//
     13 //
     14 // TODO: GCC Diagnostics emitted by the lexer:
     15 // PEDWARN: (form feed|vertical tab) in preprocessing directive
     16 //
     17 // Universal characters, unicode, char mapping:
     18 // WARNING: `%.*s' is not in NFKC
     19 // WARNING: `%.*s' is not in NFC
     20 //
     21 // Other:
     22 // TODO: Options to support:
     23 //    -fexec-charset,-fwide-exec-charset
     24 //
     25 //===----------------------------------------------------------------------===//
     26 
     27 #include "clang/Lex/Lexer.h"
     28 #include "clang/Basic/CharInfo.h"
     29 #include "clang/Basic/SourceManager.h"
     30 #include "clang/Lex/CodeCompletionHandler.h"
     31 #include "clang/Lex/LexDiagnostic.h"
     32 #include "clang/Lex/Preprocessor.h"
     33 #include "llvm/ADT/STLExtras.h"
     34 #include "llvm/ADT/StringExtras.h"
     35 #include "llvm/ADT/StringSwitch.h"
     36 #include "llvm/Support/Compiler.h"
     37 #include "llvm/Support/ConvertUTF.h"
     38 #include "llvm/Support/MemoryBuffer.h"
     39 #include "UnicodeCharSets.h"
     40 #include <cstring>
     41 using namespace clang;
     42 
     43 //===----------------------------------------------------------------------===//
     44 // Token Class Implementation
     45 //===----------------------------------------------------------------------===//
     46 
     47 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
     48 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
     49   if (IdentifierInfo *II = getIdentifierInfo())
     50     return II->getObjCKeywordID() == objcKey;
     51   return false;
     52 }
     53 
     54 /// getObjCKeywordID - Return the ObjC keyword kind.
     55 tok::ObjCKeywordKind Token::getObjCKeywordID() const {
     56   IdentifierInfo *specId = getIdentifierInfo();
     57   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
     58 }
     59 
     60 
     61 //===----------------------------------------------------------------------===//
     62 // Lexer Class Implementation
     63 //===----------------------------------------------------------------------===//
     64 
     65 void Lexer::anchor() { }
     66 
     67 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
     68                       const char *BufEnd) {
     69   BufferStart = BufStart;
     70   BufferPtr = BufPtr;
     71   BufferEnd = BufEnd;
     72 
     73   assert(BufEnd[0] == 0 &&
     74          "We assume that the input buffer has a null character at the end"
     75          " to simplify lexing!");
     76 
     77   // Check whether we have a BOM in the beginning of the buffer. If yes - act
     78   // accordingly. Right now we support only UTF-8 with and without BOM, so, just
     79   // skip the UTF-8 BOM if it's present.
     80   if (BufferStart == BufferPtr) {
     81     // Determine the size of the BOM.
     82     StringRef Buf(BufferStart, BufferEnd - BufferStart);
     83     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
     84       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
     85       .Default(0);
     86 
     87     // Skip the BOM.
     88     BufferPtr += BOMLength;
     89   }
     90 
     91   Is_PragmaLexer = false;
     92   CurrentConflictMarkerState = CMK_None;
     93 
     94   // Start of the file is a start of line.
     95   IsAtStartOfLine = true;
     96 
     97   // We are not after parsing a #.
     98   ParsingPreprocessorDirective = false;
     99 
    100   // We are not after parsing #include.
    101   ParsingFilename = false;
    102 
    103   // We are not in raw mode.  Raw mode disables diagnostics and interpretation
    104   // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
    105   // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
    106   // or otherwise skipping over tokens.
    107   LexingRawMode = false;
    108 
    109   // Default to not keeping comments.
    110   ExtendedTokenMode = 0;
    111 }
    112 
    113 /// Lexer constructor - Create a new lexer object for the specified buffer
    114 /// with the specified preprocessor managing the lexing process.  This lexer
    115 /// assumes that the associated file buffer and Preprocessor objects will
    116 /// outlive it, so it doesn't take ownership of either of them.
    117 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
    118   : PreprocessorLexer(&PP, FID),
    119     FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
    120     LangOpts(PP.getLangOpts()) {
    121 
    122   InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
    123             InputFile->getBufferEnd());
    124 
    125   resetExtendedTokenMode();
    126 }
    127 
    128 void Lexer::resetExtendedTokenMode() {
    129   assert(PP && "Cannot reset token mode without a preprocessor");
    130   if (LangOpts.TraditionalCPP)
    131     SetKeepWhitespaceMode(true);
    132   else
    133     SetCommentRetentionState(PP->getCommentRetentionState());
    134 }
    135 
    136 /// Lexer constructor - Create a new raw lexer object.  This object is only
    137 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
    138 /// range will outlive it, so it doesn't take ownership of it.
    139 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
    140              const char *BufStart, const char *BufPtr, const char *BufEnd)
    141   : FileLoc(fileloc), LangOpts(langOpts) {
    142 
    143   InitLexer(BufStart, BufPtr, BufEnd);
    144 
    145   // We *are* in raw mode.
    146   LexingRawMode = true;
    147 }
    148 
    149 /// Lexer constructor - Create a new raw lexer object.  This object is only
    150 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
    151 /// range will outlive it, so it doesn't take ownership of it.
    152 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
    153              const SourceManager &SM, const LangOptions &langOpts)
    154   : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) {
    155 
    156   InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
    157             FromFile->getBufferEnd());
    158 
    159   // We *are* in raw mode.
    160   LexingRawMode = true;
    161 }
    162 
    163 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
    164 /// _Pragma expansion.  This has a variety of magic semantics that this method
    165 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
    166 ///
    167 /// On entrance to this routine, TokStartLoc is a macro location which has a
    168 /// spelling loc that indicates the bytes to be lexed for the token and an
    169 /// expansion location that indicates where all lexed tokens should be
    170 /// "expanded from".
    171 ///
    172 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a
    173 /// normal lexer that remaps tokens as they fly by.  This would require making
    174 /// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
    175 /// interface that could handle this stuff.  This would pull GetMappedTokenLoc
    176 /// out of the critical path of the lexer!
    177 ///
    178 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
    179                                  SourceLocation ExpansionLocStart,
    180                                  SourceLocation ExpansionLocEnd,
    181                                  unsigned TokLen, Preprocessor &PP) {
    182   SourceManager &SM = PP.getSourceManager();
    183 
    184   // Create the lexer as if we were going to lex the file normally.
    185   FileID SpellingFID = SM.getFileID(SpellingLoc);
    186   const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
    187   Lexer *L = new Lexer(SpellingFID, InputFile, PP);
    188 
    189   // Now that the lexer is created, change the start/end locations so that we
    190   // just lex the subsection of the file that we want.  This is lexing from a
    191   // scratch buffer.
    192   const char *StrData = SM.getCharacterData(SpellingLoc);
    193 
    194   L->BufferPtr = StrData;
    195   L->BufferEnd = StrData+TokLen;
    196   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
    197 
    198   // Set the SourceLocation with the remapping information.  This ensures that
    199   // GetMappedTokenLoc will remap the tokens as they are lexed.
    200   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
    201                                      ExpansionLocStart,
    202                                      ExpansionLocEnd, TokLen);
    203 
    204   // Ensure that the lexer thinks it is inside a directive, so that end \n will
    205   // return an EOD token.
    206   L->ParsingPreprocessorDirective = true;
    207 
    208   // This lexer really is for _Pragma.
    209   L->Is_PragmaLexer = true;
    210   return L;
    211 }
    212 
    213 
    214 /// Stringify - Convert the specified string into a C string, with surrounding
    215 /// ""'s, and with escaped \ and " characters.
    216 std::string Lexer::Stringify(const std::string &Str, bool Charify) {
    217   std::string Result = Str;
    218   char Quote = Charify ? '\'' : '"';
    219   for (unsigned i = 0, e = Result.size(); i != e; ++i) {
    220     if (Result[i] == '\\' || Result[i] == Quote) {
    221       Result.insert(Result.begin()+i, '\\');
    222       ++i; ++e;
    223     }
    224   }
    225   return Result;
    226 }
    227 
    228 /// Stringify - Convert the specified string into a C string by escaping '\'
    229 /// and " characters.  This does not add surrounding ""'s to the string.
    230 void Lexer::Stringify(SmallVectorImpl<char> &Str) {
    231   for (unsigned i = 0, e = Str.size(); i != e; ++i) {
    232     if (Str[i] == '\\' || Str[i] == '"') {
    233       Str.insert(Str.begin()+i, '\\');
    234       ++i; ++e;
    235     }
    236   }
    237 }
    238 
    239 //===----------------------------------------------------------------------===//
    240 // Token Spelling
    241 //===----------------------------------------------------------------------===//
    242 
    243 /// \brief Slow case of getSpelling. Extract the characters comprising the
    244 /// spelling of this token from the provided input buffer.
    245 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
    246                               const LangOptions &LangOpts, char *Spelling) {
    247   assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
    248 
    249   size_t Length = 0;
    250   const char *BufEnd = BufPtr + Tok.getLength();
    251 
    252   if (Tok.is(tok::string_literal)) {
    253     // Munch the encoding-prefix and opening double-quote.
    254     while (BufPtr < BufEnd) {
    255       unsigned Size;
    256       Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
    257       BufPtr += Size;
    258 
    259       if (Spelling[Length - 1] == '"')
    260         break;
    261     }
    262 
    263     // Raw string literals need special handling; trigraph expansion and line
    264     // splicing do not occur within their d-char-sequence nor within their
    265     // r-char-sequence.
    266     if (Length >= 2 &&
    267         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
    268       // Search backwards from the end of the token to find the matching closing
    269       // quote.
    270       const char *RawEnd = BufEnd;
    271       do --RawEnd; while (*RawEnd != '"');
    272       size_t RawLength = RawEnd - BufPtr + 1;
    273 
    274       // Everything between the quotes is included verbatim in the spelling.
    275       memcpy(Spelling + Length, BufPtr, RawLength);
    276       Length += RawLength;
    277       BufPtr += RawLength;
    278 
    279       // The rest of the token is lexed normally.
    280     }
    281   }
    282 
    283   while (BufPtr < BufEnd) {
    284     unsigned Size;
    285     Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts);
    286     BufPtr += Size;
    287   }
    288 
    289   assert(Length < Tok.getLength() &&
    290          "NeedsCleaning flag set on token that didn't need cleaning!");
    291   return Length;
    292 }
    293 
    294 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
    295 /// token are the characters used to represent the token in the source file
    296 /// after trigraph expansion and escaped-newline folding.  In particular, this
    297 /// wants to get the true, uncanonicalized, spelling of things like digraphs
    298 /// UCNs, etc.
    299 StringRef Lexer::getSpelling(SourceLocation loc,
    300                              SmallVectorImpl<char> &buffer,
    301                              const SourceManager &SM,
    302                              const LangOptions &options,
    303                              bool *invalid) {
    304   // Break down the source location.
    305   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
    306 
    307   // Try to the load the file buffer.
    308   bool invalidTemp = false;
    309   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
    310   if (invalidTemp) {
    311     if (invalid) *invalid = true;
    312     return StringRef();
    313   }
    314 
    315   const char *tokenBegin = file.data() + locInfo.second;
    316 
    317   // Lex from the start of the given location.
    318   Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
    319               file.begin(), tokenBegin, file.end());
    320   Token token;
    321   lexer.LexFromRawLexer(token);
    322 
    323   unsigned length = token.getLength();
    324 
    325   // Common case:  no need for cleaning.
    326   if (!token.needsCleaning())
    327     return StringRef(tokenBegin, length);
    328 
    329   // Hard case, we need to relex the characters into the string.
    330   buffer.resize(length);
    331   buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
    332   return StringRef(buffer.data(), buffer.size());
    333 }
    334 
    335 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
    336 /// token are the characters used to represent the token in the source file
    337 /// after trigraph expansion and escaped-newline folding.  In particular, this
    338 /// wants to get the true, uncanonicalized, spelling of things like digraphs
    339 /// UCNs, etc.
    340 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
    341                                const LangOptions &LangOpts, bool *Invalid) {
    342   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
    343 
    344   bool CharDataInvalid = false;
    345   const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
    346                                                     &CharDataInvalid);
    347   if (Invalid)
    348     *Invalid = CharDataInvalid;
    349   if (CharDataInvalid)
    350     return std::string();
    351 
    352   // If this token contains nothing interesting, return it directly.
    353   if (!Tok.needsCleaning())
    354     return std::string(TokStart, TokStart + Tok.getLength());
    355 
    356   std::string Result;
    357   Result.resize(Tok.getLength());
    358   Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
    359   return Result;
    360 }
    361 
    362 /// getSpelling - This method is used to get the spelling of a token into a
    363 /// preallocated buffer, instead of as an std::string.  The caller is required
    364 /// to allocate enough space for the token, which is guaranteed to be at least
    365 /// Tok.getLength() bytes long.  The actual length of the token is returned.
    366 ///
    367 /// Note that this method may do two possible things: it may either fill in
    368 /// the buffer specified with characters, or it may *change the input pointer*
    369 /// to point to a constant buffer with the data already in it (avoiding a
    370 /// copy).  The caller is not allowed to modify the returned buffer pointer
    371 /// if an internal buffer is returned.
    372 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
    373                             const SourceManager &SourceMgr,
    374                             const LangOptions &LangOpts, bool *Invalid) {
    375   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
    376 
    377   const char *TokStart = 0;
    378   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
    379   if (Tok.is(tok::raw_identifier))
    380     TokStart = Tok.getRawIdentifierData();
    381   else if (!Tok.hasUCN()) {
    382     if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
    383       // Just return the string from the identifier table, which is very quick.
    384       Buffer = II->getNameStart();
    385       return II->getLength();
    386     }
    387   }
    388 
    389   // NOTE: this can be checked even after testing for an IdentifierInfo.
    390   if (Tok.isLiteral())
    391     TokStart = Tok.getLiteralData();
    392 
    393   if (TokStart == 0) {
    394     // Compute the start of the token in the input lexer buffer.
    395     bool CharDataInvalid = false;
    396     TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
    397     if (Invalid)
    398       *Invalid = CharDataInvalid;
    399     if (CharDataInvalid) {
    400       Buffer = "";
    401       return 0;
    402     }
    403   }
    404 
    405   // If this token contains nothing interesting, return it directly.
    406   if (!Tok.needsCleaning()) {
    407     Buffer = TokStart;
    408     return Tok.getLength();
    409   }
    410 
    411   // Otherwise, hard case, relex the characters into the string.
    412   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
    413 }
    414 
    415 
    416 /// MeasureTokenLength - Relex the token at the specified location and return
    417 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
    418 /// includes a trigraph or an escaped newline) then this count includes bytes
    419 /// that are part of that.
    420 unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
    421                                    const SourceManager &SM,
    422                                    const LangOptions &LangOpts) {
    423   Token TheTok;
    424   if (getRawToken(Loc, TheTok, SM, LangOpts))
    425     return 0;
    426   return TheTok.getLength();
    427 }
    428 
    429 /// \brief Relex the token at the specified location.
    430 /// \returns true if there was a failure, false on success.
    431 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
    432                         const SourceManager &SM,
    433                         const LangOptions &LangOpts) {
    434   // TODO: this could be special cased for common tokens like identifiers, ')',
    435   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
    436   // all obviously single-char tokens.  This could use
    437   // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
    438   // something.
    439 
    440   // If this comes from a macro expansion, we really do want the macro name, not
    441   // the token this macro expanded to.
    442   Loc = SM.getExpansionLoc(Loc);
    443   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
    444   bool Invalid = false;
    445   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
    446   if (Invalid)
    447     return true;
    448 
    449   const char *StrData = Buffer.data()+LocInfo.second;
    450 
    451   if (isWhitespace(StrData[0]))
    452     return true;
    453 
    454   // Create a lexer starting at the beginning of this token.
    455   Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
    456                  Buffer.begin(), StrData, Buffer.end());
    457   TheLexer.SetCommentRetentionState(true);
    458   TheLexer.LexFromRawLexer(Result);
    459   return false;
    460 }
    461 
    462 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
    463                                               const SourceManager &SM,
    464                                               const LangOptions &LangOpts) {
    465   assert(Loc.isFileID());
    466   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
    467   if (LocInfo.first.isInvalid())
    468     return Loc;
    469 
    470   bool Invalid = false;
    471   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
    472   if (Invalid)
    473     return Loc;
    474 
    475   // Back up from the current location until we hit the beginning of a line
    476   // (or the buffer). We'll relex from that point.
    477   const char *BufStart = Buffer.data();
    478   if (LocInfo.second >= Buffer.size())
    479     return Loc;
    480 
    481   const char *StrData = BufStart+LocInfo.second;
    482   if (StrData[0] == '\n' || StrData[0] == '\r')
    483     return Loc;
    484 
    485   const char *LexStart = StrData;
    486   while (LexStart != BufStart) {
    487     if (LexStart[0] == '\n' || LexStart[0] == '\r') {
    488       ++LexStart;
    489       break;
    490     }
    491 
    492     --LexStart;
    493   }
    494 
    495   // Create a lexer starting at the beginning of this token.
    496   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
    497   Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
    498   TheLexer.SetCommentRetentionState(true);
    499 
    500   // Lex tokens until we find the token that contains the source location.
    501   Token TheTok;
    502   do {
    503     TheLexer.LexFromRawLexer(TheTok);
    504 
    505     if (TheLexer.getBufferLocation() > StrData) {
    506       // Lexing this token has taken the lexer past the source location we're
    507       // looking for. If the current token encompasses our source location,
    508       // return the beginning of that token.
    509       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
    510         return TheTok.getLocation();
    511 
    512       // We ended up skipping over the source location entirely, which means
    513       // that it points into whitespace. We're done here.
    514       break;
    515     }
    516   } while (TheTok.getKind() != tok::eof);
    517 
    518   // We've passed our source location; just return the original source location.
    519   return Loc;
    520 }
    521 
    522 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
    523                                           const SourceManager &SM,
    524                                           const LangOptions &LangOpts) {
    525  if (Loc.isFileID())
    526    return getBeginningOfFileToken(Loc, SM, LangOpts);
    527 
    528  if (!SM.isMacroArgExpansion(Loc))
    529    return Loc;
    530 
    531  SourceLocation FileLoc = SM.getSpellingLoc(Loc);
    532  SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
    533  std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
    534  std::pair<FileID, unsigned> BeginFileLocInfo
    535    = SM.getDecomposedLoc(BeginFileLoc);
    536  assert(FileLocInfo.first == BeginFileLocInfo.first &&
    537         FileLocInfo.second >= BeginFileLocInfo.second);
    538  return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
    539 }
    540 
    541 namespace {
    542   enum PreambleDirectiveKind {
    543     PDK_Skipped,
    544     PDK_StartIf,
    545     PDK_EndIf,
    546     PDK_Unknown
    547   };
    548 }
    549 
    550 std::pair<unsigned, bool>
    551 Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer,
    552                        const LangOptions &LangOpts, unsigned MaxLines) {
    553   // Create a lexer starting at the beginning of the file. Note that we use a
    554   // "fake" file source location at offset 1 so that the lexer will track our
    555   // position within the file.
    556   const unsigned StartOffset = 1;
    557   SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
    558   Lexer TheLexer(FileLoc, LangOpts, Buffer->getBufferStart(),
    559                  Buffer->getBufferStart(), Buffer->getBufferEnd());
    560   TheLexer.SetCommentRetentionState(true);
    561 
    562   // StartLoc will differ from FileLoc if there is a BOM that was skipped.
    563   SourceLocation StartLoc = TheLexer.getSourceLocation();
    564 
    565   bool InPreprocessorDirective = false;
    566   Token TheTok;
    567   Token IfStartTok;
    568   unsigned IfCount = 0;
    569   SourceLocation ActiveCommentLoc;
    570 
    571   unsigned MaxLineOffset = 0;
    572   if (MaxLines) {
    573     const char *CurPtr = Buffer->getBufferStart();
    574     unsigned CurLine = 0;
    575     while (CurPtr != Buffer->getBufferEnd()) {
    576       char ch = *CurPtr++;
    577       if (ch == '\n') {
    578         ++CurLine;
    579         if (CurLine == MaxLines)
    580           break;
    581       }
    582     }
    583     if (CurPtr != Buffer->getBufferEnd())
    584       MaxLineOffset = CurPtr - Buffer->getBufferStart();
    585   }
    586 
    587   do {
    588     TheLexer.LexFromRawLexer(TheTok);
    589 
    590     if (InPreprocessorDirective) {
    591       // If we've hit the end of the file, we're done.
    592       if (TheTok.getKind() == tok::eof) {
    593         break;
    594       }
    595 
    596       // If we haven't hit the end of the preprocessor directive, skip this
    597       // token.
    598       if (!TheTok.isAtStartOfLine())
    599         continue;
    600 
    601       // We've passed the end of the preprocessor directive, and will look
    602       // at this token again below.
    603       InPreprocessorDirective = false;
    604     }
    605 
    606     // Keep track of the # of lines in the preamble.
    607     if (TheTok.isAtStartOfLine()) {
    608       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
    609 
    610       // If we were asked to limit the number of lines in the preamble,
    611       // and we're about to exceed that limit, we're done.
    612       if (MaxLineOffset && TokOffset >= MaxLineOffset)
    613         break;
    614     }
    615 
    616     // Comments are okay; skip over them.
    617     if (TheTok.getKind() == tok::comment) {
    618       if (ActiveCommentLoc.isInvalid())
    619         ActiveCommentLoc = TheTok.getLocation();
    620       continue;
    621     }
    622 
    623     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
    624       // This is the start of a preprocessor directive.
    625       Token HashTok = TheTok;
    626       InPreprocessorDirective = true;
    627       ActiveCommentLoc = SourceLocation();
    628 
    629       // Figure out which directive this is. Since we're lexing raw tokens,
    630       // we don't have an identifier table available. Instead, just look at
    631       // the raw identifier to recognize and categorize preprocessor directives.
    632       TheLexer.LexFromRawLexer(TheTok);
    633       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
    634         StringRef Keyword(TheTok.getRawIdentifierData(),
    635                                 TheTok.getLength());
    636         PreambleDirectiveKind PDK
    637           = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
    638               .Case("include", PDK_Skipped)
    639               .Case("__include_macros", PDK_Skipped)
    640               .Case("define", PDK_Skipped)
    641               .Case("undef", PDK_Skipped)
    642               .Case("line", PDK_Skipped)
    643               .Case("error", PDK_Skipped)
    644               .Case("pragma", PDK_Skipped)
    645               .Case("import", PDK_Skipped)
    646               .Case("include_next", PDK_Skipped)
    647               .Case("warning", PDK_Skipped)
    648               .Case("ident", PDK_Skipped)
    649               .Case("sccs", PDK_Skipped)
    650               .Case("assert", PDK_Skipped)
    651               .Case("unassert", PDK_Skipped)
    652               .Case("if", PDK_StartIf)
    653               .Case("ifdef", PDK_StartIf)
    654               .Case("ifndef", PDK_StartIf)
    655               .Case("elif", PDK_Skipped)
    656               .Case("else", PDK_Skipped)
    657               .Case("endif", PDK_EndIf)
    658               .Default(PDK_Unknown);
    659 
    660         switch (PDK) {
    661         case PDK_Skipped:
    662           continue;
    663 
    664         case PDK_StartIf:
    665           if (IfCount == 0)
    666             IfStartTok = HashTok;
    667 
    668           ++IfCount;
    669           continue;
    670 
    671         case PDK_EndIf:
    672           // Mismatched #endif. The preamble ends here.
    673           if (IfCount == 0)
    674             break;
    675 
    676           --IfCount;
    677           continue;
    678 
    679         case PDK_Unknown:
    680           // We don't know what this directive is; stop at the '#'.
    681           break;
    682         }
    683       }
    684 
    685       // We only end up here if we didn't recognize the preprocessor
    686       // directive or it was one that can't occur in the preamble at this
    687       // point. Roll back the current token to the location of the '#'.
    688       InPreprocessorDirective = false;
    689       TheTok = HashTok;
    690     }
    691 
    692     // We hit a token that we don't recognize as being in the
    693     // "preprocessing only" part of the file, so we're no longer in
    694     // the preamble.
    695     break;
    696   } while (true);
    697 
    698   SourceLocation End;
    699   if (IfCount)
    700     End = IfStartTok.getLocation();
    701   else if (ActiveCommentLoc.isValid())
    702     End = ActiveCommentLoc; // don't truncate a decl comment.
    703   else
    704     End = TheTok.getLocation();
    705 
    706   return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
    707                         IfCount? IfStartTok.isAtStartOfLine()
    708                                : TheTok.isAtStartOfLine());
    709 }
    710 
    711 
    712 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
    713 /// token, return a new location that specifies a character within the token.
    714 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
    715                                               unsigned CharNo,
    716                                               const SourceManager &SM,
    717                                               const LangOptions &LangOpts) {
    718   // Figure out how many physical characters away the specified expansion
    719   // character is.  This needs to take into consideration newlines and
    720   // trigraphs.
    721   bool Invalid = false;
    722   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
    723 
    724   // If they request the first char of the token, we're trivially done.
    725   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
    726     return TokStart;
    727 
    728   unsigned PhysOffset = 0;
    729 
    730   // The usual case is that tokens don't contain anything interesting.  Skip
    731   // over the uninteresting characters.  If a token only consists of simple
    732   // chars, this method is extremely fast.
    733   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
    734     if (CharNo == 0)
    735       return TokStart.getLocWithOffset(PhysOffset);
    736     ++TokPtr, --CharNo, ++PhysOffset;
    737   }
    738 
    739   // If we have a character that may be a trigraph or escaped newline, use a
    740   // lexer to parse it correctly.
    741   for (; CharNo; --CharNo) {
    742     unsigned Size;
    743     Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts);
    744     TokPtr += Size;
    745     PhysOffset += Size;
    746   }
    747 
    748   // Final detail: if we end up on an escaped newline, we want to return the
    749   // location of the actual byte of the token.  For example foo\<newline>bar
    750   // advanced by 3 should return the location of b, not of \\.  One compounding
    751   // detail of this is that the escape may be made by a trigraph.
    752   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
    753     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
    754 
    755   return TokStart.getLocWithOffset(PhysOffset);
    756 }
    757 
    758 /// \brief Computes the source location just past the end of the
    759 /// token at this source location.
    760 ///
    761 /// This routine can be used to produce a source location that
    762 /// points just past the end of the token referenced by \p Loc, and
    763 /// is generally used when a diagnostic needs to point just after a
    764 /// token where it expected something different that it received. If
    765 /// the returned source location would not be meaningful (e.g., if
    766 /// it points into a macro), this routine returns an invalid
    767 /// source location.
    768 ///
    769 /// \param Offset an offset from the end of the token, where the source
    770 /// location should refer to. The default offset (0) produces a source
    771 /// location pointing just past the end of the token; an offset of 1 produces
    772 /// a source location pointing to the last character in the token, etc.
    773 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
    774                                           const SourceManager &SM,
    775                                           const LangOptions &LangOpts) {
    776   if (Loc.isInvalid())
    777     return SourceLocation();
    778 
    779   if (Loc.isMacroID()) {
    780     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
    781       return SourceLocation(); // Points inside the macro expansion.
    782   }
    783 
    784   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
    785   if (Len > Offset)
    786     Len = Len - Offset;
    787   else
    788     return Loc;
    789 
    790   return Loc.getLocWithOffset(Len);
    791 }
    792 
    793 /// \brief Returns true if the given MacroID location points at the first
    794 /// token of the macro expansion.
    795 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
    796                                       const SourceManager &SM,
    797                                       const LangOptions &LangOpts,
    798                                       SourceLocation *MacroBegin) {
    799   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
    800 
    801   SourceLocation expansionLoc;
    802   if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
    803     return false;
    804 
    805   if (expansionLoc.isFileID()) {
    806     // No other macro expansions, this is the first.
    807     if (MacroBegin)
    808       *MacroBegin = expansionLoc;
    809     return true;
    810   }
    811 
    812   return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
    813 }
    814 
    815 /// \brief Returns true if the given MacroID location points at the last
    816 /// token of the macro expansion.
    817 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
    818                                     const SourceManager &SM,
    819                                     const LangOptions &LangOpts,
    820                                     SourceLocation *MacroEnd) {
    821   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
    822 
    823   SourceLocation spellLoc = SM.getSpellingLoc(loc);
    824   unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
    825   if (tokLen == 0)
    826     return false;
    827 
    828   SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
    829   SourceLocation expansionLoc;
    830   if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
    831     return false;
    832 
    833   if (expansionLoc.isFileID()) {
    834     // No other macro expansions.
    835     if (MacroEnd)
    836       *MacroEnd = expansionLoc;
    837     return true;
    838   }
    839 
    840   return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
    841 }
    842 
    843 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
    844                                              const SourceManager &SM,
    845                                              const LangOptions &LangOpts) {
    846   SourceLocation Begin = Range.getBegin();
    847   SourceLocation End = Range.getEnd();
    848   assert(Begin.isFileID() && End.isFileID());
    849   if (Range.isTokenRange()) {
    850     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
    851     if (End.isInvalid())
    852       return CharSourceRange();
    853   }
    854 
    855   // Break down the source locations.
    856   FileID FID;
    857   unsigned BeginOffs;
    858   llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
    859   if (FID.isInvalid())
    860     return CharSourceRange();
    861 
    862   unsigned EndOffs;
    863   if (!SM.isInFileID(End, FID, &EndOffs) ||
    864       BeginOffs > EndOffs)
    865     return CharSourceRange();
    866 
    867   return CharSourceRange::getCharRange(Begin, End);
    868 }
    869 
    870 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
    871                                          const SourceManager &SM,
    872                                          const LangOptions &LangOpts) {
    873   SourceLocation Begin = Range.getBegin();
    874   SourceLocation End = Range.getEnd();
    875   if (Begin.isInvalid() || End.isInvalid())
    876     return CharSourceRange();
    877 
    878   if (Begin.isFileID() && End.isFileID())
    879     return makeRangeFromFileLocs(Range, SM, LangOpts);
    880 
    881   if (Begin.isMacroID() && End.isFileID()) {
    882     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
    883       return CharSourceRange();
    884     Range.setBegin(Begin);
    885     return makeRangeFromFileLocs(Range, SM, LangOpts);
    886   }
    887 
    888   if (Begin.isFileID() && End.isMacroID()) {
    889     if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
    890                                                           &End)) ||
    891         (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
    892                                                            &End)))
    893       return CharSourceRange();
    894     Range.setEnd(End);
    895     return makeRangeFromFileLocs(Range, SM, LangOpts);
    896   }
    897 
    898   assert(Begin.isMacroID() && End.isMacroID());
    899   SourceLocation MacroBegin, MacroEnd;
    900   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
    901       ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
    902                                                         &MacroEnd)) ||
    903        (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
    904                                                          &MacroEnd)))) {
    905     Range.setBegin(MacroBegin);
    906     Range.setEnd(MacroEnd);
    907     return makeRangeFromFileLocs(Range, SM, LangOpts);
    908   }
    909 
    910   bool Invalid = false;
    911   const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
    912                                                         &Invalid);
    913   if (Invalid)
    914     return CharSourceRange();
    915 
    916   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
    917     const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
    918                                                         &Invalid);
    919     if (Invalid)
    920       return CharSourceRange();
    921 
    922     if (EndEntry.getExpansion().isMacroArgExpansion() &&
    923         BeginEntry.getExpansion().getExpansionLocStart() ==
    924             EndEntry.getExpansion().getExpansionLocStart()) {
    925       Range.setBegin(SM.getImmediateSpellingLoc(Begin));
    926       Range.setEnd(SM.getImmediateSpellingLoc(End));
    927       return makeFileCharRange(Range, SM, LangOpts);
    928     }
    929   }
    930 
    931   return CharSourceRange();
    932 }
    933 
    934 StringRef Lexer::getSourceText(CharSourceRange Range,
    935                                const SourceManager &SM,
    936                                const LangOptions &LangOpts,
    937                                bool *Invalid) {
    938   Range = makeFileCharRange(Range, SM, LangOpts);
    939   if (Range.isInvalid()) {
    940     if (Invalid) *Invalid = true;
    941     return StringRef();
    942   }
    943 
    944   // Break down the source location.
    945   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
    946   if (beginInfo.first.isInvalid()) {
    947     if (Invalid) *Invalid = true;
    948     return StringRef();
    949   }
    950 
    951   unsigned EndOffs;
    952   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
    953       beginInfo.second > EndOffs) {
    954     if (Invalid) *Invalid = true;
    955     return StringRef();
    956   }
    957 
    958   // Try to the load the file buffer.
    959   bool invalidTemp = false;
    960   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
    961   if (invalidTemp) {
    962     if (Invalid) *Invalid = true;
    963     return StringRef();
    964   }
    965 
    966   if (Invalid) *Invalid = false;
    967   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
    968 }
    969 
    970 StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
    971                                        const SourceManager &SM,
    972                                        const LangOptions &LangOpts) {
    973   assert(Loc.isMacroID() && "Only reasonble to call this on macros");
    974 
    975   // Find the location of the immediate macro expansion.
    976   while (1) {
    977     FileID FID = SM.getFileID(Loc);
    978     const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
    979     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
    980     Loc = Expansion.getExpansionLocStart();
    981     if (!Expansion.isMacroArgExpansion())
    982       break;
    983 
    984     // For macro arguments we need to check that the argument did not come
    985     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
    986 
    987     // Loc points to the argument id of the macro definition, move to the
    988     // macro expansion.
    989     Loc = SM.getImmediateExpansionRange(Loc).first;
    990     SourceLocation SpellLoc = Expansion.getSpellingLoc();
    991     if (SpellLoc.isFileID())
    992       break; // No inner macro.
    993 
    994     // If spelling location resides in the same FileID as macro expansion
    995     // location, it means there is no inner macro.
    996     FileID MacroFID = SM.getFileID(Loc);
    997     if (SM.isInFileID(SpellLoc, MacroFID))
    998       break;
    999 
   1000     // Argument came from inner macro.
   1001     Loc = SpellLoc;
   1002   }
   1003 
   1004   // Find the spelling location of the start of the non-argument expansion
   1005   // range. This is where the macro name was spelled in order to begin
   1006   // expanding this macro.
   1007   Loc = SM.getSpellingLoc(Loc);
   1008 
   1009   // Dig out the buffer where the macro name was spelled and the extents of the
   1010   // name so that we can render it into the expansion note.
   1011   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
   1012   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
   1013   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
   1014   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
   1015 }
   1016 
   1017 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) {
   1018   return isIdentifierBody(c, LangOpts.DollarIdents);
   1019 }
   1020 
   1021 
   1022 //===----------------------------------------------------------------------===//
   1023 // Diagnostics forwarding code.
   1024 //===----------------------------------------------------------------------===//
   1025 
   1026 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
   1027 /// lexer buffer was all expanded at a single point, perform the mapping.
   1028 /// This is currently only used for _Pragma implementation, so it is the slow
   1029 /// path of the hot getSourceLocation method.  Do not allow it to be inlined.
   1030 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
   1031     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
   1032 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
   1033                                         SourceLocation FileLoc,
   1034                                         unsigned CharNo, unsigned TokLen) {
   1035   assert(FileLoc.isMacroID() && "Must be a macro expansion");
   1036 
   1037   // Otherwise, we're lexing "mapped tokens".  This is used for things like
   1038   // _Pragma handling.  Combine the expansion location of FileLoc with the
   1039   // spelling location.
   1040   SourceManager &SM = PP.getSourceManager();
   1041 
   1042   // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
   1043   // characters come from spelling(FileLoc)+Offset.
   1044   SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
   1045   SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
   1046 
   1047   // Figure out the expansion loc range, which is the range covered by the
   1048   // original _Pragma(...) sequence.
   1049   std::pair<SourceLocation,SourceLocation> II =
   1050     SM.getImmediateExpansionRange(FileLoc);
   1051 
   1052   return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
   1053 }
   1054 
   1055 /// getSourceLocation - Return a source location identifier for the specified
   1056 /// offset in the current file.
   1057 SourceLocation Lexer::getSourceLocation(const char *Loc,
   1058                                         unsigned TokLen) const {
   1059   assert(Loc >= BufferStart && Loc <= BufferEnd &&
   1060          "Location out of range for this buffer!");
   1061 
   1062   // In the normal case, we're just lexing from a simple file buffer, return
   1063   // the file id from FileLoc with the offset specified.
   1064   unsigned CharNo = Loc-BufferStart;
   1065   if (FileLoc.isFileID())
   1066     return FileLoc.getLocWithOffset(CharNo);
   1067 
   1068   // Otherwise, this is the _Pragma lexer case, which pretends that all of the
   1069   // tokens are lexed from where the _Pragma was defined.
   1070   assert(PP && "This doesn't work on raw lexers");
   1071   return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
   1072 }
   1073 
   1074 /// Diag - Forwarding function for diagnostics.  This translate a source
   1075 /// position in the current buffer into a SourceLocation object for rendering.
   1076 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
   1077   return PP->Diag(getSourceLocation(Loc), DiagID);
   1078 }
   1079 
   1080 //===----------------------------------------------------------------------===//
   1081 // Trigraph and Escaped Newline Handling Code.
   1082 //===----------------------------------------------------------------------===//
   1083 
   1084 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
   1085 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
   1086 static char GetTrigraphCharForLetter(char Letter) {
   1087   switch (Letter) {
   1088   default:   return 0;
   1089   case '=':  return '#';
   1090   case ')':  return ']';
   1091   case '(':  return '[';
   1092   case '!':  return '|';
   1093   case '\'': return '^';
   1094   case '>':  return '}';
   1095   case '/':  return '\\';
   1096   case '<':  return '{';
   1097   case '-':  return '~';
   1098   }
   1099 }
   1100 
   1101 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
   1102 /// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
   1103 /// return the result character.  Finally, emit a warning about trigraph use
   1104 /// whether trigraphs are enabled or not.
   1105 static char DecodeTrigraphChar(const char *CP, Lexer *L) {
   1106   char Res = GetTrigraphCharForLetter(*CP);
   1107   if (!Res || !L) return Res;
   1108 
   1109   if (!L->getLangOpts().Trigraphs) {
   1110     if (!L->isLexingRawMode())
   1111       L->Diag(CP-2, diag::trigraph_ignored);
   1112     return 0;
   1113   }
   1114 
   1115   if (!L->isLexingRawMode())
   1116     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
   1117   return Res;
   1118 }
   1119 
   1120 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
   1121 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
   1122 /// trigraph equivalent on entry to this function.
   1123 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
   1124   unsigned Size = 0;
   1125   while (isWhitespace(Ptr[Size])) {
   1126     ++Size;
   1127 
   1128     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
   1129       continue;
   1130 
   1131     // If this is a \r\n or \n\r, skip the other half.
   1132     if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
   1133         Ptr[Size-1] != Ptr[Size])
   1134       ++Size;
   1135 
   1136     return Size;
   1137   }
   1138 
   1139   // Not an escaped newline, must be a \t or something else.
   1140   return 0;
   1141 }
   1142 
   1143 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
   1144 /// them), skip over them and return the first non-escaped-newline found,
   1145 /// otherwise return P.
   1146 const char *Lexer::SkipEscapedNewLines(const char *P) {
   1147   while (1) {
   1148     const char *AfterEscape;
   1149     if (*P == '\\') {
   1150       AfterEscape = P+1;
   1151     } else if (*P == '?') {
   1152       // If not a trigraph for escape, bail out.
   1153       if (P[1] != '?' || P[2] != '/')
   1154         return P;
   1155       AfterEscape = P+3;
   1156     } else {
   1157       return P;
   1158     }
   1159 
   1160     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
   1161     if (NewLineSize == 0) return P;
   1162     P = AfterEscape+NewLineSize;
   1163   }
   1164 }
   1165 
   1166 /// \brief Checks that the given token is the first token that occurs after the
   1167 /// given location (this excludes comments and whitespace). Returns the location
   1168 /// immediately after the specified token. If the token is not found or the
   1169 /// location is inside a macro, the returned source location will be invalid.
   1170 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
   1171                                         tok::TokenKind TKind,
   1172                                         const SourceManager &SM,
   1173                                         const LangOptions &LangOpts,
   1174                                         bool SkipTrailingWhitespaceAndNewLine) {
   1175   if (Loc.isMacroID()) {
   1176     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
   1177       return SourceLocation();
   1178   }
   1179   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
   1180 
   1181   // Break down the source location.
   1182   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
   1183 
   1184   // Try to load the file buffer.
   1185   bool InvalidTemp = false;
   1186   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
   1187   if (InvalidTemp)
   1188     return SourceLocation();
   1189 
   1190   const char *TokenBegin = File.data() + LocInfo.second;
   1191 
   1192   // Lex from the start of the given location.
   1193   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
   1194                                       TokenBegin, File.end());
   1195   // Find the token.
   1196   Token Tok;
   1197   lexer.LexFromRawLexer(Tok);
   1198   if (Tok.isNot(TKind))
   1199     return SourceLocation();
   1200   SourceLocation TokenLoc = Tok.getLocation();
   1201 
   1202   // Calculate how much whitespace needs to be skipped if any.
   1203   unsigned NumWhitespaceChars = 0;
   1204   if (SkipTrailingWhitespaceAndNewLine) {
   1205     const char *TokenEnd = SM.getCharacterData(TokenLoc) +
   1206                            Tok.getLength();
   1207     unsigned char C = *TokenEnd;
   1208     while (isHorizontalWhitespace(C)) {
   1209       C = *(++TokenEnd);
   1210       NumWhitespaceChars++;
   1211     }
   1212 
   1213     // Skip \r, \n, \r\n, or \n\r
   1214     if (C == '\n' || C == '\r') {
   1215       char PrevC = C;
   1216       C = *(++TokenEnd);
   1217       NumWhitespaceChars++;
   1218       if ((C == '\n' || C == '\r') && C != PrevC)
   1219         NumWhitespaceChars++;
   1220     }
   1221   }
   1222 
   1223   return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
   1224 }
   1225 
   1226 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
   1227 /// get its size, and return it.  This is tricky in several cases:
   1228 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
   1229 ///      then either return the trigraph (skipping 3 chars) or the '?',
   1230 ///      depending on whether trigraphs are enabled or not.
   1231 ///   2. If this is an escaped newline (potentially with whitespace between
   1232 ///      the backslash and newline), implicitly skip the newline and return
   1233 ///      the char after it.
   1234 ///
   1235 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
   1236 /// know that we can accumulate into Size, and that we have already incremented
   1237 /// Ptr by Size bytes.
   1238 ///
   1239 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
   1240 /// be updated to match.
   1241 ///
   1242 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
   1243                                Token *Tok) {
   1244   // If we have a slash, look for an escaped newline.
   1245   if (Ptr[0] == '\\') {
   1246     ++Size;
   1247     ++Ptr;
   1248 Slash:
   1249     // Common case, backslash-char where the char is not whitespace.
   1250     if (!isWhitespace(Ptr[0])) return '\\';
   1251 
   1252     // See if we have optional whitespace characters between the slash and
   1253     // newline.
   1254     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
   1255       // Remember that this token needs to be cleaned.
   1256       if (Tok) Tok->setFlag(Token::NeedsCleaning);
   1257 
   1258       // Warn if there was whitespace between the backslash and newline.
   1259       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
   1260         Diag(Ptr, diag::backslash_newline_space);
   1261 
   1262       // Found backslash<whitespace><newline>.  Parse the char after it.
   1263       Size += EscapedNewLineSize;
   1264       Ptr  += EscapedNewLineSize;
   1265 
   1266       // If the char that we finally got was a \n, then we must have had
   1267       // something like \<newline><newline>.  We don't want to consume the
   1268       // second newline.
   1269       if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
   1270         return ' ';
   1271 
   1272       // Use slow version to accumulate a correct size field.
   1273       return getCharAndSizeSlow(Ptr, Size, Tok);
   1274     }
   1275 
   1276     // Otherwise, this is not an escaped newline, just return the slash.
   1277     return '\\';
   1278   }
   1279 
   1280   // If this is a trigraph, process it.
   1281   if (Ptr[0] == '?' && Ptr[1] == '?') {
   1282     // If this is actually a legal trigraph (not something like "??x"), emit
   1283     // a trigraph warning.  If so, and if trigraphs are enabled, return it.
   1284     if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
   1285       // Remember that this token needs to be cleaned.
   1286       if (Tok) Tok->setFlag(Token::NeedsCleaning);
   1287 
   1288       Ptr += 3;
   1289       Size += 3;
   1290       if (C == '\\') goto Slash;
   1291       return C;
   1292     }
   1293   }
   1294 
   1295   // If this is neither, return a single character.
   1296   ++Size;
   1297   return *Ptr;
   1298 }
   1299 
   1300 
   1301 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
   1302 /// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
   1303 /// and that we have already incremented Ptr by Size bytes.
   1304 ///
   1305 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
   1306 /// be updated to match.
   1307 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
   1308                                      const LangOptions &LangOpts) {
   1309   // If we have a slash, look for an escaped newline.
   1310   if (Ptr[0] == '\\') {
   1311     ++Size;
   1312     ++Ptr;
   1313 Slash:
   1314     // Common case, backslash-char where the char is not whitespace.
   1315     if (!isWhitespace(Ptr[0])) return '\\';
   1316 
   1317     // See if we have optional whitespace characters followed by a newline.
   1318     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
   1319       // Found backslash<whitespace><newline>.  Parse the char after it.
   1320       Size += EscapedNewLineSize;
   1321       Ptr  += EscapedNewLineSize;
   1322 
   1323       // If the char that we finally got was a \n, then we must have had
   1324       // something like \<newline><newline>.  We don't want to consume the
   1325       // second newline.
   1326       if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
   1327         return ' ';
   1328 
   1329       // Use slow version to accumulate a correct size field.
   1330       return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
   1331     }
   1332 
   1333     // Otherwise, this is not an escaped newline, just return the slash.
   1334     return '\\';
   1335   }
   1336 
   1337   // If this is a trigraph, process it.
   1338   if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
   1339     // If this is actually a legal trigraph (not something like "??x"), return
   1340     // it.
   1341     if (char C = GetTrigraphCharForLetter(Ptr[2])) {
   1342       Ptr += 3;
   1343       Size += 3;
   1344       if (C == '\\') goto Slash;
   1345       return C;
   1346     }
   1347   }
   1348 
   1349   // If this is neither, return a single character.
   1350   ++Size;
   1351   return *Ptr;
   1352 }
   1353 
   1354 //===----------------------------------------------------------------------===//
   1355 // Helper methods for lexing.
   1356 //===----------------------------------------------------------------------===//
   1357 
   1358 /// \brief Routine that indiscriminately skips bytes in the source file.
   1359 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
   1360   BufferPtr += Bytes;
   1361   if (BufferPtr > BufferEnd)
   1362     BufferPtr = BufferEnd;
   1363   IsAtStartOfLine = StartOfLine;
   1364 }
   1365 
   1366 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts) {
   1367   if (LangOpts.CPlusPlus11 || LangOpts.C11)
   1368     return isCharInSet(C, C11AllowedIDChars);
   1369   else if (LangOpts.CPlusPlus)
   1370     return isCharInSet(C, CXX03AllowedIDChars);
   1371   else
   1372     return isCharInSet(C, C99AllowedIDChars);
   1373 }
   1374 
   1375 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts) {
   1376   assert(isAllowedIDChar(C, LangOpts));
   1377   if (LangOpts.CPlusPlus11 || LangOpts.C11)
   1378     return !isCharInSet(C, C11DisallowedInitialIDChars);
   1379   else if (LangOpts.CPlusPlus)
   1380     return true;
   1381   else
   1382     return !isCharInSet(C, C99DisallowedInitialIDChars);
   1383 }
   1384 
   1385 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
   1386                                             const char *End) {
   1387   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
   1388                                        L.getSourceLocation(End));
   1389 }
   1390 
   1391 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
   1392                                       CharSourceRange Range, bool IsFirst) {
   1393   // Check C99 compatibility.
   1394   if (Diags.getDiagnosticLevel(diag::warn_c99_compat_unicode_id,
   1395                                Range.getBegin()) > DiagnosticsEngine::Ignored) {
   1396     enum {
   1397       CannotAppearInIdentifier = 0,
   1398       CannotStartIdentifier
   1399     };
   1400 
   1401     if (!isCharInSet(C, C99AllowedIDChars)) {
   1402       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
   1403         << Range
   1404         << CannotAppearInIdentifier;
   1405     } else if (IsFirst && isCharInSet(C, C99DisallowedInitialIDChars)) {
   1406       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
   1407         << Range
   1408         << CannotStartIdentifier;
   1409     }
   1410   }
   1411 
   1412   // Check C++98 compatibility.
   1413   if (Diags.getDiagnosticLevel(diag::warn_cxx98_compat_unicode_id,
   1414                                Range.getBegin()) > DiagnosticsEngine::Ignored) {
   1415     if (!isCharInSet(C, CXX03AllowedIDChars)) {
   1416       Diags.Report(Range.getBegin(), diag::warn_cxx98_compat_unicode_id)
   1417         << Range;
   1418     }
   1419   }
   1420  }
   1421 
   1422 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
   1423   // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
   1424   unsigned Size;
   1425   unsigned char C = *CurPtr++;
   1426   while (isIdentifierBody(C))
   1427     C = *CurPtr++;
   1428 
   1429   --CurPtr;   // Back up over the skipped character.
   1430 
   1431   // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
   1432   // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
   1433   //
   1434   // TODO: Could merge these checks into an InfoTable flag to make the
   1435   // comparison cheaper
   1436   if (isASCII(C) && C != '\\' && C != '?' &&
   1437       (C != '$' || !LangOpts.DollarIdents)) {
   1438 FinishIdentifier:
   1439     const char *IdStart = BufferPtr;
   1440     FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
   1441     Result.setRawIdentifierData(IdStart);
   1442 
   1443     // If we are in raw mode, return this identifier raw.  There is no need to
   1444     // look up identifier information or attempt to macro expand it.
   1445     if (LexingRawMode)
   1446       return;
   1447 
   1448     // Fill in Result.IdentifierInfo and update the token kind,
   1449     // looking up the identifier in the identifier table.
   1450     IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
   1451 
   1452     // Finally, now that we know we have an identifier, pass this off to the
   1453     // preprocessor, which may macro expand it or something.
   1454     if (II->isHandleIdentifierCase())
   1455       PP->HandleIdentifier(Result);
   1456 
   1457     return;
   1458   }
   1459 
   1460   // Otherwise, $,\,? in identifier found.  Enter slower path.
   1461 
   1462   C = getCharAndSize(CurPtr, Size);
   1463   while (1) {
   1464     if (C == '$') {
   1465       // If we hit a $ and they are not supported in identifiers, we are done.
   1466       if (!LangOpts.DollarIdents) goto FinishIdentifier;
   1467 
   1468       // Otherwise, emit a diagnostic and continue.
   1469       if (!isLexingRawMode())
   1470         Diag(CurPtr, diag::ext_dollar_in_identifier);
   1471       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1472       C = getCharAndSize(CurPtr, Size);
   1473       continue;
   1474 
   1475     } else if (C == '\\') {
   1476       const char *UCNPtr = CurPtr + Size;
   1477       uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0);
   1478       if (CodePoint == 0 || !isAllowedIDChar(CodePoint, LangOpts))
   1479         goto FinishIdentifier;
   1480 
   1481       if (!isLexingRawMode()) {
   1482         maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
   1483                                   makeCharRange(*this, CurPtr, UCNPtr),
   1484                                   /*IsFirst=*/false);
   1485       }
   1486 
   1487       Result.setFlag(Token::HasUCN);
   1488       if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
   1489           (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
   1490         CurPtr = UCNPtr;
   1491       else
   1492         while (CurPtr != UCNPtr)
   1493           (void)getAndAdvanceChar(CurPtr, Result);
   1494 
   1495       C = getCharAndSize(CurPtr, Size);
   1496       continue;
   1497     } else if (!isASCII(C)) {
   1498       const char *UnicodePtr = CurPtr;
   1499       UTF32 CodePoint;
   1500       ConversionResult Result =
   1501           llvm::convertUTF8Sequence((const UTF8 **)&UnicodePtr,
   1502                                     (const UTF8 *)BufferEnd,
   1503                                     &CodePoint,
   1504                                     strictConversion);
   1505       if (Result != conversionOK ||
   1506           !isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts))
   1507         goto FinishIdentifier;
   1508 
   1509       if (!isLexingRawMode()) {
   1510         maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
   1511                                   makeCharRange(*this, CurPtr, UnicodePtr),
   1512                                   /*IsFirst=*/false);
   1513       }
   1514 
   1515       CurPtr = UnicodePtr;
   1516       C = getCharAndSize(CurPtr, Size);
   1517       continue;
   1518     } else if (!isIdentifierBody(C)) {
   1519       goto FinishIdentifier;
   1520     }
   1521 
   1522     // Otherwise, this character is good, consume it.
   1523     CurPtr = ConsumeChar(CurPtr, Size, Result);
   1524 
   1525     C = getCharAndSize(CurPtr, Size);
   1526     while (isIdentifierBody(C)) {
   1527       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1528       C = getCharAndSize(CurPtr, Size);
   1529     }
   1530   }
   1531 }
   1532 
   1533 /// isHexaLiteral - Return true if Start points to a hex constant.
   1534 /// in microsoft mode (where this is supposed to be several different tokens).
   1535 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
   1536   unsigned Size;
   1537   char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts);
   1538   if (C1 != '0')
   1539     return false;
   1540   char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts);
   1541   return (C2 == 'x' || C2 == 'X');
   1542 }
   1543 
   1544 /// LexNumericConstant - Lex the remainder of a integer or floating point
   1545 /// constant. From[-1] is the first character lexed.  Return the end of the
   1546 /// constant.
   1547 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
   1548   unsigned Size;
   1549   char C = getCharAndSize(CurPtr, Size);
   1550   char PrevCh = 0;
   1551   while (isPreprocessingNumberBody(C)) { // FIXME: UCNs in ud-suffix.
   1552     CurPtr = ConsumeChar(CurPtr, Size, Result);
   1553     PrevCh = C;
   1554     C = getCharAndSize(CurPtr, Size);
   1555   }
   1556 
   1557   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
   1558   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
   1559     // If we are in Microsoft mode, don't continue if the constant is hex.
   1560     // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
   1561     if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
   1562       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
   1563   }
   1564 
   1565   // If we have a hex FP constant, continue.
   1566   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
   1567     // Outside C99, we accept hexadecimal floating point numbers as a
   1568     // not-quite-conforming extension. Only do so if this looks like it's
   1569     // actually meant to be a hexfloat, and not if it has a ud-suffix.
   1570     bool IsHexFloat = true;
   1571     if (!LangOpts.C99) {
   1572       if (!isHexaLiteral(BufferPtr, LangOpts))
   1573         IsHexFloat = false;
   1574       else if (std::find(BufferPtr, CurPtr, '_') != CurPtr)
   1575         IsHexFloat = false;
   1576     }
   1577     if (IsHexFloat)
   1578       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
   1579   }
   1580 
   1581   // Update the location of token as well as BufferPtr.
   1582   const char *TokStart = BufferPtr;
   1583   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
   1584   Result.setLiteralData(TokStart);
   1585 }
   1586 
   1587 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
   1588 /// in C++11, or warn on a ud-suffix in C++98.
   1589 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
   1590                                bool IsStringLiteral) {
   1591   assert(getLangOpts().CPlusPlus);
   1592 
   1593   // Maximally munch an identifier. FIXME: UCNs.
   1594   unsigned Size;
   1595   char C = getCharAndSize(CurPtr, Size);
   1596   if (isIdentifierHead(C)) {
   1597     if (!getLangOpts().CPlusPlus11) {
   1598       if (!isLexingRawMode())
   1599         Diag(CurPtr,
   1600              C == '_' ? diag::warn_cxx11_compat_user_defined_literal
   1601                       : diag::warn_cxx11_compat_reserved_user_defined_literal)
   1602           << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
   1603       return CurPtr;
   1604     }
   1605 
   1606     // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
   1607     // that does not start with an underscore is ill-formed. As a conforming
   1608     // extension, we treat all such suffixes as if they had whitespace before
   1609     // them.
   1610     bool IsUDSuffix = false;
   1611     if (C == '_')
   1612       IsUDSuffix = true;
   1613     else if (IsStringLiteral && C == 's' && getLangOpts().CPlusPlus1y) {
   1614       // In C++1y, "s" is a valid ud-suffix for a string literal.
   1615       unsigned NextSize;
   1616       if (!isIdentifierBody(getCharAndSizeNoWarn(CurPtr + Size, NextSize,
   1617                                                  getLangOpts())))
   1618         IsUDSuffix = true;
   1619     }
   1620 
   1621     if (!IsUDSuffix) {
   1622       if (!isLexingRawMode())
   1623         Diag(CurPtr, getLangOpts().MicrosoftMode ?
   1624             diag::ext_ms_reserved_user_defined_literal :
   1625             diag::ext_reserved_user_defined_literal)
   1626           << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
   1627       return CurPtr;
   1628     }
   1629 
   1630     Result.setFlag(Token::HasUDSuffix);
   1631     do {
   1632       CurPtr = ConsumeChar(CurPtr, Size, Result);
   1633       C = getCharAndSize(CurPtr, Size);
   1634     } while (isIdentifierBody(C));
   1635   }
   1636   return CurPtr;
   1637 }
   1638 
   1639 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
   1640 /// either " or L" or u8" or u" or U".
   1641 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
   1642                              tok::TokenKind Kind) {
   1643   const char *NulCharacter = 0; // Does this string contain the \0 character?
   1644 
   1645   if (!isLexingRawMode() &&
   1646       (Kind == tok::utf8_string_literal ||
   1647        Kind == tok::utf16_string_literal ||
   1648        Kind == tok::utf32_string_literal))
   1649     Diag(BufferPtr, getLangOpts().CPlusPlus
   1650            ? diag::warn_cxx98_compat_unicode_literal
   1651            : diag::warn_c99_compat_unicode_literal);
   1652 
   1653   char C = getAndAdvanceChar(CurPtr, Result);
   1654   while (C != '"') {
   1655     // Skip escaped characters.  Escaped newlines will already be processed by
   1656     // getAndAdvanceChar.
   1657     if (C == '\\')
   1658       C = getAndAdvanceChar(CurPtr, Result);
   1659 
   1660     if (C == '\n' || C == '\r' ||             // Newline.
   1661         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
   1662       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1663         Diag(BufferPtr, diag::ext_unterminated_string);
   1664       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1665       return;
   1666     }
   1667 
   1668     if (C == 0) {
   1669       if (isCodeCompletionPoint(CurPtr-1)) {
   1670         PP->CodeCompleteNaturalLanguage();
   1671         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1672         return cutOffLexing();
   1673       }
   1674 
   1675       NulCharacter = CurPtr-1;
   1676     }
   1677     C = getAndAdvanceChar(CurPtr, Result);
   1678   }
   1679 
   1680   // If we are in C++11, lex the optional ud-suffix.
   1681   if (getLangOpts().CPlusPlus)
   1682     CurPtr = LexUDSuffix(Result, CurPtr, true);
   1683 
   1684   // If a nul character existed in the string, warn about it.
   1685   if (NulCharacter && !isLexingRawMode())
   1686     Diag(NulCharacter, diag::null_in_string);
   1687 
   1688   // Update the location of the token as well as the BufferPtr instance var.
   1689   const char *TokStart = BufferPtr;
   1690   FormTokenWithChars(Result, CurPtr, Kind);
   1691   Result.setLiteralData(TokStart);
   1692 }
   1693 
   1694 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
   1695 /// having lexed R", LR", u8R", uR", or UR".
   1696 void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
   1697                                 tok::TokenKind Kind) {
   1698   // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
   1699   //  Between the initial and final double quote characters of the raw string,
   1700   //  any transformations performed in phases 1 and 2 (trigraphs,
   1701   //  universal-character-names, and line splicing) are reverted.
   1702 
   1703   if (!isLexingRawMode())
   1704     Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
   1705 
   1706   unsigned PrefixLen = 0;
   1707 
   1708   while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
   1709     ++PrefixLen;
   1710 
   1711   // If the last character was not a '(', then we didn't lex a valid delimiter.
   1712   if (CurPtr[PrefixLen] != '(') {
   1713     if (!isLexingRawMode()) {
   1714       const char *PrefixEnd = &CurPtr[PrefixLen];
   1715       if (PrefixLen == 16) {
   1716         Diag(PrefixEnd, diag::err_raw_delim_too_long);
   1717       } else {
   1718         Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
   1719           << StringRef(PrefixEnd, 1);
   1720       }
   1721     }
   1722 
   1723     // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
   1724     // it's possible the '"' was intended to be part of the raw string, but
   1725     // there's not much we can do about that.
   1726     while (1) {
   1727       char C = *CurPtr++;
   1728 
   1729       if (C == '"')
   1730         break;
   1731       if (C == 0 && CurPtr-1 == BufferEnd) {
   1732         --CurPtr;
   1733         break;
   1734       }
   1735     }
   1736 
   1737     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1738     return;
   1739   }
   1740 
   1741   // Save prefix and move CurPtr past it
   1742   const char *Prefix = CurPtr;
   1743   CurPtr += PrefixLen + 1; // skip over prefix and '('
   1744 
   1745   while (1) {
   1746     char C = *CurPtr++;
   1747 
   1748     if (C == ')') {
   1749       // Check for prefix match and closing quote.
   1750       if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
   1751         CurPtr += PrefixLen + 1; // skip over prefix and '"'
   1752         break;
   1753       }
   1754     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
   1755       if (!isLexingRawMode())
   1756         Diag(BufferPtr, diag::err_unterminated_raw_string)
   1757           << StringRef(Prefix, PrefixLen);
   1758       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1759       return;
   1760     }
   1761   }
   1762 
   1763   // If we are in C++11, lex the optional ud-suffix.
   1764   if (getLangOpts().CPlusPlus)
   1765     CurPtr = LexUDSuffix(Result, CurPtr, true);
   1766 
   1767   // Update the location of token as well as BufferPtr.
   1768   const char *TokStart = BufferPtr;
   1769   FormTokenWithChars(Result, CurPtr, Kind);
   1770   Result.setLiteralData(TokStart);
   1771 }
   1772 
   1773 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
   1774 /// after having lexed the '<' character.  This is used for #include filenames.
   1775 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
   1776   const char *NulCharacter = 0; // Does this string contain the \0 character?
   1777   const char *AfterLessPos = CurPtr;
   1778   char C = getAndAdvanceChar(CurPtr, Result);
   1779   while (C != '>') {
   1780     // Skip escaped characters.
   1781     if (C == '\\') {
   1782       // Skip the escaped character.
   1783       getAndAdvanceChar(CurPtr, Result);
   1784     } else if (C == '\n' || C == '\r' ||             // Newline.
   1785                (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
   1786                            isCodeCompletionPoint(CurPtr-1)))) {
   1787       // If the filename is unterminated, then it must just be a lone <
   1788       // character.  Return this as such.
   1789       FormTokenWithChars(Result, AfterLessPos, tok::less);
   1790       return;
   1791     } else if (C == 0) {
   1792       NulCharacter = CurPtr-1;
   1793     }
   1794     C = getAndAdvanceChar(CurPtr, Result);
   1795   }
   1796 
   1797   // If a nul character existed in the string, warn about it.
   1798   if (NulCharacter && !isLexingRawMode())
   1799     Diag(NulCharacter, diag::null_in_string);
   1800 
   1801   // Update the location of token as well as BufferPtr.
   1802   const char *TokStart = BufferPtr;
   1803   FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
   1804   Result.setLiteralData(TokStart);
   1805 }
   1806 
   1807 
   1808 /// LexCharConstant - Lex the remainder of a character constant, after having
   1809 /// lexed either ' or L' or u' or U'.
   1810 void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
   1811                             tok::TokenKind Kind) {
   1812   const char *NulCharacter = 0; // Does this character contain the \0 character?
   1813 
   1814   if (!isLexingRawMode() &&
   1815       (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant))
   1816     Diag(BufferPtr, getLangOpts().CPlusPlus
   1817            ? diag::warn_cxx98_compat_unicode_literal
   1818            : diag::warn_c99_compat_unicode_literal);
   1819 
   1820   char C = getAndAdvanceChar(CurPtr, Result);
   1821   if (C == '\'') {
   1822     if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1823       Diag(BufferPtr, diag::ext_empty_character);
   1824     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1825     return;
   1826   }
   1827 
   1828   while (C != '\'') {
   1829     // Skip escaped characters.
   1830     if (C == '\\')
   1831       C = getAndAdvanceChar(CurPtr, Result);
   1832 
   1833     if (C == '\n' || C == '\r' ||             // Newline.
   1834         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
   1835       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
   1836         Diag(BufferPtr, diag::ext_unterminated_char);
   1837       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1838       return;
   1839     }
   1840 
   1841     if (C == 0) {
   1842       if (isCodeCompletionPoint(CurPtr-1)) {
   1843         PP->CodeCompleteNaturalLanguage();
   1844         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
   1845         return cutOffLexing();
   1846       }
   1847 
   1848       NulCharacter = CurPtr-1;
   1849     }
   1850     C = getAndAdvanceChar(CurPtr, Result);
   1851   }
   1852 
   1853   // If we are in C++11, lex the optional ud-suffix.
   1854   if (getLangOpts().CPlusPlus)
   1855     CurPtr = LexUDSuffix(Result, CurPtr, false);
   1856 
   1857   // If a nul character existed in the character, warn about it.
   1858   if (NulCharacter && !isLexingRawMode())
   1859     Diag(NulCharacter, diag::null_in_char);
   1860 
   1861   // Update the location of token as well as BufferPtr.
   1862   const char *TokStart = BufferPtr;
   1863   FormTokenWithChars(Result, CurPtr, Kind);
   1864   Result.setLiteralData(TokStart);
   1865 }
   1866 
   1867 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
   1868 /// Update BufferPtr to point to the next non-whitespace character and return.
   1869 ///
   1870 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
   1871 ///
   1872 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
   1873   // Whitespace - Skip it, then return the token after the whitespace.
   1874   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
   1875 
   1876   unsigned char Char = *CurPtr;
   1877 
   1878   // Skip consecutive spaces efficiently.
   1879   while (1) {
   1880     // Skip horizontal whitespace very aggressively.
   1881     while (isHorizontalWhitespace(Char))
   1882       Char = *++CurPtr;
   1883 
   1884     // Otherwise if we have something other than whitespace, we're done.
   1885     if (!isVerticalWhitespace(Char))
   1886       break;
   1887 
   1888     if (ParsingPreprocessorDirective) {
   1889       // End of preprocessor directive line, let LexTokenInternal handle this.
   1890       BufferPtr = CurPtr;
   1891       return false;
   1892     }
   1893 
   1894     // OK, but handle newline.
   1895     SawNewline = true;
   1896     Char = *++CurPtr;
   1897   }
   1898 
   1899   // If the client wants us to return whitespace, return it now.
   1900   if (isKeepWhitespaceMode()) {
   1901     FormTokenWithChars(Result, CurPtr, tok::unknown);
   1902     if (SawNewline)
   1903       IsAtStartOfLine = true;
   1904     // FIXME: The next token will not have LeadingSpace set.
   1905     return true;
   1906   }
   1907 
   1908   // If this isn't immediately after a newline, there is leading space.
   1909   char PrevChar = CurPtr[-1];
   1910   bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
   1911 
   1912   Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
   1913   if (SawNewline)
   1914     Result.setFlag(Token::StartOfLine);
   1915 
   1916   BufferPtr = CurPtr;
   1917   return false;
   1918 }
   1919 
   1920 /// We have just read the // characters from input.  Skip until we find the
   1921 /// newline character thats terminate the comment.  Then update BufferPtr and
   1922 /// return.
   1923 ///
   1924 /// If we're in KeepCommentMode or any CommentHandler has inserted
   1925 /// some tokens, this will store the first token and return true.
   1926 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) {
   1927   // If Line comments aren't explicitly enabled for this language, emit an
   1928   // extension warning.
   1929   if (!LangOpts.LineComment && !isLexingRawMode()) {
   1930     Diag(BufferPtr, diag::ext_line_comment);
   1931 
   1932     // Mark them enabled so we only emit one warning for this translation
   1933     // unit.
   1934     LangOpts.LineComment = true;
   1935   }
   1936 
   1937   // Scan over the body of the comment.  The common case, when scanning, is that
   1938   // the comment contains normal ascii characters with nothing interesting in
   1939   // them.  As such, optimize for this case with the inner loop.
   1940   char C;
   1941   do {
   1942     C = *CurPtr;
   1943     // Skip over characters in the fast loop.
   1944     while (C != 0 &&                // Potentially EOF.
   1945            C != '\n' && C != '\r')  // Newline or DOS-style newline.
   1946       C = *++CurPtr;
   1947 
   1948     const char *NextLine = CurPtr;
   1949     if (C != 0) {
   1950       // We found a newline, see if it's escaped.
   1951       const char *EscapePtr = CurPtr-1;
   1952       while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace.
   1953         --EscapePtr;
   1954 
   1955       if (*EscapePtr == '\\') // Escaped newline.
   1956         CurPtr = EscapePtr;
   1957       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
   1958                EscapePtr[-2] == '?') // Trigraph-escaped newline.
   1959         CurPtr = EscapePtr-2;
   1960       else
   1961         break; // This is a newline, we're done.
   1962     }
   1963 
   1964     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
   1965     // properly decode the character.  Read it in raw mode to avoid emitting
   1966     // diagnostics about things like trigraphs.  If we see an escaped newline,
   1967     // we'll handle it below.
   1968     const char *OldPtr = CurPtr;
   1969     bool OldRawMode = isLexingRawMode();
   1970     LexingRawMode = true;
   1971     C = getAndAdvanceChar(CurPtr, Result);
   1972     LexingRawMode = OldRawMode;
   1973 
   1974     // If we only read only one character, then no special handling is needed.
   1975     // We're done and can skip forward to the newline.
   1976     if (C != 0 && CurPtr == OldPtr+1) {
   1977       CurPtr = NextLine;
   1978       break;
   1979     }
   1980 
   1981     // If we read multiple characters, and one of those characters was a \r or
   1982     // \n, then we had an escaped newline within the comment.  Emit diagnostic
   1983     // unless the next line is also a // comment.
   1984     if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
   1985       for (; OldPtr != CurPtr; ++OldPtr)
   1986         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
   1987           // Okay, we found a // comment that ends in a newline, if the next
   1988           // line is also a // comment, but has spaces, don't emit a diagnostic.
   1989           if (isWhitespace(C)) {
   1990             const char *ForwardPtr = CurPtr;
   1991             while (isWhitespace(*ForwardPtr))  // Skip whitespace.
   1992               ++ForwardPtr;
   1993             if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
   1994               break;
   1995           }
   1996 
   1997           if (!isLexingRawMode())
   1998             Diag(OldPtr-1, diag::ext_multi_line_line_comment);
   1999           break;
   2000         }
   2001     }
   2002 
   2003     if (CurPtr == BufferEnd+1) {
   2004       --CurPtr;
   2005       break;
   2006     }
   2007 
   2008     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
   2009       PP->CodeCompleteNaturalLanguage();
   2010       cutOffLexing();
   2011       return false;
   2012     }
   2013 
   2014   } while (C != '\n' && C != '\r');
   2015 
   2016   // Found but did not consume the newline.  Notify comment handlers about the
   2017   // comment unless we're in a #if 0 block.
   2018   if (PP && !isLexingRawMode() &&
   2019       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
   2020                                             getSourceLocation(CurPtr)))) {
   2021     BufferPtr = CurPtr;
   2022     return true; // A token has to be returned.
   2023   }
   2024 
   2025   // If we are returning comments as tokens, return this comment as a token.
   2026   if (inKeepCommentMode())
   2027     return SaveLineComment(Result, CurPtr);
   2028 
   2029   // If we are inside a preprocessor directive and we see the end of line,
   2030   // return immediately, so that the lexer can return this as an EOD token.
   2031   if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
   2032     BufferPtr = CurPtr;
   2033     return false;
   2034   }
   2035 
   2036   // Otherwise, eat the \n character.  We don't care if this is a \n\r or
   2037   // \r\n sequence.  This is an efficiency hack (because we know the \n can't
   2038   // contribute to another token), it isn't needed for correctness.  Note that
   2039   // this is ok even in KeepWhitespaceMode, because we would have returned the
   2040   /// comment above in that mode.
   2041   ++CurPtr;
   2042 
   2043   // The next returned token is at the start of the line.
   2044   Result.setFlag(Token::StartOfLine);
   2045   // No leading whitespace seen so far.
   2046   Result.clearFlag(Token::LeadingSpace);
   2047   BufferPtr = CurPtr;
   2048   return false;
   2049 }
   2050 
   2051 /// If in save-comment mode, package up this Line comment in an appropriate
   2052 /// way and return it.
   2053 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
   2054   // If we're not in a preprocessor directive, just return the // comment
   2055   // directly.
   2056   FormTokenWithChars(Result, CurPtr, tok::comment);
   2057 
   2058   if (!ParsingPreprocessorDirective || LexingRawMode)
   2059     return true;
   2060 
   2061   // If this Line-style comment is in a macro definition, transmogrify it into
   2062   // a C-style block comment.
   2063   bool Invalid = false;
   2064   std::string Spelling = PP->getSpelling(Result, &Invalid);
   2065   if (Invalid)
   2066     return true;
   2067 
   2068   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
   2069   Spelling[1] = '*';   // Change prefix to "/*".
   2070   Spelling += "*/";    // add suffix.
   2071 
   2072   Result.setKind(tok::comment);
   2073   PP->CreateString(Spelling, Result,
   2074                    Result.getLocation(), Result.getLocation());
   2075   return true;
   2076 }
   2077 
   2078 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
   2079 /// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
   2080 /// a diagnostic if so.  We know that the newline is inside of a block comment.
   2081 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
   2082                                                   Lexer *L) {
   2083   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
   2084 
   2085   // Back up off the newline.
   2086   --CurPtr;
   2087 
   2088   // If this is a two-character newline sequence, skip the other character.
   2089   if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
   2090     // \n\n or \r\r -> not escaped newline.
   2091     if (CurPtr[0] == CurPtr[1])
   2092       return false;
   2093     // \n\r or \r\n -> skip the newline.
   2094     --CurPtr;
   2095   }
   2096 
   2097   // If we have horizontal whitespace, skip over it.  We allow whitespace
   2098   // between the slash and newline.
   2099   bool HasSpace = false;
   2100   while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
   2101     --CurPtr;
   2102     HasSpace = true;
   2103   }
   2104 
   2105   // If we have a slash, we know this is an escaped newline.
   2106   if (*CurPtr == '\\') {
   2107     if (CurPtr[-1] != '*') return false;
   2108   } else {
   2109     // It isn't a slash, is it the ?? / trigraph?
   2110     if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
   2111         CurPtr[-3] != '*')
   2112       return false;
   2113 
   2114     // This is the trigraph ending the comment.  Emit a stern warning!
   2115     CurPtr -= 2;
   2116 
   2117     // If no trigraphs are enabled, warn that we ignored this trigraph and
   2118     // ignore this * character.
   2119     if (!L->getLangOpts().Trigraphs) {
   2120       if (!L->isLexingRawMode())
   2121         L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
   2122       return false;
   2123     }
   2124     if (!L->isLexingRawMode())
   2125       L->Diag(CurPtr, diag::trigraph_ends_block_comment);
   2126   }
   2127 
   2128   // Warn about having an escaped newline between the */ characters.
   2129   if (!L->isLexingRawMode())
   2130     L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
   2131 
   2132   // If there was space between the backslash and newline, warn about it.
   2133   if (HasSpace && !L->isLexingRawMode())
   2134     L->Diag(CurPtr, diag::backslash_newline_space);
   2135 
   2136   return true;
   2137 }
   2138 
   2139 #ifdef __SSE2__
   2140 #include <emmintrin.h>
   2141 #elif __ALTIVEC__
   2142 #include <altivec.h>
   2143 #undef bool
   2144 #endif
   2145 
   2146 /// We have just read from input the / and * characters that started a comment.
   2147 /// Read until we find the * and / characters that terminate the comment.
   2148 /// Note that we don't bother decoding trigraphs or escaped newlines in block
   2149 /// comments, because they cannot cause the comment to end.  The only thing
   2150 /// that can happen is the comment could end with an escaped newline between
   2151 /// the terminating * and /.
   2152 ///
   2153 /// If we're in KeepCommentMode or any CommentHandler has inserted
   2154 /// some tokens, this will store the first token and return true.
   2155 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
   2156   // Scan one character past where we should, looking for a '/' character.  Once
   2157   // we find it, check to see if it was preceded by a *.  This common
   2158   // optimization helps people who like to put a lot of * characters in their
   2159   // comments.
   2160 
   2161   // The first character we get with newlines and trigraphs skipped to handle
   2162   // the degenerate /*/ case below correctly if the * has an escaped newline
   2163   // after it.
   2164   unsigned CharSize;
   2165   unsigned char C = getCharAndSize(CurPtr, CharSize);
   2166   CurPtr += CharSize;
   2167   if (C == 0 && CurPtr == BufferEnd+1) {
   2168     if (!isLexingRawMode())
   2169       Diag(BufferPtr, diag::err_unterminated_block_comment);
   2170     --CurPtr;
   2171 
   2172     // KeepWhitespaceMode should return this broken comment as a token.  Since
   2173     // it isn't a well formed comment, just return it as an 'unknown' token.
   2174     if (isKeepWhitespaceMode()) {
   2175       FormTokenWithChars(Result, CurPtr, tok::unknown);
   2176       return true;
   2177     }
   2178 
   2179     BufferPtr = CurPtr;
   2180     return false;
   2181   }
   2182 
   2183   // Check to see if the first character after the '/*' is another /.  If so,
   2184   // then this slash does not end the block comment, it is part of it.
   2185   if (C == '/')
   2186     C = *CurPtr++;
   2187 
   2188   while (1) {
   2189     // Skip over all non-interesting characters until we find end of buffer or a
   2190     // (probably ending) '/' character.
   2191     if (CurPtr + 24 < BufferEnd &&
   2192         // If there is a code-completion point avoid the fast scan because it
   2193         // doesn't check for '\0'.
   2194         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
   2195       // While not aligned to a 16-byte boundary.
   2196       while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
   2197         C = *CurPtr++;
   2198 
   2199       if (C == '/') goto FoundSlash;
   2200 
   2201 #ifdef __SSE2__
   2202       __m128i Slashes = _mm_set1_epi8('/');
   2203       while (CurPtr+16 <= BufferEnd) {
   2204         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
   2205                                     Slashes));
   2206         if (cmp != 0) {
   2207           // Adjust the pointer to point directly after the first slash. It's
   2208           // not necessary to set C here, it will be overwritten at the end of
   2209           // the outer loop.
   2210           CurPtr += llvm::countTrailingZeros<unsigned>(cmp) + 1;
   2211           goto FoundSlash;
   2212         }
   2213         CurPtr += 16;
   2214       }
   2215 #elif __ALTIVEC__
   2216       __vector unsigned char Slashes = {
   2217         '/', '/', '/', '/',  '/', '/', '/', '/',
   2218         '/', '/', '/', '/',  '/', '/', '/', '/'
   2219       };
   2220       while (CurPtr+16 <= BufferEnd &&
   2221              !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
   2222         CurPtr += 16;
   2223 #else
   2224       // Scan for '/' quickly.  Many block comments are very large.
   2225       while (CurPtr[0] != '/' &&
   2226              CurPtr[1] != '/' &&
   2227              CurPtr[2] != '/' &&
   2228              CurPtr[3] != '/' &&
   2229              CurPtr+4 < BufferEnd) {
   2230         CurPtr += 4;
   2231       }
   2232 #endif
   2233 
   2234       // It has to be one of the bytes scanned, increment to it and read one.
   2235       C = *CurPtr++;
   2236     }
   2237 
   2238     // Loop to scan the remainder.
   2239     while (C != '/' && C != '\0')
   2240       C = *CurPtr++;
   2241 
   2242     if (C == '/') {
   2243   FoundSlash:
   2244       if (CurPtr[-2] == '*')  // We found the final */.  We're done!
   2245         break;
   2246 
   2247       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
   2248         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
   2249           // We found the final */, though it had an escaped newline between the
   2250           // * and /.  We're done!
   2251           break;
   2252         }
   2253       }
   2254       if (CurPtr[0] == '*' && CurPtr[1] != '/') {
   2255         // If this is a /* inside of the comment, emit a warning.  Don't do this
   2256         // if this is a /*/, which will end the comment.  This misses cases with
   2257         // embedded escaped newlines, but oh well.
   2258         if (!isLexingRawMode())
   2259           Diag(CurPtr-1, diag::warn_nested_block_comment);
   2260       }
   2261     } else if (C == 0 && CurPtr == BufferEnd+1) {
   2262       if (!isLexingRawMode())
   2263         Diag(BufferPtr, diag::err_unterminated_block_comment);
   2264       // Note: the user probably forgot a */.  We could continue immediately
   2265       // after the /*, but this would involve lexing a lot of what really is the
   2266       // comment, which surely would confuse the parser.
   2267       --CurPtr;
   2268 
   2269       // KeepWhitespaceMode should return this broken comment as a token.  Since
   2270       // it isn't a well formed comment, just return it as an 'unknown' token.
   2271       if (isKeepWhitespaceMode()) {
   2272         FormTokenWithChars(Result, CurPtr, tok::unknown);
   2273         return true;
   2274       }
   2275 
   2276       BufferPtr = CurPtr;
   2277       return false;
   2278     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
   2279       PP->CodeCompleteNaturalLanguage();
   2280       cutOffLexing();
   2281       return false;
   2282     }
   2283 
   2284     C = *CurPtr++;
   2285   }
   2286 
   2287   // Notify comment handlers about the comment unless we're in a #if 0 block.
   2288   if (PP && !isLexingRawMode() &&
   2289       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
   2290                                             getSourceLocation(CurPtr)))) {
   2291     BufferPtr = CurPtr;
   2292     return true; // A token has to be returned.
   2293   }
   2294 
   2295   // If we are returning comments as tokens, return this comment as a token.
   2296   if (inKeepCommentMode()) {
   2297     FormTokenWithChars(Result, CurPtr, tok::comment);
   2298     return true;
   2299   }
   2300 
   2301   // It is common for the tokens immediately after a /**/ comment to be
   2302   // whitespace.  Instead of going through the big switch, handle it
   2303   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
   2304   // have already returned above with the comment as a token.
   2305   if (isHorizontalWhitespace(*CurPtr)) {
   2306     SkipWhitespace(Result, CurPtr+1);
   2307     return false;
   2308   }
   2309 
   2310   // Otherwise, just return so that the next character will be lexed as a token.
   2311   BufferPtr = CurPtr;
   2312   Result.setFlag(Token::LeadingSpace);
   2313   return false;
   2314 }
   2315 
   2316 //===----------------------------------------------------------------------===//
   2317 // Primary Lexing Entry Points
   2318 //===----------------------------------------------------------------------===//
   2319 
   2320 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
   2321 /// uninterpreted string.  This switches the lexer out of directive mode.
   2322 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
   2323   assert(ParsingPreprocessorDirective && ParsingFilename == false &&
   2324          "Must be in a preprocessing directive!");
   2325   Token Tmp;
   2326 
   2327   // CurPtr - Cache BufferPtr in an automatic variable.
   2328   const char *CurPtr = BufferPtr;
   2329   while (1) {
   2330     char Char = getAndAdvanceChar(CurPtr, Tmp);
   2331     switch (Char) {
   2332     default:
   2333       if (Result)
   2334         Result->push_back(Char);
   2335       break;
   2336     case 0:  // Null.
   2337       // Found end of file?
   2338       if (CurPtr-1 != BufferEnd) {
   2339         if (isCodeCompletionPoint(CurPtr-1)) {
   2340           PP->CodeCompleteNaturalLanguage();
   2341           cutOffLexing();
   2342           return;
   2343         }
   2344 
   2345         // Nope, normal character, continue.
   2346         if (Result)
   2347           Result->push_back(Char);
   2348         break;
   2349       }
   2350       // FALL THROUGH.
   2351     case '\r':
   2352     case '\n':
   2353       // Okay, we found the end of the line. First, back up past the \0, \r, \n.
   2354       assert(CurPtr[-1] == Char && "Trigraphs for newline?");
   2355       BufferPtr = CurPtr-1;
   2356 
   2357       // Next, lex the character, which should handle the EOD transition.
   2358       Lex(Tmp);
   2359       if (Tmp.is(tok::code_completion)) {
   2360         if (PP)
   2361           PP->CodeCompleteNaturalLanguage();
   2362         Lex(Tmp);
   2363       }
   2364       assert(Tmp.is(tok::eod) && "Unexpected token!");
   2365 
   2366       // Finally, we're done;
   2367       return;
   2368     }
   2369   }
   2370 }
   2371 
   2372 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
   2373 /// condition, reporting diagnostics and handling other edge cases as required.
   2374 /// This returns true if Result contains a token, false if PP.Lex should be
   2375 /// called again.
   2376 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
   2377   // If we hit the end of the file while parsing a preprocessor directive,
   2378   // end the preprocessor directive first.  The next token returned will
   2379   // then be the end of file.
   2380   if (ParsingPreprocessorDirective) {
   2381     // Done parsing the "line".
   2382     ParsingPreprocessorDirective = false;
   2383     // Update the location of token as well as BufferPtr.
   2384     FormTokenWithChars(Result, CurPtr, tok::eod);
   2385 
   2386     // Restore comment saving mode, in case it was disabled for directive.
   2387     resetExtendedTokenMode();
   2388     return true;  // Have a token.
   2389   }
   2390 
   2391   // If we are in raw mode, return this event as an EOF token.  Let the caller
   2392   // that put us in raw mode handle the event.
   2393   if (isLexingRawMode()) {
   2394     Result.startToken();
   2395     BufferPtr = BufferEnd;
   2396     FormTokenWithChars(Result, BufferEnd, tok::eof);
   2397     return true;
   2398   }
   2399 
   2400   // Issue diagnostics for unterminated #if and missing newline.
   2401 
   2402   // If we are in a #if directive, emit an error.
   2403   while (!ConditionalStack.empty()) {
   2404     if (PP->getCodeCompletionFileLoc() != FileLoc)
   2405       PP->Diag(ConditionalStack.back().IfLoc,
   2406                diag::err_pp_unterminated_conditional);
   2407     ConditionalStack.pop_back();
   2408   }
   2409 
   2410   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
   2411   // a pedwarn.
   2412   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
   2413     Diag(BufferEnd, LangOpts.CPlusPlus11 ? // C++11 [lex.phases] 2.2 p2
   2414          diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof)
   2415     << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
   2416 
   2417   BufferPtr = CurPtr;
   2418 
   2419   // Finally, let the preprocessor handle this.
   2420   return PP->HandleEndOfFile(Result, isPragmaLexer());
   2421 }
   2422 
   2423 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
   2424 /// the specified lexer will return a tok::l_paren token, 0 if it is something
   2425 /// else and 2 if there are no more tokens in the buffer controlled by the
   2426 /// lexer.
   2427 unsigned Lexer::isNextPPTokenLParen() {
   2428   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
   2429 
   2430   // Switch to 'skipping' mode.  This will ensure that we can lex a token
   2431   // without emitting diagnostics, disables macro expansion, and will cause EOF
   2432   // to return an EOF token instead of popping the include stack.
   2433   LexingRawMode = true;
   2434 
   2435   // Save state that can be changed while lexing so that we can restore it.
   2436   const char *TmpBufferPtr = BufferPtr;
   2437   bool inPPDirectiveMode = ParsingPreprocessorDirective;
   2438 
   2439   Token Tok;
   2440   Tok.startToken();
   2441   LexTokenInternal(Tok);
   2442 
   2443   // Restore state that may have changed.
   2444   BufferPtr = TmpBufferPtr;
   2445   ParsingPreprocessorDirective = inPPDirectiveMode;
   2446 
   2447   // Restore the lexer back to non-skipping mode.
   2448   LexingRawMode = false;
   2449 
   2450   if (Tok.is(tok::eof))
   2451     return 2;
   2452   return Tok.is(tok::l_paren);
   2453 }
   2454 
   2455 /// \brief Find the end of a version control conflict marker.
   2456 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
   2457                                    ConflictMarkerKind CMK) {
   2458   const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
   2459   size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
   2460   StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
   2461   size_t Pos = RestOfBuffer.find(Terminator);
   2462   while (Pos != StringRef::npos) {
   2463     // Must occur at start of line.
   2464     if (RestOfBuffer[Pos-1] != '\r' &&
   2465         RestOfBuffer[Pos-1] != '\n') {
   2466       RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
   2467       Pos = RestOfBuffer.find(Terminator);
   2468       continue;
   2469     }
   2470     return RestOfBuffer.data()+Pos;
   2471   }
   2472   return 0;
   2473 }
   2474 
   2475 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
   2476 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
   2477 /// and recover nicely.  This returns true if it is a conflict marker and false
   2478 /// if not.
   2479 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
   2480   // Only a conflict marker if it starts at the beginning of a line.
   2481   if (CurPtr != BufferStart &&
   2482       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
   2483     return false;
   2484 
   2485   // Check to see if we have <<<<<<< or >>>>.
   2486   if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
   2487       (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
   2488     return false;
   2489 
   2490   // If we have a situation where we don't care about conflict markers, ignore
   2491   // it.
   2492   if (CurrentConflictMarkerState || isLexingRawMode())
   2493     return false;
   2494 
   2495   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
   2496 
   2497   // Check to see if there is an ending marker somewhere in the buffer at the
   2498   // start of a line to terminate this conflict marker.
   2499   if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
   2500     // We found a match.  We are really in a conflict marker.
   2501     // Diagnose this, and ignore to the end of line.
   2502     Diag(CurPtr, diag::err_conflict_marker);
   2503     CurrentConflictMarkerState = Kind;
   2504 
   2505     // Skip ahead to the end of line.  We know this exists because the
   2506     // end-of-conflict marker starts with \r or \n.
   2507     while (*CurPtr != '\r' && *CurPtr != '\n') {
   2508       assert(CurPtr != BufferEnd && "Didn't find end of line");
   2509       ++CurPtr;
   2510     }
   2511     BufferPtr = CurPtr;
   2512     return true;
   2513   }
   2514 
   2515   // No end of conflict marker found.
   2516   return false;
   2517 }
   2518 
   2519 
   2520 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
   2521 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
   2522 /// is the end of a conflict marker.  Handle it by ignoring up until the end of
   2523 /// the line.  This returns true if it is a conflict marker and false if not.
   2524 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
   2525   // Only a conflict marker if it starts at the beginning of a line.
   2526   if (CurPtr != BufferStart &&
   2527       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
   2528     return false;
   2529 
   2530   // If we have a situation where we don't care about conflict markers, ignore
   2531   // it.
   2532   if (!CurrentConflictMarkerState || isLexingRawMode())
   2533     return false;
   2534 
   2535   // Check to see if we have the marker (4 characters in a row).
   2536   for (unsigned i = 1; i != 4; ++i)
   2537     if (CurPtr[i] != CurPtr[0])
   2538       return false;
   2539 
   2540   // If we do have it, search for the end of the conflict marker.  This could
   2541   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
   2542   // be the end of conflict marker.
   2543   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
   2544                                         CurrentConflictMarkerState)) {
   2545     CurPtr = End;
   2546 
   2547     // Skip ahead to the end of line.
   2548     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
   2549       ++CurPtr;
   2550 
   2551     BufferPtr = CurPtr;
   2552 
   2553     // No longer in the conflict marker.
   2554     CurrentConflictMarkerState = CMK_None;
   2555     return true;
   2556   }
   2557 
   2558   return false;
   2559 }
   2560 
   2561 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
   2562   if (PP && PP->isCodeCompletionEnabled()) {
   2563     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
   2564     return Loc == PP->getCodeCompletionLoc();
   2565   }
   2566 
   2567   return false;
   2568 }
   2569 
   2570 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
   2571                            Token *Result) {
   2572   unsigned CharSize;
   2573   char Kind = getCharAndSize(StartPtr, CharSize);
   2574 
   2575   unsigned NumHexDigits;
   2576   if (Kind == 'u')
   2577     NumHexDigits = 4;
   2578   else if (Kind == 'U')
   2579     NumHexDigits = 8;
   2580   else
   2581     return 0;
   2582 
   2583   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
   2584     if (Result && !isLexingRawMode())
   2585       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
   2586     return 0;
   2587   }
   2588 
   2589   const char *CurPtr = StartPtr + CharSize;
   2590   const char *KindLoc = &CurPtr[-1];
   2591 
   2592   uint32_t CodePoint = 0;
   2593   for (unsigned i = 0; i < NumHexDigits; ++i) {
   2594     char C = getCharAndSize(CurPtr, CharSize);
   2595 
   2596     unsigned Value = llvm::hexDigitValue(C);
   2597     if (Value == -1U) {
   2598       if (Result && !isLexingRawMode()) {
   2599         if (i == 0) {
   2600           Diag(BufferPtr, diag::warn_ucn_escape_no_digits)
   2601             << StringRef(KindLoc, 1);
   2602         } else {
   2603           Diag(BufferPtr, diag::warn_ucn_escape_incomplete);
   2604 
   2605           // If the user wrote \U1234, suggest a fixit to \u.
   2606           if (i == 4 && NumHexDigits == 8) {
   2607             CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
   2608             Diag(KindLoc, diag::note_ucn_four_not_eight)
   2609               << FixItHint::CreateReplacement(URange, "u");
   2610           }
   2611         }
   2612       }
   2613 
   2614       return 0;
   2615     }
   2616 
   2617     CodePoint <<= 4;
   2618     CodePoint += Value;
   2619 
   2620     CurPtr += CharSize;
   2621   }
   2622 
   2623   if (Result) {
   2624     Result->setFlag(Token::HasUCN);
   2625     if (CurPtr - StartPtr == (ptrdiff_t)NumHexDigits + 2)
   2626       StartPtr = CurPtr;
   2627     else
   2628       while (StartPtr != CurPtr)
   2629         (void)getAndAdvanceChar(StartPtr, *Result);
   2630   } else {
   2631     StartPtr = CurPtr;
   2632   }
   2633 
   2634   // C99 6.4.3p2: A universal character name shall not specify a character whose
   2635   //   short identifier is less than 00A0 other than 0024 ($), 0040 (@), or
   2636   //   0060 (`), nor one in the range D800 through DFFF inclusive.)
   2637   // C++11 [lex.charset]p2: If the hexadecimal value for a
   2638   //   universal-character-name corresponds to a surrogate code point (in the
   2639   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
   2640   //   if the hexadecimal value for a universal-character-name outside the
   2641   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
   2642   //   string literal corresponds to a control character (in either of the
   2643   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
   2644   //   basic source character set, the program is ill-formed.
   2645   if (CodePoint < 0xA0) {
   2646     if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60)
   2647       return CodePoint;
   2648 
   2649     // We don't use isLexingRawMode() here because we need to warn about bad
   2650     // UCNs even when skipping preprocessing tokens in a #if block.
   2651     if (Result && PP) {
   2652       if (CodePoint < 0x20 || CodePoint >= 0x7F)
   2653         Diag(BufferPtr, diag::err_ucn_control_character);
   2654       else {
   2655         char C = static_cast<char>(CodePoint);
   2656         Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
   2657       }
   2658     }
   2659 
   2660     return 0;
   2661 
   2662   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
   2663     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
   2664     // We don't use isLexingRawMode() here because we need to diagnose bad
   2665     // UCNs even when skipping preprocessing tokens in a #if block.
   2666     if (Result && PP) {
   2667       if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
   2668         Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
   2669       else
   2670         Diag(BufferPtr, diag::err_ucn_escape_invalid);
   2671     }
   2672     return 0;
   2673   }
   2674 
   2675   return CodePoint;
   2676 }
   2677 
   2678 void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) {
   2679   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
   2680       isCharInSet(C, UnicodeWhitespaceChars)) {
   2681     Diag(BufferPtr, diag::ext_unicode_whitespace)
   2682       << makeCharRange(*this, BufferPtr, CurPtr);
   2683 
   2684     Result.setFlag(Token::LeadingSpace);
   2685     if (SkipWhitespace(Result, CurPtr))
   2686       return; // KeepWhitespaceMode
   2687 
   2688     return LexTokenInternal(Result);
   2689   }
   2690 
   2691   if (isAllowedIDChar(C, LangOpts) && isAllowedInitiallyIDChar(C, LangOpts)) {
   2692     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
   2693         !PP->isPreprocessedOutput()) {
   2694       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
   2695                                 makeCharRange(*this, BufferPtr, CurPtr),
   2696                                 /*IsFirst=*/true);
   2697     }
   2698 
   2699     MIOpt.ReadToken();
   2700     return LexIdentifier(Result, CurPtr);
   2701   }
   2702 
   2703   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
   2704       !PP->isPreprocessedOutput() &&
   2705       !isASCII(*BufferPtr) && !isAllowedIDChar(C, LangOpts)) {
   2706     // Non-ASCII characters tend to creep into source code unintentionally.
   2707     // Instead of letting the parser complain about the unknown token,
   2708     // just drop the character.
   2709     // Note that we can /only/ do this when the non-ASCII character is actually
   2710     // spelled as Unicode, not written as a UCN. The standard requires that
   2711     // we not throw away any possible preprocessor tokens, but there's a
   2712     // loophole in the mapping of Unicode characters to basic character set
   2713     // characters that allows us to map these particular characters to, say,
   2714     // whitespace.
   2715     Diag(BufferPtr, diag::err_non_ascii)
   2716       << FixItHint::CreateRemoval(makeCharRange(*this, BufferPtr, CurPtr));
   2717 
   2718     BufferPtr = CurPtr;
   2719     return LexTokenInternal(Result);
   2720   }
   2721 
   2722   // Otherwise, we have an explicit UCN or a character that's unlikely to show
   2723   // up by accident.
   2724   MIOpt.ReadToken();
   2725   FormTokenWithChars(Result, CurPtr, tok::unknown);
   2726 }
   2727 
   2728 
   2729 /// LexTokenInternal - This implements a simple C family lexer.  It is an
   2730 /// extremely performance critical piece of code.  This assumes that the buffer
   2731 /// has a null character at the end of the file.  This returns a preprocessing
   2732 /// token, not a normal token, as such, it is an internal interface.  It assumes
   2733 /// that the Flags of result have been cleared before calling this.
   2734 void Lexer::LexTokenInternal(Token &Result) {
   2735 LexNextToken:
   2736   // New token, can't need cleaning yet.
   2737   Result.clearFlag(Token::NeedsCleaning);
   2738   Result.setIdentifierInfo(0);
   2739 
   2740   // CurPtr - Cache BufferPtr in an automatic variable.
   2741   const char *CurPtr = BufferPtr;
   2742 
   2743   // Small amounts of horizontal whitespace is very common between tokens.
   2744   if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
   2745     ++CurPtr;
   2746     while ((*CurPtr == ' ') || (*CurPtr == '\t'))
   2747       ++CurPtr;
   2748 
   2749     // If we are keeping whitespace and other tokens, just return what we just
   2750     // skipped.  The next lexer invocation will return the token after the
   2751     // whitespace.
   2752     if (isKeepWhitespaceMode()) {
   2753       FormTokenWithChars(Result, CurPtr, tok::unknown);
   2754       // FIXME: The next token will not have LeadingSpace set.
   2755       return;
   2756     }
   2757 
   2758     BufferPtr = CurPtr;
   2759     Result.setFlag(Token::LeadingSpace);
   2760   }
   2761 
   2762   unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
   2763 
   2764   // Read a character, advancing over it.
   2765   char Char = getAndAdvanceChar(CurPtr, Result);
   2766   tok::TokenKind Kind;
   2767 
   2768   switch (Char) {
   2769   case 0:  // Null.
   2770     // Found end of file?
   2771     if (CurPtr-1 == BufferEnd) {
   2772       // Read the PP instance variable into an automatic variable, because
   2773       // LexEndOfFile will often delete 'this'.
   2774       Preprocessor *PPCache = PP;
   2775       if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
   2776         return;   // Got a token to return.
   2777       assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
   2778       return PPCache->Lex(Result);
   2779     }
   2780 
   2781     // Check if we are performing code completion.
   2782     if (isCodeCompletionPoint(CurPtr-1)) {
   2783       // Return the code-completion token.
   2784       Result.startToken();
   2785       FormTokenWithChars(Result, CurPtr, tok::code_completion);
   2786       return;
   2787     }
   2788 
   2789     if (!isLexingRawMode())
   2790       Diag(CurPtr-1, diag::null_in_file);
   2791     Result.setFlag(Token::LeadingSpace);
   2792     if (SkipWhitespace(Result, CurPtr))
   2793       return; // KeepWhitespaceMode
   2794 
   2795     goto LexNextToken;   // GCC isn't tail call eliminating.
   2796 
   2797   case 26:  // DOS & CP/M EOF: "^Z".
   2798     // If we're in Microsoft extensions mode, treat this as end of file.
   2799     if (LangOpts.MicrosoftExt) {
   2800       // Read the PP instance variable into an automatic variable, because
   2801       // LexEndOfFile will often delete 'this'.
   2802       Preprocessor *PPCache = PP;
   2803       if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
   2804         return;   // Got a token to return.
   2805       assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
   2806       return PPCache->Lex(Result);
   2807     }
   2808     // If Microsoft extensions are disabled, this is just random garbage.
   2809     Kind = tok::unknown;
   2810     break;
   2811 
   2812   case '\n':
   2813   case '\r':
   2814     // If we are inside a preprocessor directive and we see the end of line,
   2815     // we know we are done with the directive, so return an EOD token.
   2816     if (ParsingPreprocessorDirective) {
   2817       // Done parsing the "line".
   2818       ParsingPreprocessorDirective = false;
   2819 
   2820       // Restore comment saving mode, in case it was disabled for directive.
   2821       if (PP)
   2822         resetExtendedTokenMode();
   2823 
   2824       // Since we consumed a newline, we are back at the start of a line.
   2825       IsAtStartOfLine = true;
   2826 
   2827       Kind = tok::eod;
   2828       break;
   2829     }
   2830 
   2831     // No leading whitespace seen so far.
   2832     Result.clearFlag(Token::LeadingSpace);
   2833 
   2834     if (SkipWhitespace(Result, CurPtr))
   2835       return; // KeepWhitespaceMode
   2836     goto LexNextToken;   // GCC isn't tail call eliminating.
   2837   case ' ':
   2838   case '\t':
   2839   case '\f':
   2840   case '\v':
   2841   SkipHorizontalWhitespace:
   2842     Result.setFlag(Token::LeadingSpace);
   2843     if (SkipWhitespace(Result, CurPtr))
   2844       return; // KeepWhitespaceMode
   2845 
   2846   SkipIgnoredUnits:
   2847     CurPtr = BufferPtr;
   2848 
   2849     // If the next token is obviously a // or /* */ comment, skip it efficiently
   2850     // too (without going through the big switch stmt).
   2851     if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
   2852         LangOpts.LineComment && !LangOpts.TraditionalCPP) {
   2853       if (SkipLineComment(Result, CurPtr+2))
   2854         return; // There is a token to return.
   2855       goto SkipIgnoredUnits;
   2856     } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
   2857       if (SkipBlockComment(Result, CurPtr+2))
   2858         return; // There is a token to return.
   2859       goto SkipIgnoredUnits;
   2860     } else if (isHorizontalWhitespace(*CurPtr)) {
   2861       goto SkipHorizontalWhitespace;
   2862     }
   2863     goto LexNextToken;   // GCC isn't tail call eliminating.
   2864 
   2865   // C99 6.4.4.1: Integer Constants.
   2866   // C99 6.4.4.2: Floating Constants.
   2867   case '0': case '1': case '2': case '3': case '4':
   2868   case '5': case '6': case '7': case '8': case '9':
   2869     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2870     MIOpt.ReadToken();
   2871     return LexNumericConstant(Result, CurPtr);
   2872 
   2873   case 'u':   // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
   2874     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2875     MIOpt.ReadToken();
   2876 
   2877     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
   2878       Char = getCharAndSize(CurPtr, SizeTmp);
   2879 
   2880       // UTF-16 string literal
   2881       if (Char == '"')
   2882         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2883                                 tok::utf16_string_literal);
   2884 
   2885       // UTF-16 character constant
   2886       if (Char == '\'')
   2887         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2888                                tok::utf16_char_constant);
   2889 
   2890       // UTF-16 raw string literal
   2891       if (Char == 'R' && LangOpts.CPlusPlus11 &&
   2892           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2893         return LexRawStringLiteral(Result,
   2894                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2895                                            SizeTmp2, Result),
   2896                                tok::utf16_string_literal);
   2897 
   2898       if (Char == '8') {
   2899         char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
   2900 
   2901         // UTF-8 string literal
   2902         if (Char2 == '"')
   2903           return LexStringLiteral(Result,
   2904                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2905                                            SizeTmp2, Result),
   2906                                tok::utf8_string_literal);
   2907 
   2908         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
   2909           unsigned SizeTmp3;
   2910           char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
   2911           // UTF-8 raw string literal
   2912           if (Char3 == '"') {
   2913             return LexRawStringLiteral(Result,
   2914                    ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2915                                            SizeTmp2, Result),
   2916                                SizeTmp3, Result),
   2917                    tok::utf8_string_literal);
   2918           }
   2919         }
   2920       }
   2921     }
   2922 
   2923     // treat u like the start of an identifier.
   2924     return LexIdentifier(Result, CurPtr);
   2925 
   2926   case 'U':   // Identifier (Uber) or C11/C++11 UTF-32 string literal
   2927     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2928     MIOpt.ReadToken();
   2929 
   2930     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
   2931       Char = getCharAndSize(CurPtr, SizeTmp);
   2932 
   2933       // UTF-32 string literal
   2934       if (Char == '"')
   2935         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2936                                 tok::utf32_string_literal);
   2937 
   2938       // UTF-32 character constant
   2939       if (Char == '\'')
   2940         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2941                                tok::utf32_char_constant);
   2942 
   2943       // UTF-32 raw string literal
   2944       if (Char == 'R' && LangOpts.CPlusPlus11 &&
   2945           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2946         return LexRawStringLiteral(Result,
   2947                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2948                                            SizeTmp2, Result),
   2949                                tok::utf32_string_literal);
   2950     }
   2951 
   2952     // treat U like the start of an identifier.
   2953     return LexIdentifier(Result, CurPtr);
   2954 
   2955   case 'R': // Identifier or C++0x raw string literal
   2956     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2957     MIOpt.ReadToken();
   2958 
   2959     if (LangOpts.CPlusPlus11) {
   2960       Char = getCharAndSize(CurPtr, SizeTmp);
   2961 
   2962       if (Char == '"')
   2963         return LexRawStringLiteral(Result,
   2964                                    ConsumeChar(CurPtr, SizeTmp, Result),
   2965                                    tok::string_literal);
   2966     }
   2967 
   2968     // treat R like the start of an identifier.
   2969     return LexIdentifier(Result, CurPtr);
   2970 
   2971   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
   2972     // Notify MIOpt that we read a non-whitespace/non-comment token.
   2973     MIOpt.ReadToken();
   2974     Char = getCharAndSize(CurPtr, SizeTmp);
   2975 
   2976     // Wide string literal.
   2977     if (Char == '"')
   2978       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2979                               tok::wide_string_literal);
   2980 
   2981     // Wide raw string literal.
   2982     if (LangOpts.CPlusPlus11 && Char == 'R' &&
   2983         getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
   2984       return LexRawStringLiteral(Result,
   2985                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   2986                                            SizeTmp2, Result),
   2987                                tok::wide_string_literal);
   2988 
   2989     // Wide character constant.
   2990     if (Char == '\'')
   2991       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
   2992                              tok::wide_char_constant);
   2993     // FALL THROUGH, treating L like the start of an identifier.
   2994 
   2995   // C99 6.4.2: Identifiers.
   2996   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
   2997   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
   2998   case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
   2999   case 'V': case 'W': case 'X': case 'Y': case 'Z':
   3000   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
   3001   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
   3002   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
   3003   case 'v': case 'w': case 'x': case 'y': case 'z':
   3004   case '_':
   3005     // Notify MIOpt that we read a non-whitespace/non-comment token.
   3006     MIOpt.ReadToken();
   3007     return LexIdentifier(Result, CurPtr);
   3008 
   3009   case '$':   // $ in identifiers.
   3010     if (LangOpts.DollarIdents) {
   3011       if (!isLexingRawMode())
   3012         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
   3013       // Notify MIOpt that we read a non-whitespace/non-comment token.
   3014       MIOpt.ReadToken();
   3015       return LexIdentifier(Result, CurPtr);
   3016     }
   3017 
   3018     Kind = tok::unknown;
   3019     break;
   3020 
   3021   // C99 6.4.4: Character Constants.
   3022   case '\'':
   3023     // Notify MIOpt that we read a non-whitespace/non-comment token.
   3024     MIOpt.ReadToken();
   3025     return LexCharConstant(Result, CurPtr, tok::char_constant);
   3026 
   3027   // C99 6.4.5: String Literals.
   3028   case '"':
   3029     // Notify MIOpt that we read a non-whitespace/non-comment token.
   3030     MIOpt.ReadToken();
   3031     return LexStringLiteral(Result, CurPtr, tok::string_literal);
   3032 
   3033   // C99 6.4.6: Punctuators.
   3034   case '?':
   3035     Kind = tok::question;
   3036     break;
   3037   case '[':
   3038     Kind = tok::l_square;
   3039     break;
   3040   case ']':
   3041     Kind = tok::r_square;
   3042     break;
   3043   case '(':
   3044     Kind = tok::l_paren;
   3045     break;
   3046   case ')':
   3047     Kind = tok::r_paren;
   3048     break;
   3049   case '{':
   3050     Kind = tok::l_brace;
   3051     break;
   3052   case '}':
   3053     Kind = tok::r_brace;
   3054     break;
   3055   case '.':
   3056     Char = getCharAndSize(CurPtr, SizeTmp);
   3057     if (Char >= '0' && Char <= '9') {
   3058       // Notify MIOpt that we read a non-whitespace/non-comment token.
   3059       MIOpt.ReadToken();
   3060 
   3061       return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
   3062     } else if (LangOpts.CPlusPlus && Char == '*') {
   3063       Kind = tok::periodstar;
   3064       CurPtr += SizeTmp;
   3065     } else if (Char == '.' &&
   3066                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
   3067       Kind = tok::ellipsis;
   3068       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3069                            SizeTmp2, Result);
   3070     } else {
   3071       Kind = tok::period;
   3072     }
   3073     break;
   3074   case '&':
   3075     Char = getCharAndSize(CurPtr, SizeTmp);
   3076     if (Char == '&') {
   3077       Kind = tok::ampamp;
   3078       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3079     } else if (Char == '=') {
   3080       Kind = tok::ampequal;
   3081       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3082     } else {
   3083       Kind = tok::amp;
   3084     }
   3085     break;
   3086   case '*':
   3087     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
   3088       Kind = tok::starequal;
   3089       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3090     } else {
   3091       Kind = tok::star;
   3092     }
   3093     break;
   3094   case '+':
   3095     Char = getCharAndSize(CurPtr, SizeTmp);
   3096     if (Char == '+') {
   3097       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3098       Kind = tok::plusplus;
   3099     } else if (Char == '=') {
   3100       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3101       Kind = tok::plusequal;
   3102     } else {
   3103       Kind = tok::plus;
   3104     }
   3105     break;
   3106   case '-':
   3107     Char = getCharAndSize(CurPtr, SizeTmp);
   3108     if (Char == '-') {      // --
   3109       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3110       Kind = tok::minusminus;
   3111     } else if (Char == '>' && LangOpts.CPlusPlus &&
   3112                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
   3113       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3114                            SizeTmp2, Result);
   3115       Kind = tok::arrowstar;
   3116     } else if (Char == '>') {   // ->
   3117       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3118       Kind = tok::arrow;
   3119     } else if (Char == '=') {   // -=
   3120       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3121       Kind = tok::minusequal;
   3122     } else {
   3123       Kind = tok::minus;
   3124     }
   3125     break;
   3126   case '~':
   3127     Kind = tok::tilde;
   3128     break;
   3129   case '!':
   3130     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
   3131       Kind = tok::exclaimequal;
   3132       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3133     } else {
   3134       Kind = tok::exclaim;
   3135     }
   3136     break;
   3137   case '/':
   3138     // 6.4.9: Comments
   3139     Char = getCharAndSize(CurPtr, SizeTmp);
   3140     if (Char == '/') {         // Line comment.
   3141       // Even if Line comments are disabled (e.g. in C89 mode), we generally
   3142       // want to lex this as a comment.  There is one problem with this though,
   3143       // that in one particular corner case, this can change the behavior of the
   3144       // resultant program.  For example, In  "foo //**/ bar", C89 would lex
   3145       // this as "foo / bar" and langauges with Line comments would lex it as
   3146       // "foo".  Check to see if the character after the second slash is a '*'.
   3147       // If so, we will lex that as a "/" instead of the start of a comment.
   3148       // However, we never do this if we are just preprocessing.
   3149       bool TreatAsComment = LangOpts.LineComment && !LangOpts.TraditionalCPP;
   3150       if (!TreatAsComment)
   3151         if (!(PP && PP->isPreprocessedOutput()))
   3152           TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
   3153 
   3154       if (TreatAsComment) {
   3155         if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
   3156           return; // There is a token to return.
   3157 
   3158         // It is common for the tokens immediately after a // comment to be
   3159         // whitespace (indentation for the next line).  Instead of going through
   3160         // the big switch, handle it efficiently now.
   3161         goto SkipIgnoredUnits;
   3162       }
   3163     }
   3164 
   3165     if (Char == '*') {  // /**/ comment.
   3166       if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
   3167         return; // There is a token to return.
   3168       goto LexNextToken;   // GCC isn't tail call eliminating.
   3169     }
   3170 
   3171     if (Char == '=') {
   3172       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3173       Kind = tok::slashequal;
   3174     } else {
   3175       Kind = tok::slash;
   3176     }
   3177     break;
   3178   case '%':
   3179     Char = getCharAndSize(CurPtr, SizeTmp);
   3180     if (Char == '=') {
   3181       Kind = tok::percentequal;
   3182       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3183     } else if (LangOpts.Digraphs && Char == '>') {
   3184       Kind = tok::r_brace;                             // '%>' -> '}'
   3185       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3186     } else if (LangOpts.Digraphs && Char == ':') {
   3187       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3188       Char = getCharAndSize(CurPtr, SizeTmp);
   3189       if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
   3190         Kind = tok::hashhash;                          // '%:%:' -> '##'
   3191         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3192                              SizeTmp2, Result);
   3193       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
   3194         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3195         if (!isLexingRawMode())
   3196           Diag(BufferPtr, diag::ext_charize_microsoft);
   3197         Kind = tok::hashat;
   3198       } else {                                         // '%:' -> '#'
   3199         // We parsed a # character.  If this occurs at the start of the line,
   3200         // it's actually the start of a preprocessing directive.  Callback to
   3201         // the preprocessor to handle it.
   3202         // FIXME: -fpreprocessed mode??
   3203         if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
   3204           goto HandleDirective;
   3205 
   3206         Kind = tok::hash;
   3207       }
   3208     } else {
   3209       Kind = tok::percent;
   3210     }
   3211     break;
   3212   case '<':
   3213     Char = getCharAndSize(CurPtr, SizeTmp);
   3214     if (ParsingFilename) {
   3215       return LexAngledStringLiteral(Result, CurPtr);
   3216     } else if (Char == '<') {
   3217       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
   3218       if (After == '=') {
   3219         Kind = tok::lesslessequal;
   3220         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3221                              SizeTmp2, Result);
   3222       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
   3223         // If this is actually a '<<<<<<<' version control conflict marker,
   3224         // recognize it as such and recover nicely.
   3225         goto LexNextToken;
   3226       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
   3227         // If this is '<<<<' and we're in a Perforce-style conflict marker,
   3228         // ignore it.
   3229         goto LexNextToken;
   3230       } else if (LangOpts.CUDA && After == '<') {
   3231         Kind = tok::lesslessless;
   3232         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3233                              SizeTmp2, Result);
   3234       } else {
   3235         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3236         Kind = tok::lessless;
   3237       }
   3238     } else if (Char == '=') {
   3239       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3240       Kind = tok::lessequal;
   3241     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
   3242       if (LangOpts.CPlusPlus11 &&
   3243           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
   3244         // C++0x [lex.pptoken]p3:
   3245         //  Otherwise, if the next three characters are <:: and the subsequent
   3246         //  character is neither : nor >, the < is treated as a preprocessor
   3247         //  token by itself and not as the first character of the alternative
   3248         //  token <:.
   3249         unsigned SizeTmp3;
   3250         char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
   3251         if (After != ':' && After != '>') {
   3252           Kind = tok::less;
   3253           if (!isLexingRawMode())
   3254             Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
   3255           break;
   3256         }
   3257       }
   3258 
   3259       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3260       Kind = tok::l_square;
   3261     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
   3262       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3263       Kind = tok::l_brace;
   3264     } else {
   3265       Kind = tok::less;
   3266     }
   3267     break;
   3268   case '>':
   3269     Char = getCharAndSize(CurPtr, SizeTmp);
   3270     if (Char == '=') {
   3271       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3272       Kind = tok::greaterequal;
   3273     } else if (Char == '>') {
   3274       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
   3275       if (After == '=') {
   3276         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3277                              SizeTmp2, Result);
   3278         Kind = tok::greatergreaterequal;
   3279       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
   3280         // If this is actually a '>>>>' conflict marker, recognize it as such
   3281         // and recover nicely.
   3282         goto LexNextToken;
   3283       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
   3284         // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
   3285         goto LexNextToken;
   3286       } else if (LangOpts.CUDA && After == '>') {
   3287         Kind = tok::greatergreatergreater;
   3288         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
   3289                              SizeTmp2, Result);
   3290       } else {
   3291         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3292         Kind = tok::greatergreater;
   3293       }
   3294 
   3295     } else {
   3296       Kind = tok::greater;
   3297     }
   3298     break;
   3299   case '^':
   3300     Char = getCharAndSize(CurPtr, SizeTmp);
   3301     if (Char == '=') {
   3302       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3303       Kind = tok::caretequal;
   3304     } else {
   3305       Kind = tok::caret;
   3306     }
   3307     break;
   3308   case '|':
   3309     Char = getCharAndSize(CurPtr, SizeTmp);
   3310     if (Char == '=') {
   3311       Kind = tok::pipeequal;
   3312       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3313     } else if (Char == '|') {
   3314       // If this is '|||||||' and we're in a conflict marker, ignore it.
   3315       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
   3316         goto LexNextToken;
   3317       Kind = tok::pipepipe;
   3318       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3319     } else {
   3320       Kind = tok::pipe;
   3321     }
   3322     break;
   3323   case ':':
   3324     Char = getCharAndSize(CurPtr, SizeTmp);
   3325     if (LangOpts.Digraphs && Char == '>') {
   3326       Kind = tok::r_square; // ':>' -> ']'
   3327       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3328     } else if (LangOpts.CPlusPlus && Char == ':') {
   3329       Kind = tok::coloncolon;
   3330       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3331     } else {
   3332       Kind = tok::colon;
   3333     }
   3334     break;
   3335   case ';':
   3336     Kind = tok::semi;
   3337     break;
   3338   case '=':
   3339     Char = getCharAndSize(CurPtr, SizeTmp);
   3340     if (Char == '=') {
   3341       // If this is '====' and we're in a conflict marker, ignore it.
   3342       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
   3343         goto LexNextToken;
   3344 
   3345       Kind = tok::equalequal;
   3346       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3347     } else {
   3348       Kind = tok::equal;
   3349     }
   3350     break;
   3351   case ',':
   3352     Kind = tok::comma;
   3353     break;
   3354   case '#':
   3355     Char = getCharAndSize(CurPtr, SizeTmp);
   3356     if (Char == '#') {
   3357       Kind = tok::hashhash;
   3358       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3359     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize
   3360       Kind = tok::hashat;
   3361       if (!isLexingRawMode())
   3362         Diag(BufferPtr, diag::ext_charize_microsoft);
   3363       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
   3364     } else {
   3365       // We parsed a # character.  If this occurs at the start of the line,
   3366       // it's actually the start of a preprocessing directive.  Callback to
   3367       // the preprocessor to handle it.
   3368       // FIXME: -fpreprocessed mode??
   3369       if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer)
   3370         goto HandleDirective;
   3371 
   3372       Kind = tok::hash;
   3373     }
   3374     break;
   3375 
   3376   case '@':
   3377     // Objective C support.
   3378     if (CurPtr[-1] == '@' && LangOpts.ObjC1)
   3379       Kind = tok::at;
   3380     else
   3381       Kind = tok::unknown;
   3382     break;
   3383 
   3384   // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
   3385   case '\\':
   3386     if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result))
   3387       return LexUnicode(Result, CodePoint, CurPtr);
   3388 
   3389     Kind = tok::unknown;
   3390     break;
   3391 
   3392   default: {
   3393     if (isASCII(Char)) {
   3394       Kind = tok::unknown;
   3395       break;
   3396     }
   3397 
   3398     UTF32 CodePoint;
   3399 
   3400     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
   3401     // an escaped newline.
   3402     --CurPtr;
   3403     ConversionResult Status =
   3404         llvm::convertUTF8Sequence((const UTF8 **)&CurPtr,
   3405                                   (const UTF8 *)BufferEnd,
   3406                                   &CodePoint,
   3407                                   strictConversion);
   3408     if (Status == conversionOK)
   3409       return LexUnicode(Result, CodePoint, CurPtr);
   3410 
   3411     if (isLexingRawMode() || ParsingPreprocessorDirective ||
   3412         PP->isPreprocessedOutput()) {
   3413       ++CurPtr;
   3414       Kind = tok::unknown;
   3415       break;
   3416     }
   3417 
   3418     // Non-ASCII characters tend to creep into source code unintentionally.
   3419     // Instead of letting the parser complain about the unknown token,
   3420     // just diagnose the invalid UTF-8, then drop the character.
   3421     Diag(CurPtr, diag::err_invalid_utf8);
   3422 
   3423     BufferPtr = CurPtr+1;
   3424     goto LexNextToken;
   3425   }
   3426   }
   3427 
   3428   // Notify MIOpt that we read a non-whitespace/non-comment token.
   3429   MIOpt.ReadToken();
   3430 
   3431   // Update the location of token as well as BufferPtr.
   3432   FormTokenWithChars(Result, CurPtr, Kind);
   3433   return;
   3434 
   3435 HandleDirective:
   3436   // We parsed a # character and it's the start of a preprocessing directive.
   3437 
   3438   FormTokenWithChars(Result, CurPtr, tok::hash);
   3439   PP->HandleDirective(Result);
   3440 
   3441   if (PP->hadModuleLoaderFatalFailure()) {
   3442     // With a fatal failure in the module loader, we abort parsing.
   3443     assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
   3444     return;
   3445   }
   3446 
   3447   // As an optimization, if the preprocessor didn't switch lexers, tail
   3448   // recurse.
   3449   if (PP->isCurrentLexer(this)) {
   3450     // Start a new token.  If this is a #include or something, the PP may
   3451     // want us starting at the beginning of the line again.  If so, set
   3452     // the StartOfLine flag and clear LeadingSpace.
   3453     if (IsAtStartOfLine) {
   3454       Result.setFlag(Token::StartOfLine);
   3455       Result.clearFlag(Token::LeadingSpace);
   3456       IsAtStartOfLine = false;
   3457     }
   3458     goto LexNextToken;   // GCC isn't tail call eliminating.
   3459   }
   3460   return PP->Lex(Result);
   3461 }
   3462