1 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Lexer and Token interfaces. 11 // 12 //===----------------------------------------------------------------------===// 13 // 14 // TODO: GCC Diagnostics emitted by the lexer: 15 // PEDWARN: (form feed|vertical tab) in preprocessing directive 16 // 17 // Universal characters, unicode, char mapping: 18 // WARNING: `%.*s' is not in NFKC 19 // WARNING: `%.*s' is not in NFC 20 // 21 // Other: 22 // TODO: Options to support: 23 // -fexec-charset,-fwide-exec-charset 24 // 25 //===----------------------------------------------------------------------===// 26 27 #include "clang/Lex/Lexer.h" 28 #include "clang/Lex/Preprocessor.h" 29 #include "clang/Lex/LexDiagnostic.h" 30 #include "clang/Lex/CodeCompletionHandler.h" 31 #include "clang/Basic/SourceManager.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/Support/Compiler.h" 34 #include "llvm/Support/MemoryBuffer.h" 35 #include <cstring> 36 using namespace clang; 37 38 static void InitCharacterInfo(); 39 40 //===----------------------------------------------------------------------===// 41 // Token Class Implementation 42 //===----------------------------------------------------------------------===// 43 44 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 45 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 46 if (IdentifierInfo *II = getIdentifierInfo()) 47 return II->getObjCKeywordID() == objcKey; 48 return false; 49 } 50 51 /// getObjCKeywordID - Return the ObjC keyword kind. 52 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 53 IdentifierInfo *specId = getIdentifierInfo(); 54 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 55 } 56 57 58 //===----------------------------------------------------------------------===// 59 // Lexer Class Implementation 60 //===----------------------------------------------------------------------===// 61 62 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 63 const char *BufEnd) { 64 InitCharacterInfo(); 65 66 BufferStart = BufStart; 67 BufferPtr = BufPtr; 68 BufferEnd = BufEnd; 69 70 assert(BufEnd[0] == 0 && 71 "We assume that the input buffer has a null character at the end" 72 " to simplify lexing!"); 73 74 // Check whether we have a BOM in the beginning of the buffer. If yes - act 75 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 76 // skip the UTF-8 BOM if it's present. 77 if (BufferStart == BufferPtr) { 78 // Determine the size of the BOM. 79 StringRef Buf(BufferStart, BufferEnd - BufferStart); 80 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 81 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 82 .Default(0); 83 84 // Skip the BOM. 85 BufferPtr += BOMLength; 86 } 87 88 Is_PragmaLexer = false; 89 CurrentConflictMarkerState = CMK_None; 90 91 // Start of the file is a start of line. 92 IsAtStartOfLine = true; 93 94 // We are not after parsing a #. 95 ParsingPreprocessorDirective = false; 96 97 // We are not after parsing #include. 98 ParsingFilename = false; 99 100 // We are not in raw mode. Raw mode disables diagnostics and interpretation 101 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 102 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 103 // or otherwise skipping over tokens. 104 LexingRawMode = false; 105 106 // Default to not keeping comments. 107 ExtendedTokenMode = 0; 108 } 109 110 /// Lexer constructor - Create a new lexer object for the specified buffer 111 /// with the specified preprocessor managing the lexing process. This lexer 112 /// assumes that the associated file buffer and Preprocessor objects will 113 /// outlive it, so it doesn't take ownership of either of them. 114 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 115 : PreprocessorLexer(&PP, FID), 116 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 117 Features(PP.getLangOptions()) { 118 119 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 120 InputFile->getBufferEnd()); 121 122 // Default to keeping comments if the preprocessor wants them. 123 SetCommentRetentionState(PP.getCommentRetentionState()); 124 } 125 126 /// Lexer constructor - Create a new raw lexer object. This object is only 127 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 128 /// range will outlive it, so it doesn't take ownership of it. 129 Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 130 const char *BufStart, const char *BufPtr, const char *BufEnd) 131 : FileLoc(fileloc), Features(features) { 132 133 InitLexer(BufStart, BufPtr, BufEnd); 134 135 // We *are* in raw mode. 136 LexingRawMode = true; 137 } 138 139 /// Lexer constructor - Create a new raw lexer object. This object is only 140 /// suitable for calls to 'LexRawToken'. This lexer assumes that the text 141 /// range will outlive it, so it doesn't take ownership of it. 142 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 143 const SourceManager &SM, const LangOptions &features) 144 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 145 146 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 147 FromFile->getBufferEnd()); 148 149 // We *are* in raw mode. 150 LexingRawMode = true; 151 } 152 153 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 154 /// _Pragma expansion. This has a variety of magic semantics that this method 155 /// sets up. It returns a new'd Lexer that must be delete'd when done. 156 /// 157 /// On entrance to this routine, TokStartLoc is a macro location which has a 158 /// spelling loc that indicates the bytes to be lexed for the token and an 159 /// expansion location that indicates where all lexed tokens should be 160 /// "expanded from". 161 /// 162 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a 163 /// normal lexer that remaps tokens as they fly by. This would require making 164 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 165 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 166 /// out of the critical path of the lexer! 167 /// 168 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 169 SourceLocation ExpansionLocStart, 170 SourceLocation ExpansionLocEnd, 171 unsigned TokLen, Preprocessor &PP) { 172 SourceManager &SM = PP.getSourceManager(); 173 174 // Create the lexer as if we were going to lex the file normally. 175 FileID SpellingFID = SM.getFileID(SpellingLoc); 176 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 177 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 178 179 // Now that the lexer is created, change the start/end locations so that we 180 // just lex the subsection of the file that we want. This is lexing from a 181 // scratch buffer. 182 const char *StrData = SM.getCharacterData(SpellingLoc); 183 184 L->BufferPtr = StrData; 185 L->BufferEnd = StrData+TokLen; 186 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 187 188 // Set the SourceLocation with the remapping information. This ensures that 189 // GetMappedTokenLoc will remap the tokens as they are lexed. 190 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 191 ExpansionLocStart, 192 ExpansionLocEnd, TokLen); 193 194 // Ensure that the lexer thinks it is inside a directive, so that end \n will 195 // return an EOD token. 196 L->ParsingPreprocessorDirective = true; 197 198 // This lexer really is for _Pragma. 199 L->Is_PragmaLexer = true; 200 return L; 201 } 202 203 204 /// Stringify - Convert the specified string into a C string, with surrounding 205 /// ""'s, and with escaped \ and " characters. 206 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 207 std::string Result = Str; 208 char Quote = Charify ? '\'' : '"'; 209 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 210 if (Result[i] == '\\' || Result[i] == Quote) { 211 Result.insert(Result.begin()+i, '\\'); 212 ++i; ++e; 213 } 214 } 215 return Result; 216 } 217 218 /// Stringify - Convert the specified string into a C string by escaping '\' 219 /// and " characters. This does not add surrounding ""'s to the string. 220 void Lexer::Stringify(SmallVectorImpl<char> &Str) { 221 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 222 if (Str[i] == '\\' || Str[i] == '"') { 223 Str.insert(Str.begin()+i, '\\'); 224 ++i; ++e; 225 } 226 } 227 } 228 229 //===----------------------------------------------------------------------===// 230 // Token Spelling 231 //===----------------------------------------------------------------------===// 232 233 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 234 /// token are the characters used to represent the token in the source file 235 /// after trigraph expansion and escaped-newline folding. In particular, this 236 /// wants to get the true, uncanonicalized, spelling of things like digraphs 237 /// UCNs, etc. 238 StringRef Lexer::getSpelling(SourceLocation loc, 239 SmallVectorImpl<char> &buffer, 240 const SourceManager &SM, 241 const LangOptions &options, 242 bool *invalid) { 243 // Break down the source location. 244 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 245 246 // Try to the load the file buffer. 247 bool invalidTemp = false; 248 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 249 if (invalidTemp) { 250 if (invalid) *invalid = true; 251 return StringRef(); 252 } 253 254 const char *tokenBegin = file.data() + locInfo.second; 255 256 // Lex from the start of the given location. 257 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 258 file.begin(), tokenBegin, file.end()); 259 Token token; 260 lexer.LexFromRawLexer(token); 261 262 unsigned length = token.getLength(); 263 264 // Common case: no need for cleaning. 265 if (!token.needsCleaning()) 266 return StringRef(tokenBegin, length); 267 268 // Hard case, we need to relex the characters into the string. 269 buffer.clear(); 270 buffer.reserve(length); 271 272 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 273 unsigned charSize; 274 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 275 ti += charSize; 276 } 277 278 return StringRef(buffer.data(), buffer.size()); 279 } 280 281 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 282 /// token are the characters used to represent the token in the source file 283 /// after trigraph expansion and escaped-newline folding. In particular, this 284 /// wants to get the true, uncanonicalized, spelling of things like digraphs 285 /// UCNs, etc. 286 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 287 const LangOptions &Features, bool *Invalid) { 288 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 289 290 // If this token contains nothing interesting, return it directly. 291 bool CharDataInvalid = false; 292 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 293 &CharDataInvalid); 294 if (Invalid) 295 *Invalid = CharDataInvalid; 296 if (CharDataInvalid) 297 return std::string(); 298 299 if (!Tok.needsCleaning()) 300 return std::string(TokStart, TokStart+Tok.getLength()); 301 302 std::string Result; 303 Result.reserve(Tok.getLength()); 304 305 // Otherwise, hard case, relex the characters into the string. 306 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 307 Ptr != End; ) { 308 unsigned CharSize; 309 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); 310 Ptr += CharSize; 311 } 312 assert(Result.size() != unsigned(Tok.getLength()) && 313 "NeedsCleaning flag set on something that didn't need cleaning!"); 314 return Result; 315 } 316 317 /// getSpelling - This method is used to get the spelling of a token into a 318 /// preallocated buffer, instead of as an std::string. The caller is required 319 /// to allocate enough space for the token, which is guaranteed to be at least 320 /// Tok.getLength() bytes long. The actual length of the token is returned. 321 /// 322 /// Note that this method may do two possible things: it may either fill in 323 /// the buffer specified with characters, or it may *change the input pointer* 324 /// to point to a constant buffer with the data already in it (avoiding a 325 /// copy). The caller is not allowed to modify the returned buffer pointer 326 /// if an internal buffer is returned. 327 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 328 const SourceManager &SourceMgr, 329 const LangOptions &Features, bool *Invalid) { 330 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 331 332 const char *TokStart = 0; 333 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 334 if (Tok.is(tok::raw_identifier)) 335 TokStart = Tok.getRawIdentifierData(); 336 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 337 // Just return the string from the identifier table, which is very quick. 338 Buffer = II->getNameStart(); 339 return II->getLength(); 340 } 341 342 // NOTE: this can be checked even after testing for an IdentifierInfo. 343 if (Tok.isLiteral()) 344 TokStart = Tok.getLiteralData(); 345 346 if (TokStart == 0) { 347 // Compute the start of the token in the input lexer buffer. 348 bool CharDataInvalid = false; 349 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 350 if (Invalid) 351 *Invalid = CharDataInvalid; 352 if (CharDataInvalid) { 353 Buffer = ""; 354 return 0; 355 } 356 } 357 358 // If this token contains nothing interesting, return it directly. 359 if (!Tok.needsCleaning()) { 360 Buffer = TokStart; 361 return Tok.getLength(); 362 } 363 364 // Otherwise, hard case, relex the characters into the string. 365 char *OutBuf = const_cast<char*>(Buffer); 366 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 367 Ptr != End; ) { 368 unsigned CharSize; 369 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); 370 Ptr += CharSize; 371 } 372 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 373 "NeedsCleaning flag set on something that didn't need cleaning!"); 374 375 return OutBuf-Buffer; 376 } 377 378 379 380 static bool isWhitespace(unsigned char c); 381 382 /// MeasureTokenLength - Relex the token at the specified location and return 383 /// its length in bytes in the input file. If the token needs cleaning (e.g. 384 /// includes a trigraph or an escaped newline) then this count includes bytes 385 /// that are part of that. 386 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 387 const SourceManager &SM, 388 const LangOptions &LangOpts) { 389 // TODO: this could be special cased for common tokens like identifiers, ')', 390 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 391 // all obviously single-char tokens. This could use 392 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 393 // something. 394 395 // If this comes from a macro expansion, we really do want the macro name, not 396 // the token this macro expanded to. 397 Loc = SM.getExpansionLoc(Loc); 398 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 399 bool Invalid = false; 400 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 401 if (Invalid) 402 return 0; 403 404 const char *StrData = Buffer.data()+LocInfo.second; 405 406 if (isWhitespace(StrData[0])) 407 return 0; 408 409 // Create a lexer starting at the beginning of this token. 410 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 411 Buffer.begin(), StrData, Buffer.end()); 412 TheLexer.SetCommentRetentionState(true); 413 Token TheTok; 414 TheLexer.LexFromRawLexer(TheTok); 415 return TheTok.getLength(); 416 } 417 418 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 419 const SourceManager &SM, 420 const LangOptions &LangOpts) { 421 assert(Loc.isFileID()); 422 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 423 if (LocInfo.first.isInvalid()) 424 return Loc; 425 426 bool Invalid = false; 427 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 428 if (Invalid) 429 return Loc; 430 431 // Back up from the current location until we hit the beginning of a line 432 // (or the buffer). We'll relex from that point. 433 const char *BufStart = Buffer.data(); 434 if (LocInfo.second >= Buffer.size()) 435 return Loc; 436 437 const char *StrData = BufStart+LocInfo.second; 438 if (StrData[0] == '\n' || StrData[0] == '\r') 439 return Loc; 440 441 const char *LexStart = StrData; 442 while (LexStart != BufStart) { 443 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 444 ++LexStart; 445 break; 446 } 447 448 --LexStart; 449 } 450 451 // Create a lexer starting at the beginning of this token. 452 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 453 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 454 TheLexer.SetCommentRetentionState(true); 455 456 // Lex tokens until we find the token that contains the source location. 457 Token TheTok; 458 do { 459 TheLexer.LexFromRawLexer(TheTok); 460 461 if (TheLexer.getBufferLocation() > StrData) { 462 // Lexing this token has taken the lexer past the source location we're 463 // looking for. If the current token encompasses our source location, 464 // return the beginning of that token. 465 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 466 return TheTok.getLocation(); 467 468 // We ended up skipping over the source location entirely, which means 469 // that it points into whitespace. We're done here. 470 break; 471 } 472 } while (TheTok.getKind() != tok::eof); 473 474 // We've passed our source location; just return the original source location. 475 return Loc; 476 } 477 478 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 479 const SourceManager &SM, 480 const LangOptions &LangOpts) { 481 if (Loc.isFileID()) 482 return getBeginningOfFileToken(Loc, SM, LangOpts); 483 484 if (!SM.isMacroArgExpansion(Loc)) 485 return Loc; 486 487 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 488 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 489 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 490 std::pair<FileID, unsigned> BeginFileLocInfo= SM.getDecomposedLoc(BeginFileLoc); 491 assert(FileLocInfo.first == BeginFileLocInfo.first && 492 FileLocInfo.second >= BeginFileLocInfo.second); 493 return Loc.getLocWithOffset(SM.getDecomposedLoc(BeginFileLoc).second - 494 SM.getDecomposedLoc(FileLoc).second); 495 } 496 497 namespace { 498 enum PreambleDirectiveKind { 499 PDK_Skipped, 500 PDK_StartIf, 501 PDK_EndIf, 502 PDK_Unknown 503 }; 504 } 505 506 std::pair<unsigned, bool> 507 Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 508 const LangOptions &Features, unsigned MaxLines) { 509 // Create a lexer starting at the beginning of the file. Note that we use a 510 // "fake" file source location at offset 1 so that the lexer will track our 511 // position within the file. 512 const unsigned StartOffset = 1; 513 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 514 Lexer TheLexer(StartLoc, Features, Buffer->getBufferStart(), 515 Buffer->getBufferStart(), Buffer->getBufferEnd()); 516 517 bool InPreprocessorDirective = false; 518 Token TheTok; 519 Token IfStartTok; 520 unsigned IfCount = 0; 521 522 unsigned MaxLineOffset = 0; 523 if (MaxLines) { 524 const char *CurPtr = Buffer->getBufferStart(); 525 unsigned CurLine = 0; 526 while (CurPtr != Buffer->getBufferEnd()) { 527 char ch = *CurPtr++; 528 if (ch == '\n') { 529 ++CurLine; 530 if (CurLine == MaxLines) 531 break; 532 } 533 } 534 if (CurPtr != Buffer->getBufferEnd()) 535 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 536 } 537 538 do { 539 TheLexer.LexFromRawLexer(TheTok); 540 541 if (InPreprocessorDirective) { 542 // If we've hit the end of the file, we're done. 543 if (TheTok.getKind() == tok::eof) { 544 InPreprocessorDirective = false; 545 break; 546 } 547 548 // If we haven't hit the end of the preprocessor directive, skip this 549 // token. 550 if (!TheTok.isAtStartOfLine()) 551 continue; 552 553 // We've passed the end of the preprocessor directive, and will look 554 // at this token again below. 555 InPreprocessorDirective = false; 556 } 557 558 // Keep track of the # of lines in the preamble. 559 if (TheTok.isAtStartOfLine()) { 560 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 561 562 // If we were asked to limit the number of lines in the preamble, 563 // and we're about to exceed that limit, we're done. 564 if (MaxLineOffset && TokOffset >= MaxLineOffset) 565 break; 566 } 567 568 // Comments are okay; skip over them. 569 if (TheTok.getKind() == tok::comment) 570 continue; 571 572 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 573 // This is the start of a preprocessor directive. 574 Token HashTok = TheTok; 575 InPreprocessorDirective = true; 576 577 // Figure out which directive this is. Since we're lexing raw tokens, 578 // we don't have an identifier table available. Instead, just look at 579 // the raw identifier to recognize and categorize preprocessor directives. 580 TheLexer.LexFromRawLexer(TheTok); 581 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 582 StringRef Keyword(TheTok.getRawIdentifierData(), 583 TheTok.getLength()); 584 PreambleDirectiveKind PDK 585 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 586 .Case("include", PDK_Skipped) 587 .Case("__include_macros", PDK_Skipped) 588 .Case("define", PDK_Skipped) 589 .Case("undef", PDK_Skipped) 590 .Case("line", PDK_Skipped) 591 .Case("error", PDK_Skipped) 592 .Case("pragma", PDK_Skipped) 593 .Case("import", PDK_Skipped) 594 .Case("include_next", PDK_Skipped) 595 .Case("warning", PDK_Skipped) 596 .Case("ident", PDK_Skipped) 597 .Case("sccs", PDK_Skipped) 598 .Case("assert", PDK_Skipped) 599 .Case("unassert", PDK_Skipped) 600 .Case("if", PDK_StartIf) 601 .Case("ifdef", PDK_StartIf) 602 .Case("ifndef", PDK_StartIf) 603 .Case("elif", PDK_Skipped) 604 .Case("else", PDK_Skipped) 605 .Case("endif", PDK_EndIf) 606 .Default(PDK_Unknown); 607 608 switch (PDK) { 609 case PDK_Skipped: 610 continue; 611 612 case PDK_StartIf: 613 if (IfCount == 0) 614 IfStartTok = HashTok; 615 616 ++IfCount; 617 continue; 618 619 case PDK_EndIf: 620 // Mismatched #endif. The preamble ends here. 621 if (IfCount == 0) 622 break; 623 624 --IfCount; 625 continue; 626 627 case PDK_Unknown: 628 // We don't know what this directive is; stop at the '#'. 629 break; 630 } 631 } 632 633 // We only end up here if we didn't recognize the preprocessor 634 // directive or it was one that can't occur in the preamble at this 635 // point. Roll back the current token to the location of the '#'. 636 InPreprocessorDirective = false; 637 TheTok = HashTok; 638 } 639 640 // We hit a token that we don't recognize as being in the 641 // "preprocessing only" part of the file, so we're no longer in 642 // the preamble. 643 break; 644 } while (true); 645 646 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 647 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 648 IfCount? IfStartTok.isAtStartOfLine() 649 : TheTok.isAtStartOfLine()); 650 } 651 652 653 /// AdvanceToTokenCharacter - Given a location that specifies the start of a 654 /// token, return a new location that specifies a character within the token. 655 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 656 unsigned CharNo, 657 const SourceManager &SM, 658 const LangOptions &Features) { 659 // Figure out how many physical characters away the specified expansion 660 // character is. This needs to take into consideration newlines and 661 // trigraphs. 662 bool Invalid = false; 663 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 664 665 // If they request the first char of the token, we're trivially done. 666 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 667 return TokStart; 668 669 unsigned PhysOffset = 0; 670 671 // The usual case is that tokens don't contain anything interesting. Skip 672 // over the uninteresting characters. If a token only consists of simple 673 // chars, this method is extremely fast. 674 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 675 if (CharNo == 0) 676 return TokStart.getLocWithOffset(PhysOffset); 677 ++TokPtr, --CharNo, ++PhysOffset; 678 } 679 680 // If we have a character that may be a trigraph or escaped newline, use a 681 // lexer to parse it correctly. 682 for (; CharNo; --CharNo) { 683 unsigned Size; 684 Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features); 685 TokPtr += Size; 686 PhysOffset += Size; 687 } 688 689 // Final detail: if we end up on an escaped newline, we want to return the 690 // location of the actual byte of the token. For example foo\<newline>bar 691 // advanced by 3 should return the location of b, not of \\. One compounding 692 // detail of this is that the escape may be made by a trigraph. 693 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 694 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 695 696 return TokStart.getLocWithOffset(PhysOffset); 697 } 698 699 /// \brief Computes the source location just past the end of the 700 /// token at this source location. 701 /// 702 /// This routine can be used to produce a source location that 703 /// points just past the end of the token referenced by \p Loc, and 704 /// is generally used when a diagnostic needs to point just after a 705 /// token where it expected something different that it received. If 706 /// the returned source location would not be meaningful (e.g., if 707 /// it points into a macro), this routine returns an invalid 708 /// source location. 709 /// 710 /// \param Offset an offset from the end of the token, where the source 711 /// location should refer to. The default offset (0) produces a source 712 /// location pointing just past the end of the token; an offset of 1 produces 713 /// a source location pointing to the last character in the token, etc. 714 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 715 const SourceManager &SM, 716 const LangOptions &Features) { 717 if (Loc.isInvalid()) 718 return SourceLocation(); 719 720 if (Loc.isMacroID()) { 721 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, Features)) 722 return SourceLocation(); // Points inside the macro expansion. 723 724 // Continue and find the location just after the macro expansion. 725 Loc = SM.getExpansionRange(Loc).second; 726 } 727 728 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features); 729 if (Len > Offset) 730 Len = Len - Offset; 731 else 732 return Loc; 733 734 return Loc.getLocWithOffset(Len); 735 } 736 737 /// \brief Returns true if the given MacroID location points at the first 738 /// token of the macro expansion. 739 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 740 const SourceManager &SM, 741 const LangOptions &LangOpts) { 742 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 743 744 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 745 // FIXME: If the token comes from the macro token paste operator ('##') 746 // this function will always return false; 747 if (infoLoc.second > 0) 748 return false; // Does not point at the start of token. 749 750 SourceLocation expansionLoc = 751 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 752 if (expansionLoc.isFileID()) 753 return true; // No other macro expansions, this is the first. 754 755 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts); 756 } 757 758 /// \brief Returns true if the given MacroID location points at the last 759 /// token of the macro expansion. 760 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 761 const SourceManager &SM, 762 const LangOptions &LangOpts) { 763 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 764 765 SourceLocation spellLoc = SM.getSpellingLoc(loc); 766 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 767 if (tokLen == 0) 768 return false; 769 770 FileID FID = SM.getFileID(loc); 771 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 772 if (SM.isInFileID(afterLoc, FID)) 773 return false; // Still in the same FileID, does not point to the last token. 774 775 // FIXME: If the token comes from the macro token paste operator ('##') 776 // or the stringify operator ('#') this function will always return false; 777 778 SourceLocation expansionLoc = 779 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 780 if (expansionLoc.isFileID()) 781 return true; // No other macro expansions. 782 783 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts); 784 } 785 786 //===----------------------------------------------------------------------===// 787 // Character information. 788 //===----------------------------------------------------------------------===// 789 790 enum { 791 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 792 CHAR_VERT_WS = 0x02, // '\r', '\n' 793 CHAR_LETTER = 0x04, // a-z,A-Z 794 CHAR_NUMBER = 0x08, // 0-9 795 CHAR_UNDER = 0x10, // _ 796 CHAR_PERIOD = 0x20, // . 797 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 798 }; 799 800 // Statically initialize CharInfo table based on ASCII character set 801 // Reference: FreeBSD 7.2 /usr/share/misc/ascii 802 static const unsigned char CharInfo[256] = 803 { 804 // 0 NUL 1 SOH 2 STX 3 ETX 805 // 4 EOT 5 ENQ 6 ACK 7 BEL 806 0 , 0 , 0 , 0 , 807 0 , 0 , 0 , 0 , 808 // 8 BS 9 HT 10 NL 11 VT 809 //12 NP 13 CR 14 SO 15 SI 810 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 811 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 812 //16 DLE 17 DC1 18 DC2 19 DC3 813 //20 DC4 21 NAK 22 SYN 23 ETB 814 0 , 0 , 0 , 0 , 815 0 , 0 , 0 , 0 , 816 //24 CAN 25 EM 26 SUB 27 ESC 817 //28 FS 29 GS 30 RS 31 US 818 0 , 0 , 0 , 0 , 819 0 , 0 , 0 , 0 , 820 //32 SP 33 ! 34 " 35 # 821 //36 $ 37 % 38 & 39 ' 822 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 823 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 824 //40 ( 41 ) 42 * 43 + 825 //44 , 45 - 46 . 47 / 826 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 827 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 828 //48 0 49 1 50 2 51 3 829 //52 4 53 5 54 6 55 7 830 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 831 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 832 //56 8 57 9 58 : 59 ; 833 //60 < 61 = 62 > 63 ? 834 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 835 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 836 //64 @ 65 A 66 B 67 C 837 //68 D 69 E 70 F 71 G 838 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 839 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 840 //72 H 73 I 74 J 75 K 841 //76 L 77 M 78 N 79 O 842 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 843 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 844 //80 P 81 Q 82 R 83 S 845 //84 T 85 U 86 V 87 W 846 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 847 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 848 //88 X 89 Y 90 Z 91 [ 849 //92 \ 93 ] 94 ^ 95 _ 850 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 851 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 852 //96 ` 97 a 98 b 99 c 853 //100 d 101 e 102 f 103 g 854 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 855 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 856 //104 h 105 i 106 j 107 k 857 //108 l 109 m 110 n 111 o 858 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 859 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 860 //112 p 113 q 114 r 115 s 861 //116 t 117 u 118 v 119 w 862 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 863 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 864 //120 x 121 y 122 z 123 { 865 //124 | 125 } 126 ~ 127 DEL 866 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 867 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 868 }; 869 870 static void InitCharacterInfo() { 871 static bool isInited = false; 872 if (isInited) return; 873 // check the statically-initialized CharInfo table 874 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 875 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 876 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 877 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 878 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 879 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 880 assert(CHAR_UNDER == CharInfo[(int)'_']); 881 assert(CHAR_PERIOD == CharInfo[(int)'.']); 882 for (unsigned i = 'a'; i <= 'z'; ++i) { 883 assert(CHAR_LETTER == CharInfo[i]); 884 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 885 } 886 for (unsigned i = '0'; i <= '9'; ++i) 887 assert(CHAR_NUMBER == CharInfo[i]); 888 889 isInited = true; 890 } 891 892 893 /// isIdentifierBody - Return true if this is the body character of an 894 /// identifier, which is [a-zA-Z0-9_]. 895 static inline bool isIdentifierBody(unsigned char c) { 896 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 897 } 898 899 /// isHorizontalWhitespace - Return true if this character is horizontal 900 /// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 901 static inline bool isHorizontalWhitespace(unsigned char c) { 902 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 903 } 904 905 /// isVerticalWhitespace - Return true if this character is vertical 906 /// whitespace: '\n', '\r'. Note that this returns false for '\0'. 907 static inline bool isVerticalWhitespace(unsigned char c) { 908 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 909 } 910 911 /// isWhitespace - Return true if this character is horizontal or vertical 912 /// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 913 /// for '\0'. 914 static inline bool isWhitespace(unsigned char c) { 915 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 916 } 917 918 /// isNumberBody - Return true if this is the body character of an 919 /// preprocessing number, which is [a-zA-Z0-9_.]. 920 static inline bool isNumberBody(unsigned char c) { 921 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 922 true : false; 923 } 924 925 /// isRawStringDelimBody - Return true if this is the body character of a 926 /// raw string delimiter. 927 static inline bool isRawStringDelimBody(unsigned char c) { 928 return (CharInfo[c] & 929 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 930 true : false; 931 } 932 933 934 //===----------------------------------------------------------------------===// 935 // Diagnostics forwarding code. 936 //===----------------------------------------------------------------------===// 937 938 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 939 /// lexer buffer was all expanded at a single point, perform the mapping. 940 /// This is currently only used for _Pragma implementation, so it is the slow 941 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 942 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 943 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 944 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 945 SourceLocation FileLoc, 946 unsigned CharNo, unsigned TokLen) { 947 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 948 949 // Otherwise, we're lexing "mapped tokens". This is used for things like 950 // _Pragma handling. Combine the expansion location of FileLoc with the 951 // spelling location. 952 SourceManager &SM = PP.getSourceManager(); 953 954 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 955 // characters come from spelling(FileLoc)+Offset. 956 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 957 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 958 959 // Figure out the expansion loc range, which is the range covered by the 960 // original _Pragma(...) sequence. 961 std::pair<SourceLocation,SourceLocation> II = 962 SM.getImmediateExpansionRange(FileLoc); 963 964 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 965 } 966 967 /// getSourceLocation - Return a source location identifier for the specified 968 /// offset in the current file. 969 SourceLocation Lexer::getSourceLocation(const char *Loc, 970 unsigned TokLen) const { 971 assert(Loc >= BufferStart && Loc <= BufferEnd && 972 "Location out of range for this buffer!"); 973 974 // In the normal case, we're just lexing from a simple file buffer, return 975 // the file id from FileLoc with the offset specified. 976 unsigned CharNo = Loc-BufferStart; 977 if (FileLoc.isFileID()) 978 return FileLoc.getLocWithOffset(CharNo); 979 980 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 981 // tokens are lexed from where the _Pragma was defined. 982 assert(PP && "This doesn't work on raw lexers"); 983 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 984 } 985 986 /// Diag - Forwarding function for diagnostics. This translate a source 987 /// position in the current buffer into a SourceLocation object for rendering. 988 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 989 return PP->Diag(getSourceLocation(Loc), DiagID); 990 } 991 992 //===----------------------------------------------------------------------===// 993 // Trigraph and Escaped Newline Handling Code. 994 //===----------------------------------------------------------------------===// 995 996 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 997 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 998 static char GetTrigraphCharForLetter(char Letter) { 999 switch (Letter) { 1000 default: return 0; 1001 case '=': return '#'; 1002 case ')': return ']'; 1003 case '(': return '['; 1004 case '!': return '|'; 1005 case '\'': return '^'; 1006 case '>': return '}'; 1007 case '/': return '\\'; 1008 case '<': return '{'; 1009 case '-': return '~'; 1010 } 1011 } 1012 1013 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1014 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1015 /// return the result character. Finally, emit a warning about trigraph use 1016 /// whether trigraphs are enabled or not. 1017 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1018 char Res = GetTrigraphCharForLetter(*CP); 1019 if (!Res || !L) return Res; 1020 1021 if (!L->getFeatures().Trigraphs) { 1022 if (!L->isLexingRawMode()) 1023 L->Diag(CP-2, diag::trigraph_ignored); 1024 return 0; 1025 } 1026 1027 if (!L->isLexingRawMode()) 1028 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1029 return Res; 1030 } 1031 1032 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1033 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1034 /// trigraph equivalent on entry to this function. 1035 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1036 unsigned Size = 0; 1037 while (isWhitespace(Ptr[Size])) { 1038 ++Size; 1039 1040 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1041 continue; 1042 1043 // If this is a \r\n or \n\r, skip the other half. 1044 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1045 Ptr[Size-1] != Ptr[Size]) 1046 ++Size; 1047 1048 return Size; 1049 } 1050 1051 // Not an escaped newline, must be a \t or something else. 1052 return 0; 1053 } 1054 1055 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1056 /// them), skip over them and return the first non-escaped-newline found, 1057 /// otherwise return P. 1058 const char *Lexer::SkipEscapedNewLines(const char *P) { 1059 while (1) { 1060 const char *AfterEscape; 1061 if (*P == '\\') { 1062 AfterEscape = P+1; 1063 } else if (*P == '?') { 1064 // If not a trigraph for escape, bail out. 1065 if (P[1] != '?' || P[2] != '/') 1066 return P; 1067 AfterEscape = P+3; 1068 } else { 1069 return P; 1070 } 1071 1072 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1073 if (NewLineSize == 0) return P; 1074 P = AfterEscape+NewLineSize; 1075 } 1076 } 1077 1078 /// \brief Checks that the given token is the first token that occurs after the 1079 /// given location (this excludes comments and whitespace). Returns the location 1080 /// immediately after the specified token. If the token is not found or the 1081 /// location is inside a macro, the returned source location will be invalid. 1082 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 1083 tok::TokenKind TKind, 1084 const SourceManager &SM, 1085 const LangOptions &LangOpts, 1086 bool SkipTrailingWhitespaceAndNewLine) { 1087 if (Loc.isMacroID()) { 1088 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts)) 1089 return SourceLocation(); 1090 Loc = SM.getExpansionRange(Loc).second; 1091 } 1092 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1093 1094 // Break down the source location. 1095 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1096 1097 // Try to load the file buffer. 1098 bool InvalidTemp = false; 1099 llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1100 if (InvalidTemp) 1101 return SourceLocation(); 1102 1103 const char *TokenBegin = File.data() + LocInfo.second; 1104 1105 // Lex from the start of the given location. 1106 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1107 TokenBegin, File.end()); 1108 // Find the token. 1109 Token Tok; 1110 lexer.LexFromRawLexer(Tok); 1111 if (Tok.isNot(TKind)) 1112 return SourceLocation(); 1113 SourceLocation TokenLoc = Tok.getLocation(); 1114 1115 // Calculate how much whitespace needs to be skipped if any. 1116 unsigned NumWhitespaceChars = 0; 1117 if (SkipTrailingWhitespaceAndNewLine) { 1118 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 1119 Tok.getLength(); 1120 unsigned char C = *TokenEnd; 1121 while (isHorizontalWhitespace(C)) { 1122 C = *(++TokenEnd); 1123 NumWhitespaceChars++; 1124 } 1125 if (isVerticalWhitespace(C)) 1126 NumWhitespaceChars++; 1127 } 1128 1129 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 1130 } 1131 1132 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1133 /// get its size, and return it. This is tricky in several cases: 1134 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1135 /// then either return the trigraph (skipping 3 chars) or the '?', 1136 /// depending on whether trigraphs are enabled or not. 1137 /// 2. If this is an escaped newline (potentially with whitespace between 1138 /// the backslash and newline), implicitly skip the newline and return 1139 /// the char after it. 1140 /// 3. If this is a UCN, return it. FIXME: C++ UCN's? 1141 /// 1142 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1143 /// know that we can accumulate into Size, and that we have already incremented 1144 /// Ptr by Size bytes. 1145 /// 1146 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1147 /// be updated to match. 1148 /// 1149 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1150 Token *Tok) { 1151 // If we have a slash, look for an escaped newline. 1152 if (Ptr[0] == '\\') { 1153 ++Size; 1154 ++Ptr; 1155 Slash: 1156 // Common case, backslash-char where the char is not whitespace. 1157 if (!isWhitespace(Ptr[0])) return '\\'; 1158 1159 // See if we have optional whitespace characters between the slash and 1160 // newline. 1161 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1162 // Remember that this token needs to be cleaned. 1163 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1164 1165 // Warn if there was whitespace between the backslash and newline. 1166 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1167 Diag(Ptr, diag::backslash_newline_space); 1168 1169 // Found backslash<whitespace><newline>. Parse the char after it. 1170 Size += EscapedNewLineSize; 1171 Ptr += EscapedNewLineSize; 1172 // Use slow version to accumulate a correct size field. 1173 return getCharAndSizeSlow(Ptr, Size, Tok); 1174 } 1175 1176 // Otherwise, this is not an escaped newline, just return the slash. 1177 return '\\'; 1178 } 1179 1180 // If this is a trigraph, process it. 1181 if (Ptr[0] == '?' && Ptr[1] == '?') { 1182 // If this is actually a legal trigraph (not something like "??x"), emit 1183 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1184 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1185 // Remember that this token needs to be cleaned. 1186 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1187 1188 Ptr += 3; 1189 Size += 3; 1190 if (C == '\\') goto Slash; 1191 return C; 1192 } 1193 } 1194 1195 // If this is neither, return a single character. 1196 ++Size; 1197 return *Ptr; 1198 } 1199 1200 1201 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1202 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1203 /// and that we have already incremented Ptr by Size bytes. 1204 /// 1205 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1206 /// be updated to match. 1207 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1208 const LangOptions &Features) { 1209 // If we have a slash, look for an escaped newline. 1210 if (Ptr[0] == '\\') { 1211 ++Size; 1212 ++Ptr; 1213 Slash: 1214 // Common case, backslash-char where the char is not whitespace. 1215 if (!isWhitespace(Ptr[0])) return '\\'; 1216 1217 // See if we have optional whitespace characters followed by a newline. 1218 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1219 // Found backslash<whitespace><newline>. Parse the char after it. 1220 Size += EscapedNewLineSize; 1221 Ptr += EscapedNewLineSize; 1222 1223 // Use slow version to accumulate a correct size field. 1224 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 1225 } 1226 1227 // Otherwise, this is not an escaped newline, just return the slash. 1228 return '\\'; 1229 } 1230 1231 // If this is a trigraph, process it. 1232 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1233 // If this is actually a legal trigraph (not something like "??x"), return 1234 // it. 1235 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1236 Ptr += 3; 1237 Size += 3; 1238 if (C == '\\') goto Slash; 1239 return C; 1240 } 1241 } 1242 1243 // If this is neither, return a single character. 1244 ++Size; 1245 return *Ptr; 1246 } 1247 1248 //===----------------------------------------------------------------------===// 1249 // Helper methods for lexing. 1250 //===----------------------------------------------------------------------===// 1251 1252 /// \brief Routine that indiscriminately skips bytes in the source file. 1253 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1254 BufferPtr += Bytes; 1255 if (BufferPtr > BufferEnd) 1256 BufferPtr = BufferEnd; 1257 IsAtStartOfLine = StartOfLine; 1258 } 1259 1260 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1261 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1262 unsigned Size; 1263 unsigned char C = *CurPtr++; 1264 while (isIdentifierBody(C)) 1265 C = *CurPtr++; 1266 1267 --CurPtr; // Back up over the skipped character. 1268 1269 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1270 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1271 // FIXME: UCNs. 1272 // 1273 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1274 // cheaper 1275 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 1276 FinishIdentifier: 1277 const char *IdStart = BufferPtr; 1278 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1279 Result.setRawIdentifierData(IdStart); 1280 1281 // If we are in raw mode, return this identifier raw. There is no need to 1282 // look up identifier information or attempt to macro expand it. 1283 if (LexingRawMode) 1284 return; 1285 1286 // Fill in Result.IdentifierInfo and update the token kind, 1287 // looking up the identifier in the identifier table. 1288 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1289 1290 // Finally, now that we know we have an identifier, pass this off to the 1291 // preprocessor, which may macro expand it or something. 1292 if (II->isHandleIdentifierCase()) 1293 PP->HandleIdentifier(Result); 1294 1295 return; 1296 } 1297 1298 // Otherwise, $,\,? in identifier found. Enter slower path. 1299 1300 C = getCharAndSize(CurPtr, Size); 1301 while (1) { 1302 if (C == '$') { 1303 // If we hit a $ and they are not supported in identifiers, we are done. 1304 if (!Features.DollarIdents) goto FinishIdentifier; 1305 1306 // Otherwise, emit a diagnostic and continue. 1307 if (!isLexingRawMode()) 1308 Diag(CurPtr, diag::ext_dollar_in_identifier); 1309 CurPtr = ConsumeChar(CurPtr, Size, Result); 1310 C = getCharAndSize(CurPtr, Size); 1311 continue; 1312 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 1313 // Found end of identifier. 1314 goto FinishIdentifier; 1315 } 1316 1317 // Otherwise, this character is good, consume it. 1318 CurPtr = ConsumeChar(CurPtr, Size, Result); 1319 1320 C = getCharAndSize(CurPtr, Size); 1321 while (isIdentifierBody(C)) { // FIXME: UCNs. 1322 CurPtr = ConsumeChar(CurPtr, Size, Result); 1323 C = getCharAndSize(CurPtr, Size); 1324 } 1325 } 1326 } 1327 1328 /// isHexaLiteral - Return true if Start points to a hex constant. 1329 /// in microsoft mode (where this is supposed to be several different tokens). 1330 static bool isHexaLiteral(const char *Start, const LangOptions &Features) { 1331 unsigned Size; 1332 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features); 1333 if (C1 != '0') 1334 return false; 1335 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features); 1336 return (C2 == 'x' || C2 == 'X'); 1337 } 1338 1339 /// LexNumericConstant - Lex the remainder of a integer or floating point 1340 /// constant. From[-1] is the first character lexed. Return the end of the 1341 /// constant. 1342 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1343 unsigned Size; 1344 char C = getCharAndSize(CurPtr, Size); 1345 char PrevCh = 0; 1346 while (isNumberBody(C)) { // FIXME: UCNs? 1347 CurPtr = ConsumeChar(CurPtr, Size, Result); 1348 PrevCh = C; 1349 C = getCharAndSize(CurPtr, Size); 1350 } 1351 1352 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1353 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1354 // If we are in Microsoft mode, don't continue if the constant is hex. 1355 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1356 if (!Features.MicrosoftExt || !isHexaLiteral(BufferPtr, Features)) 1357 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1358 } 1359 1360 // If we have a hex FP constant, continue. 1361 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) 1362 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1363 1364 // Update the location of token as well as BufferPtr. 1365 const char *TokStart = BufferPtr; 1366 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1367 Result.setLiteralData(TokStart); 1368 } 1369 1370 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1371 /// either " or L" or u8" or u" or U". 1372 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1373 tok::TokenKind Kind) { 1374 const char *NulCharacter = 0; // Does this string contain the \0 character? 1375 1376 if (!isLexingRawMode() && 1377 (Kind == tok::utf8_string_literal || 1378 Kind == tok::utf16_string_literal || 1379 Kind == tok::utf32_string_literal)) 1380 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1381 1382 char C = getAndAdvanceChar(CurPtr, Result); 1383 while (C != '"') { 1384 // Skip escaped characters. Escaped newlines will already be processed by 1385 // getAndAdvanceChar. 1386 if (C == '\\') 1387 C = getAndAdvanceChar(CurPtr, Result); 1388 1389 if (C == '\n' || C == '\r' || // Newline. 1390 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1391 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1392 Diag(BufferPtr, diag::warn_unterminated_string); 1393 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1394 return; 1395 } 1396 1397 if (C == 0) { 1398 if (isCodeCompletionPoint(CurPtr-1)) { 1399 PP->CodeCompleteNaturalLanguage(); 1400 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1401 return cutOffLexing(); 1402 } 1403 1404 NulCharacter = CurPtr-1; 1405 } 1406 C = getAndAdvanceChar(CurPtr, Result); 1407 } 1408 1409 // If a nul character existed in the string, warn about it. 1410 if (NulCharacter && !isLexingRawMode()) 1411 Diag(NulCharacter, diag::null_in_string); 1412 1413 // Update the location of the token as well as the BufferPtr instance var. 1414 const char *TokStart = BufferPtr; 1415 FormTokenWithChars(Result, CurPtr, Kind); 1416 Result.setLiteralData(TokStart); 1417 } 1418 1419 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1420 /// having lexed R", LR", u8R", uR", or UR". 1421 void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1422 tok::TokenKind Kind) { 1423 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1424 // Between the initial and final double quote characters of the raw string, 1425 // any transformations performed in phases 1 and 2 (trigraphs, 1426 // universal-character-names, and line splicing) are reverted. 1427 1428 if (!isLexingRawMode()) 1429 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1430 1431 unsigned PrefixLen = 0; 1432 1433 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1434 ++PrefixLen; 1435 1436 // If the last character was not a '(', then we didn't lex a valid delimiter. 1437 if (CurPtr[PrefixLen] != '(') { 1438 if (!isLexingRawMode()) { 1439 const char *PrefixEnd = &CurPtr[PrefixLen]; 1440 if (PrefixLen == 16) { 1441 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1442 } else { 1443 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1444 << StringRef(PrefixEnd, 1); 1445 } 1446 } 1447 1448 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1449 // it's possible the '"' was intended to be part of the raw string, but 1450 // there's not much we can do about that. 1451 while (1) { 1452 char C = *CurPtr++; 1453 1454 if (C == '"') 1455 break; 1456 if (C == 0 && CurPtr-1 == BufferEnd) { 1457 --CurPtr; 1458 break; 1459 } 1460 } 1461 1462 FormTokenWithChars(Result, CurPtr, tok::unknown); 1463 return; 1464 } 1465 1466 // Save prefix and move CurPtr past it 1467 const char *Prefix = CurPtr; 1468 CurPtr += PrefixLen + 1; // skip over prefix and '(' 1469 1470 while (1) { 1471 char C = *CurPtr++; 1472 1473 if (C == ')') { 1474 // Check for prefix match and closing quote. 1475 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 1476 CurPtr += PrefixLen + 1; // skip over prefix and '"' 1477 break; 1478 } 1479 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 1480 if (!isLexingRawMode()) 1481 Diag(BufferPtr, diag::err_unterminated_raw_string) 1482 << StringRef(Prefix, PrefixLen); 1483 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1484 return; 1485 } 1486 } 1487 1488 // Update the location of token as well as BufferPtr. 1489 const char *TokStart = BufferPtr; 1490 FormTokenWithChars(Result, CurPtr, Kind); 1491 Result.setLiteralData(TokStart); 1492 } 1493 1494 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1495 /// after having lexed the '<' character. This is used for #include filenames. 1496 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1497 const char *NulCharacter = 0; // Does this string contain the \0 character? 1498 const char *AfterLessPos = CurPtr; 1499 char C = getAndAdvanceChar(CurPtr, Result); 1500 while (C != '>') { 1501 // Skip escaped characters. 1502 if (C == '\\') { 1503 // Skip the escaped character. 1504 C = getAndAdvanceChar(CurPtr, Result); 1505 } else if (C == '\n' || C == '\r' || // Newline. 1506 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 1507 isCodeCompletionPoint(CurPtr-1)))) { 1508 // If the filename is unterminated, then it must just be a lone < 1509 // character. Return this as such. 1510 FormTokenWithChars(Result, AfterLessPos, tok::less); 1511 return; 1512 } else if (C == 0) { 1513 NulCharacter = CurPtr-1; 1514 } 1515 C = getAndAdvanceChar(CurPtr, Result); 1516 } 1517 1518 // If a nul character existed in the string, warn about it. 1519 if (NulCharacter && !isLexingRawMode()) 1520 Diag(NulCharacter, diag::null_in_string); 1521 1522 // Update the location of token as well as BufferPtr. 1523 const char *TokStart = BufferPtr; 1524 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1525 Result.setLiteralData(TokStart); 1526 } 1527 1528 1529 /// LexCharConstant - Lex the remainder of a character constant, after having 1530 /// lexed either ' or L' or u' or U'. 1531 void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 1532 tok::TokenKind Kind) { 1533 const char *NulCharacter = 0; // Does this character contain the \0 character? 1534 1535 if (!isLexingRawMode() && 1536 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 1537 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1538 1539 char C = getAndAdvanceChar(CurPtr, Result); 1540 if (C == '\'') { 1541 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1542 Diag(BufferPtr, diag::err_empty_character); 1543 FormTokenWithChars(Result, CurPtr, tok::unknown); 1544 return; 1545 } 1546 1547 while (C != '\'') { 1548 // Skip escaped characters. 1549 if (C == '\\') { 1550 // Skip the escaped character. 1551 // FIXME: UCN's 1552 C = getAndAdvanceChar(CurPtr, Result); 1553 } else if (C == '\n' || C == '\r' || // Newline. 1554 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1555 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1556 Diag(BufferPtr, diag::warn_unterminated_char); 1557 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1558 return; 1559 } else if (C == 0) { 1560 if (isCodeCompletionPoint(CurPtr-1)) { 1561 PP->CodeCompleteNaturalLanguage(); 1562 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1563 return cutOffLexing(); 1564 } 1565 1566 NulCharacter = CurPtr-1; 1567 } 1568 C = getAndAdvanceChar(CurPtr, Result); 1569 } 1570 1571 // If a nul character existed in the character, warn about it. 1572 if (NulCharacter && !isLexingRawMode()) 1573 Diag(NulCharacter, diag::null_in_char); 1574 1575 // Update the location of token as well as BufferPtr. 1576 const char *TokStart = BufferPtr; 1577 FormTokenWithChars(Result, CurPtr, Kind); 1578 Result.setLiteralData(TokStart); 1579 } 1580 1581 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 1582 /// Update BufferPtr to point to the next non-whitespace character and return. 1583 /// 1584 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 1585 /// 1586 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 1587 // Whitespace - Skip it, then return the token after the whitespace. 1588 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 1589 while (1) { 1590 // Skip horizontal whitespace very aggressively. 1591 while (isHorizontalWhitespace(Char)) 1592 Char = *++CurPtr; 1593 1594 // Otherwise if we have something other than whitespace, we're done. 1595 if (Char != '\n' && Char != '\r') 1596 break; 1597 1598 if (ParsingPreprocessorDirective) { 1599 // End of preprocessor directive line, let LexTokenInternal handle this. 1600 BufferPtr = CurPtr; 1601 return false; 1602 } 1603 1604 // ok, but handle newline. 1605 // The returned token is at the start of the line. 1606 Result.setFlag(Token::StartOfLine); 1607 // No leading whitespace seen so far. 1608 Result.clearFlag(Token::LeadingSpace); 1609 Char = *++CurPtr; 1610 } 1611 1612 // If this isn't immediately after a newline, there is leading space. 1613 char PrevChar = CurPtr[-1]; 1614 if (PrevChar != '\n' && PrevChar != '\r') 1615 Result.setFlag(Token::LeadingSpace); 1616 1617 // If the client wants us to return whitespace, return it now. 1618 if (isKeepWhitespaceMode()) { 1619 FormTokenWithChars(Result, CurPtr, tok::unknown); 1620 return true; 1621 } 1622 1623 BufferPtr = CurPtr; 1624 return false; 1625 } 1626 1627 // SkipBCPLComment - We have just read the // characters from input. Skip until 1628 // we find the newline character thats terminate the comment. Then update 1629 /// BufferPtr and return. 1630 /// 1631 /// If we're in KeepCommentMode or any CommentHandler has inserted 1632 /// some tokens, this will store the first token and return true. 1633 bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 1634 // If BCPL comments aren't explicitly enabled for this language, emit an 1635 // extension warning. 1636 if (!Features.BCPLComment && !isLexingRawMode()) { 1637 Diag(BufferPtr, diag::ext_bcpl_comment); 1638 1639 // Mark them enabled so we only emit one warning for this translation 1640 // unit. 1641 Features.BCPLComment = true; 1642 } 1643 1644 // Scan over the body of the comment. The common case, when scanning, is that 1645 // the comment contains normal ascii characters with nothing interesting in 1646 // them. As such, optimize for this case with the inner loop. 1647 char C; 1648 do { 1649 C = *CurPtr; 1650 // Skip over characters in the fast loop. 1651 while (C != 0 && // Potentially EOF. 1652 C != '\n' && C != '\r') // Newline or DOS-style newline. 1653 C = *++CurPtr; 1654 1655 const char *NextLine = CurPtr; 1656 if (C != 0) { 1657 // We found a newline, see if it's escaped. 1658 const char *EscapePtr = CurPtr-1; 1659 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 1660 --EscapePtr; 1661 1662 if (*EscapePtr == '\\') // Escaped newline. 1663 CurPtr = EscapePtr; 1664 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 1665 EscapePtr[-2] == '?') // Trigraph-escaped newline. 1666 CurPtr = EscapePtr-2; 1667 else 1668 break; // This is a newline, we're done. 1669 1670 C = *CurPtr; 1671 } 1672 1673 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 1674 // properly decode the character. Read it in raw mode to avoid emitting 1675 // diagnostics about things like trigraphs. If we see an escaped newline, 1676 // we'll handle it below. 1677 const char *OldPtr = CurPtr; 1678 bool OldRawMode = isLexingRawMode(); 1679 LexingRawMode = true; 1680 C = getAndAdvanceChar(CurPtr, Result); 1681 LexingRawMode = OldRawMode; 1682 1683 // If we only read only one character, then no special handling is needed. 1684 // We're done and can skip forward to the newline. 1685 if (C != 0 && CurPtr == OldPtr+1) { 1686 CurPtr = NextLine; 1687 break; 1688 } 1689 1690 // If the char that we finally got was a \n, then we must have had something 1691 // like \<newline><newline>. We don't want to have consumed the second 1692 // newline, we want CurPtr, to end up pointing to it down below. 1693 if (C == '\n' || C == '\r') { 1694 --CurPtr; 1695 C = 'x'; // doesn't matter what this is. 1696 } 1697 1698 // If we read multiple characters, and one of those characters was a \r or 1699 // \n, then we had an escaped newline within the comment. Emit diagnostic 1700 // unless the next line is also a // comment. 1701 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 1702 for (; OldPtr != CurPtr; ++OldPtr) 1703 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 1704 // Okay, we found a // comment that ends in a newline, if the next 1705 // line is also a // comment, but has spaces, don't emit a diagnostic. 1706 if (isWhitespace(C)) { 1707 const char *ForwardPtr = CurPtr; 1708 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 1709 ++ForwardPtr; 1710 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 1711 break; 1712 } 1713 1714 if (!isLexingRawMode()) 1715 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 1716 break; 1717 } 1718 } 1719 1720 if (CurPtr == BufferEnd+1) { 1721 --CurPtr; 1722 break; 1723 } 1724 1725 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 1726 PP->CodeCompleteNaturalLanguage(); 1727 cutOffLexing(); 1728 return false; 1729 } 1730 1731 } while (C != '\n' && C != '\r'); 1732 1733 // Found but did not consume the newline. Notify comment handlers about the 1734 // comment unless we're in a #if 0 block. 1735 if (PP && !isLexingRawMode() && 1736 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1737 getSourceLocation(CurPtr)))) { 1738 BufferPtr = CurPtr; 1739 return true; // A token has to be returned. 1740 } 1741 1742 // If we are returning comments as tokens, return this comment as a token. 1743 if (inKeepCommentMode()) 1744 return SaveBCPLComment(Result, CurPtr); 1745 1746 // If we are inside a preprocessor directive and we see the end of line, 1747 // return immediately, so that the lexer can return this as an EOD token. 1748 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 1749 BufferPtr = CurPtr; 1750 return false; 1751 } 1752 1753 // Otherwise, eat the \n character. We don't care if this is a \n\r or 1754 // \r\n sequence. This is an efficiency hack (because we know the \n can't 1755 // contribute to another token), it isn't needed for correctness. Note that 1756 // this is ok even in KeepWhitespaceMode, because we would have returned the 1757 /// comment above in that mode. 1758 ++CurPtr; 1759 1760 // The next returned token is at the start of the line. 1761 Result.setFlag(Token::StartOfLine); 1762 // No leading whitespace seen so far. 1763 Result.clearFlag(Token::LeadingSpace); 1764 BufferPtr = CurPtr; 1765 return false; 1766 } 1767 1768 /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 1769 /// an appropriate way and return it. 1770 bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 1771 // If we're not in a preprocessor directive, just return the // comment 1772 // directly. 1773 FormTokenWithChars(Result, CurPtr, tok::comment); 1774 1775 if (!ParsingPreprocessorDirective) 1776 return true; 1777 1778 // If this BCPL-style comment is in a macro definition, transmogrify it into 1779 // a C-style block comment. 1780 bool Invalid = false; 1781 std::string Spelling = PP->getSpelling(Result, &Invalid); 1782 if (Invalid) 1783 return true; 1784 1785 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 1786 Spelling[1] = '*'; // Change prefix to "/*". 1787 Spelling += "*/"; // add suffix. 1788 1789 Result.setKind(tok::comment); 1790 PP->CreateString(&Spelling[0], Spelling.size(), Result, 1791 Result.getLocation(), Result.getLocation()); 1792 return true; 1793 } 1794 1795 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 1796 /// character (either \n or \r) is part of an escaped newline sequence. Issue a 1797 /// diagnostic if so. We know that the newline is inside of a block comment. 1798 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 1799 Lexer *L) { 1800 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 1801 1802 // Back up off the newline. 1803 --CurPtr; 1804 1805 // If this is a two-character newline sequence, skip the other character. 1806 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 1807 // \n\n or \r\r -> not escaped newline. 1808 if (CurPtr[0] == CurPtr[1]) 1809 return false; 1810 // \n\r or \r\n -> skip the newline. 1811 --CurPtr; 1812 } 1813 1814 // If we have horizontal whitespace, skip over it. We allow whitespace 1815 // between the slash and newline. 1816 bool HasSpace = false; 1817 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 1818 --CurPtr; 1819 HasSpace = true; 1820 } 1821 1822 // If we have a slash, we know this is an escaped newline. 1823 if (*CurPtr == '\\') { 1824 if (CurPtr[-1] != '*') return false; 1825 } else { 1826 // It isn't a slash, is it the ?? / trigraph? 1827 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 1828 CurPtr[-3] != '*') 1829 return false; 1830 1831 // This is the trigraph ending the comment. Emit a stern warning! 1832 CurPtr -= 2; 1833 1834 // If no trigraphs are enabled, warn that we ignored this trigraph and 1835 // ignore this * character. 1836 if (!L->getFeatures().Trigraphs) { 1837 if (!L->isLexingRawMode()) 1838 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 1839 return false; 1840 } 1841 if (!L->isLexingRawMode()) 1842 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 1843 } 1844 1845 // Warn about having an escaped newline between the */ characters. 1846 if (!L->isLexingRawMode()) 1847 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 1848 1849 // If there was space between the backslash and newline, warn about it. 1850 if (HasSpace && !L->isLexingRawMode()) 1851 L->Diag(CurPtr, diag::backslash_newline_space); 1852 1853 return true; 1854 } 1855 1856 #ifdef __SSE2__ 1857 #include <emmintrin.h> 1858 #elif __ALTIVEC__ 1859 #include <altivec.h> 1860 #undef bool 1861 #endif 1862 1863 /// SkipBlockComment - We have just read the /* characters from input. Read 1864 /// until we find the */ characters that terminate the comment. Note that we 1865 /// don't bother decoding trigraphs or escaped newlines in block comments, 1866 /// because they cannot cause the comment to end. The only thing that can 1867 /// happen is the comment could end with an escaped newline between the */ end 1868 /// of comment. 1869 /// 1870 /// If we're in KeepCommentMode or any CommentHandler has inserted 1871 /// some tokens, this will store the first token and return true. 1872 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1873 // Scan one character past where we should, looking for a '/' character. Once 1874 // we find it, check to see if it was preceded by a *. This common 1875 // optimization helps people who like to put a lot of * characters in their 1876 // comments. 1877 1878 // The first character we get with newlines and trigraphs skipped to handle 1879 // the degenerate /*/ case below correctly if the * has an escaped newline 1880 // after it. 1881 unsigned CharSize; 1882 unsigned char C = getCharAndSize(CurPtr, CharSize); 1883 CurPtr += CharSize; 1884 if (C == 0 && CurPtr == BufferEnd+1) { 1885 if (!isLexingRawMode()) 1886 Diag(BufferPtr, diag::err_unterminated_block_comment); 1887 --CurPtr; 1888 1889 // KeepWhitespaceMode should return this broken comment as a token. Since 1890 // it isn't a well formed comment, just return it as an 'unknown' token. 1891 if (isKeepWhitespaceMode()) { 1892 FormTokenWithChars(Result, CurPtr, tok::unknown); 1893 return true; 1894 } 1895 1896 BufferPtr = CurPtr; 1897 return false; 1898 } 1899 1900 // Check to see if the first character after the '/*' is another /. If so, 1901 // then this slash does not end the block comment, it is part of it. 1902 if (C == '/') 1903 C = *CurPtr++; 1904 1905 while (1) { 1906 // Skip over all non-interesting characters until we find end of buffer or a 1907 // (probably ending) '/' character. 1908 if (CurPtr + 24 < BufferEnd && 1909 // If there is a code-completion point avoid the fast scan because it 1910 // doesn't check for '\0'. 1911 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 1912 // While not aligned to a 16-byte boundary. 1913 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1914 C = *CurPtr++; 1915 1916 if (C == '/') goto FoundSlash; 1917 1918 #ifdef __SSE2__ 1919 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1920 '/', '/', '/', '/', '/', '/', '/', '/'); 1921 while (CurPtr+16 <= BufferEnd && 1922 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1923 CurPtr += 16; 1924 #elif __ALTIVEC__ 1925 __vector unsigned char Slashes = { 1926 '/', '/', '/', '/', '/', '/', '/', '/', 1927 '/', '/', '/', '/', '/', '/', '/', '/' 1928 }; 1929 while (CurPtr+16 <= BufferEnd && 1930 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1931 CurPtr += 16; 1932 #else 1933 // Scan for '/' quickly. Many block comments are very large. 1934 while (CurPtr[0] != '/' && 1935 CurPtr[1] != '/' && 1936 CurPtr[2] != '/' && 1937 CurPtr[3] != '/' && 1938 CurPtr+4 < BufferEnd) { 1939 CurPtr += 4; 1940 } 1941 #endif 1942 1943 // It has to be one of the bytes scanned, increment to it and read one. 1944 C = *CurPtr++; 1945 } 1946 1947 // Loop to scan the remainder. 1948 while (C != '/' && C != '\0') 1949 C = *CurPtr++; 1950 1951 FoundSlash: 1952 if (C == '/') { 1953 if (CurPtr[-2] == '*') // We found the final */. We're done! 1954 break; 1955 1956 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1957 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1958 // We found the final */, though it had an escaped newline between the 1959 // * and /. We're done! 1960 break; 1961 } 1962 } 1963 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1964 // If this is a /* inside of the comment, emit a warning. Don't do this 1965 // if this is a /*/, which will end the comment. This misses cases with 1966 // embedded escaped newlines, but oh well. 1967 if (!isLexingRawMode()) 1968 Diag(CurPtr-1, diag::warn_nested_block_comment); 1969 } 1970 } else if (C == 0 && CurPtr == BufferEnd+1) { 1971 if (!isLexingRawMode()) 1972 Diag(BufferPtr, diag::err_unterminated_block_comment); 1973 // Note: the user probably forgot a */. We could continue immediately 1974 // after the /*, but this would involve lexing a lot of what really is the 1975 // comment, which surely would confuse the parser. 1976 --CurPtr; 1977 1978 // KeepWhitespaceMode should return this broken comment as a token. Since 1979 // it isn't a well formed comment, just return it as an 'unknown' token. 1980 if (isKeepWhitespaceMode()) { 1981 FormTokenWithChars(Result, CurPtr, tok::unknown); 1982 return true; 1983 } 1984 1985 BufferPtr = CurPtr; 1986 return false; 1987 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 1988 PP->CodeCompleteNaturalLanguage(); 1989 cutOffLexing(); 1990 return false; 1991 } 1992 1993 C = *CurPtr++; 1994 } 1995 1996 // Notify comment handlers about the comment unless we're in a #if 0 block. 1997 if (PP && !isLexingRawMode() && 1998 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1999 getSourceLocation(CurPtr)))) { 2000 BufferPtr = CurPtr; 2001 return true; // A token has to be returned. 2002 } 2003 2004 // If we are returning comments as tokens, return this comment as a token. 2005 if (inKeepCommentMode()) { 2006 FormTokenWithChars(Result, CurPtr, tok::comment); 2007 return true; 2008 } 2009 2010 // It is common for the tokens immediately after a /**/ comment to be 2011 // whitespace. Instead of going through the big switch, handle it 2012 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2013 // have already returned above with the comment as a token. 2014 if (isHorizontalWhitespace(*CurPtr)) { 2015 Result.setFlag(Token::LeadingSpace); 2016 SkipWhitespace(Result, CurPtr+1); 2017 return false; 2018 } 2019 2020 // Otherwise, just return so that the next character will be lexed as a token. 2021 BufferPtr = CurPtr; 2022 Result.setFlag(Token::LeadingSpace); 2023 return false; 2024 } 2025 2026 //===----------------------------------------------------------------------===// 2027 // Primary Lexing Entry Points 2028 //===----------------------------------------------------------------------===// 2029 2030 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2031 /// uninterpreted string. This switches the lexer out of directive mode. 2032 std::string Lexer::ReadToEndOfLine() { 2033 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2034 "Must be in a preprocessing directive!"); 2035 std::string Result; 2036 Token Tmp; 2037 2038 // CurPtr - Cache BufferPtr in an automatic variable. 2039 const char *CurPtr = BufferPtr; 2040 while (1) { 2041 char Char = getAndAdvanceChar(CurPtr, Tmp); 2042 switch (Char) { 2043 default: 2044 Result += Char; 2045 break; 2046 case 0: // Null. 2047 // Found end of file? 2048 if (CurPtr-1 != BufferEnd) { 2049 if (isCodeCompletionPoint(CurPtr-1)) { 2050 PP->CodeCompleteNaturalLanguage(); 2051 cutOffLexing(); 2052 return Result; 2053 } 2054 2055 // Nope, normal character, continue. 2056 Result += Char; 2057 break; 2058 } 2059 // FALL THROUGH. 2060 case '\r': 2061 case '\n': 2062 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2063 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2064 BufferPtr = CurPtr-1; 2065 2066 // Next, lex the character, which should handle the EOD transition. 2067 Lex(Tmp); 2068 if (Tmp.is(tok::code_completion)) { 2069 if (PP) 2070 PP->CodeCompleteNaturalLanguage(); 2071 Lex(Tmp); 2072 } 2073 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2074 2075 // Finally, we're done, return the string we found. 2076 return Result; 2077 } 2078 } 2079 } 2080 2081 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2082 /// condition, reporting diagnostics and handling other edge cases as required. 2083 /// This returns true if Result contains a token, false if PP.Lex should be 2084 /// called again. 2085 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2086 // If we hit the end of the file while parsing a preprocessor directive, 2087 // end the preprocessor directive first. The next token returned will 2088 // then be the end of file. 2089 if (ParsingPreprocessorDirective) { 2090 // Done parsing the "line". 2091 ParsingPreprocessorDirective = false; 2092 // Update the location of token as well as BufferPtr. 2093 FormTokenWithChars(Result, CurPtr, tok::eod); 2094 2095 // Restore comment saving mode, in case it was disabled for directive. 2096 SetCommentRetentionState(PP->getCommentRetentionState()); 2097 return true; // Have a token. 2098 } 2099 2100 // If we are in raw mode, return this event as an EOF token. Let the caller 2101 // that put us in raw mode handle the event. 2102 if (isLexingRawMode()) { 2103 Result.startToken(); 2104 BufferPtr = BufferEnd; 2105 FormTokenWithChars(Result, BufferEnd, tok::eof); 2106 return true; 2107 } 2108 2109 // Issue diagnostics for unterminated #if and missing newline. 2110 2111 // If we are in a #if directive, emit an error. 2112 while (!ConditionalStack.empty()) { 2113 if (PP->getCodeCompletionFileLoc() != FileLoc) 2114 PP->Diag(ConditionalStack.back().IfLoc, 2115 diag::err_pp_unterminated_conditional); 2116 ConditionalStack.pop_back(); 2117 } 2118 2119 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2120 // a pedwarn. 2121 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 2122 Diag(BufferEnd, diag::ext_no_newline_eof) 2123 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 2124 2125 BufferPtr = CurPtr; 2126 2127 // Finally, let the preprocessor handle this. 2128 return PP->HandleEndOfFile(Result); 2129 } 2130 2131 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2132 /// the specified lexer will return a tok::l_paren token, 0 if it is something 2133 /// else and 2 if there are no more tokens in the buffer controlled by the 2134 /// lexer. 2135 unsigned Lexer::isNextPPTokenLParen() { 2136 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2137 2138 // Switch to 'skipping' mode. This will ensure that we can lex a token 2139 // without emitting diagnostics, disables macro expansion, and will cause EOF 2140 // to return an EOF token instead of popping the include stack. 2141 LexingRawMode = true; 2142 2143 // Save state that can be changed while lexing so that we can restore it. 2144 const char *TmpBufferPtr = BufferPtr; 2145 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2146 2147 Token Tok; 2148 Tok.startToken(); 2149 LexTokenInternal(Tok); 2150 2151 // Restore state that may have changed. 2152 BufferPtr = TmpBufferPtr; 2153 ParsingPreprocessorDirective = inPPDirectiveMode; 2154 2155 // Restore the lexer back to non-skipping mode. 2156 LexingRawMode = false; 2157 2158 if (Tok.is(tok::eof)) 2159 return 2; 2160 return Tok.is(tok::l_paren); 2161 } 2162 2163 /// FindConflictEnd - Find the end of a version control conflict marker. 2164 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2165 ConflictMarkerKind CMK) { 2166 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2167 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2168 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 2169 size_t Pos = RestOfBuffer.find(Terminator); 2170 while (Pos != StringRef::npos) { 2171 // Must occur at start of line. 2172 if (RestOfBuffer[Pos-1] != '\r' && 2173 RestOfBuffer[Pos-1] != '\n') { 2174 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2175 Pos = RestOfBuffer.find(Terminator); 2176 continue; 2177 } 2178 return RestOfBuffer.data()+Pos; 2179 } 2180 return 0; 2181 } 2182 2183 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2184 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2185 /// and recover nicely. This returns true if it is a conflict marker and false 2186 /// if not. 2187 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2188 // Only a conflict marker if it starts at the beginning of a line. 2189 if (CurPtr != BufferStart && 2190 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2191 return false; 2192 2193 // Check to see if we have <<<<<<< or >>>>. 2194 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 2195 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 2196 return false; 2197 2198 // If we have a situation where we don't care about conflict markers, ignore 2199 // it. 2200 if (CurrentConflictMarkerState || isLexingRawMode()) 2201 return false; 2202 2203 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2204 2205 // Check to see if there is an ending marker somewhere in the buffer at the 2206 // start of a line to terminate this conflict marker. 2207 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2208 // We found a match. We are really in a conflict marker. 2209 // Diagnose this, and ignore to the end of line. 2210 Diag(CurPtr, diag::err_conflict_marker); 2211 CurrentConflictMarkerState = Kind; 2212 2213 // Skip ahead to the end of line. We know this exists because the 2214 // end-of-conflict marker starts with \r or \n. 2215 while (*CurPtr != '\r' && *CurPtr != '\n') { 2216 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2217 ++CurPtr; 2218 } 2219 BufferPtr = CurPtr; 2220 return true; 2221 } 2222 2223 // No end of conflict marker found. 2224 return false; 2225 } 2226 2227 2228 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2229 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2230 /// is the end of a conflict marker. Handle it by ignoring up until the end of 2231 /// the line. This returns true if it is a conflict marker and false if not. 2232 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2233 // Only a conflict marker if it starts at the beginning of a line. 2234 if (CurPtr != BufferStart && 2235 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2236 return false; 2237 2238 // If we have a situation where we don't care about conflict markers, ignore 2239 // it. 2240 if (!CurrentConflictMarkerState || isLexingRawMode()) 2241 return false; 2242 2243 // Check to see if we have the marker (4 characters in a row). 2244 for (unsigned i = 1; i != 4; ++i) 2245 if (CurPtr[i] != CurPtr[0]) 2246 return false; 2247 2248 // If we do have it, search for the end of the conflict marker. This could 2249 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2250 // be the end of conflict marker. 2251 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2252 CurrentConflictMarkerState)) { 2253 CurPtr = End; 2254 2255 // Skip ahead to the end of line. 2256 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2257 ++CurPtr; 2258 2259 BufferPtr = CurPtr; 2260 2261 // No longer in the conflict marker. 2262 CurrentConflictMarkerState = CMK_None; 2263 return true; 2264 } 2265 2266 return false; 2267 } 2268 2269 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2270 if (PP && PP->isCodeCompletionEnabled()) { 2271 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2272 return Loc == PP->getCodeCompletionLoc(); 2273 } 2274 2275 return false; 2276 } 2277 2278 2279 /// LexTokenInternal - This implements a simple C family lexer. It is an 2280 /// extremely performance critical piece of code. This assumes that the buffer 2281 /// has a null character at the end of the file. This returns a preprocessing 2282 /// token, not a normal token, as such, it is an internal interface. It assumes 2283 /// that the Flags of result have been cleared before calling this. 2284 void Lexer::LexTokenInternal(Token &Result) { 2285 LexNextToken: 2286 // New token, can't need cleaning yet. 2287 Result.clearFlag(Token::NeedsCleaning); 2288 Result.setIdentifierInfo(0); 2289 2290 // CurPtr - Cache BufferPtr in an automatic variable. 2291 const char *CurPtr = BufferPtr; 2292 2293 // Small amounts of horizontal whitespace is very common between tokens. 2294 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2295 ++CurPtr; 2296 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2297 ++CurPtr; 2298 2299 // If we are keeping whitespace and other tokens, just return what we just 2300 // skipped. The next lexer invocation will return the token after the 2301 // whitespace. 2302 if (isKeepWhitespaceMode()) { 2303 FormTokenWithChars(Result, CurPtr, tok::unknown); 2304 return; 2305 } 2306 2307 BufferPtr = CurPtr; 2308 Result.setFlag(Token::LeadingSpace); 2309 } 2310 2311 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2312 2313 // Read a character, advancing over it. 2314 char Char = getAndAdvanceChar(CurPtr, Result); 2315 tok::TokenKind Kind; 2316 2317 switch (Char) { 2318 case 0: // Null. 2319 // Found end of file? 2320 if (CurPtr-1 == BufferEnd) { 2321 // Read the PP instance variable into an automatic variable, because 2322 // LexEndOfFile will often delete 'this'. 2323 Preprocessor *PPCache = PP; 2324 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2325 return; // Got a token to return. 2326 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2327 return PPCache->Lex(Result); 2328 } 2329 2330 // Check if we are performing code completion. 2331 if (isCodeCompletionPoint(CurPtr-1)) { 2332 // Return the code-completion token. 2333 Result.startToken(); 2334 FormTokenWithChars(Result, CurPtr, tok::code_completion); 2335 return; 2336 } 2337 2338 if (!isLexingRawMode()) 2339 Diag(CurPtr-1, diag::null_in_file); 2340 Result.setFlag(Token::LeadingSpace); 2341 if (SkipWhitespace(Result, CurPtr)) 2342 return; // KeepWhitespaceMode 2343 2344 goto LexNextToken; // GCC isn't tail call eliminating. 2345 2346 case 26: // DOS & CP/M EOF: "^Z". 2347 // If we're in Microsoft extensions mode, treat this as end of file. 2348 if (Features.MicrosoftExt) { 2349 // Read the PP instance variable into an automatic variable, because 2350 // LexEndOfFile will often delete 'this'. 2351 Preprocessor *PPCache = PP; 2352 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2353 return; // Got a token to return. 2354 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2355 return PPCache->Lex(Result); 2356 } 2357 // If Microsoft extensions are disabled, this is just random garbage. 2358 Kind = tok::unknown; 2359 break; 2360 2361 case '\n': 2362 case '\r': 2363 // If we are inside a preprocessor directive and we see the end of line, 2364 // we know we are done with the directive, so return an EOD token. 2365 if (ParsingPreprocessorDirective) { 2366 // Done parsing the "line". 2367 ParsingPreprocessorDirective = false; 2368 2369 // Restore comment saving mode, in case it was disabled for directive. 2370 SetCommentRetentionState(PP->getCommentRetentionState()); 2371 2372 // Since we consumed a newline, we are back at the start of a line. 2373 IsAtStartOfLine = true; 2374 2375 Kind = tok::eod; 2376 break; 2377 } 2378 // The returned token is at the start of the line. 2379 Result.setFlag(Token::StartOfLine); 2380 // No leading whitespace seen so far. 2381 Result.clearFlag(Token::LeadingSpace); 2382 2383 if (SkipWhitespace(Result, CurPtr)) 2384 return; // KeepWhitespaceMode 2385 goto LexNextToken; // GCC isn't tail call eliminating. 2386 case ' ': 2387 case '\t': 2388 case '\f': 2389 case '\v': 2390 SkipHorizontalWhitespace: 2391 Result.setFlag(Token::LeadingSpace); 2392 if (SkipWhitespace(Result, CurPtr)) 2393 return; // KeepWhitespaceMode 2394 2395 SkipIgnoredUnits: 2396 CurPtr = BufferPtr; 2397 2398 // If the next token is obviously a // or /* */ comment, skip it efficiently 2399 // too (without going through the big switch stmt). 2400 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2401 Features.BCPLComment && !Features.TraditionalCPP) { 2402 if (SkipBCPLComment(Result, CurPtr+2)) 2403 return; // There is a token to return. 2404 goto SkipIgnoredUnits; 2405 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2406 if (SkipBlockComment(Result, CurPtr+2)) 2407 return; // There is a token to return. 2408 goto SkipIgnoredUnits; 2409 } else if (isHorizontalWhitespace(*CurPtr)) { 2410 goto SkipHorizontalWhitespace; 2411 } 2412 goto LexNextToken; // GCC isn't tail call eliminating. 2413 2414 // C99 6.4.4.1: Integer Constants. 2415 // C99 6.4.4.2: Floating Constants. 2416 case '0': case '1': case '2': case '3': case '4': 2417 case '5': case '6': case '7': case '8': case '9': 2418 // Notify MIOpt that we read a non-whitespace/non-comment token. 2419 MIOpt.ReadToken(); 2420 return LexNumericConstant(Result, CurPtr); 2421 2422 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 2423 // Notify MIOpt that we read a non-whitespace/non-comment token. 2424 MIOpt.ReadToken(); 2425 2426 if (Features.CPlusPlus0x) { 2427 Char = getCharAndSize(CurPtr, SizeTmp); 2428 2429 // UTF-16 string literal 2430 if (Char == '"') 2431 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2432 tok::utf16_string_literal); 2433 2434 // UTF-16 character constant 2435 if (Char == '\'') 2436 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2437 tok::utf16_char_constant); 2438 2439 // UTF-16 raw string literal 2440 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2441 return LexRawStringLiteral(Result, 2442 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2443 SizeTmp2, Result), 2444 tok::utf16_string_literal); 2445 2446 if (Char == '8') { 2447 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 2448 2449 // UTF-8 string literal 2450 if (Char2 == '"') 2451 return LexStringLiteral(Result, 2452 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2453 SizeTmp2, Result), 2454 tok::utf8_string_literal); 2455 2456 if (Char2 == 'R') { 2457 unsigned SizeTmp3; 2458 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2459 // UTF-8 raw string literal 2460 if (Char3 == '"') { 2461 return LexRawStringLiteral(Result, 2462 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2463 SizeTmp2, Result), 2464 SizeTmp3, Result), 2465 tok::utf8_string_literal); 2466 } 2467 } 2468 } 2469 } 2470 2471 // treat u like the start of an identifier. 2472 return LexIdentifier(Result, CurPtr); 2473 2474 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 2475 // Notify MIOpt that we read a non-whitespace/non-comment token. 2476 MIOpt.ReadToken(); 2477 2478 if (Features.CPlusPlus0x) { 2479 Char = getCharAndSize(CurPtr, SizeTmp); 2480 2481 // UTF-32 string literal 2482 if (Char == '"') 2483 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2484 tok::utf32_string_literal); 2485 2486 // UTF-32 character constant 2487 if (Char == '\'') 2488 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2489 tok::utf32_char_constant); 2490 2491 // UTF-32 raw string literal 2492 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2493 return LexRawStringLiteral(Result, 2494 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2495 SizeTmp2, Result), 2496 tok::utf32_string_literal); 2497 } 2498 2499 // treat U like the start of an identifier. 2500 return LexIdentifier(Result, CurPtr); 2501 2502 case 'R': // Identifier or C++0x raw string literal 2503 // Notify MIOpt that we read a non-whitespace/non-comment token. 2504 MIOpt.ReadToken(); 2505 2506 if (Features.CPlusPlus0x) { 2507 Char = getCharAndSize(CurPtr, SizeTmp); 2508 2509 if (Char == '"') 2510 return LexRawStringLiteral(Result, 2511 ConsumeChar(CurPtr, SizeTmp, Result), 2512 tok::string_literal); 2513 } 2514 2515 // treat R like the start of an identifier. 2516 return LexIdentifier(Result, CurPtr); 2517 2518 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 2519 // Notify MIOpt that we read a non-whitespace/non-comment token. 2520 MIOpt.ReadToken(); 2521 Char = getCharAndSize(CurPtr, SizeTmp); 2522 2523 // Wide string literal. 2524 if (Char == '"') 2525 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2526 tok::wide_string_literal); 2527 2528 // Wide raw string literal. 2529 if (Features.CPlusPlus0x && Char == 'R' && 2530 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2531 return LexRawStringLiteral(Result, 2532 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2533 SizeTmp2, Result), 2534 tok::wide_string_literal); 2535 2536 // Wide character constant. 2537 if (Char == '\'') 2538 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2539 tok::wide_char_constant); 2540 // FALL THROUGH, treating L like the start of an identifier. 2541 2542 // C99 6.4.2: Identifiers. 2543 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 2544 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 2545 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 2546 case 'V': case 'W': case 'X': case 'Y': case 'Z': 2547 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 2548 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 2549 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 2550 case 'v': case 'w': case 'x': case 'y': case 'z': 2551 case '_': 2552 // Notify MIOpt that we read a non-whitespace/non-comment token. 2553 MIOpt.ReadToken(); 2554 return LexIdentifier(Result, CurPtr); 2555 2556 case '$': // $ in identifiers. 2557 if (Features.DollarIdents) { 2558 if (!isLexingRawMode()) 2559 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 2560 // Notify MIOpt that we read a non-whitespace/non-comment token. 2561 MIOpt.ReadToken(); 2562 return LexIdentifier(Result, CurPtr); 2563 } 2564 2565 Kind = tok::unknown; 2566 break; 2567 2568 // C99 6.4.4: Character Constants. 2569 case '\'': 2570 // Notify MIOpt that we read a non-whitespace/non-comment token. 2571 MIOpt.ReadToken(); 2572 return LexCharConstant(Result, CurPtr, tok::char_constant); 2573 2574 // C99 6.4.5: String Literals. 2575 case '"': 2576 // Notify MIOpt that we read a non-whitespace/non-comment token. 2577 MIOpt.ReadToken(); 2578 return LexStringLiteral(Result, CurPtr, tok::string_literal); 2579 2580 // C99 6.4.6: Punctuators. 2581 case '?': 2582 Kind = tok::question; 2583 break; 2584 case '[': 2585 Kind = tok::l_square; 2586 break; 2587 case ']': 2588 Kind = tok::r_square; 2589 break; 2590 case '(': 2591 Kind = tok::l_paren; 2592 break; 2593 case ')': 2594 Kind = tok::r_paren; 2595 break; 2596 case '{': 2597 Kind = tok::l_brace; 2598 break; 2599 case '}': 2600 Kind = tok::r_brace; 2601 break; 2602 case '.': 2603 Char = getCharAndSize(CurPtr, SizeTmp); 2604 if (Char >= '0' && Char <= '9') { 2605 // Notify MIOpt that we read a non-whitespace/non-comment token. 2606 MIOpt.ReadToken(); 2607 2608 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2609 } else if (Features.CPlusPlus && Char == '*') { 2610 Kind = tok::periodstar; 2611 CurPtr += SizeTmp; 2612 } else if (Char == '.' && 2613 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 2614 Kind = tok::ellipsis; 2615 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2616 SizeTmp2, Result); 2617 } else { 2618 Kind = tok::period; 2619 } 2620 break; 2621 case '&': 2622 Char = getCharAndSize(CurPtr, SizeTmp); 2623 if (Char == '&') { 2624 Kind = tok::ampamp; 2625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2626 } else if (Char == '=') { 2627 Kind = tok::ampequal; 2628 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2629 } else { 2630 Kind = tok::amp; 2631 } 2632 break; 2633 case '*': 2634 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2635 Kind = tok::starequal; 2636 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2637 } else { 2638 Kind = tok::star; 2639 } 2640 break; 2641 case '+': 2642 Char = getCharAndSize(CurPtr, SizeTmp); 2643 if (Char == '+') { 2644 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2645 Kind = tok::plusplus; 2646 } else if (Char == '=') { 2647 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2648 Kind = tok::plusequal; 2649 } else { 2650 Kind = tok::plus; 2651 } 2652 break; 2653 case '-': 2654 Char = getCharAndSize(CurPtr, SizeTmp); 2655 if (Char == '-') { // -- 2656 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2657 Kind = tok::minusminus; 2658 } else if (Char == '>' && Features.CPlusPlus && 2659 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 2660 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2661 SizeTmp2, Result); 2662 Kind = tok::arrowstar; 2663 } else if (Char == '>') { // -> 2664 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2665 Kind = tok::arrow; 2666 } else if (Char == '=') { // -= 2667 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2668 Kind = tok::minusequal; 2669 } else { 2670 Kind = tok::minus; 2671 } 2672 break; 2673 case '~': 2674 Kind = tok::tilde; 2675 break; 2676 case '!': 2677 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2678 Kind = tok::exclaimequal; 2679 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2680 } else { 2681 Kind = tok::exclaim; 2682 } 2683 break; 2684 case '/': 2685 // 6.4.9: Comments 2686 Char = getCharAndSize(CurPtr, SizeTmp); 2687 if (Char == '/') { // BCPL comment. 2688 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 2689 // want to lex this as a comment. There is one problem with this though, 2690 // that in one particular corner case, this can change the behavior of the 2691 // resultant program. For example, In "foo //**/ bar", C89 would lex 2692 // this as "foo / bar" and langauges with BCPL comments would lex it as 2693 // "foo". Check to see if the character after the second slash is a '*'. 2694 // If so, we will lex that as a "/" instead of the start of a comment. 2695 // However, we never do this in -traditional-cpp mode. 2696 if ((Features.BCPLComment || 2697 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 2698 !Features.TraditionalCPP) { 2699 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2700 return; // There is a token to return. 2701 2702 // It is common for the tokens immediately after a // comment to be 2703 // whitespace (indentation for the next line). Instead of going through 2704 // the big switch, handle it efficiently now. 2705 goto SkipIgnoredUnits; 2706 } 2707 } 2708 2709 if (Char == '*') { // /**/ comment. 2710 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2711 return; // There is a token to return. 2712 goto LexNextToken; // GCC isn't tail call eliminating. 2713 } 2714 2715 if (Char == '=') { 2716 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2717 Kind = tok::slashequal; 2718 } else { 2719 Kind = tok::slash; 2720 } 2721 break; 2722 case '%': 2723 Char = getCharAndSize(CurPtr, SizeTmp); 2724 if (Char == '=') { 2725 Kind = tok::percentequal; 2726 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2727 } else if (Features.Digraphs && Char == '>') { 2728 Kind = tok::r_brace; // '%>' -> '}' 2729 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2730 } else if (Features.Digraphs && Char == ':') { 2731 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2732 Char = getCharAndSize(CurPtr, SizeTmp); 2733 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 2734 Kind = tok::hashhash; // '%:%:' -> '##' 2735 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2736 SizeTmp2, Result); 2737 } else if (Char == '@' && Features.MicrosoftExt) {// %:@ -> #@ -> Charize 2738 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2739 if (!isLexingRawMode()) 2740 Diag(BufferPtr, diag::ext_charize_microsoft); 2741 Kind = tok::hashat; 2742 } else { // '%:' -> '#' 2743 // We parsed a # character. If this occurs at the start of the line, 2744 // it's actually the start of a preprocessing directive. Callback to 2745 // the preprocessor to handle it. 2746 // FIXME: -fpreprocessed mode?? 2747 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2748 FormTokenWithChars(Result, CurPtr, tok::hash); 2749 PP->HandleDirective(Result); 2750 2751 // As an optimization, if the preprocessor didn't switch lexers, tail 2752 // recurse. 2753 if (PP->isCurrentLexer(this)) { 2754 // Start a new token. If this is a #include or something, the PP may 2755 // want us starting at the beginning of the line again. If so, set 2756 // the StartOfLine flag and clear LeadingSpace. 2757 if (IsAtStartOfLine) { 2758 Result.setFlag(Token::StartOfLine); 2759 Result.clearFlag(Token::LeadingSpace); 2760 IsAtStartOfLine = false; 2761 } 2762 goto LexNextToken; // GCC isn't tail call eliminating. 2763 } 2764 2765 return PP->Lex(Result); 2766 } 2767 2768 Kind = tok::hash; 2769 } 2770 } else { 2771 Kind = tok::percent; 2772 } 2773 break; 2774 case '<': 2775 Char = getCharAndSize(CurPtr, SizeTmp); 2776 if (ParsingFilename) { 2777 return LexAngledStringLiteral(Result, CurPtr); 2778 } else if (Char == '<') { 2779 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2780 if (After == '=') { 2781 Kind = tok::lesslessequal; 2782 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2783 SizeTmp2, Result); 2784 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 2785 // If this is actually a '<<<<<<<' version control conflict marker, 2786 // recognize it as such and recover nicely. 2787 goto LexNextToken; 2788 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 2789 // If this is '<<<<' and we're in a Perforce-style conflict marker, 2790 // ignore it. 2791 goto LexNextToken; 2792 } else if (Features.CUDA && After == '<') { 2793 Kind = tok::lesslessless; 2794 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2795 SizeTmp2, Result); 2796 } else { 2797 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2798 Kind = tok::lessless; 2799 } 2800 } else if (Char == '=') { 2801 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2802 Kind = tok::lessequal; 2803 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 2804 if (Features.CPlusPlus0x && 2805 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 2806 // C++0x [lex.pptoken]p3: 2807 // Otherwise, if the next three characters are <:: and the subsequent 2808 // character is neither : nor >, the < is treated as a preprocessor 2809 // token by itself and not as the first character of the alternative 2810 // token <:. 2811 unsigned SizeTmp3; 2812 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2813 if (After != ':' && After != '>') { 2814 Kind = tok::less; 2815 if (!isLexingRawMode()) 2816 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 2817 break; 2818 } 2819 } 2820 2821 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2822 Kind = tok::l_square; 2823 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 2824 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2825 Kind = tok::l_brace; 2826 } else { 2827 Kind = tok::less; 2828 } 2829 break; 2830 case '>': 2831 Char = getCharAndSize(CurPtr, SizeTmp); 2832 if (Char == '=') { 2833 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2834 Kind = tok::greaterequal; 2835 } else if (Char == '>') { 2836 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2837 if (After == '=') { 2838 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2839 SizeTmp2, Result); 2840 Kind = tok::greatergreaterequal; 2841 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 2842 // If this is actually a '>>>>' conflict marker, recognize it as such 2843 // and recover nicely. 2844 goto LexNextToken; 2845 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 2846 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 2847 goto LexNextToken; 2848 } else if (Features.CUDA && After == '>') { 2849 Kind = tok::greatergreatergreater; 2850 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2851 SizeTmp2, Result); 2852 } else { 2853 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2854 Kind = tok::greatergreater; 2855 } 2856 2857 } else { 2858 Kind = tok::greater; 2859 } 2860 break; 2861 case '^': 2862 Char = getCharAndSize(CurPtr, SizeTmp); 2863 if (Char == '=') { 2864 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2865 Kind = tok::caretequal; 2866 } else { 2867 Kind = tok::caret; 2868 } 2869 break; 2870 case '|': 2871 Char = getCharAndSize(CurPtr, SizeTmp); 2872 if (Char == '=') { 2873 Kind = tok::pipeequal; 2874 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2875 } else if (Char == '|') { 2876 // If this is '|||||||' and we're in a conflict marker, ignore it. 2877 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 2878 goto LexNextToken; 2879 Kind = tok::pipepipe; 2880 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2881 } else { 2882 Kind = tok::pipe; 2883 } 2884 break; 2885 case ':': 2886 Char = getCharAndSize(CurPtr, SizeTmp); 2887 if (Features.Digraphs && Char == '>') { 2888 Kind = tok::r_square; // ':>' -> ']' 2889 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2890 } else if (Features.CPlusPlus && Char == ':') { 2891 Kind = tok::coloncolon; 2892 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2893 } else { 2894 Kind = tok::colon; 2895 } 2896 break; 2897 case ';': 2898 Kind = tok::semi; 2899 break; 2900 case '=': 2901 Char = getCharAndSize(CurPtr, SizeTmp); 2902 if (Char == '=') { 2903 // If this is '====' and we're in a conflict marker, ignore it. 2904 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 2905 goto LexNextToken; 2906 2907 Kind = tok::equalequal; 2908 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2909 } else { 2910 Kind = tok::equal; 2911 } 2912 break; 2913 case ',': 2914 Kind = tok::comma; 2915 break; 2916 case '#': 2917 Char = getCharAndSize(CurPtr, SizeTmp); 2918 if (Char == '#') { 2919 Kind = tok::hashhash; 2920 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2921 } else if (Char == '@' && Features.MicrosoftExt) { // #@ -> Charize 2922 Kind = tok::hashat; 2923 if (!isLexingRawMode()) 2924 Diag(BufferPtr, diag::ext_charize_microsoft); 2925 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2926 } else { 2927 // We parsed a # character. If this occurs at the start of the line, 2928 // it's actually the start of a preprocessing directive. Callback to 2929 // the preprocessor to handle it. 2930 // FIXME: -fpreprocessed mode?? 2931 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2932 FormTokenWithChars(Result, CurPtr, tok::hash); 2933 PP->HandleDirective(Result); 2934 2935 // As an optimization, if the preprocessor didn't switch lexers, tail 2936 // recurse. 2937 if (PP->isCurrentLexer(this)) { 2938 // Start a new token. If this is a #include or something, the PP may 2939 // want us starting at the beginning of the line again. If so, set 2940 // the StartOfLine flag and clear LeadingSpace. 2941 if (IsAtStartOfLine) { 2942 Result.setFlag(Token::StartOfLine); 2943 Result.clearFlag(Token::LeadingSpace); 2944 IsAtStartOfLine = false; 2945 } 2946 goto LexNextToken; // GCC isn't tail call eliminating. 2947 } 2948 return PP->Lex(Result); 2949 } 2950 2951 Kind = tok::hash; 2952 } 2953 break; 2954 2955 case '@': 2956 // Objective C support. 2957 if (CurPtr[-1] == '@' && Features.ObjC1) 2958 Kind = tok::at; 2959 else 2960 Kind = tok::unknown; 2961 break; 2962 2963 case '\\': 2964 // FIXME: UCN's. 2965 // FALL THROUGH. 2966 default: 2967 Kind = tok::unknown; 2968 break; 2969 } 2970 2971 // Notify MIOpt that we read a non-whitespace/non-comment token. 2972 MIOpt.ReadToken(); 2973 2974 // Update the location of token as well as BufferPtr. 2975 FormTokenWithChars(Result, CurPtr, Kind); 2976 } 2977