1 //===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the Lexer and Token interfaces. 11 // 12 //===----------------------------------------------------------------------===// 13 // 14 // TODO: GCC Diagnostics emitted by the lexer: 15 // PEDWARN: (form feed|vertical tab) in preprocessing directive 16 // 17 // Universal characters, unicode, char mapping: 18 // WARNING: `%.*s' is not in NFKC 19 // WARNING: `%.*s' is not in NFC 20 // 21 // Other: 22 // TODO: Options to support: 23 // -fexec-charset,-fwide-exec-charset 24 // 25 //===----------------------------------------------------------------------===// 26 27 #include "clang/Lex/Lexer.h" 28 #include "clang/Lex/Preprocessor.h" 29 #include "clang/Lex/LexDiagnostic.h" 30 #include "clang/Lex/CodeCompletionHandler.h" 31 #include "clang/Basic/SourceManager.h" 32 #include "llvm/ADT/StringSwitch.h" 33 #include "llvm/ADT/STLExtras.h" 34 #include "llvm/Support/Compiler.h" 35 #include "llvm/Support/MemoryBuffer.h" 36 #include <cstring> 37 using namespace clang; 38 39 static void InitCharacterInfo(); 40 41 //===----------------------------------------------------------------------===// 42 // Token Class Implementation 43 //===----------------------------------------------------------------------===// 44 45 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 46 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 47 if (IdentifierInfo *II = getIdentifierInfo()) 48 return II->getObjCKeywordID() == objcKey; 49 return false; 50 } 51 52 /// getObjCKeywordID - Return the ObjC keyword kind. 53 tok::ObjCKeywordKind Token::getObjCKeywordID() const { 54 IdentifierInfo *specId = getIdentifierInfo(); 55 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 56 } 57 58 59 //===----------------------------------------------------------------------===// 60 // Lexer Class Implementation 61 //===----------------------------------------------------------------------===// 62 63 void Lexer::anchor() { } 64 65 void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 66 const char *BufEnd) { 67 InitCharacterInfo(); 68 69 BufferStart = BufStart; 70 BufferPtr = BufPtr; 71 BufferEnd = BufEnd; 72 73 assert(BufEnd[0] == 0 && 74 "We assume that the input buffer has a null character at the end" 75 " to simplify lexing!"); 76 77 // Check whether we have a BOM in the beginning of the buffer. If yes - act 78 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 79 // skip the UTF-8 BOM if it's present. 80 if (BufferStart == BufferPtr) { 81 // Determine the size of the BOM. 82 StringRef Buf(BufferStart, BufferEnd - BufferStart); 83 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 84 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 85 .Default(0); 86 87 // Skip the BOM. 88 BufferPtr += BOMLength; 89 } 90 91 Is_PragmaLexer = false; 92 CurrentConflictMarkerState = CMK_None; 93 94 // Start of the file is a start of line. 95 IsAtStartOfLine = true; 96 97 // We are not after parsing a #. 98 ParsingPreprocessorDirective = false; 99 100 // We are not after parsing #include. 101 ParsingFilename = false; 102 103 // We are not in raw mode. Raw mode disables diagnostics and interpretation 104 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 105 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 106 // or otherwise skipping over tokens. 107 LexingRawMode = false; 108 109 // Default to not keeping comments. 110 ExtendedTokenMode = 0; 111 } 112 113 /// Lexer constructor - Create a new lexer object for the specified buffer 114 /// with the specified preprocessor managing the lexing process. This lexer 115 /// assumes that the associated file buffer and Preprocessor objects will 116 /// outlive it, so it doesn't take ownership of either of them. 117 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 118 : PreprocessorLexer(&PP, FID), 119 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 120 LangOpts(PP.getLangOpts()) { 121 122 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 123 InputFile->getBufferEnd()); 124 125 // Default to keeping comments if the preprocessor wants them. 126 SetCommentRetentionState(PP.getCommentRetentionState()); 127 } 128 129 /// Lexer constructor - Create a new raw lexer object. This object is only 130 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 131 /// range will outlive it, so it doesn't take ownership of it. 132 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 133 const char *BufStart, const char *BufPtr, const char *BufEnd) 134 : FileLoc(fileloc), LangOpts(langOpts) { 135 136 InitLexer(BufStart, BufPtr, BufEnd); 137 138 // We *are* in raw mode. 139 LexingRawMode = true; 140 } 141 142 /// Lexer constructor - Create a new raw lexer object. This object is only 143 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 144 /// range will outlive it, so it doesn't take ownership of it. 145 Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 146 const SourceManager &SM, const LangOptions &langOpts) 147 : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) { 148 149 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 150 FromFile->getBufferEnd()); 151 152 // We *are* in raw mode. 153 LexingRawMode = true; 154 } 155 156 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 157 /// _Pragma expansion. This has a variety of magic semantics that this method 158 /// sets up. It returns a new'd Lexer that must be delete'd when done. 159 /// 160 /// On entrance to this routine, TokStartLoc is a macro location which has a 161 /// spelling loc that indicates the bytes to be lexed for the token and an 162 /// expansion location that indicates where all lexed tokens should be 163 /// "expanded from". 164 /// 165 /// FIXME: It would really be nice to make _Pragma just be a wrapper around a 166 /// normal lexer that remaps tokens as they fly by. This would require making 167 /// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 168 /// interface that could handle this stuff. This would pull GetMappedTokenLoc 169 /// out of the critical path of the lexer! 170 /// 171 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 172 SourceLocation ExpansionLocStart, 173 SourceLocation ExpansionLocEnd, 174 unsigned TokLen, Preprocessor &PP) { 175 SourceManager &SM = PP.getSourceManager(); 176 177 // Create the lexer as if we were going to lex the file normally. 178 FileID SpellingFID = SM.getFileID(SpellingLoc); 179 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 180 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 181 182 // Now that the lexer is created, change the start/end locations so that we 183 // just lex the subsection of the file that we want. This is lexing from a 184 // scratch buffer. 185 const char *StrData = SM.getCharacterData(SpellingLoc); 186 187 L->BufferPtr = StrData; 188 L->BufferEnd = StrData+TokLen; 189 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 190 191 // Set the SourceLocation with the remapping information. This ensures that 192 // GetMappedTokenLoc will remap the tokens as they are lexed. 193 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 194 ExpansionLocStart, 195 ExpansionLocEnd, TokLen); 196 197 // Ensure that the lexer thinks it is inside a directive, so that end \n will 198 // return an EOD token. 199 L->ParsingPreprocessorDirective = true; 200 201 // This lexer really is for _Pragma. 202 L->Is_PragmaLexer = true; 203 return L; 204 } 205 206 207 /// Stringify - Convert the specified string into a C string, with surrounding 208 /// ""'s, and with escaped \ and " characters. 209 std::string Lexer::Stringify(const std::string &Str, bool Charify) { 210 std::string Result = Str; 211 char Quote = Charify ? '\'' : '"'; 212 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 213 if (Result[i] == '\\' || Result[i] == Quote) { 214 Result.insert(Result.begin()+i, '\\'); 215 ++i; ++e; 216 } 217 } 218 return Result; 219 } 220 221 /// Stringify - Convert the specified string into a C string by escaping '\' 222 /// and " characters. This does not add surrounding ""'s to the string. 223 void Lexer::Stringify(SmallVectorImpl<char> &Str) { 224 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 225 if (Str[i] == '\\' || Str[i] == '"') { 226 Str.insert(Str.begin()+i, '\\'); 227 ++i; ++e; 228 } 229 } 230 } 231 232 //===----------------------------------------------------------------------===// 233 // Token Spelling 234 //===----------------------------------------------------------------------===// 235 236 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 237 /// token are the characters used to represent the token in the source file 238 /// after trigraph expansion and escaped-newline folding. In particular, this 239 /// wants to get the true, uncanonicalized, spelling of things like digraphs 240 /// UCNs, etc. 241 StringRef Lexer::getSpelling(SourceLocation loc, 242 SmallVectorImpl<char> &buffer, 243 const SourceManager &SM, 244 const LangOptions &options, 245 bool *invalid) { 246 // Break down the source location. 247 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 248 249 // Try to the load the file buffer. 250 bool invalidTemp = false; 251 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 252 if (invalidTemp) { 253 if (invalid) *invalid = true; 254 return StringRef(); 255 } 256 257 const char *tokenBegin = file.data() + locInfo.second; 258 259 // Lex from the start of the given location. 260 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 261 file.begin(), tokenBegin, file.end()); 262 Token token; 263 lexer.LexFromRawLexer(token); 264 265 unsigned length = token.getLength(); 266 267 // Common case: no need for cleaning. 268 if (!token.needsCleaning()) 269 return StringRef(tokenBegin, length); 270 271 // Hard case, we need to relex the characters into the string. 272 buffer.clear(); 273 buffer.reserve(length); 274 275 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 276 unsigned charSize; 277 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 278 ti += charSize; 279 } 280 281 return StringRef(buffer.data(), buffer.size()); 282 } 283 284 /// getSpelling() - Return the 'spelling' of this token. The spelling of a 285 /// token are the characters used to represent the token in the source file 286 /// after trigraph expansion and escaped-newline folding. In particular, this 287 /// wants to get the true, uncanonicalized, spelling of things like digraphs 288 /// UCNs, etc. 289 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 290 const LangOptions &LangOpts, bool *Invalid) { 291 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 292 293 // If this token contains nothing interesting, return it directly. 294 bool CharDataInvalid = false; 295 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 296 &CharDataInvalid); 297 if (Invalid) 298 *Invalid = CharDataInvalid; 299 if (CharDataInvalid) 300 return std::string(); 301 302 if (!Tok.needsCleaning()) 303 return std::string(TokStart, TokStart+Tok.getLength()); 304 305 std::string Result; 306 Result.reserve(Tok.getLength()); 307 308 // Otherwise, hard case, relex the characters into the string. 309 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 310 Ptr != End; ) { 311 unsigned CharSize; 312 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts)); 313 Ptr += CharSize; 314 } 315 assert(Result.size() != unsigned(Tok.getLength()) && 316 "NeedsCleaning flag set on something that didn't need cleaning!"); 317 return Result; 318 } 319 320 /// getSpelling - This method is used to get the spelling of a token into a 321 /// preallocated buffer, instead of as an std::string. The caller is required 322 /// to allocate enough space for the token, which is guaranteed to be at least 323 /// Tok.getLength() bytes long. The actual length of the token is returned. 324 /// 325 /// Note that this method may do two possible things: it may either fill in 326 /// the buffer specified with characters, or it may *change the input pointer* 327 /// to point to a constant buffer with the data already in it (avoiding a 328 /// copy). The caller is not allowed to modify the returned buffer pointer 329 /// if an internal buffer is returned. 330 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 331 const SourceManager &SourceMgr, 332 const LangOptions &LangOpts, bool *Invalid) { 333 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 334 335 const char *TokStart = 0; 336 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 337 if (Tok.is(tok::raw_identifier)) 338 TokStart = Tok.getRawIdentifierData(); 339 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 340 // Just return the string from the identifier table, which is very quick. 341 Buffer = II->getNameStart(); 342 return II->getLength(); 343 } 344 345 // NOTE: this can be checked even after testing for an IdentifierInfo. 346 if (Tok.isLiteral()) 347 TokStart = Tok.getLiteralData(); 348 349 if (TokStart == 0) { 350 // Compute the start of the token in the input lexer buffer. 351 bool CharDataInvalid = false; 352 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 353 if (Invalid) 354 *Invalid = CharDataInvalid; 355 if (CharDataInvalid) { 356 Buffer = ""; 357 return 0; 358 } 359 } 360 361 // If this token contains nothing interesting, return it directly. 362 if (!Tok.needsCleaning()) { 363 Buffer = TokStart; 364 return Tok.getLength(); 365 } 366 367 // Otherwise, hard case, relex the characters into the string. 368 char *OutBuf = const_cast<char*>(Buffer); 369 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 370 Ptr != End; ) { 371 unsigned CharSize; 372 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, LangOpts); 373 Ptr += CharSize; 374 } 375 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 376 "NeedsCleaning flag set on something that didn't need cleaning!"); 377 378 return OutBuf-Buffer; 379 } 380 381 382 383 static bool isWhitespace(unsigned char c); 384 385 /// MeasureTokenLength - Relex the token at the specified location and return 386 /// its length in bytes in the input file. If the token needs cleaning (e.g. 387 /// includes a trigraph or an escaped newline) then this count includes bytes 388 /// that are part of that. 389 unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 390 const SourceManager &SM, 391 const LangOptions &LangOpts) { 392 // TODO: this could be special cased for common tokens like identifiers, ')', 393 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 394 // all obviously single-char tokens. This could use 395 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 396 // something. 397 398 // If this comes from a macro expansion, we really do want the macro name, not 399 // the token this macro expanded to. 400 Loc = SM.getExpansionLoc(Loc); 401 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 402 bool Invalid = false; 403 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 404 if (Invalid) 405 return 0; 406 407 const char *StrData = Buffer.data()+LocInfo.second; 408 409 if (isWhitespace(StrData[0])) 410 return 0; 411 412 // Create a lexer starting at the beginning of this token. 413 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 414 Buffer.begin(), StrData, Buffer.end()); 415 TheLexer.SetCommentRetentionState(true); 416 Token TheTok; 417 TheLexer.LexFromRawLexer(TheTok); 418 return TheTok.getLength(); 419 } 420 421 static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 422 const SourceManager &SM, 423 const LangOptions &LangOpts) { 424 assert(Loc.isFileID()); 425 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 426 if (LocInfo.first.isInvalid()) 427 return Loc; 428 429 bool Invalid = false; 430 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 431 if (Invalid) 432 return Loc; 433 434 // Back up from the current location until we hit the beginning of a line 435 // (or the buffer). We'll relex from that point. 436 const char *BufStart = Buffer.data(); 437 if (LocInfo.second >= Buffer.size()) 438 return Loc; 439 440 const char *StrData = BufStart+LocInfo.second; 441 if (StrData[0] == '\n' || StrData[0] == '\r') 442 return Loc; 443 444 const char *LexStart = StrData; 445 while (LexStart != BufStart) { 446 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 447 ++LexStart; 448 break; 449 } 450 451 --LexStart; 452 } 453 454 // Create a lexer starting at the beginning of this token. 455 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 456 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 457 TheLexer.SetCommentRetentionState(true); 458 459 // Lex tokens until we find the token that contains the source location. 460 Token TheTok; 461 do { 462 TheLexer.LexFromRawLexer(TheTok); 463 464 if (TheLexer.getBufferLocation() > StrData) { 465 // Lexing this token has taken the lexer past the source location we're 466 // looking for. If the current token encompasses our source location, 467 // return the beginning of that token. 468 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 469 return TheTok.getLocation(); 470 471 // We ended up skipping over the source location entirely, which means 472 // that it points into whitespace. We're done here. 473 break; 474 } 475 } while (TheTok.getKind() != tok::eof); 476 477 // We've passed our source location; just return the original source location. 478 return Loc; 479 } 480 481 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 482 const SourceManager &SM, 483 const LangOptions &LangOpts) { 484 if (Loc.isFileID()) 485 return getBeginningOfFileToken(Loc, SM, LangOpts); 486 487 if (!SM.isMacroArgExpansion(Loc)) 488 return Loc; 489 490 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 491 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 492 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 493 std::pair<FileID, unsigned> BeginFileLocInfo 494 = SM.getDecomposedLoc(BeginFileLoc); 495 assert(FileLocInfo.first == BeginFileLocInfo.first && 496 FileLocInfo.second >= BeginFileLocInfo.second); 497 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 498 } 499 500 namespace { 501 enum PreambleDirectiveKind { 502 PDK_Skipped, 503 PDK_StartIf, 504 PDK_EndIf, 505 PDK_Unknown 506 }; 507 } 508 509 std::pair<unsigned, bool> 510 Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 511 const LangOptions &LangOpts, unsigned MaxLines) { 512 // Create a lexer starting at the beginning of the file. Note that we use a 513 // "fake" file source location at offset 1 so that the lexer will track our 514 // position within the file. 515 const unsigned StartOffset = 1; 516 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 517 Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(), 518 Buffer->getBufferStart(), Buffer->getBufferEnd()); 519 520 bool InPreprocessorDirective = false; 521 Token TheTok; 522 Token IfStartTok; 523 unsigned IfCount = 0; 524 525 unsigned MaxLineOffset = 0; 526 if (MaxLines) { 527 const char *CurPtr = Buffer->getBufferStart(); 528 unsigned CurLine = 0; 529 while (CurPtr != Buffer->getBufferEnd()) { 530 char ch = *CurPtr++; 531 if (ch == '\n') { 532 ++CurLine; 533 if (CurLine == MaxLines) 534 break; 535 } 536 } 537 if (CurPtr != Buffer->getBufferEnd()) 538 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 539 } 540 541 do { 542 TheLexer.LexFromRawLexer(TheTok); 543 544 if (InPreprocessorDirective) { 545 // If we've hit the end of the file, we're done. 546 if (TheTok.getKind() == tok::eof) { 547 break; 548 } 549 550 // If we haven't hit the end of the preprocessor directive, skip this 551 // token. 552 if (!TheTok.isAtStartOfLine()) 553 continue; 554 555 // We've passed the end of the preprocessor directive, and will look 556 // at this token again below. 557 InPreprocessorDirective = false; 558 } 559 560 // Keep track of the # of lines in the preamble. 561 if (TheTok.isAtStartOfLine()) { 562 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 563 564 // If we were asked to limit the number of lines in the preamble, 565 // and we're about to exceed that limit, we're done. 566 if (MaxLineOffset && TokOffset >= MaxLineOffset) 567 break; 568 } 569 570 // Comments are okay; skip over them. 571 if (TheTok.getKind() == tok::comment) 572 continue; 573 574 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 575 // This is the start of a preprocessor directive. 576 Token HashTok = TheTok; 577 InPreprocessorDirective = true; 578 579 // Figure out which directive this is. Since we're lexing raw tokens, 580 // we don't have an identifier table available. Instead, just look at 581 // the raw identifier to recognize and categorize preprocessor directives. 582 TheLexer.LexFromRawLexer(TheTok); 583 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 584 StringRef Keyword(TheTok.getRawIdentifierData(), 585 TheTok.getLength()); 586 PreambleDirectiveKind PDK 587 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 588 .Case("include", PDK_Skipped) 589 .Case("__include_macros", PDK_Skipped) 590 .Case("define", PDK_Skipped) 591 .Case("undef", PDK_Skipped) 592 .Case("line", PDK_Skipped) 593 .Case("error", PDK_Skipped) 594 .Case("pragma", PDK_Skipped) 595 .Case("import", PDK_Skipped) 596 .Case("include_next", PDK_Skipped) 597 .Case("warning", PDK_Skipped) 598 .Case("ident", PDK_Skipped) 599 .Case("sccs", PDK_Skipped) 600 .Case("assert", PDK_Skipped) 601 .Case("unassert", PDK_Skipped) 602 .Case("if", PDK_StartIf) 603 .Case("ifdef", PDK_StartIf) 604 .Case("ifndef", PDK_StartIf) 605 .Case("elif", PDK_Skipped) 606 .Case("else", PDK_Skipped) 607 .Case("endif", PDK_EndIf) 608 .Default(PDK_Unknown); 609 610 switch (PDK) { 611 case PDK_Skipped: 612 continue; 613 614 case PDK_StartIf: 615 if (IfCount == 0) 616 IfStartTok = HashTok; 617 618 ++IfCount; 619 continue; 620 621 case PDK_EndIf: 622 // Mismatched #endif. The preamble ends here. 623 if (IfCount == 0) 624 break; 625 626 --IfCount; 627 continue; 628 629 case PDK_Unknown: 630 // We don't know what this directive is; stop at the '#'. 631 break; 632 } 633 } 634 635 // We only end up here if we didn't recognize the preprocessor 636 // directive or it was one that can't occur in the preamble at this 637 // point. Roll back the current token to the location of the '#'. 638 InPreprocessorDirective = false; 639 TheTok = HashTok; 640 } 641 642 // We hit a token that we don't recognize as being in the 643 // "preprocessing only" part of the file, so we're no longer in 644 // the preamble. 645 break; 646 } while (true); 647 648 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 649 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 650 IfCount? IfStartTok.isAtStartOfLine() 651 : TheTok.isAtStartOfLine()); 652 } 653 654 655 /// AdvanceToTokenCharacter - Given a location that specifies the start of a 656 /// token, return a new location that specifies a character within the token. 657 SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 658 unsigned CharNo, 659 const SourceManager &SM, 660 const LangOptions &LangOpts) { 661 // Figure out how many physical characters away the specified expansion 662 // character is. This needs to take into consideration newlines and 663 // trigraphs. 664 bool Invalid = false; 665 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 666 667 // If they request the first char of the token, we're trivially done. 668 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 669 return TokStart; 670 671 unsigned PhysOffset = 0; 672 673 // The usual case is that tokens don't contain anything interesting. Skip 674 // over the uninteresting characters. If a token only consists of simple 675 // chars, this method is extremely fast. 676 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 677 if (CharNo == 0) 678 return TokStart.getLocWithOffset(PhysOffset); 679 ++TokPtr, --CharNo, ++PhysOffset; 680 } 681 682 // If we have a character that may be a trigraph or escaped newline, use a 683 // lexer to parse it correctly. 684 for (; CharNo; --CharNo) { 685 unsigned Size; 686 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 687 TokPtr += Size; 688 PhysOffset += Size; 689 } 690 691 // Final detail: if we end up on an escaped newline, we want to return the 692 // location of the actual byte of the token. For example foo\<newline>bar 693 // advanced by 3 should return the location of b, not of \\. One compounding 694 // detail of this is that the escape may be made by a trigraph. 695 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 696 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 697 698 return TokStart.getLocWithOffset(PhysOffset); 699 } 700 701 /// \brief Computes the source location just past the end of the 702 /// token at this source location. 703 /// 704 /// This routine can be used to produce a source location that 705 /// points just past the end of the token referenced by \p Loc, and 706 /// is generally used when a diagnostic needs to point just after a 707 /// token where it expected something different that it received. If 708 /// the returned source location would not be meaningful (e.g., if 709 /// it points into a macro), this routine returns an invalid 710 /// source location. 711 /// 712 /// \param Offset an offset from the end of the token, where the source 713 /// location should refer to. The default offset (0) produces a source 714 /// location pointing just past the end of the token; an offset of 1 produces 715 /// a source location pointing to the last character in the token, etc. 716 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 717 const SourceManager &SM, 718 const LangOptions &LangOpts) { 719 if (Loc.isInvalid()) 720 return SourceLocation(); 721 722 if (Loc.isMacroID()) { 723 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 724 return SourceLocation(); // Points inside the macro expansion. 725 } 726 727 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 728 if (Len > Offset) 729 Len = Len - Offset; 730 else 731 return Loc; 732 733 return Loc.getLocWithOffset(Len); 734 } 735 736 /// \brief Returns true if the given MacroID location points at the first 737 /// token of the macro expansion. 738 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 739 const SourceManager &SM, 740 const LangOptions &LangOpts, 741 SourceLocation *MacroBegin) { 742 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 743 744 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 745 // FIXME: If the token comes from the macro token paste operator ('##') 746 // this function will always return false; 747 if (infoLoc.second > 0) 748 return false; // Does not point at the start of token. 749 750 SourceLocation expansionLoc = 751 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 752 if (expansionLoc.isFileID()) { 753 // No other macro expansions, this is the first. 754 if (MacroBegin) 755 *MacroBegin = expansionLoc; 756 return true; 757 } 758 759 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 760 } 761 762 /// \brief Returns true if the given MacroID location points at the last 763 /// token of the macro expansion. 764 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 765 const SourceManager &SM, 766 const LangOptions &LangOpts, 767 SourceLocation *MacroEnd) { 768 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 769 770 SourceLocation spellLoc = SM.getSpellingLoc(loc); 771 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 772 if (tokLen == 0) 773 return false; 774 775 FileID FID = SM.getFileID(loc); 776 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 777 if (SM.isInFileID(afterLoc, FID)) 778 return false; // Still in the same FileID, does not point to the last token. 779 780 // FIXME: If the token comes from the macro token paste operator ('##') 781 // or the stringify operator ('#') this function will always return false; 782 783 SourceLocation expansionLoc = 784 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 785 if (expansionLoc.isFileID()) { 786 // No other macro expansions. 787 if (MacroEnd) 788 *MacroEnd = expansionLoc; 789 return true; 790 } 791 792 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 793 } 794 795 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 796 const SourceManager &SM, 797 const LangOptions &LangOpts) { 798 SourceLocation Begin = Range.getBegin(); 799 SourceLocation End = Range.getEnd(); 800 assert(Begin.isFileID() && End.isFileID()); 801 if (Range.isTokenRange()) { 802 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 803 if (End.isInvalid()) 804 return CharSourceRange(); 805 } 806 807 // Break down the source locations. 808 FileID FID; 809 unsigned BeginOffs; 810 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 811 if (FID.isInvalid()) 812 return CharSourceRange(); 813 814 unsigned EndOffs; 815 if (!SM.isInFileID(End, FID, &EndOffs) || 816 BeginOffs > EndOffs) 817 return CharSourceRange(); 818 819 return CharSourceRange::getCharRange(Begin, End); 820 } 821 822 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 823 const SourceManager &SM, 824 const LangOptions &LangOpts) { 825 SourceLocation Begin = Range.getBegin(); 826 SourceLocation End = Range.getEnd(); 827 if (Begin.isInvalid() || End.isInvalid()) 828 return CharSourceRange(); 829 830 if (Begin.isFileID() && End.isFileID()) 831 return makeRangeFromFileLocs(Range, SM, LangOpts); 832 833 if (Begin.isMacroID() && End.isFileID()) { 834 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 835 return CharSourceRange(); 836 Range.setBegin(Begin); 837 return makeRangeFromFileLocs(Range, SM, LangOpts); 838 } 839 840 if (Begin.isFileID() && End.isMacroID()) { 841 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 842 &End)) || 843 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 844 &End))) 845 return CharSourceRange(); 846 Range.setEnd(End); 847 return makeRangeFromFileLocs(Range, SM, LangOpts); 848 } 849 850 assert(Begin.isMacroID() && End.isMacroID()); 851 SourceLocation MacroBegin, MacroEnd; 852 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 853 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 854 &MacroEnd)) || 855 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 856 &MacroEnd)))) { 857 Range.setBegin(MacroBegin); 858 Range.setEnd(MacroEnd); 859 return makeRangeFromFileLocs(Range, SM, LangOpts); 860 } 861 862 FileID FID; 863 unsigned BeginOffs; 864 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 865 if (FID.isInvalid()) 866 return CharSourceRange(); 867 868 unsigned EndOffs; 869 if (!SM.isInFileID(End, FID, &EndOffs) || 870 BeginOffs > EndOffs) 871 return CharSourceRange(); 872 873 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 874 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 875 if (Expansion.isMacroArgExpansion() && 876 Expansion.getSpellingLoc().isFileID()) { 877 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 878 Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs)); 879 Range.setEnd(SpellLoc.getLocWithOffset(EndOffs)); 880 return makeRangeFromFileLocs(Range, SM, LangOpts); 881 } 882 883 return CharSourceRange(); 884 } 885 886 StringRef Lexer::getSourceText(CharSourceRange Range, 887 const SourceManager &SM, 888 const LangOptions &LangOpts, 889 bool *Invalid) { 890 Range = makeFileCharRange(Range, SM, LangOpts); 891 if (Range.isInvalid()) { 892 if (Invalid) *Invalid = true; 893 return StringRef(); 894 } 895 896 // Break down the source location. 897 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 898 if (beginInfo.first.isInvalid()) { 899 if (Invalid) *Invalid = true; 900 return StringRef(); 901 } 902 903 unsigned EndOffs; 904 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 905 beginInfo.second > EndOffs) { 906 if (Invalid) *Invalid = true; 907 return StringRef(); 908 } 909 910 // Try to the load the file buffer. 911 bool invalidTemp = false; 912 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 913 if (invalidTemp) { 914 if (Invalid) *Invalid = true; 915 return StringRef(); 916 } 917 918 if (Invalid) *Invalid = false; 919 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 920 } 921 922 StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 923 const SourceManager &SM, 924 const LangOptions &LangOpts) { 925 assert(Loc.isMacroID() && "Only reasonble to call this on macros"); 926 927 // Find the location of the immediate macro expansion. 928 while (1) { 929 FileID FID = SM.getFileID(Loc); 930 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 931 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 932 Loc = Expansion.getExpansionLocStart(); 933 if (!Expansion.isMacroArgExpansion()) 934 break; 935 936 // For macro arguments we need to check that the argument did not come 937 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 938 939 // Loc points to the argument id of the macro definition, move to the 940 // macro expansion. 941 Loc = SM.getImmediateExpansionRange(Loc).first; 942 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 943 if (SpellLoc.isFileID()) 944 break; // No inner macro. 945 946 // If spelling location resides in the same FileID as macro expansion 947 // location, it means there is no inner macro. 948 FileID MacroFID = SM.getFileID(Loc); 949 if (SM.isInFileID(SpellLoc, MacroFID)) 950 break; 951 952 // Argument came from inner macro. 953 Loc = SpellLoc; 954 } 955 956 // Find the spelling location of the start of the non-argument expansion 957 // range. This is where the macro name was spelled in order to begin 958 // expanding this macro. 959 Loc = SM.getSpellingLoc(Loc); 960 961 // Dig out the buffer where the macro name was spelled and the extents of the 962 // name so that we can render it into the expansion note. 963 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 964 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 965 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 966 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 967 } 968 969 //===----------------------------------------------------------------------===// 970 // Character information. 971 //===----------------------------------------------------------------------===// 972 973 enum { 974 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 975 CHAR_VERT_WS = 0x02, // '\r', '\n' 976 CHAR_LETTER = 0x04, // a-z,A-Z 977 CHAR_NUMBER = 0x08, // 0-9 978 CHAR_UNDER = 0x10, // _ 979 CHAR_PERIOD = 0x20, // . 980 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 981 }; 982 983 // Statically initialize CharInfo table based on ASCII character set 984 // Reference: FreeBSD 7.2 /usr/share/misc/ascii 985 static const unsigned char CharInfo[256] = 986 { 987 // 0 NUL 1 SOH 2 STX 3 ETX 988 // 4 EOT 5 ENQ 6 ACK 7 BEL 989 0 , 0 , 0 , 0 , 990 0 , 0 , 0 , 0 , 991 // 8 BS 9 HT 10 NL 11 VT 992 //12 NP 13 CR 14 SO 15 SI 993 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 994 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 995 //16 DLE 17 DC1 18 DC2 19 DC3 996 //20 DC4 21 NAK 22 SYN 23 ETB 997 0 , 0 , 0 , 0 , 998 0 , 0 , 0 , 0 , 999 //24 CAN 25 EM 26 SUB 27 ESC 1000 //28 FS 29 GS 30 RS 31 US 1001 0 , 0 , 0 , 0 , 1002 0 , 0 , 0 , 0 , 1003 //32 SP 33 ! 34 " 35 # 1004 //36 $ 37 % 38 & 39 ' 1005 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1006 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1007 //40 ( 41 ) 42 * 43 + 1008 //44 , 45 - 46 . 47 / 1009 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 1010 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 1011 //48 0 49 1 50 2 51 3 1012 //52 4 53 5 54 6 55 7 1013 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1014 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1015 //56 8 57 9 58 : 59 ; 1016 //60 < 61 = 62 > 63 ? 1017 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 1018 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1019 //64 @ 65 A 66 B 67 C 1020 //68 D 69 E 70 F 71 G 1021 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1022 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1023 //72 H 73 I 74 J 75 K 1024 //76 L 77 M 78 N 79 O 1025 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1026 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1027 //80 P 81 Q 82 R 83 S 1028 //84 T 85 U 86 V 87 W 1029 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1030 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1031 //88 X 89 Y 90 Z 91 [ 1032 //92 \ 93 ] 94 ^ 95 _ 1033 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1034 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 1035 //96 ` 97 a 98 b 99 c 1036 //100 d 101 e 102 f 103 g 1037 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1038 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1039 //104 h 105 i 106 j 107 k 1040 //108 l 109 m 110 n 111 o 1041 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1042 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1043 //112 p 113 q 114 r 115 s 1044 //116 t 117 u 118 v 119 w 1045 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1046 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1047 //120 x 121 y 122 z 123 { 1048 //124 | 125 } 126 ~ 127 DEL 1049 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1050 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 1051 }; 1052 1053 static void InitCharacterInfo() { 1054 static bool isInited = false; 1055 if (isInited) return; 1056 // check the statically-initialized CharInfo table 1057 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 1058 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 1059 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 1060 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 1061 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 1062 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 1063 assert(CHAR_UNDER == CharInfo[(int)'_']); 1064 assert(CHAR_PERIOD == CharInfo[(int)'.']); 1065 for (unsigned i = 'a'; i <= 'z'; ++i) { 1066 assert(CHAR_LETTER == CharInfo[i]); 1067 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 1068 } 1069 for (unsigned i = '0'; i <= '9'; ++i) 1070 assert(CHAR_NUMBER == CharInfo[i]); 1071 1072 isInited = true; 1073 } 1074 1075 1076 /// isIdentifierHead - Return true if this is the first character of an 1077 /// identifier, which is [a-zA-Z_]. 1078 static inline bool isIdentifierHead(unsigned char c) { 1079 return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; 1080 } 1081 1082 /// isIdentifierBody - Return true if this is the body character of an 1083 /// identifier, which is [a-zA-Z0-9_]. 1084 static inline bool isIdentifierBody(unsigned char c) { 1085 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 1086 } 1087 1088 /// isHorizontalWhitespace - Return true if this character is horizontal 1089 /// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for 1090 /// '\\0'. 1091 static inline bool isHorizontalWhitespace(unsigned char c) { 1092 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 1093 } 1094 1095 /// isVerticalWhitespace - Return true if this character is vertical 1096 /// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'. 1097 static inline bool isVerticalWhitespace(unsigned char c) { 1098 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 1099 } 1100 1101 /// isWhitespace - Return true if this character is horizontal or vertical 1102 /// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns 1103 /// false for '\\0'. 1104 static inline bool isWhitespace(unsigned char c) { 1105 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 1106 } 1107 1108 /// isNumberBody - Return true if this is the body character of an 1109 /// preprocessing number, which is [a-zA-Z0-9_.]. 1110 static inline bool isNumberBody(unsigned char c) { 1111 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 1112 true : false; 1113 } 1114 1115 /// isRawStringDelimBody - Return true if this is the body character of a 1116 /// raw string delimiter. 1117 static inline bool isRawStringDelimBody(unsigned char c) { 1118 return (CharInfo[c] & 1119 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 1120 true : false; 1121 } 1122 1123 // Allow external clients to make use of CharInfo. 1124 bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1125 return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents); 1126 } 1127 1128 1129 //===----------------------------------------------------------------------===// 1130 // Diagnostics forwarding code. 1131 //===----------------------------------------------------------------------===// 1132 1133 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1134 /// lexer buffer was all expanded at a single point, perform the mapping. 1135 /// This is currently only used for _Pragma implementation, so it is the slow 1136 /// path of the hot getSourceLocation method. Do not allow it to be inlined. 1137 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1138 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1139 static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1140 SourceLocation FileLoc, 1141 unsigned CharNo, unsigned TokLen) { 1142 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1143 1144 // Otherwise, we're lexing "mapped tokens". This is used for things like 1145 // _Pragma handling. Combine the expansion location of FileLoc with the 1146 // spelling location. 1147 SourceManager &SM = PP.getSourceManager(); 1148 1149 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1150 // characters come from spelling(FileLoc)+Offset. 1151 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1152 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1153 1154 // Figure out the expansion loc range, which is the range covered by the 1155 // original _Pragma(...) sequence. 1156 std::pair<SourceLocation,SourceLocation> II = 1157 SM.getImmediateExpansionRange(FileLoc); 1158 1159 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 1160 } 1161 1162 /// getSourceLocation - Return a source location identifier for the specified 1163 /// offset in the current file. 1164 SourceLocation Lexer::getSourceLocation(const char *Loc, 1165 unsigned TokLen) const { 1166 assert(Loc >= BufferStart && Loc <= BufferEnd && 1167 "Location out of range for this buffer!"); 1168 1169 // In the normal case, we're just lexing from a simple file buffer, return 1170 // the file id from FileLoc with the offset specified. 1171 unsigned CharNo = Loc-BufferStart; 1172 if (FileLoc.isFileID()) 1173 return FileLoc.getLocWithOffset(CharNo); 1174 1175 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1176 // tokens are lexed from where the _Pragma was defined. 1177 assert(PP && "This doesn't work on raw lexers"); 1178 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1179 } 1180 1181 /// Diag - Forwarding function for diagnostics. This translate a source 1182 /// position in the current buffer into a SourceLocation object for rendering. 1183 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1184 return PP->Diag(getSourceLocation(Loc), DiagID); 1185 } 1186 1187 //===----------------------------------------------------------------------===// 1188 // Trigraph and Escaped Newline Handling Code. 1189 //===----------------------------------------------------------------------===// 1190 1191 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1192 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1193 static char GetTrigraphCharForLetter(char Letter) { 1194 switch (Letter) { 1195 default: return 0; 1196 case '=': return '#'; 1197 case ')': return ']'; 1198 case '(': return '['; 1199 case '!': return '|'; 1200 case '\'': return '^'; 1201 case '>': return '}'; 1202 case '/': return '\\'; 1203 case '<': return '{'; 1204 case '-': return '~'; 1205 } 1206 } 1207 1208 /// DecodeTrigraphChar - If the specified character is a legal trigraph when 1209 /// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1210 /// return the result character. Finally, emit a warning about trigraph use 1211 /// whether trigraphs are enabled or not. 1212 static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1213 char Res = GetTrigraphCharForLetter(*CP); 1214 if (!Res || !L) return Res; 1215 1216 if (!L->getLangOpts().Trigraphs) { 1217 if (!L->isLexingRawMode()) 1218 L->Diag(CP-2, diag::trigraph_ignored); 1219 return 0; 1220 } 1221 1222 if (!L->isLexingRawMode()) 1223 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1224 return Res; 1225 } 1226 1227 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 1228 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1229 /// trigraph equivalent on entry to this function. 1230 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1231 unsigned Size = 0; 1232 while (isWhitespace(Ptr[Size])) { 1233 ++Size; 1234 1235 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1236 continue; 1237 1238 // If this is a \r\n or \n\r, skip the other half. 1239 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1240 Ptr[Size-1] != Ptr[Size]) 1241 ++Size; 1242 1243 return Size; 1244 } 1245 1246 // Not an escaped newline, must be a \t or something else. 1247 return 0; 1248 } 1249 1250 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1251 /// them), skip over them and return the first non-escaped-newline found, 1252 /// otherwise return P. 1253 const char *Lexer::SkipEscapedNewLines(const char *P) { 1254 while (1) { 1255 const char *AfterEscape; 1256 if (*P == '\\') { 1257 AfterEscape = P+1; 1258 } else if (*P == '?') { 1259 // If not a trigraph for escape, bail out. 1260 if (P[1] != '?' || P[2] != '/') 1261 return P; 1262 AfterEscape = P+3; 1263 } else { 1264 return P; 1265 } 1266 1267 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1268 if (NewLineSize == 0) return P; 1269 P = AfterEscape+NewLineSize; 1270 } 1271 } 1272 1273 /// \brief Checks that the given token is the first token that occurs after the 1274 /// given location (this excludes comments and whitespace). Returns the location 1275 /// immediately after the specified token. If the token is not found or the 1276 /// location is inside a macro, the returned source location will be invalid. 1277 SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 1278 tok::TokenKind TKind, 1279 const SourceManager &SM, 1280 const LangOptions &LangOpts, 1281 bool SkipTrailingWhitespaceAndNewLine) { 1282 if (Loc.isMacroID()) { 1283 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1284 return SourceLocation(); 1285 } 1286 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1287 1288 // Break down the source location. 1289 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1290 1291 // Try to load the file buffer. 1292 bool InvalidTemp = false; 1293 llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1294 if (InvalidTemp) 1295 return SourceLocation(); 1296 1297 const char *TokenBegin = File.data() + LocInfo.second; 1298 1299 // Lex from the start of the given location. 1300 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1301 TokenBegin, File.end()); 1302 // Find the token. 1303 Token Tok; 1304 lexer.LexFromRawLexer(Tok); 1305 if (Tok.isNot(TKind)) 1306 return SourceLocation(); 1307 SourceLocation TokenLoc = Tok.getLocation(); 1308 1309 // Calculate how much whitespace needs to be skipped if any. 1310 unsigned NumWhitespaceChars = 0; 1311 if (SkipTrailingWhitespaceAndNewLine) { 1312 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 1313 Tok.getLength(); 1314 unsigned char C = *TokenEnd; 1315 while (isHorizontalWhitespace(C)) { 1316 C = *(++TokenEnd); 1317 NumWhitespaceChars++; 1318 } 1319 if (isVerticalWhitespace(C)) 1320 NumWhitespaceChars++; 1321 } 1322 1323 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 1324 } 1325 1326 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1327 /// get its size, and return it. This is tricky in several cases: 1328 /// 1. If currently at the start of a trigraph, we warn about the trigraph, 1329 /// then either return the trigraph (skipping 3 chars) or the '?', 1330 /// depending on whether trigraphs are enabled or not. 1331 /// 2. If this is an escaped newline (potentially with whitespace between 1332 /// the backslash and newline), implicitly skip the newline and return 1333 /// the char after it. 1334 /// 3. If this is a UCN, return it. FIXME: C++ UCN's? 1335 /// 1336 /// This handles the slow/uncommon case of the getCharAndSize method. Here we 1337 /// know that we can accumulate into Size, and that we have already incremented 1338 /// Ptr by Size bytes. 1339 /// 1340 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1341 /// be updated to match. 1342 /// 1343 char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1344 Token *Tok) { 1345 // If we have a slash, look for an escaped newline. 1346 if (Ptr[0] == '\\') { 1347 ++Size; 1348 ++Ptr; 1349 Slash: 1350 // Common case, backslash-char where the char is not whitespace. 1351 if (!isWhitespace(Ptr[0])) return '\\'; 1352 1353 // See if we have optional whitespace characters between the slash and 1354 // newline. 1355 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1356 // Remember that this token needs to be cleaned. 1357 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1358 1359 // Warn if there was whitespace between the backslash and newline. 1360 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1361 Diag(Ptr, diag::backslash_newline_space); 1362 1363 // Found backslash<whitespace><newline>. Parse the char after it. 1364 Size += EscapedNewLineSize; 1365 Ptr += EscapedNewLineSize; 1366 1367 // If the char that we finally got was a \n, then we must have had 1368 // something like \<newline><newline>. We don't want to consume the 1369 // second newline. 1370 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1371 return ' '; 1372 1373 // Use slow version to accumulate a correct size field. 1374 return getCharAndSizeSlow(Ptr, Size, Tok); 1375 } 1376 1377 // Otherwise, this is not an escaped newline, just return the slash. 1378 return '\\'; 1379 } 1380 1381 // If this is a trigraph, process it. 1382 if (Ptr[0] == '?' && Ptr[1] == '?') { 1383 // If this is actually a legal trigraph (not something like "??x"), emit 1384 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1385 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1386 // Remember that this token needs to be cleaned. 1387 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1388 1389 Ptr += 3; 1390 Size += 3; 1391 if (C == '\\') goto Slash; 1392 return C; 1393 } 1394 } 1395 1396 // If this is neither, return a single character. 1397 ++Size; 1398 return *Ptr; 1399 } 1400 1401 1402 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1403 /// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1404 /// and that we have already incremented Ptr by Size bytes. 1405 /// 1406 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1407 /// be updated to match. 1408 char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1409 const LangOptions &LangOpts) { 1410 // If we have a slash, look for an escaped newline. 1411 if (Ptr[0] == '\\') { 1412 ++Size; 1413 ++Ptr; 1414 Slash: 1415 // Common case, backslash-char where the char is not whitespace. 1416 if (!isWhitespace(Ptr[0])) return '\\'; 1417 1418 // See if we have optional whitespace characters followed by a newline. 1419 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1420 // Found backslash<whitespace><newline>. Parse the char after it. 1421 Size += EscapedNewLineSize; 1422 Ptr += EscapedNewLineSize; 1423 1424 // If the char that we finally got was a \n, then we must have had 1425 // something like \<newline><newline>. We don't want to consume the 1426 // second newline. 1427 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1428 return ' '; 1429 1430 // Use slow version to accumulate a correct size field. 1431 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1432 } 1433 1434 // Otherwise, this is not an escaped newline, just return the slash. 1435 return '\\'; 1436 } 1437 1438 // If this is a trigraph, process it. 1439 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1440 // If this is actually a legal trigraph (not something like "??x"), return 1441 // it. 1442 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1443 Ptr += 3; 1444 Size += 3; 1445 if (C == '\\') goto Slash; 1446 return C; 1447 } 1448 } 1449 1450 // If this is neither, return a single character. 1451 ++Size; 1452 return *Ptr; 1453 } 1454 1455 //===----------------------------------------------------------------------===// 1456 // Helper methods for lexing. 1457 //===----------------------------------------------------------------------===// 1458 1459 /// \brief Routine that indiscriminately skips bytes in the source file. 1460 void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1461 BufferPtr += Bytes; 1462 if (BufferPtr > BufferEnd) 1463 BufferPtr = BufferEnd; 1464 IsAtStartOfLine = StartOfLine; 1465 } 1466 1467 void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1468 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1469 unsigned Size; 1470 unsigned char C = *CurPtr++; 1471 while (isIdentifierBody(C)) 1472 C = *CurPtr++; 1473 1474 --CurPtr; // Back up over the skipped character. 1475 1476 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1477 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1478 // FIXME: UCNs. 1479 // 1480 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1481 // cheaper 1482 if (C != '\\' && C != '?' && (C != '$' || !LangOpts.DollarIdents)) { 1483 FinishIdentifier: 1484 const char *IdStart = BufferPtr; 1485 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1486 Result.setRawIdentifierData(IdStart); 1487 1488 // If we are in raw mode, return this identifier raw. There is no need to 1489 // look up identifier information or attempt to macro expand it. 1490 if (LexingRawMode) 1491 return; 1492 1493 // Fill in Result.IdentifierInfo and update the token kind, 1494 // looking up the identifier in the identifier table. 1495 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1496 1497 // Finally, now that we know we have an identifier, pass this off to the 1498 // preprocessor, which may macro expand it or something. 1499 if (II->isHandleIdentifierCase()) 1500 PP->HandleIdentifier(Result); 1501 1502 return; 1503 } 1504 1505 // Otherwise, $,\,? in identifier found. Enter slower path. 1506 1507 C = getCharAndSize(CurPtr, Size); 1508 while (1) { 1509 if (C == '$') { 1510 // If we hit a $ and they are not supported in identifiers, we are done. 1511 if (!LangOpts.DollarIdents) goto FinishIdentifier; 1512 1513 // Otherwise, emit a diagnostic and continue. 1514 if (!isLexingRawMode()) 1515 Diag(CurPtr, diag::ext_dollar_in_identifier); 1516 CurPtr = ConsumeChar(CurPtr, Size, Result); 1517 C = getCharAndSize(CurPtr, Size); 1518 continue; 1519 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 1520 // Found end of identifier. 1521 goto FinishIdentifier; 1522 } 1523 1524 // Otherwise, this character is good, consume it. 1525 CurPtr = ConsumeChar(CurPtr, Size, Result); 1526 1527 C = getCharAndSize(CurPtr, Size); 1528 while (isIdentifierBody(C)) { // FIXME: UCNs. 1529 CurPtr = ConsumeChar(CurPtr, Size, Result); 1530 C = getCharAndSize(CurPtr, Size); 1531 } 1532 } 1533 } 1534 1535 /// isHexaLiteral - Return true if Start points to a hex constant. 1536 /// in microsoft mode (where this is supposed to be several different tokens). 1537 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1538 unsigned Size; 1539 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1540 if (C1 != '0') 1541 return false; 1542 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1543 return (C2 == 'x' || C2 == 'X'); 1544 } 1545 1546 /// LexNumericConstant - Lex the remainder of a integer or floating point 1547 /// constant. From[-1] is the first character lexed. Return the end of the 1548 /// constant. 1549 void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1550 unsigned Size; 1551 char C = getCharAndSize(CurPtr, Size); 1552 char PrevCh = 0; 1553 while (isNumberBody(C)) { // FIXME: UCNs. 1554 CurPtr = ConsumeChar(CurPtr, Size, Result); 1555 PrevCh = C; 1556 C = getCharAndSize(CurPtr, Size); 1557 } 1558 1559 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1560 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1561 // If we are in Microsoft mode, don't continue if the constant is hex. 1562 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1563 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1564 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1565 } 1566 1567 // If we have a hex FP constant, continue. 1568 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1569 // Outside C99, we accept hexadecimal floating point numbers as a 1570 // not-quite-conforming extension. Only do so if this looks like it's 1571 // actually meant to be a hexfloat, and not if it has a ud-suffix. 1572 bool IsHexFloat = true; 1573 if (!LangOpts.C99) { 1574 if (!isHexaLiteral(BufferPtr, LangOpts)) 1575 IsHexFloat = false; 1576 else if (std::find(BufferPtr, CurPtr, '_') != CurPtr) 1577 IsHexFloat = false; 1578 } 1579 if (IsHexFloat) 1580 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1581 } 1582 1583 // Update the location of token as well as BufferPtr. 1584 const char *TokStart = BufferPtr; 1585 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1586 Result.setLiteralData(TokStart); 1587 } 1588 1589 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1590 /// in C++11, or warn on a ud-suffix in C++98. 1591 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { 1592 assert(getLangOpts().CPlusPlus); 1593 1594 // Maximally munch an identifier. FIXME: UCNs. 1595 unsigned Size; 1596 char C = getCharAndSize(CurPtr, Size); 1597 if (isIdentifierHead(C)) { 1598 if (!getLangOpts().CPlusPlus0x) { 1599 if (!isLexingRawMode()) 1600 Diag(CurPtr, 1601 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1602 : diag::warn_cxx11_compat_reserved_user_defined_literal) 1603 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1604 return CurPtr; 1605 } 1606 1607 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1608 // that does not start with an underscore is ill-formed. As a conforming 1609 // extension, we treat all such suffixes as if they had whitespace before 1610 // them. 1611 if (C != '_') { 1612 if (!isLexingRawMode()) 1613 Diag(CurPtr, getLangOpts().MicrosoftMode ? 1614 diag::ext_ms_reserved_user_defined_literal : 1615 diag::ext_reserved_user_defined_literal) 1616 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1617 return CurPtr; 1618 } 1619 1620 Result.setFlag(Token::HasUDSuffix); 1621 do { 1622 CurPtr = ConsumeChar(CurPtr, Size, Result); 1623 C = getCharAndSize(CurPtr, Size); 1624 } while (isIdentifierBody(C)); 1625 } 1626 return CurPtr; 1627 } 1628 1629 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1630 /// either " or L" or u8" or u" or U". 1631 void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1632 tok::TokenKind Kind) { 1633 const char *NulCharacter = 0; // Does this string contain the \0 character? 1634 1635 if (!isLexingRawMode() && 1636 (Kind == tok::utf8_string_literal || 1637 Kind == tok::utf16_string_literal || 1638 Kind == tok::utf32_string_literal)) 1639 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1640 1641 char C = getAndAdvanceChar(CurPtr, Result); 1642 while (C != '"') { 1643 // Skip escaped characters. Escaped newlines will already be processed by 1644 // getAndAdvanceChar. 1645 if (C == '\\') 1646 C = getAndAdvanceChar(CurPtr, Result); 1647 1648 if (C == '\n' || C == '\r' || // Newline. 1649 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1650 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1651 Diag(BufferPtr, diag::ext_unterminated_string); 1652 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1653 return; 1654 } 1655 1656 if (C == 0) { 1657 if (isCodeCompletionPoint(CurPtr-1)) { 1658 PP->CodeCompleteNaturalLanguage(); 1659 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1660 return cutOffLexing(); 1661 } 1662 1663 NulCharacter = CurPtr-1; 1664 } 1665 C = getAndAdvanceChar(CurPtr, Result); 1666 } 1667 1668 // If we are in C++11, lex the optional ud-suffix. 1669 if (getLangOpts().CPlusPlus) 1670 CurPtr = LexUDSuffix(Result, CurPtr); 1671 1672 // If a nul character existed in the string, warn about it. 1673 if (NulCharacter && !isLexingRawMode()) 1674 Diag(NulCharacter, diag::null_in_string); 1675 1676 // Update the location of the token as well as the BufferPtr instance var. 1677 const char *TokStart = BufferPtr; 1678 FormTokenWithChars(Result, CurPtr, Kind); 1679 Result.setLiteralData(TokStart); 1680 } 1681 1682 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1683 /// having lexed R", LR", u8R", uR", or UR". 1684 void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1685 tok::TokenKind Kind) { 1686 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1687 // Between the initial and final double quote characters of the raw string, 1688 // any transformations performed in phases 1 and 2 (trigraphs, 1689 // universal-character-names, and line splicing) are reverted. 1690 1691 if (!isLexingRawMode()) 1692 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1693 1694 unsigned PrefixLen = 0; 1695 1696 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1697 ++PrefixLen; 1698 1699 // If the last character was not a '(', then we didn't lex a valid delimiter. 1700 if (CurPtr[PrefixLen] != '(') { 1701 if (!isLexingRawMode()) { 1702 const char *PrefixEnd = &CurPtr[PrefixLen]; 1703 if (PrefixLen == 16) { 1704 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1705 } else { 1706 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1707 << StringRef(PrefixEnd, 1); 1708 } 1709 } 1710 1711 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1712 // it's possible the '"' was intended to be part of the raw string, but 1713 // there's not much we can do about that. 1714 while (1) { 1715 char C = *CurPtr++; 1716 1717 if (C == '"') 1718 break; 1719 if (C == 0 && CurPtr-1 == BufferEnd) { 1720 --CurPtr; 1721 break; 1722 } 1723 } 1724 1725 FormTokenWithChars(Result, CurPtr, tok::unknown); 1726 return; 1727 } 1728 1729 // Save prefix and move CurPtr past it 1730 const char *Prefix = CurPtr; 1731 CurPtr += PrefixLen + 1; // skip over prefix and '(' 1732 1733 while (1) { 1734 char C = *CurPtr++; 1735 1736 if (C == ')') { 1737 // Check for prefix match and closing quote. 1738 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 1739 CurPtr += PrefixLen + 1; // skip over prefix and '"' 1740 break; 1741 } 1742 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 1743 if (!isLexingRawMode()) 1744 Diag(BufferPtr, diag::err_unterminated_raw_string) 1745 << StringRef(Prefix, PrefixLen); 1746 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1747 return; 1748 } 1749 } 1750 1751 // If we are in C++11, lex the optional ud-suffix. 1752 if (getLangOpts().CPlusPlus) 1753 CurPtr = LexUDSuffix(Result, CurPtr); 1754 1755 // Update the location of token as well as BufferPtr. 1756 const char *TokStart = BufferPtr; 1757 FormTokenWithChars(Result, CurPtr, Kind); 1758 Result.setLiteralData(TokStart); 1759 } 1760 1761 /// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1762 /// after having lexed the '<' character. This is used for #include filenames. 1763 void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1764 const char *NulCharacter = 0; // Does this string contain the \0 character? 1765 const char *AfterLessPos = CurPtr; 1766 char C = getAndAdvanceChar(CurPtr, Result); 1767 while (C != '>') { 1768 // Skip escaped characters. 1769 if (C == '\\') { 1770 // Skip the escaped character. 1771 getAndAdvanceChar(CurPtr, Result); 1772 } else if (C == '\n' || C == '\r' || // Newline. 1773 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 1774 isCodeCompletionPoint(CurPtr-1)))) { 1775 // If the filename is unterminated, then it must just be a lone < 1776 // character. Return this as such. 1777 FormTokenWithChars(Result, AfterLessPos, tok::less); 1778 return; 1779 } else if (C == 0) { 1780 NulCharacter = CurPtr-1; 1781 } 1782 C = getAndAdvanceChar(CurPtr, Result); 1783 } 1784 1785 // If a nul character existed in the string, warn about it. 1786 if (NulCharacter && !isLexingRawMode()) 1787 Diag(NulCharacter, diag::null_in_string); 1788 1789 // Update the location of token as well as BufferPtr. 1790 const char *TokStart = BufferPtr; 1791 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1792 Result.setLiteralData(TokStart); 1793 } 1794 1795 1796 /// LexCharConstant - Lex the remainder of a character constant, after having 1797 /// lexed either ' or L' or u' or U'. 1798 void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 1799 tok::TokenKind Kind) { 1800 const char *NulCharacter = 0; // Does this character contain the \0 character? 1801 1802 if (!isLexingRawMode() && 1803 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 1804 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1805 1806 char C = getAndAdvanceChar(CurPtr, Result); 1807 if (C == '\'') { 1808 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1809 Diag(BufferPtr, diag::ext_empty_character); 1810 FormTokenWithChars(Result, CurPtr, tok::unknown); 1811 return; 1812 } 1813 1814 while (C != '\'') { 1815 // Skip escaped characters. 1816 if (C == '\\') { 1817 // Skip the escaped character. 1818 // FIXME: UCN's 1819 getAndAdvanceChar(CurPtr, Result); 1820 } else if (C == '\n' || C == '\r' || // Newline. 1821 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1822 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1823 Diag(BufferPtr, diag::ext_unterminated_char); 1824 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1825 return; 1826 } else if (C == 0) { 1827 if (isCodeCompletionPoint(CurPtr-1)) { 1828 PP->CodeCompleteNaturalLanguage(); 1829 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1830 return cutOffLexing(); 1831 } 1832 1833 NulCharacter = CurPtr-1; 1834 } 1835 C = getAndAdvanceChar(CurPtr, Result); 1836 } 1837 1838 // If we are in C++11, lex the optional ud-suffix. 1839 if (getLangOpts().CPlusPlus) 1840 CurPtr = LexUDSuffix(Result, CurPtr); 1841 1842 // If a nul character existed in the character, warn about it. 1843 if (NulCharacter && !isLexingRawMode()) 1844 Diag(NulCharacter, diag::null_in_char); 1845 1846 // Update the location of token as well as BufferPtr. 1847 const char *TokStart = BufferPtr; 1848 FormTokenWithChars(Result, CurPtr, Kind); 1849 Result.setLiteralData(TokStart); 1850 } 1851 1852 /// SkipWhitespace - Efficiently skip over a series of whitespace characters. 1853 /// Update BufferPtr to point to the next non-whitespace character and return. 1854 /// 1855 /// This method forms a token and returns true if KeepWhitespaceMode is enabled. 1856 /// 1857 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 1858 // Whitespace - Skip it, then return the token after the whitespace. 1859 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 1860 while (1) { 1861 // Skip horizontal whitespace very aggressively. 1862 while (isHorizontalWhitespace(Char)) 1863 Char = *++CurPtr; 1864 1865 // Otherwise if we have something other than whitespace, we're done. 1866 if (Char != '\n' && Char != '\r') 1867 break; 1868 1869 if (ParsingPreprocessorDirective) { 1870 // End of preprocessor directive line, let LexTokenInternal handle this. 1871 BufferPtr = CurPtr; 1872 return false; 1873 } 1874 1875 // ok, but handle newline. 1876 // The returned token is at the start of the line. 1877 Result.setFlag(Token::StartOfLine); 1878 // No leading whitespace seen so far. 1879 Result.clearFlag(Token::LeadingSpace); 1880 Char = *++CurPtr; 1881 } 1882 1883 // If this isn't immediately after a newline, there is leading space. 1884 char PrevChar = CurPtr[-1]; 1885 if (PrevChar != '\n' && PrevChar != '\r') 1886 Result.setFlag(Token::LeadingSpace); 1887 1888 // If the client wants us to return whitespace, return it now. 1889 if (isKeepWhitespaceMode()) { 1890 FormTokenWithChars(Result, CurPtr, tok::unknown); 1891 return true; 1892 } 1893 1894 BufferPtr = CurPtr; 1895 return false; 1896 } 1897 1898 // SkipBCPLComment - We have just read the // characters from input. Skip until 1899 // we find the newline character thats terminate the comment. Then update 1900 /// BufferPtr and return. 1901 /// 1902 /// If we're in KeepCommentMode or any CommentHandler has inserted 1903 /// some tokens, this will store the first token and return true. 1904 bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 1905 // If BCPL comments aren't explicitly enabled for this language, emit an 1906 // extension warning. 1907 if (!LangOpts.BCPLComment && !isLexingRawMode()) { 1908 Diag(BufferPtr, diag::ext_bcpl_comment); 1909 1910 // Mark them enabled so we only emit one warning for this translation 1911 // unit. 1912 LangOpts.BCPLComment = true; 1913 } 1914 1915 // Scan over the body of the comment. The common case, when scanning, is that 1916 // the comment contains normal ascii characters with nothing interesting in 1917 // them. As such, optimize for this case with the inner loop. 1918 char C; 1919 do { 1920 C = *CurPtr; 1921 // Skip over characters in the fast loop. 1922 while (C != 0 && // Potentially EOF. 1923 C != '\n' && C != '\r') // Newline or DOS-style newline. 1924 C = *++CurPtr; 1925 1926 const char *NextLine = CurPtr; 1927 if (C != 0) { 1928 // We found a newline, see if it's escaped. 1929 const char *EscapePtr = CurPtr-1; 1930 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 1931 --EscapePtr; 1932 1933 if (*EscapePtr == '\\') // Escaped newline. 1934 CurPtr = EscapePtr; 1935 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 1936 EscapePtr[-2] == '?') // Trigraph-escaped newline. 1937 CurPtr = EscapePtr-2; 1938 else 1939 break; // This is a newline, we're done. 1940 } 1941 1942 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 1943 // properly decode the character. Read it in raw mode to avoid emitting 1944 // diagnostics about things like trigraphs. If we see an escaped newline, 1945 // we'll handle it below. 1946 const char *OldPtr = CurPtr; 1947 bool OldRawMode = isLexingRawMode(); 1948 LexingRawMode = true; 1949 C = getAndAdvanceChar(CurPtr, Result); 1950 LexingRawMode = OldRawMode; 1951 1952 // If we only read only one character, then no special handling is needed. 1953 // We're done and can skip forward to the newline. 1954 if (C != 0 && CurPtr == OldPtr+1) { 1955 CurPtr = NextLine; 1956 break; 1957 } 1958 1959 // If we read multiple characters, and one of those characters was a \r or 1960 // \n, then we had an escaped newline within the comment. Emit diagnostic 1961 // unless the next line is also a // comment. 1962 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 1963 for (; OldPtr != CurPtr; ++OldPtr) 1964 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 1965 // Okay, we found a // comment that ends in a newline, if the next 1966 // line is also a // comment, but has spaces, don't emit a diagnostic. 1967 if (isWhitespace(C)) { 1968 const char *ForwardPtr = CurPtr; 1969 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 1970 ++ForwardPtr; 1971 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 1972 break; 1973 } 1974 1975 if (!isLexingRawMode()) 1976 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 1977 break; 1978 } 1979 } 1980 1981 if (CurPtr == BufferEnd+1) { 1982 --CurPtr; 1983 break; 1984 } 1985 1986 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 1987 PP->CodeCompleteNaturalLanguage(); 1988 cutOffLexing(); 1989 return false; 1990 } 1991 1992 } while (C != '\n' && C != '\r'); 1993 1994 // Found but did not consume the newline. Notify comment handlers about the 1995 // comment unless we're in a #if 0 block. 1996 if (PP && !isLexingRawMode() && 1997 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1998 getSourceLocation(CurPtr)))) { 1999 BufferPtr = CurPtr; 2000 return true; // A token has to be returned. 2001 } 2002 2003 // If we are returning comments as tokens, return this comment as a token. 2004 if (inKeepCommentMode()) 2005 return SaveBCPLComment(Result, CurPtr); 2006 2007 // If we are inside a preprocessor directive and we see the end of line, 2008 // return immediately, so that the lexer can return this as an EOD token. 2009 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2010 BufferPtr = CurPtr; 2011 return false; 2012 } 2013 2014 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2015 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2016 // contribute to another token), it isn't needed for correctness. Note that 2017 // this is ok even in KeepWhitespaceMode, because we would have returned the 2018 /// comment above in that mode. 2019 ++CurPtr; 2020 2021 // The next returned token is at the start of the line. 2022 Result.setFlag(Token::StartOfLine); 2023 // No leading whitespace seen so far. 2024 Result.clearFlag(Token::LeadingSpace); 2025 BufferPtr = CurPtr; 2026 return false; 2027 } 2028 2029 /// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 2030 /// an appropriate way and return it. 2031 bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 2032 // If we're not in a preprocessor directive, just return the // comment 2033 // directly. 2034 FormTokenWithChars(Result, CurPtr, tok::comment); 2035 2036 if (!ParsingPreprocessorDirective || LexingRawMode) 2037 return true; 2038 2039 // If this BCPL-style comment is in a macro definition, transmogrify it into 2040 // a C-style block comment. 2041 bool Invalid = false; 2042 std::string Spelling = PP->getSpelling(Result, &Invalid); 2043 if (Invalid) 2044 return true; 2045 2046 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 2047 Spelling[1] = '*'; // Change prefix to "/*". 2048 Spelling += "*/"; // add suffix. 2049 2050 Result.setKind(tok::comment); 2051 PP->CreateString(&Spelling[0], Spelling.size(), Result, 2052 Result.getLocation(), Result.getLocation()); 2053 return true; 2054 } 2055 2056 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2057 /// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2058 /// a diagnostic if so. We know that the newline is inside of a block comment. 2059 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2060 Lexer *L) { 2061 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2062 2063 // Back up off the newline. 2064 --CurPtr; 2065 2066 // If this is a two-character newline sequence, skip the other character. 2067 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2068 // \n\n or \r\r -> not escaped newline. 2069 if (CurPtr[0] == CurPtr[1]) 2070 return false; 2071 // \n\r or \r\n -> skip the newline. 2072 --CurPtr; 2073 } 2074 2075 // If we have horizontal whitespace, skip over it. We allow whitespace 2076 // between the slash and newline. 2077 bool HasSpace = false; 2078 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2079 --CurPtr; 2080 HasSpace = true; 2081 } 2082 2083 // If we have a slash, we know this is an escaped newline. 2084 if (*CurPtr == '\\') { 2085 if (CurPtr[-1] != '*') return false; 2086 } else { 2087 // It isn't a slash, is it the ?? / trigraph? 2088 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2089 CurPtr[-3] != '*') 2090 return false; 2091 2092 // This is the trigraph ending the comment. Emit a stern warning! 2093 CurPtr -= 2; 2094 2095 // If no trigraphs are enabled, warn that we ignored this trigraph and 2096 // ignore this * character. 2097 if (!L->getLangOpts().Trigraphs) { 2098 if (!L->isLexingRawMode()) 2099 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2100 return false; 2101 } 2102 if (!L->isLexingRawMode()) 2103 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2104 } 2105 2106 // Warn about having an escaped newline between the */ characters. 2107 if (!L->isLexingRawMode()) 2108 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2109 2110 // If there was space between the backslash and newline, warn about it. 2111 if (HasSpace && !L->isLexingRawMode()) 2112 L->Diag(CurPtr, diag::backslash_newline_space); 2113 2114 return true; 2115 } 2116 2117 #ifdef __SSE2__ 2118 #include <emmintrin.h> 2119 #elif __ALTIVEC__ 2120 #include <altivec.h> 2121 #undef bool 2122 #endif 2123 2124 /// We have just read from input the / and * characters that started a comment. 2125 /// Read until we find the * and / characters that terminate the comment. 2126 /// Note that we don't bother decoding trigraphs or escaped newlines in block 2127 /// comments, because they cannot cause the comment to end. The only thing 2128 /// that can happen is the comment could end with an escaped newline between 2129 /// the terminating * and /. 2130 /// 2131 /// If we're in KeepCommentMode or any CommentHandler has inserted 2132 /// some tokens, this will store the first token and return true. 2133 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 2134 // Scan one character past where we should, looking for a '/' character. Once 2135 // we find it, check to see if it was preceded by a *. This common 2136 // optimization helps people who like to put a lot of * characters in their 2137 // comments. 2138 2139 // The first character we get with newlines and trigraphs skipped to handle 2140 // the degenerate /*/ case below correctly if the * has an escaped newline 2141 // after it. 2142 unsigned CharSize; 2143 unsigned char C = getCharAndSize(CurPtr, CharSize); 2144 CurPtr += CharSize; 2145 if (C == 0 && CurPtr == BufferEnd+1) { 2146 if (!isLexingRawMode()) 2147 Diag(BufferPtr, diag::err_unterminated_block_comment); 2148 --CurPtr; 2149 2150 // KeepWhitespaceMode should return this broken comment as a token. Since 2151 // it isn't a well formed comment, just return it as an 'unknown' token. 2152 if (isKeepWhitespaceMode()) { 2153 FormTokenWithChars(Result, CurPtr, tok::unknown); 2154 return true; 2155 } 2156 2157 BufferPtr = CurPtr; 2158 return false; 2159 } 2160 2161 // Check to see if the first character after the '/*' is another /. If so, 2162 // then this slash does not end the block comment, it is part of it. 2163 if (C == '/') 2164 C = *CurPtr++; 2165 2166 while (1) { 2167 // Skip over all non-interesting characters until we find end of buffer or a 2168 // (probably ending) '/' character. 2169 if (CurPtr + 24 < BufferEnd && 2170 // If there is a code-completion point avoid the fast scan because it 2171 // doesn't check for '\0'. 2172 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2173 // While not aligned to a 16-byte boundary. 2174 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2175 C = *CurPtr++; 2176 2177 if (C == '/') goto FoundSlash; 2178 2179 #ifdef __SSE2__ 2180 __m128i Slashes = _mm_set1_epi8('/'); 2181 while (CurPtr+16 <= BufferEnd) { 2182 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2183 Slashes)); 2184 if (cmp != 0) { 2185 // Adjust the pointer to point directly after the first slash. It's 2186 // not necessary to set C here, it will be overwritten at the end of 2187 // the outer loop. 2188 CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; 2189 goto FoundSlash; 2190 } 2191 CurPtr += 16; 2192 } 2193 #elif __ALTIVEC__ 2194 __vector unsigned char Slashes = { 2195 '/', '/', '/', '/', '/', '/', '/', '/', 2196 '/', '/', '/', '/', '/', '/', '/', '/' 2197 }; 2198 while (CurPtr+16 <= BufferEnd && 2199 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 2200 CurPtr += 16; 2201 #else 2202 // Scan for '/' quickly. Many block comments are very large. 2203 while (CurPtr[0] != '/' && 2204 CurPtr[1] != '/' && 2205 CurPtr[2] != '/' && 2206 CurPtr[3] != '/' && 2207 CurPtr+4 < BufferEnd) { 2208 CurPtr += 4; 2209 } 2210 #endif 2211 2212 // It has to be one of the bytes scanned, increment to it and read one. 2213 C = *CurPtr++; 2214 } 2215 2216 // Loop to scan the remainder. 2217 while (C != '/' && C != '\0') 2218 C = *CurPtr++; 2219 2220 if (C == '/') { 2221 FoundSlash: 2222 if (CurPtr[-2] == '*') // We found the final */. We're done! 2223 break; 2224 2225 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2226 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2227 // We found the final */, though it had an escaped newline between the 2228 // * and /. We're done! 2229 break; 2230 } 2231 } 2232 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2233 // If this is a /* inside of the comment, emit a warning. Don't do this 2234 // if this is a /*/, which will end the comment. This misses cases with 2235 // embedded escaped newlines, but oh well. 2236 if (!isLexingRawMode()) 2237 Diag(CurPtr-1, diag::warn_nested_block_comment); 2238 } 2239 } else if (C == 0 && CurPtr == BufferEnd+1) { 2240 if (!isLexingRawMode()) 2241 Diag(BufferPtr, diag::err_unterminated_block_comment); 2242 // Note: the user probably forgot a */. We could continue immediately 2243 // after the /*, but this would involve lexing a lot of what really is the 2244 // comment, which surely would confuse the parser. 2245 --CurPtr; 2246 2247 // KeepWhitespaceMode should return this broken comment as a token. Since 2248 // it isn't a well formed comment, just return it as an 'unknown' token. 2249 if (isKeepWhitespaceMode()) { 2250 FormTokenWithChars(Result, CurPtr, tok::unknown); 2251 return true; 2252 } 2253 2254 BufferPtr = CurPtr; 2255 return false; 2256 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2257 PP->CodeCompleteNaturalLanguage(); 2258 cutOffLexing(); 2259 return false; 2260 } 2261 2262 C = *CurPtr++; 2263 } 2264 2265 // Notify comment handlers about the comment unless we're in a #if 0 block. 2266 if (PP && !isLexingRawMode() && 2267 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2268 getSourceLocation(CurPtr)))) { 2269 BufferPtr = CurPtr; 2270 return true; // A token has to be returned. 2271 } 2272 2273 // If we are returning comments as tokens, return this comment as a token. 2274 if (inKeepCommentMode()) { 2275 FormTokenWithChars(Result, CurPtr, tok::comment); 2276 return true; 2277 } 2278 2279 // It is common for the tokens immediately after a /**/ comment to be 2280 // whitespace. Instead of going through the big switch, handle it 2281 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2282 // have already returned above with the comment as a token. 2283 if (isHorizontalWhitespace(*CurPtr)) { 2284 Result.setFlag(Token::LeadingSpace); 2285 SkipWhitespace(Result, CurPtr+1); 2286 return false; 2287 } 2288 2289 // Otherwise, just return so that the next character will be lexed as a token. 2290 BufferPtr = CurPtr; 2291 Result.setFlag(Token::LeadingSpace); 2292 return false; 2293 } 2294 2295 //===----------------------------------------------------------------------===// 2296 // Primary Lexing Entry Points 2297 //===----------------------------------------------------------------------===// 2298 2299 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2300 /// uninterpreted string. This switches the lexer out of directive mode. 2301 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2302 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2303 "Must be in a preprocessing directive!"); 2304 Token Tmp; 2305 2306 // CurPtr - Cache BufferPtr in an automatic variable. 2307 const char *CurPtr = BufferPtr; 2308 while (1) { 2309 char Char = getAndAdvanceChar(CurPtr, Tmp); 2310 switch (Char) { 2311 default: 2312 if (Result) 2313 Result->push_back(Char); 2314 break; 2315 case 0: // Null. 2316 // Found end of file? 2317 if (CurPtr-1 != BufferEnd) { 2318 if (isCodeCompletionPoint(CurPtr-1)) { 2319 PP->CodeCompleteNaturalLanguage(); 2320 cutOffLexing(); 2321 return; 2322 } 2323 2324 // Nope, normal character, continue. 2325 if (Result) 2326 Result->push_back(Char); 2327 break; 2328 } 2329 // FALL THROUGH. 2330 case '\r': 2331 case '\n': 2332 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2333 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2334 BufferPtr = CurPtr-1; 2335 2336 // Next, lex the character, which should handle the EOD transition. 2337 Lex(Tmp); 2338 if (Tmp.is(tok::code_completion)) { 2339 if (PP) 2340 PP->CodeCompleteNaturalLanguage(); 2341 Lex(Tmp); 2342 } 2343 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2344 2345 // Finally, we're done; 2346 return; 2347 } 2348 } 2349 } 2350 2351 /// LexEndOfFile - CurPtr points to the end of this file. Handle this 2352 /// condition, reporting diagnostics and handling other edge cases as required. 2353 /// This returns true if Result contains a token, false if PP.Lex should be 2354 /// called again. 2355 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2356 // If we hit the end of the file while parsing a preprocessor directive, 2357 // end the preprocessor directive first. The next token returned will 2358 // then be the end of file. 2359 if (ParsingPreprocessorDirective) { 2360 // Done parsing the "line". 2361 ParsingPreprocessorDirective = false; 2362 // Update the location of token as well as BufferPtr. 2363 FormTokenWithChars(Result, CurPtr, tok::eod); 2364 2365 // Restore comment saving mode, in case it was disabled for directive. 2366 SetCommentRetentionState(PP->getCommentRetentionState()); 2367 return true; // Have a token. 2368 } 2369 2370 // If we are in raw mode, return this event as an EOF token. Let the caller 2371 // that put us in raw mode handle the event. 2372 if (isLexingRawMode()) { 2373 Result.startToken(); 2374 BufferPtr = BufferEnd; 2375 FormTokenWithChars(Result, BufferEnd, tok::eof); 2376 return true; 2377 } 2378 2379 // Issue diagnostics for unterminated #if and missing newline. 2380 2381 // If we are in a #if directive, emit an error. 2382 while (!ConditionalStack.empty()) { 2383 if (PP->getCodeCompletionFileLoc() != FileLoc) 2384 PP->Diag(ConditionalStack.back().IfLoc, 2385 diag::err_pp_unterminated_conditional); 2386 ConditionalStack.pop_back(); 2387 } 2388 2389 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2390 // a pedwarn. 2391 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 2392 Diag(BufferEnd, LangOpts.CPlusPlus0x ? // C++11 [lex.phases] 2.2 p2 2393 diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof) 2394 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 2395 2396 BufferPtr = CurPtr; 2397 2398 // Finally, let the preprocessor handle this. 2399 return PP->HandleEndOfFile(Result, isPragmaLexer()); 2400 } 2401 2402 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2403 /// the specified lexer will return a tok::l_paren token, 0 if it is something 2404 /// else and 2 if there are no more tokens in the buffer controlled by the 2405 /// lexer. 2406 unsigned Lexer::isNextPPTokenLParen() { 2407 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2408 2409 // Switch to 'skipping' mode. This will ensure that we can lex a token 2410 // without emitting diagnostics, disables macro expansion, and will cause EOF 2411 // to return an EOF token instead of popping the include stack. 2412 LexingRawMode = true; 2413 2414 // Save state that can be changed while lexing so that we can restore it. 2415 const char *TmpBufferPtr = BufferPtr; 2416 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2417 2418 Token Tok; 2419 Tok.startToken(); 2420 LexTokenInternal(Tok); 2421 2422 // Restore state that may have changed. 2423 BufferPtr = TmpBufferPtr; 2424 ParsingPreprocessorDirective = inPPDirectiveMode; 2425 2426 // Restore the lexer back to non-skipping mode. 2427 LexingRawMode = false; 2428 2429 if (Tok.is(tok::eof)) 2430 return 2; 2431 return Tok.is(tok::l_paren); 2432 } 2433 2434 /// \brief Find the end of a version control conflict marker. 2435 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2436 ConflictMarkerKind CMK) { 2437 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2438 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2439 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 2440 size_t Pos = RestOfBuffer.find(Terminator); 2441 while (Pos != StringRef::npos) { 2442 // Must occur at start of line. 2443 if (RestOfBuffer[Pos-1] != '\r' && 2444 RestOfBuffer[Pos-1] != '\n') { 2445 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2446 Pos = RestOfBuffer.find(Terminator); 2447 continue; 2448 } 2449 return RestOfBuffer.data()+Pos; 2450 } 2451 return 0; 2452 } 2453 2454 /// IsStartOfConflictMarker - If the specified pointer is the start of a version 2455 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2456 /// and recover nicely. This returns true if it is a conflict marker and false 2457 /// if not. 2458 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2459 // Only a conflict marker if it starts at the beginning of a line. 2460 if (CurPtr != BufferStart && 2461 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2462 return false; 2463 2464 // Check to see if we have <<<<<<< or >>>>. 2465 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 2466 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 2467 return false; 2468 2469 // If we have a situation where we don't care about conflict markers, ignore 2470 // it. 2471 if (CurrentConflictMarkerState || isLexingRawMode()) 2472 return false; 2473 2474 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2475 2476 // Check to see if there is an ending marker somewhere in the buffer at the 2477 // start of a line to terminate this conflict marker. 2478 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2479 // We found a match. We are really in a conflict marker. 2480 // Diagnose this, and ignore to the end of line. 2481 Diag(CurPtr, diag::err_conflict_marker); 2482 CurrentConflictMarkerState = Kind; 2483 2484 // Skip ahead to the end of line. We know this exists because the 2485 // end-of-conflict marker starts with \r or \n. 2486 while (*CurPtr != '\r' && *CurPtr != '\n') { 2487 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2488 ++CurPtr; 2489 } 2490 BufferPtr = CurPtr; 2491 return true; 2492 } 2493 2494 // No end of conflict marker found. 2495 return false; 2496 } 2497 2498 2499 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2500 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2501 /// is the end of a conflict marker. Handle it by ignoring up until the end of 2502 /// the line. This returns true if it is a conflict marker and false if not. 2503 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2504 // Only a conflict marker if it starts at the beginning of a line. 2505 if (CurPtr != BufferStart && 2506 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2507 return false; 2508 2509 // If we have a situation where we don't care about conflict markers, ignore 2510 // it. 2511 if (!CurrentConflictMarkerState || isLexingRawMode()) 2512 return false; 2513 2514 // Check to see if we have the marker (4 characters in a row). 2515 for (unsigned i = 1; i != 4; ++i) 2516 if (CurPtr[i] != CurPtr[0]) 2517 return false; 2518 2519 // If we do have it, search for the end of the conflict marker. This could 2520 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2521 // be the end of conflict marker. 2522 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2523 CurrentConflictMarkerState)) { 2524 CurPtr = End; 2525 2526 // Skip ahead to the end of line. 2527 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2528 ++CurPtr; 2529 2530 BufferPtr = CurPtr; 2531 2532 // No longer in the conflict marker. 2533 CurrentConflictMarkerState = CMK_None; 2534 return true; 2535 } 2536 2537 return false; 2538 } 2539 2540 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2541 if (PP && PP->isCodeCompletionEnabled()) { 2542 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2543 return Loc == PP->getCodeCompletionLoc(); 2544 } 2545 2546 return false; 2547 } 2548 2549 2550 /// LexTokenInternal - This implements a simple C family lexer. It is an 2551 /// extremely performance critical piece of code. This assumes that the buffer 2552 /// has a null character at the end of the file. This returns a preprocessing 2553 /// token, not a normal token, as such, it is an internal interface. It assumes 2554 /// that the Flags of result have been cleared before calling this. 2555 void Lexer::LexTokenInternal(Token &Result) { 2556 LexNextToken: 2557 // New token, can't need cleaning yet. 2558 Result.clearFlag(Token::NeedsCleaning); 2559 Result.setIdentifierInfo(0); 2560 2561 // CurPtr - Cache BufferPtr in an automatic variable. 2562 const char *CurPtr = BufferPtr; 2563 2564 // Small amounts of horizontal whitespace is very common between tokens. 2565 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2566 ++CurPtr; 2567 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2568 ++CurPtr; 2569 2570 // If we are keeping whitespace and other tokens, just return what we just 2571 // skipped. The next lexer invocation will return the token after the 2572 // whitespace. 2573 if (isKeepWhitespaceMode()) { 2574 FormTokenWithChars(Result, CurPtr, tok::unknown); 2575 return; 2576 } 2577 2578 BufferPtr = CurPtr; 2579 Result.setFlag(Token::LeadingSpace); 2580 } 2581 2582 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2583 2584 // Read a character, advancing over it. 2585 char Char = getAndAdvanceChar(CurPtr, Result); 2586 tok::TokenKind Kind; 2587 2588 switch (Char) { 2589 case 0: // Null. 2590 // Found end of file? 2591 if (CurPtr-1 == BufferEnd) { 2592 // Read the PP instance variable into an automatic variable, because 2593 // LexEndOfFile will often delete 'this'. 2594 Preprocessor *PPCache = PP; 2595 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2596 return; // Got a token to return. 2597 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2598 return PPCache->Lex(Result); 2599 } 2600 2601 // Check if we are performing code completion. 2602 if (isCodeCompletionPoint(CurPtr-1)) { 2603 // Return the code-completion token. 2604 Result.startToken(); 2605 FormTokenWithChars(Result, CurPtr, tok::code_completion); 2606 return; 2607 } 2608 2609 if (!isLexingRawMode()) 2610 Diag(CurPtr-1, diag::null_in_file); 2611 Result.setFlag(Token::LeadingSpace); 2612 if (SkipWhitespace(Result, CurPtr)) 2613 return; // KeepWhitespaceMode 2614 2615 goto LexNextToken; // GCC isn't tail call eliminating. 2616 2617 case 26: // DOS & CP/M EOF: "^Z". 2618 // If we're in Microsoft extensions mode, treat this as end of file. 2619 if (LangOpts.MicrosoftExt) { 2620 // Read the PP instance variable into an automatic variable, because 2621 // LexEndOfFile will often delete 'this'. 2622 Preprocessor *PPCache = PP; 2623 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2624 return; // Got a token to return. 2625 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2626 return PPCache->Lex(Result); 2627 } 2628 // If Microsoft extensions are disabled, this is just random garbage. 2629 Kind = tok::unknown; 2630 break; 2631 2632 case '\n': 2633 case '\r': 2634 // If we are inside a preprocessor directive and we see the end of line, 2635 // we know we are done with the directive, so return an EOD token. 2636 if (ParsingPreprocessorDirective) { 2637 // Done parsing the "line". 2638 ParsingPreprocessorDirective = false; 2639 2640 // Restore comment saving mode, in case it was disabled for directive. 2641 if (PP) 2642 SetCommentRetentionState(PP->getCommentRetentionState()); 2643 2644 // Since we consumed a newline, we are back at the start of a line. 2645 IsAtStartOfLine = true; 2646 2647 Kind = tok::eod; 2648 break; 2649 } 2650 // The returned token is at the start of the line. 2651 Result.setFlag(Token::StartOfLine); 2652 // No leading whitespace seen so far. 2653 Result.clearFlag(Token::LeadingSpace); 2654 2655 if (SkipWhitespace(Result, CurPtr)) 2656 return; // KeepWhitespaceMode 2657 goto LexNextToken; // GCC isn't tail call eliminating. 2658 case ' ': 2659 case '\t': 2660 case '\f': 2661 case '\v': 2662 SkipHorizontalWhitespace: 2663 Result.setFlag(Token::LeadingSpace); 2664 if (SkipWhitespace(Result, CurPtr)) 2665 return; // KeepWhitespaceMode 2666 2667 SkipIgnoredUnits: 2668 CurPtr = BufferPtr; 2669 2670 // If the next token is obviously a // or /* */ comment, skip it efficiently 2671 // too (without going through the big switch stmt). 2672 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2673 LangOpts.BCPLComment && !LangOpts.TraditionalCPP) { 2674 if (SkipBCPLComment(Result, CurPtr+2)) 2675 return; // There is a token to return. 2676 goto SkipIgnoredUnits; 2677 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2678 if (SkipBlockComment(Result, CurPtr+2)) 2679 return; // There is a token to return. 2680 goto SkipIgnoredUnits; 2681 } else if (isHorizontalWhitespace(*CurPtr)) { 2682 goto SkipHorizontalWhitespace; 2683 } 2684 goto LexNextToken; // GCC isn't tail call eliminating. 2685 2686 // C99 6.4.4.1: Integer Constants. 2687 // C99 6.4.4.2: Floating Constants. 2688 case '0': case '1': case '2': case '3': case '4': 2689 case '5': case '6': case '7': case '8': case '9': 2690 // Notify MIOpt that we read a non-whitespace/non-comment token. 2691 MIOpt.ReadToken(); 2692 return LexNumericConstant(Result, CurPtr); 2693 2694 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 2695 // Notify MIOpt that we read a non-whitespace/non-comment token. 2696 MIOpt.ReadToken(); 2697 2698 if (LangOpts.CPlusPlus0x) { 2699 Char = getCharAndSize(CurPtr, SizeTmp); 2700 2701 // UTF-16 string literal 2702 if (Char == '"') 2703 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2704 tok::utf16_string_literal); 2705 2706 // UTF-16 character constant 2707 if (Char == '\'') 2708 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2709 tok::utf16_char_constant); 2710 2711 // UTF-16 raw string literal 2712 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2713 return LexRawStringLiteral(Result, 2714 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2715 SizeTmp2, Result), 2716 tok::utf16_string_literal); 2717 2718 if (Char == '8') { 2719 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 2720 2721 // UTF-8 string literal 2722 if (Char2 == '"') 2723 return LexStringLiteral(Result, 2724 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2725 SizeTmp2, Result), 2726 tok::utf8_string_literal); 2727 2728 if (Char2 == 'R') { 2729 unsigned SizeTmp3; 2730 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2731 // UTF-8 raw string literal 2732 if (Char3 == '"') { 2733 return LexRawStringLiteral(Result, 2734 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2735 SizeTmp2, Result), 2736 SizeTmp3, Result), 2737 tok::utf8_string_literal); 2738 } 2739 } 2740 } 2741 } 2742 2743 // treat u like the start of an identifier. 2744 return LexIdentifier(Result, CurPtr); 2745 2746 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 2747 // Notify MIOpt that we read a non-whitespace/non-comment token. 2748 MIOpt.ReadToken(); 2749 2750 if (LangOpts.CPlusPlus0x) { 2751 Char = getCharAndSize(CurPtr, SizeTmp); 2752 2753 // UTF-32 string literal 2754 if (Char == '"') 2755 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2756 tok::utf32_string_literal); 2757 2758 // UTF-32 character constant 2759 if (Char == '\'') 2760 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2761 tok::utf32_char_constant); 2762 2763 // UTF-32 raw string literal 2764 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2765 return LexRawStringLiteral(Result, 2766 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2767 SizeTmp2, Result), 2768 tok::utf32_string_literal); 2769 } 2770 2771 // treat U like the start of an identifier. 2772 return LexIdentifier(Result, CurPtr); 2773 2774 case 'R': // Identifier or C++0x raw string literal 2775 // Notify MIOpt that we read a non-whitespace/non-comment token. 2776 MIOpt.ReadToken(); 2777 2778 if (LangOpts.CPlusPlus0x) { 2779 Char = getCharAndSize(CurPtr, SizeTmp); 2780 2781 if (Char == '"') 2782 return LexRawStringLiteral(Result, 2783 ConsumeChar(CurPtr, SizeTmp, Result), 2784 tok::string_literal); 2785 } 2786 2787 // treat R like the start of an identifier. 2788 return LexIdentifier(Result, CurPtr); 2789 2790 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 2791 // Notify MIOpt that we read a non-whitespace/non-comment token. 2792 MIOpt.ReadToken(); 2793 Char = getCharAndSize(CurPtr, SizeTmp); 2794 2795 // Wide string literal. 2796 if (Char == '"') 2797 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2798 tok::wide_string_literal); 2799 2800 // Wide raw string literal. 2801 if (LangOpts.CPlusPlus0x && Char == 'R' && 2802 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2803 return LexRawStringLiteral(Result, 2804 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2805 SizeTmp2, Result), 2806 tok::wide_string_literal); 2807 2808 // Wide character constant. 2809 if (Char == '\'') 2810 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2811 tok::wide_char_constant); 2812 // FALL THROUGH, treating L like the start of an identifier. 2813 2814 // C99 6.4.2: Identifiers. 2815 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 2816 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 2817 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 2818 case 'V': case 'W': case 'X': case 'Y': case 'Z': 2819 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 2820 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 2821 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 2822 case 'v': case 'w': case 'x': case 'y': case 'z': 2823 case '_': 2824 // Notify MIOpt that we read a non-whitespace/non-comment token. 2825 MIOpt.ReadToken(); 2826 return LexIdentifier(Result, CurPtr); 2827 2828 case '$': // $ in identifiers. 2829 if (LangOpts.DollarIdents) { 2830 if (!isLexingRawMode()) 2831 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 2832 // Notify MIOpt that we read a non-whitespace/non-comment token. 2833 MIOpt.ReadToken(); 2834 return LexIdentifier(Result, CurPtr); 2835 } 2836 2837 Kind = tok::unknown; 2838 break; 2839 2840 // C99 6.4.4: Character Constants. 2841 case '\'': 2842 // Notify MIOpt that we read a non-whitespace/non-comment token. 2843 MIOpt.ReadToken(); 2844 return LexCharConstant(Result, CurPtr, tok::char_constant); 2845 2846 // C99 6.4.5: String Literals. 2847 case '"': 2848 // Notify MIOpt that we read a non-whitespace/non-comment token. 2849 MIOpt.ReadToken(); 2850 return LexStringLiteral(Result, CurPtr, tok::string_literal); 2851 2852 // C99 6.4.6: Punctuators. 2853 case '?': 2854 Kind = tok::question; 2855 break; 2856 case '[': 2857 Kind = tok::l_square; 2858 break; 2859 case ']': 2860 Kind = tok::r_square; 2861 break; 2862 case '(': 2863 Kind = tok::l_paren; 2864 break; 2865 case ')': 2866 Kind = tok::r_paren; 2867 break; 2868 case '{': 2869 Kind = tok::l_brace; 2870 break; 2871 case '}': 2872 Kind = tok::r_brace; 2873 break; 2874 case '.': 2875 Char = getCharAndSize(CurPtr, SizeTmp); 2876 if (Char >= '0' && Char <= '9') { 2877 // Notify MIOpt that we read a non-whitespace/non-comment token. 2878 MIOpt.ReadToken(); 2879 2880 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2881 } else if (LangOpts.CPlusPlus && Char == '*') { 2882 Kind = tok::periodstar; 2883 CurPtr += SizeTmp; 2884 } else if (Char == '.' && 2885 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 2886 Kind = tok::ellipsis; 2887 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2888 SizeTmp2, Result); 2889 } else { 2890 Kind = tok::period; 2891 } 2892 break; 2893 case '&': 2894 Char = getCharAndSize(CurPtr, SizeTmp); 2895 if (Char == '&') { 2896 Kind = tok::ampamp; 2897 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2898 } else if (Char == '=') { 2899 Kind = tok::ampequal; 2900 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2901 } else { 2902 Kind = tok::amp; 2903 } 2904 break; 2905 case '*': 2906 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2907 Kind = tok::starequal; 2908 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2909 } else { 2910 Kind = tok::star; 2911 } 2912 break; 2913 case '+': 2914 Char = getCharAndSize(CurPtr, SizeTmp); 2915 if (Char == '+') { 2916 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2917 Kind = tok::plusplus; 2918 } else if (Char == '=') { 2919 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2920 Kind = tok::plusequal; 2921 } else { 2922 Kind = tok::plus; 2923 } 2924 break; 2925 case '-': 2926 Char = getCharAndSize(CurPtr, SizeTmp); 2927 if (Char == '-') { // -- 2928 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2929 Kind = tok::minusminus; 2930 } else if (Char == '>' && LangOpts.CPlusPlus && 2931 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 2932 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2933 SizeTmp2, Result); 2934 Kind = tok::arrowstar; 2935 } else if (Char == '>') { // -> 2936 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2937 Kind = tok::arrow; 2938 } else if (Char == '=') { // -= 2939 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2940 Kind = tok::minusequal; 2941 } else { 2942 Kind = tok::minus; 2943 } 2944 break; 2945 case '~': 2946 Kind = tok::tilde; 2947 break; 2948 case '!': 2949 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2950 Kind = tok::exclaimequal; 2951 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2952 } else { 2953 Kind = tok::exclaim; 2954 } 2955 break; 2956 case '/': 2957 // 6.4.9: Comments 2958 Char = getCharAndSize(CurPtr, SizeTmp); 2959 if (Char == '/') { // BCPL comment. 2960 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 2961 // want to lex this as a comment. There is one problem with this though, 2962 // that in one particular corner case, this can change the behavior of the 2963 // resultant program. For example, In "foo //**/ bar", C89 would lex 2964 // this as "foo / bar" and langauges with BCPL comments would lex it as 2965 // "foo". Check to see if the character after the second slash is a '*'. 2966 // If so, we will lex that as a "/" instead of the start of a comment. 2967 // However, we never do this in -traditional-cpp mode. 2968 if ((LangOpts.BCPLComment || 2969 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 2970 !LangOpts.TraditionalCPP) { 2971 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2972 return; // There is a token to return. 2973 2974 // It is common for the tokens immediately after a // comment to be 2975 // whitespace (indentation for the next line). Instead of going through 2976 // the big switch, handle it efficiently now. 2977 goto SkipIgnoredUnits; 2978 } 2979 } 2980 2981 if (Char == '*') { // /**/ comment. 2982 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2983 return; // There is a token to return. 2984 goto LexNextToken; // GCC isn't tail call eliminating. 2985 } 2986 2987 if (Char == '=') { 2988 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2989 Kind = tok::slashequal; 2990 } else { 2991 Kind = tok::slash; 2992 } 2993 break; 2994 case '%': 2995 Char = getCharAndSize(CurPtr, SizeTmp); 2996 if (Char == '=') { 2997 Kind = tok::percentequal; 2998 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2999 } else if (LangOpts.Digraphs && Char == '>') { 3000 Kind = tok::r_brace; // '%>' -> '}' 3001 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3002 } else if (LangOpts.Digraphs && Char == ':') { 3003 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3004 Char = getCharAndSize(CurPtr, SizeTmp); 3005 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3006 Kind = tok::hashhash; // '%:%:' -> '##' 3007 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3008 SizeTmp2, Result); 3009 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3010 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3011 if (!isLexingRawMode()) 3012 Diag(BufferPtr, diag::ext_charize_microsoft); 3013 Kind = tok::hashat; 3014 } else { // '%:' -> '#' 3015 // We parsed a # character. If this occurs at the start of the line, 3016 // it's actually the start of a preprocessing directive. Callback to 3017 // the preprocessor to handle it. 3018 // FIXME: -fpreprocessed mode?? 3019 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 3020 FormTokenWithChars(Result, CurPtr, tok::hash); 3021 PP->HandleDirective(Result); 3022 3023 // As an optimization, if the preprocessor didn't switch lexers, tail 3024 // recurse. 3025 if (PP->isCurrentLexer(this)) { 3026 // Start a new token. If this is a #include or something, the PP may 3027 // want us starting at the beginning of the line again. If so, set 3028 // the StartOfLine flag and clear LeadingSpace. 3029 if (IsAtStartOfLine) { 3030 Result.setFlag(Token::StartOfLine); 3031 Result.clearFlag(Token::LeadingSpace); 3032 IsAtStartOfLine = false; 3033 } 3034 goto LexNextToken; // GCC isn't tail call eliminating. 3035 } 3036 3037 return PP->Lex(Result); 3038 } 3039 3040 Kind = tok::hash; 3041 } 3042 } else { 3043 Kind = tok::percent; 3044 } 3045 break; 3046 case '<': 3047 Char = getCharAndSize(CurPtr, SizeTmp); 3048 if (ParsingFilename) { 3049 return LexAngledStringLiteral(Result, CurPtr); 3050 } else if (Char == '<') { 3051 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3052 if (After == '=') { 3053 Kind = tok::lesslessequal; 3054 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3055 SizeTmp2, Result); 3056 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3057 // If this is actually a '<<<<<<<' version control conflict marker, 3058 // recognize it as such and recover nicely. 3059 goto LexNextToken; 3060 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3061 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3062 // ignore it. 3063 goto LexNextToken; 3064 } else if (LangOpts.CUDA && After == '<') { 3065 Kind = tok::lesslessless; 3066 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3067 SizeTmp2, Result); 3068 } else { 3069 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3070 Kind = tok::lessless; 3071 } 3072 } else if (Char == '=') { 3073 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3074 Kind = tok::lessequal; 3075 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3076 if (LangOpts.CPlusPlus0x && 3077 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3078 // C++0x [lex.pptoken]p3: 3079 // Otherwise, if the next three characters are <:: and the subsequent 3080 // character is neither : nor >, the < is treated as a preprocessor 3081 // token by itself and not as the first character of the alternative 3082 // token <:. 3083 unsigned SizeTmp3; 3084 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3085 if (After != ':' && After != '>') { 3086 Kind = tok::less; 3087 if (!isLexingRawMode()) 3088 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3089 break; 3090 } 3091 } 3092 3093 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3094 Kind = tok::l_square; 3095 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3096 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3097 Kind = tok::l_brace; 3098 } else { 3099 Kind = tok::less; 3100 } 3101 break; 3102 case '>': 3103 Char = getCharAndSize(CurPtr, SizeTmp); 3104 if (Char == '=') { 3105 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3106 Kind = tok::greaterequal; 3107 } else if (Char == '>') { 3108 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3109 if (After == '=') { 3110 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3111 SizeTmp2, Result); 3112 Kind = tok::greatergreaterequal; 3113 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3114 // If this is actually a '>>>>' conflict marker, recognize it as such 3115 // and recover nicely. 3116 goto LexNextToken; 3117 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3118 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3119 goto LexNextToken; 3120 } else if (LangOpts.CUDA && After == '>') { 3121 Kind = tok::greatergreatergreater; 3122 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3123 SizeTmp2, Result); 3124 } else { 3125 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3126 Kind = tok::greatergreater; 3127 } 3128 3129 } else { 3130 Kind = tok::greater; 3131 } 3132 break; 3133 case '^': 3134 Char = getCharAndSize(CurPtr, SizeTmp); 3135 if (Char == '=') { 3136 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3137 Kind = tok::caretequal; 3138 } else { 3139 Kind = tok::caret; 3140 } 3141 break; 3142 case '|': 3143 Char = getCharAndSize(CurPtr, SizeTmp); 3144 if (Char == '=') { 3145 Kind = tok::pipeequal; 3146 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3147 } else if (Char == '|') { 3148 // If this is '|||||||' and we're in a conflict marker, ignore it. 3149 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3150 goto LexNextToken; 3151 Kind = tok::pipepipe; 3152 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3153 } else { 3154 Kind = tok::pipe; 3155 } 3156 break; 3157 case ':': 3158 Char = getCharAndSize(CurPtr, SizeTmp); 3159 if (LangOpts.Digraphs && Char == '>') { 3160 Kind = tok::r_square; // ':>' -> ']' 3161 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3162 } else if (LangOpts.CPlusPlus && Char == ':') { 3163 Kind = tok::coloncolon; 3164 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3165 } else { 3166 Kind = tok::colon; 3167 } 3168 break; 3169 case ';': 3170 Kind = tok::semi; 3171 break; 3172 case '=': 3173 Char = getCharAndSize(CurPtr, SizeTmp); 3174 if (Char == '=') { 3175 // If this is '====' and we're in a conflict marker, ignore it. 3176 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3177 goto LexNextToken; 3178 3179 Kind = tok::equalequal; 3180 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3181 } else { 3182 Kind = tok::equal; 3183 } 3184 break; 3185 case ',': 3186 Kind = tok::comma; 3187 break; 3188 case '#': 3189 Char = getCharAndSize(CurPtr, SizeTmp); 3190 if (Char == '#') { 3191 Kind = tok::hashhash; 3192 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3193 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3194 Kind = tok::hashat; 3195 if (!isLexingRawMode()) 3196 Diag(BufferPtr, diag::ext_charize_microsoft); 3197 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3198 } else { 3199 // We parsed a # character. If this occurs at the start of the line, 3200 // it's actually the start of a preprocessing directive. Callback to 3201 // the preprocessor to handle it. 3202 // FIXME: -fpreprocessed mode?? 3203 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 3204 FormTokenWithChars(Result, CurPtr, tok::hash); 3205 PP->HandleDirective(Result); 3206 3207 // As an optimization, if the preprocessor didn't switch lexers, tail 3208 // recurse. 3209 if (PP->isCurrentLexer(this)) { 3210 // Start a new token. If this is a #include or something, the PP may 3211 // want us starting at the beginning of the line again. If so, set 3212 // the StartOfLine flag and clear LeadingSpace. 3213 if (IsAtStartOfLine) { 3214 Result.setFlag(Token::StartOfLine); 3215 Result.clearFlag(Token::LeadingSpace); 3216 IsAtStartOfLine = false; 3217 } 3218 goto LexNextToken; // GCC isn't tail call eliminating. 3219 } 3220 return PP->Lex(Result); 3221 } 3222 3223 Kind = tok::hash; 3224 } 3225 break; 3226 3227 case '@': 3228 // Objective C support. 3229 if (CurPtr[-1] == '@' && LangOpts.ObjC1) 3230 Kind = tok::at; 3231 else 3232 Kind = tok::unknown; 3233 break; 3234 3235 case '\\': 3236 // FIXME: UCN's. 3237 // FALL THROUGH. 3238 default: 3239 Kind = tok::unknown; 3240 break; 3241 } 3242 3243 // Notify MIOpt that we read a non-whitespace/non-comment token. 3244 MIOpt.ReadToken(); 3245 3246 // Update the location of token as well as BufferPtr. 3247 FormTokenWithChars(Result, CurPtr, Kind); 3248 } 3249