1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file implements the NumericLiteralParser, CharLiteralParser, and 11 // StringLiteralParser interfaces. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #include "clang/Lex/LiteralSupport.h" 16 #include "clang/Lex/Preprocessor.h" 17 #include "clang/Lex/LexDiagnostic.h" 18 #include "clang/Basic/TargetInfo.h" 19 #include "llvm/ADT/StringExtras.h" 20 #include "llvm/Support/ErrorHandling.h" 21 using namespace clang; 22 23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's 24 /// not valid. 25 static int HexDigitValue(char C) { 26 if (C >= '0' && C <= '9') return C-'0'; 27 if (C >= 'a' && C <= 'f') return C-'a'+10; 28 if (C >= 'A' && C <= 'F') return C-'A'+10; 29 return -1; 30 } 31 32 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { 33 switch (kind) { 34 default: llvm_unreachable("Unknown token type!"); 35 case tok::char_constant: 36 case tok::string_literal: 37 case tok::utf8_string_literal: 38 return Target.getCharWidth(); 39 case tok::wide_char_constant: 40 case tok::wide_string_literal: 41 return Target.getWCharWidth(); 42 case tok::utf16_char_constant: 43 case tok::utf16_string_literal: 44 return Target.getChar16Width(); 45 case tok::utf32_char_constant: 46 case tok::utf32_string_literal: 47 return Target.getChar32Width(); 48 } 49 } 50 51 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in 52 /// either a character or a string literal. 53 static unsigned ProcessCharEscape(const char *&ThisTokBuf, 54 const char *ThisTokEnd, bool &HadError, 55 FullSourceLoc Loc, unsigned CharWidth, 56 DiagnosticsEngine *Diags) { 57 // Skip the '\' char. 58 ++ThisTokBuf; 59 60 // We know that this character can't be off the end of the buffer, because 61 // that would have been \", which would not have been the end of string. 62 unsigned ResultChar = *ThisTokBuf++; 63 switch (ResultChar) { 64 // These map to themselves. 65 case '\\': case '\'': case '"': case '?': break; 66 67 // These have fixed mappings. 68 case 'a': 69 // TODO: K&R: the meaning of '\\a' is different in traditional C 70 ResultChar = 7; 71 break; 72 case 'b': 73 ResultChar = 8; 74 break; 75 case 'e': 76 if (Diags) 77 Diags->Report(Loc, diag::ext_nonstandard_escape) << "e"; 78 ResultChar = 27; 79 break; 80 case 'E': 81 if (Diags) 82 Diags->Report(Loc, diag::ext_nonstandard_escape) << "E"; 83 ResultChar = 27; 84 break; 85 case 'f': 86 ResultChar = 12; 87 break; 88 case 'n': 89 ResultChar = 10; 90 break; 91 case 'r': 92 ResultChar = 13; 93 break; 94 case 't': 95 ResultChar = 9; 96 break; 97 case 'v': 98 ResultChar = 11; 99 break; 100 case 'x': { // Hex escape. 101 ResultChar = 0; 102 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 103 if (Diags) 104 Diags->Report(Loc, diag::err_hex_escape_no_digits); 105 HadError = 1; 106 break; 107 } 108 109 // Hex escapes are a maximal series of hex digits. 110 bool Overflow = false; 111 for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) { 112 int CharVal = HexDigitValue(ThisTokBuf[0]); 113 if (CharVal == -1) break; 114 // About to shift out a digit? 115 Overflow |= (ResultChar & 0xF0000000) ? true : false; 116 ResultChar <<= 4; 117 ResultChar |= CharVal; 118 } 119 120 // See if any bits will be truncated when evaluated as a character. 121 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 122 Overflow = true; 123 ResultChar &= ~0U >> (32-CharWidth); 124 } 125 126 // Check for overflow. 127 if (Overflow && Diags) // Too many digits to fit in 128 Diags->Report(Loc, diag::warn_hex_escape_too_large); 129 break; 130 } 131 case '0': case '1': case '2': case '3': 132 case '4': case '5': case '6': case '7': { 133 // Octal escapes. 134 --ThisTokBuf; 135 ResultChar = 0; 136 137 // Octal escapes are a series of octal digits with maximum length 3. 138 // "\0123" is a two digit sequence equal to "\012" "3". 139 unsigned NumDigits = 0; 140 do { 141 ResultChar <<= 3; 142 ResultChar |= *ThisTokBuf++ - '0'; 143 ++NumDigits; 144 } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 && 145 ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7'); 146 147 // Check for overflow. Reject '\777', but not L'\777'. 148 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 149 if (Diags) 150 Diags->Report(Loc, diag::warn_octal_escape_too_large); 151 ResultChar &= ~0U >> (32-CharWidth); 152 } 153 break; 154 } 155 156 // Otherwise, these are not valid escapes. 157 case '(': case '{': case '[': case '%': 158 // GCC accepts these as extensions. We warn about them as such though. 159 if (Diags) 160 Diags->Report(Loc, diag::ext_nonstandard_escape) 161 << std::string()+(char)ResultChar; 162 break; 163 default: 164 if (Diags == 0) 165 break; 166 167 if (isgraph(ResultChar)) 168 Diags->Report(Loc, diag::ext_unknown_escape) 169 << std::string()+(char)ResultChar; 170 else 171 Diags->Report(Loc, diag::ext_unknown_escape) 172 << "x"+llvm::utohexstr(ResultChar); 173 break; 174 } 175 176 return ResultChar; 177 } 178 179 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and 180 /// return the UTF32. 181 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 182 uint32_t &UcnVal, unsigned short &UcnLen, 183 FullSourceLoc Loc, DiagnosticsEngine *Diags, 184 const LangOptions &Features) { 185 if (!Features.CPlusPlus && !Features.C99 && Diags) 186 Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89); 187 188 // Save the beginning of the string (for error diagnostics). 189 const char *ThisTokBegin = ThisTokBuf; 190 191 // Skip the '\u' char's. 192 ThisTokBuf += 2; 193 194 if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { 195 if (Diags) 196 Diags->Report(Loc, diag::err_ucn_escape_no_digits); 197 return false; 198 } 199 UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); 200 unsigned short UcnLenSave = UcnLen; 201 for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) { 202 int CharVal = HexDigitValue(ThisTokBuf[0]); 203 if (CharVal == -1) break; 204 UcnVal <<= 4; 205 UcnVal |= CharVal; 206 } 207 // If we didn't consume the proper number of digits, there is a problem. 208 if (UcnLenSave) { 209 if (Diags) { 210 SourceLocation L = 211 Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin, 212 Loc.getManager(), Features); 213 Diags->Report(FullSourceLoc(L, Loc.getManager()), 214 diag::err_ucn_escape_incomplete); 215 } 216 return false; 217 } 218 // Check UCN constraints (C99 6.4.3p2). 219 if ((UcnVal < 0xa0 && 220 (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, ` 221 || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF) 222 || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ { 223 if (Diags) 224 Diags->Report(Loc, diag::err_ucn_escape_invalid); 225 return false; 226 } 227 return true; 228 } 229 230 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and 231 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of 232 /// StringLiteralParser. When we decide to implement UCN's for identifiers, 233 /// we will likely rework our support for UCN's. 234 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd, 235 char *&ResultBuf, bool &HadError, 236 FullSourceLoc Loc, unsigned CharByteWidth, 237 DiagnosticsEngine *Diags, 238 const LangOptions &Features) { 239 typedef uint32_t UTF32; 240 UTF32 UcnVal = 0; 241 unsigned short UcnLen = 0; 242 if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags, 243 Features)) { 244 HadError = 1; 245 return; 246 } 247 248 assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) && 249 "only character widths of 1, 2, or 4 bytes supported"); 250 251 (void)UcnLen; 252 assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported"); 253 254 if (CharByteWidth == 4) { 255 // Note: our internal rep of wide char tokens is always little-endian. 256 *ResultBuf++ = (UcnVal & 0x000000FF); 257 *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; 258 *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16; 259 *ResultBuf++ = (UcnVal & 0xFF000000) >> 24; 260 return; 261 } 262 263 if (CharByteWidth == 2) { 264 // Convert to UTF16. 265 if (UcnVal < (UTF32)0xFFFF) { 266 *ResultBuf++ = (UcnVal & 0x000000FF); 267 *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8; 268 return; 269 } 270 if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large); 271 272 typedef uint16_t UTF16; 273 UcnVal -= 0x10000; 274 UTF16 surrogate1 = 0xD800 + (UcnVal >> 10); 275 UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF); 276 *ResultBuf++ = (surrogate1 & 0x000000FF); 277 *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8; 278 *ResultBuf++ = (surrogate2 & 0x000000FF); 279 *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8; 280 return; 281 } 282 283 assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters"); 284 285 // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8. 286 // The conversion below was inspired by: 287 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 288 // First, we determine how many bytes the result will require. 289 typedef uint8_t UTF8; 290 291 unsigned short bytesToWrite = 0; 292 if (UcnVal < (UTF32)0x80) 293 bytesToWrite = 1; 294 else if (UcnVal < (UTF32)0x800) 295 bytesToWrite = 2; 296 else if (UcnVal < (UTF32)0x10000) 297 bytesToWrite = 3; 298 else 299 bytesToWrite = 4; 300 301 const unsigned byteMask = 0xBF; 302 const unsigned byteMark = 0x80; 303 304 // Once the bits are split out into bytes of UTF8, this is a mask OR-ed 305 // into the first byte, depending on how many bytes follow. 306 static const UTF8 firstByteMark[5] = { 307 0x00, 0x00, 0xC0, 0xE0, 0xF0 308 }; 309 // Finally, we write the bytes into ResultBuf. 310 ResultBuf += bytesToWrite; 311 switch (bytesToWrite) { // note: everything falls through. 312 case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 313 case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 314 case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6; 315 case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]); 316 } 317 // Update the buffer. 318 ResultBuf += bytesToWrite; 319 } 320 321 322 /// integer-constant: [C99 6.4.4.1] 323 /// decimal-constant integer-suffix 324 /// octal-constant integer-suffix 325 /// hexadecimal-constant integer-suffix 326 /// decimal-constant: 327 /// nonzero-digit 328 /// decimal-constant digit 329 /// octal-constant: 330 /// 0 331 /// octal-constant octal-digit 332 /// hexadecimal-constant: 333 /// hexadecimal-prefix hexadecimal-digit 334 /// hexadecimal-constant hexadecimal-digit 335 /// hexadecimal-prefix: one of 336 /// 0x 0X 337 /// integer-suffix: 338 /// unsigned-suffix [long-suffix] 339 /// unsigned-suffix [long-long-suffix] 340 /// long-suffix [unsigned-suffix] 341 /// long-long-suffix [unsigned-sufix] 342 /// nonzero-digit: 343 /// 1 2 3 4 5 6 7 8 9 344 /// octal-digit: 345 /// 0 1 2 3 4 5 6 7 346 /// hexadecimal-digit: 347 /// 0 1 2 3 4 5 6 7 8 9 348 /// a b c d e f 349 /// A B C D E F 350 /// unsigned-suffix: one of 351 /// u U 352 /// long-suffix: one of 353 /// l L 354 /// long-long-suffix: one of 355 /// ll LL 356 /// 357 /// floating-constant: [C99 6.4.4.2] 358 /// TODO: add rules... 359 /// 360 NumericLiteralParser:: 361 NumericLiteralParser(const char *begin, const char *end, 362 SourceLocation TokLoc, Preprocessor &pp) 363 : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) { 364 365 // This routine assumes that the range begin/end matches the regex for integer 366 // and FP constants (specifically, the 'pp-number' regex), and assumes that 367 // the byte at "*end" is both valid and not part of the regex. Because of 368 // this, it doesn't have to check for 'overscan' in various places. 369 assert(!isalnum(*end) && *end != '.' && *end != '_' && 370 "Lexer didn't maximally munch?"); 371 372 s = DigitsBegin = begin; 373 saw_exponent = false; 374 saw_period = false; 375 isLong = false; 376 isUnsigned = false; 377 isLongLong = false; 378 isFloat = false; 379 isImaginary = false; 380 isMicrosoftInteger = false; 381 hadError = false; 382 383 if (*s == '0') { // parse radix 384 ParseNumberStartingWithZero(TokLoc); 385 if (hadError) 386 return; 387 } else { // the first digit is non-zero 388 radix = 10; 389 s = SkipDigits(s); 390 if (s == ThisTokEnd) { 391 // Done. 392 } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) { 393 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 394 diag::err_invalid_decimal_digit) << StringRef(s, 1); 395 hadError = true; 396 return; 397 } else if (*s == '.') { 398 s++; 399 saw_period = true; 400 s = SkipDigits(s); 401 } 402 if ((*s == 'e' || *s == 'E')) { // exponent 403 const char *Exponent = s; 404 s++; 405 saw_exponent = true; 406 if (*s == '+' || *s == '-') s++; // sign 407 const char *first_non_digit = SkipDigits(s); 408 if (first_non_digit != s) { 409 s = first_non_digit; 410 } else { 411 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin), 412 diag::err_exponent_has_no_digits); 413 hadError = true; 414 return; 415 } 416 } 417 } 418 419 SuffixBegin = s; 420 421 // Parse the suffix. At this point we can classify whether we have an FP or 422 // integer constant. 423 bool isFPConstant = isFloatingLiteral(); 424 425 // Loop over all of the characters of the suffix. If we see something bad, 426 // we break out of the loop. 427 for (; s != ThisTokEnd; ++s) { 428 switch (*s) { 429 case 'f': // FP Suffix for "float" 430 case 'F': 431 if (!isFPConstant) break; // Error for integer constant. 432 if (isFloat || isLong) break; // FF, LF invalid. 433 isFloat = true; 434 continue; // Success. 435 case 'u': 436 case 'U': 437 if (isFPConstant) break; // Error for floating constant. 438 if (isUnsigned) break; // Cannot be repeated. 439 isUnsigned = true; 440 continue; // Success. 441 case 'l': 442 case 'L': 443 if (isLong || isLongLong) break; // Cannot be repeated. 444 if (isFloat) break; // LF invalid. 445 446 // Check for long long. The L's need to be adjacent and the same case. 447 if (s+1 != ThisTokEnd && s[1] == s[0]) { 448 if (isFPConstant) break; // long long invalid for floats. 449 isLongLong = true; 450 ++s; // Eat both of them. 451 } else { 452 isLong = true; 453 } 454 continue; // Success. 455 case 'i': 456 case 'I': 457 if (PP.getLangOptions().MicrosoftExt) { 458 if (isFPConstant || isLong || isLongLong) break; 459 460 // Allow i8, i16, i32, i64, and i128. 461 if (s + 1 != ThisTokEnd) { 462 switch (s[1]) { 463 case '8': 464 s += 2; // i8 suffix 465 isMicrosoftInteger = true; 466 break; 467 case '1': 468 if (s + 2 == ThisTokEnd) break; 469 if (s[2] == '6') { 470 s += 3; // i16 suffix 471 isMicrosoftInteger = true; 472 } 473 else if (s[2] == '2') { 474 if (s + 3 == ThisTokEnd) break; 475 if (s[3] == '8') { 476 s += 4; // i128 suffix 477 isMicrosoftInteger = true; 478 } 479 } 480 break; 481 case '3': 482 if (s + 2 == ThisTokEnd) break; 483 if (s[2] == '2') { 484 s += 3; // i32 suffix 485 isLong = true; 486 isMicrosoftInteger = true; 487 } 488 break; 489 case '6': 490 if (s + 2 == ThisTokEnd) break; 491 if (s[2] == '4') { 492 s += 3; // i64 suffix 493 isLongLong = true; 494 isMicrosoftInteger = true; 495 } 496 break; 497 default: 498 break; 499 } 500 break; 501 } 502 } 503 // fall through. 504 case 'j': 505 case 'J': 506 if (isImaginary) break; // Cannot be repeated. 507 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 508 diag::ext_imaginary_constant); 509 isImaginary = true; 510 continue; // Success. 511 } 512 // If we reached here, there was an error. 513 break; 514 } 515 516 // Report an error if there are any. 517 if (s != ThisTokEnd) { 518 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin), 519 isFPConstant ? diag::err_invalid_suffix_float_constant : 520 diag::err_invalid_suffix_integer_constant) 521 << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin); 522 hadError = true; 523 return; 524 } 525 } 526 527 /// ParseNumberStartingWithZero - This method is called when the first character 528 /// of the number is found to be a zero. This means it is either an octal 529 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or 530 /// a floating point number (01239.123e4). Eat the prefix, determining the 531 /// radix etc. 532 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) { 533 assert(s[0] == '0' && "Invalid method call"); 534 s++; 535 536 // Handle a hex number like 0x1234. 537 if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) { 538 s++; 539 radix = 16; 540 DigitsBegin = s; 541 s = SkipHexDigits(s); 542 if (s == ThisTokEnd) { 543 // Done. 544 } else if (*s == '.') { 545 s++; 546 saw_period = true; 547 s = SkipHexDigits(s); 548 } 549 // A binary exponent can appear with or with a '.'. If dotted, the 550 // binary exponent is required. 551 if (*s == 'p' || *s == 'P') { 552 const char *Exponent = s; 553 s++; 554 saw_exponent = true; 555 if (*s == '+' || *s == '-') s++; // sign 556 const char *first_non_digit = SkipDigits(s); 557 if (first_non_digit == s) { 558 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 559 diag::err_exponent_has_no_digits); 560 hadError = true; 561 return; 562 } 563 s = first_non_digit; 564 565 if (!PP.getLangOptions().HexFloats) 566 PP.Diag(TokLoc, diag::ext_hexconstant_invalid); 567 } else if (saw_period) { 568 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 569 diag::err_hexconstant_requires_exponent); 570 hadError = true; 571 } 572 return; 573 } 574 575 // Handle simple binary numbers 0b01010 576 if (*s == 'b' || *s == 'B') { 577 // 0b101010 is a GCC extension. 578 PP.Diag(TokLoc, diag::ext_binary_literal); 579 ++s; 580 radix = 2; 581 DigitsBegin = s; 582 s = SkipBinaryDigits(s); 583 if (s == ThisTokEnd) { 584 // Done. 585 } else if (isxdigit(*s)) { 586 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 587 diag::err_invalid_binary_digit) << StringRef(s, 1); 588 hadError = true; 589 } 590 // Other suffixes will be diagnosed by the caller. 591 return; 592 } 593 594 // For now, the radix is set to 8. If we discover that we have a 595 // floating point constant, the radix will change to 10. Octal floating 596 // point constants are not permitted (only decimal and hexadecimal). 597 radix = 8; 598 DigitsBegin = s; 599 s = SkipOctalDigits(s); 600 if (s == ThisTokEnd) 601 return; // Done, simple octal number like 01234 602 603 // If we have some other non-octal digit that *is* a decimal digit, see if 604 // this is part of a floating point number like 094.123 or 09e1. 605 if (isdigit(*s)) { 606 const char *EndDecimal = SkipDigits(s); 607 if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') { 608 s = EndDecimal; 609 radix = 10; 610 } 611 } 612 613 // If we have a hex digit other than 'e' (which denotes a FP exponent) then 614 // the code is using an incorrect base. 615 if (isxdigit(*s) && *s != 'e' && *s != 'E') { 616 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), 617 diag::err_invalid_octal_digit) << StringRef(s, 1); 618 hadError = true; 619 return; 620 } 621 622 if (*s == '.') { 623 s++; 624 radix = 10; 625 saw_period = true; 626 s = SkipDigits(s); // Skip suffix. 627 } 628 if (*s == 'e' || *s == 'E') { // exponent 629 const char *Exponent = s; 630 s++; 631 radix = 10; 632 saw_exponent = true; 633 if (*s == '+' || *s == '-') s++; // sign 634 const char *first_non_digit = SkipDigits(s); 635 if (first_non_digit != s) { 636 s = first_non_digit; 637 } else { 638 PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin), 639 diag::err_exponent_has_no_digits); 640 hadError = true; 641 return; 642 } 643 } 644 } 645 646 647 /// GetIntegerValue - Convert this numeric literal value to an APInt that 648 /// matches Val's input width. If there is an overflow, set Val to the low bits 649 /// of the result and return true. Otherwise, return false. 650 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) { 651 // Fast path: Compute a conservative bound on the maximum number of 652 // bits per digit in this radix. If we can't possibly overflow a 653 // uint64 based on that bound then do the simple conversion to 654 // integer. This avoids the expensive overflow checking below, and 655 // handles the common cases that matter (small decimal integers and 656 // hex/octal values which don't overflow). 657 unsigned MaxBitsPerDigit = 1; 658 while ((1U << MaxBitsPerDigit) < radix) 659 MaxBitsPerDigit += 1; 660 if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) { 661 uint64_t N = 0; 662 for (s = DigitsBegin; s != SuffixBegin; ++s) 663 N = N*radix + HexDigitValue(*s); 664 665 // This will truncate the value to Val's input width. Simply check 666 // for overflow by comparing. 667 Val = N; 668 return Val.getZExtValue() != N; 669 } 670 671 Val = 0; 672 s = DigitsBegin; 673 674 llvm::APInt RadixVal(Val.getBitWidth(), radix); 675 llvm::APInt CharVal(Val.getBitWidth(), 0); 676 llvm::APInt OldVal = Val; 677 678 bool OverflowOccurred = false; 679 while (s < SuffixBegin) { 680 unsigned C = HexDigitValue(*s++); 681 682 // If this letter is out of bound for this radix, reject it. 683 assert(C < radix && "NumericLiteralParser ctor should have rejected this"); 684 685 CharVal = C; 686 687 // Add the digit to the value in the appropriate radix. If adding in digits 688 // made the value smaller, then this overflowed. 689 OldVal = Val; 690 691 // Multiply by radix, did overflow occur on the multiply? 692 Val *= RadixVal; 693 OverflowOccurred |= Val.udiv(RadixVal) != OldVal; 694 695 // Add value, did overflow occur on the value? 696 // (a + b) ult b <=> overflow 697 Val += CharVal; 698 OverflowOccurred |= Val.ult(CharVal); 699 } 700 return OverflowOccurred; 701 } 702 703 llvm::APFloat::opStatus 704 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) { 705 using llvm::APFloat; 706 707 unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin); 708 return Result.convertFromString(StringRef(ThisTokBegin, n), 709 APFloat::rmNearestTiesToEven); 710 } 711 712 713 /// character-literal: [C++0x lex.ccon] 714 /// ' c-char-sequence ' 715 /// u' c-char-sequence ' 716 /// U' c-char-sequence ' 717 /// L' c-char-sequence ' 718 /// c-char-sequence: 719 /// c-char 720 /// c-char-sequence c-char 721 /// c-char: 722 /// any member of the source character set except the single-quote ', 723 /// backslash \, or new-line character 724 /// escape-sequence 725 /// universal-character-name 726 /// escape-sequence: [C++0x lex.ccon] 727 /// simple-escape-sequence 728 /// octal-escape-sequence 729 /// hexadecimal-escape-sequence 730 /// simple-escape-sequence: 731 /// one of \' \" \? \\ \a \b \f \n \r \t \v 732 /// octal-escape-sequence: 733 /// \ octal-digit 734 /// \ octal-digit octal-digit 735 /// \ octal-digit octal-digit octal-digit 736 /// hexadecimal-escape-sequence: 737 /// \x hexadecimal-digit 738 /// hexadecimal-escape-sequence hexadecimal-digit 739 /// universal-character-name: 740 /// \u hex-quad 741 /// \U hex-quad hex-quad 742 /// hex-quad: 743 /// hex-digit hex-digit hex-digit hex-digit 744 /// 745 CharLiteralParser::CharLiteralParser(const char *begin, const char *end, 746 SourceLocation Loc, Preprocessor &PP, 747 tok::TokenKind kind) { 748 // At this point we know that the character matches the regex "L?'.*'". 749 HadError = false; 750 751 Kind = kind; 752 753 // Determine if this is a wide or UTF character. 754 if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant || 755 Kind == tok::utf32_char_constant) { 756 ++begin; 757 } 758 759 // Skip over the entry quote. 760 assert(begin[0] == '\'' && "Invalid token lexed"); 761 ++begin; 762 763 // FIXME: The "Value" is an uint64_t so we can handle char literals of 764 // up to 64-bits. 765 // FIXME: This extensively assumes that 'char' is 8-bits. 766 assert(PP.getTargetInfo().getCharWidth() == 8 && 767 "Assumes char is 8 bits"); 768 assert(PP.getTargetInfo().getIntWidth() <= 64 && 769 (PP.getTargetInfo().getIntWidth() & 7) == 0 && 770 "Assumes sizeof(int) on target is <= 64 and a multiple of char"); 771 assert(PP.getTargetInfo().getWCharWidth() <= 64 && 772 "Assumes sizeof(wchar) on target is <= 64"); 773 774 // This is what we will use for overflow detection 775 llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0); 776 777 unsigned NumCharsSoFar = 0; 778 bool Warned = false; 779 while (begin[0] != '\'') { 780 uint64_t ResultChar; 781 782 // Is this a Universal Character Name escape? 783 if (begin[0] != '\\') // If this is a normal character, consume it. 784 ResultChar = (unsigned char)*begin++; 785 else { // Otherwise, this is an escape character. 786 unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); 787 // Check for UCN. 788 if (begin[1] == 'u' || begin[1] == 'U') { 789 uint32_t utf32 = 0; 790 unsigned short UcnLen = 0; 791 if (!ProcessUCNEscape(begin, end, utf32, UcnLen, 792 FullSourceLoc(Loc, PP.getSourceManager()), 793 &PP.getDiagnostics(), PP.getLangOptions())) { 794 HadError = 1; 795 } 796 ResultChar = utf32; 797 if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { 798 PP.Diag(Loc, diag::warn_ucn_escape_too_large); 799 ResultChar &= ~0U >> (32-CharWidth); 800 } 801 } else { 802 // Otherwise, this is a non-UCN escape character. Process it. 803 ResultChar = ProcessCharEscape(begin, end, HadError, 804 FullSourceLoc(Loc,PP.getSourceManager()), 805 CharWidth, &PP.getDiagnostics()); 806 } 807 } 808 809 // If this is a multi-character constant (e.g. 'abc'), handle it. These are 810 // implementation defined (C99 6.4.4.4p10). 811 if (NumCharsSoFar) { 812 if (!isAscii()) { 813 // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'. 814 LitVal = 0; 815 } else { 816 // Narrow character literals act as though their value is concatenated 817 // in this implementation, but warn on overflow. 818 if (LitVal.countLeadingZeros() < 8 && !Warned) { 819 PP.Diag(Loc, diag::warn_char_constant_too_large); 820 Warned = true; 821 } 822 LitVal <<= 8; 823 } 824 } 825 826 LitVal = LitVal + ResultChar; 827 ++NumCharsSoFar; 828 } 829 830 // If this is the second character being processed, do special handling. 831 if (NumCharsSoFar > 1) { 832 // Warn about discarding the top bits for multi-char wide-character 833 // constants (L'abcd'). 834 if (!isAscii()) 835 PP.Diag(Loc, diag::warn_extraneous_char_constant); 836 else if (NumCharsSoFar != 4) 837 PP.Diag(Loc, diag::ext_multichar_character_literal); 838 else 839 PP.Diag(Loc, diag::ext_four_char_character_literal); 840 IsMultiChar = true; 841 } else 842 IsMultiChar = false; 843 844 // Transfer the value from APInt to uint64_t 845 Value = LitVal.getZExtValue(); 846 847 // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1") 848 // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple 849 // character constants are not sign extended in the this implementation: 850 // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC. 851 if (isAscii() && NumCharsSoFar == 1 && (Value & 128) && 852 PP.getLangOptions().CharIsSigned) 853 Value = (signed char)Value; 854 } 855 856 857 /// string-literal: [C++0x lex.string] 858 /// encoding-prefix " [s-char-sequence] " 859 /// encoding-prefix R raw-string 860 /// encoding-prefix: 861 /// u8 862 /// u 863 /// U 864 /// L 865 /// s-char-sequence: 866 /// s-char 867 /// s-char-sequence s-char 868 /// s-char: 869 /// any member of the source character set except the double-quote ", 870 /// backslash \, or new-line character 871 /// escape-sequence 872 /// universal-character-name 873 /// raw-string: 874 /// " d-char-sequence ( r-char-sequence ) d-char-sequence " 875 /// r-char-sequence: 876 /// r-char 877 /// r-char-sequence r-char 878 /// r-char: 879 /// any member of the source character set, except a right parenthesis ) 880 /// followed by the initial d-char-sequence (which may be empty) 881 /// followed by a double quote ". 882 /// d-char-sequence: 883 /// d-char 884 /// d-char-sequence d-char 885 /// d-char: 886 /// any member of the basic source character set except: 887 /// space, the left parenthesis (, the right parenthesis ), 888 /// the backslash \, and the control characters representing horizontal 889 /// tab, vertical tab, form feed, and newline. 890 /// escape-sequence: [C++0x lex.ccon] 891 /// simple-escape-sequence 892 /// octal-escape-sequence 893 /// hexadecimal-escape-sequence 894 /// simple-escape-sequence: 895 /// one of \' \" \? \\ \a \b \f \n \r \t \v 896 /// octal-escape-sequence: 897 /// \ octal-digit 898 /// \ octal-digit octal-digit 899 /// \ octal-digit octal-digit octal-digit 900 /// hexadecimal-escape-sequence: 901 /// \x hexadecimal-digit 902 /// hexadecimal-escape-sequence hexadecimal-digit 903 /// universal-character-name: 904 /// \u hex-quad 905 /// \U hex-quad hex-quad 906 /// hex-quad: 907 /// hex-digit hex-digit hex-digit hex-digit 908 /// 909 StringLiteralParser:: 910 StringLiteralParser(const Token *StringToks, unsigned NumStringToks, 911 Preprocessor &PP, bool Complain) 912 : SM(PP.getSourceManager()), Features(PP.getLangOptions()), 913 Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0), 914 MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), 915 ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { 916 init(StringToks, NumStringToks); 917 } 918 919 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ 920 // The literal token may have come from an invalid source location (e.g. due 921 // to a PCH error), in which case the token length will be 0. 922 if (NumStringToks == 0 || StringToks[0].getLength() < 2) { 923 hadError = true; 924 return; 925 } 926 927 // Scan all of the string portions, remember the max individual token length, 928 // computing a bound on the concatenated string length, and see whether any 929 // piece is a wide-string. If any of the string portions is a wide-string 930 // literal, the result is a wide-string literal [C99 6.4.5p4]. 931 assert(NumStringToks && "expected at least one token"); 932 MaxTokenLength = StringToks[0].getLength(); 933 assert(StringToks[0].getLength() >= 2 && "literal token is invalid!"); 934 SizeBound = StringToks[0].getLength()-2; // -2 for "". 935 Kind = StringToks[0].getKind(); 936 937 hadError = false; 938 939 // Implement Translation Phase #6: concatenation of string literals 940 /// (C99 5.1.1.2p1). The common case is only one string fragment. 941 for (unsigned i = 1; i != NumStringToks; ++i) { 942 if (StringToks[i].getLength() < 2) { 943 hadError = true; 944 return; 945 } 946 947 // The string could be shorter than this if it needs cleaning, but this is a 948 // reasonable bound, which is all we need. 949 assert(StringToks[i].getLength() >= 2 && "literal token is invalid!"); 950 SizeBound += StringToks[i].getLength()-2; // -2 for "". 951 952 // Remember maximum string piece length. 953 if (StringToks[i].getLength() > MaxTokenLength) 954 MaxTokenLength = StringToks[i].getLength(); 955 956 // Remember if we see any wide or utf-8/16/32 strings. 957 // Also check for illegal concatenations. 958 if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) { 959 if (isAscii()) { 960 Kind = StringToks[i].getKind(); 961 } else { 962 if (Diags) 963 Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), 964 diag::err_unsupported_string_concat); 965 hadError = true; 966 } 967 } 968 } 969 970 // Include space for the null terminator. 971 ++SizeBound; 972 973 // TODO: K&R warning: "traditional C rejects string constant concatenation" 974 975 // Get the width in bytes of char/wchar_t/char16_t/char32_t 976 CharByteWidth = getCharWidth(Kind, Target); 977 assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple"); 978 CharByteWidth /= 8; 979 980 // The output buffer size needs to be large enough to hold wide characters. 981 // This is a worst-case assumption which basically corresponds to L"" "long". 982 SizeBound *= CharByteWidth; 983 984 // Size the temporary buffer to hold the result string data. 985 ResultBuf.resize(SizeBound); 986 987 // Likewise, but for each string piece. 988 llvm::SmallString<512> TokenBuf; 989 TokenBuf.resize(MaxTokenLength); 990 991 // Loop over all the strings, getting their spelling, and expanding them to 992 // wide strings as appropriate. 993 ResultPtr = &ResultBuf[0]; // Next byte to fill in. 994 995 Pascal = false; 996 997 for (unsigned i = 0, e = NumStringToks; i != e; ++i) { 998 const char *ThisTokBuf = &TokenBuf[0]; 999 // Get the spelling of the token, which eliminates trigraphs, etc. We know 1000 // that ThisTokBuf points to a buffer that is big enough for the whole token 1001 // and 'spelled' tokens can only shrink. 1002 bool StringInvalid = false; 1003 unsigned ThisTokLen = 1004 Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features, 1005 &StringInvalid); 1006 if (StringInvalid) { 1007 hadError = true; 1008 continue; 1009 } 1010 1011 const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote. 1012 // TODO: Input character set mapping support. 1013 1014 // Skip marker for wide or unicode strings. 1015 if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') { 1016 ++ThisTokBuf; 1017 // Skip 8 of u8 marker for utf8 strings. 1018 if (ThisTokBuf[0] == '8') 1019 ++ThisTokBuf; 1020 } 1021 1022 // Check for raw string 1023 if (ThisTokBuf[0] == 'R') { 1024 ThisTokBuf += 2; // skip R" 1025 1026 const char *Prefix = ThisTokBuf; 1027 while (ThisTokBuf[0] != '(') 1028 ++ThisTokBuf; 1029 ++ThisTokBuf; // skip '(' 1030 1031 // remove same number of characters from the end 1032 if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix)) 1033 ThisTokEnd -= (ThisTokBuf - Prefix); 1034 1035 // Copy the string over 1036 CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)); 1037 } else { 1038 assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); 1039 ++ThisTokBuf; // skip " 1040 1041 // Check if this is a pascal string 1042 if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd && 1043 ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') { 1044 1045 // If the \p sequence is found in the first token, we have a pascal string 1046 // Otherwise, if we already have a pascal string, ignore the first \p 1047 if (i == 0) { 1048 ++ThisTokBuf; 1049 Pascal = true; 1050 } else if (Pascal) 1051 ThisTokBuf += 2; 1052 } 1053 1054 while (ThisTokBuf != ThisTokEnd) { 1055 // Is this a span of non-escape characters? 1056 if (ThisTokBuf[0] != '\\') { 1057 const char *InStart = ThisTokBuf; 1058 do { 1059 ++ThisTokBuf; 1060 } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); 1061 1062 // Copy the character span over. 1063 CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)); 1064 continue; 1065 } 1066 // Is this a Universal Character Name escape? 1067 if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') { 1068 EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr, 1069 hadError, FullSourceLoc(StringToks[i].getLocation(),SM), 1070 CharByteWidth, Diags, Features); 1071 continue; 1072 } 1073 // Otherwise, this is a non-UCN escape character. Process it. 1074 unsigned ResultChar = 1075 ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, 1076 FullSourceLoc(StringToks[i].getLocation(), SM), 1077 CharByteWidth*8, Diags); 1078 1079 // Note: our internal rep of wide char tokens is always little-endian. 1080 *ResultPtr++ = ResultChar & 0xFF; 1081 1082 for (unsigned i = 1, e = CharByteWidth; i != e; ++i) 1083 *ResultPtr++ = ResultChar >> i*8; 1084 } 1085 } 1086 } 1087 1088 if (Pascal) { 1089 ResultBuf[0] = ResultPtr-&ResultBuf[0]-1; 1090 ResultBuf[0] /= CharByteWidth; 1091 1092 // Verify that pascal strings aren't too large. 1093 if (GetStringLength() > 256) { 1094 if (Diags) 1095 Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), 1096 diag::err_pascal_string_too_long) 1097 << SourceRange(StringToks[0].getLocation(), 1098 StringToks[NumStringToks-1].getLocation()); 1099 hadError = true; 1100 return; 1101 } 1102 } else if (Diags) { 1103 // Complain if this string literal has too many characters. 1104 unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; 1105 1106 if (GetNumStringChars() > MaxChars) 1107 Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), 1108 diag::ext_string_too_long) 1109 << GetNumStringChars() << MaxChars 1110 << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) 1111 << SourceRange(StringToks[0].getLocation(), 1112 StringToks[NumStringToks-1].getLocation()); 1113 } 1114 } 1115 1116 1117 /// copyStringFragment - This function copies from Start to End into ResultPtr. 1118 /// Performs widening for multi-byte characters. 1119 void StringLiteralParser::CopyStringFragment(StringRef Fragment) { 1120 // Copy the character span over. 1121 if (CharByteWidth == 1) { 1122 memcpy(ResultPtr, Fragment.data(), Fragment.size()); 1123 ResultPtr += Fragment.size(); 1124 } else { 1125 // Note: our internal rep of wide char tokens is always little-endian. 1126 for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) { 1127 *ResultPtr++ = *I; 1128 // Add zeros at the end. 1129 for (unsigned i = 1, e = CharByteWidth; i != e; ++i) 1130 *ResultPtr++ = 0; 1131 } 1132 } 1133 } 1134 1135 1136 /// getOffsetOfStringByte - This function returns the offset of the 1137 /// specified byte of the string data represented by Token. This handles 1138 /// advancing over escape sequences in the string. 1139 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, 1140 unsigned ByteNo) const { 1141 // Get the spelling of the token. 1142 llvm::SmallString<32> SpellingBuffer; 1143 SpellingBuffer.resize(Tok.getLength()); 1144 1145 bool StringInvalid = false; 1146 const char *SpellingPtr = &SpellingBuffer[0]; 1147 unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features, 1148 &StringInvalid); 1149 if (StringInvalid) 1150 return 0; 1151 1152 assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' && 1153 SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet"); 1154 1155 1156 const char *SpellingStart = SpellingPtr; 1157 const char *SpellingEnd = SpellingPtr+TokLen; 1158 1159 // Skip over the leading quote. 1160 assert(SpellingPtr[0] == '"' && "Should be a string literal!"); 1161 ++SpellingPtr; 1162 1163 // Skip over bytes until we find the offset we're looking for. 1164 while (ByteNo) { 1165 assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!"); 1166 1167 // Step over non-escapes simply. 1168 if (*SpellingPtr != '\\') { 1169 ++SpellingPtr; 1170 --ByteNo; 1171 continue; 1172 } 1173 1174 // Otherwise, this is an escape character. Advance over it. 1175 bool HadError = false; 1176 ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, 1177 FullSourceLoc(Tok.getLocation(), SM), 1178 CharByteWidth*8, Diags); 1179 assert(!HadError && "This method isn't valid on erroneous strings"); 1180 --ByteNo; 1181 } 1182 1183 return SpellingPtr-SpellingStart; 1184 } 1185