Home | History | Annotate | Download | only in Lex
      1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements the NumericLiteralParser, CharLiteralParser, and
     11 // StringLiteralParser interfaces.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "clang/Lex/LiteralSupport.h"
     16 #include "clang/Lex/Preprocessor.h"
     17 #include "clang/Lex/LexDiagnostic.h"
     18 #include "clang/Basic/TargetInfo.h"
     19 #include "llvm/ADT/StringExtras.h"
     20 #include "llvm/Support/ErrorHandling.h"
     21 using namespace clang;
     22 
     23 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
     24 /// not valid.
     25 static int HexDigitValue(char C) {
     26   if (C >= '0' && C <= '9') return C-'0';
     27   if (C >= 'a' && C <= 'f') return C-'a'+10;
     28   if (C >= 'A' && C <= 'F') return C-'A'+10;
     29   return -1;
     30 }
     31 
     32 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
     33   switch (kind) {
     34   default: llvm_unreachable("Unknown token type!");
     35   case tok::char_constant:
     36   case tok::string_literal:
     37   case tok::utf8_string_literal:
     38     return Target.getCharWidth();
     39   case tok::wide_char_constant:
     40   case tok::wide_string_literal:
     41     return Target.getWCharWidth();
     42   case tok::utf16_char_constant:
     43   case tok::utf16_string_literal:
     44     return Target.getChar16Width();
     45   case tok::utf32_char_constant:
     46   case tok::utf32_string_literal:
     47     return Target.getChar32Width();
     48   }
     49 }
     50 
     51 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
     52 /// either a character or a string literal.
     53 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     54                                   const char *ThisTokEnd, bool &HadError,
     55                                   FullSourceLoc Loc, unsigned CharWidth,
     56                                   DiagnosticsEngine *Diags) {
     57   // Skip the '\' char.
     58   ++ThisTokBuf;
     59 
     60   // We know that this character can't be off the end of the buffer, because
     61   // that would have been \", which would not have been the end of string.
     62   unsigned ResultChar = *ThisTokBuf++;
     63   switch (ResultChar) {
     64   // These map to themselves.
     65   case '\\': case '\'': case '"': case '?': break;
     66 
     67     // These have fixed mappings.
     68   case 'a':
     69     // TODO: K&R: the meaning of '\\a' is different in traditional C
     70     ResultChar = 7;
     71     break;
     72   case 'b':
     73     ResultChar = 8;
     74     break;
     75   case 'e':
     76     if (Diags)
     77       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
     78     ResultChar = 27;
     79     break;
     80   case 'E':
     81     if (Diags)
     82       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
     83     ResultChar = 27;
     84     break;
     85   case 'f':
     86     ResultChar = 12;
     87     break;
     88   case 'n':
     89     ResultChar = 10;
     90     break;
     91   case 'r':
     92     ResultChar = 13;
     93     break;
     94   case 't':
     95     ResultChar = 9;
     96     break;
     97   case 'v':
     98     ResultChar = 11;
     99     break;
    100   case 'x': { // Hex escape.
    101     ResultChar = 0;
    102     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
    103       if (Diags)
    104         Diags->Report(Loc, diag::err_hex_escape_no_digits);
    105       HadError = 1;
    106       break;
    107     }
    108 
    109     // Hex escapes are a maximal series of hex digits.
    110     bool Overflow = false;
    111     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
    112       int CharVal = HexDigitValue(ThisTokBuf[0]);
    113       if (CharVal == -1) break;
    114       // About to shift out a digit?
    115       Overflow |= (ResultChar & 0xF0000000) ? true : false;
    116       ResultChar <<= 4;
    117       ResultChar |= CharVal;
    118     }
    119 
    120     // See if any bits will be truncated when evaluated as a character.
    121     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
    122       Overflow = true;
    123       ResultChar &= ~0U >> (32-CharWidth);
    124     }
    125 
    126     // Check for overflow.
    127     if (Overflow && Diags)   // Too many digits to fit in
    128       Diags->Report(Loc, diag::warn_hex_escape_too_large);
    129     break;
    130   }
    131   case '0': case '1': case '2': case '3':
    132   case '4': case '5': case '6': case '7': {
    133     // Octal escapes.
    134     --ThisTokBuf;
    135     ResultChar = 0;
    136 
    137     // Octal escapes are a series of octal digits with maximum length 3.
    138     // "\0123" is a two digit sequence equal to "\012" "3".
    139     unsigned NumDigits = 0;
    140     do {
    141       ResultChar <<= 3;
    142       ResultChar |= *ThisTokBuf++ - '0';
    143       ++NumDigits;
    144     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
    145              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
    146 
    147     // Check for overflow.  Reject '\777', but not L'\777'.
    148     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
    149       if (Diags)
    150         Diags->Report(Loc, diag::warn_octal_escape_too_large);
    151       ResultChar &= ~0U >> (32-CharWidth);
    152     }
    153     break;
    154   }
    155 
    156     // Otherwise, these are not valid escapes.
    157   case '(': case '{': case '[': case '%':
    158     // GCC accepts these as extensions.  We warn about them as such though.
    159     if (Diags)
    160       Diags->Report(Loc, diag::ext_nonstandard_escape)
    161         << std::string()+(char)ResultChar;
    162     break;
    163   default:
    164     if (Diags == 0)
    165       break;
    166 
    167     if (isgraph(ResultChar))
    168       Diags->Report(Loc, diag::ext_unknown_escape)
    169         << std::string()+(char)ResultChar;
    170     else
    171       Diags->Report(Loc, diag::ext_unknown_escape)
    172         << "x"+llvm::utohexstr(ResultChar);
    173     break;
    174   }
    175 
    176   return ResultChar;
    177 }
    178 
    179 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
    180 /// return the UTF32.
    181 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
    182                              uint32_t &UcnVal, unsigned short &UcnLen,
    183                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
    184                              const LangOptions &Features) {
    185   if (!Features.CPlusPlus && !Features.C99 && Diags)
    186     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
    187 
    188   // Save the beginning of the string (for error diagnostics).
    189   const char *ThisTokBegin = ThisTokBuf;
    190 
    191   // Skip the '\u' char's.
    192   ThisTokBuf += 2;
    193 
    194   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
    195     if (Diags)
    196       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
    197     return false;
    198   }
    199   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
    200   unsigned short UcnLenSave = UcnLen;
    201   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
    202     int CharVal = HexDigitValue(ThisTokBuf[0]);
    203     if (CharVal == -1) break;
    204     UcnVal <<= 4;
    205     UcnVal |= CharVal;
    206   }
    207   // If we didn't consume the proper number of digits, there is a problem.
    208   if (UcnLenSave) {
    209     if (Diags) {
    210       SourceLocation L =
    211         Lexer::AdvanceToTokenCharacter(Loc, ThisTokBuf-ThisTokBegin,
    212                                        Loc.getManager(), Features);
    213       Diags->Report(FullSourceLoc(L, Loc.getManager()),
    214                     diag::err_ucn_escape_incomplete);
    215     }
    216     return false;
    217   }
    218   // Check UCN constraints (C99 6.4.3p2).
    219   if ((UcnVal < 0xa0 &&
    220       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
    221       || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
    222       || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
    223     if (Diags)
    224       Diags->Report(Loc, diag::err_ucn_escape_invalid);
    225     return false;
    226   }
    227   return true;
    228 }
    229 
    230 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
    231 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
    232 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
    233 /// we will likely rework our support for UCN's.
    234 static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
    235                             char *&ResultBuf, bool &HadError,
    236                             FullSourceLoc Loc, unsigned CharByteWidth,
    237                             DiagnosticsEngine *Diags,
    238                             const LangOptions &Features) {
    239   typedef uint32_t UTF32;
    240   UTF32 UcnVal = 0;
    241   unsigned short UcnLen = 0;
    242   if (!ProcessUCNEscape(ThisTokBuf, ThisTokEnd, UcnVal, UcnLen, Loc, Diags,
    243                         Features)) {
    244     HadError = 1;
    245     return;
    246   }
    247 
    248   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
    249          "only character widths of 1, 2, or 4 bytes supported");
    250 
    251   (void)UcnLen;
    252   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
    253 
    254   if (CharByteWidth == 4) {
    255     // Note: our internal rep of wide char tokens is always little-endian.
    256     *ResultBuf++ = (UcnVal & 0x000000FF);
    257     *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
    258     *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
    259     *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
    260     return;
    261   }
    262 
    263   if (CharByteWidth == 2) {
    264     // Convert to UTF16.
    265     if (UcnVal < (UTF32)0xFFFF) {
    266       *ResultBuf++ = (UcnVal & 0x000000FF);
    267       *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
    268       return;
    269     }
    270     if (Diags) Diags->Report(Loc, diag::warn_ucn_escape_too_large);
    271 
    272     typedef uint16_t UTF16;
    273     UcnVal -= 0x10000;
    274     UTF16 surrogate1 = 0xD800 + (UcnVal >> 10);
    275     UTF16 surrogate2 = 0xDC00 + (UcnVal & 0x3FF);
    276     *ResultBuf++ = (surrogate1 & 0x000000FF);
    277     *ResultBuf++ = (surrogate1 & 0x0000FF00) >> 8;
    278     *ResultBuf++ = (surrogate2 & 0x000000FF);
    279     *ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
    280     return;
    281   }
    282 
    283   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
    284 
    285   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
    286   // The conversion below was inspired by:
    287   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
    288   // First, we determine how many bytes the result will require.
    289   typedef uint8_t UTF8;
    290 
    291   unsigned short bytesToWrite = 0;
    292   if (UcnVal < (UTF32)0x80)
    293     bytesToWrite = 1;
    294   else if (UcnVal < (UTF32)0x800)
    295     bytesToWrite = 2;
    296   else if (UcnVal < (UTF32)0x10000)
    297     bytesToWrite = 3;
    298   else
    299     bytesToWrite = 4;
    300 
    301   const unsigned byteMask = 0xBF;
    302   const unsigned byteMark = 0x80;
    303 
    304   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
    305   // into the first byte, depending on how many bytes follow.
    306   static const UTF8 firstByteMark[5] = {
    307     0x00, 0x00, 0xC0, 0xE0, 0xF0
    308   };
    309   // Finally, we write the bytes into ResultBuf.
    310   ResultBuf += bytesToWrite;
    311   switch (bytesToWrite) { // note: everything falls through.
    312     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    313     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    314     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    315     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
    316   }
    317   // Update the buffer.
    318   ResultBuf += bytesToWrite;
    319 }
    320 
    321 
    322 ///       integer-constant: [C99 6.4.4.1]
    323 ///         decimal-constant integer-suffix
    324 ///         octal-constant integer-suffix
    325 ///         hexadecimal-constant integer-suffix
    326 ///       decimal-constant:
    327 ///         nonzero-digit
    328 ///         decimal-constant digit
    329 ///       octal-constant:
    330 ///         0
    331 ///         octal-constant octal-digit
    332 ///       hexadecimal-constant:
    333 ///         hexadecimal-prefix hexadecimal-digit
    334 ///         hexadecimal-constant hexadecimal-digit
    335 ///       hexadecimal-prefix: one of
    336 ///         0x 0X
    337 ///       integer-suffix:
    338 ///         unsigned-suffix [long-suffix]
    339 ///         unsigned-suffix [long-long-suffix]
    340 ///         long-suffix [unsigned-suffix]
    341 ///         long-long-suffix [unsigned-sufix]
    342 ///       nonzero-digit:
    343 ///         1 2 3 4 5 6 7 8 9
    344 ///       octal-digit:
    345 ///         0 1 2 3 4 5 6 7
    346 ///       hexadecimal-digit:
    347 ///         0 1 2 3 4 5 6 7 8 9
    348 ///         a b c d e f
    349 ///         A B C D E F
    350 ///       unsigned-suffix: one of
    351 ///         u U
    352 ///       long-suffix: one of
    353 ///         l L
    354 ///       long-long-suffix: one of
    355 ///         ll LL
    356 ///
    357 ///       floating-constant: [C99 6.4.4.2]
    358 ///         TODO: add rules...
    359 ///
    360 NumericLiteralParser::
    361 NumericLiteralParser(const char *begin, const char *end,
    362                      SourceLocation TokLoc, Preprocessor &pp)
    363   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
    364 
    365   // This routine assumes that the range begin/end matches the regex for integer
    366   // and FP constants (specifically, the 'pp-number' regex), and assumes that
    367   // the byte at "*end" is both valid and not part of the regex.  Because of
    368   // this, it doesn't have to check for 'overscan' in various places.
    369   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
    370          "Lexer didn't maximally munch?");
    371 
    372   s = DigitsBegin = begin;
    373   saw_exponent = false;
    374   saw_period = false;
    375   isLong = false;
    376   isUnsigned = false;
    377   isLongLong = false;
    378   isFloat = false;
    379   isImaginary = false;
    380   isMicrosoftInteger = false;
    381   hadError = false;
    382 
    383   if (*s == '0') { // parse radix
    384     ParseNumberStartingWithZero(TokLoc);
    385     if (hadError)
    386       return;
    387   } else { // the first digit is non-zero
    388     radix = 10;
    389     s = SkipDigits(s);
    390     if (s == ThisTokEnd) {
    391       // Done.
    392     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
    393       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
    394               diag::err_invalid_decimal_digit) << StringRef(s, 1);
    395       hadError = true;
    396       return;
    397     } else if (*s == '.') {
    398       s++;
    399       saw_period = true;
    400       s = SkipDigits(s);
    401     }
    402     if ((*s == 'e' || *s == 'E')) { // exponent
    403       const char *Exponent = s;
    404       s++;
    405       saw_exponent = true;
    406       if (*s == '+' || *s == '-')  s++; // sign
    407       const char *first_non_digit = SkipDigits(s);
    408       if (first_non_digit != s) {
    409         s = first_non_digit;
    410       } else {
    411         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
    412                 diag::err_exponent_has_no_digits);
    413         hadError = true;
    414         return;
    415       }
    416     }
    417   }
    418 
    419   SuffixBegin = s;
    420 
    421   // Parse the suffix.  At this point we can classify whether we have an FP or
    422   // integer constant.
    423   bool isFPConstant = isFloatingLiteral();
    424 
    425   // Loop over all of the characters of the suffix.  If we see something bad,
    426   // we break out of the loop.
    427   for (; s != ThisTokEnd; ++s) {
    428     switch (*s) {
    429     case 'f':      // FP Suffix for "float"
    430     case 'F':
    431       if (!isFPConstant) break;  // Error for integer constant.
    432       if (isFloat || isLong) break; // FF, LF invalid.
    433       isFloat = true;
    434       continue;  // Success.
    435     case 'u':
    436     case 'U':
    437       if (isFPConstant) break;  // Error for floating constant.
    438       if (isUnsigned) break;    // Cannot be repeated.
    439       isUnsigned = true;
    440       continue;  // Success.
    441     case 'l':
    442     case 'L':
    443       if (isLong || isLongLong) break;  // Cannot be repeated.
    444       if (isFloat) break;               // LF invalid.
    445 
    446       // Check for long long.  The L's need to be adjacent and the same case.
    447       if (s+1 != ThisTokEnd && s[1] == s[0]) {
    448         if (isFPConstant) break;        // long long invalid for floats.
    449         isLongLong = true;
    450         ++s;  // Eat both of them.
    451       } else {
    452         isLong = true;
    453       }
    454       continue;  // Success.
    455     case 'i':
    456     case 'I':
    457       if (PP.getLangOptions().MicrosoftExt) {
    458         if (isFPConstant || isLong || isLongLong) break;
    459 
    460         // Allow i8, i16, i32, i64, and i128.
    461         if (s + 1 != ThisTokEnd) {
    462           switch (s[1]) {
    463             case '8':
    464               s += 2; // i8 suffix
    465               isMicrosoftInteger = true;
    466               break;
    467             case '1':
    468               if (s + 2 == ThisTokEnd) break;
    469               if (s[2] == '6') {
    470                 s += 3; // i16 suffix
    471                 isMicrosoftInteger = true;
    472               }
    473               else if (s[2] == '2') {
    474                 if (s + 3 == ThisTokEnd) break;
    475                 if (s[3] == '8') {
    476                   s += 4; // i128 suffix
    477                   isMicrosoftInteger = true;
    478                 }
    479               }
    480               break;
    481             case '3':
    482               if (s + 2 == ThisTokEnd) break;
    483               if (s[2] == '2') {
    484                 s += 3; // i32 suffix
    485                 isLong = true;
    486                 isMicrosoftInteger = true;
    487               }
    488               break;
    489             case '6':
    490               if (s + 2 == ThisTokEnd) break;
    491               if (s[2] == '4') {
    492                 s += 3; // i64 suffix
    493                 isLongLong = true;
    494                 isMicrosoftInteger = true;
    495               }
    496               break;
    497             default:
    498               break;
    499           }
    500           break;
    501         }
    502       }
    503       // fall through.
    504     case 'j':
    505     case 'J':
    506       if (isImaginary) break;   // Cannot be repeated.
    507       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
    508               diag::ext_imaginary_constant);
    509       isImaginary = true;
    510       continue;  // Success.
    511     }
    512     // If we reached here, there was an error.
    513     break;
    514   }
    515 
    516   // Report an error if there are any.
    517   if (s != ThisTokEnd) {
    518     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
    519             isFPConstant ? diag::err_invalid_suffix_float_constant :
    520                            diag::err_invalid_suffix_integer_constant)
    521       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
    522     hadError = true;
    523     return;
    524   }
    525 }
    526 
    527 /// ParseNumberStartingWithZero - This method is called when the first character
    528 /// of the number is found to be a zero.  This means it is either an octal
    529 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
    530 /// a floating point number (01239.123e4).  Eat the prefix, determining the
    531 /// radix etc.
    532 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
    533   assert(s[0] == '0' && "Invalid method call");
    534   s++;
    535 
    536   // Handle a hex number like 0x1234.
    537   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
    538     s++;
    539     radix = 16;
    540     DigitsBegin = s;
    541     s = SkipHexDigits(s);
    542     if (s == ThisTokEnd) {
    543       // Done.
    544     } else if (*s == '.') {
    545       s++;
    546       saw_period = true;
    547       s = SkipHexDigits(s);
    548     }
    549     // A binary exponent can appear with or with a '.'. If dotted, the
    550     // binary exponent is required.
    551     if (*s == 'p' || *s == 'P') {
    552       const char *Exponent = s;
    553       s++;
    554       saw_exponent = true;
    555       if (*s == '+' || *s == '-')  s++; // sign
    556       const char *first_non_digit = SkipDigits(s);
    557       if (first_non_digit == s) {
    558         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
    559                 diag::err_exponent_has_no_digits);
    560         hadError = true;
    561         return;
    562       }
    563       s = first_non_digit;
    564 
    565       if (!PP.getLangOptions().HexFloats)
    566         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
    567     } else if (saw_period) {
    568       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    569               diag::err_hexconstant_requires_exponent);
    570       hadError = true;
    571     }
    572     return;
    573   }
    574 
    575   // Handle simple binary numbers 0b01010
    576   if (*s == 'b' || *s == 'B') {
    577     // 0b101010 is a GCC extension.
    578     PP.Diag(TokLoc, diag::ext_binary_literal);
    579     ++s;
    580     radix = 2;
    581     DigitsBegin = s;
    582     s = SkipBinaryDigits(s);
    583     if (s == ThisTokEnd) {
    584       // Done.
    585     } else if (isxdigit(*s)) {
    586       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    587               diag::err_invalid_binary_digit) << StringRef(s, 1);
    588       hadError = true;
    589     }
    590     // Other suffixes will be diagnosed by the caller.
    591     return;
    592   }
    593 
    594   // For now, the radix is set to 8. If we discover that we have a
    595   // floating point constant, the radix will change to 10. Octal floating
    596   // point constants are not permitted (only decimal and hexadecimal).
    597   radix = 8;
    598   DigitsBegin = s;
    599   s = SkipOctalDigits(s);
    600   if (s == ThisTokEnd)
    601     return; // Done, simple octal number like 01234
    602 
    603   // If we have some other non-octal digit that *is* a decimal digit, see if
    604   // this is part of a floating point number like 094.123 or 09e1.
    605   if (isdigit(*s)) {
    606     const char *EndDecimal = SkipDigits(s);
    607     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
    608       s = EndDecimal;
    609       radix = 10;
    610     }
    611   }
    612 
    613   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
    614   // the code is using an incorrect base.
    615   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
    616     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    617             diag::err_invalid_octal_digit) << StringRef(s, 1);
    618     hadError = true;
    619     return;
    620   }
    621 
    622   if (*s == '.') {
    623     s++;
    624     radix = 10;
    625     saw_period = true;
    626     s = SkipDigits(s); // Skip suffix.
    627   }
    628   if (*s == 'e' || *s == 'E') { // exponent
    629     const char *Exponent = s;
    630     s++;
    631     radix = 10;
    632     saw_exponent = true;
    633     if (*s == '+' || *s == '-')  s++; // sign
    634     const char *first_non_digit = SkipDigits(s);
    635     if (first_non_digit != s) {
    636       s = first_non_digit;
    637     } else {
    638       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
    639               diag::err_exponent_has_no_digits);
    640       hadError = true;
    641       return;
    642     }
    643   }
    644 }
    645 
    646 
    647 /// GetIntegerValue - Convert this numeric literal value to an APInt that
    648 /// matches Val's input width.  If there is an overflow, set Val to the low bits
    649 /// of the result and return true.  Otherwise, return false.
    650 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
    651   // Fast path: Compute a conservative bound on the maximum number of
    652   // bits per digit in this radix. If we can't possibly overflow a
    653   // uint64 based on that bound then do the simple conversion to
    654   // integer. This avoids the expensive overflow checking below, and
    655   // handles the common cases that matter (small decimal integers and
    656   // hex/octal values which don't overflow).
    657   unsigned MaxBitsPerDigit = 1;
    658   while ((1U << MaxBitsPerDigit) < radix)
    659     MaxBitsPerDigit += 1;
    660   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
    661     uint64_t N = 0;
    662     for (s = DigitsBegin; s != SuffixBegin; ++s)
    663       N = N*radix + HexDigitValue(*s);
    664 
    665     // This will truncate the value to Val's input width. Simply check
    666     // for overflow by comparing.
    667     Val = N;
    668     return Val.getZExtValue() != N;
    669   }
    670 
    671   Val = 0;
    672   s = DigitsBegin;
    673 
    674   llvm::APInt RadixVal(Val.getBitWidth(), radix);
    675   llvm::APInt CharVal(Val.getBitWidth(), 0);
    676   llvm::APInt OldVal = Val;
    677 
    678   bool OverflowOccurred = false;
    679   while (s < SuffixBegin) {
    680     unsigned C = HexDigitValue(*s++);
    681 
    682     // If this letter is out of bound for this radix, reject it.
    683     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
    684 
    685     CharVal = C;
    686 
    687     // Add the digit to the value in the appropriate radix.  If adding in digits
    688     // made the value smaller, then this overflowed.
    689     OldVal = Val;
    690 
    691     // Multiply by radix, did overflow occur on the multiply?
    692     Val *= RadixVal;
    693     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
    694 
    695     // Add value, did overflow occur on the value?
    696     //   (a + b) ult b  <=> overflow
    697     Val += CharVal;
    698     OverflowOccurred |= Val.ult(CharVal);
    699   }
    700   return OverflowOccurred;
    701 }
    702 
    703 llvm::APFloat::opStatus
    704 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
    705   using llvm::APFloat;
    706 
    707   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
    708   return Result.convertFromString(StringRef(ThisTokBegin, n),
    709                                   APFloat::rmNearestTiesToEven);
    710 }
    711 
    712 
    713 ///       character-literal: [C++0x lex.ccon]
    714 ///         ' c-char-sequence '
    715 ///         u' c-char-sequence '
    716 ///         U' c-char-sequence '
    717 ///         L' c-char-sequence '
    718 ///       c-char-sequence:
    719 ///         c-char
    720 ///         c-char-sequence c-char
    721 ///       c-char:
    722 ///         any member of the source character set except the single-quote ',
    723 ///           backslash \, or new-line character
    724 ///         escape-sequence
    725 ///         universal-character-name
    726 ///       escape-sequence: [C++0x lex.ccon]
    727 ///         simple-escape-sequence
    728 ///         octal-escape-sequence
    729 ///         hexadecimal-escape-sequence
    730 ///       simple-escape-sequence:
    731 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
    732 ///       octal-escape-sequence:
    733 ///         \ octal-digit
    734 ///         \ octal-digit octal-digit
    735 ///         \ octal-digit octal-digit octal-digit
    736 ///       hexadecimal-escape-sequence:
    737 ///         \x hexadecimal-digit
    738 ///         hexadecimal-escape-sequence hexadecimal-digit
    739 ///       universal-character-name:
    740 ///         \u hex-quad
    741 ///         \U hex-quad hex-quad
    742 ///       hex-quad:
    743 ///         hex-digit hex-digit hex-digit hex-digit
    744 ///
    745 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    746                                      SourceLocation Loc, Preprocessor &PP,
    747                                      tok::TokenKind kind) {
    748   // At this point we know that the character matches the regex "L?'.*'".
    749   HadError = false;
    750 
    751   Kind = kind;
    752 
    753   // Determine if this is a wide or UTF character.
    754   if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
    755       Kind == tok::utf32_char_constant) {
    756     ++begin;
    757   }
    758 
    759   // Skip over the entry quote.
    760   assert(begin[0] == '\'' && "Invalid token lexed");
    761   ++begin;
    762 
    763   // FIXME: The "Value" is an uint64_t so we can handle char literals of
    764   // up to 64-bits.
    765   // FIXME: This extensively assumes that 'char' is 8-bits.
    766   assert(PP.getTargetInfo().getCharWidth() == 8 &&
    767          "Assumes char is 8 bits");
    768   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
    769          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
    770          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
    771   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
    772          "Assumes sizeof(wchar) on target is <= 64");
    773 
    774   // This is what we will use for overflow detection
    775   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
    776 
    777   unsigned NumCharsSoFar = 0;
    778   bool Warned = false;
    779   while (begin[0] != '\'') {
    780     uint64_t ResultChar;
    781 
    782       // Is this a Universal Character Name escape?
    783     if (begin[0] != '\\')     // If this is a normal character, consume it.
    784       ResultChar = (unsigned char)*begin++;
    785     else {                    // Otherwise, this is an escape character.
    786       unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
    787       // Check for UCN.
    788       if (begin[1] == 'u' || begin[1] == 'U') {
    789         uint32_t utf32 = 0;
    790         unsigned short UcnLen = 0;
    791         if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
    792                               FullSourceLoc(Loc, PP.getSourceManager()),
    793                               &PP.getDiagnostics(), PP.getLangOptions())) {
    794           HadError = 1;
    795         }
    796         ResultChar = utf32;
    797         if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
    798           PP.Diag(Loc, diag::warn_ucn_escape_too_large);
    799           ResultChar &= ~0U >> (32-CharWidth);
    800         }
    801       } else {
    802         // Otherwise, this is a non-UCN escape character.  Process it.
    803         ResultChar = ProcessCharEscape(begin, end, HadError,
    804                                        FullSourceLoc(Loc,PP.getSourceManager()),
    805                                        CharWidth, &PP.getDiagnostics());
    806       }
    807     }
    808 
    809     // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
    810     // implementation defined (C99 6.4.4.4p10).
    811     if (NumCharsSoFar) {
    812       if (!isAscii()) {
    813         // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
    814         LitVal = 0;
    815       } else {
    816         // Narrow character literals act as though their value is concatenated
    817         // in this implementation, but warn on overflow.
    818         if (LitVal.countLeadingZeros() < 8 && !Warned) {
    819           PP.Diag(Loc, diag::warn_char_constant_too_large);
    820           Warned = true;
    821         }
    822         LitVal <<= 8;
    823       }
    824     }
    825 
    826     LitVal = LitVal + ResultChar;
    827     ++NumCharsSoFar;
    828   }
    829 
    830   // If this is the second character being processed, do special handling.
    831   if (NumCharsSoFar > 1) {
    832     // Warn about discarding the top bits for multi-char wide-character
    833     // constants (L'abcd').
    834     if (!isAscii())
    835       PP.Diag(Loc, diag::warn_extraneous_char_constant);
    836     else if (NumCharsSoFar != 4)
    837       PP.Diag(Loc, diag::ext_multichar_character_literal);
    838     else
    839       PP.Diag(Loc, diag::ext_four_char_character_literal);
    840     IsMultiChar = true;
    841   } else
    842     IsMultiChar = false;
    843 
    844   // Transfer the value from APInt to uint64_t
    845   Value = LitVal.getZExtValue();
    846 
    847   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
    848   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
    849   // character constants are not sign extended in the this implementation:
    850   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
    851   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
    852       PP.getLangOptions().CharIsSigned)
    853     Value = (signed char)Value;
    854 }
    855 
    856 
    857 ///       string-literal: [C++0x lex.string]
    858 ///         encoding-prefix " [s-char-sequence] "
    859 ///         encoding-prefix R raw-string
    860 ///       encoding-prefix:
    861 ///         u8
    862 ///         u
    863 ///         U
    864 ///         L
    865 ///       s-char-sequence:
    866 ///         s-char
    867 ///         s-char-sequence s-char
    868 ///       s-char:
    869 ///         any member of the source character set except the double-quote ",
    870 ///           backslash \, or new-line character
    871 ///         escape-sequence
    872 ///         universal-character-name
    873 ///       raw-string:
    874 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
    875 ///       r-char-sequence:
    876 ///         r-char
    877 ///         r-char-sequence r-char
    878 ///       r-char:
    879 ///         any member of the source character set, except a right parenthesis )
    880 ///           followed by the initial d-char-sequence (which may be empty)
    881 ///           followed by a double quote ".
    882 ///       d-char-sequence:
    883 ///         d-char
    884 ///         d-char-sequence d-char
    885 ///       d-char:
    886 ///         any member of the basic source character set except:
    887 ///           space, the left parenthesis (, the right parenthesis ),
    888 ///           the backslash \, and the control characters representing horizontal
    889 ///           tab, vertical tab, form feed, and newline.
    890 ///       escape-sequence: [C++0x lex.ccon]
    891 ///         simple-escape-sequence
    892 ///         octal-escape-sequence
    893 ///         hexadecimal-escape-sequence
    894 ///       simple-escape-sequence:
    895 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
    896 ///       octal-escape-sequence:
    897 ///         \ octal-digit
    898 ///         \ octal-digit octal-digit
    899 ///         \ octal-digit octal-digit octal-digit
    900 ///       hexadecimal-escape-sequence:
    901 ///         \x hexadecimal-digit
    902 ///         hexadecimal-escape-sequence hexadecimal-digit
    903 ///       universal-character-name:
    904 ///         \u hex-quad
    905 ///         \U hex-quad hex-quad
    906 ///       hex-quad:
    907 ///         hex-digit hex-digit hex-digit hex-digit
    908 ///
    909 StringLiteralParser::
    910 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
    911                     Preprocessor &PP, bool Complain)
    912   : SM(PP.getSourceManager()), Features(PP.getLangOptions()),
    913     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
    914     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
    915     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
    916   init(StringToks, NumStringToks);
    917 }
    918 
    919 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
    920   // The literal token may have come from an invalid source location (e.g. due
    921   // to a PCH error), in which case the token length will be 0.
    922   if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
    923     hadError = true;
    924     return;
    925   }
    926 
    927   // Scan all of the string portions, remember the max individual token length,
    928   // computing a bound on the concatenated string length, and see whether any
    929   // piece is a wide-string.  If any of the string portions is a wide-string
    930   // literal, the result is a wide-string literal [C99 6.4.5p4].
    931   assert(NumStringToks && "expected at least one token");
    932   MaxTokenLength = StringToks[0].getLength();
    933   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
    934   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
    935   Kind = StringToks[0].getKind();
    936 
    937   hadError = false;
    938 
    939   // Implement Translation Phase #6: concatenation of string literals
    940   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
    941   for (unsigned i = 1; i != NumStringToks; ++i) {
    942     if (StringToks[i].getLength() < 2) {
    943       hadError = true;
    944       return;
    945     }
    946 
    947     // The string could be shorter than this if it needs cleaning, but this is a
    948     // reasonable bound, which is all we need.
    949     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
    950     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
    951 
    952     // Remember maximum string piece length.
    953     if (StringToks[i].getLength() > MaxTokenLength)
    954       MaxTokenLength = StringToks[i].getLength();
    955 
    956     // Remember if we see any wide or utf-8/16/32 strings.
    957     // Also check for illegal concatenations.
    958     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
    959       if (isAscii()) {
    960         Kind = StringToks[i].getKind();
    961       } else {
    962         if (Diags)
    963           Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
    964                         diag::err_unsupported_string_concat);
    965         hadError = true;
    966       }
    967     }
    968   }
    969 
    970   // Include space for the null terminator.
    971   ++SizeBound;
    972 
    973   // TODO: K&R warning: "traditional C rejects string constant concatenation"
    974 
    975   // Get the width in bytes of char/wchar_t/char16_t/char32_t
    976   CharByteWidth = getCharWidth(Kind, Target);
    977   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
    978   CharByteWidth /= 8;
    979 
    980   // The output buffer size needs to be large enough to hold wide characters.
    981   // This is a worst-case assumption which basically corresponds to L"" "long".
    982   SizeBound *= CharByteWidth;
    983 
    984   // Size the temporary buffer to hold the result string data.
    985   ResultBuf.resize(SizeBound);
    986 
    987   // Likewise, but for each string piece.
    988   llvm::SmallString<512> TokenBuf;
    989   TokenBuf.resize(MaxTokenLength);
    990 
    991   // Loop over all the strings, getting their spelling, and expanding them to
    992   // wide strings as appropriate.
    993   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
    994 
    995   Pascal = false;
    996 
    997   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
    998     const char *ThisTokBuf = &TokenBuf[0];
    999     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
   1000     // that ThisTokBuf points to a buffer that is big enough for the whole token
   1001     // and 'spelled' tokens can only shrink.
   1002     bool StringInvalid = false;
   1003     unsigned ThisTokLen =
   1004       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
   1005                          &StringInvalid);
   1006     if (StringInvalid) {
   1007       hadError = true;
   1008       continue;
   1009     }
   1010 
   1011     const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1;  // Skip end quote.
   1012     // TODO: Input character set mapping support.
   1013 
   1014     // Skip marker for wide or unicode strings.
   1015     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
   1016       ++ThisTokBuf;
   1017       // Skip 8 of u8 marker for utf8 strings.
   1018       if (ThisTokBuf[0] == '8')
   1019         ++ThisTokBuf;
   1020     }
   1021 
   1022     // Check for raw string
   1023     if (ThisTokBuf[0] == 'R') {
   1024       ThisTokBuf += 2; // skip R"
   1025 
   1026       const char *Prefix = ThisTokBuf;
   1027       while (ThisTokBuf[0] != '(')
   1028         ++ThisTokBuf;
   1029       ++ThisTokBuf; // skip '('
   1030 
   1031       // remove same number of characters from the end
   1032       if (ThisTokEnd >= ThisTokBuf + (ThisTokBuf - Prefix))
   1033         ThisTokEnd -= (ThisTokBuf - Prefix);
   1034 
   1035       // Copy the string over
   1036       CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf));
   1037     } else {
   1038       assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
   1039       ++ThisTokBuf; // skip "
   1040 
   1041       // Check if this is a pascal string
   1042       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
   1043           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
   1044 
   1045         // If the \p sequence is found in the first token, we have a pascal string
   1046         // Otherwise, if we already have a pascal string, ignore the first \p
   1047         if (i == 0) {
   1048           ++ThisTokBuf;
   1049           Pascal = true;
   1050         } else if (Pascal)
   1051           ThisTokBuf += 2;
   1052       }
   1053 
   1054       while (ThisTokBuf != ThisTokEnd) {
   1055         // Is this a span of non-escape characters?
   1056         if (ThisTokBuf[0] != '\\') {
   1057           const char *InStart = ThisTokBuf;
   1058           do {
   1059             ++ThisTokBuf;
   1060           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
   1061 
   1062           // Copy the character span over.
   1063           CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart));
   1064           continue;
   1065         }
   1066         // Is this a Universal Character Name escape?
   1067         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
   1068           EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
   1069                           hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
   1070                           CharByteWidth, Diags, Features);
   1071           continue;
   1072         }
   1073         // Otherwise, this is a non-UCN escape character.  Process it.
   1074         unsigned ResultChar =
   1075           ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
   1076                             FullSourceLoc(StringToks[i].getLocation(), SM),
   1077                             CharByteWidth*8, Diags);
   1078 
   1079         // Note: our internal rep of wide char tokens is always little-endian.
   1080         *ResultPtr++ = ResultChar & 0xFF;
   1081 
   1082         for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
   1083           *ResultPtr++ = ResultChar >> i*8;
   1084       }
   1085     }
   1086   }
   1087 
   1088   if (Pascal) {
   1089     ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
   1090     ResultBuf[0] /= CharByteWidth;
   1091 
   1092     // Verify that pascal strings aren't too large.
   1093     if (GetStringLength() > 256) {
   1094       if (Diags)
   1095         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
   1096                       diag::err_pascal_string_too_long)
   1097           << SourceRange(StringToks[0].getLocation(),
   1098                          StringToks[NumStringToks-1].getLocation());
   1099       hadError = true;
   1100       return;
   1101     }
   1102   } else if (Diags) {
   1103     // Complain if this string literal has too many characters.
   1104     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
   1105 
   1106     if (GetNumStringChars() > MaxChars)
   1107       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
   1108                     diag::ext_string_too_long)
   1109         << GetNumStringChars() << MaxChars
   1110         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
   1111         << SourceRange(StringToks[0].getLocation(),
   1112                        StringToks[NumStringToks-1].getLocation());
   1113   }
   1114 }
   1115 
   1116 
   1117 /// copyStringFragment - This function copies from Start to End into ResultPtr.
   1118 /// Performs widening for multi-byte characters.
   1119 void StringLiteralParser::CopyStringFragment(StringRef Fragment) {
   1120   // Copy the character span over.
   1121   if (CharByteWidth == 1) {
   1122     memcpy(ResultPtr, Fragment.data(), Fragment.size());
   1123     ResultPtr += Fragment.size();
   1124   } else {
   1125     // Note: our internal rep of wide char tokens is always little-endian.
   1126     for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) {
   1127       *ResultPtr++ = *I;
   1128       // Add zeros at the end.
   1129       for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
   1130         *ResultPtr++ = 0;
   1131     }
   1132   }
   1133 }
   1134 
   1135 
   1136 /// getOffsetOfStringByte - This function returns the offset of the
   1137 /// specified byte of the string data represented by Token.  This handles
   1138 /// advancing over escape sequences in the string.
   1139 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
   1140                                                     unsigned ByteNo) const {
   1141   // Get the spelling of the token.
   1142   llvm::SmallString<32> SpellingBuffer;
   1143   SpellingBuffer.resize(Tok.getLength());
   1144 
   1145   bool StringInvalid = false;
   1146   const char *SpellingPtr = &SpellingBuffer[0];
   1147   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
   1148                                        &StringInvalid);
   1149   if (StringInvalid)
   1150     return 0;
   1151 
   1152   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
   1153          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
   1154 
   1155 
   1156   const char *SpellingStart = SpellingPtr;
   1157   const char *SpellingEnd = SpellingPtr+TokLen;
   1158 
   1159   // Skip over the leading quote.
   1160   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
   1161   ++SpellingPtr;
   1162 
   1163   // Skip over bytes until we find the offset we're looking for.
   1164   while (ByteNo) {
   1165     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
   1166 
   1167     // Step over non-escapes simply.
   1168     if (*SpellingPtr != '\\') {
   1169       ++SpellingPtr;
   1170       --ByteNo;
   1171       continue;
   1172     }
   1173 
   1174     // Otherwise, this is an escape character.  Advance over it.
   1175     bool HadError = false;
   1176     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
   1177                       FullSourceLoc(Tok.getLocation(), SM),
   1178                       CharByteWidth*8, Diags);
   1179     assert(!HadError && "This method isn't valid on erroneous strings");
   1180     --ByteNo;
   1181   }
   1182 
   1183   return SpellingPtr-SpellingStart;
   1184 }
   1185