Home | History | Annotate | Download | only in Lex
      1 //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 //
     10 // This file implements the NumericLiteralParser, CharLiteralParser, and
     11 // StringLiteralParser interfaces.
     12 //
     13 //===----------------------------------------------------------------------===//
     14 
     15 #include "clang/Lex/LiteralSupport.h"
     16 #include "clang/Lex/Preprocessor.h"
     17 #include "clang/Lex/LexDiagnostic.h"
     18 #include "clang/Basic/TargetInfo.h"
     19 #include "clang/Basic/ConvertUTF.h"
     20 #include "llvm/ADT/StringExtras.h"
     21 #include "llvm/Support/ErrorHandling.h"
     22 using namespace clang;
     23 
     24 /// HexDigitValue - Return the value of the specified hex digit, or -1 if it's
     25 /// not valid.
     26 static int HexDigitValue(char C) {
     27   if (C >= '0' && C <= '9') return C-'0';
     28   if (C >= 'a' && C <= 'f') return C-'a'+10;
     29   if (C >= 'A' && C <= 'F') return C-'A'+10;
     30   return -1;
     31 }
     32 
     33 static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
     34   switch (kind) {
     35   default: llvm_unreachable("Unknown token type!");
     36   case tok::char_constant:
     37   case tok::string_literal:
     38   case tok::utf8_string_literal:
     39     return Target.getCharWidth();
     40   case tok::wide_char_constant:
     41   case tok::wide_string_literal:
     42     return Target.getWCharWidth();
     43   case tok::utf16_char_constant:
     44   case tok::utf16_string_literal:
     45     return Target.getChar16Width();
     46   case tok::utf32_char_constant:
     47   case tok::utf32_string_literal:
     48     return Target.getChar32Width();
     49   }
     50 }
     51 
     52 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
     53 /// either a character or a string literal.
     54 static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     55                                   const char *ThisTokEnd, bool &HadError,
     56                                   FullSourceLoc Loc, unsigned CharWidth,
     57                                   DiagnosticsEngine *Diags) {
     58   // Skip the '\' char.
     59   ++ThisTokBuf;
     60 
     61   // We know that this character can't be off the end of the buffer, because
     62   // that would have been \", which would not have been the end of string.
     63   unsigned ResultChar = *ThisTokBuf++;
     64   switch (ResultChar) {
     65   // These map to themselves.
     66   case '\\': case '\'': case '"': case '?': break;
     67 
     68     // These have fixed mappings.
     69   case 'a':
     70     // TODO: K&R: the meaning of '\\a' is different in traditional C
     71     ResultChar = 7;
     72     break;
     73   case 'b':
     74     ResultChar = 8;
     75     break;
     76   case 'e':
     77     if (Diags)
     78       Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
     79     ResultChar = 27;
     80     break;
     81   case 'E':
     82     if (Diags)
     83       Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
     84     ResultChar = 27;
     85     break;
     86   case 'f':
     87     ResultChar = 12;
     88     break;
     89   case 'n':
     90     ResultChar = 10;
     91     break;
     92   case 'r':
     93     ResultChar = 13;
     94     break;
     95   case 't':
     96     ResultChar = 9;
     97     break;
     98   case 'v':
     99     ResultChar = 11;
    100     break;
    101   case 'x': { // Hex escape.
    102     ResultChar = 0;
    103     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
    104       if (Diags)
    105         Diags->Report(Loc, diag::err_hex_escape_no_digits);
    106       HadError = 1;
    107       break;
    108     }
    109 
    110     // Hex escapes are a maximal series of hex digits.
    111     bool Overflow = false;
    112     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
    113       int CharVal = HexDigitValue(ThisTokBuf[0]);
    114       if (CharVal == -1) break;
    115       // About to shift out a digit?
    116       Overflow |= (ResultChar & 0xF0000000) ? true : false;
    117       ResultChar <<= 4;
    118       ResultChar |= CharVal;
    119     }
    120 
    121     // See if any bits will be truncated when evaluated as a character.
    122     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
    123       Overflow = true;
    124       ResultChar &= ~0U >> (32-CharWidth);
    125     }
    126 
    127     // Check for overflow.
    128     if (Overflow && Diags)   // Too many digits to fit in
    129       Diags->Report(Loc, diag::warn_hex_escape_too_large);
    130     break;
    131   }
    132   case '0': case '1': case '2': case '3':
    133   case '4': case '5': case '6': case '7': {
    134     // Octal escapes.
    135     --ThisTokBuf;
    136     ResultChar = 0;
    137 
    138     // Octal escapes are a series of octal digits with maximum length 3.
    139     // "\0123" is a two digit sequence equal to "\012" "3".
    140     unsigned NumDigits = 0;
    141     do {
    142       ResultChar <<= 3;
    143       ResultChar |= *ThisTokBuf++ - '0';
    144       ++NumDigits;
    145     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
    146              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
    147 
    148     // Check for overflow.  Reject '\777', but not L'\777'.
    149     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
    150       if (Diags)
    151         Diags->Report(Loc, diag::warn_octal_escape_too_large);
    152       ResultChar &= ~0U >> (32-CharWidth);
    153     }
    154     break;
    155   }
    156 
    157     // Otherwise, these are not valid escapes.
    158   case '(': case '{': case '[': case '%':
    159     // GCC accepts these as extensions.  We warn about them as such though.
    160     if (Diags)
    161       Diags->Report(Loc, diag::ext_nonstandard_escape)
    162         << std::string()+(char)ResultChar;
    163     break;
    164   default:
    165     if (Diags == 0)
    166       break;
    167 
    168     if (isgraph(ResultChar))
    169       Diags->Report(Loc, diag::ext_unknown_escape)
    170         << std::string()+(char)ResultChar;
    171     else
    172       Diags->Report(Loc, diag::ext_unknown_escape)
    173         << "x"+llvm::utohexstr(ResultChar);
    174     break;
    175   }
    176 
    177   return ResultChar;
    178 }
    179 
    180 /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
    181 /// return the UTF32.
    182 static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    183                              const char *ThisTokEnd,
    184                              uint32_t &UcnVal, unsigned short &UcnLen,
    185                              FullSourceLoc Loc, DiagnosticsEngine *Diags,
    186                              const LangOptions &Features,
    187                              bool in_char_string_literal = false) {
    188   if (!Features.CPlusPlus && !Features.C99 && Diags)
    189     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
    190 
    191   const char *UcnBegin = ThisTokBuf;
    192 
    193   // Skip the '\u' char's.
    194   ThisTokBuf += 2;
    195 
    196   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
    197     if (Diags)
    198       Diags->Report(Loc, diag::err_ucn_escape_no_digits);
    199     return false;
    200   }
    201   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
    202   unsigned short UcnLenSave = UcnLen;
    203   for (; ThisTokBuf != ThisTokEnd && UcnLenSave; ++ThisTokBuf, UcnLenSave--) {
    204     int CharVal = HexDigitValue(ThisTokBuf[0]);
    205     if (CharVal == -1) break;
    206     UcnVal <<= 4;
    207     UcnVal |= CharVal;
    208   }
    209   // If we didn't consume the proper number of digits, there is a problem.
    210   if (UcnLenSave) {
    211     if (Diags) {
    212       SourceLocation L =
    213         Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
    214                                        Loc.getManager(), Features);
    215       Diags->Report(L, diag::err_ucn_escape_incomplete);
    216     }
    217     return false;
    218   }
    219 
    220   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
    221   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
    222       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
    223     if (Diags)
    224       Diags->Report(Loc, diag::err_ucn_escape_invalid);
    225     return false;
    226   }
    227 
    228   // C++11 allows UCNs that refer to control characters and basic source
    229   // characters inside character and string literals
    230   if (UcnVal < 0xa0 &&
    231       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
    232     bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
    233     if (Diags) {
    234       SourceLocation UcnBeginLoc =
    235         Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
    236                                        Loc.getManager(), Features);
    237       char BasicSCSChar = UcnVal;
    238       if (UcnVal >= 0x20 && UcnVal < 0x7f)
    239         Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
    240                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
    241           << StringRef(&BasicSCSChar, 1);
    242       else
    243         Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
    244                       diag::warn_cxx98_compat_literal_ucn_control_character);
    245     }
    246     if (IsError)
    247       return false;
    248   }
    249 
    250   return true;
    251 }
    252 
    253 /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
    254 /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
    255 /// StringLiteralParser. When we decide to implement UCN's for identifiers,
    256 /// we will likely rework our support for UCN's.
    257 static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
    258                             const char *ThisTokEnd,
    259                             char *&ResultBuf, bool &HadError,
    260                             FullSourceLoc Loc, unsigned CharByteWidth,
    261                             DiagnosticsEngine *Diags,
    262                             const LangOptions &Features) {
    263   typedef uint32_t UTF32;
    264   UTF32 UcnVal = 0;
    265   unsigned short UcnLen = 0;
    266   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
    267                         Loc, Diags, Features, true)) {
    268     HadError = 1;
    269     return;
    270   }
    271 
    272   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
    273          "only character widths of 1, 2, or 4 bytes supported");
    274 
    275   (void)UcnLen;
    276   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
    277 
    278   if (CharByteWidth == 4) {
    279     // FIXME: Make the type of the result buffer correct instead of
    280     // using reinterpret_cast.
    281     UTF32 *ResultPtr = reinterpret_cast<UTF32*>(ResultBuf);
    282     *ResultPtr = UcnVal;
    283     ResultBuf += 4;
    284     return;
    285   }
    286 
    287   if (CharByteWidth == 2) {
    288     // FIXME: Make the type of the result buffer correct instead of
    289     // using reinterpret_cast.
    290     UTF16 *ResultPtr = reinterpret_cast<UTF16*>(ResultBuf);
    291 
    292     if (UcnVal < (UTF32)0xFFFF) {
    293       *ResultPtr = UcnVal;
    294       ResultBuf += 2;
    295       return;
    296     }
    297 
    298     // Convert to UTF16.
    299     UcnVal -= 0x10000;
    300     *ResultPtr     = 0xD800 + (UcnVal >> 10);
    301     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
    302     ResultBuf += 4;
    303     return;
    304   }
    305 
    306   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
    307 
    308   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
    309   // The conversion below was inspired by:
    310   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
    311   // First, we determine how many bytes the result will require.
    312   typedef uint8_t UTF8;
    313 
    314   unsigned short bytesToWrite = 0;
    315   if (UcnVal < (UTF32)0x80)
    316     bytesToWrite = 1;
    317   else if (UcnVal < (UTF32)0x800)
    318     bytesToWrite = 2;
    319   else if (UcnVal < (UTF32)0x10000)
    320     bytesToWrite = 3;
    321   else
    322     bytesToWrite = 4;
    323 
    324   const unsigned byteMask = 0xBF;
    325   const unsigned byteMark = 0x80;
    326 
    327   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
    328   // into the first byte, depending on how many bytes follow.
    329   static const UTF8 firstByteMark[5] = {
    330     0x00, 0x00, 0xC0, 0xE0, 0xF0
    331   };
    332   // Finally, we write the bytes into ResultBuf.
    333   ResultBuf += bytesToWrite;
    334   switch (bytesToWrite) { // note: everything falls through.
    335     case 4: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    336     case 3: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    337     case 2: *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
    338     case 1: *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
    339   }
    340   // Update the buffer.
    341   ResultBuf += bytesToWrite;
    342 }
    343 
    344 
    345 ///       integer-constant: [C99 6.4.4.1]
    346 ///         decimal-constant integer-suffix
    347 ///         octal-constant integer-suffix
    348 ///         hexadecimal-constant integer-suffix
    349 ///       user-defined-integer-literal: [C++11 lex.ext]
    350 ///         decimal-literal ud-suffix
    351 ///         octal-literal ud-suffix
    352 ///         hexadecimal-literal ud-suffix
    353 ///       decimal-constant:
    354 ///         nonzero-digit
    355 ///         decimal-constant digit
    356 ///       octal-constant:
    357 ///         0
    358 ///         octal-constant octal-digit
    359 ///       hexadecimal-constant:
    360 ///         hexadecimal-prefix hexadecimal-digit
    361 ///         hexadecimal-constant hexadecimal-digit
    362 ///       hexadecimal-prefix: one of
    363 ///         0x 0X
    364 ///       integer-suffix:
    365 ///         unsigned-suffix [long-suffix]
    366 ///         unsigned-suffix [long-long-suffix]
    367 ///         long-suffix [unsigned-suffix]
    368 ///         long-long-suffix [unsigned-sufix]
    369 ///       nonzero-digit:
    370 ///         1 2 3 4 5 6 7 8 9
    371 ///       octal-digit:
    372 ///         0 1 2 3 4 5 6 7
    373 ///       hexadecimal-digit:
    374 ///         0 1 2 3 4 5 6 7 8 9
    375 ///         a b c d e f
    376 ///         A B C D E F
    377 ///       unsigned-suffix: one of
    378 ///         u U
    379 ///       long-suffix: one of
    380 ///         l L
    381 ///       long-long-suffix: one of
    382 ///         ll LL
    383 ///
    384 ///       floating-constant: [C99 6.4.4.2]
    385 ///         TODO: add rules...
    386 ///
    387 NumericLiteralParser::
    388 NumericLiteralParser(const char *begin, const char *end,
    389                      SourceLocation TokLoc, Preprocessor &pp)
    390   : PP(pp), ThisTokBegin(begin), ThisTokEnd(end) {
    391 
    392   // This routine assumes that the range begin/end matches the regex for integer
    393   // and FP constants (specifically, the 'pp-number' regex), and assumes that
    394   // the byte at "*end" is both valid and not part of the regex.  Because of
    395   // this, it doesn't have to check for 'overscan' in various places.
    396   assert(!isalnum(*end) && *end != '.' && *end != '_' &&
    397          "Lexer didn't maximally munch?");
    398 
    399   s = DigitsBegin = begin;
    400   saw_exponent = false;
    401   saw_period = false;
    402   saw_ud_suffix = false;
    403   isLong = false;
    404   isUnsigned = false;
    405   isLongLong = false;
    406   isFloat = false;
    407   isImaginary = false;
    408   isMicrosoftInteger = false;
    409   hadError = false;
    410 
    411   if (*s == '0') { // parse radix
    412     ParseNumberStartingWithZero(TokLoc);
    413     if (hadError)
    414       return;
    415   } else { // the first digit is non-zero
    416     radix = 10;
    417     s = SkipDigits(s);
    418     if (s == ThisTokEnd) {
    419       // Done.
    420     } else if (isxdigit(*s) && !(*s == 'e' || *s == 'E')) {
    421       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
    422               diag::err_invalid_decimal_digit) << StringRef(s, 1);
    423       hadError = true;
    424       return;
    425     } else if (*s == '.') {
    426       s++;
    427       saw_period = true;
    428       s = SkipDigits(s);
    429     }
    430     if ((*s == 'e' || *s == 'E')) { // exponent
    431       const char *Exponent = s;
    432       s++;
    433       saw_exponent = true;
    434       if (*s == '+' || *s == '-')  s++; // sign
    435       const char *first_non_digit = SkipDigits(s);
    436       if (first_non_digit != s) {
    437         s = first_non_digit;
    438       } else {
    439         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-begin),
    440                 diag::err_exponent_has_no_digits);
    441         hadError = true;
    442         return;
    443       }
    444     }
    445   }
    446 
    447   SuffixBegin = s;
    448 
    449   // Parse the suffix.  At this point we can classify whether we have an FP or
    450   // integer constant.
    451   bool isFPConstant = isFloatingLiteral();
    452 
    453   // Loop over all of the characters of the suffix.  If we see something bad,
    454   // we break out of the loop.
    455   for (; s != ThisTokEnd; ++s) {
    456     switch (*s) {
    457     case 'f':      // FP Suffix for "float"
    458     case 'F':
    459       if (!isFPConstant) break;  // Error for integer constant.
    460       if (isFloat || isLong) break; // FF, LF invalid.
    461       isFloat = true;
    462       continue;  // Success.
    463     case 'u':
    464     case 'U':
    465       if (isFPConstant) break;  // Error for floating constant.
    466       if (isUnsigned) break;    // Cannot be repeated.
    467       isUnsigned = true;
    468       continue;  // Success.
    469     case 'l':
    470     case 'L':
    471       if (isLong || isLongLong) break;  // Cannot be repeated.
    472       if (isFloat) break;               // LF invalid.
    473 
    474       // Check for long long.  The L's need to be adjacent and the same case.
    475       if (s+1 != ThisTokEnd && s[1] == s[0]) {
    476         if (isFPConstant) break;        // long long invalid for floats.
    477         isLongLong = true;
    478         ++s;  // Eat both of them.
    479       } else {
    480         isLong = true;
    481       }
    482       continue;  // Success.
    483     case 'i':
    484     case 'I':
    485       if (PP.getLangOpts().MicrosoftExt) {
    486         if (isFPConstant || isLong || isLongLong) break;
    487 
    488         // Allow i8, i16, i32, i64, and i128.
    489         if (s + 1 != ThisTokEnd) {
    490           switch (s[1]) {
    491             case '8':
    492               s += 2; // i8 suffix
    493               isMicrosoftInteger = true;
    494               break;
    495             case '1':
    496               if (s + 2 == ThisTokEnd) break;
    497               if (s[2] == '6') {
    498                 s += 3; // i16 suffix
    499                 isMicrosoftInteger = true;
    500               }
    501               else if (s[2] == '2') {
    502                 if (s + 3 == ThisTokEnd) break;
    503                 if (s[3] == '8') {
    504                   s += 4; // i128 suffix
    505                   isMicrosoftInteger = true;
    506                 }
    507               }
    508               break;
    509             case '3':
    510               if (s + 2 == ThisTokEnd) break;
    511               if (s[2] == '2') {
    512                 s += 3; // i32 suffix
    513                 isLong = true;
    514                 isMicrosoftInteger = true;
    515               }
    516               break;
    517             case '6':
    518               if (s + 2 == ThisTokEnd) break;
    519               if (s[2] == '4') {
    520                 s += 3; // i64 suffix
    521                 isLongLong = true;
    522                 isMicrosoftInteger = true;
    523               }
    524               break;
    525             default:
    526               break;
    527           }
    528           break;
    529         }
    530       }
    531       // fall through.
    532     case 'j':
    533     case 'J':
    534       if (isImaginary) break;   // Cannot be repeated.
    535       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-begin),
    536               diag::ext_imaginary_constant);
    537       isImaginary = true;
    538       continue;  // Success.
    539     }
    540     // If we reached here, there was an error or a ud-suffix.
    541     break;
    542   }
    543 
    544   if (s != ThisTokEnd) {
    545     if (PP.getLangOpts().CPlusPlus0x && s == SuffixBegin && *s == '_') {
    546       // We have a ud-suffix! By C++11 [lex.ext]p10, ud-suffixes not starting
    547       // with an '_' are ill-formed.
    548       saw_ud_suffix = true;
    549       return;
    550     }
    551 
    552     // Report an error if there are any.
    553     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, SuffixBegin-begin),
    554             isFPConstant ? diag::err_invalid_suffix_float_constant :
    555                            diag::err_invalid_suffix_integer_constant)
    556       << StringRef(SuffixBegin, ThisTokEnd-SuffixBegin);
    557     hadError = true;
    558     return;
    559   }
    560 }
    561 
    562 /// ParseNumberStartingWithZero - This method is called when the first character
    563 /// of the number is found to be a zero.  This means it is either an octal
    564 /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
    565 /// a floating point number (01239.123e4).  Eat the prefix, determining the
    566 /// radix etc.
    567 void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
    568   assert(s[0] == '0' && "Invalid method call");
    569   s++;
    570 
    571   // Handle a hex number like 0x1234.
    572   if ((*s == 'x' || *s == 'X') && (isxdigit(s[1]) || s[1] == '.')) {
    573     s++;
    574     radix = 16;
    575     DigitsBegin = s;
    576     s = SkipHexDigits(s);
    577     bool noSignificand = (s == DigitsBegin);
    578     if (s == ThisTokEnd) {
    579       // Done.
    580     } else if (*s == '.') {
    581       s++;
    582       saw_period = true;
    583       const char *floatDigitsBegin = s;
    584       s = SkipHexDigits(s);
    585       noSignificand &= (floatDigitsBegin == s);
    586     }
    587 
    588     if (noSignificand) {
    589       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin), \
    590         diag::err_hexconstant_requires_digits);
    591       hadError = true;
    592       return;
    593     }
    594 
    595     // A binary exponent can appear with or with a '.'. If dotted, the
    596     // binary exponent is required.
    597     if (*s == 'p' || *s == 'P') {
    598       const char *Exponent = s;
    599       s++;
    600       saw_exponent = true;
    601       if (*s == '+' || *s == '-')  s++; // sign
    602       const char *first_non_digit = SkipDigits(s);
    603       if (first_non_digit == s) {
    604         PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
    605                 diag::err_exponent_has_no_digits);
    606         hadError = true;
    607         return;
    608       }
    609       s = first_non_digit;
    610 
    611       if (!PP.getLangOpts().HexFloats)
    612         PP.Diag(TokLoc, diag::ext_hexconstant_invalid);
    613     } else if (saw_period) {
    614       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    615               diag::err_hexconstant_requires_exponent);
    616       hadError = true;
    617     }
    618     return;
    619   }
    620 
    621   // Handle simple binary numbers 0b01010
    622   if (*s == 'b' || *s == 'B') {
    623     // 0b101010 is a GCC extension.
    624     PP.Diag(TokLoc, diag::ext_binary_literal);
    625     ++s;
    626     radix = 2;
    627     DigitsBegin = s;
    628     s = SkipBinaryDigits(s);
    629     if (s == ThisTokEnd) {
    630       // Done.
    631     } else if (isxdigit(*s)) {
    632       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    633               diag::err_invalid_binary_digit) << StringRef(s, 1);
    634       hadError = true;
    635     }
    636     // Other suffixes will be diagnosed by the caller.
    637     return;
    638   }
    639 
    640   // For now, the radix is set to 8. If we discover that we have a
    641   // floating point constant, the radix will change to 10. Octal floating
    642   // point constants are not permitted (only decimal and hexadecimal).
    643   radix = 8;
    644   DigitsBegin = s;
    645   s = SkipOctalDigits(s);
    646   if (s == ThisTokEnd)
    647     return; // Done, simple octal number like 01234
    648 
    649   // If we have some other non-octal digit that *is* a decimal digit, see if
    650   // this is part of a floating point number like 094.123 or 09e1.
    651   if (isdigit(*s)) {
    652     const char *EndDecimal = SkipDigits(s);
    653     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
    654       s = EndDecimal;
    655       radix = 10;
    656     }
    657   }
    658 
    659   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
    660   // the code is using an incorrect base.
    661   if (isxdigit(*s) && *s != 'e' && *s != 'E') {
    662     PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, s-ThisTokBegin),
    663             diag::err_invalid_octal_digit) << StringRef(s, 1);
    664     hadError = true;
    665     return;
    666   }
    667 
    668   if (*s == '.') {
    669     s++;
    670     radix = 10;
    671     saw_period = true;
    672     s = SkipDigits(s); // Skip suffix.
    673   }
    674   if (*s == 'e' || *s == 'E') { // exponent
    675     const char *Exponent = s;
    676     s++;
    677     radix = 10;
    678     saw_exponent = true;
    679     if (*s == '+' || *s == '-')  s++; // sign
    680     const char *first_non_digit = SkipDigits(s);
    681     if (first_non_digit != s) {
    682       s = first_non_digit;
    683     } else {
    684       PP.Diag(PP.AdvanceToTokenCharacter(TokLoc, Exponent-ThisTokBegin),
    685               diag::err_exponent_has_no_digits);
    686       hadError = true;
    687       return;
    688     }
    689   }
    690 }
    691 
    692 
    693 /// GetIntegerValue - Convert this numeric literal value to an APInt that
    694 /// matches Val's input width.  If there is an overflow, set Val to the low bits
    695 /// of the result and return true.  Otherwise, return false.
    696 bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
    697   // Fast path: Compute a conservative bound on the maximum number of
    698   // bits per digit in this radix. If we can't possibly overflow a
    699   // uint64 based on that bound then do the simple conversion to
    700   // integer. This avoids the expensive overflow checking below, and
    701   // handles the common cases that matter (small decimal integers and
    702   // hex/octal values which don't overflow).
    703   unsigned MaxBitsPerDigit = 1;
    704   while ((1U << MaxBitsPerDigit) < radix)
    705     MaxBitsPerDigit += 1;
    706   if ((SuffixBegin - DigitsBegin) * MaxBitsPerDigit <= 64) {
    707     uint64_t N = 0;
    708     for (s = DigitsBegin; s != SuffixBegin; ++s)
    709       N = N*radix + HexDigitValue(*s);
    710 
    711     // This will truncate the value to Val's input width. Simply check
    712     // for overflow by comparing.
    713     Val = N;
    714     return Val.getZExtValue() != N;
    715   }
    716 
    717   Val = 0;
    718   s = DigitsBegin;
    719 
    720   llvm::APInt RadixVal(Val.getBitWidth(), radix);
    721   llvm::APInt CharVal(Val.getBitWidth(), 0);
    722   llvm::APInt OldVal = Val;
    723 
    724   bool OverflowOccurred = false;
    725   while (s < SuffixBegin) {
    726     unsigned C = HexDigitValue(*s++);
    727 
    728     // If this letter is out of bound for this radix, reject it.
    729     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
    730 
    731     CharVal = C;
    732 
    733     // Add the digit to the value in the appropriate radix.  If adding in digits
    734     // made the value smaller, then this overflowed.
    735     OldVal = Val;
    736 
    737     // Multiply by radix, did overflow occur on the multiply?
    738     Val *= RadixVal;
    739     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
    740 
    741     // Add value, did overflow occur on the value?
    742     //   (a + b) ult b  <=> overflow
    743     Val += CharVal;
    744     OverflowOccurred |= Val.ult(CharVal);
    745   }
    746   return OverflowOccurred;
    747 }
    748 
    749 llvm::APFloat::opStatus
    750 NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
    751   using llvm::APFloat;
    752 
    753   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
    754   return Result.convertFromString(StringRef(ThisTokBegin, n),
    755                                   APFloat::rmNearestTiesToEven);
    756 }
    757 
    758 
    759 ///       user-defined-character-literal: [C++11 lex.ext]
    760 ///         character-literal ud-suffix
    761 ///       ud-suffix:
    762 ///         identifier
    763 ///       character-literal: [C++11 lex.ccon]
    764 ///         ' c-char-sequence '
    765 ///         u' c-char-sequence '
    766 ///         U' c-char-sequence '
    767 ///         L' c-char-sequence '
    768 ///       c-char-sequence:
    769 ///         c-char
    770 ///         c-char-sequence c-char
    771 ///       c-char:
    772 ///         any member of the source character set except the single-quote ',
    773 ///           backslash \, or new-line character
    774 ///         escape-sequence
    775 ///         universal-character-name
    776 ///       escape-sequence:
    777 ///         simple-escape-sequence
    778 ///         octal-escape-sequence
    779 ///         hexadecimal-escape-sequence
    780 ///       simple-escape-sequence:
    781 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
    782 ///       octal-escape-sequence:
    783 ///         \ octal-digit
    784 ///         \ octal-digit octal-digit
    785 ///         \ octal-digit octal-digit octal-digit
    786 ///       hexadecimal-escape-sequence:
    787 ///         \x hexadecimal-digit
    788 ///         hexadecimal-escape-sequence hexadecimal-digit
    789 ///       universal-character-name: [C++11 lex.charset]
    790 ///         \u hex-quad
    791 ///         \U hex-quad hex-quad
    792 ///       hex-quad:
    793 ///         hex-digit hex-digit hex-digit hex-digit
    794 ///
    795 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
    796                                      SourceLocation Loc, Preprocessor &PP,
    797                                      tok::TokenKind kind) {
    798   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
    799   HadError = false;
    800 
    801   Kind = kind;
    802 
    803   const char *TokBegin = begin;
    804 
    805   // Skip over wide character determinant.
    806   if (Kind != tok::char_constant) {
    807     ++begin;
    808   }
    809 
    810   // Skip over the entry quote.
    811   assert(begin[0] == '\'' && "Invalid token lexed");
    812   ++begin;
    813 
    814   // Remove an optional ud-suffix.
    815   if (end[-1] != '\'') {
    816     const char *UDSuffixEnd = end;
    817     do {
    818       --end;
    819     } while (end[-1] != '\'');
    820     UDSuffixBuf.assign(end, UDSuffixEnd);
    821     UDSuffixOffset = end - TokBegin;
    822   }
    823 
    824   // Trim the ending quote.
    825   assert(end != begin && "Invalid token lexed");
    826   --end;
    827 
    828   // FIXME: The "Value" is an uint64_t so we can handle char literals of
    829   // up to 64-bits.
    830   // FIXME: This extensively assumes that 'char' is 8-bits.
    831   assert(PP.getTargetInfo().getCharWidth() == 8 &&
    832          "Assumes char is 8 bits");
    833   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
    834          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
    835          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
    836   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
    837          "Assumes sizeof(wchar) on target is <= 64");
    838 
    839   SmallVector<uint32_t,4> codepoint_buffer;
    840   codepoint_buffer.resize(end-begin);
    841   uint32_t *buffer_begin = &codepoint_buffer.front();
    842   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
    843 
    844   // Unicode escapes representing characters that cannot be correctly
    845   // represented in a single code unit are disallowed in character literals
    846   // by this implementation.
    847   uint32_t largest_character_for_kind;
    848   if (tok::wide_char_constant == Kind) {
    849     largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
    850   } else if (tok::utf16_char_constant == Kind) {
    851     largest_character_for_kind = 0xFFFF;
    852   } else if (tok::utf32_char_constant == Kind) {
    853     largest_character_for_kind = 0x10FFFF;
    854   } else {
    855     largest_character_for_kind = 0x7Fu;
    856   }
    857 
    858   while (begin!=end) {
    859     // Is this a span of non-escape characters?
    860     if (begin[0] != '\\') {
    861       char const *start = begin;
    862       do {
    863         ++begin;
    864       } while (begin != end && *begin != '\\');
    865 
    866       char const *tmp_in_start = start;
    867       uint32_t *tmp_out_start = buffer_begin;
    868       ConversionResult res =
    869       ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
    870                          reinterpret_cast<UTF8 const *>(begin),
    871                          &buffer_begin,buffer_end,strictConversion);
    872       if (res!=conversionOK) {
    873         // If we see bad encoding for unprefixed character literals, warn and
    874         // simply copy the byte values, for compatibility with gcc and
    875         // older versions of clang.
    876         bool NoErrorOnBadEncoding = isAscii();
    877         unsigned Msg = diag::err_bad_character_encoding;
    878         if (NoErrorOnBadEncoding)
    879           Msg = diag::warn_bad_character_encoding;
    880         PP.Diag(Loc, Msg);
    881         if (NoErrorOnBadEncoding) {
    882           start = tmp_in_start;
    883           buffer_begin = tmp_out_start;
    884           for ( ; start != begin; ++start, ++buffer_begin)
    885             *buffer_begin = static_cast<uint8_t>(*start);
    886         } else {
    887           HadError = true;
    888         }
    889       } else {
    890         for (; tmp_out_start <buffer_begin; ++tmp_out_start) {
    891           if (*tmp_out_start > largest_character_for_kind) {
    892             HadError = true;
    893             PP.Diag(Loc, diag::err_character_too_large);
    894           }
    895         }
    896       }
    897 
    898       continue;
    899     }
    900     // Is this a Universal Character Name excape?
    901     if (begin[1] == 'u' || begin[1] == 'U') {
    902       unsigned short UcnLen = 0;
    903       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
    904                             FullSourceLoc(Loc, PP.getSourceManager()),
    905                             &PP.getDiagnostics(), PP.getLangOpts(),
    906                             true))
    907       {
    908         HadError = true;
    909       } else if (*buffer_begin > largest_character_for_kind) {
    910         HadError = true;
    911         PP.Diag(Loc,diag::err_character_too_large);
    912       }
    913 
    914       ++buffer_begin;
    915       continue;
    916     }
    917     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
    918     uint64_t result =
    919     ProcessCharEscape(begin, end, HadError,
    920                       FullSourceLoc(Loc,PP.getSourceManager()),
    921                       CharWidth, &PP.getDiagnostics());
    922     *buffer_begin++ = result;
    923   }
    924 
    925   unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
    926 
    927   if (NumCharsSoFar > 1) {
    928     if (isWide())
    929       PP.Diag(Loc, diag::warn_extraneous_char_constant);
    930     else if (isAscii() && NumCharsSoFar == 4)
    931       PP.Diag(Loc, diag::ext_four_char_character_literal);
    932     else if (isAscii())
    933       PP.Diag(Loc, diag::ext_multichar_character_literal);
    934     else
    935       PP.Diag(Loc, diag::err_multichar_utf_character_literal);
    936     IsMultiChar = true;
    937   } else
    938     IsMultiChar = false;
    939 
    940   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
    941 
    942   // Narrow character literals act as though their value is concatenated
    943   // in this implementation, but warn on overflow.
    944   bool multi_char_too_long = false;
    945   if (isAscii() && isMultiChar()) {
    946     LitVal = 0;
    947     for (size_t i=0;i<NumCharsSoFar;++i) {
    948       // check for enough leading zeros to shift into
    949       multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
    950       LitVal <<= 8;
    951       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
    952     }
    953   } else if (NumCharsSoFar > 0) {
    954     // otherwise just take the last character
    955     LitVal = buffer_begin[-1];
    956   }
    957 
    958   if (!HadError && multi_char_too_long) {
    959     PP.Diag(Loc,diag::warn_char_constant_too_large);
    960   }
    961 
    962   // Transfer the value from APInt to uint64_t
    963   Value = LitVal.getZExtValue();
    964 
    965   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
    966   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
    967   // character constants are not sign extended in the this implementation:
    968   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
    969   if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
    970       PP.getLangOpts().CharIsSigned)
    971     Value = (signed char)Value;
    972 }
    973 
    974 
    975 ///       string-literal: [C++0x lex.string]
    976 ///         encoding-prefix " [s-char-sequence] "
    977 ///         encoding-prefix R raw-string
    978 ///       encoding-prefix:
    979 ///         u8
    980 ///         u
    981 ///         U
    982 ///         L
    983 ///       s-char-sequence:
    984 ///         s-char
    985 ///         s-char-sequence s-char
    986 ///       s-char:
    987 ///         any member of the source character set except the double-quote ",
    988 ///           backslash \, or new-line character
    989 ///         escape-sequence
    990 ///         universal-character-name
    991 ///       raw-string:
    992 ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
    993 ///       r-char-sequence:
    994 ///         r-char
    995 ///         r-char-sequence r-char
    996 ///       r-char:
    997 ///         any member of the source character set, except a right parenthesis )
    998 ///           followed by the initial d-char-sequence (which may be empty)
    999 ///           followed by a double quote ".
   1000 ///       d-char-sequence:
   1001 ///         d-char
   1002 ///         d-char-sequence d-char
   1003 ///       d-char:
   1004 ///         any member of the basic source character set except:
   1005 ///           space, the left parenthesis (, the right parenthesis ),
   1006 ///           the backslash \, and the control characters representing horizontal
   1007 ///           tab, vertical tab, form feed, and newline.
   1008 ///       escape-sequence: [C++0x lex.ccon]
   1009 ///         simple-escape-sequence
   1010 ///         octal-escape-sequence
   1011 ///         hexadecimal-escape-sequence
   1012 ///       simple-escape-sequence:
   1013 ///         one of \' \" \? \\ \a \b \f \n \r \t \v
   1014 ///       octal-escape-sequence:
   1015 ///         \ octal-digit
   1016 ///         \ octal-digit octal-digit
   1017 ///         \ octal-digit octal-digit octal-digit
   1018 ///       hexadecimal-escape-sequence:
   1019 ///         \x hexadecimal-digit
   1020 ///         hexadecimal-escape-sequence hexadecimal-digit
   1021 ///       universal-character-name:
   1022 ///         \u hex-quad
   1023 ///         \U hex-quad hex-quad
   1024 ///       hex-quad:
   1025 ///         hex-digit hex-digit hex-digit hex-digit
   1026 ///
   1027 StringLiteralParser::
   1028 StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
   1029                     Preprocessor &PP, bool Complain)
   1030   : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
   1031     Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
   1032     MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
   1033     ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
   1034   init(StringToks, NumStringToks);
   1035 }
   1036 
   1037 void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   1038   // The literal token may have come from an invalid source location (e.g. due
   1039   // to a PCH error), in which case the token length will be 0.
   1040   if (NumStringToks == 0 || StringToks[0].getLength() < 2) {
   1041     hadError = true;
   1042     return;
   1043   }
   1044 
   1045   // Scan all of the string portions, remember the max individual token length,
   1046   // computing a bound on the concatenated string length, and see whether any
   1047   // piece is a wide-string.  If any of the string portions is a wide-string
   1048   // literal, the result is a wide-string literal [C99 6.4.5p4].
   1049   assert(NumStringToks && "expected at least one token");
   1050   MaxTokenLength = StringToks[0].getLength();
   1051   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
   1052   SizeBound = StringToks[0].getLength()-2;  // -2 for "".
   1053   Kind = StringToks[0].getKind();
   1054 
   1055   hadError = false;
   1056 
   1057   // Implement Translation Phase #6: concatenation of string literals
   1058   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
   1059   for (unsigned i = 1; i != NumStringToks; ++i) {
   1060     if (StringToks[i].getLength() < 2) {
   1061       hadError = true;
   1062       return;
   1063     }
   1064 
   1065     // The string could be shorter than this if it needs cleaning, but this is a
   1066     // reasonable bound, which is all we need.
   1067     assert(StringToks[i].getLength() >= 2 && "literal token is invalid!");
   1068     SizeBound += StringToks[i].getLength()-2;  // -2 for "".
   1069 
   1070     // Remember maximum string piece length.
   1071     if (StringToks[i].getLength() > MaxTokenLength)
   1072       MaxTokenLength = StringToks[i].getLength();
   1073 
   1074     // Remember if we see any wide or utf-8/16/32 strings.
   1075     // Also check for illegal concatenations.
   1076     if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
   1077       if (isAscii()) {
   1078         Kind = StringToks[i].getKind();
   1079       } else {
   1080         if (Diags)
   1081           Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
   1082                         diag::err_unsupported_string_concat);
   1083         hadError = true;
   1084       }
   1085     }
   1086   }
   1087 
   1088   // Include space for the null terminator.
   1089   ++SizeBound;
   1090 
   1091   // TODO: K&R warning: "traditional C rejects string constant concatenation"
   1092 
   1093   // Get the width in bytes of char/wchar_t/char16_t/char32_t
   1094   CharByteWidth = getCharWidth(Kind, Target);
   1095   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
   1096   CharByteWidth /= 8;
   1097 
   1098   // The output buffer size needs to be large enough to hold wide characters.
   1099   // This is a worst-case assumption which basically corresponds to L"" "long".
   1100   SizeBound *= CharByteWidth;
   1101 
   1102   // Size the temporary buffer to hold the result string data.
   1103   ResultBuf.resize(SizeBound);
   1104 
   1105   // Likewise, but for each string piece.
   1106   SmallString<512> TokenBuf;
   1107   TokenBuf.resize(MaxTokenLength);
   1108 
   1109   // Loop over all the strings, getting their spelling, and expanding them to
   1110   // wide strings as appropriate.
   1111   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
   1112 
   1113   Pascal = false;
   1114 
   1115   SourceLocation UDSuffixTokLoc;
   1116 
   1117   for (unsigned i = 0, e = NumStringToks; i != e; ++i) {
   1118     const char *ThisTokBuf = &TokenBuf[0];
   1119     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
   1120     // that ThisTokBuf points to a buffer that is big enough for the whole token
   1121     // and 'spelled' tokens can only shrink.
   1122     bool StringInvalid = false;
   1123     unsigned ThisTokLen =
   1124       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
   1125                          &StringInvalid);
   1126     if (StringInvalid) {
   1127       hadError = true;
   1128       continue;
   1129     }
   1130 
   1131     const char *ThisTokBegin = ThisTokBuf;
   1132     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
   1133 
   1134     // Remove an optional ud-suffix.
   1135     if (ThisTokEnd[-1] != '"') {
   1136       const char *UDSuffixEnd = ThisTokEnd;
   1137       do {
   1138         --ThisTokEnd;
   1139       } while (ThisTokEnd[-1] != '"');
   1140 
   1141       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
   1142 
   1143       if (UDSuffixBuf.empty()) {
   1144         UDSuffixBuf.assign(UDSuffix);
   1145         UDSuffixToken = i;
   1146         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
   1147         UDSuffixTokLoc = StringToks[i].getLocation();
   1148       } else if (!UDSuffixBuf.equals(UDSuffix)) {
   1149         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
   1150         // result of a concatenation involving at least one user-defined-string-
   1151         // literal, all the participating user-defined-string-literals shall
   1152         // have the same ud-suffix.
   1153         if (Diags) {
   1154           SourceLocation TokLoc = StringToks[i].getLocation();
   1155           Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
   1156             << UDSuffixBuf << UDSuffix
   1157             << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc)
   1158             << SourceRange(TokLoc, TokLoc);
   1159         }
   1160         hadError = true;
   1161       }
   1162     }
   1163 
   1164     // Strip the end quote.
   1165     --ThisTokEnd;
   1166 
   1167     // TODO: Input character set mapping support.
   1168 
   1169     // Skip marker for wide or unicode strings.
   1170     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
   1171       ++ThisTokBuf;
   1172       // Skip 8 of u8 marker for utf8 strings.
   1173       if (ThisTokBuf[0] == '8')
   1174         ++ThisTokBuf;
   1175     }
   1176 
   1177     // Check for raw string
   1178     if (ThisTokBuf[0] == 'R') {
   1179       ThisTokBuf += 2; // skip R"
   1180 
   1181       const char *Prefix = ThisTokBuf;
   1182       while (ThisTokBuf[0] != '(')
   1183         ++ThisTokBuf;
   1184       ++ThisTokBuf; // skip '('
   1185 
   1186       // Remove same number of characters from the end
   1187       ThisTokEnd -= ThisTokBuf - Prefix;
   1188       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
   1189 
   1190       // Copy the string over
   1191       if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
   1192         if (DiagnoseBadString(StringToks[i]))
   1193           hadError = true;
   1194     } else {
   1195       assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
   1196       ++ThisTokBuf; // skip "
   1197 
   1198       // Check if this is a pascal string
   1199       if (Features.PascalStrings && ThisTokBuf + 1 != ThisTokEnd &&
   1200           ThisTokBuf[0] == '\\' && ThisTokBuf[1] == 'p') {
   1201 
   1202         // If the \p sequence is found in the first token, we have a pascal string
   1203         // Otherwise, if we already have a pascal string, ignore the first \p
   1204         if (i == 0) {
   1205           ++ThisTokBuf;
   1206           Pascal = true;
   1207         } else if (Pascal)
   1208           ThisTokBuf += 2;
   1209       }
   1210 
   1211       while (ThisTokBuf != ThisTokEnd) {
   1212         // Is this a span of non-escape characters?
   1213         if (ThisTokBuf[0] != '\\') {
   1214           const char *InStart = ThisTokBuf;
   1215           do {
   1216             ++ThisTokBuf;
   1217           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
   1218 
   1219           // Copy the character span over.
   1220           if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
   1221             if (DiagnoseBadString(StringToks[i]))
   1222               hadError = true;
   1223           continue;
   1224         }
   1225         // Is this a Universal Character Name escape?
   1226         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
   1227           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
   1228                           ResultPtr, hadError,
   1229                           FullSourceLoc(StringToks[i].getLocation(), SM),
   1230                           CharByteWidth, Diags, Features);
   1231           continue;
   1232         }
   1233         // Otherwise, this is a non-UCN escape character.  Process it.
   1234         unsigned ResultChar =
   1235           ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
   1236                             FullSourceLoc(StringToks[i].getLocation(), SM),
   1237                             CharByteWidth*8, Diags);
   1238 
   1239         if (CharByteWidth == 4) {
   1240           // FIXME: Make the type of the result buffer correct instead of
   1241           // using reinterpret_cast.
   1242           UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultPtr);
   1243           *ResultWidePtr = ResultChar;
   1244           ResultPtr += 4;
   1245         } else if (CharByteWidth == 2) {
   1246           // FIXME: Make the type of the result buffer correct instead of
   1247           // using reinterpret_cast.
   1248           UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultPtr);
   1249           *ResultWidePtr = ResultChar & 0xFFFF;
   1250           ResultPtr += 2;
   1251         } else {
   1252           assert(CharByteWidth == 1 && "Unexpected char width");
   1253           *ResultPtr++ = ResultChar & 0xFF;
   1254         }
   1255       }
   1256     }
   1257   }
   1258 
   1259   if (Pascal) {
   1260     if (CharByteWidth == 4) {
   1261       // FIXME: Make the type of the result buffer correct instead of
   1262       // using reinterpret_cast.
   1263       UTF32 *ResultWidePtr = reinterpret_cast<UTF32*>(ResultBuf.data());
   1264       ResultWidePtr[0] = GetNumStringChars() - 1;
   1265     } else if (CharByteWidth == 2) {
   1266       // FIXME: Make the type of the result buffer correct instead of
   1267       // using reinterpret_cast.
   1268       UTF16 *ResultWidePtr = reinterpret_cast<UTF16*>(ResultBuf.data());
   1269       ResultWidePtr[0] = GetNumStringChars() - 1;
   1270     } else {
   1271       assert(CharByteWidth == 1 && "Unexpected char width");
   1272       ResultBuf[0] = GetNumStringChars() - 1;
   1273     }
   1274 
   1275     // Verify that pascal strings aren't too large.
   1276     if (GetStringLength() > 256) {
   1277       if (Diags)
   1278         Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
   1279                       diag::err_pascal_string_too_long)
   1280           << SourceRange(StringToks[0].getLocation(),
   1281                          StringToks[NumStringToks-1].getLocation());
   1282       hadError = true;
   1283       return;
   1284     }
   1285   } else if (Diags) {
   1286     // Complain if this string literal has too many characters.
   1287     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
   1288 
   1289     if (GetNumStringChars() > MaxChars)
   1290       Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
   1291                     diag::ext_string_too_long)
   1292         << GetNumStringChars() << MaxChars
   1293         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
   1294         << SourceRange(StringToks[0].getLocation(),
   1295                        StringToks[NumStringToks-1].getLocation());
   1296   }
   1297 }
   1298 
   1299 
   1300 /// copyStringFragment - This function copies from Start to End into ResultPtr.
   1301 /// Performs widening for multi-byte characters.
   1302 bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
   1303   assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4);
   1304   ConversionResult result = conversionOK;
   1305   // Copy the character span over.
   1306   if (CharByteWidth == 1) {
   1307     if (!isLegalUTF8String(reinterpret_cast<const UTF8*>(Fragment.begin()),
   1308                            reinterpret_cast<const UTF8*>(Fragment.end())))
   1309       result = sourceIllegal;
   1310     memcpy(ResultPtr, Fragment.data(), Fragment.size());
   1311     ResultPtr += Fragment.size();
   1312   } else if (CharByteWidth == 2) {
   1313     UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
   1314     // FIXME: Make the type of the result buffer correct instead of
   1315     // using reinterpret_cast.
   1316     UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr);
   1317     ConversionFlags flags = strictConversion;
   1318     result = ConvertUTF8toUTF16(
   1319 	    &sourceStart,sourceStart + Fragment.size(),
   1320         &targetStart,targetStart + 2*Fragment.size(),flags);
   1321     if (result==conversionOK)
   1322       ResultPtr = reinterpret_cast<char*>(targetStart);
   1323   } else if (CharByteWidth == 4) {
   1324     UTF8 const *sourceStart = (UTF8 const *)Fragment.data();
   1325     // FIXME: Make the type of the result buffer correct instead of
   1326     // using reinterpret_cast.
   1327     UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr);
   1328     ConversionFlags flags = strictConversion;
   1329     result = ConvertUTF8toUTF32(
   1330         &sourceStart,sourceStart + Fragment.size(),
   1331         &targetStart,targetStart + 4*Fragment.size(),flags);
   1332     if (result==conversionOK)
   1333       ResultPtr = reinterpret_cast<char*>(targetStart);
   1334   }
   1335   assert((result != targetExhausted)
   1336          && "ConvertUTF8toUTFXX exhausted target buffer");
   1337   return result != conversionOK;
   1338 }
   1339 
   1340 bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
   1341   // If we see bad encoding for unprefixed string literals, warn and
   1342   // simply copy the byte values, for compatibility with gcc and older
   1343   // versions of clang.
   1344   bool NoErrorOnBadEncoding = isAscii();
   1345   unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
   1346                                         diag::err_bad_string_encoding;
   1347   if (Diags)
   1348     Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
   1349   return !NoErrorOnBadEncoding;
   1350 }
   1351 
   1352 /// getOffsetOfStringByte - This function returns the offset of the
   1353 /// specified byte of the string data represented by Token.  This handles
   1354 /// advancing over escape sequences in the string.
   1355 unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
   1356                                                     unsigned ByteNo) const {
   1357   // Get the spelling of the token.
   1358   SmallString<32> SpellingBuffer;
   1359   SpellingBuffer.resize(Tok.getLength());
   1360 
   1361   bool StringInvalid = false;
   1362   const char *SpellingPtr = &SpellingBuffer[0];
   1363   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
   1364                                        &StringInvalid);
   1365   if (StringInvalid)
   1366     return 0;
   1367 
   1368   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
   1369          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
   1370 
   1371 
   1372   const char *SpellingStart = SpellingPtr;
   1373   const char *SpellingEnd = SpellingPtr+TokLen;
   1374 
   1375   // Skip over the leading quote.
   1376   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
   1377   ++SpellingPtr;
   1378 
   1379   // Skip over bytes until we find the offset we're looking for.
   1380   while (ByteNo) {
   1381     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
   1382 
   1383     // Step over non-escapes simply.
   1384     if (*SpellingPtr != '\\') {
   1385       ++SpellingPtr;
   1386       --ByteNo;
   1387       continue;
   1388     }
   1389 
   1390     // Otherwise, this is an escape character.  Advance over it.
   1391     bool HadError = false;
   1392     ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
   1393                       FullSourceLoc(Tok.getLocation(), SM),
   1394                       CharByteWidth*8, Diags);
   1395     assert(!HadError && "This method isn't valid on erroneous strings");
   1396     --ByteNo;
   1397   }
   1398 
   1399   return SpellingPtr-SpellingStart;
   1400 }
   1401