Home | History | Annotate | Download | only in Support
      1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "llvm/ADT/StringRef.h"
     11 #include "llvm/ADT/APInt.h"
     12 #include "llvm/ADT/Hashing.h"
     13 #include "llvm/ADT/edit_distance.h"
     14 #include <bitset>
     15 
     16 using namespace llvm;
     17 
     18 // MSVC emits references to this into the translation units which reference it.
     19 #ifndef _MSC_VER
     20 const size_t StringRef::npos;
     21 #endif
     22 
     23 static char ascii_tolower(char x) {
     24   if (x >= 'A' && x <= 'Z')
     25     return x - 'A' + 'a';
     26   return x;
     27 }
     28 
     29 static char ascii_toupper(char x) {
     30   if (x >= 'a' && x <= 'z')
     31     return x - 'a' + 'A';
     32   return x;
     33 }
     34 
     35 static bool ascii_isdigit(char x) {
     36   return x >= '0' && x <= '9';
     37 }
     38 
     39 // strncasecmp() is not available on non-POSIX systems, so define an
     40 // alternative function here.
     41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) {
     42   for (size_t I = 0; I < Length; ++I) {
     43     unsigned char LHC = ascii_tolower(LHS[I]);
     44     unsigned char RHC = ascii_tolower(RHS[I]);
     45     if (LHC != RHC)
     46       return LHC < RHC ? -1 : 1;
     47   }
     48   return 0;
     49 }
     50 
     51 /// compare_lower - Compare strings, ignoring case.
     52 int StringRef::compare_lower(StringRef RHS) const {
     53   if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length)))
     54     return Res;
     55   if (Length == RHS.Length)
     56     return 0;
     57   return Length < RHS.Length ? -1 : 1;
     58 }
     59 
     60 /// Check if this string starts with the given \p Prefix, ignoring case.
     61 bool StringRef::startswith_lower(StringRef Prefix) const {
     62   return Length >= Prefix.Length &&
     63       ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0;
     64 }
     65 
     66 /// Check if this string ends with the given \p Suffix, ignoring case.
     67 bool StringRef::endswith_lower(StringRef Suffix) const {
     68   return Length >= Suffix.Length &&
     69       ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
     70 }
     71 
     72 size_t StringRef::find_lower(char C, size_t From) const {
     73   char L = ascii_tolower(C);
     74   return find_if([L](char D) { return ascii_tolower(D) == L; }, From);
     75 }
     76 
     77 /// compare_numeric - Compare strings, handle embedded numbers.
     78 int StringRef::compare_numeric(StringRef RHS) const {
     79   for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) {
     80     // Check for sequences of digits.
     81     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
     82       // The longer sequence of numbers is considered larger.
     83       // This doesn't really handle prefixed zeros well.
     84       size_t J;
     85       for (J = I + 1; J != E + 1; ++J) {
     86         bool ld = J < Length && ascii_isdigit(Data[J]);
     87         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
     88         if (ld != rd)
     89           return rd ? -1 : 1;
     90         if (!rd)
     91           break;
     92       }
     93       // The two number sequences have the same length (J-I), just memcmp them.
     94       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
     95         return Res < 0 ? -1 : 1;
     96       // Identical number sequences, continue search after the numbers.
     97       I = J - 1;
     98       continue;
     99     }
    100     if (Data[I] != RHS.Data[I])
    101       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
    102   }
    103   if (Length == RHS.Length)
    104     return 0;
    105   return Length < RHS.Length ? -1 : 1;
    106 }
    107 
    108 // Compute the edit distance between the two given strings.
    109 unsigned StringRef::edit_distance(llvm::StringRef Other,
    110                                   bool AllowReplacements,
    111                                   unsigned MaxEditDistance) const {
    112   return llvm::ComputeEditDistance(
    113       makeArrayRef(data(), size()),
    114       makeArrayRef(Other.data(), Other.size()),
    115       AllowReplacements, MaxEditDistance);
    116 }
    117 
    118 //===----------------------------------------------------------------------===//
    119 // String Operations
    120 //===----------------------------------------------------------------------===//
    121 
    122 std::string StringRef::lower() const {
    123   std::string Result(size(), char());
    124   for (size_type i = 0, e = size(); i != e; ++i) {
    125     Result[i] = ascii_tolower(Data[i]);
    126   }
    127   return Result;
    128 }
    129 
    130 std::string StringRef::upper() const {
    131   std::string Result(size(), char());
    132   for (size_type i = 0, e = size(); i != e; ++i) {
    133     Result[i] = ascii_toupper(Data[i]);
    134   }
    135   return Result;
    136 }
    137 
    138 //===----------------------------------------------------------------------===//
    139 // String Searching
    140 //===----------------------------------------------------------------------===//
    141 
    142 
    143 /// find - Search for the first string \arg Str in the string.
    144 ///
    145 /// \return - The index of the first occurrence of \arg Str, or npos if not
    146 /// found.
    147 size_t StringRef::find(StringRef Str, size_t From) const {
    148   if (From > Length)
    149     return npos;
    150 
    151   const char *Start = Data + From;
    152   size_t Size = Length - From;
    153 
    154   const char *Needle = Str.data();
    155   size_t N = Str.size();
    156   if (N == 0)
    157     return From;
    158   if (Size < N)
    159     return npos;
    160   if (N == 1) {
    161     const char *Ptr = (const char *)::memchr(Start, Needle[0], Size);
    162     return Ptr == nullptr ? npos : Ptr - Data;
    163   }
    164 
    165   const char *Stop = Start + (Size - N + 1);
    166 
    167   // For short haystacks or unsupported needles fall back to the naive algorithm
    168   if (Size < 16 || N > 255) {
    169     do {
    170       if (std::memcmp(Start, Needle, N) == 0)
    171         return Start - Data;
    172       ++Start;
    173     } while (Start < Stop);
    174     return npos;
    175   }
    176 
    177   // Build the bad char heuristic table, with uint8_t to reduce cache thrashing.
    178   uint8_t BadCharSkip[256];
    179   std::memset(BadCharSkip, N, 256);
    180   for (unsigned i = 0; i != N-1; ++i)
    181     BadCharSkip[(uint8_t)Str[i]] = N-1-i;
    182 
    183   do {
    184     uint8_t Last = Start[N - 1];
    185     if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1]))
    186       if (std::memcmp(Start, Needle, N - 1) == 0)
    187         return Start - Data;
    188 
    189     // Otherwise skip the appropriate number of bytes.
    190     Start += BadCharSkip[Last];
    191   } while (Start < Stop);
    192 
    193   return npos;
    194 }
    195 
    196 size_t StringRef::find_lower(StringRef Str, size_t From) const {
    197   StringRef This = substr(From);
    198   while (This.size() >= Str.size()) {
    199     if (This.startswith_lower(Str))
    200       return From;
    201     This = This.drop_front();
    202     ++From;
    203   }
    204   return npos;
    205 }
    206 
    207 size_t StringRef::rfind_lower(char C, size_t From) const {
    208   From = std::min(From, Length);
    209   size_t i = From;
    210   while (i != 0) {
    211     --i;
    212     if (ascii_tolower(Data[i]) == ascii_tolower(C))
    213       return i;
    214   }
    215   return npos;
    216 }
    217 
    218 /// rfind - Search for the last string \arg Str in the string.
    219 ///
    220 /// \return - The index of the last occurrence of \arg Str, or npos if not
    221 /// found.
    222 size_t StringRef::rfind(StringRef Str) const {
    223   size_t N = Str.size();
    224   if (N > Length)
    225     return npos;
    226   for (size_t i = Length - N + 1, e = 0; i != e;) {
    227     --i;
    228     if (substr(i, N).equals(Str))
    229       return i;
    230   }
    231   return npos;
    232 }
    233 
    234 size_t StringRef::rfind_lower(StringRef Str) const {
    235   size_t N = Str.size();
    236   if (N > Length)
    237     return npos;
    238   for (size_t i = Length - N + 1, e = 0; i != e;) {
    239     --i;
    240     if (substr(i, N).equals_lower(Str))
    241       return i;
    242   }
    243   return npos;
    244 }
    245 
    246 /// find_first_of - Find the first character in the string that is in \arg
    247 /// Chars, or npos if not found.
    248 ///
    249 /// Note: O(size() + Chars.size())
    250 StringRef::size_type StringRef::find_first_of(StringRef Chars,
    251                                               size_t From) const {
    252   std::bitset<1 << CHAR_BIT> CharBits;
    253   for (size_type i = 0; i != Chars.size(); ++i)
    254     CharBits.set((unsigned char)Chars[i]);
    255 
    256   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
    257     if (CharBits.test((unsigned char)Data[i]))
    258       return i;
    259   return npos;
    260 }
    261 
    262 /// find_first_not_of - Find the first character in the string that is not
    263 /// \arg C or npos if not found.
    264 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
    265   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
    266     if (Data[i] != C)
    267       return i;
    268   return npos;
    269 }
    270 
    271 /// find_first_not_of - Find the first character in the string that is not
    272 /// in the string \arg Chars, or npos if not found.
    273 ///
    274 /// Note: O(size() + Chars.size())
    275 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
    276                                                   size_t From) const {
    277   std::bitset<1 << CHAR_BIT> CharBits;
    278   for (size_type i = 0; i != Chars.size(); ++i)
    279     CharBits.set((unsigned char)Chars[i]);
    280 
    281   for (size_type i = std::min(From, Length), e = Length; i != e; ++i)
    282     if (!CharBits.test((unsigned char)Data[i]))
    283       return i;
    284   return npos;
    285 }
    286 
    287 /// find_last_of - Find the last character in the string that is in \arg C,
    288 /// or npos if not found.
    289 ///
    290 /// Note: O(size() + Chars.size())
    291 StringRef::size_type StringRef::find_last_of(StringRef Chars,
    292                                              size_t From) const {
    293   std::bitset<1 << CHAR_BIT> CharBits;
    294   for (size_type i = 0; i != Chars.size(); ++i)
    295     CharBits.set((unsigned char)Chars[i]);
    296 
    297   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
    298     if (CharBits.test((unsigned char)Data[i]))
    299       return i;
    300   return npos;
    301 }
    302 
    303 /// find_last_not_of - Find the last character in the string that is not
    304 /// \arg C, or npos if not found.
    305 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const {
    306   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
    307     if (Data[i] != C)
    308       return i;
    309   return npos;
    310 }
    311 
    312 /// find_last_not_of - Find the last character in the string that is not in
    313 /// \arg Chars, or npos if not found.
    314 ///
    315 /// Note: O(size() + Chars.size())
    316 StringRef::size_type StringRef::find_last_not_of(StringRef Chars,
    317                                                  size_t From) const {
    318   std::bitset<1 << CHAR_BIT> CharBits;
    319   for (size_type i = 0, e = Chars.size(); i != e; ++i)
    320     CharBits.set((unsigned char)Chars[i]);
    321 
    322   for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i)
    323     if (!CharBits.test((unsigned char)Data[i]))
    324       return i;
    325   return npos;
    326 }
    327 
    328 void StringRef::split(SmallVectorImpl<StringRef> &A,
    329                       StringRef Separator, int MaxSplit,
    330                       bool KeepEmpty) const {
    331   StringRef S = *this;
    332 
    333   // Count down from MaxSplit. When MaxSplit is -1, this will just split
    334   // "forever". This doesn't support splitting more than 2^31 times
    335   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
    336   // but that seems unlikely to be useful.
    337   while (MaxSplit-- != 0) {
    338     size_t Idx = S.find(Separator);
    339     if (Idx == npos)
    340       break;
    341 
    342     // Push this split.
    343     if (KeepEmpty || Idx > 0)
    344       A.push_back(S.slice(0, Idx));
    345 
    346     // Jump forward.
    347     S = S.slice(Idx + Separator.size(), npos);
    348   }
    349 
    350   // Push the tail.
    351   if (KeepEmpty || !S.empty())
    352     A.push_back(S);
    353 }
    354 
    355 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator,
    356                       int MaxSplit, bool KeepEmpty) const {
    357   StringRef S = *this;
    358 
    359   // Count down from MaxSplit. When MaxSplit is -1, this will just split
    360   // "forever". This doesn't support splitting more than 2^31 times
    361   // intentionally; if we ever want that we can make MaxSplit a 64-bit integer
    362   // but that seems unlikely to be useful.
    363   while (MaxSplit-- != 0) {
    364     size_t Idx = S.find(Separator);
    365     if (Idx == npos)
    366       break;
    367 
    368     // Push this split.
    369     if (KeepEmpty || Idx > 0)
    370       A.push_back(S.slice(0, Idx));
    371 
    372     // Jump forward.
    373     S = S.slice(Idx + 1, npos);
    374   }
    375 
    376   // Push the tail.
    377   if (KeepEmpty || !S.empty())
    378     A.push_back(S);
    379 }
    380 
    381 //===----------------------------------------------------------------------===//
    382 // Helpful Algorithms
    383 //===----------------------------------------------------------------------===//
    384 
    385 /// count - Return the number of non-overlapped occurrences of \arg Str in
    386 /// the string.
    387 size_t StringRef::count(StringRef Str) const {
    388   size_t Count = 0;
    389   size_t N = Str.size();
    390   if (N > Length)
    391     return 0;
    392   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
    393     if (substr(i, N).equals(Str))
    394       ++Count;
    395   return Count;
    396 }
    397 
    398 static unsigned GetAutoSenseRadix(StringRef &Str) {
    399   if (Str.empty())
    400     return 10;
    401 
    402   if (Str.startswith("0x") || Str.startswith("0X")) {
    403     Str = Str.substr(2);
    404     return 16;
    405   }
    406 
    407   if (Str.startswith("0b") || Str.startswith("0B")) {
    408     Str = Str.substr(2);
    409     return 2;
    410   }
    411 
    412   if (Str.startswith("0o")) {
    413     Str = Str.substr(2);
    414     return 8;
    415   }
    416 
    417   if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) {
    418     Str = Str.substr(1);
    419     return 8;
    420   }
    421 
    422   return 10;
    423 }
    424 
    425 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix,
    426                                   unsigned long long &Result) {
    427   // Autosense radix if not specified.
    428   if (Radix == 0)
    429     Radix = GetAutoSenseRadix(Str);
    430 
    431   // Empty strings (after the radix autosense) are invalid.
    432   if (Str.empty()) return true;
    433 
    434   // Parse all the bytes of the string given this radix.  Watch for overflow.
    435   StringRef Str2 = Str;
    436   Result = 0;
    437   while (!Str2.empty()) {
    438     unsigned CharVal;
    439     if (Str2[0] >= '0' && Str2[0] <= '9')
    440       CharVal = Str2[0] - '0';
    441     else if (Str2[0] >= 'a' && Str2[0] <= 'z')
    442       CharVal = Str2[0] - 'a' + 10;
    443     else if (Str2[0] >= 'A' && Str2[0] <= 'Z')
    444       CharVal = Str2[0] - 'A' + 10;
    445     else
    446       break;
    447 
    448     // If the parsed value is larger than the integer radix, we cannot
    449     // consume any more characters.
    450     if (CharVal >= Radix)
    451       break;
    452 
    453     // Add in this character.
    454     unsigned long long PrevResult = Result;
    455     Result = Result * Radix + CharVal;
    456 
    457     // Check for overflow by shifting back and seeing if bits were lost.
    458     if (Result / Radix < PrevResult)
    459       return true;
    460 
    461     Str2 = Str2.substr(1);
    462   }
    463 
    464   // We consider the operation a failure if no characters were consumed
    465   // successfully.
    466   if (Str.size() == Str2.size())
    467     return true;
    468 
    469   Str = Str2;
    470   return false;
    471 }
    472 
    473 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix,
    474                                 long long &Result) {
    475   unsigned long long ULLVal;
    476 
    477   // Handle positive strings first.
    478   if (Str.empty() || Str.front() != '-') {
    479     if (consumeUnsignedInteger(Str, Radix, ULLVal) ||
    480         // Check for value so large it overflows a signed value.
    481         (long long)ULLVal < 0)
    482       return true;
    483     Result = ULLVal;
    484     return false;
    485   }
    486 
    487   // Get the positive part of the value.
    488   StringRef Str2 = Str.drop_front(1);
    489   if (consumeUnsignedInteger(Str2, Radix, ULLVal) ||
    490       // Reject values so large they'd overflow as negative signed, but allow
    491       // "-0".  This negates the unsigned so that the negative isn't undefined
    492       // on signed overflow.
    493       (long long)-ULLVal > 0)
    494     return true;
    495 
    496   Str = Str2;
    497   Result = -ULLVal;
    498   return false;
    499 }
    500 
    501 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
    502 /// sequence of radix up to 36 to an unsigned long long value.
    503 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix,
    504                                 unsigned long long &Result) {
    505   if (consumeUnsignedInteger(Str, Radix, Result))
    506     return true;
    507 
    508   // For getAsUnsignedInteger, we require the whole string to be consumed or
    509   // else we consider it a failure.
    510   return !Str.empty();
    511 }
    512 
    513 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix,
    514                               long long &Result) {
    515   if (consumeSignedInteger(Str, Radix, Result))
    516     return true;
    517 
    518   // For getAsSignedInteger, we require the whole string to be consumed or else
    519   // we consider it a failure.
    520   return !Str.empty();
    521 }
    522 
    523 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
    524   StringRef Str = *this;
    525 
    526   // Autosense radix if not specified.
    527   if (Radix == 0)
    528     Radix = GetAutoSenseRadix(Str);
    529 
    530   assert(Radix > 1 && Radix <= 36);
    531 
    532   // Empty strings (after the radix autosense) are invalid.
    533   if (Str.empty()) return true;
    534 
    535   // Skip leading zeroes.  This can be a significant improvement if
    536   // it means we don't need > 64 bits.
    537   while (!Str.empty() && Str.front() == '0')
    538     Str = Str.substr(1);
    539 
    540   // If it was nothing but zeroes....
    541   if (Str.empty()) {
    542     Result = APInt(64, 0);
    543     return false;
    544   }
    545 
    546   // (Over-)estimate the required number of bits.
    547   unsigned Log2Radix = 0;
    548   while ((1U << Log2Radix) < Radix) Log2Radix++;
    549   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
    550 
    551   unsigned BitWidth = Log2Radix * Str.size();
    552   if (BitWidth < Result.getBitWidth())
    553     BitWidth = Result.getBitWidth(); // don't shrink the result
    554   else if (BitWidth > Result.getBitWidth())
    555     Result = Result.zext(BitWidth);
    556 
    557   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
    558   if (!IsPowerOf2Radix) {
    559     // These must have the same bit-width as Result.
    560     RadixAP = APInt(BitWidth, Radix);
    561     CharAP = APInt(BitWidth, 0);
    562   }
    563 
    564   // Parse all the bytes of the string given this radix.
    565   Result = 0;
    566   while (!Str.empty()) {
    567     unsigned CharVal;
    568     if (Str[0] >= '0' && Str[0] <= '9')
    569       CharVal = Str[0]-'0';
    570     else if (Str[0] >= 'a' && Str[0] <= 'z')
    571       CharVal = Str[0]-'a'+10;
    572     else if (Str[0] >= 'A' && Str[0] <= 'Z')
    573       CharVal = Str[0]-'A'+10;
    574     else
    575       return true;
    576 
    577     // If the parsed value is larger than the integer radix, the string is
    578     // invalid.
    579     if (CharVal >= Radix)
    580       return true;
    581 
    582     // Add in this character.
    583     if (IsPowerOf2Radix) {
    584       Result <<= Log2Radix;
    585       Result |= CharVal;
    586     } else {
    587       Result *= RadixAP;
    588       CharAP = CharVal;
    589       Result += CharAP;
    590     }
    591 
    592     Str = Str.substr(1);
    593   }
    594 
    595   return false;
    596 }
    597 
    598 
    599 // Implementation of StringRef hashing.
    600 hash_code llvm::hash_value(StringRef S) {
    601   return hash_combine_range(S.begin(), S.end());
    602 }
    603