Home | History | Annotate | Download | only in Support
      1 //===-- StringRef.cpp - Lightweight String References ---------------------===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 
     10 #include "llvm/ADT/StringRef.h"
     11 #include "llvm/ADT/APInt.h"
     12 #include "llvm/ADT/OwningPtr.h"
     13 #include <bitset>
     14 
     15 using namespace llvm;
     16 
     17 // MSVC emits references to this into the translation units which reference it.
     18 #ifndef _MSC_VER
     19 const size_t StringRef::npos;
     20 #endif
     21 
     22 static char ascii_tolower(char x) {
     23   if (x >= 'A' && x <= 'Z')
     24     return x - 'A' + 'a';
     25   return x;
     26 }
     27 
     28 static bool ascii_isdigit(char x) {
     29   return x >= '0' && x <= '9';
     30 }
     31 
     32 /// compare_lower - Compare strings, ignoring case.
     33 int StringRef::compare_lower(StringRef RHS) const {
     34   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
     35     unsigned char LHC = ascii_tolower(Data[I]);
     36     unsigned char RHC = ascii_tolower(RHS.Data[I]);
     37     if (LHC != RHC)
     38       return LHC < RHC ? -1 : 1;
     39   }
     40 
     41   if (Length == RHS.Length)
     42     return 0;
     43   return Length < RHS.Length ? -1 : 1;
     44 }
     45 
     46 /// compare_numeric - Compare strings, handle embedded numbers.
     47 int StringRef::compare_numeric(StringRef RHS) const {
     48   for (size_t I = 0, E = min(Length, RHS.Length); I != E; ++I) {
     49     // Check for sequences of digits.
     50     if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) {
     51       // The longer sequence of numbers is considered larger.
     52       // This doesn't really handle prefixed zeros well.
     53       size_t J;
     54       for (J = I + 1; J != E + 1; ++J) {
     55         bool ld = J < Length && ascii_isdigit(Data[J]);
     56         bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]);
     57         if (ld != rd)
     58           return rd ? -1 : 1;
     59         if (!rd)
     60           break;
     61       }
     62       // The two number sequences have the same length (J-I), just memcmp them.
     63       if (int Res = compareMemory(Data + I, RHS.Data + I, J - I))
     64         return Res < 0 ? -1 : 1;
     65       // Identical number sequences, continue search after the numbers.
     66       I = J - 1;
     67       continue;
     68     }
     69     if (Data[I] != RHS.Data[I])
     70       return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1;
     71   }
     72   if (Length == RHS.Length)
     73     return 0;
     74   return Length < RHS.Length ? -1 : 1;
     75 }
     76 
     77 // Compute the edit distance between the two given strings.
     78 unsigned StringRef::edit_distance(llvm::StringRef Other,
     79                                   bool AllowReplacements,
     80                                   unsigned MaxEditDistance) {
     81   // The algorithm implemented below is the "classic"
     82   // dynamic-programming algorithm for computing the Levenshtein
     83   // distance, which is described here:
     84   //
     85   //   http://en.wikipedia.org/wiki/Levenshtein_distance
     86   //
     87   // Although the algorithm is typically described using an m x n
     88   // array, only two rows are used at a time, so this implemenation
     89   // just keeps two separate vectors for those two rows.
     90   size_type m = size();
     91   size_type n = Other.size();
     92 
     93   const unsigned SmallBufferSize = 64;
     94   unsigned SmallBuffer[SmallBufferSize];
     95   llvm::OwningArrayPtr<unsigned> Allocated;
     96   unsigned *previous = SmallBuffer;
     97   if (2*(n + 1) > SmallBufferSize) {
     98     previous = new unsigned [2*(n+1)];
     99     Allocated.reset(previous);
    100   }
    101   unsigned *current = previous + (n + 1);
    102 
    103   for (unsigned i = 0; i <= n; ++i)
    104     previous[i] = i;
    105 
    106   for (size_type y = 1; y <= m; ++y) {
    107     current[0] = y;
    108     unsigned BestThisRow = current[0];
    109 
    110     for (size_type x = 1; x <= n; ++x) {
    111       if (AllowReplacements) {
    112         current[x] = min(previous[x-1] + ((*this)[y-1] == Other[x-1]? 0u:1u),
    113                          min(current[x-1], previous[x])+1);
    114       }
    115       else {
    116         if ((*this)[y-1] == Other[x-1]) current[x] = previous[x-1];
    117         else current[x] = min(current[x-1], previous[x]) + 1;
    118       }
    119       BestThisRow = min(BestThisRow, current[x]);
    120     }
    121 
    122     if (MaxEditDistance && BestThisRow > MaxEditDistance)
    123       return MaxEditDistance + 1;
    124 
    125     unsigned *tmp = current;
    126     current = previous;
    127     previous = tmp;
    128   }
    129 
    130   unsigned Result = previous[n];
    131   return Result;
    132 }
    133 
    134 //===----------------------------------------------------------------------===//
    135 // String Searching
    136 //===----------------------------------------------------------------------===//
    137 
    138 
    139 /// find - Search for the first string \arg Str in the string.
    140 ///
    141 /// \return - The index of the first occurrence of \arg Str, or npos if not
    142 /// found.
    143 size_t StringRef::find(StringRef Str, size_t From) const {
    144   size_t N = Str.size();
    145   if (N > Length)
    146     return npos;
    147   for (size_t e = Length - N + 1, i = min(From, e); i != e; ++i)
    148     if (substr(i, N).equals(Str))
    149       return i;
    150   return npos;
    151 }
    152 
    153 /// rfind - Search for the last string \arg Str in the string.
    154 ///
    155 /// \return - The index of the last occurrence of \arg Str, or npos if not
    156 /// found.
    157 size_t StringRef::rfind(StringRef Str) const {
    158   size_t N = Str.size();
    159   if (N > Length)
    160     return npos;
    161   for (size_t i = Length - N + 1, e = 0; i != e;) {
    162     --i;
    163     if (substr(i, N).equals(Str))
    164       return i;
    165   }
    166   return npos;
    167 }
    168 
    169 /// find_first_of - Find the first character in the string that is in \arg
    170 /// Chars, or npos if not found.
    171 ///
    172 /// Note: O(size() + Chars.size())
    173 StringRef::size_type StringRef::find_first_of(StringRef Chars,
    174                                               size_t From) const {
    175   std::bitset<1 << CHAR_BIT> CharBits;
    176   for (size_type i = 0; i != Chars.size(); ++i)
    177     CharBits.set((unsigned char)Chars[i]);
    178 
    179   for (size_type i = min(From, Length), e = Length; i != e; ++i)
    180     if (CharBits.test((unsigned char)Data[i]))
    181       return i;
    182   return npos;
    183 }
    184 
    185 /// find_first_not_of - Find the first character in the string that is not
    186 /// \arg C or npos if not found.
    187 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const {
    188   for (size_type i = min(From, Length), e = Length; i != e; ++i)
    189     if (Data[i] != C)
    190       return i;
    191   return npos;
    192 }
    193 
    194 /// find_first_not_of - Find the first character in the string that is not
    195 /// in the string \arg Chars, or npos if not found.
    196 ///
    197 /// Note: O(size() + Chars.size())
    198 StringRef::size_type StringRef::find_first_not_of(StringRef Chars,
    199                                                   size_t From) const {
    200   std::bitset<1 << CHAR_BIT> CharBits;
    201   for (size_type i = 0; i != Chars.size(); ++i)
    202     CharBits.set((unsigned char)Chars[i]);
    203 
    204   for (size_type i = min(From, Length), e = Length; i != e; ++i)
    205     if (!CharBits.test((unsigned char)Data[i]))
    206       return i;
    207   return npos;
    208 }
    209 
    210 /// find_last_of - Find the last character in the string that is in \arg C,
    211 /// or npos if not found.
    212 ///
    213 /// Note: O(size() + Chars.size())
    214 StringRef::size_type StringRef::find_last_of(StringRef Chars,
    215                                              size_t From) const {
    216   std::bitset<1 << CHAR_BIT> CharBits;
    217   for (size_type i = 0; i != Chars.size(); ++i)
    218     CharBits.set((unsigned char)Chars[i]);
    219 
    220   for (size_type i = min(From, Length) - 1, e = -1; i != e; --i)
    221     if (CharBits.test((unsigned char)Data[i]))
    222       return i;
    223   return npos;
    224 }
    225 
    226 //===----------------------------------------------------------------------===//
    227 // Helpful Algorithms
    228 //===----------------------------------------------------------------------===//
    229 
    230 /// count - Return the number of non-overlapped occurrences of \arg Str in
    231 /// the string.
    232 size_t StringRef::count(StringRef Str) const {
    233   size_t Count = 0;
    234   size_t N = Str.size();
    235   if (N > Length)
    236     return 0;
    237   for (size_t i = 0, e = Length - N + 1; i != e; ++i)
    238     if (substr(i, N).equals(Str))
    239       ++Count;
    240   return Count;
    241 }
    242 
    243 static unsigned GetAutoSenseRadix(StringRef &Str) {
    244   if (Str.startswith("0x")) {
    245     Str = Str.substr(2);
    246     return 16;
    247   } else if (Str.startswith("0b")) {
    248     Str = Str.substr(2);
    249     return 2;
    250   } else if (Str.startswith("0")) {
    251     return 8;
    252   } else {
    253     return 10;
    254   }
    255 }
    256 
    257 
    258 /// GetAsUnsignedInteger - Workhorse method that converts a integer character
    259 /// sequence of radix up to 36 to an unsigned long long value.
    260 static bool GetAsUnsignedInteger(StringRef Str, unsigned Radix,
    261                                  unsigned long long &Result) {
    262   // Autosense radix if not specified.
    263   if (Radix == 0)
    264     Radix = GetAutoSenseRadix(Str);
    265 
    266   // Empty strings (after the radix autosense) are invalid.
    267   if (Str.empty()) return true;
    268 
    269   // Parse all the bytes of the string given this radix.  Watch for overflow.
    270   Result = 0;
    271   while (!Str.empty()) {
    272     unsigned CharVal;
    273     if (Str[0] >= '0' && Str[0] <= '9')
    274       CharVal = Str[0]-'0';
    275     else if (Str[0] >= 'a' && Str[0] <= 'z')
    276       CharVal = Str[0]-'a'+10;
    277     else if (Str[0] >= 'A' && Str[0] <= 'Z')
    278       CharVal = Str[0]-'A'+10;
    279     else
    280       return true;
    281 
    282     // If the parsed value is larger than the integer radix, the string is
    283     // invalid.
    284     if (CharVal >= Radix)
    285       return true;
    286 
    287     // Add in this character.
    288     unsigned long long PrevResult = Result;
    289     Result = Result*Radix+CharVal;
    290 
    291     // Check for overflow.
    292     if (Result < PrevResult)
    293       return true;
    294 
    295     Str = Str.substr(1);
    296   }
    297 
    298   return false;
    299 }
    300 
    301 bool StringRef::getAsInteger(unsigned Radix, unsigned long long &Result) const {
    302   return GetAsUnsignedInteger(*this, Radix, Result);
    303 }
    304 
    305 
    306 bool StringRef::getAsInteger(unsigned Radix, long long &Result) const {
    307   unsigned long long ULLVal;
    308 
    309   // Handle positive strings first.
    310   if (empty() || front() != '-') {
    311     if (GetAsUnsignedInteger(*this, Radix, ULLVal) ||
    312         // Check for value so large it overflows a signed value.
    313         (long long)ULLVal < 0)
    314       return true;
    315     Result = ULLVal;
    316     return false;
    317   }
    318 
    319   // Get the positive part of the value.
    320   if (GetAsUnsignedInteger(substr(1), Radix, ULLVal) ||
    321       // Reject values so large they'd overflow as negative signed, but allow
    322       // "-0".  This negates the unsigned so that the negative isn't undefined
    323       // on signed overflow.
    324       (long long)-ULLVal > 0)
    325     return true;
    326 
    327   Result = -ULLVal;
    328   return false;
    329 }
    330 
    331 bool StringRef::getAsInteger(unsigned Radix, int &Result) const {
    332   long long Val;
    333   if (getAsInteger(Radix, Val) ||
    334       (int)Val != Val)
    335     return true;
    336   Result = Val;
    337   return false;
    338 }
    339 
    340 bool StringRef::getAsInteger(unsigned Radix, unsigned &Result) const {
    341   unsigned long long Val;
    342   if (getAsInteger(Radix, Val) ||
    343       (unsigned)Val != Val)
    344     return true;
    345   Result = Val;
    346   return false;
    347 }
    348 
    349 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
    350   StringRef Str = *this;
    351 
    352   // Autosense radix if not specified.
    353   if (Radix == 0)
    354     Radix = GetAutoSenseRadix(Str);
    355 
    356   assert(Radix > 1 && Radix <= 36);
    357 
    358   // Empty strings (after the radix autosense) are invalid.
    359   if (Str.empty()) return true;
    360 
    361   // Skip leading zeroes.  This can be a significant improvement if
    362   // it means we don't need > 64 bits.
    363   while (!Str.empty() && Str.front() == '0')
    364     Str = Str.substr(1);
    365 
    366   // If it was nothing but zeroes....
    367   if (Str.empty()) {
    368     Result = APInt(64, 0);
    369     return false;
    370   }
    371 
    372   // (Over-)estimate the required number of bits.
    373   unsigned Log2Radix = 0;
    374   while ((1U << Log2Radix) < Radix) Log2Radix++;
    375   bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix);
    376 
    377   unsigned BitWidth = Log2Radix * Str.size();
    378   if (BitWidth < Result.getBitWidth())
    379     BitWidth = Result.getBitWidth(); // don't shrink the result
    380   else
    381     Result = Result.zext(BitWidth);
    382 
    383   APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix
    384   if (!IsPowerOf2Radix) {
    385     // These must have the same bit-width as Result.
    386     RadixAP = APInt(BitWidth, Radix);
    387     CharAP = APInt(BitWidth, 0);
    388   }
    389 
    390   // Parse all the bytes of the string given this radix.
    391   Result = 0;
    392   while (!Str.empty()) {
    393     unsigned CharVal;
    394     if (Str[0] >= '0' && Str[0] <= '9')
    395       CharVal = Str[0]-'0';
    396     else if (Str[0] >= 'a' && Str[0] <= 'z')
    397       CharVal = Str[0]-'a'+10;
    398     else if (Str[0] >= 'A' && Str[0] <= 'Z')
    399       CharVal = Str[0]-'A'+10;
    400     else
    401       return true;
    402 
    403     // If the parsed value is larger than the integer radix, the string is
    404     // invalid.
    405     if (CharVal >= Radix)
    406       return true;
    407 
    408     // Add in this character.
    409     if (IsPowerOf2Radix) {
    410       Result <<= Log2Radix;
    411       Result |= CharVal;
    412     } else {
    413       Result *= RadixAP;
    414       CharAP = CharVal;
    415       Result += CharAP;
    416     }
    417 
    418     Str = Str.substr(1);
    419   }
    420 
    421   return false;
    422 }
    423