1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 /// compare_numeric - Compare strings, handle embedded numbers. 73 int StringRef::compare_numeric(StringRef RHS) const { 74 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 75 // Check for sequences of digits. 76 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 77 // The longer sequence of numbers is considered larger. 78 // This doesn't really handle prefixed zeros well. 79 size_t J; 80 for (J = I + 1; J != E + 1; ++J) { 81 bool ld = J < Length && ascii_isdigit(Data[J]); 82 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 83 if (ld != rd) 84 return rd ? -1 : 1; 85 if (!rd) 86 break; 87 } 88 // The two number sequences have the same length (J-I), just memcmp them. 89 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 90 return Res < 0 ? -1 : 1; 91 // Identical number sequences, continue search after the numbers. 92 I = J - 1; 93 continue; 94 } 95 if (Data[I] != RHS.Data[I]) 96 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 97 } 98 if (Length == RHS.Length) 99 return 0; 100 return Length < RHS.Length ? -1 : 1; 101 } 102 103 // Compute the edit distance between the two given strings. 104 unsigned StringRef::edit_distance(llvm::StringRef Other, 105 bool AllowReplacements, 106 unsigned MaxEditDistance) const { 107 return llvm::ComputeEditDistance( 108 makeArrayRef(data(), size()), 109 makeArrayRef(Other.data(), Other.size()), 110 AllowReplacements, MaxEditDistance); 111 } 112 113 //===----------------------------------------------------------------------===// 114 // String Operations 115 //===----------------------------------------------------------------------===// 116 117 std::string StringRef::lower() const { 118 std::string Result(size(), char()); 119 for (size_type i = 0, e = size(); i != e; ++i) { 120 Result[i] = ascii_tolower(Data[i]); 121 } 122 return Result; 123 } 124 125 std::string StringRef::upper() const { 126 std::string Result(size(), char()); 127 for (size_type i = 0, e = size(); i != e; ++i) { 128 Result[i] = ascii_toupper(Data[i]); 129 } 130 return Result; 131 } 132 133 //===----------------------------------------------------------------------===// 134 // String Searching 135 //===----------------------------------------------------------------------===// 136 137 138 /// find - Search for the first string \arg Str in the string. 139 /// 140 /// \return - The index of the first occurrence of \arg Str, or npos if not 141 /// found. 142 size_t StringRef::find(StringRef Str, size_t From) const { 143 if (From > Length) 144 return npos; 145 146 const char *Needle = Str.data(); 147 size_t N = Str.size(); 148 if (N == 0) 149 return From; 150 151 size_t Size = Length - From; 152 if (Size < N) 153 return npos; 154 155 const char *Start = Data + From; 156 const char *Stop = Start + (Size - N + 1); 157 158 // For short haystacks or unsupported needles fall back to the naive algorithm 159 if (Size < 16 || N > 255) { 160 do { 161 if (std::memcmp(Start, Needle, N) == 0) 162 return Start - Data; 163 ++Start; 164 } while (Start < Stop); 165 return npos; 166 } 167 168 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 169 uint8_t BadCharSkip[256]; 170 std::memset(BadCharSkip, N, 256); 171 for (unsigned i = 0; i != N-1; ++i) 172 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 173 174 do { 175 if (std::memcmp(Start, Needle, N) == 0) 176 return Start - Data; 177 178 // Otherwise skip the appropriate number of bytes. 179 Start += BadCharSkip[(uint8_t)Start[N-1]]; 180 } while (Start < Stop); 181 182 return npos; 183 } 184 185 /// rfind - Search for the last string \arg Str in the string. 186 /// 187 /// \return - The index of the last occurrence of \arg Str, or npos if not 188 /// found. 189 size_t StringRef::rfind(StringRef Str) const { 190 size_t N = Str.size(); 191 if (N > Length) 192 return npos; 193 for (size_t i = Length - N + 1, e = 0; i != e;) { 194 --i; 195 if (substr(i, N).equals(Str)) 196 return i; 197 } 198 return npos; 199 } 200 201 /// find_first_of - Find the first character in the string that is in \arg 202 /// Chars, or npos if not found. 203 /// 204 /// Note: O(size() + Chars.size()) 205 StringRef::size_type StringRef::find_first_of(StringRef Chars, 206 size_t From) const { 207 std::bitset<1 << CHAR_BIT> CharBits; 208 for (size_type i = 0; i != Chars.size(); ++i) 209 CharBits.set((unsigned char)Chars[i]); 210 211 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 212 if (CharBits.test((unsigned char)Data[i])) 213 return i; 214 return npos; 215 } 216 217 /// find_first_not_of - Find the first character in the string that is not 218 /// \arg C or npos if not found. 219 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 220 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 221 if (Data[i] != C) 222 return i; 223 return npos; 224 } 225 226 /// find_first_not_of - Find the first character in the string that is not 227 /// in the string \arg Chars, or npos if not found. 228 /// 229 /// Note: O(size() + Chars.size()) 230 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 231 size_t From) const { 232 std::bitset<1 << CHAR_BIT> CharBits; 233 for (size_type i = 0; i != Chars.size(); ++i) 234 CharBits.set((unsigned char)Chars[i]); 235 236 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 237 if (!CharBits.test((unsigned char)Data[i])) 238 return i; 239 return npos; 240 } 241 242 /// find_last_of - Find the last character in the string that is in \arg C, 243 /// or npos if not found. 244 /// 245 /// Note: O(size() + Chars.size()) 246 StringRef::size_type StringRef::find_last_of(StringRef Chars, 247 size_t From) const { 248 std::bitset<1 << CHAR_BIT> CharBits; 249 for (size_type i = 0; i != Chars.size(); ++i) 250 CharBits.set((unsigned char)Chars[i]); 251 252 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 253 if (CharBits.test((unsigned char)Data[i])) 254 return i; 255 return npos; 256 } 257 258 /// find_last_not_of - Find the last character in the string that is not 259 /// \arg C, or npos if not found. 260 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 261 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 262 if (Data[i] != C) 263 return i; 264 return npos; 265 } 266 267 /// find_last_not_of - Find the last character in the string that is not in 268 /// \arg Chars, or npos if not found. 269 /// 270 /// Note: O(size() + Chars.size()) 271 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 272 size_t From) const { 273 std::bitset<1 << CHAR_BIT> CharBits; 274 for (size_type i = 0, e = Chars.size(); i != e; ++i) 275 CharBits.set((unsigned char)Chars[i]); 276 277 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 278 if (!CharBits.test((unsigned char)Data[i])) 279 return i; 280 return npos; 281 } 282 283 void StringRef::split(SmallVectorImpl<StringRef> &A, 284 StringRef Separator, int MaxSplit, 285 bool KeepEmpty) const { 286 StringRef S = *this; 287 288 // Count down from MaxSplit. When MaxSplit is -1, this will just split 289 // "forever". This doesn't support splitting more than 2^31 times 290 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 291 // but that seems unlikely to be useful. 292 while (MaxSplit-- != 0) { 293 size_t Idx = S.find(Separator); 294 if (Idx == npos) 295 break; 296 297 // Push this split. 298 if (KeepEmpty || Idx > 0) 299 A.push_back(S.slice(0, Idx)); 300 301 // Jump forward. 302 S = S.slice(Idx + Separator.size(), npos); 303 } 304 305 // Push the tail. 306 if (KeepEmpty || !S.empty()) 307 A.push_back(S); 308 } 309 310 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 311 int MaxSplit, bool KeepEmpty) const { 312 StringRef S = *this; 313 314 // Count down from MaxSplit. When MaxSplit is -1, this will just split 315 // "forever". This doesn't support splitting more than 2^31 times 316 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 317 // but that seems unlikely to be useful. 318 while (MaxSplit-- != 0) { 319 size_t Idx = S.find(Separator); 320 if (Idx == npos) 321 break; 322 323 // Push this split. 324 if (KeepEmpty || Idx > 0) 325 A.push_back(S.slice(0, Idx)); 326 327 // Jump forward. 328 S = S.slice(Idx + 1, npos); 329 } 330 331 // Push the tail. 332 if (KeepEmpty || !S.empty()) 333 A.push_back(S); 334 } 335 336 //===----------------------------------------------------------------------===// 337 // Helpful Algorithms 338 //===----------------------------------------------------------------------===// 339 340 /// count - Return the number of non-overlapped occurrences of \arg Str in 341 /// the string. 342 size_t StringRef::count(StringRef Str) const { 343 size_t Count = 0; 344 size_t N = Str.size(); 345 if (N > Length) 346 return 0; 347 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 348 if (substr(i, N).equals(Str)) 349 ++Count; 350 return Count; 351 } 352 353 static unsigned GetAutoSenseRadix(StringRef &Str) { 354 if (Str.startswith("0x") || Str.startswith("0X")) { 355 Str = Str.substr(2); 356 return 16; 357 } 358 359 if (Str.startswith("0b") || Str.startswith("0B")) { 360 Str = Str.substr(2); 361 return 2; 362 } 363 364 if (Str.startswith("0o")) { 365 Str = Str.substr(2); 366 return 8; 367 } 368 369 if (Str.startswith("0")) 370 return 8; 371 372 return 10; 373 } 374 375 376 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 377 /// sequence of radix up to 36 to an unsigned long long value. 378 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 379 unsigned long long &Result) { 380 // Autosense radix if not specified. 381 if (Radix == 0) 382 Radix = GetAutoSenseRadix(Str); 383 384 // Empty strings (after the radix autosense) are invalid. 385 if (Str.empty()) return true; 386 387 // Parse all the bytes of the string given this radix. Watch for overflow. 388 Result = 0; 389 while (!Str.empty()) { 390 unsigned CharVal; 391 if (Str[0] >= '0' && Str[0] <= '9') 392 CharVal = Str[0]-'0'; 393 else if (Str[0] >= 'a' && Str[0] <= 'z') 394 CharVal = Str[0]-'a'+10; 395 else if (Str[0] >= 'A' && Str[0] <= 'Z') 396 CharVal = Str[0]-'A'+10; 397 else 398 return true; 399 400 // If the parsed value is larger than the integer radix, the string is 401 // invalid. 402 if (CharVal >= Radix) 403 return true; 404 405 // Add in this character. 406 unsigned long long PrevResult = Result; 407 Result = Result*Radix+CharVal; 408 409 // Check for overflow by shifting back and seeing if bits were lost. 410 if (Result/Radix < PrevResult) 411 return true; 412 413 Str = Str.substr(1); 414 } 415 416 return false; 417 } 418 419 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 420 long long &Result) { 421 unsigned long long ULLVal; 422 423 // Handle positive strings first. 424 if (Str.empty() || Str.front() != '-') { 425 if (getAsUnsignedInteger(Str, Radix, ULLVal) || 426 // Check for value so large it overflows a signed value. 427 (long long)ULLVal < 0) 428 return true; 429 Result = ULLVal; 430 return false; 431 } 432 433 // Get the positive part of the value. 434 if (getAsUnsignedInteger(Str.substr(1), Radix, ULLVal) || 435 // Reject values so large they'd overflow as negative signed, but allow 436 // "-0". This negates the unsigned so that the negative isn't undefined 437 // on signed overflow. 438 (long long)-ULLVal > 0) 439 return true; 440 441 Result = -ULLVal; 442 return false; 443 } 444 445 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 446 StringRef Str = *this; 447 448 // Autosense radix if not specified. 449 if (Radix == 0) 450 Radix = GetAutoSenseRadix(Str); 451 452 assert(Radix > 1 && Radix <= 36); 453 454 // Empty strings (after the radix autosense) are invalid. 455 if (Str.empty()) return true; 456 457 // Skip leading zeroes. This can be a significant improvement if 458 // it means we don't need > 64 bits. 459 while (!Str.empty() && Str.front() == '0') 460 Str = Str.substr(1); 461 462 // If it was nothing but zeroes.... 463 if (Str.empty()) { 464 Result = APInt(64, 0); 465 return false; 466 } 467 468 // (Over-)estimate the required number of bits. 469 unsigned Log2Radix = 0; 470 while ((1U << Log2Radix) < Radix) Log2Radix++; 471 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 472 473 unsigned BitWidth = Log2Radix * Str.size(); 474 if (BitWidth < Result.getBitWidth()) 475 BitWidth = Result.getBitWidth(); // don't shrink the result 476 else if (BitWidth > Result.getBitWidth()) 477 Result = Result.zext(BitWidth); 478 479 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 480 if (!IsPowerOf2Radix) { 481 // These must have the same bit-width as Result. 482 RadixAP = APInt(BitWidth, Radix); 483 CharAP = APInt(BitWidth, 0); 484 } 485 486 // Parse all the bytes of the string given this radix. 487 Result = 0; 488 while (!Str.empty()) { 489 unsigned CharVal; 490 if (Str[0] >= '0' && Str[0] <= '9') 491 CharVal = Str[0]-'0'; 492 else if (Str[0] >= 'a' && Str[0] <= 'z') 493 CharVal = Str[0]-'a'+10; 494 else if (Str[0] >= 'A' && Str[0] <= 'Z') 495 CharVal = Str[0]-'A'+10; 496 else 497 return true; 498 499 // If the parsed value is larger than the integer radix, the string is 500 // invalid. 501 if (CharVal >= Radix) 502 return true; 503 504 // Add in this character. 505 if (IsPowerOf2Radix) { 506 Result <<= Log2Radix; 507 Result |= CharVal; 508 } else { 509 Result *= RadixAP; 510 CharAP = CharVal; 511 Result += CharAP; 512 } 513 514 Str = Str.substr(1); 515 } 516 517 return false; 518 } 519 520 521 // Implementation of StringRef hashing. 522 hash_code llvm::hash_value(StringRef S) { 523 return hash_combine_range(S.begin(), S.end()); 524 } 525