1 //===-- StringRef.cpp - Lightweight String References ---------------------===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #include "llvm/ADT/StringRef.h" 11 #include "llvm/ADT/APInt.h" 12 #include "llvm/ADT/Hashing.h" 13 #include "llvm/ADT/edit_distance.h" 14 #include <bitset> 15 16 using namespace llvm; 17 18 // MSVC emits references to this into the translation units which reference it. 19 #ifndef _MSC_VER 20 const size_t StringRef::npos; 21 #endif 22 23 static char ascii_tolower(char x) { 24 if (x >= 'A' && x <= 'Z') 25 return x - 'A' + 'a'; 26 return x; 27 } 28 29 static char ascii_toupper(char x) { 30 if (x >= 'a' && x <= 'z') 31 return x - 'a' + 'A'; 32 return x; 33 } 34 35 static bool ascii_isdigit(char x) { 36 return x >= '0' && x <= '9'; 37 } 38 39 // strncasecmp() is not available on non-POSIX systems, so define an 40 // alternative function here. 41 static int ascii_strncasecmp(const char *LHS, const char *RHS, size_t Length) { 42 for (size_t I = 0; I < Length; ++I) { 43 unsigned char LHC = ascii_tolower(LHS[I]); 44 unsigned char RHC = ascii_tolower(RHS[I]); 45 if (LHC != RHC) 46 return LHC < RHC ? -1 : 1; 47 } 48 return 0; 49 } 50 51 /// compare_lower - Compare strings, ignoring case. 52 int StringRef::compare_lower(StringRef RHS) const { 53 if (int Res = ascii_strncasecmp(Data, RHS.Data, std::min(Length, RHS.Length))) 54 return Res; 55 if (Length == RHS.Length) 56 return 0; 57 return Length < RHS.Length ? -1 : 1; 58 } 59 60 /// Check if this string starts with the given \p Prefix, ignoring case. 61 bool StringRef::startswith_lower(StringRef Prefix) const { 62 return Length >= Prefix.Length && 63 ascii_strncasecmp(Data, Prefix.Data, Prefix.Length) == 0; 64 } 65 66 /// Check if this string ends with the given \p Suffix, ignoring case. 67 bool StringRef::endswith_lower(StringRef Suffix) const { 68 return Length >= Suffix.Length && 69 ascii_strncasecmp(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 70 } 71 72 size_t StringRef::find_lower(char C, size_t From) const { 73 char L = ascii_tolower(C); 74 return find_if([L](char D) { return ascii_tolower(D) == L; }, From); 75 } 76 77 /// compare_numeric - Compare strings, handle embedded numbers. 78 int StringRef::compare_numeric(StringRef RHS) const { 79 for (size_t I = 0, E = std::min(Length, RHS.Length); I != E; ++I) { 80 // Check for sequences of digits. 81 if (ascii_isdigit(Data[I]) && ascii_isdigit(RHS.Data[I])) { 82 // The longer sequence of numbers is considered larger. 83 // This doesn't really handle prefixed zeros well. 84 size_t J; 85 for (J = I + 1; J != E + 1; ++J) { 86 bool ld = J < Length && ascii_isdigit(Data[J]); 87 bool rd = J < RHS.Length && ascii_isdigit(RHS.Data[J]); 88 if (ld != rd) 89 return rd ? -1 : 1; 90 if (!rd) 91 break; 92 } 93 // The two number sequences have the same length (J-I), just memcmp them. 94 if (int Res = compareMemory(Data + I, RHS.Data + I, J - I)) 95 return Res < 0 ? -1 : 1; 96 // Identical number sequences, continue search after the numbers. 97 I = J - 1; 98 continue; 99 } 100 if (Data[I] != RHS.Data[I]) 101 return (unsigned char)Data[I] < (unsigned char)RHS.Data[I] ? -1 : 1; 102 } 103 if (Length == RHS.Length) 104 return 0; 105 return Length < RHS.Length ? -1 : 1; 106 } 107 108 // Compute the edit distance between the two given strings. 109 unsigned StringRef::edit_distance(llvm::StringRef Other, 110 bool AllowReplacements, 111 unsigned MaxEditDistance) const { 112 return llvm::ComputeEditDistance( 113 makeArrayRef(data(), size()), 114 makeArrayRef(Other.data(), Other.size()), 115 AllowReplacements, MaxEditDistance); 116 } 117 118 //===----------------------------------------------------------------------===// 119 // String Operations 120 //===----------------------------------------------------------------------===// 121 122 std::string StringRef::lower() const { 123 std::string Result(size(), char()); 124 for (size_type i = 0, e = size(); i != e; ++i) { 125 Result[i] = ascii_tolower(Data[i]); 126 } 127 return Result; 128 } 129 130 std::string StringRef::upper() const { 131 std::string Result(size(), char()); 132 for (size_type i = 0, e = size(); i != e; ++i) { 133 Result[i] = ascii_toupper(Data[i]); 134 } 135 return Result; 136 } 137 138 //===----------------------------------------------------------------------===// 139 // String Searching 140 //===----------------------------------------------------------------------===// 141 142 143 /// find - Search for the first string \arg Str in the string. 144 /// 145 /// \return - The index of the first occurrence of \arg Str, or npos if not 146 /// found. 147 size_t StringRef::find(StringRef Str, size_t From) const { 148 if (From > Length) 149 return npos; 150 151 const char *Start = Data + From; 152 size_t Size = Length - From; 153 154 const char *Needle = Str.data(); 155 size_t N = Str.size(); 156 if (N == 0) 157 return From; 158 if (Size < N) 159 return npos; 160 if (N == 1) { 161 const char *Ptr = (const char *)::memchr(Start, Needle[0], Size); 162 return Ptr == nullptr ? npos : Ptr - Data; 163 } 164 165 const char *Stop = Start + (Size - N + 1); 166 167 // For short haystacks or unsupported needles fall back to the naive algorithm 168 if (Size < 16 || N > 255) { 169 do { 170 if (std::memcmp(Start, Needle, N) == 0) 171 return Start - Data; 172 ++Start; 173 } while (Start < Stop); 174 return npos; 175 } 176 177 // Build the bad char heuristic table, with uint8_t to reduce cache thrashing. 178 uint8_t BadCharSkip[256]; 179 std::memset(BadCharSkip, N, 256); 180 for (unsigned i = 0; i != N-1; ++i) 181 BadCharSkip[(uint8_t)Str[i]] = N-1-i; 182 183 do { 184 uint8_t Last = Start[N - 1]; 185 if (LLVM_UNLIKELY(Last == (uint8_t)Needle[N - 1])) 186 if (std::memcmp(Start, Needle, N - 1) == 0) 187 return Start - Data; 188 189 // Otherwise skip the appropriate number of bytes. 190 Start += BadCharSkip[Last]; 191 } while (Start < Stop); 192 193 return npos; 194 } 195 196 size_t StringRef::find_lower(StringRef Str, size_t From) const { 197 StringRef This = substr(From); 198 while (This.size() >= Str.size()) { 199 if (This.startswith_lower(Str)) 200 return From; 201 This = This.drop_front(); 202 ++From; 203 } 204 return npos; 205 } 206 207 size_t StringRef::rfind_lower(char C, size_t From) const { 208 From = std::min(From, Length); 209 size_t i = From; 210 while (i != 0) { 211 --i; 212 if (ascii_tolower(Data[i]) == ascii_tolower(C)) 213 return i; 214 } 215 return npos; 216 } 217 218 /// rfind - Search for the last string \arg Str in the string. 219 /// 220 /// \return - The index of the last occurrence of \arg Str, or npos if not 221 /// found. 222 size_t StringRef::rfind(StringRef Str) const { 223 size_t N = Str.size(); 224 if (N > Length) 225 return npos; 226 for (size_t i = Length - N + 1, e = 0; i != e;) { 227 --i; 228 if (substr(i, N).equals(Str)) 229 return i; 230 } 231 return npos; 232 } 233 234 size_t StringRef::rfind_lower(StringRef Str) const { 235 size_t N = Str.size(); 236 if (N > Length) 237 return npos; 238 for (size_t i = Length - N + 1, e = 0; i != e;) { 239 --i; 240 if (substr(i, N).equals_lower(Str)) 241 return i; 242 } 243 return npos; 244 } 245 246 /// find_first_of - Find the first character in the string that is in \arg 247 /// Chars, or npos if not found. 248 /// 249 /// Note: O(size() + Chars.size()) 250 StringRef::size_type StringRef::find_first_of(StringRef Chars, 251 size_t From) const { 252 std::bitset<1 << CHAR_BIT> CharBits; 253 for (size_type i = 0; i != Chars.size(); ++i) 254 CharBits.set((unsigned char)Chars[i]); 255 256 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 257 if (CharBits.test((unsigned char)Data[i])) 258 return i; 259 return npos; 260 } 261 262 /// find_first_not_of - Find the first character in the string that is not 263 /// \arg C or npos if not found. 264 StringRef::size_type StringRef::find_first_not_of(char C, size_t From) const { 265 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 266 if (Data[i] != C) 267 return i; 268 return npos; 269 } 270 271 /// find_first_not_of - Find the first character in the string that is not 272 /// in the string \arg Chars, or npos if not found. 273 /// 274 /// Note: O(size() + Chars.size()) 275 StringRef::size_type StringRef::find_first_not_of(StringRef Chars, 276 size_t From) const { 277 std::bitset<1 << CHAR_BIT> CharBits; 278 for (size_type i = 0; i != Chars.size(); ++i) 279 CharBits.set((unsigned char)Chars[i]); 280 281 for (size_type i = std::min(From, Length), e = Length; i != e; ++i) 282 if (!CharBits.test((unsigned char)Data[i])) 283 return i; 284 return npos; 285 } 286 287 /// find_last_of - Find the last character in the string that is in \arg C, 288 /// or npos if not found. 289 /// 290 /// Note: O(size() + Chars.size()) 291 StringRef::size_type StringRef::find_last_of(StringRef Chars, 292 size_t From) const { 293 std::bitset<1 << CHAR_BIT> CharBits; 294 for (size_type i = 0; i != Chars.size(); ++i) 295 CharBits.set((unsigned char)Chars[i]); 296 297 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 298 if (CharBits.test((unsigned char)Data[i])) 299 return i; 300 return npos; 301 } 302 303 /// find_last_not_of - Find the last character in the string that is not 304 /// \arg C, or npos if not found. 305 StringRef::size_type StringRef::find_last_not_of(char C, size_t From) const { 306 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 307 if (Data[i] != C) 308 return i; 309 return npos; 310 } 311 312 /// find_last_not_of - Find the last character in the string that is not in 313 /// \arg Chars, or npos if not found. 314 /// 315 /// Note: O(size() + Chars.size()) 316 StringRef::size_type StringRef::find_last_not_of(StringRef Chars, 317 size_t From) const { 318 std::bitset<1 << CHAR_BIT> CharBits; 319 for (size_type i = 0, e = Chars.size(); i != e; ++i) 320 CharBits.set((unsigned char)Chars[i]); 321 322 for (size_type i = std::min(From, Length) - 1, e = -1; i != e; --i) 323 if (!CharBits.test((unsigned char)Data[i])) 324 return i; 325 return npos; 326 } 327 328 void StringRef::split(SmallVectorImpl<StringRef> &A, 329 StringRef Separator, int MaxSplit, 330 bool KeepEmpty) const { 331 StringRef S = *this; 332 333 // Count down from MaxSplit. When MaxSplit is -1, this will just split 334 // "forever". This doesn't support splitting more than 2^31 times 335 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 336 // but that seems unlikely to be useful. 337 while (MaxSplit-- != 0) { 338 size_t Idx = S.find(Separator); 339 if (Idx == npos) 340 break; 341 342 // Push this split. 343 if (KeepEmpty || Idx > 0) 344 A.push_back(S.slice(0, Idx)); 345 346 // Jump forward. 347 S = S.slice(Idx + Separator.size(), npos); 348 } 349 350 // Push the tail. 351 if (KeepEmpty || !S.empty()) 352 A.push_back(S); 353 } 354 355 void StringRef::split(SmallVectorImpl<StringRef> &A, char Separator, 356 int MaxSplit, bool KeepEmpty) const { 357 StringRef S = *this; 358 359 // Count down from MaxSplit. When MaxSplit is -1, this will just split 360 // "forever". This doesn't support splitting more than 2^31 times 361 // intentionally; if we ever want that we can make MaxSplit a 64-bit integer 362 // but that seems unlikely to be useful. 363 while (MaxSplit-- != 0) { 364 size_t Idx = S.find(Separator); 365 if (Idx == npos) 366 break; 367 368 // Push this split. 369 if (KeepEmpty || Idx > 0) 370 A.push_back(S.slice(0, Idx)); 371 372 // Jump forward. 373 S = S.slice(Idx + 1, npos); 374 } 375 376 // Push the tail. 377 if (KeepEmpty || !S.empty()) 378 A.push_back(S); 379 } 380 381 //===----------------------------------------------------------------------===// 382 // Helpful Algorithms 383 //===----------------------------------------------------------------------===// 384 385 /// count - Return the number of non-overlapped occurrences of \arg Str in 386 /// the string. 387 size_t StringRef::count(StringRef Str) const { 388 size_t Count = 0; 389 size_t N = Str.size(); 390 if (N > Length) 391 return 0; 392 for (size_t i = 0, e = Length - N + 1; i != e; ++i) 393 if (substr(i, N).equals(Str)) 394 ++Count; 395 return Count; 396 } 397 398 static unsigned GetAutoSenseRadix(StringRef &Str) { 399 if (Str.empty()) 400 return 10; 401 402 if (Str.startswith("0x") || Str.startswith("0X")) { 403 Str = Str.substr(2); 404 return 16; 405 } 406 407 if (Str.startswith("0b") || Str.startswith("0B")) { 408 Str = Str.substr(2); 409 return 2; 410 } 411 412 if (Str.startswith("0o")) { 413 Str = Str.substr(2); 414 return 8; 415 } 416 417 if (Str[0] == '0' && Str.size() > 1 && ascii_isdigit(Str[1])) { 418 Str = Str.substr(1); 419 return 8; 420 } 421 422 return 10; 423 } 424 425 bool llvm::consumeUnsignedInteger(StringRef &Str, unsigned Radix, 426 unsigned long long &Result) { 427 // Autosense radix if not specified. 428 if (Radix == 0) 429 Radix = GetAutoSenseRadix(Str); 430 431 // Empty strings (after the radix autosense) are invalid. 432 if (Str.empty()) return true; 433 434 // Parse all the bytes of the string given this radix. Watch for overflow. 435 StringRef Str2 = Str; 436 Result = 0; 437 while (!Str2.empty()) { 438 unsigned CharVal; 439 if (Str2[0] >= '0' && Str2[0] <= '9') 440 CharVal = Str2[0] - '0'; 441 else if (Str2[0] >= 'a' && Str2[0] <= 'z') 442 CharVal = Str2[0] - 'a' + 10; 443 else if (Str2[0] >= 'A' && Str2[0] <= 'Z') 444 CharVal = Str2[0] - 'A' + 10; 445 else 446 break; 447 448 // If the parsed value is larger than the integer radix, we cannot 449 // consume any more characters. 450 if (CharVal >= Radix) 451 break; 452 453 // Add in this character. 454 unsigned long long PrevResult = Result; 455 Result = Result * Radix + CharVal; 456 457 // Check for overflow by shifting back and seeing if bits were lost. 458 if (Result / Radix < PrevResult) 459 return true; 460 461 Str2 = Str2.substr(1); 462 } 463 464 // We consider the operation a failure if no characters were consumed 465 // successfully. 466 if (Str.size() == Str2.size()) 467 return true; 468 469 Str = Str2; 470 return false; 471 } 472 473 bool llvm::consumeSignedInteger(StringRef &Str, unsigned Radix, 474 long long &Result) { 475 unsigned long long ULLVal; 476 477 // Handle positive strings first. 478 if (Str.empty() || Str.front() != '-') { 479 if (consumeUnsignedInteger(Str, Radix, ULLVal) || 480 // Check for value so large it overflows a signed value. 481 (long long)ULLVal < 0) 482 return true; 483 Result = ULLVal; 484 return false; 485 } 486 487 // Get the positive part of the value. 488 StringRef Str2 = Str.drop_front(1); 489 if (consumeUnsignedInteger(Str2, Radix, ULLVal) || 490 // Reject values so large they'd overflow as negative signed, but allow 491 // "-0". This negates the unsigned so that the negative isn't undefined 492 // on signed overflow. 493 (long long)-ULLVal > 0) 494 return true; 495 496 Str = Str2; 497 Result = -ULLVal; 498 return false; 499 } 500 501 /// GetAsUnsignedInteger - Workhorse method that converts a integer character 502 /// sequence of radix up to 36 to an unsigned long long value. 503 bool llvm::getAsUnsignedInteger(StringRef Str, unsigned Radix, 504 unsigned long long &Result) { 505 if (consumeUnsignedInteger(Str, Radix, Result)) 506 return true; 507 508 // For getAsUnsignedInteger, we require the whole string to be consumed or 509 // else we consider it a failure. 510 return !Str.empty(); 511 } 512 513 bool llvm::getAsSignedInteger(StringRef Str, unsigned Radix, 514 long long &Result) { 515 if (consumeSignedInteger(Str, Radix, Result)) 516 return true; 517 518 // For getAsSignedInteger, we require the whole string to be consumed or else 519 // we consider it a failure. 520 return !Str.empty(); 521 } 522 523 bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const { 524 StringRef Str = *this; 525 526 // Autosense radix if not specified. 527 if (Radix == 0) 528 Radix = GetAutoSenseRadix(Str); 529 530 assert(Radix > 1 && Radix <= 36); 531 532 // Empty strings (after the radix autosense) are invalid. 533 if (Str.empty()) return true; 534 535 // Skip leading zeroes. This can be a significant improvement if 536 // it means we don't need > 64 bits. 537 while (!Str.empty() && Str.front() == '0') 538 Str = Str.substr(1); 539 540 // If it was nothing but zeroes.... 541 if (Str.empty()) { 542 Result = APInt(64, 0); 543 return false; 544 } 545 546 // (Over-)estimate the required number of bits. 547 unsigned Log2Radix = 0; 548 while ((1U << Log2Radix) < Radix) Log2Radix++; 549 bool IsPowerOf2Radix = ((1U << Log2Radix) == Radix); 550 551 unsigned BitWidth = Log2Radix * Str.size(); 552 if (BitWidth < Result.getBitWidth()) 553 BitWidth = Result.getBitWidth(); // don't shrink the result 554 else if (BitWidth > Result.getBitWidth()) 555 Result = Result.zext(BitWidth); 556 557 APInt RadixAP, CharAP; // unused unless !IsPowerOf2Radix 558 if (!IsPowerOf2Radix) { 559 // These must have the same bit-width as Result. 560 RadixAP = APInt(BitWidth, Radix); 561 CharAP = APInt(BitWidth, 0); 562 } 563 564 // Parse all the bytes of the string given this radix. 565 Result = 0; 566 while (!Str.empty()) { 567 unsigned CharVal; 568 if (Str[0] >= '0' && Str[0] <= '9') 569 CharVal = Str[0]-'0'; 570 else if (Str[0] >= 'a' && Str[0] <= 'z') 571 CharVal = Str[0]-'a'+10; 572 else if (Str[0] >= 'A' && Str[0] <= 'Z') 573 CharVal = Str[0]-'A'+10; 574 else 575 return true; 576 577 // If the parsed value is larger than the integer radix, the string is 578 // invalid. 579 if (CharVal >= Radix) 580 return true; 581 582 // Add in this character. 583 if (IsPowerOf2Radix) { 584 Result <<= Log2Radix; 585 Result |= CharVal; 586 } else { 587 Result *= RadixAP; 588 CharAP = CharVal; 589 Result += CharAP; 590 } 591 592 Str = Str.substr(1); 593 } 594 595 return false; 596 } 597 598 599 // Implementation of StringRef hashing. 600 hash_code llvm::hash_value(StringRef S) { 601 return hash_combine_range(S.begin(), S.end()); 602 } 603