1 //===--- StringRef.h - Constant String Reference Wrapper --------*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 10 #ifndef LLVM_ADT_STRINGREF_H 11 #define LLVM_ADT_STRINGREF_H 12 13 #include "llvm/ADT/iterator_range.h" 14 #include "llvm/Support/Compiler.h" 15 #include <algorithm> 16 #include <cassert> 17 #include <cstring> 18 #include <limits> 19 #include <string> 20 #include <utility> 21 22 namespace llvm { 23 template <typename T> 24 class SmallVectorImpl; 25 class APInt; 26 class hash_code; 27 class StringRef; 28 29 /// Helper functions for StringRef::getAsInteger. 30 bool getAsUnsignedInteger(StringRef Str, unsigned Radix, 31 unsigned long long &Result); 32 33 bool getAsSignedInteger(StringRef Str, unsigned Radix, long long &Result); 34 35 /// StringRef - Represent a constant reference to a string, i.e. a character 36 /// array and a length, which need not be null terminated. 37 /// 38 /// This class does not own the string data, it is expected to be used in 39 /// situations where the character data resides in some other buffer, whose 40 /// lifetime extends past that of the StringRef. For this reason, it is not in 41 /// general safe to store a StringRef. 42 class StringRef { 43 public: 44 typedef const char *iterator; 45 typedef const char *const_iterator; 46 static const size_t npos = ~size_t(0); 47 typedef size_t size_type; 48 49 private: 50 /// The start of the string, in an external buffer. 51 const char *Data; 52 53 /// The length of the string. 54 size_t Length; 55 56 // Workaround memcmp issue with null pointers (undefined behavior) 57 // by providing a specialized version 58 LLVM_ATTRIBUTE_ALWAYS_INLINE 59 static int compareMemory(const char *Lhs, const char *Rhs, size_t Length) { 60 if (Length == 0) { return 0; } 61 return ::memcmp(Lhs,Rhs,Length); 62 } 63 64 public: 65 /// @name Constructors 66 /// @{ 67 68 /// Construct an empty string ref. 69 /*implicit*/ StringRef() : Data(nullptr), Length(0) {} 70 71 /// Construct a string ref from a cstring. 72 /*implicit*/ StringRef(const char *Str) 73 : Data(Str) { 74 assert(Str && "StringRef cannot be built from a NULL argument"); 75 Length = ::strlen(Str); // invoking strlen(NULL) is undefined behavior 76 } 77 78 /// Construct a string ref from a pointer and length. 79 LLVM_ATTRIBUTE_ALWAYS_INLINE 80 /*implicit*/ StringRef(const char *data, size_t length) 81 : Data(data), Length(length) { 82 assert((data || length == 0) && 83 "StringRef cannot be built from a NULL argument with non-null length"); 84 } 85 86 /// Construct a string ref from an std::string. 87 LLVM_ATTRIBUTE_ALWAYS_INLINE 88 /*implicit*/ StringRef(const std::string &Str) 89 : Data(Str.data()), Length(Str.length()) {} 90 91 /// @} 92 /// @name Iterators 93 /// @{ 94 95 iterator begin() const { return Data; } 96 97 iterator end() const { return Data + Length; } 98 99 const unsigned char *bytes_begin() const { 100 return reinterpret_cast<const unsigned char *>(begin()); 101 } 102 const unsigned char *bytes_end() const { 103 return reinterpret_cast<const unsigned char *>(end()); 104 } 105 iterator_range<const unsigned char *> bytes() const { 106 return make_range(bytes_begin(), bytes_end()); 107 } 108 109 /// @} 110 /// @name String Operations 111 /// @{ 112 113 /// data - Get a pointer to the start of the string (which may not be null 114 /// terminated). 115 LLVM_ATTRIBUTE_ALWAYS_INLINE 116 const char *data() const { return Data; } 117 118 /// empty - Check if the string is empty. 119 LLVM_ATTRIBUTE_ALWAYS_INLINE 120 bool empty() const { return Length == 0; } 121 122 /// size - Get the string size. 123 LLVM_ATTRIBUTE_ALWAYS_INLINE 124 size_t size() const { return Length; } 125 126 /// front - Get the first character in the string. 127 char front() const { 128 assert(!empty()); 129 return Data[0]; 130 } 131 132 /// back - Get the last character in the string. 133 char back() const { 134 assert(!empty()); 135 return Data[Length-1]; 136 } 137 138 // copy - Allocate copy in Allocator and return StringRef to it. 139 template <typename Allocator> StringRef copy(Allocator &A) const { 140 // Don't request a length 0 copy from the allocator. 141 if (empty()) 142 return StringRef(); 143 char *S = A.template Allocate<char>(Length); 144 std::copy(begin(), end(), S); 145 return StringRef(S, Length); 146 } 147 148 /// equals - Check for string equality, this is more efficient than 149 /// compare() when the relative ordering of inequal strings isn't needed. 150 LLVM_ATTRIBUTE_ALWAYS_INLINE 151 bool equals(StringRef RHS) const { 152 return (Length == RHS.Length && 153 compareMemory(Data, RHS.Data, RHS.Length) == 0); 154 } 155 156 /// equals_lower - Check for string equality, ignoring case. 157 bool equals_lower(StringRef RHS) const { 158 return Length == RHS.Length && compare_lower(RHS) == 0; 159 } 160 161 /// compare - Compare two strings; the result is -1, 0, or 1 if this string 162 /// is lexicographically less than, equal to, or greater than the \p RHS. 163 LLVM_ATTRIBUTE_ALWAYS_INLINE 164 int compare(StringRef RHS) const { 165 // Check the prefix for a mismatch. 166 if (int Res = compareMemory(Data, RHS.Data, std::min(Length, RHS.Length))) 167 return Res < 0 ? -1 : 1; 168 169 // Otherwise the prefixes match, so we only need to check the lengths. 170 if (Length == RHS.Length) 171 return 0; 172 return Length < RHS.Length ? -1 : 1; 173 } 174 175 /// compare_lower - Compare two strings, ignoring case. 176 int compare_lower(StringRef RHS) const; 177 178 /// compare_numeric - Compare two strings, treating sequences of digits as 179 /// numbers. 180 int compare_numeric(StringRef RHS) const; 181 182 /// \brief Determine the edit distance between this string and another 183 /// string. 184 /// 185 /// \param Other the string to compare this string against. 186 /// 187 /// \param AllowReplacements whether to allow character 188 /// replacements (change one character into another) as a single 189 /// operation, rather than as two operations (an insertion and a 190 /// removal). 191 /// 192 /// \param MaxEditDistance If non-zero, the maximum edit distance that 193 /// this routine is allowed to compute. If the edit distance will exceed 194 /// that maximum, returns \c MaxEditDistance+1. 195 /// 196 /// \returns the minimum number of character insertions, removals, 197 /// or (if \p AllowReplacements is \c true) replacements needed to 198 /// transform one of the given strings into the other. If zero, 199 /// the strings are identical. 200 unsigned edit_distance(StringRef Other, bool AllowReplacements = true, 201 unsigned MaxEditDistance = 0) const; 202 203 /// str - Get the contents as an std::string. 204 std::string str() const { 205 if (!Data) return std::string(); 206 return std::string(Data, Length); 207 } 208 209 /// @} 210 /// @name Operator Overloads 211 /// @{ 212 213 char operator[](size_t Index) const { 214 assert(Index < Length && "Invalid index!"); 215 return Data[Index]; 216 } 217 218 /// @} 219 /// @name Type Conversions 220 /// @{ 221 222 operator std::string() const { 223 return str(); 224 } 225 226 /// @} 227 /// @name String Predicates 228 /// @{ 229 230 /// Check if this string starts with the given \p Prefix. 231 LLVM_ATTRIBUTE_ALWAYS_INLINE 232 bool startswith(StringRef Prefix) const { 233 return Length >= Prefix.Length && 234 compareMemory(Data, Prefix.Data, Prefix.Length) == 0; 235 } 236 237 /// Check if this string starts with the given \p Prefix, ignoring case. 238 bool startswith_lower(StringRef Prefix) const; 239 240 /// Check if this string ends with the given \p Suffix. 241 LLVM_ATTRIBUTE_ALWAYS_INLINE 242 bool endswith(StringRef Suffix) const { 243 return Length >= Suffix.Length && 244 compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0; 245 } 246 247 /// Check if this string ends with the given \p Suffix, ignoring case. 248 bool endswith_lower(StringRef Suffix) const; 249 250 /// @} 251 /// @name String Searching 252 /// @{ 253 254 /// Search for the first character \p C in the string. 255 /// 256 /// \returns The index of the first occurrence of \p C, or npos if not 257 /// found. 258 LLVM_ATTRIBUTE_ALWAYS_INLINE 259 size_t find(char C, size_t From = 0) const { 260 size_t FindBegin = std::min(From, Length); 261 if (FindBegin < Length) { // Avoid calling memchr with nullptr. 262 // Just forward to memchr, which is faster than a hand-rolled loop. 263 if (const void *P = ::memchr(Data + FindBegin, C, Length - FindBegin)) 264 return static_cast<const char *>(P) - Data; 265 } 266 return npos; 267 } 268 269 /// Search for the first string \p Str in the string. 270 /// 271 /// \returns The index of the first occurrence of \p Str, or npos if not 272 /// found. 273 size_t find(StringRef Str, size_t From = 0) const; 274 275 /// Search for the last character \p C in the string. 276 /// 277 /// \returns The index of the last occurrence of \p C, or npos if not 278 /// found. 279 size_t rfind(char C, size_t From = npos) const { 280 From = std::min(From, Length); 281 size_t i = From; 282 while (i != 0) { 283 --i; 284 if (Data[i] == C) 285 return i; 286 } 287 return npos; 288 } 289 290 /// Search for the last string \p Str in the string. 291 /// 292 /// \returns The index of the last occurrence of \p Str, or npos if not 293 /// found. 294 size_t rfind(StringRef Str) const; 295 296 /// Find the first character in the string that is \p C, or npos if not 297 /// found. Same as find. 298 size_t find_first_of(char C, size_t From = 0) const { 299 return find(C, From); 300 } 301 302 /// Find the first character in the string that is in \p Chars, or npos if 303 /// not found. 304 /// 305 /// Complexity: O(size() + Chars.size()) 306 size_t find_first_of(StringRef Chars, size_t From = 0) const; 307 308 /// Find the first character in the string that is not \p C or npos if not 309 /// found. 310 size_t find_first_not_of(char C, size_t From = 0) const; 311 312 /// Find the first character in the string that is not in the string 313 /// \p Chars, or npos if not found. 314 /// 315 /// Complexity: O(size() + Chars.size()) 316 size_t find_first_not_of(StringRef Chars, size_t From = 0) const; 317 318 /// Find the last character in the string that is \p C, or npos if not 319 /// found. 320 size_t find_last_of(char C, size_t From = npos) const { 321 return rfind(C, From); 322 } 323 324 /// Find the last character in the string that is in \p C, or npos if not 325 /// found. 326 /// 327 /// Complexity: O(size() + Chars.size()) 328 size_t find_last_of(StringRef Chars, size_t From = npos) const; 329 330 /// Find the last character in the string that is not \p C, or npos if not 331 /// found. 332 size_t find_last_not_of(char C, size_t From = npos) const; 333 334 /// Find the last character in the string that is not in \p Chars, or 335 /// npos if not found. 336 /// 337 /// Complexity: O(size() + Chars.size()) 338 size_t find_last_not_of(StringRef Chars, size_t From = npos) const; 339 340 /// @} 341 /// @name Helpful Algorithms 342 /// @{ 343 344 /// Return the number of occurrences of \p C in the string. 345 size_t count(char C) const { 346 size_t Count = 0; 347 for (size_t i = 0, e = Length; i != e; ++i) 348 if (Data[i] == C) 349 ++Count; 350 return Count; 351 } 352 353 /// Return the number of non-overlapped occurrences of \p Str in 354 /// the string. 355 size_t count(StringRef Str) const; 356 357 /// Parse the current string as an integer of the specified radix. If 358 /// \p Radix is specified as zero, this does radix autosensing using 359 /// extended C rules: 0 is octal, 0x is hex, 0b is binary. 360 /// 361 /// If the string is invalid or if only a subset of the string is valid, 362 /// this returns true to signify the error. The string is considered 363 /// erroneous if empty or if it overflows T. 364 template <typename T> 365 typename std::enable_if<std::numeric_limits<T>::is_signed, bool>::type 366 getAsInteger(unsigned Radix, T &Result) const { 367 long long LLVal; 368 if (getAsSignedInteger(*this, Radix, LLVal) || 369 static_cast<T>(LLVal) != LLVal) 370 return true; 371 Result = LLVal; 372 return false; 373 } 374 375 template <typename T> 376 typename std::enable_if<!std::numeric_limits<T>::is_signed, bool>::type 377 getAsInteger(unsigned Radix, T &Result) const { 378 unsigned long long ULLVal; 379 // The additional cast to unsigned long long is required to avoid the 380 // Visual C++ warning C4805: '!=' : unsafe mix of type 'bool' and type 381 // 'unsigned __int64' when instantiating getAsInteger with T = bool. 382 if (getAsUnsignedInteger(*this, Radix, ULLVal) || 383 static_cast<unsigned long long>(static_cast<T>(ULLVal)) != ULLVal) 384 return true; 385 Result = ULLVal; 386 return false; 387 } 388 389 /// Parse the current string as an integer of the specified \p Radix, or of 390 /// an autosensed radix if the \p Radix given is 0. The current value in 391 /// \p Result is discarded, and the storage is changed to be wide enough to 392 /// store the parsed integer. 393 /// 394 /// \returns true if the string does not solely consist of a valid 395 /// non-empty number in the appropriate base. 396 /// 397 /// APInt::fromString is superficially similar but assumes the 398 /// string is well-formed in the given radix. 399 bool getAsInteger(unsigned Radix, APInt &Result) const; 400 401 /// @} 402 /// @name String Operations 403 /// @{ 404 405 // Convert the given ASCII string to lowercase. 406 std::string lower() const; 407 408 /// Convert the given ASCII string to uppercase. 409 std::string upper() const; 410 411 /// @} 412 /// @name Substring Operations 413 /// @{ 414 415 /// Return a reference to the substring from [Start, Start + N). 416 /// 417 /// \param Start The index of the starting character in the substring; if 418 /// the index is npos or greater than the length of the string then the 419 /// empty substring will be returned. 420 /// 421 /// \param N The number of characters to included in the substring. If N 422 /// exceeds the number of characters remaining in the string, the string 423 /// suffix (starting with \p Start) will be returned. 424 LLVM_ATTRIBUTE_ALWAYS_INLINE 425 StringRef substr(size_t Start, size_t N = npos) const { 426 Start = std::min(Start, Length); 427 return StringRef(Data + Start, std::min(N, Length - Start)); 428 } 429 430 /// Return a StringRef equal to 'this' but with the first \p N elements 431 /// dropped. 432 LLVM_ATTRIBUTE_ALWAYS_INLINE 433 StringRef drop_front(size_t N = 1) const { 434 assert(size() >= N && "Dropping more elements than exist"); 435 return substr(N); 436 } 437 438 /// Return a StringRef equal to 'this' but with the last \p N elements 439 /// dropped. 440 LLVM_ATTRIBUTE_ALWAYS_INLINE 441 StringRef drop_back(size_t N = 1) const { 442 assert(size() >= N && "Dropping more elements than exist"); 443 return substr(0, size()-N); 444 } 445 446 /// Return a reference to the substring from [Start, End). 447 /// 448 /// \param Start The index of the starting character in the substring; if 449 /// the index is npos or greater than the length of the string then the 450 /// empty substring will be returned. 451 /// 452 /// \param End The index following the last character to include in the 453 /// substring. If this is npos or exceeds the number of characters 454 /// remaining in the string, the string suffix (starting with \p Start) 455 /// will be returned. If this is less than \p Start, an empty string will 456 /// be returned. 457 LLVM_ATTRIBUTE_ALWAYS_INLINE 458 StringRef slice(size_t Start, size_t End) const { 459 Start = std::min(Start, Length); 460 End = std::min(std::max(Start, End), Length); 461 return StringRef(Data + Start, End - Start); 462 } 463 464 /// Split into two substrings around the first occurrence of a separator 465 /// character. 466 /// 467 /// If \p Separator is in the string, then the result is a pair (LHS, RHS) 468 /// such that (*this == LHS + Separator + RHS) is true and RHS is 469 /// maximal. If \p Separator is not in the string, then the result is a 470 /// pair (LHS, RHS) where (*this == LHS) and (RHS == ""). 471 /// 472 /// \param Separator The character to split on. 473 /// \returns The split substrings. 474 std::pair<StringRef, StringRef> split(char Separator) const { 475 size_t Idx = find(Separator); 476 if (Idx == npos) 477 return std::make_pair(*this, StringRef()); 478 return std::make_pair(slice(0, Idx), slice(Idx+1, npos)); 479 } 480 481 /// Split into two substrings around the first occurrence of a separator 482 /// string. 483 /// 484 /// If \p Separator is in the string, then the result is a pair (LHS, RHS) 485 /// such that (*this == LHS + Separator + RHS) is true and RHS is 486 /// maximal. If \p Separator is not in the string, then the result is a 487 /// pair (LHS, RHS) where (*this == LHS) and (RHS == ""). 488 /// 489 /// \param Separator - The string to split on. 490 /// \return - The split substrings. 491 std::pair<StringRef, StringRef> split(StringRef Separator) const { 492 size_t Idx = find(Separator); 493 if (Idx == npos) 494 return std::make_pair(*this, StringRef()); 495 return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos)); 496 } 497 498 /// Split into substrings around the occurrences of a separator string. 499 /// 500 /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most 501 /// \p MaxSplit splits are done and consequently <= \p MaxSplit + 1 502 /// elements are added to A. 503 /// If \p KeepEmpty is false, empty strings are not added to \p A. They 504 /// still count when considering \p MaxSplit 505 /// An useful invariant is that 506 /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true 507 /// 508 /// \param A - Where to put the substrings. 509 /// \param Separator - The string to split on. 510 /// \param MaxSplit - The maximum number of times the string is split. 511 /// \param KeepEmpty - True if empty substring should be added. 512 void split(SmallVectorImpl<StringRef> &A, 513 StringRef Separator, int MaxSplit = -1, 514 bool KeepEmpty = true) const; 515 516 /// Split into substrings around the occurrences of a separator character. 517 /// 518 /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most 519 /// \p MaxSplit splits are done and consequently <= \p MaxSplit + 1 520 /// elements are added to A. 521 /// If \p KeepEmpty is false, empty strings are not added to \p A. They 522 /// still count when considering \p MaxSplit 523 /// An useful invariant is that 524 /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true 525 /// 526 /// \param A - Where to put the substrings. 527 /// \param Separator - The string to split on. 528 /// \param MaxSplit - The maximum number of times the string is split. 529 /// \param KeepEmpty - True if empty substring should be added. 530 void split(SmallVectorImpl<StringRef> &A, char Separator, int MaxSplit = -1, 531 bool KeepEmpty = true) const; 532 533 /// Split into two substrings around the last occurrence of a separator 534 /// character. 535 /// 536 /// If \p Separator is in the string, then the result is a pair (LHS, RHS) 537 /// such that (*this == LHS + Separator + RHS) is true and RHS is 538 /// minimal. If \p Separator is not in the string, then the result is a 539 /// pair (LHS, RHS) where (*this == LHS) and (RHS == ""). 540 /// 541 /// \param Separator - The character to split on. 542 /// \return - The split substrings. 543 std::pair<StringRef, StringRef> rsplit(char Separator) const { 544 size_t Idx = rfind(Separator); 545 if (Idx == npos) 546 return std::make_pair(*this, StringRef()); 547 return std::make_pair(slice(0, Idx), slice(Idx+1, npos)); 548 } 549 550 /// Return string with consecutive \p Char characters starting from the 551 /// the left removed. 552 StringRef ltrim(char Char) const { 553 return drop_front(std::min(Length, find_first_not_of(Char))); 554 } 555 556 /// Return string with consecutive characters in \p Chars starting from 557 /// the left removed. 558 StringRef ltrim(StringRef Chars = " \t\n\v\f\r") const { 559 return drop_front(std::min(Length, find_first_not_of(Chars))); 560 } 561 562 /// Return string with consecutive \p Char characters starting from the 563 /// right removed. 564 StringRef rtrim(char Char) const { 565 return drop_back(Length - std::min(Length, find_last_not_of(Char) + 1)); 566 } 567 568 /// Return string with consecutive characters in \p Chars starting from 569 /// the right removed. 570 StringRef rtrim(StringRef Chars = " \t\n\v\f\r") const { 571 return drop_back(Length - std::min(Length, find_last_not_of(Chars) + 1)); 572 } 573 574 /// Return string with consecutive \p Char characters starting from the 575 /// left and right removed. 576 StringRef trim(char Char) const { 577 return ltrim(Char).rtrim(Char); 578 } 579 580 /// Return string with consecutive characters in \p Chars starting from 581 /// the left and right removed. 582 StringRef trim(StringRef Chars = " \t\n\v\f\r") const { 583 return ltrim(Chars).rtrim(Chars); 584 } 585 586 /// @} 587 }; 588 589 /// @name StringRef Comparison Operators 590 /// @{ 591 592 LLVM_ATTRIBUTE_ALWAYS_INLINE 593 inline bool operator==(StringRef LHS, StringRef RHS) { 594 return LHS.equals(RHS); 595 } 596 597 LLVM_ATTRIBUTE_ALWAYS_INLINE 598 inline bool operator!=(StringRef LHS, StringRef RHS) { 599 return !(LHS == RHS); 600 } 601 602 inline bool operator<(StringRef LHS, StringRef RHS) { 603 return LHS.compare(RHS) == -1; 604 } 605 606 inline bool operator<=(StringRef LHS, StringRef RHS) { 607 return LHS.compare(RHS) != 1; 608 } 609 610 inline bool operator>(StringRef LHS, StringRef RHS) { 611 return LHS.compare(RHS) == 1; 612 } 613 614 inline bool operator>=(StringRef LHS, StringRef RHS) { 615 return LHS.compare(RHS) != -1; 616 } 617 618 inline std::string &operator+=(std::string &buffer, StringRef string) { 619 return buffer.append(string.data(), string.size()); 620 } 621 622 /// @} 623 624 /// \brief Compute a hash_code for a StringRef. 625 hash_code hash_value(StringRef S); 626 627 // StringRefs can be treated like a POD type. 628 template <typename T> struct isPodLike; 629 template <> struct isPodLike<StringRef> { static const bool value = true; }; 630 } 631 632 #endif 633