1 // Copyright 2013 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/strings/string_util.h" 6 7 #include <ctype.h> 8 #include <errno.h> 9 #include <math.h> 10 #include <stdarg.h> 11 #include <stdint.h> 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include <string.h> 15 #include <time.h> 16 #include <wchar.h> 17 #include <wctype.h> 18 19 #include <algorithm> 20 #include <limits> 21 #include <vector> 22 23 #include "base/logging.h" 24 #include "base/macros.h" 25 #include "base/memory/singleton.h" 26 #include "base/strings/utf_string_conversion_utils.h" 27 #include "base/strings/utf_string_conversions.h" 28 #include "base/third_party/icu/icu_utf.h" 29 #include "build/build_config.h" 30 31 namespace base { 32 33 namespace { 34 35 // Force the singleton used by EmptyString[16] to be a unique type. This 36 // prevents other code that might accidentally use Singleton<string> from 37 // getting our internal one. 38 struct EmptyStrings { 39 EmptyStrings() = default; 40 const std::string s; 41 const string16 s16; 42 43 static EmptyStrings* GetInstance() { 44 return Singleton<EmptyStrings>::get(); 45 } 46 }; 47 48 // Used by ReplaceStringPlaceholders to track the position in the string of 49 // replaced parameters. 50 struct ReplacementOffset { 51 ReplacementOffset(uintptr_t parameter, size_t offset) 52 : parameter(parameter), 53 offset(offset) {} 54 55 // Index of the parameter. 56 uintptr_t parameter; 57 58 // Starting position in the string. 59 size_t offset; 60 }; 61 62 static bool CompareParameter(const ReplacementOffset& elem1, 63 const ReplacementOffset& elem2) { 64 return elem1.parameter < elem2.parameter; 65 } 66 67 // Overloaded function to append one string onto the end of another. Having a 68 // separate overload for |source| as both string and StringPiece allows for more 69 // efficient usage from functions templated to work with either type (avoiding a 70 // redundant call to the BasicStringPiece constructor in both cases). 71 template <typename string_type> 72 inline void AppendToString(string_type* target, const string_type& source) { 73 target->append(source); 74 } 75 76 template <typename string_type> 77 inline void AppendToString(string_type* target, 78 const BasicStringPiece<string_type>& source) { 79 source.AppendToString(target); 80 } 81 82 // Assuming that a pointer is the size of a "machine word", then 83 // uintptr_t is an integer type that is also a machine word. 84 typedef uintptr_t MachineWord; 85 const uintptr_t kMachineWordAlignmentMask = sizeof(MachineWord) - 1; 86 87 inline bool IsAlignedToMachineWord(const void* pointer) { 88 return !(reinterpret_cast<MachineWord>(pointer) & kMachineWordAlignmentMask); 89 } 90 91 template<typename T> inline T* AlignToMachineWord(T* pointer) { 92 return reinterpret_cast<T*>(reinterpret_cast<MachineWord>(pointer) & 93 ~kMachineWordAlignmentMask); 94 } 95 96 template<size_t size, typename CharacterType> struct NonASCIIMask; 97 template<> struct NonASCIIMask<4, char16> { 98 static inline uint32_t value() { return 0xFF80FF80U; } 99 }; 100 template<> struct NonASCIIMask<4, char> { 101 static inline uint32_t value() { return 0x80808080U; } 102 }; 103 template<> struct NonASCIIMask<8, char16> { 104 static inline uint64_t value() { return 0xFF80FF80FF80FF80ULL; } 105 }; 106 template<> struct NonASCIIMask<8, char> { 107 static inline uint64_t value() { return 0x8080808080808080ULL; } 108 }; 109 #if defined(WCHAR_T_IS_UTF32) 110 template<> struct NonASCIIMask<4, wchar_t> { 111 static inline uint32_t value() { return 0xFFFFFF80U; } 112 }; 113 template<> struct NonASCIIMask<8, wchar_t> { 114 static inline uint64_t value() { return 0xFFFFFF80FFFFFF80ULL; } 115 }; 116 #endif // WCHAR_T_IS_UTF32 117 118 } // namespace 119 120 bool IsWprintfFormatPortable(const wchar_t* format) { 121 for (const wchar_t* position = format; *position != '\0'; ++position) { 122 if (*position == '%') { 123 bool in_specification = true; 124 bool modifier_l = false; 125 while (in_specification) { 126 // Eat up characters until reaching a known specifier. 127 if (*++position == '\0') { 128 // The format string ended in the middle of a specification. Call 129 // it portable because no unportable specifications were found. The 130 // string is equally broken on all platforms. 131 return true; 132 } 133 134 if (*position == 'l') { 135 // 'l' is the only thing that can save the 's' and 'c' specifiers. 136 modifier_l = true; 137 } else if (((*position == 's' || *position == 'c') && !modifier_l) || 138 *position == 'S' || *position == 'C' || *position == 'F' || 139 *position == 'D' || *position == 'O' || *position == 'U') { 140 // Not portable. 141 return false; 142 } 143 144 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { 145 // Portable, keep scanning the rest of the format string. 146 in_specification = false; 147 } 148 } 149 } 150 } 151 152 return true; 153 } 154 155 namespace { 156 157 template<typename StringType> 158 StringType ToLowerASCIIImpl(BasicStringPiece<StringType> str) { 159 StringType ret; 160 ret.reserve(str.size()); 161 for (size_t i = 0; i < str.size(); i++) 162 ret.push_back(ToLowerASCII(str[i])); 163 return ret; 164 } 165 166 template<typename StringType> 167 StringType ToUpperASCIIImpl(BasicStringPiece<StringType> str) { 168 StringType ret; 169 ret.reserve(str.size()); 170 for (size_t i = 0; i < str.size(); i++) 171 ret.push_back(ToUpperASCII(str[i])); 172 return ret; 173 } 174 175 } // namespace 176 177 std::string ToLowerASCII(StringPiece str) { 178 return ToLowerASCIIImpl<std::string>(str); 179 } 180 181 string16 ToLowerASCII(StringPiece16 str) { 182 return ToLowerASCIIImpl<string16>(str); 183 } 184 185 std::string ToUpperASCII(StringPiece str) { 186 return ToUpperASCIIImpl<std::string>(str); 187 } 188 189 string16 ToUpperASCII(StringPiece16 str) { 190 return ToUpperASCIIImpl<string16>(str); 191 } 192 193 template<class StringType> 194 int CompareCaseInsensitiveASCIIT(BasicStringPiece<StringType> a, 195 BasicStringPiece<StringType> b) { 196 // Find the first characters that aren't equal and compare them. If the end 197 // of one of the strings is found before a nonequal character, the lengths 198 // of the strings are compared. 199 size_t i = 0; 200 while (i < a.length() && i < b.length()) { 201 typename StringType::value_type lower_a = ToLowerASCII(a[i]); 202 typename StringType::value_type lower_b = ToLowerASCII(b[i]); 203 if (lower_a < lower_b) 204 return -1; 205 if (lower_a > lower_b) 206 return 1; 207 i++; 208 } 209 210 // End of one string hit before finding a different character. Expect the 211 // common case to be "strings equal" at this point so check that first. 212 if (a.length() == b.length()) 213 return 0; 214 215 if (a.length() < b.length()) 216 return -1; 217 return 1; 218 } 219 220 int CompareCaseInsensitiveASCII(StringPiece a, StringPiece b) { 221 return CompareCaseInsensitiveASCIIT<std::string>(a, b); 222 } 223 224 int CompareCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) { 225 return CompareCaseInsensitiveASCIIT<string16>(a, b); 226 } 227 228 bool EqualsCaseInsensitiveASCII(StringPiece a, StringPiece b) { 229 if (a.length() != b.length()) 230 return false; 231 return CompareCaseInsensitiveASCIIT<std::string>(a, b) == 0; 232 } 233 234 bool EqualsCaseInsensitiveASCII(StringPiece16 a, StringPiece16 b) { 235 if (a.length() != b.length()) 236 return false; 237 return CompareCaseInsensitiveASCIIT<string16>(a, b) == 0; 238 } 239 240 const std::string& EmptyString() { 241 return EmptyStrings::GetInstance()->s; 242 } 243 244 const string16& EmptyString16() { 245 return EmptyStrings::GetInstance()->s16; 246 } 247 248 template <class StringType> 249 bool ReplaceCharsT(const StringType& input, 250 BasicStringPiece<StringType> find_any_of_these, 251 BasicStringPiece<StringType> replace_with, 252 StringType* output); 253 254 bool ReplaceChars(const string16& input, 255 StringPiece16 replace_chars, 256 const string16& replace_with, 257 string16* output) { 258 return ReplaceCharsT(input, replace_chars, StringPiece16(replace_with), 259 output); 260 } 261 262 bool ReplaceChars(const std::string& input, 263 StringPiece replace_chars, 264 const std::string& replace_with, 265 std::string* output) { 266 return ReplaceCharsT(input, replace_chars, StringPiece(replace_with), output); 267 } 268 269 bool RemoveChars(const string16& input, 270 StringPiece16 remove_chars, 271 string16* output) { 272 return ReplaceCharsT(input, remove_chars, StringPiece16(), output); 273 } 274 275 bool RemoveChars(const std::string& input, 276 StringPiece remove_chars, 277 std::string* output) { 278 return ReplaceCharsT(input, remove_chars, StringPiece(), output); 279 } 280 281 template<typename Str> 282 TrimPositions TrimStringT(const Str& input, 283 BasicStringPiece<Str> trim_chars, 284 TrimPositions positions, 285 Str* output) { 286 // Find the edges of leading/trailing whitespace as desired. Need to use 287 // a StringPiece version of input to be able to call find* on it with the 288 // StringPiece version of trim_chars (normally the trim_chars will be a 289 // constant so avoid making a copy). 290 BasicStringPiece<Str> input_piece(input); 291 const size_t last_char = input.length() - 1; 292 const size_t first_good_char = (positions & TRIM_LEADING) ? 293 input_piece.find_first_not_of(trim_chars) : 0; 294 const size_t last_good_char = (positions & TRIM_TRAILING) ? 295 input_piece.find_last_not_of(trim_chars) : last_char; 296 297 // When the string was all trimmed, report that we stripped off characters 298 // from whichever position the caller was interested in. For empty input, we 299 // stripped no characters, but we still need to clear |output|. 300 if (input.empty() || 301 (first_good_char == Str::npos) || (last_good_char == Str::npos)) { 302 bool input_was_empty = input.empty(); // in case output == &input 303 output->clear(); 304 return input_was_empty ? TRIM_NONE : positions; 305 } 306 307 // Trim. 308 *output = 309 input.substr(first_good_char, last_good_char - first_good_char + 1); 310 311 // Return where we trimmed from. 312 return static_cast<TrimPositions>( 313 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | 314 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); 315 } 316 317 bool TrimString(const string16& input, 318 StringPiece16 trim_chars, 319 string16* output) { 320 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 321 } 322 323 bool TrimString(const std::string& input, 324 StringPiece trim_chars, 325 std::string* output) { 326 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 327 } 328 329 template<typename Str> 330 BasicStringPiece<Str> TrimStringPieceT(BasicStringPiece<Str> input, 331 BasicStringPiece<Str> trim_chars, 332 TrimPositions positions) { 333 size_t begin = (positions & TRIM_LEADING) ? 334 input.find_first_not_of(trim_chars) : 0; 335 size_t end = (positions & TRIM_TRAILING) ? 336 input.find_last_not_of(trim_chars) + 1 : input.size(); 337 return input.substr(begin, end - begin); 338 } 339 340 StringPiece16 TrimString(StringPiece16 input, 341 StringPiece16 trim_chars, 342 TrimPositions positions) { 343 return TrimStringPieceT(input, trim_chars, positions); 344 } 345 346 StringPiece TrimString(StringPiece input, 347 StringPiece trim_chars, 348 TrimPositions positions) { 349 return TrimStringPieceT(input, trim_chars, positions); 350 } 351 352 void TruncateUTF8ToByteSize(const std::string& input, 353 const size_t byte_size, 354 std::string* output) { 355 DCHECK(output); 356 if (byte_size > input.length()) { 357 *output = input; 358 return; 359 } 360 DCHECK_LE(byte_size, 361 static_cast<uint32_t>(std::numeric_limits<int32_t>::max())); 362 // Note: This cast is necessary because CBU8_NEXT uses int32_ts. 363 int32_t truncation_length = static_cast<int32_t>(byte_size); 364 int32_t char_index = truncation_length - 1; 365 const char* data = input.data(); 366 367 // Using CBU8, we will move backwards from the truncation point 368 // to the beginning of the string looking for a valid UTF8 369 // character. Once a full UTF8 character is found, we will 370 // truncate the string to the end of that character. 371 while (char_index >= 0) { 372 int32_t prev = char_index; 373 base_icu::UChar32 code_point = 0; 374 CBU8_NEXT(data, char_index, truncation_length, code_point); 375 if (!IsValidCharacter(code_point) || 376 !IsValidCodepoint(code_point)) { 377 char_index = prev - 1; 378 } else { 379 break; 380 } 381 } 382 383 if (char_index >= 0 ) 384 *output = input.substr(0, char_index); 385 else 386 output->clear(); 387 } 388 389 TrimPositions TrimWhitespace(const string16& input, 390 TrimPositions positions, 391 string16* output) { 392 return TrimStringT(input, StringPiece16(kWhitespaceUTF16), positions, output); 393 } 394 395 StringPiece16 TrimWhitespace(StringPiece16 input, 396 TrimPositions positions) { 397 return TrimStringPieceT(input, StringPiece16(kWhitespaceUTF16), positions); 398 } 399 400 TrimPositions TrimWhitespaceASCII(const std::string& input, 401 TrimPositions positions, 402 std::string* output) { 403 return TrimStringT(input, StringPiece(kWhitespaceASCII), positions, output); 404 } 405 406 StringPiece TrimWhitespaceASCII(StringPiece input, TrimPositions positions) { 407 return TrimStringPieceT(input, StringPiece(kWhitespaceASCII), positions); 408 } 409 410 template<typename STR> 411 STR CollapseWhitespaceT(const STR& text, 412 bool trim_sequences_with_line_breaks) { 413 STR result; 414 result.resize(text.size()); 415 416 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 417 // will trim any leading whitespace. 418 bool in_whitespace = true; 419 bool already_trimmed = true; 420 421 int chars_written = 0; 422 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { 423 if (IsUnicodeWhitespace(*i)) { 424 if (!in_whitespace) { 425 // Reduce all whitespace sequences to a single space. 426 in_whitespace = true; 427 result[chars_written++] = L' '; 428 } 429 if (trim_sequences_with_line_breaks && !already_trimmed && 430 ((*i == '\n') || (*i == '\r'))) { 431 // Whitespace sequences containing CR or LF are eliminated entirely. 432 already_trimmed = true; 433 --chars_written; 434 } 435 } else { 436 // Non-whitespace chracters are copied straight across. 437 in_whitespace = false; 438 already_trimmed = false; 439 result[chars_written++] = *i; 440 } 441 } 442 443 if (in_whitespace && !already_trimmed) { 444 // Any trailing whitespace is eliminated. 445 --chars_written; 446 } 447 448 result.resize(chars_written); 449 return result; 450 } 451 452 string16 CollapseWhitespace(const string16& text, 453 bool trim_sequences_with_line_breaks) { 454 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 455 } 456 457 std::string CollapseWhitespaceASCII(const std::string& text, 458 bool trim_sequences_with_line_breaks) { 459 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 460 } 461 462 bool ContainsOnlyChars(StringPiece input, StringPiece characters) { 463 return input.find_first_not_of(characters) == StringPiece::npos; 464 } 465 466 bool ContainsOnlyChars(StringPiece16 input, StringPiece16 characters) { 467 return input.find_first_not_of(characters) == StringPiece16::npos; 468 } 469 470 template <class Char> 471 inline bool DoIsStringASCII(const Char* characters, size_t length) { 472 MachineWord all_char_bits = 0; 473 const Char* end = characters + length; 474 475 // Prologue: align the input. 476 while (!IsAlignedToMachineWord(characters) && characters != end) { 477 all_char_bits |= *characters; 478 ++characters; 479 } 480 481 // Compare the values of CPU word size. 482 const Char* word_end = AlignToMachineWord(end); 483 const size_t loop_increment = sizeof(MachineWord) / sizeof(Char); 484 while (characters < word_end) { 485 all_char_bits |= *(reinterpret_cast<const MachineWord*>(characters)); 486 characters += loop_increment; 487 } 488 489 // Process the remaining bytes. 490 while (characters != end) { 491 all_char_bits |= *characters; 492 ++characters; 493 } 494 495 MachineWord non_ascii_bit_mask = 496 NonASCIIMask<sizeof(MachineWord), Char>::value(); 497 return !(all_char_bits & non_ascii_bit_mask); 498 } 499 500 bool IsStringASCII(StringPiece str) { 501 return DoIsStringASCII(str.data(), str.length()); 502 } 503 504 bool IsStringASCII(StringPiece16 str) { 505 return DoIsStringASCII(str.data(), str.length()); 506 } 507 508 #if defined(WCHAR_T_IS_UTF32) 509 bool IsStringASCII(WStringPiece str) { 510 return DoIsStringASCII(str.data(), str.length()); 511 } 512 #endif 513 514 bool IsStringUTF8(StringPiece str) { 515 const char *src = str.data(); 516 int32_t src_len = static_cast<int32_t>(str.length()); 517 int32_t char_index = 0; 518 519 while (char_index < src_len) { 520 int32_t code_point; 521 CBU8_NEXT(src, char_index, src_len, code_point); 522 if (!IsValidCharacter(code_point)) 523 return false; 524 } 525 return true; 526 } 527 528 // Implementation note: Normally this function will be called with a hardcoded 529 // constant for the lowercase_ascii parameter. Constructing a StringPiece from 530 // a C constant requires running strlen, so the result will be two passes 531 // through the buffers, one to file the length of lowercase_ascii, and one to 532 // compare each letter. 533 // 534 // This function could have taken a const char* to avoid this and only do one 535 // pass through the string. But the strlen is faster than the case-insensitive 536 // compares and lets us early-exit in the case that the strings are different 537 // lengths (will often be the case for non-matches). So whether one approach or 538 // the other will be faster depends on the case. 539 // 540 // The hardcoded strings are typically very short so it doesn't matter, and the 541 // string piece gives additional flexibility for the caller (doesn't have to be 542 // null terminated) so we choose the StringPiece route. 543 template<typename Str> 544 static inline bool DoLowerCaseEqualsASCII(BasicStringPiece<Str> str, 545 StringPiece lowercase_ascii) { 546 if (str.size() != lowercase_ascii.size()) 547 return false; 548 for (size_t i = 0; i < str.size(); i++) { 549 if (ToLowerASCII(str[i]) != lowercase_ascii[i]) 550 return false; 551 } 552 return true; 553 } 554 555 bool LowerCaseEqualsASCII(StringPiece str, StringPiece lowercase_ascii) { 556 return DoLowerCaseEqualsASCII<std::string>(str, lowercase_ascii); 557 } 558 559 bool LowerCaseEqualsASCII(StringPiece16 str, StringPiece lowercase_ascii) { 560 return DoLowerCaseEqualsASCII<string16>(str, lowercase_ascii); 561 } 562 563 bool EqualsASCII(StringPiece16 str, StringPiece ascii) { 564 if (str.length() != ascii.length()) 565 return false; 566 return std::equal(ascii.begin(), ascii.end(), str.begin()); 567 } 568 569 template<typename Str> 570 bool StartsWithT(BasicStringPiece<Str> str, 571 BasicStringPiece<Str> search_for, 572 CompareCase case_sensitivity) { 573 if (search_for.size() > str.size()) 574 return false; 575 576 BasicStringPiece<Str> source = str.substr(0, search_for.size()); 577 578 switch (case_sensitivity) { 579 case CompareCase::SENSITIVE: 580 return source == search_for; 581 582 case CompareCase::INSENSITIVE_ASCII: 583 return std::equal( 584 search_for.begin(), search_for.end(), 585 source.begin(), 586 CaseInsensitiveCompareASCII<typename Str::value_type>()); 587 588 default: 589 NOTREACHED(); 590 return false; 591 } 592 } 593 594 bool StartsWith(StringPiece str, 595 StringPiece search_for, 596 CompareCase case_sensitivity) { 597 return StartsWithT<std::string>(str, search_for, case_sensitivity); 598 } 599 600 bool StartsWith(StringPiece16 str, 601 StringPiece16 search_for, 602 CompareCase case_sensitivity) { 603 return StartsWithT<string16>(str, search_for, case_sensitivity); 604 } 605 606 template <typename Str> 607 bool EndsWithT(BasicStringPiece<Str> str, 608 BasicStringPiece<Str> search_for, 609 CompareCase case_sensitivity) { 610 if (search_for.size() > str.size()) 611 return false; 612 613 BasicStringPiece<Str> source = str.substr(str.size() - search_for.size(), 614 search_for.size()); 615 616 switch (case_sensitivity) { 617 case CompareCase::SENSITIVE: 618 return source == search_for; 619 620 case CompareCase::INSENSITIVE_ASCII: 621 return std::equal( 622 source.begin(), source.end(), 623 search_for.begin(), 624 CaseInsensitiveCompareASCII<typename Str::value_type>()); 625 626 default: 627 NOTREACHED(); 628 return false; 629 } 630 } 631 632 bool EndsWith(StringPiece str, 633 StringPiece search_for, 634 CompareCase case_sensitivity) { 635 return EndsWithT<std::string>(str, search_for, case_sensitivity); 636 } 637 638 bool EndsWith(StringPiece16 str, 639 StringPiece16 search_for, 640 CompareCase case_sensitivity) { 641 return EndsWithT<string16>(str, search_for, case_sensitivity); 642 } 643 644 char HexDigitToInt(wchar_t c) { 645 DCHECK(IsHexDigit(c)); 646 if (c >= '0' && c <= '9') 647 return static_cast<char>(c - '0'); 648 if (c >= 'A' && c <= 'F') 649 return static_cast<char>(c - 'A' + 10); 650 if (c >= 'a' && c <= 'f') 651 return static_cast<char>(c - 'a' + 10); 652 return 0; 653 } 654 655 bool IsUnicodeWhitespace(wchar_t c) { 656 // kWhitespaceWide is a NULL-terminated string 657 for (const wchar_t* cur = kWhitespaceWide; *cur; ++cur) { 658 if (*cur == c) 659 return true; 660 } 661 return false; 662 } 663 664 static const char* const kByteStringsUnlocalized[] = { 665 " B", 666 " kB", 667 " MB", 668 " GB", 669 " TB", 670 " PB" 671 }; 672 673 string16 FormatBytesUnlocalized(int64_t bytes) { 674 double unit_amount = static_cast<double>(bytes); 675 size_t dimension = 0; 676 const int kKilo = 1024; 677 while (unit_amount >= kKilo && 678 dimension < arraysize(kByteStringsUnlocalized) - 1) { 679 unit_amount /= kKilo; 680 dimension++; 681 } 682 683 char buf[64]; 684 if (bytes != 0 && dimension > 0 && unit_amount < 100) { 685 base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount, 686 kByteStringsUnlocalized[dimension]); 687 } else { 688 base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount, 689 kByteStringsUnlocalized[dimension]); 690 } 691 692 return ASCIIToUTF16(buf); 693 } 694 695 // A Matcher for DoReplaceMatchesAfterOffset() that matches substrings. 696 template <class StringType> 697 struct SubstringMatcher { 698 BasicStringPiece<StringType> find_this; 699 700 size_t Find(const StringType& input, size_t pos) { 701 return input.find(find_this.data(), pos, find_this.length()); 702 } 703 size_t MatchSize() { return find_this.length(); } 704 }; 705 706 // A Matcher for DoReplaceMatchesAfterOffset() that matches single characters. 707 template <class StringType> 708 struct CharacterMatcher { 709 BasicStringPiece<StringType> find_any_of_these; 710 711 size_t Find(const StringType& input, size_t pos) { 712 return input.find_first_of(find_any_of_these.data(), pos, 713 find_any_of_these.length()); 714 } 715 constexpr size_t MatchSize() { return 1; } 716 }; 717 718 enum class ReplaceType { REPLACE_ALL, REPLACE_FIRST }; 719 720 // Runs in O(n) time in the length of |str|, and transforms the string without 721 // reallocating when possible. Returns |true| if any matches were found. 722 // 723 // This is parameterized on a |Matcher| traits type, so that it can be the 724 // implementation for both ReplaceChars() and ReplaceSubstringsAfterOffset(). 725 template <class StringType, class Matcher> 726 bool DoReplaceMatchesAfterOffset(StringType* str, 727 size_t initial_offset, 728 Matcher matcher, 729 BasicStringPiece<StringType> replace_with, 730 ReplaceType replace_type) { 731 using CharTraits = typename StringType::traits_type; 732 733 const size_t find_length = matcher.MatchSize(); 734 if (!find_length) 735 return false; 736 737 // If the find string doesn't appear, there's nothing to do. 738 size_t first_match = matcher.Find(*str, initial_offset); 739 if (first_match == StringType::npos) 740 return false; 741 742 // If we're only replacing one instance, there's no need to do anything 743 // complicated. 744 const size_t replace_length = replace_with.length(); 745 if (replace_type == ReplaceType::REPLACE_FIRST) { 746 str->replace(first_match, find_length, replace_with.data(), replace_length); 747 return true; 748 } 749 750 // If the find and replace strings are the same length, we can simply use 751 // replace() on each instance, and finish the entire operation in O(n) time. 752 if (find_length == replace_length) { 753 auto* buffer = &((*str)[0]); 754 for (size_t offset = first_match; offset != StringType::npos; 755 offset = matcher.Find(*str, offset + replace_length)) { 756 CharTraits::copy(buffer + offset, replace_with.data(), replace_length); 757 } 758 return true; 759 } 760 761 // Since the find and replace strings aren't the same length, a loop like the 762 // one above would be O(n^2) in the worst case, as replace() will shift the 763 // entire remaining string each time. We need to be more clever to keep things 764 // O(n). 765 // 766 // When the string is being shortened, it's possible to just shift the matches 767 // down in one pass while finding, and truncate the length at the end of the 768 // search. 769 // 770 // If the string is being lengthened, more work is required. The strategy used 771 // here is to make two find() passes through the string. The first pass counts 772 // the number of matches to determine the new size. The second pass will 773 // either construct the new string into a new buffer (if the existing buffer 774 // lacked capacity), or else -- if there is room -- create a region of scratch 775 // space after |first_match| by shifting the tail of the string to a higher 776 // index, and doing in-place moves from the tail to lower indices thereafter. 777 size_t str_length = str->length(); 778 size_t expansion = 0; 779 if (replace_length > find_length) { 780 // This operation lengthens the string; determine the new length by counting 781 // matches. 782 const size_t expansion_per_match = (replace_length - find_length); 783 size_t num_matches = 0; 784 for (size_t match = first_match; match != StringType::npos; 785 match = matcher.Find(*str, match + find_length)) { 786 expansion += expansion_per_match; 787 ++num_matches; 788 } 789 const size_t final_length = str_length + expansion; 790 791 if (str->capacity() < final_length) { 792 // If we'd have to allocate a new buffer to grow the string, build the 793 // result directly into the new allocation via append(). 794 StringType src(str->get_allocator()); 795 str->swap(src); 796 str->reserve(final_length); 797 798 size_t pos = 0; 799 for (size_t match = first_match;; match = matcher.Find(src, pos)) { 800 str->append(src, pos, match - pos); 801 str->append(replace_with.data(), replace_length); 802 pos = match + find_length; 803 804 // A mid-loop test/break enables skipping the final Find() call; the 805 // number of matches is known, so don't search past the last one. 806 if (!--num_matches) 807 break; 808 } 809 810 // Handle substring after the final match. 811 str->append(src, pos, str_length - pos); 812 return true; 813 } 814 815 // Prepare for the copy/move loop below -- expand the string to its final 816 // size by shifting the data after the first match to the end of the resized 817 // string. 818 size_t shift_src = first_match + find_length; 819 size_t shift_dst = shift_src + expansion; 820 821 // Big |expansion| factors (relative to |str_length|) require padding up to 822 // |shift_dst|. 823 if (shift_dst > str_length) 824 str->resize(shift_dst); 825 826 str->replace(shift_dst, str_length - shift_src, *str, shift_src, 827 str_length - shift_src); 828 str_length = final_length; 829 } 830 831 // We can alternate replacement and move operations. This won't overwrite the 832 // unsearched region of the string so long as |write_offset| <= |read_offset|; 833 // that condition is always satisfied because: 834 // 835 // (a) If the string is being shortened, |expansion| is zero and 836 // |write_offset| grows slower than |read_offset|. 837 // 838 // (b) If the string is being lengthened, |write_offset| grows faster than 839 // |read_offset|, but |expansion| is big enough so that |write_offset| 840 // will only catch up to |read_offset| at the point of the last match. 841 auto* buffer = &((*str)[0]); 842 size_t write_offset = first_match; 843 size_t read_offset = first_match + expansion; 844 do { 845 if (replace_length) { 846 CharTraits::copy(buffer + write_offset, replace_with.data(), 847 replace_length); 848 write_offset += replace_length; 849 } 850 read_offset += find_length; 851 852 // min() clamps StringType::npos (the largest unsigned value) to str_length. 853 size_t match = std::min(matcher.Find(*str, read_offset), str_length); 854 855 size_t length = match - read_offset; 856 if (length) { 857 CharTraits::move(buffer + write_offset, buffer + read_offset, length); 858 write_offset += length; 859 read_offset += length; 860 } 861 } while (read_offset < str_length); 862 863 // If we're shortening the string, truncate it now. 864 str->resize(write_offset); 865 return true; 866 } 867 868 template <class StringType> 869 bool ReplaceCharsT(const StringType& input, 870 BasicStringPiece<StringType> find_any_of_these, 871 BasicStringPiece<StringType> replace_with, 872 StringType* output) { 873 // Commonly, this is called with output and input being the same string; in 874 // that case, this assignment is inexpensive. 875 *output = input; 876 877 return DoReplaceMatchesAfterOffset( 878 output, 0, CharacterMatcher<StringType>{find_any_of_these}, replace_with, 879 ReplaceType::REPLACE_ALL); 880 } 881 882 void ReplaceFirstSubstringAfterOffset(string16* str, 883 size_t start_offset, 884 StringPiece16 find_this, 885 StringPiece16 replace_with) { 886 DoReplaceMatchesAfterOffset(str, start_offset, 887 SubstringMatcher<string16>{find_this}, 888 replace_with, ReplaceType::REPLACE_FIRST); 889 } 890 891 void ReplaceFirstSubstringAfterOffset(std::string* str, 892 size_t start_offset, 893 StringPiece find_this, 894 StringPiece replace_with) { 895 DoReplaceMatchesAfterOffset(str, start_offset, 896 SubstringMatcher<std::string>{find_this}, 897 replace_with, ReplaceType::REPLACE_FIRST); 898 } 899 900 void ReplaceSubstringsAfterOffset(string16* str, 901 size_t start_offset, 902 StringPiece16 find_this, 903 StringPiece16 replace_with) { 904 DoReplaceMatchesAfterOffset(str, start_offset, 905 SubstringMatcher<string16>{find_this}, 906 replace_with, ReplaceType::REPLACE_ALL); 907 } 908 909 void ReplaceSubstringsAfterOffset(std::string* str, 910 size_t start_offset, 911 StringPiece find_this, 912 StringPiece replace_with) { 913 DoReplaceMatchesAfterOffset(str, start_offset, 914 SubstringMatcher<std::string>{find_this}, 915 replace_with, ReplaceType::REPLACE_ALL); 916 } 917 918 template <class string_type> 919 inline typename string_type::value_type* WriteIntoT(string_type* str, 920 size_t length_with_null) { 921 DCHECK_GT(length_with_null, 1u); 922 str->reserve(length_with_null); 923 str->resize(length_with_null - 1); 924 return &((*str)[0]); 925 } 926 927 char* WriteInto(std::string* str, size_t length_with_null) { 928 return WriteIntoT(str, length_with_null); 929 } 930 931 char16* WriteInto(string16* str, size_t length_with_null) { 932 return WriteIntoT(str, length_with_null); 933 } 934 935 #if defined(_MSC_VER) && !defined(__clang__) 936 // Work around VC++ code-gen bug. https://crbug.com/804884 937 #pragma optimize("", off) 938 #endif 939 940 // Generic version for all JoinString overloads. |list_type| must be a sequence 941 // (std::vector or std::initializer_list) of strings/StringPieces (std::string, 942 // string16, StringPiece or StringPiece16). |string_type| is either std::string 943 // or string16. 944 template <typename list_type, typename string_type> 945 static string_type JoinStringT(const list_type& parts, 946 BasicStringPiece<string_type> sep) { 947 if (parts.size() == 0) 948 return string_type(); 949 950 // Pre-allocate the eventual size of the string. Start with the size of all of 951 // the separators (note that this *assumes* parts.size() > 0). 952 size_t total_size = (parts.size() - 1) * sep.size(); 953 for (const auto& part : parts) 954 total_size += part.size(); 955 string_type result; 956 result.reserve(total_size); 957 958 auto iter = parts.begin(); 959 DCHECK(iter != parts.end()); 960 AppendToString(&result, *iter); 961 ++iter; 962 963 for (; iter != parts.end(); ++iter) { 964 sep.AppendToString(&result); 965 // Using the overloaded AppendToString allows this template function to work 966 // on both strings and StringPieces without creating an intermediate 967 // StringPiece object. 968 AppendToString(&result, *iter); 969 } 970 971 // Sanity-check that we pre-allocated correctly. 972 DCHECK_EQ(total_size, result.size()); 973 974 return result; 975 } 976 977 std::string JoinString(const std::vector<std::string>& parts, 978 StringPiece separator) { 979 return JoinStringT(parts, separator); 980 } 981 982 string16 JoinString(const std::vector<string16>& parts, 983 StringPiece16 separator) { 984 return JoinStringT(parts, separator); 985 } 986 987 #if defined(_MSC_VER) && !defined(__clang__) 988 // Work around VC++ code-gen bug. https://crbug.com/804884 989 #pragma optimize("", on) 990 #endif 991 992 std::string JoinString(const std::vector<StringPiece>& parts, 993 StringPiece separator) { 994 return JoinStringT(parts, separator); 995 } 996 997 string16 JoinString(const std::vector<StringPiece16>& parts, 998 StringPiece16 separator) { 999 return JoinStringT(parts, separator); 1000 } 1001 1002 std::string JoinString(std::initializer_list<StringPiece> parts, 1003 StringPiece separator) { 1004 return JoinStringT(parts, separator); 1005 } 1006 1007 string16 JoinString(std::initializer_list<StringPiece16> parts, 1008 StringPiece16 separator) { 1009 return JoinStringT(parts, separator); 1010 } 1011 1012 template<class FormatStringType, class OutStringType> 1013 OutStringType DoReplaceStringPlaceholders( 1014 const FormatStringType& format_string, 1015 const std::vector<OutStringType>& subst, 1016 std::vector<size_t>* offsets) { 1017 size_t substitutions = subst.size(); 1018 DCHECK_LT(substitutions, 10U); 1019 1020 size_t sub_length = 0; 1021 for (const auto& cur : subst) 1022 sub_length += cur.length(); 1023 1024 OutStringType formatted; 1025 formatted.reserve(format_string.length() + sub_length); 1026 1027 std::vector<ReplacementOffset> r_offsets; 1028 for (auto i = format_string.begin(); i != format_string.end(); ++i) { 1029 if ('$' == *i) { 1030 if (i + 1 != format_string.end()) { 1031 ++i; 1032 if ('$' == *i) { 1033 while (i != format_string.end() && '$' == *i) { 1034 formatted.push_back('$'); 1035 ++i; 1036 } 1037 --i; 1038 } else { 1039 if (*i < '1' || *i > '9') { 1040 DLOG(ERROR) << "Invalid placeholder: $" << *i; 1041 continue; 1042 } 1043 uintptr_t index = *i - '1'; 1044 if (offsets) { 1045 ReplacementOffset r_offset(index, 1046 static_cast<int>(formatted.size())); 1047 r_offsets.insert( 1048 std::upper_bound(r_offsets.begin(), r_offsets.end(), r_offset, 1049 &CompareParameter), 1050 r_offset); 1051 } 1052 if (index < substitutions) 1053 formatted.append(subst.at(index)); 1054 } 1055 } 1056 } else { 1057 formatted.push_back(*i); 1058 } 1059 } 1060 if (offsets) { 1061 for (const auto& cur : r_offsets) 1062 offsets->push_back(cur.offset); 1063 } 1064 return formatted; 1065 } 1066 1067 string16 ReplaceStringPlaceholders(const string16& format_string, 1068 const std::vector<string16>& subst, 1069 std::vector<size_t>* offsets) { 1070 return DoReplaceStringPlaceholders(format_string, subst, offsets); 1071 } 1072 1073 std::string ReplaceStringPlaceholders(StringPiece format_string, 1074 const std::vector<std::string>& subst, 1075 std::vector<size_t>* offsets) { 1076 return DoReplaceStringPlaceholders(format_string, subst, offsets); 1077 } 1078 1079 string16 ReplaceStringPlaceholders(const string16& format_string, 1080 const string16& a, 1081 size_t* offset) { 1082 std::vector<size_t> offsets; 1083 std::vector<string16> subst; 1084 subst.push_back(a); 1085 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); 1086 1087 DCHECK_EQ(1U, offsets.size()); 1088 if (offset) 1089 *offset = offsets[0]; 1090 return result; 1091 } 1092 1093 // The following code is compatible with the OpenBSD lcpy interface. See: 1094 // http://www.gratisoft.us/todd/papers/strlcpy.html 1095 // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 1096 1097 namespace { 1098 1099 template <typename CHAR> 1100 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 1101 for (size_t i = 0; i < dst_size; ++i) { 1102 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 1103 return i; 1104 } 1105 1106 // We were left off at dst_size. We over copied 1 byte. Null terminate. 1107 if (dst_size != 0) 1108 dst[dst_size - 1] = 0; 1109 1110 // Count the rest of the |src|, and return it's length in characters. 1111 while (src[dst_size]) ++dst_size; 1112 return dst_size; 1113 } 1114 1115 } // namespace 1116 1117 size_t strlcpy(char* dst, const char* src, size_t dst_size) { 1118 return lcpyT<char>(dst, src, dst_size); 1119 } 1120 size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1121 return lcpyT<wchar_t>(dst, src, dst_size); 1122 } 1123 1124 } // namespace base 1125