1 // Copyright (c) 2011 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "base/string_util.h" 6 7 #include "build/build_config.h" 8 9 #include <ctype.h> 10 #include <errno.h> 11 #include <math.h> 12 #include <stdarg.h> 13 #include <stdio.h> 14 #include <stdlib.h> 15 #include <string.h> 16 #include <time.h> 17 #include <wchar.h> 18 #include <wctype.h> 19 20 #include <algorithm> 21 #include <vector> 22 23 #include "base/basictypes.h" 24 #include "base/logging.h" 25 #include "base/memory/singleton.h" 26 #include "base/third_party/dmg_fp/dmg_fp.h" 27 #include "base/utf_string_conversion_utils.h" 28 #include "base/utf_string_conversions.h" 29 #include "base/third_party/icu/icu_utf.h" 30 31 namespace { 32 33 // Force the singleton used by Empty[W]String[16] to be a unique type. This 34 // prevents other code that might accidentally use Singleton<string> from 35 // getting our internal one. 36 struct EmptyStrings { 37 EmptyStrings() {} 38 const std::string s; 39 const std::wstring ws; 40 const string16 s16; 41 42 static EmptyStrings* GetInstance() { 43 return Singleton<EmptyStrings>::get(); 44 } 45 }; 46 47 // Used by ReplaceStringPlaceholders to track the position in the string of 48 // replaced parameters. 49 struct ReplacementOffset { 50 ReplacementOffset(uintptr_t parameter, size_t offset) 51 : parameter(parameter), 52 offset(offset) {} 53 54 // Index of the parameter. 55 uintptr_t parameter; 56 57 // Starting position in the string. 58 size_t offset; 59 }; 60 61 static bool CompareParameter(const ReplacementOffset& elem1, 62 const ReplacementOffset& elem2) { 63 return elem1.parameter < elem2.parameter; 64 } 65 66 } // namespace 67 68 namespace base { 69 70 bool IsWprintfFormatPortable(const wchar_t* format) { 71 for (const wchar_t* position = format; *position != '\0'; ++position) { 72 if (*position == '%') { 73 bool in_specification = true; 74 bool modifier_l = false; 75 while (in_specification) { 76 // Eat up characters until reaching a known specifier. 77 if (*++position == '\0') { 78 // The format string ended in the middle of a specification. Call 79 // it portable because no unportable specifications were found. The 80 // string is equally broken on all platforms. 81 return true; 82 } 83 84 if (*position == 'l') { 85 // 'l' is the only thing that can save the 's' and 'c' specifiers. 86 modifier_l = true; 87 } else if (((*position == 's' || *position == 'c') && !modifier_l) || 88 *position == 'S' || *position == 'C' || *position == 'F' || 89 *position == 'D' || *position == 'O' || *position == 'U') { 90 // Not portable. 91 return false; 92 } 93 94 if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { 95 // Portable, keep scanning the rest of the format string. 96 in_specification = false; 97 } 98 } 99 } 100 } 101 102 return true; 103 } 104 105 } // namespace base 106 107 108 const std::string& EmptyString() { 109 return EmptyStrings::GetInstance()->s; 110 } 111 112 const std::wstring& EmptyWString() { 113 return EmptyStrings::GetInstance()->ws; 114 } 115 116 const string16& EmptyString16() { 117 return EmptyStrings::GetInstance()->s16; 118 } 119 120 #define WHITESPACE_UNICODE \ 121 0x0009, /* <control-0009> to <control-000D> */ \ 122 0x000A, \ 123 0x000B, \ 124 0x000C, \ 125 0x000D, \ 126 0x0020, /* Space */ \ 127 0x0085, /* <control-0085> */ \ 128 0x00A0, /* No-Break Space */ \ 129 0x1680, /* Ogham Space Mark */ \ 130 0x180E, /* Mongolian Vowel Separator */ \ 131 0x2000, /* En Quad to Hair Space */ \ 132 0x2001, \ 133 0x2002, \ 134 0x2003, \ 135 0x2004, \ 136 0x2005, \ 137 0x2006, \ 138 0x2007, \ 139 0x2008, \ 140 0x2009, \ 141 0x200A, \ 142 0x200C, /* Zero Width Non-Joiner */ \ 143 0x2028, /* Line Separator */ \ 144 0x2029, /* Paragraph Separator */ \ 145 0x202F, /* Narrow No-Break Space */ \ 146 0x205F, /* Medium Mathematical Space */ \ 147 0x3000, /* Ideographic Space */ \ 148 0 149 150 const wchar_t kWhitespaceWide[] = { 151 WHITESPACE_UNICODE 152 }; 153 const char16 kWhitespaceUTF16[] = { 154 WHITESPACE_UNICODE 155 }; 156 const char kWhitespaceASCII[] = { 157 0x09, // <control-0009> to <control-000D> 158 0x0A, 159 0x0B, 160 0x0C, 161 0x0D, 162 0x20, // Space 163 0 164 }; 165 166 const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; 167 168 template<typename STR> 169 bool RemoveCharsT(const STR& input, 170 const typename STR::value_type remove_chars[], 171 STR* output) { 172 bool removed = false; 173 size_t found; 174 175 *output = input; 176 177 found = output->find_first_of(remove_chars); 178 while (found != STR::npos) { 179 removed = true; 180 output->replace(found, 1, STR()); 181 found = output->find_first_of(remove_chars, found); 182 } 183 184 return removed; 185 } 186 187 bool RemoveChars(const std::wstring& input, 188 const wchar_t remove_chars[], 189 std::wstring* output) { 190 return RemoveCharsT(input, remove_chars, output); 191 } 192 193 #if !defined(WCHAR_T_IS_UTF16) 194 bool RemoveChars(const string16& input, 195 const char16 remove_chars[], 196 string16* output) { 197 return RemoveCharsT(input, remove_chars, output); 198 } 199 #endif 200 201 bool RemoveChars(const std::string& input, 202 const char remove_chars[], 203 std::string* output) { 204 return RemoveCharsT(input, remove_chars, output); 205 } 206 207 template<typename STR> 208 TrimPositions TrimStringT(const STR& input, 209 const typename STR::value_type trim_chars[], 210 TrimPositions positions, 211 STR* output) { 212 // Find the edges of leading/trailing whitespace as desired. 213 const typename STR::size_type last_char = input.length() - 1; 214 const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ? 215 input.find_first_not_of(trim_chars) : 0; 216 const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ? 217 input.find_last_not_of(trim_chars) : last_char; 218 219 // When the string was all whitespace, report that we stripped off whitespace 220 // from whichever position the caller was interested in. For empty input, we 221 // stripped no whitespace, but we still need to clear |output|. 222 if (input.empty() || 223 (first_good_char == STR::npos) || (last_good_char == STR::npos)) { 224 bool input_was_empty = input.empty(); // in case output == &input 225 output->clear(); 226 return input_was_empty ? TRIM_NONE : positions; 227 } 228 229 // Trim the whitespace. 230 *output = 231 input.substr(first_good_char, last_good_char - first_good_char + 1); 232 233 // Return where we trimmed from. 234 return static_cast<TrimPositions>( 235 ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | 236 ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); 237 } 238 239 bool TrimString(const std::wstring& input, 240 const wchar_t trim_chars[], 241 std::wstring* output) { 242 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 243 } 244 245 #if !defined(WCHAR_T_IS_UTF16) 246 bool TrimString(const string16& input, 247 const char16 trim_chars[], 248 string16* output) { 249 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 250 } 251 #endif 252 253 bool TrimString(const std::string& input, 254 const char trim_chars[], 255 std::string* output) { 256 return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; 257 } 258 259 void TruncateUTF8ToByteSize(const std::string& input, 260 const size_t byte_size, 261 std::string* output) { 262 DCHECK(output); 263 if (byte_size > input.length()) { 264 *output = input; 265 return; 266 } 267 DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); 268 // Note: This cast is necessary because CBU8_NEXT uses int32s. 269 int32 truncation_length = static_cast<int32>(byte_size); 270 int32 char_index = truncation_length - 1; 271 const char* data = input.data(); 272 273 // Using CBU8, we will move backwards from the truncation point 274 // to the beginning of the string looking for a valid UTF8 275 // character. Once a full UTF8 character is found, we will 276 // truncate the string to the end of that character. 277 while (char_index >= 0) { 278 int32 prev = char_index; 279 uint32 code_point = 0; 280 CBU8_NEXT(data, char_index, truncation_length, code_point); 281 if (!base::IsValidCharacter(code_point) || 282 !base::IsValidCodepoint(code_point)) { 283 char_index = prev - 1; 284 } else { 285 break; 286 } 287 } 288 289 if (char_index >= 0 ) 290 *output = input.substr(0, char_index); 291 else 292 output->clear(); 293 } 294 295 TrimPositions TrimWhitespace(const std::wstring& input, 296 TrimPositions positions, 297 std::wstring* output) { 298 return TrimStringT(input, kWhitespaceWide, positions, output); 299 } 300 301 #if !defined(WCHAR_T_IS_UTF16) 302 TrimPositions TrimWhitespace(const string16& input, 303 TrimPositions positions, 304 string16* output) { 305 return TrimStringT(input, kWhitespaceUTF16, positions, output); 306 } 307 #endif 308 309 TrimPositions TrimWhitespaceASCII(const std::string& input, 310 TrimPositions positions, 311 std::string* output) { 312 return TrimStringT(input, kWhitespaceASCII, positions, output); 313 } 314 315 // This function is only for backward-compatibility. 316 // To be removed when all callers are updated. 317 TrimPositions TrimWhitespace(const std::string& input, 318 TrimPositions positions, 319 std::string* output) { 320 return TrimWhitespaceASCII(input, positions, output); 321 } 322 323 template<typename STR> 324 STR CollapseWhitespaceT(const STR& text, 325 bool trim_sequences_with_line_breaks) { 326 STR result; 327 result.resize(text.size()); 328 329 // Set flags to pretend we're already in a trimmed whitespace sequence, so we 330 // will trim any leading whitespace. 331 bool in_whitespace = true; 332 bool already_trimmed = true; 333 334 int chars_written = 0; 335 for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { 336 if (IsWhitespace(*i)) { 337 if (!in_whitespace) { 338 // Reduce all whitespace sequences to a single space. 339 in_whitespace = true; 340 result[chars_written++] = L' '; 341 } 342 if (trim_sequences_with_line_breaks && !already_trimmed && 343 ((*i == '\n') || (*i == '\r'))) { 344 // Whitespace sequences containing CR or LF are eliminated entirely. 345 already_trimmed = true; 346 --chars_written; 347 } 348 } else { 349 // Non-whitespace chracters are copied straight across. 350 in_whitespace = false; 351 already_trimmed = false; 352 result[chars_written++] = *i; 353 } 354 } 355 356 if (in_whitespace && !already_trimmed) { 357 // Any trailing whitespace is eliminated. 358 --chars_written; 359 } 360 361 result.resize(chars_written); 362 return result; 363 } 364 365 std::wstring CollapseWhitespace(const std::wstring& text, 366 bool trim_sequences_with_line_breaks) { 367 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 368 } 369 370 #if !defined(WCHAR_T_IS_UTF16) 371 string16 CollapseWhitespace(const string16& text, 372 bool trim_sequences_with_line_breaks) { 373 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 374 } 375 #endif 376 377 std::string CollapseWhitespaceASCII(const std::string& text, 378 bool trim_sequences_with_line_breaks) { 379 return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); 380 } 381 382 bool ContainsOnlyWhitespaceASCII(const std::string& str) { 383 for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) { 384 if (!IsAsciiWhitespace(*i)) 385 return false; 386 } 387 return true; 388 } 389 390 bool ContainsOnlyWhitespace(const string16& str) { 391 for (string16::const_iterator i(str.begin()); i != str.end(); ++i) { 392 if (!IsWhitespace(*i)) 393 return false; 394 } 395 return true; 396 } 397 398 template<typename STR> 399 static bool ContainsOnlyCharsT(const STR& input, const STR& characters) { 400 for (typename STR::const_iterator iter = input.begin(); 401 iter != input.end(); ++iter) { 402 if (characters.find(*iter) == STR::npos) 403 return false; 404 } 405 return true; 406 } 407 408 bool ContainsOnlyChars(const std::wstring& input, 409 const std::wstring& characters) { 410 return ContainsOnlyCharsT(input, characters); 411 } 412 413 #if !defined(WCHAR_T_IS_UTF16) 414 bool ContainsOnlyChars(const string16& input, const string16& characters) { 415 return ContainsOnlyCharsT(input, characters); 416 } 417 #endif 418 419 bool ContainsOnlyChars(const std::string& input, 420 const std::string& characters) { 421 return ContainsOnlyCharsT(input, characters); 422 } 423 424 std::string WideToASCII(const std::wstring& wide) { 425 DCHECK(IsStringASCII(wide)) << wide; 426 return std::string(wide.begin(), wide.end()); 427 } 428 429 std::string UTF16ToASCII(const string16& utf16) { 430 DCHECK(IsStringASCII(utf16)) << utf16; 431 return std::string(utf16.begin(), utf16.end()); 432 } 433 434 // Latin1 is just the low range of Unicode, so we can copy directly to convert. 435 bool WideToLatin1(const std::wstring& wide, std::string* latin1) { 436 std::string output; 437 output.resize(wide.size()); 438 latin1->clear(); 439 for (size_t i = 0; i < wide.size(); i++) { 440 if (wide[i] > 255) 441 return false; 442 output[i] = static_cast<char>(wide[i]); 443 } 444 latin1->swap(output); 445 return true; 446 } 447 448 template<class STR> 449 static bool DoIsStringASCII(const STR& str) { 450 for (size_t i = 0; i < str.length(); i++) { 451 typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; 452 if (c > 0x7F) 453 return false; 454 } 455 return true; 456 } 457 458 bool IsStringASCII(const std::wstring& str) { 459 return DoIsStringASCII(str); 460 } 461 462 #if !defined(WCHAR_T_IS_UTF16) 463 bool IsStringASCII(const string16& str) { 464 return DoIsStringASCII(str); 465 } 466 #endif 467 468 bool IsStringASCII(const base::StringPiece& str) { 469 return DoIsStringASCII(str); 470 } 471 472 bool IsStringUTF8(const std::string& str) { 473 const char *src = str.data(); 474 int32 src_len = static_cast<int32>(str.length()); 475 int32 char_index = 0; 476 477 while (char_index < src_len) { 478 int32 code_point; 479 CBU8_NEXT(src, char_index, src_len, code_point); 480 if (!base::IsValidCharacter(code_point)) 481 return false; 482 } 483 return true; 484 } 485 486 template<typename Iter> 487 static inline bool DoLowerCaseEqualsASCII(Iter a_begin, 488 Iter a_end, 489 const char* b) { 490 for (Iter it = a_begin; it != a_end; ++it, ++b) { 491 if (!*b || base::ToLowerASCII(*it) != *b) 492 return false; 493 } 494 return *b == 0; 495 } 496 497 // Front-ends for LowerCaseEqualsASCII. 498 bool LowerCaseEqualsASCII(const std::string& a, const char* b) { 499 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 500 } 501 502 bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) { 503 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 504 } 505 506 #if !defined(WCHAR_T_IS_UTF16) 507 bool LowerCaseEqualsASCII(const string16& a, const char* b) { 508 return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); 509 } 510 #endif 511 512 bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, 513 std::string::const_iterator a_end, 514 const char* b) { 515 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 516 } 517 518 bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, 519 std::wstring::const_iterator a_end, 520 const char* b) { 521 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 522 } 523 524 #if !defined(WCHAR_T_IS_UTF16) 525 bool LowerCaseEqualsASCII(string16::const_iterator a_begin, 526 string16::const_iterator a_end, 527 const char* b) { 528 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 529 } 530 #endif 531 532 #if !defined(ANDROID) 533 bool LowerCaseEqualsASCII(const char* a_begin, 534 const char* a_end, 535 const char* b) { 536 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 537 } 538 #endif // !ANDROID 539 540 #if !defined(ANDROID) 541 bool LowerCaseEqualsASCII(const wchar_t* a_begin, 542 const wchar_t* a_end, 543 const char* b) { 544 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 545 } 546 #endif // !ANDROID 547 548 #if !defined(WCHAR_T_IS_UTF16) && !defined(ANDROID) 549 bool LowerCaseEqualsASCII(const char16* a_begin, 550 const char16* a_end, 551 const char* b) { 552 return DoLowerCaseEqualsASCII(a_begin, a_end, b); 553 } 554 #endif 555 556 bool EqualsASCII(const string16& a, const base::StringPiece& b) { 557 if (a.length() != b.length()) 558 return false; 559 return std::equal(b.begin(), b.end(), a.begin()); 560 } 561 562 bool StartsWithASCII(const std::string& str, 563 const std::string& search, 564 bool case_sensitive) { 565 if (case_sensitive) 566 return str.compare(0, search.length(), search) == 0; 567 else 568 return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0; 569 } 570 571 template <typename STR> 572 bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) { 573 if (case_sensitive) { 574 return str.compare(0, search.length(), search) == 0; 575 } else { 576 if (search.size() > str.size()) 577 return false; 578 return std::equal(search.begin(), search.end(), str.begin(), 579 base::CaseInsensitiveCompare<typename STR::value_type>()); 580 } 581 } 582 583 bool StartsWith(const std::wstring& str, const std::wstring& search, 584 bool case_sensitive) { 585 return StartsWithT(str, search, case_sensitive); 586 } 587 588 #if !defined(WCHAR_T_IS_UTF16) 589 bool StartsWith(const string16& str, const string16& search, 590 bool case_sensitive) { 591 return StartsWithT(str, search, case_sensitive); 592 } 593 #endif 594 595 template <typename STR> 596 bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) { 597 typename STR::size_type str_length = str.length(); 598 typename STR::size_type search_length = search.length(); 599 if (search_length > str_length) 600 return false; 601 if (case_sensitive) { 602 return str.compare(str_length - search_length, search_length, search) == 0; 603 } else { 604 return std::equal(search.begin(), search.end(), 605 str.begin() + (str_length - search_length), 606 base::CaseInsensitiveCompare<typename STR::value_type>()); 607 } 608 } 609 610 bool EndsWith(const std::string& str, const std::string& search, 611 bool case_sensitive) { 612 return EndsWithT(str, search, case_sensitive); 613 } 614 615 bool EndsWith(const std::wstring& str, const std::wstring& search, 616 bool case_sensitive) { 617 return EndsWithT(str, search, case_sensitive); 618 } 619 620 #if !defined(WCHAR_T_IS_UTF16) 621 bool EndsWith(const string16& str, const string16& search, 622 bool case_sensitive) { 623 return EndsWithT(str, search, case_sensitive); 624 } 625 #endif 626 627 DataUnits GetByteDisplayUnits(int64 bytes) { 628 // The byte thresholds at which we display amounts. A byte count is displayed 629 // in unit U when kUnitThresholds[U] <= bytes < kUnitThresholds[U+1]. 630 // This must match the DataUnits enum. 631 static const int64 kUnitThresholds[] = { 632 0, // DATA_UNITS_BYTE, 633 3*1024, // DATA_UNITS_KIBIBYTE, 634 2*1024*1024, // DATA_UNITS_MEBIBYTE, 635 1024*1024*1024 // DATA_UNITS_GIBIBYTE, 636 }; 637 638 if (bytes < 0) { 639 NOTREACHED() << "Negative bytes value"; 640 return DATA_UNITS_BYTE; 641 } 642 643 int unit_index = arraysize(kUnitThresholds); 644 while (--unit_index > 0) { 645 if (bytes >= kUnitThresholds[unit_index]) 646 break; 647 } 648 649 DCHECK(unit_index >= DATA_UNITS_BYTE && unit_index <= DATA_UNITS_GIBIBYTE); 650 return DataUnits(unit_index); 651 } 652 653 // TODO(mpcomplete): deal with locale 654 // Byte suffixes. This must match the DataUnits enum. 655 static const char* const kByteStrings[] = { 656 "B", 657 "kB", 658 "MB", 659 "GB" 660 }; 661 662 static const char* const kSpeedStrings[] = { 663 "B/s", 664 "kB/s", 665 "MB/s", 666 "GB/s" 667 }; 668 669 string16 FormatBytesInternal(int64 bytes, 670 DataUnits units, 671 bool show_units, 672 const char* const* suffix) { 673 if (bytes < 0) { 674 NOTREACHED() << "Negative bytes value"; 675 return string16(); 676 } 677 678 DCHECK(units >= DATA_UNITS_BYTE && units <= DATA_UNITS_GIBIBYTE); 679 680 // Put the quantity in the right units. 681 double unit_amount = static_cast<double>(bytes); 682 for (int i = 0; i < units; ++i) 683 unit_amount /= 1024.0; 684 685 char buf[64]; 686 if (bytes != 0 && units != DATA_UNITS_BYTE && unit_amount < 100) 687 base::snprintf(buf, arraysize(buf), "%.1lf", unit_amount); 688 else 689 base::snprintf(buf, arraysize(buf), "%.0lf", unit_amount); 690 691 std::string ret(buf); 692 if (show_units) { 693 ret += " "; 694 ret += suffix[units]; 695 } 696 697 return ASCIIToUTF16(ret); 698 } 699 700 string16 FormatBytes(int64 bytes, DataUnits units, bool show_units) { 701 return FormatBytesInternal(bytes, units, show_units, kByteStrings); 702 } 703 704 string16 FormatSpeed(int64 bytes, DataUnits units, bool show_units) { 705 return FormatBytesInternal(bytes, units, show_units, kSpeedStrings); 706 } 707 708 template<class StringType> 709 void DoReplaceSubstringsAfterOffset(StringType* str, 710 typename StringType::size_type start_offset, 711 const StringType& find_this, 712 const StringType& replace_with, 713 bool replace_all) { 714 if ((start_offset == StringType::npos) || (start_offset >= str->length())) 715 return; 716 717 DCHECK(!find_this.empty()); 718 for (typename StringType::size_type offs(str->find(find_this, start_offset)); 719 offs != StringType::npos; offs = str->find(find_this, offs)) { 720 str->replace(offs, find_this.length(), replace_with); 721 offs += replace_with.length(); 722 723 if (!replace_all) 724 break; 725 } 726 } 727 728 void ReplaceFirstSubstringAfterOffset(string16* str, 729 string16::size_type start_offset, 730 const string16& find_this, 731 const string16& replace_with) { 732 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 733 false); // replace first instance 734 } 735 736 void ReplaceFirstSubstringAfterOffset(std::string* str, 737 std::string::size_type start_offset, 738 const std::string& find_this, 739 const std::string& replace_with) { 740 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 741 false); // replace first instance 742 } 743 744 void ReplaceSubstringsAfterOffset(string16* str, 745 string16::size_type start_offset, 746 const string16& find_this, 747 const string16& replace_with) { 748 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 749 true); // replace all instances 750 } 751 752 void ReplaceSubstringsAfterOffset(std::string* str, 753 std::string::size_type start_offset, 754 const std::string& find_this, 755 const std::string& replace_with) { 756 DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, 757 true); // replace all instances 758 } 759 760 761 template<typename STR> 762 static size_t TokenizeT(const STR& str, 763 const STR& delimiters, 764 std::vector<STR>* tokens) { 765 tokens->clear(); 766 767 typename STR::size_type start = str.find_first_not_of(delimiters); 768 while (start != STR::npos) { 769 typename STR::size_type end = str.find_first_of(delimiters, start + 1); 770 if (end == STR::npos) { 771 tokens->push_back(str.substr(start)); 772 break; 773 } else { 774 tokens->push_back(str.substr(start, end - start)); 775 start = str.find_first_not_of(delimiters, end + 1); 776 } 777 } 778 779 return tokens->size(); 780 } 781 782 size_t Tokenize(const std::wstring& str, 783 const std::wstring& delimiters, 784 std::vector<std::wstring>* tokens) { 785 return TokenizeT(str, delimiters, tokens); 786 } 787 788 #if !defined(WCHAR_T_IS_UTF16) 789 size_t Tokenize(const string16& str, 790 const string16& delimiters, 791 std::vector<string16>* tokens) { 792 return TokenizeT(str, delimiters, tokens); 793 } 794 #endif 795 796 size_t Tokenize(const std::string& str, 797 const std::string& delimiters, 798 std::vector<std::string>* tokens) { 799 return TokenizeT(str, delimiters, tokens); 800 } 801 802 size_t Tokenize(const base::StringPiece& str, 803 const base::StringPiece& delimiters, 804 std::vector<base::StringPiece>* tokens) { 805 return TokenizeT(str, delimiters, tokens); 806 } 807 808 template<typename STR> 809 static STR JoinStringT(const std::vector<STR>& parts, 810 typename STR::value_type sep) { 811 if (parts.empty()) 812 return STR(); 813 814 STR result(parts[0]); 815 typename std::vector<STR>::const_iterator iter = parts.begin(); 816 ++iter; 817 818 for (; iter != parts.end(); ++iter) { 819 result += sep; 820 result += *iter; 821 } 822 823 return result; 824 } 825 826 std::string JoinString(const std::vector<std::string>& parts, char sep) { 827 return JoinStringT(parts, sep); 828 } 829 830 string16 JoinString(const std::vector<string16>& parts, char16 sep) { 831 return JoinStringT(parts, sep); 832 } 833 834 template<class FormatStringType, class OutStringType> 835 OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string, 836 const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) { 837 size_t substitutions = subst.size(); 838 DCHECK(substitutions < 10); 839 840 size_t sub_length = 0; 841 for (typename std::vector<OutStringType>::const_iterator iter = subst.begin(); 842 iter != subst.end(); ++iter) { 843 sub_length += iter->length(); 844 } 845 846 OutStringType formatted; 847 formatted.reserve(format_string.length() + sub_length); 848 849 std::vector<ReplacementOffset> r_offsets; 850 for (typename FormatStringType::const_iterator i = format_string.begin(); 851 i != format_string.end(); ++i) { 852 if ('$' == *i) { 853 if (i + 1 != format_string.end()) { 854 ++i; 855 DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i; 856 if ('$' == *i) { 857 while (i != format_string.end() && '$' == *i) { 858 formatted.push_back('$'); 859 ++i; 860 } 861 --i; 862 } else { 863 uintptr_t index = *i - '1'; 864 if (offsets) { 865 ReplacementOffset r_offset(index, 866 static_cast<int>(formatted.size())); 867 r_offsets.insert(std::lower_bound(r_offsets.begin(), 868 r_offsets.end(), 869 r_offset, 870 &CompareParameter), 871 r_offset); 872 } 873 if (index < substitutions) 874 formatted.append(subst.at(index)); 875 } 876 } 877 } else { 878 formatted.push_back(*i); 879 } 880 } 881 if (offsets) { 882 for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin(); 883 i != r_offsets.end(); ++i) { 884 offsets->push_back(i->offset); 885 } 886 } 887 return formatted; 888 } 889 890 string16 ReplaceStringPlaceholders(const string16& format_string, 891 const std::vector<string16>& subst, 892 std::vector<size_t>* offsets) { 893 return DoReplaceStringPlaceholders(format_string, subst, offsets); 894 } 895 896 std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, 897 const std::vector<std::string>& subst, 898 std::vector<size_t>* offsets) { 899 return DoReplaceStringPlaceholders(format_string, subst, offsets); 900 } 901 902 string16 ReplaceStringPlaceholders(const string16& format_string, 903 const string16& a, 904 size_t* offset) { 905 std::vector<size_t> offsets; 906 std::vector<string16> subst; 907 subst.push_back(a); 908 string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); 909 910 DCHECK(offsets.size() == 1); 911 if (offset) { 912 *offset = offsets[0]; 913 } 914 return result; 915 } 916 917 static bool IsWildcard(base_icu::UChar32 character) { 918 return character == '*' || character == '?'; 919 } 920 921 // Move the strings pointers to the point where they start to differ. 922 template <typename CHAR, typename NEXT> 923 static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, 924 const CHAR** string, const CHAR* string_end, 925 NEXT next) { 926 const CHAR* escape = NULL; 927 while (*pattern != pattern_end && *string != string_end) { 928 if (!escape && IsWildcard(**pattern)) { 929 // We don't want to match wildcard here, except if it's escaped. 930 return; 931 } 932 933 // Check if the escapement char is found. If so, skip it and move to the 934 // next character. 935 if (!escape && **pattern == '\\') { 936 escape = *pattern; 937 next(pattern, pattern_end); 938 continue; 939 } 940 941 // Check if the chars match, if so, increment the ptrs. 942 const CHAR* pattern_next = *pattern; 943 const CHAR* string_next = *string; 944 base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); 945 if (pattern_char == next(&string_next, string_end) && 946 pattern_char != (base_icu::UChar32) CBU_SENTINEL) { 947 *pattern = pattern_next; 948 *string = string_next; 949 } else { 950 // Uh ho, it did not match, we are done. If the last char was an 951 // escapement, that means that it was an error to advance the ptr here, 952 // let's put it back where it was. This also mean that the MatchPattern 953 // function will return false because if we can't match an escape char 954 // here, then no one will. 955 if (escape) { 956 *pattern = escape; 957 } 958 return; 959 } 960 961 escape = NULL; 962 } 963 } 964 965 template <typename CHAR, typename NEXT> 966 static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { 967 while (*pattern != end) { 968 if (!IsWildcard(**pattern)) 969 return; 970 next(pattern, end); 971 } 972 } 973 974 template <typename CHAR, typename NEXT> 975 static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, 976 const CHAR* pattern, const CHAR* pattern_end, 977 int depth, 978 NEXT next) { 979 const int kMaxDepth = 16; 980 if (depth > kMaxDepth) 981 return false; 982 983 // Eat all the matching chars. 984 EatSameChars(&pattern, pattern_end, &eval, eval_end, next); 985 986 // If the string is empty, then the pattern must be empty too, or contains 987 // only wildcards. 988 if (eval == eval_end) { 989 EatWildcard(&pattern, pattern_end, next); 990 return pattern == pattern_end; 991 } 992 993 // Pattern is empty but not string, this is not a match. 994 if (pattern == pattern_end) 995 return false; 996 997 // If this is a question mark, then we need to compare the rest with 998 // the current string or the string with one character eaten. 999 const CHAR* next_pattern = pattern; 1000 next(&next_pattern, pattern_end); 1001 if (pattern[0] == '?') { 1002 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1003 depth + 1, next)) 1004 return true; 1005 const CHAR* next_eval = eval; 1006 next(&next_eval, eval_end); 1007 if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, 1008 depth + 1, next)) 1009 return true; 1010 } 1011 1012 // This is a *, try to match all the possible substrings with the remainder 1013 // of the pattern. 1014 if (pattern[0] == '*') { 1015 // Collapse duplicate wild cards (********** into *) so that the 1016 // method does not recurse unnecessarily. http://crbug.com/52839 1017 EatWildcard(&next_pattern, pattern_end, next); 1018 1019 while (eval != eval_end) { 1020 if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, 1021 depth + 1, next)) 1022 return true; 1023 eval++; 1024 } 1025 1026 // We reached the end of the string, let see if the pattern contains only 1027 // wildcards. 1028 if (eval == eval_end) { 1029 EatWildcard(&pattern, pattern_end, next); 1030 if (pattern != pattern_end) 1031 return false; 1032 return true; 1033 } 1034 } 1035 1036 return false; 1037 } 1038 1039 struct NextCharUTF8 { 1040 base_icu::UChar32 operator()(const char** p, const char* end) { 1041 base_icu::UChar32 c; 1042 int offset = 0; 1043 CBU8_NEXT(*p, offset, end - *p, c); 1044 *p += offset; 1045 return c; 1046 } 1047 }; 1048 1049 struct NextCharUTF16 { 1050 base_icu::UChar32 operator()(const char16** p, const char16* end) { 1051 base_icu::UChar32 c; 1052 int offset = 0; 1053 CBU16_NEXT(*p, offset, end - *p, c); 1054 *p += offset; 1055 return c; 1056 } 1057 }; 1058 1059 bool MatchPattern(const base::StringPiece& eval, 1060 const base::StringPiece& pattern) { 1061 return MatchPatternT(eval.data(), eval.data() + eval.size(), 1062 pattern.data(), pattern.data() + pattern.size(), 1063 0, NextCharUTF8()); 1064 } 1065 1066 bool MatchPattern(const string16& eval, const string16& pattern) { 1067 return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), 1068 pattern.c_str(), pattern.c_str() + pattern.size(), 1069 0, NextCharUTF16()); 1070 } 1071 1072 // The following code is compatible with the OpenBSD lcpy interface. See: 1073 // http://www.gratisoft.us/todd/papers/strlcpy.html 1074 // ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c 1075 1076 namespace { 1077 1078 template <typename CHAR> 1079 size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { 1080 for (size_t i = 0; i < dst_size; ++i) { 1081 if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. 1082 return i; 1083 } 1084 1085 // We were left off at dst_size. We over copied 1 byte. Null terminate. 1086 if (dst_size != 0) 1087 dst[dst_size - 1] = 0; 1088 1089 // Count the rest of the |src|, and return it's length in characters. 1090 while (src[dst_size]) ++dst_size; 1091 return dst_size; 1092 } 1093 1094 } // namespace 1095 1096 size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { 1097 return lcpyT<char>(dst, src, dst_size); 1098 } 1099 size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { 1100 return lcpyT<wchar_t>(dst, src, dst_size); 1101 } 1102