1 /* 2 * (C) 1999 Lars Knoll (knoll (at) kde.org) 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2010, 2012 Apple Inc. All rights reserved. 4 * Copyright (C) 2007-2009 Torch Mobile, Inc. 5 * 6 * This library is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU Library General Public 8 * License as published by the Free Software Foundation; either 9 * version 2 of the License, or (at your option) any later version. 10 * 11 * This library is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * Library General Public License for more details. 15 * 16 * You should have received a copy of the GNU Library General Public License 17 * along with this library; see the file COPYING.LIB. If not, write to 18 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 * Boston, MA 02110-1301, USA. 20 */ 21 22 #include "config.h" 23 #include "WTFString.h" 24 25 #include "IntegerToStringConversion.h" 26 #include <stdarg.h> 27 #include "wtf/ASCIICType.h" 28 #include "wtf/DataLog.h" 29 #include "wtf/HexNumber.h" 30 #include "wtf/MathExtras.h" 31 #include "wtf/text/CString.h" 32 #include "wtf/StringExtras.h" 33 #include "wtf/Vector.h" 34 #include "wtf/dtoa.h" 35 #include "wtf/unicode/CharacterNames.h" 36 #include "wtf/unicode/UTF8.h" 37 #include "wtf/unicode/Unicode.h" 38 39 using namespace std; 40 41 namespace WTF { 42 43 using namespace Unicode; 44 using namespace std; 45 46 // Construct a string with UTF-16 data. 47 String::String(const UChar* characters, unsigned length) 48 : m_impl(characters ? StringImpl::create(characters, length) : 0) 49 { 50 } 51 52 // Construct a string with UTF-16 data, from a null-terminated source. 53 String::String(const UChar* str) 54 { 55 if (!str) 56 return; 57 m_impl = StringImpl::create(str, lengthOfNullTerminatedString(str)); 58 } 59 60 // Construct a string with latin1 data. 61 String::String(const LChar* characters, unsigned length) 62 : m_impl(characters ? StringImpl::create(characters, length) : 0) 63 { 64 } 65 66 String::String(const char* characters, unsigned length) 67 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters), length) : 0) 68 { 69 } 70 71 // Construct a string with latin1 data, from a null-terminated source. 72 String::String(const LChar* characters) 73 : m_impl(characters ? StringImpl::create(characters) : 0) 74 { 75 } 76 77 String::String(const char* characters) 78 : m_impl(characters ? StringImpl::create(reinterpret_cast<const LChar*>(characters)) : 0) 79 { 80 } 81 82 void String::append(const String& string) 83 { 84 if (string.isEmpty()) 85 return; 86 if (!m_impl) { 87 m_impl = string.m_impl; 88 return; 89 } 90 91 // FIXME: This is extremely inefficient. So much so that we might want to take this 92 // out of String's API. We can make it better by optimizing the case where exactly 93 // one String is pointing at this StringImpl, but even then it's going to require a 94 // call into the allocator every single time. 95 96 if (m_impl->is8Bit() && string.m_impl->is8Bit()) { 97 LChar* data; 98 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 99 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 100 memcpy(data, m_impl->characters8(), m_impl->length() * sizeof(LChar)); 101 memcpy(data + m_impl->length(), string.characters8(), string.length() * sizeof(LChar)); 102 m_impl = newImpl.release(); 103 return; 104 } 105 106 UChar* data; 107 RELEASE_ASSERT(string.length() <= numeric_limits<unsigned>::max() - m_impl->length()); 108 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + string.length(), data); 109 110 if (m_impl->is8Bit()) 111 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 112 else 113 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 114 115 if (string.impl()->is8Bit()) 116 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters8(), string.impl()->length()); 117 else 118 StringImpl::copyChars(data + m_impl->length(), string.impl()->characters16(), string.impl()->length()); 119 120 m_impl = newImpl.release(); 121 } 122 123 template <typename CharacterType> 124 inline void String::appendInternal(CharacterType c) 125 { 126 // FIXME: This is extremely inefficient. So much so that we might want to take this 127 // out of String's API. We can make it better by optimizing the case where exactly 128 // one String is pointing at this StringImpl, but even then it's going to require a 129 // call into the allocator every single time. 130 if (!m_impl) { 131 m_impl = StringImpl::create(&c, 1); 132 return; 133 } 134 135 UChar* data; // FIXME: We should be able to create an 8 bit string via this code path. 136 RELEASE_ASSERT(m_impl->length() < numeric_limits<unsigned>::max()); 137 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(m_impl->length() + 1, data); 138 if (m_impl->is8Bit()) 139 StringImpl::copyChars(data, m_impl->characters8(), m_impl->length()); 140 else 141 StringImpl::copyChars(data, m_impl->characters16(), m_impl->length()); 142 data[m_impl->length()] = c; 143 m_impl = newImpl.release(); 144 } 145 146 void String::append(LChar c) 147 { 148 appendInternal(c); 149 } 150 151 void String::append(UChar c) 152 { 153 appendInternal(c); 154 } 155 156 int codePointCompare(const String& a, const String& b) 157 { 158 return codePointCompare(a.impl(), b.impl()); 159 } 160 161 void String::insert(const String& string, unsigned position) 162 { 163 if (string.isEmpty()) { 164 if (string.isNull()) 165 return; 166 if (isNull()) 167 m_impl = string.impl(); 168 return; 169 } 170 171 if (string.is8Bit()) 172 insert(string.impl()->characters8(), string.length(), position); 173 else 174 insert(string.impl()->characters16(), string.length(), position); 175 } 176 177 void String::append(const LChar* charactersToAppend, unsigned lengthToAppend) 178 { 179 if (!m_impl) { 180 if (!charactersToAppend) 181 return; 182 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 183 return; 184 } 185 186 if (!lengthToAppend) 187 return; 188 189 ASSERT(charactersToAppend); 190 191 unsigned strLength = m_impl->length(); 192 193 if (m_impl->is8Bit()) { 194 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 195 LChar* data; 196 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 197 StringImpl::copyChars(data, m_impl->characters8(), strLength); 198 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 199 m_impl = newImpl.release(); 200 return; 201 } 202 203 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 204 UChar* data; 205 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() + lengthToAppend, data); 206 StringImpl::copyChars(data, m_impl->characters16(), strLength); 207 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 208 m_impl = newImpl.release(); 209 } 210 211 void String::append(const UChar* charactersToAppend, unsigned lengthToAppend) 212 { 213 if (!m_impl) { 214 if (!charactersToAppend) 215 return; 216 m_impl = StringImpl::create(charactersToAppend, lengthToAppend); 217 return; 218 } 219 220 if (!lengthToAppend) 221 return; 222 223 unsigned strLength = m_impl->length(); 224 225 ASSERT(charactersToAppend); 226 RELEASE_ASSERT(lengthToAppend <= numeric_limits<unsigned>::max() - strLength); 227 UChar* data; 228 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(strLength + lengthToAppend, data); 229 if (m_impl->is8Bit()) 230 StringImpl::copyChars(data, characters8(), strLength); 231 else 232 StringImpl::copyChars(data, characters16(), strLength); 233 StringImpl::copyChars(data + strLength, charactersToAppend, lengthToAppend); 234 m_impl = newImpl.release(); 235 } 236 237 template<typename CharType> 238 PassRefPtr<StringImpl> insertInternal(PassRefPtr<StringImpl> impl, const CharType* charactersToInsert, unsigned lengthToInsert, unsigned position) 239 { 240 if (!lengthToInsert) 241 return impl; 242 243 ASSERT(charactersToInsert); 244 UChar* data; // FIXME: We should be able to create an 8 bit string here. 245 RELEASE_ASSERT(lengthToInsert <= numeric_limits<unsigned>::max() - impl->length()); 246 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(impl->length() + lengthToInsert, data); 247 248 if (impl->is8Bit()) 249 StringImpl::copyChars(data, impl->characters8(), position); 250 else 251 StringImpl::copyChars(data, impl->characters16(), position); 252 253 StringImpl::copyChars(data + position, charactersToInsert, lengthToInsert); 254 255 if (impl->is8Bit()) 256 StringImpl::copyChars(data + position + lengthToInsert, impl->characters8() + position, impl->length() - position); 257 else 258 StringImpl::copyChars(data + position + lengthToInsert, impl->characters16() + position, impl->length() - position); 259 260 return newImpl.release(); 261 } 262 263 void String::insert(const UChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 264 { 265 if (position >= length()) { 266 append(charactersToInsert, lengthToInsert); 267 return; 268 } 269 ASSERT(m_impl); 270 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 271 } 272 273 void String::insert(const LChar* charactersToInsert, unsigned lengthToInsert, unsigned position) 274 { 275 if (position >= length()) { 276 append(charactersToInsert, lengthToInsert); 277 return; 278 } 279 ASSERT(m_impl); 280 m_impl = insertInternal(m_impl.release(), charactersToInsert, lengthToInsert, position); 281 } 282 283 UChar32 String::characterStartingAt(unsigned i) const 284 { 285 if (!m_impl || i >= m_impl->length()) 286 return 0; 287 return m_impl->characterStartingAt(i); 288 } 289 290 void String::ensure16Bit() 291 { 292 unsigned length = this->length(); 293 if (!length || !is8Bit()) 294 return; 295 m_impl = make16BitFrom8BitSource(m_impl->characters8(), length).impl(); 296 } 297 298 void String::truncate(unsigned position) 299 { 300 if (position >= length()) 301 return; 302 if (m_impl->is8Bit()) { 303 LChar* data; 304 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 305 memcpy(data, m_impl->characters8(), position * sizeof(LChar)); 306 m_impl = newImpl.release(); 307 } else { 308 UChar* data; 309 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(position, data); 310 memcpy(data, m_impl->characters16(), position * sizeof(UChar)); 311 m_impl = newImpl.release(); 312 } 313 } 314 315 template <typename CharacterType> 316 inline void String::removeInternal(const CharacterType* characters, unsigned position, int lengthToRemove) 317 { 318 CharacterType* data; 319 RefPtr<StringImpl> newImpl = StringImpl::createUninitialized(length() - lengthToRemove, data); 320 memcpy(data, characters, position * sizeof(CharacterType)); 321 memcpy(data + position, characters + position + lengthToRemove, 322 (length() - lengthToRemove - position) * sizeof(CharacterType)); 323 324 m_impl = newImpl.release(); 325 } 326 327 void String::remove(unsigned position, int lengthToRemove) 328 { 329 if (lengthToRemove <= 0) 330 return; 331 if (position >= length()) 332 return; 333 if (static_cast<unsigned>(lengthToRemove) > length() - position) 334 lengthToRemove = length() - position; 335 336 if (is8Bit()) { 337 removeInternal(characters8(), position, lengthToRemove); 338 339 return; 340 } 341 342 removeInternal(characters16(), position, lengthToRemove); 343 } 344 345 String String::substring(unsigned pos, unsigned len) const 346 { 347 if (!m_impl) 348 return String(); 349 return m_impl->substring(pos, len); 350 } 351 352 String String::lower() const 353 { 354 if (!m_impl) 355 return String(); 356 return m_impl->lower(); 357 } 358 359 String String::upper() const 360 { 361 if (!m_impl) 362 return String(); 363 return m_impl->upper(); 364 } 365 366 String String::lower(const AtomicString& localeIdentifier) const 367 { 368 if (!m_impl) 369 return String(); 370 return m_impl->lower(localeIdentifier); 371 } 372 373 String String::upper(const AtomicString& localeIdentifier) const 374 { 375 if (!m_impl) 376 return String(); 377 return m_impl->upper(localeIdentifier); 378 } 379 380 String String::stripWhiteSpace() const 381 { 382 if (!m_impl) 383 return String(); 384 return m_impl->stripWhiteSpace(); 385 } 386 387 String String::stripWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace) const 388 { 389 if (!m_impl) 390 return String(); 391 return m_impl->stripWhiteSpace(isWhiteSpace); 392 } 393 394 String String::simplifyWhiteSpace(StripBehavior stripBehavior) const 395 { 396 if (!m_impl) 397 return String(); 398 return m_impl->simplifyWhiteSpace(stripBehavior); 399 } 400 401 String String::simplifyWhiteSpace(IsWhiteSpaceFunctionPtr isWhiteSpace, StripBehavior stripBehavior) const 402 { 403 if (!m_impl) 404 return String(); 405 return m_impl->simplifyWhiteSpace(isWhiteSpace, stripBehavior); 406 } 407 408 String String::removeCharacters(CharacterMatchFunctionPtr findMatch) const 409 { 410 if (!m_impl) 411 return String(); 412 return m_impl->removeCharacters(findMatch); 413 } 414 415 String String::foldCase() const 416 { 417 if (!m_impl) 418 return String(); 419 return m_impl->foldCase(); 420 } 421 422 bool String::percentage(int& result) const 423 { 424 if (!m_impl || !m_impl->length()) 425 return false; 426 427 if ((*m_impl)[m_impl->length() - 1] != '%') 428 return false; 429 430 if (m_impl->is8Bit()) 431 result = charactersToIntStrict(m_impl->characters8(), m_impl->length() - 1); 432 else 433 result = charactersToIntStrict(m_impl->characters16(), m_impl->length() - 1); 434 435 return true; 436 } 437 438 Vector<UChar> String::charactersWithNullTermination() const 439 { 440 if (!m_impl) 441 return Vector<UChar>(); 442 443 Vector<UChar> result; 444 result.reserveInitialCapacity(length() + 1); 445 appendTo(result); 446 result.append(0); 447 return result; 448 } 449 450 unsigned String::copyTo(UChar* buffer, unsigned pos, unsigned maxLength) const 451 { 452 unsigned length = this->length(); 453 RELEASE_ASSERT(pos <= length); 454 unsigned numCharacters = std::min(length - pos, maxLength); 455 if (!numCharacters) 456 return 0; 457 if (is8Bit()) 458 StringImpl::copyChars(buffer, characters8() + pos, numCharacters); 459 else 460 StringImpl::copyChars(buffer, characters16() + pos, numCharacters); 461 return numCharacters; 462 } 463 464 String String::format(const char *format, ...) 465 { 466 va_list args; 467 va_start(args, format); 468 469 Vector<char, 256> buffer; 470 471 // Do the format once to get the length. 472 #if COMPILER(MSVC) 473 int result = _vscprintf(format, args); 474 #else 475 char ch; 476 int result = vsnprintf(&ch, 1, format, args); 477 // We need to call va_end() and then va_start() again here, as the 478 // contents of args is undefined after the call to vsnprintf 479 // according to http://man.cx/snprintf(3) 480 // 481 // Not calling va_end/va_start here happens to work on lots of 482 // systems, but fails e.g. on 64bit Linux. 483 va_end(args); 484 va_start(args, format); 485 #endif 486 487 if (result == 0) 488 return String(""); 489 if (result < 0) 490 return String(); 491 unsigned len = result; 492 buffer.grow(len + 1); 493 494 // Now do the formatting again, guaranteed to fit. 495 vsnprintf(buffer.data(), buffer.size(), format, args); 496 497 va_end(args); 498 499 return StringImpl::create(reinterpret_cast<const LChar*>(buffer.data()), len); 500 } 501 502 String String::number(int number) 503 { 504 return numberToStringSigned<String>(number); 505 } 506 507 String String::number(unsigned int number) 508 { 509 return numberToStringUnsigned<String>(number); 510 } 511 512 String String::number(long number) 513 { 514 return numberToStringSigned<String>(number); 515 } 516 517 String String::number(unsigned long number) 518 { 519 return numberToStringUnsigned<String>(number); 520 } 521 522 String String::number(long long number) 523 { 524 return numberToStringSigned<String>(number); 525 } 526 527 String String::number(unsigned long long number) 528 { 529 return numberToStringUnsigned<String>(number); 530 } 531 532 String String::number(double number, unsigned precision, TrailingZerosTruncatingPolicy trailingZerosTruncatingPolicy) 533 { 534 NumberToStringBuffer buffer; 535 return String(numberToFixedPrecisionString(number, precision, buffer, trailingZerosTruncatingPolicy == TruncateTrailingZeros)); 536 } 537 538 String String::numberToStringECMAScript(double number) 539 { 540 NumberToStringBuffer buffer; 541 return String(numberToString(number, buffer)); 542 } 543 544 String String::numberToStringFixedWidth(double number, unsigned decimalPlaces) 545 { 546 NumberToStringBuffer buffer; 547 return String(numberToFixedWidthString(number, decimalPlaces, buffer)); 548 } 549 550 int String::toIntStrict(bool* ok, int base) const 551 { 552 if (!m_impl) { 553 if (ok) 554 *ok = false; 555 return 0; 556 } 557 return m_impl->toIntStrict(ok, base); 558 } 559 560 unsigned String::toUIntStrict(bool* ok, int base) const 561 { 562 if (!m_impl) { 563 if (ok) 564 *ok = false; 565 return 0; 566 } 567 return m_impl->toUIntStrict(ok, base); 568 } 569 570 int64_t String::toInt64Strict(bool* ok, int base) const 571 { 572 if (!m_impl) { 573 if (ok) 574 *ok = false; 575 return 0; 576 } 577 return m_impl->toInt64Strict(ok, base); 578 } 579 580 uint64_t String::toUInt64Strict(bool* ok, int base) const 581 { 582 if (!m_impl) { 583 if (ok) 584 *ok = false; 585 return 0; 586 } 587 return m_impl->toUInt64Strict(ok, base); 588 } 589 590 intptr_t String::toIntPtrStrict(bool* ok, int base) const 591 { 592 if (!m_impl) { 593 if (ok) 594 *ok = false; 595 return 0; 596 } 597 return m_impl->toIntPtrStrict(ok, base); 598 } 599 600 int String::toInt(bool* ok) const 601 { 602 if (!m_impl) { 603 if (ok) 604 *ok = false; 605 return 0; 606 } 607 return m_impl->toInt(ok); 608 } 609 610 unsigned String::toUInt(bool* ok) const 611 { 612 if (!m_impl) { 613 if (ok) 614 *ok = false; 615 return 0; 616 } 617 return m_impl->toUInt(ok); 618 } 619 620 int64_t String::toInt64(bool* ok) const 621 { 622 if (!m_impl) { 623 if (ok) 624 *ok = false; 625 return 0; 626 } 627 return m_impl->toInt64(ok); 628 } 629 630 uint64_t String::toUInt64(bool* ok) const 631 { 632 if (!m_impl) { 633 if (ok) 634 *ok = false; 635 return 0; 636 } 637 return m_impl->toUInt64(ok); 638 } 639 640 intptr_t String::toIntPtr(bool* ok) const 641 { 642 if (!m_impl) { 643 if (ok) 644 *ok = false; 645 return 0; 646 } 647 return m_impl->toIntPtr(ok); 648 } 649 650 double String::toDouble(bool* ok) const 651 { 652 if (!m_impl) { 653 if (ok) 654 *ok = false; 655 return 0.0; 656 } 657 return m_impl->toDouble(ok); 658 } 659 660 float String::toFloat(bool* ok) const 661 { 662 if (!m_impl) { 663 if (ok) 664 *ok = false; 665 return 0.0f; 666 } 667 return m_impl->toFloat(ok); 668 } 669 670 String String::isolatedCopy() const 671 { 672 if (!m_impl) 673 return String(); 674 return m_impl->isolatedCopy(); 675 } 676 677 bool String::isSafeToSendToAnotherThread() const 678 { 679 if (!impl()) 680 return true; 681 if (impl()->isStatic()) 682 return true; 683 // AtomicStrings are not safe to send between threads as ~StringImpl() 684 // will try to remove them from the wrong AtomicStringTable. 685 if (impl()->isAtomic()) 686 return false; 687 if (impl()->hasOneRef()) 688 return true; 689 return false; 690 } 691 692 void String::split(const String& separator, bool allowEmptyEntries, Vector<String>& result) const 693 { 694 result.clear(); 695 696 unsigned startPos = 0; 697 size_t endPos; 698 while ((endPos = find(separator, startPos)) != kNotFound) { 699 if (allowEmptyEntries || startPos != endPos) 700 result.append(substring(startPos, endPos - startPos)); 701 startPos = endPos + separator.length(); 702 } 703 if (allowEmptyEntries || startPos != length()) 704 result.append(substring(startPos)); 705 } 706 707 void String::split(UChar separator, bool allowEmptyEntries, Vector<String>& result) const 708 { 709 result.clear(); 710 711 unsigned startPos = 0; 712 size_t endPos; 713 while ((endPos = find(separator, startPos)) != kNotFound) { 714 if (allowEmptyEntries || startPos != endPos) 715 result.append(substring(startPos, endPos - startPos)); 716 startPos = endPos + 1; 717 } 718 if (allowEmptyEntries || startPos != length()) 719 result.append(substring(startPos)); 720 } 721 722 CString String::ascii() const 723 { 724 // Printable ASCII characters 32..127 and the null character are 725 // preserved, characters outside of this range are converted to '?'. 726 727 unsigned length = this->length(); 728 if (!length) { 729 char* characterBuffer; 730 return CString::newUninitialized(length, characterBuffer); 731 } 732 733 if (this->is8Bit()) { 734 const LChar* characters = this->characters8(); 735 736 char* characterBuffer; 737 CString result = CString::newUninitialized(length, characterBuffer); 738 739 for (unsigned i = 0; i < length; ++i) { 740 LChar ch = characters[i]; 741 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 742 } 743 744 return result; 745 } 746 747 const UChar* characters = this->characters16(); 748 749 char* characterBuffer; 750 CString result = CString::newUninitialized(length, characterBuffer); 751 752 for (unsigned i = 0; i < length; ++i) { 753 UChar ch = characters[i]; 754 characterBuffer[i] = ch && (ch < 0x20 || ch > 0x7f) ? '?' : ch; 755 } 756 757 return result; 758 } 759 760 CString String::latin1() const 761 { 762 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 763 // preserved, characters outside of this range are converted to '?'. 764 765 unsigned length = this->length(); 766 767 if (!length) 768 return CString("", 0); 769 770 if (is8Bit()) 771 return CString(reinterpret_cast<const char*>(this->characters8()), length); 772 773 const UChar* characters = this->characters16(); 774 775 char* characterBuffer; 776 CString result = CString::newUninitialized(length, characterBuffer); 777 778 for (unsigned i = 0; i < length; ++i) { 779 UChar ch = characters[i]; 780 characterBuffer[i] = ch > 0xff ? '?' : ch; 781 } 782 783 return result; 784 } 785 786 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 787 static inline void putUTF8Triple(char*& buffer, UChar ch) 788 { 789 ASSERT(ch >= 0x0800); 790 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 791 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 792 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 793 } 794 795 CString String::utf8(ConversionMode mode) const 796 { 797 unsigned length = this->length(); 798 799 if (!length) 800 return CString("", 0); 801 802 // Allocate a buffer big enough to hold all the characters 803 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 804 // Optimization ideas, if we find this function is hot: 805 // * We could speculatively create a CStringBuffer to contain 'length' 806 // characters, and resize if necessary (i.e. if the buffer contains 807 // non-ascii characters). (Alternatively, scan the buffer first for 808 // ascii characters, so we know this will be sufficient). 809 // * We could allocate a CStringBuffer with an appropriate size to 810 // have a good chance of being able to write the string into the 811 // buffer without reallocing (say, 1.5 x length). 812 if (length > numeric_limits<unsigned>::max() / 3) 813 return CString(); 814 Vector<char, 1024> bufferVector(length * 3); 815 816 char* buffer = bufferVector.data(); 817 818 if (is8Bit()) { 819 const LChar* characters = this->characters8(); 820 821 ConversionResult result = convertLatin1ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size()); 822 ASSERT_UNUSED(result, result != targetExhausted); // (length * 3) should be sufficient for any conversion 823 } else { 824 const UChar* characters = this->characters16(); 825 826 if (mode == StrictConversionReplacingUnpairedSurrogatesWithFFFD) { 827 const UChar* charactersEnd = characters + length; 828 char* bufferEnd = buffer + bufferVector.size(); 829 while (characters < charactersEnd) { 830 // Use strict conversion to detect unpaired surrogates. 831 ConversionResult result = convertUTF16ToUTF8(&characters, charactersEnd, &buffer, bufferEnd, true); 832 ASSERT(result != targetExhausted); 833 // Conversion fails when there is an unpaired surrogate. 834 // Put replacement character (U+FFFD) instead of the unpaired surrogate. 835 if (result != conversionOK) { 836 ASSERT((0xD800 <= *characters && *characters <= 0xDFFF)); 837 // There should be room left, since one UChar hasn't been converted. 838 ASSERT((buffer + 3) <= bufferEnd); 839 putUTF8Triple(buffer, replacementCharacter); 840 ++characters; 841 } 842 } 843 } else { 844 bool strict = mode == StrictConversion; 845 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); 846 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 847 848 // Only produced from strict conversion. 849 if (result == sourceIllegal) { 850 ASSERT(strict); 851 return CString(); 852 } 853 854 // Check for an unconverted high surrogate. 855 if (result == sourceExhausted) { 856 if (strict) 857 return CString(); 858 // This should be one unpaired high surrogate. Treat it the same 859 // was as an unpaired high surrogate would have been handled in 860 // the middle of a string with non-strict conversion - which is 861 // to say, simply encode it to UTF-8. 862 ASSERT((characters + 1) == (this->characters16() + length)); 863 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 864 // There should be room left, since one UChar hasn't been converted. 865 ASSERT((buffer + 3) <= (buffer + bufferVector.size())); 866 putUTF8Triple(buffer, *characters); 867 } 868 } 869 } 870 871 return CString(bufferVector.data(), buffer - bufferVector.data()); 872 } 873 874 String String::make8BitFrom16BitSource(const UChar* source, size_t length) 875 { 876 if (!length) 877 return String(); 878 879 LChar* destination; 880 String result = String::createUninitialized(length, destination); 881 882 copyLCharsFromUCharSource(destination, source, length); 883 884 return result; 885 } 886 887 String String::make16BitFrom8BitSource(const LChar* source, size_t length) 888 { 889 if (!length) 890 return String(); 891 892 UChar* destination; 893 String result = String::createUninitialized(length, destination); 894 895 StringImpl::copyChars(destination, source, length); 896 897 return result; 898 } 899 900 String String::fromUTF8(const LChar* stringStart, size_t length) 901 { 902 RELEASE_ASSERT(length <= numeric_limits<unsigned>::max()); 903 904 if (!stringStart) 905 return String(); 906 907 if (!length) 908 return emptyString(); 909 910 if (charactersAreAllASCII(stringStart, length)) 911 return StringImpl::create(stringStart, length); 912 913 Vector<UChar, 1024> buffer(length); 914 UChar* bufferStart = buffer.data(); 915 916 UChar* bufferCurrent = bufferStart; 917 const char* stringCurrent = reinterpret_cast<const char*>(stringStart); 918 if (convertUTF8ToUTF16(&stringCurrent, reinterpret_cast<const char *>(stringStart + length), &bufferCurrent, bufferCurrent + buffer.size()) != conversionOK) 919 return String(); 920 921 unsigned utf16Length = bufferCurrent - bufferStart; 922 ASSERT(utf16Length < length); 923 return StringImpl::create(bufferStart, utf16Length); 924 } 925 926 String String::fromUTF8(const LChar* string) 927 { 928 if (!string) 929 return String(); 930 return fromUTF8(string, strlen(reinterpret_cast<const char*>(string))); 931 } 932 933 String String::fromUTF8(const CString& s) 934 { 935 return fromUTF8(s.data()); 936 } 937 938 String String::fromUTF8WithLatin1Fallback(const LChar* string, size_t size) 939 { 940 String utf8 = fromUTF8(string, size); 941 if (!utf8) 942 return String(string, size); 943 return utf8; 944 } 945 946 // String Operations 947 948 static bool isCharacterAllowedInBase(UChar c, int base) 949 { 950 if (c > 0x7F) 951 return false; 952 if (isASCIIDigit(c)) 953 return c - '0' < base; 954 if (isASCIIAlpha(c)) { 955 if (base > 36) 956 base = 36; 957 return (c >= 'a' && c < 'a' + base - 10) 958 || (c >= 'A' && c < 'A' + base - 10); 959 } 960 return false; 961 } 962 963 template <typename IntegralType, typename CharType> 964 static inline IntegralType toIntegralType(const CharType* data, size_t length, bool* ok, int base) 965 { 966 static const IntegralType integralMax = numeric_limits<IntegralType>::max(); 967 static const bool isSigned = numeric_limits<IntegralType>::is_signed; 968 const IntegralType maxMultiplier = integralMax / base; 969 970 IntegralType value = 0; 971 bool isOk = false; 972 bool isNegative = false; 973 974 if (!data) 975 goto bye; 976 977 // skip leading whitespace 978 while (length && isSpaceOrNewline(*data)) { 979 --length; 980 ++data; 981 } 982 983 if (isSigned && length && *data == '-') { 984 --length; 985 ++data; 986 isNegative = true; 987 } else if (length && *data == '+') { 988 --length; 989 ++data; 990 } 991 992 if (!length || !isCharacterAllowedInBase(*data, base)) 993 goto bye; 994 995 while (length && isCharacterAllowedInBase(*data, base)) { 996 --length; 997 IntegralType digitValue; 998 CharType c = *data; 999 if (isASCIIDigit(c)) 1000 digitValue = c - '0'; 1001 else if (c >= 'a') 1002 digitValue = c - 'a' + 10; 1003 else 1004 digitValue = c - 'A' + 10; 1005 1006 if (value > maxMultiplier || (value == maxMultiplier && digitValue > (integralMax % base) + isNegative)) 1007 goto bye; 1008 1009 value = base * value + digitValue; 1010 ++data; 1011 } 1012 1013 #if COMPILER(MSVC) 1014 #pragma warning(push, 0) 1015 #pragma warning(disable:4146) 1016 #endif 1017 1018 if (isNegative) 1019 value = -value; 1020 1021 #if COMPILER(MSVC) 1022 #pragma warning(pop) 1023 #endif 1024 1025 // skip trailing space 1026 while (length && isSpaceOrNewline(*data)) { 1027 --length; 1028 ++data; 1029 } 1030 1031 if (!length) 1032 isOk = true; 1033 bye: 1034 if (ok) 1035 *ok = isOk; 1036 return isOk ? value : 0; 1037 } 1038 1039 template <typename CharType> 1040 static unsigned lengthOfCharactersAsInteger(const CharType* data, size_t length) 1041 { 1042 size_t i = 0; 1043 1044 // Allow leading spaces. 1045 for (; i != length; ++i) { 1046 if (!isSpaceOrNewline(data[i])) 1047 break; 1048 } 1049 1050 // Allow sign. 1051 if (i != length && (data[i] == '+' || data[i] == '-')) 1052 ++i; 1053 1054 // Allow digits. 1055 for (; i != length; ++i) { 1056 if (!isASCIIDigit(data[i])) 1057 break; 1058 } 1059 1060 return i; 1061 } 1062 1063 int charactersToIntStrict(const LChar* data, size_t length, bool* ok, int base) 1064 { 1065 return toIntegralType<int, LChar>(data, length, ok, base); 1066 } 1067 1068 int charactersToIntStrict(const UChar* data, size_t length, bool* ok, int base) 1069 { 1070 return toIntegralType<int, UChar>(data, length, ok, base); 1071 } 1072 1073 unsigned charactersToUIntStrict(const LChar* data, size_t length, bool* ok, int base) 1074 { 1075 return toIntegralType<unsigned, LChar>(data, length, ok, base); 1076 } 1077 1078 unsigned charactersToUIntStrict(const UChar* data, size_t length, bool* ok, int base) 1079 { 1080 return toIntegralType<unsigned, UChar>(data, length, ok, base); 1081 } 1082 1083 int64_t charactersToInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1084 { 1085 return toIntegralType<int64_t, LChar>(data, length, ok, base); 1086 } 1087 1088 int64_t charactersToInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1089 { 1090 return toIntegralType<int64_t, UChar>(data, length, ok, base); 1091 } 1092 1093 uint64_t charactersToUInt64Strict(const LChar* data, size_t length, bool* ok, int base) 1094 { 1095 return toIntegralType<uint64_t, LChar>(data, length, ok, base); 1096 } 1097 1098 uint64_t charactersToUInt64Strict(const UChar* data, size_t length, bool* ok, int base) 1099 { 1100 return toIntegralType<uint64_t, UChar>(data, length, ok, base); 1101 } 1102 1103 intptr_t charactersToIntPtrStrict(const LChar* data, size_t length, bool* ok, int base) 1104 { 1105 return toIntegralType<intptr_t, LChar>(data, length, ok, base); 1106 } 1107 1108 intptr_t charactersToIntPtrStrict(const UChar* data, size_t length, bool* ok, int base) 1109 { 1110 return toIntegralType<intptr_t, UChar>(data, length, ok, base); 1111 } 1112 1113 int charactersToInt(const LChar* data, size_t length, bool* ok) 1114 { 1115 return toIntegralType<int, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1116 } 1117 1118 int charactersToInt(const UChar* data, size_t length, bool* ok) 1119 { 1120 return toIntegralType<int, UChar>(data, lengthOfCharactersAsInteger(data, length), ok, 10); 1121 } 1122 1123 unsigned charactersToUInt(const LChar* data, size_t length, bool* ok) 1124 { 1125 return toIntegralType<unsigned, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1126 } 1127 1128 unsigned charactersToUInt(const UChar* data, size_t length, bool* ok) 1129 { 1130 return toIntegralType<unsigned, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1131 } 1132 1133 int64_t charactersToInt64(const LChar* data, size_t length, bool* ok) 1134 { 1135 return toIntegralType<int64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1136 } 1137 1138 int64_t charactersToInt64(const UChar* data, size_t length, bool* ok) 1139 { 1140 return toIntegralType<int64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1141 } 1142 1143 uint64_t charactersToUInt64(const LChar* data, size_t length, bool* ok) 1144 { 1145 return toIntegralType<uint64_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1146 } 1147 1148 uint64_t charactersToUInt64(const UChar* data, size_t length, bool* ok) 1149 { 1150 return toIntegralType<uint64_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1151 } 1152 1153 intptr_t charactersToIntPtr(const LChar* data, size_t length, bool* ok) 1154 { 1155 return toIntegralType<intptr_t, LChar>(data, lengthOfCharactersAsInteger<LChar>(data, length), ok, 10); 1156 } 1157 1158 intptr_t charactersToIntPtr(const UChar* data, size_t length, bool* ok) 1159 { 1160 return toIntegralType<intptr_t, UChar>(data, lengthOfCharactersAsInteger<UChar>(data, length), ok, 10); 1161 } 1162 1163 enum TrailingJunkPolicy { DisallowTrailingJunk, AllowTrailingJunk }; 1164 1165 template <typename CharType, TrailingJunkPolicy policy> 1166 static inline double toDoubleType(const CharType* data, size_t length, bool* ok, size_t& parsedLength) 1167 { 1168 size_t leadingSpacesLength = 0; 1169 while (leadingSpacesLength < length && isASCIISpace(data[leadingSpacesLength])) 1170 ++leadingSpacesLength; 1171 1172 double number = parseDouble(data + leadingSpacesLength, length - leadingSpacesLength, parsedLength); 1173 if (!parsedLength) { 1174 if (ok) 1175 *ok = false; 1176 return 0.0; 1177 } 1178 1179 parsedLength += leadingSpacesLength; 1180 if (ok) 1181 *ok = policy == AllowTrailingJunk || parsedLength == length; 1182 return number; 1183 } 1184 1185 double charactersToDouble(const LChar* data, size_t length, bool* ok) 1186 { 1187 size_t parsedLength; 1188 return toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1189 } 1190 1191 double charactersToDouble(const UChar* data, size_t length, bool* ok) 1192 { 1193 size_t parsedLength; 1194 return toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength); 1195 } 1196 1197 float charactersToFloat(const LChar* data, size_t length, bool* ok) 1198 { 1199 // FIXME: This will return ok even when the string fits into a double but not a float. 1200 size_t parsedLength; 1201 return static_cast<float>(toDoubleType<LChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1202 } 1203 1204 float charactersToFloat(const UChar* data, size_t length, bool* ok) 1205 { 1206 // FIXME: This will return ok even when the string fits into a double but not a float. 1207 size_t parsedLength; 1208 return static_cast<float>(toDoubleType<UChar, DisallowTrailingJunk>(data, length, ok, parsedLength)); 1209 } 1210 1211 float charactersToFloat(const LChar* data, size_t length, size_t& parsedLength) 1212 { 1213 // FIXME: This will return ok even when the string fits into a double but not a float. 1214 return static_cast<float>(toDoubleType<LChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1215 } 1216 1217 float charactersToFloat(const UChar* data, size_t length, size_t& parsedLength) 1218 { 1219 // FIXME: This will return ok even when the string fits into a double but not a float. 1220 return static_cast<float>(toDoubleType<UChar, AllowTrailingJunk>(data, length, 0, parsedLength)); 1221 } 1222 1223 const String& emptyString() 1224 { 1225 DEFINE_STATIC_LOCAL(String, emptyString, (StringImpl::empty())); 1226 return emptyString; 1227 } 1228 1229 } // namespace WTF 1230 1231 #ifndef NDEBUG 1232 // For use in the debugger 1233 String* string(const char*); 1234 Vector<char> asciiDebug(StringImpl* impl); 1235 Vector<char> asciiDebug(String& string); 1236 1237 void String::show() const 1238 { 1239 dataLogF("%s\n", asciiDebug(impl()).data()); 1240 } 1241 1242 String* string(const char* s) 1243 { 1244 // leaks memory! 1245 return new String(s); 1246 } 1247 1248 Vector<char> asciiDebug(StringImpl* impl) 1249 { 1250 if (!impl) 1251 return asciiDebug(String("[null]").impl()); 1252 1253 Vector<char> buffer; 1254 for (unsigned i = 0; i < impl->length(); ++i) { 1255 UChar ch = (*impl)[i]; 1256 if (isASCIIPrintable(ch)) { 1257 if (ch == '\\') 1258 buffer.append(ch); 1259 buffer.append(ch); 1260 } else { 1261 buffer.append('\\'); 1262 buffer.append('u'); 1263 appendUnsignedAsHexFixedSize(ch, buffer, 4); 1264 } 1265 } 1266 buffer.append('\0'); 1267 return buffer; 1268 } 1269 1270 Vector<char> asciiDebug(String& string) 1271 { 1272 return asciiDebug(string.impl()); 1273 } 1274 1275 #endif 1276