1 /* 2 * Copyright (C) 1999-2000 Harri Porten (porten (at) kde.org) 3 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009 Apple Inc. All rights reserved. 4 * Copyright (C) 2007 Cameron Zwarich (cwzwarich (at) uwaterloo.ca) 5 * Copyright (C) 2009 Google Inc. All rights reserved. 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Library General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Library General Public License for more details. 16 * 17 * You should have received a copy of the GNU Library General Public License 18 * along with this library; see the file COPYING.LIB. If not, write to 19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 20 * Boston, MA 02110-1301, USA. 21 * 22 */ 23 24 #include "config.h" 25 #include "UString.h" 26 27 #include "JSGlobalObjectFunctions.h" 28 #include "Heap.h" 29 #include "Identifier.h" 30 #include "Operations.h" 31 #include <ctype.h> 32 #include <limits.h> 33 #include <limits> 34 #include <stdio.h> 35 #include <stdlib.h> 36 #include <wtf/ASCIICType.h> 37 #include <wtf/Assertions.h> 38 #include <wtf/DecimalNumber.h> 39 #include <wtf/MathExtras.h> 40 #include <wtf/StringExtras.h> 41 #include <wtf/Vector.h> 42 #include <wtf/unicode/UTF8.h> 43 44 #if HAVE(STRINGS_H) 45 #include <strings.h> 46 #endif 47 48 using namespace WTF; 49 using namespace WTF::Unicode; 50 using namespace std; 51 52 namespace JSC { 53 54 extern const double NaN; 55 extern const double Inf; 56 57 COMPILE_ASSERT(sizeof(UString) == sizeof(void*), UString_should_stay_small); 58 59 // Construct a string with UTF-16 data. 60 UString::UString(const UChar* characters, unsigned length) 61 : m_impl(characters ? StringImpl::create(characters, length) : 0) 62 { 63 } 64 65 // Construct a string with UTF-16 data, from a null-terminated source. 66 UString::UString(const UChar* characters) 67 { 68 if (!characters) 69 return; 70 71 int length = 0; 72 while (characters[length] != UChar(0)) 73 ++length; 74 75 m_impl = StringImpl::create(characters, length); 76 } 77 78 // Construct a string with latin1 data. 79 UString::UString(const char* characters, unsigned length) 80 : m_impl(characters ? StringImpl::create(characters, length) : 0) 81 { 82 } 83 84 // Construct a string with latin1 data, from a null-terminated source. 85 UString::UString(const char* characters) 86 : m_impl(characters ? StringImpl::create(characters) : 0) 87 { 88 } 89 90 UString UString::number(int i) 91 { 92 UChar buf[1 + sizeof(i) * 3]; 93 UChar* end = buf + WTF_ARRAY_LENGTH(buf); 94 UChar* p = end; 95 96 if (i == 0) 97 *--p = '0'; 98 else if (i == INT_MIN) { 99 char minBuf[1 + sizeof(i) * 3]; 100 snprintf(minBuf, sizeof(minBuf), "%d", INT_MIN); 101 return UString(minBuf); 102 } else { 103 bool negative = false; 104 if (i < 0) { 105 negative = true; 106 i = -i; 107 } 108 while (i) { 109 *--p = static_cast<unsigned short>((i % 10) + '0'); 110 i /= 10; 111 } 112 if (negative) 113 *--p = '-'; 114 } 115 116 return UString(p, static_cast<unsigned>(end - p)); 117 } 118 119 UString UString::number(long long i) 120 { 121 UChar buf[1 + sizeof(i) * 3]; 122 UChar* end = buf + WTF_ARRAY_LENGTH(buf); 123 UChar* p = end; 124 125 if (i == 0) 126 *--p = '0'; 127 else if (i == std::numeric_limits<long long>::min()) { 128 char minBuf[1 + sizeof(i) * 3]; 129 #if OS(WINDOWS) 130 snprintf(minBuf, sizeof(minBuf), "%I64d", std::numeric_limits<long long>::min()); 131 #else 132 snprintf(minBuf, sizeof(minBuf), "%lld", std::numeric_limits<long long>::min()); 133 #endif 134 return UString(minBuf); 135 } else { 136 bool negative = false; 137 if (i < 0) { 138 negative = true; 139 i = -i; 140 } 141 while (i) { 142 *--p = static_cast<unsigned short>((i % 10) + '0'); 143 i /= 10; 144 } 145 if (negative) 146 *--p = '-'; 147 } 148 149 return UString(p, static_cast<unsigned>(end - p)); 150 } 151 152 UString UString::number(unsigned u) 153 { 154 UChar buf[sizeof(u) * 3]; 155 UChar* end = buf + WTF_ARRAY_LENGTH(buf); 156 UChar* p = end; 157 158 if (u == 0) 159 *--p = '0'; 160 else { 161 while (u) { 162 *--p = static_cast<unsigned short>((u % 10) + '0'); 163 u /= 10; 164 } 165 } 166 167 return UString(p, static_cast<unsigned>(end - p)); 168 } 169 170 UString UString::number(long l) 171 { 172 UChar buf[1 + sizeof(l) * 3]; 173 UChar* end = buf + WTF_ARRAY_LENGTH(buf); 174 UChar* p = end; 175 176 if (l == 0) 177 *--p = '0'; 178 else if (l == LONG_MIN) { 179 char minBuf[1 + sizeof(l) * 3]; 180 snprintf(minBuf, sizeof(minBuf), "%ld", LONG_MIN); 181 return UString(minBuf); 182 } else { 183 bool negative = false; 184 if (l < 0) { 185 negative = true; 186 l = -l; 187 } 188 while (l) { 189 *--p = static_cast<unsigned short>((l % 10) + '0'); 190 l /= 10; 191 } 192 if (negative) 193 *--p = '-'; 194 } 195 196 return UString(p, end - p); 197 } 198 199 UString UString::number(double d) 200 { 201 NumberToStringBuffer buffer; 202 unsigned length = numberToString(d, buffer); 203 return UString(buffer, length); 204 } 205 206 UString UString::substringSharingImpl(unsigned offset, unsigned length) const 207 { 208 // FIXME: We used to check against a limit of Heap::minExtraCost / sizeof(UChar). 209 210 unsigned stringLength = this->length(); 211 offset = min(offset, stringLength); 212 length = min(length, stringLength - offset); 213 214 if (!offset && length == stringLength) 215 return *this; 216 return UString(StringImpl::create(m_impl, offset, length)); 217 } 218 219 bool operator==(const UString& s1, const char *s2) 220 { 221 if (s2 == 0) 222 return s1.isEmpty(); 223 224 const UChar* u = s1.characters(); 225 const UChar* uend = u + s1.length(); 226 while (u != uend && *s2) { 227 if (u[0] != (unsigned char)*s2) 228 return false; 229 s2++; 230 u++; 231 } 232 233 return u == uend && *s2 == 0; 234 } 235 236 bool operator<(const UString& s1, const UString& s2) 237 { 238 const unsigned l1 = s1.length(); 239 const unsigned l2 = s2.length(); 240 const unsigned lmin = l1 < l2 ? l1 : l2; 241 const UChar* c1 = s1.characters(); 242 const UChar* c2 = s2.characters(); 243 unsigned l = 0; 244 while (l < lmin && *c1 == *c2) { 245 c1++; 246 c2++; 247 l++; 248 } 249 if (l < lmin) 250 return (c1[0] < c2[0]); 251 252 return (l1 < l2); 253 } 254 255 bool operator>(const UString& s1, const UString& s2) 256 { 257 const unsigned l1 = s1.length(); 258 const unsigned l2 = s2.length(); 259 const unsigned lmin = l1 < l2 ? l1 : l2; 260 const UChar* c1 = s1.characters(); 261 const UChar* c2 = s2.characters(); 262 unsigned l = 0; 263 while (l < lmin && *c1 == *c2) { 264 c1++; 265 c2++; 266 l++; 267 } 268 if (l < lmin) 269 return (c1[0] > c2[0]); 270 271 return (l1 > l2); 272 } 273 274 CString UString::ascii() const 275 { 276 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 277 // preserved, characters outside of this range are converted to '?'. 278 279 unsigned length = this->length(); 280 const UChar* characters = this->characters(); 281 282 char* characterBuffer; 283 CString result = CString::newUninitialized(length, characterBuffer); 284 285 for (unsigned i = 0; i < length; ++i) { 286 UChar ch = characters[i]; 287 characterBuffer[i] = ch && (ch < 0x20 || ch >= 0x7f) ? '?' : ch; 288 } 289 290 return result; 291 } 292 293 CString UString::latin1() const 294 { 295 // Basic Latin1 (ISO) encoding - Unicode characters 0..255 are 296 // preserved, characters outside of this range are converted to '?'. 297 298 unsigned length = this->length(); 299 const UChar* characters = this->characters(); 300 301 char* characterBuffer; 302 CString result = CString::newUninitialized(length, characterBuffer); 303 304 for (unsigned i = 0; i < length; ++i) { 305 UChar ch = characters[i]; 306 characterBuffer[i] = ch > 0xff ? '?' : ch; 307 } 308 309 return result; 310 } 311 312 // Helper to write a three-byte UTF-8 code point to the buffer, caller must check room is available. 313 static inline void putUTF8Triple(char*& buffer, UChar ch) 314 { 315 ASSERT(ch >= 0x0800); 316 *buffer++ = static_cast<char>(((ch >> 12) & 0x0F) | 0xE0); 317 *buffer++ = static_cast<char>(((ch >> 6) & 0x3F) | 0x80); 318 *buffer++ = static_cast<char>((ch & 0x3F) | 0x80); 319 } 320 321 CString UString::utf8(bool strict) const 322 { 323 unsigned length = this->length(); 324 const UChar* characters = this->characters(); 325 326 // Allocate a buffer big enough to hold all the characters 327 // (an individual UTF-16 UChar can only expand to 3 UTF-8 bytes). 328 // Optimization ideas, if we find this function is hot: 329 // * We could speculatively create a CStringBuffer to contain 'length' 330 // characters, and resize if necessary (i.e. if the buffer contains 331 // non-ascii characters). (Alternatively, scan the buffer first for 332 // ascii characters, so we know this will be sufficient). 333 // * We could allocate a CStringBuffer with an appropriate size to 334 // have a good chance of being able to write the string into the 335 // buffer without reallocing (say, 1.5 x length). 336 if (length > numeric_limits<unsigned>::max() / 3) 337 return CString(); 338 Vector<char, 1024> bufferVector(length * 3); 339 340 char* buffer = bufferVector.data(); 341 ConversionResult result = convertUTF16ToUTF8(&characters, characters + length, &buffer, buffer + bufferVector.size(), strict); 342 ASSERT(result != targetExhausted); // (length * 3) should be sufficient for any conversion 343 344 // Only produced from strict conversion. 345 if (result == sourceIllegal) 346 return CString(); 347 348 // Check for an unconverted high surrogate. 349 if (result == sourceExhausted) { 350 if (strict) 351 return CString(); 352 // This should be one unpaired high surrogate. Treat it the same 353 // was as an unpaired high surrogate would have been handled in 354 // the middle of a string with non-strict conversion - which is 355 // to say, simply encode it to UTF-8. 356 ASSERT((characters + 1) == (this->characters() + length)); 357 ASSERT((*characters >= 0xD800) && (*characters <= 0xDBFF)); 358 // There should be room left, since one UChar hasn't been converted. 359 ASSERT((buffer + 3) <= (buffer + bufferVector.size())); 360 putUTF8Triple(buffer, *characters); 361 } 362 363 return CString(bufferVector.data(), buffer - bufferVector.data()); 364 } 365 366 } // namespace JSC 367