1 // Copyright 2013 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_INTL_SUPPORT 6 #error Internationalization is expected to be enabled. 7 #endif // V8_INTL_SUPPORT 8 9 #include "src/intl.h" 10 11 #include <memory> 12 13 #include "src/heap/factory.h" 14 #include "src/isolate.h" 15 #include "src/objects-inl.h" 16 #include "src/string-case.h" 17 #include "unicode/basictz.h" 18 #include "unicode/calendar.h" 19 #include "unicode/gregocal.h" 20 #include "unicode/timezone.h" 21 #include "unicode/ustring.h" 22 #include "unicode/uvernum.h" 23 #include "unicode/uversion.h" 24 25 namespace v8 { 26 namespace internal { 27 28 namespace { 29 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; } 30 31 const uint8_t kToLower[256] = { 32 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 33 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 34 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23, 35 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 36 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 37 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 38 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 39 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F, 40 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 41 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 42 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83, 43 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F, 44 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B, 45 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7, 46 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3, 47 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF, 48 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 49 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7, 50 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3, 51 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 52 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB, 53 0xFC, 0xFD, 0xFE, 0xFF, 54 }; 55 56 inline uint16_t ToLatin1Lower(uint16_t ch) { 57 return static_cast<uint16_t>(kToLower[ch]); 58 } 59 60 inline uint16_t ToASCIIUpper(uint16_t ch) { 61 return ch & ~((ch >= 'a' && ch <= 'z') << 5); 62 } 63 64 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF. 65 inline uint16_t ToLatin1Upper(uint16_t ch) { 66 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF); 67 return ch & 68 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xF7)) 69 << 5); 70 } 71 72 template <typename Char> 73 bool ToUpperFastASCII(const Vector<const Char>& src, 74 Handle<SeqOneByteString> result) { 75 // Do a faster loop for the case where all the characters are ASCII. 76 uint16_t ored = 0; 77 int32_t index = 0; 78 for (auto it = src.begin(); it != src.end(); ++it) { 79 uint16_t ch = static_cast<uint16_t>(*it); 80 ored |= ch; 81 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch)); 82 } 83 return !(ored & ~0x7F); 84 } 85 86 const uint16_t sharp_s = 0xDF; 87 88 template <typename Char> 89 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest, 90 int* sharp_s_count) { 91 // Still pretty-fast path for the input with non-ASCII Latin-1 characters. 92 93 // There are two special cases. 94 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF. 95 // 2. Lower case sharp-S converts to "SS" (two characters) 96 *sharp_s_count = 0; 97 for (auto it = src.begin(); it != src.end(); ++it) { 98 uint16_t ch = static_cast<uint16_t>(*it); 99 if (V8_UNLIKELY(ch == sharp_s)) { 100 ++(*sharp_s_count); 101 continue; 102 } 103 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) { 104 // Since this upper-cased character does not fit in an 8-bit string, we 105 // need to take the 16-bit path. 106 return false; 107 } 108 *dest++ = ToLatin1Upper(ch); 109 } 110 111 return true; 112 } 113 114 template <typename Char> 115 void ToUpperWithSharpS(const Vector<const Char>& src, 116 Handle<SeqOneByteString> result) { 117 int32_t dest_index = 0; 118 for (auto it = src.begin(); it != src.end(); ++it) { 119 uint16_t ch = static_cast<uint16_t>(*it); 120 if (ch == sharp_s) { 121 result->SeqOneByteStringSet(dest_index++, 'S'); 122 result->SeqOneByteStringSet(dest_index++, 'S'); 123 } else { 124 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch)); 125 } 126 } 127 } 128 129 inline int FindFirstUpperOrNonAscii(String* s, int length) { 130 for (int index = 0; index < length; ++index) { 131 uint16_t ch = s->Get(index); 132 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { 133 return index; 134 } 135 } 136 return length; 137 } 138 139 } // namespace 140 141 const uint8_t* ToLatin1LowerTable() { return &kToLower[0]; } 142 143 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat, 144 std::unique_ptr<uc16[]>* dest, 145 int32_t length) { 146 DCHECK(flat.IsFlat()); 147 if (flat.IsOneByte()) { 148 if (!*dest) { 149 dest->reset(NewArray<uc16>(length)); 150 CopyChars(dest->get(), flat.ToOneByteVector().start(), length); 151 } 152 return reinterpret_cast<const UChar*>(dest->get()); 153 } else { 154 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start()); 155 } 156 } 157 158 MaybeHandle<String> LocaleConvertCase(Handle<String> s, Isolate* isolate, 159 bool is_to_upper, const char* lang) { 160 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower; 161 int32_t src_length = s->length(); 162 int32_t dest_length = src_length; 163 UErrorCode status; 164 Handle<SeqTwoByteString> result; 165 std::unique_ptr<uc16[]> sap; 166 167 if (dest_length == 0) return ReadOnlyRoots(isolate).empty_string_handle(); 168 169 // This is not a real loop. It'll be executed only once (no overflow) or 170 // twice (overflow). 171 for (int i = 0; i < 2; ++i) { 172 // Case conversion can increase the string length (e.g. sharp-S => SS) so 173 // that we have to handle RangeError exceptions here. 174 ASSIGN_RETURN_ON_EXCEPTION( 175 isolate, result, isolate->factory()->NewRawTwoByteString(dest_length), 176 String); 177 DisallowHeapAllocation no_gc; 178 DCHECK(s->IsFlat()); 179 String::FlatContent flat = s->GetFlatContent(); 180 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length); 181 status = U_ZERO_ERROR; 182 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()), 183 dest_length, src, src_length, lang, &status); 184 if (status != U_BUFFER_OVERFLOW_ERROR) break; 185 } 186 187 // In most cases, the output will fill the destination buffer completely 188 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING). 189 // Only in rare cases, it'll be shorter than the destination buffer and 190 // |result| has to be truncated. 191 DCHECK(U_SUCCESS(status)); 192 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) { 193 DCHECK(dest_length == result->length()); 194 return result; 195 } 196 DCHECK(dest_length < result->length()); 197 return SeqString::Truncate(result, dest_length); 198 } 199 200 // A stripped-down version of ConvertToLower that can only handle flat one-byte 201 // strings and does not allocate. Note that {src} could still be, e.g., a 202 // one-byte sliced string with a two-byte parent string. 203 // Called from TF builtins. 204 V8_WARN_UNUSED_RESULT String* ConvertOneByteToLower(String* src, String* dst) { 205 DCHECK_EQ(src->length(), dst->length()); 206 DCHECK(src->HasOnlyOneByteChars()); 207 DCHECK(src->IsFlat()); 208 DCHECK(dst->IsSeqOneByteString()); 209 210 DisallowHeapAllocation no_gc; 211 212 const int length = src->length(); 213 String::FlatContent src_flat = src->GetFlatContent(); 214 uint8_t* dst_data = SeqOneByteString::cast(dst)->GetChars(); 215 216 if (src_flat.IsOneByte()) { 217 const uint8_t* src_data = src_flat.ToOneByteVector().start(); 218 219 bool has_changed_character = false; 220 int index_to_first_unprocessed = 221 FastAsciiConvert<true>(reinterpret_cast<char*>(dst_data), 222 reinterpret_cast<const char*>(src_data), length, 223 &has_changed_character); 224 225 if (index_to_first_unprocessed == length) { 226 return has_changed_character ? dst : src; 227 } 228 229 // If not ASCII, we keep the result up to index_to_first_unprocessed and 230 // process the rest. 231 for (int index = index_to_first_unprocessed; index < length; ++index) { 232 dst_data[index] = ToLatin1Lower(static_cast<uint16_t>(src_data[index])); 233 } 234 } else { 235 DCHECK(src_flat.IsTwoByte()); 236 int index_to_first_unprocessed = FindFirstUpperOrNonAscii(src, length); 237 if (index_to_first_unprocessed == length) return src; 238 239 const uint16_t* src_data = src_flat.ToUC16Vector().start(); 240 CopyChars(dst_data, src_data, index_to_first_unprocessed); 241 for (int index = index_to_first_unprocessed; index < length; ++index) { 242 dst_data[index] = ToLatin1Lower(static_cast<uint16_t>(src_data[index])); 243 } 244 } 245 246 return dst; 247 } 248 249 MaybeHandle<String> ConvertToLower(Handle<String> s, Isolate* isolate) { 250 if (!s->HasOnlyOneByteChars()) { 251 // Use a slower implementation for strings with characters beyond U+00FF. 252 return LocaleConvertCase(s, isolate, false, ""); 253 } 254 255 int length = s->length(); 256 257 // We depend here on the invariant that the length of a Latin1 258 // string is invariant under ToLowerCase, and the result always 259 // fits in the Latin1 range in the *root locale*. It does not hold 260 // for ToUpperCase even in the root locale. 261 262 // Scan the string for uppercase and non-ASCII characters for strings 263 // shorter than a machine-word without any memory allocation overhead. 264 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert() 265 // to two parts, one for scanning the prefix with no change and the other for 266 // handling ASCII-only characters. 267 268 bool is_short = length < static_cast<int>(sizeof(uintptr_t)); 269 if (is_short) { 270 bool is_lower_ascii = FindFirstUpperOrNonAscii(*s, length) == length; 271 if (is_lower_ascii) return s; 272 } 273 274 Handle<SeqOneByteString> result = 275 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); 276 277 return Handle<String>(ConvertOneByteToLower(*s, *result), isolate); 278 } 279 280 MaybeHandle<String> ConvertToUpper(Handle<String> s, Isolate* isolate) { 281 int32_t length = s->length(); 282 if (s->HasOnlyOneByteChars() && length > 0) { 283 Handle<SeqOneByteString> result = 284 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); 285 286 DCHECK(s->IsFlat()); 287 int sharp_s_count; 288 bool is_result_single_byte; 289 { 290 DisallowHeapAllocation no_gc; 291 String::FlatContent flat = s->GetFlatContent(); 292 uint8_t* dest = result->GetChars(); 293 if (flat.IsOneByte()) { 294 Vector<const uint8_t> src = flat.ToOneByteVector(); 295 bool has_changed_character = false; 296 int index_to_first_unprocessed = 297 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()), 298 reinterpret_cast<const char*>(src.start()), 299 length, &has_changed_character); 300 if (index_to_first_unprocessed == length) { 301 return has_changed_character ? result : s; 302 } 303 // If not ASCII, we keep the result up to index_to_first_unprocessed and 304 // process the rest. 305 is_result_single_byte = 306 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length), 307 dest + index_to_first_unprocessed, &sharp_s_count); 308 } else { 309 DCHECK(flat.IsTwoByte()); 310 Vector<const uint16_t> src = flat.ToUC16Vector(); 311 if (ToUpperFastASCII(src, result)) return result; 312 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count); 313 } 314 } 315 316 // Go to the full Unicode path if there are characters whose uppercase 317 // is beyond the Latin-1 range (cannot be represented in OneByteString). 318 if (V8_UNLIKELY(!is_result_single_byte)) { 319 return LocaleConvertCase(s, isolate, true, ""); 320 } 321 322 if (sharp_s_count == 0) return result; 323 324 // We have sharp_s_count sharp-s characters, but the result is still 325 // in the Latin-1 range. 326 ASSIGN_RETURN_ON_EXCEPTION( 327 isolate, result, 328 isolate->factory()->NewRawOneByteString(length + sharp_s_count), 329 String); 330 DisallowHeapAllocation no_gc; 331 String::FlatContent flat = s->GetFlatContent(); 332 if (flat.IsOneByte()) { 333 ToUpperWithSharpS(flat.ToOneByteVector(), result); 334 } else { 335 ToUpperWithSharpS(flat.ToUC16Vector(), result); 336 } 337 338 return result; 339 } 340 341 return LocaleConvertCase(s, isolate, true, ""); 342 } 343 344 MaybeHandle<String> ConvertCase(Handle<String> s, bool is_upper, 345 Isolate* isolate) { 346 return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate); 347 } 348 349 ICUTimezoneCache::ICUTimezoneCache() : timezone_(nullptr) { Clear(); } 350 351 ICUTimezoneCache::~ICUTimezoneCache() { Clear(); } 352 353 const char* ICUTimezoneCache::LocalTimezone(double time_ms) { 354 bool is_dst = DaylightSavingsOffset(time_ms) != 0; 355 std::string* name = is_dst ? &dst_timezone_name_ : &timezone_name_; 356 if (name->empty()) { 357 icu::UnicodeString result; 358 GetTimeZone()->getDisplayName(is_dst, icu::TimeZone::LONG, result); 359 result += '\0'; 360 361 icu::StringByteSink<std::string> byte_sink(name); 362 result.toUTF8(byte_sink); 363 } 364 DCHECK(!name->empty()); 365 return name->c_str(); 366 } 367 368 icu::TimeZone* ICUTimezoneCache::GetTimeZone() { 369 if (timezone_ == nullptr) { 370 timezone_ = icu::TimeZone::createDefault(); 371 } 372 return timezone_; 373 } 374 375 bool ICUTimezoneCache::GetOffsets(double time_ms, bool is_utc, 376 int32_t* raw_offset, int32_t* dst_offset) { 377 UErrorCode status = U_ZERO_ERROR; 378 // TODO(jshin): ICU TimeZone class handles skipped time differently from 379 // Ecma 262 (https://github.com/tc39/ecma262/pull/778) and icu::TimeZone 380 // class does not expose the necessary API. Fixing 381 // http://bugs.icu-project.org/trac/ticket/13268 would make it easy to 382 // implement the proposed spec change. A proposed fix for ICU is 383 // https://chromium-review.googlesource.com/851265 . 384 // In the meantime, use an internal (still public) API of icu::BasicTimeZone. 385 // Once it's accepted by the upstream, get rid of cast. Note that casting 386 // TimeZone to BasicTimeZone is safe because we know that icu::TimeZone used 387 // here is a BasicTimeZone. 388 if (is_utc) { 389 GetTimeZone()->getOffset(time_ms, false, *raw_offset, *dst_offset, status); 390 } else { 391 static_cast<const icu::BasicTimeZone*>(GetTimeZone()) 392 ->getOffsetFromLocal(time_ms, icu::BasicTimeZone::kFormer, 393 icu::BasicTimeZone::kFormer, *raw_offset, 394 *dst_offset, status); 395 } 396 397 return U_SUCCESS(status); 398 } 399 400 double ICUTimezoneCache::DaylightSavingsOffset(double time_ms) { 401 int32_t raw_offset, dst_offset; 402 if (!GetOffsets(time_ms, true, &raw_offset, &dst_offset)) return 0; 403 return dst_offset; 404 } 405 406 double ICUTimezoneCache::LocalTimeOffset(double time_ms, bool is_utc) { 407 int32_t raw_offset, dst_offset; 408 if (!GetOffsets(time_ms, is_utc, &raw_offset, &dst_offset)) return 0; 409 return raw_offset + dst_offset; 410 } 411 412 void ICUTimezoneCache::Clear() { 413 delete timezone_; 414 timezone_ = nullptr; 415 timezone_name_.clear(); 416 dst_timezone_name_.clear(); 417 } 418 419 } // namespace internal 420 } // namespace v8 421