1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #include "config.h" 23 #include "platform/text/TextBreakIterator.h" 24 25 #include "platform/text/TextBreakIteratorInternalICU.h" 26 #include "wtf/Assertions.h" 27 #include "wtf/HashMap.h" 28 #include "wtf/PassOwnPtr.h" 29 #include "wtf/ThreadSpecific.h" 30 #include "wtf/ThreadingPrimitives.h" 31 #include "wtf/text/AtomicString.h" 32 #include "wtf/text/CString.h" 33 #include "wtf/text/WTFString.h" 34 #include <unicode/ubrk.h> 35 36 using namespace WTF; 37 using namespace std; 38 39 namespace WebCore { 40 41 class LineBreakIteratorPool { 42 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool); 43 public: 44 static LineBreakIteratorPool& sharedPool() 45 { 46 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>; 47 return **pool; 48 } 49 50 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); } 51 52 icu::BreakIterator* take(const AtomicString& locale) 53 { 54 icu::BreakIterator* iterator = 0; 55 for (size_t i = 0; i < m_pool.size(); ++i) { 56 if (m_pool[i].first == locale) { 57 iterator = m_pool[i].second; 58 m_pool.remove(i); 59 break; 60 } 61 } 62 63 if (!iterator) { 64 UErrorCode openStatus = U_ZERO_ERROR; 65 bool localeIsEmpty = locale.isEmpty(); 66 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.string().utf8().data()), openStatus); 67 // locale comes from a web page and it can be invalid, leading ICU 68 // to fail, in which case we fall back to the default locale. 69 if (!localeIsEmpty && U_FAILURE(openStatus)) { 70 openStatus = U_ZERO_ERROR; 71 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus); 72 } 73 74 if (U_FAILURE(openStatus)) { 75 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus); 76 return 0; 77 } 78 } 79 80 ASSERT(!m_vendedIterators.contains(iterator)); 81 m_vendedIterators.set(iterator, locale); 82 return iterator; 83 } 84 85 void put(icu::BreakIterator* iterator) 86 { 87 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator)); 88 89 if (m_pool.size() == capacity) { 90 delete(m_pool[0].second); 91 m_pool.remove(0); 92 } 93 94 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator)); 95 } 96 97 private: 98 LineBreakIteratorPool() { } 99 100 static const size_t capacity = 4; 101 102 typedef pair<AtomicString, icu::BreakIterator*> Entry; 103 typedef Vector<Entry, capacity> Pool; 104 Pool m_pool; 105 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators; 106 107 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*(); 108 }; 109 110 enum TextContext { NoContext, PriorContext, PrimaryContext }; 111 112 const int textBufferCapacity = 16; 113 114 typedef struct { 115 UText text; 116 UChar buffer[textBufferCapacity]; 117 } UTextWithBuffer; 118 119 static inline int64_t textPinIndex(int64_t& index, int64_t limit) 120 { 121 if (index < 0) 122 index = 0; 123 else if (index > limit) 124 index = limit; 125 return index; 126 } 127 128 static inline int64_t textNativeLength(UText* text) 129 { 130 return text->a + text->b; 131 } 132 133 // Relocate pointer from source into destination as required. 134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer) 135 { 136 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) { 137 // Pointer references source extra buffer. 138 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra)); 139 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) { 140 // Pointer references source text structure, but not source extra buffer. 141 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source)); 142 } 143 } 144 145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status) 146 { 147 ASSERT_UNUSED(deep, !deep); 148 if (U_FAILURE(*status)) 149 return 0; 150 int32_t extraSize = source->extraSize; 151 destination = utext_setup(destination, extraSize, status); 152 if (U_FAILURE(*status)) 153 return destination; 154 void* extraNew = destination->pExtra; 155 int32_t flags = destination->flags; 156 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct); 157 memcpy(destination, source, sizeToCopy); 158 destination->pExtra = extraNew; 159 destination->flags = flags; 160 memcpy(destination->pExtra, source->pExtra, extraSize); 161 textFixPointer(source, destination, destination->context); 162 textFixPointer(source, destination, destination->p); 163 textFixPointer(source, destination, destination->q); 164 ASSERT(!destination->r); 165 const void * chunkContents = static_cast<const void*>(destination->chunkContents); 166 textFixPointer(source, destination, chunkContents); 167 destination->chunkContents = static_cast<const UChar*>(chunkContents); 168 return destination; 169 } 170 171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode) 172 { 173 // In the present context, this text provider is used only with ICU functions 174 // that do not perform an extract operation. 175 ASSERT_NOT_REACHED(); 176 *errorCode = U_UNSUPPORTED_ERROR; 177 return 0; 178 } 179 180 static void textClose(UText* text) 181 { 182 text->context = 0; 183 } 184 185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward) 186 { 187 if (!text->b || nativeIndex > text->b) 188 return PrimaryContext; 189 if (nativeIndex == text->b) 190 return forward ? PrimaryContext : PriorContext; 191 return PriorContext; 192 } 193 194 static inline TextContext textLatin1GetCurrentContext(const UText* text) 195 { 196 if (!text->chunkContents) 197 return NoContext; 198 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext; 199 } 200 201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 202 { 203 ASSERT(text->chunkContents == text->pExtra); 204 if (forward) { 205 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength); 206 text->chunkNativeStart = nativeIndex; 207 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar); 208 if (text->chunkNativeLimit > nativeLength) 209 text->chunkNativeLimit = nativeLength; 210 } else { 211 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength); 212 text->chunkNativeLimit = nativeIndex; 213 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar); 214 if (text->chunkNativeStart < text->b) 215 text->chunkNativeStart = text->b; 216 } 217 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 218 // Ensure chunk length is well defined if computed length exceeds int32_t range. 219 ASSERT(length <= numeric_limits<int32_t>::max()); 220 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 221 text->nativeIndexingLimit = text->chunkLength; 222 text->chunkOffset = forward ? 0 : text->chunkLength; 223 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength)); 224 } 225 226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 227 { 228 ASSERT(!text->chunkContents || text->chunkContents == text->q); 229 text->chunkContents = static_cast<const UChar*>(text->pExtra); 230 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 231 } 232 233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 234 { 235 ASSERT(text->chunkContents == text->q); 236 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 237 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 238 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 239 text->chunkNativeStart = 0; 240 text->chunkNativeLimit = text->b; 241 text->chunkLength = text->b; 242 text->nativeIndexingLimit = text->chunkLength; 243 int64_t offset = nativeIndex - text->chunkNativeStart; 244 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 245 ASSERT(offset <= numeric_limits<int32_t>::max()); 246 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 247 } 248 249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 250 { 251 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra); 252 text->chunkContents = static_cast<const UChar*>(text->q); 253 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 254 } 255 256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible) 257 { 258 if (forward) { 259 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) { 260 int64_t offset = nativeIndex - text->chunkNativeStart; 261 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 262 ASSERT(offset <= numeric_limits<int32_t>::max()); 263 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 264 isAccessible = TRUE; 265 return true; 266 } 267 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) { 268 text->chunkOffset = text->chunkLength; 269 isAccessible = FALSE; 270 return true; 271 } 272 } else { 273 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) { 274 int64_t offset = nativeIndex - text->chunkNativeStart; 275 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 276 ASSERT(offset <= numeric_limits<int32_t>::max()); 277 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 278 isAccessible = TRUE; 279 return true; 280 } 281 if (nativeIndex <= 0 && !text->chunkNativeStart) { 282 text->chunkOffset = 0; 283 isAccessible = FALSE; 284 return true; 285 } 286 } 287 return false; 288 } 289 290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward) 291 { 292 if (!text->context) 293 return FALSE; 294 int64_t nativeLength = textNativeLength(text); 295 UBool isAccessible; 296 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 297 return isAccessible; 298 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 299 TextContext currentContext = textLatin1GetCurrentContext(text); 300 TextContext newContext = textGetContext(text, nativeIndex, forward); 301 ASSERT(newContext != NoContext); 302 if (newContext == currentContext) { 303 if (currentContext == PrimaryContext) { 304 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 305 } else { 306 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 307 } 308 } else if (newContext == PrimaryContext) { 309 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 310 } else { 311 ASSERT(newContext == PriorContext); 312 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 313 } 314 return TRUE; 315 } 316 317 static const struct UTextFuncs textLatin1Funcs = { 318 sizeof(UTextFuncs), 319 0, 0, 0, 320 textClone, 321 textNativeLength, 322 textLatin1Access, 323 textExtract, 324 0, 0, 0, 0, 325 textClose, 326 0, 0, 0, 327 }; 328 329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength) 330 { 331 text->pFuncs = funcs; 332 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS; 333 text->context = string; 334 text->p = string; 335 text->a = length; 336 text->q = priorContext; 337 text->b = priorContextLength; 338 } 339 340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 341 { 342 if (U_FAILURE(*status)) 343 return 0; 344 345 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 346 *status = U_ILLEGAL_ARGUMENT_ERROR; 347 return 0; 348 } 349 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status); 350 if (U_FAILURE(*status)) { 351 ASSERT(!text); 352 return 0; 353 } 354 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength); 355 return text; 356 } 357 358 static inline TextContext textUTF16GetCurrentContext(const UText* text) 359 { 360 if (!text->chunkContents) 361 return NoContext; 362 return text->chunkContents == text->p ? PrimaryContext : PriorContext; 363 } 364 365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 366 { 367 ASSERT(text->chunkContents == text->p); 368 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b); 369 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 370 text->chunkNativeStart = text->b; 371 text->chunkNativeLimit = nativeLength; 372 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 373 // Ensure chunk length is well defined if computed length exceeds int32_t range. 374 ASSERT(length <= numeric_limits<int32_t>::max()); 375 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 376 text->nativeIndexingLimit = text->chunkLength; 377 int64_t offset = nativeIndex - text->chunkNativeStart; 378 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 379 ASSERT(offset <= numeric_limits<int32_t>::max()); 380 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 381 } 382 383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 384 { 385 ASSERT(!text->chunkContents || text->chunkContents == text->q); 386 text->chunkContents = static_cast<const UChar*>(text->p); 387 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 388 } 389 390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 391 { 392 ASSERT(text->chunkContents == text->q); 393 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 394 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 395 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 396 text->chunkNativeStart = 0; 397 text->chunkNativeLimit = text->b; 398 text->chunkLength = text->b; 399 text->nativeIndexingLimit = text->chunkLength; 400 int64_t offset = nativeIndex - text->chunkNativeStart; 401 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 402 ASSERT(offset <= numeric_limits<int32_t>::max()); 403 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 404 } 405 406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 407 { 408 ASSERT(!text->chunkContents || text->chunkContents == text->p); 409 text->chunkContents = static_cast<const UChar*>(text->q); 410 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 411 } 412 413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward) 414 { 415 if (!text->context) 416 return FALSE; 417 int64_t nativeLength = textNativeLength(text); 418 UBool isAccessible; 419 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 420 return isAccessible; 421 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 422 TextContext currentContext = textUTF16GetCurrentContext(text); 423 TextContext newContext = textGetContext(text, nativeIndex, forward); 424 ASSERT(newContext != NoContext); 425 if (newContext == currentContext) { 426 if (currentContext == PrimaryContext) { 427 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 428 } else { 429 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 430 } 431 } else if (newContext == PrimaryContext) { 432 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 433 } else { 434 ASSERT(newContext == PriorContext); 435 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 436 } 437 return TRUE; 438 } 439 440 static const struct UTextFuncs textUTF16Funcs = { 441 sizeof(UTextFuncs), 442 0, 0, 0, 443 textClone, 444 textNativeLength, 445 textUTF16Access, 446 textExtract, 447 0, 0, 0, 0, 448 textClose, 449 0, 0, 0, 450 }; 451 452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 453 { 454 if (U_FAILURE(*status)) 455 return 0; 456 457 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 458 *status = U_ILLEGAL_ARGUMENT_ERROR; 459 return 0; 460 } 461 462 text = utext_setup(text, 0, status); 463 if (U_FAILURE(*status)) { 464 ASSERT(!text); 465 return 0; 466 } 467 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength); 468 return text; 469 } 470 471 static UText emptyText = UTEXT_INITIALIZER; 472 473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length) 474 { 475 UErrorCode errorCode = U_ZERO_ERROR; 476 static TextBreakIterator* breakIter = 0; 477 if (!breakIter) { 478 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 479 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 480 if (!breakIter) 481 return 0; 482 } 483 484 UTextWithBuffer textLocal; 485 textLocal.text = emptyText; 486 textLocal.text.extraSize = sizeof(textLocal.buffer); 487 textLocal.text.pExtra = textLocal.buffer; 488 489 UErrorCode openStatus = U_ZERO_ERROR; 490 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus); 491 if (U_FAILURE(openStatus)) { 492 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 493 return 0; 494 } 495 496 UErrorCode setTextStatus = U_ZERO_ERROR; 497 breakIter->setText(text, setTextStatus); 498 if (U_FAILURE(setTextStatus)) 499 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus); 500 501 utext_close(text); 502 503 return breakIter; 504 } 505 506 static void setText16(TextBreakIterator* iter, const UChar* string, int length) 507 { 508 UErrorCode errorCode = U_ZERO_ERROR; 509 UText uText = UTEXT_INITIALIZER; 510 utext_openUChars(&uText, string, length, &errorCode); 511 if (U_FAILURE(errorCode)) 512 return; 513 iter->setText(&uText, errorCode); 514 } 515 516 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 517 { 518 UErrorCode errorCode = U_ZERO_ERROR; 519 static TextBreakIterator* breakIter = 0; 520 if (!breakIter) { 521 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 522 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 523 if (!breakIter) 524 return 0; 525 } 526 setText16(breakIter, string, length); 527 return breakIter; 528 } 529 530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length) 531 { 532 if (string.isEmpty()) 533 return 0; 534 if (string.is8Bit()) 535 return wordBreakIterator(string.characters8() + start, length); 536 return wordBreakIterator(string.characters16() + start, length); 537 } 538 539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 540 { 541 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 542 if (!iterator) 543 return 0; 544 545 UTextWithBuffer textLocal; 546 textLocal.text = emptyText; 547 textLocal.text.extraSize = sizeof(textLocal.buffer); 548 textLocal.text.pExtra = textLocal.buffer; 549 550 UErrorCode openStatus = U_ZERO_ERROR; 551 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 552 if (U_FAILURE(openStatus)) { 553 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 554 return 0; 555 } 556 557 UErrorCode setTextStatus = U_ZERO_ERROR; 558 iterator->setText(text, setTextStatus); 559 if (U_FAILURE(setTextStatus)) { 560 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 561 return 0; 562 } 563 564 utext_close(text); 565 566 return iterator; 567 } 568 569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 570 { 571 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 572 if (!iterator) 573 return 0; 574 575 UText textLocal = UTEXT_INITIALIZER; 576 577 UErrorCode openStatus = U_ZERO_ERROR; 578 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 579 if (U_FAILURE(openStatus)) { 580 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus); 581 return 0; 582 } 583 584 UErrorCode setTextStatus = U_ZERO_ERROR; 585 iterator->setText(text, setTextStatus); 586 if (U_FAILURE(setTextStatus)) { 587 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 588 return 0; 589 } 590 591 utext_close(text); 592 593 return iterator; 594 } 595 596 void releaseLineBreakIterator(TextBreakIterator* iterator) 597 { 598 ASSERT_ARG(iterator, iterator); 599 600 LineBreakIteratorPool::sharedPool().put(iterator); 601 } 602 603 static TextBreakIterator* nonSharedCharacterBreakIterator; 604 605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue) 606 { 607 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ()); 608 MutexLocker locker(nonSharedCharacterBreakIteratorMutex); 609 if (nonSharedCharacterBreakIterator != expected) 610 return false; 611 nonSharedCharacterBreakIterator = newValue; 612 return true; 613 } 614 615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string) 616 : m_is8Bit(true) 617 , m_charaters8(0) 618 , m_offset(0) 619 , m_length(0) 620 , m_iterator(0) 621 { 622 if (string.isEmpty()) 623 return; 624 625 m_is8Bit = string.is8Bit(); 626 627 if (m_is8Bit) { 628 m_charaters8 = string.characters8(); 629 m_offset = 0; 630 m_length = string.length(); 631 return; 632 } 633 634 createIteratorForBuffer(string.characters16(), string.length()); 635 } 636 637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length) 638 : m_is8Bit(false) 639 , m_charaters8(0) 640 , m_offset(0) 641 , m_length(0) 642 , m_iterator(0) 643 { 644 createIteratorForBuffer(buffer, length); 645 } 646 647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length) 648 { 649 m_iterator = nonSharedCharacterBreakIterator; 650 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0); 651 if (!createdIterator) { 652 UErrorCode errorCode = U_ZERO_ERROR; 653 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 654 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 655 } 656 657 setText16(m_iterator, buffer, length); 658 } 659 660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() 661 { 662 if (m_is8Bit) 663 return; 664 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator)) 665 delete m_iterator; 666 } 667 668 int NonSharedCharacterBreakIterator::next() 669 { 670 if (!m_is8Bit) 671 return m_iterator->next(); 672 673 if (m_offset >= m_length) 674 return TextBreakDone; 675 676 m_offset += clusterLengthStartingAt(m_offset); 677 return m_offset; 678 } 679 680 int NonSharedCharacterBreakIterator::current() 681 { 682 if (!m_is8Bit) 683 return m_iterator->current(); 684 return m_offset; 685 } 686 687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const 688 { 689 if (!m_is8Bit) 690 return m_iterator->isBoundary(offset); 691 return !isLFAfterCR(offset); 692 } 693 694 int NonSharedCharacterBreakIterator::preceding(int offset) const 695 { 696 if (!m_is8Bit) 697 return m_iterator->preceding(offset); 698 if (offset <= 0) 699 return TextBreakDone; 700 if (isLFAfterCR(offset)) 701 return offset - 2; 702 return offset - 1; 703 } 704 705 int NonSharedCharacterBreakIterator::following(int offset) const 706 { 707 if (!m_is8Bit) 708 return m_iterator->following(offset); 709 if (static_cast<unsigned>(offset) >= m_length) 710 return TextBreakDone; 711 return offset + clusterLengthStartingAt(offset); 712 } 713 714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 715 { 716 UErrorCode openStatus = U_ZERO_ERROR; 717 static TextBreakIterator* iterator = 0; 718 if (!iterator) { 719 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus); 720 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 721 if (!iterator) 722 return 0; 723 } 724 725 setText16(iterator, string, length); 726 return iterator; 727 } 728 729 bool isWordTextBreak(TextBreakIterator* iterator) 730 { 731 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator); 732 int ruleStatus = ruleBasedBreakIterator->getRuleStatus(); 733 return ruleStatus != UBRK_WORD_NONE; 734 } 735 736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length) 737 { 738 if (!string) 739 return 0; 740 741 static TextBreakIterator* iterator = 0; 742 if (!iterator) { 743 UParseError parseStatus; 744 UErrorCode openStatus = U_ZERO_ERROR; 745 Vector<UChar> rules; 746 String(breakRules).appendTo(rules); 747 748 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus); 749 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 750 if (!iterator) 751 return 0; 752 } 753 754 setText16(iterator, string, length); 755 return iterator; 756 } 757 758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 759 { 760 // This rule set is based on character-break iterator rules of ICU 4.0 761 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 762 // The major differences from the original ones are listed below: 763 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 764 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 765 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 766 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 767 // * Added rules for regional indicator symbols. 768 static const char* const kRules = 769 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 770 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 771 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 772 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 773 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 774 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 775 "$L = [\\p{Grapheme_Cluster_Break = L}];" 776 "$V = [\\p{Grapheme_Cluster_Break = V}];" 777 "$T = [\\p{Grapheme_Cluster_Break = T}];" 778 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 779 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 780 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 781 "$HinV = \\u094D;" // Devanagari Sign Virama 782 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 783 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 784 "$BenV = \\u09CD;" // Bengali Sign Virama 785 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 786 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 787 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 788 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 789 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 790 "$GujV = \\u0ACD;" // Gujarati Sign Virama 791 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 792 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 793 "$OriV = \\u0B4D;" // Oriya Sign Virama 794 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 795 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 796 "$TelV = \\u0C4D;" // Telugu Sign Virama 797 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 798 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 799 "$KanV = \\u0CCD;" // Kannada Sign Virama 800 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 801 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 802 "$MalV = \\u0D4D;" // Malayalam Sign Virama 803 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 804 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators 805 "!!chain;" 806 "!!forward;" 807 "$CR $LF;" 808 "$L ($L | $V | $LV | $LVT);" 809 "($LV | $V) ($V | $T);" 810 "($LVT | $T) $T;" 811 "[^$Control $CR $LF] $Extend;" 812 "[^$Control $CR $LF] $SpacingMark;" 813 "$RI $RI / $RI;" 814 "$RI $RI;" 815 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 816 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 817 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 818 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 819 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 820 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 821 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 822 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 823 "!!reverse;" 824 "$LF $CR;" 825 "($L | $V | $LV | $LVT) $L;" 826 "($V | $T) ($LV | $V);" 827 "$T ($LVT | $T);" 828 "$Extend [^$Control $CR $LF];" 829 "$SpacingMark [^$Control $CR $LF];" 830 "$RI $RI / $RI $RI;" 831 "$RI $RI;" 832 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 833 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 834 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 835 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 836 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 837 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 838 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 839 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 840 "!!safe_reverse;" 841 "!!safe_forward;"; 842 843 return setUpIteratorWithRules(kRules, string, length); 844 } 845 846 } 847