1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #include "config.h" 23 #include "platform/text/TextBreakIterator.h" 24 25 #include "platform/text/TextBreakIteratorInternalICU.h" 26 #include "wtf/Assertions.h" 27 #include "wtf/HashMap.h" 28 #include "wtf/PassOwnPtr.h" 29 #include "wtf/ThreadSpecific.h" 30 #include "wtf/ThreadingPrimitives.h" 31 #include "wtf/text/AtomicString.h" 32 #include "wtf/text/CString.h" 33 #include "wtf/text/WTFString.h" 34 #include <unicode/rbbi.h> 35 #include <unicode/ubrk.h> 36 37 using namespace WTF; 38 using namespace std; 39 40 namespace WebCore { 41 42 class LineBreakIteratorPool { 43 WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool); 44 public: 45 static LineBreakIteratorPool& sharedPool() 46 { 47 static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>; 48 return **pool; 49 } 50 51 static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); } 52 53 icu::BreakIterator* take(const AtomicString& locale) 54 { 55 icu::BreakIterator* iterator = 0; 56 for (size_t i = 0; i < m_pool.size(); ++i) { 57 if (m_pool[i].first == locale) { 58 iterator = m_pool[i].second; 59 m_pool.remove(i); 60 break; 61 } 62 } 63 64 if (!iterator) { 65 UErrorCode openStatus = U_ZERO_ERROR; 66 bool localeIsEmpty = locale.isEmpty(); 67 iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus); 68 // locale comes from a web page and it can be invalid, leading ICU 69 // to fail, in which case we fall back to the default locale. 70 if (!localeIsEmpty && U_FAILURE(openStatus)) { 71 openStatus = U_ZERO_ERROR; 72 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus); 73 } 74 75 if (U_FAILURE(openStatus)) { 76 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus); 77 return 0; 78 } 79 } 80 81 ASSERT(!m_vendedIterators.contains(iterator)); 82 m_vendedIterators.set(iterator, locale); 83 return iterator; 84 } 85 86 void put(icu::BreakIterator* iterator) 87 { 88 ASSERT_ARG(iterator, m_vendedIterators.contains(iterator)); 89 90 if (m_pool.size() == capacity) { 91 delete(m_pool[0].second); 92 m_pool.remove(0); 93 } 94 95 m_pool.append(Entry(m_vendedIterators.take(iterator), iterator)); 96 } 97 98 private: 99 LineBreakIteratorPool() { } 100 101 static const size_t capacity = 4; 102 103 typedef pair<AtomicString, icu::BreakIterator*> Entry; 104 typedef Vector<Entry, capacity> Pool; 105 Pool m_pool; 106 HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators; 107 108 friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*(); 109 }; 110 111 enum TextContext { NoContext, PriorContext, PrimaryContext }; 112 113 const int textBufferCapacity = 16; 114 115 typedef struct { 116 UText text; 117 UChar buffer[textBufferCapacity]; 118 } UTextWithBuffer; 119 120 static inline int64_t textPinIndex(int64_t& index, int64_t limit) 121 { 122 if (index < 0) 123 index = 0; 124 else if (index > limit) 125 index = limit; 126 return index; 127 } 128 129 static inline int64_t textNativeLength(UText* text) 130 { 131 return text->a + text->b; 132 } 133 134 // Relocate pointer from source into destination as required. 135 static void textFixPointer(const UText* source, UText* destination, const void*& pointer) 136 { 137 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) { 138 // Pointer references source extra buffer. 139 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra)); 140 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) { 141 // Pointer references source text structure, but not source extra buffer. 142 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source)); 143 } 144 } 145 146 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status) 147 { 148 ASSERT_UNUSED(deep, !deep); 149 if (U_FAILURE(*status)) 150 return 0; 151 int32_t extraSize = source->extraSize; 152 destination = utext_setup(destination, extraSize, status); 153 if (U_FAILURE(*status)) 154 return destination; 155 void* extraNew = destination->pExtra; 156 int32_t flags = destination->flags; 157 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct); 158 memcpy(destination, source, sizeToCopy); 159 destination->pExtra = extraNew; 160 destination->flags = flags; 161 memcpy(destination->pExtra, source->pExtra, extraSize); 162 textFixPointer(source, destination, destination->context); 163 textFixPointer(source, destination, destination->p); 164 textFixPointer(source, destination, destination->q); 165 ASSERT(!destination->r); 166 const void * chunkContents = static_cast<const void*>(destination->chunkContents); 167 textFixPointer(source, destination, chunkContents); 168 destination->chunkContents = static_cast<const UChar*>(chunkContents); 169 return destination; 170 } 171 172 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode) 173 { 174 // In the present context, this text provider is used only with ICU functions 175 // that do not perform an extract operation. 176 ASSERT_NOT_REACHED(); 177 *errorCode = U_UNSUPPORTED_ERROR; 178 return 0; 179 } 180 181 static void textClose(UText* text) 182 { 183 text->context = 0; 184 } 185 186 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward) 187 { 188 if (!text->b || nativeIndex > text->b) 189 return PrimaryContext; 190 if (nativeIndex == text->b) 191 return forward ? PrimaryContext : PriorContext; 192 return PriorContext; 193 } 194 195 static inline TextContext textLatin1GetCurrentContext(const UText* text) 196 { 197 if (!text->chunkContents) 198 return NoContext; 199 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext; 200 } 201 202 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 203 { 204 ASSERT(text->chunkContents == text->pExtra); 205 if (forward) { 206 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength); 207 text->chunkNativeStart = nativeIndex; 208 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar); 209 if (text->chunkNativeLimit > nativeLength) 210 text->chunkNativeLimit = nativeLength; 211 } else { 212 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength); 213 text->chunkNativeLimit = nativeIndex; 214 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar); 215 if (text->chunkNativeStart < text->b) 216 text->chunkNativeStart = text->b; 217 } 218 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 219 // Ensure chunk length is well defined if computed length exceeds int32_t range. 220 ASSERT(length <= numeric_limits<int32_t>::max()); 221 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 222 text->nativeIndexingLimit = text->chunkLength; 223 text->chunkOffset = forward ? 0 : text->chunkLength; 224 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength)); 225 } 226 227 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 228 { 229 ASSERT(!text->chunkContents || text->chunkContents == text->q); 230 text->chunkContents = static_cast<const UChar*>(text->pExtra); 231 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 232 } 233 234 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 235 { 236 ASSERT(text->chunkContents == text->q); 237 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 238 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 239 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 240 text->chunkNativeStart = 0; 241 text->chunkNativeLimit = text->b; 242 text->chunkLength = text->b; 243 text->nativeIndexingLimit = text->chunkLength; 244 int64_t offset = nativeIndex - text->chunkNativeStart; 245 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 246 ASSERT(offset <= numeric_limits<int32_t>::max()); 247 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 248 } 249 250 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 251 { 252 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra); 253 text->chunkContents = static_cast<const UChar*>(text->q); 254 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 255 } 256 257 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible) 258 { 259 if (forward) { 260 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) { 261 int64_t offset = nativeIndex - text->chunkNativeStart; 262 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 263 ASSERT(offset <= numeric_limits<int32_t>::max()); 264 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 265 isAccessible = TRUE; 266 return true; 267 } 268 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) { 269 text->chunkOffset = text->chunkLength; 270 isAccessible = FALSE; 271 return true; 272 } 273 } else { 274 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) { 275 int64_t offset = nativeIndex - text->chunkNativeStart; 276 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 277 ASSERT(offset <= numeric_limits<int32_t>::max()); 278 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 279 isAccessible = TRUE; 280 return true; 281 } 282 if (nativeIndex <= 0 && !text->chunkNativeStart) { 283 text->chunkOffset = 0; 284 isAccessible = FALSE; 285 return true; 286 } 287 } 288 return false; 289 } 290 291 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward) 292 { 293 if (!text->context) 294 return FALSE; 295 int64_t nativeLength = textNativeLength(text); 296 UBool isAccessible; 297 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 298 return isAccessible; 299 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 300 TextContext currentContext = textLatin1GetCurrentContext(text); 301 TextContext newContext = textGetContext(text, nativeIndex, forward); 302 ASSERT(newContext != NoContext); 303 if (newContext == currentContext) { 304 if (currentContext == PrimaryContext) { 305 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 306 } else { 307 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 308 } 309 } else if (newContext == PrimaryContext) { 310 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 311 } else { 312 ASSERT(newContext == PriorContext); 313 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 314 } 315 return TRUE; 316 } 317 318 static const struct UTextFuncs textLatin1Funcs = { 319 sizeof(UTextFuncs), 320 0, 0, 0, 321 textClone, 322 textNativeLength, 323 textLatin1Access, 324 textExtract, 325 0, 0, 0, 0, 326 textClose, 327 0, 0, 0, 328 }; 329 330 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength) 331 { 332 text->pFuncs = funcs; 333 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS; 334 text->context = string; 335 text->p = string; 336 text->a = length; 337 text->q = priorContext; 338 text->b = priorContextLength; 339 } 340 341 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 342 { 343 if (U_FAILURE(*status)) 344 return 0; 345 346 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 347 *status = U_ILLEGAL_ARGUMENT_ERROR; 348 return 0; 349 } 350 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status); 351 if (U_FAILURE(*status)) { 352 ASSERT(!text); 353 return 0; 354 } 355 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength); 356 return text; 357 } 358 359 static inline TextContext textUTF16GetCurrentContext(const UText* text) 360 { 361 if (!text->chunkContents) 362 return NoContext; 363 return text->chunkContents == text->p ? PrimaryContext : PriorContext; 364 } 365 366 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 367 { 368 ASSERT(text->chunkContents == text->p); 369 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b); 370 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 371 text->chunkNativeStart = text->b; 372 text->chunkNativeLimit = nativeLength; 373 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 374 // Ensure chunk length is well defined if computed length exceeds int32_t range. 375 ASSERT(length <= numeric_limits<int32_t>::max()); 376 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 377 text->nativeIndexingLimit = text->chunkLength; 378 int64_t offset = nativeIndex - text->chunkNativeStart; 379 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 380 ASSERT(offset <= numeric_limits<int32_t>::max()); 381 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 382 } 383 384 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 385 { 386 ASSERT(!text->chunkContents || text->chunkContents == text->q); 387 text->chunkContents = static_cast<const UChar*>(text->p); 388 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 389 } 390 391 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 392 { 393 ASSERT(text->chunkContents == text->q); 394 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 395 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 396 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 397 text->chunkNativeStart = 0; 398 text->chunkNativeLimit = text->b; 399 text->chunkLength = text->b; 400 text->nativeIndexingLimit = text->chunkLength; 401 int64_t offset = nativeIndex - text->chunkNativeStart; 402 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 403 ASSERT(offset <= numeric_limits<int32_t>::max()); 404 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 405 } 406 407 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 408 { 409 ASSERT(!text->chunkContents || text->chunkContents == text->p); 410 text->chunkContents = static_cast<const UChar*>(text->q); 411 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 412 } 413 414 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward) 415 { 416 if (!text->context) 417 return FALSE; 418 int64_t nativeLength = textNativeLength(text); 419 UBool isAccessible; 420 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 421 return isAccessible; 422 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 423 TextContext currentContext = textUTF16GetCurrentContext(text); 424 TextContext newContext = textGetContext(text, nativeIndex, forward); 425 ASSERT(newContext != NoContext); 426 if (newContext == currentContext) { 427 if (currentContext == PrimaryContext) { 428 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 429 } else { 430 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 431 } 432 } else if (newContext == PrimaryContext) { 433 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 434 } else { 435 ASSERT(newContext == PriorContext); 436 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 437 } 438 return TRUE; 439 } 440 441 static const struct UTextFuncs textUTF16Funcs = { 442 sizeof(UTextFuncs), 443 0, 0, 0, 444 textClone, 445 textNativeLength, 446 textUTF16Access, 447 textExtract, 448 0, 0, 0, 0, 449 textClose, 450 0, 0, 0, 451 }; 452 453 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 454 { 455 if (U_FAILURE(*status)) 456 return 0; 457 458 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 459 *status = U_ILLEGAL_ARGUMENT_ERROR; 460 return 0; 461 } 462 463 text = utext_setup(text, 0, status); 464 if (U_FAILURE(*status)) { 465 ASSERT(!text); 466 return 0; 467 } 468 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength); 469 return text; 470 } 471 472 static UText emptyText = UTEXT_INITIALIZER; 473 474 static TextBreakIterator* wordBreakIterator(const LChar* string, int length) 475 { 476 UErrorCode errorCode = U_ZERO_ERROR; 477 static TextBreakIterator* breakIter = 0; 478 if (!breakIter) { 479 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 480 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 481 if (!breakIter) 482 return 0; 483 } 484 485 UTextWithBuffer textLocal; 486 textLocal.text = emptyText; 487 textLocal.text.extraSize = sizeof(textLocal.buffer); 488 textLocal.text.pExtra = textLocal.buffer; 489 490 UErrorCode openStatus = U_ZERO_ERROR; 491 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus); 492 if (U_FAILURE(openStatus)) { 493 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 494 return 0; 495 } 496 497 UErrorCode setTextStatus = U_ZERO_ERROR; 498 breakIter->setText(text, setTextStatus); 499 if (U_FAILURE(setTextStatus)) 500 WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus); 501 502 utext_close(text); 503 504 return breakIter; 505 } 506 507 static void setText16(TextBreakIterator* iter, const UChar* string, int length) 508 { 509 UErrorCode errorCode = U_ZERO_ERROR; 510 UText uText = UTEXT_INITIALIZER; 511 utext_openUChars(&uText, string, length, &errorCode); 512 if (U_FAILURE(errorCode)) 513 return; 514 iter->setText(&uText, errorCode); 515 } 516 517 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 518 { 519 UErrorCode errorCode = U_ZERO_ERROR; 520 static TextBreakIterator* breakIter = 0; 521 if (!breakIter) { 522 breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 523 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 524 if (!breakIter) 525 return 0; 526 } 527 setText16(breakIter, string, length); 528 return breakIter; 529 } 530 531 TextBreakIterator* wordBreakIterator(const String& string, int start, int length) 532 { 533 if (string.isEmpty()) 534 return 0; 535 if (string.is8Bit()) 536 return wordBreakIterator(string.characters8() + start, length); 537 return wordBreakIterator(string.characters16() + start, length); 538 } 539 540 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 541 { 542 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 543 if (!iterator) 544 return 0; 545 546 UTextWithBuffer textLocal; 547 textLocal.text = emptyText; 548 textLocal.text.extraSize = sizeof(textLocal.buffer); 549 textLocal.text.pExtra = textLocal.buffer; 550 551 UErrorCode openStatus = U_ZERO_ERROR; 552 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 553 if (U_FAILURE(openStatus)) { 554 WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 555 return 0; 556 } 557 558 UErrorCode setTextStatus = U_ZERO_ERROR; 559 iterator->setText(text, setTextStatus); 560 if (U_FAILURE(setTextStatus)) { 561 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 562 return 0; 563 } 564 565 utext_close(text); 566 567 return iterator; 568 } 569 570 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 571 { 572 TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 573 if (!iterator) 574 return 0; 575 576 UText textLocal = UTEXT_INITIALIZER; 577 578 UErrorCode openStatus = U_ZERO_ERROR; 579 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 580 if (U_FAILURE(openStatus)) { 581 WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus); 582 return 0; 583 } 584 585 UErrorCode setTextStatus = U_ZERO_ERROR; 586 iterator->setText(text, setTextStatus); 587 if (U_FAILURE(setTextStatus)) { 588 WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 589 return 0; 590 } 591 592 utext_close(text); 593 594 return iterator; 595 } 596 597 void releaseLineBreakIterator(TextBreakIterator* iterator) 598 { 599 ASSERT_ARG(iterator, iterator); 600 601 LineBreakIteratorPool::sharedPool().put(iterator); 602 } 603 604 static TextBreakIterator* nonSharedCharacterBreakIterator; 605 606 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue) 607 { 608 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ()); 609 MutexLocker locker(nonSharedCharacterBreakIteratorMutex); 610 if (nonSharedCharacterBreakIterator != expected) 611 return false; 612 nonSharedCharacterBreakIterator = newValue; 613 return true; 614 } 615 616 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string) 617 : m_is8Bit(true) 618 , m_charaters8(0) 619 , m_offset(0) 620 , m_length(0) 621 , m_iterator(0) 622 { 623 if (string.isEmpty()) 624 return; 625 626 m_is8Bit = string.is8Bit(); 627 628 if (m_is8Bit) { 629 m_charaters8 = string.characters8(); 630 m_offset = 0; 631 m_length = string.length(); 632 return; 633 } 634 635 createIteratorForBuffer(string.characters16(), string.length()); 636 } 637 638 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length) 639 : m_is8Bit(false) 640 , m_charaters8(0) 641 , m_offset(0) 642 , m_length(0) 643 , m_iterator(0) 644 { 645 createIteratorForBuffer(buffer, length); 646 } 647 648 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length) 649 { 650 m_iterator = nonSharedCharacterBreakIterator; 651 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0); 652 if (!createdIterator) { 653 UErrorCode errorCode = U_ZERO_ERROR; 654 m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode); 655 ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode); 656 } 657 658 setText16(m_iterator, buffer, length); 659 } 660 661 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() 662 { 663 if (m_is8Bit) 664 return; 665 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator)) 666 delete m_iterator; 667 } 668 669 int NonSharedCharacterBreakIterator::next() 670 { 671 if (!m_is8Bit) 672 return m_iterator->next(); 673 674 if (m_offset >= m_length) 675 return TextBreakDone; 676 677 m_offset += clusterLengthStartingAt(m_offset); 678 return m_offset; 679 } 680 681 int NonSharedCharacterBreakIterator::current() 682 { 683 if (!m_is8Bit) 684 return m_iterator->current(); 685 return m_offset; 686 } 687 688 bool NonSharedCharacterBreakIterator::isBreak(int offset) const 689 { 690 if (!m_is8Bit) 691 return m_iterator->isBoundary(offset); 692 return !isLFAfterCR(offset); 693 } 694 695 int NonSharedCharacterBreakIterator::preceding(int offset) const 696 { 697 if (!m_is8Bit) 698 return m_iterator->preceding(offset); 699 if (offset <= 0) 700 return TextBreakDone; 701 if (isLFAfterCR(offset)) 702 return offset - 2; 703 return offset - 1; 704 } 705 706 int NonSharedCharacterBreakIterator::following(int offset) const 707 { 708 if (!m_is8Bit) 709 return m_iterator->following(offset); 710 if (static_cast<unsigned>(offset) >= m_length) 711 return TextBreakDone; 712 return offset + clusterLengthStartingAt(offset); 713 } 714 715 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 716 { 717 UErrorCode openStatus = U_ZERO_ERROR; 718 static TextBreakIterator* iterator = 0; 719 if (!iterator) { 720 iterator = icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus); 721 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 722 if (!iterator) 723 return 0; 724 } 725 726 setText16(iterator, string, length); 727 return iterator; 728 } 729 730 bool isWordTextBreak(TextBreakIterator* iterator) 731 { 732 icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator); 733 int ruleStatus = ruleBasedBreakIterator->getRuleStatus(); 734 return ruleStatus != UBRK_WORD_NONE; 735 } 736 737 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length) 738 { 739 if (!string) 740 return 0; 741 742 static TextBreakIterator* iterator = 0; 743 if (!iterator) { 744 UParseError parseStatus; 745 UErrorCode openStatus = U_ZERO_ERROR; 746 Vector<UChar> rules; 747 String(breakRules).appendTo(rules); 748 749 iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus); 750 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 751 if (!iterator) 752 return 0; 753 } 754 755 setText16(iterator, string, length); 756 return iterator; 757 } 758 759 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 760 { 761 // This rule set is based on character-break iterator rules of ICU 4.0 762 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 763 // The major differences from the original ones are listed below: 764 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 765 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 766 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 767 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 768 // * Added rules for regional indicator symbols. 769 static const char* const kRules = 770 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 771 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 772 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 773 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 774 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 775 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 776 "$L = [\\p{Grapheme_Cluster_Break = L}];" 777 "$V = [\\p{Grapheme_Cluster_Break = V}];" 778 "$T = [\\p{Grapheme_Cluster_Break = T}];" 779 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 780 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 781 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 782 "$HinV = \\u094D;" // Devanagari Sign Virama 783 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 784 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 785 "$BenV = \\u09CD;" // Bengali Sign Virama 786 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 787 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 788 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 789 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 790 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 791 "$GujV = \\u0ACD;" // Gujarati Sign Virama 792 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 793 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 794 "$OriV = \\u0B4D;" // Oriya Sign Virama 795 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 796 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 797 "$TelV = \\u0C4D;" // Telugu Sign Virama 798 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 799 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 800 "$KanV = \\u0CCD;" // Kannada Sign Virama 801 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 802 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 803 "$MalV = \\u0D4D;" // Malayalam Sign Virama 804 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 805 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators 806 "!!chain;" 807 "!!forward;" 808 "$CR $LF;" 809 "$L ($L | $V | $LV | $LVT);" 810 "($LV | $V) ($V | $T);" 811 "($LVT | $T) $T;" 812 "[^$Control $CR $LF] $Extend;" 813 "[^$Control $CR $LF] $SpacingMark;" 814 "$RI $RI / $RI;" 815 "$RI $RI;" 816 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 817 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 818 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 819 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 820 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 821 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 822 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 823 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 824 "!!reverse;" 825 "$LF $CR;" 826 "($L | $V | $LV | $LVT) $L;" 827 "($V | $T) ($LV | $V);" 828 "$T ($LVT | $T);" 829 "$Extend [^$Control $CR $LF];" 830 "$SpacingMark [^$Control $CR $LF];" 831 "$RI $RI / $RI $RI;" 832 "$RI $RI;" 833 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 834 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 835 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 836 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 837 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 838 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 839 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 840 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 841 "!!safe_reverse;" 842 "!!safe_forward;"; 843 844 return setUpIteratorWithRules(kRules, string, length); 845 } 846 847 } 848