1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #include "config.h" 23 #include "core/platform/text/TextBreakIterator.h" 24 25 #include "core/platform/text/LineBreakIteratorPoolICU.h" 26 #include "wtf/text/WTFString.h" 27 28 using namespace WTF; 29 using namespace std; 30 31 namespace WebCore { 32 33 static TextBreakIterator* ensureIterator(bool& createdIterator, TextBreakIterator*& iterator, UBreakIteratorType type) 34 { 35 if (!createdIterator) { 36 UErrorCode openStatus = U_ZERO_ERROR; 37 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus)); 38 createdIterator = true; 39 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 40 } 41 return iterator; 42 } 43 44 enum TextContext { NoContext, PriorContext, PrimaryContext }; 45 46 const int textBufferCapacity = 16; 47 48 typedef struct { 49 UText text; 50 UChar buffer[textBufferCapacity]; 51 } UTextWithBuffer; 52 53 static inline int64_t textPinIndex(int64_t& index, int64_t limit) 54 { 55 if (index < 0) 56 index = 0; 57 else if (index > limit) 58 index = limit; 59 return index; 60 } 61 62 static inline int64_t textNativeLength(UText* text) 63 { 64 return text->a + text->b; 65 } 66 67 // Relocate pointer from source into destination as required. 68 static void textFixPointer(const UText* source, UText* destination, const void*& pointer) 69 { 70 if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) { 71 // Pointer references source extra buffer. 72 pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra)); 73 } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) { 74 // Pointer references source text structure, but not source extra buffer. 75 pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source)); 76 } 77 } 78 79 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status) 80 { 81 ASSERT_UNUSED(deep, !deep); 82 if (U_FAILURE(*status)) 83 return 0; 84 int32_t extraSize = source->extraSize; 85 destination = utext_setup(destination, extraSize, status); 86 if (U_FAILURE(*status)) 87 return destination; 88 void* extraNew = destination->pExtra; 89 int32_t flags = destination->flags; 90 int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct); 91 memcpy(destination, source, sizeToCopy); 92 destination->pExtra = extraNew; 93 destination->flags = flags; 94 memcpy(destination->pExtra, source->pExtra, extraSize); 95 textFixPointer(source, destination, destination->context); 96 textFixPointer(source, destination, destination->p); 97 textFixPointer(source, destination, destination->q); 98 ASSERT(!destination->r); 99 const void * chunkContents = static_cast<const void*>(destination->chunkContents); 100 textFixPointer(source, destination, chunkContents); 101 destination->chunkContents = static_cast<const UChar*>(chunkContents); 102 return destination; 103 } 104 105 static int32_t textExtract(UText* text, int64_t start, int64_t limit, UChar* destination, int32_t destinationCapacity, UErrorCode* errorCode) 106 { 107 UNUSED_PARAM(text); 108 UNUSED_PARAM(start); 109 UNUSED_PARAM(limit); 110 UNUSED_PARAM(destination); 111 UNUSED_PARAM(destinationCapacity); 112 // In the present context, this text provider is used only with ICU functions 113 // that do not perform an extract operation. 114 ASSERT_NOT_REACHED(); 115 *errorCode = U_UNSUPPORTED_ERROR; 116 return 0; 117 } 118 119 static void textClose(UText* text) 120 { 121 text->context = 0; 122 } 123 124 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward) 125 { 126 if (!text->b || nativeIndex > text->b) 127 return PrimaryContext; 128 if (nativeIndex == text->b) 129 return forward ? PrimaryContext : PriorContext; 130 return PriorContext; 131 } 132 133 static inline TextContext textLatin1GetCurrentContext(const UText* text) 134 { 135 if (!text->chunkContents) 136 return NoContext; 137 return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext; 138 } 139 140 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 141 { 142 ASSERT(text->chunkContents == text->pExtra); 143 if (forward) { 144 ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength); 145 text->chunkNativeStart = nativeIndex; 146 text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar); 147 if (text->chunkNativeLimit > nativeLength) 148 text->chunkNativeLimit = nativeLength; 149 } else { 150 ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength); 151 text->chunkNativeLimit = nativeIndex; 152 text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar); 153 if (text->chunkNativeStart < text->b) 154 text->chunkNativeStart = text->b; 155 } 156 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 157 // Ensure chunk length is well defined if computed length exceeds int32_t range. 158 ASSERT(length <= numeric_limits<int32_t>::max()); 159 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 160 text->nativeIndexingLimit = text->chunkLength; 161 text->chunkOffset = forward ? 0 : text->chunkLength; 162 StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength)); 163 } 164 165 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 166 { 167 ASSERT(!text->chunkContents || text->chunkContents == text->q); 168 text->chunkContents = static_cast<const UChar*>(text->pExtra); 169 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 170 } 171 172 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 173 { 174 ASSERT(text->chunkContents == text->q); 175 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 176 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 177 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 178 text->chunkNativeStart = 0; 179 text->chunkNativeLimit = text->b; 180 text->chunkLength = text->b; 181 text->nativeIndexingLimit = text->chunkLength; 182 int64_t offset = nativeIndex - text->chunkNativeStart; 183 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 184 ASSERT(offset <= numeric_limits<int32_t>::max()); 185 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 186 } 187 188 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 189 { 190 ASSERT(!text->chunkContents || text->chunkContents == text->pExtra); 191 text->chunkContents = static_cast<const UChar*>(text->q); 192 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 193 } 194 195 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible) 196 { 197 if (forward) { 198 if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) { 199 int64_t offset = nativeIndex - text->chunkNativeStart; 200 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 201 ASSERT(offset <= numeric_limits<int32_t>::max()); 202 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 203 isAccessible = TRUE; 204 return true; 205 } 206 if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) { 207 text->chunkOffset = text->chunkLength; 208 isAccessible = FALSE; 209 return true; 210 } 211 } else { 212 if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) { 213 int64_t offset = nativeIndex - text->chunkNativeStart; 214 // Ensure chunk offset is well formed if computed offset exceeds int32_t range. 215 ASSERT(offset <= numeric_limits<int32_t>::max()); 216 text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0; 217 isAccessible = TRUE; 218 return true; 219 } 220 if (nativeIndex <= 0 && !text->chunkNativeStart) { 221 text->chunkOffset = 0; 222 isAccessible = FALSE; 223 return true; 224 } 225 } 226 return false; 227 } 228 229 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward) 230 { 231 if (!text->context) 232 return FALSE; 233 int64_t nativeLength = textNativeLength(text); 234 UBool isAccessible; 235 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 236 return isAccessible; 237 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 238 TextContext currentContext = textLatin1GetCurrentContext(text); 239 TextContext newContext = textGetContext(text, nativeIndex, forward); 240 ASSERT(newContext != NoContext); 241 if (newContext == currentContext) { 242 if (currentContext == PrimaryContext) { 243 textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 244 } else { 245 textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward); 246 } 247 } else if (newContext == PrimaryContext) { 248 textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 249 } else { 250 ASSERT(newContext == PriorContext); 251 textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 252 } 253 return TRUE; 254 } 255 256 static const struct UTextFuncs textLatin1Funcs = { 257 sizeof(UTextFuncs), 258 0, 0, 0, 259 textClone, 260 textNativeLength, 261 textLatin1Access, 262 textExtract, 263 0, 0, 0, 0, 264 textClose, 265 0, 0, 0, 266 }; 267 268 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength) 269 { 270 text->pFuncs = funcs; 271 text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS; 272 text->context = string; 273 text->p = string; 274 text->a = length; 275 text->q = priorContext; 276 text->b = priorContextLength; 277 } 278 279 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 280 { 281 if (U_FAILURE(*status)) 282 return 0; 283 284 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 285 *status = U_ILLEGAL_ARGUMENT_ERROR; 286 return 0; 287 } 288 UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status); 289 if (U_FAILURE(*status)) { 290 ASSERT(!text); 291 return 0; 292 } 293 textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength); 294 return text; 295 } 296 297 static inline TextContext textUTF16GetCurrentContext(const UText* text) 298 { 299 if (!text->chunkContents) 300 return NoContext; 301 return text->chunkContents == text->p ? PrimaryContext : PriorContext; 302 } 303 304 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 305 { 306 ASSERT(text->chunkContents == text->p); 307 ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b); 308 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 309 text->chunkNativeStart = text->b; 310 text->chunkNativeLimit = nativeLength; 311 int64_t length = text->chunkNativeLimit - text->chunkNativeStart; 312 // Ensure chunk length is well defined if computed length exceeds int32_t range. 313 ASSERT(length <= numeric_limits<int32_t>::max()); 314 text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0; 315 text->nativeIndexingLimit = text->chunkLength; 316 int64_t offset = nativeIndex - text->chunkNativeStart; 317 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 318 ASSERT(offset <= numeric_limits<int32_t>::max()); 319 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 320 } 321 322 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 323 { 324 ASSERT(!text->chunkContents || text->chunkContents == text->q); 325 text->chunkContents = static_cast<const UChar*>(text->p); 326 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 327 } 328 329 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 330 { 331 ASSERT(text->chunkContents == text->q); 332 ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b); 333 ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 334 ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength); 335 text->chunkNativeStart = 0; 336 text->chunkNativeLimit = text->b; 337 text->chunkLength = text->b; 338 text->nativeIndexingLimit = text->chunkLength; 339 int64_t offset = nativeIndex - text->chunkNativeStart; 340 // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length. 341 ASSERT(offset <= numeric_limits<int32_t>::max()); 342 text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength); 343 } 344 345 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward) 346 { 347 ASSERT(!text->chunkContents || text->chunkContents == text->p); 348 text->chunkContents = static_cast<const UChar*>(text->q); 349 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 350 } 351 352 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward) 353 { 354 if (!text->context) 355 return FALSE; 356 int64_t nativeLength = textNativeLength(text); 357 UBool isAccessible; 358 if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible)) 359 return isAccessible; 360 nativeIndex = textPinIndex(nativeIndex, nativeLength - 1); 361 TextContext currentContext = textUTF16GetCurrentContext(text); 362 TextContext newContext = textGetContext(text, nativeIndex, forward); 363 ASSERT(newContext != NoContext); 364 if (newContext == currentContext) { 365 if (currentContext == PrimaryContext) { 366 textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward); 367 } else { 368 textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward); 369 } 370 } else if (newContext == PrimaryContext) { 371 textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward); 372 } else { 373 ASSERT(newContext == PriorContext); 374 textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward); 375 } 376 return TRUE; 377 } 378 379 static const struct UTextFuncs textUTF16Funcs = { 380 sizeof(UTextFuncs), 381 0, 0, 0, 382 textClone, 383 textNativeLength, 384 textUTF16Access, 385 textExtract, 386 0, 0, 0, 0, 387 textClose, 388 0, 0, 0, 389 }; 390 391 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status) 392 { 393 if (U_FAILURE(*status)) 394 return 0; 395 396 if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) { 397 *status = U_ILLEGAL_ARGUMENT_ERROR; 398 return 0; 399 } 400 401 text = utext_setup(text, 0, status); 402 if (U_FAILURE(*status)) { 403 ASSERT(!text); 404 return 0; 405 } 406 textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength); 407 return text; 408 } 409 410 static UText emptyText = UTEXT_INITIALIZER; 411 412 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, UBreakIteratorType type, const UChar* string, int length) 413 { 414 if (!string) 415 return 0; 416 417 iterator = ensureIterator(createdIterator, iterator, type); 418 if (!iterator) 419 return 0; 420 421 UErrorCode setTextStatus = U_ZERO_ERROR; 422 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 423 if (U_FAILURE(setTextStatus)) 424 return 0; 425 426 return iterator; 427 } 428 429 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, UBreakIteratorType type, const LChar* string, int length) 430 { 431 if (!string) 432 return 0; 433 434 iterator = ensureIterator(createdIterator, iterator, type); 435 if (!iterator) 436 return 0; 437 438 UTextWithBuffer textLocal; 439 textLocal.text = emptyText; 440 textLocal.text.extraSize = sizeof(textLocal.buffer); 441 textLocal.text.pExtra = textLocal.buffer; 442 443 UErrorCode openStatus = U_ZERO_ERROR; 444 UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus); 445 if (U_FAILURE(openStatus)) { 446 LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 447 return 0; 448 } 449 450 UErrorCode setTextStatus = U_ZERO_ERROR; 451 ubrk_setUText(reinterpret_cast<UBreakIterator*>(iterator), text, &setTextStatus); 452 if (U_FAILURE(setTextStatus)) { 453 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 454 // FIXME: Do we need to call utext_close(text) here? 455 return 0; 456 } 457 458 utext_close(text); 459 460 return iterator; 461 } 462 463 static TextBreakIterator* wordBreakIterator(const LChar* string, int length) 464 { 465 static bool createdWordBreakIterator8 = false; 466 static TextBreakIterator* staticWordBreakIterator8; 467 return setUpIterator(createdWordBreakIterator8, 468 staticWordBreakIterator8, UBRK_WORD, string, length); 469 } 470 471 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 472 { 473 static bool createdWordBreakIterator16 = false; 474 static TextBreakIterator* staticWordBreakIterator16; 475 return setUpIterator(createdWordBreakIterator16, 476 staticWordBreakIterator16, UBRK_WORD, string, length); 477 } 478 479 TextBreakIterator* wordBreakIterator(const String& string, int start, int length) 480 { 481 if (string.isEmpty()) 482 return 0; 483 if (string.is8Bit()) 484 return wordBreakIterator(string.characters8() + start, length); 485 return wordBreakIterator(string.characters16() + start, length); 486 } 487 488 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 489 { 490 UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 491 if (!iterator) 492 return 0; 493 494 UTextWithBuffer textLocal; 495 textLocal.text = emptyText; 496 textLocal.text.extraSize = sizeof(textLocal.buffer); 497 textLocal.text.pExtra = textLocal.buffer; 498 499 UErrorCode openStatus = U_ZERO_ERROR; 500 UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 501 if (U_FAILURE(openStatus)) { 502 LOG_ERROR("textOpenLatin1 failed with status %d", openStatus); 503 return 0; 504 } 505 506 UErrorCode setTextStatus = U_ZERO_ERROR; 507 ubrk_setUText(iterator, text, &setTextStatus); 508 if (U_FAILURE(setTextStatus)) { 509 // FIXME: Do we need to call utext_close(text) here? 510 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 511 return 0; 512 } 513 514 utext_close(text); 515 516 return reinterpret_cast<TextBreakIterator*>(iterator); 517 } 518 519 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength) 520 { 521 UBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale); 522 if (!iterator) 523 return 0; 524 525 UText textLocal = UTEXT_INITIALIZER; 526 527 UErrorCode openStatus = U_ZERO_ERROR; 528 UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus); 529 if (U_FAILURE(openStatus)) { 530 LOG_ERROR("textOpenUTF16 failed with status %d", openStatus); 531 return 0; 532 } 533 534 UErrorCode setTextStatus = U_ZERO_ERROR; 535 ubrk_setUText(iterator, text, &setTextStatus); 536 if (U_FAILURE(setTextStatus)) { 537 // FIXME: Do we need to call utext_close(text) here? 538 LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus); 539 return 0; 540 } 541 542 utext_close(text); 543 544 return reinterpret_cast<TextBreakIterator*>(iterator); 545 } 546 547 void releaseLineBreakIterator(TextBreakIterator* iterator) 548 { 549 ASSERT_ARG(iterator, iterator); 550 551 LineBreakIteratorPool::sharedPool().put(reinterpret_cast<UBreakIterator*>(iterator)); 552 } 553 554 static TextBreakIterator* nonSharedCharacterBreakIterator; 555 556 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue) 557 { 558 DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ()); 559 MutexLocker locker(nonSharedCharacterBreakIteratorMutex); 560 if (nonSharedCharacterBreakIterator != expected) 561 return false; 562 nonSharedCharacterBreakIterator = newValue; 563 return true; 564 } 565 566 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string) 567 : m_is8Bit(true) 568 , m_charaters8(0) 569 , m_offset(0) 570 , m_length(0) 571 , m_iterator(0) 572 { 573 if (string.isEmpty()) 574 return; 575 576 m_is8Bit = string.is8Bit(); 577 578 if (m_is8Bit) { 579 m_charaters8 = string.characters8(); 580 m_offset = 0; 581 m_length = string.length(); 582 return; 583 } 584 585 createIteratorForBuffer(string.characters16(), string.length()); 586 } 587 588 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length) 589 : m_is8Bit(false) 590 , m_charaters8(0) 591 , m_offset(0) 592 , m_length(0) 593 , m_iterator(0) 594 { 595 createIteratorForBuffer(buffer, length); 596 } 597 598 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length) 599 { 600 m_iterator = nonSharedCharacterBreakIterator; 601 bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0); 602 m_iterator = setUpIterator(createdIterator, m_iterator, UBRK_CHARACTER, buffer, length); 603 } 604 605 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator() 606 { 607 if (m_is8Bit) 608 return; 609 if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator)) 610 ubrk_close(reinterpret_cast<UBreakIterator*>(m_iterator)); 611 } 612 613 int NonSharedCharacterBreakIterator::next() 614 { 615 if (!m_is8Bit) 616 return textBreakNext(m_iterator); 617 618 if (m_offset >= m_length) 619 return TextBreakDone; 620 621 m_offset += clusterLengthStartingAt(m_offset); 622 return m_offset; 623 } 624 625 int NonSharedCharacterBreakIterator::current() 626 { 627 if (!m_is8Bit) 628 return textBreakCurrent(m_iterator); 629 return m_offset; 630 } 631 632 bool NonSharedCharacterBreakIterator::isBreak(int offset) const 633 { 634 if (!m_is8Bit) 635 return isTextBreak(m_iterator, offset); 636 return !isLFAfterCR(offset); 637 } 638 639 int NonSharedCharacterBreakIterator::preceding(int offset) const 640 { 641 if (!m_is8Bit) 642 return textBreakPreceding(m_iterator, offset); 643 if (offset <= 0) 644 return TextBreakDone; 645 if (isLFAfterCR(offset)) 646 return offset - 2; 647 return offset - 1; 648 } 649 650 int NonSharedCharacterBreakIterator::following(int offset) const 651 { 652 if (!m_is8Bit) 653 return textBreakFollowing(m_iterator, offset); 654 if (static_cast<unsigned>(offset) >= m_length) 655 return TextBreakDone; 656 return offset + clusterLengthStartingAt(offset); 657 } 658 659 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 660 { 661 static bool createdSentenceBreakIterator = false; 662 static TextBreakIterator* staticSentenceBreakIterator; 663 return setUpIterator(createdSentenceBreakIterator, 664 staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 665 } 666 667 int textBreakFirst(TextBreakIterator* iterator) 668 { 669 return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator)); 670 } 671 672 int textBreakLast(TextBreakIterator* iterator) 673 { 674 return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator)); 675 } 676 677 int textBreakNext(TextBreakIterator* iterator) 678 { 679 return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); 680 } 681 682 int textBreakPrevious(TextBreakIterator* iterator) 683 { 684 return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator)); 685 } 686 687 int textBreakPreceding(TextBreakIterator* iterator, int pos) 688 { 689 return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos); 690 } 691 692 int textBreakFollowing(TextBreakIterator* iterator, int pos) 693 { 694 return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); 695 } 696 697 int textBreakCurrent(TextBreakIterator* iterator) 698 { 699 return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator)); 700 } 701 702 bool isTextBreak(TextBreakIterator* iterator, int position) 703 { 704 return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position); 705 } 706 707 bool isWordTextBreak(TextBreakIterator* iterator) 708 { 709 int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator)); 710 return ruleStatus != UBRK_WORD_NONE; 711 } 712 713 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator, 714 const char* breakRules, const UChar* string, int length) 715 { 716 if (!string) 717 return 0; 718 719 if (!createdIterator) { 720 UParseError parseStatus; 721 UErrorCode openStatus = U_ZERO_ERROR; 722 Vector<UChar> rules; 723 String(breakRules).appendTo(rules); 724 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.data(), rules.size(), 0, 0, &parseStatus, &openStatus)); 725 createdIterator = true; 726 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 727 } 728 if (!iterator) 729 return 0; 730 731 UErrorCode setTextStatus = U_ZERO_ERROR; 732 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 733 if (U_FAILURE(setTextStatus)) 734 return 0; 735 736 return iterator; 737 } 738 739 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 740 { 741 // This rule set is based on character-break iterator rules of ICU 4.0 742 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 743 // The major differences from the original ones are listed below: 744 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 745 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 746 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 747 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 748 // * Added rules for regional indicator symbols. 749 static const char* kRules = 750 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 751 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 752 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 753 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 754 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 755 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 756 "$L = [\\p{Grapheme_Cluster_Break = L}];" 757 "$V = [\\p{Grapheme_Cluster_Break = V}];" 758 "$T = [\\p{Grapheme_Cluster_Break = T}];" 759 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 760 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 761 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 762 "$HinV = \\u094D;" // Devanagari Sign Virama 763 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 764 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 765 "$BenV = \\u09CD;" // Bengali Sign Virama 766 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 767 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 768 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 769 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 770 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 771 "$GujV = \\u0ACD;" // Gujarati Sign Virama 772 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 773 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 774 "$OriV = \\u0B4D;" // Oriya Sign Virama 775 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 776 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 777 "$TelV = \\u0C4D;" // Telugu Sign Virama 778 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 779 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 780 "$KanV = \\u0CCD;" // Kannada Sign Virama 781 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 782 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 783 "$MalV = \\u0D4D;" // Malayalam Sign Virama 784 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 785 "$RI = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators 786 "!!chain;" 787 "!!forward;" 788 "$CR $LF;" 789 "$L ($L | $V | $LV | $LVT);" 790 "($LV | $V) ($V | $T);" 791 "($LVT | $T) $T;" 792 "[^$Control $CR $LF] $Extend;" 793 "[^$Control $CR $LF] $SpacingMark;" 794 "$RI $RI / $RI;" 795 "$RI $RI;" 796 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 797 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 798 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 799 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 800 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 801 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 802 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 803 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 804 "!!reverse;" 805 "$LF $CR;" 806 "($L | $V | $LV | $LVT) $L;" 807 "($V | $T) ($LV | $V);" 808 "$T ($LVT | $T);" 809 "$Extend [^$Control $CR $LF];" 810 "$SpacingMark [^$Control $CR $LF];" 811 "$RI $RI / $RI $RI;" 812 "$RI $RI;" 813 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 814 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 815 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 816 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 817 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 818 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 819 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 820 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 821 "!!safe_reverse;" 822 "!!safe_forward;"; 823 static bool createdCursorMovementIterator = false; 824 static TextBreakIterator* staticCursorMovementIterator; 825 return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); 826 } 827 828 } 829