Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
      4  *
      5  * This library is free software; you can redistribute it and/or
      6  * modify it under the terms of the GNU Library General Public
      7  * License as published by the Free Software Foundation; either
      8  * version 2 of the License, or (at your option) any later version.
      9  *
     10  * This library is distributed in the hope that it will be useful,
     11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  * Library General Public License for more details.
     14  *
     15  * You should have received a copy of the GNU Library General Public License
     16  * along with this library; see the file COPYING.LIB.  If not, write to
     17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18  * Boston, MA 02110-1301, USA.
     19  *
     20  */
     21 
     22 #include "config.h"
     23 #include "platform/text/TextBreakIterator.h"
     24 
     25 #include "platform/text/TextBreakIteratorInternalICU.h"
     26 #include "wtf/Assertions.h"
     27 #include "wtf/HashMap.h"
     28 #include "wtf/PassOwnPtr.h"
     29 #include "wtf/ThreadSpecific.h"
     30 #include "wtf/ThreadingPrimitives.h"
     31 #include "wtf/text/AtomicString.h"
     32 #include "wtf/text/CString.h"
     33 #include "wtf/text/WTFString.h"
     34 #include <unicode/rbbi.h>
     35 #include <unicode/ubrk.h>
     36 
     37 using namespace WTF;
     38 using namespace std;
     39 
     40 namespace WebCore {
     41 
     42 class LineBreakIteratorPool {
     43     WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
     44 public:
     45     static LineBreakIteratorPool& sharedPool()
     46     {
     47         static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
     48         return **pool;
     49     }
     50 
     51     static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
     52 
     53     icu::BreakIterator* take(const AtomicString& locale)
     54     {
     55         icu::BreakIterator* iterator = 0;
     56         for (size_t i = 0; i < m_pool.size(); ++i) {
     57             if (m_pool[i].first == locale) {
     58                 iterator = m_pool[i].second;
     59                 m_pool.remove(i);
     60                 break;
     61             }
     62         }
     63 
     64         if (!iterator) {
     65             UErrorCode openStatus = U_ZERO_ERROR;
     66             bool localeIsEmpty = locale.isEmpty();
     67             iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.utf8().data()), openStatus);
     68             // locale comes from a web page and it can be invalid, leading ICU
     69             // to fail, in which case we fall back to the default locale.
     70             if (!localeIsEmpty && U_FAILURE(openStatus)) {
     71                 openStatus = U_ZERO_ERROR;
     72                 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
     73             }
     74 
     75             if (U_FAILURE(openStatus)) {
     76                 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
     77                 return 0;
     78             }
     79         }
     80 
     81         ASSERT(!m_vendedIterators.contains(iterator));
     82         m_vendedIterators.set(iterator, locale);
     83         return iterator;
     84     }
     85 
     86     void put(icu::BreakIterator* iterator)
     87     {
     88         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
     89 
     90         if (m_pool.size() == capacity) {
     91             delete(m_pool[0].second);
     92             m_pool.remove(0);
     93         }
     94 
     95         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
     96     }
     97 
     98 private:
     99     LineBreakIteratorPool() { }
    100 
    101     static const size_t capacity = 4;
    102 
    103     typedef pair<AtomicString, icu::BreakIterator*> Entry;
    104     typedef Vector<Entry, capacity> Pool;
    105     Pool m_pool;
    106     HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
    107 
    108     friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
    109 };
    110 
    111 enum TextContext { NoContext, PriorContext, PrimaryContext };
    112 
    113 const int textBufferCapacity = 16;
    114 
    115 typedef struct {
    116     UText text;
    117     UChar buffer[textBufferCapacity];
    118 } UTextWithBuffer;
    119 
    120 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
    121 {
    122     if (index < 0)
    123         index = 0;
    124     else if (index > limit)
    125         index = limit;
    126     return index;
    127 }
    128 
    129 static inline int64_t textNativeLength(UText* text)
    130 {
    131     return text->a + text->b;
    132 }
    133 
    134 // Relocate pointer from source into destination as required.
    135 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
    136 {
    137     if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
    138         // Pointer references source extra buffer.
    139         pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
    140     } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
    141         // Pointer references source text structure, but not source extra buffer.
    142         pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
    143     }
    144 }
    145 
    146 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
    147 {
    148     ASSERT_UNUSED(deep, !deep);
    149     if (U_FAILURE(*status))
    150         return 0;
    151     int32_t extraSize = source->extraSize;
    152     destination = utext_setup(destination, extraSize, status);
    153     if (U_FAILURE(*status))
    154         return destination;
    155     void* extraNew = destination->pExtra;
    156     int32_t flags = destination->flags;
    157     int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
    158     memcpy(destination, source, sizeToCopy);
    159     destination->pExtra = extraNew;
    160     destination->flags = flags;
    161     memcpy(destination->pExtra, source->pExtra, extraSize);
    162     textFixPointer(source, destination, destination->context);
    163     textFixPointer(source, destination, destination->p);
    164     textFixPointer(source, destination, destination->q);
    165     ASSERT(!destination->r);
    166     const void * chunkContents = static_cast<const void*>(destination->chunkContents);
    167     textFixPointer(source, destination, chunkContents);
    168     destination->chunkContents = static_cast<const UChar*>(chunkContents);
    169     return destination;
    170 }
    171 
    172 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
    173 {
    174     // In the present context, this text provider is used only with ICU functions
    175     // that do not perform an extract operation.
    176     ASSERT_NOT_REACHED();
    177     *errorCode = U_UNSUPPORTED_ERROR;
    178     return 0;
    179 }
    180 
    181 static void textClose(UText* text)
    182 {
    183     text->context = 0;
    184 }
    185 
    186 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
    187 {
    188     if (!text->b || nativeIndex > text->b)
    189         return PrimaryContext;
    190     if (nativeIndex == text->b)
    191         return forward ? PrimaryContext : PriorContext;
    192     return PriorContext;
    193 }
    194 
    195 static inline TextContext textLatin1GetCurrentContext(const UText* text)
    196 {
    197     if (!text->chunkContents)
    198         return NoContext;
    199     return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
    200 }
    201 
    202 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    203 {
    204     ASSERT(text->chunkContents == text->pExtra);
    205     if (forward) {
    206         ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
    207         text->chunkNativeStart = nativeIndex;
    208         text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
    209         if (text->chunkNativeLimit > nativeLength)
    210             text->chunkNativeLimit = nativeLength;
    211     } else {
    212         ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
    213         text->chunkNativeLimit = nativeIndex;
    214         text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
    215         if (text->chunkNativeStart < text->b)
    216             text->chunkNativeStart = text->b;
    217     }
    218     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
    219     // Ensure chunk length is well defined if computed length exceeds int32_t range.
    220     ASSERT(length <= numeric_limits<int32_t>::max());
    221     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
    222     text->nativeIndexingLimit = text->chunkLength;
    223     text->chunkOffset = forward ? 0 : text->chunkLength;
    224     StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
    225 }
    226 
    227 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    228 {
    229     ASSERT(!text->chunkContents || text->chunkContents == text->q);
    230     text->chunkContents = static_cast<const UChar*>(text->pExtra);
    231     textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    232 }
    233 
    234 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    235 {
    236     ASSERT(text->chunkContents == text->q);
    237     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
    238     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    239     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    240     text->chunkNativeStart = 0;
    241     text->chunkNativeLimit = text->b;
    242     text->chunkLength = text->b;
    243     text->nativeIndexingLimit = text->chunkLength;
    244     int64_t offset = nativeIndex - text->chunkNativeStart;
    245     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    246     ASSERT(offset <= numeric_limits<int32_t>::max());
    247     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    248 }
    249 
    250 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    251 {
    252     ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
    253     text->chunkContents = static_cast<const UChar*>(text->q);
    254     textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    255 }
    256 
    257 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
    258 {
    259     if (forward) {
    260         if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
    261             int64_t offset = nativeIndex - text->chunkNativeStart;
    262             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
    263             ASSERT(offset <= numeric_limits<int32_t>::max());
    264             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
    265             isAccessible = TRUE;
    266             return true;
    267         }
    268         if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
    269             text->chunkOffset = text->chunkLength;
    270             isAccessible = FALSE;
    271             return true;
    272         }
    273     } else {
    274         if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
    275             int64_t offset = nativeIndex - text->chunkNativeStart;
    276             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
    277             ASSERT(offset <= numeric_limits<int32_t>::max());
    278             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
    279             isAccessible = TRUE;
    280             return true;
    281         }
    282         if (nativeIndex <= 0 && !text->chunkNativeStart) {
    283             text->chunkOffset = 0;
    284             isAccessible = FALSE;
    285             return true;
    286         }
    287     }
    288     return false;
    289 }
    290 
    291 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
    292 {
    293     if (!text->context)
    294         return FALSE;
    295     int64_t nativeLength = textNativeLength(text);
    296     UBool isAccessible;
    297     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
    298         return isAccessible;
    299     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
    300     TextContext currentContext = textLatin1GetCurrentContext(text);
    301     TextContext newContext = textGetContext(text, nativeIndex, forward);
    302     ASSERT(newContext != NoContext);
    303     if (newContext == currentContext) {
    304         if (currentContext == PrimaryContext) {
    305             textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    306         } else {
    307             textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    308         }
    309     } else if (newContext == PrimaryContext) {
    310         textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
    311     } else {
    312         ASSERT(newContext == PriorContext);
    313         textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
    314     }
    315     return TRUE;
    316 }
    317 
    318 static const struct UTextFuncs textLatin1Funcs = {
    319     sizeof(UTextFuncs),
    320     0, 0, 0,
    321     textClone,
    322     textNativeLength,
    323     textLatin1Access,
    324     textExtract,
    325     0, 0, 0, 0,
    326     textClose,
    327     0, 0, 0,
    328 };
    329 
    330 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
    331 {
    332     text->pFuncs = funcs;
    333     text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
    334     text->context = string;
    335     text->p = string;
    336     text->a = length;
    337     text->q = priorContext;
    338     text->b = priorContextLength;
    339 }
    340 
    341 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
    342 {
    343     if (U_FAILURE(*status))
    344         return 0;
    345 
    346     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
    347         *status = U_ILLEGAL_ARGUMENT_ERROR;
    348         return 0;
    349     }
    350     UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
    351     if (U_FAILURE(*status)) {
    352         ASSERT(!text);
    353         return 0;
    354     }
    355     textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
    356     return text;
    357 }
    358 
    359 static inline TextContext textUTF16GetCurrentContext(const UText* text)
    360 {
    361     if (!text->chunkContents)
    362         return NoContext;
    363     return text->chunkContents == text->p ? PrimaryContext : PriorContext;
    364 }
    365 
    366 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    367 {
    368     ASSERT(text->chunkContents == text->p);
    369     ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
    370     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    371     text->chunkNativeStart = text->b;
    372     text->chunkNativeLimit = nativeLength;
    373     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
    374     // Ensure chunk length is well defined if computed length exceeds int32_t range.
    375     ASSERT(length <= numeric_limits<int32_t>::max());
    376     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
    377     text->nativeIndexingLimit = text->chunkLength;
    378     int64_t offset = nativeIndex - text->chunkNativeStart;
    379     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    380     ASSERT(offset <= numeric_limits<int32_t>::max());
    381     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    382 }
    383 
    384 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    385 {
    386     ASSERT(!text->chunkContents || text->chunkContents == text->q);
    387     text->chunkContents = static_cast<const UChar*>(text->p);
    388     textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    389 }
    390 
    391 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    392 {
    393     ASSERT(text->chunkContents == text->q);
    394     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
    395     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    396     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    397     text->chunkNativeStart = 0;
    398     text->chunkNativeLimit = text->b;
    399     text->chunkLength = text->b;
    400     text->nativeIndexingLimit = text->chunkLength;
    401     int64_t offset = nativeIndex - text->chunkNativeStart;
    402     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    403     ASSERT(offset <= numeric_limits<int32_t>::max());
    404     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    405 }
    406 
    407 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    408 {
    409     ASSERT(!text->chunkContents || text->chunkContents == text->p);
    410     text->chunkContents = static_cast<const UChar*>(text->q);
    411     textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    412 }
    413 
    414 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
    415 {
    416     if (!text->context)
    417         return FALSE;
    418     int64_t nativeLength = textNativeLength(text);
    419     UBool isAccessible;
    420     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
    421         return isAccessible;
    422     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
    423     TextContext currentContext = textUTF16GetCurrentContext(text);
    424     TextContext newContext = textGetContext(text, nativeIndex, forward);
    425     ASSERT(newContext != NoContext);
    426     if (newContext == currentContext) {
    427         if (currentContext == PrimaryContext) {
    428             textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    429         } else {
    430             textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    431         }
    432     } else if (newContext == PrimaryContext) {
    433         textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
    434     } else {
    435         ASSERT(newContext == PriorContext);
    436         textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
    437     }
    438     return TRUE;
    439 }
    440 
    441 static const struct UTextFuncs textUTF16Funcs = {
    442     sizeof(UTextFuncs),
    443     0, 0, 0,
    444     textClone,
    445     textNativeLength,
    446     textUTF16Access,
    447     textExtract,
    448     0, 0, 0, 0,
    449     textClose,
    450     0, 0, 0,
    451 };
    452 
    453 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
    454 {
    455     if (U_FAILURE(*status))
    456         return 0;
    457 
    458     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
    459         *status = U_ILLEGAL_ARGUMENT_ERROR;
    460         return 0;
    461     }
    462 
    463     text = utext_setup(text, 0, status);
    464     if (U_FAILURE(*status)) {
    465         ASSERT(!text);
    466         return 0;
    467     }
    468     textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
    469     return text;
    470 }
    471 
    472 static UText emptyText = UTEXT_INITIALIZER;
    473 
    474 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
    475 {
    476     UErrorCode errorCode = U_ZERO_ERROR;
    477     static TextBreakIterator* breakIter = 0;
    478     if (!breakIter) {
    479         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    480         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    481         if (!breakIter)
    482             return 0;
    483     }
    484 
    485     UTextWithBuffer textLocal;
    486     textLocal.text = emptyText;
    487     textLocal.text.extraSize = sizeof(textLocal.buffer);
    488     textLocal.text.pExtra = textLocal.buffer;
    489 
    490     UErrorCode openStatus = U_ZERO_ERROR;
    491     UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
    492     if (U_FAILURE(openStatus)) {
    493         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
    494         return 0;
    495     }
    496 
    497     UErrorCode setTextStatus = U_ZERO_ERROR;
    498     breakIter->setText(text, setTextStatus);
    499     if (U_FAILURE(setTextStatus))
    500         WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
    501 
    502     utext_close(text);
    503 
    504     return breakIter;
    505 }
    506 
    507 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
    508 {
    509     UErrorCode errorCode = U_ZERO_ERROR;
    510     UText uText = UTEXT_INITIALIZER;
    511     utext_openUChars(&uText, string, length, &errorCode);
    512     if (U_FAILURE(errorCode))
    513         return;
    514     iter->setText(&uText, errorCode);
    515 }
    516 
    517 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
    518 {
    519     UErrorCode errorCode = U_ZERO_ERROR;
    520     static TextBreakIterator* breakIter = 0;
    521     if (!breakIter) {
    522         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    523         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    524         if (!breakIter)
    525             return 0;
    526     }
    527     setText16(breakIter, string, length);
    528     return breakIter;
    529 }
    530 
    531 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
    532 {
    533     if (string.isEmpty())
    534         return 0;
    535     if (string.is8Bit())
    536         return wordBreakIterator(string.characters8() + start, length);
    537     return wordBreakIterator(string.characters16() + start, length);
    538 }
    539 
    540 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
    541 {
    542     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
    543     if (!iterator)
    544         return 0;
    545 
    546     UTextWithBuffer textLocal;
    547     textLocal.text = emptyText;
    548     textLocal.text.extraSize = sizeof(textLocal.buffer);
    549     textLocal.text.pExtra = textLocal.buffer;
    550 
    551     UErrorCode openStatus = U_ZERO_ERROR;
    552     UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
    553     if (U_FAILURE(openStatus)) {
    554         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
    555         return 0;
    556     }
    557 
    558     UErrorCode setTextStatus = U_ZERO_ERROR;
    559     iterator->setText(text, setTextStatus);
    560     if (U_FAILURE(setTextStatus)) {
    561         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
    562         return 0;
    563     }
    564 
    565     utext_close(text);
    566 
    567     return iterator;
    568 }
    569 
    570 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
    571 {
    572     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
    573     if (!iterator)
    574         return 0;
    575 
    576     UText textLocal = UTEXT_INITIALIZER;
    577 
    578     UErrorCode openStatus = U_ZERO_ERROR;
    579     UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
    580     if (U_FAILURE(openStatus)) {
    581         WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
    582         return 0;
    583     }
    584 
    585     UErrorCode setTextStatus = U_ZERO_ERROR;
    586     iterator->setText(text, setTextStatus);
    587     if (U_FAILURE(setTextStatus)) {
    588         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
    589         return 0;
    590     }
    591 
    592     utext_close(text);
    593 
    594     return iterator;
    595 }
    596 
    597 void releaseLineBreakIterator(TextBreakIterator* iterator)
    598 {
    599     ASSERT_ARG(iterator, iterator);
    600 
    601     LineBreakIteratorPool::sharedPool().put(iterator);
    602 }
    603 
    604 static TextBreakIterator* nonSharedCharacterBreakIterator;
    605 
    606 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
    607 {
    608     DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
    609     MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
    610     if (nonSharedCharacterBreakIterator != expected)
    611         return false;
    612     nonSharedCharacterBreakIterator = newValue;
    613     return true;
    614 }
    615 
    616 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
    617     : m_is8Bit(true)
    618     , m_charaters8(0)
    619     , m_offset(0)
    620     , m_length(0)
    621     , m_iterator(0)
    622 {
    623     if (string.isEmpty())
    624         return;
    625 
    626     m_is8Bit = string.is8Bit();
    627 
    628     if (m_is8Bit) {
    629         m_charaters8 = string.characters8();
    630         m_offset = 0;
    631         m_length = string.length();
    632         return;
    633     }
    634 
    635     createIteratorForBuffer(string.characters16(), string.length());
    636 }
    637 
    638 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
    639     : m_is8Bit(false)
    640     , m_charaters8(0)
    641     , m_offset(0)
    642     , m_length(0)
    643     , m_iterator(0)
    644 {
    645     createIteratorForBuffer(buffer, length);
    646 }
    647 
    648 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
    649 {
    650     m_iterator = nonSharedCharacterBreakIterator;
    651     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
    652     if (!createdIterator) {
    653         UErrorCode errorCode = U_ZERO_ERROR;
    654         m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    655         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    656     }
    657 
    658     setText16(m_iterator, buffer, length);
    659 }
    660 
    661 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
    662 {
    663     if (m_is8Bit)
    664         return;
    665     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
    666         delete m_iterator;
    667 }
    668 
    669 int NonSharedCharacterBreakIterator::next()
    670 {
    671     if (!m_is8Bit)
    672         return m_iterator->next();
    673 
    674     if (m_offset >= m_length)
    675         return TextBreakDone;
    676 
    677     m_offset += clusterLengthStartingAt(m_offset);
    678     return m_offset;
    679 }
    680 
    681 int NonSharedCharacterBreakIterator::current()
    682 {
    683     if (!m_is8Bit)
    684         return m_iterator->current();
    685     return m_offset;
    686 }
    687 
    688 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
    689 {
    690     if (!m_is8Bit)
    691         return m_iterator->isBoundary(offset);
    692     return !isLFAfterCR(offset);
    693 }
    694 
    695 int NonSharedCharacterBreakIterator::preceding(int offset) const
    696 {
    697     if (!m_is8Bit)
    698         return m_iterator->preceding(offset);
    699     if (offset <= 0)
    700         return TextBreakDone;
    701     if (isLFAfterCR(offset))
    702         return offset - 2;
    703     return offset - 1;
    704 }
    705 
    706 int NonSharedCharacterBreakIterator::following(int offset) const
    707 {
    708     if (!m_is8Bit)
    709         return m_iterator->following(offset);
    710     if (static_cast<unsigned>(offset) >= m_length)
    711         return TextBreakDone;
    712     return offset + clusterLengthStartingAt(offset);
    713 }
    714 
    715 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
    716 {
    717     UErrorCode openStatus = U_ZERO_ERROR;
    718     static TextBreakIterator* iterator = 0;
    719     if (!iterator) {
    720         iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
    721         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
    722         if (!iterator)
    723             return 0;
    724     }
    725 
    726     setText16(iterator, string, length);
    727     return iterator;
    728 }
    729 
    730 bool isWordTextBreak(TextBreakIterator* iterator)
    731 {
    732     icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
    733     int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
    734     return ruleStatus != UBRK_WORD_NONE;
    735 }
    736 
    737 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
    738 {
    739     if (!string)
    740         return 0;
    741 
    742     static TextBreakIterator* iterator = 0;
    743     if (!iterator) {
    744         UParseError parseStatus;
    745         UErrorCode openStatus = U_ZERO_ERROR;
    746         Vector<UChar> rules;
    747         String(breakRules).appendTo(rules);
    748 
    749         iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
    750         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
    751         if (!iterator)
    752             return 0;
    753     }
    754 
    755     setText16(iterator, string, length);
    756     return iterator;
    757 }
    758 
    759 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
    760 {
    761     // This rule set is based on character-break iterator rules of ICU 4.0
    762     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
    763     // The major differences from the original ones are listed below:
    764     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
    765     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
    766     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
    767     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
    768     // * Added rules for regional indicator symbols.
    769     static const char* const kRules =
    770         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
    771         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
    772         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
    773         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
    774         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
    775         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
    776         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
    777         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
    778         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
    779         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
    780         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
    781         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
    782         "$HinV    = \\u094D;"              // Devanagari Sign Virama
    783         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
    784         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
    785         "$BenV    = \\u09CD;"              // Bengali Sign Virama
    786         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
    787         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
    788         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
    789         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
    790         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
    791         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
    792         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
    793         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
    794         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
    795         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
    796         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
    797         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
    798         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
    799         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
    800         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
    801         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
    802         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
    803         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
    804         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
    805         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
    806         "!!chain;"
    807         "!!forward;"
    808         "$CR $LF;"
    809         "$L ($L | $V | $LV | $LVT);"
    810         "($LV | $V) ($V | $T);"
    811         "($LVT | $T) $T;"
    812         "[^$Control $CR $LF] $Extend;"
    813         "[^$Control $CR $LF] $SpacingMark;"
    814         "$RI $RI / $RI;"
    815         "$RI $RI;"
    816         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
    817         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
    818         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
    819         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
    820         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
    821         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
    822         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
    823         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
    824         "!!reverse;"
    825         "$LF $CR;"
    826         "($L | $V | $LV | $LVT) $L;"
    827         "($V | $T) ($LV | $V);"
    828         "$T ($LVT | $T);"
    829         "$Extend      [^$Control $CR $LF];"
    830         "$SpacingMark [^$Control $CR $LF];"
    831         "$RI $RI / $RI $RI;"
    832         "$RI $RI;"
    833         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
    834         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
    835         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
    836         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
    837         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
    838         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
    839         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
    840         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
    841         "!!safe_reverse;"
    842         "!!safe_forward;";
    843 
    844     return setUpIteratorWithRules(kRules, string, length);
    845 }
    846 
    847 }
    848