Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
      4  *
      5  * This library is free software; you can redistribute it and/or
      6  * modify it under the terms of the GNU Library General Public
      7  * License as published by the Free Software Foundation; either
      8  * version 2 of the License, or (at your option) any later version.
      9  *
     10  * This library is distributed in the hope that it will be useful,
     11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  * Library General Public License for more details.
     14  *
     15  * You should have received a copy of the GNU Library General Public License
     16  * along with this library; see the file COPYING.LIB.  If not, write to
     17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18  * Boston, MA 02110-1301, USA.
     19  *
     20  */
     21 
     22 #include "config.h"
     23 #include "platform/text/TextBreakIterator.h"
     24 
     25 #include "platform/text/TextBreakIteratorInternalICU.h"
     26 #include "wtf/Assertions.h"
     27 #include "wtf/HashMap.h"
     28 #include "wtf/PassOwnPtr.h"
     29 #include "wtf/ThreadSpecific.h"
     30 #include "wtf/ThreadingPrimitives.h"
     31 #include "wtf/text/AtomicString.h"
     32 #include "wtf/text/CString.h"
     33 #include "wtf/text/WTFString.h"
     34 #include <unicode/ubrk.h>
     35 
     36 using namespace WTF;
     37 using namespace std;
     38 
     39 namespace WebCore {
     40 
     41 class LineBreakIteratorPool {
     42     WTF_MAKE_NONCOPYABLE(LineBreakIteratorPool);
     43 public:
     44     static LineBreakIteratorPool& sharedPool()
     45     {
     46         static WTF::ThreadSpecific<LineBreakIteratorPool>* pool = new WTF::ThreadSpecific<LineBreakIteratorPool>;
     47         return **pool;
     48     }
     49 
     50     static PassOwnPtr<LineBreakIteratorPool> create() { return adoptPtr(new LineBreakIteratorPool); }
     51 
     52     icu::BreakIterator* take(const AtomicString& locale)
     53     {
     54         icu::BreakIterator* iterator = 0;
     55         for (size_t i = 0; i < m_pool.size(); ++i) {
     56             if (m_pool[i].first == locale) {
     57                 iterator = m_pool[i].second;
     58                 m_pool.remove(i);
     59                 break;
     60             }
     61         }
     62 
     63         if (!iterator) {
     64             UErrorCode openStatus = U_ZERO_ERROR;
     65             bool localeIsEmpty = locale.isEmpty();
     66             iterator = icu::BreakIterator::createLineInstance(localeIsEmpty ? icu::Locale(currentTextBreakLocaleID()) : icu::Locale(locale.string().utf8().data()), openStatus);
     67             // locale comes from a web page and it can be invalid, leading ICU
     68             // to fail, in which case we fall back to the default locale.
     69             if (!localeIsEmpty && U_FAILURE(openStatus)) {
     70                 openStatus = U_ZERO_ERROR;
     71                 iterator = icu::BreakIterator::createLineInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
     72             }
     73 
     74             if (U_FAILURE(openStatus)) {
     75                 WTF_LOG_ERROR("icu::BreakIterator construction failed with status %d", openStatus);
     76                 return 0;
     77             }
     78         }
     79 
     80         ASSERT(!m_vendedIterators.contains(iterator));
     81         m_vendedIterators.set(iterator, locale);
     82         return iterator;
     83     }
     84 
     85     void put(icu::BreakIterator* iterator)
     86     {
     87         ASSERT_ARG(iterator, m_vendedIterators.contains(iterator));
     88 
     89         if (m_pool.size() == capacity) {
     90             delete(m_pool[0].second);
     91             m_pool.remove(0);
     92         }
     93 
     94         m_pool.append(Entry(m_vendedIterators.take(iterator), iterator));
     95     }
     96 
     97 private:
     98     LineBreakIteratorPool() { }
     99 
    100     static const size_t capacity = 4;
    101 
    102     typedef pair<AtomicString, icu::BreakIterator*> Entry;
    103     typedef Vector<Entry, capacity> Pool;
    104     Pool m_pool;
    105     HashMap<icu::BreakIterator*, AtomicString> m_vendedIterators;
    106 
    107     friend WTF::ThreadSpecific<LineBreakIteratorPool>::operator LineBreakIteratorPool*();
    108 };
    109 
    110 enum TextContext { NoContext, PriorContext, PrimaryContext };
    111 
    112 const int textBufferCapacity = 16;
    113 
    114 typedef struct {
    115     UText text;
    116     UChar buffer[textBufferCapacity];
    117 } UTextWithBuffer;
    118 
    119 static inline int64_t textPinIndex(int64_t& index, int64_t limit)
    120 {
    121     if (index < 0)
    122         index = 0;
    123     else if (index > limit)
    124         index = limit;
    125     return index;
    126 }
    127 
    128 static inline int64_t textNativeLength(UText* text)
    129 {
    130     return text->a + text->b;
    131 }
    132 
    133 // Relocate pointer from source into destination as required.
    134 static void textFixPointer(const UText* source, UText* destination, const void*& pointer)
    135 {
    136     if (pointer >= source->pExtra && pointer < static_cast<char*>(source->pExtra) + source->extraSize) {
    137         // Pointer references source extra buffer.
    138         pointer = static_cast<char*>(destination->pExtra) + (static_cast<const char*>(pointer) - static_cast<const char*>(source->pExtra));
    139     } else if (pointer >= source && pointer < reinterpret_cast<const char*>(source) + source->sizeOfStruct) {
    140         // Pointer references source text structure, but not source extra buffer.
    141         pointer = reinterpret_cast<char*>(destination) + (static_cast<const char*>(pointer) - reinterpret_cast<const char*>(source));
    142     }
    143 }
    144 
    145 static UText* textClone(UText* destination, const UText* source, UBool deep, UErrorCode* status)
    146 {
    147     ASSERT_UNUSED(deep, !deep);
    148     if (U_FAILURE(*status))
    149         return 0;
    150     int32_t extraSize = source->extraSize;
    151     destination = utext_setup(destination, extraSize, status);
    152     if (U_FAILURE(*status))
    153         return destination;
    154     void* extraNew = destination->pExtra;
    155     int32_t flags = destination->flags;
    156     int sizeToCopy = min(source->sizeOfStruct, destination->sizeOfStruct);
    157     memcpy(destination, source, sizeToCopy);
    158     destination->pExtra = extraNew;
    159     destination->flags = flags;
    160     memcpy(destination->pExtra, source->pExtra, extraSize);
    161     textFixPointer(source, destination, destination->context);
    162     textFixPointer(source, destination, destination->p);
    163     textFixPointer(source, destination, destination->q);
    164     ASSERT(!destination->r);
    165     const void * chunkContents = static_cast<const void*>(destination->chunkContents);
    166     textFixPointer(source, destination, chunkContents);
    167     destination->chunkContents = static_cast<const UChar*>(chunkContents);
    168     return destination;
    169 }
    170 
    171 static int32_t textExtract(UText*, int64_t, int64_t, UChar*, int32_t, UErrorCode* errorCode)
    172 {
    173     // In the present context, this text provider is used only with ICU functions
    174     // that do not perform an extract operation.
    175     ASSERT_NOT_REACHED();
    176     *errorCode = U_UNSUPPORTED_ERROR;
    177     return 0;
    178 }
    179 
    180 static void textClose(UText* text)
    181 {
    182     text->context = 0;
    183 }
    184 
    185 static inline TextContext textGetContext(const UText* text, int64_t nativeIndex, UBool forward)
    186 {
    187     if (!text->b || nativeIndex > text->b)
    188         return PrimaryContext;
    189     if (nativeIndex == text->b)
    190         return forward ? PrimaryContext : PriorContext;
    191     return PriorContext;
    192 }
    193 
    194 static inline TextContext textLatin1GetCurrentContext(const UText* text)
    195 {
    196     if (!text->chunkContents)
    197         return NoContext;
    198     return text->chunkContents == text->pExtra ? PrimaryContext : PriorContext;
    199 }
    200 
    201 static void textLatin1MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    202 {
    203     ASSERT(text->chunkContents == text->pExtra);
    204     if (forward) {
    205         ASSERT(nativeIndex >= text->b && nativeIndex < nativeLength);
    206         text->chunkNativeStart = nativeIndex;
    207         text->chunkNativeLimit = nativeIndex + text->extraSize / sizeof(UChar);
    208         if (text->chunkNativeLimit > nativeLength)
    209             text->chunkNativeLimit = nativeLength;
    210     } else {
    211         ASSERT(nativeIndex > text->b && nativeIndex <= nativeLength);
    212         text->chunkNativeLimit = nativeIndex;
    213         text->chunkNativeStart = nativeIndex - text->extraSize / sizeof(UChar);
    214         if (text->chunkNativeStart < text->b)
    215             text->chunkNativeStart = text->b;
    216     }
    217     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
    218     // Ensure chunk length is well defined if computed length exceeds int32_t range.
    219     ASSERT(length <= numeric_limits<int32_t>::max());
    220     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
    221     text->nativeIndexingLimit = text->chunkLength;
    222     text->chunkOffset = forward ? 0 : text->chunkLength;
    223     StringImpl::copyChars(const_cast<UChar*>(text->chunkContents), static_cast<const LChar*>(text->p) + (text->chunkNativeStart - text->b), static_cast<unsigned>(text->chunkLength));
    224 }
    225 
    226 static void textLatin1SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    227 {
    228     ASSERT(!text->chunkContents || text->chunkContents == text->q);
    229     text->chunkContents = static_cast<const UChar*>(text->pExtra);
    230     textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    231 }
    232 
    233 static void textLatin1MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    234 {
    235     ASSERT(text->chunkContents == text->q);
    236     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
    237     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    238     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    239     text->chunkNativeStart = 0;
    240     text->chunkNativeLimit = text->b;
    241     text->chunkLength = text->b;
    242     text->nativeIndexingLimit = text->chunkLength;
    243     int64_t offset = nativeIndex - text->chunkNativeStart;
    244     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    245     ASSERT(offset <= numeric_limits<int32_t>::max());
    246     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    247 }
    248 
    249 static void textLatin1SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    250 {
    251     ASSERT(!text->chunkContents || text->chunkContents == text->pExtra);
    252     text->chunkContents = static_cast<const UChar*>(text->q);
    253     textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    254 }
    255 
    256 static inline bool textInChunkOrOutOfRange(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward, UBool& isAccessible)
    257 {
    258     if (forward) {
    259         if (nativeIndex >= text->chunkNativeStart && nativeIndex < text->chunkNativeLimit) {
    260             int64_t offset = nativeIndex - text->chunkNativeStart;
    261             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
    262             ASSERT(offset <= numeric_limits<int32_t>::max());
    263             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
    264             isAccessible = TRUE;
    265             return true;
    266         }
    267         if (nativeIndex >= nativeLength && text->chunkNativeLimit == nativeLength) {
    268             text->chunkOffset = text->chunkLength;
    269             isAccessible = FALSE;
    270             return true;
    271         }
    272     } else {
    273         if (nativeIndex > text->chunkNativeStart && nativeIndex <= text->chunkNativeLimit) {
    274             int64_t offset = nativeIndex - text->chunkNativeStart;
    275             // Ensure chunk offset is well formed if computed offset exceeds int32_t range.
    276             ASSERT(offset <= numeric_limits<int32_t>::max());
    277             text->chunkOffset = offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0;
    278             isAccessible = TRUE;
    279             return true;
    280         }
    281         if (nativeIndex <= 0 && !text->chunkNativeStart) {
    282             text->chunkOffset = 0;
    283             isAccessible = FALSE;
    284             return true;
    285         }
    286     }
    287     return false;
    288 }
    289 
    290 static UBool textLatin1Access(UText* text, int64_t nativeIndex, UBool forward)
    291 {
    292     if (!text->context)
    293         return FALSE;
    294     int64_t nativeLength = textNativeLength(text);
    295     UBool isAccessible;
    296     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
    297         return isAccessible;
    298     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
    299     TextContext currentContext = textLatin1GetCurrentContext(text);
    300     TextContext newContext = textGetContext(text, nativeIndex, forward);
    301     ASSERT(newContext != NoContext);
    302     if (newContext == currentContext) {
    303         if (currentContext == PrimaryContext) {
    304             textLatin1MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    305         } else {
    306             textLatin1MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    307         }
    308     } else if (newContext == PrimaryContext) {
    309         textLatin1SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
    310     } else {
    311         ASSERT(newContext == PriorContext);
    312         textLatin1SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
    313     }
    314     return TRUE;
    315 }
    316 
    317 static const struct UTextFuncs textLatin1Funcs = {
    318     sizeof(UTextFuncs),
    319     0, 0, 0,
    320     textClone,
    321     textNativeLength,
    322     textLatin1Access,
    323     textExtract,
    324     0, 0, 0, 0,
    325     textClose,
    326     0, 0, 0,
    327 };
    328 
    329 static void textInit(UText* text, const UTextFuncs* funcs, const void* string, unsigned length, const UChar* priorContext, int priorContextLength)
    330 {
    331     text->pFuncs = funcs;
    332     text->providerProperties = 1 << UTEXT_PROVIDER_STABLE_CHUNKS;
    333     text->context = string;
    334     text->p = string;
    335     text->a = length;
    336     text->q = priorContext;
    337     text->b = priorContextLength;
    338 }
    339 
    340 static UText* textOpenLatin1(UTextWithBuffer* utWithBuffer, const LChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
    341 {
    342     if (U_FAILURE(*status))
    343         return 0;
    344 
    345     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
    346         *status = U_ILLEGAL_ARGUMENT_ERROR;
    347         return 0;
    348     }
    349     UText* text = utext_setup(&utWithBuffer->text, sizeof(utWithBuffer->buffer), status);
    350     if (U_FAILURE(*status)) {
    351         ASSERT(!text);
    352         return 0;
    353     }
    354     textInit(text, &textLatin1Funcs, string, length, priorContext, priorContextLength);
    355     return text;
    356 }
    357 
    358 static inline TextContext textUTF16GetCurrentContext(const UText* text)
    359 {
    360     if (!text->chunkContents)
    361         return NoContext;
    362     return text->chunkContents == text->p ? PrimaryContext : PriorContext;
    363 }
    364 
    365 static void textUTF16MoveInPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    366 {
    367     ASSERT(text->chunkContents == text->p);
    368     ASSERT_UNUSED(forward, forward ? nativeIndex >= text->b : nativeIndex > text->b);
    369     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    370     text->chunkNativeStart = text->b;
    371     text->chunkNativeLimit = nativeLength;
    372     int64_t length = text->chunkNativeLimit - text->chunkNativeStart;
    373     // Ensure chunk length is well defined if computed length exceeds int32_t range.
    374     ASSERT(length <= numeric_limits<int32_t>::max());
    375     text->chunkLength = length <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(length) : 0;
    376     text->nativeIndexingLimit = text->chunkLength;
    377     int64_t offset = nativeIndex - text->chunkNativeStart;
    378     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    379     ASSERT(offset <= numeric_limits<int32_t>::max());
    380     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    381 }
    382 
    383 static void textUTF16SwitchToPrimaryContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    384 {
    385     ASSERT(!text->chunkContents || text->chunkContents == text->q);
    386     text->chunkContents = static_cast<const UChar*>(text->p);
    387     textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    388 }
    389 
    390 static void textUTF16MoveInPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    391 {
    392     ASSERT(text->chunkContents == text->q);
    393     ASSERT(forward ? nativeIndex < text->b : nativeIndex <= text->b);
    394     ASSERT_UNUSED(nativeLength, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    395     ASSERT_UNUSED(forward, forward ? nativeIndex < nativeLength : nativeIndex <= nativeLength);
    396     text->chunkNativeStart = 0;
    397     text->chunkNativeLimit = text->b;
    398     text->chunkLength = text->b;
    399     text->nativeIndexingLimit = text->chunkLength;
    400     int64_t offset = nativeIndex - text->chunkNativeStart;
    401     // Ensure chunk offset is well defined if computed offset exceeds int32_t range or chunk length.
    402     ASSERT(offset <= numeric_limits<int32_t>::max());
    403     text->chunkOffset = min(offset <= numeric_limits<int32_t>::max() ? static_cast<int32_t>(offset) : 0, text->chunkLength);
    404 }
    405 
    406 static void textUTF16SwitchToPriorContext(UText* text, int64_t nativeIndex, int64_t nativeLength, UBool forward)
    407 {
    408     ASSERT(!text->chunkContents || text->chunkContents == text->p);
    409     text->chunkContents = static_cast<const UChar*>(text->q);
    410     textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    411 }
    412 
    413 static UBool textUTF16Access(UText* text, int64_t nativeIndex, UBool forward)
    414 {
    415     if (!text->context)
    416         return FALSE;
    417     int64_t nativeLength = textNativeLength(text);
    418     UBool isAccessible;
    419     if (textInChunkOrOutOfRange(text, nativeIndex, nativeLength, forward, isAccessible))
    420         return isAccessible;
    421     nativeIndex = textPinIndex(nativeIndex, nativeLength - 1);
    422     TextContext currentContext = textUTF16GetCurrentContext(text);
    423     TextContext newContext = textGetContext(text, nativeIndex, forward);
    424     ASSERT(newContext != NoContext);
    425     if (newContext == currentContext) {
    426         if (currentContext == PrimaryContext) {
    427             textUTF16MoveInPrimaryContext(text, nativeIndex, nativeLength, forward);
    428         } else {
    429             textUTF16MoveInPriorContext(text, nativeIndex, nativeLength, forward);
    430         }
    431     } else if (newContext == PrimaryContext) {
    432         textUTF16SwitchToPrimaryContext(text, nativeIndex, nativeLength, forward);
    433     } else {
    434         ASSERT(newContext == PriorContext);
    435         textUTF16SwitchToPriorContext(text, nativeIndex, nativeLength, forward);
    436     }
    437     return TRUE;
    438 }
    439 
    440 static const struct UTextFuncs textUTF16Funcs = {
    441     sizeof(UTextFuncs),
    442     0, 0, 0,
    443     textClone,
    444     textNativeLength,
    445     textUTF16Access,
    446     textExtract,
    447     0, 0, 0, 0,
    448     textClose,
    449     0, 0, 0,
    450 };
    451 
    452 static UText* textOpenUTF16(UText* text, const UChar* string, unsigned length, const UChar* priorContext, int priorContextLength, UErrorCode* status)
    453 {
    454     if (U_FAILURE(*status))
    455         return 0;
    456 
    457     if (!string || length > static_cast<unsigned>(numeric_limits<int32_t>::max())) {
    458         *status = U_ILLEGAL_ARGUMENT_ERROR;
    459         return 0;
    460     }
    461 
    462     text = utext_setup(text, 0, status);
    463     if (U_FAILURE(*status)) {
    464         ASSERT(!text);
    465         return 0;
    466     }
    467     textInit(text, &textUTF16Funcs, string, length, priorContext, priorContextLength);
    468     return text;
    469 }
    470 
    471 static UText emptyText = UTEXT_INITIALIZER;
    472 
    473 static TextBreakIterator* wordBreakIterator(const LChar* string, int length)
    474 {
    475     UErrorCode errorCode = U_ZERO_ERROR;
    476     static TextBreakIterator* breakIter = 0;
    477     if (!breakIter) {
    478         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    479         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    480         if (!breakIter)
    481             return 0;
    482     }
    483 
    484     UTextWithBuffer textLocal;
    485     textLocal.text = emptyText;
    486     textLocal.text.extraSize = sizeof(textLocal.buffer);
    487     textLocal.text.pExtra = textLocal.buffer;
    488 
    489     UErrorCode openStatus = U_ZERO_ERROR;
    490     UText* text = textOpenLatin1(&textLocal, string, length, 0, 0, &openStatus);
    491     if (U_FAILURE(openStatus)) {
    492         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
    493         return 0;
    494     }
    495 
    496     UErrorCode setTextStatus = U_ZERO_ERROR;
    497     breakIter->setText(text, setTextStatus);
    498     if (U_FAILURE(setTextStatus))
    499         WTF_LOG_ERROR("BreakIterator::seText failed with status %d", setTextStatus);
    500 
    501     utext_close(text);
    502 
    503     return breakIter;
    504 }
    505 
    506 static void setText16(TextBreakIterator* iter, const UChar* string, int length)
    507 {
    508     UErrorCode errorCode = U_ZERO_ERROR;
    509     UText uText = UTEXT_INITIALIZER;
    510     utext_openUChars(&uText, string, length, &errorCode);
    511     if (U_FAILURE(errorCode))
    512         return;
    513     iter->setText(&uText, errorCode);
    514 }
    515 
    516 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
    517 {
    518     UErrorCode errorCode = U_ZERO_ERROR;
    519     static TextBreakIterator* breakIter = 0;
    520     if (!breakIter) {
    521         breakIter = icu::BreakIterator::createWordInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    522         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    523         if (!breakIter)
    524             return 0;
    525     }
    526     setText16(breakIter, string, length);
    527     return breakIter;
    528 }
    529 
    530 TextBreakIterator* wordBreakIterator(const String& string, int start, int length)
    531 {
    532     if (string.isEmpty())
    533         return 0;
    534     if (string.is8Bit())
    535         return wordBreakIterator(string.characters8() + start, length);
    536     return wordBreakIterator(string.characters16() + start, length);
    537 }
    538 
    539 TextBreakIterator* acquireLineBreakIterator(const LChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
    540 {
    541     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
    542     if (!iterator)
    543         return 0;
    544 
    545     UTextWithBuffer textLocal;
    546     textLocal.text = emptyText;
    547     textLocal.text.extraSize = sizeof(textLocal.buffer);
    548     textLocal.text.pExtra = textLocal.buffer;
    549 
    550     UErrorCode openStatus = U_ZERO_ERROR;
    551     UText* text = textOpenLatin1(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
    552     if (U_FAILURE(openStatus)) {
    553         WTF_LOG_ERROR("textOpenLatin1 failed with status %d", openStatus);
    554         return 0;
    555     }
    556 
    557     UErrorCode setTextStatus = U_ZERO_ERROR;
    558     iterator->setText(text, setTextStatus);
    559     if (U_FAILURE(setTextStatus)) {
    560         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
    561         return 0;
    562     }
    563 
    564     utext_close(text);
    565 
    566     return iterator;
    567 }
    568 
    569 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength)
    570 {
    571     TextBreakIterator* iterator = LineBreakIteratorPool::sharedPool().take(locale);
    572     if (!iterator)
    573         return 0;
    574 
    575     UText textLocal = UTEXT_INITIALIZER;
    576 
    577     UErrorCode openStatus = U_ZERO_ERROR;
    578     UText* text = textOpenUTF16(&textLocal, string, length, priorContext, priorContextLength, &openStatus);
    579     if (U_FAILURE(openStatus)) {
    580         WTF_LOG_ERROR("textOpenUTF16 failed with status %d", openStatus);
    581         return 0;
    582     }
    583 
    584     UErrorCode setTextStatus = U_ZERO_ERROR;
    585     iterator->setText(text, setTextStatus);
    586     if (U_FAILURE(setTextStatus)) {
    587         WTF_LOG_ERROR("ubrk_setUText failed with status %d", setTextStatus);
    588         return 0;
    589     }
    590 
    591     utext_close(text);
    592 
    593     return iterator;
    594 }
    595 
    596 void releaseLineBreakIterator(TextBreakIterator* iterator)
    597 {
    598     ASSERT_ARG(iterator, iterator);
    599 
    600     LineBreakIteratorPool::sharedPool().put(iterator);
    601 }
    602 
    603 static TextBreakIterator* nonSharedCharacterBreakIterator;
    604 
    605 static inline bool compareAndSwapNonSharedCharacterBreakIterator(TextBreakIterator* expected, TextBreakIterator* newValue)
    606 {
    607     DEFINE_STATIC_LOCAL(Mutex, nonSharedCharacterBreakIteratorMutex, ());
    608     MutexLocker locker(nonSharedCharacterBreakIteratorMutex);
    609     if (nonSharedCharacterBreakIterator != expected)
    610         return false;
    611     nonSharedCharacterBreakIterator = newValue;
    612     return true;
    613 }
    614 
    615 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const String& string)
    616     : m_is8Bit(true)
    617     , m_charaters8(0)
    618     , m_offset(0)
    619     , m_length(0)
    620     , m_iterator(0)
    621 {
    622     if (string.isEmpty())
    623         return;
    624 
    625     m_is8Bit = string.is8Bit();
    626 
    627     if (m_is8Bit) {
    628         m_charaters8 = string.characters8();
    629         m_offset = 0;
    630         m_length = string.length();
    631         return;
    632     }
    633 
    634     createIteratorForBuffer(string.characters16(), string.length());
    635 }
    636 
    637 NonSharedCharacterBreakIterator::NonSharedCharacterBreakIterator(const UChar* buffer, unsigned length)
    638     : m_is8Bit(false)
    639     , m_charaters8(0)
    640     , m_offset(0)
    641     , m_length(0)
    642     , m_iterator(0)
    643 {
    644     createIteratorForBuffer(buffer, length);
    645 }
    646 
    647 void NonSharedCharacterBreakIterator::createIteratorForBuffer(const UChar* buffer, unsigned length)
    648 {
    649     m_iterator = nonSharedCharacterBreakIterator;
    650     bool createdIterator = m_iterator && compareAndSwapNonSharedCharacterBreakIterator(m_iterator, 0);
    651     if (!createdIterator) {
    652         UErrorCode errorCode = U_ZERO_ERROR;
    653         m_iterator = icu::BreakIterator::createCharacterInstance(icu::Locale(currentTextBreakLocaleID()), errorCode);
    654         ASSERT_WITH_MESSAGE(U_SUCCESS(errorCode), "ICU could not open a break iterator: %s (%d)", u_errorName(errorCode), errorCode);
    655     }
    656 
    657     setText16(m_iterator, buffer, length);
    658 }
    659 
    660 NonSharedCharacterBreakIterator::~NonSharedCharacterBreakIterator()
    661 {
    662     if (m_is8Bit)
    663         return;
    664     if (!compareAndSwapNonSharedCharacterBreakIterator(0, m_iterator))
    665         delete m_iterator;
    666 }
    667 
    668 int NonSharedCharacterBreakIterator::next()
    669 {
    670     if (!m_is8Bit)
    671         return m_iterator->next();
    672 
    673     if (m_offset >= m_length)
    674         return TextBreakDone;
    675 
    676     m_offset += clusterLengthStartingAt(m_offset);
    677     return m_offset;
    678 }
    679 
    680 int NonSharedCharacterBreakIterator::current()
    681 {
    682     if (!m_is8Bit)
    683         return m_iterator->current();
    684     return m_offset;
    685 }
    686 
    687 bool NonSharedCharacterBreakIterator::isBreak(int offset) const
    688 {
    689     if (!m_is8Bit)
    690         return m_iterator->isBoundary(offset);
    691     return !isLFAfterCR(offset);
    692 }
    693 
    694 int NonSharedCharacterBreakIterator::preceding(int offset) const
    695 {
    696     if (!m_is8Bit)
    697         return m_iterator->preceding(offset);
    698     if (offset <= 0)
    699         return TextBreakDone;
    700     if (isLFAfterCR(offset))
    701         return offset - 2;
    702     return offset - 1;
    703 }
    704 
    705 int NonSharedCharacterBreakIterator::following(int offset) const
    706 {
    707     if (!m_is8Bit)
    708         return m_iterator->following(offset);
    709     if (static_cast<unsigned>(offset) >= m_length)
    710         return TextBreakDone;
    711     return offset + clusterLengthStartingAt(offset);
    712 }
    713 
    714 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
    715 {
    716     UErrorCode openStatus = U_ZERO_ERROR;
    717     static TextBreakIterator* iterator = 0;
    718     if (!iterator) {
    719         iterator =  icu::BreakIterator::createSentenceInstance(icu::Locale(currentTextBreakLocaleID()), openStatus);
    720         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
    721         if (!iterator)
    722             return 0;
    723     }
    724 
    725     setText16(iterator, string, length);
    726     return iterator;
    727 }
    728 
    729 bool isWordTextBreak(TextBreakIterator* iterator)
    730 {
    731     icu::RuleBasedBreakIterator* ruleBasedBreakIterator = static_cast<icu::RuleBasedBreakIterator*>(iterator);
    732     int ruleStatus = ruleBasedBreakIterator->getRuleStatus();
    733     return ruleStatus != UBRK_WORD_NONE;
    734 }
    735 
    736 static TextBreakIterator* setUpIteratorWithRules(const char* breakRules, const UChar* string, int length)
    737 {
    738     if (!string)
    739         return 0;
    740 
    741     static TextBreakIterator* iterator = 0;
    742     if (!iterator) {
    743         UParseError parseStatus;
    744         UErrorCode openStatus = U_ZERO_ERROR;
    745         Vector<UChar> rules;
    746         String(breakRules).appendTo(rules);
    747 
    748         iterator = new icu::RuleBasedBreakIterator(icu::UnicodeString(rules.data(), rules.size()), parseStatus, openStatus);
    749         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
    750         if (!iterator)
    751             return 0;
    752     }
    753 
    754     setText16(iterator, string, length);
    755     return iterator;
    756 }
    757 
    758 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
    759 {
    760     // This rule set is based on character-break iterator rules of ICU 4.0
    761     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
    762     // The major differences from the original ones are listed below:
    763     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
    764     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
    765     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
    766     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
    767     // * Added rules for regional indicator symbols.
    768     static const char* const kRules =
    769         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
    770         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
    771         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
    772         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
    773         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
    774         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
    775         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
    776         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
    777         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
    778         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
    779         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
    780         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
    781         "$HinV    = \\u094D;"              // Devanagari Sign Virama
    782         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
    783         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
    784         "$BenV    = \\u09CD;"              // Bengali Sign Virama
    785         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
    786         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
    787         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
    788         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
    789         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
    790         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
    791         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
    792         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
    793         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
    794         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
    795         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
    796         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
    797         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
    798         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
    799         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
    800         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
    801         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
    802         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
    803         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
    804         "$RI      = [\\U0001F1E6-\\U0001F1FF];" // Emoji regional indicators
    805         "!!chain;"
    806         "!!forward;"
    807         "$CR $LF;"
    808         "$L ($L | $V | $LV | $LVT);"
    809         "($LV | $V) ($V | $T);"
    810         "($LVT | $T) $T;"
    811         "[^$Control $CR $LF] $Extend;"
    812         "[^$Control $CR $LF] $SpacingMark;"
    813         "$RI $RI / $RI;"
    814         "$RI $RI;"
    815         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
    816         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
    817         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
    818         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
    819         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
    820         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
    821         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
    822         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
    823         "!!reverse;"
    824         "$LF $CR;"
    825         "($L | $V | $LV | $LVT) $L;"
    826         "($V | $T) ($LV | $V);"
    827         "$T ($LVT | $T);"
    828         "$Extend      [^$Control $CR $LF];"
    829         "$SpacingMark [^$Control $CR $LF];"
    830         "$RI $RI / $RI $RI;"
    831         "$RI $RI;"
    832         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
    833         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
    834         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
    835         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
    836         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
    837         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
    838         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
    839         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
    840         "!!safe_reverse;"
    841         "!!safe_forward;";
    842 
    843     return setUpIteratorWithRules(kRules, string, length);
    844 }
    845 
    846 }
    847