Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
      4  *
      5  * This library is free software; you can redistribute it and/or
      6  * modify it under the terms of the GNU Library General Public
      7  * License as published by the Free Software Foundation; either
      8  * version 2 of the License, or (at your option) any later version.
      9  *
     10  * This library is distributed in the hope that it will be useful,
     11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  * Library General Public License for more details.
     14  *
     15  * You should have received a copy of the GNU Library General Public License
     16  * along with this library; see the file COPYING.LIB.  If not, write to
     17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18  * Boston, MA 02110-1301, USA.
     19  *
     20  */
     21 
     22 #ifndef TextBreakIterator_h
     23 #define TextBreakIterator_h
     24 
     25 #include "platform/PlatformExport.h"
     26 #include "wtf/text/AtomicString.h"
     27 #include "wtf/unicode/Unicode.h"
     28 
     29 #include <unicode/brkiter.h>
     30 
     31 namespace blink {
     32 
     33 typedef icu::BreakIterator TextBreakIterator;
     34 
     35 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
     36 
     37 // This is similar to character break iterator in most cases, but is subject to
     38 // platform UI conventions. One notable example where this can be different
     39 // from character break iterator is Thai prepend characters, see bug 24342.
     40 // Use this for insertion point and selection manipulations.
     41 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length);
     42 
     43 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length);
     44 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length);
     45 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
     46 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
     47 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*);
     48 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
     49 
     50 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*);
     51 
     52 const int TextBreakDone = -1;
     53 
     54 class PLATFORM_EXPORT LazyLineBreakIterator {
     55 public:
     56     LazyLineBreakIterator()
     57         : m_iterator(0)
     58         , m_cachedPriorContext(0)
     59         , m_cachedPriorContextLength(0)
     60     {
     61         resetPriorContext();
     62     }
     63 
     64     LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
     65         : m_string(string)
     66         , m_locale(locale)
     67         , m_iterator(0)
     68         , m_cachedPriorContext(0)
     69         , m_cachedPriorContextLength(0)
     70     {
     71         resetPriorContext();
     72     }
     73 
     74     ~LazyLineBreakIterator()
     75     {
     76         if (m_iterator)
     77             releaseLineBreakIterator(m_iterator);
     78     }
     79 
     80     String string() const { return m_string; }
     81 
     82     UChar lastCharacter() const
     83     {
     84         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
     85         return m_priorContext[1];
     86     }
     87 
     88     UChar secondToLastCharacter() const
     89     {
     90         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
     91         return m_priorContext[0];
     92     }
     93 
     94     void setPriorContext(UChar last, UChar secondToLast)
     95     {
     96         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
     97         m_priorContext[0] = secondToLast;
     98         m_priorContext[1] = last;
     99     }
    100 
    101     void updatePriorContext(UChar last)
    102     {
    103         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    104         m_priorContext[0] = m_priorContext[1];
    105         m_priorContext[1] = last;
    106     }
    107 
    108     void resetPriorContext()
    109     {
    110         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    111         m_priorContext[0] = 0;
    112         m_priorContext[1] = 0;
    113     }
    114 
    115     unsigned priorContextLength() const
    116     {
    117         unsigned priorContextLength = 0;
    118         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    119         if (m_priorContext[1]) {
    120             ++priorContextLength;
    121             if (m_priorContext[0])
    122                 ++priorContextLength;
    123         }
    124         return priorContextLength;
    125     }
    126 
    127     // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
    128     // initialized to use the previously stored string as the primary breaking context and using
    129     // previously stored prior context if non-empty.
    130     TextBreakIterator* get(unsigned priorContextLength)
    131     {
    132         ASSERT(priorContextLength <= priorContextCapacity);
    133         const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
    134         if (!m_iterator) {
    135             if (m_string.is8Bit())
    136                 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
    137             else
    138                 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
    139             m_cachedPriorContext = priorContext;
    140             m_cachedPriorContextLength = priorContextLength;
    141         } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
    142             this->resetStringAndReleaseIterator(m_string, m_locale);
    143             return this->get(priorContextLength);
    144         }
    145         return m_iterator;
    146     }
    147 
    148     void resetStringAndReleaseIterator(String string, const AtomicString& locale)
    149     {
    150         if (m_iterator)
    151             releaseLineBreakIterator(m_iterator);
    152 
    153         m_string = string;
    154         m_locale = locale;
    155         m_iterator = 0;
    156         m_cachedPriorContext = 0;
    157         m_cachedPriorContextLength = 0;
    158     }
    159 
    160 private:
    161     static const unsigned priorContextCapacity = 2;
    162     String m_string;
    163     AtomicString m_locale;
    164     TextBreakIterator* m_iterator;
    165     UChar m_priorContext[priorContextCapacity];
    166     const UChar* m_cachedPriorContext;
    167     unsigned m_cachedPriorContextLength;
    168 };
    169 
    170 // Iterates over "extended grapheme clusters", as defined in UAX #29.
    171 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
    172 // version 4.0 only supports "legacy grapheme clusters".
    173 // Use this for general text processing, e.g. string truncation.
    174 
    175 class PLATFORM_EXPORT NonSharedCharacterBreakIterator {
    176     WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
    177 public:
    178     explicit NonSharedCharacterBreakIterator(const String&);
    179     NonSharedCharacterBreakIterator(const UChar*, unsigned length);
    180     ~NonSharedCharacterBreakIterator();
    181 
    182     int next();
    183     int current();
    184 
    185     bool isBreak(int offset) const;
    186     int preceding(int offset) const;
    187     int following(int offset) const;
    188 
    189     bool operator!() const
    190     {
    191         return !m_is8Bit && !m_iterator;
    192     }
    193 
    194 private:
    195     void createIteratorForBuffer(const UChar*, unsigned length);
    196 
    197     unsigned clusterLengthStartingAt(unsigned offset) const
    198     {
    199         ASSERT(m_is8Bit);
    200         // The only Latin-1 Extended Grapheme Cluster is CR LF
    201         return isCRBeforeLF(offset) ? 2 : 1;
    202     }
    203 
    204     bool isCRBeforeLF(unsigned offset) const
    205     {
    206         ASSERT(m_is8Bit);
    207         return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
    208     }
    209 
    210     bool isLFAfterCR(unsigned offset) const
    211     {
    212         ASSERT(m_is8Bit);
    213         return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
    214     }
    215 
    216     bool m_is8Bit;
    217 
    218     // For 8 bit strings, we implement the iterator ourselves.
    219     const LChar* m_charaters8;
    220     unsigned m_offset;
    221     unsigned m_length;
    222 
    223     // For 16 bit strings, we use a TextBreakIterator.
    224     TextBreakIterator* m_iterator;
    225 };
    226 
    227 // Counts the number of grapheme clusters. A surrogate pair or a sequence
    228 // of a non-combining character and following combining characters is
    229 // counted as 1 grapheme cluster.
    230 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&);
    231 // Returns the number of characters which will be less than or equal to
    232 // the specified grapheme cluster length.
    233 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned);
    234 
    235 }
    236 
    237 #endif
    238