Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved.
      4  *
      5  * This library is free software; you can redistribute it and/or
      6  * modify it under the terms of the GNU Library General Public
      7  * License as published by the Free Software Foundation; either
      8  * version 2 of the License, or (at your option) any later version.
      9  *
     10  * This library is distributed in the hope that it will be useful,
     11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  * Library General Public License for more details.
     14  *
     15  * You should have received a copy of the GNU Library General Public License
     16  * along with this library; see the file COPYING.LIB.  If not, write to
     17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18  * Boston, MA 02110-1301, USA.
     19  *
     20  */
     21 
     22 #ifndef TextBreakIterator_h
     23 #define TextBreakIterator_h
     24 
     25 #include "wtf/text/AtomicString.h"
     26 #include "wtf/unicode/Unicode.h"
     27 
     28 namespace WebCore {
     29 
     30 class TextBreakIterator;
     31 
     32 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator.
     33 
     34 // This is similar to character break iterator in most cases, but is subject to
     35 // platform UI conventions. One notable example where this can be different
     36 // from character break iterator is Thai prepend characters, see bug 24342.
     37 // Use this for insertion point and selection manipulations.
     38 TextBreakIterator* cursorMovementIterator(const UChar*, int length);
     39 
     40 TextBreakIterator* wordBreakIterator(const String&, int start, int length);
     41 TextBreakIterator* wordBreakIterator(const UChar*, int length);
     42 TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
     43 TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength);
     44 void releaseLineBreakIterator(TextBreakIterator*);
     45 TextBreakIterator* sentenceBreakIterator(const UChar*, int length);
     46 
     47 int textBreakFirst(TextBreakIterator*);
     48 int textBreakLast(TextBreakIterator*);
     49 int textBreakNext(TextBreakIterator*);
     50 int textBreakPrevious(TextBreakIterator*);
     51 int textBreakCurrent(TextBreakIterator*);
     52 int textBreakPreceding(TextBreakIterator*, int);
     53 int textBreakFollowing(TextBreakIterator*, int);
     54 bool isTextBreak(TextBreakIterator*, int);
     55 bool isWordTextBreak(TextBreakIterator*);
     56 
     57 const int TextBreakDone = -1;
     58 
     59 class LazyLineBreakIterator {
     60 public:
     61     LazyLineBreakIterator()
     62         : m_iterator(0)
     63         , m_cachedPriorContext(0)
     64         , m_cachedPriorContextLength(0)
     65     {
     66         resetPriorContext();
     67     }
     68 
     69     LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString())
     70         : m_string(string)
     71         , m_locale(locale)
     72         , m_iterator(0)
     73         , m_cachedPriorContext(0)
     74         , m_cachedPriorContextLength(0)
     75     {
     76         resetPriorContext();
     77     }
     78 
     79     ~LazyLineBreakIterator()
     80     {
     81         if (m_iterator)
     82             releaseLineBreakIterator(m_iterator);
     83     }
     84 
     85     String string() const { return m_string; }
     86 
     87     UChar lastCharacter() const
     88     {
     89         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
     90         return m_priorContext[1];
     91     }
     92 
     93     UChar secondToLastCharacter() const
     94     {
     95         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
     96         return m_priorContext[0];
     97     }
     98 
     99     void setPriorContext(UChar last, UChar secondToLast)
    100     {
    101         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    102         m_priorContext[0] = secondToLast;
    103         m_priorContext[1] = last;
    104     }
    105 
    106     void updatePriorContext(UChar last)
    107     {
    108         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    109         m_priorContext[0] = m_priorContext[1];
    110         m_priorContext[1] = last;
    111     }
    112 
    113     void resetPriorContext()
    114     {
    115         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    116         m_priorContext[0] = 0;
    117         m_priorContext[1] = 0;
    118     }
    119 
    120     unsigned priorContextLength() const
    121     {
    122         unsigned priorContextLength = 0;
    123         COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length);
    124         if (m_priorContext[1]) {
    125             ++priorContextLength;
    126             if (m_priorContext[0])
    127                 ++priorContextLength;
    128         }
    129         return priorContextLength;
    130     }
    131 
    132     // Obtain text break iterator, possibly previously cached, where this iterator is (or has been)
    133     // initialized to use the previously stored string as the primary breaking context and using
    134     // previously stored prior context if non-empty.
    135     TextBreakIterator* get(unsigned priorContextLength)
    136     {
    137         ASSERT(priorContextLength <= priorContextCapacity);
    138         const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0;
    139         if (!m_iterator) {
    140             if (m_string.is8Bit())
    141                 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength);
    142             else
    143                 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength);
    144             m_cachedPriorContext = priorContext;
    145             m_cachedPriorContextLength = priorContextLength;
    146         } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) {
    147             this->resetStringAndReleaseIterator(m_string, m_locale);
    148             return this->get(priorContextLength);
    149         }
    150         return m_iterator;
    151     }
    152 
    153     void resetStringAndReleaseIterator(String string, const AtomicString& locale)
    154     {
    155         if (m_iterator)
    156             releaseLineBreakIterator(m_iterator);
    157 
    158         m_string = string;
    159         m_locale = locale;
    160         m_iterator = 0;
    161         m_cachedPriorContext = 0;
    162         m_cachedPriorContextLength = 0;
    163     }
    164 
    165 private:
    166     static const unsigned priorContextCapacity = 2;
    167     String m_string;
    168     AtomicString m_locale;
    169     TextBreakIterator* m_iterator;
    170     UChar m_priorContext[priorContextCapacity];
    171     const UChar* m_cachedPriorContext;
    172     unsigned m_cachedPriorContextLength;
    173 };
    174 
    175 // Iterates over "extended grapheme clusters", as defined in UAX #29.
    176 // Note that platform implementations may be less sophisticated - e.g. ICU prior to
    177 // version 4.0 only supports "legacy grapheme clusters".
    178 // Use this for general text processing, e.g. string truncation.
    179 
    180 class NonSharedCharacterBreakIterator {
    181     WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator);
    182 public:
    183     explicit NonSharedCharacterBreakIterator(const String&);
    184     NonSharedCharacterBreakIterator(const UChar*, unsigned length);
    185     ~NonSharedCharacterBreakIterator();
    186 
    187     int next();
    188     int current();
    189 
    190     bool isBreak(int offset) const;
    191     int preceding(int offset) const;
    192     int following(int offset) const;
    193 
    194     bool operator!() const
    195     {
    196         return !m_is8Bit && !m_iterator;
    197     }
    198 
    199 private:
    200     void createIteratorForBuffer(const UChar*, unsigned length);
    201 
    202     unsigned clusterLengthStartingAt(unsigned offset) const
    203     {
    204         ASSERT(m_is8Bit);
    205         // The only Latin-1 Extended Grapheme Cluster is CR LF
    206         return isCRBeforeLF(offset) ? 2 : 1;
    207     }
    208 
    209     bool isCRBeforeLF(unsigned offset) const
    210     {
    211         ASSERT(m_is8Bit);
    212         return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n';
    213     }
    214 
    215     bool isLFAfterCR(unsigned offset) const
    216     {
    217         ASSERT(m_is8Bit);
    218         return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r';
    219     }
    220 
    221     bool m_is8Bit;
    222 
    223     // For 8 bit strings, we implement the iterator ourselves.
    224     const LChar* m_charaters8;
    225     unsigned m_offset;
    226     unsigned m_length;
    227 
    228     // For 16 bit strings, we use a TextBreakIterator.
    229     TextBreakIterator* m_iterator;
    230 };
    231 
    232 // Counts the number of grapheme clusters. A surrogate pair or a sequence
    233 // of a non-combining character and following combining characters is
    234 // counted as 1 grapheme cluster.
    235 unsigned numGraphemeClusters(const String&);
    236 // Returns the number of characters which will be less than or equal to
    237 // the specified grapheme cluster length.
    238 unsigned numCharactersInGraphemeClusters(const String&, unsigned);
    239 
    240 }
    241 
    242 #endif
    243