1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #ifndef TextBreakIterator_h 23 #define TextBreakIterator_h 24 25 #include "platform/PlatformExport.h" 26 #include "wtf/text/AtomicString.h" 27 #include "wtf/unicode/Unicode.h" 28 29 #include <unicode/brkiter.h> 30 31 namespace blink { 32 33 typedef icu::BreakIterator TextBreakIterator; 34 35 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator. 36 37 // This is similar to character break iterator in most cases, but is subject to 38 // platform UI conventions. One notable example where this can be different 39 // from character break iterator is Thai prepend characters, see bug 24342. 40 // Use this for insertion point and selection manipulations. 41 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length); 42 43 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length); 44 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length); 45 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 46 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 47 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*); 48 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length); 49 50 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*); 51 52 const int TextBreakDone = -1; 53 54 class PLATFORM_EXPORT LazyLineBreakIterator { 55 public: 56 LazyLineBreakIterator() 57 : m_iterator(0) 58 , m_cachedPriorContext(0) 59 , m_cachedPriorContextLength(0) 60 { 61 resetPriorContext(); 62 } 63 64 LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString()) 65 : m_string(string) 66 , m_locale(locale) 67 , m_iterator(0) 68 , m_cachedPriorContext(0) 69 , m_cachedPriorContextLength(0) 70 { 71 resetPriorContext(); 72 } 73 74 ~LazyLineBreakIterator() 75 { 76 if (m_iterator) 77 releaseLineBreakIterator(m_iterator); 78 } 79 80 String string() const { return m_string; } 81 82 UChar lastCharacter() const 83 { 84 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 85 return m_priorContext[1]; 86 } 87 88 UChar secondToLastCharacter() const 89 { 90 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 91 return m_priorContext[0]; 92 } 93 94 void setPriorContext(UChar last, UChar secondToLast) 95 { 96 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 97 m_priorContext[0] = secondToLast; 98 m_priorContext[1] = last; 99 } 100 101 void updatePriorContext(UChar last) 102 { 103 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 104 m_priorContext[0] = m_priorContext[1]; 105 m_priorContext[1] = last; 106 } 107 108 void resetPriorContext() 109 { 110 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 111 m_priorContext[0] = 0; 112 m_priorContext[1] = 0; 113 } 114 115 unsigned priorContextLength() const 116 { 117 unsigned priorContextLength = 0; 118 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 119 if (m_priorContext[1]) { 120 ++priorContextLength; 121 if (m_priorContext[0]) 122 ++priorContextLength; 123 } 124 return priorContextLength; 125 } 126 127 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been) 128 // initialized to use the previously stored string as the primary breaking context and using 129 // previously stored prior context if non-empty. 130 TextBreakIterator* get(unsigned priorContextLength) 131 { 132 ASSERT(priorContextLength <= priorContextCapacity); 133 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0; 134 if (!m_iterator) { 135 if (m_string.is8Bit()) 136 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength); 137 else 138 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength); 139 m_cachedPriorContext = priorContext; 140 m_cachedPriorContextLength = priorContextLength; 141 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) { 142 this->resetStringAndReleaseIterator(m_string, m_locale); 143 return this->get(priorContextLength); 144 } 145 return m_iterator; 146 } 147 148 void resetStringAndReleaseIterator(String string, const AtomicString& locale) 149 { 150 if (m_iterator) 151 releaseLineBreakIterator(m_iterator); 152 153 m_string = string; 154 m_locale = locale; 155 m_iterator = 0; 156 m_cachedPriorContext = 0; 157 m_cachedPriorContextLength = 0; 158 } 159 160 private: 161 static const unsigned priorContextCapacity = 2; 162 String m_string; 163 AtomicString m_locale; 164 TextBreakIterator* m_iterator; 165 UChar m_priorContext[priorContextCapacity]; 166 const UChar* m_cachedPriorContext; 167 unsigned m_cachedPriorContextLength; 168 }; 169 170 // Iterates over "extended grapheme clusters", as defined in UAX #29. 171 // Note that platform implementations may be less sophisticated - e.g. ICU prior to 172 // version 4.0 only supports "legacy grapheme clusters". 173 // Use this for general text processing, e.g. string truncation. 174 175 class PLATFORM_EXPORT NonSharedCharacterBreakIterator { 176 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator); 177 public: 178 explicit NonSharedCharacterBreakIterator(const String&); 179 NonSharedCharacterBreakIterator(const UChar*, unsigned length); 180 ~NonSharedCharacterBreakIterator(); 181 182 int next(); 183 int current(); 184 185 bool isBreak(int offset) const; 186 int preceding(int offset) const; 187 int following(int offset) const; 188 189 bool operator!() const 190 { 191 return !m_is8Bit && !m_iterator; 192 } 193 194 private: 195 void createIteratorForBuffer(const UChar*, unsigned length); 196 197 unsigned clusterLengthStartingAt(unsigned offset) const 198 { 199 ASSERT(m_is8Bit); 200 // The only Latin-1 Extended Grapheme Cluster is CR LF 201 return isCRBeforeLF(offset) ? 2 : 1; 202 } 203 204 bool isCRBeforeLF(unsigned offset) const 205 { 206 ASSERT(m_is8Bit); 207 return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n'; 208 } 209 210 bool isLFAfterCR(unsigned offset) const 211 { 212 ASSERT(m_is8Bit); 213 return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r'; 214 } 215 216 bool m_is8Bit; 217 218 // For 8 bit strings, we implement the iterator ourselves. 219 const LChar* m_charaters8; 220 unsigned m_offset; 221 unsigned m_length; 222 223 // For 16 bit strings, we use a TextBreakIterator. 224 TextBreakIterator* m_iterator; 225 }; 226 227 // Counts the number of grapheme clusters. A surrogate pair or a sequence 228 // of a non-combining character and following combining characters is 229 // counted as 1 grapheme cluster. 230 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&); 231 // Returns the number of characters which will be less than or equal to 232 // the specified grapheme cluster length. 233 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned); 234 235 } 236 237 #endif 238