1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #ifndef TextBreakIterator_h 23 #define TextBreakIterator_h 24 25 #include "wtf/text/AtomicString.h" 26 #include "wtf/unicode/Unicode.h" 27 28 namespace WebCore { 29 30 class TextBreakIterator; 31 32 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator. 33 34 // This is similar to character break iterator in most cases, but is subject to 35 // platform UI conventions. One notable example where this can be different 36 // from character break iterator is Thai prepend characters, see bug 24342. 37 // Use this for insertion point and selection manipulations. 38 TextBreakIterator* cursorMovementIterator(const UChar*, int length); 39 40 TextBreakIterator* wordBreakIterator(const String&, int start, int length); 41 TextBreakIterator* wordBreakIterator(const UChar*, int length); 42 TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 43 TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 44 void releaseLineBreakIterator(TextBreakIterator*); 45 TextBreakIterator* sentenceBreakIterator(const UChar*, int length); 46 47 int textBreakFirst(TextBreakIterator*); 48 int textBreakLast(TextBreakIterator*); 49 int textBreakNext(TextBreakIterator*); 50 int textBreakPrevious(TextBreakIterator*); 51 int textBreakCurrent(TextBreakIterator*); 52 int textBreakPreceding(TextBreakIterator*, int); 53 int textBreakFollowing(TextBreakIterator*, int); 54 bool isTextBreak(TextBreakIterator*, int); 55 bool isWordTextBreak(TextBreakIterator*); 56 57 const int TextBreakDone = -1; 58 59 class LazyLineBreakIterator { 60 public: 61 LazyLineBreakIterator() 62 : m_iterator(0) 63 , m_cachedPriorContext(0) 64 , m_cachedPriorContextLength(0) 65 { 66 resetPriorContext(); 67 } 68 69 LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString()) 70 : m_string(string) 71 , m_locale(locale) 72 , m_iterator(0) 73 , m_cachedPriorContext(0) 74 , m_cachedPriorContextLength(0) 75 { 76 resetPriorContext(); 77 } 78 79 ~LazyLineBreakIterator() 80 { 81 if (m_iterator) 82 releaseLineBreakIterator(m_iterator); 83 } 84 85 String string() const { return m_string; } 86 87 UChar lastCharacter() const 88 { 89 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 90 return m_priorContext[1]; 91 } 92 93 UChar secondToLastCharacter() const 94 { 95 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 96 return m_priorContext[0]; 97 } 98 99 void setPriorContext(UChar last, UChar secondToLast) 100 { 101 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 102 m_priorContext[0] = secondToLast; 103 m_priorContext[1] = last; 104 } 105 106 void updatePriorContext(UChar last) 107 { 108 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 109 m_priorContext[0] = m_priorContext[1]; 110 m_priorContext[1] = last; 111 } 112 113 void resetPriorContext() 114 { 115 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 116 m_priorContext[0] = 0; 117 m_priorContext[1] = 0; 118 } 119 120 unsigned priorContextLength() const 121 { 122 unsigned priorContextLength = 0; 123 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 124 if (m_priorContext[1]) { 125 ++priorContextLength; 126 if (m_priorContext[0]) 127 ++priorContextLength; 128 } 129 return priorContextLength; 130 } 131 132 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been) 133 // initialized to use the previously stored string as the primary breaking context and using 134 // previously stored prior context if non-empty. 135 TextBreakIterator* get(unsigned priorContextLength) 136 { 137 ASSERT(priorContextLength <= priorContextCapacity); 138 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0; 139 if (!m_iterator) { 140 if (m_string.is8Bit()) 141 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength); 142 else 143 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength); 144 m_cachedPriorContext = priorContext; 145 m_cachedPriorContextLength = priorContextLength; 146 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) { 147 this->resetStringAndReleaseIterator(m_string, m_locale); 148 return this->get(priorContextLength); 149 } 150 return m_iterator; 151 } 152 153 void resetStringAndReleaseIterator(String string, const AtomicString& locale) 154 { 155 if (m_iterator) 156 releaseLineBreakIterator(m_iterator); 157 158 m_string = string; 159 m_locale = locale; 160 m_iterator = 0; 161 m_cachedPriorContext = 0; 162 m_cachedPriorContextLength = 0; 163 } 164 165 private: 166 static const unsigned priorContextCapacity = 2; 167 String m_string; 168 AtomicString m_locale; 169 TextBreakIterator* m_iterator; 170 UChar m_priorContext[priorContextCapacity]; 171 const UChar* m_cachedPriorContext; 172 unsigned m_cachedPriorContextLength; 173 }; 174 175 // Iterates over "extended grapheme clusters", as defined in UAX #29. 176 // Note that platform implementations may be less sophisticated - e.g. ICU prior to 177 // version 4.0 only supports "legacy grapheme clusters". 178 // Use this for general text processing, e.g. string truncation. 179 180 class NonSharedCharacterBreakIterator { 181 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator); 182 public: 183 explicit NonSharedCharacterBreakIterator(const String&); 184 NonSharedCharacterBreakIterator(const UChar*, unsigned length); 185 ~NonSharedCharacterBreakIterator(); 186 187 int next(); 188 int current(); 189 190 bool isBreak(int offset) const; 191 int preceding(int offset) const; 192 int following(int offset) const; 193 194 bool operator!() const 195 { 196 return !m_is8Bit && !m_iterator; 197 } 198 199 private: 200 void createIteratorForBuffer(const UChar*, unsigned length); 201 202 unsigned clusterLengthStartingAt(unsigned offset) const 203 { 204 ASSERT(m_is8Bit); 205 // The only Latin-1 Extended Grapheme Cluster is CR LF 206 return isCRBeforeLF(offset) ? 2 : 1; 207 } 208 209 bool isCRBeforeLF(unsigned offset) const 210 { 211 ASSERT(m_is8Bit); 212 return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n'; 213 } 214 215 bool isLFAfterCR(unsigned offset) const 216 { 217 ASSERT(m_is8Bit); 218 return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r'; 219 } 220 221 bool m_is8Bit; 222 223 // For 8 bit strings, we implement the iterator ourselves. 224 const LChar* m_charaters8; 225 unsigned m_offset; 226 unsigned m_length; 227 228 // For 16 bit strings, we use a TextBreakIterator. 229 TextBreakIterator* m_iterator; 230 }; 231 232 // Counts the number of grapheme clusters. A surrogate pair or a sequence 233 // of a non-combining character and following combining characters is 234 // counted as 1 grapheme cluster. 235 unsigned numGraphemeClusters(const String&); 236 // Returns the number of characters which will be less than or equal to 237 // the specified grapheme cluster length. 238 unsigned numCharactersInGraphemeClusters(const String&, unsigned); 239 240 } 241 242 #endif 243