1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007, 2011, 2012 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #ifndef TextBreakIterator_h 23 #define TextBreakIterator_h 24 25 #include "platform/PlatformExport.h" 26 #include "wtf/text/AtomicString.h" 27 #include "wtf/unicode/Unicode.h" 28 29 namespace WebCore { 30 31 typedef icu::BreakIterator TextBreakIterator; 32 33 // Note: The returned iterator is good only until you get another iterator, with the exception of acquireLineBreakIterator. 34 35 // This is similar to character break iterator in most cases, but is subject to 36 // platform UI conventions. One notable example where this can be different 37 // from character break iterator is Thai prepend characters, see bug 24342. 38 // Use this for insertion point and selection manipulations. 39 PLATFORM_EXPORT TextBreakIterator* cursorMovementIterator(const UChar*, int length); 40 41 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const String&, int start, int length); 42 PLATFORM_EXPORT TextBreakIterator* wordBreakIterator(const UChar*, int length); 43 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const LChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 44 PLATFORM_EXPORT TextBreakIterator* acquireLineBreakIterator(const UChar*, int length, const AtomicString& locale, const UChar* priorContext, unsigned priorContextLength); 45 PLATFORM_EXPORT void releaseLineBreakIterator(TextBreakIterator*); 46 PLATFORM_EXPORT TextBreakIterator* sentenceBreakIterator(const UChar*, int length); 47 48 PLATFORM_EXPORT bool isWordTextBreak(TextBreakIterator*); 49 50 const int TextBreakDone = -1; 51 52 class PLATFORM_EXPORT LazyLineBreakIterator { 53 public: 54 LazyLineBreakIterator() 55 : m_iterator(0) 56 , m_cachedPriorContext(0) 57 , m_cachedPriorContextLength(0) 58 { 59 resetPriorContext(); 60 } 61 62 LazyLineBreakIterator(String string, const AtomicString& locale = AtomicString()) 63 : m_string(string) 64 , m_locale(locale) 65 , m_iterator(0) 66 , m_cachedPriorContext(0) 67 , m_cachedPriorContextLength(0) 68 { 69 resetPriorContext(); 70 } 71 72 ~LazyLineBreakIterator() 73 { 74 if (m_iterator) 75 releaseLineBreakIterator(m_iterator); 76 } 77 78 String string() const { return m_string; } 79 80 UChar lastCharacter() const 81 { 82 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 83 return m_priorContext[1]; 84 } 85 86 UChar secondToLastCharacter() const 87 { 88 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 89 return m_priorContext[0]; 90 } 91 92 void setPriorContext(UChar last, UChar secondToLast) 93 { 94 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 95 m_priorContext[0] = secondToLast; 96 m_priorContext[1] = last; 97 } 98 99 void updatePriorContext(UChar last) 100 { 101 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 102 m_priorContext[0] = m_priorContext[1]; 103 m_priorContext[1] = last; 104 } 105 106 void resetPriorContext() 107 { 108 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 109 m_priorContext[0] = 0; 110 m_priorContext[1] = 0; 111 } 112 113 unsigned priorContextLength() const 114 { 115 unsigned priorContextLength = 0; 116 COMPILE_ASSERT(WTF_ARRAY_LENGTH(m_priorContext) == 2, TextBreakIterator_unexpected_prior_context_length); 117 if (m_priorContext[1]) { 118 ++priorContextLength; 119 if (m_priorContext[0]) 120 ++priorContextLength; 121 } 122 return priorContextLength; 123 } 124 125 // Obtain text break iterator, possibly previously cached, where this iterator is (or has been) 126 // initialized to use the previously stored string as the primary breaking context and using 127 // previously stored prior context if non-empty. 128 TextBreakIterator* get(unsigned priorContextLength) 129 { 130 ASSERT(priorContextLength <= priorContextCapacity); 131 const UChar* priorContext = priorContextLength ? &m_priorContext[priorContextCapacity - priorContextLength] : 0; 132 if (!m_iterator) { 133 if (m_string.is8Bit()) 134 m_iterator = acquireLineBreakIterator(m_string.characters8(), m_string.length(), m_locale, priorContext, priorContextLength); 135 else 136 m_iterator = acquireLineBreakIterator(m_string.characters16(), m_string.length(), m_locale, priorContext, priorContextLength); 137 m_cachedPriorContext = priorContext; 138 m_cachedPriorContextLength = priorContextLength; 139 } else if (priorContext != m_cachedPriorContext || priorContextLength != m_cachedPriorContextLength) { 140 this->resetStringAndReleaseIterator(m_string, m_locale); 141 return this->get(priorContextLength); 142 } 143 return m_iterator; 144 } 145 146 void resetStringAndReleaseIterator(String string, const AtomicString& locale) 147 { 148 if (m_iterator) 149 releaseLineBreakIterator(m_iterator); 150 151 m_string = string; 152 m_locale = locale; 153 m_iterator = 0; 154 m_cachedPriorContext = 0; 155 m_cachedPriorContextLength = 0; 156 } 157 158 private: 159 static const unsigned priorContextCapacity = 2; 160 String m_string; 161 AtomicString m_locale; 162 TextBreakIterator* m_iterator; 163 UChar m_priorContext[priorContextCapacity]; 164 const UChar* m_cachedPriorContext; 165 unsigned m_cachedPriorContextLength; 166 }; 167 168 // Iterates over "extended grapheme clusters", as defined in UAX #29. 169 // Note that platform implementations may be less sophisticated - e.g. ICU prior to 170 // version 4.0 only supports "legacy grapheme clusters". 171 // Use this for general text processing, e.g. string truncation. 172 173 class PLATFORM_EXPORT NonSharedCharacterBreakIterator { 174 WTF_MAKE_NONCOPYABLE(NonSharedCharacterBreakIterator); 175 public: 176 explicit NonSharedCharacterBreakIterator(const String&); 177 NonSharedCharacterBreakIterator(const UChar*, unsigned length); 178 ~NonSharedCharacterBreakIterator(); 179 180 int next(); 181 int current(); 182 183 bool isBreak(int offset) const; 184 int preceding(int offset) const; 185 int following(int offset) const; 186 187 bool operator!() const 188 { 189 return !m_is8Bit && !m_iterator; 190 } 191 192 private: 193 void createIteratorForBuffer(const UChar*, unsigned length); 194 195 unsigned clusterLengthStartingAt(unsigned offset) const 196 { 197 ASSERT(m_is8Bit); 198 // The only Latin-1 Extended Grapheme Cluster is CR LF 199 return isCRBeforeLF(offset) ? 2 : 1; 200 } 201 202 bool isCRBeforeLF(unsigned offset) const 203 { 204 ASSERT(m_is8Bit); 205 return m_charaters8[offset] == '\r' && offset + 1 < m_length && m_charaters8[offset + 1] == '\n'; 206 } 207 208 bool isLFAfterCR(unsigned offset) const 209 { 210 ASSERT(m_is8Bit); 211 return m_charaters8[offset] == '\n' && offset >= 1 && m_charaters8[offset - 1] == '\r'; 212 } 213 214 bool m_is8Bit; 215 216 // For 8 bit strings, we implement the iterator ourselves. 217 const LChar* m_charaters8; 218 unsigned m_offset; 219 unsigned m_length; 220 221 // For 16 bit strings, we use a TextBreakIterator. 222 TextBreakIterator* m_iterator; 223 }; 224 225 // Counts the number of grapheme clusters. A surrogate pair or a sequence 226 // of a non-combining character and following combining characters is 227 // counted as 1 grapheme cluster. 228 PLATFORM_EXPORT unsigned numGraphemeClusters(const String&); 229 // Returns the number of characters which will be less than or equal to 230 // the specified grapheme cluster length. 231 PLATFORM_EXPORT unsigned numCharactersInGraphemeClusters(const String&, unsigned); 232 233 } 234 235 #endif 236