1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007 Apple Inc. All rights reserved. 4 * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch> 5 * Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com> 6 * Copyright (C) 2010 Igalia S.L. 7 * 8 * This library is free software; you can redistribute it and/or 9 * modify it under the terms of the GNU Library General Public 10 * License as published by the Free Software Foundation; either 11 * version 2 of the License, or (at your option) any later version. 12 * 13 * This library is distributed in the hope that it will be useful, 14 * but WITHOUT ANY WARRANTY; without even the implied warranty of 15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 * Library General Public License for more details. 17 * 18 * You should have received a copy of the GNU Library General Public License 19 * along with this library; see the file COPYING.LIB. If not, write to 20 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 21 * Boston, MA 02110-1301, USA. 22 * 23 */ 24 25 #include "config.h" 26 27 #include "TextBreakIterator.h" 28 29 #include "GOwnPtr.h" 30 #include <pango/pango.h> 31 using namespace std; 32 33 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF) 34 35 namespace WebCore { 36 37 class CharacterIterator { 38 public: 39 bool setText(const UChar* string, int length); 40 const gchar* getText() { return m_utf8.get(); } 41 int getLength() { return m_length; } 42 glong getSize() { return m_size; } 43 void setIndex(int index); 44 int getIndex() { return m_index; } 45 void setUTF16Index(int index); 46 int getUTF16Index() { return m_utf16Index; } 47 int getUTF16Length() { return m_utf16Length; } 48 int first(); 49 int last(); 50 int next(); 51 int previous(); 52 private: 53 int characterSize(int index); 54 55 GOwnPtr<char> m_utf8; 56 int m_length; 57 long m_size; 58 int m_index; 59 int m_utf16Index; 60 int m_utf16Length; 61 }; 62 63 int CharacterIterator::characterSize(int index) 64 { 65 if (index == m_length || index < 0) 66 return 0; 67 if (m_length == m_utf16Length) 68 return 1; 69 70 gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index); 71 gunichar character = g_utf8_get_char(indexPtr); 72 return UTF8_IS_SURROGATE(character) ? 2 : 1; 73 } 74 75 bool CharacterIterator::setText(const UChar* string, int length) 76 { 77 long utf8Size = 0; 78 m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0)); 79 if (!utf8Size) 80 return false; 81 82 m_utf16Length = length; 83 m_length = g_utf8_strlen(m_utf8.get(), utf8Size); 84 m_size = utf8Size; 85 m_index = 0; 86 m_utf16Index = 0; 87 88 return true; 89 } 90 91 void CharacterIterator::setIndex(int index) 92 { 93 if (index == m_index) 94 return; 95 if (index <= 0) 96 m_index = m_utf16Index = 0; 97 else if (index >= m_length) { 98 m_index = m_length; 99 m_utf16Index = m_utf16Length; 100 } else if (m_length == m_utf16Length) 101 m_index = m_utf16Index = index; 102 else { 103 m_index = index; 104 int utf16Index = 0; 105 int utf8Index = 0; 106 while (utf8Index < index) { 107 utf16Index += characterSize(utf8Index); 108 utf8Index++; 109 } 110 m_utf16Index = utf16Index; 111 } 112 } 113 114 void CharacterIterator::setUTF16Index(int index) 115 { 116 if (index == m_utf16Index) 117 return; 118 if (index <= 0) 119 m_utf16Index = m_index = 0; 120 else if (index >= m_utf16Length) { 121 m_utf16Index = m_utf16Length; 122 m_index = m_length; 123 } else if (m_length == m_utf16Length) 124 m_utf16Index = m_index = index; 125 else { 126 m_utf16Index = index; 127 int utf16Index = 0; 128 int utf8Index = 0; 129 while (utf16Index < index) { 130 utf16Index += characterSize(utf8Index); 131 utf8Index++; 132 } 133 m_index = utf8Index; 134 } 135 } 136 137 int CharacterIterator::first() 138 { 139 m_index = m_utf16Index = 0; 140 return m_index; 141 } 142 143 int CharacterIterator::last() 144 { 145 m_index = m_length; 146 m_utf16Index = m_utf16Length; 147 return m_index; 148 } 149 150 int CharacterIterator::next() 151 { 152 int next = m_index + 1; 153 154 if (next <= m_length) { 155 m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length); 156 m_index = next; 157 } else { 158 m_index = TextBreakDone; 159 m_utf16Index = TextBreakDone; 160 } 161 162 return m_index; 163 } 164 165 int CharacterIterator::previous() 166 { 167 int previous = m_index - 1; 168 169 if (previous >= 0) { 170 m_utf16Index = max(m_utf16Index - characterSize(previous), 0); 171 m_index = previous; 172 } else { 173 m_index = TextBreakDone; 174 m_utf16Index = TextBreakDone; 175 } 176 177 return m_index; 178 } 179 180 enum UBreakIteratorType { 181 UBRK_CHARACTER, 182 UBRK_WORD, 183 UBRK_LINE, 184 UBRK_SENTENCE 185 }; 186 187 class TextBreakIterator { 188 public: 189 UBreakIteratorType m_type; 190 PangoLogAttr* m_logAttrs; 191 CharacterIterator m_charIterator; 192 }; 193 194 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, 195 UBreakIteratorType type, const UChar* string, int length) 196 { 197 if (!string) 198 return 0; 199 200 if (!createdIterator) { 201 iterator = new TextBreakIterator(); 202 createdIterator = true; 203 } 204 if (!iterator) 205 return 0; 206 207 if (!iterator->m_charIterator.setText(string, length)) 208 return 0; 209 210 int charLength = iterator->m_charIterator.getLength(); 211 212 iterator->m_type = type; 213 if (createdIterator) 214 g_free(iterator->m_logAttrs); 215 iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1); 216 pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(), 217 -1, 0, iterator->m_logAttrs, charLength + 1); 218 219 return iterator; 220 } 221 222 TextBreakIterator* characterBreakIterator(const UChar* string, int length) 223 { 224 static bool createdCharacterBreakIterator = false; 225 static TextBreakIterator* staticCharacterBreakIterator; 226 return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); 227 } 228 229 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 230 { 231 // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. 232 return characterBreakIterator(string, length); 233 } 234 235 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 236 { 237 static bool createdWordBreakIterator = false; 238 static TextBreakIterator* staticWordBreakIterator; 239 return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); 240 } 241 242 static bool createdLineBreakIterator = false; 243 static TextBreakIterator* staticLineBreakIterator; 244 245 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length) 246 { 247 TextBreakIterator* lineBreakIterator = 0; 248 if (!createdLineBreakIterator || staticLineBreakIterator) { 249 setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); 250 swap(staticLineBreakIterator, lineBreakIterator); 251 } 252 253 if (!lineBreakIterator) { 254 bool createdNewLineBreakIterator = false; 255 setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length); 256 } 257 258 return lineBreakIterator; 259 } 260 261 void releaseLineBreakIterator(TextBreakIterator* iterator) 262 { 263 ASSERT(createdLineBreakIterator); 264 ASSERT(iterator); 265 266 if (!staticLineBreakIterator) 267 staticLineBreakIterator = iterator; 268 else 269 delete iterator; 270 } 271 272 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 273 { 274 static bool createdSentenceBreakIterator = false; 275 static TextBreakIterator* staticSentenceBreakIterator; 276 return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 277 } 278 279 int textBreakFirst(TextBreakIterator* iterator) 280 { 281 iterator->m_charIterator.first(); 282 return iterator->m_charIterator.getUTF16Index(); 283 } 284 285 int textBreakLast(TextBreakIterator* iterator) 286 { 287 // TextBreakLast is not meant to find just any break according to bi->m_type 288 // but really the one near the last character. 289 // (cmp ICU documentation for ubrk_first and ubrk_last) 290 // From ICU docs for ubrk_last: 291 // "Determine the index immediately beyond the last character in the text being scanned." 292 293 // So we should advance or traverse back based on bi->m_logAttrs cursor positions. 294 // If last character position in the original string is a whitespace, 295 // traverse to the left until the first non-white character position is found 296 // and return the position of the first white-space char after this one. 297 // Otherwise return m_length, as "the first character beyond the last" is outside our string. 298 299 bool whiteSpaceAtTheEnd = true; 300 int nextWhiteSpacePos = iterator->m_charIterator.getLength(); 301 302 int pos = iterator->m_charIterator.last(); 303 while (pos >= 0 && whiteSpaceAtTheEnd) { 304 if (iterator->m_logAttrs[pos].is_cursor_position) { 305 if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white) 306 nextWhiteSpacePos = pos; 307 } 308 pos = iterator->m_charIterator.previous(); 309 } 310 iterator->m_charIterator.setIndex(nextWhiteSpacePos); 311 return iterator->m_charIterator.getUTF16Index(); 312 } 313 314 int textBreakNext(TextBreakIterator* iterator) 315 { 316 while (iterator->m_charIterator.next() != TextBreakDone) { 317 int index = iterator->m_charIterator.getIndex(); 318 319 // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol , 320 // are not marked as word_start & word_end as opposed to the way ICU does it. 321 // This leads to - for example - different word selection behaviour when right clicking. 322 323 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) 324 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) 325 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) 326 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { 327 break; 328 } 329 } 330 return iterator->m_charIterator.getUTF16Index(); 331 } 332 333 int textBreakPrevious(TextBreakIterator* iterator) 334 { 335 while (iterator->m_charIterator.previous() != TextBreakDone) { 336 int index = iterator->m_charIterator.getIndex(); 337 338 if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break) 339 || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end)) 340 || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position) 341 || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) { 342 break; 343 } 344 } 345 return iterator->m_charIterator.getUTF16Index(); 346 } 347 348 int textBreakPreceding(TextBreakIterator* iterator, int offset) 349 { 350 if (offset > iterator->m_charIterator.getUTF16Length()) 351 return TextBreakDone; 352 if (offset < 0) 353 return 0; 354 iterator->m_charIterator.setUTF16Index(offset); 355 return textBreakPrevious(iterator); 356 } 357 358 int textBreakFollowing(TextBreakIterator* iterator, int offset) 359 { 360 if (offset > iterator->m_charIterator.getUTF16Length()) 361 return TextBreakDone; 362 if (offset < 0) 363 return 0; 364 iterator->m_charIterator.setUTF16Index(offset); 365 return textBreakNext(iterator); 366 } 367 368 int textBreakCurrent(TextBreakIterator* iterator) 369 { 370 return iterator->m_charIterator.getUTF16Index(); 371 } 372 373 bool isTextBreak(TextBreakIterator* iterator, int offset) 374 { 375 if (!offset) 376 return true; 377 if (offset > iterator->m_charIterator.getUTF16Length()) 378 return false; 379 380 iterator->m_charIterator.setUTF16Index(offset); 381 382 int index = iterator->m_charIterator.getIndex(); 383 iterator->m_charIterator.previous(); 384 textBreakNext(iterator); 385 return iterator->m_charIterator.getIndex() == index; 386 } 387 388 } 389