1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007 Apple Inc. All rights reserved. 4 * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch> 5 * Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com> 6 * 7 * This library is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Library General Public 9 * License as published by the Free Software Foundation; either 10 * version 2 of the License, or (at your option) any later version. 11 * 12 * This library is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Library General Public License for more details. 16 * 17 * You should have received a copy of the GNU Library General Public License 18 * along with this library; see the file COPYING.LIB. If not, write to 19 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 20 * Boston, MA 02110-1301, USA. 21 * 22 */ 23 24 #include "config.h" 25 #include "TextBreakIterator.h" 26 27 #include <pango/pango.h> 28 #include <wtf/gtk/GOwnPtr.h> 29 30 namespace WebCore { 31 32 enum UBreakIteratorType { 33 UBRK_CHARACTER, 34 UBRK_WORD, 35 UBRK_LINE, 36 UBRK_SENTENCE 37 }; 38 39 class TextBreakIterator { 40 public: 41 UBreakIteratorType m_type; 42 int m_length; 43 PangoLogAttr* m_logAttrs; 44 int m_index; 45 }; 46 47 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, 48 UBreakIteratorType type, const UChar* string, int length) 49 { 50 if (!string) 51 return 0; 52 53 if (!createdIterator) { 54 iterator = new TextBreakIterator(); 55 createdIterator = true; 56 } 57 if (!iterator) 58 return 0; 59 60 long utf8len; 61 GOwnPtr<char> utf8; 62 utf8.set(g_utf16_to_utf8(string, length, 0, &utf8len, 0)); 63 64 // FIXME: assumes no surrogate pairs 65 66 iterator->m_type = type; 67 iterator->m_length = length; 68 if (createdIterator) 69 g_free(iterator->m_logAttrs); 70 iterator->m_logAttrs = g_new0(PangoLogAttr, length + 1); 71 iterator->m_index = -1; 72 pango_get_log_attrs(utf8.get(), utf8len, -1, 0, iterator->m_logAttrs, length + 1); 73 74 return iterator; 75 } 76 77 TextBreakIterator* characterBreakIterator(const UChar* string, int length) 78 { 79 static bool createdCharacterBreakIterator = false; 80 static TextBreakIterator* staticCharacterBreakIterator; 81 return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length); 82 } 83 84 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 85 { 86 // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version. 87 return characterBreakIterator(string, length); 88 } 89 90 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 91 { 92 static bool createdWordBreakIterator = false; 93 static TextBreakIterator* staticWordBreakIterator; 94 return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length); 95 } 96 97 TextBreakIterator* lineBreakIterator(const UChar* string, int length) 98 { 99 static bool createdLineBreakIterator = false; 100 static TextBreakIterator* staticLineBreakIterator; 101 return setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length); 102 } 103 104 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 105 { 106 static bool createdSentenceBreakIterator = false; 107 static TextBreakIterator* staticSentenceBreakIterator; 108 return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 109 } 110 111 int textBreakFirst(TextBreakIterator* bi) 112 { 113 // see textBreakLast 114 115 int firstCursorPosition = -1; 116 int pos = 0; 117 while (pos <= bi->m_length && (firstCursorPosition < 0)) { 118 if (bi->m_logAttrs[pos].is_cursor_position) 119 firstCursorPosition = pos; 120 } 121 bi->m_index = firstCursorPosition; 122 return firstCursorPosition; 123 } 124 125 int textBreakLast(TextBreakIterator* bi) 126 { 127 // TextBreakLast is not meant to find just any break according to bi->m_type 128 // but really the one near the last character. 129 // (cmp ICU documentation for ubrk_first and ubrk_last) 130 // From ICU docs for ubrk_last: 131 // "Determine the index immediately beyond the last character in the text being scanned." 132 133 // So we should advance or traverse back based on bi->m_logAttrs cursor positions. 134 // If last character position in the original string is a whitespace, 135 // traverse to the left until the first non-white character position is found 136 // and return the position of the first white-space char after this one. 137 // Otherwise return m_length, as "the first character beyond the last" is outside our string. 138 139 bool whiteSpaceAtTheEnd = true; 140 int nextWhiteSpacePos = bi->m_length; 141 142 int pos = bi->m_length; 143 while (pos >= 0 && whiteSpaceAtTheEnd) { 144 if (bi->m_logAttrs[pos].is_cursor_position) { 145 if (whiteSpaceAtTheEnd = bi->m_logAttrs[pos].is_white) 146 nextWhiteSpacePos = pos; 147 } 148 pos--; 149 } 150 bi->m_index = nextWhiteSpacePos; 151 return nextWhiteSpacePos; 152 } 153 154 int textBreakNext(TextBreakIterator* bi) 155 { 156 for (int i = bi->m_index + 1; i <= bi->m_length; i++) { 157 158 // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol , 159 // are not marked as word_start & word_end as opposed to the way ICU does it. 160 // This leads to - for example - different word selection behaviour when right clicking. 161 162 if ((bi->m_type == UBRK_LINE && bi->m_logAttrs[i].is_line_break) 163 || (bi->m_type == UBRK_WORD && (bi->m_logAttrs[i].is_word_start || bi->m_logAttrs[i].is_word_end)) 164 || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[i].is_cursor_position) 165 || (bi->m_type == UBRK_SENTENCE && (bi->m_logAttrs[i].is_sentence_start || bi->m_logAttrs[i].is_sentence_end)) ) { 166 bi->m_index = i; 167 return i; 168 } 169 } 170 return TextBreakDone; 171 } 172 173 int textBreakPrevious(TextBreakIterator* bi) 174 { 175 for (int i = bi->m_index - 1; i >= 0; i--) { 176 if ((bi->m_type == UBRK_LINE && bi->m_logAttrs[i].is_line_break) 177 || (bi->m_type == UBRK_WORD && (bi->m_logAttrs[i].is_word_start || bi->m_logAttrs[i].is_word_end)) 178 || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[i].is_cursor_position) 179 || (bi->m_type == UBRK_SENTENCE && (bi->m_logAttrs[i].is_sentence_start || bi->m_logAttrs[i].is_sentence_end)) ) { 180 bi->m_index = i; 181 return i; 182 } 183 } 184 return textBreakFirst(bi); 185 } 186 187 int textBreakPreceding(TextBreakIterator* bi, int pos) 188 { 189 bi->m_index = pos; 190 return textBreakPrevious(bi); 191 } 192 193 int textBreakFollowing(TextBreakIterator* bi, int pos) 194 { 195 if (pos < 0) 196 pos = -1; 197 bi->m_index = pos; 198 return textBreakNext(bi); 199 } 200 201 int textBreakCurrent(TextBreakIterator* bi) 202 { 203 return bi->m_index; 204 } 205 206 bool isTextBreak(TextBreakIterator* bi, int pos) 207 { 208 if (bi->m_index < 0) 209 return false; 210 211 return ((bi->m_type == UBRK_LINE && bi->m_logAttrs[bi->m_index].is_line_break) 212 || (bi->m_type == UBRK_WORD && bi->m_logAttrs[bi->m_index].is_word_end) 213 || (bi->m_type == UBRK_CHARACTER && bi->m_logAttrs[bi->m_index].is_char_break) 214 || (bi->m_type == UBRK_SENTENCE && bi->m_logAttrs[bi->m_index].is_sentence_end) ); 215 } 216 217 } 218