Home | History | Annotate | Download | only in gtk
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007 Apple Inc. All rights reserved.
      4  * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch>
      5  * Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com>
      6  * Copyright (C) 2010 Igalia S.L.
      7  *
      8  * This library is free software; you can redistribute it and/or
      9  * modify it under the terms of the GNU Library General Public
     10  * License as published by the Free Software Foundation; either
     11  * version 2 of the License, or (at your option) any later version.
     12  *
     13  * This library is distributed in the hope that it will be useful,
     14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16  * Library General Public License for more details.
     17  *
     18  * You should have received a copy of the GNU Library General Public License
     19  * along with this library; see the file COPYING.LIB.  If not, write to
     20  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     21  * Boston, MA 02110-1301, USA.
     22  *
     23  */
     24 
     25 #include "config.h"
     26 
     27 #include "TextBreakIterator.h"
     28 
     29 #include "GOwnPtr.h"
     30 #include <pango/pango.h>
     31 using namespace std;
     32 
     33 #define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)
     34 
     35 namespace WebCore {
     36 
     37 class CharacterIterator {
     38 public:
     39     bool setText(const UChar* string, int length);
     40     const gchar* getText() { return m_utf8.get(); }
     41     int getLength() { return m_length; }
     42     glong getSize() { return m_size; }
     43     void setIndex(int index);
     44     int getIndex() { return m_index; }
     45     void setUTF16Index(int index);
     46     int getUTF16Index() { return m_utf16Index; }
     47     int getUTF16Length() { return m_utf16Length; }
     48     int first();
     49     int last();
     50     int next();
     51     int previous();
     52 private:
     53     int characterSize(int index);
     54 
     55     GOwnPtr<char> m_utf8;
     56     int m_length;
     57     long m_size;
     58     int m_index;
     59     int m_utf16Index;
     60     int m_utf16Length;
     61 };
     62 
     63 int CharacterIterator::characterSize(int index)
     64 {
     65     if (index == m_length || index < 0)
     66         return 0;
     67     if (m_length == m_utf16Length)
     68         return 1;
     69 
     70     gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index);
     71     gunichar character = g_utf8_get_char(indexPtr);
     72     return UTF8_IS_SURROGATE(character) ? 2 : 1;
     73 }
     74 
     75 bool CharacterIterator::setText(const UChar* string, int length)
     76 {
     77     long utf8Size = 0;
     78     m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0));
     79     if (!utf8Size)
     80         return false;
     81 
     82     m_utf16Length = length;
     83     m_length = g_utf8_strlen(m_utf8.get(), utf8Size);
     84     m_size = utf8Size;
     85     m_index = 0;
     86     m_utf16Index = 0;
     87 
     88     return true;
     89 }
     90 
     91 void CharacterIterator::setIndex(int index)
     92 {
     93     if (index == m_index)
     94         return;
     95     if (index <= 0)
     96         m_index = m_utf16Index = 0;
     97     else if (index >= m_length) {
     98         m_index = m_length;
     99         m_utf16Index = m_utf16Length;
    100     } else if (m_length == m_utf16Length)
    101         m_index = m_utf16Index = index;
    102     else {
    103         m_index = index;
    104         int utf16Index = 0;
    105         int utf8Index = 0;
    106         while (utf8Index < index) {
    107             utf16Index += characterSize(utf8Index);
    108             utf8Index++;
    109         }
    110         m_utf16Index = utf16Index;
    111     }
    112 }
    113 
    114 void CharacterIterator::setUTF16Index(int index)
    115 {
    116     if (index == m_utf16Index)
    117         return;
    118     if (index <= 0)
    119         m_utf16Index = m_index = 0;
    120     else if (index >= m_utf16Length) {
    121         m_utf16Index = m_utf16Length;
    122         m_index = m_length;
    123     } else if (m_length == m_utf16Length)
    124         m_utf16Index = m_index = index;
    125     else {
    126         m_utf16Index = index;
    127         int utf16Index = 0;
    128         int utf8Index = 0;
    129         while (utf16Index < index) {
    130             utf16Index += characterSize(utf8Index);
    131             utf8Index++;
    132         }
    133         m_index = utf8Index;
    134     }
    135 }
    136 
    137 int CharacterIterator::first()
    138 {
    139     m_index = m_utf16Index = 0;
    140     return m_index;
    141 }
    142 
    143 int CharacterIterator::last()
    144 {
    145     m_index = m_length;
    146     m_utf16Index = m_utf16Length;
    147     return m_index;
    148 }
    149 
    150 int CharacterIterator::next()
    151 {
    152     int next = m_index + 1;
    153 
    154     if (next <= m_length) {
    155         m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length);
    156         m_index = next;
    157     } else {
    158         m_index = TextBreakDone;
    159         m_utf16Index = TextBreakDone;
    160     }
    161 
    162     return m_index;
    163 }
    164 
    165 int CharacterIterator::previous()
    166 {
    167     int previous = m_index - 1;
    168 
    169     if (previous >= 0) {
    170         m_utf16Index = max(m_utf16Index - characterSize(previous), 0);
    171         m_index = previous;
    172     } else {
    173         m_index = TextBreakDone;
    174         m_utf16Index = TextBreakDone;
    175     }
    176 
    177     return m_index;
    178 }
    179 
    180 enum UBreakIteratorType {
    181     UBRK_CHARACTER,
    182     UBRK_WORD,
    183     UBRK_LINE,
    184     UBRK_SENTENCE
    185 };
    186 
    187 class TextBreakIterator {
    188 public:
    189     UBreakIteratorType m_type;
    190     PangoLogAttr* m_logAttrs;
    191     CharacterIterator m_charIterator;
    192 };
    193 
    194 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
    195     UBreakIteratorType type, const UChar* string, int length)
    196 {
    197     if (!string)
    198         return 0;
    199 
    200     if (!createdIterator) {
    201         iterator = new TextBreakIterator();
    202         createdIterator = true;
    203     }
    204     if (!iterator)
    205         return 0;
    206 
    207     if (!iterator->m_charIterator.setText(string, length))
    208         return 0;
    209 
    210     int charLength = iterator->m_charIterator.getLength();
    211 
    212     iterator->m_type = type;
    213     if (createdIterator)
    214         g_free(iterator->m_logAttrs);
    215     iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1);
    216     pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(),
    217                         -1, 0, iterator->m_logAttrs, charLength + 1);
    218 
    219     return iterator;
    220 }
    221 
    222 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
    223 {
    224     static bool createdCharacterBreakIterator = false;
    225     static TextBreakIterator* staticCharacterBreakIterator;
    226     return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
    227 }
    228 
    229 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
    230 {
    231     // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version.
    232     return characterBreakIterator(string, length);
    233 }
    234 
    235 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
    236 {
    237     static bool createdWordBreakIterator = false;
    238     static TextBreakIterator* staticWordBreakIterator;
    239     return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length);
    240 }
    241 
    242 static bool createdLineBreakIterator = false;
    243 static TextBreakIterator* staticLineBreakIterator;
    244 
    245 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
    246 {
    247     TextBreakIterator* lineBreakIterator = 0;
    248     if (!createdLineBreakIterator || staticLineBreakIterator) {
    249         setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
    250         swap(staticLineBreakIterator, lineBreakIterator);
    251     }
    252 
    253     if (!lineBreakIterator) {
    254         bool createdNewLineBreakIterator = false;
    255         setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
    256     }
    257 
    258     return lineBreakIterator;
    259 }
    260 
    261 void releaseLineBreakIterator(TextBreakIterator* iterator)
    262 {
    263     ASSERT(createdLineBreakIterator);
    264     ASSERT(iterator);
    265 
    266     if (!staticLineBreakIterator)
    267         staticLineBreakIterator = iterator;
    268     else
    269         delete iterator;
    270 }
    271 
    272 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
    273 {
    274     static bool createdSentenceBreakIterator = false;
    275     static TextBreakIterator* staticSentenceBreakIterator;
    276     return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
    277 }
    278 
    279 int textBreakFirst(TextBreakIterator* iterator)
    280 {
    281     iterator->m_charIterator.first();
    282     return iterator->m_charIterator.getUTF16Index();
    283 }
    284 
    285 int textBreakLast(TextBreakIterator* iterator)
    286 {
    287     // TextBreakLast is not meant to find just any break according to bi->m_type
    288     // but really the one near the last character.
    289     // (cmp ICU documentation for ubrk_first and ubrk_last)
    290     // From ICU docs for ubrk_last:
    291     // "Determine the index immediately beyond the last character in the text being scanned."
    292 
    293     // So we should advance or traverse back based on bi->m_logAttrs cursor positions.
    294     // If last character position in the original string is a whitespace,
    295     // traverse to the left until the first non-white character position is found
    296     // and return the position of the first white-space char after this one.
    297     // Otherwise return m_length, as "the first character beyond the last" is outside our string.
    298 
    299     bool whiteSpaceAtTheEnd = true;
    300     int nextWhiteSpacePos = iterator->m_charIterator.getLength();
    301 
    302     int pos = iterator->m_charIterator.last();
    303     while (pos >= 0 && whiteSpaceAtTheEnd) {
    304         if (iterator->m_logAttrs[pos].is_cursor_position) {
    305             if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white)
    306                 nextWhiteSpacePos = pos;
    307         }
    308         pos = iterator->m_charIterator.previous();
    309     }
    310     iterator->m_charIterator.setIndex(nextWhiteSpacePos);
    311     return iterator->m_charIterator.getUTF16Index();
    312 }
    313 
    314 int textBreakNext(TextBreakIterator* iterator)
    315 {
    316     while (iterator->m_charIterator.next() != TextBreakDone) {
    317         int index = iterator->m_charIterator.getIndex();
    318 
    319         // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol ,
    320         // are not marked as word_start & word_end as opposed to the way ICU does it.
    321         // This leads to - for example - different word selection behaviour when right clicking.
    322 
    323         if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
    324             || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
    325             || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
    326             || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
    327             break;
    328         }
    329     }
    330     return iterator->m_charIterator.getUTF16Index();
    331 }
    332 
    333 int textBreakPrevious(TextBreakIterator* iterator)
    334 {
    335     while (iterator->m_charIterator.previous() != TextBreakDone) {
    336         int index = iterator->m_charIterator.getIndex();
    337 
    338         if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
    339             || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
    340             || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
    341             || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
    342             break;
    343         }
    344     }
    345     return iterator->m_charIterator.getUTF16Index();
    346 }
    347 
    348 int textBreakPreceding(TextBreakIterator* iterator, int offset)
    349 {
    350     if (offset > iterator->m_charIterator.getUTF16Length())
    351         return TextBreakDone;
    352     if (offset < 0)
    353         return 0;
    354     iterator->m_charIterator.setUTF16Index(offset);
    355     return textBreakPrevious(iterator);
    356 }
    357 
    358 int textBreakFollowing(TextBreakIterator* iterator, int offset)
    359 {
    360     if (offset > iterator->m_charIterator.getUTF16Length())
    361         return TextBreakDone;
    362     if (offset < 0)
    363         return 0;
    364     iterator->m_charIterator.setUTF16Index(offset);
    365     return textBreakNext(iterator);
    366 }
    367 
    368 int textBreakCurrent(TextBreakIterator* iterator)
    369 {
    370     return iterator->m_charIterator.getUTF16Index();
    371 }
    372 
    373 bool isTextBreak(TextBreakIterator* iterator, int offset)
    374 {
    375     if (!offset)
    376         return true;
    377     if (offset > iterator->m_charIterator.getUTF16Length())
    378         return false;
    379 
    380     iterator->m_charIterator.setUTF16Index(offset);
    381 
    382     int index = iterator->m_charIterator.getIndex();
    383     iterator->m_charIterator.previous();
    384     textBreakNext(iterator);
    385     return iterator->m_charIterator.getIndex() == index;
    386 }
    387 
    388 }
    389