text/gtk/TextBreakIteratorGtk.cpp

/*
 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
 * Copyright (C) 2007 Apple Inc. All rights reserved.
 * Copyright (C) 2008 Jrg Billeter <j (at) bitron.ch>
 * Copyright (C) 2008 Dominik Rttsches <dominik.roettsches (at) access-company.com>
 * Copyright (C) 2010 Igalia S.L.
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public License
 * along with this library; see the file COPYING.LIB.  If not, write to
 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA 02110-1301, USA.
 *
 */

#include "config.h"

#include "TextBreakIterator.h"

#include "GOwnPtr.h"
#include <pango/pango.h>
using namespace std;

#define UTF8_IS_SURROGATE(character) (character >= 0x10000 && character <= 0x10FFFF)

namespace WebCore {

class CharacterIterator {
public:
    bool setText(const UChar* string, int length);
    const gchar* getText() { return m_utf8.get(); }
    int getLength() { return m_length; }
    glong getSize() { return m_size; }
    void setIndex(int index);
    int getIndex() { return m_index; }
    void setUTF16Index(int index);
    int getUTF16Index() { return m_utf16Index; }
    int getUTF16Length() { return m_utf16Length; }
    int first();
    int last();
    int next();
    int previous();
private:
    int characterSize(int index);

    GOwnPtr<char> m_utf8;
    int m_length;
    long m_size;
    int m_index;
    int m_utf16Index;
    int m_utf16Length;
};

int CharacterIterator::characterSize(int index)
{
    if (index == m_length || index < 0)
        return 0;
    if (m_length == m_utf16Length)
        return 1;

    gchar* indexPtr = g_utf8_offset_to_pointer(m_utf8.get(), index);
    gunichar character = g_utf8_get_char(indexPtr);
    return UTF8_IS_SURROGATE(character) ? 2 : 1;
}

bool CharacterIterator::setText(const UChar* string, int length)
{
    long utf8Size = 0;
    m_utf8.set(g_utf16_to_utf8(string, length, 0, &utf8Size, 0));
    if (!utf8Size)
        return false;

    m_utf16Length = length;
    m_length = g_utf8_strlen(m_utf8.get(), utf8Size);
    m_size = utf8Size;
    m_index = 0;
    m_utf16Index = 0;

    return true;
}

void CharacterIterator::setIndex(int index)
{
    if (index == m_index)
        return;
    if (index <= 0)
        m_index = m_utf16Index = 0;
    else if (index >= m_length) {
        m_index = m_length;
        m_utf16Index = m_utf16Length;
    } else if (m_length == m_utf16Length)
        m_index = m_utf16Index = index;
    else {
        m_index = index;
        int utf16Index = 0;
        int utf8Index = 0;
        while (utf8Index < index) {
            utf16Index += characterSize(utf8Index);
            utf8Index++;
        }
        m_utf16Index = utf16Index;
    }
}

void CharacterIterator::setUTF16Index(int index)
{
    if (index == m_utf16Index)
        return;
    if (index <= 0)
        m_utf16Index = m_index = 0;
    else if (index >= m_utf16Length) {
        m_utf16Index = m_utf16Length;
        m_index = m_length;
    } else if (m_length == m_utf16Length)
        m_utf16Index = m_index = index;
    else {
        m_utf16Index = index;
        int utf16Index = 0;
        int utf8Index = 0;
        while (utf16Index < index) {
            utf16Index += characterSize(utf8Index);
            utf8Index++;
        }
        m_index = utf8Index;
    }
}

int CharacterIterator::first()
{
    m_index = m_utf16Index = 0;
    return m_index;
}

int CharacterIterator::last()
{
    m_index = m_length;
    m_utf16Index = m_utf16Length;
    return m_index;
}

int CharacterIterator::next()
{
    int next = m_index + 1;

    if (next <= m_length) {
        m_utf16Index = min(m_utf16Index + characterSize(m_index), m_utf16Length);
        m_index = next;
    } else {
        m_index = TextBreakDone;
        m_utf16Index = TextBreakDone;
    }

    return m_index;
}

int CharacterIterator::previous()
{
    int previous = m_index - 1;

    if (previous >= 0) {
        m_utf16Index = max(m_utf16Index - characterSize(previous), 0);
        m_index = previous;
    } else {
        m_index = TextBreakDone;
        m_utf16Index = TextBreakDone;
    }

    return m_index;
}

enum UBreakIteratorType {
    UBRK_CHARACTER,
    UBRK_WORD,
    UBRK_LINE,
    UBRK_SENTENCE
};

class TextBreakIterator {
public:
    UBreakIteratorType m_type;
    PangoLogAttr* m_logAttrs;
    CharacterIterator m_charIterator;
};

static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
    UBreakIteratorType type, const UChar* string, int length)
{
    if (!string)
        return 0;

    if (!createdIterator) {
        iterator = new TextBreakIterator();
        createdIterator = true;
    }
    if (!iterator)
        return 0;

    if (!iterator->m_charIterator.setText(string, length))
        return 0;

    int charLength = iterator->m_charIterator.getLength();

    iterator->m_type = type;
    if (createdIterator)
        g_free(iterator->m_logAttrs);
    iterator->m_logAttrs = g_new0(PangoLogAttr, charLength + 1);
    pango_get_log_attrs(iterator->m_charIterator.getText(), iterator->m_charIterator.getSize(),
                        -1, 0, iterator->m_logAttrs, charLength + 1);

    return iterator;
}

TextBreakIterator* characterBreakIterator(const UChar* string, int length)
{
    static bool createdCharacterBreakIterator = false;
    static TextBreakIterator* staticCharacterBreakIterator;
    return setUpIterator(createdCharacterBreakIterator, staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
}

TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
{
    // FIXME: This needs closer inspection to achieve behaviour identical to the ICU version.
    return characterBreakIterator(string, length);
}

TextBreakIterator* wordBreakIterator(const UChar* string, int length)
{
    static bool createdWordBreakIterator = false;
    static TextBreakIterator* staticWordBreakIterator;
    return setUpIterator(createdWordBreakIterator, staticWordBreakIterator, UBRK_WORD, string, length);
}

static bool createdLineBreakIterator = false;
static TextBreakIterator* staticLineBreakIterator;

TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
{
    TextBreakIterator* lineBreakIterator = 0;
    if (!createdLineBreakIterator || staticLineBreakIterator) {
        setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
        swap(staticLineBreakIterator, lineBreakIterator);
    }

    if (!lineBreakIterator) {
        bool createdNewLineBreakIterator = false;
        setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
    }

    return lineBreakIterator;
}

void releaseLineBreakIterator(TextBreakIterator* iterator)
{
    ASSERT(createdLineBreakIterator);
    ASSERT(iterator);

    if (!staticLineBreakIterator)
        staticLineBreakIterator = iterator;
    else
        delete iterator;
}

TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
{
    static bool createdSentenceBreakIterator = false;
    static TextBreakIterator* staticSentenceBreakIterator;
    return setUpIterator(createdSentenceBreakIterator, staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
}

int textBreakFirst(TextBreakIterator* iterator)
{
    iterator->m_charIterator.first();
    return iterator->m_charIterator.getUTF16Index();
}

int textBreakLast(TextBreakIterator* iterator)
{
    // TextBreakLast is not meant to find just any break according to bi->m_type
    // but really the one near the last character.
    // (cmp ICU documentation for ubrk_first and ubrk_last)
    // From ICU docs for ubrk_last:
    // "Determine the index immediately beyond the last character in the text being scanned."

    // So we should advance or traverse back based on bi->m_logAttrs cursor positions.
    // If last character position in the original string is a whitespace,
    // traverse to the left until the first non-white character position is found
    // and return the position of the first white-space char after this one.
    // Otherwise return m_length, as "the first character beyond the last" is outside our string.

    bool whiteSpaceAtTheEnd = true;
    int nextWhiteSpacePos = iterator->m_charIterator.getLength();

    int pos = iterator->m_charIterator.last();
    while (pos >= 0 && whiteSpaceAtTheEnd) {
        if (iterator->m_logAttrs[pos].is_cursor_position) {
            if (whiteSpaceAtTheEnd = iterator->m_logAttrs[pos].is_white)
                nextWhiteSpacePos = pos;
        }
        pos = iterator->m_charIterator.previous();
    }
    iterator->m_charIterator.setIndex(nextWhiteSpacePos);
    return iterator->m_charIterator.getUTF16Index();
}

int textBreakNext(TextBreakIterator* iterator)
{
    while (iterator->m_charIterator.next() != TextBreakDone) {
        int index = iterator->m_charIterator.getIndex();

        // FIXME: UBRK_WORD case: Single multibyte characters (i.e. white space around them), such as the euro symbol ,
        // are not marked as word_start & word_end as opposed to the way ICU does it.
        // This leads to - for example - different word selection behaviour when right clicking.

        if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
            || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
            || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
            || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
            break;
        }
    }
    return iterator->m_charIterator.getUTF16Index();
}

int textBreakPrevious(TextBreakIterator* iterator)
{
    while (iterator->m_charIterator.previous() != TextBreakDone) {
        int index = iterator->m_charIterator.getIndex();

        if ((iterator->m_type == UBRK_LINE && iterator->m_logAttrs[index].is_line_break)
            || (iterator->m_type == UBRK_WORD && (iterator->m_logAttrs[index].is_word_start || iterator->m_logAttrs[index].is_word_end))
            || (iterator->m_type == UBRK_CHARACTER && iterator->m_logAttrs[index].is_cursor_position)
            || (iterator->m_type == UBRK_SENTENCE && iterator->m_logAttrs[index].is_sentence_boundary)) {
            break;
        }
    }
    return iterator->m_charIterator.getUTF16Index();
}

int textBreakPreceding(TextBreakIterator* iterator, int offset)
{
    if (offset > iterator->m_charIterator.getUTF16Length())
        return TextBreakDone;
    if (offset < 0)
        return 0;
    iterator->m_charIterator.setUTF16Index(offset);
    return textBreakPrevious(iterator);
}

int textBreakFollowing(TextBreakIterator* iterator, int offset)
{
    if (offset > iterator->m_charIterator.getUTF16Length())
        return TextBreakDone;
    if (offset < 0)
        return 0;
    iterator->m_charIterator.setUTF16Index(offset);
    return textBreakNext(iterator);
}

int textBreakCurrent(TextBreakIterator* iterator)
{
    return iterator->m_charIterator.getUTF16Index();
}

bool isTextBreak(TextBreakIterator* iterator, int offset)
{
    if (!offset)
        return true;
    if (offset > iterator->m_charIterator.getUTF16Length())
        return false;

    iterator->m_charIterator.setUTF16Index(offset);

    int index = iterator->m_charIterator.getIndex();
    iterator->m_charIterator.previous();
    textBreakNext(iterator);
    return iterator->m_charIterator.getIndex() == index;
}

}