Home | History | Annotate | Download | only in text
      1 /*
      2  * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com>
      3  * Copyright (C) 2007 Apple Inc. All rights reserved.
      4  *
      5  * This library is free software; you can redistribute it and/or
      6  * modify it under the terms of the GNU Library General Public
      7  * License as published by the Free Software Foundation; either
      8  * version 2 of the License, or (at your option) any later version.
      9  *
     10  * This library is distributed in the hope that it will be useful,
     11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     13  * Library General Public License for more details.
     14  *
     15  * You should have received a copy of the GNU Library General Public License
     16  * along with this library; see the file COPYING.LIB.  If not, write to
     17  * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
     18  * Boston, MA 02110-1301, USA.
     19  *
     20  */
     21 
     22 #include "config.h"
     23 #include "TextBreakIterator.h"
     24 
     25 #include "PlatformString.h"
     26 #include "TextBreakIteratorInternalICU.h"
     27 #include <unicode/ubrk.h>
     28 #include <wtf/Assertions.h>
     29 
     30 using namespace std;
     31 
     32 namespace WebCore {
     33 
     34 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator,
     35     UBreakIteratorType type, const UChar* string, int length)
     36 {
     37     if (!string)
     38         return 0;
     39 
     40     if (!createdIterator) {
     41         UErrorCode openStatus = U_ZERO_ERROR;
     42         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus));
     43         createdIterator = true;
     44         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
     45     }
     46     if (!iterator)
     47         return 0;
     48 
     49     UErrorCode setTextStatus = U_ZERO_ERROR;
     50     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
     51     if (U_FAILURE(setTextStatus))
     52         return 0;
     53 
     54     return iterator;
     55 }
     56 
     57 TextBreakIterator* characterBreakIterator(const UChar* string, int length)
     58 {
     59     static bool createdCharacterBreakIterator = false;
     60     static TextBreakIterator* staticCharacterBreakIterator;
     61     return setUpIterator(createdCharacterBreakIterator,
     62         staticCharacterBreakIterator, UBRK_CHARACTER, string, length);
     63 }
     64 
     65 TextBreakIterator* wordBreakIterator(const UChar* string, int length)
     66 {
     67     static bool createdWordBreakIterator = false;
     68     static TextBreakIterator* staticWordBreakIterator;
     69     return setUpIterator(createdWordBreakIterator,
     70         staticWordBreakIterator, UBRK_WORD, string, length);
     71 }
     72 
     73 static bool createdLineBreakIterator = false;
     74 static TextBreakIterator* staticLineBreakIterator;
     75 
     76 TextBreakIterator* acquireLineBreakIterator(const UChar* string, int length)
     77 {
     78     TextBreakIterator* lineBreakIterator = 0;
     79     if (!createdLineBreakIterator || staticLineBreakIterator) {
     80         setUpIterator(createdLineBreakIterator, staticLineBreakIterator, UBRK_LINE, string, length);
     81         swap(staticLineBreakIterator, lineBreakIterator);
     82     }
     83 
     84     if (!lineBreakIterator) {
     85         bool createdNewLineBreakIterator = false;
     86         setUpIterator(createdNewLineBreakIterator, lineBreakIterator, UBRK_LINE, string, length);
     87     }
     88 
     89     return lineBreakIterator;
     90 }
     91 
     92 void releaseLineBreakIterator(TextBreakIterator* iterator)
     93 {
     94     ASSERT(createdLineBreakIterator);
     95     ASSERT(iterator);
     96 
     97     if (!staticLineBreakIterator)
     98         staticLineBreakIterator = iterator;
     99     else
    100         ubrk_close(reinterpret_cast<UBreakIterator*>(iterator));
    101 }
    102 
    103 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length)
    104 {
    105     static bool createdSentenceBreakIterator = false;
    106     static TextBreakIterator* staticSentenceBreakIterator;
    107     return setUpIterator(createdSentenceBreakIterator,
    108         staticSentenceBreakIterator, UBRK_SENTENCE, string, length);
    109 }
    110 
    111 int textBreakFirst(TextBreakIterator* iterator)
    112 {
    113     return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator));
    114 }
    115 
    116 int textBreakLast(TextBreakIterator* iterator)
    117 {
    118     return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
    119 }
    120 
    121 int textBreakNext(TextBreakIterator* iterator)
    122 {
    123     return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
    124 }
    125 
    126 int textBreakPrevious(TextBreakIterator* iterator)
    127 {
    128     return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator));
    129 }
    130 
    131 int textBreakPreceding(TextBreakIterator* iterator, int pos)
    132 {
    133     return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos);
    134 }
    135 
    136 int textBreakFollowing(TextBreakIterator* iterator, int pos)
    137 {
    138     return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos);
    139 }
    140 
    141 int textBreakCurrent(TextBreakIterator* iterator)
    142 {
    143     return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator));
    144 }
    145 
    146 bool isTextBreak(TextBreakIterator* iterator, int position)
    147 {
    148     return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), position);
    149 }
    150 
    151 #ifndef BUILDING_ON_TIGER
    152 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator,
    153     const char* breakRules, const UChar* string, int length)
    154 {
    155     if (!string)
    156         return 0;
    157 
    158     if (!createdIterator) {
    159         UParseError parseStatus;
    160         UErrorCode openStatus = U_ZERO_ERROR;
    161         String rules(breakRules);
    162         iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus));
    163         createdIterator = true;
    164         ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus);
    165     }
    166     if (!iterator)
    167         return 0;
    168 
    169     UErrorCode setTextStatus = U_ZERO_ERROR;
    170     ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus);
    171     if (U_FAILURE(setTextStatus))
    172         return 0;
    173 
    174     return iterator;
    175 }
    176 #endif // BUILDING_ON_TIGER
    177 
    178 TextBreakIterator* cursorMovementIterator(const UChar* string, int length)
    179 {
    180 #ifdef BUILDING_ON_TIGER
    181     // ICU 3.2 cannot compile the below rules.
    182     return characterBreakIterator(string, length);
    183 #else
    184     // This rule set is based on character-break iterator rules of ICU 4.0
    185     // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>.
    186     // The major differences from the original ones are listed below:
    187     // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier;
    188     // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342);
    189     // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and;
    190     // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks.
    191     static const char* kRules =
    192         "$CR      = [\\p{Grapheme_Cluster_Break = CR}];"
    193         "$LF      = [\\p{Grapheme_Cluster_Break = LF}];"
    194         "$Control = [\\p{Grapheme_Cluster_Break = Control}];"
    195         "$VoiceMarks = [\\uFF9E\\uFF9F];"  // Japanese half-width katakana voiced marks
    196         "$Extend  = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];"
    197         "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];"
    198         "$L       = [\\p{Grapheme_Cluster_Break = L}];"
    199         "$V       = [\\p{Grapheme_Cluster_Break = V}];"
    200         "$T       = [\\p{Grapheme_Cluster_Break = T}];"
    201         "$LV      = [\\p{Grapheme_Cluster_Break = LV}];"
    202         "$LVT     = [\\p{Grapheme_Cluster_Break = LVT}];"
    203         "$Hin0    = [\\u0905-\\u0939];"    // Devanagari Letter A,...,Ha
    204         "$HinV    = \\u094D;"              // Devanagari Sign Virama
    205         "$Hin1    = [\\u0915-\\u0939];"    // Devanagari Letter Ka,...,Ha
    206         "$Ben0    = [\\u0985-\\u09B9];"    // Bengali Letter A,...,Ha
    207         "$BenV    = \\u09CD;"              // Bengali Sign Virama
    208         "$Ben1    = [\\u0995-\\u09B9];"    // Bengali Letter Ka,...,Ha
    209         "$Pan0    = [\\u0A05-\\u0A39];"    // Gurmukhi Letter A,...,Ha
    210         "$PanV    = \\u0A4D;"              // Gurmukhi Sign Virama
    211         "$Pan1    = [\\u0A15-\\u0A39];"    // Gurmukhi Letter Ka,...,Ha
    212         "$Guj0    = [\\u0A85-\\u0AB9];"    // Gujarati Letter A,...,Ha
    213         "$GujV    = \\u0ACD;"              // Gujarati Sign Virama
    214         "$Guj1    = [\\u0A95-\\u0AB9];"    // Gujarati Letter Ka,...,Ha
    215         "$Ori0    = [\\u0B05-\\u0B39];"    // Oriya Letter A,...,Ha
    216         "$OriV    = \\u0B4D;"              // Oriya Sign Virama
    217         "$Ori1    = [\\u0B15-\\u0B39];"    // Oriya Letter Ka,...,Ha
    218         "$Tel0    = [\\u0C05-\\u0C39];"    // Telugu Letter A,...,Ha
    219         "$TelV    = \\u0C4D;"              // Telugu Sign Virama
    220         "$Tel1    = [\\u0C14-\\u0C39];"    // Telugu Letter Ka,...,Ha
    221         "$Kan0    = [\\u0C85-\\u0CB9];"    // Kannada Letter A,...,Ha
    222         "$KanV    = \\u0CCD;"              // Kannada Sign Virama
    223         "$Kan1    = [\\u0C95-\\u0CB9];"    // Kannada Letter A,...,Ha
    224         "$Mal0    = [\\u0D05-\\u0D39];"    // Malayalam Letter A,...,Ha
    225         "$MalV    = \\u0D4D;"              // Malayalam Sign Virama
    226         "$Mal1    = [\\u0D15-\\u0D39];"    // Malayalam Letter A,...,Ha
    227         "!!chain;"
    228         "!!forward;"
    229         "$CR $LF;"
    230         "$L ($L | $V | $LV | $LVT);"
    231         "($LV | $V) ($V | $T);"
    232         "($LVT | $T) $T;"
    233         "[^$Control $CR $LF] $Extend;"
    234         "[^$Control $CR $LF] $SpacingMark;"
    235         "$Hin0 $HinV $Hin1;"               // Devanagari Virama (forward)
    236         "$Ben0 $BenV $Ben1;"               // Bengali Virama (forward)
    237         "$Pan0 $PanV $Pan1;"               // Gurmukhi Virama (forward)
    238         "$Guj0 $GujV $Guj1;"               // Gujarati Virama (forward)
    239         "$Ori0 $OriV $Ori1;"               // Oriya Virama (forward)
    240         "$Tel0 $TelV $Tel1;"               // Telugu Virama (forward)
    241         "$Kan0 $KanV $Kan1;"               // Kannada Virama (forward)
    242         "$Mal0 $MalV $Mal1;"               // Malayalam Virama (forward)
    243         "!!reverse;"
    244         "$LF $CR;"
    245         "($L | $V | $LV | $LVT) $L;"
    246         "($V | $T) ($LV | $V);"
    247         "$T ($LVT | $T);"
    248         "$Extend      [^$Control $CR $LF];"
    249         "$SpacingMark [^$Control $CR $LF];"
    250         "$Hin1 $HinV $Hin0;"               // Devanagari Virama (backward)
    251         "$Ben1 $BenV $Ben0;"               // Bengali Virama (backward)
    252         "$Pan1 $PanV $Pan0;"               // Gurmukhi Virama (backward)
    253         "$Guj1 $GujV $Guj0;"               // Gujarati Virama (backward)
    254         "$Ori1 $OriV $Ori0;"               // Gujarati Virama (backward)
    255         "$Tel1 $TelV $Tel0;"               // Telugu Virama (backward)
    256         "$Kan1 $KanV $Kan0;"               // Kannada Virama (backward)
    257         "$Mal1 $MalV $Mal0;"               // Malayalam Virama (backward)
    258         "!!safe_reverse;"
    259         "!!safe_forward;";
    260     static bool createdCursorMovementIterator = false;
    261     static TextBreakIterator* staticCursorMovementIterator;
    262     return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length);
    263 #endif // BUILDING_ON_TIGER
    264 }
    265 
    266 }
    267