1 /* 2 * Copyright (C) 2006 Lars Knoll <lars (at) trolltech.com> 3 * Copyright (C) 2007 Apple Inc. All rights reserved. 4 * 5 * This library is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU Library General Public 7 * License as published by the Free Software Foundation; either 8 * version 2 of the License, or (at your option) any later version. 9 * 10 * This library is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 * Library General Public License for more details. 14 * 15 * You should have received a copy of the GNU Library General Public License 16 * along with this library; see the file COPYING.LIB. If not, write to 17 * the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 * Boston, MA 02110-1301, USA. 19 * 20 */ 21 22 #include "config.h" 23 #include "TextBreakIterator.h" 24 25 #include "PlatformString.h" 26 #include "TextBreakIteratorInternalICU.h" 27 28 #include <unicode/ubrk.h> 29 #include <wtf/Assertions.h> 30 31 namespace WebCore { 32 33 static TextBreakIterator* setUpIterator(bool& createdIterator, TextBreakIterator*& iterator, 34 UBreakIteratorType type, const UChar* string, int length) 35 { 36 if (!string) 37 return 0; 38 39 if (!createdIterator) { 40 UErrorCode openStatus = U_ZERO_ERROR; 41 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_open(type, currentTextBreakLocaleID(), 0, 0, &openStatus)); 42 createdIterator = true; 43 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 44 } 45 if (!iterator) 46 return 0; 47 48 UErrorCode setTextStatus = U_ZERO_ERROR; 49 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 50 if (U_FAILURE(setTextStatus)) 51 return 0; 52 53 return iterator; 54 } 55 56 TextBreakIterator* characterBreakIterator(const UChar* string, int length) 57 { 58 static bool createdCharacterBreakIterator = false; 59 static TextBreakIterator* staticCharacterBreakIterator; 60 return setUpIterator(createdCharacterBreakIterator, 61 staticCharacterBreakIterator, UBRK_CHARACTER, string, length); 62 } 63 64 TextBreakIterator* wordBreakIterator(const UChar* string, int length) 65 { 66 static bool createdWordBreakIterator = false; 67 static TextBreakIterator* staticWordBreakIterator; 68 return setUpIterator(createdWordBreakIterator, 69 staticWordBreakIterator, UBRK_WORD, string, length); 70 } 71 72 TextBreakIterator* lineBreakIterator(const UChar* string, int length) 73 { 74 static bool createdLineBreakIterator = false; 75 static TextBreakIterator* staticLineBreakIterator; 76 return setUpIterator(createdLineBreakIterator, 77 staticLineBreakIterator, UBRK_LINE, string, length); 78 } 79 80 TextBreakIterator* sentenceBreakIterator(const UChar* string, int length) 81 { 82 static bool createdSentenceBreakIterator = false; 83 static TextBreakIterator* staticSentenceBreakIterator; 84 return setUpIterator(createdSentenceBreakIterator, 85 staticSentenceBreakIterator, UBRK_SENTENCE, string, length); 86 } 87 88 int textBreakFirst(TextBreakIterator* iterator) 89 { 90 return ubrk_first(reinterpret_cast<UBreakIterator*>(iterator)); 91 } 92 93 int textBreakLast(TextBreakIterator* iterator) 94 { 95 return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator)); 96 } 97 98 int textBreakNext(TextBreakIterator* iterator) 99 { 100 return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator)); 101 } 102 103 int textBreakPrevious(TextBreakIterator* iterator) 104 { 105 return ubrk_previous(reinterpret_cast<UBreakIterator*>(iterator)); 106 } 107 108 int textBreakPreceding(TextBreakIterator* iterator, int pos) 109 { 110 return ubrk_preceding(reinterpret_cast<UBreakIterator*>(iterator), pos); 111 } 112 113 int textBreakFollowing(TextBreakIterator* iterator, int pos) 114 { 115 return ubrk_following(reinterpret_cast<UBreakIterator*>(iterator), pos); 116 } 117 118 int textBreakCurrent(TextBreakIterator* iterator) 119 { 120 return ubrk_current(reinterpret_cast<UBreakIterator*>(iterator)); 121 } 122 123 bool isTextBreak(TextBreakIterator* iterator, int pos) 124 { 125 return ubrk_isBoundary(reinterpret_cast<UBreakIterator*>(iterator), pos); 126 } 127 128 #ifndef BUILDING_ON_TIGER 129 static TextBreakIterator* setUpIteratorWithRules(bool& createdIterator, TextBreakIterator*& iterator, 130 const char* breakRules, const UChar* string, int length) 131 { 132 if (!string) 133 return 0; 134 135 if (!createdIterator) { 136 UParseError parseStatus; 137 UErrorCode openStatus = U_ZERO_ERROR; 138 String rules(breakRules); 139 iterator = reinterpret_cast<TextBreakIterator*>(ubrk_openRules(rules.characters(), rules.length(), 0, 0, &parseStatus, &openStatus)); 140 createdIterator = true; 141 ASSERT_WITH_MESSAGE(U_SUCCESS(openStatus), "ICU could not open a break iterator: %s (%d)", u_errorName(openStatus), openStatus); 142 } 143 if (!iterator) 144 return 0; 145 146 UErrorCode setTextStatus = U_ZERO_ERROR; 147 ubrk_setText(reinterpret_cast<UBreakIterator*>(iterator), string, length, &setTextStatus); 148 if (U_FAILURE(setTextStatus)) 149 return 0; 150 151 return iterator; 152 } 153 #endif // BUILDING_ON_TIGER 154 155 TextBreakIterator* cursorMovementIterator(const UChar* string, int length) 156 { 157 #ifdef BUILDING_ON_TIGER 158 // ICU 3.2 cannot compile the below rules. 159 return characterBreakIterator(string, length); 160 #else 161 // This rule set is based on character-break iterator rules of ICU 4.0 162 // <http://source.icu-project.org/repos/icu/icu/tags/release-4-0/source/data/brkitr/char.txt>. 163 // The major differences from the original ones are listed below: 164 // * Replaced '[\p{Grapheme_Cluster_Break = SpacingMark}]' with '[\p{General_Category = Spacing Mark} - $Extend]' for ICU 3.8 or earlier; 165 // * Removed rules that prevent a cursor from moving after prepend characters (Bug 24342); 166 // * Added rules that prevent a cursor from moving after virama signs of Indic languages except Tamil (Bug 15790), and; 167 // * Added rules that prevent a cursor from moving before Japanese half-width katakara voiced marks. 168 static const char* kRules = 169 "$CR = [\\p{Grapheme_Cluster_Break = CR}];" 170 "$LF = [\\p{Grapheme_Cluster_Break = LF}];" 171 "$Control = [\\p{Grapheme_Cluster_Break = Control}];" 172 "$VoiceMarks = [\\uFF9E\\uFF9F];" // Japanese half-width katakana voiced marks 173 "$Extend = [\\p{Grapheme_Cluster_Break = Extend} $VoiceMarks - [\\u0E30 \\u0E32 \\u0E45 \\u0EB0 \\u0EB2]];" 174 "$SpacingMark = [[\\p{General_Category = Spacing Mark}] - $Extend];" 175 "$L = [\\p{Grapheme_Cluster_Break = L}];" 176 "$V = [\\p{Grapheme_Cluster_Break = V}];" 177 "$T = [\\p{Grapheme_Cluster_Break = T}];" 178 "$LV = [\\p{Grapheme_Cluster_Break = LV}];" 179 "$LVT = [\\p{Grapheme_Cluster_Break = LVT}];" 180 "$Hin0 = [\\u0905-\\u0939];" // Devanagari Letter A,...,Ha 181 "$HinV = \\u094D;" // Devanagari Sign Virama 182 "$Hin1 = [\\u0915-\\u0939];" // Devanagari Letter Ka,...,Ha 183 "$Ben0 = [\\u0985-\\u09B9];" // Bengali Letter A,...,Ha 184 "$BenV = \\u09CD;" // Bengali Sign Virama 185 "$Ben1 = [\\u0995-\\u09B9];" // Bengali Letter Ka,...,Ha 186 "$Pan0 = [\\u0A05-\\u0A39];" // Gurmukhi Letter A,...,Ha 187 "$PanV = \\u0A4D;" // Gurmukhi Sign Virama 188 "$Pan1 = [\\u0A15-\\u0A39];" // Gurmukhi Letter Ka,...,Ha 189 "$Guj0 = [\\u0A85-\\u0AB9];" // Gujarati Letter A,...,Ha 190 "$GujV = \\u0ACD;" // Gujarati Sign Virama 191 "$Guj1 = [\\u0A95-\\u0AB9];" // Gujarati Letter Ka,...,Ha 192 "$Ori0 = [\\u0B05-\\u0B39];" // Oriya Letter A,...,Ha 193 "$OriV = \\u0B4D;" // Oriya Sign Virama 194 "$Ori1 = [\\u0B15-\\u0B39];" // Oriya Letter Ka,...,Ha 195 "$Tel0 = [\\u0C05-\\u0C39];" // Telugu Letter A,...,Ha 196 "$TelV = \\u0C4D;" // Telugu Sign Virama 197 "$Tel1 = [\\u0C14-\\u0C39];" // Telugu Letter Ka,...,Ha 198 "$Kan0 = [\\u0C85-\\u0CB9];" // Kannada Letter A,...,Ha 199 "$KanV = \\u0CCD;" // Kannada Sign Virama 200 "$Kan1 = [\\u0C95-\\u0CB9];" // Kannada Letter A,...,Ha 201 "$Mal0 = [\\u0D05-\\u0D39];" // Malayalam Letter A,...,Ha 202 "$MalV = \\u0D4D;" // Malayalam Sign Virama 203 "$Mal1 = [\\u0D15-\\u0D39];" // Malayalam Letter A,...,Ha 204 "!!chain;" 205 "!!forward;" 206 "$CR $LF;" 207 "$L ($L | $V | $LV | $LVT);" 208 "($LV | $V) ($V | $T);" 209 "($LVT | $T) $T;" 210 "[^$Control $CR $LF] $Extend;" 211 "[^$Control $CR $LF] $SpacingMark;" 212 "$Hin0 $HinV $Hin1;" // Devanagari Virama (forward) 213 "$Ben0 $BenV $Ben1;" // Bengali Virama (forward) 214 "$Pan0 $PanV $Pan1;" // Gurmukhi Virama (forward) 215 "$Guj0 $GujV $Guj1;" // Gujarati Virama (forward) 216 "$Ori0 $OriV $Ori1;" // Oriya Virama (forward) 217 "$Tel0 $TelV $Tel1;" // Telugu Virama (forward) 218 "$Kan0 $KanV $Kan1;" // Kannada Virama (forward) 219 "$Mal0 $MalV $Mal1;" // Malayalam Virama (forward) 220 "!!reverse;" 221 "$LF $CR;" 222 "($L | $V | $LV | $LVT) $L;" 223 "($V | $T) ($LV | $V);" 224 "$T ($LVT | $T);" 225 "$Extend [^$Control $CR $LF];" 226 "$SpacingMark [^$Control $CR $LF];" 227 "$Hin1 $HinV $Hin0;" // Devanagari Virama (backward) 228 "$Ben1 $BenV $Ben0;" // Bengali Virama (backward) 229 "$Pan1 $PanV $Pan0;" // Gurmukhi Virama (backward) 230 "$Guj1 $GujV $Guj0;" // Gujarati Virama (backward) 231 "$Ori1 $OriV $Ori0;" // Gujarati Virama (backward) 232 "$Tel1 $TelV $Tel0;" // Telugu Virama (backward) 233 "$Kan1 $KanV $Kan0;" // Kannada Virama (backward) 234 "$Mal1 $MalV $Mal0;" // Malayalam Virama (backward) 235 "!!safe_reverse;" 236 "!!safe_forward;"; 237 static bool createdCursorMovementIterator = false; 238 static TextBreakIterator* staticCursorMovementIterator; 239 return setUpIteratorWithRules(createdCursorMovementIterator, staticCursorMovementIterator, kRules, string, length); 240 #endif // BUILDING_ON_TIGER 241 } 242 243 } 244