1 /* 2 ************************************************************************************ 3 * Copyright (C) 2006-2015, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ************************************************************************************ 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uchar.h" 15 #include "unicode/uniset.h" 16 #include "unicode/chariter.h" 17 #include "unicode/ures.h" 18 #include "unicode/udata.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/bytestrie.h" 24 #include "charstr.h" 25 #include "dictionarydata.h" 26 #include "mutex.h" 27 #include "uvector.h" 28 #include "umutex.h" 29 #include "uresimp.h" 30 #include "ubrkimpl.h" 31 32 U_NAMESPACE_BEGIN 33 34 /* 35 ****************************************************************** 36 */ 37 38 LanguageBreakEngine::LanguageBreakEngine() { 39 } 40 41 LanguageBreakEngine::~LanguageBreakEngine() { 42 } 43 44 /* 45 ****************************************************************** 46 */ 47 48 LanguageBreakFactory::LanguageBreakFactory() { 49 } 50 51 LanguageBreakFactory::~LanguageBreakFactory() { 52 } 53 54 /* 55 ****************************************************************** 56 */ 57 58 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 59 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 60 fHandled[i] = 0; 61 } 62 } 63 64 UnhandledEngine::~UnhandledEngine() { 65 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 66 if (fHandled[i] != 0) { 67 delete fHandled[i]; 68 } 69 } 70 } 71 72 UBool 73 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 74 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 75 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 76 } 77 78 int32_t 79 UnhandledEngine::findBreaks( UText *text, 80 int32_t startPos, 81 int32_t endPos, 82 UBool reverse, 83 int32_t breakType, 84 UStack &/*foundBreaks*/ ) const { 85 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 86 UChar32 c = utext_current32(text); 87 if (reverse) { 88 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 89 c = utext_previous32(text); 90 } 91 } 92 else { 93 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 94 utext_next32(text); // TODO: recast loop to work with post-increment operations. 95 c = utext_current32(text); 96 } 97 } 98 } 99 return 0; 100 } 101 102 void 103 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 104 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 105 if (fHandled[breakType] == 0) { 106 fHandled[breakType] = new UnicodeSet(); 107 if (fHandled[breakType] == 0) { 108 return; 109 } 110 } 111 if (!fHandled[breakType]->contains(c)) { 112 UErrorCode status = U_ZERO_ERROR; 113 // Apply the entire script of the character. 114 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 115 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 116 } 117 } 118 } 119 120 /* 121 ****************************************************************** 122 */ 123 124 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 125 fEngines = 0; 126 } 127 128 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 129 if (fEngines != 0) { 130 delete fEngines; 131 } 132 } 133 134 U_NAMESPACE_END 135 U_CDECL_BEGIN 136 static void U_CALLCONV _deleteEngine(void *obj) { 137 delete (const icu::LanguageBreakEngine *) obj; 138 } 139 U_CDECL_END 140 U_NAMESPACE_BEGIN 141 142 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER; 143 144 const LanguageBreakEngine * 145 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 146 const LanguageBreakEngine *lbe = NULL; 147 UErrorCode status = U_ZERO_ERROR; 148 149 Mutex m(&gBreakEngineMutex); 150 151 if (fEngines == NULL) { 152 UStack *engines = new UStack(_deleteEngine, NULL, status); 153 if (U_FAILURE(status) || engines == NULL) { 154 // Note: no way to return error code to caller. 155 delete engines; 156 return NULL; 157 } 158 fEngines = engines; 159 } else { 160 int32_t i = fEngines->size(); 161 while (--i >= 0) { 162 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 163 if (lbe != NULL && lbe->handles(c, breakType)) { 164 return lbe; 165 } 166 } 167 } 168 169 // We didn't find an engine. Create one. 170 lbe = loadEngineFor(c, breakType); 171 if (lbe != NULL) { 172 fEngines->push((void *)lbe, status); 173 } 174 return lbe; 175 } 176 177 const LanguageBreakEngine * 178 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 179 UErrorCode status = U_ZERO_ERROR; 180 UScriptCode code = uscript_getScript(c, &status); 181 if (U_SUCCESS(status)) { 182 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 183 if (m != NULL) { 184 const LanguageBreakEngine *engine = NULL; 185 switch(code) { 186 case USCRIPT_THAI: 187 engine = new ThaiBreakEngine(m, status); 188 break; 189 case USCRIPT_LAO: 190 engine = new LaoBreakEngine(m, status); 191 break; 192 case USCRIPT_MYANMAR: 193 engine = new BurmeseBreakEngine(m, status); 194 break; 195 case USCRIPT_KHMER: 196 engine = new KhmerBreakEngine(m, status); 197 break; 198 199 #if !UCONFIG_NO_NORMALIZATION 200 // CJK not available w/o normalization 201 case USCRIPT_HANGUL: 202 engine = new CjkBreakEngine(m, kKorean, status); 203 break; 204 205 // use same BreakEngine and dictionary for both Chinese and Japanese 206 case USCRIPT_HIRAGANA: 207 case USCRIPT_KATAKANA: 208 case USCRIPT_HAN: 209 engine = new CjkBreakEngine(m, kChineseJapanese, status); 210 break; 211 #if 0 212 // TODO: Have to get some characters with script=common handled 213 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 214 // them to CjkBreakEngine does not work. The engine has to 215 // special-case them. 216 case USCRIPT_COMMON: 217 { 218 UBlockCode block = ublock_getCode(code); 219 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 220 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 221 break; 222 } 223 #endif 224 #endif 225 226 default: 227 break; 228 } 229 if (engine == NULL) { 230 delete m; 231 } 232 else if (U_FAILURE(status)) { 233 delete engine; 234 engine = NULL; 235 } 236 return engine; 237 } 238 } 239 return NULL; 240 } 241 242 DictionaryMatcher * 243 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 244 UErrorCode status = U_ZERO_ERROR; 245 // open root from brkitr tree. 246 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 247 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 248 int32_t dictnlength = 0; 249 const UChar *dictfname = 250 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 251 if (U_FAILURE(status)) { 252 ures_close(b); 253 return NULL; 254 } 255 CharString dictnbuf; 256 CharString ext; 257 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 258 if (extStart != NULL) { 259 int32_t len = (int32_t)(extStart - dictfname); 260 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 261 dictnlength = len; 262 } 263 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 264 ures_close(b); 265 266 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 267 if (U_SUCCESS(status)) { 268 // build trie 269 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 270 const int32_t *indexes = (const int32_t *)data; 271 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 272 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 273 DictionaryMatcher *m = NULL; 274 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 275 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 276 const char *characters = (const char *)(data + offset); 277 m = new BytesDictionaryMatcher(characters, transform, file); 278 } 279 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 280 const UChar *characters = (const UChar *)(data + offset); 281 m = new UCharsDictionaryMatcher(characters, file); 282 } 283 if (m == NULL) { 284 // no matcher exists to take ownership - either we are an invalid 285 // type or memory allocation failed 286 udata_close(file); 287 } 288 return m; 289 } else if (dictfname != NULL) { 290 // we don't have a dictionary matcher. 291 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 292 status = U_ZERO_ERROR; 293 return NULL; 294 } 295 return NULL; 296 } 297 298 U_NAMESPACE_END 299 300 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 301