1 /* 2 ************************************************************************************ 3 * Copyright (C) 2006-2013, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ************************************************************************************ 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uchar.h" 15 #include "unicode/uniset.h" 16 #include "unicode/chariter.h" 17 #include "unicode/ures.h" 18 #include "unicode/udata.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/bytestrie.h" 24 #include "charstr.h" 25 #include "dictionarydata.h" 26 #include "uvector.h" 27 #include "umutex.h" 28 #include "uresimp.h" 29 #include "ubrkimpl.h" 30 31 U_NAMESPACE_BEGIN 32 33 /* 34 ****************************************************************** 35 */ 36 37 LanguageBreakEngine::LanguageBreakEngine() { 38 } 39 40 LanguageBreakEngine::~LanguageBreakEngine() { 41 } 42 43 /* 44 ****************************************************************** 45 */ 46 47 LanguageBreakFactory::LanguageBreakFactory() { 48 } 49 50 LanguageBreakFactory::~LanguageBreakFactory() { 51 } 52 53 /* 54 ****************************************************************** 55 */ 56 57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 59 fHandled[i] = 0; 60 } 61 } 62 63 UnhandledEngine::~UnhandledEngine() { 64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 65 if (fHandled[i] != 0) { 66 delete fHandled[i]; 67 } 68 } 69 } 70 71 UBool 72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 75 } 76 77 int32_t 78 UnhandledEngine::findBreaks( UText *text, 79 int32_t startPos, 80 int32_t endPos, 81 UBool reverse, 82 int32_t breakType, 83 UStack &/*foundBreaks*/ ) const { 84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 85 UChar32 c = utext_current32(text); 86 if (reverse) { 87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 88 c = utext_previous32(text); 89 } 90 } 91 else { 92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 93 utext_next32(text); // TODO: recast loop to work with post-increment operations. 94 c = utext_current32(text); 95 } 96 } 97 } 98 return 0; 99 } 100 101 void 102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 104 if (fHandled[breakType] == 0) { 105 fHandled[breakType] = new UnicodeSet(); 106 if (fHandled[breakType] == 0) { 107 return; 108 } 109 } 110 if (!fHandled[breakType]->contains(c)) { 111 UErrorCode status = U_ZERO_ERROR; 112 // Apply the entire script of the character. 113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 115 } 116 } 117 } 118 119 /* 120 ****************************************************************** 121 */ 122 123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 124 fEngines = 0; 125 } 126 127 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 128 if (fEngines != 0) { 129 delete fEngines; 130 } 131 } 132 133 U_NAMESPACE_END 134 U_CDECL_BEGIN 135 static void U_CALLCONV _deleteEngine(void *obj) { 136 delete (const icu::LanguageBreakEngine *) obj; 137 } 138 U_CDECL_END 139 U_NAMESPACE_BEGIN 140 141 const LanguageBreakEngine * 142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 143 UBool needsInit; 144 int32_t i; 145 const LanguageBreakEngine *lbe = NULL; 146 UErrorCode status = U_ZERO_ERROR; 147 148 // TODO: The global mutex should not be used. 149 // The global mutex should only be used for short periods. 150 // A ICULanguageBreakFactory specific mutex should be used. 151 umtx_lock(NULL); 152 needsInit = (UBool)(fEngines == NULL); 153 if (!needsInit) { 154 i = fEngines->size(); 155 while (--i >= 0) { 156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 157 if (lbe != NULL && lbe->handles(c, breakType)) { 158 break; 159 } 160 lbe = NULL; 161 } 162 } 163 umtx_unlock(NULL); 164 165 if (lbe != NULL) { 166 return lbe; 167 } 168 169 if (needsInit) { 170 UStack *engines = new UStack(_deleteEngine, NULL, status); 171 if (U_SUCCESS(status) && engines == NULL) { 172 status = U_MEMORY_ALLOCATION_ERROR; 173 } 174 else if (U_FAILURE(status)) { 175 delete engines; 176 engines = NULL; 177 } 178 else { 179 umtx_lock(NULL); 180 if (fEngines == NULL) { 181 fEngines = engines; 182 engines = NULL; 183 } 184 umtx_unlock(NULL); 185 delete engines; 186 } 187 } 188 189 if (fEngines == NULL) { 190 return NULL; 191 } 192 193 // We didn't find an engine the first time through, or there was no 194 // stack. Create an engine. 195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 196 197 // Now get the lock, and see if someone else has created it in the 198 // meantime 199 umtx_lock(NULL); 200 i = fEngines->size(); 201 while (--i >= 0) { 202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 203 if (lbe != NULL && lbe->handles(c, breakType)) { 204 break; 205 } 206 lbe = NULL; 207 } 208 if (lbe == NULL && newlbe != NULL) { 209 fEngines->push((void *)newlbe, status); 210 lbe = newlbe; 211 newlbe = NULL; 212 } 213 umtx_unlock(NULL); 214 215 delete newlbe; 216 217 return lbe; 218 } 219 220 const LanguageBreakEngine * 221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 222 UErrorCode status = U_ZERO_ERROR; 223 UScriptCode code = uscript_getScript(c, &status); 224 if (U_SUCCESS(status)) { 225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 226 if (m != NULL) { 227 const LanguageBreakEngine *engine = NULL; 228 switch(code) { 229 case USCRIPT_THAI: 230 engine = new ThaiBreakEngine(m, status); 231 break; 232 case USCRIPT_LAO: 233 engine = new LaoBreakEngine(m, status); 234 break; 235 case USCRIPT_KHMER: 236 engine = new KhmerBreakEngine(m, status); 237 break; 238 239 #if !UCONFIG_NO_NORMALIZATION 240 // CJK not available w/o normalization 241 case USCRIPT_HANGUL: 242 engine = new CjkBreakEngine(m, kKorean, status); 243 break; 244 245 // use same BreakEngine and dictionary for both Chinese and Japanese 246 case USCRIPT_HIRAGANA: 247 case USCRIPT_KATAKANA: 248 case USCRIPT_HAN: 249 engine = new CjkBreakEngine(m, kChineseJapanese, status); 250 break; 251 #if 0 252 // TODO: Have to get some characters with script=common handled 253 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 254 // them to CjkBreakEngine does not work. The engine has to 255 // special-case them. 256 case USCRIPT_COMMON: 257 { 258 UBlockCode block = ublock_getCode(code); 259 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 260 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 261 break; 262 } 263 #endif 264 #endif 265 266 default: 267 break; 268 } 269 if (engine == NULL) { 270 delete m; 271 } 272 else if (U_FAILURE(status)) { 273 delete engine; 274 engine = NULL; 275 } 276 return engine; 277 } 278 } 279 return NULL; 280 } 281 282 DictionaryMatcher * 283 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 284 UErrorCode status = U_ZERO_ERROR; 285 // open root from brkitr tree. 286 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 287 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 288 int32_t dictnlength = 0; 289 const UChar *dictfname = 290 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 291 if (U_FAILURE(status)) { 292 ures_close(b); 293 return NULL; 294 } 295 CharString dictnbuf; 296 CharString ext; 297 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 298 if (extStart != NULL) { 299 int32_t len = (int32_t)(extStart - dictfname); 300 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 301 dictnlength = len; 302 } 303 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 304 ures_close(b); 305 306 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 307 if (U_SUCCESS(status)) { 308 // build trie 309 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 310 const int32_t *indexes = (const int32_t *)data; 311 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 312 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 313 DictionaryMatcher *m = NULL; 314 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 315 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 316 const char *characters = (const char *)(data + offset); 317 m = new BytesDictionaryMatcher(characters, transform, file); 318 } 319 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 320 const UChar *characters = (const UChar *)(data + offset); 321 m = new UCharsDictionaryMatcher(characters, file); 322 } 323 if (m == NULL) { 324 // no matcher exists to take ownership - either we are an invalid 325 // type or memory allocation failed 326 udata_close(file); 327 } 328 return m; 329 } else if (dictfname != NULL) { 330 // we don't have a dictionary matcher. 331 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 332 status = U_ZERO_ERROR; 333 return NULL; 334 } 335 return NULL; 336 } 337 338 U_NAMESPACE_END 339 340 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 341