1 /* 2 ************************************************************************************ 3 * Copyright (C) 2006-2012, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ************************************************************************************ 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "unicode/uchar.h" 15 #include "unicode/uniset.h" 16 #include "unicode/chariter.h" 17 #include "unicode/ures.h" 18 #include "unicode/udata.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/bytestrie.h" 24 #include "charstr.h" 25 #include "dictionarydata.h" 26 #include "uvector.h" 27 #include "umutex.h" 28 #include "uresimp.h" 29 #include "ubrkimpl.h" 30 31 U_NAMESPACE_BEGIN 32 33 /* 34 ****************************************************************** 35 */ 36 37 LanguageBreakEngine::LanguageBreakEngine() { 38 } 39 40 LanguageBreakEngine::~LanguageBreakEngine() { 41 } 42 43 /* 44 ****************************************************************** 45 */ 46 47 LanguageBreakFactory::LanguageBreakFactory() { 48 } 49 50 LanguageBreakFactory::~LanguageBreakFactory() { 51 } 52 53 /* 54 ****************************************************************** 55 */ 56 57 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 58 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 59 fHandled[i] = 0; 60 } 61 } 62 63 UnhandledEngine::~UnhandledEngine() { 64 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 65 if (fHandled[i] != 0) { 66 delete fHandled[i]; 67 } 68 } 69 } 70 71 UBool 72 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 73 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 74 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 75 } 76 77 int32_t 78 UnhandledEngine::findBreaks( UText *text, 79 int32_t startPos, 80 int32_t endPos, 81 UBool reverse, 82 int32_t breakType, 83 UStack &/*foundBreaks*/ ) const { 84 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 85 UChar32 c = utext_current32(text); 86 if (reverse) { 87 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 88 c = utext_previous32(text); 89 } 90 } 91 else { 92 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 93 utext_next32(text); // TODO: recast loop to work with post-increment operations. 94 c = utext_current32(text); 95 } 96 } 97 } 98 return 0; 99 } 100 101 void 102 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 103 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 104 if (fHandled[breakType] == 0) { 105 fHandled[breakType] = new UnicodeSet(); 106 if (fHandled[breakType] == 0) { 107 return; 108 } 109 } 110 if (!fHandled[breakType]->contains(c)) { 111 UErrorCode status = U_ZERO_ERROR; 112 // Apply the entire script of the character. 113 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 114 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 115 } 116 } 117 } 118 119 /* 120 ****************************************************************** 121 */ 122 123 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 124 fEngines = 0; 125 } 126 127 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 128 if (fEngines != 0) { 129 delete fEngines; 130 } 131 } 132 133 U_NAMESPACE_END 134 U_CDECL_BEGIN 135 static void U_CALLCONV _deleteEngine(void *obj) { 136 delete (const icu::LanguageBreakEngine *) obj; 137 } 138 U_CDECL_END 139 U_NAMESPACE_BEGIN 140 141 const LanguageBreakEngine * 142 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 143 UBool needsInit; 144 int32_t i; 145 const LanguageBreakEngine *lbe = NULL; 146 UErrorCode status = U_ZERO_ERROR; 147 148 // TODO: The global mutex should not be used. 149 // The global mutex should only be used for short periods. 150 // A ICULanguageBreakFactory specific mutex should be used. 151 umtx_lock(NULL); 152 needsInit = (UBool)(fEngines == NULL); 153 if (!needsInit) { 154 i = fEngines->size(); 155 while (--i >= 0) { 156 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 157 if (lbe != NULL && lbe->handles(c, breakType)) { 158 break; 159 } 160 lbe = NULL; 161 } 162 } 163 umtx_unlock(NULL); 164 165 if (lbe != NULL) { 166 return lbe; 167 } 168 169 if (needsInit) { 170 UStack *engines = new UStack(_deleteEngine, NULL, status); 171 if (U_SUCCESS(status) && engines == NULL) { 172 status = U_MEMORY_ALLOCATION_ERROR; 173 } 174 else if (U_FAILURE(status)) { 175 delete engines; 176 engines = NULL; 177 } 178 else { 179 umtx_lock(NULL); 180 if (fEngines == NULL) { 181 fEngines = engines; 182 engines = NULL; 183 } 184 umtx_unlock(NULL); 185 delete engines; 186 } 187 } 188 189 if (fEngines == NULL) { 190 return NULL; 191 } 192 193 // We didn't find an engine the first time through, or there was no 194 // stack. Create an engine. 195 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 196 197 // Now get the lock, and see if someone else has created it in the 198 // meantime 199 umtx_lock(NULL); 200 i = fEngines->size(); 201 while (--i >= 0) { 202 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 203 if (lbe != NULL && lbe->handles(c, breakType)) { 204 break; 205 } 206 lbe = NULL; 207 } 208 if (lbe == NULL && newlbe != NULL) { 209 fEngines->push((void *)newlbe, status); 210 lbe = newlbe; 211 newlbe = NULL; 212 } 213 umtx_unlock(NULL); 214 215 delete newlbe; 216 217 return lbe; 218 } 219 220 const LanguageBreakEngine * 221 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 222 UErrorCode status = U_ZERO_ERROR; 223 UScriptCode code = uscript_getScript(c, &status); 224 if (U_SUCCESS(status)) { 225 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 226 if (m != NULL) { 227 const LanguageBreakEngine *engine = NULL; 228 switch(code) { 229 case USCRIPT_THAI: 230 engine = new ThaiBreakEngine(m, status); 231 break; 232 case USCRIPT_KHMER: 233 engine = new KhmerBreakEngine(m, status); 234 break; 235 236 #if !UCONFIG_NO_NORMALIZATION 237 // CJK not available w/o normalization 238 case USCRIPT_HANGUL: 239 engine = new CjkBreakEngine(m, kKorean, status); 240 break; 241 242 // use same BreakEngine and dictionary for both Chinese and Japanese 243 case USCRIPT_HIRAGANA: 244 case USCRIPT_KATAKANA: 245 case USCRIPT_HAN: 246 engine = new CjkBreakEngine(m, kChineseJapanese, status); 247 break; 248 #if 0 249 // TODO: Have to get some characters with script=common handled 250 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 251 // them to CjkBreakEngine does not work. The engine has to 252 // special-case them. 253 case USCRIPT_COMMON: 254 { 255 UBlockCode block = ublock_getCode(code); 256 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 257 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 258 break; 259 } 260 #endif 261 #endif 262 263 default: 264 break; 265 } 266 if (engine == NULL) { 267 delete m; 268 } 269 else if (U_FAILURE(status)) { 270 delete engine; 271 engine = NULL; 272 } 273 return engine; 274 } 275 } 276 return NULL; 277 } 278 279 DictionaryMatcher * 280 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 281 UErrorCode status = U_ZERO_ERROR; 282 // open root from brkitr tree. 283 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 284 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 285 int32_t dictnlength = 0; 286 const UChar *dictfname = 287 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 288 if (U_FAILURE(status)) { 289 ures_close(b); 290 return NULL; 291 } 292 CharString dictnbuf; 293 CharString ext; 294 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 295 if (extStart != NULL) { 296 int32_t len = (int32_t)(extStart - dictfname); 297 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 298 dictnlength = len; 299 } 300 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 301 ures_close(b); 302 303 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 304 if (U_SUCCESS(status)) { 305 // build trie 306 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 307 const int32_t *indexes = (const int32_t *)data; 308 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 309 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 310 DictionaryMatcher *m = NULL; 311 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 312 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 313 const char *characters = (const char *)(data + offset); 314 m = new BytesDictionaryMatcher(characters, transform, file); 315 } 316 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 317 const UChar *characters = (const UChar *)(data + offset); 318 m = new UCharsDictionaryMatcher(characters, file); 319 } 320 if (m == NULL) { 321 // no matcher exists to take ownership - either we are an invalid 322 // type or memory allocation failed 323 udata_close(file); 324 } 325 return m; 326 } else if (dictfname != NULL) { 327 // we don't have a dictionary matcher. 328 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 329 status = U_ZERO_ERROR; 330 return NULL; 331 } 332 return NULL; 333 } 334 335 U_NAMESPACE_END 336 337 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 338