1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************************ 5 * Copyright (C) 2006-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ************************************************************************************ 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "brkeng.h" 15 #include "cmemory.h" 16 #include "dictbe.h" 17 #include "unicode/uchar.h" 18 #include "unicode/uniset.h" 19 #include "unicode/chariter.h" 20 #include "unicode/ures.h" 21 #include "unicode/udata.h" 22 #include "unicode/putil.h" 23 #include "unicode/ustring.h" 24 #include "unicode/uscript.h" 25 #include "unicode/ucharstrie.h" 26 #include "unicode/bytestrie.h" 27 #include "charstr.h" 28 #include "dictionarydata.h" 29 #include "mutex.h" 30 #include "uvector.h" 31 #include "umutex.h" 32 #include "uresimp.h" 33 #include "ubrkimpl.h" 34 35 U_NAMESPACE_BEGIN 36 37 /* 38 ****************************************************************** 39 */ 40 41 LanguageBreakEngine::LanguageBreakEngine() { 42 } 43 44 LanguageBreakEngine::~LanguageBreakEngine() { 45 } 46 47 /* 48 ****************************************************************** 49 */ 50 51 LanguageBreakFactory::LanguageBreakFactory() { 52 } 53 54 LanguageBreakFactory::~LanguageBreakFactory() { 55 } 56 57 /* 58 ****************************************************************** 59 */ 60 61 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 62 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) { 63 fHandled[i] = 0; 64 } 65 } 66 67 UnhandledEngine::~UnhandledEngine() { 68 for (int32_t i = 0; i < UPRV_LENGTHOF(fHandled); ++i) { 69 if (fHandled[i] != 0) { 70 delete fHandled[i]; 71 } 72 } 73 } 74 75 UBool 76 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 77 return (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled) 78 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 79 } 80 81 int32_t 82 UnhandledEngine::findBreaks( UText *text, 83 int32_t startPos, 84 int32_t endPos, 85 UBool reverse, 86 int32_t breakType, 87 UStack &/*foundBreaks*/ ) const { 88 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { 89 UChar32 c = utext_current32(text); 90 if (reverse) { 91 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 92 c = utext_previous32(text); 93 } 94 } 95 else { 96 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 97 utext_next32(text); // TODO: recast loop to work with post-increment operations. 98 c = utext_current32(text); 99 } 100 } 101 } 102 return 0; 103 } 104 105 void 106 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 107 if (breakType >= 0 && breakType < UPRV_LENGTHOF(fHandled)) { 108 if (fHandled[breakType] == 0) { 109 fHandled[breakType] = new UnicodeSet(); 110 if (fHandled[breakType] == 0) { 111 return; 112 } 113 } 114 if (!fHandled[breakType]->contains(c)) { 115 UErrorCode status = U_ZERO_ERROR; 116 // Apply the entire script of the character. 117 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 118 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 119 } 120 } 121 } 122 123 /* 124 ****************************************************************** 125 */ 126 127 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 128 fEngines = 0; 129 } 130 131 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 132 if (fEngines != 0) { 133 delete fEngines; 134 } 135 } 136 137 U_NAMESPACE_END 138 U_CDECL_BEGIN 139 static void U_CALLCONV _deleteEngine(void *obj) { 140 delete (const icu::LanguageBreakEngine *) obj; 141 } 142 U_CDECL_END 143 U_NAMESPACE_BEGIN 144 145 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER; 146 147 const LanguageBreakEngine * 148 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 149 const LanguageBreakEngine *lbe = NULL; 150 UErrorCode status = U_ZERO_ERROR; 151 152 Mutex m(&gBreakEngineMutex); 153 154 if (fEngines == NULL) { 155 UStack *engines = new UStack(_deleteEngine, NULL, status); 156 if (U_FAILURE(status) || engines == NULL) { 157 // Note: no way to return error code to caller. 158 delete engines; 159 return NULL; 160 } 161 fEngines = engines; 162 } else { 163 int32_t i = fEngines->size(); 164 while (--i >= 0) { 165 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 166 if (lbe != NULL && lbe->handles(c, breakType)) { 167 return lbe; 168 } 169 } 170 } 171 172 // We didn't find an engine. Create one. 173 lbe = loadEngineFor(c, breakType); 174 if (lbe != NULL) { 175 fEngines->push((void *)lbe, status); 176 } 177 return lbe; 178 } 179 180 const LanguageBreakEngine * 181 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 182 UErrorCode status = U_ZERO_ERROR; 183 UScriptCode code = uscript_getScript(c, &status); 184 if (U_SUCCESS(status)) { 185 DictionaryMatcher *m = loadDictionaryMatcherFor(code, breakType); 186 if (m != NULL) { 187 const LanguageBreakEngine *engine = NULL; 188 switch(code) { 189 case USCRIPT_THAI: 190 engine = new ThaiBreakEngine(m, status); 191 break; 192 case USCRIPT_LAO: 193 engine = new LaoBreakEngine(m, status); 194 break; 195 case USCRIPT_MYANMAR: 196 engine = new BurmeseBreakEngine(m, status); 197 break; 198 case USCRIPT_KHMER: 199 engine = new KhmerBreakEngine(m, status); 200 break; 201 202 #if !UCONFIG_NO_NORMALIZATION 203 // CJK not available w/o normalization 204 case USCRIPT_HANGUL: 205 engine = new CjkBreakEngine(m, kKorean, status); 206 break; 207 208 // use same BreakEngine and dictionary for both Chinese and Japanese 209 case USCRIPT_HIRAGANA: 210 case USCRIPT_KATAKANA: 211 case USCRIPT_HAN: 212 engine = new CjkBreakEngine(m, kChineseJapanese, status); 213 break; 214 #if 0 215 // TODO: Have to get some characters with script=common handled 216 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 217 // them to CjkBreakEngine does not work. The engine has to 218 // special-case them. 219 case USCRIPT_COMMON: 220 { 221 UBlockCode block = ublock_getCode(code); 222 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 223 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 224 break; 225 } 226 #endif 227 #endif 228 229 default: 230 break; 231 } 232 if (engine == NULL) { 233 delete m; 234 } 235 else if (U_FAILURE(status)) { 236 delete engine; 237 engine = NULL; 238 } 239 return engine; 240 } 241 } 242 return NULL; 243 } 244 245 DictionaryMatcher * 246 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script, int32_t /* brkType */) { 247 UErrorCode status = U_ZERO_ERROR; 248 // open root from brkitr tree. 249 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 250 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 251 int32_t dictnlength = 0; 252 const UChar *dictfname = 253 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 254 if (U_FAILURE(status)) { 255 ures_close(b); 256 return NULL; 257 } 258 CharString dictnbuf; 259 CharString ext; 260 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 261 if (extStart != NULL) { 262 int32_t len = (int32_t)(extStart - dictfname); 263 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 264 dictnlength = len; 265 } 266 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 267 ures_close(b); 268 269 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 270 if (U_SUCCESS(status)) { 271 // build trie 272 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 273 const int32_t *indexes = (const int32_t *)data; 274 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 275 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 276 DictionaryMatcher *m = NULL; 277 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 278 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 279 const char *characters = (const char *)(data + offset); 280 m = new BytesDictionaryMatcher(characters, transform, file); 281 } 282 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 283 const UChar *characters = (const UChar *)(data + offset); 284 m = new UCharsDictionaryMatcher(characters, file); 285 } 286 if (m == NULL) { 287 // no matcher exists to take ownership - either we are an invalid 288 // type or memory allocation failed 289 udata_close(file); 290 } 291 return m; 292 } else if (dictfname != NULL) { 293 // we don't have a dictionary matcher. 294 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 295 status = U_ZERO_ERROR; 296 return NULL; 297 } 298 return NULL; 299 } 300 301 U_NAMESPACE_END 302 303 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 304