1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ************************************************************************************ 5 * Copyright (C) 2006-2016, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ************************************************************************************ 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_BREAK_ITERATION 13 14 #include "unicode/uchar.h" 15 #include "unicode/uniset.h" 16 #include "unicode/chariter.h" 17 #include "unicode/ures.h" 18 #include "unicode/udata.h" 19 #include "unicode/putil.h" 20 #include "unicode/ustring.h" 21 #include "unicode/uscript.h" 22 #include "unicode/ucharstrie.h" 23 #include "unicode/bytestrie.h" 24 25 #include "brkeng.h" 26 #include "cmemory.h" 27 #include "dictbe.h" 28 #include "charstr.h" 29 #include "dictionarydata.h" 30 #include "mutex.h" 31 #include "uvector.h" 32 #include "umutex.h" 33 #include "uresimp.h" 34 #include "ubrkimpl.h" 35 36 U_NAMESPACE_BEGIN 37 38 /* 39 ****************************************************************** 40 */ 41 42 LanguageBreakEngine::LanguageBreakEngine() { 43 } 44 45 LanguageBreakEngine::~LanguageBreakEngine() { 46 } 47 48 /* 49 ****************************************************************** 50 */ 51 52 LanguageBreakFactory::LanguageBreakFactory() { 53 } 54 55 LanguageBreakFactory::~LanguageBreakFactory() { 56 } 57 58 /* 59 ****************************************************************** 60 */ 61 62 UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { 63 (void)status; 64 } 65 66 UnhandledEngine::~UnhandledEngine() { 67 delete fHandled; 68 fHandled = nullptr; 69 } 70 71 UBool 72 UnhandledEngine::handles(UChar32 c) const { 73 return fHandled && fHandled->contains(c); 74 } 75 76 int32_t 77 UnhandledEngine::findBreaks( UText *text, 78 int32_t /* startPos */, 79 int32_t endPos, 80 UVector32 &/*foundBreaks*/ ) const { 81 UChar32 c = utext_current32(text); 82 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { 83 utext_next32(text); // TODO: recast loop to work with post-increment operations. 84 c = utext_current32(text); 85 } 86 return 0; 87 } 88 89 void 90 UnhandledEngine::handleCharacter(UChar32 c) { 91 if (fHandled == nullptr) { 92 fHandled = new UnicodeSet(); 93 if (fHandled == nullptr) { 94 return; 95 } 96 } 97 if (!fHandled->contains(c)) { 98 UErrorCode status = U_ZERO_ERROR; 99 // Apply the entire script of the character. 100 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 101 fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 102 } 103 } 104 105 /* 106 ****************************************************************** 107 */ 108 109 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 110 fEngines = 0; 111 } 112 113 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 114 if (fEngines != 0) { 115 delete fEngines; 116 } 117 } 118 119 U_NAMESPACE_END 120 U_CDECL_BEGIN 121 static void U_CALLCONV _deleteEngine(void *obj) { 122 delete (const icu::LanguageBreakEngine *) obj; 123 } 124 U_CDECL_END 125 U_NAMESPACE_BEGIN 126 127 static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER; 128 129 const LanguageBreakEngine * 130 ICULanguageBreakFactory::getEngineFor(UChar32 c) { 131 const LanguageBreakEngine *lbe = NULL; 132 UErrorCode status = U_ZERO_ERROR; 133 134 Mutex m(&gBreakEngineMutex); 135 136 if (fEngines == NULL) { 137 UStack *engines = new UStack(_deleteEngine, NULL, status); 138 if (U_FAILURE(status) || engines == NULL) { 139 // Note: no way to return error code to caller. 140 delete engines; 141 return NULL; 142 } 143 fEngines = engines; 144 } else { 145 int32_t i = fEngines->size(); 146 while (--i >= 0) { 147 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 148 if (lbe != NULL && lbe->handles(c)) { 149 return lbe; 150 } 151 } 152 } 153 154 // We didn't find an engine. Create one. 155 lbe = loadEngineFor(c); 156 if (lbe != NULL) { 157 fEngines->push((void *)lbe, status); 158 } 159 return lbe; 160 } 161 162 const LanguageBreakEngine * 163 ICULanguageBreakFactory::loadEngineFor(UChar32 c) { 164 UErrorCode status = U_ZERO_ERROR; 165 UScriptCode code = uscript_getScript(c, &status); 166 if (U_SUCCESS(status)) { 167 DictionaryMatcher *m = loadDictionaryMatcherFor(code); 168 if (m != NULL) { 169 const LanguageBreakEngine *engine = NULL; 170 switch(code) { 171 case USCRIPT_THAI: 172 engine = new ThaiBreakEngine(m, status); 173 break; 174 case USCRIPT_LAO: 175 engine = new LaoBreakEngine(m, status); 176 break; 177 case USCRIPT_MYANMAR: 178 engine = new BurmeseBreakEngine(m, status); 179 break; 180 case USCRIPT_KHMER: 181 engine = new KhmerBreakEngine(m, status); 182 break; 183 184 #if !UCONFIG_NO_NORMALIZATION 185 // CJK not available w/o normalization 186 case USCRIPT_HANGUL: 187 engine = new CjkBreakEngine(m, kKorean, status); 188 break; 189 190 // use same BreakEngine and dictionary for both Chinese and Japanese 191 case USCRIPT_HIRAGANA: 192 case USCRIPT_KATAKANA: 193 case USCRIPT_HAN: 194 engine = new CjkBreakEngine(m, kChineseJapanese, status); 195 break; 196 #if 0 197 // TODO: Have to get some characters with script=common handled 198 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 199 // them to CjkBreakEngine does not work. The engine has to 200 // special-case them. 201 case USCRIPT_COMMON: 202 { 203 UBlockCode block = ublock_getCode(code); 204 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 205 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 206 break; 207 } 208 #endif 209 #endif 210 211 default: 212 break; 213 } 214 if (engine == NULL) { 215 delete m; 216 } 217 else if (U_FAILURE(status)) { 218 delete engine; 219 engine = NULL; 220 } 221 return engine; 222 } 223 } 224 return NULL; 225 } 226 227 DictionaryMatcher * 228 ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { 229 UErrorCode status = U_ZERO_ERROR; 230 // open root from brkitr tree. 231 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 232 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 233 int32_t dictnlength = 0; 234 const UChar *dictfname = 235 ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); 236 if (U_FAILURE(status)) { 237 ures_close(b); 238 return NULL; 239 } 240 CharString dictnbuf; 241 CharString ext; 242 const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot 243 if (extStart != NULL) { 244 int32_t len = (int32_t)(extStart - dictfname); 245 ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); 246 dictnlength = len; 247 } 248 dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); 249 ures_close(b); 250 251 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); 252 if (U_SUCCESS(status)) { 253 // build trie 254 const uint8_t *data = (const uint8_t *)udata_getMemory(file); 255 const int32_t *indexes = (const int32_t *)data; 256 const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; 257 const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; 258 DictionaryMatcher *m = NULL; 259 if (trieType == DictionaryData::TRIE_TYPE_BYTES) { 260 const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; 261 const char *characters = (const char *)(data + offset); 262 m = new BytesDictionaryMatcher(characters, transform, file); 263 } 264 else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { 265 const UChar *characters = (const UChar *)(data + offset); 266 m = new UCharsDictionaryMatcher(characters, file); 267 } 268 if (m == NULL) { 269 // no matcher exists to take ownership - either we are an invalid 270 // type or memory allocation failed 271 udata_close(file); 272 } 273 return m; 274 } else if (dictfname != NULL) { 275 // we don't have a dictionary matcher. 276 // returning NULL here will cause us to fail to find a dictionary break engine, as expected 277 status = U_ZERO_ERROR; 278 return NULL; 279 } 280 return NULL; 281 } 282 283 U_NAMESPACE_END 284 285 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 286