1 /** 2 ************************************************************************************ 3 * Copyright (C) 2006-2009, International Business Machines Corporation and others. * 4 * All Rights Reserved. * 5 ************************************************************************************ 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "triedict.h" 15 #include "unicode/uchar.h" 16 #include "unicode/uniset.h" 17 #include "unicode/chariter.h" 18 #include "unicode/ures.h" 19 #include "unicode/udata.h" 20 #include "unicode/putil.h" 21 #include "unicode/ustring.h" 22 #include "unicode/uscript.h" 23 #include "uvector.h" 24 #include "umutex.h" 25 #include "uresimp.h" 26 #include "ubrkimpl.h" 27 28 U_NAMESPACE_BEGIN 29 30 /* 31 ****************************************************************** 32 */ 33 34 LanguageBreakEngine::LanguageBreakEngine() { 35 } 36 37 LanguageBreakEngine::~LanguageBreakEngine() { 38 } 39 40 /* 41 ****************************************************************** 42 */ 43 44 LanguageBreakFactory::LanguageBreakFactory() { 45 } 46 47 LanguageBreakFactory::~LanguageBreakFactory() { 48 } 49 50 /* 51 ****************************************************************** 52 */ 53 54 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 55 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 56 fHandled[i] = 0; 57 } 58 } 59 60 UnhandledEngine::~UnhandledEngine() { 61 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 62 if (fHandled[i] != 0) { 63 delete fHandled[i]; 64 } 65 } 66 } 67 68 UBool 69 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 70 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 71 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 72 } 73 74 int32_t 75 UnhandledEngine::findBreaks( UText *text, 76 int32_t startPos, 77 int32_t endPos, 78 UBool reverse, 79 int32_t breakType, 80 UStack &/*foundBreaks*/ ) const { 81 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 82 UChar32 c = utext_current32(text); 83 if (reverse) { 84 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 85 c = utext_previous32(text); 86 } 87 } 88 else { 89 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 90 utext_next32(text); // TODO: recast loop to work with post-increment operations. 91 c = utext_current32(text); 92 } 93 } 94 } 95 return 0; 96 } 97 98 void 99 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 100 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 101 if (fHandled[breakType] == 0) { 102 fHandled[breakType] = new UnicodeSet(); 103 if (fHandled[breakType] == 0) { 104 return; 105 } 106 } 107 if (!fHandled[breakType]->contains(c)) { 108 UErrorCode status = U_ZERO_ERROR; 109 // Apply the entire script of the character. 110 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 111 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 112 } 113 } 114 } 115 116 /* 117 ****************************************************************** 118 */ 119 120 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 121 fEngines = 0; 122 } 123 124 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 125 if (fEngines != 0) { 126 delete fEngines; 127 } 128 } 129 130 U_NAMESPACE_END 131 U_CDECL_BEGIN 132 static void U_CALLCONV _deleteEngine(void *obj) { 133 delete (const U_NAMESPACE_QUALIFIER LanguageBreakEngine *) obj; 134 } 135 U_CDECL_END 136 U_NAMESPACE_BEGIN 137 138 const LanguageBreakEngine * 139 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 140 UBool needsInit; 141 int32_t i; 142 const LanguageBreakEngine *lbe = NULL; 143 UErrorCode status = U_ZERO_ERROR; 144 145 // TODO: The global mutex should not be used. 146 // The global mutex should only be used for short periods. 147 // A ICULanguageBreakFactory specific mutex should be used. 148 umtx_lock(NULL); 149 needsInit = (UBool)(fEngines == NULL); 150 if (!needsInit) { 151 i = fEngines->size(); 152 while (--i >= 0) { 153 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 154 if (lbe != NULL && lbe->handles(c, breakType)) { 155 break; 156 } 157 lbe = NULL; 158 } 159 } 160 umtx_unlock(NULL); 161 162 if (lbe != NULL) { 163 return lbe; 164 } 165 166 if (needsInit) { 167 UStack *engines = new UStack(_deleteEngine, NULL, status); 168 if (U_SUCCESS(status) && engines == NULL) { 169 status = U_MEMORY_ALLOCATION_ERROR; 170 } 171 else if (U_FAILURE(status)) { 172 delete engines; 173 engines = NULL; 174 } 175 else { 176 umtx_lock(NULL); 177 if (fEngines == NULL) { 178 fEngines = engines; 179 engines = NULL; 180 } 181 umtx_unlock(NULL); 182 delete engines; 183 } 184 } 185 186 if (fEngines == NULL) { 187 return NULL; 188 } 189 190 // We didn't find an engine the first time through, or there was no 191 // stack. Create an engine. 192 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 193 194 // Now get the lock, and see if someone else has created it in the 195 // meantime 196 umtx_lock(NULL); 197 i = fEngines->size(); 198 while (--i >= 0) { 199 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 200 if (lbe != NULL && lbe->handles(c, breakType)) { 201 break; 202 } 203 lbe = NULL; 204 } 205 if (lbe == NULL && newlbe != NULL) { 206 fEngines->push((void *)newlbe, status); 207 lbe = newlbe; 208 newlbe = NULL; 209 } 210 umtx_unlock(NULL); 211 212 delete newlbe; 213 214 return lbe; 215 } 216 217 const LanguageBreakEngine * 218 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 219 UErrorCode status = U_ZERO_ERROR; 220 UScriptCode code = uscript_getScript(c, &status); 221 if (U_SUCCESS(status)) { 222 const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType); 223 if (dict != NULL) { 224 const LanguageBreakEngine *engine = NULL; 225 switch(code) { 226 case USCRIPT_THAI: 227 engine = new ThaiBreakEngine(dict, status); 228 break; 229 230 case USCRIPT_HANGUL: 231 engine = new CjkBreakEngine(dict, kKorean, status); 232 break; 233 234 // use same BreakEngine and dictionary for both Chinese and Japanese 235 case USCRIPT_HIRAGANA: 236 case USCRIPT_KATAKANA: 237 case USCRIPT_HAN: 238 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 239 break; 240 #if 0 241 // TODO: Have to get some characters with script=common handled 242 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 243 // them to CjkBreakEngine does not work. The engine has to 244 // special-case them. 245 case USCRIPT_COMMON: 246 { 247 UBlockCode block = ublock_getCode(code); 248 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 249 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 250 break; 251 } 252 #endif 253 default: 254 break; 255 } 256 if (engine == NULL) { 257 delete dict; 258 } 259 else if (U_FAILURE(status)) { 260 delete engine; 261 engine = NULL; 262 } 263 return engine; 264 } 265 } 266 return NULL; 267 } 268 269 const CompactTrieDictionary * 270 ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) { 271 UErrorCode status = U_ZERO_ERROR; 272 // Open root from brkitr tree. 273 char dictnbuff[256]; 274 char ext[4]={'\0'}; 275 276 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 277 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 278 b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status); 279 int32_t dictnlength = 0; 280 const UChar *dictfname = ures_getString(b, &dictnlength, &status); 281 if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) { 282 dictnlength = 0; 283 status = U_BUFFER_OVERFLOW_ERROR; 284 } 285 if (U_SUCCESS(status) && dictfname) { 286 UChar* extStart=u_strchr(dictfname, 0x002e); 287 int len = 0; 288 if(extStart!=NULL){ 289 len = (int)(extStart-dictfname); 290 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff 291 u_UCharsToChars(dictfname, dictnbuff, len); 292 } 293 dictnbuff[len]=0; // nul terminate 294 } 295 ures_close(b); 296 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status); 297 if (U_SUCCESS(status)) { 298 const CompactTrieDictionary *dict = new CompactTrieDictionary( 299 file, status); 300 if (U_SUCCESS(status) && dict == NULL) { 301 status = U_MEMORY_ALLOCATION_ERROR; 302 } 303 if (U_FAILURE(status)) { 304 delete dict; 305 dict = NULL; 306 } 307 return dict; 308 } else if (dictfname != NULL){ 309 //create dummy dict if dictionary filename not valid 310 UChar c = 0x0020; 311 status = U_ZERO_ERROR; 312 MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); 313 mtd->addWord(&c, 1, status, 1); 314 return new CompactTrieDictionary(*mtd, status); 315 } 316 return NULL; 317 } 318 319 U_NAMESPACE_END 320 321 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 322