1 /** 2 ************************************************************************************ 3 * Copyright (C) 2006-2007, International Business Machines Corporation and others. * 4 * All Rights Reserved. * 5 ************************************************************************************ 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_BREAK_ITERATION 11 12 #include "brkeng.h" 13 #include "dictbe.h" 14 #include "triedict.h" 15 #include "unicode/uchar.h" 16 #include "unicode/uniset.h" 17 #include "unicode/chariter.h" 18 #include "unicode/ures.h" 19 #include "unicode/udata.h" 20 #include "unicode/putil.h" 21 #include "unicode/ustring.h" 22 #include "unicode/uscript.h" 23 #include "uvector.h" 24 #include "umutex.h" 25 #include "uresimp.h" 26 #include "ubrkimpl.h" 27 #include <stdio.h> 28 29 U_NAMESPACE_BEGIN 30 31 /* 32 ****************************************************************** 33 */ 34 35 LanguageBreakEngine::LanguageBreakEngine() { 36 } 37 38 LanguageBreakEngine::~LanguageBreakEngine() { 39 } 40 41 /* 42 ****************************************************************** 43 */ 44 45 LanguageBreakFactory::LanguageBreakFactory() { 46 } 47 48 LanguageBreakFactory::~LanguageBreakFactory() { 49 } 50 51 /* 52 ****************************************************************** 53 */ 54 55 UnhandledEngine::UnhandledEngine(UErrorCode &/*status*/) { 56 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 57 fHandled[i] = 0; 58 } 59 } 60 61 UnhandledEngine::~UnhandledEngine() { 62 for (int32_t i = 0; i < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])); ++i) { 63 if (fHandled[i] != 0) { 64 delete fHandled[i]; 65 } 66 } 67 } 68 69 UBool 70 UnhandledEngine::handles(UChar32 c, int32_t breakType) const { 71 return (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0])) 72 && fHandled[breakType] != 0 && fHandled[breakType]->contains(c)); 73 } 74 75 int32_t 76 UnhandledEngine::findBreaks( UText *text, 77 int32_t startPos, 78 int32_t endPos, 79 UBool reverse, 80 int32_t breakType, 81 UStack &/*foundBreaks*/ ) const { 82 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 83 UChar32 c = utext_current32(text); 84 if (reverse) { 85 while((int32_t)utext_getNativeIndex(text) > startPos && fHandled[breakType]->contains(c)) { 86 c = utext_previous32(text); 87 } 88 } 89 else { 90 while((int32_t)utext_getNativeIndex(text) < endPos && fHandled[breakType]->contains(c)) { 91 utext_next32(text); // TODO: recast loop to work with post-increment operations. 92 c = utext_current32(text); 93 } 94 } 95 } 96 return 0; 97 } 98 99 void 100 UnhandledEngine::handleCharacter(UChar32 c, int32_t breakType) { 101 if (breakType >= 0 && breakType < (int32_t)(sizeof(fHandled)/sizeof(fHandled[0]))) { 102 if (fHandled[breakType] == 0) { 103 fHandled[breakType] = new UnicodeSet(); 104 if (fHandled[breakType] == 0) { 105 return; 106 } 107 } 108 if (!fHandled[breakType]->contains(c)) { 109 UErrorCode status = U_ZERO_ERROR; 110 // Apply the entire script of the character. 111 int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); 112 fHandled[breakType]->applyIntPropertyValue(UCHAR_SCRIPT, script, status); 113 } 114 } 115 } 116 117 /* 118 ****************************************************************** 119 */ 120 121 ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { 122 fEngines = 0; 123 } 124 125 ICULanguageBreakFactory::~ICULanguageBreakFactory() { 126 if (fEngines != 0) { 127 delete fEngines; 128 } 129 } 130 131 U_NAMESPACE_END 132 U_CDECL_BEGIN 133 static void U_CALLCONV _deleteEngine(void *obj) { 134 delete (const U_NAMESPACE_QUALIFIER LanguageBreakEngine *) obj; 135 } 136 U_CDECL_END 137 U_NAMESPACE_BEGIN 138 139 const LanguageBreakEngine * 140 ICULanguageBreakFactory::getEngineFor(UChar32 c, int32_t breakType) { 141 UBool needsInit; 142 int32_t i; 143 const LanguageBreakEngine *lbe = NULL; 144 UErrorCode status = U_ZERO_ERROR; 145 146 // TODO: The global mutex should not be used. 147 // The global mutex should only be used for short periods. 148 // A ICULanguageBreakFactory specific mutex should be used. 149 umtx_lock(NULL); 150 needsInit = (UBool)(fEngines == NULL); 151 if (!needsInit) { 152 i = fEngines->size(); 153 while (--i >= 0) { 154 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 155 if (lbe != NULL && lbe->handles(c, breakType)) { 156 break; 157 } 158 lbe = NULL; 159 } 160 } 161 umtx_unlock(NULL); 162 163 if (lbe != NULL) { 164 return lbe; 165 } 166 167 if (needsInit) { 168 UStack *engines = new UStack(_deleteEngine, NULL, status); 169 if (U_SUCCESS(status) && engines == NULL) { 170 status = U_MEMORY_ALLOCATION_ERROR; 171 } 172 else if (U_FAILURE(status)) { 173 delete engines; 174 engines = NULL; 175 } 176 else { 177 umtx_lock(NULL); 178 if (fEngines == NULL) { 179 fEngines = engines; 180 engines = NULL; 181 } 182 umtx_unlock(NULL); 183 delete engines; 184 } 185 } 186 187 if (fEngines == NULL) { 188 return NULL; 189 } 190 191 // We didn't find an engine the first time through, or there was no 192 // stack. Create an engine. 193 const LanguageBreakEngine *newlbe = loadEngineFor(c, breakType); 194 195 // Now get the lock, and see if someone else has created it in the 196 // meantime 197 umtx_lock(NULL); 198 i = fEngines->size(); 199 while (--i >= 0) { 200 lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); 201 if (lbe != NULL && lbe->handles(c, breakType)) { 202 break; 203 } 204 lbe = NULL; 205 } 206 if (lbe == NULL && newlbe != NULL) { 207 fEngines->push((void *)newlbe, status); 208 lbe = newlbe; 209 newlbe = NULL; 210 } 211 umtx_unlock(NULL); 212 213 delete newlbe; 214 215 return lbe; 216 } 217 218 const LanguageBreakEngine * 219 ICULanguageBreakFactory::loadEngineFor(UChar32 c, int32_t breakType) { 220 UErrorCode status = U_ZERO_ERROR; 221 UScriptCode code = uscript_getScript(c, &status); 222 if (U_SUCCESS(status)) { 223 const CompactTrieDictionary *dict = loadDictionaryFor(code, breakType); 224 if (dict != NULL) { 225 const LanguageBreakEngine *engine = NULL; 226 switch(code) { 227 case USCRIPT_THAI: 228 engine = new ThaiBreakEngine(dict, status); 229 break; 230 231 case USCRIPT_HANGUL: 232 engine = new CjkBreakEngine(dict, kKorean, status); 233 break; 234 235 // use same BreakEngine and dictionary for both Chinese and Japanese 236 case USCRIPT_HIRAGANA: 237 case USCRIPT_KATAKANA: 238 case USCRIPT_HAN: 239 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 240 break; 241 #if 0 242 // TODO: Have to get some characters with script=common handled 243 // by CjkBreakEngine (e.g. U+309B). Simply subjecting 244 // them to CjkBreakEngine does not work. The engine has to 245 // special-case them. 246 case USCRIPT_COMMON: 247 { 248 UBlockCode block = ublock_getCode(code); 249 if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) 250 engine = new CjkBreakEngine(dict, kChineseJapanese, status); 251 break; 252 } 253 #endif 254 default: 255 break; 256 } 257 if (engine == NULL) { 258 delete dict; 259 } 260 else if (U_FAILURE(status)) { 261 delete engine; 262 engine = NULL; 263 } 264 return engine; 265 } 266 } 267 return NULL; 268 } 269 270 const CompactTrieDictionary * 271 ICULanguageBreakFactory::loadDictionaryFor(UScriptCode script, int32_t /*breakType*/) { 272 UErrorCode status = U_ZERO_ERROR; 273 // Open root from brkitr tree. 274 char dictnbuff[256]; 275 char ext[4]={'\0'}; 276 277 UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); 278 b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); 279 b = ures_getByKeyWithFallback(b, uscript_getShortName(script), b, &status); 280 int32_t dictnlength = 0; 281 const UChar *dictfname = ures_getString(b, &dictnlength, &status); 282 if (U_SUCCESS(status) && (size_t)dictnlength >= sizeof(dictnbuff)) { 283 dictnlength = 0; 284 status = U_BUFFER_OVERFLOW_ERROR; 285 } 286 if (U_SUCCESS(status) && dictfname) { 287 UChar* extStart=u_strchr(dictfname, 0x002e); 288 int len = 0; 289 if(extStart!=NULL){ 290 len = extStart-dictfname; 291 u_UCharsToChars(extStart+1, ext, sizeof(ext)); // nul terminates the buff 292 u_UCharsToChars(dictfname, dictnbuff, len); 293 } 294 dictnbuff[len]=0; // nul terminate 295 } 296 ures_close(b); 297 UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext, dictnbuff, &status); 298 if (U_SUCCESS(status)) { 299 const CompactTrieDictionary *dict = new CompactTrieDictionary( 300 file, status); 301 if (U_SUCCESS(status) && dict == NULL) { 302 status = U_MEMORY_ALLOCATION_ERROR; 303 } 304 if (U_FAILURE(status)) { 305 delete dict; 306 dict = NULL; 307 } 308 return dict; 309 } else if (dictfname != NULL){ 310 //create dummy dict if dictionary filename not valid 311 UChar c = 0x0020; 312 status = U_ZERO_ERROR; 313 MutableTrieDictionary *mtd = new MutableTrieDictionary(c, status, TRUE); 314 mtd->addWord(&c, 1, status, 1); 315 return new CompactTrieDictionary(*mtd, status); 316 } 317 return NULL; 318 } 319 320 U_NAMESPACE_END 321 322 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 323