1 /* 2 ***************************************************************** 3 * Copyright (c) 2002-2014, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ***************************************************************** 6 * Date Name Description 7 * 06/06/2002 aliu Creation. 8 ***************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/uobject.h" 16 #include "unicode/uscript.h" 17 18 #include "anytrans.h" 19 #include "hash.h" 20 #include "mutex.h" 21 #include "nultrans.h" 22 #include "putilimp.h" 23 #include "tridpars.h" 24 #include "uinvchar.h" 25 #include "uvector.h" 26 27 //------------------------------------------------------------ 28 // Constants 29 30 static const UChar TARGET_SEP = 45; // '-' 31 static const UChar VARIANT_SEP = 47; // '/' 32 static const UChar ANY[] = {65,110,121,0}; // "Any" 33 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" 34 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" 35 36 //------------------------------------------------------------ 37 38 U_CDECL_BEGIN 39 /** 40 * Deleter function for Transliterator*. 41 */ 42 static void U_CALLCONV 43 _deleteTransliterator(void *obj) { 44 delete (icu::Transliterator*) obj; 45 } 46 U_CDECL_END 47 48 //------------------------------------------------------------ 49 50 U_NAMESPACE_BEGIN 51 52 //------------------------------------------------------------ 53 // ScriptRunIterator 54 55 /** 56 * Returns a series of ranges corresponding to scripts. They will be 57 * of the form: 58 * 59 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second 60 * | | - first run (start, limit) 61 * | | - second run (start, limit) 62 * 63 * That is, the runs will overlap. The reason for this is so that a 64 * transliterator can consider common characters both before and after 65 * the scripts. 66 */ 67 class ScriptRunIterator : public UMemory { 68 private: 69 const Replaceable& text; 70 int32_t textStart; 71 int32_t textLimit; 72 73 public: 74 /** 75 * The code of the current run, valid after next() returns. May 76 * be USCRIPT_INVALID_CODE if and only if the entire text is 77 * COMMON/INHERITED. 78 */ 79 UScriptCode scriptCode; 80 81 /** 82 * The start of the run, inclusive, valid after next() returns. 83 */ 84 int32_t start; 85 86 /** 87 * The end of the run, exclusive, valid after next() returns. 88 */ 89 int32_t limit; 90 91 /** 92 * Constructs a run iterator over the given text from start 93 * (inclusive) to limit (exclusive). 94 */ 95 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); 96 97 /** 98 * Returns TRUE if there are any more runs. TRUE is always 99 * returned at least once. Upon return, the caller should 100 * examine scriptCode, start, and limit. 101 */ 102 UBool next(); 103 104 /** 105 * Adjusts internal indices for a change in the limit index of the 106 * given delta. A positive delta means the limit has increased. 107 */ 108 void adjustLimit(int32_t delta); 109 110 private: 111 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class 112 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class 113 }; 114 115 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, 116 int32_t myStart, int32_t myLimit) : 117 text(theText) 118 { 119 textStart = myStart; 120 textLimit = myLimit; 121 limit = myStart; 122 } 123 124 UBool ScriptRunIterator::next() { 125 UChar32 ch; 126 UScriptCode s; 127 UErrorCode ec = U_ZERO_ERROR; 128 129 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet 130 start = limit; 131 132 // Are we done? 133 if (start == textLimit) { 134 return FALSE; 135 } 136 137 // Move start back to include adjacent COMMON or INHERITED 138 // characters 139 while (start > textStart) { 140 ch = text.char32At(start - 1); // look back 141 s = uscript_getScript(ch, &ec); 142 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { 143 --start; 144 } else { 145 break; 146 } 147 } 148 149 // Move limit ahead to include COMMON, INHERITED, and characters 150 // of the current script. 151 while (limit < textLimit) { 152 ch = text.char32At(limit); // look ahead 153 s = uscript_getScript(ch, &ec); 154 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { 155 if (scriptCode == USCRIPT_INVALID_CODE) { 156 scriptCode = s; 157 } else if (s != scriptCode) { 158 break; 159 } 160 } 161 ++limit; 162 } 163 164 // Return TRUE even if the entire text is COMMON / INHERITED, in 165 // which case scriptCode will be USCRIPT_INVALID_CODE. 166 return TRUE; 167 } 168 169 void ScriptRunIterator::adjustLimit(int32_t delta) { 170 limit += delta; 171 textLimit += delta; 172 } 173 174 //------------------------------------------------------------ 175 // AnyTransliterator 176 177 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) 178 179 AnyTransliterator::AnyTransliterator(const UnicodeString& id, 180 const UnicodeString& theTarget, 181 const UnicodeString& theVariant, 182 UScriptCode theTargetScript, 183 UErrorCode& ec) : 184 Transliterator(id, NULL), 185 targetScript(theTargetScript) 186 { 187 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 188 if (U_FAILURE(ec)) { 189 return; 190 } 191 uhash_setValueDeleter(cache, _deleteTransliterator); 192 193 target = theTarget; 194 if (theVariant.length() > 0) { 195 target.append(VARIANT_SEP).append(theVariant); 196 } 197 } 198 199 AnyTransliterator::~AnyTransliterator() { 200 uhash_close(cache); 201 } 202 203 /** 204 * Copy constructor. 205 */ 206 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : 207 Transliterator(o), 208 target(o.target), 209 targetScript(o.targetScript) 210 { 211 // Don't copy the cache contents 212 UErrorCode ec = U_ZERO_ERROR; 213 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 214 if (U_FAILURE(ec)) { 215 return; 216 } 217 uhash_setValueDeleter(cache, _deleteTransliterator); 218 } 219 220 /** 221 * Transliterator API. 222 */ 223 Transliterator* AnyTransliterator::clone() const { 224 return new AnyTransliterator(*this); 225 } 226 227 /** 228 * Implements {@link Transliterator#handleTransliterate}. 229 */ 230 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 231 UBool isIncremental) const { 232 int32_t allStart = pos.start; 233 int32_t allLimit = pos.limit; 234 235 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); 236 237 while (it.next()) { 238 // Ignore runs in the ante context 239 if (it.limit <= allStart) continue; 240 241 // Try to instantiate transliterator from it.scriptCode to 242 // our target or target/variant 243 Transliterator* t = getTransliterator(it.scriptCode); 244 245 if (t == NULL) { 246 // We have no transliterator. Do nothing, but keep 247 // pos.start up to date. 248 pos.start = it.limit; 249 continue; 250 } 251 252 // If the run end is before the transliteration limit, do 253 // a non-incremental transliteration. Otherwise do an 254 // incremental one. 255 UBool incremental = isIncremental && (it.limit >= allLimit); 256 257 pos.start = uprv_max(allStart, it.start); 258 pos.limit = uprv_min(allLimit, it.limit); 259 int32_t limit = pos.limit; 260 t->filteredTransliterate(text, pos, incremental); 261 int32_t delta = pos.limit - limit; 262 allLimit += delta; 263 it.adjustLimit(delta); 264 265 // We're done if we enter the post context 266 if (it.limit >= allLimit) break; 267 } 268 269 // Restore limit. pos.start is fine where the last transliterator 270 // left it, or at the end of the last run. 271 pos.limit = allLimit; 272 } 273 274 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { 275 276 if (source == targetScript || source == USCRIPT_INVALID_CODE) { 277 return NULL; 278 } 279 280 Transliterator* t = NULL; 281 { 282 Mutex m(NULL); 283 t = (Transliterator*) uhash_iget(cache, (int32_t) source); 284 } 285 if (t == NULL) { 286 UErrorCode ec = U_ZERO_ERROR; 287 UnicodeString sourceName(uscript_getName(source), -1, US_INV); 288 UnicodeString id(sourceName); 289 id.append(TARGET_SEP).append(target); 290 291 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 292 if (U_FAILURE(ec) || t == NULL) { 293 delete t; 294 295 // Try to pivot around Latin, our most common script 296 id = sourceName; 297 id.append(LATIN_PIVOT, -1).append(target); 298 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 299 if (U_FAILURE(ec) || t == NULL) { 300 delete t; 301 t = NULL; 302 } 303 } 304 305 if (t != NULL) { 306 Transliterator *rt = NULL; 307 { 308 Mutex m(NULL); 309 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); 310 if (rt == NULL) { 311 // Common case, no race to cache this new transliterator. 312 uhash_iput(cache, (int32_t) source, t, &ec); 313 } else { 314 // Race case, some other thread beat us to caching this transliterator. 315 Transliterator *temp = rt; 316 rt = t; // Our newly created transliterator that lost the race & now needs deleting. 317 t = temp; // The transliterator from the cache that we will return. 318 } 319 } 320 delete rt; // will be non-null only in case of races. 321 } 322 } 323 return t; 324 } 325 326 /** 327 * Return the script code for a given name, or -1 if not found. 328 */ 329 static UScriptCode scriptNameToCode(const UnicodeString& name) { 330 char buf[128]; 331 UScriptCode code; 332 UErrorCode ec = U_ZERO_ERROR; 333 int32_t nameLen = name.length(); 334 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); 335 336 if (isInvariant) { 337 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); 338 buf[127] = 0; // Make sure that we NULL terminate the string. 339 } 340 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) 341 { 342 code = USCRIPT_INVALID_CODE; 343 } 344 return code; 345 } 346 347 /** 348 * Registers standard transliterators with the system. Called by 349 * Transliterator during initialization. Scan all current targets and 350 * register those that are scripts T as Any-T/V. 351 */ 352 void AnyTransliterator::registerIDs() { 353 354 UErrorCode ec = U_ZERO_ERROR; 355 Hashtable seen(TRUE, ec); 356 357 int32_t sourceCount = Transliterator::_countAvailableSources(); 358 for (int32_t s=0; s<sourceCount; ++s) { 359 UnicodeString source; 360 Transliterator::_getAvailableSource(s, source); 361 362 // Ignore the "Any" source 363 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; 364 365 int32_t targetCount = Transliterator::_countAvailableTargets(source); 366 for (int32_t t=0; t<targetCount; ++t) { 367 UnicodeString target; 368 Transliterator::_getAvailableTarget(t, source, target); 369 370 // Only process each target once 371 if (seen.geti(target) != 0) continue; 372 ec = U_ZERO_ERROR; 373 seen.puti(target, 1, ec); 374 375 // Get the script code for the target. If not a script, ignore. 376 UScriptCode targetScript = scriptNameToCode(target); 377 if (targetScript == USCRIPT_INVALID_CODE) continue; 378 379 int32_t variantCount = Transliterator::_countAvailableVariants(source, target); 380 // assert(variantCount >= 1); 381 for (int32_t v=0; v<variantCount; ++v) { 382 UnicodeString variant; 383 Transliterator::_getAvailableVariant(v, source, target, variant); 384 385 UnicodeString id; 386 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); 387 ec = U_ZERO_ERROR; 388 AnyTransliterator* t = new AnyTransliterator(id, target, variant, 389 targetScript, ec); 390 if (U_FAILURE(ec)) { 391 delete t; 392 } else { 393 Transliterator::_registerInstance(t); 394 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); 395 } 396 } 397 } 398 } 399 } 400 401 U_NAMESPACE_END 402 403 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 404 405 //eof 406