1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ***************************************************************** 5 * Copyright (c) 2002-2014, International Business Machines Corporation 6 * and others. All Rights Reserved. 7 ***************************************************************** 8 * Date Name Description 9 * 06/06/2002 aliu Creation. 10 ***************************************************************** 11 */ 12 13 #include "unicode/utypes.h" 14 15 #if !UCONFIG_NO_TRANSLITERATION 16 17 #include "unicode/uobject.h" 18 #include "unicode/uscript.h" 19 20 #include "anytrans.h" 21 #include "hash.h" 22 #include "mutex.h" 23 #include "nultrans.h" 24 #include "putilimp.h" 25 #include "tridpars.h" 26 #include "uinvchar.h" 27 #include "uvector.h" 28 29 //------------------------------------------------------------ 30 // Constants 31 32 static const UChar TARGET_SEP = 45; // '-' 33 static const UChar VARIANT_SEP = 47; // '/' 34 static const UChar ANY[] = {0x41,0x6E,0x79,0}; // "Any" 35 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" 36 static const UChar LATIN_PIVOT[] = {0x2D,0x4C,0x61,0x74,0x6E,0x3B,0x4C,0x61,0x74,0x6E,0x2D,0}; // "-Latn;Latn-" 37 38 // initial size for an Any-XXXX transform's cache of script-XXXX transforms 39 // (will grow as necessary, but we don't expect to have source text with more than 7 scripts) 40 #define ANY_TRANS_CACHE_INIT_SIZE 7 41 42 //------------------------------------------------------------ 43 44 U_CDECL_BEGIN 45 /** 46 * Deleter function for Transliterator*. 47 */ 48 static void U_CALLCONV 49 _deleteTransliterator(void *obj) { 50 delete (icu::Transliterator*) obj; 51 } 52 U_CDECL_END 53 54 //------------------------------------------------------------ 55 56 U_NAMESPACE_BEGIN 57 58 //------------------------------------------------------------ 59 // ScriptRunIterator 60 61 /** 62 * Returns a series of ranges corresponding to scripts. They will be 63 * of the form: 64 * 65 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second 66 * | | - first run (start, limit) 67 * | | - second run (start, limit) 68 * 69 * That is, the runs will overlap. The reason for this is so that a 70 * transliterator can consider common characters both before and after 71 * the scripts. 72 */ 73 class ScriptRunIterator : public UMemory { 74 private: 75 const Replaceable& text; 76 int32_t textStart; 77 int32_t textLimit; 78 79 public: 80 /** 81 * The code of the current run, valid after next() returns. May 82 * be USCRIPT_INVALID_CODE if and only if the entire text is 83 * COMMON/INHERITED. 84 */ 85 UScriptCode scriptCode; 86 87 /** 88 * The start of the run, inclusive, valid after next() returns. 89 */ 90 int32_t start; 91 92 /** 93 * The end of the run, exclusive, valid after next() returns. 94 */ 95 int32_t limit; 96 97 /** 98 * Constructs a run iterator over the given text from start 99 * (inclusive) to limit (exclusive). 100 */ 101 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); 102 103 /** 104 * Returns TRUE if there are any more runs. TRUE is always 105 * returned at least once. Upon return, the caller should 106 * examine scriptCode, start, and limit. 107 */ 108 UBool next(); 109 110 /** 111 * Adjusts internal indices for a change in the limit index of the 112 * given delta. A positive delta means the limit has increased. 113 */ 114 void adjustLimit(int32_t delta); 115 116 private: 117 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class 118 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class 119 }; 120 121 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, 122 int32_t myStart, int32_t myLimit) : 123 text(theText) 124 { 125 textStart = myStart; 126 textLimit = myLimit; 127 limit = myStart; 128 } 129 130 UBool ScriptRunIterator::next() { 131 UChar32 ch; 132 UScriptCode s; 133 UErrorCode ec = U_ZERO_ERROR; 134 135 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet 136 start = limit; 137 138 // Are we done? 139 if (start == textLimit) { 140 return FALSE; 141 } 142 143 // Move start back to include adjacent COMMON or INHERITED 144 // characters 145 while (start > textStart) { 146 ch = text.char32At(start - 1); // look back 147 s = uscript_getScript(ch, &ec); 148 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { 149 --start; 150 } else { 151 break; 152 } 153 } 154 155 // Move limit ahead to include COMMON, INHERITED, and characters 156 // of the current script. 157 while (limit < textLimit) { 158 ch = text.char32At(limit); // look ahead 159 s = uscript_getScript(ch, &ec); 160 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { 161 if (scriptCode == USCRIPT_INVALID_CODE) { 162 scriptCode = s; 163 } else if (s != scriptCode) { 164 break; 165 } 166 } 167 ++limit; 168 } 169 170 // Return TRUE even if the entire text is COMMON / INHERITED, in 171 // which case scriptCode will be USCRIPT_INVALID_CODE. 172 return TRUE; 173 } 174 175 void ScriptRunIterator::adjustLimit(int32_t delta) { 176 limit += delta; 177 textLimit += delta; 178 } 179 180 //------------------------------------------------------------ 181 // AnyTransliterator 182 183 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) 184 185 AnyTransliterator::AnyTransliterator(const UnicodeString& id, 186 const UnicodeString& theTarget, 187 const UnicodeString& theVariant, 188 UScriptCode theTargetScript, 189 UErrorCode& ec) : 190 Transliterator(id, NULL), 191 targetScript(theTargetScript) 192 { 193 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); 194 if (U_FAILURE(ec)) { 195 return; 196 } 197 uhash_setValueDeleter(cache, _deleteTransliterator); 198 199 target = theTarget; 200 if (theVariant.length() > 0) { 201 target.append(VARIANT_SEP).append(theVariant); 202 } 203 } 204 205 AnyTransliterator::~AnyTransliterator() { 206 uhash_close(cache); 207 } 208 209 /** 210 * Copy constructor. 211 */ 212 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : 213 Transliterator(o), 214 target(o.target), 215 targetScript(o.targetScript) 216 { 217 // Don't copy the cache contents 218 UErrorCode ec = U_ZERO_ERROR; 219 cache = uhash_openSize(uhash_hashLong, uhash_compareLong, NULL, ANY_TRANS_CACHE_INIT_SIZE, &ec); 220 if (U_FAILURE(ec)) { 221 return; 222 } 223 uhash_setValueDeleter(cache, _deleteTransliterator); 224 } 225 226 /** 227 * Transliterator API. 228 */ 229 Transliterator* AnyTransliterator::clone() const { 230 return new AnyTransliterator(*this); 231 } 232 233 /** 234 * Implements {@link Transliterator#handleTransliterate}. 235 */ 236 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 237 UBool isIncremental) const { 238 int32_t allStart = pos.start; 239 int32_t allLimit = pos.limit; 240 241 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); 242 243 while (it.next()) { 244 // Ignore runs in the ante context 245 if (it.limit <= allStart) continue; 246 247 // Try to instantiate transliterator from it.scriptCode to 248 // our target or target/variant 249 Transliterator* t = getTransliterator(it.scriptCode); 250 251 if (t == NULL) { 252 // We have no transliterator. Do nothing, but keep 253 // pos.start up to date. 254 pos.start = it.limit; 255 continue; 256 } 257 258 // If the run end is before the transliteration limit, do 259 // a non-incremental transliteration. Otherwise do an 260 // incremental one. 261 UBool incremental = isIncremental && (it.limit >= allLimit); 262 263 pos.start = uprv_max(allStart, it.start); 264 pos.limit = uprv_min(allLimit, it.limit); 265 int32_t limit = pos.limit; 266 t->filteredTransliterate(text, pos, incremental); 267 int32_t delta = pos.limit - limit; 268 allLimit += delta; 269 it.adjustLimit(delta); 270 271 // We're done if we enter the post context 272 if (it.limit >= allLimit) break; 273 } 274 275 // Restore limit. pos.start is fine where the last transliterator 276 // left it, or at the end of the last run. 277 pos.limit = allLimit; 278 } 279 280 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { 281 282 if (source == targetScript || source == USCRIPT_INVALID_CODE) { 283 return NULL; 284 } 285 286 Transliterator* t = NULL; 287 { 288 Mutex m(NULL); 289 t = (Transliterator*) uhash_iget(cache, (int32_t) source); 290 } 291 if (t == NULL) { 292 UErrorCode ec = U_ZERO_ERROR; 293 UnicodeString sourceName(uscript_getShortName(source), -1, US_INV); 294 UnicodeString id(sourceName); 295 id.append(TARGET_SEP).append(target); 296 297 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 298 if (U_FAILURE(ec) || t == NULL) { 299 delete t; 300 301 // Try to pivot around Latin, our most common script 302 id = sourceName; 303 id.append(LATIN_PIVOT, -1).append(target); 304 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 305 if (U_FAILURE(ec) || t == NULL) { 306 delete t; 307 t = NULL; 308 } 309 } 310 311 if (t != NULL) { 312 Transliterator *rt = NULL; 313 { 314 Mutex m(NULL); 315 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source)); 316 if (rt == NULL) { 317 // Common case, no race to cache this new transliterator. 318 uhash_iput(cache, (int32_t) source, t, &ec); 319 } else { 320 // Race case, some other thread beat us to caching this transliterator. 321 Transliterator *temp = rt; 322 rt = t; // Our newly created transliterator that lost the race & now needs deleting. 323 t = temp; // The transliterator from the cache that we will return. 324 } 325 } 326 delete rt; // will be non-null only in case of races. 327 } 328 } 329 return t; 330 } 331 332 /** 333 * Return the script code for a given name, or -1 if not found. 334 */ 335 static UScriptCode scriptNameToCode(const UnicodeString& name) { 336 char buf[128]; 337 UScriptCode code; 338 UErrorCode ec = U_ZERO_ERROR; 339 int32_t nameLen = name.length(); 340 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); 341 342 if (isInvariant) { 343 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); 344 buf[127] = 0; // Make sure that we NULL terminate the string. 345 } 346 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) 347 { 348 code = USCRIPT_INVALID_CODE; 349 } 350 return code; 351 } 352 353 /** 354 * Registers standard transliterators with the system. Called by 355 * Transliterator during initialization. Scan all current targets and 356 * register those that are scripts T as Any-T/V. 357 */ 358 void AnyTransliterator::registerIDs() { 359 360 UErrorCode ec = U_ZERO_ERROR; 361 Hashtable seen(TRUE, ec); 362 363 int32_t sourceCount = Transliterator::_countAvailableSources(); 364 for (int32_t s=0; s<sourceCount; ++s) { 365 UnicodeString source; 366 Transliterator::_getAvailableSource(s, source); 367 368 // Ignore the "Any" source 369 if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; 370 371 int32_t targetCount = Transliterator::_countAvailableTargets(source); 372 for (int32_t t=0; t<targetCount; ++t) { 373 UnicodeString target; 374 Transliterator::_getAvailableTarget(t, source, target); 375 376 // Only process each target once 377 if (seen.geti(target) != 0) continue; 378 ec = U_ZERO_ERROR; 379 seen.puti(target, 1, ec); 380 381 // Get the script code for the target. If not a script, ignore. 382 UScriptCode targetScript = scriptNameToCode(target); 383 if (targetScript == USCRIPT_INVALID_CODE) continue; 384 385 int32_t variantCount = Transliterator::_countAvailableVariants(source, target); 386 // assert(variantCount >= 1); 387 for (int32_t v=0; v<variantCount; ++v) { 388 UnicodeString variant; 389 Transliterator::_getAvailableVariant(v, source, target, variant); 390 391 UnicodeString id; 392 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id); 393 ec = U_ZERO_ERROR; 394 AnyTransliterator* t = new AnyTransliterator(id, target, variant, 395 targetScript, ec); 396 if (U_FAILURE(ec)) { 397 delete t; 398 } else { 399 Transliterator::_registerInstance(t); 400 Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE); 401 } 402 } 403 } 404 } 405 } 406 407 U_NAMESPACE_END 408 409 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 410 411 //eof 412