1 /* 2 ***************************************************************** 3 * Copyright (c) 2002-2008, International Business Machines Corporation 4 * and others. All Rights Reserved. 5 ***************************************************************** 6 * Date Name Description 7 * 06/06/2002 aliu Creation. 8 ***************************************************************** 9 */ 10 11 #include "unicode/utypes.h" 12 13 #if !UCONFIG_NO_TRANSLITERATION 14 15 #include "unicode/uobject.h" 16 #include "unicode/uscript.h" 17 #include "nultrans.h" 18 #include "anytrans.h" 19 #include "uvector.h" 20 #include "tridpars.h" 21 #include "hash.h" 22 #include "putilimp.h" 23 #include "uinvchar.h" 24 25 //------------------------------------------------------------ 26 // Constants 27 28 static const UChar TARGET_SEP = 45; // '-' 29 static const UChar VARIANT_SEP = 47; // '/' 30 static const UChar ANY[] = {65,110,121,0}; // "Any" 31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null" 32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-" 33 34 //------------------------------------------------------------ 35 36 U_CDECL_BEGIN 37 /** 38 * Deleter function for Transliterator*. 39 */ 40 static void U_CALLCONV 41 _deleteTransliterator(void *obj) { 42 delete (U_NAMESPACE_QUALIFIER Transliterator*) obj; 43 } 44 U_CDECL_END 45 46 //------------------------------------------------------------ 47 48 U_NAMESPACE_BEGIN 49 50 //------------------------------------------------------------ 51 // ScriptRunIterator 52 53 /** 54 * Returns a series of ranges corresponding to scripts. They will be 55 * of the form: 56 * 57 * ccccSScSSccccTTcTcccc - c = common, S = first script, T = second 58 * | | - first run (start, limit) 59 * | | - second run (start, limit) 60 * 61 * That is, the runs will overlap. The reason for this is so that a 62 * transliterator can consider common characters both before and after 63 * the scripts. 64 */ 65 class ScriptRunIterator : public UMemory { 66 private: 67 const Replaceable& text; 68 int32_t textStart; 69 int32_t textLimit; 70 71 public: 72 /** 73 * The code of the current run, valid after next() returns. May 74 * be USCRIPT_INVALID_CODE if and only if the entire text is 75 * COMMON/INHERITED. 76 */ 77 UScriptCode scriptCode; 78 79 /** 80 * The start of the run, inclusive, valid after next() returns. 81 */ 82 int32_t start; 83 84 /** 85 * The end of the run, exclusive, valid after next() returns. 86 */ 87 int32_t limit; 88 89 /** 90 * Constructs a run iterator over the given text from start 91 * (inclusive) to limit (exclusive). 92 */ 93 ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit); 94 95 /** 96 * Returns TRUE if there are any more runs. TRUE is always 97 * returned at least once. Upon return, the caller should 98 * examine scriptCode, start, and limit. 99 */ 100 UBool next(); 101 102 /** 103 * Adjusts internal indices for a change in the limit index of the 104 * given delta. A positive delta means the limit has increased. 105 */ 106 void adjustLimit(int32_t delta); 107 108 private: 109 ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class 110 ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class 111 }; 112 113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText, 114 int32_t myStart, int32_t myLimit) : 115 text(theText) 116 { 117 textStart = myStart; 118 textLimit = myLimit; 119 limit = myStart; 120 } 121 122 UBool ScriptRunIterator::next() { 123 UChar32 ch; 124 UScriptCode s; 125 UErrorCode ec = U_ZERO_ERROR; 126 127 scriptCode = USCRIPT_INVALID_CODE; // don't know script yet 128 start = limit; 129 130 // Are we done? 131 if (start == textLimit) { 132 return FALSE; 133 } 134 135 // Move start back to include adjacent COMMON or INHERITED 136 // characters 137 while (start > textStart) { 138 ch = text.char32At(start - 1); // look back 139 s = uscript_getScript(ch, &ec); 140 if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) { 141 --start; 142 } else { 143 break; 144 } 145 } 146 147 // Move limit ahead to include COMMON, INHERITED, and characters 148 // of the current script. 149 while (limit < textLimit) { 150 ch = text.char32At(limit); // look ahead 151 s = uscript_getScript(ch, &ec); 152 if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) { 153 if (scriptCode == USCRIPT_INVALID_CODE) { 154 scriptCode = s; 155 } else if (s != scriptCode) { 156 break; 157 } 158 } 159 ++limit; 160 } 161 162 // Return TRUE even if the entire text is COMMON / INHERITED, in 163 // which case scriptCode will be USCRIPT_INVALID_CODE. 164 return TRUE; 165 } 166 167 void ScriptRunIterator::adjustLimit(int32_t delta) { 168 limit += delta; 169 textLimit += delta; 170 } 171 172 //------------------------------------------------------------ 173 // AnyTransliterator 174 175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator) 176 177 AnyTransliterator::AnyTransliterator(const UnicodeString& id, 178 const UnicodeString& theTarget, 179 const UnicodeString& theVariant, 180 UScriptCode theTargetScript, 181 UErrorCode& ec) : 182 Transliterator(id, NULL), 183 targetScript(theTargetScript) 184 { 185 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 186 if (U_FAILURE(ec)) { 187 return; 188 } 189 uhash_setValueDeleter(cache, _deleteTransliterator); 190 191 target = theTarget; 192 if (theVariant.length() > 0) { 193 target.append(VARIANT_SEP).append(theVariant); 194 } 195 } 196 197 AnyTransliterator::~AnyTransliterator() { 198 uhash_close(cache); 199 } 200 201 /** 202 * Copy constructor. 203 */ 204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) : 205 Transliterator(o), 206 target(o.target), 207 targetScript(o.targetScript) 208 { 209 // Don't copy the cache contents 210 UErrorCode ec = U_ZERO_ERROR; 211 cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec); 212 if (U_FAILURE(ec)) { 213 return; 214 } 215 uhash_setValueDeleter(cache, _deleteTransliterator); 216 } 217 218 /** 219 * Transliterator API. 220 */ 221 Transliterator* AnyTransliterator::clone() const { 222 return new AnyTransliterator(*this); 223 } 224 225 /** 226 * Implements {@link Transliterator#handleTransliterate}. 227 */ 228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos, 229 UBool isIncremental) const { 230 int32_t allStart = pos.start; 231 int32_t allLimit = pos.limit; 232 233 ScriptRunIterator it(text, pos.contextStart, pos.contextLimit); 234 235 while (it.next()) { 236 // Ignore runs in the ante context 237 if (it.limit <= allStart) continue; 238 239 // Try to instantiate transliterator from it.scriptCode to 240 // our target or target/variant 241 Transliterator* t = getTransliterator(it.scriptCode); 242 243 if (t == NULL) { 244 // We have no transliterator. Do nothing, but keep 245 // pos.start up to date. 246 pos.start = it.limit; 247 continue; 248 } 249 250 // If the run end is before the transliteration limit, do 251 // a non-incremental transliteration. Otherwise do an 252 // incremental one. 253 UBool incremental = isIncremental && (it.limit >= allLimit); 254 255 pos.start = uprv_max(allStart, it.start); 256 pos.limit = uprv_min(allLimit, it.limit); 257 int32_t limit = pos.limit; 258 t->filteredTransliterate(text, pos, incremental); 259 int32_t delta = pos.limit - limit; 260 allLimit += delta; 261 it.adjustLimit(delta); 262 263 // We're done if we enter the post context 264 if (it.limit >= allLimit) break; 265 } 266 267 // Restore limit. pos.start is fine where the last transliterator 268 // left it, or at the end of the last run. 269 pos.limit = allLimit; 270 } 271 272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const { 273 274 if (source == targetScript || source == USCRIPT_INVALID_CODE) { 275 return NULL; 276 } 277 278 Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source); 279 if (t == NULL) { 280 UErrorCode ec = U_ZERO_ERROR; 281 UnicodeString sourceName(uscript_getName(source), -1, US_INV); 282 UnicodeString id(sourceName); 283 id.append(TARGET_SEP).append(target); 284 285 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 286 if (U_FAILURE(ec) || t == NULL) { 287 delete t; 288 289 // Try to pivot around Latin, our most common script 290 id = sourceName; 291 id.append(LATIN_PIVOT).append(target); 292 t = Transliterator::createInstance(id, UTRANS_FORWARD, ec); 293 if (U_FAILURE(ec) || t == NULL) { 294 delete t; 295 t = NULL; 296 } 297 } 298 299 if (t != NULL) { 300 uhash_iput(cache, (int32_t) source, t, &ec); 301 } 302 } 303 304 return t; 305 } 306 307 /** 308 * Return the script code for a given name, or -1 if not found. 309 */ 310 static UScriptCode scriptNameToCode(const UnicodeString& name) { 311 char buf[128]; 312 UScriptCode code; 313 UErrorCode ec = U_ZERO_ERROR; 314 int32_t nameLen = name.length(); 315 UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen); 316 317 if (isInvariant) { 318 name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV); 319 buf[127] = 0; // Make sure that we NULL terminate the string. 320 } 321 if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec)) 322 { 323 code = USCRIPT_INVALID_CODE; 324 } 325 return code; 326 } 327 328 /** 329 * Registers standard transliterators with the system. Called by 330 * Transliterator during initialization. Scan all current targets and 331 * register those that are scripts T as Any-T/V. 332 */ 333 void AnyTransliterator::registerIDs() { 334 335 UErrorCode ec = U_ZERO_ERROR; 336 Hashtable seen(TRUE, ec); 337 338 int32_t sourceCount = Transliterator::_countAvailableSources(); 339 for (int32_t s=0; s<sourceCount; ++s) { 340 UnicodeString source; 341 Transliterator::_getAvailableSource(s, source); 342 343 // Ignore the "Any" source 344 if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue; 345 346 int32_t targetCount = Transliterator::_countAvailableTargets(source); 347 for (int32_t t=0; t<targetCount; ++t) { 348 UnicodeString target; 349 Transliterator::_getAvailableTarget(t, source, target); 350 351 // Only process each target once 352 if (seen.geti(target) != 0) continue; 353 ec = U_ZERO_ERROR; 354 seen.puti(target, 1, ec); 355 356 // Get the script code for the target. If not a script, ignore. 357 UScriptCode targetScript = scriptNameToCode(target); 358 if (targetScript == USCRIPT_INVALID_CODE) continue; 359 360 int32_t variantCount = Transliterator::_countAvailableVariants(source, target); 361 // assert(variantCount >= 1); 362 for (int32_t v=0; v<variantCount; ++v) { 363 UnicodeString variant; 364 Transliterator::_getAvailableVariant(v, source, target, variant); 365 366 UnicodeString id; 367 TransliteratorIDParser::STVtoID(ANY, target, variant, id); 368 ec = U_ZERO_ERROR; 369 AnyTransliterator* t = new AnyTransliterator(id, target, variant, 370 targetScript, ec); 371 if (U_FAILURE(ec)) { 372 delete t; 373 } else { 374 Transliterator::_registerInstance(t); 375 Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE); 376 } 377 } 378 } 379 } 380 } 381 382 U_NAMESPACE_END 383 384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */ 385 386 //eof 387