Home | History | Annotate | Download | only in i18n
      1 /*
      2 *****************************************************************
      3 * Copyright (c) 2002-2014, International Business Machines Corporation
      4 * and others.  All Rights Reserved.
      5 *****************************************************************
      6 * Date        Name        Description
      7 * 06/06/2002  aliu        Creation.
      8 *****************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uobject.h"
     16 #include "unicode/uscript.h"
     17 
     18 #include "anytrans.h"
     19 #include "hash.h"
     20 #include "mutex.h"
     21 #include "nultrans.h"
     22 #include "putilimp.h"
     23 #include "tridpars.h"
     24 #include "uinvchar.h"
     25 #include "uvector.h"
     26 
     27 //------------------------------------------------------------
     28 // Constants
     29 
     30 static const UChar TARGET_SEP = 45; // '-'
     31 static const UChar VARIANT_SEP = 47; // '/'
     32 static const UChar ANY[] = {65,110,121,0}; // "Any"
     33 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
     34 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
     35 
     36 //------------------------------------------------------------
     37 
     38 U_CDECL_BEGIN
     39 /**
     40  * Deleter function for Transliterator*.
     41  */
     42 static void U_CALLCONV
     43 _deleteTransliterator(void *obj) {
     44     delete (icu::Transliterator*) obj;
     45 }
     46 U_CDECL_END
     47 
     48 //------------------------------------------------------------
     49 
     50 U_NAMESPACE_BEGIN
     51 
     52 //------------------------------------------------------------
     53 // ScriptRunIterator
     54 
     55 /**
     56  * Returns a series of ranges corresponding to scripts. They will be
     57  * of the form:
     58  *
     59  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
     60  * |            |          - first run (start, limit)
     61  *          |           |  - second run (start, limit)
     62  *
     63  * That is, the runs will overlap. The reason for this is so that a
     64  * transliterator can consider common characters both before and after
     65  * the scripts.
     66  */
     67 class ScriptRunIterator : public UMemory {
     68 private:
     69     const Replaceable& text;
     70     int32_t textStart;
     71     int32_t textLimit;
     72 
     73 public:
     74     /**
     75      * The code of the current run, valid after next() returns.  May
     76      * be USCRIPT_INVALID_CODE if and only if the entire text is
     77      * COMMON/INHERITED.
     78      */
     79     UScriptCode scriptCode;
     80 
     81     /**
     82      * The start of the run, inclusive, valid after next() returns.
     83      */
     84     int32_t start;
     85 
     86     /**
     87      * The end of the run, exclusive, valid after next() returns.
     88      */
     89     int32_t limit;
     90 
     91     /**
     92      * Constructs a run iterator over the given text from start
     93      * (inclusive) to limit (exclusive).
     94      */
     95     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
     96 
     97     /**
     98      * Returns TRUE if there are any more runs.  TRUE is always
     99      * returned at least once.  Upon return, the caller should
    100      * examine scriptCode, start, and limit.
    101      */
    102     UBool next();
    103 
    104     /**
    105      * Adjusts internal indices for a change in the limit index of the
    106      * given delta.  A positive delta means the limit has increased.
    107      */
    108     void adjustLimit(int32_t delta);
    109 
    110 private:
    111     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
    112     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
    113 };
    114 
    115 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
    116                                      int32_t myStart, int32_t myLimit) :
    117     text(theText)
    118 {
    119     textStart = myStart;
    120     textLimit = myLimit;
    121     limit = myStart;
    122 }
    123 
    124 UBool ScriptRunIterator::next() {
    125     UChar32 ch;
    126     UScriptCode s;
    127     UErrorCode ec = U_ZERO_ERROR;
    128 
    129     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
    130     start = limit;
    131 
    132     // Are we done?
    133     if (start == textLimit) {
    134         return FALSE;
    135     }
    136 
    137     // Move start back to include adjacent COMMON or INHERITED
    138     // characters
    139     while (start > textStart) {
    140         ch = text.char32At(start - 1); // look back
    141         s = uscript_getScript(ch, &ec);
    142         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
    143             --start;
    144         } else {
    145             break;
    146         }
    147     }
    148 
    149     // Move limit ahead to include COMMON, INHERITED, and characters
    150     // of the current script.
    151     while (limit < textLimit) {
    152         ch = text.char32At(limit); // look ahead
    153         s = uscript_getScript(ch, &ec);
    154         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
    155             if (scriptCode == USCRIPT_INVALID_CODE) {
    156                 scriptCode = s;
    157             } else if (s != scriptCode) {
    158                 break;
    159             }
    160         }
    161         ++limit;
    162     }
    163 
    164     // Return TRUE even if the entire text is COMMON / INHERITED, in
    165     // which case scriptCode will be USCRIPT_INVALID_CODE.
    166     return TRUE;
    167 }
    168 
    169 void ScriptRunIterator::adjustLimit(int32_t delta) {
    170     limit += delta;
    171     textLimit += delta;
    172 }
    173 
    174 //------------------------------------------------------------
    175 // AnyTransliterator
    176 
    177 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
    178 
    179 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
    180                                      const UnicodeString& theTarget,
    181                                      const UnicodeString& theVariant,
    182                                      UScriptCode theTargetScript,
    183                                      UErrorCode& ec) :
    184     Transliterator(id, NULL),
    185     targetScript(theTargetScript)
    186 {
    187     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
    188     if (U_FAILURE(ec)) {
    189         return;
    190     }
    191     uhash_setValueDeleter(cache, _deleteTransliterator);
    192 
    193     target = theTarget;
    194     if (theVariant.length() > 0) {
    195         target.append(VARIANT_SEP).append(theVariant);
    196     }
    197 }
    198 
    199 AnyTransliterator::~AnyTransliterator() {
    200     uhash_close(cache);
    201 }
    202 
    203 /**
    204  * Copy constructor.
    205  */
    206 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
    207     Transliterator(o),
    208     target(o.target),
    209     targetScript(o.targetScript)
    210 {
    211     // Don't copy the cache contents
    212     UErrorCode ec = U_ZERO_ERROR;
    213     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
    214     if (U_FAILURE(ec)) {
    215         return;
    216     }
    217     uhash_setValueDeleter(cache, _deleteTransliterator);
    218 }
    219 
    220 /**
    221  * Transliterator API.
    222  */
    223 Transliterator* AnyTransliterator::clone() const {
    224     return new AnyTransliterator(*this);
    225 }
    226 
    227 /**
    228  * Implements {@link Transliterator#handleTransliterate}.
    229  */
    230 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    231                                             UBool isIncremental) const {
    232     int32_t allStart = pos.start;
    233     int32_t allLimit = pos.limit;
    234 
    235     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
    236 
    237     while (it.next()) {
    238         // Ignore runs in the ante context
    239         if (it.limit <= allStart) continue;
    240 
    241         // Try to instantiate transliterator from it.scriptCode to
    242         // our target or target/variant
    243         Transliterator* t = getTransliterator(it.scriptCode);
    244 
    245         if (t == NULL) {
    246             // We have no transliterator.  Do nothing, but keep
    247             // pos.start up to date.
    248             pos.start = it.limit;
    249             continue;
    250         }
    251 
    252         // If the run end is before the transliteration limit, do
    253         // a non-incremental transliteration.  Otherwise do an
    254         // incremental one.
    255         UBool incremental = isIncremental && (it.limit >= allLimit);
    256 
    257         pos.start = uprv_max(allStart, it.start);
    258         pos.limit = uprv_min(allLimit, it.limit);
    259         int32_t limit = pos.limit;
    260         t->filteredTransliterate(text, pos, incremental);
    261         int32_t delta = pos.limit - limit;
    262         allLimit += delta;
    263         it.adjustLimit(delta);
    264 
    265         // We're done if we enter the post context
    266         if (it.limit >= allLimit) break;
    267     }
    268 
    269     // Restore limit.  pos.start is fine where the last transliterator
    270     // left it, or at the end of the last run.
    271     pos.limit = allLimit;
    272 }
    273 
    274 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
    275 
    276     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
    277         return NULL;
    278     }
    279 
    280     Transliterator* t = NULL;
    281     {
    282         Mutex m(NULL);
    283         t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    284     }
    285     if (t == NULL) {
    286         UErrorCode ec = U_ZERO_ERROR;
    287         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
    288         UnicodeString id(sourceName);
    289         id.append(TARGET_SEP).append(target);
    290 
    291         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    292         if (U_FAILURE(ec) || t == NULL) {
    293             delete t;
    294 
    295             // Try to pivot around Latin, our most common script
    296             id = sourceName;
    297             id.append(LATIN_PIVOT, -1).append(target);
    298             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    299             if (U_FAILURE(ec) || t == NULL) {
    300                 delete t;
    301                 t = NULL;
    302             }
    303         }
    304 
    305         if (t != NULL) {
    306             Transliterator *rt = NULL;
    307             {
    308                 Mutex m(NULL);
    309                 rt = static_cast<Transliterator *> (uhash_iget(cache, (int32_t) source));
    310                 if (rt == NULL) {
    311                     // Common case, no race to cache this new transliterator.
    312                     uhash_iput(cache, (int32_t) source, t, &ec);
    313                 } else {
    314                     // Race case, some other thread beat us to caching this transliterator.
    315                     Transliterator *temp = rt;
    316                     rt = t;    // Our newly created transliterator that lost the race & now needs deleting.
    317                     t  = temp; // The transliterator from the cache that we will return.
    318                 }
    319             }
    320             delete rt;    // will be non-null only in case of races.
    321         }
    322     }
    323     return t;
    324 }
    325 
    326 /**
    327  * Return the script code for a given name, or -1 if not found.
    328  */
    329 static UScriptCode scriptNameToCode(const UnicodeString& name) {
    330     char buf[128];
    331     UScriptCode code;
    332     UErrorCode ec = U_ZERO_ERROR;
    333     int32_t nameLen = name.length();
    334     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
    335 
    336     if (isInvariant) {
    337         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
    338         buf[127] = 0;   // Make sure that we NULL terminate the string.
    339     }
    340     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
    341     {
    342         code = USCRIPT_INVALID_CODE;
    343     }
    344     return code;
    345 }
    346 
    347 /**
    348  * Registers standard transliterators with the system.  Called by
    349  * Transliterator during initialization.  Scan all current targets and
    350  * register those that are scripts T as Any-T/V.
    351  */
    352 void AnyTransliterator::registerIDs() {
    353 
    354     UErrorCode ec = U_ZERO_ERROR;
    355     Hashtable seen(TRUE, ec);
    356 
    357     int32_t sourceCount = Transliterator::_countAvailableSources();
    358     for (int32_t s=0; s<sourceCount; ++s) {
    359         UnicodeString source;
    360         Transliterator::_getAvailableSource(s, source);
    361 
    362         // Ignore the "Any" source
    363         if (source.caseCompare(ANY, 3, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
    364 
    365         int32_t targetCount = Transliterator::_countAvailableTargets(source);
    366         for (int32_t t=0; t<targetCount; ++t) {
    367             UnicodeString target;
    368             Transliterator::_getAvailableTarget(t, source, target);
    369 
    370             // Only process each target once
    371             if (seen.geti(target) != 0) continue;
    372             ec = U_ZERO_ERROR;
    373             seen.puti(target, 1, ec);
    374 
    375             // Get the script code for the target.  If not a script, ignore.
    376             UScriptCode targetScript = scriptNameToCode(target);
    377             if (targetScript == USCRIPT_INVALID_CODE) continue;
    378 
    379             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
    380             // assert(variantCount >= 1);
    381             for (int32_t v=0; v<variantCount; ++v) {
    382                 UnicodeString variant;
    383                 Transliterator::_getAvailableVariant(v, source, target, variant);
    384 
    385                 UnicodeString id;
    386                 TransliteratorIDParser::STVtoID(UnicodeString(TRUE, ANY, 3), target, variant, id);
    387                 ec = U_ZERO_ERROR;
    388                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
    389                                                              targetScript, ec);
    390                 if (U_FAILURE(ec)) {
    391                     delete t;
    392                 } else {
    393                     Transliterator::_registerInstance(t);
    394                     Transliterator::_registerSpecialInverse(target, UnicodeString(TRUE, NULL_ID, 4), FALSE);
    395                 }
    396             }
    397         }
    398     }
    399 }
    400 
    401 U_NAMESPACE_END
    402 
    403 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    404 
    405 //eof
    406