Home | History | Annotate | Download | only in i18n
      1 /*
      2 *****************************************************************
      3 * Copyright (c) 2002-2008, International Business Machines Corporation
      4 * and others.  All Rights Reserved.
      5 *****************************************************************
      6 * Date        Name        Description
      7 * 06/06/2002  aliu        Creation.
      8 *****************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_TRANSLITERATION
     14 
     15 #include "unicode/uobject.h"
     16 #include "unicode/uscript.h"
     17 #include "nultrans.h"
     18 #include "anytrans.h"
     19 #include "uvector.h"
     20 #include "tridpars.h"
     21 #include "hash.h"
     22 #include "putilimp.h"
     23 #include "uinvchar.h"
     24 
     25 //------------------------------------------------------------
     26 // Constants
     27 
     28 static const UChar TARGET_SEP = 45; // '-'
     29 static const UChar VARIANT_SEP = 47; // '/'
     30 static const UChar ANY[] = {65,110,121,0}; // "Any"
     31 static const UChar NULL_ID[] = {78,117,108,108,0}; // "Null"
     32 static const UChar LATIN_PIVOT[] = {45,76,97,116,105,110,59,76,97,116,105,110,45,0}; // "-Latin;Latin-"
     33 
     34 //------------------------------------------------------------
     35 
     36 U_CDECL_BEGIN
     37 /**
     38  * Deleter function for Transliterator*.
     39  */
     40 static void U_CALLCONV
     41 _deleteTransliterator(void *obj) {
     42     delete (U_NAMESPACE_QUALIFIER Transliterator*) obj;
     43 }
     44 U_CDECL_END
     45 
     46 //------------------------------------------------------------
     47 
     48 U_NAMESPACE_BEGIN
     49 
     50 //------------------------------------------------------------
     51 // ScriptRunIterator
     52 
     53 /**
     54  * Returns a series of ranges corresponding to scripts. They will be
     55  * of the form:
     56  *
     57  * ccccSScSSccccTTcTcccc   - c = common, S = first script, T = second
     58  * |            |          - first run (start, limit)
     59  *          |           |  - second run (start, limit)
     60  *
     61  * That is, the runs will overlap. The reason for this is so that a
     62  * transliterator can consider common characters both before and after
     63  * the scripts.
     64  */
     65 class ScriptRunIterator : public UMemory {
     66 private:
     67     const Replaceable& text;
     68     int32_t textStart;
     69     int32_t textLimit;
     70 
     71 public:
     72     /**
     73      * The code of the current run, valid after next() returns.  May
     74      * be USCRIPT_INVALID_CODE if and only if the entire text is
     75      * COMMON/INHERITED.
     76      */
     77     UScriptCode scriptCode;
     78 
     79     /**
     80      * The start of the run, inclusive, valid after next() returns.
     81      */
     82     int32_t start;
     83 
     84     /**
     85      * The end of the run, exclusive, valid after next() returns.
     86      */
     87     int32_t limit;
     88 
     89     /**
     90      * Constructs a run iterator over the given text from start
     91      * (inclusive) to limit (exclusive).
     92      */
     93     ScriptRunIterator(const Replaceable& text, int32_t start, int32_t limit);
     94 
     95     /**
     96      * Returns TRUE if there are any more runs.  TRUE is always
     97      * returned at least once.  Upon return, the caller should
     98      * examine scriptCode, start, and limit.
     99      */
    100     UBool next();
    101 
    102     /**
    103      * Adjusts internal indices for a change in the limit index of the
    104      * given delta.  A positive delta means the limit has increased.
    105      */
    106     void adjustLimit(int32_t delta);
    107 
    108 private:
    109     ScriptRunIterator(const ScriptRunIterator &other); // forbid copying of this class
    110     ScriptRunIterator &operator=(const ScriptRunIterator &other); // forbid copying of this class
    111 };
    112 
    113 ScriptRunIterator::ScriptRunIterator(const Replaceable& theText,
    114                                      int32_t myStart, int32_t myLimit) :
    115     text(theText)
    116 {
    117     textStart = myStart;
    118     textLimit = myLimit;
    119     limit = myStart;
    120 }
    121 
    122 UBool ScriptRunIterator::next() {
    123     UChar32 ch;
    124     UScriptCode s;
    125     UErrorCode ec = U_ZERO_ERROR;
    126 
    127     scriptCode = USCRIPT_INVALID_CODE; // don't know script yet
    128     start = limit;
    129 
    130     // Are we done?
    131     if (start == textLimit) {
    132         return FALSE;
    133     }
    134 
    135     // Move start back to include adjacent COMMON or INHERITED
    136     // characters
    137     while (start > textStart) {
    138         ch = text.char32At(start - 1); // look back
    139         s = uscript_getScript(ch, &ec);
    140         if (s == USCRIPT_COMMON || s == USCRIPT_INHERITED) {
    141             --start;
    142         } else {
    143             break;
    144         }
    145     }
    146 
    147     // Move limit ahead to include COMMON, INHERITED, and characters
    148     // of the current script.
    149     while (limit < textLimit) {
    150         ch = text.char32At(limit); // look ahead
    151         s = uscript_getScript(ch, &ec);
    152         if (s != USCRIPT_COMMON && s != USCRIPT_INHERITED) {
    153             if (scriptCode == USCRIPT_INVALID_CODE) {
    154                 scriptCode = s;
    155             } else if (s != scriptCode) {
    156                 break;
    157             }
    158         }
    159         ++limit;
    160     }
    161 
    162     // Return TRUE even if the entire text is COMMON / INHERITED, in
    163     // which case scriptCode will be USCRIPT_INVALID_CODE.
    164     return TRUE;
    165 }
    166 
    167 void ScriptRunIterator::adjustLimit(int32_t delta) {
    168     limit += delta;
    169     textLimit += delta;
    170 }
    171 
    172 //------------------------------------------------------------
    173 // AnyTransliterator
    174 
    175 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(AnyTransliterator)
    176 
    177 AnyTransliterator::AnyTransliterator(const UnicodeString& id,
    178                                      const UnicodeString& theTarget,
    179                                      const UnicodeString& theVariant,
    180                                      UScriptCode theTargetScript,
    181                                      UErrorCode& ec) :
    182     Transliterator(id, NULL),
    183     targetScript(theTargetScript)
    184 {
    185     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
    186     if (U_FAILURE(ec)) {
    187         return;
    188     }
    189     uhash_setValueDeleter(cache, _deleteTransliterator);
    190 
    191     target = theTarget;
    192     if (theVariant.length() > 0) {
    193         target.append(VARIANT_SEP).append(theVariant);
    194     }
    195 }
    196 
    197 AnyTransliterator::~AnyTransliterator() {
    198     uhash_close(cache);
    199 }
    200 
    201 /**
    202  * Copy constructor.
    203  */
    204 AnyTransliterator::AnyTransliterator(const AnyTransliterator& o) :
    205     Transliterator(o),
    206     target(o.target),
    207     targetScript(o.targetScript)
    208 {
    209     // Don't copy the cache contents
    210     UErrorCode ec = U_ZERO_ERROR;
    211     cache = uhash_open(uhash_hashLong, uhash_compareLong, NULL, &ec);
    212     if (U_FAILURE(ec)) {
    213         return;
    214     }
    215     uhash_setValueDeleter(cache, _deleteTransliterator);
    216 }
    217 
    218 /**
    219  * Transliterator API.
    220  */
    221 Transliterator* AnyTransliterator::clone() const {
    222     return new AnyTransliterator(*this);
    223 }
    224 
    225 /**
    226  * Implements {@link Transliterator#handleTransliterate}.
    227  */
    228 void AnyTransliterator::handleTransliterate(Replaceable& text, UTransPosition& pos,
    229                                             UBool isIncremental) const {
    230     int32_t allStart = pos.start;
    231     int32_t allLimit = pos.limit;
    232 
    233     ScriptRunIterator it(text, pos.contextStart, pos.contextLimit);
    234 
    235     while (it.next()) {
    236         // Ignore runs in the ante context
    237         if (it.limit <= allStart) continue;
    238 
    239         // Try to instantiate transliterator from it.scriptCode to
    240         // our target or target/variant
    241         Transliterator* t = getTransliterator(it.scriptCode);
    242 
    243         if (t == NULL) {
    244             // We have no transliterator.  Do nothing, but keep
    245             // pos.start up to date.
    246             pos.start = it.limit;
    247             continue;
    248         }
    249 
    250         // If the run end is before the transliteration limit, do
    251         // a non-incremental transliteration.  Otherwise do an
    252         // incremental one.
    253         UBool incremental = isIncremental && (it.limit >= allLimit);
    254 
    255         pos.start = uprv_max(allStart, it.start);
    256         pos.limit = uprv_min(allLimit, it.limit);
    257         int32_t limit = pos.limit;
    258         t->filteredTransliterate(text, pos, incremental);
    259         int32_t delta = pos.limit - limit;
    260         allLimit += delta;
    261         it.adjustLimit(delta);
    262 
    263         // We're done if we enter the post context
    264         if (it.limit >= allLimit) break;
    265     }
    266 
    267     // Restore limit.  pos.start is fine where the last transliterator
    268     // left it, or at the end of the last run.
    269     pos.limit = allLimit;
    270 }
    271 
    272 Transliterator* AnyTransliterator::getTransliterator(UScriptCode source) const {
    273 
    274     if (source == targetScript || source == USCRIPT_INVALID_CODE) {
    275         return NULL;
    276     }
    277 
    278     Transliterator* t = (Transliterator*) uhash_iget(cache, (int32_t) source);
    279     if (t == NULL) {
    280         UErrorCode ec = U_ZERO_ERROR;
    281         UnicodeString sourceName(uscript_getName(source), -1, US_INV);
    282         UnicodeString id(sourceName);
    283         id.append(TARGET_SEP).append(target);
    284 
    285         t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    286         if (U_FAILURE(ec) || t == NULL) {
    287             delete t;
    288 
    289             // Try to pivot around Latin, our most common script
    290             id = sourceName;
    291             id.append(LATIN_PIVOT).append(target);
    292             t = Transliterator::createInstance(id, UTRANS_FORWARD, ec);
    293             if (U_FAILURE(ec) || t == NULL) {
    294                 delete t;
    295                 t = NULL;
    296             }
    297         }
    298 
    299         if (t != NULL) {
    300             uhash_iput(cache, (int32_t) source, t, &ec);
    301         }
    302     }
    303 
    304     return t;
    305 }
    306 
    307 /**
    308  * Return the script code for a given name, or -1 if not found.
    309  */
    310 static UScriptCode scriptNameToCode(const UnicodeString& name) {
    311     char buf[128];
    312     UScriptCode code;
    313     UErrorCode ec = U_ZERO_ERROR;
    314     int32_t nameLen = name.length();
    315     UBool isInvariant = uprv_isInvariantUString(name.getBuffer(), nameLen);
    316 
    317     if (isInvariant) {
    318         name.extract(0, nameLen, buf, (int32_t)sizeof(buf), US_INV);
    319         buf[127] = 0;   // Make sure that we NULL terminate the string.
    320     }
    321     if (!isInvariant || uscript_getCode(buf, &code, 1, &ec) != 1 || U_FAILURE(ec))
    322     {
    323         code = USCRIPT_INVALID_CODE;
    324     }
    325     return code;
    326 }
    327 
    328 /**
    329  * Registers standard transliterators with the system.  Called by
    330  * Transliterator during initialization.  Scan all current targets and
    331  * register those that are scripts T as Any-T/V.
    332  */
    333 void AnyTransliterator::registerIDs() {
    334 
    335     UErrorCode ec = U_ZERO_ERROR;
    336     Hashtable seen(TRUE, ec);
    337 
    338     int32_t sourceCount = Transliterator::_countAvailableSources();
    339     for (int32_t s=0; s<sourceCount; ++s) {
    340         UnicodeString source;
    341         Transliterator::_getAvailableSource(s, source);
    342 
    343         // Ignore the "Any" source
    344         if (source.caseCompare(ANY, 0 /*U_FOLD_CASE_DEFAULT*/) == 0) continue;
    345 
    346         int32_t targetCount = Transliterator::_countAvailableTargets(source);
    347         for (int32_t t=0; t<targetCount; ++t) {
    348             UnicodeString target;
    349             Transliterator::_getAvailableTarget(t, source, target);
    350 
    351             // Only process each target once
    352             if (seen.geti(target) != 0) continue;
    353             ec = U_ZERO_ERROR;
    354             seen.puti(target, 1, ec);
    355 
    356             // Get the script code for the target.  If not a script, ignore.
    357             UScriptCode targetScript = scriptNameToCode(target);
    358             if (targetScript == USCRIPT_INVALID_CODE) continue;
    359 
    360             int32_t variantCount = Transliterator::_countAvailableVariants(source, target);
    361             // assert(variantCount >= 1);
    362             for (int32_t v=0; v<variantCount; ++v) {
    363                 UnicodeString variant;
    364                 Transliterator::_getAvailableVariant(v, source, target, variant);
    365 
    366                 UnicodeString id;
    367                 TransliteratorIDParser::STVtoID(ANY, target, variant, id);
    368                 ec = U_ZERO_ERROR;
    369                 AnyTransliterator* t = new AnyTransliterator(id, target, variant,
    370                                                              targetScript, ec);
    371                 if (U_FAILURE(ec)) {
    372                     delete t;
    373                 } else {
    374                     Transliterator::_registerInstance(t);
    375                     Transliterator::_registerSpecialInverse(target, NULL_ID, FALSE);
    376                 }
    377             }
    378         }
    379     }
    380 }
    381 
    382 U_NAMESPACE_END
    383 
    384 #endif /* #if !UCONFIG_NO_TRANSLITERATION */
    385 
    386 //eof
    387