Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2012-2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #include "unicode/uchar.h"
     11 #include "unicode/utf16.h"
     12 
     13 #include "identifier_info.h"
     14 #include "mutex.h"
     15 #include "scriptset.h"
     16 #include "ucln_in.h"
     17 #include "uvector.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 static UnicodeSet *ASCII;
     22 static ScriptSet *JAPANESE;
     23 static ScriptSet *CHINESE;
     24 static ScriptSet *KOREAN;
     25 static ScriptSet *CONFUSABLE_WITH_LATIN;
     26 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
     27 
     28 
     29 U_CDECL_BEGIN
     30 static UBool U_CALLCONV
     31 IdentifierInfo_cleanup(void) {
     32     delete ASCII;
     33     ASCII = NULL;
     34     delete JAPANESE;
     35     JAPANESE = NULL;
     36     delete CHINESE;
     37     CHINESE = NULL;
     38     delete KOREAN;
     39     KOREAN = NULL;
     40     delete CONFUSABLE_WITH_LATIN;
     41     CONFUSABLE_WITH_LATIN = NULL;
     42     gIdentifierInfoInitOnce.reset();
     43     return TRUE;
     44 }
     45 
     46 static void U_CALLCONV
     47 IdentifierInfo_init(UErrorCode &status) {
     48     ASCII    = new UnicodeSet(0, 0x7f);
     49     JAPANESE = new ScriptSet();
     50     CHINESE  = new ScriptSet();
     51     KOREAN   = new ScriptSet();
     52     CONFUSABLE_WITH_LATIN = new ScriptSet();
     53     if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
     54             || CONFUSABLE_WITH_LATIN == NULL) {
     55         status = U_MEMORY_ALLOCATION_ERROR;
     56         return;
     57     }
     58     ASCII->freeze();
     59     JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
     60              .set(USCRIPT_KATAKANA, status);
     61     CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
     62     KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
     63     CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
     64               .set(USCRIPT_CHEROKEE, status);
     65     ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
     66 }
     67 U_CDECL_END
     68 
     69 
     70 IdentifierInfo::IdentifierInfo(UErrorCode &status):
     71          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
     72          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
     73     umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
     74     if (U_FAILURE(status)) {
     75         return;
     76     }
     77 
     78     fIdentifier = new UnicodeString();
     79     fRequiredScripts = new ScriptSet();
     80     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
     81     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
     82     fCommonAmongAlternates = new ScriptSet();
     83     fNumerics = new UnicodeSet();
     84     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
     85 
     86     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
     87                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
     88         status = U_MEMORY_ALLOCATION_ERROR;
     89     }
     90 }
     91 
     92 IdentifierInfo::~IdentifierInfo() {
     93     delete fIdentifier;
     94     delete fRequiredScripts;
     95     uhash_close(fScriptSetSet);
     96     delete fCommonAmongAlternates;
     97     delete fNumerics;
     98     delete fIdentifierProfile;
     99 }
    100 
    101 
    102 IdentifierInfo &IdentifierInfo::clear() {
    103     fRequiredScripts->resetAll();
    104     uhash_removeAll(fScriptSetSet);
    105     fNumerics->clear();
    106     fCommonAmongAlternates->resetAll();
    107     return *this;
    108 }
    109 
    110 
    111 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
    112     *fIdentifierProfile = identifierProfile;
    113     return *this;
    114 }
    115 
    116 
    117 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
    118     return *fIdentifierProfile;
    119 }
    120 
    121 
    122 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
    123     if (U_FAILURE(status)) {
    124         return *this;
    125     }
    126     *fIdentifier = identifier;
    127     clear();
    128     ScriptSet scriptsForCP;
    129     UChar32 cp;
    130     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
    131         cp = identifier.char32At(i);
    132         // Store a representative character for each kind of decimal digit
    133         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
    134             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
    135             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
    136         }
    137         UScriptCode extensions[500];
    138         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status);
    139         if (U_FAILURE(status)) {
    140             return *this;
    141         }
    142         scriptsForCP.resetAll();
    143         for (int32_t j=0; j<extensionsCount; j++) {
    144             scriptsForCP.set(extensions[j], status);
    145         }
    146         scriptsForCP.reset(USCRIPT_COMMON, status);
    147         scriptsForCP.reset(USCRIPT_INHERITED, status);
    148         switch (scriptsForCP.countMembers()) {
    149           case 0: break;
    150           case 1:
    151             // Single script, record it.
    152             fRequiredScripts->Union(scriptsForCP);
    153             break;
    154           default:
    155             if (!fRequiredScripts->intersects(scriptsForCP)
    156                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
    157                 // If the set hasn't been added already, add it
    158                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
    159                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
    160             }
    161             break;
    162         }
    163     }
    164     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
    165     // [Kana], [Kana Hira] => [Kana]
    166     // This is relatively infrequent, so doesn't have to be optimized.
    167     // We also compute any commonalities among the alternates.
    168     if (uhash_count(fScriptSetSet) > 0) {
    169         fCommonAmongAlternates->setAll();
    170         for (int32_t it = UHASH_FIRST;;) {
    171             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
    172             if (nextHashEl == NULL) {
    173                 break;
    174             }
    175             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
    176             // [Kana], [Kana Hira] => [Kana]
    177             if (fRequiredScripts->intersects(*next)) {
    178                 uhash_removeElement(fScriptSetSet, nextHashEl);
    179             } else {
    180                 fCommonAmongAlternates->intersect(*next);
    181                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
    182                 for (int32_t otherIt = UHASH_FIRST;;) {
    183                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
    184                     if (otherHashEl == NULL) {
    185                         break;
    186                     }
    187                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
    188                     if (next != other && next->contains(*other)) {
    189                         uhash_removeElement(fScriptSetSet, nextHashEl);
    190                         break;
    191                     }
    192                 }
    193             }
    194         }
    195     }
    196     if (uhash_count(fScriptSetSet) == 0) {
    197         fCommonAmongAlternates->resetAll();
    198     }
    199     return *this;
    200 }
    201 
    202 
    203 const UnicodeString *IdentifierInfo::getIdentifier() const {
    204     return fIdentifier;
    205 }
    206 
    207 const ScriptSet *IdentifierInfo::getScripts() const {
    208     return fRequiredScripts;
    209 }
    210 
    211 const UHashtable *IdentifierInfo::getAlternates() const {
    212     return fScriptSetSet;
    213 }
    214 
    215 
    216 const UnicodeSet *IdentifierInfo::getNumerics() const {
    217     return fNumerics;
    218 }
    219 
    220 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
    221     return fCommonAmongAlternates;
    222 }
    223 
    224 #if !UCONFIG_NO_NORMALIZATION
    225 
    226 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
    227     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
    228         return USPOOF_UNRESTRICTIVE;
    229     }
    230     if (ASCII->containsAll(*fIdentifier)) {
    231         return USPOOF_ASCII;
    232     }
    233     // This is a bit tricky. We look at a number of factors.
    234     // The number of scripts in the text.
    235     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
    236     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
    237 
    238     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
    239     //       time it is created, in setIdentifier().
    240     int32_t cardinalityPlus = fRequiredScripts->countMembers() +
    241             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    242     if (cardinalityPlus < 2) {
    243         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    244     }
    245     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
    246             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
    247         return USPOOF_HIGHLY_RESTRICTIVE;
    248     }
    249     if (cardinalityPlus == 2 &&
    250             fRequiredScripts->test(USCRIPT_LATIN, status) &&
    251             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
    252         return USPOOF_MODERATELY_RESTRICTIVE;
    253     }
    254     return USPOOF_MINIMALLY_RESTRICTIVE;
    255 }
    256 
    257 #endif /* !UCONFIG_NO_NORMALIZATION */
    258 
    259 int32_t IdentifierInfo::getScriptCount() const {
    260     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
    261     int32_t count = fRequiredScripts->countMembers() +
    262             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    263     return count;
    264 }
    265 
    266 
    267 
    268 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
    269     if (!container.contains(containee)) {
    270         return FALSE;
    271     }
    272     for (int32_t iter = UHASH_FIRST; ;) {
    273         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
    274         if (hashEl == NULL) {
    275             break;
    276         }
    277         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
    278         if (!container.intersects(*alternatives)) {
    279             return false;
    280         }
    281     }
    282     return true;
    283 }
    284 
    285 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
    286     UVector sorted(status);
    287     if (U_FAILURE(status)) {
    288         return dest;
    289     }
    290     for (int32_t pos = UHASH_FIRST; ;) {
    291         const UHashElement *el = uhash_nextElement(alternates, &pos);
    292         if (el == NULL) {
    293             break;
    294         }
    295         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
    296         sorted.addElement(ss, status);
    297     }
    298     sorted.sort(uhash_compareScriptSet, status);
    299     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
    300     for (int32_t i=0; i<sorted.size(); i++) {
    301         if (i>0) {
    302             dest.append(separator);
    303         }
    304         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
    305         ss->displayScripts(dest);
    306     }
    307     return dest;
    308 }
    309 
    310 U_NAMESPACE_END
    311 
    312