Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2012-2014, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #include "unicode/uchar.h"
     11 #include "unicode/utf16.h"
     12 
     13 #include "identifier_info.h"
     14 #include "mutex.h"
     15 #include "scriptset.h"
     16 #include "ucln_in.h"
     17 #include "uvector.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     22 
     23 static UnicodeSet *ASCII;
     24 static ScriptSet *JAPANESE;
     25 static ScriptSet *CHINESE;
     26 static ScriptSet *KOREAN;
     27 static ScriptSet *CONFUSABLE_WITH_LATIN;
     28 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER;
     29 
     30 
     31 U_CDECL_BEGIN
     32 static UBool U_CALLCONV
     33 IdentifierInfo_cleanup(void) {
     34     delete ASCII;
     35     ASCII = NULL;
     36     delete JAPANESE;
     37     JAPANESE = NULL;
     38     delete CHINESE;
     39     CHINESE = NULL;
     40     delete KOREAN;
     41     KOREAN = NULL;
     42     delete CONFUSABLE_WITH_LATIN;
     43     CONFUSABLE_WITH_LATIN = NULL;
     44     gIdentifierInfoInitOnce.reset();
     45     return TRUE;
     46 }
     47 
     48 static void U_CALLCONV
     49 IdentifierInfo_init(UErrorCode &status) {
     50     ASCII    = new UnicodeSet(0, 0x7f);
     51     JAPANESE = new ScriptSet();
     52     CHINESE  = new ScriptSet();
     53     KOREAN   = new ScriptSet();
     54     CONFUSABLE_WITH_LATIN = new ScriptSet();
     55     if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
     56             || CONFUSABLE_WITH_LATIN == NULL) {
     57         status = U_MEMORY_ALLOCATION_ERROR;
     58         return;
     59     }
     60     ASCII->freeze();
     61     JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
     62              .set(USCRIPT_KATAKANA, status);
     63     CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
     64     KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
     65     CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
     66               .set(USCRIPT_CHEROKEE, status);
     67     ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
     68 }
     69 U_CDECL_END
     70 
     71 
     72 IdentifierInfo::IdentifierInfo(UErrorCode &status):
     73          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
     74          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
     75     umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status);
     76     if (U_FAILURE(status)) {
     77         return;
     78     }
     79 
     80     fIdentifier = new UnicodeString();
     81     fRequiredScripts = new ScriptSet();
     82     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
     83     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
     84     fCommonAmongAlternates = new ScriptSet();
     85     fNumerics = new UnicodeSet();
     86     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
     87 
     88     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
     89                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
     90         status = U_MEMORY_ALLOCATION_ERROR;
     91     }
     92 }
     93 
     94 IdentifierInfo::~IdentifierInfo() {
     95     delete fIdentifier;
     96     delete fRequiredScripts;
     97     uhash_close(fScriptSetSet);
     98     delete fCommonAmongAlternates;
     99     delete fNumerics;
    100     delete fIdentifierProfile;
    101 }
    102 
    103 
    104 IdentifierInfo &IdentifierInfo::clear() {
    105     fRequiredScripts->resetAll();
    106     uhash_removeAll(fScriptSetSet);
    107     fNumerics->clear();
    108     fCommonAmongAlternates->resetAll();
    109     return *this;
    110 }
    111 
    112 
    113 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
    114     *fIdentifierProfile = identifierProfile;
    115     return *this;
    116 }
    117 
    118 
    119 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
    120     return *fIdentifierProfile;
    121 }
    122 
    123 
    124 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
    125     if (U_FAILURE(status)) {
    126         return *this;
    127     }
    128     *fIdentifier = identifier;
    129     clear();
    130     ScriptSet scriptsForCP;
    131     UChar32 cp;
    132     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
    133         cp = identifier.char32At(i);
    134         // Store a representative character for each kind of decimal digit
    135         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
    136             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
    137             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
    138         }
    139         UScriptCode extensions[500];
    140         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
    141         if (U_FAILURE(status)) {
    142             return *this;
    143         }
    144         scriptsForCP.resetAll();
    145         for (int32_t j=0; j<extensionsCount; j++) {
    146             scriptsForCP.set(extensions[j], status);
    147         }
    148         scriptsForCP.reset(USCRIPT_COMMON, status);
    149         scriptsForCP.reset(USCRIPT_INHERITED, status);
    150         switch (scriptsForCP.countMembers()) {
    151           case 0: break;
    152           case 1:
    153             // Single script, record it.
    154             fRequiredScripts->Union(scriptsForCP);
    155             break;
    156           default:
    157             if (!fRequiredScripts->intersects(scriptsForCP)
    158                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
    159                 // If the set hasn't been added already, add it
    160                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
    161                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
    162             }
    163             break;
    164         }
    165     }
    166     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
    167     // [Kana], [Kana Hira] => [Kana]
    168     // This is relatively infrequent, so doesn't have to be optimized.
    169     // We also compute any commonalities among the alternates.
    170     if (uhash_count(fScriptSetSet) > 0) {
    171         fCommonAmongAlternates->setAll();
    172         for (int32_t it = -1;;) {
    173             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
    174             if (nextHashEl == NULL) {
    175                 break;
    176             }
    177             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
    178             // [Kana], [Kana Hira] => [Kana]
    179             if (fRequiredScripts->intersects(*next)) {
    180                 uhash_removeElement(fScriptSetSet, nextHashEl);
    181             } else {
    182                 fCommonAmongAlternates->intersect(*next);
    183                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
    184                 for (int32_t otherIt = -1;;) {
    185                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
    186                     if (otherHashEl == NULL) {
    187                         break;
    188                     }
    189                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
    190                     if (next != other && next->contains(*other)) {
    191                         uhash_removeElement(fScriptSetSet, nextHashEl);
    192                         break;
    193                     }
    194                 }
    195             }
    196         }
    197     }
    198     if (uhash_count(fScriptSetSet) == 0) {
    199         fCommonAmongAlternates->resetAll();
    200     }
    201     return *this;
    202 }
    203 
    204 
    205 const UnicodeString *IdentifierInfo::getIdentifier() const {
    206     return fIdentifier;
    207 }
    208 
    209 const ScriptSet *IdentifierInfo::getScripts() const {
    210     return fRequiredScripts;
    211 }
    212 
    213 const UHashtable *IdentifierInfo::getAlternates() const {
    214     return fScriptSetSet;
    215 }
    216 
    217 
    218 const UnicodeSet *IdentifierInfo::getNumerics() const {
    219     return fNumerics;
    220 }
    221 
    222 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
    223     return fCommonAmongAlternates;
    224 }
    225 
    226 #if !UCONFIG_NO_NORMALIZATION
    227 
    228 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
    229     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
    230         return USPOOF_UNRESTRICTIVE;
    231     }
    232     if (ASCII->containsAll(*fIdentifier)) {
    233         return USPOOF_ASCII;
    234     }
    235     // This is a bit tricky. We look at a number of factors.
    236     // The number of scripts in the text.
    237     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
    238     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
    239 
    240     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
    241     //       time it is created, in setIdentifier().
    242     int32_t cardinalityPlus = fRequiredScripts->countMembers() +
    243             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    244     if (cardinalityPlus < 2) {
    245         return USPOOF_SINGLE_SCRIPT_RESTRICTIVE;
    246     }
    247     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
    248             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
    249         return USPOOF_HIGHLY_RESTRICTIVE;
    250     }
    251     if (cardinalityPlus == 2 &&
    252             fRequiredScripts->test(USCRIPT_LATIN, status) &&
    253             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
    254         return USPOOF_MODERATELY_RESTRICTIVE;
    255     }
    256     return USPOOF_MINIMALLY_RESTRICTIVE;
    257 }
    258 
    259 #endif /* !UCONFIG_NO_NORMALIZATION */
    260 
    261 int32_t IdentifierInfo::getScriptCount() const {
    262     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
    263     int32_t count = fRequiredScripts->countMembers() +
    264             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    265     return count;
    266 }
    267 
    268 
    269 
    270 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
    271     if (!container.contains(containee)) {
    272         return FALSE;
    273     }
    274     for (int32_t iter = -1; ;) {
    275         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
    276         if (hashEl == NULL) {
    277             break;
    278         }
    279         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
    280         if (!container.intersects(*alternatives)) {
    281             return false;
    282         }
    283     }
    284     return true;
    285 }
    286 
    287 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
    288     UVector sorted(status);
    289     if (U_FAILURE(status)) {
    290         return dest;
    291     }
    292     for (int32_t pos = -1; ;) {
    293         const UHashElement *el = uhash_nextElement(alternates, &pos);
    294         if (el == NULL) {
    295             break;
    296         }
    297         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
    298         sorted.addElement(ss, status);
    299     }
    300     sorted.sort(uhash_compareScriptSet, status);
    301     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
    302     for (int32_t i=0; i<sorted.size(); i++) {
    303         if (i>0) {
    304             dest.append(separator);
    305         }
    306         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
    307         ss->displayScripts(dest);
    308     }
    309     return dest;
    310 }
    311 
    312 U_NAMESPACE_END
    313 
    314