Home | History | Annotate | Download | only in i18n
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 2012-2013, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6 */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #include "unicode/uchar.h"
     11 #include "unicode/utf16.h"
     12 
     13 #include "identifier_info.h"
     14 #include "mutex.h"
     15 #include "scriptset.h"
     16 #include "ucln_in.h"
     17 #include "uvector.h"
     18 
     19 U_NAMESPACE_BEGIN
     20 
     21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
     22 
     23 static UMutex gInitMutex = U_MUTEX_INITIALIZER;
     24 static UBool gStaticsAreInitialized = FALSE;
     25 
     26 UnicodeSet *IdentifierInfo::ASCII;
     27 ScriptSet *IdentifierInfo::JAPANESE;
     28 ScriptSet *IdentifierInfo::CHINESE;
     29 ScriptSet *IdentifierInfo::KOREAN;
     30 ScriptSet *IdentifierInfo::CONFUSABLE_WITH_LATIN;
     31 
     32 UBool IdentifierInfo::cleanup() {
     33     delete ASCII;
     34     ASCII = NULL;
     35     delete JAPANESE;
     36     JAPANESE = NULL;
     37     delete CHINESE;
     38     CHINESE = NULL;
     39     delete KOREAN;
     40     KOREAN = NULL;
     41     delete CONFUSABLE_WITH_LATIN;
     42     CONFUSABLE_WITH_LATIN = NULL;
     43     gStaticsAreInitialized = FALSE;
     44     return TRUE;
     45 }
     46 
     47 U_CDECL_BEGIN
     48 static UBool U_CALLCONV
     49 IdentifierInfo_cleanup(void) {
     50     return IdentifierInfo::cleanup();
     51 }
     52 U_CDECL_END
     53 
     54 
     55 IdentifierInfo::IdentifierInfo(UErrorCode &status):
     56          fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL),
     57          fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) {
     58     if (U_FAILURE(status)) {
     59         return;
     60     }
     61     {
     62         Mutex lock(&gInitMutex);
     63         if (!gStaticsAreInitialized) {
     64             ASCII    = new UnicodeSet(0, 0x7f);
     65             JAPANESE = new ScriptSet();
     66             CHINESE  = new ScriptSet();
     67             KOREAN   = new ScriptSet();
     68             CONFUSABLE_WITH_LATIN = new ScriptSet();
     69             if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL
     70                     || CONFUSABLE_WITH_LATIN == NULL) {
     71                 status = U_MEMORY_ALLOCATION_ERROR;
     72                 return;
     73             }
     74             ASCII->freeze();
     75             JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status)
     76                      .set(USCRIPT_KATAKANA, status);
     77             CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status);
     78             KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status);
     79             CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status)
     80                       .set(USCRIPT_CHEROKEE, status);
     81             ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup);
     82             gStaticsAreInitialized = TRUE;
     83         }
     84     }
     85     fIdentifier = new UnicodeString();
     86     fRequiredScripts = new ScriptSet();
     87     fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status);
     88     uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet);
     89     fCommonAmongAlternates = new ScriptSet();
     90     fNumerics = new UnicodeSet();
     91     fIdentifierProfile = new UnicodeSet(0, 0x10FFFF);
     92 
     93     if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL ||
     94                               fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) {
     95         status = U_MEMORY_ALLOCATION_ERROR;
     96     }
     97 }
     98 
     99 IdentifierInfo::~IdentifierInfo() {
    100     delete fIdentifier;
    101     delete fRequiredScripts;
    102     uhash_close(fScriptSetSet);
    103     delete fCommonAmongAlternates;
    104     delete fNumerics;
    105     delete fIdentifierProfile;
    106 }
    107 
    108 
    109 IdentifierInfo &IdentifierInfo::clear() {
    110     fRequiredScripts->resetAll();
    111     uhash_removeAll(fScriptSetSet);
    112     fNumerics->clear();
    113     fCommonAmongAlternates->resetAll();
    114     return *this;
    115 }
    116 
    117 
    118 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) {
    119     *fIdentifierProfile = identifierProfile;
    120     return *this;
    121 }
    122 
    123 
    124 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const {
    125     return *fIdentifierProfile;
    126 }
    127 
    128 
    129 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) {
    130     if (U_FAILURE(status)) {
    131         return *this;
    132     }
    133     *fIdentifier = identifier;
    134     clear();
    135     ScriptSet scriptsForCP;
    136     UChar32 cp;
    137     for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) {
    138         cp = identifier.char32At(i);
    139         // Store a representative character for each kind of decimal digit
    140         if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) {
    141             // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value
    142             fNumerics->add(cp - (UChar32)u_getNumericValue(cp));
    143         }
    144         UScriptCode extensions[500];
    145         int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status);
    146         if (U_FAILURE(status)) {
    147             return *this;
    148         }
    149         scriptsForCP.resetAll();
    150         for (int32_t j=0; j<extensionsCount; j++) {
    151             scriptsForCP.set(extensions[j], status);
    152         }
    153         scriptsForCP.reset(USCRIPT_COMMON, status);
    154         scriptsForCP.reset(USCRIPT_INHERITED, status);
    155         switch (scriptsForCP.countMembers()) {
    156           case 0: break;
    157           case 1:
    158             // Single script, record it.
    159             fRequiredScripts->Union(scriptsForCP);
    160             break;
    161           default:
    162             if (!fRequiredScripts->intersects(scriptsForCP)
    163                     && !uhash_geti(fScriptSetSet, &scriptsForCP)) {
    164                 // If the set hasn't been added already, add it
    165                 //    (Add a copy, fScriptSetSet takes ownership of the copy.)
    166                 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status);
    167             }
    168             break;
    169         }
    170     }
    171     // Now make a final pass through ScriptSetSet to remove alternates that came before singles.
    172     // [Kana], [Kana Hira] => [Kana]
    173     // This is relatively infrequent, so doesn't have to be optimized.
    174     // We also compute any commonalities among the alternates.
    175     if (uhash_count(fScriptSetSet) > 0) {
    176         fCommonAmongAlternates->setAll();
    177         for (int32_t it = -1;;) {
    178             const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it);
    179             if (nextHashEl == NULL) {
    180                 break;
    181             }
    182             ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer);
    183             // [Kana], [Kana Hira] => [Kana]
    184             if (fRequiredScripts->intersects(*next)) {
    185                 uhash_removeElement(fScriptSetSet, nextHashEl);
    186             } else {
    187                 fCommonAmongAlternates->intersect(*next);
    188                 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]]
    189                 for (int32_t otherIt = -1;;) {
    190                     const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt);
    191                     if (otherHashEl == NULL) {
    192                         break;
    193                     }
    194                     ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer);
    195                     if (next != other && next->contains(*other)) {
    196                         uhash_removeElement(fScriptSetSet, nextHashEl);
    197                         break;
    198                     }
    199                 }
    200             }
    201         }
    202     }
    203     if (uhash_count(fScriptSetSet) == 0) {
    204         fCommonAmongAlternates->resetAll();
    205     }
    206     return *this;
    207 }
    208 
    209 
    210 const UnicodeString *IdentifierInfo::getIdentifier() const {
    211     return fIdentifier;
    212 }
    213 
    214 const ScriptSet *IdentifierInfo::getScripts() const {
    215     return fRequiredScripts;
    216 }
    217 
    218 const UHashtable *IdentifierInfo::getAlternates() const {
    219     return fScriptSetSet;
    220 }
    221 
    222 
    223 const UnicodeSet *IdentifierInfo::getNumerics() const {
    224     return fNumerics;
    225 }
    226 
    227 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const {
    228     return fCommonAmongAlternates;
    229 }
    230 
    231 #if !UCONFIG_NO_NORMALIZATION
    232 
    233 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const {
    234     if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) {
    235         return USPOOF_UNRESTRICTIVE;
    236     }
    237     if (ASCII->containsAll(*fIdentifier)) {
    238         return USPOOF_ASCII;
    239     }
    240     // This is a bit tricky. We look at a number of factors.
    241     // The number of scripts in the text.
    242     // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc])
    243     // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.)
    244 
    245     // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the
    246     //       time it is created, in setIdentifier().
    247     int32_t cardinalityPlus = fRequiredScripts->countMembers() +
    248             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    249     if (cardinalityPlus < 2) {
    250         return USPOOF_HIGHLY_RESTRICTIVE;
    251     }
    252     if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts)
    253             || containsWithAlternates(*KOREAN, *fRequiredScripts)) {
    254         return USPOOF_HIGHLY_RESTRICTIVE;
    255     }
    256     if (cardinalityPlus == 2 &&
    257             fRequiredScripts->test(USCRIPT_LATIN, status) &&
    258             !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) {
    259         return USPOOF_MODERATELY_RESTRICTIVE;
    260     }
    261     return USPOOF_MINIMALLY_RESTRICTIVE;
    262 }
    263 
    264 #endif /* !UCONFIG_NO_NORMALIZATION */
    265 
    266 int32_t IdentifierInfo::getScriptCount() const {
    267     // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts.
    268     int32_t count = fRequiredScripts->countMembers() +
    269             (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1);
    270     return count;
    271 }
    272 
    273 
    274 
    275 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const {
    276     if (!container.contains(containee)) {
    277         return FALSE;
    278     }
    279     for (int32_t iter = -1; ;) {
    280         const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter);
    281         if (hashEl == NULL) {
    282             break;
    283         }
    284         ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer);
    285         if (!container.intersects(*alternatives)) {
    286             return false;
    287         }
    288     }
    289     return true;
    290 }
    291 
    292 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) {
    293     UVector sorted(status);
    294     if (U_FAILURE(status)) {
    295         return dest;
    296     }
    297     for (int32_t pos = -1; ;) {
    298         const UHashElement *el = uhash_nextElement(alternates, &pos);
    299         if (el == NULL) {
    300             break;
    301         }
    302         ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer);
    303         sorted.addElement(ss, status);
    304     }
    305     sorted.sort(uhash_compareScriptSet, status);
    306     UnicodeString separator = UNICODE_STRING_SIMPLE("; ");
    307     for (int32_t i=0; i<sorted.size(); i++) {
    308         if (i>0) {
    309             dest.append(separator);
    310         }
    311         ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i));
    312         ss->displayScripts(dest);
    313     }
    314     return dest;
    315 }
    316 
    317 U_NAMESPACE_END
    318 
    319