1 /* 2 ********************************************************************** 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #include "unicode/uchar.h" 11 #include "unicode/utf16.h" 12 13 #include "identifier_info.h" 14 #include "mutex.h" 15 #include "scriptset.h" 16 #include "ucln_in.h" 17 #include "uvector.h" 18 19 U_NAMESPACE_BEGIN 20 21 static UnicodeSet *ASCII; 22 static ScriptSet *JAPANESE; 23 static ScriptSet *CHINESE; 24 static ScriptSet *KOREAN; 25 static ScriptSet *CONFUSABLE_WITH_LATIN; 26 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; 27 28 29 U_CDECL_BEGIN 30 static UBool U_CALLCONV 31 IdentifierInfo_cleanup(void) { 32 delete ASCII; 33 ASCII = NULL; 34 delete JAPANESE; 35 JAPANESE = NULL; 36 delete CHINESE; 37 CHINESE = NULL; 38 delete KOREAN; 39 KOREAN = NULL; 40 delete CONFUSABLE_WITH_LATIN; 41 CONFUSABLE_WITH_LATIN = NULL; 42 gIdentifierInfoInitOnce.reset(); 43 return TRUE; 44 } 45 46 static void U_CALLCONV 47 IdentifierInfo_init(UErrorCode &status) { 48 ASCII = new UnicodeSet(0, 0x7f); 49 JAPANESE = new ScriptSet(); 50 CHINESE = new ScriptSet(); 51 KOREAN = new ScriptSet(); 52 CONFUSABLE_WITH_LATIN = new ScriptSet(); 53 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 54 || CONFUSABLE_WITH_LATIN == NULL) { 55 status = U_MEMORY_ALLOCATION_ERROR; 56 return; 57 } 58 ASCII->freeze(); 59 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) 60 .set(USCRIPT_KATAKANA, status); 61 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); 62 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); 63 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) 64 .set(USCRIPT_CHEROKEE, status); 65 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); 66 } 67 U_CDECL_END 68 69 70 IdentifierInfo::IdentifierInfo(UErrorCode &status): 71 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 72 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { 73 umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); 74 if (U_FAILURE(status)) { 75 return; 76 } 77 78 fIdentifier = new UnicodeString(); 79 fRequiredScripts = new ScriptSet(); 80 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); 81 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); 82 fCommonAmongAlternates = new ScriptSet(); 83 fNumerics = new UnicodeSet(); 84 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); 85 86 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || 87 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { 88 status = U_MEMORY_ALLOCATION_ERROR; 89 } 90 } 91 92 IdentifierInfo::~IdentifierInfo() { 93 delete fIdentifier; 94 delete fRequiredScripts; 95 uhash_close(fScriptSetSet); 96 delete fCommonAmongAlternates; 97 delete fNumerics; 98 delete fIdentifierProfile; 99 } 100 101 102 IdentifierInfo &IdentifierInfo::clear() { 103 fRequiredScripts->resetAll(); 104 uhash_removeAll(fScriptSetSet); 105 fNumerics->clear(); 106 fCommonAmongAlternates->resetAll(); 107 return *this; 108 } 109 110 111 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { 112 *fIdentifierProfile = identifierProfile; 113 return *this; 114 } 115 116 117 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { 118 return *fIdentifierProfile; 119 } 120 121 122 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { 123 if (U_FAILURE(status)) { 124 return *this; 125 } 126 *fIdentifier = identifier; 127 clear(); 128 ScriptSet scriptsForCP; 129 UChar32 cp; 130 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { 131 cp = identifier.char32At(i); 132 // Store a representative character for each kind of decimal digit 133 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { 134 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value 135 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); 136 } 137 UScriptCode extensions[500]; 138 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, UPRV_LENGTHOF(extensions), &status); 139 if (U_FAILURE(status)) { 140 return *this; 141 } 142 scriptsForCP.resetAll(); 143 for (int32_t j=0; j<extensionsCount; j++) { 144 scriptsForCP.set(extensions[j], status); 145 } 146 scriptsForCP.reset(USCRIPT_COMMON, status); 147 scriptsForCP.reset(USCRIPT_INHERITED, status); 148 switch (scriptsForCP.countMembers()) { 149 case 0: break; 150 case 1: 151 // Single script, record it. 152 fRequiredScripts->Union(scriptsForCP); 153 break; 154 default: 155 if (!fRequiredScripts->intersects(scriptsForCP) 156 && !uhash_geti(fScriptSetSet, &scriptsForCP)) { 157 // If the set hasn't been added already, add it 158 // (Add a copy, fScriptSetSet takes ownership of the copy.) 159 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); 160 } 161 break; 162 } 163 } 164 // Now make a final pass through ScriptSetSet to remove alternates that came before singles. 165 // [Kana], [Kana Hira] => [Kana] 166 // This is relatively infrequent, so doesn't have to be optimized. 167 // We also compute any commonalities among the alternates. 168 if (uhash_count(fScriptSetSet) > 0) { 169 fCommonAmongAlternates->setAll(); 170 for (int32_t it = UHASH_FIRST;;) { 171 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); 172 if (nextHashEl == NULL) { 173 break; 174 } 175 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); 176 // [Kana], [Kana Hira] => [Kana] 177 if (fRequiredScripts->intersects(*next)) { 178 uhash_removeElement(fScriptSetSet, nextHashEl); 179 } else { 180 fCommonAmongAlternates->intersect(*next); 181 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] 182 for (int32_t otherIt = UHASH_FIRST;;) { 183 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); 184 if (otherHashEl == NULL) { 185 break; 186 } 187 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); 188 if (next != other && next->contains(*other)) { 189 uhash_removeElement(fScriptSetSet, nextHashEl); 190 break; 191 } 192 } 193 } 194 } 195 } 196 if (uhash_count(fScriptSetSet) == 0) { 197 fCommonAmongAlternates->resetAll(); 198 } 199 return *this; 200 } 201 202 203 const UnicodeString *IdentifierInfo::getIdentifier() const { 204 return fIdentifier; 205 } 206 207 const ScriptSet *IdentifierInfo::getScripts() const { 208 return fRequiredScripts; 209 } 210 211 const UHashtable *IdentifierInfo::getAlternates() const { 212 return fScriptSetSet; 213 } 214 215 216 const UnicodeSet *IdentifierInfo::getNumerics() const { 217 return fNumerics; 218 } 219 220 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { 221 return fCommonAmongAlternates; 222 } 223 224 #if !UCONFIG_NO_NORMALIZATION 225 226 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { 227 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { 228 return USPOOF_UNRESTRICTIVE; 229 } 230 if (ASCII->containsAll(*fIdentifier)) { 231 return USPOOF_ASCII; 232 } 233 // This is a bit tricky. We look at a number of factors. 234 // The number of scripts in the text. 235 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) 236 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) 237 238 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the 239 // time it is created, in setIdentifier(). 240 int32_t cardinalityPlus = fRequiredScripts->countMembers() + 241 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 242 if (cardinalityPlus < 2) { 243 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; 244 } 245 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) 246 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { 247 return USPOOF_HIGHLY_RESTRICTIVE; 248 } 249 if (cardinalityPlus == 2 && 250 fRequiredScripts->test(USCRIPT_LATIN, status) && 251 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { 252 return USPOOF_MODERATELY_RESTRICTIVE; 253 } 254 return USPOOF_MINIMALLY_RESTRICTIVE; 255 } 256 257 #endif /* !UCONFIG_NO_NORMALIZATION */ 258 259 int32_t IdentifierInfo::getScriptCount() const { 260 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. 261 int32_t count = fRequiredScripts->countMembers() + 262 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 263 return count; 264 } 265 266 267 268 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { 269 if (!container.contains(containee)) { 270 return FALSE; 271 } 272 for (int32_t iter = UHASH_FIRST; ;) { 273 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); 274 if (hashEl == NULL) { 275 break; 276 } 277 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); 278 if (!container.intersects(*alternatives)) { 279 return false; 280 } 281 } 282 return true; 283 } 284 285 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { 286 UVector sorted(status); 287 if (U_FAILURE(status)) { 288 return dest; 289 } 290 for (int32_t pos = UHASH_FIRST; ;) { 291 const UHashElement *el = uhash_nextElement(alternates, &pos); 292 if (el == NULL) { 293 break; 294 } 295 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); 296 sorted.addElement(ss, status); 297 } 298 sorted.sort(uhash_compareScriptSet, status); 299 UnicodeString separator = UNICODE_STRING_SIMPLE("; "); 300 for (int32_t i=0; i<sorted.size(); i++) { 301 if (i>0) { 302 dest.append(separator); 303 } 304 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); 305 ss->displayScripts(dest); 306 } 307 return dest; 308 } 309 310 U_NAMESPACE_END 311 312