1 /* 2 ********************************************************************** 3 * Copyright (C) 2012-2014, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #include "unicode/uchar.h" 11 #include "unicode/utf16.h" 12 13 #include "identifier_info.h" 14 #include "mutex.h" 15 #include "scriptset.h" 16 #include "ucln_in.h" 17 #include "uvector.h" 18 19 U_NAMESPACE_BEGIN 20 21 #define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 22 23 static UnicodeSet *ASCII; 24 static ScriptSet *JAPANESE; 25 static ScriptSet *CHINESE; 26 static ScriptSet *KOREAN; 27 static ScriptSet *CONFUSABLE_WITH_LATIN; 28 static UInitOnce gIdentifierInfoInitOnce = U_INITONCE_INITIALIZER; 29 30 31 U_CDECL_BEGIN 32 static UBool U_CALLCONV 33 IdentifierInfo_cleanup(void) { 34 delete ASCII; 35 ASCII = NULL; 36 delete JAPANESE; 37 JAPANESE = NULL; 38 delete CHINESE; 39 CHINESE = NULL; 40 delete KOREAN; 41 KOREAN = NULL; 42 delete CONFUSABLE_WITH_LATIN; 43 CONFUSABLE_WITH_LATIN = NULL; 44 gIdentifierInfoInitOnce.reset(); 45 return TRUE; 46 } 47 48 static void U_CALLCONV 49 IdentifierInfo_init(UErrorCode &status) { 50 ASCII = new UnicodeSet(0, 0x7f); 51 JAPANESE = new ScriptSet(); 52 CHINESE = new ScriptSet(); 53 KOREAN = new ScriptSet(); 54 CONFUSABLE_WITH_LATIN = new ScriptSet(); 55 if (ASCII == NULL || JAPANESE == NULL || CHINESE == NULL || KOREAN == NULL 56 || CONFUSABLE_WITH_LATIN == NULL) { 57 status = U_MEMORY_ALLOCATION_ERROR; 58 return; 59 } 60 ASCII->freeze(); 61 JAPANESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HIRAGANA, status) 62 .set(USCRIPT_KATAKANA, status); 63 CHINESE->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_BOPOMOFO, status); 64 KOREAN->set(USCRIPT_LATIN, status).set(USCRIPT_HAN, status).set(USCRIPT_HANGUL, status); 65 CONFUSABLE_WITH_LATIN->set(USCRIPT_CYRILLIC, status).set(USCRIPT_GREEK, status) 66 .set(USCRIPT_CHEROKEE, status); 67 ucln_i18n_registerCleanup(UCLN_I18N_IDENTIFIER_INFO, IdentifierInfo_cleanup); 68 } 69 U_CDECL_END 70 71 72 IdentifierInfo::IdentifierInfo(UErrorCode &status): 73 fIdentifier(NULL), fRequiredScripts(NULL), fScriptSetSet(NULL), 74 fCommonAmongAlternates(NULL), fNumerics(NULL), fIdentifierProfile(NULL) { 75 umtx_initOnce(gIdentifierInfoInitOnce, &IdentifierInfo_init, status); 76 if (U_FAILURE(status)) { 77 return; 78 } 79 80 fIdentifier = new UnicodeString(); 81 fRequiredScripts = new ScriptSet(); 82 fScriptSetSet = uhash_open(uhash_hashScriptSet, uhash_compareScriptSet, NULL, &status); 83 uhash_setKeyDeleter(fScriptSetSet, uhash_deleteScriptSet); 84 fCommonAmongAlternates = new ScriptSet(); 85 fNumerics = new UnicodeSet(); 86 fIdentifierProfile = new UnicodeSet(0, 0x10FFFF); 87 88 if (U_SUCCESS(status) && (fIdentifier == NULL || fRequiredScripts == NULL || fScriptSetSet == NULL || 89 fCommonAmongAlternates == NULL || fNumerics == NULL || fIdentifierProfile == NULL)) { 90 status = U_MEMORY_ALLOCATION_ERROR; 91 } 92 } 93 94 IdentifierInfo::~IdentifierInfo() { 95 delete fIdentifier; 96 delete fRequiredScripts; 97 uhash_close(fScriptSetSet); 98 delete fCommonAmongAlternates; 99 delete fNumerics; 100 delete fIdentifierProfile; 101 } 102 103 104 IdentifierInfo &IdentifierInfo::clear() { 105 fRequiredScripts->resetAll(); 106 uhash_removeAll(fScriptSetSet); 107 fNumerics->clear(); 108 fCommonAmongAlternates->resetAll(); 109 return *this; 110 } 111 112 113 IdentifierInfo &IdentifierInfo::setIdentifierProfile(const UnicodeSet &identifierProfile) { 114 *fIdentifierProfile = identifierProfile; 115 return *this; 116 } 117 118 119 const UnicodeSet &IdentifierInfo::getIdentifierProfile() const { 120 return *fIdentifierProfile; 121 } 122 123 124 IdentifierInfo &IdentifierInfo::setIdentifier(const UnicodeString &identifier, UErrorCode &status) { 125 if (U_FAILURE(status)) { 126 return *this; 127 } 128 *fIdentifier = identifier; 129 clear(); 130 ScriptSet scriptsForCP; 131 UChar32 cp; 132 for (int32_t i = 0; i < identifier.length(); i += U16_LENGTH(cp)) { 133 cp = identifier.char32At(i); 134 // Store a representative character for each kind of decimal digit 135 if (u_charType(cp) == U_DECIMAL_DIGIT_NUMBER) { 136 // Just store the zero character as a representative for comparison. Unicode guarantees it is cp - value 137 fNumerics->add(cp - (UChar32)u_getNumericValue(cp)); 138 } 139 UScriptCode extensions[500]; 140 int32_t extensionsCount = uscript_getScriptExtensions(cp, extensions, LENGTHOF(extensions), &status); 141 if (U_FAILURE(status)) { 142 return *this; 143 } 144 scriptsForCP.resetAll(); 145 for (int32_t j=0; j<extensionsCount; j++) { 146 scriptsForCP.set(extensions[j], status); 147 } 148 scriptsForCP.reset(USCRIPT_COMMON, status); 149 scriptsForCP.reset(USCRIPT_INHERITED, status); 150 switch (scriptsForCP.countMembers()) { 151 case 0: break; 152 case 1: 153 // Single script, record it. 154 fRequiredScripts->Union(scriptsForCP); 155 break; 156 default: 157 if (!fRequiredScripts->intersects(scriptsForCP) 158 && !uhash_geti(fScriptSetSet, &scriptsForCP)) { 159 // If the set hasn't been added already, add it 160 // (Add a copy, fScriptSetSet takes ownership of the copy.) 161 uhash_puti(fScriptSetSet, new ScriptSet(scriptsForCP), 1, &status); 162 } 163 break; 164 } 165 } 166 // Now make a final pass through ScriptSetSet to remove alternates that came before singles. 167 // [Kana], [Kana Hira] => [Kana] 168 // This is relatively infrequent, so doesn't have to be optimized. 169 // We also compute any commonalities among the alternates. 170 if (uhash_count(fScriptSetSet) > 0) { 171 fCommonAmongAlternates->setAll(); 172 for (int32_t it = -1;;) { 173 const UHashElement *nextHashEl = uhash_nextElement(fScriptSetSet, &it); 174 if (nextHashEl == NULL) { 175 break; 176 } 177 ScriptSet *next = static_cast<ScriptSet *>(nextHashEl->key.pointer); 178 // [Kana], [Kana Hira] => [Kana] 179 if (fRequiredScripts->intersects(*next)) { 180 uhash_removeElement(fScriptSetSet, nextHashEl); 181 } else { 182 fCommonAmongAlternates->intersect(*next); 183 // [[Arab Syrc Thaa]; [Arab Syrc]] => [[Arab Syrc]] 184 for (int32_t otherIt = -1;;) { 185 const UHashElement *otherHashEl = uhash_nextElement(fScriptSetSet, &otherIt); 186 if (otherHashEl == NULL) { 187 break; 188 } 189 ScriptSet *other = static_cast<ScriptSet *>(otherHashEl->key.pointer); 190 if (next != other && next->contains(*other)) { 191 uhash_removeElement(fScriptSetSet, nextHashEl); 192 break; 193 } 194 } 195 } 196 } 197 } 198 if (uhash_count(fScriptSetSet) == 0) { 199 fCommonAmongAlternates->resetAll(); 200 } 201 return *this; 202 } 203 204 205 const UnicodeString *IdentifierInfo::getIdentifier() const { 206 return fIdentifier; 207 } 208 209 const ScriptSet *IdentifierInfo::getScripts() const { 210 return fRequiredScripts; 211 } 212 213 const UHashtable *IdentifierInfo::getAlternates() const { 214 return fScriptSetSet; 215 } 216 217 218 const UnicodeSet *IdentifierInfo::getNumerics() const { 219 return fNumerics; 220 } 221 222 const ScriptSet *IdentifierInfo::getCommonAmongAlternates() const { 223 return fCommonAmongAlternates; 224 } 225 226 #if !UCONFIG_NO_NORMALIZATION 227 228 URestrictionLevel IdentifierInfo::getRestrictionLevel(UErrorCode &status) const { 229 if (!fIdentifierProfile->containsAll(*fIdentifier) || getNumerics()->size() > 1) { 230 return USPOOF_UNRESTRICTIVE; 231 } 232 if (ASCII->containsAll(*fIdentifier)) { 233 return USPOOF_ASCII; 234 } 235 // This is a bit tricky. We look at a number of factors. 236 // The number of scripts in the text. 237 // Plus 1 if there is some commonality among the alternates (eg [Arab Thaa]; [Arab Syrc]) 238 // Plus number of alternates otherwise (this only works because we only test cardinality up to 2.) 239 240 // Note: the requiredScripts set omits COMMON and INHERITED; they are taken out at the 241 // time it is created, in setIdentifier(). 242 int32_t cardinalityPlus = fRequiredScripts->countMembers() + 243 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 244 if (cardinalityPlus < 2) { 245 return USPOOF_SINGLE_SCRIPT_RESTRICTIVE; 246 } 247 if (containsWithAlternates(*JAPANESE, *fRequiredScripts) || containsWithAlternates(*CHINESE, *fRequiredScripts) 248 || containsWithAlternates(*KOREAN, *fRequiredScripts)) { 249 return USPOOF_HIGHLY_RESTRICTIVE; 250 } 251 if (cardinalityPlus == 2 && 252 fRequiredScripts->test(USCRIPT_LATIN, status) && 253 !fRequiredScripts->intersects(*CONFUSABLE_WITH_LATIN)) { 254 return USPOOF_MODERATELY_RESTRICTIVE; 255 } 256 return USPOOF_MINIMALLY_RESTRICTIVE; 257 } 258 259 #endif /* !UCONFIG_NO_NORMALIZATION */ 260 261 int32_t IdentifierInfo::getScriptCount() const { 262 // Note: Common and Inherited scripts were removed by setIdentifier(), and do not appear in fRequiredScripts. 263 int32_t count = fRequiredScripts->countMembers() + 264 (fCommonAmongAlternates->countMembers() == 0 ? uhash_count(fScriptSetSet) : 1); 265 return count; 266 } 267 268 269 270 UBool IdentifierInfo::containsWithAlternates(const ScriptSet &container, const ScriptSet &containee) const { 271 if (!container.contains(containee)) { 272 return FALSE; 273 } 274 for (int32_t iter = -1; ;) { 275 const UHashElement *hashEl = uhash_nextElement(fScriptSetSet, &iter); 276 if (hashEl == NULL) { 277 break; 278 } 279 ScriptSet *alternatives = static_cast<ScriptSet *>(hashEl->key.pointer); 280 if (!container.intersects(*alternatives)) { 281 return false; 282 } 283 } 284 return true; 285 } 286 287 UnicodeString &IdentifierInfo::displayAlternates(UnicodeString &dest, const UHashtable *alternates, UErrorCode &status) { 288 UVector sorted(status); 289 if (U_FAILURE(status)) { 290 return dest; 291 } 292 for (int32_t pos = -1; ;) { 293 const UHashElement *el = uhash_nextElement(alternates, &pos); 294 if (el == NULL) { 295 break; 296 } 297 ScriptSet *ss = static_cast<ScriptSet *>(el->key.pointer); 298 sorted.addElement(ss, status); 299 } 300 sorted.sort(uhash_compareScriptSet, status); 301 UnicodeString separator = UNICODE_STRING_SIMPLE("; "); 302 for (int32_t i=0; i<sorted.size(); i++) { 303 if (i>0) { 304 dest.append(separator); 305 } 306 ScriptSet *ss = static_cast<ScriptSet *>(sorted.elementAt(i)); 307 ss->displayScripts(dest); 308 } 309 return dest; 310 } 311 312 U_NAMESPACE_END 313 314