1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This file extends lang_enc.cc with additional languages and extended routines 6 // It is current with Unicode 5.1 (beta Jan 2008) 7 // 8 9 #include <stdlib.h> 10 #include <stdio.h> 11 #include <string.h> 12 13 #include "encodings/compact_lang_det/ext_lang_enc.h" 14 #include "encodings/compact_lang_det/win/cld_macros.h" 15 #include "encodings/compact_lang_det/win/cld_strtoint.h" 16 17 // Language names above NUM_LANGUAGES 18 // These are also the C enum declared names 19 static const char* const kExtLanguageName[] = { 20 "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD", 21 22 // Pseudo-languages for Unicode scripts that express a single language 23 "X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC", 24 "X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE", 25 "X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT", 26 "X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH", 27 "X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM", 28 "X_PHOENICIAN", "X_PHAGS_PA", "X_NKO", 29 30 // Unicode 5.1 31 "X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA", 32 "X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN", 33 "X_CHAM", 34 }; 35 36 37 // These are the C enum declared names, for programs creating C code 38 static const char* const kExtLangDeclaredName[] = { 39 "ENGLISH", /* 0 */ 40 "DANISH", /* 1 */ 41 "DUTCH", /* 2 */ 42 "FINNISH", /* 3 */ 43 "FRENCH", /* 4 */ 44 "GERMAN", /* 5 */ 45 "HEBREW", /* 6 */ 46 "ITALIAN", /* 7 */ 47 "JAPANESE", /* 8 */ 48 "KOREAN", /* 9 */ 49 "NORWEGIAN", /* 10 */ 50 "POLISH", /* 11 */ 51 "PORTUGUESE", /* 12 */ 52 "RUSSIAN", /* 13 */ 53 "SPANISH", /* 14 */ 54 "SWEDISH", /* 15 */ 55 "CHINESE", /* 16 */ 56 "CZECH", /* 17 */ 57 "GREEK", /* 18 */ 58 "ICELANDIC", /* 19 */ 59 "LATVIAN", /* 20 */ 60 "LITHUANIAN", /* 21 */ 61 "ROMANIAN", /* 22 */ 62 "HUNGARIAN", /* 23 */ 63 "ESTONIAN", /* 24 */ 64 "TG_UNKNOWN_LANGUAGE", /* 25 */ 65 "UNKNOWN_LANGUAGE", /* 26 */ 66 "BULGARIAN", /* 27 */ 67 "CROATIAN", /* 28 */ 68 "SERBIAN", /* 29 */ 69 "IRISH", /* 30 */ 70 "GALICIAN", /* 31 */ 71 "TAGALOG", /* 32 */ 72 "TURKISH", /* 33 */ 73 "UKRAINIAN", /* 34 */ 74 "HINDI", /* 35 */ 75 "MACEDONIAN", /* 36 */ 76 "BENGALI", /* 37 */ 77 "INDONESIAN", /* 38 */ 78 "LATIN", /* 39 */ 79 "MALAY", /* 40 */ 80 "MALAYALAM", /* 41 */ 81 "WELSH", /* 42 */ 82 "NEPALI", /* 43 */ 83 "TELUGU", /* 44 */ 84 "ALBANIAN", /* 45 */ 85 "TAMIL", /* 46 */ 86 "BELARUSIAN", /* 47 */ 87 "JAVANESE", /* 48 */ 88 "OCCITAN", /* 49 */ 89 "URDU", /* 50 */ 90 "BIHARI", /* 51 */ 91 "GUJARATI", /* 52 */ 92 "THAI", /* 53 */ 93 "ARABIC", /* 54 */ 94 "CATALAN", /* 55 */ 95 "ESPERANTO", /* 56 */ 96 "BASQUE", /* 57 */ 97 "INTERLINGUA", /* 58 */ 98 "KANNADA", /* 59 */ 99 "PUNJABI", /* 60 */ 100 "SCOTS_GAELIC", /* 61 */ 101 "SWAHILI", /* 62 */ 102 "SLOVENIAN", /* 63 */ 103 "MARATHI", /* 64 */ 104 "MALTESE", /* 65 */ 105 "VIETNAMESE", /* 66 */ 106 "FRISIAN", /* 67 */ 107 "SLOVAK", /* 68 */ 108 "CHINESE_T", /* 69 */ 109 "FAROESE", /* 70 */ 110 "SUNDANESE", /* 71 */ 111 "UZBEK", /* 72 */ 112 "AMHARIC", /* 73 */ 113 "AZERBAIJANI", /* 74 */ 114 "GEORGIAN", /* 75 */ 115 "TIGRINYA", /* 76 */ 116 "PERSIAN", /* 77 */ 117 "BOSNIAN", /* 78 */ 118 "SINHALESE", /* 79 */ 119 "NORWEGIAN_N", /* 80 */ 120 "PORTUGUESE_P", /* 81 */ 121 "PORTUGUESE_B", /* 82 */ 122 "XHOSA", /* 83 */ 123 "ZULU", /* 84 */ 124 "GUARANI", /* 85 */ 125 "SESOTHO", /* 86 */ 126 "TURKMEN", /* 87 */ 127 "KYRGYZ", /* 88 */ 128 "BRETON", /* 89 */ 129 "TWI", /* 90 */ 130 "YIDDISH", /* 91 */ 131 "SERBO_CROATIAN", /* 92 */ 132 "SOMALI", /* 93 */ 133 "UIGHUR", /* 94 */ 134 "KURDISH", /* 95 */ 135 "MONGOLIAN", /* 96 */ 136 "ARMENIAN", /* 97 */ 137 "LAOTHIAN", /* 98 */ 138 "SINDHI", /* 99 */ 139 "RHAETO_ROMANCE", /* 100 */ 140 "AFRIKAANS", /* 101 */ 141 "LUXEMBOURGISH", /* 102 */ 142 "BURMESE", /* 103 */ 143 "KHMER", /* 104 */ 144 "TIBETAN", /* 105 */ 145 "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives 146 "CHEROKEE", /* 107 */ 147 "SYRIAC", /* 108 */ 148 "LIMBU", /* 109 */ 149 "ORIYA", /* 110 */ 150 "ASSAMESE", /* 111 */ 151 "CORSICAN", /* 112 */ 152 "INTERLINGUE", /* 113 */ 153 "KAZAKH", /* 114 */ 154 "LINGALA", /* 115 */ 155 "MOLDAVIAN", /* 116 */ 156 "PASHTO", /* 117 */ 157 "QUECHUA", /* 118 */ 158 "SHONA", /* 119 */ 159 "TAJIK", /* 120 */ 160 "TATAR", /* 121 */ 161 "TONGA", /* 122 */ 162 "YORUBA", /* 123 */ 163 "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */ 164 "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */ 165 "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */ 166 "CREOLES_AND_PIDGINS_OTHER", /* 127 */ 167 "MAORI", /* 128 */ 168 "WOLOF", /* 129 */ 169 "ABKHAZIAN", /* 130 */ 170 "AFAR", /* 131 */ 171 "AYMARA", /* 132 */ 172 "BASHKIR", /* 133 */ 173 "BISLAMA", /* 134 */ 174 "DZONGKHA", /* 135 */ 175 "FIJIAN", /* 136 */ 176 "GREENLANDIC", /* 137 */ 177 "HAUSA", /* 138 */ 178 "HAITIAN_CREOLE", /* 139 */ 179 "INUPIAK", /* 140 */ 180 "INUKTITUT", /* 141 */ 181 "KASHMIRI", /* 142 */ 182 "KINYARWANDA", /* 143 */ 183 "MALAGASY", /* 144 */ 184 "NAURU", /* 145 */ 185 "OROMO", /* 146 */ 186 "RUNDI", /* 147 */ 187 "SAMOAN", /* 148 */ 188 "SANGO", /* 149 */ 189 "SANSKRIT", /* 150 */ 190 "SISWANT", /* 151 */ 191 "TSONGA", /* 152 */ 192 "TSWANA", /* 153 */ 193 "VOLAPUK", /* 154 */ 194 "ZHUANG", /* 155 */ 195 "KHASI", /* 156 */ 196 "SCOTS", /* 157 */ 197 "GANDA", /* 158 */ 198 "MANX", /* 159 */ 199 "MONTENEGRIN", /* 160 */ 200 // Add new language declared names just before here 201 }; 202 203 COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES, 204 kExtLangDeclaredName_has_incorrect_length); 205 206 207 // Language codes above NUM_LANGUAGES 208 // I made all these up, except Klingon from ISO-639-2 (dsites) 209 // NOTE: zza is a standard name 210 static const char* const kExtLanguageCode[] = { 211 // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD", 212 // All Latin script 213 "zzb", "zzp", "zzh", "tlh", "zze", 214 215 // Pseudo-languages for Unicode scripts that express a single language 216 "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth", 217 "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale", 218 "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt", 219 "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng", 220 "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux", 221 "xx-Phnx", "xx-Phag", "xx-Nkoo", 222 223 // Unicode 5.1 224 "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur", 225 "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi", 226 "xx-Cham", 227 }; 228 229 230 // Given the Language, returns its string name used as the output by 231 // the lang/enc identifier, e.g. "Korean" 232 // "invalid_language" if the input is invalid. 233 // TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language, 234 // used to subtract out HTML, link farms, DNA strings, and alittle English porn 235 const char* ExtLanguageName(const Language lang) { 236 if (lang < 0) { 237 // No-text-at-all result from a Tote 238 return ""; 239 } 240 // CompactLanguageDetect extension 241 if (lang == TG_UNKNOWN_LANGUAGE) { 242 return "Ignore"; 243 } 244 if ((0 <= lang) && (lang < NUM_LANGUAGES)) { 245 return LanguageName(lang); 246 } 247 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) { 248 return kExtLanguageName[lang - EXT_LANGUAGE_BASE]; 249 } 250 return invalid_language_name(); 251 } 252 253 254 // Given the Language, returns its Language enum spelling, for use by 255 // programs that create C declarations, e.g. "KOREAN" 256 // "UNKNOWN_LANGUAGE" if the input is invalid. 257 const char* ExtLanguageDeclaredName(const Language lang) { 258 if ((0 <= lang) && (lang < NUM_LANGUAGES)) { 259 return kExtLangDeclaredName[lang]; 260 } 261 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) { 262 return kExtLanguageName[lang - EXT_LANGUAGE_BASE]; 263 } 264 return "UNKNOWN_LANGUAGE"; 265 } 266 267 // Given the Language, return the language code, e.g. "ko" 268 const char* ExtLanguageCode(const Language lang) { 269 // Hack for ignore/porn pseudo-language 270 if (lang == TG_UNKNOWN_LANGUAGE) { 271 return "xxx"; 272 } 273 if ((0 <= lang) && (lang < NUM_LANGUAGES)) { 274 return LanguageCode(lang); 275 } 276 if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) { 277 return kExtLanguageCode[lang - EXT_LANGUAGE_BASE]; 278 } 279 return "??"; 280 } 281 282 283 // Convert "en-Latn-GB" to ENGLISH 284 // Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P 285 // Consider for later: NORWEGIAN, NORWEGIAN_N 286 // Consider for later: SCOTS, SCOTS_GAELIC 287 // Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN 288 // 289 Language GetLanguageFromNumberOrName(const char* src) { 290 if (strspn(src, "0123456789") == strlen(src)) { 291 // All digits 292 return static_cast<Language>(strto32(src, NULL, 10)); 293 } 294 295 Language retlang = UNKNOWN_LANGUAGE; 296 size_t len = strlen(src); 297 298 if (true /*FLAGS_mergepairs*/) { 299 // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr 300 if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;} 301 if (memcmp(src, "en-", 3) == 0) {return ENGLISH;} 302 if (memcmp(src, "fr-", 3) == 0) {return FRENCH;} 303 // Use NormalizeLanguage instead 304 if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;} 305 if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;} 306 if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;} 307 if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;} 308 if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;} 309 if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;} 310 } 311 312 // Extensions 313 if (len >= 3) { 314 // Standin for ignore/porn "language" 315 if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;} 316 317 if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;} 318 if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;} 319 if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;} 320 if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;} 321 if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;} 322 } 323 324 // We have a name like en-Latn-GB or pt-BR 325 // First, get rid of some special cases 326 if (len <= 3) { 327 LanguageFromCode(src, &retlang); 328 } else if (len == 7) { 329 // More Extensions 330 if (memcmp(src, "xx-", 3) == 0) { 331 if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;} 332 if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;} 333 if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;} 334 if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;} 335 if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;} 336 if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;} 337 if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;} 338 if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;} 339 if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;} 340 if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;} 341 if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;} 342 if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;} 343 if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;} 344 if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;} 345 if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;} 346 if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;} 347 if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;} 348 if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;} 349 if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;} 350 if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;} 351 if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;} 352 if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;} 353 if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;} 354 if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;} 355 if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;} 356 if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;} 357 if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;} 358 if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;} 359 360 // Unicode 5.1 361 if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;} 362 if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;} 363 if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;} 364 if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;} 365 if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;} 366 if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;} 367 if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;} 368 if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;} 369 if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;} 370 if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;} 371 if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;} 372 } 373 } 374 // Some other weird ones 375 // Could be Latn or Limb; all our current training data is Latn 376 if (strcmp(src, "sit-NP") == 0) {return LIMBU;} 377 if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;} 378 379 // Multi-country langauges 380 if (memcmp(src, "zh", 2) == 0) { 381 if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;} 382 if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;} 383 return CHINESE; 384 } 385 if (memcmp(src, "pt", 2) == 0) { 386 if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;} 387 return PORTUGUESE; 388 } 389 if (memcmp(src, "fr", 2) == 0) { 390 if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;} 391 return FRENCH; 392 } 393 394 // None of the special cases matched 395 if (src[2] == '-') { 396 char temp[4]; 397 memcpy(temp, src, 4); 398 temp[2] = '\0'; 399 LanguageFromCode(temp, &retlang); 400 } 401 if (src[3] == '-') { 402 char temp[4]; 403 memcpy(temp, src, 4); 404 temp[3] = '\0'; 405 LanguageFromCode(temp, &retlang); 406 } 407 if (retlang != UNKNOWN_LANGUAGE) { 408 return retlang; 409 } 410 411 return retlang; 412 } 413 414 typedef struct { 415 const char* name; 416 UnicodeLScript lscript; 417 } NameScriptPair; 418 419 // In alphabetic order for binary search 420 static const NameScriptPair kNameScriptPair[] = { 421 // Unicode 5.1 additional scripts 422 {"Arab", ULScript_Arabic}, 423 {"Armn", ULScript_Armenian}, 424 {"Bali", ULScript_Balinese}, 425 {"Beng", ULScript_Bengali}, 426 {"Bugi", ULScript_Buginese}, 427 {"Buhd", ULScript_Buhid}, 428 {"Cans", ULScript_Canadian_Aboriginal}, 429 {"Cari", ULScript_Carian}, // Unicode 5.1 430 {"Cham", ULScript_Cham}, // Unicode 5.1 431 {"Cher", ULScript_Cherokee}, 432 {"Copt", ULScript_Coptic}, 433 {"Cprt", ULScript_Cypriot}, 434 {"Cyrl", ULScript_Cyrillic}, 435 {"Deva", ULScript_Devanagari}, 436 {"Dsrt", ULScript_Deseret}, 437 {"Ethi", ULScript_Ethiopic}, 438 {"Geor", ULScript_Georgian}, 439 {"Glag", ULScript_Glagolitic}, 440 {"Goth", ULScript_Gothic}, 441 {"Grek", ULScript_Greek}, 442 {"Gujr", ULScript_Gujarati}, 443 {"Guru", ULScript_Gurmukhi}, 444 {"Hani", ULScript_HanCJK}, 445 {"Hano", ULScript_Hanunoo}, 446 {"Hebr", ULScript_Hebrew}, 447 {"Ital", ULScript_Old_Italic}, 448 {"Kali", ULScript_Kayah_Li}, // Unicode 5.1 449 {"Khar", ULScript_Kharoshthi}, 450 {"Khmr", ULScript_Khmer}, 451 {"Knda", ULScript_Kannada}, 452 {"Laoo", ULScript_Lao}, 453 {"Latn", ULScript_Latin}, 454 {"Lepc", ULScript_Lepcha}, // Unicode 5.1 455 {"Limb", ULScript_Limbu}, 456 {"Linb", ULScript_Linear_B}, 457 {"Lyci", ULScript_Lycian}, // Unicode 5.1 458 {"Lydi", ULScript_Lydian}, // Unicode 5.1 459 {"Mlym", ULScript_Malayalam}, 460 {"Mong", ULScript_Mongolian}, 461 {"Mymr", ULScript_Myanmar}, 462 {"Nkoo", ULScript_Nko}, 463 {"Ogam", ULScript_Ogham}, 464 {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1 465 {"Orya", ULScript_Oriya}, 466 {"Osma", ULScript_Osmanya}, 467 {"Phag", ULScript_Phags_Pa}, 468 {"Phnx", ULScript_Phoenician}, 469 {"Rjng", ULScript_Rejang}, // Unicode 5.1 470 {"Runr", ULScript_Runic}, 471 {"Saur", ULScript_Saurashtra}, // Unicode 5.1 472 {"Shaw", ULScript_Shavian}, 473 {"Sinh", ULScript_Sinhala}, 474 {"Sund", ULScript_Sundanese}, // Unicode 5.1 475 {"Sylo", ULScript_Syloti_Nagri}, 476 {"Syrc", ULScript_Syriac}, 477 {"Tagb", ULScript_Tagbanwa}, 478 {"Tale", ULScript_Tai_Le}, 479 {"Talu", ULScript_New_Tai_Lue}, 480 {"Taml", ULScript_Tamil}, 481 {"Telu", ULScript_Telugu}, 482 {"Tfng", ULScript_Tifinagh}, 483 {"Tglg", ULScript_Tagalog}, 484 {"Thaa", ULScript_Thaana}, 485 {"Thai", ULScript_Thai}, 486 {"Tibt", ULScript_Tibetan}, 487 {"Ugar", ULScript_Ugaritic}, 488 {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai ' 489 {"Xpeo", ULScript_Old_Persian}, 490 {"Xsux", ULScript_Cuneiform}, 491 {"Yiii", ULScript_Yi}, 492 {"Zyyy", ULScript_Common}, 493 {"Zzzz", ULScript_Inherited}, 494 }; 495 496 // Convert "en-Latn-GB" to ULScript_Latin 497 UnicodeLScript GetLScriptFromNumberOrName(const char* src) { 498 if (strspn(src, "0123456789") == strlen(src)) { 499 // All digits 500 return static_cast<UnicodeLScript>(strto32(src, NULL, 10)); 501 } 502 503 if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;} 504 if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;} 505 if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;} 506 if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;} 507 // Could be Latn or Limb; all our current training data is Latn 508 if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;} 509 510 // Isolate just the script field 511 char temp[5]; 512 const char* src2 = strchr(src, '-'); 513 if (src2 == NULL) {return ULScript_Latin;} 514 src2 += 1; // over the - 515 memcpy(temp, src2, 4); 516 temp[4] = '\0'; 517 518 int lo = 0; 519 int hi = ULScript_NUM_SCRIPTS; 520 while (lo < hi) { 521 int mid = (lo + hi) >> 1; 522 if (strcmp(temp, kNameScriptPair[mid].name) < 0) { 523 hi = mid; 524 } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) { 525 lo = mid + 1; 526 } else { 527 return kNameScriptPair[mid].lscript; 528 } 529 } 530 return ULScript_Latin; 531 } 532 533 534 // Merge together some languages, such as bo/hr/sr 535 // Croatian Latin and Serbian Cyrillic now. 536 Language NormalizeLanguage(Language lang) { 537 if (lang == BOSNIAN) {return CROATIAN;} 538 if (lang == SERBO_CROATIAN) {return SERBIAN;} 539 540 if (lang == PORTUGUESE_P) {return PORTUGUESE;} 541 if (lang == PORTUGUESE_B) {return PORTUGUESE;} 542 543 return lang; 544 } 545 546