1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "languages/public/languages.h" 6 7 #include "base/string_util.h" 8 #include "encodings/compact_lang_det/win/cld_basictypes.h" 9 10 11 Language default_language() {return ENGLISH;} 12 13 14 // Language names and codes 15 16 struct LanguageInfo { 17 const char * language_name_; 18 const char * language_code_639_1_; // the ISO-639-1 code for the language 19 const char * language_code_639_2_; // the ISO-639-2 code for the language 20 const char * language_code_other_; // some nonstandard code for the language 21 }; 22 23 static const LanguageInfo kLanguageInfoTable[] = { 24 { "ENGLISH", "en", "eng", NULL}, 25 { "DANISH", "da", "dan", NULL}, 26 { "DUTCH", "nl", "dut", NULL}, 27 { "FINNISH", "fi", "fin", NULL}, 28 { "FRENCH", "fr", "fre", NULL}, 29 { "GERMAN", "de", "ger", NULL}, 30 { "HEBREW", "he", "heb", NULL}, 31 { "ITALIAN", "it", "ita", NULL}, 32 { "Japanese", "ja", "jpn", NULL}, 33 { "Korean", "ko", "kor", NULL}, 34 { "NORWEGIAN", "nb", "nor", NULL}, 35 { "POLISH", "pl", "pol", NULL}, 36 { "PORTUGUESE", "pt", "por", NULL}, 37 { "RUSSIAN", "ru", "rus", NULL}, 38 { "SPANISH", "es", "spa", NULL}, 39 { "SWEDISH", "sv", "swe", NULL}, 40 { "Chinese", "zh", "chi", "zh-CN"}, 41 { "CZECH", "cs", "cze", NULL}, 42 { "GREEK", "el", "gre", NULL}, 43 { "ICELANDIC", "is", "ice", NULL}, 44 { "LATVIAN", "lv", "lav", NULL}, 45 { "LITHUANIAN", "lt", "lit", NULL}, 46 { "ROMANIAN", "ro", "rum", NULL}, 47 { "HUNGARIAN", "hu", "hun", NULL}, 48 { "ESTONIAN", "et", "est", NULL}, 49 // TODO: Although Teragram has two output names "TG_UNKNOWN_LANGUAGE" 50 // and "Unknown", they are essentially the same. Need to unify them. 51 // "un" and "ut" are invented by us, not from ISO-639. 52 // 53 { "TG_UNKNOWN_LANGUAGE", NULL, NULL, "ut"}, 54 { "Unknown", NULL, NULL, "un"}, 55 { "BULGARIAN", "bg", "bul", NULL}, 56 { "CROATIAN", "hr", "scr", NULL}, 57 { "SERBIAN", "sr", "scc", NULL}, 58 { "IRISH", "ga", "gle", NULL}, 59 { "GALICIAN", "gl", "glg", NULL}, 60 // Impossible to tell Tagalog from Filipino at the moment. 61 // Use ISO 639-2 code for Filipino here. 62 { "TAGALOG", NULL, "fil", NULL}, 63 { "TURKISH", "tr", "tur", NULL}, 64 { "UKRAINIAN", "uk", "ukr", NULL}, 65 { "HINDI", "hi", "hin", NULL}, 66 { "MACEDONIAN", "mk", "mac", NULL}, 67 { "BENGALI", "bn", "ben", NULL}, 68 { "INDONESIAN", "id", "ind", NULL}, 69 { "LATIN", "la", "lat", NULL}, 70 { "MALAY", "ms", "may", NULL}, 71 { "MALAYALAM", "ml", "mal", NULL}, 72 { "WELSH", "cy", "wel", NULL}, 73 { "NEPALI", "ne", "nep", NULL}, 74 { "TELUGU", "te", "tel", NULL}, 75 { "ALBANIAN", "sq", "alb", NULL}, 76 { "TAMIL", "ta", "tam", NULL}, 77 { "BELARUSIAN", "be", "bel", NULL}, 78 { "JAVANESE", "jw", "jav", NULL}, 79 { "OCCITAN", "oc", "oci", NULL}, 80 { "URDU", "ur", "urd", NULL}, 81 { "BIHARI", "bh", "bih", NULL}, 82 { "GUJARATI", "gu", "guj", NULL}, 83 { "THAI", "th", "tha", NULL}, 84 { "ARABIC", "ar", "ara", NULL}, 85 { "CATALAN", "ca", "cat", NULL}, 86 { "ESPERANTO", "eo", "epo", NULL}, 87 { "BASQUE", "eu", "baq", NULL}, 88 { "INTERLINGUA", "ia", "ina", NULL}, 89 { "KANNADA", "kn", "kan", NULL}, 90 { "PUNJABI", "pa", "pan", NULL}, 91 { "SCOTS_GAELIC", "gd", "gla", NULL}, 92 { "SWAHILI", "sw", "swa", NULL}, 93 { "SLOVENIAN", "sl", "slv", NULL}, 94 { "MARATHI", "mr", "mar", NULL}, 95 { "MALTESE", "mt", "mlt", NULL}, 96 { "VIETNAMESE", "vi", "vie", NULL}, 97 { "FRISIAN", "fy", "fry", NULL}, 98 { "SLOVAK", "sk", "slo", NULL}, 99 { "ChineseT", 100 NULL, NULL, // We intentionally set these 2 fields to NULL to avoid 101 // confusion between CHINESE_T and CHINESE. 102 "zh-TW"}, 103 { "FAROESE", "fo", "fao", NULL}, 104 { "SUNDANESE", "su", "sun", NULL}, 105 { "UZBEK", "uz", "uzb", NULL}, 106 { "AMHARIC", "am", "amh", NULL}, 107 { "AZERBAIJANI", "az", "aze", NULL}, 108 { "GEORGIAN", "ka", "geo", NULL}, 109 { "TIGRINYA", "ti", "tir", NULL}, 110 { "PERSIAN", "fa", "per", NULL}, 111 { "BOSNIAN", "bs", "bos", NULL}, 112 { "SINHALESE", "si", "sin", NULL}, 113 { "NORWEGIAN_N", "nn", "nno", NULL}, 114 { "PORTUGUESE_P", NULL, NULL, "pt-PT"}, 115 { "PORTUGUESE_B", NULL, NULL, "pt-BR"}, 116 { "XHOSA", "xh", "xho", NULL}, 117 { "ZULU", "zu", "zul", NULL}, 118 { "GUARANI", "gn", "grn", NULL}, 119 { "SESOTHO", "st", "sot", NULL}, 120 { "TURKMEN", "tk", "tuk", NULL}, 121 { "KYRGYZ", "ky", "kir", NULL}, 122 { "BRETON", "br", "bre", NULL}, 123 { "TWI", "tw", "twi", NULL}, 124 { "YIDDISH", "yi", "yid", NULL}, 125 { "SERBO_CROATIAN", "sh", NULL, NULL}, 126 { "SOMALI", "so", "som", NULL}, 127 { "UIGHUR", "ug", "uig", NULL}, 128 { "KURDISH", "ku", "kur", NULL}, 129 { "MONGOLIAN", "mn", "mon", NULL}, 130 { "ARMENIAN", "hy", "arm", NULL}, 131 { "LAOTHIAN", "lo", "lao", NULL}, 132 { "SINDHI", "sd", "snd", NULL}, 133 { "RHAETO_ROMANCE", "rm", "roh", NULL}, 134 { "AFRIKAANS", "af", "afr", NULL}, 135 { "LUXEMBOURGISH", "lb", "ltz", NULL}, 136 { "BURMESE", "my", "bur", NULL}, 137 // KHMER is known as Cambodian for Google user interfaces. 138 { "KHMER", "km", "khm", NULL}, 139 { "TIBETAN", "bo", "tib", NULL}, 140 { "DHIVEHI", "dv", "div", NULL}, 141 { "CHEROKEE", NULL, "chr", NULL}, 142 { "SYRIAC", NULL, "syr", NULL}, 143 { "LIMBU", NULL, NULL, "sit-NP"}, 144 { "ORIYA", "or", "ori", NULL}, 145 { "ASSAMESE", "as", "asm", NULL}, 146 { "CORSICAN", "co", "cos", NULL}, 147 { "INTERLINGUE", "ie", "ine", NULL}, 148 { "KAZAKH", "kk", "kaz", NULL}, 149 { "LINGALA", "ln", "lin", NULL}, 150 { "MOLDAVIAN", "mo", "mol", NULL}, 151 { "PASHTO", "ps", "pus", NULL}, 152 { "QUECHUA", "qu", "que", NULL}, 153 { "SHONA", "sn", "sna", NULL}, 154 { "TAJIK", "tg", "tgk", NULL}, 155 { "TATAR", "tt", "tat", NULL}, 156 { "TONGA", "to", "tog", NULL}, 157 { "YORUBA", "yo", "yor", NULL}, 158 { "CREOLES_AND_PIDGINS_ENGLISH_BASED", NULL, "cpe", NULL}, 159 { "CREOLES_AND_PIDGINS_FRENCH_BASED", NULL, "cpf", NULL}, 160 { "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", NULL, "cpp", NULL}, 161 { "CREOLES_AND_PIDGINS_OTHER", NULL, "crp", NULL}, 162 { "MAORI", "mi", "mao", NULL}, 163 { "WOLOF", "wo", "wol", NULL}, 164 { "ABKHAZIAN", "ab", "abk", NULL}, 165 { "AFAR", "aa", "aar", NULL}, 166 { "AYMARA", "ay", "aym", NULL}, 167 { "BASHKIR", "ba", "bak", NULL}, 168 { "BISLAMA", "bi", "bis", NULL}, 169 { "DZONGKHA", "dz", "dzo", NULL}, 170 { "FIJIAN", "fj", "fij", NULL}, 171 { "GREENLANDIC", "kl", "kal", NULL}, 172 { "HAUSA", "ha", "hau", NULL}, 173 { "HAITIAN_CREOLE", "ht", NULL, NULL}, 174 { "INUPIAK", "ik", "ipk", NULL}, 175 { "INUKTITUT", "iu", "iku", NULL}, 176 { "KASHMIRI", "ks", "kas", NULL}, 177 { "KINYARWANDA", "rw", "kin", NULL}, 178 { "MALAGASY", "mg", "mlg", NULL}, 179 { "NAURU", "na", "nau", NULL}, 180 { "OROMO", "om", "orm", NULL}, 181 { "RUNDI", "rn", "run", NULL}, 182 { "SAMOAN", "sm", "smo", NULL}, 183 { "SANGO", "sg", "sag", NULL}, 184 { "SANSKRIT", "sa", "san", NULL}, 185 { "SISWANT", "ss", "ssw", NULL}, 186 { "TSONGA", "ts", "tso", NULL}, 187 { "TSWANA", "tn", "tsn", NULL}, 188 { "VOLAPUK", "vo", "vol", NULL}, 189 { "ZHUANG", "za", "zha", NULL}, 190 { "KHASI", NULL, "kha", NULL}, 191 { "SCOTS", NULL, "sco", NULL}, 192 { "GANDA", "lg", "lug", NULL}, 193 { "MANX", "gv", "glv", NULL}, 194 { "MONTENEGRIN", NULL, NULL, "sr-ME"}, 195 { "XX", NULL, NULL, "XX"}, 196 }; 197 198 COMPILE_ASSERT(arraysize(kLanguageInfoTable) == NUM_LANGUAGES + 1, 199 kLanguageInfoTable_has_incorrect_length); 200 201 202 // LANGUAGE NAMES 203 204 const char* default_language_name() { 205 return kLanguageInfoTable[ENGLISH].language_name_; 206 } 207 208 static const char* const kInvalidLanguageName = "invalid_language"; 209 210 const char *invalid_language_name() { 211 return kInvalidLanguageName; 212 } 213 214 const char* LanguageName(Language lang) { 215 return IsValidLanguage(lang) 216 ? kLanguageInfoTable[lang].language_name_ 217 : kInvalidLanguageName; 218 } 219 220 221 222 // LANGUAGE CODES 223 224 225 // The space before invalid_language_code is intentional. It is used 226 // to prevent it matching any two letter language code. 227 // 228 static const char* const kInvalidLanguageCode = " invalid_language_code"; 229 230 const char *invalid_language_code() { 231 return kInvalidLanguageCode; 232 } 233 234 const char * LanguageCode(Language lang) { 235 if (! IsValidLanguage(lang)) 236 return kInvalidLanguageCode; 237 const LanguageInfo& info = kLanguageInfoTable[lang]; 238 if (info.language_code_639_1_) { 239 return info.language_code_639_1_; 240 } else if (info.language_code_639_2_) { 241 return info.language_code_639_2_; 242 } else if (info.language_code_other_) { 243 return info.language_code_other_; 244 } else { 245 return kInvalidLanguageCode; 246 } 247 } 248 249 const char* default_language_code() { 250 return kLanguageInfoTable[ENGLISH].language_code_639_1_; 251 } 252 253 const char* LanguageCodeISO639_1(Language lang) { 254 if (! IsValidLanguage(lang)) 255 return kInvalidLanguageCode; 256 if (const char* code = kLanguageInfoTable[lang].language_code_639_1_) 257 return code; 258 return kInvalidLanguageCode; 259 } 260 261 const char* LanguageCodeISO639_2(Language lang) { 262 if (! IsValidLanguage(lang)) 263 return kInvalidLanguageCode; 264 if (const char* code = kLanguageInfoTable[lang].language_code_639_2_) 265 return code; 266 return kInvalidLanguageCode; 267 } 268 269 const char* LanguageCodeWithDialects(Language lang) { 270 if (lang == CHINESE) 271 return "zh-CN"; 272 return LanguageCode(lang); 273 } 274 275 276 277 bool LanguageFromCode(const char* lang_code, Language *language) { 278 *language = UNKNOWN_LANGUAGE; 279 if ( lang_code == NULL ) return false; 280 281 for ( int i = 0 ; i < kNumLanguages ; i++ ) { 282 const LanguageInfo& info = kLanguageInfoTable[i]; 283 if ((info.language_code_639_1_ && 284 !base::strcasecmp(lang_code, info.language_code_639_1_)) || 285 (info.language_code_639_2_ && 286 !base::strcasecmp(lang_code, info.language_code_639_2_)) || 287 (info.language_code_other_ && 288 !base::strcasecmp(lang_code, info.language_code_other_))) { 289 *language = static_cast<Language>(i); 290 return true; 291 } 292 } 293 294 // For convenience, this function can also parse the non-standard 295 // five-letter language codes "zh-cn" and "zh-tw" which are used by 296 // front-ends such as GWS to distinguish Simplified from Traditional 297 // Chinese. 298 if (!base::strcasecmp(lang_code, "zh-cn") || 299 !base::strcasecmp(lang_code, "zh_cn")) { 300 *language = CHINESE; 301 return true; 302 } 303 if (!base::strcasecmp(lang_code, "zh-tw") || 304 !base::strcasecmp(lang_code, "zh_tw")) { 305 *language = CHINESE_T; 306 return true; 307 } 308 if (!base::strcasecmp(lang_code, "sr-me") || 309 !base::strcasecmp(lang_code, "sr_me")) { 310 *language = MONTENEGRIN; 311 return true; 312 } 313 314 // Process language-code synonyms. 315 if (!base::strcasecmp(lang_code, "he")) { 316 *language = HEBREW; // Use "iw". 317 return true; 318 } 319 if (!base::strcasecmp(lang_code, "in")) { 320 *language = INDONESIAN; // Use "id". 321 return true; 322 } 323 if (!base::strcasecmp(lang_code, "ji")) { 324 *language = YIDDISH; // Use "yi". 325 return true; 326 } 327 328 // Process language-detection synonyms. 329 // These distinct languages cannot be differentiated by our current 330 // language-detection algorithms. 331 if (!base::strcasecmp(lang_code, "fil")) { 332 *language = TAGALOG; 333 return true; 334 } 335 336 return false; 337 } 338