1 // Copyright (c) 2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef LANGUAGES_PUBLIC_LANGUAGES_H_ 6 #define LANGUAGES_PUBLIC_LANGUAGES_H_ 7 8 // This interface defines the Language enum and functions that depend 9 // only on Language values. 10 11 // A hash-function for Language, hash<Language>, is defined in 12 // i18n/languages/public/languages-hash.h 13 14 #ifndef SWIG 15 // Language enum defined in languages.proto 16 // Also description on how to add languages. 17 #include "languages/proto/languages.pb.h" 18 19 // We need this for compatibility: 20 // - The Language enum in the default namespace. 21 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 22 //using namespace i18n::languages; 23 24 #else 25 // And we must have a swig-compatible enum. 26 // This one is a simple cleaned up version of language.proto, making the enum 27 // compatible with C++. 28 #include "i18n/languages/internal/languages_proto_wrapper.h" 29 30 #endif 31 32 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 33 //#include "util/utf8/proptables/script_enum.h" 34 35 const int kNumLanguages = NUM_LANGUAGES; 36 37 // Return the default language (ENGLISH). 38 Language default_language(); 39 40 41 // ******************************************* 42 // Language predicates 43 // IsValidLanguage() 44 // IS_LANGUAGE_UNKNOWN() 45 // IsCJKLanguage() 46 // IsChineseLanguage() 47 // IsNorwegianLanguage() 48 // IsPortugueseLanguage() 49 // IsRightToLeftLanguage() 50 // IsMaybeRightToLeftLanguage() 51 // IsSameLanguage() 52 // IsScriptRequiringLongerSnippets() 53 // ******************************************* 54 55 // IsValidLanguage 56 // =============== 57 // 58 // Function to check if the input is within range of the Language enum. If 59 // IsValidLanguage(lang) returns true, it is safe to call 60 // static_cast<Language>(lang). 61 // 62 inline bool IsValidLanguage(int lang) { 63 return ((lang >= 0) && (lang < kNumLanguages)); 64 } 65 66 // Return true if the language is "unknown". (This function was 67 // previously a macro, hence the spelling in all caps.) 68 // 69 inline bool IS_LANGUAGE_UNKNOWN(Language lang) { 70 return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE; 71 } 72 73 // IsCJKLanguage 74 // ------------- 75 // 76 // This function returns true if the language is either Chinese 77 // (simplified or traditional), Japanese, or Korean. 78 bool IsCJKLanguage(Language lang); 79 80 // IsChineseLanguage 81 // ----------------- 82 // 83 // This function returns true if the language is either Chinese 84 // (simplified or traditional) 85 bool IsChineseLanguage(Language lang); 86 87 // IsNorwegianLanguage 88 // -------------------- 89 // 90 // This function returns true if the language is any of the Norwegian 91 // (regular or Nynorsk). 92 bool IsNorwegianLanguage(Language lang); 93 94 // IsPortugueseLanguage 95 // -------------------- 96 // 97 // This function returns true if the language is any of the Portuguese 98 // languages (regular, Portugal or Brazil) 99 bool IsPortugueseLanguage(Language lang); 100 101 // IsSameLanguage 102 // -------------- 103 // 104 // WARNING: This function provides only a simple test on the values of 105 // the two Language arguments. It returns false if either language is 106 // invalid. It returns true if the language arguments are equal, or 107 // if they are both Chinese languages, both Norwegian languages, or 108 // both Portuguese languages, as defined by IsChineseLanguage, 109 // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns 110 // false. 111 bool IsSameLanguage(Language lang1, Language lang2); 112 113 114 // IsRightToLeftLanguage 115 // --------------------- 116 // 117 // This function returns true if the language is only written right-to-left 118 // (E.g., Hebrew, Arabic, Persian etc.) 119 // 120 // IMPORTANT NOTE: Technically we're talking about scripts, not languages. 121 // There are languages that can be written in more than one script. 122 // Examples: 123 // - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in 124 // Latin or Cyrillic script, and right-to-left in Arabic script. 125 // - Sindhi and Punjabi are written in different scripts, depending on 126 // region and dialect. 127 // - Turkmen used an Arabic script historically, but not any more. 128 // - Pashto and Uyghur can use Arabic script, but use a Roman script 129 // on the Internet. 130 // - Kashmiri and Urdu are written either with Arabic or Devanagari script. 131 // 132 // This function only returns true for languages that are always, unequivocally 133 // written in right-to-left script. 134 // 135 // TODO(benjy): If we want to do anything special with multi-script languages 136 // we should create new 'languages' for each language+script, as we do for 137 // traditional vs. simplified Chinese. However most such languages are rare in 138 // use and even rarer on the web, so this is unlikely to be something we'll 139 // be concerned with for a while. 140 bool IsRightToLeftLanguage(Language lang); 141 142 // IsMaybeRightToLeftLanguage 143 // -------------------------- 144 // 145 // This function returns true if the language may appear on the web in a 146 // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.) 147 // 148 // NOTE: See important notes under IsRightToLeftLanguage(...). 149 // 150 // This function returns true for languages that *may* appear on the web in a 151 // right-to-left script, even if they may also appear in a left-to-right 152 // script. 153 // 154 // This function should typically be used in cases where doing some work on 155 // left-to-right text would be OK (usually a no-op), and this function is used 156 // just to cut down on unnecessary work on regular, LTR text. 157 bool IsMaybeRightToLeftLanguage(Language lang); 158 159 // IsScriptRequiringLongerSnippets 160 // -------------------- 161 // 162 // This function returns true if the script chracteristics require longer 163 // snippet length (Devanagari, Bengali, Gurmukhi, 164 // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam). 165 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE 166 // bool IsScriptRequiringLongerSnippets(UnicodeScript script); 167 168 169 // ******************************************* 170 // LANGUAGE NAMES 171 // 172 // This interface defines a standard name for each valid Language, 173 // and a standard name for invalid languages. Some language names use all 174 // uppercase letters, but others use mixed case. 175 // LanguageName() [Language to name] 176 // LanguageEnumName() [language to enum name] 177 // LanguageFromName() [name to Language] 178 // default_language_name() 179 // invalid_language_name() 180 // ******************************************* 181 182 // Given a Language, returns its standard name. 183 // Return invalid_language_name() if the language is invalid. 184 const char* LanguageName(Language lang); 185 186 // Given a Language, return the name of the enum constant for that 187 // language. In all but a few cases, this is the same as its standard 188 // name. For example, LanguageName(CHINESE) returns "Chinese", but 189 // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for 190 // code that is generating C++ code, where the enum constant is more 191 // useful than its integer value. Return "NUM_LANGUAGES" if 192 // the language is invalid. 193 const char* LanguageEnumName(Language lang); 194 195 // The maximum length of a standard language name. 196 const int kMaxLanguageNameSize = 50; 197 198 // The standard name for the default language. 199 const char* default_language_name(); 200 201 // The standard name for all invalid languages. 202 const char* invalid_language_name(); 203 204 // If lang_name matches the standard name of a Language, using a 205 // case-insensitive comparison, set *language to that Language and 206 // return true. 207 // Otherwise, set *language to UNKNOWN_LANGUAGE and return false. 208 // 209 // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name 210 // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA. 211 // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed 212 // as a name for UNKNOWN_LANGUAGE (the return value is true in this case, 213 // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for 214 // CHINESE_T (i.e., a synonym for "ChineseT"). 215 // 216 // REQUIRES: language must not be NULL. 217 // 218 bool LanguageFromName(const char* lang_name, Language *language); 219 220 221 222 // ******************************************* 223 // LANGUAGE CODES 224 // 225 // This interface defines a standard code for each valid language, and 226 // a standard code for invalid languages. These are derived from ISO codes, 227 // with some Google additions. 228 // LanguageCode() 229 // default_language_code() 230 // invalid_language_code() 231 // LanguageCodeWithDialects() 232 // LanguageCodeISO639_1() 233 // LanguageCodeISO639_2() 234 // ******************************************* 235 236 // Given a Language, return its standard code. There are Google-specific codes: 237 // For CHINESE_T, return "zh-TW". 238 // For TG_UNKNOWN_LANGUAGE, return "ut". 239 // For UNKNOWN_LANGUAGE, return "un". 240 // For PORTUGUESE_P, return "pt-PT". 241 // For PORTUGUESE_B, return "pt-BR". 242 // For LIMBU, return "sit-NP". 243 // For CHEROKEE, return "chr". 244 // For SYRIAC, return "syr". 245 // Otherwise return the ISO 639-1 two-letter language code for lang. 246 // If lang is invalid, return invalid_language_code(). 247 // 248 // NOTE: See the note below about the codes for Chinese languages. 249 // 250 const char* LanguageCode(Language lang); 251 252 // The maximum length of a language code. 253 const int kMaxLanguageCodeSize = 50; 254 255 // The standard code for the default language. 256 const char* default_language_code(); 257 258 // The standard code for all invalid languages. 259 const char* invalid_language_code(); 260 261 262 // -------------------------------------------- 263 // NOTE: CHINESE LANGUAGE CODES 264 // 265 // There are three functions that return codes for Chinese languages. 266 // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here. 267 // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h. 268 // The following list shows the different results. 269 // 270 // LanguageCode(CHINESE) returns "zh" 271 // LanguageCode(CHINESE_T) returns "zh-TW". 272 // 273 // LanguageCodeWithDialects(CHINESE) returns "zh-CN". 274 // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW". 275 // 276 // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW". 277 // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW". 278 // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN". 279 // 280 // -------------------------------------------- 281 282 // LanguageCodeWithDialects 283 // ------------------------ 284 // 285 // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang). 286 const char* LanguageCodeWithDialects(Language lang); 287 288 // LanguageCodeISO639_1 289 // -------------------- 290 // 291 // Return the ISO 639-1 two-letter language code for lang. 292 // Return invalid_language_code() if lang is invalid or does not have 293 // an ISO 639-1 two-letter language code. 294 const char* LanguageCodeISO639_1(Language lang); 295 296 // LanguageCodeISO639_2 297 // -------------------- 298 // 299 // Return the ISO 639-2 three-letter language for lang. 300 // Return invalid_language_code() if lang is invalid or does not have 301 // an ISO 639-2 three-letter language code. 302 const char* LanguageCodeISO639_2(Language lang); 303 304 // LanguageFromCode 305 // ---------------- 306 // 307 // If lang_code matches the code for a Language, using a case-insensitive 308 // comparison, set *lang to that Language and return true. 309 // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false. 310 // 311 // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2 312 // (three-letter) code, or a Google-specific code (see LanguageCode). 313 // 314 // Certain language-code aliases are also allowed: 315 // For "zh-cn" and "zh_cn", set *lang to CHINESE. 316 // For "zh-tw" and "zh_tw", set *lang to CHINESE_T. 317 // For "he", set *lang to HEBREW. 318 // For "in", set *lang to INDONESIAN. 319 // For "ji", set *lang to YIDDISH. 320 // For "fil", set *lang to TAGALOG. 321 // 322 // REQUIRES: 'lang' must not be NULL. 323 bool LanguageFromCode(const char* lang_code, Language *language); 324 325 326 // LanguageFromCodeOrName 327 // ---------------------- 328 // 329 // If lang_code_or_name is a language code or a language name. 330 // set *language to the corresponding Language and return true. 331 // Otherwise set *language to UNKNOWN_LANGUAGE and return false. 332 // 333 bool LanguageFromCodeOrName(const char* lang_code_or_name, 334 Language* language); 335 336 // LanguageNameFromCode 337 // -------------------- 338 // 339 // If language_code is the code for a Language (see LanguageFromCode), 340 // return the standard name of that language (see LanguageName). 341 // Otherwise return invalid_language_name(). 342 // 343 const char* LanguageNameFromCode(const char* language_code); 344 345 346 // Miscellany 347 348 // LanguageCodeToUnderscoreForm 349 // ---------------------------- 350 // 351 // Given a language code, convert the dash "-" to underscore "_". 352 // 353 // Specifically, if result_length <= strlen(lang_code), set result[0] 354 // to '\0' and return false. Otherwise, copy lang_code to result, 355 // converting every dash to an underscore, converting every character 356 // before the first dash or underscore to lower case, and converting 357 // every character after the first dash or underscore to upper 358 // case. If there is no dash or underscore, convert the entire string 359 // to lower case. 360 // 361 // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL. 362 363 bool LanguageCodeToUnderscoreForm(const char* lang_code, 364 char* result, 365 int result_length); 366 367 // 368 // AlwaysPutInExpectedRestrict 369 // --------------------------- 370 // 371 // For Web pages in certain top-level domains, Web Search always 372 // applies a "country restrict". If 'tld' matches one of those, using 373 // a case-SENSITIVE comparison, set *expected_language to the Language 374 // most commonly found in that top-level domain and return true. 375 // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false. 376 bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language); 377 378 379 #endif // LANGUAGES_PUBLIC_LANGUAGES_H_ 380