Home | History | Annotate | Download | only in public
      1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef LANGUAGES_PUBLIC_LANGUAGES_H_
      6 #define LANGUAGES_PUBLIC_LANGUAGES_H_
      7 
      8 // This interface defines the Language enum and functions that depend
      9 // only on Language values.
     10 
     11 // A hash-function for Language, hash<Language>, is defined in
     12 // i18n/languages/public/languages-hash.h
     13 
     14 #ifndef SWIG
     15 // Language enum defined in languages.proto
     16 // Also description on how to add languages.
     17 #include "languages/proto/languages.pb.h"
     18 
     19 // We need this for compatibility:
     20 // - The Language enum in the default namespace.
     21 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
     22 //using namespace i18n::languages;
     23 
     24 #else
     25 // And we must have a swig-compatible enum.
     26 // This one is a simple cleaned up version of language.proto, making the enum
     27 // compatible with C++.
     28 #include "i18n/languages/internal/languages_proto_wrapper.h"
     29 
     30 #endif
     31 
     32 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
     33 //#include "util/utf8/proptables/script_enum.h"
     34 
     35 const int kNumLanguages = NUM_LANGUAGES;
     36 
     37 // Return the default language (ENGLISH).
     38 Language default_language();
     39 
     40 
     41 // *******************************************
     42 // Language predicates
     43 //   IsValidLanguage()
     44 //   IS_LANGUAGE_UNKNOWN()
     45 //   IsCJKLanguage()
     46 //   IsChineseLanguage()
     47 //   IsNorwegianLanguage()
     48 //   IsPortugueseLanguage()
     49 //   IsRightToLeftLanguage()
     50 //   IsMaybeRightToLeftLanguage()
     51 //   IsSameLanguage()
     52 //   IsScriptRequiringLongerSnippets()
     53 // *******************************************
     54 
     55 // IsValidLanguage
     56 // ===============
     57 //
     58 // Function to check if the input is within range of the Language enum. If
     59 // IsValidLanguage(lang) returns true, it is safe to call
     60 // static_cast<Language>(lang).
     61 //
     62 inline bool IsValidLanguage(int lang) {
     63   return ((lang >= 0) && (lang < kNumLanguages));
     64 }
     65 
     66 // Return true if the language is "unknown". (This function was
     67 // previously a macro, hence the spelling in all caps.)
     68 //
     69 inline bool IS_LANGUAGE_UNKNOWN(Language lang) {
     70   return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;
     71 }
     72 
     73 // IsCJKLanguage
     74 // -------------
     75 //
     76 // This function returns true if the language is either Chinese
     77 // (simplified or traditional), Japanese, or Korean.
     78 bool IsCJKLanguage(Language lang);
     79 
     80 // IsChineseLanguage
     81 // -----------------
     82 //
     83 // This function returns true if the language is either Chinese
     84 // (simplified or traditional)
     85 bool IsChineseLanguage(Language lang);
     86 
     87 // IsNorwegianLanguage
     88 // --------------------
     89 //
     90 // This function returns true if the language is any of the Norwegian
     91 // (regular or Nynorsk).
     92 bool IsNorwegianLanguage(Language lang);
     93 
     94 // IsPortugueseLanguage
     95 // --------------------
     96 //
     97 // This function returns true if the language is any of the Portuguese
     98 // languages (regular, Portugal or Brazil)
     99 bool IsPortugueseLanguage(Language lang);
    100 
    101 // IsSameLanguage
    102 // --------------
    103 //
    104 // WARNING: This function provides only a simple test on the values of
    105 // the two Language arguments. It returns false if either language is
    106 // invalid. It returns true if the language arguments are equal, or
    107 // if they are both Chinese languages, both Norwegian languages, or
    108 // both Portuguese languages, as defined by IsChineseLanguage,
    109 // IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns
    110 // false.
    111 bool IsSameLanguage(Language lang1, Language lang2);
    112 
    113 
    114 // IsRightToLeftLanguage
    115 // ---------------------
    116 //
    117 // This function returns true if the language is only written right-to-left
    118 // (E.g., Hebrew, Arabic, Persian etc.)
    119 //
    120 // IMPORTANT NOTE: Technically we're talking about scripts, not languages.
    121 // There are languages that can be written in more than one script.
    122 // Examples:
    123 //   - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in
    124 //     Latin or Cyrillic script, and right-to-left in Arabic script.
    125 //   - Sindhi and Punjabi are written in different scripts, depending on
    126 //     region and dialect.
    127 //   - Turkmen used an Arabic script historically, but not any more.
    128 //   - Pashto and Uyghur can use Arabic script, but use a Roman script
    129 //     on the Internet.
    130 //   - Kashmiri and Urdu are written either with Arabic or Devanagari script.
    131 //
    132 // This function only returns true for languages that are always, unequivocally
    133 // written in right-to-left script.
    134 //
    135 // TODO(benjy): If we want to do anything special with multi-script languages
    136 // we should create new 'languages' for each language+script, as we do for
    137 // traditional vs. simplified Chinese. However most such languages are rare in
    138 // use and even rarer on the web, so this is unlikely to be something we'll
    139 // be concerned with for a while.
    140 bool IsRightToLeftLanguage(Language lang);
    141 
    142 // IsMaybeRightToLeftLanguage
    143 // --------------------------
    144 //
    145 // This function returns true if the language may appear on the web in a
    146 // right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)
    147 //
    148 // NOTE: See important notes under IsRightToLeftLanguage(...).
    149 //
    150 // This function returns true for languages that *may* appear on the web in a
    151 // right-to-left script, even if they may also appear in a left-to-right
    152 // script.
    153 //
    154 // This function should typically be used in cases where doing some work on
    155 // left-to-right text would be OK (usually a no-op), and this function is used
    156 // just to cut down on unnecessary work on regular, LTR text.
    157 bool IsMaybeRightToLeftLanguage(Language lang);
    158 
    159 // IsScriptRequiringLongerSnippets
    160 // --------------------
    161 //
    162 // This function returns true if the script chracteristics require longer
    163 // snippet length (Devanagari, Bengali, Gurmukhi,
    164 // Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).
    165 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
    166 // bool IsScriptRequiringLongerSnippets(UnicodeScript script);
    167 
    168 
    169 // *******************************************
    170 // LANGUAGE NAMES
    171 //
    172 // This interface defines a standard name for each valid Language,
    173 // and a standard name for invalid languages. Some language names use all
    174 // uppercase letters, but others use mixed case.
    175 //   LanguageName() [Language to name]
    176 //   LanguageEnumName() [language to enum name]
    177 //   LanguageFromName() [name to Language]
    178 //   default_language_name()
    179 //   invalid_language_name()
    180 // *******************************************
    181 
    182 // Given a Language, returns its standard name.
    183 // Return invalid_language_name() if the language is invalid.
    184 const char* LanguageName(Language lang);
    185 
    186 // Given a Language, return the name of the enum constant for that
    187 // language. In all but a few cases, this is the same as its standard
    188 // name. For example, LanguageName(CHINESE) returns "Chinese", but
    189 // LanguageEnumName(CHINESE) returns "CHINESE". This is intended for
    190 // code that is generating C++ code, where the enum constant is more
    191 // useful than its integer value.  Return "NUM_LANGUAGES" if
    192 // the language is invalid.
    193 const char* LanguageEnumName(Language lang);
    194 
    195 // The maximum length of a standard language name.
    196 const int kMaxLanguageNameSize = 50;
    197 
    198 // The standard name for the default language.
    199 const char* default_language_name();
    200 
    201 // The standard name for all invalid languages.
    202 const char* invalid_language_name();
    203 
    204 // If lang_name matches the standard name of a Language, using a
    205 // case-insensitive comparison, set *language to that Language and
    206 // return true.
    207 // Otherwise, set *language to UNKNOWN_LANGUAGE and return false.
    208 //
    209 // For backwards compatibility, "HATIAN_CREOLE" is allowed as a name
    210 // for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.
    211 // For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed
    212 // as a name for UNKNOWN_LANGUAGE (the return value is true in this case,
    213 // as it is for "Unknown"), and "CHINESE_T" is allowed as a name for
    214 // CHINESE_T (i.e., a synonym for "ChineseT").
    215 //
    216 // REQUIRES: language must not be NULL.
    217 //
    218 bool LanguageFromName(const char* lang_name, Language *language);
    219 
    220 
    221 
    222 // *******************************************
    223 // LANGUAGE CODES
    224 //
    225 // This interface defines a standard code for each valid language, and
    226 // a standard code for invalid languages. These are derived from ISO codes,
    227 // with some Google additions.
    228 //   LanguageCode()
    229 //   default_language_code()
    230 //   invalid_language_code()
    231 //   LanguageCodeWithDialects()
    232 //   LanguageCodeISO639_1()
    233 //   LanguageCodeISO639_2()
    234 // *******************************************
    235 
    236 // Given a Language, return its standard code. There are Google-specific codes:
    237 //     For CHINESE_T, return "zh-TW".
    238 //     For TG_UNKNOWN_LANGUAGE, return "ut".
    239 //     For UNKNOWN_LANGUAGE, return "un".
    240 //     For PORTUGUESE_P, return "pt-PT".
    241 //     For PORTUGUESE_B, return "pt-BR".
    242 //     For LIMBU, return "sit-NP".
    243 //     For CHEROKEE, return "chr".
    244 //     For SYRIAC, return "syr".
    245 // Otherwise return the ISO 639-1 two-letter language code for lang.
    246 // If lang is invalid, return invalid_language_code().
    247 //
    248 // NOTE: See the note below about the codes for Chinese languages.
    249 //
    250 const char* LanguageCode(Language lang);
    251 
    252 // The maximum length of a language code.
    253 const int kMaxLanguageCodeSize = 50;
    254 
    255 // The standard code for the default language.
    256 const char* default_language_code();
    257 
    258 // The standard code for all invalid languages.
    259 const char* invalid_language_code();
    260 
    261 
    262 // --------------------------------------------
    263 // NOTE: CHINESE LANGUAGE CODES
    264 //
    265 // There are three functions that return codes for Chinese languages.
    266 // LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.
    267 // LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.
    268 // The following list shows the different results.
    269 //
    270 // LanguageCode(CHINESE) returns "zh"
    271 // LanguageCode(CHINESE_T) returns "zh-TW".
    272 //
    273 // LanguageCodeWithDialects(CHINESE) returns "zh-CN".
    274 // LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".
    275 //
    276 // LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".
    277 // LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".
    278 // LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".
    279 //
    280 // --------------------------------------------
    281 
    282 // LanguageCodeWithDialects
    283 // ------------------------
    284 //
    285 // If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).
    286 const char* LanguageCodeWithDialects(Language lang);
    287 
    288 // LanguageCodeISO639_1
    289 // --------------------
    290 //
    291 // Return the ISO 639-1 two-letter language code for lang.
    292 // Return invalid_language_code() if lang is invalid or does not have
    293 // an ISO 639-1 two-letter language code.
    294 const char* LanguageCodeISO639_1(Language lang);
    295 
    296 // LanguageCodeISO639_2
    297 // --------------------
    298 //
    299 // Return the ISO 639-2 three-letter language for lang.
    300 // Return invalid_language_code() if lang is invalid or does not have
    301 // an ISO 639-2 three-letter language code.
    302 const char* LanguageCodeISO639_2(Language lang);
    303 
    304 // LanguageFromCode
    305 // ----------------
    306 //
    307 // If lang_code matches the code for a Language, using a case-insensitive
    308 // comparison, set *lang to that Language and return true.
    309 // Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.
    310 //
    311 // lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2
    312 // (three-letter) code, or a Google-specific code (see LanguageCode).
    313 //
    314 // Certain language-code aliases are also allowed:
    315 //   For "zh-cn" and "zh_cn", set *lang to CHINESE.
    316 //   For "zh-tw" and "zh_tw", set *lang to CHINESE_T.
    317 //   For "he", set *lang to HEBREW.
    318 //   For "in", set *lang to INDONESIAN.
    319 //   For "ji", set *lang to YIDDISH.
    320 //   For "fil", set *lang to TAGALOG.
    321 //
    322 // REQUIRES: 'lang' must not be NULL.
    323 bool LanguageFromCode(const char* lang_code, Language *language);
    324 
    325 
    326 // LanguageFromCodeOrName
    327 // ----------------------
    328 //
    329 // If lang_code_or_name is a language code or a language name.
    330 // set *language to the corresponding Language and return true.
    331 // Otherwise set *language to UNKNOWN_LANGUAGE and return false.
    332 //
    333 bool LanguageFromCodeOrName(const char* lang_code_or_name,
    334                             Language* language);
    335 
    336 // LanguageNameFromCode
    337 // --------------------
    338 //
    339 // If language_code is the code for a Language (see LanguageFromCode),
    340 // return the standard name of that language (see LanguageName).
    341 // Otherwise return invalid_language_name().
    342 //
    343 const char* LanguageNameFromCode(const char* language_code);
    344 
    345 
    346 // Miscellany
    347 
    348 // LanguageCodeToUnderscoreForm
    349 // ----------------------------
    350 //
    351 // Given a language code, convert the dash "-" to underscore "_".
    352 //
    353 // Specifically, if result_length <= strlen(lang_code), set result[0]
    354 // to '\0' and return false. Otherwise, copy lang_code to result,
    355 // converting every dash to an underscore, converting every character
    356 // before the first dash or underscore to lower case, and converting
    357 // every character after the first dash or underscore to upper
    358 // case. If there is no dash or underscore, convert the entire string
    359 // to lower case.
    360 //
    361 // REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.
    362 
    363 bool LanguageCodeToUnderscoreForm(const char* lang_code,
    364                                   char* result,
    365                                   int result_length);
    366 
    367 //
    368 // AlwaysPutInExpectedRestrict
    369 // ---------------------------
    370 //
    371 // For Web pages in certain top-level domains, Web Search always
    372 // applies a "country restrict". If 'tld' matches one of those, using
    373 // a case-SENSITIVE comparison, set *expected_language to the Language
    374 // most commonly found in that top-level domain and return true.
    375 // Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.
    376 bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);
    377 
    378 
    379 #endif  // LANGUAGES_PUBLIC_LANGUAGES_H_
    380