1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // This file is for i18n. It contains two enums, namely Language and 6 // Encoding, where Language is the linguistic convention, and Encoding 7 // contains information on both language encoding and character set. 8 // 9 // The language and encoding are both based on Teragram's conventions, 10 // except for some common ISO-8859 encodings that are not detected by 11 // Teragram but might be in the future. 12 // 13 // This file also includes functions that do mappings among 14 // Language/Encoding enums, language/encoding string names (typically 15 // the output from Language Encoding identifier), and language codes 16 // (iso 639), and two-letter country codes (iso 3166) 17 // 18 // NOTE: Both Language and Encoding enums should always start from 19 // zero value. This assumption has been made and used. 20 // 21 22 #ifndef ENCODINGS_LANG_ENC_H__ 23 #define ENCODINGS_LANG_ENC_H__ 24 25 #include "languages/public/languages.h" 26 #include "encodings/public/encodings.h" 27 28 29 // EncodingsForLanguage 30 // -------------------- 31 // 32 // Given the language, returns a pointer to an array of encodings this 33 // language supports. Typically, the encs array has at least one 34 // element: UNKNOWN_ENCODING, which is always the last element of the 35 // array. The first encoding is the default encoding of the language. 36 // Return NULL if the input is invalid. 37 // 38 // Note: The output encoding array does not include ASCII_7BIT, UTF8 39 // or UNICODE which are good for all languages. TODO: Find out whether 40 // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them 41 // as special cases. 42 // 43 const Encoding* EncodingsForLanguage(Language lang); 44 45 46 // DefaultEncodingForLanguage 47 // -------------------------- 48 // 49 // Given the language, returns the default encoding for the language 50 // via the argument encoding. 51 // 52 // The function returns true if the input lang is valid. Otherwise, 53 // false is returned, and encoding is set to UNKNOWN_ENCODING. 54 // 55 bool DefaultEncodingForLanguage(Language lang, 56 Encoding *encoding); 57 58 // LanguagesForEncoding 59 // -------------------- 60 // 61 // Given the encoding, returns a pointer to an array of languages this 62 // encoding supports. Typically, the langs array has at least one 63 // element: UNKNOWN_LANGUAGE, which is always the last element of the 64 // array. The first language in the array if the most popular 65 // language for that encoding. NULL is returned if the input is 66 // invalid. 67 // 68 // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and 69 // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all 70 // the languages or to treat these two encodings as special cases. 71 // 72 // For other known encodings, ENGLISH is always included. This is 73 // because English (Latin) characters are included in each encoding. 74 // 75 const Language* LanguagesForEncoding(Encoding enc); 76 77 // DefaultLanguageForEncoding 78 // -------------------------- 79 // 80 // Given the encoding, returns the default language for that encoding 81 // via the argument language. 82 // 83 // The function returns true if the input enc is valid. Otherwise, 84 // false is returned, and language is set to UNKNOWN_LANGUAGE. 85 // 86 // Note, this function is more useful for the encodings that have only 87 // one corresponding language i.e. shift_jis => Japanese. There are 88 // cases that multiple langauges have the same encoding, for which the 89 // default language is an arbitrary choice from them. 90 // 91 bool DefaultLanguageForEncoding(Encoding enc, Language* language); 92 93 // 94 // IsLangEncCompatible 95 // ------------------- 96 // 97 // This function is to determine whether the input language and 98 // encoding are compatible. For example, FRENCH and LATIN1 are 99 // compatible, but FRENCH and GB are not. 100 // 101 // If either lang or enc is invalid return false. 102 // If either lang is unknown, return true. 103 // (e.g. we can detect a page's encoding as latin1 from metatag info, but 104 // cannot derive it language since there are more than one 105 // language encoding in Latin1 ) 106 // If language is known, but encoding is unknown, return false. 107 // (return true will do us no good since we cannot convert to UTF8 anyway) 108 // If enc is unicode or utf8, return true. 109 // Otherwise check if lang is supported by enc and enc supported by 110 // lang. 111 // 112 bool IsLangEncCompatible(Language lang, Encoding enc); 113 114 // 115 // DominantLanguageFromEncoding 116 // ---------------------------- 117 // 118 // This function determine if there exists a dominant language for the 119 // input encoding. For example, the encoding GB has a dominant 120 // language (Chinese), but Latin1 does not. 121 // 122 // The word "dominant" is used here because English characters are 123 // included in each encoding. 124 // 125 // If there is no dominant langauge for the encoding, such as Latin1, 126 // UNKNOWN_LANGUAGE is returned. 127 // 128 Language DominantLanguageFromEncoding(Encoding enc); 129 130 // LanguageCode 131 // ------------------------ 132 // Given the Language and Encoding, return language code with dialects 133 // (>= 2 letters). Encoding is necessary to disambiguate between 134 // Simplified and Traditional Chinese. 135 // 136 // See the note on Chinese Language Codes in 137 // i18n/languages/public/languages.h 138 // for the details. 139 140 const char* LanguageCode(Language lang, Encoding enc); 141 142 // 143 // IsEncodingWithSupportedLanguage() 144 // --------------------------------- 145 // 146 // There are some encoding listed here just because they are commonly 147 // used. There is no interface language for them yet. They are not 148 // detected by Teragram, but can be detected from the meta info of the 149 // HTML page. 150 // 151 // For example, we have list ARABIC_ENCODING but there is no arabic in 152 // the Language enum. If the user input an Arabic query from Google 153 // main page, Netscape will just send the raw bytes to GWS, and GWS 154 // will treat them as Latin1. Therefore, there is no use to detect 155 // ARABIC_ENCODING for indexing, since they will never match the 156 // queries which are treated as Latin1 by GWS. On the contrary, if we 157 // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will 158 // fall them through as Latin1 in indexing time. And there might be a 159 // match for some ARABIC queries which are also treated as Latin1 by 160 // GWS. In fact, some people are relying on this feature to do Arabic 161 // searches. 162 // 163 // Thus for these type of encoding, before we have the UI support for 164 // their language and have a pretty comprehensive language/encoding 165 // identification quality, it is better to revert them as 166 // UNKNOWN_ENCODING. 167 // 168 // This function checks whether the input encoding is one with 169 // an interface language. 170 bool IsEncodingWithSupportedLanguage(Encoding enc); 171 172 173 // 174 // LangsFromCountryCode and EncFromCountryCode 175 // ------------------------------------------- 176 // 177 // These two functions return the possible languages and encodings, 178 // respectively, according to the input country code, which is a 179 // 2-letter string. The country code is usually specified in the url 180 // of a document. 181 // 182 // 183 184 // LangsFromCountryCode 185 // -------------------- 186 // 187 // This function takes a string of arbitrary length. It treats the 188 // first 2 bytes of the string as the country code, as defined in iso 189 // 3166-1993 (E). It returns, via arguments, an array of the 190 // languages that are popular in that country, roughly in order of 191 // popularity, together with the size of the array. 192 // 193 // This function returns true if we have language information for 194 // country_code. Otherwise, it returns false. 195 // 196 bool LangsFromCountryCode(const char* country_code, 197 const Language** lang_arry, 198 int* num_langs); 199 200 201 // 202 // EncFromCountryCode 203 // ------------------ 204 // 205 // This function takes a string of arbitrary length. It treats the 206 // first 2 bytes of that string as the country code, as defined in iso 207 // 3166-1993 (E). It sets *enc to the encoding that is 208 // most often used for the languages spoken in that country. 209 // 210 // This function returns true if we have encoding information for 211 // country_code. Otherwise, it returns false, and *enc is set to 212 // UNKNOWN_ENCODING. 213 // 214 bool EncFromCountryCode(const char* country_code, Encoding* enc); 215 216 217 218 // VisualType 219 // ---------- 220 // 221 // Right-to-left documents may be in logical or visual order. When they 222 // are in visual order we convert them to logical order before processing. 223 // This enum lists the types of visual document we can encounter. 224 // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual. 225 // The other documents in those languages, and all documents in non-RTL 226 // languages, will be NOT_VISUAL_DOCUMENT. 227 enum VisualType { 228 NOT_VISUAL_DOCUMENT = 0, 229 VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order. 230 CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual. 231 }; 232 233 VisualType default_visualtype(); 234 235 // VisualTypeName 236 // -------------- 237 // 238 // Given the visual type, returns a string name useful for debug output. 239 const char* VisualTypeName(VisualType visualtype); 240 241 242 243 // InitLangEnc 244 // ----------- 245 // 246 // Ensures the LangEnc module has been initialized. Normally this 247 // happens during InitGoogle, but this allows access for scripts that 248 // don't support InitGoogle. InitLangEnc calls InitEncodings (see 249 // i18n/encodings/public/encodings.h) and also initializes data 250 // structures used in lang_enc.cc. 251 // 252 void InitLangEnc(); 253 254 #endif // ENCODINGS_LANG_ENC_H__ 255