Home | History | Annotate | Download | only in encodings
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // This file is for i18n. It contains two enums, namely Language and
      6 // Encoding, where Language is the linguistic convention, and Encoding
      7 // contains information on both language encoding and character set.
      8 //
      9 // The language and encoding are both based on Teragram's conventions,
     10 // except for some common ISO-8859 encodings that are not detected by
     11 // Teragram but might be in the future.
     12 //
     13 // This file also includes functions that do mappings among
     14 // Language/Encoding enums, language/encoding string names (typically
     15 // the output from Language Encoding identifier), and language codes
     16 // (iso 639), and two-letter country codes (iso 3166)
     17 //
     18 // NOTE: Both Language and Encoding enums should always start from
     19 // zero value. This assumption has been made and used.
     20 //
     21 
     22 #ifndef ENCODINGS_LANG_ENC_H__
     23 #define ENCODINGS_LANG_ENC_H__
     24 
     25 #include "languages/public/languages.h"
     26 #include "encodings/public/encodings.h"
     27 
     28 
     29 // EncodingsForLanguage
     30 // --------------------
     31 //
     32 // Given the language, returns a pointer to an array of encodings this
     33 // language supports. Typically, the encs array has at least one
     34 // element: UNKNOWN_ENCODING, which is always the last element of the
     35 // array. The first encoding is the default encoding of the language.
     36 // Return NULL if the input is invalid.
     37 //
     38 // Note: The output encoding array does not include ASCII_7BIT, UTF8
     39 // or UNICODE which are good for all languages. TODO: Find out whether
     40 // it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
     41 // as special cases.
     42 //
     43 const Encoding* EncodingsForLanguage(Language lang);
     44 
     45 
     46 // DefaultEncodingForLanguage
     47 // --------------------------
     48 //
     49 // Given the language, returns the default encoding for the language
     50 // via the argument encoding.
     51 //
     52 // The function returns true if the input lang is valid. Otherwise,
     53 // false is returned, and encoding is set to UNKNOWN_ENCODING.
     54 //
     55 bool DefaultEncodingForLanguage(Language lang,
     56                                 Encoding *encoding);
     57 
     58 // LanguagesForEncoding
     59 // --------------------
     60 //
     61 // Given the encoding, returns a pointer to an array of languages this
     62 // encoding supports. Typically, the langs array has at least one
     63 // element: UNKNOWN_LANGUAGE, which is always the last element of the
     64 // array. The first language in the array if the most popular
     65 // language for that encoding. NULL is returned if the input is
     66 // invalid.
     67 //
     68 // Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
     69 // UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
     70 // the languages or to treat these two encodings as special cases.
     71 //
     72 // For other known encodings, ENGLISH is always included. This is
     73 // because English (Latin) characters are included in each encoding.
     74 //
     75 const Language* LanguagesForEncoding(Encoding enc);
     76 
     77 // DefaultLanguageForEncoding
     78 // --------------------------
     79 //
     80 // Given the encoding, returns the default language for that encoding
     81 // via the argument language.
     82 //
     83 // The function returns true if the input enc is valid. Otherwise,
     84 // false is returned, and language is set to UNKNOWN_LANGUAGE.
     85 //
     86 // Note, this function is more useful for the encodings that have only
     87 // one corresponding language i.e. shift_jis => Japanese. There are
     88 // cases that multiple langauges have the same encoding, for which the
     89 // default language is an arbitrary choice from them.
     90 //
     91 bool DefaultLanguageForEncoding(Encoding enc, Language* language);
     92 
     93 //
     94 // IsLangEncCompatible
     95 // -------------------
     96 //
     97 // This function is to determine whether the input language and
     98 // encoding are compatible. For example, FRENCH and LATIN1 are
     99 // compatible, but FRENCH and GB are not.
    100 //
    101 // If either lang or enc is invalid return false.
    102 // If either lang is unknown, return true.
    103 //    (e.g. we can detect a page's encoding as latin1 from metatag info, but
    104 //     cannot derive it language since there are more than one
    105 //     language encoding in Latin1 )
    106 // If language is known, but encoding is unknown, return false.
    107 //    (return true will do us no good since we cannot convert to UTF8 anyway)
    108 // If enc is unicode or utf8, return true.
    109 // Otherwise check if lang is supported by enc and enc supported by
    110 // lang.
    111 //
    112 bool IsLangEncCompatible(Language lang, Encoding enc);
    113 
    114 //
    115 // DominantLanguageFromEncoding
    116 // ----------------------------
    117 //
    118 // This function determine if there exists a dominant language for the
    119 // input encoding. For example, the encoding GB has a dominant
    120 // language (Chinese), but Latin1 does not.
    121 //
    122 // The word "dominant" is used here because English characters are
    123 // included in each encoding.
    124 //
    125 // If there is no dominant langauge for the encoding, such as Latin1,
    126 // UNKNOWN_LANGUAGE is returned.
    127 //
    128 Language DominantLanguageFromEncoding(Encoding enc);
    129 
    130 // LanguageCode
    131 // ------------------------
    132 // Given the Language and Encoding, return language code with dialects
    133 // (>= 2 letters).  Encoding is necessary to disambiguate between
    134 // Simplified and Traditional Chinese.
    135 //
    136 // See the note on Chinese Language Codes in
    137 // i18n/languages/public/languages.h
    138 // for the details.
    139 
    140 const char* LanguageCode(Language lang, Encoding enc);
    141 
    142 //
    143 // IsEncodingWithSupportedLanguage()
    144 // ---------------------------------
    145 //
    146 // There are some encoding listed here just because they are commonly
    147 // used.  There is no interface language for them yet. They are not
    148 // detected by Teragram, but can be detected from the meta info of the
    149 // HTML page.
    150 //
    151 // For example, we have list ARABIC_ENCODING but there is no arabic in
    152 // the Language enum. If the user input an Arabic query from Google
    153 // main page, Netscape will just send the raw bytes to GWS, and GWS
    154 // will treat them as Latin1.  Therefore, there is no use to detect
    155 // ARABIC_ENCODING for indexing, since they will never match the
    156 // queries which are treated as Latin1 by GWS. On the contrary, if we
    157 // treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
    158 // fall them through as Latin1 in indexing time. And there might be a
    159 // match for some ARABIC queries which are also treated as Latin1 by
    160 // GWS. In fact, some people are relying on this feature to do Arabic
    161 // searches.
    162 //
    163 // Thus for these type of encoding, before we have the UI support for
    164 // their language and have a pretty comprehensive language/encoding
    165 // identification quality, it is better to revert them as
    166 // UNKNOWN_ENCODING.
    167 //
    168 // This function checks whether the input encoding is one with
    169 // an interface language.
    170 bool IsEncodingWithSupportedLanguage(Encoding enc);
    171 
    172 
    173 //
    174 // LangsFromCountryCode and EncFromCountryCode
    175 // -------------------------------------------
    176 //
    177 // These two functions return the possible languages and encodings,
    178 // respectively, according to the input country code, which is a
    179 // 2-letter string. The country code is usually specified in the url
    180 // of a document.
    181 //
    182 //
    183 
    184 // LangsFromCountryCode
    185 // --------------------
    186 //
    187 // This function takes a string of arbitrary length. It treats the
    188 // first 2 bytes of the string as the country code, as defined in iso
    189 // 3166-1993 (E).  It returns, via arguments, an array of the
    190 // languages that are popular in that country, roughly in order of
    191 // popularity, together with the size of the array.
    192 //
    193 // This function returns true if we have language information for
    194 // country_code.  Otherwise, it returns false.
    195 //
    196 bool LangsFromCountryCode(const char* country_code,
    197                           const Language** lang_arry,
    198                           int* num_langs);
    199 
    200 
    201 //
    202 // EncFromCountryCode
    203 // ------------------
    204 //
    205 // This function takes a string of arbitrary length. It treats the
    206 // first 2 bytes of that string as the country code, as defined in iso
    207 // 3166-1993 (E). It sets *enc to the encoding that is
    208 // most often used for the languages spoken in that country.
    209 //
    210 // This function returns true if we have encoding information for
    211 // country_code.  Otherwise, it returns false, and *enc is set to
    212 // UNKNOWN_ENCODING.
    213 //
    214 bool EncFromCountryCode(const char* country_code, Encoding* enc);
    215 
    216 
    217 
    218 // VisualType
    219 // ----------
    220 //
    221 // Right-to-left documents may be in logical or visual order. When they
    222 // are in visual order we convert them to logical order before processing.
    223 // This enum lists the types of visual document we can encounter.
    224 // Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
    225 // The other documents in those languages, and all documents in non-RTL
    226 // languages, will be NOT_VISUAL_DOCUMENT.
    227 enum VisualType {
    228   NOT_VISUAL_DOCUMENT = 0,
    229   VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
    230   CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
    231 };
    232 
    233 VisualType default_visualtype();
    234 
    235 // VisualTypeName
    236 // --------------
    237 //
    238 // Given the visual type, returns a string name useful for debug output.
    239 const char* VisualTypeName(VisualType visualtype);
    240 
    241 
    242 
    243 // InitLangEnc
    244 // -----------
    245 //
    246 // Ensures the LangEnc module has been initialized.  Normally this
    247 // happens during InitGoogle, but this allows access for scripts that
    248 // don't support InitGoogle. InitLangEnc calls InitEncodings (see
    249 // i18n/encodings/public/encodings.h) and also initializes data
    250 // structures used in lang_enc.cc.
    251 //
    252 void InitLangEnc();
    253 
    254 #endif  // ENCODINGS_LANG_ENC_H__
    255