Home | History | Annotate | Download | only in public
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef ENCODINGS_PUBLIC_ENCODINGS_H_
      6 #define ENCODINGS_PUBLIC_ENCODINGS_H_
      7 
      8 // This interface defines the Encoding enum and various functions that
      9 // depend only on Encoding values.
     10 
     11 // A hash-function for Encoding, hash<Encoding>, is defined in
     12 // i18n/encodings/public/encodings-hash.h
     13 
     14 // On some Windows projects, UNICODE may be defined, which would prevent the
     15 // Encoding enum below from compiling. Note that this is a quick fix that does
     16 // not break any existing projects. The UNICODE enum may someday be changed
     17 // to something more specific and non-colliding, but this involves careful
     18 // testing of changes in many other projects.
     19 #undef UNICODE
     20 
     21 // NOTE: The Encoding enum must always start at 0. This assumption has
     22 // been made and used.
     23 
     24 #ifndef SWIG
     25 
     26 #include "encodings/proto/encodings.pb.h"
     27 
     28 // We must have this for compatibility.
     29 // COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE
     30 //using namespace i18n::encodings;
     31 
     32 #else
     33 
     34 // Special proto SWIG workaround header file.
     35 #include "i18n/encodings/internal/encodings_proto_wrapper.h"
     36 
     37 #endif
     38 
     39 const int kNumEncodings = NUM_ENCODINGS;
     40 
     41 // some of the popular encoding aliases
     42 // TODO(jrm) Make these static const Encoding values instead of macros.
     43 #define LATIN1           ISO_8859_1
     44 #define LATIN2           ISO_8859_2
     45 #define LATIN3           ISO_8859_3
     46 #define LATIN4           ISO_8859_4
     47 #define CYRILLIC         ISO_8859_5
     48 #define ARABIC_ENCODING  ISO_8859_6     // avoiding the same name as language
     49 #define GREEK_ENCODING   ISO_8859_7     // avoiding the same name as language
     50 #define HEBREW_ENCODING  ISO_8859_8     // avoiding the same name as language
     51 #define LATIN5           ISO_8859_9
     52 #define LATIN6           ISO_8859_10
     53 #define KOREAN_HANGUL    KOREAN_EUC_KR
     54 
     55 // The default Encoding (LATIN1).
     56 Encoding default_encoding();
     57 
     58 
     59 
     60 // *************************************************************
     61 // Encoding predicates
     62 //   IsValidEncoding()
     63 //   IsEncEncCompatible
     64 //   IsSupersetOfAscii7Bit
     65 //   Is8BitEncoding
     66 //   IsCJKEncoding
     67 //   IsHebrewEncoding
     68 //   IsRightToLeftEncoding
     69 //   IsLogicalRightToLeftEncoding
     70 //   IsVisualRightToLeftEncoding
     71 //   IsIso2022Encoding
     72 //   IsIso2022JpOrVariant
     73 //   IsShiftJisOrVariant
     74 //   IsJapaneseCellPhoneCarrierSpecificEncoding
     75 // *************************************************************
     76 
     77 // IsValidEncoding
     78 // ===================================
     79 //
     80 // Function to check if the input language enum is within range.
     81 //
     82 
     83 bool IsValidEncoding(Encoding enc);
     84 
     85 //
     86 // IsEncEncCompatible
     87 // ------------------
     88 //
     89 // This function is to determine whether or not converting from the
     90 // first encoding to the second requires any changes to the underlying
     91 // text (e.g.  ASCII_7BIT is a subset of UTF8).
     92 //
     93 // TODO(someone more familiar with i18n): the current implementation
     94 // is likely incomplete.  It would be good to consider the full matrix
     95 // of all pairs of encodings and to fish out all compatible pairs.
     96 //
     97 bool IsEncEncCompatible(const Encoding from, const Encoding to);
     98 
     99 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
    100 // encoding represent the same characters as they do in ISO_8859_1.
    101 
    102 // WARNING: This function does not currently return true for all encodings that
    103 // are supersets of Ascii 7-bit.
    104 bool IsSupersetOfAscii7Bit(Encoding e);
    105 
    106 // To be an 8-bit encoding means that there are fewer than 256 symbols.
    107 // Each byte determines a new character; there are no multi-byte sequences.
    108 
    109 // WARNING: This function does not currently return true for all encodings that
    110 // are 8-bit encodings.
    111 bool Is8BitEncoding(Encoding e);
    112 
    113 // IsCJKEncoding
    114 // -------------
    115 //
    116 // This function returns true if the encoding is either Chinese
    117 // (simplified or traditional), Japanese, or Korean. Note: UTF8 is not
    118 // considered a CJK encoding.
    119 bool IsCJKEncoding(Encoding e);
    120 
    121 // IsHebrewEncoding
    122 // -------------
    123 //
    124 // This function returns true if the encoding is a Hebrew specific
    125 // encoding (not UTF8, etc).
    126 bool IsHebrewEncoding(Encoding e);
    127 
    128 // IsRightToLeftEncoding
    129 // ---------------------
    130 //
    131 // Returns true if the encoding is a right-to-left encoding.
    132 //
    133 // Note that the name of this function is somewhat misleading. There is nothing
    134 // "right to left" about these encodings. They merely contain code points for
    135 // characters in RTL languages such as Hebrew and Arabic. But this is also
    136 // true for UTF-8.
    137 //
    138 // TODO(benjy): Get rid of this function. The only special-case we
    139 // should need to worry about are visual encodings. Anything we
    140 // need to do for all 'RTL' encodings we need to do for UTF-8 as well.
    141 bool IsRightToLeftEncoding(Encoding enc);
    142 
    143 // IsLogicalRightToLeftEncoding
    144 // ----------------------------
    145 //
    146 // Returns true if the encoding is a logical right-to-left encoding.
    147 // Logical right-to-left encodings are those that the browser renders
    148 // right-to-left and applies the BiDi algorithm to. Therefore the characters
    149 // appear in reading order in the file, and indexing, snippet generation etc.
    150 // should all just work with no special processing.
    151 //
    152 // TODO(benjy): Get rid of this function. The only special-case we
    153 // should need to worry about are visual encodings.
    154 bool IsLogicalRightToLeftEncoding(Encoding enc);
    155 
    156 // IsVisualRightToLeftEncoding
    157 // ---------------------------
    158 //
    159 // Returns true if the encoding is a visual right-to-left encoding.
    160 // Visual right-to-left encodings are those that the browser renders
    161 // left-to-right and does not apply the BiDi algorithm to. Therefore each
    162 // line appears in reverse order in the file, lines are manually wrapped
    163 // by abusing <br> or <p> tags, etc. Visual RTL encoding is a relic of
    164 // the prehistoric days when browsers couldn't render right-to-left, but
    165 // unfortunately some visual pages persist to this day. These documents require
    166 // special processing so that we don't index or snippet them with each line
    167 // reversed.
    168 bool IsVisualRightToLeftEncoding(Encoding enc);
    169 
    170 // IsIso2022Encoding
    171 // -----------------
    172 //
    173 // Returns true if the encoding is a kind of ISO 2022 such as
    174 // ISO-2022-JP.
    175 bool IsIso2022Encoding(Encoding enc);
    176 
    177 // IsIso2022JpOrVariant
    178 // --------------------
    179 //
    180 // Returns true if the encoding is ISO-2022-JP or a variant such as
    181 // KDDI's ISO-2022-JP.
    182 bool IsIso2022JpOrVariant(Encoding enc);
    183 
    184 // IsShiftJisOrVariant
    185 // --------------------
    186 //
    187 // Returns true if the encoding is Shift_JIS or a variant such as
    188 // KDDI's Shift_JIS.
    189 bool IsShiftJisOrVariant(Encoding enc);
    190 
    191 // IsJapanesCellPhoneCarrierSpecificEncoding
    192 // -----------------------------------------
    193 //
    194 // Returns true if it's Japanese cell phone carrier specific encoding
    195 // such as KDDI_SHIFT_JIS.
    196 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc);
    197 
    198 
    199 
    200 // *************************************************************
    201 // ENCODING NAMES
    202 //
    203 // This interface defines a standard name for each valid encoding, and
    204 // a standard name for invalid encodings. (Some names use all upper
    205 // case, but others use mixed case.)
    206 //
    207 //   EncodingName() [Encoding to name]
    208 //   MimeEncodingName() [Encoding to name]
    209 //   EncodingFromName() [name to Encoding]
    210 //   EncodingNameAliasToEncoding() [name to Encoding]
    211 //   default_encoding_name()
    212 //   invalid_encoding_name()
    213 // *************************************************************
    214 
    215 // EncodingName
    216 // ------------
    217 //
    218 // Given the encoding, returns its standard name.
    219 // Return invalid_encoding_name() if the encoding is invalid.
    220 //
    221 const char* EncodingName(Encoding enc);
    222 
    223 //
    224 // MimeEncodingName
    225 // ----------------
    226 //
    227 // Return the "preferred MIME name" of an encoding.
    228 //
    229 // This name is suitable for using in HTTP headers, HTML tags,
    230 // and as the "charset" parameter of a MIME Content-Type.
    231 const char* MimeEncodingName(Encoding enc);
    232 
    233 
    234 // The maximum length of an encoding name
    235 const int kMaxEncodingNameSize = 50;
    236 
    237 // The standard name of the default encoding.
    238 const char* default_encoding_name();
    239 
    240 // The name used for an invalid encoding.
    241 const char* invalid_encoding_name();
    242 
    243 // EncodingFromName
    244 // ----------------
    245 //
    246 // If enc_name matches the standard name of an Encoding, using a
    247 // case-insensitive comparison, set *encoding to that Encoding and
    248 // return true.  Otherwise set *encoding to UNKNOWN_ENCODING and
    249 // return false.
    250 //
    251 // REQUIRES: encoding must not be NULL.
    252 //
    253 bool EncodingFromName(const char* enc_name, Encoding *encoding);
    254 
    255 //
    256 // EncodingNameAliasToEncoding
    257 // ---------------------------
    258 //
    259 // If enc_name matches the standard name or an alias of an Encoding,
    260 // using a case-insensitive comparison, return that
    261 // Encoding. Otherwise, return UNKNOWN_ENCODING.
    262 //
    263 // Aliases include most mime-encoding names (e.g., "ISO-8859-7" for
    264 // GREEK), alternate names (e.g., "cyrillic" for ISO_8859_5) and
    265 // common variations with hyphens and underscores (e.g., "koi8-u" and
    266 // "koi8u" for RUSSIAN_KOI8_R).
    267 
    268 Encoding EncodingNameAliasToEncoding(const char *enc_name);
    269 
    270 
    271 // *************************************************************
    272 // Miscellany
    273 // *************************************************************
    274 
    275 // PreferredWebOutputEncoding
    276 // --------------------------
    277 //
    278 // Some multi-byte encodings use byte values that coincide with the
    279 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
    280 // can misinterpret these, as indicated in an external XSS report from
    281 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
    282 // also use UTF8 instead of encodings that we don't support in our
    283 // output, and we generally try to be conservative in what we send out.
    284 // Where the client asks for single- or double-byte encodings that are
    285 // not as common, we substitute a more common single- or double-byte
    286 // encoding, if there is one, thereby preserving the client's intent
    287 // to use less space than UTF-8. This also means that characters
    288 // outside the destination set will be converted to HTML NCRs (&#NNN;)
    289 // if requested.
    290 Encoding PreferredWebOutputEncoding(Encoding enc);
    291 
    292 
    293 // InitEncodings
    294 // -------------
    295 //
    296 // Ensures the encodings module has been initialized.  Normally this happens
    297 // during InitGoogle, but this allows access for scripts that don't
    298 // support InitGoogle.
    299 void InitEncodings();
    300 
    301 #endif  // ENCODINGS_PUBLIC_ENCODINGS_H_
    302