Home | History | Annotate | Download | only in proto
      1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #ifndef ENCODINGS_PROTO_ENCODINGS_PB_H_
      6 #define ENCODINGS_PROTO_ENCODINGS_PB_H_
      7 
      8 enum Encoding {
      9   ISO_8859_1           =  0,  // Teragram ASCII
     10   ISO_8859_2           =  1,  // Teragram Latin2
     11   ISO_8859_3           =  2,  // in BasisTech but not in Teragram
     12   ISO_8859_4           =  3,  // Teragram Latin4
     13   ISO_8859_5           =  4,  // Teragram ISO-8859-5
     14   ISO_8859_6           =  5,  // Teragram Arabic
     15   ISO_8859_7           =  6,  // Teragram Greek
     16   ISO_8859_8           =  7,  // Teragram Hebrew
     17   ISO_8859_9           =  8,  // in BasisTech but not in Teragram
     18   ISO_8859_10          =  9,  // in BasisTech but not in Teragram
     19   JAPANESE_EUC_JP      = 10,  // Teragram EUC_JP
     20   JAPANESE_SHIFT_JIS   = 11,  // Teragram SJS
     21   JAPANESE_JIS         = 12,  // Teragram JIS
     22   CHINESE_BIG5         = 13,  // Teragram BIG5
     23   CHINESE_GB           = 14,  // Teragram GB
     24   CHINESE_EUC_CN       = 15,  // Misnamed. Should be EUC_TW. Was Basis Tech
     25                               // CNS11643EUC, before that Teragram EUC-CN(!)
     26                               // See //i18n/basistech/basistech_encodings.h
     27   KOREAN_EUC_KR        = 16,  // Teragram KSC
     28   UNICODE              = 17,  // Teragram Unicode
     29   CHINESE_EUC_DEC      = 18,  // Misnamed. Should be EUC_TW. Was Basis Tech
     30                               // CNS11643EUC, before that Teragram EUC.
     31   CHINESE_CNS          = 19,  // Misnamed. Should be EUC_TW. Was Basis Tech
     32                               // CNS11643EUC, before that Teragram CNS.
     33   CHINESE_BIG5_CP950   = 20,  // Teragram BIG5_CP950
     34   JAPANESE_CP932       = 21,  // Teragram CP932
     35   UTF8                 = 22,
     36   UNKNOWN_ENCODING     = 23,
     37   ASCII_7BIT           = 24,  // ISO_8859_1 with all characters <= 127.
     38                               // Should be present only in the crawler
     39                               // and in the repository,
     40                               // *never* as a result of Document::encoding().
     41   RUSSIAN_KOI8_R       = 25,  // Teragram KOI8R
     42   RUSSIAN_CP1251       = 26,  // Teragram CP1251
     43 
     44   //----------------------------------------------------------
     45   // These are _not_ output from teragram. Instead, they are as
     46   // detected in the headers of usenet articles.
     47   MSFT_CP1252          = 27,  // 27: CP1252 aka MSFT euro ascii
     48   RUSSIAN_KOI8_RU      = 28,  // CP21866 aka KOI8-U, used for Ukrainian.
     49                               // Misnamed, this is _not_ KOI8-RU but KOI8-U.
     50                               // KOI8-U is used much more often than KOI8-RU.
     51   MSFT_CP1250          = 29,  // CP1250 aka MSFT eastern european
     52   ISO_8859_15          = 30,  // aka ISO_8859_0 aka ISO_8859_1 euroized
     53   //----------------------------------------------------------
     54 
     55   //----------------------------------------------------------
     56   // These are in BasisTech but not in Teragram. They are
     57   // needed for new interface languages. Now detected by
     58   // research langid
     59   MSFT_CP1254          = 31,  // used for Turkish
     60   MSFT_CP1257          = 32,  // used in Baltic countries
     61   //----------------------------------------------------------
     62 
     63   //----------------------------------------------------------
     64   //----------------------------------------------------------
     65   // New encodings detected by Teragram
     66   ISO_8859_11          = 33,  // aka TIS-620, used for Thai
     67   MSFT_CP874           = 34,  // used for Thai
     68   MSFT_CP1256          = 35,  // used for Arabic
     69 
     70   //----------------------------------------------------------
     71   // Detected as ISO_8859_8 by Teragram, but can be found in META tags
     72   MSFT_CP1255          = 36,  // Logical Hebrew Microsoft
     73   ISO_8859_8_I         = 37,  // Iso Hebrew Logical
     74   HEBREW_VISUAL        = 38,  // Iso Hebrew Visual
     75   //----------------------------------------------------------
     76 
     77   //----------------------------------------------------------
     78   // Detected by research langid
     79   CZECH_CP852          = 39,
     80   CZECH_CSN_369103     = 40,  // aka ISO_IR_139 aka KOI8_CS
     81   MSFT_CP1253          = 41,  // used for Greek
     82   RUSSIAN_CP866        = 42,
     83   //----------------------------------------------------------
     84 
     85   //----------------------------------------------------------
     86   // Handled by iconv in glibc
     87   ISO_8859_13          = 43,
     88   ISO_2022_KR          = 44,
     89   GBK                  = 45,
     90   GB18030              = 46,
     91   BIG5_HKSCS           = 47,
     92   ISO_2022_CN          = 48,
     93 
     94   //-----------------------------------------------------------
     95   // Detected by xin liu's detector
     96   // Handled by transcoder
     97   // (Indic encodings)
     98 
     99   TSCII                = 49,
    100   TAMIL_MONO           = 50,
    101   TAMIL_BI             = 51,
    102   JAGRAN               = 52,
    103 
    104 
    105   MACINTOSH_ROMAN      = 53,
    106   UTF7                 = 54,
    107   BHASKAR              = 55,  // Indic encoding - Devanagari
    108   HTCHANAKYA           = 56,  // 56 Indic encoding - Devanagari
    109 
    110   //-----------------------------------------------------------
    111   // These allow a single place (inputconverter and outputconverter)
    112   // to do UTF-16 <==> UTF-8 bulk conversions and UTF-32 <==> UTF-8
    113   // bulk conversions, with interchange-valid checking on input and
    114   // fallback if needed on ouput.
    115   UTF16BE              = 57,  // big-endian UTF-16
    116   UTF16LE              = 58,  // little-endian UTF-16
    117   UTF32BE              = 59,  // big-endian UTF-32
    118   UTF32LE              = 60,  // little-endian UTF-32
    119   //-----------------------------------------------------------
    120 
    121   //-----------------------------------------------------------
    122   // An encoding that means "This is not text, but it may have some
    123   // simple ASCII text embedded". Intended input conversion (not yet
    124   // implemented) is to keep strings of >=4 seven-bit ASCII characters
    125   // (follow each kept string with an ASCII space), delete the rest of
    126   // the bytes. This will pick up and allow indexing of e.g. captions
    127   // in JPEGs. No output conversion needed.
    128   BINARYENC            = 61,
    129   //-----------------------------------------------------------
    130 
    131   //-----------------------------------------------------------
    132   // Some Web pages allow a mixture of HZ-GB and GB-2312 by using
    133   // ~{ ... ~} for 2-byte pairs, and the browsers support this.
    134   HZ_GB_2312           = 62,
    135   //-----------------------------------------------------------
    136 
    137   //-----------------------------------------------------------
    138   // Some external vendors make the common input error of
    139   // converting MSFT_CP1252 to UTF8 *twice*. No output conversion needed.
    140   UTF8UTF8             = 63,
    141   //-----------------------------------------------------------
    142 
    143   //-----------------------------------------------------------
    144   // Handled by transcoder for tamil language specific font
    145   // encodings without the support for detection at present.
    146   TAM_ELANGO           = 64,  // Elango - Tamil
    147   TAM_LTTMBARANI       = 65,  // Barani - Tamil
    148   TAM_SHREE            = 66,  // Shree - Tamil
    149   TAM_TBOOMIS          = 67,  // TBoomis - Tamil
    150   TAM_TMNEWS           = 68,  // TMNews - Tamil
    151   TAM_WEBTAMIL         = 69,  // Webtamil - Tamil
    152   //-----------------------------------------------------------
    153 
    154   //-----------------------------------------------------------
    155   // Shift_JIS variants used by Japanese cell phone carriers.
    156   KDDI_SHIFT_JIS       = 70,
    157   DOCOMO_SHIFT_JIS     = 71,
    158   SOFTBANK_SHIFT_JIS   = 72,
    159   // ISO-2022-JP variants used by KDDI and SoftBank.
    160   KDDI_ISO_2022_JP     = 73,
    161   SOFTBANK_ISO_2022_JP = 74,
    162   //-----------------------------------------------------------
    163 
    164   NUM_ENCODINGS        = 75,  // Always keep this at the end. It is not a
    165                               // valid Encoding enum, it is only used to
    166                               // indicate the total number of Encodings.
    167 };
    168 
    169 #endif  // ENCODINGS_PROTO_ENCODINGS_PB_H_
    170