Home | History | Annotate | Download | only in text
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ****************************************************************************
      5  * Copyright (C) 2005-2012, International Business Machines Corporation and *
      6  * others. All Rights Reserved.                                             *
      7  ****************************************************************************
      8  *
      9  */
     10 package com.ibm.icu.text;
     11 
     12 import java.util.Arrays;
     13 
     14 /**
     15  * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
     16  *                   Match is determined mostly by the input data adhering to the
     17  *                   encoding scheme for the charset, and, optionally,
     18  *                   frequency-of-occurence of characters.
     19  * <p/>
     20  *                   Instances of this class are singletons, one per encoding
     21  *                   being recognized.  They are created in the main
     22  *                   CharsetDetector class and kept in the global list of available
     23  *                   encodings to be checked.  The specific encoding being recognized
     24  *                   is determined by subclass.
     25  */
     26 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     27 
     28    /**
     29      * Get the IANA name of this charset.
     30      * @return the charset name.
     31      */
     32     @Override
     33     abstract String      getName() ;
     34 
     35 
     36     /**
     37      * Test the match of this charset with the input text data
     38      *      which is obtained via the CharsetDetector object.
     39      *
     40      * @param det  The CharsetDetector, which contains the input text
     41      *             to be checked for being in this charset.
     42      * @return     Two values packed into one int  (Damn java, anyhow)
     43      *             <br/>
     44      *             bits 0-7:  the match confidence, ranging from 0-100
     45      *             <br/>
     46      *             bits 8-15: The match reason, an enum-like value.
     47      */
     48     int match(CharsetDetector det, int [] commonChars) {
     49         @SuppressWarnings("unused")
     50         int   singleByteCharCount = 0;  //TODO Do we really need this?
     51         int   doubleByteCharCount = 0;
     52         int   commonCharCount     = 0;
     53         int   badCharCount        = 0;
     54         int   totalCharCount      = 0;
     55         int   confidence          = 0;
     56         iteratedChar   iter       = new iteratedChar();
     57 
     58         detectBlock: {
     59             for (iter.reset(); nextChar(iter, det);) {
     60                 totalCharCount++;
     61                 if (iter.error) {
     62                     badCharCount++;
     63                 } else {
     64                     long cv = iter.charValue & 0xFFFFFFFFL;
     65 
     66                     if (cv <= 0xff) {
     67                         singleByteCharCount++;
     68                     } else {
     69                         doubleByteCharCount++;
     70                         if (commonChars != null) {
     71                             // NOTE: This assumes that there are no 4-byte common chars.
     72                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
     73                                 commonCharCount++;
     74                             }
     75                         }
     76                     }
     77                 }
     78                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
     79                     // Bail out early if the byte data is not matching the encoding scheme.
     80                     break detectBlock;
     81                 }
     82             }
     83 
     84             if (doubleByteCharCount <= 10 && badCharCount== 0) {
     85                 // Not many multi-byte chars.
     86                 if (doubleByteCharCount == 0 && totalCharCount < 10) {
     87                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
     88                     // We don't have enough data to have any confidence.
     89                     // Statistical analysis of single byte non-ASCII charcters would probably help here.
     90                     confidence = 0;
     91                 }
     92                 else {
     93                     //   ASCII or ISO file?  It's probably not our encoding,
     94                     //   but is not incompatible with our encoding, so don't give it a zero.
     95                     confidence = 10;
     96                 }
     97 
     98                 break detectBlock;
     99             }
    100 
    101             //
    102             //  No match if there are too many characters that don't fit the encoding scheme.
    103             //    (should we have zero tolerance for these?)
    104             //
    105             if (doubleByteCharCount < 20*badCharCount) {
    106                 confidence = 0;
    107                 break detectBlock;
    108             }
    109 
    110             if (commonChars == null) {
    111                 // We have no statistics on frequently occuring characters.
    112                 //  Assess confidence purely on having a reasonable number of
    113                 //  multi-byte characters (the more the better
    114                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
    115                 if (confidence > 100) {
    116                     confidence = 100;
    117                 }
    118             }else {
    119                 //
    120                 // Frequency of occurence statistics exist.
    121                 //
    122                 double maxVal = Math.log((float)doubleByteCharCount / 4);
    123                 double scaleFactor = 90.0 / maxVal;
    124                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
    125                 confidence = Math.min(confidence, 100);
    126             }
    127         }   // end of detectBlock:
    128 
    129         return confidence;
    130     }
    131 
    132      // "Character"  iterated character class.
    133      //    Recognizers for specific mbcs encodings make their "characters" available
    134      //    by providing a nextChar() function that fills in an instance of iteratedChar
    135      //    with the next char from the input.
    136      //    The returned characters are not converted to Unicode, but remain as the raw
    137      //    bytes (concatenated into an int) from the codepage data.
    138      //
    139      //  For Asian charsets, use the raw input rather than the input that has been
    140      //   stripped of markup.  Detection only considers multi-byte chars, effectively
    141      //   stripping markup anyway, and double byte chars do occur in markup too.
    142      //
    143      static class iteratedChar {
    144          int             charValue = 0;             // 1-4 bytes from the raw input data
    145          int             nextIndex = 0;
    146          boolean         error     = false;
    147          boolean         done      = false;
    148 
    149          void reset() {
    150              charValue = 0;
    151              nextIndex = 0;
    152              error     = false;
    153              done      = false;
    154          }
    155 
    156          int nextByte(CharsetDetector det) {
    157              if (nextIndex >= det.fRawLength) {
    158                  done = true;
    159                  return -1;
    160              }
    161              int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
    162              return byteValue;
    163          }
    164      }
    165 
    166      /**
    167       * Get the next character (however many bytes it is) from the input data
    168       *    Subclasses for specific charset encodings must implement this function
    169       *    to get characters according to the rules of their encoding scheme.
    170       *
    171       *  This function is not a method of class iteratedChar only because
    172       *   that would require a lot of extra derived classes, which is awkward.
    173       * @param it  The iteratedChar "struct" into which the returned char is placed.
    174       * @param det The charset detector, which is needed to get at the input byte data
    175       *            being iterated over.
    176       * @return    True if a character was returned, false at end of input.
    177       */
    178      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
    179 
    180 
    181 
    182 
    183 
    184      /**
    185       *   Shift-JIS charset recognizer.
    186       *
    187       */
    188      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
    189          static int [] commonChars =
    190              // TODO:  This set of data comes from the character frequency-
    191              //        of-occurence analysis tool.  The data needs to be moved
    192              //        into a resource and loaded from there.
    193             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
    194              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
    195              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
    196              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
    197              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
    198              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
    199 
    200          @Override
    201         boolean nextChar(iteratedChar it, CharsetDetector det) {
    202              it.error = false;
    203              int firstByte;
    204              firstByte = it.charValue = it.nextByte(det);
    205              if (firstByte < 0) {
    206                  return false;
    207              }
    208 
    209              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
    210                  return true;
    211              }
    212 
    213              int secondByte = it.nextByte(det);
    214              if (secondByte < 0)  {
    215                  return false;
    216              }
    217              it.charValue = (firstByte << 8) | secondByte;
    218              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
    219                  // Illegal second byte value.
    220                  it.error = true;
    221              }
    222              return true;
    223          }
    224 
    225          @Override
    226         CharsetMatch match(CharsetDetector det) {
    227              int confidence = match(det, commonChars);
    228              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    229          }
    230 
    231          @Override
    232         String getName() {
    233              return "Shift_JIS";
    234          }
    235 
    236          @Override
    237         public String getLanguage()
    238          {
    239              return "ja";
    240          }
    241 
    242 
    243      }
    244 
    245 
    246      /**
    247       *   Big5 charset recognizer.
    248       *
    249       */
    250      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
    251          static int [] commonChars =
    252              // TODO:  This set of data comes from the character frequency-
    253              //        of-occurence analysis tool.  The data needs to be moved
    254              //        into a resource and loaded from there.
    255             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
    256              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
    257              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
    258              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
    259              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
    260              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
    261              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
    262              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
    263              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
    264              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
    265 
    266          @Override
    267         boolean nextChar(iteratedChar it, CharsetDetector det) {
    268              it.error = false;
    269              int firstByte;
    270              firstByte = it.charValue = it.nextByte(det);
    271              if (firstByte < 0) {
    272                  return false;
    273              }
    274 
    275              if (firstByte <= 0x7f || firstByte==0xff) {
    276                  // single byte character.
    277                  return true;
    278              }
    279 
    280              int secondByte = it.nextByte(det);
    281              if (secondByte < 0)  {
    282                  return false;
    283              }
    284              it.charValue = (it.charValue << 8) | secondByte;
    285 
    286              if (secondByte < 0x40 ||
    287                  secondByte ==0x7f ||
    288                  secondByte == 0xff) {
    289                      it.error = true;
    290              }
    291              return true;
    292          }
    293 
    294          @Override
    295         CharsetMatch match(CharsetDetector det) {
    296              int confidence = match(det, commonChars);
    297              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    298          }
    299 
    300          @Override
    301         String getName() {
    302              return "Big5";
    303          }
    304 
    305 
    306          @Override
    307         public String getLanguage()
    308          {
    309              return "zh";
    310          }
    311      }
    312 
    313 
    314      /**
    315       *   EUC charset recognizers.  One abstract class that provides the common function
    316       *             for getting the next character according to the EUC encoding scheme,
    317       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
    318       *
    319       */
    320      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
    321 
    322          /*
    323           *  (non-Javadoc)
    324           *  Get the next character value for EUC based encodings.
    325           *  Character "value" is simply the raw bytes that make up the character
    326           *     packed into an int.
    327           */
    328          @Override
    329         boolean nextChar(iteratedChar it, CharsetDetector det) {
    330              it.error = false;
    331              int firstByte  = 0;
    332              int secondByte = 0;
    333              int thirdByte  = 0;
    334              //int fourthByte = 0;
    335 
    336              buildChar: {
    337                  firstByte = it.charValue = it.nextByte(det);
    338                  if (firstByte < 0) {
    339                      // Ran off the end of the input data
    340                      it.done = true;
    341                      break buildChar;
    342                  }
    343                  if (firstByte <= 0x8d) {
    344                      // single byte char
    345                      break buildChar;
    346                  }
    347 
    348                  secondByte = it.nextByte(det);
    349                  it.charValue = (it.charValue << 8) | secondByte;
    350 
    351                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
    352                      // Two byte Char
    353                      if (secondByte < 0xa1) {
    354                          it.error = true;
    355                      }
    356                      break buildChar;
    357                  }
    358                  if (firstByte == 0x8e) {
    359                      // Code Set 2.
    360                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    361                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    362                      // We don't know which we've got.
    363                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    364                      //   bytes will look like a well formed 2 byte char.
    365                      if (secondByte < 0xa1) {
    366                          it.error = true;
    367                      }
    368                      break buildChar;
    369                  }
    370 
    371                  if (firstByte == 0x8f) {
    372                      // Code set 3.
    373                      // Three byte total char size, two bytes of actual char value.
    374                      thirdByte    = it.nextByte(det);
    375                      it.charValue = (it.charValue << 8) | thirdByte;
    376                      if (thirdByte < 0xa1) {
    377                          it.error = true;
    378                      }
    379                  }
    380               }
    381 
    382              return (it.done == false);
    383          }
    384 
    385          /**
    386           * The charset recognize for EUC-JP.  A singleton instance of this class
    387           *    is created and kept by the public CharsetDetector class
    388           */
    389          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
    390              static int [] commonChars =
    391                  // TODO:  This set of data comes from the character frequency-
    392                  //        of-occurence analysis tool.  The data needs to be moved
    393                  //        into a resource and loaded from there.
    394                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
    395                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
    396                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
    397                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
    398                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
    399                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
    400                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
    401                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
    402                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
    403                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
    404              @Override
    405             String getName() {
    406                  return "EUC-JP";
    407              }
    408 
    409              @Override
    410             CharsetMatch match(CharsetDetector det) {
    411                  int confidence = match(det, commonChars);
    412                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    413              }
    414 
    415              @Override
    416             public String getLanguage()
    417              {
    418                  return "ja";
    419              }
    420          }
    421 
    422          /**
    423           * The charset recognize for EUC-KR.  A singleton instance of this class
    424           *    is created and kept by the public CharsetDetector class
    425           */
    426          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
    427              static int [] commonChars =
    428                  // TODO:  This set of data comes from the character frequency-
    429                  //        of-occurence analysis tool.  The data needs to be moved
    430                  //        into a resource and loaded from there.
    431                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
    432                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
    433                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
    434                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
    435                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
    436                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
    437                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
    438                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
    439                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
    440                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
    441 
    442              @Override
    443             String getName() {
    444                  return "EUC-KR";
    445              }
    446 
    447              @Override
    448             CharsetMatch match(CharsetDetector det) {
    449                  int confidence = match(det, commonChars);
    450                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    451              }
    452 
    453              @Override
    454             public String getLanguage()
    455              {
    456                  return "ko";
    457              }
    458          }
    459      }
    460 
    461      /**
    462       *
    463       *   GB-18030 recognizer. Uses simplified Chinese statistics.
    464       *
    465       */
    466      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
    467 
    468          /*
    469           *  (non-Javadoc)
    470           *  Get the next character value for EUC based encodings.
    471           *  Character "value" is simply the raw bytes that make up the character
    472           *     packed into an int.
    473           */
    474          @Override
    475         boolean nextChar(iteratedChar it, CharsetDetector det) {
    476              it.error = false;
    477              int firstByte  = 0;
    478              int secondByte = 0;
    479              int thirdByte  = 0;
    480              int fourthByte = 0;
    481 
    482              buildChar: {
    483                  firstByte = it.charValue = it.nextByte(det);
    484 
    485                  if (firstByte < 0) {
    486                      // Ran off the end of the input data
    487                      it.done = true;
    488                      break buildChar;
    489                  }
    490 
    491                  if (firstByte <= 0x80) {
    492                      // single byte char
    493                      break buildChar;
    494                  }
    495 
    496                  secondByte = it.nextByte(det);
    497                  it.charValue = (it.charValue << 8) | secondByte;
    498 
    499                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
    500                      // Two byte Char
    501                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
    502                          break buildChar;
    503                      }
    504 
    505                      // Four byte char
    506                      if (secondByte >= 0x30 && secondByte <= 0x39) {
    507                          thirdByte = it.nextByte(det);
    508 
    509                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    510                              fourthByte = it.nextByte(det);
    511 
    512                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    513                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
    514                                  break buildChar;
    515                              }
    516                          }
    517                      }
    518 
    519                      it.error = true;
    520                      break buildChar;
    521                  }
    522              }
    523 
    524              return (it.done == false);
    525          }
    526 
    527          static int [] commonChars =
    528              // TODO:  This set of data comes from the character frequency-
    529              //        of-occurence analysis tool.  The data needs to be moved
    530              //        into a resource and loaded from there.
    531             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
    532              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
    533              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
    534              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
    535              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
    536              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
    537              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
    538              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
    539              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
    540              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
    541 
    542 
    543          @Override
    544         String getName() {
    545              return "GB18030";
    546          }
    547 
    548          @Override
    549         CharsetMatch match(CharsetDetector det) {
    550              int confidence = match(det, commonChars);
    551              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    552          }
    553 
    554          @Override
    555         public String getLanguage()
    556          {
    557              return "zh";
    558          }
    559      }
    560 
    561 
    562 }
    563