Home | History | Annotate | Download | only in text
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  ****************************************************************************
      6  * Copyright (C) 2005-2012, International Business Machines Corporation and *
      7  * others. All Rights Reserved.                                             *
      8  ****************************************************************************
      9  *
     10  */
     11 package android.icu.text;
     12 
     13 import java.util.Arrays;
     14 
     15 /**
     16  * CharsetRecognizer implemenation for Asian  - double or multi-byte - charsets.
     17  *                   Match is determined mostly by the input data adhering to the
     18  *                   encoding scheme for the charset, and, optionally,
     19  *                   frequency-of-occurence of characters.
     20  * <p/>
     21  *                   Instances of this class are singletons, one per encoding
     22  *                   being recognized.  They are created in the main
     23  *                   CharsetDetector class and kept in the global list of available
     24  *                   encodings to be checked.  The specific encoding being recognized
     25  *                   is determined by subclass.
     26  */
     27 abstract class CharsetRecog_mbcs extends CharsetRecognizer {
     28 
     29    /**
     30      * Get the IANA name of this charset.
     31      * @return the charset name.
     32      */
     33     @Override
     34     abstract String      getName() ;
     35 
     36 
     37     /**
     38      * Test the match of this charset with the input text data
     39      *      which is obtained via the CharsetDetector object.
     40      *
     41      * @param det  The CharsetDetector, which contains the input text
     42      *             to be checked for being in this charset.
     43      * @return     Two values packed into one int  (Damn java, anyhow)
     44      *             <br/>
     45      *             bits 0-7:  the match confidence, ranging from 0-100
     46      *             <br/>
     47      *             bits 8-15: The match reason, an enum-like value.
     48      */
     49     int match(CharsetDetector det, int [] commonChars) {
     50         @SuppressWarnings("unused")
     51         int   singleByteCharCount = 0;  //TODO Do we really need this?
     52         int   doubleByteCharCount = 0;
     53         int   commonCharCount     = 0;
     54         int   badCharCount        = 0;
     55         int   totalCharCount      = 0;
     56         int   confidence          = 0;
     57         iteratedChar   iter       = new iteratedChar();
     58 
     59         detectBlock: {
     60             for (iter.reset(); nextChar(iter, det);) {
     61                 totalCharCount++;
     62                 if (iter.error) {
     63                     badCharCount++;
     64                 } else {
     65                     long cv = iter.charValue & 0xFFFFFFFFL;
     66 
     67                     if (cv <= 0xff) {
     68                         singleByteCharCount++;
     69                     } else {
     70                         doubleByteCharCount++;
     71                         if (commonChars != null) {
     72                             // NOTE: This assumes that there are no 4-byte common chars.
     73                             if (Arrays.binarySearch(commonChars, (int) cv) >= 0) {
     74                                 commonCharCount++;
     75                             }
     76                         }
     77                     }
     78                 }
     79                 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
     80                     // Bail out early if the byte data is not matching the encoding scheme.
     81                     break detectBlock;
     82                 }
     83             }
     84 
     85             if (doubleByteCharCount <= 10 && badCharCount== 0) {
     86                 // Not many multi-byte chars.
     87                 if (doubleByteCharCount == 0 && totalCharCount < 10) {
     88                     // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
     89                     // We don't have enough data to have any confidence.
     90                     // Statistical analysis of single byte non-ASCII charcters would probably help here.
     91                     confidence = 0;
     92                 }
     93                 else {
     94                     //   ASCII or ISO file?  It's probably not our encoding,
     95                     //   but is not incompatible with our encoding, so don't give it a zero.
     96                     confidence = 10;
     97                 }
     98 
     99                 break detectBlock;
    100             }
    101 
    102             //
    103             //  No match if there are too many characters that don't fit the encoding scheme.
    104             //    (should we have zero tolerance for these?)
    105             //
    106             if (doubleByteCharCount < 20*badCharCount) {
    107                 confidence = 0;
    108                 break detectBlock;
    109             }
    110 
    111             if (commonChars == null) {
    112                 // We have no statistics on frequently occuring characters.
    113                 //  Assess confidence purely on having a reasonable number of
    114                 //  multi-byte characters (the more the better
    115                 confidence = 30 + doubleByteCharCount - 20*badCharCount;
    116                 if (confidence > 100) {
    117                     confidence = 100;
    118                 }
    119             }else {
    120                 //
    121                 // Frequency of occurence statistics exist.
    122                 //
    123                 double maxVal = Math.log((float)doubleByteCharCount / 4);
    124                 double scaleFactor = 90.0 / maxVal;
    125                 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10);
    126                 confidence = Math.min(confidence, 100);
    127             }
    128         }   // end of detectBlock:
    129 
    130         return confidence;
    131     }
    132 
    133      // "Character"  iterated character class.
    134      //    Recognizers for specific mbcs encodings make their "characters" available
    135      //    by providing a nextChar() function that fills in an instance of iteratedChar
    136      //    with the next char from the input.
    137      //    The returned characters are not converted to Unicode, but remain as the raw
    138      //    bytes (concatenated into an int) from the codepage data.
    139      //
    140      //  For Asian charsets, use the raw input rather than the input that has been
    141      //   stripped of markup.  Detection only considers multi-byte chars, effectively
    142      //   stripping markup anyway, and double byte chars do occur in markup too.
    143      //
    144      static class iteratedChar {
    145          int             charValue = 0;             // 1-4 bytes from the raw input data
    146          int             nextIndex = 0;
    147          boolean         error     = false;
    148          boolean         done      = false;
    149 
    150          void reset() {
    151              charValue = 0;
    152              nextIndex = 0;
    153              error     = false;
    154              done      = false;
    155          }
    156 
    157          int nextByte(CharsetDetector det) {
    158              if (nextIndex >= det.fRawLength) {
    159                  done = true;
    160                  return -1;
    161              }
    162              int byteValue = det.fRawInput[nextIndex++] & 0x00ff;
    163              return byteValue;
    164          }
    165      }
    166 
    167      /**
    168       * Get the next character (however many bytes it is) from the input data
    169       *    Subclasses for specific charset encodings must implement this function
    170       *    to get characters according to the rules of their encoding scheme.
    171       *
    172       *  This function is not a method of class iteratedChar only because
    173       *   that would require a lot of extra derived classes, which is awkward.
    174       * @param it  The iteratedChar "struct" into which the returned char is placed.
    175       * @param det The charset detector, which is needed to get at the input byte data
    176       *            being iterated over.
    177       * @return    True if a character was returned, false at end of input.
    178       */
    179      abstract boolean nextChar(iteratedChar it, CharsetDetector det);
    180 
    181 
    182 
    183 
    184 
    185      /**
    186       *   Shift-JIS charset recognizer.
    187       *
    188       */
    189      static class CharsetRecog_sjis extends CharsetRecog_mbcs {
    190          static int [] commonChars =
    191              // TODO:  This set of data comes from the character frequency-
    192              //        of-occurence analysis tool.  The data needs to be moved
    193              //        into a resource and loaded from there.
    194             {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
    195              0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
    196              0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
    197              0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
    198              0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
    199              0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
    200 
    201          @Override
    202         boolean nextChar(iteratedChar it, CharsetDetector det) {
    203              it.error = false;
    204              int firstByte;
    205              firstByte = it.charValue = it.nextByte(det);
    206              if (firstByte < 0) {
    207                  return false;
    208              }
    209 
    210              if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) {
    211                  return true;
    212              }
    213 
    214              int secondByte = it.nextByte(det);
    215              if (secondByte < 0)  {
    216                  return false;
    217              }
    218              it.charValue = (firstByte << 8) | secondByte;
    219              if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) {
    220                  // Illegal second byte value.
    221                  it.error = true;
    222              }
    223              return true;
    224          }
    225 
    226          @Override
    227         CharsetMatch match(CharsetDetector det) {
    228              int confidence = match(det, commonChars);
    229              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    230          }
    231 
    232          @Override
    233         String getName() {
    234              return "Shift_JIS";
    235          }
    236 
    237          @Override
    238         public String getLanguage()
    239          {
    240              return "ja";
    241          }
    242 
    243 
    244      }
    245 
    246 
    247      /**
    248       *   Big5 charset recognizer.
    249       *
    250       */
    251      static class CharsetRecog_big5 extends CharsetRecog_mbcs {
    252          static int [] commonChars =
    253              // TODO:  This set of data comes from the character frequency-
    254              //        of-occurence analysis tool.  The data needs to be moved
    255              //        into a resource and loaded from there.
    256             {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
    257              0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
    258              0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
    259              0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
    260              0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
    261              0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
    262              0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
    263              0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
    264              0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
    265              0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
    266 
    267          @Override
    268         boolean nextChar(iteratedChar it, CharsetDetector det) {
    269              it.error = false;
    270              int firstByte;
    271              firstByte = it.charValue = it.nextByte(det);
    272              if (firstByte < 0) {
    273                  return false;
    274              }
    275 
    276              if (firstByte <= 0x7f || firstByte==0xff) {
    277                  // single byte character.
    278                  return true;
    279              }
    280 
    281              int secondByte = it.nextByte(det);
    282              if (secondByte < 0)  {
    283                  return false;
    284              }
    285              it.charValue = (it.charValue << 8) | secondByte;
    286 
    287              if (secondByte < 0x40 ||
    288                  secondByte ==0x7f ||
    289                  secondByte == 0xff) {
    290                      it.error = true;
    291              }
    292              return true;
    293          }
    294 
    295          @Override
    296         CharsetMatch match(CharsetDetector det) {
    297              int confidence = match(det, commonChars);
    298              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    299          }
    300 
    301          @Override
    302         String getName() {
    303              return "Big5";
    304          }
    305 
    306 
    307          @Override
    308         public String getLanguage()
    309          {
    310              return "zh";
    311          }
    312      }
    313 
    314 
    315      /**
    316       *   EUC charset recognizers.  One abstract class that provides the common function
    317       *             for getting the next character according to the EUC encoding scheme,
    318       *             and nested derived classes for EUC_KR, EUC_JP, EUC_CN.
    319       *
    320       */
    321      abstract static class CharsetRecog_euc extends CharsetRecog_mbcs {
    322 
    323          /*
    324           *  (non-Javadoc)
    325           *  Get the next character value for EUC based encodings.
    326           *  Character "value" is simply the raw bytes that make up the character
    327           *     packed into an int.
    328           */
    329          @Override
    330         boolean nextChar(iteratedChar it, CharsetDetector det) {
    331              it.error = false;
    332              int firstByte  = 0;
    333              int secondByte = 0;
    334              int thirdByte  = 0;
    335              //int fourthByte = 0;
    336 
    337              buildChar: {
    338                  firstByte = it.charValue = it.nextByte(det);
    339                  if (firstByte < 0) {
    340                      // Ran off the end of the input data
    341                      it.done = true;
    342                      break buildChar;
    343                  }
    344                  if (firstByte <= 0x8d) {
    345                      // single byte char
    346                      break buildChar;
    347                  }
    348 
    349                  secondByte = it.nextByte(det);
    350                  it.charValue = (it.charValue << 8) | secondByte;
    351 
    352                  if (firstByte >= 0xA1 && firstByte <= 0xfe) {
    353                      // Two byte Char
    354                      if (secondByte < 0xa1) {
    355                          it.error = true;
    356                      }
    357                      break buildChar;
    358                  }
    359                  if (firstByte == 0x8e) {
    360                      // Code Set 2.
    361                      //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    362                      //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    363                      // We don't know which we've got.
    364                      // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    365                      //   bytes will look like a well formed 2 byte char.
    366                      if (secondByte < 0xa1) {
    367                          it.error = true;
    368                      }
    369                      break buildChar;
    370                  }
    371 
    372                  if (firstByte == 0x8f) {
    373                      // Code set 3.
    374                      // Three byte total char size, two bytes of actual char value.
    375                      thirdByte    = it.nextByte(det);
    376                      it.charValue = (it.charValue << 8) | thirdByte;
    377                      if (thirdByte < 0xa1) {
    378                          it.error = true;
    379                      }
    380                  }
    381               }
    382 
    383              return (it.done == false);
    384          }
    385 
    386          /**
    387           * The charset recognize for EUC-JP.  A singleton instance of this class
    388           *    is created and kept by the public CharsetDetector class
    389           */
    390          static class CharsetRecog_euc_jp extends CharsetRecog_euc {
    391              static int [] commonChars =
    392                  // TODO:  This set of data comes from the character frequency-
    393                  //        of-occurence analysis tool.  The data needs to be moved
    394                  //        into a resource and loaded from there.
    395                 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
    396                  0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
    397                  0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
    398                  0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
    399                  0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
    400                  0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
    401                  0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
    402                  0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
    403                  0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
    404                  0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
    405              @Override
    406             String getName() {
    407                  return "EUC-JP";
    408              }
    409 
    410              @Override
    411             CharsetMatch match(CharsetDetector det) {
    412                  int confidence = match(det, commonChars);
    413                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    414              }
    415 
    416              @Override
    417             public String getLanguage()
    418              {
    419                  return "ja";
    420              }
    421          }
    422 
    423          /**
    424           * The charset recognize for EUC-KR.  A singleton instance of this class
    425           *    is created and kept by the public CharsetDetector class
    426           */
    427          static class CharsetRecog_euc_kr extends CharsetRecog_euc {
    428              static int [] commonChars =
    429                  // TODO:  This set of data comes from the character frequency-
    430                  //        of-occurence analysis tool.  The data needs to be moved
    431                  //        into a resource and loaded from there.
    432                 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
    433                  0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
    434                  0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
    435                  0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
    436                  0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
    437                  0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
    438                  0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
    439                  0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
    440                  0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
    441                  0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
    442 
    443              @Override
    444             String getName() {
    445                  return "EUC-KR";
    446              }
    447 
    448              @Override
    449             CharsetMatch match(CharsetDetector det) {
    450                  int confidence = match(det, commonChars);
    451                  return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    452              }
    453 
    454              @Override
    455             public String getLanguage()
    456              {
    457                  return "ko";
    458              }
    459          }
    460      }
    461 
    462      /**
    463       *
    464       *   GB-18030 recognizer. Uses simplified Chinese statistics.
    465       *
    466       */
    467      static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs {
    468 
    469          /*
    470           *  (non-Javadoc)
    471           *  Get the next character value for EUC based encodings.
    472           *  Character "value" is simply the raw bytes that make up the character
    473           *     packed into an int.
    474           */
    475          @Override
    476         boolean nextChar(iteratedChar it, CharsetDetector det) {
    477              it.error = false;
    478              int firstByte  = 0;
    479              int secondByte = 0;
    480              int thirdByte  = 0;
    481              int fourthByte = 0;
    482 
    483              buildChar: {
    484                  firstByte = it.charValue = it.nextByte(det);
    485 
    486                  if (firstByte < 0) {
    487                      // Ran off the end of the input data
    488                      it.done = true;
    489                      break buildChar;
    490                  }
    491 
    492                  if (firstByte <= 0x80) {
    493                      // single byte char
    494                      break buildChar;
    495                  }
    496 
    497                  secondByte = it.nextByte(det);
    498                  it.charValue = (it.charValue << 8) | secondByte;
    499 
    500                  if (firstByte >= 0x81 && firstByte <= 0xFE) {
    501                      // Two byte Char
    502                      if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) {
    503                          break buildChar;
    504                      }
    505 
    506                      // Four byte char
    507                      if (secondByte >= 0x30 && secondByte <= 0x39) {
    508                          thirdByte = it.nextByte(det);
    509 
    510                          if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    511                              fourthByte = it.nextByte(det);
    512 
    513                              if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    514                                  it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte;
    515                                  break buildChar;
    516                              }
    517                          }
    518                      }
    519 
    520                      it.error = true;
    521                      break buildChar;
    522                  }
    523              }
    524 
    525              return (it.done == false);
    526          }
    527 
    528          static int [] commonChars =
    529              // TODO:  This set of data comes from the character frequency-
    530              //        of-occurence analysis tool.  The data needs to be moved
    531              //        into a resource and loaded from there.
    532             {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
    533              0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
    534              0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
    535              0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
    536              0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
    537              0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
    538              0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
    539              0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
    540              0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
    541              0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
    542 
    543 
    544          @Override
    545         String getName() {
    546              return "GB18030";
    547          }
    548 
    549          @Override
    550         CharsetMatch match(CharsetDetector det) {
    551              int confidence = match(det, commonChars);
    552              return confidence == 0 ? null : new CharsetMatch(det, this, confidence);
    553          }
    554 
    555          @Override
    556         public String getLanguage()
    557          {
    558              return "zh";
    559          }
    560      }
    561 
    562 
    563 }
    564