Home | History | Annotate | Download | only in mbcs
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ***********************************************************************
      5  *
      6  * Copyright (C) 2005-2012, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *
      9  ***********************************************************************
     10  *
     11  * euc_tool
     12  *
     13  *    This tool produces the character usage frequency statistics for the EUC family
     14  *    of charsets, for use by the ICU charset detectors.
     15  *
     16  *    usage:  java euc_tool [-d] [directory path]
     17  *
     18  *        -d:   Produce the data in a form to be exported to the ICU implementation
     19  *              Default is to produce an informative dump.
     20  *
     21  *        directory path
     22  *              Source directory for the files to be analyzed.
     23  *              Default is the current directory.
     24  *              There should be three subdirectories under the specified directory, one
     25  *              each for EUC_JP, EUC_CN and EUC_KR.  Within each of these subdirectories
     26  *              should be text files in the specified encoding.
     27  *
     28  */
     29 
     30 package com.ibm.icu.dev.tool.charsetdet.mbcs;
     31 
     32 import java.io.File;
     33 import java.io.FileInputStream;
     34 import java.util.ArrayList;
     35 import java.util.Arrays;
     36 import java.util.HashMap;
     37 import java.util.List;
     38 
     39 public class EUCTool {
     40 
     41     // The file buffer and file data length need to be out in class member variables
     42     //  so that the code lifted from charSet detection for scanning the multi-byte chars
     43     //  can see them conveniently.
     44     byte []    buf = new byte[1000000];
     45     int        fileSize;
     46 
     47     boolean    option_d = false;    // data option.  Produce exportable data
     48     boolean    option_v = true;     // verbose informaional output.
     49 
     50 
     51 
     52     public static void main(String[] args) {
     53         EUCTool  This = new EUCTool();
     54         This.Main(args);
     55     }
     56 
     57 
     58 
     59     void Main(String[] args) {
     60         int i;
     61 
     62         //
     63         //   Command Line Option Handling
     64         //
     65         String     dirName  = ".";
     66         for (i=0; i<args.length; i++) {
     67             if (args[i].equals("-d")) {
     68                 option_d = true;
     69                 option_v = false;
     70                 continue;
     71             }
     72             if (args[i].startsWith("-")) {
     73                 System.err.println("Unrecongized option: " + args[i]);
     74                 System.exit(-1);
     75             }
     76             dirName = args[i];
     77         }
     78 
     79         //
     80         //  Verify that the specified directory exists.
     81         //
     82         File dir = new File(dirName);
     83         if (dir.isDirectory() == false) {
     84             System.err.println("\"" + dirName + "\" is not a directory");
     85             System.exit(-1);
     86         }
     87 
     88         //
     89         //  Do each subdirectory of the specified directory.  There should be
     90         //    one per each encoding - euc-kr, euc-cn, euc-jp
     91         //
     92         File[] dirs  = dir.listFiles();
     93         for (i=0; i<dirs.length; i++) {
     94             if (dirs[i].isDirectory()) {
     95                 String nam = dirs[i].getName();
     96                 if (nam.equalsIgnoreCase("CVS")) {
     97                     continue;
     98                 }
     99                 processDir(dirs[i]);
    100             }
    101         }
    102     }
    103 
    104     //
    105     // Collect statistics from all ordinary files in a specified directory.
    106     //
    107     void processDir(File dir) {
    108         int      totalMbcsChars  = 0;
    109         HashMap  m = new HashMap(10000);
    110         int      i;
    111 
    112         System.out.println(dir.getName());
    113         File[] files = dir.listFiles();
    114         for (i=0; i<files.length; i++) {
    115             FileInputStream is = null;
    116             try {
    117                 if (files[i].isFile()) {
    118                     is = new FileInputStream(files[i]);
    119                     fileSize = is.read(buf);
    120                     if (option_v) {
    121                         System.out.println(files[i].getPath());
    122                         System.out.println("  " + fileSize + " bytes.");
    123                     }
    124                     iteratedChar ichar = new iteratedChar();
    125                     int fileChars     = 0;
    126                     int fileMbcsChars = 0;
    127                     int errs          = 0;
    128 
    129                     while (nextChar(ichar)) {
    130                         if (ichar.error == true) {
    131                             errs++;
    132                             continue;
    133                         }
    134                         fileChars++;
    135                         if (ichar.charValue > 255) {
    136                             fileMbcsChars++;
    137                             totalMbcsChars++;
    138                         }
    139                         if (ichar.charValue <= 255) {
    140                             // Don't keep occurence statistics for the single byte range
    141                             continue;
    142                         }
    143 
    144                         //
    145                         //  Frequency of occurence statistics are accumulated in a map.
    146                         //
    147                         ChEl  keyEl = new ChEl(ichar.charValue, 0);
    148                         ChEl  valEl = (ChEl)m.get(keyEl);
    149                         if (valEl == null) {
    150                             m.put(keyEl, keyEl);
    151                             valEl = keyEl;
    152                         }
    153                         valEl.occurences++;
    154                     }
    155                     if (option_v) {
    156                         System.out.println("  " + fileChars     + " Chars");
    157                         System.out.println("  " + fileMbcsChars + " mbcs Chars");
    158                         System.out.println("  " + errs          + " errors");
    159                         System.out.println("\n");
    160                     }
    161                 }
    162             }
    163             catch (Exception e) {
    164                 System.err.println("Exception:" + e);
    165 
    166             }
    167             finally {
    168                 if (is != null) {
    169                     try {
    170                         is.close();
    171                     } catch (Exception e) {
    172                         // ignore
    173                     }
    174                 }
    175             }
    176         }
    177 
    178         //
    179         //  We've processed through all of the files.
    180         //     sort and dump out the frequency statistics.
    181         //
    182         Object [] encounteredChars = m.values().toArray();
    183         Arrays.sort(encounteredChars);
    184         int cumulativeChars = 0;
    185         int cumulativePercent = 0;
    186         if (option_v) {
    187             System.out.println("# <char code> <occurences>  <Cumulative %>");
    188             for (i=0; i<encounteredChars.length; i++) {
    189                 ChEl c = (ChEl)encounteredChars[i];
    190                 cumulativeChars += c.occurences;
    191                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
    192                 System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
    193                         + c.occurences + "         " + cumulativePercent);
    194             }
    195         }
    196         if (option_d) {
    197             //
    198             //   Output the list of characters formatted for pasting into a
    199             //     Java source code array initializer.
    200             //     Resort into order based on the character code value, not
    201             //      on frequency of occurence.
    202             //
    203             List  charList = new ArrayList();
    204 
    205             for (i=0; i<100 && cumulativePercent<50; i++) {
    206                 ChEl c = (ChEl)encounteredChars[i];
    207                 cumulativeChars += c.occurences;
    208                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
    209                 charList.add(new Integer(c.charCode));
    210             }
    211             Object [] sortedChars = charList.toArray();
    212             Arrays.sort(sortedChars);
    213 
    214             System.out.print("          {");
    215             for (i=0; i<sortedChars.length; i++) {
    216                 if (i != 0) {
    217                     System.out.print(", ");
    218                     if ((i)%10 == 0) {
    219                         System.out.print("\n           ");
    220                     }
    221                 }
    222                 int cp = ((Integer)sortedChars[i]).intValue();
    223                 System.out.print("0x" + Integer.toHexString(cp));
    224             }
    225             System.out.println("};");
    226         }
    227     }
    228 
    229     //
    230     //  This is a little class containing a
    231     //    multi-byte character value and an occurence count for that char.
    232     //  Instances of this class are kept in the collection that accumulates statistics
    233     //
    234     //  WARNING:  this class's natural ordering (from Comparable) and equals()
    235     //            are inconsistent.
    236 
    237     static class ChEl implements Comparable {
    238         int charCode;
    239         int occurences;
    240 
    241         ChEl(int c, int o) {
    242             charCode = c;
    243             occurences = o;
    244         }
    245 
    246         // Equals needs to work with a map, with the charCode as the key.
    247         //   For insertion/lookup, we care about the char code only, not the occurence count.
    248         public boolean equals(Object other) {
    249             ChEl o = (ChEl)other;
    250             return o.charCode == this.charCode;
    251         }
    252 
    253         // Hashcode needs to be compatible with equals
    254         //   We're using this in a hashMap!
    255         public int hashCode() {
    256             return charCode;
    257         }
    258 
    259         // We want to be able to sort the results by frequency of occurence
    260         //   Compare backwards.  We want most frequent chars first.
    261         public int compareTo(Object other) {
    262             ChEl o = (ChEl)other;
    263             return (this.occurences> o.occurences? -1 :
    264                    (this.occurences==o.occurences?  0 : 1));
    265         }
    266 
    267     }
    268 
    269     //
    270     // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
    271     //              Pulls out one logical char according to the rules of EUC encoding.
    272     //
    273     class iteratedChar {
    274         int             charValue = 0;             // The char value is a value from the encoding.
    275                                                    //   It's meaning is not well defined, other than
    276                                                    //   different encodings
    277         int             index     = 0;
    278         int             nextIndex = 0;
    279         boolean         error     = false;
    280         boolean         done      = false;
    281 
    282         void reset() {
    283             charValue = 0;
    284             index     = -1;
    285             nextIndex = 0;
    286             error     = false;
    287             done      = false;
    288         }
    289 
    290         int nextByte() {
    291             if (nextIndex >= fileSize) {
    292                 done = true;
    293                 return -1;
    294             }
    295             int byteValue = (int)buf[nextIndex++] & 0x00ff;
    296             return byteValue;
    297         }
    298     }
    299 
    300 
    301     boolean nextChar(iteratedChar it) {
    302         it.index = it.nextIndex;
    303         it.error = false;
    304         int firstByte  = 0;
    305         int secondByte = 0;
    306         int thirdByte  = 0;
    307         int fourthByte = 0;
    308 
    309         buildChar: {
    310             firstByte = it.charValue = it.nextByte();
    311             if (firstByte < 0) {
    312                 // Ran off the end of the input data
    313                 it.done = true;
    314                 break buildChar;
    315             }
    316             if (firstByte <= 0x8d) {
    317                 // single byte char
    318                 break buildChar;
    319             }
    320 
    321             secondByte = it.nextByte();
    322             it.charValue = (it.charValue << 8) | secondByte;
    323 
    324             if (firstByte >= 0xA1 && firstByte <= 0xfe) {
    325                 // Two byte Char
    326                 if (secondByte < 0xa1) {
    327                     it.error = true;
    328                 }
    329                 break buildChar;
    330             }
    331             if (firstByte == 0x8e) {
    332                 // Code Set 2.
    333                 //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    334                 //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    335                 // We don't know which we've got.
    336                 // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    337                 //   bytes will look like a well formed 2 byte char.
    338                 if (secondByte < 0xa1) {
    339                     it.error = true;
    340                 }
    341                 break buildChar;
    342             }
    343 
    344             if (firstByte == 0x8f) {
    345                 // Code set 3.
    346                 // Three byte total char size, two bytes of actual char value.
    347                 thirdByte    = it.nextByte();
    348                 it.charValue = (it.charValue << 8) | thirdByte;
    349                 if (thirdByte < 0xa1) {
    350                     it.error = true;
    351                 }
    352             }
    353 
    354         }
    355         if (it.error) {
    356             System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte)
    357                     + " " +  Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte));
    358         }
    359         return (it.done == false);
    360     }
    361 }
    362 
    363 
    364 
    365 
    366