Home | History | Annotate | Download | only in mbcs
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  ***********************************************************************
      5  *
      6  * Copyright (C) 2006-2012, International Business Machines Corporation and
      7  * others. All Rights Reserved.
      8  *
      9  ***********************************************************************
     10  *
     11  * BIG5Tool
     12  *
     13  *    This tool produces the character usage frequency statistics for the Big5
     14  *    Chinese charset, for use by the ICU charset detectors.
     15  *
     16  *    usage:  java BIG5Tool [-d] [directory path]
     17  *
     18  *        -d:   Produce the data in a form to be exported to the ICU implementation
     19  *              Default is to produce an informative dump.
     20  *
     21  *        -sjis Do Shift_JIS.  The structure of sjis is very similar to Big5.
     22  *
     23  *        directory path
     24  *              Source directory for the text files to be analyzed.
     25  *              All files in the specified directory must be in the Big5 encoding.
     26  *
     27  */
     28 
     29 package com.ibm.icu.dev.tool.charsetdet.mbcs;
     30 
     31 import java.io.File;
     32 import java.io.FileInputStream;
     33 import java.util.ArrayList;
     34 import java.util.Arrays;
     35 import java.util.HashMap;
     36 import java.util.List;
     37 
     38 
     39 public class BIG5Tool {
     40 
     41     // The file buffer and file data length need to be out in class member variables
     42     //  so that the code lifted from charSet detection for scanning the multi-byte chars
     43     //  can see them conveniently.
     44     byte []    buf = new byte[1000000];
     45     int        fileSize;
     46 
     47     boolean    option_d = false;    // data option.  Produce exportable data
     48     boolean    option_v = true;     // verbose informaional output.
     49     boolean    sjis     = false;    // True if input text files are Shift_JIS encoded.
     50 
     51 
     52 
     53     public static void main(String[] args) {
     54         BIG5Tool  This = new BIG5Tool();
     55         This.Main(args);
     56     }
     57 
     58 
     59 
     60     void Main(String[] args) {
     61         int i;
     62 
     63         //
     64         //   Command Line Option Handling
     65         //
     66         String     dirName  = null;
     67         for (i=0; i<args.length; i++) {
     68             if (args[i].equals("-d")) {
     69                 option_d = true;
     70                 option_v = false;
     71                 continue;
     72             }
     73             if (args[i].equals("-sjis")) {
     74                 sjis = true;
     75                 continue;
     76             }
     77             if (args[i].startsWith("-")) {
     78                 System.err.println("Unrecognized option: " + args[i]);
     79                 System.exit(-1);
     80             }
     81             if (dirName == null) {
     82                 dirName = args[i];
     83             } else {
     84                 System.err.println("Unrecognized option: " + dirName);
     85                 System.exit(-1);
     86             }
     87         }
     88         if (dirName == null) {
     89             dirName = ".";
     90         }
     91 
     92         //
     93         //  Verify that the specified directory exists.
     94         //
     95         File dir = new File(dirName);
     96         if (dir.isDirectory() == false) {
     97             System.err.println("\"" + dirName + "\" is not a directory");
     98             System.exit(-1);
     99         }
    100         processDir(dir);
    101 
    102     }
    103 
    104     //
    105     // Collect statistics from all ordinary files in a specified directory.
    106     //
    107     void processDir(File dir) {
    108         int      totalMbcsChars  = 0;
    109         HashMap  m = new HashMap(10000);
    110         int      i;
    111 
    112         System.out.println(dir.getName());
    113         File[] files = dir.listFiles();
    114         for (i=0; i<files.length; i++) {
    115             FileInputStream is = null;
    116             try {
    117                 if (files[i].isFile()) {
    118                     is = new FileInputStream(files[i]);
    119                     fileSize = is.read(buf);
    120                     if (option_v) {
    121                         System.out.println(files[i].getPath());
    122                         System.out.println("  " + fileSize + " bytes.");
    123                     }
    124                     iteratedChar ichar = new iteratedChar();
    125                     int fileChars     = 0;
    126                     int fileMbcsChars = 0;
    127                     int errs          = 0;
    128 
    129                     while (nextChar(ichar)) {
    130                         if (ichar.error == true) {
    131                             errs++;
    132                             continue;
    133                         }
    134                         fileChars++;
    135                         if (ichar.charValue > 255) {
    136                             fileMbcsChars++;
    137                             totalMbcsChars++;
    138                         }
    139                         if (ichar.charValue <= 255) {
    140                             // Don't keep occurence statistics for the single byte range
    141                             continue;
    142                         }
    143 
    144                         //
    145                         //  Frequency of occurence statistics are accumulated in a map.
    146                         //
    147                         ChEl  keyEl = new ChEl(ichar.charValue, 0);
    148                         ChEl  valEl = (ChEl)m.get(keyEl);
    149                         if (valEl == null) {
    150                             m.put(keyEl, keyEl);
    151                             valEl = keyEl;
    152                         }
    153                         valEl.occurences++;
    154                     }
    155                     if (option_v) {
    156                         System.out.println("  " + fileChars     + " Chars");
    157                         System.out.println("  " + fileMbcsChars + " mbcs Chars");
    158                         System.out.println("  " + errs          + " errors");
    159                         System.out.println("\n");
    160                     }
    161                 }
    162             }
    163             catch (Exception e) {
    164                 System.err.println("Exception:" + e);
    165 
    166             }
    167             finally {
    168                 if (is != null) {
    169                     try {
    170                         is.close();
    171                     } catch (Exception e) {
    172                         // ignore
    173                     }
    174                 }
    175             }
    176         }
    177 
    178         //
    179         //  We've processed through all of the files.
    180         //     sort and dump out the frequency statistics.
    181         //
    182         Object [] encounteredChars = m.values().toArray();
    183         Arrays.sort(encounteredChars);
    184         int cumulativeChars = 0;
    185         int cumulativePercent = 0;
    186         if (option_v) {
    187             System.out.println("# <char code> <occurences>  <Cumulative %>");
    188             for (i=0; i<encounteredChars.length; i++) {
    189                 ChEl c = (ChEl)encounteredChars[i];
    190                 cumulativeChars += c.occurences;
    191                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
    192                 System.out.println(i + "   " + Integer.toHexString(c.charCode) + "        "
    193                         + c.occurences + "         " + cumulativePercent);
    194             }
    195         }
    196         if (option_d) {
    197             //
    198             //   Output the list of characters formatted for pasting into a
    199             //     Java source code array initializer.
    200             //     Resort into order based on the character code value, not
    201             //      on frequency of occurence.
    202             //
    203             List  charList = new ArrayList();
    204 
    205             for (i=0; i<100 && cumulativePercent<50; i++) {
    206                 ChEl c = (ChEl)encounteredChars[i];
    207                 cumulativeChars += c.occurences;
    208                 cumulativePercent = cumulativeChars*100/totalMbcsChars;
    209                 charList.add(new Integer(c.charCode));
    210             }
    211             Object [] sortedChars = charList.toArray();
    212             Arrays.sort(sortedChars);
    213 
    214             System.out.print("          {");
    215             for (i=0; i<sortedChars.length; i++) {
    216                 if (i != 0) {
    217                     System.out.print(", ");
    218                     if ((i)%10 == 0) {
    219                         System.out.print("\n           ");
    220                     }
    221                 }
    222                 int cp = ((Integer)sortedChars[i]).intValue();
    223                 System.out.print("0x" + Integer.toHexString(cp));
    224             }
    225             System.out.println("};");
    226         }
    227     }
    228 
    229     //
    230     //  This is a little class containing a
    231     //    multi-byte character value and an occurence count for that char.
    232     //  Instances of this class are kept in the collection that accumulates statistics
    233     //
    234     //  WARNING:  this class's natural ordering (from Comparable) and equals()
    235     //            are inconsistent.
    236 
    237     static class ChEl implements Comparable {
    238         int charCode;
    239         int occurences;
    240 
    241         ChEl(int c, int o) {
    242             charCode = c;
    243             occurences = o;
    244         }
    245 
    246         // Equals needs to work with a map, with the charCode as the key.
    247         //   For insertion/lookup, we care about the char code only, not the occurence count.
    248         public boolean equals(Object other) {
    249             ChEl o = (ChEl)other;
    250             return o.charCode == this.charCode;
    251         }
    252 
    253         // Hashcode needs to be compatible with equals
    254         //   We're using this in a hashMap!
    255         public int hashCode() {
    256             return charCode;
    257         }
    258 
    259         // We want to be able to sort the results by frequency of occurence
    260         //   Compare backwards.  We want most frequent chars first.
    261         public int compareTo(Object other) {
    262             ChEl o = (ChEl)other;
    263             return (this.occurences> o.occurences? -1 :
    264                    (this.occurences==o.occurences?  0 : 1));
    265         }
    266 
    267     }
    268 
    269     //
    270     // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs
    271     //              Pulls out one logical char according to the rules of EUC encoding.
    272     //
    273     class iteratedChar {
    274         int             charValue = 0;             // The char value is a value from the encoding.
    275                                                    //   It's meaning is not well defined, other than
    276                                                    //   different encodings
    277         int             index     = 0;
    278         int             nextIndex = 0;
    279         boolean         error     = false;
    280         boolean         done      = false;
    281 
    282         void reset() {
    283             charValue = 0;
    284             index     = -1;
    285             nextIndex = 0;
    286             error     = false;
    287             done      = false;
    288         }
    289 
    290         int nextByte() {
    291             if (nextIndex >= fileSize) {
    292                 done = true;
    293                 return -1;
    294             }
    295             int byteValue = (int)buf[nextIndex++] & 0x00ff;
    296             return byteValue;
    297         }
    298     }
    299 
    300 
    301     boolean nextChar(iteratedChar it) {
    302         it.index = it.nextIndex;
    303         it.error = false;
    304         int firstByte  = 0;
    305         int secondByte = 0;
    306 
    307         buildChar: {
    308             firstByte = it.charValue = it.nextByte();
    309             if (firstByte < 0) {
    310                 // Ran off the end of the input data
    311                 it.done = true;
    312                 break buildChar;
    313             }
    314             if (firstByte <= 0x0080 ||
    315                     (sjis && firstByte>=0x00a0 && firstByte< 0x00e0) ||
    316                     (sjis && firstByte>=0x00fd && firstByte<=0x00ff)) {
    317                 // single byte char
    318                 break buildChar;
    319             }
    320 
    321             secondByte = it.nextByte();
    322             it.charValue = (it.charValue << 8) | secondByte;
    323 
    324             if (secondByte <  0x40 ||
    325                 secondByte == 0x007f ||
    326                 secondByte == 0x00ff ||
    327                 sjis && secondByte >= 0x00fd) {
    328                     it.error = true;
    329             }
    330 
    331             if (it.error) {
    332                 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte));
    333             }
    334        }
    335 
    336         return (it.done == false);
    337     }
    338 
    339 }
    340