1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 *********************************************************************** 5 * 6 * Copyright (C) 2005-2012, International Business Machines Corporation and 7 * others. All Rights Reserved. 8 * 9 *********************************************************************** 10 * 11 * euc_tool 12 * 13 * This tool produces the character usage frequency statistics for the EUC family 14 * of charsets, for use by the ICU charset detectors. 15 * 16 * usage: java euc_tool [-d] [directory path] 17 * 18 * -d: Produce the data in a form to be exported to the ICU implementation 19 * Default is to produce an informative dump. 20 * 21 * directory path 22 * Source directory for the files to be analyzed. 23 * Default is the current directory. 24 * There should be three subdirectories under the specified directory, one 25 * each for EUC_JP, EUC_CN and EUC_KR. Within each of these subdirectories 26 * should be text files in the specified encoding. 27 * 28 */ 29 30 package com.ibm.icu.dev.tool.charsetdet.mbcs; 31 32 import java.io.File; 33 import java.io.FileInputStream; 34 import java.util.ArrayList; 35 import java.util.Arrays; 36 import java.util.HashMap; 37 import java.util.List; 38 39 public class EUCTool { 40 41 // The file buffer and file data length need to be out in class member variables 42 // so that the code lifted from charSet detection for scanning the multi-byte chars 43 // can see them conveniently. 44 byte [] buf = new byte[1000000]; 45 int fileSize; 46 47 boolean option_d = false; // data option. Produce exportable data 48 boolean option_v = true; // verbose informaional output. 49 50 51 52 public static void main(String[] args) { 53 EUCTool This = new EUCTool(); 54 This.Main(args); 55 } 56 57 58 59 void Main(String[] args) { 60 int i; 61 62 // 63 // Command Line Option Handling 64 // 65 String dirName = "."; 66 for (i=0; i<args.length; i++) { 67 if (args[i].equals("-d")) { 68 option_d = true; 69 option_v = false; 70 continue; 71 } 72 if (args[i].startsWith("-")) { 73 System.err.println("Unrecongized option: " + args[i]); 74 System.exit(-1); 75 } 76 dirName = args[i]; 77 } 78 79 // 80 // Verify that the specified directory exists. 81 // 82 File dir = new File(dirName); 83 if (dir.isDirectory() == false) { 84 System.err.println("\"" + dirName + "\" is not a directory"); 85 System.exit(-1); 86 } 87 88 // 89 // Do each subdirectory of the specified directory. There should be 90 // one per each encoding - euc-kr, euc-cn, euc-jp 91 // 92 File[] dirs = dir.listFiles(); 93 for (i=0; i<dirs.length; i++) { 94 if (dirs[i].isDirectory()) { 95 String nam = dirs[i].getName(); 96 if (nam.equalsIgnoreCase("CVS")) { 97 continue; 98 } 99 processDir(dirs[i]); 100 } 101 } 102 } 103 104 // 105 // Collect statistics from all ordinary files in a specified directory. 106 // 107 void processDir(File dir) { 108 int totalMbcsChars = 0; 109 HashMap m = new HashMap(10000); 110 int i; 111 112 System.out.println(dir.getName()); 113 File[] files = dir.listFiles(); 114 for (i=0; i<files.length; i++) { 115 FileInputStream is = null; 116 try { 117 if (files[i].isFile()) { 118 is = new FileInputStream(files[i]); 119 fileSize = is.read(buf); 120 if (option_v) { 121 System.out.println(files[i].getPath()); 122 System.out.println(" " + fileSize + " bytes."); 123 } 124 iteratedChar ichar = new iteratedChar(); 125 int fileChars = 0; 126 int fileMbcsChars = 0; 127 int errs = 0; 128 129 while (nextChar(ichar)) { 130 if (ichar.error == true) { 131 errs++; 132 continue; 133 } 134 fileChars++; 135 if (ichar.charValue > 255) { 136 fileMbcsChars++; 137 totalMbcsChars++; 138 } 139 if (ichar.charValue <= 255) { 140 // Don't keep occurence statistics for the single byte range 141 continue; 142 } 143 144 // 145 // Frequency of occurence statistics are accumulated in a map. 146 // 147 ChEl keyEl = new ChEl(ichar.charValue, 0); 148 ChEl valEl = (ChEl)m.get(keyEl); 149 if (valEl == null) { 150 m.put(keyEl, keyEl); 151 valEl = keyEl; 152 } 153 valEl.occurences++; 154 } 155 if (option_v) { 156 System.out.println(" " + fileChars + " Chars"); 157 System.out.println(" " + fileMbcsChars + " mbcs Chars"); 158 System.out.println(" " + errs + " errors"); 159 System.out.println("\n"); 160 } 161 } 162 } 163 catch (Exception e) { 164 System.err.println("Exception:" + e); 165 166 } 167 finally { 168 if (is != null) { 169 try { 170 is.close(); 171 } catch (Exception e) { 172 // ignore 173 } 174 } 175 } 176 } 177 178 // 179 // We've processed through all of the files. 180 // sort and dump out the frequency statistics. 181 // 182 Object [] encounteredChars = m.values().toArray(); 183 Arrays.sort(encounteredChars); 184 int cumulativeChars = 0; 185 int cumulativePercent = 0; 186 if (option_v) { 187 System.out.println("# <char code> <occurences> <Cumulative %>"); 188 for (i=0; i<encounteredChars.length; i++) { 189 ChEl c = (ChEl)encounteredChars[i]; 190 cumulativeChars += c.occurences; 191 cumulativePercent = cumulativeChars*100/totalMbcsChars; 192 System.out.println(i + " " + Integer.toHexString(c.charCode) + " " 193 + c.occurences + " " + cumulativePercent); 194 } 195 } 196 if (option_d) { 197 // 198 // Output the list of characters formatted for pasting into a 199 // Java source code array initializer. 200 // Resort into order based on the character code value, not 201 // on frequency of occurence. 202 // 203 List charList = new ArrayList(); 204 205 for (i=0; i<100 && cumulativePercent<50; i++) { 206 ChEl c = (ChEl)encounteredChars[i]; 207 cumulativeChars += c.occurences; 208 cumulativePercent = cumulativeChars*100/totalMbcsChars; 209 charList.add(new Integer(c.charCode)); 210 } 211 Object [] sortedChars = charList.toArray(); 212 Arrays.sort(sortedChars); 213 214 System.out.print(" {"); 215 for (i=0; i<sortedChars.length; i++) { 216 if (i != 0) { 217 System.out.print(", "); 218 if ((i)%10 == 0) { 219 System.out.print("\n "); 220 } 221 } 222 int cp = ((Integer)sortedChars[i]).intValue(); 223 System.out.print("0x" + Integer.toHexString(cp)); 224 } 225 System.out.println("};"); 226 } 227 } 228 229 // 230 // This is a little class containing a 231 // multi-byte character value and an occurence count for that char. 232 // Instances of this class are kept in the collection that accumulates statistics 233 // 234 // WARNING: this class's natural ordering (from Comparable) and equals() 235 // are inconsistent. 236 237 static class ChEl implements Comparable { 238 int charCode; 239 int occurences; 240 241 ChEl(int c, int o) { 242 charCode = c; 243 occurences = o; 244 } 245 246 // Equals needs to work with a map, with the charCode as the key. 247 // For insertion/lookup, we care about the char code only, not the occurence count. 248 public boolean equals(Object other) { 249 ChEl o = (ChEl)other; 250 return o.charCode == this.charCode; 251 } 252 253 // Hashcode needs to be compatible with equals 254 // We're using this in a hashMap! 255 public int hashCode() { 256 return charCode; 257 } 258 259 // We want to be able to sort the results by frequency of occurence 260 // Compare backwards. We want most frequent chars first. 261 public int compareTo(Object other) { 262 ChEl o = (ChEl)other; 263 return (this.occurences> o.occurences? -1 : 264 (this.occurences==o.occurences? 0 : 1)); 265 } 266 267 } 268 269 // 270 // iteratedChar is copied and slightly hacked from the similar calss in CharsetRecog_mbcs 271 // Pulls out one logical char according to the rules of EUC encoding. 272 // 273 class iteratedChar { 274 int charValue = 0; // The char value is a value from the encoding. 275 // It's meaning is not well defined, other than 276 // different encodings 277 int index = 0; 278 int nextIndex = 0; 279 boolean error = false; 280 boolean done = false; 281 282 void reset() { 283 charValue = 0; 284 index = -1; 285 nextIndex = 0; 286 error = false; 287 done = false; 288 } 289 290 int nextByte() { 291 if (nextIndex >= fileSize) { 292 done = true; 293 return -1; 294 } 295 int byteValue = (int)buf[nextIndex++] & 0x00ff; 296 return byteValue; 297 } 298 } 299 300 301 boolean nextChar(iteratedChar it) { 302 it.index = it.nextIndex; 303 it.error = false; 304 int firstByte = 0; 305 int secondByte = 0; 306 int thirdByte = 0; 307 int fourthByte = 0; 308 309 buildChar: { 310 firstByte = it.charValue = it.nextByte(); 311 if (firstByte < 0) { 312 // Ran off the end of the input data 313 it.done = true; 314 break buildChar; 315 } 316 if (firstByte <= 0x8d) { 317 // single byte char 318 break buildChar; 319 } 320 321 secondByte = it.nextByte(); 322 it.charValue = (it.charValue << 8) | secondByte; 323 324 if (firstByte >= 0xA1 && firstByte <= 0xfe) { 325 // Two byte Char 326 if (secondByte < 0xa1) { 327 it.error = true; 328 } 329 break buildChar; 330 } 331 if (firstByte == 0x8e) { 332 // Code Set 2. 333 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 334 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 335 // We don't know which we've got. 336 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 337 // bytes will look like a well formed 2 byte char. 338 if (secondByte < 0xa1) { 339 it.error = true; 340 } 341 break buildChar; 342 } 343 344 if (firstByte == 0x8f) { 345 // Code set 3. 346 // Three byte total char size, two bytes of actual char value. 347 thirdByte = it.nextByte(); 348 it.charValue = (it.charValue << 8) | thirdByte; 349 if (thirdByte < 0xa1) { 350 it.error = true; 351 } 352 } 353 354 } 355 if (it.error) { 356 System.out.println("Error " + Integer.toHexString(firstByte) + " " + Integer.toHexString(secondByte) 357 + " " + Integer.toHexString(thirdByte) + " " + Integer.toHexString(fourthByte)); 358 } 359 return (it.done == false); 360 } 361 } 362 363 364 365 366