1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 **************************************************************************** 5 * Copyright (C) 2005-2012, International Business Machines Corporation and * 6 * others. All Rights Reserved. * 7 **************************************************************************** 8 * 9 */ 10 package com.ibm.icu.text; 11 12 import java.util.Arrays; 13 14 /** 15 * CharsetRecognizer implemenation for Asian - double or multi-byte - charsets. 16 * Match is determined mostly by the input data adhering to the 17 * encoding scheme for the charset, and, optionally, 18 * frequency-of-occurence of characters. 19 * <p/> 20 * Instances of this class are singletons, one per encoding 21 * being recognized. They are created in the main 22 * CharsetDetector class and kept in the global list of available 23 * encodings to be checked. The specific encoding being recognized 24 * is determined by subclass. 25 */ 26 abstract class CharsetRecog_mbcs extends CharsetRecognizer { 27 28 /** 29 * Get the IANA name of this charset. 30 * @return the charset name. 31 */ 32 @Override 33 abstract String getName() ; 34 35 36 /** 37 * Test the match of this charset with the input text data 38 * which is obtained via the CharsetDetector object. 39 * 40 * @param det The CharsetDetector, which contains the input text 41 * to be checked for being in this charset. 42 * @return Two values packed into one int (Damn java, anyhow) 43 * <br/> 44 * bits 0-7: the match confidence, ranging from 0-100 45 * <br/> 46 * bits 8-15: The match reason, an enum-like value. 47 */ 48 int match(CharsetDetector det, int [] commonChars) { 49 @SuppressWarnings("unused") 50 int singleByteCharCount = 0; //TODO Do we really need this? 51 int doubleByteCharCount = 0; 52 int commonCharCount = 0; 53 int badCharCount = 0; 54 int totalCharCount = 0; 55 int confidence = 0; 56 iteratedChar iter = new iteratedChar(); 57 58 detectBlock: { 59 for (iter.reset(); nextChar(iter, det);) { 60 totalCharCount++; 61 if (iter.error) { 62 badCharCount++; 63 } else { 64 long cv = iter.charValue & 0xFFFFFFFFL; 65 66 if (cv <= 0xff) { 67 singleByteCharCount++; 68 } else { 69 doubleByteCharCount++; 70 if (commonChars != null) { 71 // NOTE: This assumes that there are no 4-byte common chars. 72 if (Arrays.binarySearch(commonChars, (int) cv) >= 0) { 73 commonCharCount++; 74 } 75 } 76 } 77 } 78 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 79 // Bail out early if the byte data is not matching the encoding scheme. 80 break detectBlock; 81 } 82 } 83 84 if (doubleByteCharCount <= 10 && badCharCount== 0) { 85 // Not many multi-byte chars. 86 if (doubleByteCharCount == 0 && totalCharCount < 10) { 87 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 88 // We don't have enough data to have any confidence. 89 // Statistical analysis of single byte non-ASCII charcters would probably help here. 90 confidence = 0; 91 } 92 else { 93 // ASCII or ISO file? It's probably not our encoding, 94 // but is not incompatible with our encoding, so don't give it a zero. 95 confidence = 10; 96 } 97 98 break detectBlock; 99 } 100 101 // 102 // No match if there are too many characters that don't fit the encoding scheme. 103 // (should we have zero tolerance for these?) 104 // 105 if (doubleByteCharCount < 20*badCharCount) { 106 confidence = 0; 107 break detectBlock; 108 } 109 110 if (commonChars == null) { 111 // We have no statistics on frequently occuring characters. 112 // Assess confidence purely on having a reasonable number of 113 // multi-byte characters (the more the better 114 confidence = 30 + doubleByteCharCount - 20*badCharCount; 115 if (confidence > 100) { 116 confidence = 100; 117 } 118 }else { 119 // 120 // Frequency of occurence statistics exist. 121 // 122 double maxVal = Math.log((float)doubleByteCharCount / 4); 123 double scaleFactor = 90.0 / maxVal; 124 confidence = (int)(Math.log(commonCharCount+1) * scaleFactor + 10); 125 confidence = Math.min(confidence, 100); 126 } 127 } // end of detectBlock: 128 129 return confidence; 130 } 131 132 // "Character" iterated character class. 133 // Recognizers for specific mbcs encodings make their "characters" available 134 // by providing a nextChar() function that fills in an instance of iteratedChar 135 // with the next char from the input. 136 // The returned characters are not converted to Unicode, but remain as the raw 137 // bytes (concatenated into an int) from the codepage data. 138 // 139 // For Asian charsets, use the raw input rather than the input that has been 140 // stripped of markup. Detection only considers multi-byte chars, effectively 141 // stripping markup anyway, and double byte chars do occur in markup too. 142 // 143 static class iteratedChar { 144 int charValue = 0; // 1-4 bytes from the raw input data 145 int nextIndex = 0; 146 boolean error = false; 147 boolean done = false; 148 149 void reset() { 150 charValue = 0; 151 nextIndex = 0; 152 error = false; 153 done = false; 154 } 155 156 int nextByte(CharsetDetector det) { 157 if (nextIndex >= det.fRawLength) { 158 done = true; 159 return -1; 160 } 161 int byteValue = det.fRawInput[nextIndex++] & 0x00ff; 162 return byteValue; 163 } 164 } 165 166 /** 167 * Get the next character (however many bytes it is) from the input data 168 * Subclasses for specific charset encodings must implement this function 169 * to get characters according to the rules of their encoding scheme. 170 * 171 * This function is not a method of class iteratedChar only because 172 * that would require a lot of extra derived classes, which is awkward. 173 * @param it The iteratedChar "struct" into which the returned char is placed. 174 * @param det The charset detector, which is needed to get at the input byte data 175 * being iterated over. 176 * @return True if a character was returned, false at end of input. 177 */ 178 abstract boolean nextChar(iteratedChar it, CharsetDetector det); 179 180 181 182 183 184 /** 185 * Shift-JIS charset recognizer. 186 * 187 */ 188 static class CharsetRecog_sjis extends CharsetRecog_mbcs { 189 static int [] commonChars = 190 // TODO: This set of data comes from the character frequency- 191 // of-occurence analysis tool. The data needs to be moved 192 // into a resource and loaded from there. 193 {0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 194 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 195 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 196 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 197 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 198 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 199 200 @Override 201 boolean nextChar(iteratedChar it, CharsetDetector det) { 202 it.error = false; 203 int firstByte; 204 firstByte = it.charValue = it.nextByte(det); 205 if (firstByte < 0) { 206 return false; 207 } 208 209 if (firstByte <= 0x7f || (firstByte>0xa0 && firstByte<=0xdf)) { 210 return true; 211 } 212 213 int secondByte = it.nextByte(det); 214 if (secondByte < 0) { 215 return false; 216 } 217 it.charValue = (firstByte << 8) | secondByte; 218 if (! ((secondByte>=0x40 && secondByte<=0x7f) || (secondByte>=0x80 && secondByte<=0xff))) { 219 // Illegal second byte value. 220 it.error = true; 221 } 222 return true; 223 } 224 225 @Override 226 CharsetMatch match(CharsetDetector det) { 227 int confidence = match(det, commonChars); 228 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 229 } 230 231 @Override 232 String getName() { 233 return "Shift_JIS"; 234 } 235 236 @Override 237 public String getLanguage() 238 { 239 return "ja"; 240 } 241 242 243 } 244 245 246 /** 247 * Big5 charset recognizer. 248 * 249 */ 250 static class CharsetRecog_big5 extends CharsetRecog_mbcs { 251 static int [] commonChars = 252 // TODO: This set of data comes from the character frequency- 253 // of-occurence analysis tool. The data needs to be moved 254 // into a resource and loaded from there. 255 {0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 256 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 257 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 258 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 259 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 260 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 261 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 262 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 263 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 264 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 265 266 @Override 267 boolean nextChar(iteratedChar it, CharsetDetector det) { 268 it.error = false; 269 int firstByte; 270 firstByte = it.charValue = it.nextByte(det); 271 if (firstByte < 0) { 272 return false; 273 } 274 275 if (firstByte <= 0x7f || firstByte==0xff) { 276 // single byte character. 277 return true; 278 } 279 280 int secondByte = it.nextByte(det); 281 if (secondByte < 0) { 282 return false; 283 } 284 it.charValue = (it.charValue << 8) | secondByte; 285 286 if (secondByte < 0x40 || 287 secondByte ==0x7f || 288 secondByte == 0xff) { 289 it.error = true; 290 } 291 return true; 292 } 293 294 @Override 295 CharsetMatch match(CharsetDetector det) { 296 int confidence = match(det, commonChars); 297 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 298 } 299 300 @Override 301 String getName() { 302 return "Big5"; 303 } 304 305 306 @Override 307 public String getLanguage() 308 { 309 return "zh"; 310 } 311 } 312 313 314 /** 315 * EUC charset recognizers. One abstract class that provides the common function 316 * for getting the next character according to the EUC encoding scheme, 317 * and nested derived classes for EUC_KR, EUC_JP, EUC_CN. 318 * 319 */ 320 abstract static class CharsetRecog_euc extends CharsetRecog_mbcs { 321 322 /* 323 * (non-Javadoc) 324 * Get the next character value for EUC based encodings. 325 * Character "value" is simply the raw bytes that make up the character 326 * packed into an int. 327 */ 328 @Override 329 boolean nextChar(iteratedChar it, CharsetDetector det) { 330 it.error = false; 331 int firstByte = 0; 332 int secondByte = 0; 333 int thirdByte = 0; 334 //int fourthByte = 0; 335 336 buildChar: { 337 firstByte = it.charValue = it.nextByte(det); 338 if (firstByte < 0) { 339 // Ran off the end of the input data 340 it.done = true; 341 break buildChar; 342 } 343 if (firstByte <= 0x8d) { 344 // single byte char 345 break buildChar; 346 } 347 348 secondByte = it.nextByte(det); 349 it.charValue = (it.charValue << 8) | secondByte; 350 351 if (firstByte >= 0xA1 && firstByte <= 0xfe) { 352 // Two byte Char 353 if (secondByte < 0xa1) { 354 it.error = true; 355 } 356 break buildChar; 357 } 358 if (firstByte == 0x8e) { 359 // Code Set 2. 360 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 361 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 362 // We don't know which we've got. 363 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 364 // bytes will look like a well formed 2 byte char. 365 if (secondByte < 0xa1) { 366 it.error = true; 367 } 368 break buildChar; 369 } 370 371 if (firstByte == 0x8f) { 372 // Code set 3. 373 // Three byte total char size, two bytes of actual char value. 374 thirdByte = it.nextByte(det); 375 it.charValue = (it.charValue << 8) | thirdByte; 376 if (thirdByte < 0xa1) { 377 it.error = true; 378 } 379 } 380 } 381 382 return (it.done == false); 383 } 384 385 /** 386 * The charset recognize for EUC-JP. A singleton instance of this class 387 * is created and kept by the public CharsetDetector class 388 */ 389 static class CharsetRecog_euc_jp extends CharsetRecog_euc { 390 static int [] commonChars = 391 // TODO: This set of data comes from the character frequency- 392 // of-occurence analysis tool. The data needs to be moved 393 // into a resource and loaded from there. 394 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 395 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 396 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 397 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 398 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 399 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 400 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 401 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 402 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 403 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 404 @Override 405 String getName() { 406 return "EUC-JP"; 407 } 408 409 @Override 410 CharsetMatch match(CharsetDetector det) { 411 int confidence = match(det, commonChars); 412 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 413 } 414 415 @Override 416 public String getLanguage() 417 { 418 return "ja"; 419 } 420 } 421 422 /** 423 * The charset recognize for EUC-KR. A singleton instance of this class 424 * is created and kept by the public CharsetDetector class 425 */ 426 static class CharsetRecog_euc_kr extends CharsetRecog_euc { 427 static int [] commonChars = 428 // TODO: This set of data comes from the character frequency- 429 // of-occurence analysis tool. The data needs to be moved 430 // into a resource and loaded from there. 431 {0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 432 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 433 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 434 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 435 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 436 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 437 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 438 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 439 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 440 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 441 442 @Override 443 String getName() { 444 return "EUC-KR"; 445 } 446 447 @Override 448 CharsetMatch match(CharsetDetector det) { 449 int confidence = match(det, commonChars); 450 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 451 } 452 453 @Override 454 public String getLanguage() 455 { 456 return "ko"; 457 } 458 } 459 } 460 461 /** 462 * 463 * GB-18030 recognizer. Uses simplified Chinese statistics. 464 * 465 */ 466 static class CharsetRecog_gb_18030 extends CharsetRecog_mbcs { 467 468 /* 469 * (non-Javadoc) 470 * Get the next character value for EUC based encodings. 471 * Character "value" is simply the raw bytes that make up the character 472 * packed into an int. 473 */ 474 @Override 475 boolean nextChar(iteratedChar it, CharsetDetector det) { 476 it.error = false; 477 int firstByte = 0; 478 int secondByte = 0; 479 int thirdByte = 0; 480 int fourthByte = 0; 481 482 buildChar: { 483 firstByte = it.charValue = it.nextByte(det); 484 485 if (firstByte < 0) { 486 // Ran off the end of the input data 487 it.done = true; 488 break buildChar; 489 } 490 491 if (firstByte <= 0x80) { 492 // single byte char 493 break buildChar; 494 } 495 496 secondByte = it.nextByte(det); 497 it.charValue = (it.charValue << 8) | secondByte; 498 499 if (firstByte >= 0x81 && firstByte <= 0xFE) { 500 // Two byte Char 501 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <=0xFE)) { 502 break buildChar; 503 } 504 505 // Four byte char 506 if (secondByte >= 0x30 && secondByte <= 0x39) { 507 thirdByte = it.nextByte(det); 508 509 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 510 fourthByte = it.nextByte(det); 511 512 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 513 it.charValue = (it.charValue << 16) | (thirdByte << 8) | fourthByte; 514 break buildChar; 515 } 516 } 517 } 518 519 it.error = true; 520 break buildChar; 521 } 522 } 523 524 return (it.done == false); 525 } 526 527 static int [] commonChars = 528 // TODO: This set of data comes from the character frequency- 529 // of-occurence analysis tool. The data needs to be moved 530 // into a resource and loaded from there. 531 {0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 532 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 533 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 534 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 535 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 536 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 537 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 538 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 539 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 540 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 541 542 543 @Override 544 String getName() { 545 return "GB18030"; 546 } 547 548 @Override 549 CharsetMatch match(CharsetDetector det) { 550 int confidence = match(det, commonChars); 551 return confidence == 0 ? null : new CharsetMatch(det, this, confidence); 552 } 553 554 @Override 555 public String getLanguage() 556 { 557 return "zh"; 558 } 559 } 560 561 562 } 563