1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_CONVERSION 11 12 #include "csmatch.h" 13 #include "csrmbcs.h" 14 15 #include <math.h> 16 17 U_NAMESPACE_BEGIN 18 19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 20 21 #define min(x,y) (((x)<(y))?(x):(y)) 22 23 static const uint16_t commonChars_sjis [] = { 24 // TODO: This set of data comes from the character frequency- 25 // of-occurence analysis tool. The data needs to be moved 26 // into a resource and loaded from there. 27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 33 34 static const uint16_t commonChars_euc_jp[] = { 35 // TODO: This set of data comes from the character frequency- 36 // of-occurence analysis tool. The data needs to be moved 37 // into a resource and loaded from there. 38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 48 49 static const uint16_t commonChars_euc_kr[] = { 50 // TODO: This set of data comes from the character frequency- 51 // of-occurence analysis tool. The data needs to be moved 52 // into a resource and loaded from there. 53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 63 64 static const uint16_t commonChars_big5[] = { 65 // TODO: This set of data comes from the character frequency- 66 // of-occurence analysis tool. The data needs to be moved 67 // into a resource and loaded from there. 68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 78 79 static const uint16_t commonChars_gb_18030[] = { 80 // TODO: This set of data comes from the character frequency- 81 // of-occurence analysis tool. The data needs to be moved 82 // into a resource and loaded from there. 83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 93 94 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 95 { 96 int32_t start = 0, end = len-1; 97 int32_t mid = (start+end)/2; 98 99 while(start <= end) { 100 if(array[mid] == value) { 101 return mid; 102 } 103 104 if(array[mid] < value){ 105 start = mid+1; 106 } else { 107 end = mid-1; 108 } 109 110 mid = (start+end)/2; 111 } 112 113 return -1; 114 } 115 116 IteratedChar::IteratedChar() : 117 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) 118 { 119 // nothing else to do. 120 } 121 122 /*void IteratedChar::reset() 123 { 124 charValue = 0; 125 index = -1; 126 nextIndex = 0; 127 error = FALSE; 128 done = FALSE; 129 }*/ 130 131 int32_t IteratedChar::nextByte(InputText *det) 132 { 133 if (nextIndex >= det->fRawLength) { 134 done = TRUE; 135 136 return -1; 137 } 138 139 return det->fRawInput[nextIndex++]; 140 } 141 142 CharsetRecog_mbcs::~CharsetRecog_mbcs() 143 { 144 // nothing to do. 145 } 146 147 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 148 int32_t singleByteCharCount = 0; 149 int32_t doubleByteCharCount = 0; 150 int32_t commonCharCount = 0; 151 int32_t badCharCount = 0; 152 int32_t totalCharCount = 0; 153 int32_t confidence = 0; 154 IteratedChar iter; 155 156 while (nextChar(&iter, det)) { 157 totalCharCount++; 158 159 if (iter.error) { 160 badCharCount++; 161 } else { 162 if (iter.charValue <= 0xFF) { 163 singleByteCharCount++; 164 } else { 165 doubleByteCharCount++; 166 167 if (commonChars != 0) { 168 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ 169 commonCharCount += 1; 170 } 171 } 172 } 173 } 174 175 176 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 177 // Bail out early if the byte data is not matching the encoding scheme. 178 // break detectBlock; 179 return confidence; 180 } 181 } 182 183 if (doubleByteCharCount <= 10 && badCharCount == 0) { 184 // Not many multi-byte chars. 185 if (doubleByteCharCount == 0 && totalCharCount < 10) { 186 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 187 // We don't have enough data to have any confidence. 188 // Statistical analysis of single byte non-ASCII charcters would probably help here. 189 confidence = 0; 190 } 191 else { 192 // ASCII or ISO file? It's probably not our encoding, 193 // but is not incompatible with our encoding, so don't give it a zero. 194 confidence = 10; 195 } 196 197 return confidence; 198 } 199 200 // 201 // No match if there are too many characters that don't fit the encoding scheme. 202 // (should we have zero tolerance for these?) 203 // 204 if (doubleByteCharCount < 20*badCharCount) { 205 confidence = 0; 206 207 return confidence; 208 } 209 210 if (commonChars == 0) { 211 // We have no statistics on frequently occuring characters. 212 // Assess confidence purely on having a reasonable number of 213 // multi-byte characters (the more the better) 214 confidence = 30 + doubleByteCharCount - 20*badCharCount; 215 216 if (confidence > 100) { 217 confidence = 100; 218 } 219 } else { 220 // 221 // Frequency of occurence statistics exist. 222 // 223 224 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ 225 double scaleFactor = 90.0 / maxVal; 226 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); 227 228 confidence = min(confidence, 100); 229 } 230 231 if (confidence < 0) { 232 confidence = 0; 233 } 234 235 return confidence; 236 } 237 238 CharsetRecog_sjis::~CharsetRecog_sjis() 239 { 240 // nothing to do 241 } 242 243 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 244 it->index = it->nextIndex; 245 it->error = FALSE; 246 247 int32_t firstByte = it->charValue = it->nextByte(det); 248 249 if (firstByte < 0) { 250 return FALSE; 251 } 252 253 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 254 return TRUE; 255 } 256 257 int32_t secondByte = it->nextByte(det); 258 if (secondByte >= 0) { 259 it->charValue = (firstByte << 8) | secondByte; 260 } 261 // else we'll handle the error later. 262 263 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 264 // Illegal second byte value. 265 it->error = TRUE; 266 } 267 268 return TRUE; 269 } 270 271 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 272 int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); 273 results->set(det, this, confidence); 274 return (confidence > 0); 275 } 276 277 const char *CharsetRecog_sjis::getName() const 278 { 279 return "Shift_JIS"; 280 } 281 282 const char *CharsetRecog_sjis::getLanguage() const 283 { 284 return "ja"; 285 } 286 287 CharsetRecog_euc::~CharsetRecog_euc() 288 { 289 // nothing to do 290 } 291 292 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 293 int32_t firstByte = 0; 294 int32_t secondByte = 0; 295 int32_t thirdByte = 0; 296 297 it->index = it->nextIndex; 298 it->error = FALSE; 299 firstByte = it->charValue = it->nextByte(det); 300 301 if (firstByte < 0) { 302 // Ran off the end of the input data 303 return FALSE; 304 } 305 306 if (firstByte <= 0x8D) { 307 // single byte char 308 return TRUE; 309 } 310 311 secondByte = it->nextByte(det); 312 if (secondByte >= 0) { 313 it->charValue = (it->charValue << 8) | secondByte; 314 } 315 // else we'll handle the error later. 316 317 if (firstByte >= 0xA1 && firstByte <= 0xFE) { 318 // Two byte Char 319 if (secondByte < 0xA1) { 320 it->error = TRUE; 321 } 322 323 return TRUE; 324 } 325 326 if (firstByte == 0x8E) { 327 // Code Set 2. 328 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 329 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 330 // We don't know which we've got. 331 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 332 // bytes will look like a well formed 2 byte char. 333 if (secondByte < 0xA1) { 334 it->error = TRUE; 335 } 336 337 return TRUE; 338 } 339 340 if (firstByte == 0x8F) { 341 // Code set 3. 342 // Three byte total char size, two bytes of actual char value. 343 thirdByte = it->nextByte(det); 344 it->charValue = (it->charValue << 8) | thirdByte; 345 346 if (thirdByte < 0xa1) { 347 // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 348 it->error = TRUE; 349 } 350 } 351 352 return TRUE; 353 354 } 355 356 CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 357 { 358 // nothing to do 359 } 360 361 const char *CharsetRecog_euc_jp::getName() const 362 { 363 return "EUC-JP"; 364 } 365 366 const char *CharsetRecog_euc_jp::getLanguage() const 367 { 368 return "ja"; 369 } 370 371 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 372 { 373 int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); 374 results->set(det, this, confidence); 375 return (confidence > 0); 376 } 377 378 CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 379 { 380 // nothing to do 381 } 382 383 const char *CharsetRecog_euc_kr::getName() const 384 { 385 return "EUC-KR"; 386 } 387 388 const char *CharsetRecog_euc_kr::getLanguage() const 389 { 390 return "ko"; 391 } 392 393 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 394 { 395 int32_t confidence = match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); 396 results->set(det, this, confidence); 397 return (confidence > 0); 398 } 399 400 CharsetRecog_big5::~CharsetRecog_big5() 401 { 402 // nothing to do 403 } 404 405 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 406 { 407 int32_t firstByte; 408 409 it->index = it->nextIndex; 410 it->error = FALSE; 411 firstByte = it->charValue = it->nextByte(det); 412 413 if (firstByte < 0) { 414 return FALSE; 415 } 416 417 if (firstByte <= 0x7F || firstByte == 0xFF) { 418 // single byte character. 419 return TRUE; 420 } 421 422 int32_t secondByte = it->nextByte(det); 423 if (secondByte >= 0) { 424 it->charValue = (it->charValue << 8) | secondByte; 425 } 426 // else we'll handle the error later. 427 428 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 429 it->error = TRUE; 430 } 431 432 return TRUE; 433 } 434 435 const char *CharsetRecog_big5::getName() const 436 { 437 return "Big5"; 438 } 439 440 const char *CharsetRecog_big5::getLanguage() const 441 { 442 return "zh"; 443 } 444 445 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 446 { 447 int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); 448 results->set(det, this, confidence); 449 return (confidence > 0); 450 } 451 452 CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 453 { 454 // nothing to do 455 } 456 457 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 458 int32_t firstByte = 0; 459 int32_t secondByte = 0; 460 int32_t thirdByte = 0; 461 int32_t fourthByte = 0; 462 463 it->index = it->nextIndex; 464 it->error = FALSE; 465 firstByte = it->charValue = it->nextByte(det); 466 467 if (firstByte < 0) { 468 // Ran off the end of the input data 469 return FALSE; 470 } 471 472 if (firstByte <= 0x80) { 473 // single byte char 474 return TRUE; 475 } 476 477 secondByte = it->nextByte(det); 478 if (secondByte >= 0) { 479 it->charValue = (it->charValue << 8) | secondByte; 480 } 481 // else we'll handle the error later. 482 483 if (firstByte >= 0x81 && firstByte <= 0xFE) { 484 // Two byte Char 485 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 486 return TRUE; 487 } 488 489 // Four byte char 490 if (secondByte >= 0x30 && secondByte <= 0x39) { 491 thirdByte = it->nextByte(det); 492 493 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 494 fourthByte = it->nextByte(det); 495 496 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 497 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 498 499 return TRUE; 500 } 501 } 502 } 503 504 // Something wasn't valid, or we ran out of data (-1). 505 it->error = TRUE; 506 } 507 508 return TRUE; 509 } 510 511 const char *CharsetRecog_gb_18030::getName() const 512 { 513 return "GB18030"; 514 } 515 516 const char *CharsetRecog_gb_18030::getLanguage() const 517 { 518 return "zh"; 519 } 520 521 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 522 { 523 int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); 524 results->set(det, this, confidence); 525 return (confidence > 0); 526 } 527 528 U_NAMESPACE_END 529 #endif 530