1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2008, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_CONVERSION 11 12 #include "csrmbcs.h" 13 14 #include <math.h> 15 16 U_NAMESPACE_BEGIN 17 18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 19 20 #define min(x,y) (((x)<(y))?(x):(y)) 21 22 static const uint16_t commonChars_sjis [] = { 23 // TODO: This set of data comes from the character frequency- 24 // of-occurence analysis tool. The data needs to be moved 25 // into a resource and loaded from there. 26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 32 33 static const uint16_t commonChars_euc_jp[] = { 34 // TODO: This set of data comes from the character frequency- 35 // of-occurence analysis tool. The data needs to be moved 36 // into a resource and loaded from there. 37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 47 48 static const uint16_t commonChars_euc_kr[] = { 49 // TODO: This set of data comes from the character frequency- 50 // of-occurence analysis tool. The data needs to be moved 51 // into a resource and loaded from there. 52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 62 63 static const uint16_t commonChars_big5[] = { 64 // TODO: This set of data comes from the character frequency- 65 // of-occurence analysis tool. The data needs to be moved 66 // into a resource and loaded from there. 67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 77 78 static const uint16_t commonChars_gb_18030[] = { 79 // TODO: This set of data comes from the character frequency- 80 // of-occurence analysis tool. The data needs to be moved 81 // into a resource and loaded from there. 82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 92 93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 94 { 95 int32_t start = 0, end = len-1; 96 int32_t mid = (start+end)/2; 97 98 while(start <= end) { 99 if(array[mid] == value) { 100 return mid; 101 } 102 103 if(array[mid] < value){ 104 start = mid+1; 105 } else { 106 end = mid-1; 107 } 108 109 mid = (start+end)/2; 110 } 111 112 return -1; 113 } 114 115 IteratedChar::IteratedChar() : 116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) 117 { 118 // nothing else to do. 119 } 120 121 /*void IteratedChar::reset() 122 { 123 charValue = 0; 124 index = -1; 125 nextIndex = 0; 126 error = FALSE; 127 done = FALSE; 128 }*/ 129 130 int32_t IteratedChar::nextByte(InputText *det) 131 { 132 if (nextIndex >= det->fRawLength) { 133 done = TRUE; 134 135 return -1; 136 } 137 138 return det->fRawInput[nextIndex++]; 139 } 140 141 CharsetRecog_mbcs::~CharsetRecog_mbcs() 142 { 143 // nothing to do. 144 } 145 146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) { 147 int32_t singleByteCharCount = 0; 148 int32_t doubleByteCharCount = 0; 149 int32_t commonCharCount = 0; 150 int32_t badCharCount = 0; 151 int32_t totalCharCount = 0; 152 int32_t confidence = 0; 153 IteratedChar iter; 154 155 while (nextChar(&iter, det)) { 156 totalCharCount++; 157 158 if (iter.error) { 159 badCharCount++; 160 } else { 161 if (iter.charValue <= 0xFF) { 162 singleByteCharCount++; 163 } else { 164 doubleByteCharCount++; 165 166 if (commonChars != 0) { 167 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ 168 commonCharCount += 1; 169 } 170 } 171 } 172 } 173 174 175 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 176 // Bail out early if the byte data is not matching the encoding scheme. 177 // break detectBlock; 178 return confidence; 179 } 180 } 181 182 if (doubleByteCharCount <= 10 && badCharCount == 0) { 183 // Not many multi-byte chars. 184 if (doubleByteCharCount == 0 && totalCharCount < 10) { 185 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 186 // We don't have enough data to have any confidence. 187 // Statistical analysis of single byte non-ASCII charcters would probably help here. 188 confidence = 0; 189 } 190 else { 191 // ASCII or ISO file? It's probably not our encoding, 192 // but is not incompatible with our encoding, so don't give it a zero. 193 confidence = 10; 194 } 195 196 return confidence; 197 } 198 199 // 200 // No match if there are too many characters that don't fit the encoding scheme. 201 // (should we have zero tolerance for these?) 202 // 203 if (doubleByteCharCount < 20*badCharCount) { 204 confidence = 0; 205 206 return confidence; 207 } 208 209 if (commonChars == 0) { 210 // We have no statistics on frequently occuring characters. 211 // Assess confidence purely on having a reasonable number of 212 // multi-byte characters (the more the better) 213 confidence = 30 + doubleByteCharCount - 20*badCharCount; 214 215 if (confidence > 100) { 216 confidence = 100; 217 } 218 } else { 219 // 220 // Frequency of occurence statistics exist. 221 // 222 223 double maxVal = log10((double)doubleByteCharCount / 4); /*(float)?*/ 224 double scaleFactor = 90.0 / maxVal; 225 confidence = (int32_t)(log10((double)commonCharCount+1) * scaleFactor + 10.0); 226 227 confidence = min(confidence, 100); 228 } 229 230 if (confidence < 0) { 231 confidence = 0; 232 } 233 234 return confidence; 235 } 236 237 CharsetRecog_sjis::~CharsetRecog_sjis() 238 { 239 // nothing to do 240 } 241 242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) { 243 it->index = it->nextIndex; 244 it->error = FALSE; 245 246 int32_t firstByte = it->charValue = it->nextByte(det); 247 248 if (firstByte < 0) { 249 return FALSE; 250 } 251 252 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 253 return TRUE; 254 } 255 256 int32_t secondByte = it->nextByte(det); 257 if (secondByte >= 0) { 258 it->charValue = (firstByte << 8) | secondByte; 259 } 260 // else we'll handle the error later. 261 262 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 263 // Illegal second byte value. 264 it->error = TRUE; 265 } 266 267 return TRUE; 268 } 269 270 int32_t CharsetRecog_sjis::match(InputText* det) 271 { 272 return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis)); 273 } 274 275 const char *CharsetRecog_sjis::getName() const 276 { 277 return "Shift_JIS"; 278 } 279 280 const char *CharsetRecog_sjis::getLanguage() const 281 { 282 return "ja"; 283 } 284 285 CharsetRecog_euc::~CharsetRecog_euc() 286 { 287 // nothing to do 288 } 289 290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) { 291 int32_t firstByte = 0; 292 int32_t secondByte = 0; 293 int32_t thirdByte = 0; 294 295 it->index = it->nextIndex; 296 it->error = FALSE; 297 firstByte = it->charValue = it->nextByte(det); 298 299 if (firstByte < 0) { 300 // Ran off the end of the input data 301 return FALSE; 302 } 303 304 if (firstByte <= 0x8D) { 305 // single byte char 306 return TRUE; 307 } 308 309 secondByte = it->nextByte(det); 310 if (secondByte >= 0) { 311 it->charValue = (it->charValue << 8) | secondByte; 312 } 313 // else we'll handle the error later. 314 315 if (firstByte >= 0xA1 && firstByte <= 0xFE) { 316 // Two byte Char 317 if (secondByte < 0xA1) { 318 it->error = TRUE; 319 } 320 321 return TRUE; 322 } 323 324 if (firstByte == 0x8E) { 325 // Code Set 2. 326 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 327 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 328 // We don't know which we've got. 329 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 330 // bytes will look like a well formed 2 byte char. 331 if (secondByte < 0xA1) { 332 it->error = TRUE; 333 } 334 335 return TRUE; 336 } 337 338 if (firstByte == 0x8F) { 339 // Code set 3. 340 // Three byte total char size, two bytes of actual char value. 341 thirdByte = it->nextByte(det); 342 it->charValue = (it->charValue << 8) | thirdByte; 343 344 if (thirdByte < 0xa1) { 345 // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 346 it->error = TRUE; 347 } 348 } 349 350 return TRUE; 351 352 } 353 354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 355 { 356 // nothing to do 357 } 358 359 const char *CharsetRecog_euc_jp::getName() const 360 { 361 return "EUC-JP"; 362 } 363 364 const char *CharsetRecog_euc_jp::getLanguage() const 365 { 366 return "ja"; 367 } 368 369 int32_t CharsetRecog_euc_jp::match(InputText *det) 370 { 371 return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp)); 372 } 373 374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 375 { 376 // nothing to do 377 } 378 379 const char *CharsetRecog_euc_kr::getName() const 380 { 381 return "EUC-KR"; 382 } 383 384 const char *CharsetRecog_euc_kr::getLanguage() const 385 { 386 return "ko"; 387 } 388 389 int32_t CharsetRecog_euc_kr::match(InputText *det) 390 { 391 return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr)); 392 } 393 394 CharsetRecog_big5::~CharsetRecog_big5() 395 { 396 // nothing to do 397 } 398 399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) 400 { 401 int32_t firstByte; 402 403 it->index = it->nextIndex; 404 it->error = FALSE; 405 firstByte = it->charValue = it->nextByte(det); 406 407 if (firstByte < 0) { 408 return FALSE; 409 } 410 411 if (firstByte <= 0x7F || firstByte == 0xFF) { 412 // single byte character. 413 return TRUE; 414 } 415 416 int32_t secondByte = it->nextByte(det); 417 if (secondByte >= 0) { 418 it->charValue = (it->charValue << 8) | secondByte; 419 } 420 // else we'll handle the error later. 421 422 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 423 it->error = TRUE; 424 } 425 426 return TRUE; 427 } 428 429 const char *CharsetRecog_big5::getName() const 430 { 431 return "Big5"; 432 } 433 434 const char *CharsetRecog_big5::getLanguage() const 435 { 436 return "zh"; 437 } 438 439 int32_t CharsetRecog_big5::match(InputText *det) 440 { 441 return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5)); 442 } 443 444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 445 { 446 // nothing to do 447 } 448 449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) { 450 int32_t firstByte = 0; 451 int32_t secondByte = 0; 452 int32_t thirdByte = 0; 453 int32_t fourthByte = 0; 454 455 it->index = it->nextIndex; 456 it->error = FALSE; 457 firstByte = it->charValue = it->nextByte(det); 458 459 if (firstByte < 0) { 460 // Ran off the end of the input data 461 return FALSE; 462 } 463 464 if (firstByte <= 0x80) { 465 // single byte char 466 return TRUE; 467 } 468 469 secondByte = it->nextByte(det); 470 if (secondByte >= 0) { 471 it->charValue = (it->charValue << 8) | secondByte; 472 } 473 // else we'll handle the error later. 474 475 if (firstByte >= 0x81 && firstByte <= 0xFE) { 476 // Two byte Char 477 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 478 return TRUE; 479 } 480 481 // Four byte char 482 if (secondByte >= 0x30 && secondByte <= 0x39) { 483 thirdByte = it->nextByte(det); 484 485 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 486 fourthByte = it->nextByte(det); 487 488 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 489 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 490 491 return TRUE; 492 } 493 } 494 } 495 496 // Something wasn't valid, or we ran out of data (-1). 497 it->error = TRUE; 498 } 499 500 return TRUE; 501 } 502 503 const char *CharsetRecog_gb_18030::getName() const 504 { 505 return "GB18030"; 506 } 507 508 const char *CharsetRecog_gb_18030::getLanguage() const 509 { 510 return "zh"; 511 } 512 513 int32_t CharsetRecog_gb_18030::match(InputText *det) 514 { 515 return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030)); 516 } 517 518 U_NAMESPACE_END 519 #endif 520