1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_CONVERSION 13 14 #include "cmemory.h" 15 #include "csmatch.h" 16 #include "csrmbcs.h" 17 18 #include <math.h> 19 20 U_NAMESPACE_BEGIN 21 22 #define min(x,y) (((x)<(y))?(x):(y)) 23 24 static const uint16_t commonChars_sjis [] = { 25 // TODO: This set of data comes from the character frequency- 26 // of-occurence analysis tool. The data needs to be moved 27 // into a resource and loaded from there. 28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0, 29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5, 30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc, 31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341, 32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389, 33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa}; 34 35 static const uint16_t commonChars_euc_jp[] = { 36 // TODO: This set of data comes from the character frequency- 37 // of-occurence analysis tool. The data needs to be moved 38 // into a resource and loaded from there. 39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2, 40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3, 41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4, 42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de, 43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef, 44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af, 45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7, 46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1, 47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee, 48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1}; 49 50 static const uint16_t commonChars_euc_kr[] = { 51 // TODO: This set of data comes from the character frequency- 52 // of-occurence analysis tool. The data needs to be moved 53 // into a resource and loaded from there. 54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc, 55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9, 56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce, 57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce, 58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba, 59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee, 60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7, 61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6, 62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6, 63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad}; 64 65 static const uint16_t commonChars_big5[] = { 66 // TODO: This set of data comes from the character frequency- 67 // of-occurence analysis tool. The data needs to be moved 68 // into a resource and loaded from there. 69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446, 70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3, 71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548, 72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8, 73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da, 74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3, 75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59, 76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c, 77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44, 78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f}; 79 80 static const uint16_t commonChars_gb_18030[] = { 81 // TODO: This set of data comes from the character frequency- 82 // of-occurence analysis tool. The data needs to be moved 83 // into a resource and loaded from there. 84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac, 85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4, 86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4, 87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6, 88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6, 89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7, 90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7, 91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5, 92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2, 93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0}; 94 95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value) 96 { 97 int32_t start = 0, end = len-1; 98 int32_t mid = (start+end)/2; 99 100 while(start <= end) { 101 if(array[mid] == value) { 102 return mid; 103 } 104 105 if(array[mid] < value){ 106 start = mid+1; 107 } else { 108 end = mid-1; 109 } 110 111 mid = (start+end)/2; 112 } 113 114 return -1; 115 } 116 117 IteratedChar::IteratedChar() : 118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE) 119 { 120 // nothing else to do. 121 } 122 123 /*void IteratedChar::reset() 124 { 125 charValue = 0; 126 index = -1; 127 nextIndex = 0; 128 error = FALSE; 129 done = FALSE; 130 }*/ 131 132 int32_t IteratedChar::nextByte(InputText *det) 133 { 134 if (nextIndex >= det->fRawLength) { 135 done = TRUE; 136 137 return -1; 138 } 139 140 return det->fRawInput[nextIndex++]; 141 } 142 143 CharsetRecog_mbcs::~CharsetRecog_mbcs() 144 { 145 // nothing to do. 146 } 147 148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const { 149 int32_t singleByteCharCount = 0; 150 int32_t doubleByteCharCount = 0; 151 int32_t commonCharCount = 0; 152 int32_t badCharCount = 0; 153 int32_t totalCharCount = 0; 154 int32_t confidence = 0; 155 IteratedChar iter; 156 157 while (nextChar(&iter, det)) { 158 totalCharCount++; 159 160 if (iter.error) { 161 badCharCount++; 162 } else { 163 if (iter.charValue <= 0xFF) { 164 singleByteCharCount++; 165 } else { 166 doubleByteCharCount++; 167 168 if (commonChars != 0) { 169 if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){ 170 commonCharCount += 1; 171 } 172 } 173 } 174 } 175 176 177 if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) { 178 // Bail out early if the byte data is not matching the encoding scheme. 179 // break detectBlock; 180 return confidence; 181 } 182 } 183 184 if (doubleByteCharCount <= 10 && badCharCount == 0) { 185 // Not many multi-byte chars. 186 if (doubleByteCharCount == 0 && totalCharCount < 10) { 187 // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes. 188 // We don't have enough data to have any confidence. 189 // Statistical analysis of single byte non-ASCII charcters would probably help here. 190 confidence = 0; 191 } 192 else { 193 // ASCII or ISO file? It's probably not our encoding, 194 // but is not incompatible with our encoding, so don't give it a zero. 195 confidence = 10; 196 } 197 198 return confidence; 199 } 200 201 // 202 // No match if there are too many characters that don't fit the encoding scheme. 203 // (should we have zero tolerance for these?) 204 // 205 if (doubleByteCharCount < 20*badCharCount) { 206 confidence = 0; 207 208 return confidence; 209 } 210 211 if (commonChars == 0) { 212 // We have no statistics on frequently occuring characters. 213 // Assess confidence purely on having a reasonable number of 214 // multi-byte characters (the more the better) 215 confidence = 30 + doubleByteCharCount - 20*badCharCount; 216 217 if (confidence > 100) { 218 confidence = 100; 219 } 220 } else { 221 // 222 // Frequency of occurence statistics exist. 223 // 224 225 double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/ 226 double scaleFactor = 90.0 / maxVal; 227 confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0); 228 229 confidence = min(confidence, 100); 230 } 231 232 if (confidence < 0) { 233 confidence = 0; 234 } 235 236 return confidence; 237 } 238 239 CharsetRecog_sjis::~CharsetRecog_sjis() 240 { 241 // nothing to do 242 } 243 244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const { 245 it->index = it->nextIndex; 246 it->error = FALSE; 247 248 int32_t firstByte = it->charValue = it->nextByte(det); 249 250 if (firstByte < 0) { 251 return FALSE; 252 } 253 254 if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) { 255 return TRUE; 256 } 257 258 int32_t secondByte = it->nextByte(det); 259 if (secondByte >= 0) { 260 it->charValue = (firstByte << 8) | secondByte; 261 } 262 // else we'll handle the error later. 263 264 if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) { 265 // Illegal second byte value. 266 it->error = TRUE; 267 } 268 269 return TRUE; 270 } 271 272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const { 273 int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis)); 274 results->set(det, this, confidence); 275 return (confidence > 0); 276 } 277 278 const char *CharsetRecog_sjis::getName() const 279 { 280 return "Shift_JIS"; 281 } 282 283 const char *CharsetRecog_sjis::getLanguage() const 284 { 285 return "ja"; 286 } 287 288 CharsetRecog_euc::~CharsetRecog_euc() 289 { 290 // nothing to do 291 } 292 293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const { 294 int32_t firstByte = 0; 295 int32_t secondByte = 0; 296 int32_t thirdByte = 0; 297 298 it->index = it->nextIndex; 299 it->error = FALSE; 300 firstByte = it->charValue = it->nextByte(det); 301 302 if (firstByte < 0) { 303 // Ran off the end of the input data 304 return FALSE; 305 } 306 307 if (firstByte <= 0x8D) { 308 // single byte char 309 return TRUE; 310 } 311 312 secondByte = it->nextByte(det); 313 if (secondByte >= 0) { 314 it->charValue = (it->charValue << 8) | secondByte; 315 } 316 // else we'll handle the error later. 317 318 if (firstByte >= 0xA1 && firstByte <= 0xFE) { 319 // Two byte Char 320 if (secondByte < 0xA1) { 321 it->error = TRUE; 322 } 323 324 return TRUE; 325 } 326 327 if (firstByte == 0x8E) { 328 // Code Set 2. 329 // In EUC-JP, total char size is 2 bytes, only one byte of actual char value. 330 // In EUC-TW, total char size is 4 bytes, three bytes contribute to char value. 331 // We don't know which we've got. 332 // Treat it like EUC-JP. If the data really was EUC-TW, the following two 333 // bytes will look like a well formed 2 byte char. 334 if (secondByte < 0xA1) { 335 it->error = TRUE; 336 } 337 338 return TRUE; 339 } 340 341 if (firstByte == 0x8F) { 342 // Code set 3. 343 // Three byte total char size, two bytes of actual char value. 344 thirdByte = it->nextByte(det); 345 it->charValue = (it->charValue << 8) | thirdByte; 346 347 if (thirdByte < 0xa1) { 348 // Bad second byte or ran off the end of the input data with a non-ASCII first byte. 349 it->error = TRUE; 350 } 351 } 352 353 return TRUE; 354 355 } 356 357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp() 358 { 359 // nothing to do 360 } 361 362 const char *CharsetRecog_euc_jp::getName() const 363 { 364 return "EUC-JP"; 365 } 366 367 const char *CharsetRecog_euc_jp::getLanguage() const 368 { 369 return "ja"; 370 } 371 372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const 373 { 374 int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp)); 375 results->set(det, this, confidence); 376 return (confidence > 0); 377 } 378 379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr() 380 { 381 // nothing to do 382 } 383 384 const char *CharsetRecog_euc_kr::getName() const 385 { 386 return "EUC-KR"; 387 } 388 389 const char *CharsetRecog_euc_kr::getLanguage() const 390 { 391 return "ko"; 392 } 393 394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const 395 { 396 int32_t confidence = match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr)); 397 results->set(det, this, confidence); 398 return (confidence > 0); 399 } 400 401 CharsetRecog_big5::~CharsetRecog_big5() 402 { 403 // nothing to do 404 } 405 406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const 407 { 408 int32_t firstByte; 409 410 it->index = it->nextIndex; 411 it->error = FALSE; 412 firstByte = it->charValue = it->nextByte(det); 413 414 if (firstByte < 0) { 415 return FALSE; 416 } 417 418 if (firstByte <= 0x7F || firstByte == 0xFF) { 419 // single byte character. 420 return TRUE; 421 } 422 423 int32_t secondByte = it->nextByte(det); 424 if (secondByte >= 0) { 425 it->charValue = (it->charValue << 8) | secondByte; 426 } 427 // else we'll handle the error later. 428 429 if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) { 430 it->error = TRUE; 431 } 432 433 return TRUE; 434 } 435 436 const char *CharsetRecog_big5::getName() const 437 { 438 return "Big5"; 439 } 440 441 const char *CharsetRecog_big5::getLanguage() const 442 { 443 return "zh"; 444 } 445 446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const 447 { 448 int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5)); 449 results->set(det, this, confidence); 450 return (confidence > 0); 451 } 452 453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030() 454 { 455 // nothing to do 456 } 457 458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const { 459 int32_t firstByte = 0; 460 int32_t secondByte = 0; 461 int32_t thirdByte = 0; 462 int32_t fourthByte = 0; 463 464 it->index = it->nextIndex; 465 it->error = FALSE; 466 firstByte = it->charValue = it->nextByte(det); 467 468 if (firstByte < 0) { 469 // Ran off the end of the input data 470 return FALSE; 471 } 472 473 if (firstByte <= 0x80) { 474 // single byte char 475 return TRUE; 476 } 477 478 secondByte = it->nextByte(det); 479 if (secondByte >= 0) { 480 it->charValue = (it->charValue << 8) | secondByte; 481 } 482 // else we'll handle the error later. 483 484 if (firstByte >= 0x81 && firstByte <= 0xFE) { 485 // Two byte Char 486 if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) { 487 return TRUE; 488 } 489 490 // Four byte char 491 if (secondByte >= 0x30 && secondByte <= 0x39) { 492 thirdByte = it->nextByte(det); 493 494 if (thirdByte >= 0x81 && thirdByte <= 0xFE) { 495 fourthByte = it->nextByte(det); 496 497 if (fourthByte >= 0x30 && fourthByte <= 0x39) { 498 it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte; 499 500 return TRUE; 501 } 502 } 503 } 504 505 // Something wasn't valid, or we ran out of data (-1). 506 it->error = TRUE; 507 } 508 509 return TRUE; 510 } 511 512 const char *CharsetRecog_gb_18030::getName() const 513 { 514 return "GB18030"; 515 } 516 517 const char *CharsetRecog_gb_18030::getLanguage() const 518 { 519 return "zh"; 520 } 521 522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const 523 { 524 int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030)); 525 results->set(det, this, confidence); 526 return (confidence > 0); 527 } 528 529 U_NAMESPACE_END 530 #endif 531