Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2016, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "cmemory.h"
     15 #include "csmatch.h"
     16 #include "csrmbcs.h"
     17 
     18 #include <math.h>
     19 
     20 U_NAMESPACE_BEGIN
     21 
     22 #define min(x,y) (((x)<(y))?(x):(y))
     23 
     24 static const uint16_t commonChars_sjis [] = {
     25 // TODO:  This set of data comes from the character frequency-
     26 //        of-occurence analysis tool.  The data needs to be moved
     27 //        into a resource and loaded from there.
     28 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
     29 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
     30 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
     31 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
     32 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
     33 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
     34 
     35 static const uint16_t commonChars_euc_jp[] = {
     36 // TODO:  This set of data comes from the character frequency-
     37 //        of-occurence analysis tool.  The data needs to be moved
     38 //        into a resource and loaded from there.
     39 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
     40 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
     41 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
     42 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
     43 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
     44 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
     45 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
     46 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
     47 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
     48 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
     49 
     50 static const uint16_t commonChars_euc_kr[] = {
     51 // TODO:  This set of data comes from the character frequency-
     52 //        of-occurence analysis tool.  The data needs to be moved
     53 //        into a resource and loaded from there.
     54 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
     55 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
     56 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
     57 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
     58 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
     59 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
     60 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
     61 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
     62 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
     63 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
     64 
     65 static const uint16_t commonChars_big5[] = {
     66 // TODO:  This set of data comes from the character frequency-
     67 //        of-occurence analysis tool.  The data needs to be moved
     68 //        into a resource and loaded from there.
     69 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
     70 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
     71 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
     72 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
     73 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
     74 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
     75 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
     76 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
     77 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
     78 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
     79 
     80 static const uint16_t commonChars_gb_18030[] = {
     81 // TODO:  This set of data comes from the character frequency-
     82 //        of-occurence analysis tool.  The data needs to be moved
     83 //        into a resource and loaded from there.
     84 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
     85 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
     86 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
     87 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
     88 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
     89 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
     90 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
     91 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
     92 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
     93 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
     94 
     95 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
     96 {
     97     int32_t start = 0, end = len-1;
     98     int32_t mid = (start+end)/2;
     99 
    100     while(start <= end) {
    101         if(array[mid] == value) {
    102             return mid;
    103         }
    104 
    105         if(array[mid] < value){
    106             start = mid+1;
    107         } else {
    108             end = mid-1;
    109         }
    110 
    111         mid = (start+end)/2;
    112     }
    113 
    114     return -1;
    115 }
    116 
    117 IteratedChar::IteratedChar() :
    118 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
    119 {
    120     // nothing else to do.
    121 }
    122 
    123 /*void IteratedChar::reset()
    124 {
    125     charValue = 0;
    126     index     = -1;
    127     nextIndex = 0;
    128     error     = FALSE;
    129     done      = FALSE;
    130 }*/
    131 
    132 int32_t IteratedChar::nextByte(InputText *det)
    133 {
    134     if (nextIndex >= det->fRawLength) {
    135         done = TRUE;
    136 
    137         return -1;
    138     }
    139 
    140     return det->fRawInput[nextIndex++];
    141 }
    142 
    143 CharsetRecog_mbcs::~CharsetRecog_mbcs()
    144 {
    145     // nothing to do.
    146 }
    147 
    148 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    149     int32_t singleByteCharCount = 0;
    150     int32_t doubleByteCharCount = 0;
    151     int32_t commonCharCount     = 0;
    152     int32_t badCharCount        = 0;
    153     int32_t totalCharCount      = 0;
    154     int32_t confidence          = 0;
    155     IteratedChar iter;
    156 
    157     while (nextChar(&iter, det)) {
    158         totalCharCount++;
    159 
    160         if (iter.error) {
    161             badCharCount++;
    162         } else {
    163             if (iter.charValue <= 0xFF) {
    164                 singleByteCharCount++;
    165             } else {
    166                 doubleByteCharCount++;
    167 
    168                 if (commonChars != 0) {
    169                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
    170                         commonCharCount += 1;
    171                     }
    172                 }
    173             }
    174         }
    175 
    176 
    177         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
    178             // Bail out early if the byte data is not matching the encoding scheme.
    179             // break detectBlock;
    180             return confidence;
    181         }
    182     }
    183 
    184     if (doubleByteCharCount <= 10 && badCharCount == 0) {
    185         // Not many multi-byte chars.
    186         if (doubleByteCharCount == 0 && totalCharCount < 10) {
    187             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
    188             // We don't have enough data to have any confidence.
    189             // Statistical analysis of single byte non-ASCII charcters would probably help here.
    190             confidence = 0;
    191         }
    192         else {
    193             //   ASCII or ISO file?  It's probably not our encoding,
    194             //   but is not incompatible with our encoding, so don't give it a zero.
    195             confidence = 10;
    196         }
    197 
    198         return confidence;
    199     }
    200 
    201     //
    202     //  No match if there are too many characters that don't fit the encoding scheme.
    203     //    (should we have zero tolerance for these?)
    204     //
    205     if (doubleByteCharCount < 20*badCharCount) {
    206         confidence = 0;
    207 
    208         return confidence;
    209     }
    210 
    211     if (commonChars == 0) {
    212         // We have no statistics on frequently occuring characters.
    213         //  Assess confidence purely on having a reasonable number of
    214         //  multi-byte characters (the more the better)
    215         confidence = 30 + doubleByteCharCount - 20*badCharCount;
    216 
    217         if (confidence > 100) {
    218             confidence = 100;
    219         }
    220     } else {
    221         //
    222         // Frequency of occurence statistics exist.
    223         //
    224 
    225         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
    226         double scaleFactor = 90.0 / maxVal;
    227         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
    228 
    229         confidence = min(confidence, 100);
    230     }
    231 
    232     if (confidence < 0) {
    233         confidence = 0;
    234     }
    235 
    236     return confidence;
    237 }
    238 
    239 CharsetRecog_sjis::~CharsetRecog_sjis()
    240 {
    241     // nothing to do
    242 }
    243 
    244 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    245     it->index = it->nextIndex;
    246     it->error = FALSE;
    247 
    248     int32_t firstByte = it->charValue = it->nextByte(det);
    249 
    250     if (firstByte < 0) {
    251         return FALSE;
    252     }
    253 
    254     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
    255         return TRUE;
    256     }
    257 
    258     int32_t secondByte = it->nextByte(det);
    259     if (secondByte >= 0) {
    260         it->charValue = (firstByte << 8) | secondByte;
    261     }
    262     // else we'll handle the error later.
    263 
    264     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
    265         // Illegal second byte value.
    266         it->error = TRUE;
    267     }
    268 
    269     return TRUE;
    270 }
    271 
    272 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    273     int32_t confidence = match_mbcs(det, commonChars_sjis, UPRV_LENGTHOF(commonChars_sjis));
    274     results->set(det, this, confidence);
    275     return (confidence > 0);
    276 }
    277 
    278 const char *CharsetRecog_sjis::getName() const
    279 {
    280     return "Shift_JIS";
    281 }
    282 
    283 const char *CharsetRecog_sjis::getLanguage() const
    284 {
    285     return "ja";
    286 }
    287 
    288 CharsetRecog_euc::~CharsetRecog_euc()
    289 {
    290     // nothing to do
    291 }
    292 
    293 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    294     int32_t firstByte  = 0;
    295     int32_t secondByte = 0;
    296     int32_t thirdByte  = 0;
    297 
    298     it->index = it->nextIndex;
    299     it->error = FALSE;
    300     firstByte = it->charValue = it->nextByte(det);
    301 
    302     if (firstByte < 0) {
    303         // Ran off the end of the input data
    304         return FALSE;
    305     }
    306 
    307     if (firstByte <= 0x8D) {
    308         // single byte char
    309         return TRUE;
    310     }
    311 
    312     secondByte = it->nextByte(det);
    313     if (secondByte >= 0) {
    314         it->charValue = (it->charValue << 8) | secondByte;
    315     }
    316     // else we'll handle the error later.
    317 
    318     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
    319         // Two byte Char
    320         if (secondByte < 0xA1) {
    321             it->error = TRUE;
    322         }
    323 
    324         return TRUE;
    325     }
    326 
    327     if (firstByte == 0x8E) {
    328         // Code Set 2.
    329         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    330         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    331         // We don't know which we've got.
    332         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    333         //   bytes will look like a well formed 2 byte char.
    334         if (secondByte < 0xA1) {
    335             it->error = TRUE;
    336         }
    337 
    338         return TRUE;
    339     }
    340 
    341     if (firstByte == 0x8F) {
    342         // Code set 3.
    343         // Three byte total char size, two bytes of actual char value.
    344         thirdByte    = it->nextByte(det);
    345         it->charValue = (it->charValue << 8) | thirdByte;
    346 
    347         if (thirdByte < 0xa1) {
    348             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
    349             it->error = TRUE;
    350         }
    351     }
    352 
    353     return TRUE;
    354 
    355 }
    356 
    357 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
    358 {
    359     // nothing to do
    360 }
    361 
    362 const char *CharsetRecog_euc_jp::getName() const
    363 {
    364     return "EUC-JP";
    365 }
    366 
    367 const char *CharsetRecog_euc_jp::getLanguage() const
    368 {
    369     return "ja";
    370 }
    371 
    372 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
    373 {
    374     int32_t confidence = match_mbcs(det, commonChars_euc_jp, UPRV_LENGTHOF(commonChars_euc_jp));
    375     results->set(det, this, confidence);
    376     return (confidence > 0);
    377 }
    378 
    379 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
    380 {
    381     // nothing to do
    382 }
    383 
    384 const char *CharsetRecog_euc_kr::getName() const
    385 {
    386     return "EUC-KR";
    387 }
    388 
    389 const char *CharsetRecog_euc_kr::getLanguage() const
    390 {
    391     return "ko";
    392 }
    393 
    394 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
    395 {
    396     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, UPRV_LENGTHOF(commonChars_euc_kr));
    397     results->set(det, this, confidence);
    398     return (confidence > 0);
    399 }
    400 
    401 CharsetRecog_big5::~CharsetRecog_big5()
    402 {
    403     // nothing to do
    404 }
    405 
    406 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
    407 {
    408     int32_t firstByte;
    409 
    410     it->index = it->nextIndex;
    411     it->error = FALSE;
    412     firstByte = it->charValue = it->nextByte(det);
    413 
    414     if (firstByte < 0) {
    415         return FALSE;
    416     }
    417 
    418     if (firstByte <= 0x7F || firstByte == 0xFF) {
    419         // single byte character.
    420         return TRUE;
    421     }
    422 
    423     int32_t secondByte = it->nextByte(det);
    424     if (secondByte >= 0)  {
    425         it->charValue = (it->charValue << 8) | secondByte;
    426     }
    427     // else we'll handle the error later.
    428 
    429     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
    430         it->error = TRUE;
    431     }
    432 
    433     return TRUE;
    434 }
    435 
    436 const char *CharsetRecog_big5::getName() const
    437 {
    438     return "Big5";
    439 }
    440 
    441 const char *CharsetRecog_big5::getLanguage() const
    442 {
    443     return "zh";
    444 }
    445 
    446 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
    447 {
    448     int32_t confidence = match_mbcs(det, commonChars_big5, UPRV_LENGTHOF(commonChars_big5));
    449     results->set(det, this, confidence);
    450     return (confidence > 0);
    451 }
    452 
    453 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
    454 {
    455     // nothing to do
    456 }
    457 
    458 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    459     int32_t firstByte  = 0;
    460     int32_t secondByte = 0;
    461     int32_t thirdByte  = 0;
    462     int32_t fourthByte = 0;
    463 
    464     it->index = it->nextIndex;
    465     it->error = FALSE;
    466     firstByte = it->charValue = it->nextByte(det);
    467 
    468     if (firstByte < 0) {
    469         // Ran off the end of the input data
    470         return FALSE;
    471     }
    472 
    473     if (firstByte <= 0x80) {
    474         // single byte char
    475         return TRUE;
    476     }
    477 
    478     secondByte = it->nextByte(det);
    479     if (secondByte >= 0) {
    480         it->charValue = (it->charValue << 8) | secondByte;
    481     }
    482     // else we'll handle the error later.
    483 
    484     if (firstByte >= 0x81 && firstByte <= 0xFE) {
    485         // Two byte Char
    486         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
    487             return TRUE;
    488         }
    489 
    490         // Four byte char
    491         if (secondByte >= 0x30 && secondByte <= 0x39) {
    492             thirdByte = it->nextByte(det);
    493 
    494             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    495                 fourthByte = it->nextByte(det);
    496 
    497                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    498                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
    499 
    500                     return TRUE;
    501                 }
    502             }
    503         }
    504 
    505         // Something wasn't valid, or we ran out of data (-1).
    506         it->error = TRUE;
    507     }
    508 
    509     return TRUE;
    510 }
    511 
    512 const char *CharsetRecog_gb_18030::getName() const
    513 {
    514     return "GB18030";
    515 }
    516 
    517 const char *CharsetRecog_gb_18030::getLanguage() const
    518 {
    519     return "zh";
    520 }
    521 
    522 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
    523 {
    524     int32_t confidence = match_mbcs(det, commonChars_gb_18030, UPRV_LENGTHOF(commonChars_gb_18030));
    525     results->set(det, this, confidence);
    526     return (confidence > 0);
    527 }
    528 
    529 U_NAMESPACE_END
    530 #endif
    531