Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2011, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "csrmbcs.h"
     13 
     14 #include <math.h>
     15 
     16 U_NAMESPACE_BEGIN
     17 
     18 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     19 
     20 #define min(x,y) (((x)<(y))?(x):(y))
     21 
     22 static const uint16_t commonChars_sjis [] = {
     23 // TODO:  This set of data comes from the character frequency-
     24 //        of-occurence analysis tool.  The data needs to be moved
     25 //        into a resource and loaded from there.
     26 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
     27 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
     28 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
     29 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
     30 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
     31 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
     32 
     33 static const uint16_t commonChars_euc_jp[] = {
     34 // TODO:  This set of data comes from the character frequency-
     35 //        of-occurence analysis tool.  The data needs to be moved
     36 //        into a resource and loaded from there.
     37 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
     38 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
     39 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
     40 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
     41 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
     42 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
     43 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
     44 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
     45 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
     46 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
     47 
     48 static const uint16_t commonChars_euc_kr[] = {
     49 // TODO:  This set of data comes from the character frequency-
     50 //        of-occurence analysis tool.  The data needs to be moved
     51 //        into a resource and loaded from there.
     52 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
     53 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
     54 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
     55 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
     56 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
     57 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
     58 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
     59 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
     60 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
     61 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
     62 
     63 static const uint16_t commonChars_big5[] = {
     64 // TODO:  This set of data comes from the character frequency-
     65 //        of-occurence analysis tool.  The data needs to be moved
     66 //        into a resource and loaded from there.
     67 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
     68 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
     69 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
     70 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
     71 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
     72 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
     73 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
     74 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
     75 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
     76 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
     77 
     78 static const uint16_t commonChars_gb_18030[] = {
     79 // TODO:  This set of data comes from the character frequency-
     80 //        of-occurence analysis tool.  The data needs to be moved
     81 //        into a resource and loaded from there.
     82 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
     83 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
     84 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
     85 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
     86 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
     87 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
     88 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
     89 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
     90 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
     91 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
     92 
     93 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
     94 {
     95     int32_t start = 0, end = len-1;
     96     int32_t mid = (start+end)/2;
     97 
     98     while(start <= end) {
     99         if(array[mid] == value) {
    100             return mid;
    101         }
    102 
    103         if(array[mid] < value){
    104             start = mid+1;
    105         } else {
    106             end = mid-1;
    107         }
    108 
    109         mid = (start+end)/2;
    110     }
    111 
    112     return -1;
    113 }
    114 
    115 IteratedChar::IteratedChar() :
    116 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
    117 {
    118     // nothing else to do.
    119 }
    120 
    121 /*void IteratedChar::reset()
    122 {
    123     charValue = 0;
    124     index     = -1;
    125     nextIndex = 0;
    126     error     = FALSE;
    127     done      = FALSE;
    128 }*/
    129 
    130 int32_t IteratedChar::nextByte(InputText *det)
    131 {
    132     if (nextIndex >= det->fRawLength) {
    133         done = TRUE;
    134 
    135         return -1;
    136     }
    137 
    138     return det->fRawInput[nextIndex++];
    139 }
    140 
    141 CharsetRecog_mbcs::~CharsetRecog_mbcs()
    142 {
    143     // nothing to do.
    144 }
    145 
    146 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) {
    147     int32_t singleByteCharCount = 0;
    148     int32_t doubleByteCharCount = 0;
    149     int32_t commonCharCount     = 0;
    150     int32_t badCharCount        = 0;
    151     int32_t totalCharCount      = 0;
    152     int32_t confidence          = 0;
    153     IteratedChar iter;
    154 
    155     while (nextChar(&iter, det)) {
    156         totalCharCount++;
    157 
    158         if (iter.error) {
    159             badCharCount++;
    160         } else {
    161             if (iter.charValue <= 0xFF) {
    162                 singleByteCharCount++;
    163             } else {
    164                 doubleByteCharCount++;
    165 
    166                 if (commonChars != 0) {
    167                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
    168                         commonCharCount += 1;
    169                     }
    170                 }
    171             }
    172         }
    173 
    174 
    175         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
    176             // Bail out early if the byte data is not matching the encoding scheme.
    177             // break detectBlock;
    178             return confidence;
    179         }
    180     }
    181 
    182     if (doubleByteCharCount <= 10 && badCharCount == 0) {
    183         // Not many multi-byte chars.
    184         if (doubleByteCharCount == 0 && totalCharCount < 10) {
    185             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
    186             // We don't have enough data to have any confidence.
    187             // Statistical analysis of single byte non-ASCII charcters would probably help here.
    188             confidence = 0;
    189         }
    190         else {
    191             //   ASCII or ISO file?  It's probably not our encoding,
    192             //   but is not incompatible with our encoding, so don't give it a zero.
    193             confidence = 10;
    194         }
    195 
    196         return confidence;
    197     }
    198 
    199     //
    200     //  No match if there are too many characters that don't fit the encoding scheme.
    201     //    (should we have zero tolerance for these?)
    202     //
    203     if (doubleByteCharCount < 20*badCharCount) {
    204         confidence = 0;
    205 
    206         return confidence;
    207     }
    208 
    209     if (commonChars == 0) {
    210         // We have no statistics on frequently occuring characters.
    211         //  Assess confidence purely on having a reasonable number of
    212         //  multi-byte characters (the more the better)
    213         confidence = 30 + doubleByteCharCount - 20*badCharCount;
    214 
    215         if (confidence > 100) {
    216             confidence = 100;
    217         }
    218     } else {
    219         //
    220         // Frequency of occurence statistics exist.
    221         //
    222 
    223         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
    224         double scaleFactor = 90.0 / maxVal;
    225         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
    226 
    227         confidence = min(confidence, 100);
    228     }
    229 
    230     if (confidence < 0) {
    231         confidence = 0;
    232     }
    233 
    234     return confidence;
    235 }
    236 
    237 CharsetRecog_sjis::~CharsetRecog_sjis()
    238 {
    239     // nothing to do
    240 }
    241 
    242 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) {
    243     it->index = it->nextIndex;
    244     it->error = FALSE;
    245 
    246     int32_t firstByte = it->charValue = it->nextByte(det);
    247 
    248     if (firstByte < 0) {
    249         return FALSE;
    250     }
    251 
    252     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
    253         return TRUE;
    254     }
    255 
    256     int32_t secondByte = it->nextByte(det);
    257     if (secondByte >= 0) {
    258         it->charValue = (firstByte << 8) | secondByte;
    259     }
    260     // else we'll handle the error later.
    261 
    262     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
    263         // Illegal second byte value.
    264         it->error = TRUE;
    265     }
    266 
    267     return TRUE;
    268 }
    269 
    270 int32_t CharsetRecog_sjis::match(InputText* det)
    271 {
    272     return match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
    273 }
    274 
    275 const char *CharsetRecog_sjis::getName() const
    276 {
    277     return "Shift_JIS";
    278 }
    279 
    280 const char *CharsetRecog_sjis::getLanguage() const
    281 {
    282     return "ja";
    283 }
    284 
    285 CharsetRecog_euc::~CharsetRecog_euc()
    286 {
    287     // nothing to do
    288 }
    289 
    290 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) {
    291     int32_t firstByte  = 0;
    292     int32_t secondByte = 0;
    293     int32_t thirdByte  = 0;
    294 
    295     it->index = it->nextIndex;
    296     it->error = FALSE;
    297     firstByte = it->charValue = it->nextByte(det);
    298 
    299     if (firstByte < 0) {
    300         // Ran off the end of the input data
    301         return FALSE;
    302     }
    303 
    304     if (firstByte <= 0x8D) {
    305         // single byte char
    306         return TRUE;
    307     }
    308 
    309     secondByte = it->nextByte(det);
    310     if (secondByte >= 0) {
    311         it->charValue = (it->charValue << 8) | secondByte;
    312     }
    313     // else we'll handle the error later.
    314 
    315     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
    316         // Two byte Char
    317         if (secondByte < 0xA1) {
    318             it->error = TRUE;
    319         }
    320 
    321         return TRUE;
    322     }
    323 
    324     if (firstByte == 0x8E) {
    325         // Code Set 2.
    326         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    327         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    328         // We don't know which we've got.
    329         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    330         //   bytes will look like a well formed 2 byte char.
    331         if (secondByte < 0xA1) {
    332             it->error = TRUE;
    333         }
    334 
    335         return TRUE;
    336     }
    337 
    338     if (firstByte == 0x8F) {
    339         // Code set 3.
    340         // Three byte total char size, two bytes of actual char value.
    341         thirdByte    = it->nextByte(det);
    342         it->charValue = (it->charValue << 8) | thirdByte;
    343 
    344         if (thirdByte < 0xa1) {
    345             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
    346             it->error = TRUE;
    347         }
    348     }
    349 
    350     return TRUE;
    351 
    352 }
    353 
    354 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
    355 {
    356     // nothing to do
    357 }
    358 
    359 const char *CharsetRecog_euc_jp::getName() const
    360 {
    361     return "EUC-JP";
    362 }
    363 
    364 const char *CharsetRecog_euc_jp::getLanguage() const
    365 {
    366     return "ja";
    367 }
    368 
    369 int32_t CharsetRecog_euc_jp::match(InputText *det)
    370 {
    371     return match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
    372 }
    373 
    374 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
    375 {
    376     // nothing to do
    377 }
    378 
    379 const char *CharsetRecog_euc_kr::getName() const
    380 {
    381     return "EUC-KR";
    382 }
    383 
    384 const char *CharsetRecog_euc_kr::getLanguage() const
    385 {
    386     return "ko";
    387 }
    388 
    389 int32_t CharsetRecog_euc_kr::match(InputText *det)
    390 {
    391     return match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
    392 }
    393 
    394 CharsetRecog_big5::~CharsetRecog_big5()
    395 {
    396     // nothing to do
    397 }
    398 
    399 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det)
    400 {
    401     int32_t firstByte;
    402 
    403     it->index = it->nextIndex;
    404     it->error = FALSE;
    405     firstByte = it->charValue = it->nextByte(det);
    406 
    407     if (firstByte < 0) {
    408         return FALSE;
    409     }
    410 
    411     if (firstByte <= 0x7F || firstByte == 0xFF) {
    412         // single byte character.
    413         return TRUE;
    414     }
    415 
    416     int32_t secondByte = it->nextByte(det);
    417     if (secondByte >= 0)  {
    418         it->charValue = (it->charValue << 8) | secondByte;
    419     }
    420     // else we'll handle the error later.
    421 
    422     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
    423         it->error = TRUE;
    424     }
    425 
    426     return TRUE;
    427 }
    428 
    429 const char *CharsetRecog_big5::getName() const
    430 {
    431     return "Big5";
    432 }
    433 
    434 const char *CharsetRecog_big5::getLanguage() const
    435 {
    436     return "zh";
    437 }
    438 
    439 int32_t CharsetRecog_big5::match(InputText *det)
    440 {
    441     return match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
    442 }
    443 
    444 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
    445 {
    446     // nothing to do
    447 }
    448 
    449 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) {
    450     int32_t firstByte  = 0;
    451     int32_t secondByte = 0;
    452     int32_t thirdByte  = 0;
    453     int32_t fourthByte = 0;
    454 
    455     it->index = it->nextIndex;
    456     it->error = FALSE;
    457     firstByte = it->charValue = it->nextByte(det);
    458 
    459     if (firstByte < 0) {
    460         // Ran off the end of the input data
    461         return FALSE;
    462     }
    463 
    464     if (firstByte <= 0x80) {
    465         // single byte char
    466         return TRUE;
    467     }
    468 
    469     secondByte = it->nextByte(det);
    470     if (secondByte >= 0) {
    471         it->charValue = (it->charValue << 8) | secondByte;
    472     }
    473     // else we'll handle the error later.
    474 
    475     if (firstByte >= 0x81 && firstByte <= 0xFE) {
    476         // Two byte Char
    477         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
    478             return TRUE;
    479         }
    480 
    481         // Four byte char
    482         if (secondByte >= 0x30 && secondByte <= 0x39) {
    483             thirdByte = it->nextByte(det);
    484 
    485             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    486                 fourthByte = it->nextByte(det);
    487 
    488                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    489                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
    490 
    491                     return TRUE;
    492                 }
    493             }
    494         }
    495 
    496         // Something wasn't valid, or we ran out of data (-1).
    497         it->error = TRUE;
    498     }
    499 
    500     return TRUE;
    501 }
    502 
    503 const char *CharsetRecog_gb_18030::getName() const
    504 {
    505     return "GB18030";
    506 }
    507 
    508 const char *CharsetRecog_gb_18030::getLanguage() const
    509 {
    510     return "zh";
    511 }
    512 
    513 int32_t CharsetRecog_gb_18030::match(InputText *det)
    514 {
    515     return match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
    516 }
    517 
    518 U_NAMESPACE_END
    519 #endif
    520