Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2012, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "csmatch.h"
     13 #include "csrmbcs.h"
     14 
     15 #include <math.h>
     16 
     17 U_NAMESPACE_BEGIN
     18 
     19 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     20 
     21 #define min(x,y) (((x)<(y))?(x):(y))
     22 
     23 static const uint16_t commonChars_sjis [] = {
     24 // TODO:  This set of data comes from the character frequency-
     25 //        of-occurence analysis tool.  The data needs to be moved
     26 //        into a resource and loaded from there.
     27 0x8140, 0x8141, 0x8142, 0x8145, 0x815b, 0x8169, 0x816a, 0x8175, 0x8176, 0x82a0,
     28 0x82a2, 0x82a4, 0x82a9, 0x82aa, 0x82ab, 0x82ad, 0x82af, 0x82b1, 0x82b3, 0x82b5,
     29 0x82b7, 0x82bd, 0x82be, 0x82c1, 0x82c4, 0x82c5, 0x82c6, 0x82c8, 0x82c9, 0x82cc,
     30 0x82cd, 0x82dc, 0x82e0, 0x82e7, 0x82e8, 0x82e9, 0x82ea, 0x82f0, 0x82f1, 0x8341,
     31 0x8343, 0x834e, 0x834f, 0x8358, 0x835e, 0x8362, 0x8367, 0x8375, 0x8376, 0x8389,
     32 0x838a, 0x838b, 0x838d, 0x8393, 0x8e96, 0x93fa, 0x95aa};
     33 
     34 static const uint16_t commonChars_euc_jp[] = {
     35 // TODO:  This set of data comes from the character frequency-
     36 //        of-occurence analysis tool.  The data needs to be moved
     37 //        into a resource and loaded from there.
     38 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a6, 0xa1bc, 0xa1ca, 0xa1cb, 0xa1d6, 0xa1d7, 0xa4a2,
     39 0xa4a4, 0xa4a6, 0xa4a8, 0xa4aa, 0xa4ab, 0xa4ac, 0xa4ad, 0xa4af, 0xa4b1, 0xa4b3,
     40 0xa4b5, 0xa4b7, 0xa4b9, 0xa4bb, 0xa4bd, 0xa4bf, 0xa4c0, 0xa4c1, 0xa4c3, 0xa4c4,
     41 0xa4c6, 0xa4c7, 0xa4c8, 0xa4c9, 0xa4ca, 0xa4cb, 0xa4ce, 0xa4cf, 0xa4d0, 0xa4de,
     42 0xa4df, 0xa4e1, 0xa4e2, 0xa4e4, 0xa4e8, 0xa4e9, 0xa4ea, 0xa4eb, 0xa4ec, 0xa4ef,
     43 0xa4f2, 0xa4f3, 0xa5a2, 0xa5a3, 0xa5a4, 0xa5a6, 0xa5a7, 0xa5aa, 0xa5ad, 0xa5af,
     44 0xa5b0, 0xa5b3, 0xa5b5, 0xa5b7, 0xa5b8, 0xa5b9, 0xa5bf, 0xa5c3, 0xa5c6, 0xa5c7,
     45 0xa5c8, 0xa5c9, 0xa5cb, 0xa5d0, 0xa5d5, 0xa5d6, 0xa5d7, 0xa5de, 0xa5e0, 0xa5e1,
     46 0xa5e5, 0xa5e9, 0xa5ea, 0xa5eb, 0xa5ec, 0xa5ed, 0xa5f3, 0xb8a9, 0xb9d4, 0xbaee,
     47 0xbbc8, 0xbef0, 0xbfb7, 0xc4ea, 0xc6fc, 0xc7bd, 0xcab8, 0xcaf3, 0xcbdc, 0xcdd1};
     48 
     49 static const uint16_t commonChars_euc_kr[] = {
     50 // TODO:  This set of data comes from the character frequency-
     51 //        of-occurence analysis tool.  The data needs to be moved
     52 //        into a resource and loaded from there.
     53 0xb0a1, 0xb0b3, 0xb0c5, 0xb0cd, 0xb0d4, 0xb0e6, 0xb0ed, 0xb0f8, 0xb0fa, 0xb0fc,
     54 0xb1b8, 0xb1b9, 0xb1c7, 0xb1d7, 0xb1e2, 0xb3aa, 0xb3bb, 0xb4c2, 0xb4cf, 0xb4d9,
     55 0xb4eb, 0xb5a5, 0xb5b5, 0xb5bf, 0xb5c7, 0xb5e9, 0xb6f3, 0xb7af, 0xb7c2, 0xb7ce,
     56 0xb8a6, 0xb8ae, 0xb8b6, 0xb8b8, 0xb8bb, 0xb8e9, 0xb9ab, 0xb9ae, 0xb9cc, 0xb9ce,
     57 0xb9fd, 0xbab8, 0xbace, 0xbad0, 0xbaf1, 0xbbe7, 0xbbf3, 0xbbfd, 0xbcad, 0xbcba,
     58 0xbcd2, 0xbcf6, 0xbdba, 0xbdc0, 0xbdc3, 0xbdc5, 0xbec6, 0xbec8, 0xbedf, 0xbeee,
     59 0xbef8, 0xbefa, 0xbfa1, 0xbfa9, 0xbfc0, 0xbfe4, 0xbfeb, 0xbfec, 0xbff8, 0xc0a7,
     60 0xc0af, 0xc0b8, 0xc0ba, 0xc0bb, 0xc0bd, 0xc0c7, 0xc0cc, 0xc0ce, 0xc0cf, 0xc0d6,
     61 0xc0da, 0xc0e5, 0xc0fb, 0xc0fc, 0xc1a4, 0xc1a6, 0xc1b6, 0xc1d6, 0xc1df, 0xc1f6,
     62 0xc1f8, 0xc4a1, 0xc5cd, 0xc6ae, 0xc7cf, 0xc7d1, 0xc7d2, 0xc7d8, 0xc7e5, 0xc8ad};
     63 
     64 static const uint16_t commonChars_big5[] = {
     65 // TODO:  This set of data comes from the character frequency-
     66 //        of-occurence analysis tool.  The data needs to be moved
     67 //        into a resource and loaded from there.
     68 0xa140, 0xa141, 0xa142, 0xa143, 0xa147, 0xa149, 0xa175, 0xa176, 0xa440, 0xa446,
     69 0xa447, 0xa448, 0xa451, 0xa454, 0xa457, 0xa464, 0xa46a, 0xa46c, 0xa477, 0xa4a3,
     70 0xa4a4, 0xa4a7, 0xa4c1, 0xa4ce, 0xa4d1, 0xa4df, 0xa4e8, 0xa4fd, 0xa540, 0xa548,
     71 0xa558, 0xa569, 0xa5cd, 0xa5e7, 0xa657, 0xa661, 0xa662, 0xa668, 0xa670, 0xa6a8,
     72 0xa6b3, 0xa6b9, 0xa6d3, 0xa6db, 0xa6e6, 0xa6f2, 0xa740, 0xa751, 0xa759, 0xa7da,
     73 0xa8a3, 0xa8a5, 0xa8ad, 0xa8d1, 0xa8d3, 0xa8e4, 0xa8fc, 0xa9c0, 0xa9d2, 0xa9f3,
     74 0xaa6b, 0xaaba, 0xaabe, 0xaacc, 0xaafc, 0xac47, 0xac4f, 0xacb0, 0xacd2, 0xad59,
     75 0xaec9, 0xafe0, 0xb0ea, 0xb16f, 0xb2b3, 0xb2c4, 0xb36f, 0xb44c, 0xb44e, 0xb54c,
     76 0xb5a5, 0xb5bd, 0xb5d0, 0xb5d8, 0xb671, 0xb7ed, 0xb867, 0xb944, 0xbad8, 0xbb44,
     77 0xbba1, 0xbdd1, 0xc2c4, 0xc3b9, 0xc440, 0xc45f};
     78 
     79 static const uint16_t commonChars_gb_18030[] = {
     80 // TODO:  This set of data comes from the character frequency-
     81 //        of-occurence analysis tool.  The data needs to be moved
     82 //        into a resource and loaded from there.
     83 0xa1a1, 0xa1a2, 0xa1a3, 0xa1a4, 0xa1b0, 0xa1b1, 0xa1f1, 0xa1f3, 0xa3a1, 0xa3ac,
     84 0xa3ba, 0xb1a8, 0xb1b8, 0xb1be, 0xb2bb, 0xb3c9, 0xb3f6, 0xb4f3, 0xb5bd, 0xb5c4,
     85 0xb5e3, 0xb6af, 0xb6d4, 0xb6e0, 0xb7a2, 0xb7a8, 0xb7bd, 0xb7d6, 0xb7dd, 0xb8b4,
     86 0xb8df, 0xb8f6, 0xb9ab, 0xb9c9, 0xb9d8, 0xb9fa, 0xb9fd, 0xbacd, 0xbba7, 0xbbd6,
     87 0xbbe1, 0xbbfa, 0xbcbc, 0xbcdb, 0xbcfe, 0xbdcc, 0xbecd, 0xbedd, 0xbfb4, 0xbfc6,
     88 0xbfc9, 0xc0b4, 0xc0ed, 0xc1cb, 0xc2db, 0xc3c7, 0xc4dc, 0xc4ea, 0xc5cc, 0xc6f7,
     89 0xc7f8, 0xc8ab, 0xc8cb, 0xc8d5, 0xc8e7, 0xc9cf, 0xc9fa, 0xcab1, 0xcab5, 0xcac7,
     90 0xcad0, 0xcad6, 0xcaf5, 0xcafd, 0xccec, 0xcdf8, 0xceaa, 0xcec4, 0xced2, 0xcee5,
     91 0xcfb5, 0xcfc2, 0xcfd6, 0xd0c2, 0xd0c5, 0xd0d0, 0xd0d4, 0xd1a7, 0xd2aa, 0xd2b2,
     92 0xd2b5, 0xd2bb, 0xd2d4, 0xd3c3, 0xd3d0, 0xd3fd, 0xd4c2, 0xd4da, 0xd5e2, 0xd6d0};
     93 
     94 static int32_t binarySearch(const uint16_t *array, int32_t len, uint16_t value)
     95 {
     96     int32_t start = 0, end = len-1;
     97     int32_t mid = (start+end)/2;
     98 
     99     while(start <= end) {
    100         if(array[mid] == value) {
    101             return mid;
    102         }
    103 
    104         if(array[mid] < value){
    105             start = mid+1;
    106         } else {
    107             end = mid-1;
    108         }
    109 
    110         mid = (start+end)/2;
    111     }
    112 
    113     return -1;
    114 }
    115 
    116 IteratedChar::IteratedChar() :
    117 charValue(0), index(-1), nextIndex(0), error(FALSE), done(FALSE)
    118 {
    119     // nothing else to do.
    120 }
    121 
    122 /*void IteratedChar::reset()
    123 {
    124     charValue = 0;
    125     index     = -1;
    126     nextIndex = 0;
    127     error     = FALSE;
    128     done      = FALSE;
    129 }*/
    130 
    131 int32_t IteratedChar::nextByte(InputText *det)
    132 {
    133     if (nextIndex >= det->fRawLength) {
    134         done = TRUE;
    135 
    136         return -1;
    137     }
    138 
    139     return det->fRawInput[nextIndex++];
    140 }
    141 
    142 CharsetRecog_mbcs::~CharsetRecog_mbcs()
    143 {
    144     // nothing to do.
    145 }
    146 
    147 int32_t CharsetRecog_mbcs::match_mbcs(InputText *det, const uint16_t commonChars[], int32_t commonCharsLen) const {
    148     int32_t singleByteCharCount = 0;
    149     int32_t doubleByteCharCount = 0;
    150     int32_t commonCharCount     = 0;
    151     int32_t badCharCount        = 0;
    152     int32_t totalCharCount      = 0;
    153     int32_t confidence          = 0;
    154     IteratedChar iter;
    155 
    156     while (nextChar(&iter, det)) {
    157         totalCharCount++;
    158 
    159         if (iter.error) {
    160             badCharCount++;
    161         } else {
    162             if (iter.charValue <= 0xFF) {
    163                 singleByteCharCount++;
    164             } else {
    165                 doubleByteCharCount++;
    166 
    167                 if (commonChars != 0) {
    168                     if (binarySearch(commonChars, commonCharsLen, iter.charValue) >= 0){
    169                         commonCharCount += 1;
    170                     }
    171                 }
    172             }
    173         }
    174 
    175 
    176         if (badCharCount >= 2 && badCharCount*5 >= doubleByteCharCount) {
    177             // Bail out early if the byte data is not matching the encoding scheme.
    178             // break detectBlock;
    179             return confidence;
    180         }
    181     }
    182 
    183     if (doubleByteCharCount <= 10 && badCharCount == 0) {
    184         // Not many multi-byte chars.
    185         if (doubleByteCharCount == 0 && totalCharCount < 10) {
    186             // There weren't any multibyte sequences, and there was a low density of non-ASCII single bytes.
    187             // We don't have enough data to have any confidence.
    188             // Statistical analysis of single byte non-ASCII charcters would probably help here.
    189             confidence = 0;
    190         }
    191         else {
    192             //   ASCII or ISO file?  It's probably not our encoding,
    193             //   but is not incompatible with our encoding, so don't give it a zero.
    194             confidence = 10;
    195         }
    196 
    197         return confidence;
    198     }
    199 
    200     //
    201     //  No match if there are too many characters that don't fit the encoding scheme.
    202     //    (should we have zero tolerance for these?)
    203     //
    204     if (doubleByteCharCount < 20*badCharCount) {
    205         confidence = 0;
    206 
    207         return confidence;
    208     }
    209 
    210     if (commonChars == 0) {
    211         // We have no statistics on frequently occuring characters.
    212         //  Assess confidence purely on having a reasonable number of
    213         //  multi-byte characters (the more the better)
    214         confidence = 30 + doubleByteCharCount - 20*badCharCount;
    215 
    216         if (confidence > 100) {
    217             confidence = 100;
    218         }
    219     } else {
    220         //
    221         // Frequency of occurence statistics exist.
    222         //
    223 
    224         double maxVal = log((double)doubleByteCharCount / 4); /*(float)?*/
    225         double scaleFactor = 90.0 / maxVal;
    226         confidence = (int32_t)(log((double)commonCharCount+1) * scaleFactor + 10.0);
    227 
    228         confidence = min(confidence, 100);
    229     }
    230 
    231     if (confidence < 0) {
    232         confidence = 0;
    233     }
    234 
    235     return confidence;
    236 }
    237 
    238 CharsetRecog_sjis::~CharsetRecog_sjis()
    239 {
    240     // nothing to do
    241 }
    242 
    243 UBool CharsetRecog_sjis::nextChar(IteratedChar* it, InputText* det) const {
    244     it->index = it->nextIndex;
    245     it->error = FALSE;
    246 
    247     int32_t firstByte = it->charValue = it->nextByte(det);
    248 
    249     if (firstByte < 0) {
    250         return FALSE;
    251     }
    252 
    253     if (firstByte <= 0x7F || (firstByte > 0xA0 && firstByte <= 0xDF)) {
    254         return TRUE;
    255     }
    256 
    257     int32_t secondByte = it->nextByte(det);
    258     if (secondByte >= 0) {
    259         it->charValue = (firstByte << 8) | secondByte;
    260     }
    261     // else we'll handle the error later.
    262 
    263     if (! ((secondByte >= 0x40 && secondByte <= 0x7F) || (secondByte >= 0x80 && secondByte <= 0xFE))) {
    264         // Illegal second byte value.
    265         it->error = TRUE;
    266     }
    267 
    268     return TRUE;
    269 }
    270 
    271 UBool CharsetRecog_sjis::match(InputText* det, CharsetMatch *results) const {
    272     int32_t confidence = match_mbcs(det, commonChars_sjis, ARRAY_SIZE(commonChars_sjis));
    273     results->set(det, this, confidence);
    274     return (confidence > 0);
    275 }
    276 
    277 const char *CharsetRecog_sjis::getName() const
    278 {
    279     return "Shift_JIS";
    280 }
    281 
    282 const char *CharsetRecog_sjis::getLanguage() const
    283 {
    284     return "ja";
    285 }
    286 
    287 CharsetRecog_euc::~CharsetRecog_euc()
    288 {
    289     // nothing to do
    290 }
    291 
    292 UBool CharsetRecog_euc::nextChar(IteratedChar* it, InputText* det) const {
    293     int32_t firstByte  = 0;
    294     int32_t secondByte = 0;
    295     int32_t thirdByte  = 0;
    296 
    297     it->index = it->nextIndex;
    298     it->error = FALSE;
    299     firstByte = it->charValue = it->nextByte(det);
    300 
    301     if (firstByte < 0) {
    302         // Ran off the end of the input data
    303         return FALSE;
    304     }
    305 
    306     if (firstByte <= 0x8D) {
    307         // single byte char
    308         return TRUE;
    309     }
    310 
    311     secondByte = it->nextByte(det);
    312     if (secondByte >= 0) {
    313         it->charValue = (it->charValue << 8) | secondByte;
    314     }
    315     // else we'll handle the error later.
    316 
    317     if (firstByte >= 0xA1 && firstByte <= 0xFE) {
    318         // Two byte Char
    319         if (secondByte < 0xA1) {
    320             it->error = TRUE;
    321         }
    322 
    323         return TRUE;
    324     }
    325 
    326     if (firstByte == 0x8E) {
    327         // Code Set 2.
    328         //   In EUC-JP, total char size is 2 bytes, only one byte of actual char value.
    329         //   In EUC-TW, total char size is 4 bytes, three bytes contribute to char value.
    330         // We don't know which we've got.
    331         // Treat it like EUC-JP.  If the data really was EUC-TW, the following two
    332         //   bytes will look like a well formed 2 byte char.
    333         if (secondByte < 0xA1) {
    334             it->error = TRUE;
    335         }
    336 
    337         return TRUE;
    338     }
    339 
    340     if (firstByte == 0x8F) {
    341         // Code set 3.
    342         // Three byte total char size, two bytes of actual char value.
    343         thirdByte    = it->nextByte(det);
    344         it->charValue = (it->charValue << 8) | thirdByte;
    345 
    346         if (thirdByte < 0xa1) {
    347             // Bad second byte or ran off the end of the input data with a non-ASCII first byte.
    348             it->error = TRUE;
    349         }
    350     }
    351 
    352     return TRUE;
    353 
    354 }
    355 
    356 CharsetRecog_euc_jp::~CharsetRecog_euc_jp()
    357 {
    358     // nothing to do
    359 }
    360 
    361 const char *CharsetRecog_euc_jp::getName() const
    362 {
    363     return "EUC-JP";
    364 }
    365 
    366 const char *CharsetRecog_euc_jp::getLanguage() const
    367 {
    368     return "ja";
    369 }
    370 
    371 UBool CharsetRecog_euc_jp::match(InputText *det, CharsetMatch *results) const
    372 {
    373     int32_t confidence = match_mbcs(det, commonChars_euc_jp, ARRAY_SIZE(commonChars_euc_jp));
    374     results->set(det, this, confidence);
    375     return (confidence > 0);
    376 }
    377 
    378 CharsetRecog_euc_kr::~CharsetRecog_euc_kr()
    379 {
    380     // nothing to do
    381 }
    382 
    383 const char *CharsetRecog_euc_kr::getName() const
    384 {
    385     return "EUC-KR";
    386 }
    387 
    388 const char *CharsetRecog_euc_kr::getLanguage() const
    389 {
    390     return "ko";
    391 }
    392 
    393 UBool CharsetRecog_euc_kr::match(InputText *det, CharsetMatch *results) const
    394 {
    395     int32_t confidence =  match_mbcs(det, commonChars_euc_kr, ARRAY_SIZE(commonChars_euc_kr));
    396     results->set(det, this, confidence);
    397     return (confidence > 0);
    398 }
    399 
    400 CharsetRecog_big5::~CharsetRecog_big5()
    401 {
    402     // nothing to do
    403 }
    404 
    405 UBool CharsetRecog_big5::nextChar(IteratedChar* it, InputText* det) const
    406 {
    407     int32_t firstByte;
    408 
    409     it->index = it->nextIndex;
    410     it->error = FALSE;
    411     firstByte = it->charValue = it->nextByte(det);
    412 
    413     if (firstByte < 0) {
    414         return FALSE;
    415     }
    416 
    417     if (firstByte <= 0x7F || firstByte == 0xFF) {
    418         // single byte character.
    419         return TRUE;
    420     }
    421 
    422     int32_t secondByte = it->nextByte(det);
    423     if (secondByte >= 0)  {
    424         it->charValue = (it->charValue << 8) | secondByte;
    425     }
    426     // else we'll handle the error later.
    427 
    428     if (secondByte < 0x40 || secondByte == 0x7F || secondByte == 0xFF) {
    429         it->error = TRUE;
    430     }
    431 
    432     return TRUE;
    433 }
    434 
    435 const char *CharsetRecog_big5::getName() const
    436 {
    437     return "Big5";
    438 }
    439 
    440 const char *CharsetRecog_big5::getLanguage() const
    441 {
    442     return "zh";
    443 }
    444 
    445 UBool CharsetRecog_big5::match(InputText *det, CharsetMatch *results) const
    446 {
    447     int32_t confidence = match_mbcs(det, commonChars_big5, ARRAY_SIZE(commonChars_big5));
    448     results->set(det, this, confidence);
    449     return (confidence > 0);
    450 }
    451 
    452 CharsetRecog_gb_18030::~CharsetRecog_gb_18030()
    453 {
    454     // nothing to do
    455 }
    456 
    457 UBool CharsetRecog_gb_18030::nextChar(IteratedChar* it, InputText* det) const {
    458     int32_t firstByte  = 0;
    459     int32_t secondByte = 0;
    460     int32_t thirdByte  = 0;
    461     int32_t fourthByte = 0;
    462 
    463     it->index = it->nextIndex;
    464     it->error = FALSE;
    465     firstByte = it->charValue = it->nextByte(det);
    466 
    467     if (firstByte < 0) {
    468         // Ran off the end of the input data
    469         return FALSE;
    470     }
    471 
    472     if (firstByte <= 0x80) {
    473         // single byte char
    474         return TRUE;
    475     }
    476 
    477     secondByte = it->nextByte(det);
    478     if (secondByte >= 0) {
    479         it->charValue = (it->charValue << 8) | secondByte;
    480     }
    481     // else we'll handle the error later.
    482 
    483     if (firstByte >= 0x81 && firstByte <= 0xFE) {
    484         // Two byte Char
    485         if ((secondByte >= 0x40 && secondByte <= 0x7E) || (secondByte >=80 && secondByte <= 0xFE)) {
    486             return TRUE;
    487         }
    488 
    489         // Four byte char
    490         if (secondByte >= 0x30 && secondByte <= 0x39) {
    491             thirdByte = it->nextByte(det);
    492 
    493             if (thirdByte >= 0x81 && thirdByte <= 0xFE) {
    494                 fourthByte = it->nextByte(det);
    495 
    496                 if (fourthByte >= 0x30 && fourthByte <= 0x39) {
    497                     it->charValue = (it->charValue << 16) | (thirdByte << 8) | fourthByte;
    498 
    499                     return TRUE;
    500                 }
    501             }
    502         }
    503 
    504         // Something wasn't valid, or we ran out of data (-1).
    505         it->error = TRUE;
    506     }
    507 
    508     return TRUE;
    509 }
    510 
    511 const char *CharsetRecog_gb_18030::getName() const
    512 {
    513     return "GB18030";
    514 }
    515 
    516 const char *CharsetRecog_gb_18030::getLanguage() const
    517 {
    518     return "zh";
    519 }
    520 
    521 UBool CharsetRecog_gb_18030::match(InputText *det, CharsetMatch *results) const
    522 {
    523     int32_t confidence = match_mbcs(det, commonChars_gb_18030, ARRAY_SIZE(commonChars_gb_18030));
    524     results->set(det, this, confidence);
    525     return (confidence > 0);
    526 }
    527 
    528 U_NAMESPACE_END
    529 #endif
    530