Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2013, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "unicode/ucsdet.h"
     13 
     14 #include "csdetect.h"
     15 #include "csmatch.h"
     16 #include "uenumimp.h"
     17 
     18 #include "cmemory.h"
     19 #include "cstring.h"
     20 #include "umutex.h"
     21 #include "ucln_in.h"
     22 #include "uarrsort.h"
     23 #include "inputext.h"
     24 #include "csrsbcs.h"
     25 #include "csrmbcs.h"
     26 #include "csrutf8.h"
     27 #include "csrucode.h"
     28 #include "csr2022.h"
     29 
     30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     31 
     32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 struct CSRecognizerInfo : public UMemory {
     38     CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
     39         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
     40 
     41     ~CSRecognizerInfo() {delete recognizer;};
     42 
     43     CharsetRecognizer *recognizer;
     44     UBool isDefaultEnabled;
     45 };
     46 
     47 U_NAMESPACE_END
     48 
     49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
     50 static icu::UInitOnce gCSRecognizersInitOnce;
     51 static int32_t fCSRecognizers_size = 0;
     52 
     53 U_CDECL_BEGIN
     54 static UBool U_CALLCONV csdet_cleanup(void)
     55 {
     56     U_NAMESPACE_USE
     57     if (fCSRecognizers != NULL) {
     58         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
     59             delete fCSRecognizers[r];
     60             fCSRecognizers[r] = NULL;
     61         }
     62 
     63         DELETE_ARRAY(fCSRecognizers);
     64         fCSRecognizers = NULL;
     65         fCSRecognizers_size = 0;
     66     }
     67     gCSRecognizersInitOnce.reset();
     68 
     69     return TRUE;
     70 }
     71 
     72 static int32_t U_CALLCONV
     73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
     74 {
     75     U_NAMESPACE_USE
     76 
     77     const CharsetMatch **csm_l = (const CharsetMatch **) left;
     78     const CharsetMatch **csm_r = (const CharsetMatch **) right;
     79 
     80     // NOTE: compare is backwards to sort from highest to lowest.
     81     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
     82 }
     83 
     84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
     85     U_NAMESPACE_USE
     86     ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
     87     CSRecognizerInfo *tempArray[] = {
     88         new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
     89 
     90         new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
     91         new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
     92         new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
     93         new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
     94 
     95         new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
     96         new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
     97         new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
     98         new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
     99         new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
    100         new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
    101         new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
    102         new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
    103         new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
    104         new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
    105         new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
    106         new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
    107         new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
    108         new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
    109         new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
    110         new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
    111 
    112         new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
    113         new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
    114         new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
    115 
    116         new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
    117         new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
    118         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
    119         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
    120     };
    121     int32_t rCount = ARRAY_SIZE(tempArray);
    122 
    123     fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
    124 
    125     if (fCSRecognizers == NULL) {
    126         status = U_MEMORY_ALLOCATION_ERROR;
    127     }
    128     else {
    129         fCSRecognizers_size = rCount;
    130         for (int32_t r = 0; r < rCount; r += 1) {
    131             fCSRecognizers[r] = tempArray[r];
    132             if (fCSRecognizers[r] == NULL) {
    133                 status = U_MEMORY_ALLOCATION_ERROR;
    134             }
    135         }
    136     }
    137 }
    138 
    139 U_CDECL_END
    140 
    141 U_NAMESPACE_BEGIN
    142 
    143 void CharsetDetector::setRecognizers(UErrorCode &status)
    144 {
    145     umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
    146 }
    147 
    148 CharsetDetector::CharsetDetector(UErrorCode &status)
    149   : textIn(new InputText(status)), resultArray(NULL),
    150     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
    151     fEnabledRecognizers(NULL)
    152 {
    153     if (U_FAILURE(status)) {
    154         return;
    155     }
    156 
    157     setRecognizers(status);
    158 
    159     if (U_FAILURE(status)) {
    160         return;
    161     }
    162 
    163     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
    164 
    165     if (resultArray == NULL) {
    166         status = U_MEMORY_ALLOCATION_ERROR;
    167         return;
    168     }
    169 
    170     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    171         resultArray[i] = new CharsetMatch();
    172 
    173         if (resultArray[i] == NULL) {
    174             status = U_MEMORY_ALLOCATION_ERROR;
    175             break;
    176         }
    177     }
    178 }
    179 
    180 CharsetDetector::~CharsetDetector()
    181 {
    182     delete textIn;
    183 
    184     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    185         delete resultArray[i];
    186     }
    187 
    188     uprv_free(resultArray);
    189 
    190     if (fEnabledRecognizers) {
    191         uprv_free(fEnabledRecognizers);
    192     }
    193 }
    194 
    195 void CharsetDetector::setText(const char *in, int32_t len)
    196 {
    197     textIn->setText(in, len);
    198     fFreshTextSet = TRUE;
    199 }
    200 
    201 UBool CharsetDetector::setStripTagsFlag(UBool flag)
    202 {
    203     UBool temp = fStripTags;
    204     fStripTags = flag;
    205     fFreshTextSet = TRUE;
    206     return temp;
    207 }
    208 
    209 UBool CharsetDetector::getStripTagsFlag() const
    210 {
    211     return fStripTags;
    212 }
    213 
    214 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
    215 {
    216     textIn->setDeclaredEncoding(encoding,len);
    217 }
    218 
    219 int32_t CharsetDetector::getDetectableCount()
    220 {
    221     UErrorCode status = U_ZERO_ERROR;
    222 
    223     setRecognizers(status);
    224 
    225     return fCSRecognizers_size;
    226 }
    227 
    228 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    229 {
    230     int32_t maxMatchesFound = 0;
    231 
    232     detectAll(maxMatchesFound, status);
    233 
    234     if(maxMatchesFound > 0) {
    235         return resultArray[0];
    236     } else {
    237         return NULL;
    238     }
    239 }
    240 
    241 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
    242 {
    243     if(!textIn->isSet()) {
    244         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
    245 
    246         return NULL;
    247     } else if (fFreshTextSet) {
    248         CharsetRecognizer *csr;
    249         int32_t            i;
    250 
    251         textIn->MungeInput(fStripTags);
    252 
    253         // Iterate over all possible charsets, remember all that
    254         // give a match quality > 0.
    255         resultCount = 0;
    256         for (i = 0; i < fCSRecognizers_size; i += 1) {
    257             csr = fCSRecognizers[i]->recognizer;
    258             if (csr->match(textIn, resultArray[resultCount])) {
    259                 resultCount++;
    260             }
    261         }
    262 
    263         if (resultCount > 1) {
    264             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
    265         }
    266         fFreshTextSet = FALSE;
    267     }
    268 
    269     maxMatchesFound = resultCount;
    270 
    271     return resultArray;
    272 }
    273 
    274 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
    275 {
    276     if (U_FAILURE(status)) {
    277         return;
    278     }
    279 
    280     int32_t modIdx = -1;
    281     UBool isDefaultVal = FALSE;
    282     for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    283         CSRecognizerInfo *csrinfo = fCSRecognizers[i];
    284         if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
    285             modIdx = i;
    286             isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
    287             break;
    288         }
    289     }
    290     if (modIdx < 0) {
    291         // No matching encoding found
    292         status = U_ILLEGAL_ARGUMENT_ERROR;
    293         return;
    294     }
    295 
    296     if (fEnabledRecognizers == NULL && !isDefaultVal) {
    297         // Create an array storing the non default setting
    298         fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
    299         if (fEnabledRecognizers == NULL) {
    300             status = U_MEMORY_ALLOCATION_ERROR;
    301             return;
    302         }
    303         // Initialize the array with default info
    304         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    305             fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
    306         }
    307     }
    308 
    309     if (fEnabledRecognizers != NULL) {
    310         fEnabledRecognizers[modIdx] = enabled;
    311     }
    312 }
    313 
    314 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
    315 {
    316     if( index > fCSRecognizers_size-1 || index < 0) {
    317         status = U_INDEX_OUTOFBOUNDS_ERROR;
    318 
    319         return 0;
    320     } else {
    321         return fCSRecognizers[index]->getName();
    322     }
    323 }*/
    324 
    325 U_NAMESPACE_END
    326 
    327 U_CDECL_BEGIN
    328 typedef struct {
    329     int32_t currIndex;
    330     UBool all;
    331     UBool *enabledRecognizers;
    332 } Context;
    333 
    334 
    335 
    336 static void U_CALLCONV
    337 enumClose(UEnumeration *en) {
    338     if(en->context != NULL) {
    339         DELETE_ARRAY(en->context);
    340     }
    341 
    342     DELETE_ARRAY(en);
    343 }
    344 
    345 static int32_t U_CALLCONV
    346 enumCount(UEnumeration *en, UErrorCode *) {
    347     if (((Context *)en->context)->all) {
    348         // ucsdet_getAllDetectableCharsets, all charset detector names
    349         return fCSRecognizers_size;
    350     }
    351 
    352     // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
    353     int32_t count = 0;
    354     UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    355     if (enabledArray != NULL) {
    356         // custom set
    357         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    358             if (enabledArray[i]) {
    359                 count++;
    360             }
    361         }
    362     } else {
    363         // default set
    364         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    365             if (fCSRecognizers[i]->isDefaultEnabled) {
    366                 count++;
    367             }
    368         }
    369     }
    370     return count;
    371 }
    372 
    373 static const char* U_CALLCONV
    374 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    375     const char *currName = NULL;
    376 
    377     if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
    378         if (((Context *)en->context)->all) {
    379             // ucsdet_getAllDetectableCharsets, all charset detector names
    380             currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    381             ((Context *)en->context)->currIndex++;
    382         } else {
    383             // ucsdet_getDetectableCharsets
    384             UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    385             if (enabledArray != NULL) {
    386                 // custome set
    387                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    388                     if (enabledArray[((Context *)en->context)->currIndex]) {
    389                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    390                     }
    391                     ((Context *)en->context)->currIndex++;
    392                 }
    393             } else {
    394                 // default set
    395                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    396                     if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
    397                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    398                     }
    399                     ((Context *)en->context)->currIndex++;
    400                 }
    401             }
    402         }
    403     }
    404 
    405     if(resultLength != NULL) {
    406         *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
    407     }
    408 
    409     return currName;
    410 }
    411 
    412 
    413 static void U_CALLCONV
    414 enumReset(UEnumeration *en, UErrorCode *) {
    415     ((Context *)en->context)->currIndex = 0;
    416 }
    417 
    418 static const UEnumeration gCSDetEnumeration = {
    419     NULL,
    420     NULL,
    421     enumClose,
    422     enumCount,
    423     uenum_unextDefault,
    424     enumNext,
    425     enumReset
    426 };
    427 
    428 U_CDECL_END
    429 
    430 U_NAMESPACE_BEGIN
    431 
    432 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
    433 {
    434 
    435     /* Initialize recognized charsets. */
    436     setRecognizers(status);
    437 
    438     if(U_FAILURE(status)) {
    439         return 0;
    440     }
    441 
    442     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    443     if (en == NULL) {
    444         status = U_MEMORY_ALLOCATION_ERROR;
    445         return 0;
    446     }
    447     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    448     en->context = (void*)NEW_ARRAY(Context, 1);
    449     if (en->context == NULL) {
    450         status = U_MEMORY_ALLOCATION_ERROR;
    451         DELETE_ARRAY(en);
    452         return 0;
    453     }
    454     uprv_memset(en->context, 0, sizeof(Context));
    455     ((Context*)en->context)->all = TRUE;
    456     return en;
    457 }
    458 
    459 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
    460 {
    461     if(U_FAILURE(status)) {
    462         return 0;
    463     }
    464 
    465     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    466     if (en == NULL) {
    467         status = U_MEMORY_ALLOCATION_ERROR;
    468         return 0;
    469     }
    470     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    471     en->context = (void*)NEW_ARRAY(Context, 1);
    472     if (en->context == NULL) {
    473         status = U_MEMORY_ALLOCATION_ERROR;
    474         DELETE_ARRAY(en);
    475         return 0;
    476     }
    477     uprv_memset(en->context, 0, sizeof(Context));
    478     ((Context*)en->context)->all = FALSE;
    479     ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
    480     return en;
    481 }
    482 
    483 U_NAMESPACE_END
    484 
    485 #endif
    486