Home | History | Annotate | Download | only in i18n
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2016, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  */
      9 
     10 #include "unicode/utypes.h"
     11 
     12 #if !UCONFIG_NO_CONVERSION
     13 
     14 #include "unicode/ucsdet.h"
     15 
     16 #include "csdetect.h"
     17 #include "csmatch.h"
     18 #include "uenumimp.h"
     19 
     20 #include "cmemory.h"
     21 #include "cstring.h"
     22 #include "umutex.h"
     23 #include "ucln_in.h"
     24 #include "uarrsort.h"
     25 #include "inputext.h"
     26 #include "csrsbcs.h"
     27 #include "csrmbcs.h"
     28 #include "csrutf8.h"
     29 #include "csrucode.h"
     30 #include "csr2022.h"
     31 
     32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     34 
     35 U_NAMESPACE_BEGIN
     36 
     37 struct CSRecognizerInfo : public UMemory {
     38     CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled)
     39         : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {};
     40 
     41     ~CSRecognizerInfo() {delete recognizer;};
     42 
     43     CharsetRecognizer *recognizer;
     44     UBool isDefaultEnabled;
     45 };
     46 
     47 U_NAMESPACE_END
     48 
     49 static icu::CSRecognizerInfo **fCSRecognizers = NULL;
     50 static icu::UInitOnce gCSRecognizersInitOnce;
     51 static int32_t fCSRecognizers_size = 0;
     52 
     53 U_CDECL_BEGIN
     54 static UBool U_CALLCONV csdet_cleanup(void)
     55 {
     56     U_NAMESPACE_USE
     57     if (fCSRecognizers != NULL) {
     58         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
     59             delete fCSRecognizers[r];
     60             fCSRecognizers[r] = NULL;
     61         }
     62 
     63         DELETE_ARRAY(fCSRecognizers);
     64         fCSRecognizers = NULL;
     65         fCSRecognizers_size = 0;
     66     }
     67     gCSRecognizersInitOnce.reset();
     68 
     69     return TRUE;
     70 }
     71 
     72 static int32_t U_CALLCONV
     73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
     74 {
     75     U_NAMESPACE_USE
     76 
     77     const CharsetMatch **csm_l = (const CharsetMatch **) left;
     78     const CharsetMatch **csm_r = (const CharsetMatch **) right;
     79 
     80     // NOTE: compare is backwards to sort from highest to lowest.
     81     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
     82 }
     83 
     84 static void U_CALLCONV initRecognizers(UErrorCode &status) {
     85     U_NAMESPACE_USE
     86     ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
     87     CSRecognizerInfo *tempArray[] = {
     88         new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE),
     89 
     90         new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE),
     91         new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE),
     92         new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE),
     93         new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE),
     94 
     95         new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE),
     96         new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE),
     97         new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE),
     98         new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE),
     99         new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE),
    100         new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE),
    101         new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE),
    102         new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE),
    103         new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE),
    104         new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE),
    105         new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE),
    106         new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE),
    107         new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE),
    108         new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE),
    109         new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE),
    110         new CSRecognizerInfo(new CharsetRecog_big5(), TRUE),
    111 
    112         new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE),
    113 #if !UCONFIG_ONLY_HTML_CONVERSION
    114         new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE),
    115         new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE),
    116 
    117         new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE),
    118         new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE),
    119         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE),
    120         new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE)
    121 #endif
    122     };
    123     int32_t rCount = UPRV_LENGTHOF(tempArray);
    124 
    125     fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount);
    126 
    127     if (fCSRecognizers == NULL) {
    128         status = U_MEMORY_ALLOCATION_ERROR;
    129     }
    130     else {
    131         fCSRecognizers_size = rCount;
    132         for (int32_t r = 0; r < rCount; r += 1) {
    133             fCSRecognizers[r] = tempArray[r];
    134             if (fCSRecognizers[r] == NULL) {
    135                 status = U_MEMORY_ALLOCATION_ERROR;
    136             }
    137         }
    138     }
    139 }
    140 
    141 U_CDECL_END
    142 
    143 U_NAMESPACE_BEGIN
    144 
    145 void CharsetDetector::setRecognizers(UErrorCode &status)
    146 {
    147     umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status);
    148 }
    149 
    150 CharsetDetector::CharsetDetector(UErrorCode &status)
    151   : textIn(new InputText(status)), resultArray(NULL),
    152     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE),
    153     fEnabledRecognizers(NULL)
    154 {
    155     if (U_FAILURE(status)) {
    156         return;
    157     }
    158 
    159     setRecognizers(status);
    160 
    161     if (U_FAILURE(status)) {
    162         return;
    163     }
    164 
    165     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
    166 
    167     if (resultArray == NULL) {
    168         status = U_MEMORY_ALLOCATION_ERROR;
    169         return;
    170     }
    171 
    172     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    173         resultArray[i] = new CharsetMatch();
    174 
    175         if (resultArray[i] == NULL) {
    176             status = U_MEMORY_ALLOCATION_ERROR;
    177             break;
    178         }
    179     }
    180 }
    181 
    182 CharsetDetector::~CharsetDetector()
    183 {
    184     delete textIn;
    185 
    186     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    187         delete resultArray[i];
    188     }
    189 
    190     uprv_free(resultArray);
    191 
    192     if (fEnabledRecognizers) {
    193         uprv_free(fEnabledRecognizers);
    194     }
    195 }
    196 
    197 void CharsetDetector::setText(const char *in, int32_t len)
    198 {
    199     textIn->setText(in, len);
    200     fFreshTextSet = TRUE;
    201 }
    202 
    203 UBool CharsetDetector::setStripTagsFlag(UBool flag)
    204 {
    205     UBool temp = fStripTags;
    206     fStripTags = flag;
    207     fFreshTextSet = TRUE;
    208     return temp;
    209 }
    210 
    211 UBool CharsetDetector::getStripTagsFlag() const
    212 {
    213     return fStripTags;
    214 }
    215 
    216 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
    217 {
    218     textIn->setDeclaredEncoding(encoding,len);
    219 }
    220 
    221 int32_t CharsetDetector::getDetectableCount()
    222 {
    223     UErrorCode status = U_ZERO_ERROR;
    224 
    225     setRecognizers(status);
    226 
    227     return fCSRecognizers_size;
    228 }
    229 
    230 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    231 {
    232     int32_t maxMatchesFound = 0;
    233 
    234     detectAll(maxMatchesFound, status);
    235 
    236     if(maxMatchesFound > 0) {
    237         return resultArray[0];
    238     } else {
    239         return NULL;
    240     }
    241 }
    242 
    243 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
    244 {
    245     if(!textIn->isSet()) {
    246         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
    247 
    248         return NULL;
    249     } else if (fFreshTextSet) {
    250         CharsetRecognizer *csr;
    251         int32_t            i;
    252 
    253         textIn->MungeInput(fStripTags);
    254 
    255         // Iterate over all possible charsets, remember all that
    256         // give a match quality > 0.
    257         resultCount = 0;
    258         for (i = 0; i < fCSRecognizers_size; i += 1) {
    259             csr = fCSRecognizers[i]->recognizer;
    260             if (csr->match(textIn, resultArray[resultCount])) {
    261                 resultCount++;
    262             }
    263         }
    264 
    265         if (resultCount > 1) {
    266             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
    267         }
    268         fFreshTextSet = FALSE;
    269     }
    270 
    271     maxMatchesFound = resultCount;
    272 
    273     return resultArray;
    274 }
    275 
    276 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status)
    277 {
    278     if (U_FAILURE(status)) {
    279         return;
    280     }
    281 
    282     int32_t modIdx = -1;
    283     UBool isDefaultVal = FALSE;
    284     for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    285         CSRecognizerInfo *csrinfo = fCSRecognizers[i];
    286         if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) {
    287             modIdx = i;
    288             isDefaultVal = (csrinfo->isDefaultEnabled == enabled);
    289             break;
    290         }
    291     }
    292     if (modIdx < 0) {
    293         // No matching encoding found
    294         status = U_ILLEGAL_ARGUMENT_ERROR;
    295         return;
    296     }
    297 
    298     if (fEnabledRecognizers == NULL && !isDefaultVal) {
    299         // Create an array storing the non default setting
    300         fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size);
    301         if (fEnabledRecognizers == NULL) {
    302             status = U_MEMORY_ALLOCATION_ERROR;
    303             return;
    304         }
    305         // Initialize the array with default info
    306         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    307             fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled;
    308         }
    309     }
    310 
    311     if (fEnabledRecognizers != NULL) {
    312         fEnabledRecognizers[modIdx] = enabled;
    313     }
    314 }
    315 
    316 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
    317 {
    318     if( index > fCSRecognizers_size-1 || index < 0) {
    319         status = U_INDEX_OUTOFBOUNDS_ERROR;
    320 
    321         return 0;
    322     } else {
    323         return fCSRecognizers[index]->getName();
    324     }
    325 }*/
    326 
    327 U_NAMESPACE_END
    328 
    329 U_CDECL_BEGIN
    330 typedef struct {
    331     int32_t currIndex;
    332     UBool all;
    333     UBool *enabledRecognizers;
    334 } Context;
    335 
    336 
    337 
    338 static void U_CALLCONV
    339 enumClose(UEnumeration *en) {
    340     if(en->context != NULL) {
    341         DELETE_ARRAY(en->context);
    342     }
    343 
    344     DELETE_ARRAY(en);
    345 }
    346 
    347 static int32_t U_CALLCONV
    348 enumCount(UEnumeration *en, UErrorCode *) {
    349     if (((Context *)en->context)->all) {
    350         // ucsdet_getAllDetectableCharsets, all charset detector names
    351         return fCSRecognizers_size;
    352     }
    353 
    354     // Otherwise, ucsdet_getDetectableCharsets - only enabled ones
    355     int32_t count = 0;
    356     UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    357     if (enabledArray != NULL) {
    358         // custom set
    359         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    360             if (enabledArray[i]) {
    361                 count++;
    362             }
    363         }
    364     } else {
    365         // default set
    366         for (int32_t i = 0; i < fCSRecognizers_size; i++) {
    367             if (fCSRecognizers[i]->isDefaultEnabled) {
    368                 count++;
    369             }
    370         }
    371     }
    372     return count;
    373 }
    374 
    375 static const char* U_CALLCONV
    376 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    377     const char *currName = NULL;
    378 
    379     if (((Context *)en->context)->currIndex < fCSRecognizers_size) {
    380         if (((Context *)en->context)->all) {
    381             // ucsdet_getAllDetectableCharsets, all charset detector names
    382             currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    383             ((Context *)en->context)->currIndex++;
    384         } else {
    385             // ucsdet_getDetectableCharsets
    386             UBool *enabledArray = ((Context *)en->context)->enabledRecognizers;
    387             if (enabledArray != NULL) {
    388                 // custome set
    389                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    390                     if (enabledArray[((Context *)en->context)->currIndex]) {
    391                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    392                     }
    393                     ((Context *)en->context)->currIndex++;
    394                 }
    395             } else {
    396                 // default set
    397                 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) {
    398                     if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) {
    399                         currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName();
    400                     }
    401                     ((Context *)en->context)->currIndex++;
    402                 }
    403             }
    404         }
    405     }
    406 
    407     if(resultLength != NULL) {
    408         *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName);
    409     }
    410 
    411     return currName;
    412 }
    413 
    414 
    415 static void U_CALLCONV
    416 enumReset(UEnumeration *en, UErrorCode *) {
    417     ((Context *)en->context)->currIndex = 0;
    418 }
    419 
    420 static const UEnumeration gCSDetEnumeration = {
    421     NULL,
    422     NULL,
    423     enumClose,
    424     enumCount,
    425     uenum_unextDefault,
    426     enumNext,
    427     enumReset
    428 };
    429 
    430 U_CDECL_END
    431 
    432 U_NAMESPACE_BEGIN
    433 
    434 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status)
    435 {
    436 
    437     /* Initialize recognized charsets. */
    438     setRecognizers(status);
    439 
    440     if(U_FAILURE(status)) {
    441         return 0;
    442     }
    443 
    444     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    445     if (en == NULL) {
    446         status = U_MEMORY_ALLOCATION_ERROR;
    447         return 0;
    448     }
    449     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    450     en->context = (void*)NEW_ARRAY(Context, 1);
    451     if (en->context == NULL) {
    452         status = U_MEMORY_ALLOCATION_ERROR;
    453         DELETE_ARRAY(en);
    454         return 0;
    455     }
    456     uprv_memset(en->context, 0, sizeof(Context));
    457     ((Context*)en->context)->all = TRUE;
    458     return en;
    459 }
    460 
    461 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const
    462 {
    463     if(U_FAILURE(status)) {
    464         return 0;
    465     }
    466 
    467     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    468     if (en == NULL) {
    469         status = U_MEMORY_ALLOCATION_ERROR;
    470         return 0;
    471     }
    472     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    473     en->context = (void*)NEW_ARRAY(Context, 1);
    474     if (en->context == NULL) {
    475         status = U_MEMORY_ALLOCATION_ERROR;
    476         DELETE_ARRAY(en);
    477         return 0;
    478     }
    479     uprv_memset(en->context, 0, sizeof(Context));
    480     ((Context*)en->context)->all = FALSE;
    481     ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers;
    482     return en;
    483 }
    484 
    485 U_NAMESPACE_END
    486 
    487 #endif
    488