Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2012, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "unicode/ucsdet.h"
     13 
     14 #include "csdetect.h"
     15 #include "csmatch.h"
     16 #include "uenumimp.h"
     17 
     18 #include "cmemory.h"
     19 #include "cstring.h"
     20 #include "umutex.h"
     21 #include "ucln_in.h"
     22 #include "uarrsort.h"
     23 #include "inputext.h"
     24 #include "csrsbcs.h"
     25 #include "csrmbcs.h"
     26 #include "csrutf8.h"
     27 #include "csrucode.h"
     28 #include "csr2022.h"
     29 
     30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     31 
     32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     34 
     35 U_CDECL_BEGIN
     36 static icu::CharsetRecognizer **fCSRecognizers = NULL;
     37 
     38 static int32_t fCSRecognizers_size = 0;
     39 
     40 static UBool U_CALLCONV csdet_cleanup(void)
     41 {
     42     if (fCSRecognizers != NULL) {
     43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
     44             delete fCSRecognizers[r];
     45             fCSRecognizers[r] = NULL;
     46         }
     47 
     48         DELETE_ARRAY(fCSRecognizers);
     49         fCSRecognizers = NULL;
     50         fCSRecognizers_size = 0;
     51     }
     52 
     53     return TRUE;
     54 }
     55 
     56 static int32_t U_CALLCONV
     57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
     58 {
     59     U_NAMESPACE_USE
     60 
     61     const CharsetMatch **csm_l = (const CharsetMatch **) left;
     62     const CharsetMatch **csm_r = (const CharsetMatch **) right;
     63 
     64     // NOTE: compare is backwards to sort from highest to lowest.
     65     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
     66 }
     67 
     68 U_CDECL_END
     69 
     70 U_NAMESPACE_BEGIN
     71 
     72 void CharsetDetector::setRecognizers(UErrorCode &status)
     73 {
     74     UBool needsInit;
     75     CharsetRecognizer **recognizers;
     76 
     77     if (U_FAILURE(status)) {
     78         return;
     79     }
     80 
     81     UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
     82 
     83     if (needsInit) {
     84         CharsetRecognizer *tempArray[] = {
     85             new CharsetRecog_UTF8(),
     86 
     87             new CharsetRecog_UTF_16_BE(),
     88             new CharsetRecog_UTF_16_LE(),
     89             new CharsetRecog_UTF_32_BE(),
     90             new CharsetRecog_UTF_32_LE(),
     91 
     92             new CharsetRecog_8859_1(),
     93             new CharsetRecog_8859_2(),
     94             new CharsetRecog_8859_5_ru(),
     95             new CharsetRecog_8859_6_ar(),
     96             new CharsetRecog_8859_7_el(),
     97             new CharsetRecog_8859_8_I_he(),
     98             new CharsetRecog_8859_8_he(),
     99             new CharsetRecog_windows_1251(),
    100             new CharsetRecog_windows_1256(),
    101             new CharsetRecog_KOI8_R(),
    102             new CharsetRecog_8859_9_tr(),
    103             new CharsetRecog_sjis(),
    104             new CharsetRecog_gb_18030(),
    105             new CharsetRecog_euc_jp(),
    106             new CharsetRecog_euc_kr(),
    107             new CharsetRecog_big5(),
    108 
    109             new CharsetRecog_2022JP(),
    110             new CharsetRecog_2022KR(),
    111             new CharsetRecog_2022CN(),
    112 
    113             new CharsetRecog_IBM424_he_rtl(),
    114             new CharsetRecog_IBM424_he_ltr(),
    115             new CharsetRecog_IBM420_ar_rtl(),
    116             new CharsetRecog_IBM420_ar_ltr()
    117         };
    118         int32_t rCount = ARRAY_SIZE(tempArray);
    119         int32_t r;
    120 
    121         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
    122 
    123         if (recognizers == NULL) {
    124             status = U_MEMORY_ALLOCATION_ERROR;
    125             return;
    126         } else {
    127             for (r = 0; r < rCount; r += 1) {
    128                 recognizers[r] = tempArray[r];
    129 
    130                 if (recognizers[r] == NULL) {
    131                     status = U_MEMORY_ALLOCATION_ERROR;
    132                     break;
    133                 }
    134             }
    135         }
    136 
    137         if (U_SUCCESS(status)) {
    138             umtx_lock(NULL);
    139             if (fCSRecognizers == NULL) {
    140                 fCSRecognizers_size = rCount;
    141                 fCSRecognizers = recognizers;
    142             }
    143             umtx_unlock(NULL);
    144         }
    145 
    146         if (fCSRecognizers != recognizers) {
    147             for (r = 0; r < rCount; r += 1) {
    148                 delete recognizers[r];
    149                 recognizers[r] = NULL;
    150             }
    151 
    152             DELETE_ARRAY(recognizers);
    153         }
    154 
    155         recognizers = NULL;
    156         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
    157     }
    158 }
    159 
    160 CharsetDetector::CharsetDetector(UErrorCode &status)
    161   : textIn(new InputText(status)), resultArray(NULL),
    162     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
    163 {
    164     if (U_FAILURE(status)) {
    165         return;
    166     }
    167 
    168     setRecognizers(status);
    169 
    170     if (U_FAILURE(status)) {
    171         return;
    172     }
    173 
    174     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
    175 
    176     if (resultArray == NULL) {
    177         status = U_MEMORY_ALLOCATION_ERROR;
    178         return;
    179     }
    180 
    181     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    182         resultArray[i] = new CharsetMatch();
    183 
    184         if (resultArray[i] == NULL) {
    185             status = U_MEMORY_ALLOCATION_ERROR;
    186             break;
    187         }
    188     }
    189 }
    190 
    191 CharsetDetector::~CharsetDetector()
    192 {
    193     delete textIn;
    194 
    195     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    196         delete resultArray[i];
    197     }
    198 
    199     uprv_free(resultArray);
    200 }
    201 
    202 void CharsetDetector::setText(const char *in, int32_t len)
    203 {
    204     textIn->setText(in, len);
    205     fFreshTextSet = TRUE;
    206 }
    207 
    208 UBool CharsetDetector::setStripTagsFlag(UBool flag)
    209 {
    210     UBool temp = fStripTags;
    211     fStripTags = flag;
    212     fFreshTextSet = TRUE;
    213     return temp;
    214 }
    215 
    216 UBool CharsetDetector::getStripTagsFlag() const
    217 {
    218     return fStripTags;
    219 }
    220 
    221 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
    222 {
    223     textIn->setDeclaredEncoding(encoding,len);
    224 }
    225 
    226 int32_t CharsetDetector::getDetectableCount()
    227 {
    228     UErrorCode status = U_ZERO_ERROR;
    229 
    230     setRecognizers(status);
    231 
    232     return fCSRecognizers_size;
    233 }
    234 
    235 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    236 {
    237     int32_t maxMatchesFound = 0;
    238 
    239     detectAll(maxMatchesFound, status);
    240 
    241     if(maxMatchesFound > 0) {
    242         return resultArray[0];
    243     } else {
    244         return NULL;
    245     }
    246 }
    247 
    248 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
    249 {
    250     if(!textIn->isSet()) {
    251         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
    252 
    253         return NULL;
    254     } else if (fFreshTextSet) {
    255         CharsetRecognizer *csr;
    256         int32_t            i;
    257 
    258         textIn->MungeInput(fStripTags);
    259 
    260         // Iterate over all possible charsets, remember all that
    261         // give a match quality > 0.
    262         resultCount = 0;
    263         for (i = 0; i < fCSRecognizers_size; i += 1) {
    264             csr = fCSRecognizers[i];
    265             if (csr->match(textIn, resultArray[resultCount])) {
    266                 resultCount++;
    267             }
    268         }
    269 
    270         if (resultCount > 1) {
    271             uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
    272         }
    273         fFreshTextSet = FALSE;
    274     }
    275 
    276     maxMatchesFound = resultCount;
    277 
    278     return resultArray;
    279 }
    280 
    281 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
    282 {
    283     if( index > fCSRecognizers_size-1 || index < 0) {
    284         status = U_INDEX_OUTOFBOUNDS_ERROR;
    285 
    286         return 0;
    287     } else {
    288         return fCSRecognizers[index]->getName();
    289     }
    290 }*/
    291 
    292 U_NAMESPACE_END
    293 
    294 U_CDECL_BEGIN
    295 typedef struct {
    296     int32_t currIndex;
    297 } Context;
    298 
    299 
    300 
    301 static void U_CALLCONV
    302 enumClose(UEnumeration *en) {
    303     if(en->context != NULL) {
    304         DELETE_ARRAY(en->context);
    305     }
    306 
    307     DELETE_ARRAY(en);
    308 }
    309 
    310 static int32_t U_CALLCONV
    311 enumCount(UEnumeration *, UErrorCode *) {
    312     return fCSRecognizers_size;
    313 }
    314 
    315 static const char* U_CALLCONV
    316 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    317     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
    318         if(resultLength != NULL) {
    319             *resultLength = 0;
    320         }
    321         return NULL;
    322     }
    323     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
    324     if(resultLength != NULL) {
    325         *resultLength = (int32_t)uprv_strlen(currName);
    326     }
    327     ((Context *)en->context)->currIndex++;
    328 
    329     return currName;
    330 }
    331 
    332 static void U_CALLCONV
    333 enumReset(UEnumeration *en, UErrorCode *) {
    334     ((Context *)en->context)->currIndex = 0;
    335 }
    336 
    337 static const UEnumeration gCSDetEnumeration = {
    338     NULL,
    339     NULL,
    340     enumClose,
    341     enumCount,
    342     uenum_unextDefault,
    343     enumNext,
    344     enumReset
    345 };
    346 
    347 U_CAPI  UEnumeration * U_EXPORT2
    348 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
    349 {
    350     U_NAMESPACE_USE
    351 
    352     if(U_FAILURE(*status)) {
    353         return 0;
    354     }
    355 
    356     /* Initialize recognized charsets. */
    357     CharsetDetector::getDetectableCount();
    358 
    359     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    360     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    361     en->context = (void*)NEW_ARRAY(Context, 1);
    362     uprv_memset(en->context, 0, sizeof(Context));
    363     return en;
    364 }
    365 U_CDECL_END
    366 
    367 #endif
    368 
    369