Home | History | Annotate | Download | only in i18n
      1 /*
      2  **********************************************************************
      3  *   Copyright (C) 2005-2009, International Business Machines
      4  *   Corporation and others.  All Rights Reserved.
      5  **********************************************************************
      6  */
      7 
      8 #include "unicode/utypes.h"
      9 
     10 #if !UCONFIG_NO_CONVERSION
     11 
     12 #include "unicode/ucsdet.h"
     13 
     14 #include "csdetect.h"
     15 #include "csmatch.h"
     16 #include "uenumimp.h"
     17 
     18 #include "cmemory.h"
     19 #include "cstring.h"
     20 #include "umutex.h"
     21 #include "ucln_in.h"
     22 #include "uarrsort.h"
     23 #include "inputext.h"
     24 #include "csrsbcs.h"
     25 #include "csrmbcs.h"
     26 #include "csrutf8.h"
     27 #include "csrucode.h"
     28 #include "csr2022.h"
     29 
     30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
     31 
     32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type))
     33 #define DELETE_ARRAY(array) uprv_free((void *) (array))
     34 
     35 U_CDECL_BEGIN
     36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL;
     37 
     38 static int32_t fCSRecognizers_size = 0;
     39 
     40 static UBool U_CALLCONV csdet_cleanup(void)
     41 {
     42     if (fCSRecognizers != NULL) {
     43         for(int32_t r = 0; r < fCSRecognizers_size; r += 1) {
     44             delete fCSRecognizers[r];
     45             fCSRecognizers[r] = NULL;
     46         }
     47 
     48         DELETE_ARRAY(fCSRecognizers);
     49         fCSRecognizers = NULL;
     50         fCSRecognizers_size = 0;
     51     }
     52 
     53     return TRUE;
     54 }
     55 
     56 static int32_t U_CALLCONV
     57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right)
     58 {
     59     U_NAMESPACE_USE
     60 
     61     const CharsetMatch **csm_l = (const CharsetMatch **) left;
     62     const CharsetMatch **csm_r = (const CharsetMatch **) right;
     63 
     64     // NOTE: compare is backwards to sort from highest to lowest.
     65     return (*csm_r)->getConfidence() - (*csm_l)->getConfidence();
     66 }
     67 
     68 U_CDECL_END
     69 
     70 U_NAMESPACE_BEGIN
     71 
     72 void CharsetDetector::setRecognizers(UErrorCode &status)
     73 {
     74     UBool needsInit;
     75     CharsetRecognizer **recognizers;
     76 
     77     if (U_FAILURE(status)) {
     78         return;
     79     }
     80 
     81     UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit);
     82 
     83     if (needsInit) {
     84         CharsetRecognizer *tempArray[] = {
     85             new CharsetRecog_UTF8(),
     86 
     87             new CharsetRecog_UTF_16_BE(),
     88             new CharsetRecog_UTF_16_LE(),
     89             new CharsetRecog_UTF_32_BE(),
     90             new CharsetRecog_UTF_32_LE(),
     91 
     92             new CharsetRecog_8859_1_en(),
     93             new CharsetRecog_8859_1_da(),
     94             new CharsetRecog_8859_1_de(),
     95             new CharsetRecog_8859_1_es(),
     96             new CharsetRecog_8859_1_fr(),
     97             new CharsetRecog_8859_1_it(),
     98             new CharsetRecog_8859_1_nl(),
     99             new CharsetRecog_8859_1_no(),
    100             new CharsetRecog_8859_1_pt(),
    101             new CharsetRecog_8859_1_sv(),
    102             new CharsetRecog_8859_2_cs(),
    103             new CharsetRecog_8859_2_hu(),
    104             new CharsetRecog_8859_2_pl(),
    105             new CharsetRecog_8859_2_ro(),
    106             new CharsetRecog_8859_5_ru(),
    107             new CharsetRecog_8859_6_ar(),
    108             new CharsetRecog_8859_7_el(),
    109             new CharsetRecog_8859_8_I_he(),
    110             new CharsetRecog_8859_8_he(),
    111             new CharsetRecog_windows_1251(),
    112             new CharsetRecog_windows_1256(),
    113             new CharsetRecog_KOI8_R(),
    114             new CharsetRecog_8859_9_tr(),
    115             new CharsetRecog_sjis(),
    116             new CharsetRecog_gb_18030(),
    117             new CharsetRecog_euc_jp(),
    118             new CharsetRecog_euc_kr(),
    119             new CharsetRecog_big5(),
    120 
    121             new CharsetRecog_2022JP(),
    122             new CharsetRecog_2022KR(),
    123             new CharsetRecog_2022CN(),
    124 
    125             new CharsetRecog_IBM424_he_rtl(),
    126             new CharsetRecog_IBM424_he_ltr(),
    127             new CharsetRecog_IBM420_ar_rtl(),
    128             new CharsetRecog_IBM420_ar_ltr()
    129         };
    130         int32_t rCount = ARRAY_SIZE(tempArray);
    131         int32_t r;
    132 
    133         recognizers = NEW_ARRAY(CharsetRecognizer *, rCount);
    134 
    135         if (recognizers == NULL) {
    136             status = U_MEMORY_ALLOCATION_ERROR;
    137             return;
    138         } else {
    139             for (r = 0; r < rCount; r += 1) {
    140                 recognizers[r] = tempArray[r];
    141 
    142                 if (recognizers[r] == NULL) {
    143                     status = U_MEMORY_ALLOCATION_ERROR;
    144                     break;
    145                 }
    146             }
    147         }
    148 
    149         if (U_SUCCESS(status)) {
    150             umtx_lock(NULL);
    151             if (fCSRecognizers == NULL) {
    152                 fCSRecognizers_size = rCount;
    153                 fCSRecognizers = recognizers;
    154             }
    155             umtx_unlock(NULL);
    156         }
    157 
    158         if (fCSRecognizers != recognizers) {
    159             for (r = 0; r < rCount; r += 1) {
    160                 delete recognizers[r];
    161                 recognizers[r] = NULL;
    162             }
    163 
    164             DELETE_ARRAY(recognizers);
    165         }
    166 
    167         recognizers = NULL;
    168         ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup);
    169     }
    170 }
    171 
    172 CharsetDetector::CharsetDetector(UErrorCode &status)
    173   : textIn(new InputText(status)), resultArray(NULL),
    174     resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE)
    175 {
    176     if (U_FAILURE(status)) {
    177         return;
    178     }
    179 
    180     setRecognizers(status);
    181 
    182     if (U_FAILURE(status)) {
    183         return;
    184     }
    185 
    186     resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size);
    187 
    188     if (resultArray == NULL) {
    189         status = U_MEMORY_ALLOCATION_ERROR;
    190         return;
    191     }
    192 
    193     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    194         resultArray[i] = new CharsetMatch();
    195 
    196         if (resultArray[i] == NULL) {
    197             status = U_MEMORY_ALLOCATION_ERROR;
    198             break;
    199         }
    200     }
    201 }
    202 
    203 CharsetDetector::~CharsetDetector()
    204 {
    205     delete textIn;
    206 
    207     for(int32_t i = 0; i < fCSRecognizers_size; i += 1) {
    208         delete resultArray[i];
    209     }
    210 
    211     uprv_free(resultArray);
    212 }
    213 
    214 void CharsetDetector::setText(const char *in, int32_t len)
    215 {
    216     textIn->setText(in, len);
    217     fFreshTextSet = TRUE;
    218 }
    219 
    220 UBool CharsetDetector::setStripTagsFlag(UBool flag)
    221 {
    222     UBool temp = fStripTags;
    223     fStripTags = flag;
    224     fFreshTextSet = TRUE;
    225     return temp;
    226 }
    227 
    228 UBool CharsetDetector::getStripTagsFlag() const
    229 {
    230     return fStripTags;
    231 }
    232 
    233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const
    234 {
    235     textIn->setDeclaredEncoding(encoding,len);
    236 }
    237 
    238 int32_t CharsetDetector::getDetectableCount()
    239 {
    240     UErrorCode status = U_ZERO_ERROR;
    241 
    242     setRecognizers(status);
    243 
    244     return fCSRecognizers_size;
    245 }
    246 
    247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status)
    248 {
    249     int32_t maxMatchesFound = 0;
    250 
    251     detectAll(maxMatchesFound, status);
    252 
    253     if(maxMatchesFound > 0) {
    254         return resultArray[0];
    255     } else {
    256         return NULL;
    257     }
    258 }
    259 
    260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status)
    261 {
    262     if(!textIn->isSet()) {
    263         status = U_MISSING_RESOURCE_ERROR;// TODO:  Need to set proper status code for input text not set
    264 
    265         return NULL;
    266     } else if(fFreshTextSet) {
    267         CharsetRecognizer *csr;
    268         int32_t            detectResults;
    269         int32_t            confidence;
    270         int32_t            i;
    271 
    272         textIn->MungeInput(fStripTags);
    273 
    274         // Iterate over all possible charsets, remember all that
    275         // give a match quality > 0.
    276         resultCount = 0;
    277         for (i = 0; i < fCSRecognizers_size; i += 1) {
    278             csr = fCSRecognizers[i];
    279             detectResults = csr->match(textIn);
    280             confidence = detectResults;
    281 
    282             if (confidence > 0)  {
    283                 resultArray[resultCount++]->set(textIn, csr, confidence);
    284             }
    285         }
    286 
    287         for(i = resultCount; i < fCSRecognizers_size; i += 1) {
    288             resultArray[i]->set(textIn, 0, 0);
    289         }
    290 
    291         uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status);
    292 
    293         // Remove duplicate charsets from the results.
    294         // Simple minded, brute force approach - check each entry against all that follow.
    295         // The first entry of any duplicated set is the one that should be kept because it will
    296         // be the one with the highest confidence rating.
    297         //   (Duplicate matches have different languages, only the charset is the same)
    298         // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually
    299         // deleted, just reordered, with the unwanted duplicates placed after the good results.
    300         int32_t j, k;
    301         for (i=0; i<resultCount; i++) {
    302             const char *charSetName = resultArray[i]->getName();
    303             for (j=i+1; j<resultCount; ) {
    304                 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) {
    305                     // Not a duplicate.
    306                     j++;
    307                 } else {
    308                     // Duplicate entry at index j.
    309                     CharsetMatch *duplicate = resultArray[j];
    310                     for (k=j; k<resultCount-1; k++) {
    311                         resultArray[k] = resultArray[k+1];
    312                     }
    313                     resultCount--;
    314                     resultArray[resultCount] = duplicate;
    315                 }
    316             }
    317         }
    318 
    319         fFreshTextSet = FALSE;
    320     }
    321 
    322     maxMatchesFound = resultCount;
    323 
    324     return resultArray;
    325 }
    326 
    327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const
    328 {
    329     if( index > fCSRecognizers_size-1 || index < 0) {
    330         status = U_INDEX_OUTOFBOUNDS_ERROR;
    331 
    332         return 0;
    333     } else {
    334         return fCSRecognizers[index]->getName();
    335     }
    336 }*/
    337 
    338 U_NAMESPACE_END
    339 
    340 U_CDECL_BEGIN
    341 typedef struct {
    342     int32_t currIndex;
    343 } Context;
    344 
    345 
    346 
    347 static void U_CALLCONV
    348 enumClose(UEnumeration *en) {
    349     if(en->context != NULL) {
    350         DELETE_ARRAY(en->context);
    351     }
    352 
    353     DELETE_ARRAY(en);
    354 }
    355 
    356 static int32_t U_CALLCONV
    357 enumCount(UEnumeration *, UErrorCode *) {
    358     return fCSRecognizers_size;
    359 }
    360 
    361 static const char* U_CALLCONV
    362 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) {
    363     if(((Context *)en->context)->currIndex >= fCSRecognizers_size) {
    364         if(resultLength != NULL) {
    365             *resultLength = 0;
    366         }
    367         return NULL;
    368     }
    369     const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName();
    370     if(resultLength != NULL) {
    371         *resultLength = (int32_t)uprv_strlen(currName);
    372     }
    373     ((Context *)en->context)->currIndex++;
    374 
    375     return currName;
    376 }
    377 
    378 static void U_CALLCONV
    379 enumReset(UEnumeration *en, UErrorCode *) {
    380     ((Context *)en->context)->currIndex = 0;
    381 }
    382 
    383 static const UEnumeration gCSDetEnumeration = {
    384     NULL,
    385     NULL,
    386     enumClose,
    387     enumCount,
    388     uenum_unextDefault,
    389     enumNext,
    390     enumReset
    391 };
    392 
    393 U_CAPI  UEnumeration * U_EXPORT2
    394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status)
    395 {
    396     U_NAMESPACE_USE
    397 
    398     if(U_FAILURE(*status)) {
    399         return 0;
    400     }
    401 
    402     /* Initialize recognized charsets. */
    403     CharsetDetector::getDetectableCount();
    404 
    405     UEnumeration *en = NEW_ARRAY(UEnumeration, 1);
    406     memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration));
    407     en->context = (void*)NEW_ARRAY(Context, 1);
    408     uprv_memset(en->context, 0, sizeof(Context));
    409     return en;
    410 }
    411 U_CDECL_END
    412 
    413 #endif
    414 
    415