1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_CONVERSION 11 12 #include "unicode/ucsdet.h" 13 14 #include "csdetect.h" 15 #include "csmatch.h" 16 #include "uenumimp.h" 17 18 #include "cmemory.h" 19 #include "cstring.h" 20 #include "umutex.h" 21 #include "ucln_in.h" 22 #include "uarrsort.h" 23 #include "inputext.h" 24 #include "csrsbcs.h" 25 #include "csrmbcs.h" 26 #include "csrutf8.h" 27 #include "csrucode.h" 28 #include "csr2022.h" 29 30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31 32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 34 35 U_CDECL_BEGIN 36 static U_NAMESPACE_QUALIFIER CharsetRecognizer **fCSRecognizers = NULL; 37 38 static int32_t fCSRecognizers_size = 0; 39 40 static UBool U_CALLCONV csdet_cleanup(void) 41 { 42 if (fCSRecognizers != NULL) { 43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44 delete fCSRecognizers[r]; 45 fCSRecognizers[r] = NULL; 46 } 47 48 DELETE_ARRAY(fCSRecognizers); 49 fCSRecognizers = NULL; 50 fCSRecognizers_size = 0; 51 } 52 53 return TRUE; 54 } 55 56 static int32_t U_CALLCONV 57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58 { 59 U_NAMESPACE_USE 60 61 const CharsetMatch **csm_l = (const CharsetMatch **) left; 62 const CharsetMatch **csm_r = (const CharsetMatch **) right; 63 64 // NOTE: compare is backwards to sort from highest to lowest. 65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66 } 67 68 U_CDECL_END 69 70 U_NAMESPACE_BEGIN 71 72 void CharsetDetector::setRecognizers(UErrorCode &status) 73 { 74 UBool needsInit; 75 CharsetRecognizer **recognizers; 76 77 if (U_FAILURE(status)) { 78 return; 79 } 80 81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82 83 if (needsInit) { 84 CharsetRecognizer *tempArray[] = { 85 new CharsetRecog_UTF8(), 86 87 new CharsetRecog_UTF_16_BE(), 88 new CharsetRecog_UTF_16_LE(), 89 new CharsetRecog_UTF_32_BE(), 90 new CharsetRecog_UTF_32_LE(), 91 92 new CharsetRecog_8859_1_en(), 93 new CharsetRecog_8859_1_da(), 94 new CharsetRecog_8859_1_de(), 95 new CharsetRecog_8859_1_es(), 96 new CharsetRecog_8859_1_fr(), 97 new CharsetRecog_8859_1_it(), 98 new CharsetRecog_8859_1_nl(), 99 new CharsetRecog_8859_1_no(), 100 new CharsetRecog_8859_1_pt(), 101 new CharsetRecog_8859_1_sv(), 102 new CharsetRecog_8859_2_cs(), 103 new CharsetRecog_8859_2_hu(), 104 new CharsetRecog_8859_2_pl(), 105 new CharsetRecog_8859_2_ro(), 106 new CharsetRecog_8859_5_ru(), 107 new CharsetRecog_8859_6_ar(), 108 new CharsetRecog_8859_7_el(), 109 new CharsetRecog_8859_8_I_he(), 110 new CharsetRecog_8859_8_he(), 111 new CharsetRecog_windows_1251(), 112 new CharsetRecog_windows_1256(), 113 new CharsetRecog_KOI8_R(), 114 new CharsetRecog_8859_9_tr(), 115 new CharsetRecog_sjis(), 116 new CharsetRecog_gb_18030(), 117 new CharsetRecog_euc_jp(), 118 new CharsetRecog_euc_kr(), 119 new CharsetRecog_big5(), 120 121 new CharsetRecog_2022JP(), 122 new CharsetRecog_2022KR(), 123 new CharsetRecog_2022CN(), 124 125 new CharsetRecog_IBM424_he_rtl(), 126 new CharsetRecog_IBM424_he_ltr(), 127 new CharsetRecog_IBM420_ar_rtl(), 128 new CharsetRecog_IBM420_ar_ltr() 129 }; 130 int32_t rCount = ARRAY_SIZE(tempArray); 131 int32_t r; 132 133 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 134 135 if (recognizers == NULL) { 136 status = U_MEMORY_ALLOCATION_ERROR; 137 return; 138 } else { 139 for (r = 0; r < rCount; r += 1) { 140 recognizers[r] = tempArray[r]; 141 142 if (recognizers[r] == NULL) { 143 status = U_MEMORY_ALLOCATION_ERROR; 144 break; 145 } 146 } 147 } 148 149 if (U_SUCCESS(status)) { 150 umtx_lock(NULL); 151 if (fCSRecognizers == NULL) { 152 fCSRecognizers_size = rCount; 153 fCSRecognizers = recognizers; 154 } 155 umtx_unlock(NULL); 156 } 157 158 if (fCSRecognizers != recognizers) { 159 for (r = 0; r < rCount; r += 1) { 160 delete recognizers[r]; 161 recognizers[r] = NULL; 162 } 163 164 DELETE_ARRAY(recognizers); 165 } 166 167 recognizers = NULL; 168 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 169 } 170 } 171 172 CharsetDetector::CharsetDetector(UErrorCode &status) 173 : textIn(new InputText(status)), resultArray(NULL), 174 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 175 { 176 if (U_FAILURE(status)) { 177 return; 178 } 179 180 setRecognizers(status); 181 182 if (U_FAILURE(status)) { 183 return; 184 } 185 186 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 187 188 if (resultArray == NULL) { 189 status = U_MEMORY_ALLOCATION_ERROR; 190 return; 191 } 192 193 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 194 resultArray[i] = new CharsetMatch(); 195 196 if (resultArray[i] == NULL) { 197 status = U_MEMORY_ALLOCATION_ERROR; 198 break; 199 } 200 } 201 } 202 203 CharsetDetector::~CharsetDetector() 204 { 205 delete textIn; 206 207 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 208 delete resultArray[i]; 209 } 210 211 uprv_free(resultArray); 212 } 213 214 void CharsetDetector::setText(const char *in, int32_t len) 215 { 216 textIn->setText(in, len); 217 fFreshTextSet = TRUE; 218 } 219 220 UBool CharsetDetector::setStripTagsFlag(UBool flag) 221 { 222 UBool temp = fStripTags; 223 fStripTags = flag; 224 fFreshTextSet = TRUE; 225 return temp; 226 } 227 228 UBool CharsetDetector::getStripTagsFlag() const 229 { 230 return fStripTags; 231 } 232 233 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 234 { 235 textIn->setDeclaredEncoding(encoding,len); 236 } 237 238 int32_t CharsetDetector::getDetectableCount() 239 { 240 UErrorCode status = U_ZERO_ERROR; 241 242 setRecognizers(status); 243 244 return fCSRecognizers_size; 245 } 246 247 const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 248 { 249 int32_t maxMatchesFound = 0; 250 251 detectAll(maxMatchesFound, status); 252 253 if(maxMatchesFound > 0) { 254 return resultArray[0]; 255 } else { 256 return NULL; 257 } 258 } 259 260 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 261 { 262 if(!textIn->isSet()) { 263 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 264 265 return NULL; 266 } else if(fFreshTextSet) { 267 CharsetRecognizer *csr; 268 int32_t detectResults; 269 int32_t confidence; 270 int32_t i; 271 272 textIn->MungeInput(fStripTags); 273 274 // Iterate over all possible charsets, remember all that 275 // give a match quality > 0. 276 resultCount = 0; 277 for (i = 0; i < fCSRecognizers_size; i += 1) { 278 csr = fCSRecognizers[i]; 279 detectResults = csr->match(textIn); 280 confidence = detectResults; 281 282 if (confidence > 0) { 283 resultArray[resultCount++]->set(textIn, csr, confidence); 284 } 285 } 286 287 for(i = resultCount; i < fCSRecognizers_size; i += 1) { 288 resultArray[i]->set(textIn, 0, 0); 289 } 290 291 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 292 293 // Remove duplicate charsets from the results. 294 // Simple minded, brute force approach - check each entry against all that follow. 295 // The first entry of any duplicated set is the one that should be kept because it will 296 // be the one with the highest confidence rating. 297 // (Duplicate matches have different languages, only the charset is the same) 298 // Because the resultArray contains preallocated CharsetMatch objects, they aren't actually 299 // deleted, just reordered, with the unwanted duplicates placed after the good results. 300 int32_t j, k; 301 for (i=0; i<resultCount; i++) { 302 const char *charSetName = resultArray[i]->getName(); 303 for (j=i+1; j<resultCount; ) { 304 if (uprv_strcmp(charSetName, resultArray[j]->getName()) != 0) { 305 // Not a duplicate. 306 j++; 307 } else { 308 // Duplicate entry at index j. 309 CharsetMatch *duplicate = resultArray[j]; 310 for (k=j; k<resultCount-1; k++) { 311 resultArray[k] = resultArray[k+1]; 312 } 313 resultCount--; 314 resultArray[resultCount] = duplicate; 315 } 316 } 317 } 318 319 fFreshTextSet = FALSE; 320 } 321 322 maxMatchesFound = resultCount; 323 324 return resultArray; 325 } 326 327 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 328 { 329 if( index > fCSRecognizers_size-1 || index < 0) { 330 status = U_INDEX_OUTOFBOUNDS_ERROR; 331 332 return 0; 333 } else { 334 return fCSRecognizers[index]->getName(); 335 } 336 }*/ 337 338 U_NAMESPACE_END 339 340 U_CDECL_BEGIN 341 typedef struct { 342 int32_t currIndex; 343 } Context; 344 345 346 347 static void U_CALLCONV 348 enumClose(UEnumeration *en) { 349 if(en->context != NULL) { 350 DELETE_ARRAY(en->context); 351 } 352 353 DELETE_ARRAY(en); 354 } 355 356 static int32_t U_CALLCONV 357 enumCount(UEnumeration *, UErrorCode *) { 358 return fCSRecognizers_size; 359 } 360 361 static const char* U_CALLCONV 362 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 363 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 364 if(resultLength != NULL) { 365 *resultLength = 0; 366 } 367 return NULL; 368 } 369 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 370 if(resultLength != NULL) { 371 *resultLength = (int32_t)uprv_strlen(currName); 372 } 373 ((Context *)en->context)->currIndex++; 374 375 return currName; 376 } 377 378 static void U_CALLCONV 379 enumReset(UEnumeration *en, UErrorCode *) { 380 ((Context *)en->context)->currIndex = 0; 381 } 382 383 static const UEnumeration gCSDetEnumeration = { 384 NULL, 385 NULL, 386 enumClose, 387 enumCount, 388 uenum_unextDefault, 389 enumNext, 390 enumReset 391 }; 392 393 U_CAPI UEnumeration * U_EXPORT2 394 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 395 { 396 U_NAMESPACE_USE 397 398 if(U_FAILURE(*status)) { 399 return 0; 400 } 401 402 /* Initialize recognized charsets. */ 403 CharsetDetector::getDetectableCount(); 404 405 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 406 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 407 en->context = (void*)NEW_ARRAY(Context, 1); 408 uprv_memset(en->context, 0, sizeof(Context)); 409 return en; 410 } 411 U_CDECL_END 412 413 #endif 414 415