1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 */ 9 10 #include "unicode/utypes.h" 11 12 #if !UCONFIG_NO_CONVERSION 13 14 #include "unicode/ucsdet.h" 15 16 #include "csdetect.h" 17 #include "csmatch.h" 18 #include "uenumimp.h" 19 20 #include "cmemory.h" 21 #include "cstring.h" 22 #include "umutex.h" 23 #include "ucln_in.h" 24 #include "uarrsort.h" 25 #include "inputext.h" 26 #include "csrsbcs.h" 27 #include "csrmbcs.h" 28 #include "csrutf8.h" 29 #include "csrucode.h" 30 #include "csr2022.h" 31 32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 34 35 U_NAMESPACE_BEGIN 36 37 struct CSRecognizerInfo : public UMemory { 38 CSRecognizerInfo(CharsetRecognizer *recognizer, UBool isDefaultEnabled) 39 : recognizer(recognizer), isDefaultEnabled(isDefaultEnabled) {}; 40 41 ~CSRecognizerInfo() {delete recognizer;}; 42 43 CharsetRecognizer *recognizer; 44 UBool isDefaultEnabled; 45 }; 46 47 U_NAMESPACE_END 48 49 static icu::CSRecognizerInfo **fCSRecognizers = NULL; 50 static icu::UInitOnce gCSRecognizersInitOnce; 51 static int32_t fCSRecognizers_size = 0; 52 53 U_CDECL_BEGIN 54 static UBool U_CALLCONV csdet_cleanup(void) 55 { 56 U_NAMESPACE_USE 57 if (fCSRecognizers != NULL) { 58 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 59 delete fCSRecognizers[r]; 60 fCSRecognizers[r] = NULL; 61 } 62 63 DELETE_ARRAY(fCSRecognizers); 64 fCSRecognizers = NULL; 65 fCSRecognizers_size = 0; 66 } 67 gCSRecognizersInitOnce.reset(); 68 69 return TRUE; 70 } 71 72 static int32_t U_CALLCONV 73 charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 74 { 75 U_NAMESPACE_USE 76 77 const CharsetMatch **csm_l = (const CharsetMatch **) left; 78 const CharsetMatch **csm_r = (const CharsetMatch **) right; 79 80 // NOTE: compare is backwards to sort from highest to lowest. 81 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 82 } 83 84 static void U_CALLCONV initRecognizers(UErrorCode &status) { 85 U_NAMESPACE_USE 86 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 87 CSRecognizerInfo *tempArray[] = { 88 new CSRecognizerInfo(new CharsetRecog_UTF8(), TRUE), 89 90 new CSRecognizerInfo(new CharsetRecog_UTF_16_BE(), TRUE), 91 new CSRecognizerInfo(new CharsetRecog_UTF_16_LE(), TRUE), 92 new CSRecognizerInfo(new CharsetRecog_UTF_32_BE(), TRUE), 93 new CSRecognizerInfo(new CharsetRecog_UTF_32_LE(), TRUE), 94 95 new CSRecognizerInfo(new CharsetRecog_8859_1(), TRUE), 96 new CSRecognizerInfo(new CharsetRecog_8859_2(), TRUE), 97 new CSRecognizerInfo(new CharsetRecog_8859_5_ru(), TRUE), 98 new CSRecognizerInfo(new CharsetRecog_8859_6_ar(), TRUE), 99 new CSRecognizerInfo(new CharsetRecog_8859_7_el(), TRUE), 100 new CSRecognizerInfo(new CharsetRecog_8859_8_I_he(), TRUE), 101 new CSRecognizerInfo(new CharsetRecog_8859_8_he(), TRUE), 102 new CSRecognizerInfo(new CharsetRecog_windows_1251(), TRUE), 103 new CSRecognizerInfo(new CharsetRecog_windows_1256(), TRUE), 104 new CSRecognizerInfo(new CharsetRecog_KOI8_R(), TRUE), 105 new CSRecognizerInfo(new CharsetRecog_8859_9_tr(), TRUE), 106 new CSRecognizerInfo(new CharsetRecog_sjis(), TRUE), 107 new CSRecognizerInfo(new CharsetRecog_gb_18030(), TRUE), 108 new CSRecognizerInfo(new CharsetRecog_euc_jp(), TRUE), 109 new CSRecognizerInfo(new CharsetRecog_euc_kr(), TRUE), 110 new CSRecognizerInfo(new CharsetRecog_big5(), TRUE), 111 112 new CSRecognizerInfo(new CharsetRecog_2022JP(), TRUE), 113 #if !UCONFIG_ONLY_HTML_CONVERSION 114 new CSRecognizerInfo(new CharsetRecog_2022KR(), TRUE), 115 new CSRecognizerInfo(new CharsetRecog_2022CN(), TRUE), 116 117 new CSRecognizerInfo(new CharsetRecog_IBM424_he_rtl(), FALSE), 118 new CSRecognizerInfo(new CharsetRecog_IBM424_he_ltr(), FALSE), 119 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_rtl(), FALSE), 120 new CSRecognizerInfo(new CharsetRecog_IBM420_ar_ltr(), FALSE) 121 #endif 122 }; 123 int32_t rCount = UPRV_LENGTHOF(tempArray); 124 125 fCSRecognizers = NEW_ARRAY(CSRecognizerInfo *, rCount); 126 127 if (fCSRecognizers == NULL) { 128 status = U_MEMORY_ALLOCATION_ERROR; 129 } 130 else { 131 fCSRecognizers_size = rCount; 132 for (int32_t r = 0; r < rCount; r += 1) { 133 fCSRecognizers[r] = tempArray[r]; 134 if (fCSRecognizers[r] == NULL) { 135 status = U_MEMORY_ALLOCATION_ERROR; 136 } 137 } 138 } 139 } 140 141 U_CDECL_END 142 143 U_NAMESPACE_BEGIN 144 145 void CharsetDetector::setRecognizers(UErrorCode &status) 146 { 147 umtx_initOnce(gCSRecognizersInitOnce, &initRecognizers, status); 148 } 149 150 CharsetDetector::CharsetDetector(UErrorCode &status) 151 : textIn(new InputText(status)), resultArray(NULL), 152 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE), 153 fEnabledRecognizers(NULL) 154 { 155 if (U_FAILURE(status)) { 156 return; 157 } 158 159 setRecognizers(status); 160 161 if (U_FAILURE(status)) { 162 return; 163 } 164 165 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 166 167 if (resultArray == NULL) { 168 status = U_MEMORY_ALLOCATION_ERROR; 169 return; 170 } 171 172 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 173 resultArray[i] = new CharsetMatch(); 174 175 if (resultArray[i] == NULL) { 176 status = U_MEMORY_ALLOCATION_ERROR; 177 break; 178 } 179 } 180 } 181 182 CharsetDetector::~CharsetDetector() 183 { 184 delete textIn; 185 186 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 187 delete resultArray[i]; 188 } 189 190 uprv_free(resultArray); 191 192 if (fEnabledRecognizers) { 193 uprv_free(fEnabledRecognizers); 194 } 195 } 196 197 void CharsetDetector::setText(const char *in, int32_t len) 198 { 199 textIn->setText(in, len); 200 fFreshTextSet = TRUE; 201 } 202 203 UBool CharsetDetector::setStripTagsFlag(UBool flag) 204 { 205 UBool temp = fStripTags; 206 fStripTags = flag; 207 fFreshTextSet = TRUE; 208 return temp; 209 } 210 211 UBool CharsetDetector::getStripTagsFlag() const 212 { 213 return fStripTags; 214 } 215 216 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 217 { 218 textIn->setDeclaredEncoding(encoding,len); 219 } 220 221 int32_t CharsetDetector::getDetectableCount() 222 { 223 UErrorCode status = U_ZERO_ERROR; 224 225 setRecognizers(status); 226 227 return fCSRecognizers_size; 228 } 229 230 const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 231 { 232 int32_t maxMatchesFound = 0; 233 234 detectAll(maxMatchesFound, status); 235 236 if(maxMatchesFound > 0) { 237 return resultArray[0]; 238 } else { 239 return NULL; 240 } 241 } 242 243 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 244 { 245 if(!textIn->isSet()) { 246 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 247 248 return NULL; 249 } else if (fFreshTextSet) { 250 CharsetRecognizer *csr; 251 int32_t i; 252 253 textIn->MungeInput(fStripTags); 254 255 // Iterate over all possible charsets, remember all that 256 // give a match quality > 0. 257 resultCount = 0; 258 for (i = 0; i < fCSRecognizers_size; i += 1) { 259 csr = fCSRecognizers[i]->recognizer; 260 if (csr->match(textIn, resultArray[resultCount])) { 261 resultCount++; 262 } 263 } 264 265 if (resultCount > 1) { 266 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 267 } 268 fFreshTextSet = FALSE; 269 } 270 271 maxMatchesFound = resultCount; 272 273 return resultArray; 274 } 275 276 void CharsetDetector::setDetectableCharset(const char *encoding, UBool enabled, UErrorCode &status) 277 { 278 if (U_FAILURE(status)) { 279 return; 280 } 281 282 int32_t modIdx = -1; 283 UBool isDefaultVal = FALSE; 284 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 285 CSRecognizerInfo *csrinfo = fCSRecognizers[i]; 286 if (uprv_strcmp(csrinfo->recognizer->getName(), encoding) == 0) { 287 modIdx = i; 288 isDefaultVal = (csrinfo->isDefaultEnabled == enabled); 289 break; 290 } 291 } 292 if (modIdx < 0) { 293 // No matching encoding found 294 status = U_ILLEGAL_ARGUMENT_ERROR; 295 return; 296 } 297 298 if (fEnabledRecognizers == NULL && !isDefaultVal) { 299 // Create an array storing the non default setting 300 fEnabledRecognizers = NEW_ARRAY(UBool, fCSRecognizers_size); 301 if (fEnabledRecognizers == NULL) { 302 status = U_MEMORY_ALLOCATION_ERROR; 303 return; 304 } 305 // Initialize the array with default info 306 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 307 fEnabledRecognizers[i] = fCSRecognizers[i]->isDefaultEnabled; 308 } 309 } 310 311 if (fEnabledRecognizers != NULL) { 312 fEnabledRecognizers[modIdx] = enabled; 313 } 314 } 315 316 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 317 { 318 if( index > fCSRecognizers_size-1 || index < 0) { 319 status = U_INDEX_OUTOFBOUNDS_ERROR; 320 321 return 0; 322 } else { 323 return fCSRecognizers[index]->getName(); 324 } 325 }*/ 326 327 U_NAMESPACE_END 328 329 U_CDECL_BEGIN 330 typedef struct { 331 int32_t currIndex; 332 UBool all; 333 UBool *enabledRecognizers; 334 } Context; 335 336 337 338 static void U_CALLCONV 339 enumClose(UEnumeration *en) { 340 if(en->context != NULL) { 341 DELETE_ARRAY(en->context); 342 } 343 344 DELETE_ARRAY(en); 345 } 346 347 static int32_t U_CALLCONV 348 enumCount(UEnumeration *en, UErrorCode *) { 349 if (((Context *)en->context)->all) { 350 // ucsdet_getAllDetectableCharsets, all charset detector names 351 return fCSRecognizers_size; 352 } 353 354 // Otherwise, ucsdet_getDetectableCharsets - only enabled ones 355 int32_t count = 0; 356 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 357 if (enabledArray != NULL) { 358 // custom set 359 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 360 if (enabledArray[i]) { 361 count++; 362 } 363 } 364 } else { 365 // default set 366 for (int32_t i = 0; i < fCSRecognizers_size; i++) { 367 if (fCSRecognizers[i]->isDefaultEnabled) { 368 count++; 369 } 370 } 371 } 372 return count; 373 } 374 375 static const char* U_CALLCONV 376 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 377 const char *currName = NULL; 378 379 if (((Context *)en->context)->currIndex < fCSRecognizers_size) { 380 if (((Context *)en->context)->all) { 381 // ucsdet_getAllDetectableCharsets, all charset detector names 382 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 383 ((Context *)en->context)->currIndex++; 384 } else { 385 // ucsdet_getDetectableCharsets 386 UBool *enabledArray = ((Context *)en->context)->enabledRecognizers; 387 if (enabledArray != NULL) { 388 // custome set 389 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 390 if (enabledArray[((Context *)en->context)->currIndex]) { 391 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 392 } 393 ((Context *)en->context)->currIndex++; 394 } 395 } else { 396 // default set 397 while (currName == NULL && ((Context *)en->context)->currIndex < fCSRecognizers_size) { 398 if (fCSRecognizers[((Context *)en->context)->currIndex]->isDefaultEnabled) { 399 currName = fCSRecognizers[((Context *)en->context)->currIndex]->recognizer->getName(); 400 } 401 ((Context *)en->context)->currIndex++; 402 } 403 } 404 } 405 } 406 407 if(resultLength != NULL) { 408 *resultLength = currName == NULL ? 0 : (int32_t)uprv_strlen(currName); 409 } 410 411 return currName; 412 } 413 414 415 static void U_CALLCONV 416 enumReset(UEnumeration *en, UErrorCode *) { 417 ((Context *)en->context)->currIndex = 0; 418 } 419 420 static const UEnumeration gCSDetEnumeration = { 421 NULL, 422 NULL, 423 enumClose, 424 enumCount, 425 uenum_unextDefault, 426 enumNext, 427 enumReset 428 }; 429 430 U_CDECL_END 431 432 U_NAMESPACE_BEGIN 433 434 UEnumeration * CharsetDetector::getAllDetectableCharsets(UErrorCode &status) 435 { 436 437 /* Initialize recognized charsets. */ 438 setRecognizers(status); 439 440 if(U_FAILURE(status)) { 441 return 0; 442 } 443 444 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 445 if (en == NULL) { 446 status = U_MEMORY_ALLOCATION_ERROR; 447 return 0; 448 } 449 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 450 en->context = (void*)NEW_ARRAY(Context, 1); 451 if (en->context == NULL) { 452 status = U_MEMORY_ALLOCATION_ERROR; 453 DELETE_ARRAY(en); 454 return 0; 455 } 456 uprv_memset(en->context, 0, sizeof(Context)); 457 ((Context*)en->context)->all = TRUE; 458 return en; 459 } 460 461 UEnumeration * CharsetDetector::getDetectableCharsets(UErrorCode &status) const 462 { 463 if(U_FAILURE(status)) { 464 return 0; 465 } 466 467 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 468 if (en == NULL) { 469 status = U_MEMORY_ALLOCATION_ERROR; 470 return 0; 471 } 472 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 473 en->context = (void*)NEW_ARRAY(Context, 1); 474 if (en->context == NULL) { 475 status = U_MEMORY_ALLOCATION_ERROR; 476 DELETE_ARRAY(en); 477 return 0; 478 } 479 uprv_memset(en->context, 0, sizeof(Context)); 480 ((Context*)en->context)->all = FALSE; 481 ((Context*)en->context)->enabledRecognizers = fEnabledRecognizers; 482 return en; 483 } 484 485 U_NAMESPACE_END 486 487 #endif 488