1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2012, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 */ 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_CONVERSION 11 12 #include "unicode/ucsdet.h" 13 14 #include "csdetect.h" 15 #include "csmatch.h" 16 #include "uenumimp.h" 17 18 #include "cmemory.h" 19 #include "cstring.h" 20 #include "umutex.h" 21 #include "ucln_in.h" 22 #include "uarrsort.h" 23 #include "inputext.h" 24 #include "csrsbcs.h" 25 #include "csrmbcs.h" 26 #include "csrutf8.h" 27 #include "csrucode.h" 28 #include "csr2022.h" 29 30 #define ARRAY_SIZE(array) (sizeof array / sizeof array[0]) 31 32 #define NEW_ARRAY(type,count) (type *) uprv_malloc((count) * sizeof(type)) 33 #define DELETE_ARRAY(array) uprv_free((void *) (array)) 34 35 U_CDECL_BEGIN 36 static icu::CharsetRecognizer **fCSRecognizers = NULL; 37 38 static int32_t fCSRecognizers_size = 0; 39 40 static UBool U_CALLCONV csdet_cleanup(void) 41 { 42 if (fCSRecognizers != NULL) { 43 for(int32_t r = 0; r < fCSRecognizers_size; r += 1) { 44 delete fCSRecognizers[r]; 45 fCSRecognizers[r] = NULL; 46 } 47 48 DELETE_ARRAY(fCSRecognizers); 49 fCSRecognizers = NULL; 50 fCSRecognizers_size = 0; 51 } 52 53 return TRUE; 54 } 55 56 static int32_t U_CALLCONV 57 charsetMatchComparator(const void * /*context*/, const void *left, const void *right) 58 { 59 U_NAMESPACE_USE 60 61 const CharsetMatch **csm_l = (const CharsetMatch **) left; 62 const CharsetMatch **csm_r = (const CharsetMatch **) right; 63 64 // NOTE: compare is backwards to sort from highest to lowest. 65 return (*csm_r)->getConfidence() - (*csm_l)->getConfidence(); 66 } 67 68 U_CDECL_END 69 70 U_NAMESPACE_BEGIN 71 72 void CharsetDetector::setRecognizers(UErrorCode &status) 73 { 74 UBool needsInit; 75 CharsetRecognizer **recognizers; 76 77 if (U_FAILURE(status)) { 78 return; 79 } 80 81 UMTX_CHECK(NULL, (UBool) (fCSRecognizers == NULL), needsInit); 82 83 if (needsInit) { 84 CharsetRecognizer *tempArray[] = { 85 new CharsetRecog_UTF8(), 86 87 new CharsetRecog_UTF_16_BE(), 88 new CharsetRecog_UTF_16_LE(), 89 new CharsetRecog_UTF_32_BE(), 90 new CharsetRecog_UTF_32_LE(), 91 92 new CharsetRecog_8859_1(), 93 new CharsetRecog_8859_2(), 94 new CharsetRecog_8859_5_ru(), 95 new CharsetRecog_8859_6_ar(), 96 new CharsetRecog_8859_7_el(), 97 new CharsetRecog_8859_8_I_he(), 98 new CharsetRecog_8859_8_he(), 99 new CharsetRecog_windows_1251(), 100 new CharsetRecog_windows_1256(), 101 new CharsetRecog_KOI8_R(), 102 new CharsetRecog_8859_9_tr(), 103 new CharsetRecog_sjis(), 104 new CharsetRecog_gb_18030(), 105 new CharsetRecog_euc_jp(), 106 new CharsetRecog_euc_kr(), 107 new CharsetRecog_big5(), 108 109 new CharsetRecog_2022JP(), 110 new CharsetRecog_2022KR(), 111 new CharsetRecog_2022CN(), 112 113 new CharsetRecog_IBM424_he_rtl(), 114 new CharsetRecog_IBM424_he_ltr(), 115 new CharsetRecog_IBM420_ar_rtl(), 116 new CharsetRecog_IBM420_ar_ltr() 117 }; 118 int32_t rCount = ARRAY_SIZE(tempArray); 119 int32_t r; 120 121 recognizers = NEW_ARRAY(CharsetRecognizer *, rCount); 122 123 if (recognizers == NULL) { 124 status = U_MEMORY_ALLOCATION_ERROR; 125 return; 126 } else { 127 for (r = 0; r < rCount; r += 1) { 128 recognizers[r] = tempArray[r]; 129 130 if (recognizers[r] == NULL) { 131 status = U_MEMORY_ALLOCATION_ERROR; 132 break; 133 } 134 } 135 } 136 137 if (U_SUCCESS(status)) { 138 umtx_lock(NULL); 139 if (fCSRecognizers == NULL) { 140 fCSRecognizers_size = rCount; 141 fCSRecognizers = recognizers; 142 } 143 umtx_unlock(NULL); 144 } 145 146 if (fCSRecognizers != recognizers) { 147 for (r = 0; r < rCount; r += 1) { 148 delete recognizers[r]; 149 recognizers[r] = NULL; 150 } 151 152 DELETE_ARRAY(recognizers); 153 } 154 155 recognizers = NULL; 156 ucln_i18n_registerCleanup(UCLN_I18N_CSDET, csdet_cleanup); 157 } 158 } 159 160 CharsetDetector::CharsetDetector(UErrorCode &status) 161 : textIn(new InputText(status)), resultArray(NULL), 162 resultCount(0), fStripTags(FALSE), fFreshTextSet(FALSE) 163 { 164 if (U_FAILURE(status)) { 165 return; 166 } 167 168 setRecognizers(status); 169 170 if (U_FAILURE(status)) { 171 return; 172 } 173 174 resultArray = (CharsetMatch **)uprv_malloc(sizeof(CharsetMatch *)*fCSRecognizers_size); 175 176 if (resultArray == NULL) { 177 status = U_MEMORY_ALLOCATION_ERROR; 178 return; 179 } 180 181 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 182 resultArray[i] = new CharsetMatch(); 183 184 if (resultArray[i] == NULL) { 185 status = U_MEMORY_ALLOCATION_ERROR; 186 break; 187 } 188 } 189 } 190 191 CharsetDetector::~CharsetDetector() 192 { 193 delete textIn; 194 195 for(int32_t i = 0; i < fCSRecognizers_size; i += 1) { 196 delete resultArray[i]; 197 } 198 199 uprv_free(resultArray); 200 } 201 202 void CharsetDetector::setText(const char *in, int32_t len) 203 { 204 textIn->setText(in, len); 205 fFreshTextSet = TRUE; 206 } 207 208 UBool CharsetDetector::setStripTagsFlag(UBool flag) 209 { 210 UBool temp = fStripTags; 211 fStripTags = flag; 212 fFreshTextSet = TRUE; 213 return temp; 214 } 215 216 UBool CharsetDetector::getStripTagsFlag() const 217 { 218 return fStripTags; 219 } 220 221 void CharsetDetector::setDeclaredEncoding(const char *encoding, int32_t len) const 222 { 223 textIn->setDeclaredEncoding(encoding,len); 224 } 225 226 int32_t CharsetDetector::getDetectableCount() 227 { 228 UErrorCode status = U_ZERO_ERROR; 229 230 setRecognizers(status); 231 232 return fCSRecognizers_size; 233 } 234 235 const CharsetMatch *CharsetDetector::detect(UErrorCode &status) 236 { 237 int32_t maxMatchesFound = 0; 238 239 detectAll(maxMatchesFound, status); 240 241 if(maxMatchesFound > 0) { 242 return resultArray[0]; 243 } else { 244 return NULL; 245 } 246 } 247 248 const CharsetMatch * const *CharsetDetector::detectAll(int32_t &maxMatchesFound, UErrorCode &status) 249 { 250 if(!textIn->isSet()) { 251 status = U_MISSING_RESOURCE_ERROR;// TODO: Need to set proper status code for input text not set 252 253 return NULL; 254 } else if (fFreshTextSet) { 255 CharsetRecognizer *csr; 256 int32_t i; 257 258 textIn->MungeInput(fStripTags); 259 260 // Iterate over all possible charsets, remember all that 261 // give a match quality > 0. 262 resultCount = 0; 263 for (i = 0; i < fCSRecognizers_size; i += 1) { 264 csr = fCSRecognizers[i]; 265 if (csr->match(textIn, resultArray[resultCount])) { 266 resultCount++; 267 } 268 } 269 270 if (resultCount > 1) { 271 uprv_sortArray(resultArray, resultCount, sizeof resultArray[0], charsetMatchComparator, NULL, TRUE, &status); 272 } 273 fFreshTextSet = FALSE; 274 } 275 276 maxMatchesFound = resultCount; 277 278 return resultArray; 279 } 280 281 /*const char *CharsetDetector::getCharsetName(int32_t index, UErrorCode &status) const 282 { 283 if( index > fCSRecognizers_size-1 || index < 0) { 284 status = U_INDEX_OUTOFBOUNDS_ERROR; 285 286 return 0; 287 } else { 288 return fCSRecognizers[index]->getName(); 289 } 290 }*/ 291 292 U_NAMESPACE_END 293 294 U_CDECL_BEGIN 295 typedef struct { 296 int32_t currIndex; 297 } Context; 298 299 300 301 static void U_CALLCONV 302 enumClose(UEnumeration *en) { 303 if(en->context != NULL) { 304 DELETE_ARRAY(en->context); 305 } 306 307 DELETE_ARRAY(en); 308 } 309 310 static int32_t U_CALLCONV 311 enumCount(UEnumeration *, UErrorCode *) { 312 return fCSRecognizers_size; 313 } 314 315 static const char* U_CALLCONV 316 enumNext(UEnumeration *en, int32_t *resultLength, UErrorCode * /*status*/) { 317 if(((Context *)en->context)->currIndex >= fCSRecognizers_size) { 318 if(resultLength != NULL) { 319 *resultLength = 0; 320 } 321 return NULL; 322 } 323 const char *currName = fCSRecognizers[((Context *)en->context)->currIndex]->getName(); 324 if(resultLength != NULL) { 325 *resultLength = (int32_t)uprv_strlen(currName); 326 } 327 ((Context *)en->context)->currIndex++; 328 329 return currName; 330 } 331 332 static void U_CALLCONV 333 enumReset(UEnumeration *en, UErrorCode *) { 334 ((Context *)en->context)->currIndex = 0; 335 } 336 337 static const UEnumeration gCSDetEnumeration = { 338 NULL, 339 NULL, 340 enumClose, 341 enumCount, 342 uenum_unextDefault, 343 enumNext, 344 enumReset 345 }; 346 347 U_CAPI UEnumeration * U_EXPORT2 348 ucsdet_getAllDetectableCharsets(const UCharsetDetector * /*ucsd*/, UErrorCode *status) 349 { 350 U_NAMESPACE_USE 351 352 if(U_FAILURE(*status)) { 353 return 0; 354 } 355 356 /* Initialize recognized charsets. */ 357 CharsetDetector::getDetectableCount(); 358 359 UEnumeration *en = NEW_ARRAY(UEnumeration, 1); 360 memcpy(en, &gCSDetEnumeration, sizeof(UEnumeration)); 361 en->context = (void*)NEW_ARRAY(Context, 1); 362 uprv_memset(en->context, 0, sizeof(Context)); 363 return en; 364 } 365 U_CDECL_END 366 367 #endif 368 369