1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2005-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * file name: ucsdet.h 9 * encoding: UTF-8 10 * indentation:4 11 * 12 * created on: 2005Aug04 13 * created by: Andy Heninger 14 * 15 * ICU Character Set Detection, API for C 16 * 17 * Draft version 18 Oct 2005 18 * 19 */ 20 21 #ifndef __UCSDET_H 22 #define __UCSDET_H 23 24 #include "unicode/utypes.h" 25 26 #if !UCONFIG_NO_CONVERSION 27 28 #include "unicode/localpointer.h" 29 #include "unicode/uenum.h" 30 31 /** 32 * \file 33 * \brief C API: Charset Detection API 34 * 35 * This API provides a facility for detecting the 36 * charset or encoding of character data in an unknown text format. 37 * The input data can be from an array of bytes. 38 * <p> 39 * Character set detection is at best an imprecise operation. The detection 40 * process will attempt to identify the charset that best matches the characteristics 41 * of the byte data, but the process is partly statistical in nature, and 42 * the results can not be guaranteed to always be correct. 43 * <p> 44 * For best accuracy in charset detection, the input data should be primarily 45 * in a single language, and a minimum of a few hundred bytes worth of plain text 46 * in the language are needed. The detection process will attempt to 47 * ignore html or xml style markup that could otherwise obscure the content. 48 * <p> 49 * An alternative to the ICU Charset Detector is the 50 * Compact Encoding Detector, https://github.com/google/compact_enc_det. 51 * It often gives more accurate results, especially with short input samples. 52 */ 53 54 55 struct UCharsetDetector; 56 /** 57 * Structure representing a charset detector 58 * @stable ICU 3.6 59 */ 60 typedef struct UCharsetDetector UCharsetDetector; 61 62 struct UCharsetMatch; 63 /** 64 * Opaque structure representing a match that was identified 65 * from a charset detection operation. 66 * @stable ICU 3.6 67 */ 68 typedef struct UCharsetMatch UCharsetMatch; 69 70 /** 71 * Open a charset detector. 72 * 73 * @param status Any error conditions occurring during the open 74 * operation are reported back in this variable. 75 * @return the newly opened charset detector. 76 * @stable ICU 3.6 77 */ 78 U_STABLE UCharsetDetector * U_EXPORT2 79 ucsdet_open(UErrorCode *status); 80 81 /** 82 * Close a charset detector. All storage and any other resources 83 * owned by this charset detector will be released. Failure to 84 * close a charset detector when finished with it can result in 85 * memory leaks in the application. 86 * 87 * @param ucsd The charset detector to be closed. 88 * @stable ICU 3.6 89 */ 90 U_STABLE void U_EXPORT2 91 ucsdet_close(UCharsetDetector *ucsd); 92 93 #if U_SHOW_CPLUSPLUS_API 94 95 U_NAMESPACE_BEGIN 96 97 /** 98 * \class LocalUCharsetDetectorPointer 99 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 100 * For most methods see the LocalPointerBase base class. 101 * 102 * @see LocalPointerBase 103 * @see LocalPointer 104 * @stable ICU 4.4 105 */ 106 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 107 108 U_NAMESPACE_END 109 110 #endif 111 112 /** 113 * Set the input byte data whose charset is to detected. 114 * 115 * Ownership of the input text byte array remains with the caller. 116 * The input string must not be altered or deleted until the charset 117 * detector is either closed or reset to refer to different input text. 118 * 119 * @param ucsd the charset detector to be used. 120 * @param textIn the input text of unknown encoding. . 121 * @param len the length of the input text, or -1 if the text 122 * is NUL terminated. 123 * @param status any error conditions are reported back in this variable. 124 * 125 * @stable ICU 3.6 126 */ 127 U_STABLE void U_EXPORT2 128 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 129 130 131 /** Set the declared encoding for charset detection. 132 * The declared encoding of an input text is an encoding obtained 133 * by the user from an http header or xml declaration or similar source that 134 * can be provided as an additional hint to the charset detector. 135 * 136 * How and whether the declared encoding will be used during the 137 * detection process is TBD. 138 * 139 * @param ucsd the charset detector to be used. 140 * @param encoding an encoding for the current data obtained from 141 * a header or declaration or other source outside 142 * of the byte data itself. 143 * @param length the length of the encoding name, or -1 if the name string 144 * is NUL terminated. 145 * @param status any error conditions are reported back in this variable. 146 * 147 * @stable ICU 3.6 148 */ 149 U_STABLE void U_EXPORT2 150 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 151 152 153 /** 154 * Return the charset that best matches the supplied input data. 155 * 156 * Note though, that because the detection 157 * only looks at the start of the input data, 158 * there is a possibility that the returned charset will fail to handle 159 * the full set of input data. 160 * <p> 161 * The returned UCharsetMatch object is owned by the UCharsetDetector. 162 * It will remain valid until the detector input is reset, or until 163 * the detector is closed. 164 * <p> 165 * The function will fail if 166 * <ul> 167 * <li>no charset appears to match the data.</li> 168 * <li>no input text has been provided</li> 169 * </ul> 170 * 171 * @param ucsd the charset detector to be used. 172 * @param status any error conditions are reported back in this variable. 173 * @return a UCharsetMatch representing the best matching charset, 174 * or NULL if no charset matches the byte data. 175 * 176 * @stable ICU 3.6 177 */ 178 U_STABLE const UCharsetMatch * U_EXPORT2 179 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 180 181 182 /** 183 * Find all charset matches that appear to be consistent with the input, 184 * returning an array of results. The results are ordered with the 185 * best quality match first. 186 * 187 * Because the detection only looks at a limited amount of the 188 * input byte data, some of the returned charsets may fail to handle 189 * the all of input data. 190 * <p> 191 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 192 * They will remain valid until the detector is closed or modified 193 * 194 * <p> 195 * Return an error if 196 * <ul> 197 * <li>no charsets appear to match the input data.</li> 198 * <li>no input text has been provided</li> 199 * </ul> 200 * 201 * @param ucsd the charset detector to be used. 202 * @param matchesFound pointer to a variable that will be set to the 203 * number of charsets identified that are consistent with 204 * the input data. Output only. 205 * @param status any error conditions are reported back in this variable. 206 * @return A pointer to an array of pointers to UCharSetMatch objects. 207 * This array, and the UCharSetMatch instances to which it refers, 208 * are owned by the UCharsetDetector, and will remain valid until 209 * the detector is closed or modified. 210 * @stable ICU 3.6 211 */ 212 U_STABLE const UCharsetMatch ** U_EXPORT2 213 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 214 215 216 217 /** 218 * Get the name of the charset represented by a UCharsetMatch. 219 * 220 * The storage for the returned name string is owned by the 221 * UCharsetMatch, and will remain valid while the UCharsetMatch 222 * is valid. 223 * 224 * The name returned is suitable for use with the ICU conversion APIs. 225 * 226 * @param ucsm The charset match object. 227 * @param status Any error conditions are reported back in this variable. 228 * @return The name of the matching charset. 229 * 230 * @stable ICU 3.6 231 */ 232 U_STABLE const char * U_EXPORT2 233 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 234 235 /** 236 * Get a confidence number for the quality of the match of the byte 237 * data with the charset. Confidence numbers range from zero to 100, 238 * with 100 representing complete confidence and zero representing 239 * no confidence. 240 * 241 * The confidence values are somewhat arbitrary. They define an 242 * an ordering within the results for any single detection operation 243 * but are not generally comparable between the results for different input. 244 * 245 * A confidence value of ten does have a general meaning - it is used 246 * for charsets that can represent the input data, but for which there 247 * is no other indication that suggests that the charset is the correct one. 248 * Pure 7 bit ASCII data, for example, is compatible with a 249 * great many charsets, most of which will appear as possible matches 250 * with a confidence of 10. 251 * 252 * @param ucsm The charset match object. 253 * @param status Any error conditions are reported back in this variable. 254 * @return A confidence number for the charset match. 255 * 256 * @stable ICU 3.6 257 */ 258 U_STABLE int32_t U_EXPORT2 259 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 260 261 /** 262 * Get the RFC 3066 code for the language of the input data. 263 * 264 * The Charset Detection service is intended primarily for detecting 265 * charsets, not language. For some, but not all, charsets, a language is 266 * identified as a byproduct of the detection process, and that is what 267 * is returned by this function. 268 * 269 * CAUTION: 270 * 1. Language information is not available for input data encoded in 271 * all charsets. In particular, no language is identified 272 * for UTF-8 input data. 273 * 274 * 2. Closely related languages may sometimes be confused. 275 * 276 * If more accurate language detection is required, a linguistic 277 * analysis package should be used. 278 * 279 * The storage for the returned name string is owned by the 280 * UCharsetMatch, and will remain valid while the UCharsetMatch 281 * is valid. 282 * 283 * @param ucsm The charset match object. 284 * @param status Any error conditions are reported back in this variable. 285 * @return The RFC 3066 code for the language of the input data, or 286 * an empty string if the language could not be determined. 287 * 288 * @stable ICU 3.6 289 */ 290 U_STABLE const char * U_EXPORT2 291 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 292 293 294 /** 295 * Get the entire input text as a UChar string, placing it into 296 * a caller-supplied buffer. A terminating 297 * NUL character will be appended to the buffer if space is available. 298 * 299 * The number of UChars in the output string, not including the terminating 300 * NUL, is returned. 301 * 302 * If the supplied buffer is smaller than required to hold the output, 303 * the contents of the buffer are undefined. The full output string length 304 * (in UChars) is returned as always, and can be used to allocate a buffer 305 * of the correct size. 306 * 307 * 308 * @param ucsm The charset match object. 309 * @param buf A UChar buffer to be filled with the converted text data. 310 * @param cap The capacity of the buffer in UChars. 311 * @param status Any error conditions are reported back in this variable. 312 * @return The number of UChars in the output string. 313 * 314 * @stable ICU 3.6 315 */ 316 U_STABLE int32_t U_EXPORT2 317 ucsdet_getUChars(const UCharsetMatch *ucsm, 318 UChar *buf, int32_t cap, UErrorCode *status); 319 320 321 322 /** 323 * Get an iterator over the set of all detectable charsets - 324 * over the charsets that are known to the charset detection 325 * service. 326 * 327 * The returned UEnumeration provides access to the names of 328 * the charsets. 329 * 330 * <p> 331 * The state of the Charset detector that is passed in does not 332 * affect the result of this function, but requiring a valid, open 333 * charset detector as a parameter insures that the charset detection 334 * service has been safely initialized and that the required detection 335 * data is available. 336 * 337 * <p> 338 * <b>Note:</b> Multiple different charset encodings in a same family may use 339 * a single shared name in this implementation. For example, this method returns 340 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 341 * (Windows Latin 1). However, actual detection result could be "windows-1252" 342 * when the input data matches Latin 1 code points with any points only available 343 * in "windows-1252". 344 * 345 * @param ucsd a Charset detector. 346 * @param status Any error conditions are reported back in this variable. 347 * @return an iterator providing access to the detectable charset names. 348 * @stable ICU 3.6 349 */ 350 U_STABLE UEnumeration * U_EXPORT2 351 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 352 353 /** 354 * Test whether input filtering is enabled for this charset detector. 355 * Input filtering removes text that appears to be HTML or xml 356 * markup from the input before applying the code page detection 357 * heuristics. 358 * 359 * @param ucsd The charset detector to check. 360 * @return TRUE if filtering is enabled. 361 * @stable ICU 3.6 362 */ 363 364 U_STABLE UBool U_EXPORT2 365 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 366 367 368 /** 369 * Enable filtering of input text. If filtering is enabled, 370 * text within angle brackets ("<" and ">") will be removed 371 * before detection, which will remove most HTML or xml markup. 372 * 373 * @param ucsd the charset detector to be modified. 374 * @param filter <code>true</code> to enable input text filtering. 375 * @return The previous setting. 376 * 377 * @stable ICU 3.6 378 */ 379 U_STABLE UBool U_EXPORT2 380 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 381 382 #ifndef U_HIDE_INTERNAL_API 383 /** 384 * Get an iterator over the set of detectable charsets - 385 * over the charsets that are enabled by the specified charset detector. 386 * 387 * The returned UEnumeration provides access to the names of 388 * the charsets. 389 * 390 * @param ucsd a Charset detector. 391 * @param status Any error conditions are reported back in this variable. 392 * @return an iterator providing access to the detectable charset names by 393 * the specified charset detector. 394 * @internal 395 */ 396 U_INTERNAL UEnumeration * U_EXPORT2 397 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 398 399 /** 400 * Enable or disable individual charset encoding. 401 * A name of charset encoding must be included in the names returned by 402 * {@link #ucsdet_getAllDetectableCharsets()}. 403 * 404 * @param ucsd a Charset detector. 405 * @param encoding encoding the name of charset encoding. 406 * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the 407 * charset encoding. 408 * @param status receives the return status. When the name of charset encoding 409 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 410 * @internal 411 */ 412 U_INTERNAL void U_EXPORT2 413 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 414 #endif /* U_HIDE_INTERNAL_API */ 415 416 #endif 417 #endif /* __UCSDET_H */ 418 419 420