1 /* 2 ********************************************************************** 3 * Copyright (C) 2005-2013, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ********************************************************************** 6 * file name: ucsdet.h 7 * encoding: US-ASCII 8 * indentation:4 9 * 10 * created on: 2005Aug04 11 * created by: Andy Heninger 12 * 13 * ICU Character Set Detection, API for C 14 * 15 * Draft version 18 Oct 2005 16 * 17 */ 18 19 #ifndef __UCSDET_H 20 #define __UCSDET_H 21 22 #include "unicode/utypes.h" 23 24 #if !UCONFIG_NO_CONVERSION 25 26 #include "unicode/localpointer.h" 27 #include "unicode/uenum.h" 28 29 /** 30 * \file 31 * \brief C API: Charset Detection API 32 * 33 * This API provides a facility for detecting the 34 * charset or encoding of character data in an unknown text format. 35 * The input data can be from an array of bytes. 36 * <p> 37 * Character set detection is at best an imprecise operation. The detection 38 * process will attempt to identify the charset that best matches the characteristics 39 * of the byte data, but the process is partly statistical in nature, and 40 * the results can not be guaranteed to always be correct. 41 * <p> 42 * For best accuracy in charset detection, the input data should be primarily 43 * in a single language, and a minimum of a few hundred bytes worth of plain text 44 * in the language are needed. The detection process will attempt to 45 * ignore html or xml style markup that could otherwise obscure the content. 46 */ 47 48 49 struct UCharsetDetector; 50 /** 51 * Structure representing a charset detector 52 * @stable ICU 3.6 53 */ 54 typedef struct UCharsetDetector UCharsetDetector; 55 56 struct UCharsetMatch; 57 /** 58 * Opaque structure representing a match that was identified 59 * from a charset detection operation. 60 * @stable ICU 3.6 61 */ 62 typedef struct UCharsetMatch UCharsetMatch; 63 64 /** 65 * Open a charset detector. 66 * 67 * @param status Any error conditions occurring during the open 68 * operation are reported back in this variable. 69 * @return the newly opened charset detector. 70 * @stable ICU 3.6 71 */ 72 U_STABLE UCharsetDetector * U_EXPORT2 73 ucsdet_open(UErrorCode *status); 74 75 /** 76 * Close a charset detector. All storage and any other resources 77 * owned by this charset detector will be released. Failure to 78 * close a charset detector when finished with it can result in 79 * memory leaks in the application. 80 * 81 * @param ucsd The charset detector to be closed. 82 * @stable ICU 3.6 83 */ 84 U_STABLE void U_EXPORT2 85 ucsdet_close(UCharsetDetector *ucsd); 86 87 #if U_SHOW_CPLUSPLUS_API 88 89 U_NAMESPACE_BEGIN 90 91 /** 92 * \class LocalUCharsetDetectorPointer 93 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 94 * For most methods see the LocalPointerBase base class. 95 * 96 * @see LocalPointerBase 97 * @see LocalPointer 98 * @stable ICU 4.4 99 */ 100 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 101 102 U_NAMESPACE_END 103 104 #endif 105 106 /** 107 * Set the input byte data whose charset is to detected. 108 * 109 * Ownership of the input text byte array remains with the caller. 110 * The input string must not be altered or deleted until the charset 111 * detector is either closed or reset to refer to different input text. 112 * 113 * @param ucsd the charset detector to be used. 114 * @param textIn the input text of unknown encoding. . 115 * @param len the length of the input text, or -1 if the text 116 * is NUL terminated. 117 * @param status any error conditions are reported back in this variable. 118 * 119 * @stable ICU 3.6 120 */ 121 U_STABLE void U_EXPORT2 122 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 123 124 125 /** Set the declared encoding for charset detection. 126 * The declared encoding of an input text is an encoding obtained 127 * by the user from an http header or xml declaration or similar source that 128 * can be provided as an additional hint to the charset detector. 129 * 130 * How and whether the declared encoding will be used during the 131 * detection process is TBD. 132 * 133 * @param ucsd the charset detector to be used. 134 * @param encoding an encoding for the current data obtained from 135 * a header or declaration or other source outside 136 * of the byte data itself. 137 * @param length the length of the encoding name, or -1 if the name string 138 * is NUL terminated. 139 * @param status any error conditions are reported back in this variable. 140 * 141 * @stable ICU 3.6 142 */ 143 U_STABLE void U_EXPORT2 144 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 145 146 147 /** 148 * Return the charset that best matches the supplied input data. 149 * 150 * Note though, that because the detection 151 * only looks at the start of the input data, 152 * there is a possibility that the returned charset will fail to handle 153 * the full set of input data. 154 * <p> 155 * The returned UCharsetMatch object is owned by the UCharsetDetector. 156 * It will remain valid until the detector input is reset, or until 157 * the detector is closed. 158 * <p> 159 * The function will fail if 160 * <ul> 161 * <li>no charset appears to match the data.</li> 162 * <li>no input text has been provided</li> 163 * </ul> 164 * 165 * @param ucsd the charset detector to be used. 166 * @param status any error conditions are reported back in this variable. 167 * @return a UCharsetMatch representing the best matching charset, 168 * or NULL if no charset matches the byte data. 169 * 170 * @stable ICU 3.6 171 */ 172 U_STABLE const UCharsetMatch * U_EXPORT2 173 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 174 175 176 /** 177 * Find all charset matches that appear to be consistent with the input, 178 * returning an array of results. The results are ordered with the 179 * best quality match first. 180 * 181 * Because the detection only looks at a limited amount of the 182 * input byte data, some of the returned charsets may fail to handle 183 * the all of input data. 184 * <p> 185 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 186 * They will remain valid until the detector is closed or modified 187 * 188 * <p> 189 * Return an error if 190 * <ul> 191 * <li>no charsets appear to match the input data.</li> 192 * <li>no input text has been provided</li> 193 * </ul> 194 * 195 * @param ucsd the charset detector to be used. 196 * @param matchesFound pointer to a variable that will be set to the 197 * number of charsets identified that are consistent with 198 * the input data. Output only. 199 * @param status any error conditions are reported back in this variable. 200 * @return A pointer to an array of pointers to UCharSetMatch objects. 201 * This array, and the UCharSetMatch instances to which it refers, 202 * are owned by the UCharsetDetector, and will remain valid until 203 * the detector is closed or modified. 204 * @stable ICU 3.6 205 */ 206 U_STABLE const UCharsetMatch ** U_EXPORT2 207 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 208 209 210 211 /** 212 * Get the name of the charset represented by a UCharsetMatch. 213 * 214 * The storage for the returned name string is owned by the 215 * UCharsetMatch, and will remain valid while the UCharsetMatch 216 * is valid. 217 * 218 * The name returned is suitable for use with the ICU conversion APIs. 219 * 220 * @param ucsm The charset match object. 221 * @param status Any error conditions are reported back in this variable. 222 * @return The name of the matching charset. 223 * 224 * @stable ICU 3.6 225 */ 226 U_STABLE const char * U_EXPORT2 227 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 228 229 /** 230 * Get a confidence number for the quality of the match of the byte 231 * data with the charset. Confidence numbers range from zero to 100, 232 * with 100 representing complete confidence and zero representing 233 * no confidence. 234 * 235 * The confidence values are somewhat arbitrary. They define an 236 * an ordering within the results for any single detection operation 237 * but are not generally comparable between the results for different input. 238 * 239 * A confidence value of ten does have a general meaning - it is used 240 * for charsets that can represent the input data, but for which there 241 * is no other indication that suggests that the charset is the correct one. 242 * Pure 7 bit ASCII data, for example, is compatible with a 243 * great many charsets, most of which will appear as possible matches 244 * with a confidence of 10. 245 * 246 * @param ucsm The charset match object. 247 * @param status Any error conditions are reported back in this variable. 248 * @return A confidence number for the charset match. 249 * 250 * @stable ICU 3.6 251 */ 252 U_STABLE int32_t U_EXPORT2 253 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 254 255 /** 256 * Get the RFC 3066 code for the language of the input data. 257 * 258 * The Charset Detection service is intended primarily for detecting 259 * charsets, not language. For some, but not all, charsets, a language is 260 * identified as a byproduct of the detection process, and that is what 261 * is returned by this function. 262 * 263 * CAUTION: 264 * 1. Language information is not available for input data encoded in 265 * all charsets. In particular, no language is identified 266 * for UTF-8 input data. 267 * 268 * 2. Closely related languages may sometimes be confused. 269 * 270 * If more accurate language detection is required, a linguistic 271 * analysis package should be used. 272 * 273 * The storage for the returned name string is owned by the 274 * UCharsetMatch, and will remain valid while the UCharsetMatch 275 * is valid. 276 * 277 * @param ucsm The charset match object. 278 * @param status Any error conditions are reported back in this variable. 279 * @return The RFC 3066 code for the language of the input data, or 280 * an empty string if the language could not be determined. 281 * 282 * @stable ICU 3.6 283 */ 284 U_STABLE const char * U_EXPORT2 285 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 286 287 288 /** 289 * Get the entire input text as a UChar string, placing it into 290 * a caller-supplied buffer. A terminating 291 * NUL character will be appended to the buffer if space is available. 292 * 293 * The number of UChars in the output string, not including the terminating 294 * NUL, is returned. 295 * 296 * If the supplied buffer is smaller than required to hold the output, 297 * the contents of the buffer are undefined. The full output string length 298 * (in UChars) is returned as always, and can be used to allocate a buffer 299 * of the correct size. 300 * 301 * 302 * @param ucsm The charset match object. 303 * @param buf A UChar buffer to be filled with the converted text data. 304 * @param cap The capacity of the buffer in UChars. 305 * @param status Any error conditions are reported back in this variable. 306 * @return The number of UChars in the output string. 307 * 308 * @stable ICU 3.6 309 */ 310 U_STABLE int32_t U_EXPORT2 311 ucsdet_getUChars(const UCharsetMatch *ucsm, 312 UChar *buf, int32_t cap, UErrorCode *status); 313 314 315 316 /** 317 * Get an iterator over the set of all detectable charsets - 318 * over the charsets that are known to the charset detection 319 * service. 320 * 321 * The returned UEnumeration provides access to the names of 322 * the charsets. 323 * 324 * <p> 325 * The state of the Charset detector that is passed in does not 326 * affect the result of this function, but requiring a valid, open 327 * charset detector as a parameter insures that the charset detection 328 * service has been safely initialized and that the required detection 329 * data is available. 330 * 331 * <p> 332 * <b>Note:</b> Multiple different charset encodings in a same family may use 333 * a single shared name in this implementation. For example, this method returns 334 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 335 * (Windows Latin 1). However, actual detection result could be "windows-1252" 336 * when the input data matches Latin 1 code points with any points only available 337 * in "windows-1252". 338 * 339 * @param ucsd a Charset detector. 340 * @param status Any error conditions are reported back in this variable. 341 * @return an iterator providing access to the detectable charset names. 342 * @stable ICU 3.6 343 */ 344 U_STABLE UEnumeration * U_EXPORT2 345 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 346 347 /** 348 * Test whether input filtering is enabled for this charset detector. 349 * Input filtering removes text that appears to be HTML or xml 350 * markup from the input before applying the code page detection 351 * heuristics. 352 * 353 * @param ucsd The charset detector to check. 354 * @return TRUE if filtering is enabled. 355 * @stable ICU 3.6 356 */ 357 358 U_STABLE UBool U_EXPORT2 359 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 360 361 362 /** 363 * Enable filtering of input text. If filtering is enabled, 364 * text within angle brackets ("<" and ">") will be removed 365 * before detection, which will remove most HTML or xml markup. 366 * 367 * @param ucsd the charset detector to be modified. 368 * @param filter <code>true</code> to enable input text filtering. 369 * @return The previous setting. 370 * 371 * @stable ICU 3.6 372 */ 373 U_STABLE UBool U_EXPORT2 374 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 375 376 #ifndef U_HIDE_INTERNAL_API 377 /** 378 * Get an iterator over the set of detectable charsets - 379 * over the charsets that are enabled by the specified charset detector. 380 * 381 * The returned UEnumeration provides access to the names of 382 * the charsets. 383 * 384 * @param ucsd a Charset detector. 385 * @param status Any error conditions are reported back in this variable. 386 * @return an iterator providing access to the detectable charset names by 387 * the specified charset detector. 388 * @internal 389 */ 390 U_INTERNAL UEnumeration * U_EXPORT2 391 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 392 393 /** 394 * Enable or disable individual charset encoding. 395 * A name of charset encoding must be included in the names returned by 396 * {@link #getAllDetectableCharsets()}. 397 * 398 * @param ucsd a Charset detector. 399 * @param encoding encoding the name of charset encoding. 400 * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the 401 * charset encoding. 402 * @param status receives the return status. When the name of charset encoding 403 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 404 * @internal 405 */ 406 U_INTERNAL void U_EXPORT2 407 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 408 #endif /* U_HIDE_INTERNAL_API */ 409 410 #endif 411 #endif /* __UCSDET_H */ 412 413 414