Home | History | Annotate | Download | only in unicode
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  **********************************************************************
      5  *   Copyright (C) 2005-2013, International Business Machines
      6  *   Corporation and others.  All Rights Reserved.
      7  **********************************************************************
      8  *   file name:  ucsdet.h
      9  *   encoding:   UTF-8
     10  *   indentation:4
     11  *
     12  *   created on: 2005Aug04
     13  *   created by: Andy Heninger
     14  *
     15  *   ICU Character Set Detection, API for C
     16  *
     17  *   Draft version 18 Oct 2005
     18  *
     19  */
     20 
     21 #ifndef __UCSDET_H
     22 #define __UCSDET_H
     23 
     24 #include "unicode/utypes.h"
     25 
     26 #if !UCONFIG_NO_CONVERSION
     27 
     28 #include "unicode/localpointer.h"
     29 #include "unicode/uenum.h"
     30 
     31 /**
     32  * \file
     33  * \brief C API: Charset Detection API
     34  *
     35  * This API provides a facility for detecting the
     36  * charset or encoding of character data in an unknown text format.
     37  * The input data can be from an array of bytes.
     38  * <p>
     39  * Character set detection is at best an imprecise operation.  The detection
     40  * process will attempt to identify the charset that best matches the characteristics
     41  * of the byte data, but the process is partly statistical in nature, and
     42  * the results can not be guaranteed to always be correct.
     43  * <p>
     44  * For best accuracy in charset detection, the input data should be primarily
     45  * in a single language, and a minimum of a few hundred bytes worth of plain text
     46  * in the language are needed.  The detection process will attempt to
     47  * ignore html or xml style markup that could otherwise obscure the content.
     48  * <p>
     49  * An alternative to the ICU Charset Detector is the
     50  * Compact Encoding Detector, https://github.com/google/compact_enc_det.
     51  * It often gives more accurate results, especially with short input samples.
     52  */
     53 
     54 
     55 struct UCharsetDetector;
     56 /**
     57   * Structure representing a charset detector
     58   * @stable ICU 3.6
     59   */
     60 typedef struct UCharsetDetector UCharsetDetector;
     61 
     62 struct UCharsetMatch;
     63 /**
     64   *  Opaque structure representing a match that was identified
     65   *  from a charset detection operation.
     66   *  @stable ICU 3.6
     67   */
     68 typedef struct UCharsetMatch UCharsetMatch;
     69 
     70 /**
     71   *  Open a charset detector.
     72   *
     73   *  @param status Any error conditions occurring during the open
     74   *                operation are reported back in this variable.
     75   *  @return the newly opened charset detector.
     76   *  @stable ICU 3.6
     77   */
     78 U_STABLE UCharsetDetector * U_EXPORT2
     79 ucsdet_open(UErrorCode   *status);
     80 
     81 /**
     82   * Close a charset detector.  All storage and any other resources
     83   *   owned by this charset detector will be released.  Failure to
     84   *   close a charset detector when finished with it can result in
     85   *   memory leaks in the application.
     86   *
     87   *  @param ucsd  The charset detector to be closed.
     88   *  @stable ICU 3.6
     89   */
     90 U_STABLE void U_EXPORT2
     91 ucsdet_close(UCharsetDetector *ucsd);
     92 
     93 #if U_SHOW_CPLUSPLUS_API
     94 
     95 U_NAMESPACE_BEGIN
     96 
     97 /**
     98  * \class LocalUCharsetDetectorPointer
     99  * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close().
    100  * For most methods see the LocalPointerBase base class.
    101  *
    102  * @see LocalPointerBase
    103  * @see LocalPointer
    104  * @stable ICU 4.4
    105  */
    106 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close);
    107 
    108 U_NAMESPACE_END
    109 
    110 #endif
    111 
    112 /**
    113   * Set the input byte data whose charset is to detected.
    114   *
    115   * Ownership of the input  text byte array remains with the caller.
    116   * The input string must not be altered or deleted until the charset
    117   * detector is either closed or reset to refer to different input text.
    118   *
    119   * @param ucsd   the charset detector to be used.
    120   * @param textIn the input text of unknown encoding.   .
    121   * @param len    the length of the input text, or -1 if the text
    122   *               is NUL terminated.
    123   * @param status any error conditions are reported back in this variable.
    124   *
    125   * @stable ICU 3.6
    126   */
    127 U_STABLE void U_EXPORT2
    128 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status);
    129 
    130 
    131 /** Set the declared encoding for charset detection.
    132  *  The declared encoding of an input text is an encoding obtained
    133  *  by the user from an http header or xml declaration or similar source that
    134  *  can be provided as an additional hint to the charset detector.
    135  *
    136  *  How and whether the declared encoding will be used during the
    137  *  detection process is TBD.
    138  *
    139  * @param ucsd      the charset detector to be used.
    140  * @param encoding  an encoding for the current data obtained from
    141  *                  a header or declaration or other source outside
    142  *                  of the byte data itself.
    143  * @param length    the length of the encoding name, or -1 if the name string
    144  *                  is NUL terminated.
    145  * @param status    any error conditions are reported back in this variable.
    146  *
    147  * @stable ICU 3.6
    148  */
    149 U_STABLE void U_EXPORT2
    150 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status);
    151 
    152 
    153 /**
    154  * Return the charset that best matches the supplied input data.
    155  *
    156  * Note though, that because the detection
    157  * only looks at the start of the input data,
    158  * there is a possibility that the returned charset will fail to handle
    159  * the full set of input data.
    160  * <p>
    161  * The returned UCharsetMatch object is owned by the UCharsetDetector.
    162  * It will remain valid until the detector input is reset, or until
    163  * the detector is closed.
    164  * <p>
    165  * The function will fail if
    166  *  <ul>
    167  *    <li>no charset appears to match the data.</li>
    168  *    <li>no input text has been provided</li>
    169  *  </ul>
    170  *
    171  * @param ucsd      the charset detector to be used.
    172  * @param status    any error conditions are reported back in this variable.
    173  * @return          a UCharsetMatch  representing the best matching charset,
    174  *                  or NULL if no charset matches the byte data.
    175  *
    176  * @stable ICU 3.6
    177  */
    178 U_STABLE const UCharsetMatch * U_EXPORT2
    179 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status);
    180 
    181 
    182 /**
    183  *  Find all charset matches that appear to be consistent with the input,
    184  *  returning an array of results.  The results are ordered with the
    185  *  best quality match first.
    186  *
    187  *  Because the detection only looks at a limited amount of the
    188  *  input byte data, some of the returned charsets may fail to handle
    189  *  the all of input data.
    190  *  <p>
    191  *  The returned UCharsetMatch objects are owned by the UCharsetDetector.
    192  *  They will remain valid until the detector is closed or modified
    193  *
    194  * <p>
    195  * Return an error if
    196  *  <ul>
    197  *    <li>no charsets appear to match the input data.</li>
    198  *    <li>no input text has been provided</li>
    199  *  </ul>
    200  *
    201  * @param ucsd          the charset detector to be used.
    202  * @param matchesFound  pointer to a variable that will be set to the
    203  *                      number of charsets identified that are consistent with
    204  *                      the input data.  Output only.
    205  * @param status        any error conditions are reported back in this variable.
    206  * @return              A pointer to an array of pointers to UCharSetMatch objects.
    207  *                      This array, and the UCharSetMatch instances to which it refers,
    208  *                      are owned by the UCharsetDetector, and will remain valid until
    209  *                      the detector is closed or modified.
    210  * @stable ICU 3.6
    211  */
    212 U_STABLE const UCharsetMatch ** U_EXPORT2
    213 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status);
    214 
    215 
    216 
    217 /**
    218  *  Get the name of the charset represented by a UCharsetMatch.
    219  *
    220  *  The storage for the returned name string is owned by the
    221  *  UCharsetMatch, and will remain valid while the UCharsetMatch
    222  *  is valid.
    223  *
    224  *  The name returned is suitable for use with the ICU conversion APIs.
    225  *
    226  *  @param ucsm    The charset match object.
    227  *  @param status  Any error conditions are reported back in this variable.
    228  *  @return        The name of the matching charset.
    229  *
    230  *  @stable ICU 3.6
    231  */
    232 U_STABLE const char * U_EXPORT2
    233 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status);
    234 
    235 /**
    236  *  Get a confidence number for the quality of the match of the byte
    237  *  data with the charset.  Confidence numbers range from zero to 100,
    238  *  with 100 representing complete confidence and zero representing
    239  *  no confidence.
    240  *
    241  *  The confidence values are somewhat arbitrary.  They define an
    242  *  an ordering within the results for any single detection operation
    243  *  but are not generally comparable between the results for different input.
    244  *
    245  *  A confidence value of ten does have a general meaning - it is used
    246  *  for charsets that can represent the input data, but for which there
    247  *  is no other indication that suggests that the charset is the correct one.
    248  *  Pure 7 bit ASCII data, for example, is compatible with a
    249  *  great many charsets, most of which will appear as possible matches
    250  *  with a confidence of 10.
    251  *
    252  *  @param ucsm    The charset match object.
    253  *  @param status  Any error conditions are reported back in this variable.
    254  *  @return        A confidence number for the charset match.
    255  *
    256  *  @stable ICU 3.6
    257  */
    258 U_STABLE int32_t U_EXPORT2
    259 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status);
    260 
    261 /**
    262  *  Get the RFC 3066 code for the language of the input data.
    263  *
    264  *  The Charset Detection service is intended primarily for detecting
    265  *  charsets, not language.  For some, but not all, charsets, a language is
    266  *  identified as a byproduct of the detection process, and that is what
    267  *  is returned by this function.
    268  *
    269  *  CAUTION:
    270  *    1.  Language information is not available for input data encoded in
    271  *        all charsets. In particular, no language is identified
    272  *        for UTF-8 input data.
    273  *
    274  *    2.  Closely related languages may sometimes be confused.
    275  *
    276  *  If more accurate language detection is required, a linguistic
    277  *  analysis package should be used.
    278  *
    279  *  The storage for the returned name string is owned by the
    280  *  UCharsetMatch, and will remain valid while the UCharsetMatch
    281  *  is valid.
    282  *
    283  *  @param ucsm    The charset match object.
    284  *  @param status  Any error conditions are reported back in this variable.
    285  *  @return        The RFC 3066 code for the language of the input data, or
    286  *                 an empty string if the language could not be determined.
    287  *
    288  *  @stable ICU 3.6
    289  */
    290 U_STABLE const char * U_EXPORT2
    291 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status);
    292 
    293 
    294 /**
    295   *  Get the entire input text as a UChar string, placing it into
    296   *  a caller-supplied buffer.  A terminating
    297   *  NUL character will be appended to the buffer if space is available.
    298   *
    299   *  The number of UChars in the output string, not including the terminating
    300   *  NUL, is returned.
    301   *
    302   *  If the supplied buffer is smaller than required to hold the output,
    303   *  the contents of the buffer are undefined.  The full output string length
    304   *  (in UChars) is returned as always, and can be used to allocate a buffer
    305   *  of the correct size.
    306   *
    307   *
    308   * @param ucsm    The charset match object.
    309   * @param buf     A UChar buffer to be filled with the converted text data.
    310   * @param cap     The capacity of the buffer in UChars.
    311   * @param status  Any error conditions are reported back in this variable.
    312   * @return        The number of UChars in the output string.
    313   *
    314   * @stable ICU 3.6
    315   */
    316 U_STABLE  int32_t U_EXPORT2
    317 ucsdet_getUChars(const UCharsetMatch *ucsm,
    318                  UChar *buf, int32_t cap, UErrorCode *status);
    319 
    320 
    321 
    322 /**
    323   *  Get an iterator over the set of all detectable charsets -
    324   *  over the charsets that are known to the charset detection
    325   *  service.
    326   *
    327   *  The returned UEnumeration provides access to the names of
    328   *  the charsets.
    329   *
    330   *  <p>
    331   *  The state of the Charset detector that is passed in does not
    332   *  affect the result of this function, but requiring a valid, open
    333   *  charset detector as a parameter insures that the charset detection
    334   *  service has been safely initialized and that the required detection
    335   *  data is available.
    336   *
    337   *  <p>
    338   *  <b>Note:</b> Multiple different charset encodings in a same family may use
    339   *  a single shared name in this implementation. For example, this method returns
    340   *  an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252"
    341   *  (Windows Latin 1). However, actual detection result could be "windows-1252"
    342   *  when the input data matches Latin 1 code points with any points only available
    343   *  in "windows-1252".
    344   *
    345   *  @param ucsd a Charset detector.
    346   *  @param status  Any error conditions are reported back in this variable.
    347   *  @return an iterator providing access to the detectable charset names.
    348   *  @stable ICU 3.6
    349   */
    350 U_STABLE  UEnumeration * U_EXPORT2
    351 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
    352 
    353 /**
    354   *  Test whether input filtering is enabled for this charset detector.
    355   *  Input filtering removes text that appears to be HTML or xml
    356   *  markup from the input before applying the code page detection
    357   *  heuristics.
    358   *
    359   *  @param ucsd  The charset detector to check.
    360   *  @return TRUE if filtering is enabled.
    361   *  @stable ICU 3.6
    362   */
    363 
    364 U_STABLE  UBool U_EXPORT2
    365 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd);
    366 
    367 
    368 /**
    369  * Enable filtering of input text. If filtering is enabled,
    370  * text within angle brackets ("<" and ">") will be removed
    371  * before detection, which will remove most HTML or xml markup.
    372  *
    373  * @param ucsd   the charset detector to be modified.
    374  * @param filter <code>true</code> to enable input text filtering.
    375  * @return The previous setting.
    376  *
    377  * @stable ICU 3.6
    378  */
    379 U_STABLE  UBool U_EXPORT2
    380 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter);
    381 
    382 #ifndef U_HIDE_INTERNAL_API
    383 /**
    384   *  Get an iterator over the set of detectable charsets -
    385   *  over the charsets that are enabled by the specified charset detector.
    386   *
    387   *  The returned UEnumeration provides access to the names of
    388   *  the charsets.
    389   *
    390   *  @param ucsd a Charset detector.
    391   *  @param status  Any error conditions are reported back in this variable.
    392   *  @return an iterator providing access to the detectable charset names by
    393   *  the specified charset detector.
    394   *  @internal
    395   */
    396 U_INTERNAL UEnumeration * U_EXPORT2
    397 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd,  UErrorCode *status);
    398 
    399 /**
    400   * Enable or disable individual charset encoding.
    401   * A name of charset encoding must be included in the names returned by
    402   * {@link #ucsdet_getAllDetectableCharsets()}.
    403   *
    404   * @param ucsd a Charset detector.
    405   * @param encoding encoding the name of charset encoding.
    406   * @param enabled <code>TRUE</code> to enable, or <code>FALSE</code> to disable the
    407   *   charset encoding.
    408   * @param status receives the return status. When the name of charset encoding
    409   *   is not supported, U_ILLEGAL_ARGUMENT_ERROR is set.
    410   * @internal
    411   */
    412 U_INTERNAL void U_EXPORT2
    413 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status);
    414 #endif  /* U_HIDE_INTERNAL_API */
    415 
    416 #endif
    417 #endif   /* __UCSDET_H */
    418 
    419 
    420