Home | History | Annotate | Download | only in unicode
      1 /*
      2 **********************************************************************
      3 *   Copyright (C) 1999-2009, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 **********************************************************************
      6  *
      7  *
      8  *   ucnv_err.h:
      9  */
     10 
     11 /**
     12  * \file
     13  * \brief C UConverter predefined error callbacks
     14  *
     15  *  <h2>Error Behaviour Functions</h2>
     16  *  Defines some error behaviour functions called by ucnv_{from,to}Unicode
     17  *  These are provided as part of ICU and many are stable, but they
     18  *  can also be considered only as an example of what can be done with
     19  *  callbacks.  You may of course write your own.
     20  *
     21  *  If you want to write your own, you may also find the functions from
     22  *  ucnv_cb.h useful when writing your own callbacks.
     23  *
     24  *  These functions, although public, should NEVER be called directly.
     25  *  They should be used as parameters to the ucnv_setFromUCallback
     26  *  and ucnv_setToUCallback functions, to set the behaviour of a converter
     27  *  when it encounters ILLEGAL/UNMAPPED/INVALID sequences.
     28  *
     29  *  usage example:  'STOP' doesn't need any context, but newContext
     30  *    could be set to something other than 'NULL' if needed. The available
     31  *    contexts in this header can modify the default behavior of the callback.
     32  *
     33  *  \code
     34  *  UErrorCode err = U_ZERO_ERROR;
     35  *  UConverter *myConverter = ucnv_open("ibm-949", &err);
     36  *  const void *oldContext;
     37  *  UConverterFromUCallback oldAction;
     38  *
     39  *
     40  *  if (U_SUCCESS(err))
     41  *  {
     42  *      ucnv_setFromUCallBack(myConverter,
     43  *                       UCNV_FROM_U_CALLBACK_STOP,
     44  *                       NULL,
     45  *                       &oldAction,
     46  *                       &oldContext,
     47  *                       &status);
     48  *  }
     49  *  \endcode
     50  *
     51  *  The code above tells "myConverter" to stop when it encounters an
     52  *  ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
     53  *  Unicode -> Codepage. The behavior from Codepage to Unicode is not changed,
     54  *  and ucnv_setToUCallBack would need to be called in order to change
     55  *  that behavior too.
     56  *
     57  *  Here is an example with a context:
     58  *
     59  *  \code
     60  *  UErrorCode err = U_ZERO_ERROR;
     61  *  UConverter *myConverter = ucnv_open("ibm-949", &err);
     62  *  const void *oldContext;
     63  *  UConverterFromUCallback oldAction;
     64  *
     65  *
     66  *  if (U_SUCCESS(err))
     67  *  {
     68  *      ucnv_setToUCallBack(myConverter,
     69  *                       UCNV_TO_U_CALLBACK_SUBSTITUTE,
     70  *                       UCNV_SUB_STOP_ON_ILLEGAL,
     71  *                       &oldAction,
     72  *                       &oldContext,
     73  *                       &status);
     74  *  }
     75  *  \endcode
     76  *
     77  *  The code above tells "myConverter" to stop when it encounters an
     78  *  ILLEGAL/TRUNCATED/INVALID sequences when it is used to convert from
     79  *  Codepage -> Unicode. Any unmapped and legal characters will be
     80  *  substituted to be the default substitution character.
     81  */
     82 
     83 #ifndef UCNV_ERR_H
     84 #define UCNV_ERR_H
     85 
     86 #include "unicode/utypes.h"
     87 
     88 #if !UCONFIG_NO_CONVERSION
     89 
     90 /** Forward declaring the UConverter structure. @stable ICU 2.0 */
     91 struct UConverter;
     92 
     93 /** @stable ICU 2.0 */
     94 typedef struct UConverter UConverter;
     95 
     96 /**
     97  * FROM_U, TO_U context options for sub callback
     98  * @stable ICU 2.0
     99  */
    100 #define UCNV_SUB_STOP_ON_ILLEGAL "i"
    101 
    102 /**
    103  * FROM_U, TO_U context options for skip callback
    104  * @stable ICU 2.0
    105  */
    106 #define UCNV_SKIP_STOP_ON_ILLEGAL "i"
    107 
    108 /**
    109  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
    110  * @stable ICU 2.0
    111  */
    112 #define UCNV_ESCAPE_ICU       NULL
    113 /**
    114  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
    115  * @stable ICU 2.0
    116  */
    117 #define UCNV_ESCAPE_JAVA      "J"
    118 /**
    119  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
    120  * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
    121  * @stable ICU 2.0
    122  */
    123 #define UCNV_ESCAPE_C         "C"
    124 /**
    125  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
    126  * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
    127  * @stable ICU 2.0
    128  */
    129 #define UCNV_ESCAPE_XML_DEC   "D"
    130 /**
    131  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
    132  * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
    133  * @stable ICU 2.0
    134  */
    135 #define UCNV_ESCAPE_XML_HEX   "X"
    136 /**
    137  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
    138  * @stable ICU 2.0
    139  */
    140 #define UCNV_ESCAPE_UNICODE   "U"
    141 
    142 /**
    143  * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to CSS2 conventions (\\HH..H<space>, that is,
    144  * a backslash, 1..6 hex digits, and a space)
    145  * @stable ICU 4.0
    146  */
    147 #define UCNV_ESCAPE_CSS2   "S"
    148 
    149 /**
    150  * The process condition code to be used with the callbacks.
    151  * Codes which are greater than UCNV_IRREGULAR should be
    152  * passed on to any chained callbacks.
    153  * @stable ICU 2.0
    154  */
    155 typedef enum {
    156     UCNV_UNASSIGNED = 0,  /**< The code point is unassigned.
    157                              The error code U_INVALID_CHAR_FOUND will be set. */
    158     UCNV_ILLEGAL = 1,     /**< The code point is illegal. For example,
    159                              \\x81\\x2E is illegal in SJIS because \\x2E
    160                              is not a valid trail byte for the \\x81
    161                              lead byte.
    162                              Also, starting with Unicode 3.0.1, non-shortest byte sequences
    163                              in UTF-8 (like \\xC1\\xA1 instead of \\x61 for U+0061)
    164                              are also illegal, not just irregular.
    165                              The error code U_ILLEGAL_CHAR_FOUND will be set. */
    166     UCNV_IRREGULAR = 2,   /**< The codepoint is not a regular sequence in
    167                              the encoding. For example, \\xED\\xA0\\x80..\\xED\\xBF\\xBF
    168                              are irregular UTF-8 byte sequences for single surrogate
    169                              code points.
    170                              The error code U_INVALID_CHAR_FOUND will be set. */
    171     UCNV_RESET = 3,       /**< The callback is called with this reason when a
    172                              'reset' has occured. Callback should reset all
    173                              state. */
    174     UCNV_CLOSE = 4,        /**< Called when the converter is closed. The
    175                              callback should release any allocated memory.*/
    176     UCNV_CLONE = 5         /**< Called when ucnv_safeClone() is called on the
    177                               converter. the pointer available as the
    178                               'context' is an alias to the original converters'
    179                               context pointer. If the context must be owned
    180                               by the new converter, the callback must clone
    181                               the data and call ucnv_setFromUCallback
    182                               (or setToUCallback) with the correct pointer.
    183                               @stable ICU 2.2
    184                            */
    185 } UConverterCallbackReason;
    186 
    187 
    188 /**
    189  * The structure for the fromUnicode callback function parameter.
    190  * @stable ICU 2.0
    191  */
    192 typedef struct {
    193     uint16_t size;              /**< The size of this struct. @stable ICU 2.0 */
    194     UBool flush;                /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0    */
    195     UConverter *converter;      /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0  */
    196     const UChar *source;        /**< Pointer to the source source buffer. @stable ICU 2.0    */
    197     const UChar *sourceLimit;   /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0    */
    198     char *target;               /**< Pointer to the target buffer. @stable ICU 2.0    */
    199     const char *targetLimit;    /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0     */
    200     int32_t *offsets;           /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0  */
    201 } UConverterFromUnicodeArgs;
    202 
    203 
    204 /**
    205  * The structure for the toUnicode callback function parameter.
    206  * @stable ICU 2.0
    207  */
    208 typedef struct {
    209     uint16_t size;              /**< The size of this struct   @stable ICU 2.0 */
    210     UBool flush;                /**< The internal state of converter will be reset and data flushed if set to TRUE. @stable ICU 2.0   */
    211     UConverter *converter;      /**< Pointer to the converter that is opened and to which this struct is passed as an argument. @stable ICU 2.0 */
    212     const char *source;         /**< Pointer to the source source buffer. @stable ICU 2.0    */
    213     const char *sourceLimit;    /**< Pointer to the limit (end + 1) of source buffer. @stable ICU 2.0    */
    214     UChar *target;              /**< Pointer to the target buffer. @stable ICU 2.0    */
    215     const UChar *targetLimit;   /**< Pointer to the limit (end + 1) of target buffer. @stable ICU 2.0     */
    216     int32_t *offsets;           /**< Pointer to the buffer that recieves the offsets. *offset = blah ; offset++;. @stable ICU 2.0  */
    217 } UConverterToUnicodeArgs;
    218 
    219 
    220 /**
    221  * DO NOT CALL THIS FUNCTION DIRECTLY!
    222  * This From Unicode callback STOPS at the ILLEGAL_SEQUENCE,
    223  * returning the error code back to the caller immediately.
    224  *
    225  * @param context Pointer to the callback's private data
    226  * @param fromUArgs Information about the conversion in progress
    227  * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
    228  * @param length Size (in bytes) of the concerned codepage sequence
    229  * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
    230  * @param reason Defines the reason the callback was invoked
    231  * @param err This should always be set to a failure status prior to calling.
    232  * @stable ICU 2.0
    233  */
    234 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_STOP (
    235                   const void *context,
    236                   UConverterFromUnicodeArgs *fromUArgs,
    237                   const UChar* codeUnits,
    238                   int32_t length,
    239                   UChar32 codePoint,
    240                   UConverterCallbackReason reason,
    241                   UErrorCode * err);
    242 
    243 
    244 
    245 /**
    246  * DO NOT CALL THIS FUNCTION DIRECTLY!
    247  * This To Unicode callback STOPS at the ILLEGAL_SEQUENCE,
    248  * returning the error code back to the caller immediately.
    249  *
    250  * @param context Pointer to the callback's private data
    251  * @param toUArgs Information about the conversion in progress
    252  * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
    253  * @param length Size (in bytes) of the concerned codepage sequence
    254  * @param reason Defines the reason the callback was invoked
    255  * @param err This should always be set to a failure status prior to calling.
    256  * @stable ICU 2.0
    257  */
    258 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_STOP (
    259                   const void *context,
    260                   UConverterToUnicodeArgs *toUArgs,
    261                   const char* codeUnits,
    262                   int32_t length,
    263                   UConverterCallbackReason reason,
    264                   UErrorCode * err);
    265 
    266 /**
    267  * DO NOT CALL THIS FUNCTION DIRECTLY!
    268  * This From Unicode callback skips any ILLEGAL_SEQUENCE, or
    269  * skips only UNASSINGED_SEQUENCE depending on the context parameter
    270  * simply ignoring those characters.
    271  *
    272  * @param context  The function currently recognizes the callback options:
    273  *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
    274  *                      returning the error code back to the caller immediately.
    275  *                 NULL: Skips any ILLEGAL_SEQUENCE
    276  * @param fromUArgs Information about the conversion in progress
    277  * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
    278  * @param length Size (in bytes) of the concerned codepage sequence
    279  * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
    280  * @param reason Defines the reason the callback was invoked
    281  * @param err Return value will be set to success if the callback was handled,
    282  *      otherwise this value will be set to a failure status.
    283  * @stable ICU 2.0
    284  */
    285 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SKIP (
    286                   const void *context,
    287                   UConverterFromUnicodeArgs *fromUArgs,
    288                   const UChar* codeUnits,
    289                   int32_t length,
    290                   UChar32 codePoint,
    291                   UConverterCallbackReason reason,
    292                   UErrorCode * err);
    293 
    294 /**
    295  * DO NOT CALL THIS FUNCTION DIRECTLY!
    296  * This From Unicode callback will Substitute the ILLEGAL SEQUENCE, or
    297  * UNASSIGNED_SEQUENCE depending on context parameter, with the
    298  * current substitution string for the converter. This is the default
    299  * callback.
    300  *
    301  * @param context The function currently recognizes the callback options:
    302  *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
    303  *                      returning the error code back to the caller immediately.
    304  *                 NULL: Substitutes any ILLEGAL_SEQUENCE
    305  * @param fromUArgs Information about the conversion in progress
    306  * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
    307  * @param length Size (in bytes) of the concerned codepage sequence
    308  * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
    309  * @param reason Defines the reason the callback was invoked
    310  * @param err Return value will be set to success if the callback was handled,
    311  *      otherwise this value will be set to a failure status.
    312  * @see ucnv_setSubstChars
    313  * @stable ICU 2.0
    314  */
    315 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
    316                   const void *context,
    317                   UConverterFromUnicodeArgs *fromUArgs,
    318                   const UChar* codeUnits,
    319                   int32_t length,
    320                   UChar32 codePoint,
    321                   UConverterCallbackReason reason,
    322                   UErrorCode * err);
    323 
    324 /**
    325  * DO NOT CALL THIS FUNCTION DIRECTLY!
    326  * This From Unicode callback will Substitute the ILLEGAL SEQUENCE with the
    327  * hexadecimal representation of the illegal codepoints
    328  *
    329  * @param context The function currently recognizes the callback options:
    330  *        <ul>
    331  *        <li>UCNV_ESCAPE_ICU: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
    332  *          representation in the format  %UXXXX, e.g. "%uFFFE%u00AC%uC8FE").
    333  *          In the Event the converter doesn't support the characters {%,U}[A-F][0-9],
    334  *          it will  substitute  the illegal sequence with the substitution characters.
    335  *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
    336  *          %UD84D%UDC56</li>
    337  *        <li>UCNV_ESCAPE_JAVA: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
    338  *          representation in the format  \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
    339  *          In the Event the converter doesn't support the characters {\,u}[A-F][0-9],
    340  *          it will  substitute  the illegal sequence with the substitution characters.
    341  *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
    342  *          \\uD84D\\uDC56</li>
    343  *        <li>UCNV_ESCAPE_C: Substitues the  ILLEGAL SEQUENCE with the hexadecimal
    344  *          representation in the format  \\uXXXX, e.g. "\\uFFFE\\u00AC\\uC8FE").
    345  *          In the Event the converter doesn't support the characters {\,u,U}[A-F][0-9],
    346  *          it will  substitute  the illegal sequence with the substitution characters.
    347  *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
    348  *          \\U00023456</li>
    349  *        <li>UCNV_ESCAPE_XML_DEC: Substitues the  ILLEGAL SEQUENCE with the decimal
    350  *          representation in the format \htmlonly&amp;#DDDDDDDD;, e.g. "&amp;#65534;&amp;#172;&amp;#51454;")\endhtmlonly.
    351  *          In the Event the converter doesn't support the characters {&amp;,#}[0-9],
    352  *          it will  substitute  the illegal sequence with the substitution characters.
    353  *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
    354  *          &amp;#144470; and Zero padding is ignored.</li>
    355  *        <li>UCNV_ESCAPE_XML_HEX:Substitues the  ILLEGAL SEQUENCE with the decimal
    356  *          representation in the format \htmlonly&amp;#xXXXX; e.g. "&amp;#xFFFE;&amp;#x00AC;&amp;#xC8FE;")\endhtmlonly.
    357  *          In the Event the converter doesn't support the characters {&,#,x}[0-9],
    358  *          it will  substitute  the illegal sequence with the substitution characters.
    359  *          Note that  codeUnit(32bit int eg: unit of a surrogate pair) is represented as
    360  *          \htmlonly&amp;#x23456;\endhtmlonly</li>
    361  *        </ul>
    362  * @param fromUArgs Information about the conversion in progress
    363  * @param codeUnits Points to 'length' UChars of the concerned Unicode sequence
    364  * @param length Size (in bytes) of the concerned codepage sequence
    365  * @param codePoint Single UChar32 (UTF-32) containing the concerend Unicode codepoint.
    366  * @param reason Defines the reason the callback was invoked
    367  * @param err Return value will be set to success if the callback was handled,
    368  *      otherwise this value will be set to a failure status.
    369  * @stable ICU 2.0
    370  */
    371 U_STABLE void U_EXPORT2 UCNV_FROM_U_CALLBACK_ESCAPE (
    372                   const void *context,
    373                   UConverterFromUnicodeArgs *fromUArgs,
    374                   const UChar* codeUnits,
    375                   int32_t length,
    376                   UChar32 codePoint,
    377                   UConverterCallbackReason reason,
    378                   UErrorCode * err);
    379 
    380 
    381 /**
    382  * DO NOT CALL THIS FUNCTION DIRECTLY!
    383  * This To Unicode callback skips any ILLEGAL_SEQUENCE, or
    384  * skips only UNASSINGED_SEQUENCE depending on the context parameter
    385  * simply ignoring those characters.
    386  *
    387  * @param context  The function currently recognizes the callback options:
    388  *                 UCNV_SKIP_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
    389  *                      returning the error code back to the caller immediately.
    390  *                 NULL: Skips any ILLEGAL_SEQUENCE
    391  * @param toUArgs Information about the conversion in progress
    392  * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
    393  * @param length Size (in bytes) of the concerned codepage sequence
    394  * @param reason Defines the reason the callback was invoked
    395  * @param err Return value will be set to success if the callback was handled,
    396  *      otherwise this value will be set to a failure status.
    397  * @stable ICU 2.0
    398  */
    399 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SKIP (
    400                   const void *context,
    401                   UConverterToUnicodeArgs *toUArgs,
    402                   const char* codeUnits,
    403                   int32_t length,
    404                   UConverterCallbackReason reason,
    405                   UErrorCode * err);
    406 
    407 /**
    408  * DO NOT CALL THIS FUNCTION DIRECTLY!
    409  * This To Unicode callback will Substitute the ILLEGAL SEQUENCE,or
    410  * UNASSIGNED_SEQUENCE depending on context parameter,  with the
    411  * Unicode substitution character, U+FFFD.
    412  *
    413  * @param context  The function currently recognizes the callback options:
    414  *                 UCNV_SUB_STOP_ON_ILLEGAL: STOPS at the ILLEGAL_SEQUENCE,
    415  *                      returning the error code back to the caller immediately.
    416  *                 NULL: Substitutes any ILLEGAL_SEQUENCE
    417  * @param toUArgs Information about the conversion in progress
    418  * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
    419  * @param length Size (in bytes) of the concerned codepage sequence
    420  * @param reason Defines the reason the callback was invoked
    421  * @param err Return value will be set to success if the callback was handled,
    422  *      otherwise this value will be set to a failure status.
    423  * @stable ICU 2.0
    424  */
    425 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_SUBSTITUTE (
    426                   const void *context,
    427                   UConverterToUnicodeArgs *toUArgs,
    428                   const char* codeUnits,
    429                   int32_t length,
    430                   UConverterCallbackReason reason,
    431                   UErrorCode * err);
    432 
    433 /**
    434  * DO NOT CALL THIS FUNCTION DIRECTLY!
    435  * This To Unicode callback will Substitute the ILLEGAL SEQUENCE with the
    436  * hexadecimal representation of the illegal bytes
    437  *  (in the format  %XNN, e.g. "%XFF%X0A%XC8%X03").
    438  *
    439  * @param context This function currently recognizes the callback options:
    440  *      UCNV_ESCAPE_ICU, UCNV_ESCAPE_JAVA, UCNV_ESCAPE_C, UCNV_ESCAPE_XML_DEC,
    441  *      UCNV_ESCAPE_XML_HEX and UCNV_ESCAPE_UNICODE.
    442  * @param toUArgs Information about the conversion in progress
    443  * @param codeUnits Points to 'length' bytes of the concerned codepage sequence
    444  * @param length Size (in bytes) of the concerned codepage sequence
    445  * @param reason Defines the reason the callback was invoked
    446  * @param err Return value will be set to success if the callback was handled,
    447  *      otherwise this value will be set to a failure status.
    448  * @stable ICU 2.0
    449  */
    450 
    451 U_STABLE void U_EXPORT2 UCNV_TO_U_CALLBACK_ESCAPE (
    452                   const void *context,
    453                   UConverterToUnicodeArgs *toUArgs,
    454                   const char* codeUnits,
    455                   int32_t length,
    456                   UConverterCallbackReason reason,
    457                   UErrorCode * err);
    458 
    459 #endif
    460 
    461 #endif
    462 
    463 /*UCNV_ERR_H*/
    464