Home | History | Annotate | Download | only in common
      1 /*
      2  *****************************************************************************
      3  *
      4  *   Copyright (C) 1998-2014, International Business Machines
      5  *   Corporation and others.  All Rights Reserved.
      6  *
      7  *****************************************************************************
      8  *
      9  *  ucnv_err.c
     10  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
     11  *
     12  *
     13 *   Change history:
     14 *
     15 *   06/29/2000  helena      Major rewrite of the callback APIs.
     16 */
     17 
     18 #include "unicode/utypes.h"
     19 
     20 #if !UCONFIG_NO_CONVERSION
     21 
     22 #include "unicode/ucnv_err.h"
     23 #include "unicode/ucnv_cb.h"
     24 #include "ucnv_cnv.h"
     25 #include "cmemory.h"
     26 #include "unicode/ucnv.h"
     27 #include "ustrfmt.h"
     28 
     29 #define VALUE_STRING_LENGTH 32
     30 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
     31 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
     32 #define UNICODE_U_CODEPOINT             0x0055
     33 #define UNICODE_X_CODEPOINT             0x0058
     34 #define UNICODE_RS_CODEPOINT            0x005C
     35 #define UNICODE_U_LOW_CODEPOINT         0x0075
     36 #define UNICODE_X_LOW_CODEPOINT         0x0078
     37 #define UNICODE_AMP_CODEPOINT           0x0026
     38 #define UNICODE_HASH_CODEPOINT          0x0023
     39 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
     40 #define UNICODE_PLUS_CODEPOINT          0x002B
     41 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
     42 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
     43 #define UNICODE_SPACE_CODEPOINT         0x0020
     44 #define UCNV_PRV_ESCAPE_ICU         0
     45 #define UCNV_PRV_ESCAPE_C           'C'
     46 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
     47 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
     48 #define UCNV_PRV_ESCAPE_JAVA        'J'
     49 #define UCNV_PRV_ESCAPE_UNICODE     'U'
     50 #define UCNV_PRV_ESCAPE_CSS2        'S'
     51 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
     52 
     53 /*
     54  * IS_DEFAULT_IGNORABLE_CODE_POINT
     55  * This is to check if a code point has the default ignorable unicode property.
     56  * As such, this list needs to be updated if the ignorable code point list ever
     57  * changes.
     58  * To avoid dependency on other code, this list is hard coded here.
     59  * When an ignorable code point is found and is unmappable, the default callbacks
     60  * will ignore them.
     61  * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
     62  *
     63  * This list should be sync with the one in CharsetCallback.java
     64  */
     65 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
     66     (c == 0x00AD) || \
     67     (c == 0x034F) || \
     68     (c == 0x061C) || \
     69     (c == 0x115F) || \
     70     (c == 0x1160) || \
     71     (0x17B4 <= c && c <= 0x17B5) || \
     72     (0x180B <= c && c <= 0x180E) || \
     73     (0x200B <= c && c <= 0x200F) || \
     74     (0x202A <= c && c <= 0x202E) || \
     75     (c == 0x2060) || \
     76     (0x2066 <= c && c <= 0x2069) || \
     77     (0x2061 <= c && c <= 0x2064) || \
     78     (0x206A <= c && c <= 0x206F) || \
     79     (c == 0x3164) || \
     80     (0x0FE00 <= c && c <= 0x0FE0F) || \
     81     (c == 0x0FEFF) || \
     82     (c == 0x0FFA0) || \
     83     (0x01BCA0  <= c && c <= 0x01BCA3) || \
     84     (0x01D173 <= c && c <= 0x01D17A) || \
     85     (c == 0x0E0001) || \
     86     (0x0E0020 <= c && c <= 0x0E007F) || \
     87     (0x0E0100 <= c && c <= 0x0E01EF) || \
     88     (c == 0x2065) || \
     89     (0x0FFF0 <= c && c <= 0x0FFF8) || \
     90     (c == 0x0E0000) || \
     91     (0x0E0002 <= c && c <= 0x0E001F) || \
     92     (0x0E0080 <= c && c <= 0x0E00FF) || \
     93     (0x0E01F0 <= c && c <= 0x0E0FFF) \
     94     )
     95 
     96 
     97 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
     98 U_CAPI void    U_EXPORT2
     99 UCNV_FROM_U_CALLBACK_STOP (
    100                   const void *context,
    101                   UConverterFromUnicodeArgs *fromUArgs,
    102                   const UChar* codeUnits,
    103                   int32_t length,
    104                   UChar32 codePoint,
    105                   UConverterCallbackReason reason,
    106                   UErrorCode * err)
    107 {
    108     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    109     {
    110         /*
    111          * Skip if the codepoint has unicode property of default ignorable.
    112          */
    113         *err = U_ZERO_ERROR;
    114     }
    115     /* the caller must have set the error code accordingly */
    116     return;
    117 }
    118 
    119 
    120 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
    121 U_CAPI void    U_EXPORT2
    122 UCNV_TO_U_CALLBACK_STOP (
    123                    const void *context,
    124                    UConverterToUnicodeArgs *toUArgs,
    125                    const char* codePoints,
    126                    int32_t length,
    127                    UConverterCallbackReason reason,
    128                    UErrorCode * err)
    129 {
    130     /* the caller must have set the error code accordingly */
    131     return;
    132 }
    133 
    134 U_CAPI void    U_EXPORT2
    135 UCNV_FROM_U_CALLBACK_SKIP (
    136                   const void *context,
    137                   UConverterFromUnicodeArgs *fromUArgs,
    138                   const UChar* codeUnits,
    139                   int32_t length,
    140                   UChar32 codePoint,
    141                   UConverterCallbackReason reason,
    142                   UErrorCode * err)
    143 {
    144     if (reason <= UCNV_IRREGULAR)
    145     {
    146         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    147         {
    148             /*
    149              * Skip if the codepoint has unicode property of default ignorable.
    150              */
    151             *err = U_ZERO_ERROR;
    152         }
    153         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    154         {
    155             *err = U_ZERO_ERROR;
    156         }
    157         /* else the caller must have set the error code accordingly. */
    158     }
    159     /* else ignore the reset, close and clone calls. */
    160 }
    161 
    162 U_CAPI void    U_EXPORT2
    163 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
    164                   const void *context,
    165                   UConverterFromUnicodeArgs *fromArgs,
    166                   const UChar* codeUnits,
    167                   int32_t length,
    168                   UChar32 codePoint,
    169                   UConverterCallbackReason reason,
    170                   UErrorCode * err)
    171 {
    172     if (reason <= UCNV_IRREGULAR)
    173     {
    174         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    175         {
    176             /*
    177              * Skip if the codepoint has unicode property of default ignorable.
    178              */
    179             *err = U_ZERO_ERROR;
    180         }
    181         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    182         {
    183             *err = U_ZERO_ERROR;
    184             ucnv_cbFromUWriteSub(fromArgs, 0, err);
    185         }
    186         /* else the caller must have set the error code accordingly. */
    187     }
    188     /* else ignore the reset, close and clone calls. */
    189 }
    190 
    191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    192  *uses a clean copy (resetted) of the converter, to convert that unicode
    193  *escape sequence to the target codepage (if conversion failure happens then
    194  *we revert to substituting with subchar)
    195  */
    196 U_CAPI void    U_EXPORT2
    197 UCNV_FROM_U_CALLBACK_ESCAPE (
    198                          const void *context,
    199                          UConverterFromUnicodeArgs *fromArgs,
    200                          const UChar *codeUnits,
    201                          int32_t length,
    202                          UChar32 codePoint,
    203                          UConverterCallbackReason reason,
    204                          UErrorCode * err)
    205 {
    206 
    207   UChar valueString[VALUE_STRING_LENGTH];
    208   int32_t valueStringLength = 0;
    209   int32_t i = 0;
    210 
    211   const UChar *myValueSource = NULL;
    212   UErrorCode err2 = U_ZERO_ERROR;
    213   UConverterFromUCallback original = NULL;
    214   const void *originalContext;
    215 
    216   UConverterFromUCallback ignoredCallback = NULL;
    217   const void *ignoredContext;
    218 
    219   if (reason > UCNV_IRREGULAR)
    220   {
    221       return;
    222   }
    223   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    224   {
    225       /*
    226        * Skip if the codepoint has unicode property of default ignorable.
    227        */
    228       *err = U_ZERO_ERROR;
    229       return;
    230   }
    231 
    232   ucnv_setFromUCallBack (fromArgs->converter,
    233                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
    234                      NULL,
    235                      &original,
    236                      &originalContext,
    237                      &err2);
    238 
    239   if (U_FAILURE (err2))
    240   {
    241     *err = err2;
    242     return;
    243   }
    244   if(context==NULL)
    245   {
    246       while (i < length)
    247       {
    248         valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    249         valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
    250         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    251       }
    252   }
    253   else
    254   {
    255       switch(*((char*)context))
    256       {
    257       case UCNV_PRV_ESCAPE_JAVA:
    258           while (i < length)
    259           {
    260               valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    261               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
    262               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    263           }
    264           break;
    265 
    266       case UCNV_PRV_ESCAPE_C:
    267           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    268 
    269           if(length==2){
    270               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
    271               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
    272 
    273           }
    274           else{
    275               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
    276               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    277           }
    278           break;
    279 
    280       case UCNV_PRV_ESCAPE_XML_DEC:
    281 
    282           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    283           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    284           if(length==2){
    285               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
    286           }
    287           else{
    288               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
    289           }
    290           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    291           break;
    292 
    293       case UCNV_PRV_ESCAPE_XML_HEX:
    294 
    295           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    296           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    297           valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    298           if(length==2){
    299               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    300           }
    301           else{
    302               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
    303           }
    304           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    305           break;
    306 
    307       case UCNV_PRV_ESCAPE_UNICODE:
    308           valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
    309           valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;    /* adding U */
    310           valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
    311           if (length == 2) {
    312               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
    313           } else {
    314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    315           }
    316           valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
    317           break;
    318 
    319       case UCNV_PRV_ESCAPE_CSS2:
    320           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    321           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    322           /* Always add space character, becase the next character might be whitespace,
    323              which would erroneously be considered the termination of the escape sequence. */
    324           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
    325           break;
    326 
    327       default:
    328           while (i < length)
    329           {
    330               valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    331               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;             /* adding U */
    332               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    333           }
    334       }
    335   }
    336   myValueSource = valueString;
    337 
    338   /* reset the error */
    339   *err = U_ZERO_ERROR;
    340 
    341   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
    342 
    343   ucnv_setFromUCallBack (fromArgs->converter,
    344                          original,
    345                          originalContext,
    346                          &ignoredCallback,
    347                          &ignoredContext,
    348                          &err2);
    349   if (U_FAILURE (err2))
    350   {
    351       *err = err2;
    352       return;
    353   }
    354 
    355   return;
    356 }
    357 
    358 
    359 
    360 U_CAPI void  U_EXPORT2
    361 UCNV_TO_U_CALLBACK_SKIP (
    362                  const void *context,
    363                  UConverterToUnicodeArgs *toArgs,
    364                  const char* codeUnits,
    365                  int32_t length,
    366                  UConverterCallbackReason reason,
    367                  UErrorCode * err)
    368 {
    369     if (reason <= UCNV_IRREGULAR)
    370     {
    371         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    372         {
    373             *err = U_ZERO_ERROR;
    374         }
    375         /* else the caller must have set the error code accordingly. */
    376     }
    377     /* else ignore the reset, close and clone calls. */
    378 }
    379 
    380 U_CAPI void    U_EXPORT2
    381 UCNV_TO_U_CALLBACK_SUBSTITUTE (
    382                  const void *context,
    383                  UConverterToUnicodeArgs *toArgs,
    384                  const char* codeUnits,
    385                  int32_t length,
    386                  UConverterCallbackReason reason,
    387                  UErrorCode * err)
    388 {
    389     if (reason <= UCNV_IRREGULAR)
    390     {
    391         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    392         {
    393             *err = U_ZERO_ERROR;
    394             ucnv_cbToUWriteSub(toArgs,0,err);
    395         }
    396         /* else the caller must have set the error code accordingly. */
    397     }
    398     /* else ignore the reset, close and clone calls. */
    399 }
    400 
    401 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    402  *and uses that as the substitution sequence
    403  */
    404 U_CAPI void   U_EXPORT2
    405 UCNV_TO_U_CALLBACK_ESCAPE (
    406                  const void *context,
    407                  UConverterToUnicodeArgs *toArgs,
    408                  const char* codeUnits,
    409                  int32_t length,
    410                  UConverterCallbackReason reason,
    411                  UErrorCode * err)
    412 {
    413     UChar uniValueString[VALUE_STRING_LENGTH];
    414     int32_t valueStringLength = 0;
    415     int32_t i = 0;
    416 
    417     if (reason > UCNV_IRREGULAR)
    418     {
    419         return;
    420     }
    421 
    422     if(context==NULL)
    423     {
    424         while (i < length)
    425         {
    426             uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    427             uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
    428             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    429         }
    430     }
    431     else
    432     {
    433         switch(*((char*)context))
    434         {
    435         case UCNV_PRV_ESCAPE_XML_DEC:
    436             while (i < length)
    437             {
    438                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    439                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    440                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
    441                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    442             }
    443             break;
    444 
    445         case UCNV_PRV_ESCAPE_XML_HEX:
    446             while (i < length)
    447             {
    448                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    449                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    450                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    451                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
    452                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    453             }
    454             break;
    455         case UCNV_PRV_ESCAPE_C:
    456             while (i < length)
    457             {
    458                 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    459                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    460                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
    461             }
    462             break;
    463         default:
    464             while (i < length)
    465             {
    466                 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    467                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
    468                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    469                 valueStringLength += 2;
    470             }
    471         }
    472     }
    473     /* reset the error */
    474     *err = U_ZERO_ERROR;
    475 
    476     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
    477 }
    478 
    479 #endif
    480