Home | History | Annotate | Download | only in common
      1 // Copyright (C) 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4  *****************************************************************************
      5  *
      6  *   Copyright (C) 1998-2016, International Business Machines
      7  *   Corporation and others.  All Rights Reserved.
      8  *
      9  *****************************************************************************
     10  *
     11  *  ucnv_err.c
     12  *  Implements error behaviour functions called by T_UConverter_{from,to}Unicode
     13  *
     14  *
     15 *   Change history:
     16 *
     17 *   06/29/2000  helena      Major rewrite of the callback APIs.
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 
     22 #if !UCONFIG_NO_CONVERSION
     23 
     24 #include "unicode/ucnv_err.h"
     25 #include "unicode/ucnv_cb.h"
     26 #include "ucnv_cnv.h"
     27 #include "cmemory.h"
     28 #include "unicode/ucnv.h"
     29 #include "ustrfmt.h"
     30 
     31 #define VALUE_STRING_LENGTH 48
     32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */
     33 #define UNICODE_PERCENT_SIGN_CODEPOINT  0x0025
     34 #define UNICODE_U_CODEPOINT             0x0055
     35 #define UNICODE_X_CODEPOINT             0x0058
     36 #define UNICODE_RS_CODEPOINT            0x005C
     37 #define UNICODE_U_LOW_CODEPOINT         0x0075
     38 #define UNICODE_X_LOW_CODEPOINT         0x0078
     39 #define UNICODE_AMP_CODEPOINT           0x0026
     40 #define UNICODE_HASH_CODEPOINT          0x0023
     41 #define UNICODE_SEMICOLON_CODEPOINT     0x003B
     42 #define UNICODE_PLUS_CODEPOINT          0x002B
     43 #define UNICODE_LEFT_CURLY_CODEPOINT    0x007B
     44 #define UNICODE_RIGHT_CURLY_CODEPOINT   0x007D
     45 #define UNICODE_SPACE_CODEPOINT         0x0020
     46 #define UCNV_PRV_ESCAPE_ICU         0
     47 #define UCNV_PRV_ESCAPE_C           'C'
     48 #define UCNV_PRV_ESCAPE_XML_DEC     'D'
     49 #define UCNV_PRV_ESCAPE_XML_HEX     'X'
     50 #define UCNV_PRV_ESCAPE_JAVA        'J'
     51 #define UCNV_PRV_ESCAPE_UNICODE     'U'
     52 #define UCNV_PRV_ESCAPE_CSS2        'S'
     53 #define UCNV_PRV_STOP_ON_ILLEGAL    'i'
     54 
     55 /*
     56  * IS_DEFAULT_IGNORABLE_CODE_POINT
     57  * This is to check if a code point has the default ignorable unicode property.
     58  * As such, this list needs to be updated if the ignorable code point list ever
     59  * changes.
     60  * To avoid dependency on other code, this list is hard coded here.
     61  * When an ignorable code point is found and is unmappable, the default callbacks
     62  * will ignore them.
     63  * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
     64  *
     65  * This list should be sync with the one in CharsetCallback.java
     66  */
     67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\
     68     (c == 0x00AD) || \
     69     (c == 0x034F) || \
     70     (c == 0x061C) || \
     71     (c == 0x115F) || \
     72     (c == 0x1160) || \
     73     (0x17B4 <= c && c <= 0x17B5) || \
     74     (0x180B <= c && c <= 0x180E) || \
     75     (0x200B <= c && c <= 0x200F) || \
     76     (0x202A <= c && c <= 0x202E) || \
     77     (c == 0x2060) || \
     78     (0x2066 <= c && c <= 0x2069) || \
     79     (0x2061 <= c && c <= 0x2064) || \
     80     (0x206A <= c && c <= 0x206F) || \
     81     (c == 0x3164) || \
     82     (0x0FE00 <= c && c <= 0x0FE0F) || \
     83     (c == 0x0FEFF) || \
     84     (c == 0x0FFA0) || \
     85     (0x01BCA0  <= c && c <= 0x01BCA3) || \
     86     (0x01D173 <= c && c <= 0x01D17A) || \
     87     (c == 0x0E0001) || \
     88     (0x0E0020 <= c && c <= 0x0E007F) || \
     89     (0x0E0100 <= c && c <= 0x0E01EF) || \
     90     (c == 0x2065) || \
     91     (0x0FFF0 <= c && c <= 0x0FFF8) || \
     92     (c == 0x0E0000) || \
     93     (0x0E0002 <= c && c <= 0x0E001F) || \
     94     (0x0E0080 <= c && c <= 0x0E00FF) || \
     95     (0x0E01F0 <= c && c <= 0x0E0FFF) \
     96     )
     97 
     98 
     99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
    100 U_CAPI void    U_EXPORT2
    101 UCNV_FROM_U_CALLBACK_STOP (
    102                   const void *context,
    103                   UConverterFromUnicodeArgs *fromUArgs,
    104                   const UChar* codeUnits,
    105                   int32_t length,
    106                   UChar32 codePoint,
    107                   UConverterCallbackReason reason,
    108                   UErrorCode * err)
    109 {
    110     if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    111     {
    112         /*
    113          * Skip if the codepoint has unicode property of default ignorable.
    114          */
    115         *err = U_ZERO_ERROR;
    116     }
    117     /* the caller must have set the error code accordingly */
    118     return;
    119 }
    120 
    121 
    122 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */
    123 U_CAPI void    U_EXPORT2
    124 UCNV_TO_U_CALLBACK_STOP (
    125                    const void *context,
    126                    UConverterToUnicodeArgs *toUArgs,
    127                    const char* codePoints,
    128                    int32_t length,
    129                    UConverterCallbackReason reason,
    130                    UErrorCode * err)
    131 {
    132     /* the caller must have set the error code accordingly */
    133     return;
    134 }
    135 
    136 U_CAPI void    U_EXPORT2
    137 UCNV_FROM_U_CALLBACK_SKIP (
    138                   const void *context,
    139                   UConverterFromUnicodeArgs *fromUArgs,
    140                   const UChar* codeUnits,
    141                   int32_t length,
    142                   UChar32 codePoint,
    143                   UConverterCallbackReason reason,
    144                   UErrorCode * err)
    145 {
    146     if (reason <= UCNV_IRREGULAR)
    147     {
    148         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    149         {
    150             /*
    151              * Skip if the codepoint has unicode property of default ignorable.
    152              */
    153             *err = U_ZERO_ERROR;
    154         }
    155         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    156         {
    157             *err = U_ZERO_ERROR;
    158         }
    159         /* else the caller must have set the error code accordingly. */
    160     }
    161     /* else ignore the reset, close and clone calls. */
    162 }
    163 
    164 U_CAPI void    U_EXPORT2
    165 UCNV_FROM_U_CALLBACK_SUBSTITUTE (
    166                   const void *context,
    167                   UConverterFromUnicodeArgs *fromArgs,
    168                   const UChar* codeUnits,
    169                   int32_t length,
    170                   UChar32 codePoint,
    171                   UConverterCallbackReason reason,
    172                   UErrorCode * err)
    173 {
    174     if (reason <= UCNV_IRREGULAR)
    175     {
    176         if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    177         {
    178             /*
    179              * Skip if the codepoint has unicode property of default ignorable.
    180              */
    181             *err = U_ZERO_ERROR;
    182         }
    183         else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    184         {
    185             *err = U_ZERO_ERROR;
    186             ucnv_cbFromUWriteSub(fromArgs, 0, err);
    187         }
    188         /* else the caller must have set the error code accordingly. */
    189     }
    190     /* else ignore the reset, close and clone calls. */
    191 }
    192 
    193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    194  *uses a clean copy (resetted) of the converter, to convert that unicode
    195  *escape sequence to the target codepage (if conversion failure happens then
    196  *we revert to substituting with subchar)
    197  */
    198 U_CAPI void    U_EXPORT2
    199 UCNV_FROM_U_CALLBACK_ESCAPE (
    200                          const void *context,
    201                          UConverterFromUnicodeArgs *fromArgs,
    202                          const UChar *codeUnits,
    203                          int32_t length,
    204                          UChar32 codePoint,
    205                          UConverterCallbackReason reason,
    206                          UErrorCode * err)
    207 {
    208 
    209   UChar valueString[VALUE_STRING_LENGTH];
    210   int32_t valueStringLength = 0;
    211   int32_t i = 0;
    212 
    213   const UChar *myValueSource = NULL;
    214   UErrorCode err2 = U_ZERO_ERROR;
    215   UConverterFromUCallback original = NULL;
    216   const void *originalContext;
    217 
    218   UConverterFromUCallback ignoredCallback = NULL;
    219   const void *ignoredContext;
    220 
    221   if (reason > UCNV_IRREGULAR)
    222   {
    223       return;
    224   }
    225   else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint))
    226   {
    227       /*
    228        * Skip if the codepoint has unicode property of default ignorable.
    229        */
    230       *err = U_ZERO_ERROR;
    231       return;
    232   }
    233 
    234   ucnv_setFromUCallBack (fromArgs->converter,
    235                      (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE,
    236                      NULL,
    237                      &original,
    238                      &originalContext,
    239                      &err2);
    240 
    241   if (U_FAILURE (err2))
    242   {
    243     *err = err2;
    244     return;
    245   }
    246   if(context==NULL)
    247   {
    248       while (i < length)
    249       {
    250         valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    251         valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
    252         valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    253       }
    254   }
    255   else
    256   {
    257       switch(*((char*)context))
    258       {
    259       case UCNV_PRV_ESCAPE_JAVA:
    260           while (i < length)
    261           {
    262               valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    263               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
    264               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    265           }
    266           break;
    267 
    268       case UCNV_PRV_ESCAPE_C:
    269           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    270 
    271           if(length==2){
    272               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */
    273               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8);
    274 
    275           }
    276           else{
    277               valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */
    278               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    279           }
    280           break;
    281 
    282       case UCNV_PRV_ESCAPE_XML_DEC:
    283 
    284           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    285           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    286           if(length==2){
    287               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0);
    288           }
    289           else{
    290               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0);
    291           }
    292           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    293           break;
    294 
    295       case UCNV_PRV_ESCAPE_XML_HEX:
    296 
    297           valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    298           valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    299           valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    300           if(length==2){
    301               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    302           }
    303           else{
    304               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0);
    305           }
    306           valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    307           break;
    308 
    309       case UCNV_PRV_ESCAPE_UNICODE:
    310           valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
    311           valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;    /* adding U */
    312           valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */
    313           if (length == 2) {
    314               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4);
    315           } else {
    316               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4);
    317           }
    318           valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT;    /* adding } */
    319           break;
    320 
    321       case UCNV_PRV_ESCAPE_CSS2:
    322           valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    323           valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0);
    324           /* Always add space character, becase the next character might be whitespace,
    325              which would erroneously be considered the termination of the escape sequence. */
    326           valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT;
    327           break;
    328 
    329       default:
    330           while (i < length)
    331           {
    332               valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    333               valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT;             /* adding U */
    334               valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4);
    335           }
    336       }
    337   }
    338   myValueSource = valueString;
    339 
    340   /* reset the error */
    341   *err = U_ZERO_ERROR;
    342 
    343   ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err);
    344 
    345   ucnv_setFromUCallBack (fromArgs->converter,
    346                          original,
    347                          originalContext,
    348                          &ignoredCallback,
    349                          &ignoredContext,
    350                          &err2);
    351   if (U_FAILURE (err2))
    352   {
    353       *err = err2;
    354       return;
    355   }
    356 
    357   return;
    358 }
    359 
    360 
    361 
    362 U_CAPI void  U_EXPORT2
    363 UCNV_TO_U_CALLBACK_SKIP (
    364                  const void *context,
    365                  UConverterToUnicodeArgs *toArgs,
    366                  const char* codeUnits,
    367                  int32_t length,
    368                  UConverterCallbackReason reason,
    369                  UErrorCode * err)
    370 {
    371     if (reason <= UCNV_IRREGULAR)
    372     {
    373         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    374         {
    375             *err = U_ZERO_ERROR;
    376         }
    377         /* else the caller must have set the error code accordingly. */
    378     }
    379     /* else ignore the reset, close and clone calls. */
    380 }
    381 
    382 U_CAPI void    U_EXPORT2
    383 UCNV_TO_U_CALLBACK_SUBSTITUTE (
    384                  const void *context,
    385                  UConverterToUnicodeArgs *toArgs,
    386                  const char* codeUnits,
    387                  int32_t length,
    388                  UConverterCallbackReason reason,
    389                  UErrorCode * err)
    390 {
    391     if (reason <= UCNV_IRREGULAR)
    392     {
    393         if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED))
    394         {
    395             *err = U_ZERO_ERROR;
    396             ucnv_cbToUWriteSub(toArgs,0,err);
    397         }
    398         /* else the caller must have set the error code accordingly. */
    399     }
    400     /* else ignore the reset, close and clone calls. */
    401 }
    402 
    403 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence,
    404  *and uses that as the substitution sequence
    405  */
    406 U_CAPI void   U_EXPORT2
    407 UCNV_TO_U_CALLBACK_ESCAPE (
    408                  const void *context,
    409                  UConverterToUnicodeArgs *toArgs,
    410                  const char* codeUnits,
    411                  int32_t length,
    412                  UConverterCallbackReason reason,
    413                  UErrorCode * err)
    414 {
    415     UChar uniValueString[VALUE_STRING_LENGTH];
    416     int32_t valueStringLength = 0;
    417     int32_t i = 0;
    418 
    419     if (reason > UCNV_IRREGULAR)
    420     {
    421         return;
    422     }
    423 
    424     if(context==NULL)
    425     {
    426         while (i < length)
    427         {
    428             uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    429             uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
    430             valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    431         }
    432     }
    433     else
    434     {
    435         switch(*((char*)context))
    436         {
    437         case UCNV_PRV_ESCAPE_XML_DEC:
    438             while (i < length)
    439             {
    440                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    441                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    442                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0);
    443                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    444             }
    445             break;
    446 
    447         case UCNV_PRV_ESCAPE_XML_HEX:
    448             while (i < length)
    449             {
    450                 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT;   /* adding & */
    451                 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT;  /* adding # */
    452                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    453                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0);
    454                 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    455             }
    456             break;
    457         case UCNV_PRV_ESCAPE_C:
    458             while (i < length)
    459             {
    460                 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT;    /* adding \ */
    461                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */
    462                 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2);
    463             }
    464             break;
    465         default:
    466             while (i < length)
    467             {
    468                 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    469                 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT;    /* adding X */
    470                 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2);
    471                 valueStringLength += 2;
    472             }
    473         }
    474     }
    475     /* reset the error */
    476     *err = U_ZERO_ERROR;
    477 
    478     ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err);
    479 }
    480 
    481 #endif
    482