Home | History | Annotate | Download | only in charset
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /**
      4 *******************************************************************************
      5 * Copyright (C) 2006-2014, International Business Machines Corporation and
      6 * others. All Rights Reserved.
      7 *******************************************************************************
      8 */
      9 
     10 package com.ibm.icu.charset;
     11 
     12 import java.nio.ByteBuffer;
     13 import java.nio.CharBuffer;
     14 import java.nio.IntBuffer;
     15 import java.nio.charset.CoderResult;
     16 
     17 /**
     18  * <h2> Callback API for CharsetICU API </h2>
     19  *
     20  *  CharsetCallback class defines some error behaviour functions called
     21  *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
     22  *  the facility by which clients can write their own callbacks.
     23  *
     24  *  These functions, although public, should NEVER be called directly.
     25  *  They should be used as parameters to the onUmappableCharacter() and
     26  *  onMalformedInput() methods, to set the behaviour of a converter
     27  *  when it encounters UNMAPPED/INVALID sequences.
     28  *  Currently the only way to set callbacks is by using CodingErrorAction.
     29  *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
     30  *  that will accept CharsetCallback fields.
     31  *
     32  * @stable ICU 3.6
     33  */
     34 
     35 public class CharsetCallback {
     36     /*
     37      * FROM_U, TO_U context options for sub callback
     38      */
     39     private static final String SUB_STOP_ON_ILLEGAL = "i";
     40 
     41 //    /*
     42 //     * FROM_U, TO_U context options for skip callback
     43 //     */
     44 //    private static final String SKIP_STOP_ON_ILLEGAL = "i";
     45 
     46 //    /*
     47 //     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
     48 //     */
     49 //    private static final String ESCAPE_ICU  = null;
     50 
     51     /*
     52      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
     53      */
     54     private static final String ESCAPE_JAVA     =  "J";
     55 
     56     /*
     57      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
     58      * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
     59      */
     60     private static final String ESCAPE_C        = "C";
     61 
     62     /*
     63      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
     64      * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
     65      */
     66     private static final String ESCAPE_XML_DEC  = "D";
     67 
     68     /*
     69      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
     70      * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
     71      */
     72     private static final String ESCAPE_XML_HEX  = "X";
     73 
     74     /*
     75      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     76      */
     77     private static final String ESCAPE_UNICODE  = "U";
     78 
     79     /*
     80      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     81      */
     82     private static final String ESCAPE_CSS2  = "S";
     83 
     84     /*
     85      * IS_DEFAULT_IGNORABLE_CODE_POINT
     86      * This is to check if a code point has the default ignorable unicode property.
     87      * As such, this list needs to be updated if the ignorable code point list ever
     88      * changes.
     89      * To avoid dependency on other code, this list is hard coded here.
     90      * When an ignorable code point is found and is unmappable, the default callbacks
     91      * will ignore them.
     92      * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
     93      *
     94      * This list should be sync with the one in ucnv_err.c
     95      *
     96      */
     97     private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
     98         return ((c == 0x00AD) ||
     99                 (c == 0x034F) ||
    100                 (c == 0x061C) ||
    101                 (c == 0x115F) ||
    102                 (c == 0x1160) ||
    103                 (0x17B4 <= c && c <= 0x17B5) ||
    104                 (0x180B <= c && c <= 0x180E) ||
    105                 (0x200B <= c && c <= 0x200F) ||
    106                 (0x202A <= c && c <= 0x202E) ||
    107                 (c == 0x2060) ||
    108                 (0x2066 <= c && c <= 0x2069) ||
    109                 (0x2061 <= c && c <= 0x2064) ||
    110                 (0x206A <= c && c <= 0x206F) ||
    111                 (c == 0x3164) ||
    112                 (0x0FE00 <= c && c <= 0x0FE0F) ||
    113                 (c == 0x0FEFF) ||
    114                 (c == 0x0FFA0) ||
    115                 (0x01BCA0  <= c && c <= 0x01BCA3) ||
    116                 (0x01D173 <= c && c <= 0x01D17A) ||
    117                 (c == 0x0E0001) ||
    118                 (0x0E0020 <= c && c <= 0x0E007F) ||
    119                 (0x0E0100 <= c && c <= 0x0E01EF) ||
    120                 (c == 0x2065) ||
    121                 (0x0FFF0 <= c && c <= 0x0FFF8) ||
    122                 (c == 0x0E0000) ||
    123                 (0x0E0002 <= c && c <= 0x0E001F) ||
    124                 (0x0E0080 <= c && c <= 0x0E00FF) ||
    125                 (0x0E01F0 <= c && c <= 0x0E0FFF)
    126                 );
    127     }
    128     /**
    129      * Decoder Callback interface
    130      * @stable ICU 3.6
    131      */
    132     public interface Decoder {
    133         /**
    134          * This function is called when the bytes in the source cannot be handled,
    135          * and this function is meant to handle or fix the error if possible.
    136          *
    137          * @return Result of decoding action. This returned object is set to an error
    138          *  if this function could not handle the conversion.
    139          * @stable ICU 3.6
    140          */
    141         public CoderResult call(CharsetDecoderICU decoder, Object context,
    142                                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    143                                 char[] buffer, int length, CoderResult cr);
    144     }
    145     /**
    146      * Encoder Callback interface
    147      * @stable ICU 3.6
    148      */
    149     public interface Encoder {
    150         /**
    151          * This function is called when the Unicode characters in the source cannot be handled,
    152          * and this function is meant to handle or fix the error if possible.
    153          * @return Result of decoding action. This returned object is set to an error
    154          *  if this function could not handle the conversion.
    155          * @stable ICU 3.6
    156          */
    157         public CoderResult call(CharsetEncoderICU encoder, Object context,
    158                                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    159                                 char[] buffer, int length, int cp, CoderResult cr);
    160     }
    161     /**
    162      * Skip callback
    163      * @stable ICU 3.6
    164      */
    165     public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
    166         @Override
    167         public CoderResult call(CharsetEncoderICU encoder, Object context,
    168                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    169                 char[] buffer, int length, int cp, CoderResult cr){
    170             if(context==null){
    171                 return CoderResult.UNDERFLOW;
    172             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    173                 if(!cr.isUnmappable()){
    174                     return cr;
    175                 }else{
    176                     return CoderResult.UNDERFLOW;
    177                 }
    178             }
    179             return cr;
    180         }
    181     };
    182     /**
    183      * Skip callback
    184      * @stable ICU 3.6
    185      */
    186     public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
    187         @Override
    188         public CoderResult call(CharsetDecoderICU decoder, Object context,
    189                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    190                 char[] buffer, int length, CoderResult cr){
    191             if(context==null){
    192                 return CoderResult.UNDERFLOW;
    193             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    194                 if(!cr.isUnmappable()){
    195                     return cr;
    196                 }else{
    197                     return CoderResult.UNDERFLOW;
    198                 }
    199             }
    200             return cr;
    201         }
    202     };
    203     /**
    204      * Write substitute callback
    205      * @stable ICU 3.6
    206      */
    207     public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
    208         @Override
    209         public CoderResult call(CharsetEncoderICU encoder, Object context,
    210                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    211                 char[] buffer, int length, int cp, CoderResult cr){
    212             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    213                 return CoderResult.UNDERFLOW;
    214             }else if(context==null){
    215                 return encoder.cbFromUWriteSub(encoder, source, target, offsets);
    216             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    217                 if(!cr.isUnmappable()){
    218                     return cr;
    219                 }else{
    220                    return encoder.cbFromUWriteSub(encoder, source, target, offsets);
    221                 }
    222             }
    223             return cr;
    224         }
    225     };
    226     private static final char[] kSubstituteChar1 = new char[]{0x1A};
    227     private static final char[] kSubstituteChar = new char[] {0xFFFD};
    228     /**
    229      * Write substitute callback
    230      * @stable ICU 3.6
    231      */
    232     public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
    233         @Override
    234         public CoderResult call(CharsetDecoderICU decoder, Object context,
    235                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    236                 char[] buffer, int length, CoderResult cr){
    237 
    238             CharsetICU cs = (CharsetICU) decoder.charset();
    239             /* Use the specified replacement character if it is different than the default one. */
    240             boolean useReplacement = true;
    241             char [] replacementChar = decoder.replacement().toCharArray();
    242             if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
    243                 useReplacement = false;
    244             }
    245 
    246             /* could optimize this case, just one uchar */
    247             if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
    248                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
    249             } else {
    250                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
    251             }
    252         }
    253     };
    254     /**
    255      * Stop callback
    256      * @stable ICU 3.6
    257      */
    258     public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
    259         @Override
    260         public CoderResult call(CharsetEncoderICU encoder, Object context,
    261                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    262                 char[] buffer, int length, int cp, CoderResult cr){
    263             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    264                 return CoderResult.UNDERFLOW;
    265             }
    266             return cr;
    267         }
    268     };
    269     /**
    270      * Stop callback
    271      * @stable ICU 3.6
    272      */
    273     public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
    274         @Override
    275         public CoderResult call(CharsetDecoderICU decoder, Object context,
    276                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    277                 char[] buffer, int length, CoderResult cr){
    278             return cr;
    279         }
    280     };
    281     private static final int VALUE_STRING_LENGTH = 32;
    282     private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
    283     private static final char UNICODE_U_CODEPOINT               = 0x0055;
    284     private static final char UNICODE_X_CODEPOINT               = 0x0058;
    285     private static final char UNICODE_RS_CODEPOINT              = 0x005C;
    286     private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
    287     private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
    288     private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
    289     private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
    290     private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
    291     private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
    292     private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
    293     private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
    294     private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
    295     /**
    296      * Write escape callback
    297      * @stable ICU 4.0
    298      */
    299     public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
    300         @Override
    301         public CoderResult call(CharsetEncoderICU encoder, Object context,
    302                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    303                 char[] buffer, int length, int cp, CoderResult cr){
    304             char[] valueString = new char[VALUE_STRING_LENGTH];
    305             int valueStringLength = 0;
    306             int i = 0;
    307 
    308             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    309                 return CoderResult.UNDERFLOW;
    310             }
    311 
    312             if (context == null || !(context instanceof String)) {
    313                 while (i < length) {
    314                     valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    315                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
    316                     valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    317                 }
    318             } else {
    319                 if (((String)context).equals(ESCAPE_JAVA)) {
    320                     while (i < length) {
    321                         valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    322                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
    323                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    324                     }
    325                 } else if (((String)context).equals(ESCAPE_C)) {
    326                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    327 
    328                     if (length == 2) {
    329                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
    330                         valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
    331                     } else {
    332                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
    333                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
    334                     }
    335                 } else if (((String)context).equals(ESCAPE_XML_DEC)) {
    336                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
    337                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
    338                     if (length == 2) {
    339                         valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
    340                     } else {
    341                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
    342                     }
    343                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    344                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
    345                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
    346                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
    347                     valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
    348                     if (length == 2) {
    349                         valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
    350                     } else {
    351                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
    352                     }
    353                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    354                 } else if (((String)context).equals(ESCAPE_UNICODE)) {
    355                     valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
    356                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
    357                     valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
    358                     if (length == 2) {
    359                         valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
    360                     } else {
    361                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
    362                     }
    363                     valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
    364                 } else if (((String)context).equals(ESCAPE_CSS2)) {
    365                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    366                     valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
    367                     /* Always add space character, because the next character might be whitespace,
    368                        which would erroneously be considered the termination of the escape sequence. */
    369                     valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
    370                 } else {
    371                     while (i < length) {
    372                         valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    373                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
    374                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    375                     }
    376                 }
    377             }
    378             return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
    379         }
    380     };
    381     /**
    382      * Write escape callback
    383      * @stable ICU 4.0
    384      */
    385     public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
    386         @Override
    387         public CoderResult call(CharsetDecoderICU decoder, Object context,
    388                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    389                 char[] buffer, int length, CoderResult cr){
    390             char[] uniValueString = new char[VALUE_STRING_LENGTH];
    391             int valueStringLength = 0;
    392             int i = 0;
    393 
    394             if (context == null || !(context instanceof String)) {
    395                 while (i < length) {
    396                     uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
    397                     uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
    398                     valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    399                 }
    400             } else {
    401                 if (((String)context).equals(ESCAPE_XML_DEC)) {
    402                     while (i < length) {
    403                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
    404                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
    405                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
    406                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
    407                     }
    408                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
    409                     while (i < length) {
    410                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
    411                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
    412                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
    413                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
    414                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
    415                     }
    416                 } else if (((String)context).equals(ESCAPE_C)) {
    417                     while (i < length) {
    418                         uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
    419                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
    420                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    421                     }
    422                 } else {
    423                     while (i < length) {
    424                         uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
    425                         uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
    426                         itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    427                         valueStringLength += 2;
    428                     }
    429                 }
    430             }
    431 
    432             cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
    433 
    434             return cr;
    435         }
    436     };
    437     /***
    438      * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
    439      * Fills in a char string with the radix-based representation of a number padded with zeroes
    440      * to minwidth.
    441      */
    442     private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
    443         int length = 0;
    444         int digit;
    445         int j;
    446         char temp;
    447 
    448         do {
    449             digit = i % radix;
    450             buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
    451             i = i/radix;
    452         } while (i != 0 && (sourceIndex + length) < buffer.length);
    453 
    454         while (length < minwidth) {
    455             buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
    456         }
    457         /* reverses the string */
    458         for (j = 0; j < (length / 2); j++) {
    459             temp = buffer[(sourceIndex + length - 1) - j];
    460             buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
    461             buffer[sourceIndex + j] = temp;
    462         }
    463 
    464         return length;
    465     }
    466 
    467     /*
    468      * No need to create an instance
    469      */
    470     private CharsetCallback() {
    471     }
    472 }
    473