Home | History | Annotate | Download | only in charset
      1 /**
      2 *******************************************************************************
      3 * Copyright (C) 2006-2014, International Business Machines Corporation and
      4 * others. All Rights Reserved.
      5 *******************************************************************************
      6 */
      7 
      8 package com.ibm.icu.charset;
      9 
     10 import java.nio.ByteBuffer;
     11 import java.nio.CharBuffer;
     12 import java.nio.IntBuffer;
     13 import java.nio.charset.CoderResult;
     14 
     15 /**
     16  * <h2> Callback API for CharsetICU API </h2>
     17  *
     18  *  CharsetCallback class defines some error behaviour functions called
     19  *  by CharsetDecoderICU and CharsetEncoderICU. The class also provides
     20  *  the facility by which clients can write their own callbacks.
     21  *
     22  *  These functions, although public, should NEVER be called directly.
     23  *  They should be used as parameters to the onUmappableCharacter() and
     24  *  onMalformedInput() methods, to set the behaviour of a converter
     25  *  when it encounters UNMAPPED/INVALID sequences.
     26  *  Currently the only way to set callbacks is by using CodingErrorAction.
     27  *  In the future we will provide set methods on CharsetEncoder and CharsetDecoder
     28  *  that will accept CharsetCallback fields.
     29  *
     30  * @stable ICU 3.6
     31  */
     32 
     33 public class CharsetCallback {
     34     /*
     35      * FROM_U, TO_U context options for sub callback
     36      */
     37     private static final String SUB_STOP_ON_ILLEGAL = "i";
     38 
     39 //    /*
     40 //     * FROM_U, TO_U context options for skip callback
     41 //     */
     42 //    private static final String SKIP_STOP_ON_ILLEGAL = "i";
     43 
     44 //    /*
     45 //     * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX)
     46 //     */
     47 //    private static final String ESCAPE_ICU  = null;
     48 
     49     /*
     50      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX)
     51      */
     52     private static final String ESCAPE_JAVA     =  "J";
     53 
     54     /*
     55      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX)
     56      * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX)
     57      */
     58     private static final String ESCAPE_C        = "C";
     59 
     60     /*
     61      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
     62      * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&amp;#DDDD;)\endhtmlonly
     63      */
     64     private static final String ESCAPE_XML_DEC  = "D";
     65 
     66     /*
     67      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
     68      * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&amp;#xXXXX;)\endhtmlonly
     69      */
     70     private static final String ESCAPE_XML_HEX  = "X";
     71 
     72     /*
     73      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     74      */
     75     private static final String ESCAPE_UNICODE  = "U";
     76 
     77     /*
     78      * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX)
     79      */
     80     private static final String ESCAPE_CSS2  = "S";
     81 
     82     /*
     83      * IS_DEFAULT_IGNORABLE_CODE_POINT
     84      * This is to check if a code point has the default ignorable unicode property.
     85      * As such, this list needs to be updated if the ignorable code point list ever
     86      * changes.
     87      * To avoid dependency on other code, this list is hard coded here.
     88      * When an ignorable code point is found and is unmappable, the default callbacks
     89      * will ignore them.
     90      * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g=
     91      *
     92      * This list should be sync with the one in ucnv_err.c
     93      *
     94      */
     95     private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) {
     96         return ((c == 0x00AD) ||
     97                 (c == 0x034F) ||
     98                 (c == 0x061C) ||
     99                 (c == 0x115F) ||
    100                 (c == 0x1160) ||
    101                 (0x17B4 <= c && c <= 0x17B5) ||
    102                 (0x180B <= c && c <= 0x180E) ||
    103                 (0x200B <= c && c <= 0x200F) ||
    104                 (0x202A <= c && c <= 0x202E) ||
    105                 (c == 0x2060) ||
    106                 (0x2066 <= c && c <= 0x2069) ||
    107                 (0x2061 <= c && c <= 0x2064) ||
    108                 (0x206A <= c && c <= 0x206F) ||
    109                 (c == 0x3164) ||
    110                 (0x0FE00 <= c && c <= 0x0FE0F) ||
    111                 (c == 0x0FEFF) ||
    112                 (c == 0x0FFA0) ||
    113                 (0x01BCA0  <= c && c <= 0x01BCA3) ||
    114                 (0x01D173 <= c && c <= 0x01D17A) ||
    115                 (c == 0x0E0001) ||
    116                 (0x0E0020 <= c && c <= 0x0E007F) ||
    117                 (0x0E0100 <= c && c <= 0x0E01EF) ||
    118                 (c == 0x2065) ||
    119                 (0x0FFF0 <= c && c <= 0x0FFF8) ||
    120                 (c == 0x0E0000) ||
    121                 (0x0E0002 <= c && c <= 0x0E001F) ||
    122                 (0x0E0080 <= c && c <= 0x0E00FF) ||
    123                 (0x0E01F0 <= c && c <= 0x0E0FFF)
    124                 );
    125     }
    126     /**
    127      * Decoder Callback interface
    128      * @stable ICU 3.6
    129      */
    130     public interface Decoder {
    131         /**
    132          * This function is called when the bytes in the source cannot be handled,
    133          * and this function is meant to handle or fix the error if possible.
    134          *
    135          * @return Result of decoding action. This returned object is set to an error
    136          *  if this function could not handle the conversion.
    137          * @stable ICU 3.6
    138          */
    139         public CoderResult call(CharsetDecoderICU decoder, Object context,
    140                                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    141                                 char[] buffer, int length, CoderResult cr);
    142     }
    143     /**
    144      * Encoder Callback interface
    145      * @stable ICU 3.6
    146      */
    147     public interface Encoder {
    148         /**
    149          * This function is called when the Unicode characters in the source cannot be handled,
    150          * and this function is meant to handle or fix the error if possible.
    151          * @return Result of decoding action. This returned object is set to an error
    152          *  if this function could not handle the conversion.
    153          * @stable ICU 3.6
    154          */
    155         public CoderResult call(CharsetEncoderICU encoder, Object context,
    156                                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    157                                 char[] buffer, int length, int cp, CoderResult cr);
    158     }
    159     /**
    160      * Skip callback
    161      * @stable ICU 3.6
    162      */
    163     public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() {
    164         public CoderResult call(CharsetEncoderICU encoder, Object context,
    165                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    166                 char[] buffer, int length, int cp, CoderResult cr){
    167             if(context==null){
    168                 return CoderResult.UNDERFLOW;
    169             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    170                 if(!cr.isUnmappable()){
    171                     return cr;
    172                 }else{
    173                     return CoderResult.UNDERFLOW;
    174                 }
    175             }
    176             return cr;
    177         }
    178     };
    179     /**
    180      * Skip callback
    181      * @stable ICU 3.6
    182      */
    183     public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() {
    184         public CoderResult call(CharsetDecoderICU decoder, Object context,
    185                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    186                 char[] buffer, int length, CoderResult cr){
    187             if(context==null){
    188                 return CoderResult.UNDERFLOW;
    189             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    190                 if(!cr.isUnmappable()){
    191                     return cr;
    192                 }else{
    193                     return CoderResult.UNDERFLOW;
    194                 }
    195             }
    196             return cr;
    197         }
    198     };
    199     /**
    200      * Write substitute callback
    201      * @stable ICU 3.6
    202      */
    203     public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){
    204         public CoderResult call(CharsetEncoderICU encoder, Object context,
    205                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    206                 char[] buffer, int length, int cp, CoderResult cr){
    207             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    208                 return CoderResult.UNDERFLOW;
    209             }else if(context==null){
    210                 return encoder.cbFromUWriteSub(encoder, source, target, offsets);
    211             }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){
    212                 if(!cr.isUnmappable()){
    213                     return cr;
    214                 }else{
    215                    return encoder.cbFromUWriteSub(encoder, source, target, offsets);
    216                 }
    217             }
    218             return cr;
    219         }
    220     };
    221     private static final char[] kSubstituteChar1 = new char[]{0x1A};
    222     private static final char[] kSubstituteChar = new char[] {0xFFFD};
    223     /**
    224      * Write substitute callback
    225      * @stable ICU 3.6
    226      */
    227     public static final Decoder TO_U_CALLBACK_SUBSTITUTE  = new Decoder() {
    228         public CoderResult call(CharsetDecoderICU decoder, Object context,
    229                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    230                 char[] buffer, int length, CoderResult cr){
    231 
    232             CharsetICU cs = (CharsetICU) decoder.charset();
    233             /* Use the specified replacement character if it is different than the default one. */
    234             boolean useReplacement = true;
    235             char [] replacementChar = decoder.replacement().toCharArray();
    236             if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) {
    237                 useReplacement = false;
    238             }
    239 
    240             /* could optimize this case, just one uchar */
    241             if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) {
    242                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
    243             } else {
    244                 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position());
    245             }
    246         }
    247     };
    248     /**
    249      * Stop callback
    250      * @stable ICU 3.6
    251      */
    252     public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() {
    253         public CoderResult call(CharsetEncoderICU encoder, Object context,
    254                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    255                 char[] buffer, int length, int cp, CoderResult cr){
    256             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    257                 return CoderResult.UNDERFLOW;
    258             }
    259             return cr;
    260         }
    261     };
    262     /**
    263      * Stop callback
    264      * @stable ICU 3.6
    265      */
    266     public static final Decoder TO_U_CALLBACK_STOP = new Decoder() {
    267         public CoderResult call(CharsetDecoderICU decoder, Object context,
    268                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    269                 char[] buffer, int length, CoderResult cr){
    270             return cr;
    271         }
    272     };
    273     private static final int VALUE_STRING_LENGTH = 32;
    274     private static final char UNICODE_PERCENT_SIGN_CODEPOINT    = 0x0025;
    275     private static final char UNICODE_U_CODEPOINT               = 0x0055;
    276     private static final char UNICODE_X_CODEPOINT               = 0x0058;
    277     private static final char UNICODE_RS_CODEPOINT              = 0x005C;
    278     private static final char UNICODE_U_LOW_CODEPOINT           = 0x0075;
    279     private static final char UNICODE_X_LOW_CODEPOINT           = 0x0078;
    280     private static final char UNICODE_AMP_CODEPOINT             = 0x0026;
    281     private static final char UNICODE_HASH_CODEPOINT            = 0x0023;
    282     private static final char UNICODE_SEMICOLON_CODEPOINT       = 0x003B;
    283     private static final char UNICODE_PLUS_CODEPOINT            = 0x002B;
    284     private static final char UNICODE_LEFT_CURLY_CODEPOINT      = 0x007B;
    285     private static final char UNICODE_RIGHT_CURLY_CODEPOINT     = 0x007D;
    286     private static final char UNICODE_SPACE_CODEPOINT           = 0x0020;
    287     /**
    288      * Write escape callback
    289      * @stable ICU 4.0
    290      */
    291     public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() {
    292         public CoderResult call(CharsetEncoderICU encoder, Object context,
    293                 CharBuffer source, ByteBuffer target, IntBuffer offsets,
    294                 char[] buffer, int length, int cp, CoderResult cr){
    295             char[] valueString = new char[VALUE_STRING_LENGTH];
    296             int valueStringLength = 0;
    297             int i = 0;
    298 
    299             if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) {
    300                 return CoderResult.UNDERFLOW;
    301             }
    302 
    303             if (context == null || !(context instanceof String)) {
    304                 while (i < length) {
    305                     valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */
    306                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
    307                     valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    308                 }
    309             } else {
    310                 if (((String)context).equals(ESCAPE_JAVA)) {
    311                     while (i < length) {
    312                         valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    313                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
    314                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    315                     }
    316                 } else if (((String)context).equals(ESCAPE_C)) {
    317                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    318 
    319                     if (length == 2) {
    320                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */
    321                         valueStringLength = itou(valueString, valueStringLength, cp, 16, 8);
    322                     } else {
    323                         valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */
    324                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
    325                     }
    326                 } else if (((String)context).equals(ESCAPE_XML_DEC)) {
    327                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
    328                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
    329                     if (length == 2) {
    330                         valueStringLength += itou(valueString, valueStringLength, cp, 10, 0);
    331                     } else {
    332                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0);
    333                     }
    334                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    335                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
    336                     valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;   /* adding & */
    337                     valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;  /* adding # */
    338                     valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */
    339                     if (length == 2) {
    340                         valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
    341                     } else {
    342                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0);
    343                     }
    344                     valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */
    345                 } else if (((String)context).equals(ESCAPE_UNICODE)) {
    346                     valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT;    /* adding { */
    347                     valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
    348                     valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT;          /* adding + */
    349                     if (length == 2) {
    350                         valueStringLength += itou(valueString, valueStringLength,cp, 16, 4);
    351                     } else {
    352                         valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4);
    353                     }
    354                     valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT;   /* adding } */
    355                 } else if (((String)context).equals(ESCAPE_CSS2)) {
    356                     valueString[valueStringLength++] = UNICODE_RS_CODEPOINT;    /* adding \ */
    357                     valueStringLength += itou(valueString, valueStringLength, cp, 16, 0);
    358                     /* Always add space character, because the next character might be whitespace,
    359                        which would erroneously be considered the termination of the escape sequence. */
    360                     valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT;
    361                 } else {
    362                     while (i < length) {
    363                         valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;  /* adding % */
    364                         valueString[valueStringLength++] = UNICODE_U_CODEPOINT;             /* adding U */
    365                         valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4);
    366                     }
    367                 }
    368             }
    369             return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets);
    370         }
    371     };
    372     /**
    373      * Write escape callback
    374      * @stable ICU 4.0
    375      */
    376     public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() {
    377         public CoderResult call(CharsetDecoderICU decoder, Object context,
    378                 ByteBuffer source, CharBuffer target, IntBuffer offsets,
    379                 char[] buffer, int length, CoderResult cr){
    380             char[] uniValueString = new char[VALUE_STRING_LENGTH];
    381             int valueStringLength = 0;
    382             int i = 0;
    383 
    384             if (context == null || !(context instanceof String)) {
    385                 while (i < length) {
    386                     uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
    387                     uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding U */
    388                     valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    389                 }
    390             } else {
    391                 if (((String)context).equals(ESCAPE_XML_DEC)) {
    392                     while (i < length) {
    393                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
    394                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
    395                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0);
    396                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
    397                     }
    398                 } else if (((String)context).equals(ESCAPE_XML_HEX)) {
    399                     while (i < length) {
    400                         uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT;    /* adding & */
    401                         uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT;   /* adding # */
    402                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;  /* adding x */
    403                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0);
    404                         uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT;  /* adding ; */
    405                     }
    406                 } else if (((String)context).equals(ESCAPE_C)) {
    407                     while (i < length) {
    408                         uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT;         /* adding \ */
    409                         uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT;      /* adding x */
    410                         valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    411                     }
    412                 } else {
    413                     while (i < length) {
    414                         uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT;   /* adding % */
    415                         uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT;              /* adding X */
    416                         itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2);
    417                         valueStringLength += 2;
    418                     }
    419                 }
    420             }
    421 
    422             cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0);
    423 
    424             return cr;
    425         }
    426     };
    427     /***
    428      * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE.
    429      * Fills in a char string with the radix-based representation of a number padded with zeroes
    430      * to minwidth.
    431      */
    432     private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) {
    433         int length = 0;
    434         int digit;
    435         int j;
    436         char temp;
    437 
    438         do {
    439             digit = i % radix;
    440             buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7));
    441             i = i/radix;
    442         } while (i != 0 && (sourceIndex + length) < buffer.length);
    443 
    444         while (length < minwidth) {
    445             buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */
    446         }
    447         /* reverses the string */
    448         for (j = 0; j < (length / 2); j++) {
    449             temp = buffer[(sourceIndex + length - 1) - j];
    450             buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j];
    451             buffer[sourceIndex + j] = temp;
    452         }
    453 
    454         return length;
    455     }
    456 
    457     /*
    458      * No need to create an instance
    459      */
    460     private CharsetCallback() {
    461     }
    462 }
    463