1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.charset; 9 10 import java.nio.ByteBuffer; 11 import java.nio.CharBuffer; 12 import java.nio.IntBuffer; 13 import java.nio.charset.CoderResult; 14 15 /** 16 * <h2> Callback API for CharsetICU API </h2> 17 * 18 * CharsetCallback class defines some error behaviour functions called 19 * by CharsetDecoderICU and CharsetEncoderICU. The class also provides 20 * the facility by which clients can write their own callbacks. 21 * 22 * These functions, although public, should NEVER be called directly. 23 * They should be used as parameters to the onUmappableCharacter() and 24 * onMalformedInput() methods, to set the behaviour of a converter 25 * when it encounters UNMAPPED/INVALID sequences. 26 * Currently the only way to set callbacks is by using CodingErrorAction. 27 * In the future we will provide set methods on CharsetEncoder and CharsetDecoder 28 * that will accept CharsetCallback fields. 29 * 30 * @stable ICU 3.6 31 */ 32 33 public class CharsetCallback { 34 /* 35 * FROM_U, TO_U context options for sub callback 36 */ 37 private static final String SUB_STOP_ON_ILLEGAL = "i"; 38 39 // /* 40 // * FROM_U, TO_U context options for skip callback 41 // */ 42 // private static final String SKIP_STOP_ON_ILLEGAL = "i"; 43 44 // /* 45 // * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) 46 // */ 47 // private static final String ESCAPE_ICU = null; 48 49 /* 50 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) 51 */ 52 private static final String ESCAPE_JAVA = "J"; 53 54 /* 55 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) 56 * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) 57 */ 58 private static final String ESCAPE_C = "C"; 59 60 /* 61 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 62 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 63 */ 64 private static final String ESCAPE_XML_DEC = "D"; 65 66 /* 67 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 68 * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 69 */ 70 private static final String ESCAPE_XML_HEX = "X"; 71 72 /* 73 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 74 */ 75 private static final String ESCAPE_UNICODE = "U"; 76 77 /* 78 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 79 */ 80 private static final String ESCAPE_CSS2 = "S"; 81 82 /* 83 * IS_DEFAULT_IGNORABLE_CODE_POINT 84 * This is to check if a code point has the default ignorable unicode property. 85 * As such, this list needs to be updated if the ignorable code point list ever 86 * changes. 87 * To avoid dependency on other code, this list is hard coded here. 88 * When an ignorable code point is found and is unmappable, the default callbacks 89 * will ignore them. 90 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g= 91 * 92 * This list should be sync with the one in ucnv_err.c 93 * 94 */ 95 private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) { 96 return ((c == 0x00AD) || 97 (c == 0x034F) || 98 (c == 0x061C) || 99 (c == 0x115F) || 100 (c == 0x1160) || 101 (0x17B4 <= c && c <= 0x17B5) || 102 (0x180B <= c && c <= 0x180E) || 103 (0x200B <= c && c <= 0x200F) || 104 (0x202A <= c && c <= 0x202E) || 105 (c == 0x2060) || 106 (0x2066 <= c && c <= 0x2069) || 107 (0x2061 <= c && c <= 0x2064) || 108 (0x206A <= c && c <= 0x206F) || 109 (c == 0x3164) || 110 (0x0FE00 <= c && c <= 0x0FE0F) || 111 (c == 0x0FEFF) || 112 (c == 0x0FFA0) || 113 (0x01BCA0 <= c && c <= 0x01BCA3) || 114 (0x01D173 <= c && c <= 0x01D17A) || 115 (c == 0x0E0001) || 116 (0x0E0020 <= c && c <= 0x0E007F) || 117 (0x0E0100 <= c && c <= 0x0E01EF) || 118 (c == 0x2065) || 119 (0x0FFF0 <= c && c <= 0x0FFF8) || 120 (c == 0x0E0000) || 121 (0x0E0002 <= c && c <= 0x0E001F) || 122 (0x0E0080 <= c && c <= 0x0E00FF) || 123 (0x0E01F0 <= c && c <= 0x0E0FFF) 124 ); 125 } 126 /** 127 * Decoder Callback interface 128 * @stable ICU 3.6 129 */ 130 public interface Decoder { 131 /** 132 * This function is called when the bytes in the source cannot be handled, 133 * and this function is meant to handle or fix the error if possible. 134 * 135 * @return Result of decoding action. This returned object is set to an error 136 * if this function could not handle the conversion. 137 * @stable ICU 3.6 138 */ 139 public CoderResult call(CharsetDecoderICU decoder, Object context, 140 ByteBuffer source, CharBuffer target, IntBuffer offsets, 141 char[] buffer, int length, CoderResult cr); 142 } 143 /** 144 * Encoder Callback interface 145 * @stable ICU 3.6 146 */ 147 public interface Encoder { 148 /** 149 * This function is called when the Unicode characters in the source cannot be handled, 150 * and this function is meant to handle or fix the error if possible. 151 * @return Result of decoding action. This returned object is set to an error 152 * if this function could not handle the conversion. 153 * @stable ICU 3.6 154 */ 155 public CoderResult call(CharsetEncoderICU encoder, Object context, 156 CharBuffer source, ByteBuffer target, IntBuffer offsets, 157 char[] buffer, int length, int cp, CoderResult cr); 158 } 159 /** 160 * Skip callback 161 * @stable ICU 3.6 162 */ 163 public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() { 164 public CoderResult call(CharsetEncoderICU encoder, Object context, 165 CharBuffer source, ByteBuffer target, IntBuffer offsets, 166 char[] buffer, int length, int cp, CoderResult cr){ 167 if(context==null){ 168 return CoderResult.UNDERFLOW; 169 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 170 if(!cr.isUnmappable()){ 171 return cr; 172 }else{ 173 return CoderResult.UNDERFLOW; 174 } 175 } 176 return cr; 177 } 178 }; 179 /** 180 * Skip callback 181 * @stable ICU 3.6 182 */ 183 public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() { 184 public CoderResult call(CharsetDecoderICU decoder, Object context, 185 ByteBuffer source, CharBuffer target, IntBuffer offsets, 186 char[] buffer, int length, CoderResult cr){ 187 if(context==null){ 188 return CoderResult.UNDERFLOW; 189 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 190 if(!cr.isUnmappable()){ 191 return cr; 192 }else{ 193 return CoderResult.UNDERFLOW; 194 } 195 } 196 return cr; 197 } 198 }; 199 /** 200 * Write substitute callback 201 * @stable ICU 3.6 202 */ 203 public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){ 204 public CoderResult call(CharsetEncoderICU encoder, Object context, 205 CharBuffer source, ByteBuffer target, IntBuffer offsets, 206 char[] buffer, int length, int cp, CoderResult cr){ 207 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 208 return CoderResult.UNDERFLOW; 209 }else if(context==null){ 210 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 211 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 212 if(!cr.isUnmappable()){ 213 return cr; 214 }else{ 215 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 216 } 217 } 218 return cr; 219 } 220 }; 221 private static final char[] kSubstituteChar1 = new char[]{0x1A}; 222 private static final char[] kSubstituteChar = new char[] {0xFFFD}; 223 /** 224 * Write substitute callback 225 * @stable ICU 3.6 226 */ 227 public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() { 228 public CoderResult call(CharsetDecoderICU decoder, Object context, 229 ByteBuffer source, CharBuffer target, IntBuffer offsets, 230 char[] buffer, int length, CoderResult cr){ 231 232 CharsetICU cs = (CharsetICU) decoder.charset(); 233 /* Use the specified replacement character if it is different than the default one. */ 234 boolean useReplacement = true; 235 char [] replacementChar = decoder.replacement().toCharArray(); 236 if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) { 237 useReplacement = false; 238 } 239 240 /* could optimize this case, just one uchar */ 241 if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) { 242 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 243 } else { 244 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 245 } 246 } 247 }; 248 /** 249 * Stop callback 250 * @stable ICU 3.6 251 */ 252 public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() { 253 public CoderResult call(CharsetEncoderICU encoder, Object context, 254 CharBuffer source, ByteBuffer target, IntBuffer offsets, 255 char[] buffer, int length, int cp, CoderResult cr){ 256 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 257 return CoderResult.UNDERFLOW; 258 } 259 return cr; 260 } 261 }; 262 /** 263 * Stop callback 264 * @stable ICU 3.6 265 */ 266 public static final Decoder TO_U_CALLBACK_STOP = new Decoder() { 267 public CoderResult call(CharsetDecoderICU decoder, Object context, 268 ByteBuffer source, CharBuffer target, IntBuffer offsets, 269 char[] buffer, int length, CoderResult cr){ 270 return cr; 271 } 272 }; 273 private static final int VALUE_STRING_LENGTH = 32; 274 private static final char UNICODE_PERCENT_SIGN_CODEPOINT = 0x0025; 275 private static final char UNICODE_U_CODEPOINT = 0x0055; 276 private static final char UNICODE_X_CODEPOINT = 0x0058; 277 private static final char UNICODE_RS_CODEPOINT = 0x005C; 278 private static final char UNICODE_U_LOW_CODEPOINT = 0x0075; 279 private static final char UNICODE_X_LOW_CODEPOINT = 0x0078; 280 private static final char UNICODE_AMP_CODEPOINT = 0x0026; 281 private static final char UNICODE_HASH_CODEPOINT = 0x0023; 282 private static final char UNICODE_SEMICOLON_CODEPOINT = 0x003B; 283 private static final char UNICODE_PLUS_CODEPOINT = 0x002B; 284 private static final char UNICODE_LEFT_CURLY_CODEPOINT = 0x007B; 285 private static final char UNICODE_RIGHT_CURLY_CODEPOINT = 0x007D; 286 private static final char UNICODE_SPACE_CODEPOINT = 0x0020; 287 /** 288 * Write escape callback 289 * @stable ICU 4.0 290 */ 291 public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() { 292 public CoderResult call(CharsetEncoderICU encoder, Object context, 293 CharBuffer source, ByteBuffer target, IntBuffer offsets, 294 char[] buffer, int length, int cp, CoderResult cr){ 295 char[] valueString = new char[VALUE_STRING_LENGTH]; 296 int valueStringLength = 0; 297 int i = 0; 298 299 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 300 return CoderResult.UNDERFLOW; 301 } 302 303 if (context == null || !(context instanceof String)) { 304 while (i < length) { 305 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 306 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 307 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 308 } 309 } else { 310 if (((String)context).equals(ESCAPE_JAVA)) { 311 while (i < length) { 312 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 313 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 314 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 315 } 316 } else if (((String)context).equals(ESCAPE_C)) { 317 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 318 319 if (length == 2) { 320 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 321 valueStringLength = itou(valueString, valueStringLength, cp, 16, 8); 322 } else { 323 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 324 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 325 } 326 } else if (((String)context).equals(ESCAPE_XML_DEC)) { 327 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 328 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 329 if (length == 2) { 330 valueStringLength += itou(valueString, valueStringLength, cp, 10, 0); 331 } else { 332 valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0); 333 } 334 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 335 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 336 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 337 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 338 valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 339 if (length == 2) { 340 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 341 } else { 342 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0); 343 } 344 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 345 } else if (((String)context).equals(ESCAPE_UNICODE)) { 346 valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 347 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 348 valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT; /* adding + */ 349 if (length == 2) { 350 valueStringLength += itou(valueString, valueStringLength,cp, 16, 4); 351 } else { 352 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 353 } 354 valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 355 } else if (((String)context).equals(ESCAPE_CSS2)) { 356 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 357 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 358 /* Always add space character, because the next character might be whitespace, 359 which would erroneously be considered the termination of the escape sequence. */ 360 valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT; 361 } else { 362 while (i < length) { 363 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 364 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 365 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 366 } 367 } 368 } 369 return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets); 370 } 371 }; 372 /** 373 * Write escape callback 374 * @stable ICU 4.0 375 */ 376 public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() { 377 public CoderResult call(CharsetDecoderICU decoder, Object context, 378 ByteBuffer source, CharBuffer target, IntBuffer offsets, 379 char[] buffer, int length, CoderResult cr){ 380 char[] uniValueString = new char[VALUE_STRING_LENGTH]; 381 int valueStringLength = 0; 382 int i = 0; 383 384 if (context == null || !(context instanceof String)) { 385 while (i < length) { 386 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 387 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding U */ 388 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 389 } 390 } else { 391 if (((String)context).equals(ESCAPE_XML_DEC)) { 392 while (i < length) { 393 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 394 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 395 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0); 396 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 397 } 398 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 399 while (i < length) { 400 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 401 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 402 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 403 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0); 404 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 405 } 406 } else if (((String)context).equals(ESCAPE_C)) { 407 while (i < length) { 408 uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 409 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 410 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 411 } 412 } else { 413 while (i < length) { 414 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 415 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */ 416 itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 417 valueStringLength += 2; 418 } 419 } 420 } 421 422 cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0); 423 424 return cr; 425 } 426 }; 427 /*** 428 * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE. 429 * Fills in a char string with the radix-based representation of a number padded with zeroes 430 * to minwidth. 431 */ 432 private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) { 433 int length = 0; 434 int digit; 435 int j; 436 char temp; 437 438 do { 439 digit = i % radix; 440 buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7)); 441 i = i/radix; 442 } while (i != 0 && (sourceIndex + length) < buffer.length); 443 444 while (length < minwidth) { 445 buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */ 446 } 447 /* reverses the string */ 448 for (j = 0; j < (length / 2); j++) { 449 temp = buffer[(sourceIndex + length - 1) - j]; 450 buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j]; 451 buffer[sourceIndex + j] = temp; 452 } 453 454 return length; 455 } 456 457 /* 458 * No need to create an instance 459 */ 460 private CharsetCallback() { 461 } 462 } 463