1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /** 4 ******************************************************************************* 5 * Copyright (C) 2006-2014, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.nio.ByteBuffer; 13 import java.nio.CharBuffer; 14 import java.nio.IntBuffer; 15 import java.nio.charset.CoderResult; 16 17 /** 18 * <h2> Callback API for CharsetICU API </h2> 19 * 20 * CharsetCallback class defines some error behaviour functions called 21 * by CharsetDecoderICU and CharsetEncoderICU. The class also provides 22 * the facility by which clients can write their own callbacks. 23 * 24 * These functions, although public, should NEVER be called directly. 25 * They should be used as parameters to the onUmappableCharacter() and 26 * onMalformedInput() methods, to set the behaviour of a converter 27 * when it encounters UNMAPPED/INVALID sequences. 28 * Currently the only way to set callbacks is by using CodingErrorAction. 29 * In the future we will provide set methods on CharsetEncoder and CharsetDecoder 30 * that will accept CharsetCallback fields. 31 * 32 * @stable ICU 3.6 33 */ 34 35 public class CharsetCallback { 36 /* 37 * FROM_U, TO_U context options for sub callback 38 */ 39 private static final String SUB_STOP_ON_ILLEGAL = "i"; 40 41 // /* 42 // * FROM_U, TO_U context options for skip callback 43 // */ 44 // private static final String SKIP_STOP_ON_ILLEGAL = "i"; 45 46 // /* 47 // * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to ICU (%UXXXX) 48 // */ 49 // private static final String ESCAPE_ICU = null; 50 51 /* 52 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to JAVA (\\uXXXX) 53 */ 54 private static final String ESCAPE_JAVA = "J"; 55 56 /* 57 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to C (\\uXXXX \\UXXXXXXXX) 58 * TO_U_CALLBACK_ESCAPE option to escape the character value accoding to C (\\xXXXX) 59 */ 60 private static final String ESCAPE_C = "C"; 61 62 /* 63 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 64 * TO_U_CALLBACK_ESCAPE context option to escape the character value accoding to XML Decimal escape \htmlonly(&#DDDD;)\endhtmlonly 65 */ 66 private static final String ESCAPE_XML_DEC = "D"; 67 68 /* 69 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 70 * TO_U_CALLBACK_ESCAPE context option to escape the character value according to XML Hex escape \htmlonly(&#xXXXX;)\endhtmlonly 71 */ 72 private static final String ESCAPE_XML_HEX = "X"; 73 74 /* 75 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 76 */ 77 private static final String ESCAPE_UNICODE = "U"; 78 79 /* 80 * FROM_U_CALLBACK_ESCAPE context option to escape the code unit according to Unicode (U+XXXXX) 81 */ 82 private static final String ESCAPE_CSS2 = "S"; 83 84 /* 85 * IS_DEFAULT_IGNORABLE_CODE_POINT 86 * This is to check if a code point has the default ignorable unicode property. 87 * As such, this list needs to be updated if the ignorable code point list ever 88 * changes. 89 * To avoid dependency on other code, this list is hard coded here. 90 * When an ignorable code point is found and is unmappable, the default callbacks 91 * will ignore them. 92 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g= 93 * 94 * This list should be sync with the one in ucnv_err.c 95 * 96 */ 97 private static boolean IS_DEFAULT_IGNORABLE_CODE_POINT(int c) { 98 return ((c == 0x00AD) || 99 (c == 0x034F) || 100 (c == 0x061C) || 101 (c == 0x115F) || 102 (c == 0x1160) || 103 (0x17B4 <= c && c <= 0x17B5) || 104 (0x180B <= c && c <= 0x180E) || 105 (0x200B <= c && c <= 0x200F) || 106 (0x202A <= c && c <= 0x202E) || 107 (c == 0x2060) || 108 (0x2066 <= c && c <= 0x2069) || 109 (0x2061 <= c && c <= 0x2064) || 110 (0x206A <= c && c <= 0x206F) || 111 (c == 0x3164) || 112 (0x0FE00 <= c && c <= 0x0FE0F) || 113 (c == 0x0FEFF) || 114 (c == 0x0FFA0) || 115 (0x01BCA0 <= c && c <= 0x01BCA3) || 116 (0x01D173 <= c && c <= 0x01D17A) || 117 (c == 0x0E0001) || 118 (0x0E0020 <= c && c <= 0x0E007F) || 119 (0x0E0100 <= c && c <= 0x0E01EF) || 120 (c == 0x2065) || 121 (0x0FFF0 <= c && c <= 0x0FFF8) || 122 (c == 0x0E0000) || 123 (0x0E0002 <= c && c <= 0x0E001F) || 124 (0x0E0080 <= c && c <= 0x0E00FF) || 125 (0x0E01F0 <= c && c <= 0x0E0FFF) 126 ); 127 } 128 /** 129 * Decoder Callback interface 130 * @stable ICU 3.6 131 */ 132 public interface Decoder { 133 /** 134 * This function is called when the bytes in the source cannot be handled, 135 * and this function is meant to handle or fix the error if possible. 136 * 137 * @return Result of decoding action. This returned object is set to an error 138 * if this function could not handle the conversion. 139 * @stable ICU 3.6 140 */ 141 public CoderResult call(CharsetDecoderICU decoder, Object context, 142 ByteBuffer source, CharBuffer target, IntBuffer offsets, 143 char[] buffer, int length, CoderResult cr); 144 } 145 /** 146 * Encoder Callback interface 147 * @stable ICU 3.6 148 */ 149 public interface Encoder { 150 /** 151 * This function is called when the Unicode characters in the source cannot be handled, 152 * and this function is meant to handle or fix the error if possible. 153 * @return Result of decoding action. This returned object is set to an error 154 * if this function could not handle the conversion. 155 * @stable ICU 3.6 156 */ 157 public CoderResult call(CharsetEncoderICU encoder, Object context, 158 CharBuffer source, ByteBuffer target, IntBuffer offsets, 159 char[] buffer, int length, int cp, CoderResult cr); 160 } 161 /** 162 * Skip callback 163 * @stable ICU 3.6 164 */ 165 public static final Encoder FROM_U_CALLBACK_SKIP = new Encoder() { 166 @Override 167 public CoderResult call(CharsetEncoderICU encoder, Object context, 168 CharBuffer source, ByteBuffer target, IntBuffer offsets, 169 char[] buffer, int length, int cp, CoderResult cr){ 170 if(context==null){ 171 return CoderResult.UNDERFLOW; 172 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 173 if(!cr.isUnmappable()){ 174 return cr; 175 }else{ 176 return CoderResult.UNDERFLOW; 177 } 178 } 179 return cr; 180 } 181 }; 182 /** 183 * Skip callback 184 * @stable ICU 3.6 185 */ 186 public static final Decoder TO_U_CALLBACK_SKIP = new Decoder() { 187 @Override 188 public CoderResult call(CharsetDecoderICU decoder, Object context, 189 ByteBuffer source, CharBuffer target, IntBuffer offsets, 190 char[] buffer, int length, CoderResult cr){ 191 if(context==null){ 192 return CoderResult.UNDERFLOW; 193 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 194 if(!cr.isUnmappable()){ 195 return cr; 196 }else{ 197 return CoderResult.UNDERFLOW; 198 } 199 } 200 return cr; 201 } 202 }; 203 /** 204 * Write substitute callback 205 * @stable ICU 3.6 206 */ 207 public static final Encoder FROM_U_CALLBACK_SUBSTITUTE = new Encoder(){ 208 @Override 209 public CoderResult call(CharsetEncoderICU encoder, Object context, 210 CharBuffer source, ByteBuffer target, IntBuffer offsets, 211 char[] buffer, int length, int cp, CoderResult cr){ 212 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 213 return CoderResult.UNDERFLOW; 214 }else if(context==null){ 215 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 216 }else if(((String)context).equals(SUB_STOP_ON_ILLEGAL)){ 217 if(!cr.isUnmappable()){ 218 return cr; 219 }else{ 220 return encoder.cbFromUWriteSub(encoder, source, target, offsets); 221 } 222 } 223 return cr; 224 } 225 }; 226 private static final char[] kSubstituteChar1 = new char[]{0x1A}; 227 private static final char[] kSubstituteChar = new char[] {0xFFFD}; 228 /** 229 * Write substitute callback 230 * @stable ICU 3.6 231 */ 232 public static final Decoder TO_U_CALLBACK_SUBSTITUTE = new Decoder() { 233 @Override 234 public CoderResult call(CharsetDecoderICU decoder, Object context, 235 ByteBuffer source, CharBuffer target, IntBuffer offsets, 236 char[] buffer, int length, CoderResult cr){ 237 238 CharsetICU cs = (CharsetICU) decoder.charset(); 239 /* Use the specified replacement character if it is different than the default one. */ 240 boolean useReplacement = true; 241 char [] replacementChar = decoder.replacement().toCharArray(); 242 if (replacementChar.length == 1 && (replacementChar[0] == kSubstituteChar1[0] || replacementChar[0] == kSubstituteChar[0])) { 243 useReplacement = false; 244 } 245 246 /* could optimize this case, just one uchar */ 247 if(decoder.invalidCharLength == 1 && cs.subChar1 != 0) { 248 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar1, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 249 } else { 250 return CharsetDecoderICU.toUWriteUChars(decoder, useReplacement ? replacementChar : kSubstituteChar, 0, useReplacement ? replacementChar.length : 1, target, offsets, source.position()); 251 } 252 } 253 }; 254 /** 255 * Stop callback 256 * @stable ICU 3.6 257 */ 258 public static final Encoder FROM_U_CALLBACK_STOP = new Encoder() { 259 @Override 260 public CoderResult call(CharsetEncoderICU encoder, Object context, 261 CharBuffer source, ByteBuffer target, IntBuffer offsets, 262 char[] buffer, int length, int cp, CoderResult cr){ 263 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 264 return CoderResult.UNDERFLOW; 265 } 266 return cr; 267 } 268 }; 269 /** 270 * Stop callback 271 * @stable ICU 3.6 272 */ 273 public static final Decoder TO_U_CALLBACK_STOP = new Decoder() { 274 @Override 275 public CoderResult call(CharsetDecoderICU decoder, Object context, 276 ByteBuffer source, CharBuffer target, IntBuffer offsets, 277 char[] buffer, int length, CoderResult cr){ 278 return cr; 279 } 280 }; 281 private static final int VALUE_STRING_LENGTH = 32; 282 private static final char UNICODE_PERCENT_SIGN_CODEPOINT = 0x0025; 283 private static final char UNICODE_U_CODEPOINT = 0x0055; 284 private static final char UNICODE_X_CODEPOINT = 0x0058; 285 private static final char UNICODE_RS_CODEPOINT = 0x005C; 286 private static final char UNICODE_U_LOW_CODEPOINT = 0x0075; 287 private static final char UNICODE_X_LOW_CODEPOINT = 0x0078; 288 private static final char UNICODE_AMP_CODEPOINT = 0x0026; 289 private static final char UNICODE_HASH_CODEPOINT = 0x0023; 290 private static final char UNICODE_SEMICOLON_CODEPOINT = 0x003B; 291 private static final char UNICODE_PLUS_CODEPOINT = 0x002B; 292 private static final char UNICODE_LEFT_CURLY_CODEPOINT = 0x007B; 293 private static final char UNICODE_RIGHT_CURLY_CODEPOINT = 0x007D; 294 private static final char UNICODE_SPACE_CODEPOINT = 0x0020; 295 /** 296 * Write escape callback 297 * @stable ICU 4.0 298 */ 299 public static final Encoder FROM_U_CALLBACK_ESCAPE = new Encoder() { 300 @Override 301 public CoderResult call(CharsetEncoderICU encoder, Object context, 302 CharBuffer source, ByteBuffer target, IntBuffer offsets, 303 char[] buffer, int length, int cp, CoderResult cr){ 304 char[] valueString = new char[VALUE_STRING_LENGTH]; 305 int valueStringLength = 0; 306 int i = 0; 307 308 if (cr.isUnmappable() && IS_DEFAULT_IGNORABLE_CODE_POINT(cp)) { 309 return CoderResult.UNDERFLOW; 310 } 311 312 if (context == null || !(context instanceof String)) { 313 while (i < length) { 314 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 315 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 316 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 317 } 318 } else { 319 if (((String)context).equals(ESCAPE_JAVA)) { 320 while (i < length) { 321 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 322 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 323 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 324 } 325 } else if (((String)context).equals(ESCAPE_C)) { 326 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 327 328 if (length == 2) { 329 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 330 valueStringLength = itou(valueString, valueStringLength, cp, 16, 8); 331 } else { 332 valueString[valueStringLength++] = UNICODE_U_LOW_CODEPOINT; /* adding u */ 333 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 334 } 335 } else if (((String)context).equals(ESCAPE_XML_DEC)) { 336 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 337 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 338 if (length == 2) { 339 valueStringLength += itou(valueString, valueStringLength, cp, 10, 0); 340 } else { 341 valueStringLength += itou(valueString, valueStringLength, buffer[0], 10, 0); 342 } 343 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 344 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 345 valueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 346 valueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 347 valueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 348 if (length == 2) { 349 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 350 } else { 351 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 0); 352 } 353 valueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 354 } else if (((String)context).equals(ESCAPE_UNICODE)) { 355 valueString[valueStringLength++] = UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 356 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 357 valueString[valueStringLength++] = UNICODE_PLUS_CODEPOINT; /* adding + */ 358 if (length == 2) { 359 valueStringLength += itou(valueString, valueStringLength,cp, 16, 4); 360 } else { 361 valueStringLength += itou(valueString, valueStringLength, buffer[0], 16, 4); 362 } 363 valueString[valueStringLength++] = UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 364 } else if (((String)context).equals(ESCAPE_CSS2)) { 365 valueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 366 valueStringLength += itou(valueString, valueStringLength, cp, 16, 0); 367 /* Always add space character, because the next character might be whitespace, 368 which would erroneously be considered the termination of the escape sequence. */ 369 valueString[valueStringLength++] = UNICODE_SPACE_CODEPOINT; 370 } else { 371 while (i < length) { 372 valueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 373 valueString[valueStringLength++] = UNICODE_U_CODEPOINT; /* adding U */ 374 valueStringLength += itou(valueString, valueStringLength, buffer[i++], 16, 4); 375 } 376 } 377 } 378 return encoder.cbFromUWriteUChars(encoder, CharBuffer.wrap(valueString, 0, valueStringLength), target, offsets); 379 } 380 }; 381 /** 382 * Write escape callback 383 * @stable ICU 4.0 384 */ 385 public static final Decoder TO_U_CALLBACK_ESCAPE = new Decoder() { 386 @Override 387 public CoderResult call(CharsetDecoderICU decoder, Object context, 388 ByteBuffer source, CharBuffer target, IntBuffer offsets, 389 char[] buffer, int length, CoderResult cr){ 390 char[] uniValueString = new char[VALUE_STRING_LENGTH]; 391 int valueStringLength = 0; 392 int i = 0; 393 394 if (context == null || !(context instanceof String)) { 395 while (i < length) { 396 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 397 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding U */ 398 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 399 } 400 } else { 401 if (((String)context).equals(ESCAPE_XML_DEC)) { 402 while (i < length) { 403 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 404 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 405 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 10, 0); 406 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 407 } 408 } else if (((String)context).equals(ESCAPE_XML_HEX)) { 409 while (i < length) { 410 uniValueString[valueStringLength++] = UNICODE_AMP_CODEPOINT; /* adding & */ 411 uniValueString[valueStringLength++] = UNICODE_HASH_CODEPOINT; /* adding # */ 412 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 413 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 0); 414 uniValueString[valueStringLength++] = UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 415 } 416 } else if (((String)context).equals(ESCAPE_C)) { 417 while (i < length) { 418 uniValueString[valueStringLength++] = UNICODE_RS_CODEPOINT; /* adding \ */ 419 uniValueString[valueStringLength++] = UNICODE_X_LOW_CODEPOINT; /* adding x */ 420 valueStringLength += itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 421 } 422 } else { 423 while (i < length) { 424 uniValueString[valueStringLength++] = UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 425 uniValueString[valueStringLength++] = UNICODE_X_CODEPOINT; /* adding X */ 426 itou(uniValueString, valueStringLength, buffer[i++] & UConverterConstants.UNSIGNED_BYTE_MASK, 16, 2); 427 valueStringLength += 2; 428 } 429 } 430 } 431 432 cr = CharsetDecoderICU.toUWriteUChars(decoder, uniValueString, 0, valueStringLength, target, offsets, 0); 433 434 return cr; 435 } 436 }; 437 /*** 438 * Java port of uprv_itou() in ICU4C used by TO_U_CALLBACK_ESCAPE and FROM_U_CALLBACK_ESCAPE. 439 * Fills in a char string with the radix-based representation of a number padded with zeroes 440 * to minwidth. 441 */ 442 private static final int itou(char[] buffer, int sourceIndex, int i, int radix, int minwidth) { 443 int length = 0; 444 int digit; 445 int j; 446 char temp; 447 448 do { 449 digit = i % radix; 450 buffer[sourceIndex + length++] = (char)(digit <= 9 ? (0x0030+digit) : (0x0030+digit+7)); 451 i = i/radix; 452 } while (i != 0 && (sourceIndex + length) < buffer.length); 453 454 while (length < minwidth) { 455 buffer[sourceIndex + length++] = (char)0x0030; /* zero padding */ 456 } 457 /* reverses the string */ 458 for (j = 0; j < (length / 2); j++) { 459 temp = buffer[(sourceIndex + length - 1) - j]; 460 buffer[(sourceIndex + length-1) -j] = buffer[sourceIndex + j]; 461 buffer[sourceIndex + j] = temp; 462 } 463 464 return length; 465 } 466 467 /* 468 * No need to create an instance 469 */ 470 private CharsetCallback() { 471 } 472 } 473