1 /* 2 ***************************************************************************** 3 * 4 * Copyright (C) 1998-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ***************************************************************************** 8 * 9 * ucnv_err.c 10 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode 11 * 12 * 13 * Change history: 14 * 15 * 06/29/2000 helena Major rewrite of the callback APIs. 16 */ 17 18 #include "unicode/utypes.h" 19 20 #if !UCONFIG_NO_CONVERSION 21 22 #include "unicode/ucnv_err.h" 23 #include "unicode/ucnv_cb.h" 24 #include "ucnv_cnv.h" 25 #include "cmemory.h" 26 #include "unicode/ucnv.h" 27 #include "ustrfmt.h" 28 29 #define VALUE_STRING_LENGTH 32 30 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */ 31 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025 32 #define UNICODE_U_CODEPOINT 0x0055 33 #define UNICODE_X_CODEPOINT 0x0058 34 #define UNICODE_RS_CODEPOINT 0x005C 35 #define UNICODE_U_LOW_CODEPOINT 0x0075 36 #define UNICODE_X_LOW_CODEPOINT 0x0078 37 #define UNICODE_AMP_CODEPOINT 0x0026 38 #define UNICODE_HASH_CODEPOINT 0x0023 39 #define UNICODE_SEMICOLON_CODEPOINT 0x003B 40 #define UNICODE_PLUS_CODEPOINT 0x002B 41 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B 42 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D 43 #define UNICODE_SPACE_CODEPOINT 0x0020 44 #define UCNV_PRV_ESCAPE_ICU 0 45 #define UCNV_PRV_ESCAPE_C 'C' 46 #define UCNV_PRV_ESCAPE_XML_DEC 'D' 47 #define UCNV_PRV_ESCAPE_XML_HEX 'X' 48 #define UCNV_PRV_ESCAPE_JAVA 'J' 49 #define UCNV_PRV_ESCAPE_UNICODE 'U' 50 #define UCNV_PRV_ESCAPE_CSS2 'S' 51 #define UCNV_PRV_STOP_ON_ILLEGAL 'i' 52 53 /* 54 * IS_DEFAULT_IGNORABLE_CODE_POINT 55 * This is to check if a code point has the default ignorable unicode property. 56 * As such, this list needs to be updated if the ignorable code point list ever 57 * changes. 58 * To avoid dependency on other code, this list is hard coded here. 59 * When an ignorable code point is found and is unmappable, the default callbacks 60 * will ignore them. 61 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g= 62 * 63 * This list should be sync with the one in CharsetCallback.java 64 */ 65 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\ 66 (c == 0x00AD) || \ 67 (c == 0x034F) || \ 68 (c == 0x061C) || \ 69 (c == 0x115F) || \ 70 (c == 0x1160) || \ 71 (0x17B4 <= c && c <= 0x17B5) || \ 72 (0x180B <= c && c <= 0x180E) || \ 73 (0x200B <= c && c <= 0x200F) || \ 74 (0x202A <= c && c <= 0x202E) || \ 75 (c == 0x2060) || \ 76 (0x2066 <= c && c <= 0x2069) || \ 77 (0x2061 <= c && c <= 0x2064) || \ 78 (0x206A <= c && c <= 0x206F) || \ 79 (c == 0x3164) || \ 80 (0x0FE00 <= c && c <= 0x0FE0F) || \ 81 (c == 0x0FEFF) || \ 82 (c == 0x0FFA0) || \ 83 (0x01BCA0 <= c && c <= 0x01BCA3) || \ 84 (0x01D173 <= c && c <= 0x01D17A) || \ 85 (c == 0x0E0001) || \ 86 (0x0E0020 <= c && c <= 0x0E007F) || \ 87 (0x0E0100 <= c && c <= 0x0E01EF) || \ 88 (c == 0x2065) || \ 89 (0x0FFF0 <= c && c <= 0x0FFF8) || \ 90 (c == 0x0E0000) || \ 91 (0x0E0002 <= c && c <= 0x0E001F) || \ 92 (0x0E0080 <= c && c <= 0x0E00FF) || \ 93 (0x0E01F0 <= c && c <= 0x0E0FFF) \ 94 ) 95 96 97 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 98 U_CAPI void U_EXPORT2 99 UCNV_FROM_U_CALLBACK_STOP ( 100 const void *context, 101 UConverterFromUnicodeArgs *fromUArgs, 102 const UChar* codeUnits, 103 int32_t length, 104 UChar32 codePoint, 105 UConverterCallbackReason reason, 106 UErrorCode * err) 107 { 108 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 109 { 110 /* 111 * Skip if the codepoint has unicode property of default ignorable. 112 */ 113 *err = U_ZERO_ERROR; 114 } 115 /* the caller must have set the error code accordingly */ 116 return; 117 } 118 119 120 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 121 U_CAPI void U_EXPORT2 122 UCNV_TO_U_CALLBACK_STOP ( 123 const void *context, 124 UConverterToUnicodeArgs *toUArgs, 125 const char* codePoints, 126 int32_t length, 127 UConverterCallbackReason reason, 128 UErrorCode * err) 129 { 130 /* the caller must have set the error code accordingly */ 131 return; 132 } 133 134 U_CAPI void U_EXPORT2 135 UCNV_FROM_U_CALLBACK_SKIP ( 136 const void *context, 137 UConverterFromUnicodeArgs *fromUArgs, 138 const UChar* codeUnits, 139 int32_t length, 140 UChar32 codePoint, 141 UConverterCallbackReason reason, 142 UErrorCode * err) 143 { 144 if (reason <= UCNV_IRREGULAR) 145 { 146 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 147 { 148 /* 149 * Skip if the codepoint has unicode property of default ignorable. 150 */ 151 *err = U_ZERO_ERROR; 152 } 153 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 154 { 155 *err = U_ZERO_ERROR; 156 } 157 /* else the caller must have set the error code accordingly. */ 158 } 159 /* else ignore the reset, close and clone calls. */ 160 } 161 162 U_CAPI void U_EXPORT2 163 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( 164 const void *context, 165 UConverterFromUnicodeArgs *fromArgs, 166 const UChar* codeUnits, 167 int32_t length, 168 UChar32 codePoint, 169 UConverterCallbackReason reason, 170 UErrorCode * err) 171 { 172 if (reason <= UCNV_IRREGULAR) 173 { 174 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 175 { 176 /* 177 * Skip if the codepoint has unicode property of default ignorable. 178 */ 179 *err = U_ZERO_ERROR; 180 } 181 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 182 { 183 *err = U_ZERO_ERROR; 184 ucnv_cbFromUWriteSub(fromArgs, 0, err); 185 } 186 /* else the caller must have set the error code accordingly. */ 187 } 188 /* else ignore the reset, close and clone calls. */ 189 } 190 191 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 192 *uses a clean copy (resetted) of the converter, to convert that unicode 193 *escape sequence to the target codepage (if conversion failure happens then 194 *we revert to substituting with subchar) 195 */ 196 U_CAPI void U_EXPORT2 197 UCNV_FROM_U_CALLBACK_ESCAPE ( 198 const void *context, 199 UConverterFromUnicodeArgs *fromArgs, 200 const UChar *codeUnits, 201 int32_t length, 202 UChar32 codePoint, 203 UConverterCallbackReason reason, 204 UErrorCode * err) 205 { 206 207 UChar valueString[VALUE_STRING_LENGTH]; 208 int32_t valueStringLength = 0; 209 int32_t i = 0; 210 211 const UChar *myValueSource = NULL; 212 UErrorCode err2 = U_ZERO_ERROR; 213 UConverterFromUCallback original = NULL; 214 const void *originalContext; 215 216 UConverterFromUCallback ignoredCallback = NULL; 217 const void *ignoredContext; 218 219 if (reason > UCNV_IRREGULAR) 220 { 221 return; 222 } 223 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 224 { 225 /* 226 * Skip if the codepoint has unicode property of default ignorable. 227 */ 228 *err = U_ZERO_ERROR; 229 return; 230 } 231 232 ucnv_setFromUCallBack (fromArgs->converter, 233 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, 234 NULL, 235 &original, 236 &originalContext, 237 &err2); 238 239 if (U_FAILURE (err2)) 240 { 241 *err = err2; 242 return; 243 } 244 if(context==NULL) 245 { 246 while (i < length) 247 { 248 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 249 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 250 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 251 } 252 } 253 else 254 { 255 switch(*((char*)context)) 256 { 257 case UCNV_PRV_ESCAPE_JAVA: 258 while (i < length) 259 { 260 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 261 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */ 262 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 263 } 264 break; 265 266 case UCNV_PRV_ESCAPE_C: 267 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 268 269 if(length==2){ 270 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 271 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8); 272 273 } 274 else{ 275 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */ 276 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 277 } 278 break; 279 280 case UCNV_PRV_ESCAPE_XML_DEC: 281 282 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 283 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 284 if(length==2){ 285 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0); 286 } 287 else{ 288 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0); 289 } 290 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 291 break; 292 293 case UCNV_PRV_ESCAPE_XML_HEX: 294 295 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 296 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 297 valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 298 if(length==2){ 299 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 300 } 301 else{ 302 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0); 303 } 304 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 305 break; 306 307 case UCNV_PRV_ESCAPE_UNICODE: 308 valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 309 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 310 valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */ 311 if (length == 2) { 312 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4); 313 } else { 314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 315 } 316 valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 317 break; 318 319 case UCNV_PRV_ESCAPE_CSS2: 320 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 321 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 322 /* Always add space character, becase the next character might be whitespace, 323 which would erroneously be considered the termination of the escape sequence. */ 324 valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT; 325 break; 326 327 default: 328 while (i < length) 329 { 330 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 331 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 332 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 333 } 334 } 335 } 336 myValueSource = valueString; 337 338 /* reset the error */ 339 *err = U_ZERO_ERROR; 340 341 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err); 342 343 ucnv_setFromUCallBack (fromArgs->converter, 344 original, 345 originalContext, 346 &ignoredCallback, 347 &ignoredContext, 348 &err2); 349 if (U_FAILURE (err2)) 350 { 351 *err = err2; 352 return; 353 } 354 355 return; 356 } 357 358 359 360 U_CAPI void U_EXPORT2 361 UCNV_TO_U_CALLBACK_SKIP ( 362 const void *context, 363 UConverterToUnicodeArgs *toArgs, 364 const char* codeUnits, 365 int32_t length, 366 UConverterCallbackReason reason, 367 UErrorCode * err) 368 { 369 if (reason <= UCNV_IRREGULAR) 370 { 371 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 372 { 373 *err = U_ZERO_ERROR; 374 } 375 /* else the caller must have set the error code accordingly. */ 376 } 377 /* else ignore the reset, close and clone calls. */ 378 } 379 380 U_CAPI void U_EXPORT2 381 UCNV_TO_U_CALLBACK_SUBSTITUTE ( 382 const void *context, 383 UConverterToUnicodeArgs *toArgs, 384 const char* codeUnits, 385 int32_t length, 386 UConverterCallbackReason reason, 387 UErrorCode * err) 388 { 389 if (reason <= UCNV_IRREGULAR) 390 { 391 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 392 { 393 *err = U_ZERO_ERROR; 394 ucnv_cbToUWriteSub(toArgs,0,err); 395 } 396 /* else the caller must have set the error code accordingly. */ 397 } 398 /* else ignore the reset, close and clone calls. */ 399 } 400 401 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 402 *and uses that as the substitution sequence 403 */ 404 U_CAPI void U_EXPORT2 405 UCNV_TO_U_CALLBACK_ESCAPE ( 406 const void *context, 407 UConverterToUnicodeArgs *toArgs, 408 const char* codeUnits, 409 int32_t length, 410 UConverterCallbackReason reason, 411 UErrorCode * err) 412 { 413 UChar uniValueString[VALUE_STRING_LENGTH]; 414 int32_t valueStringLength = 0; 415 int32_t i = 0; 416 417 if (reason > UCNV_IRREGULAR) 418 { 419 return; 420 } 421 422 if(context==NULL) 423 { 424 while (i < length) 425 { 426 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 427 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */ 428 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 429 } 430 } 431 else 432 { 433 switch(*((char*)context)) 434 { 435 case UCNV_PRV_ESCAPE_XML_DEC: 436 while (i < length) 437 { 438 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 439 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 440 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0); 441 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 442 } 443 break; 444 445 case UCNV_PRV_ESCAPE_XML_HEX: 446 while (i < length) 447 { 448 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 449 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 450 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 451 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0); 452 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 453 } 454 break; 455 case UCNV_PRV_ESCAPE_C: 456 while (i < length) 457 { 458 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 459 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 460 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2); 461 } 462 break; 463 default: 464 while (i < length) 465 { 466 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 467 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */ 468 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 469 valueStringLength += 2; 470 } 471 } 472 } 473 /* reset the error */ 474 *err = U_ZERO_ERROR; 475 476 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err); 477 } 478 479 #endif 480