1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ***************************************************************************** 5 * 6 * Copyright (C) 1998-2016, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ***************************************************************************** 10 * 11 * ucnv_err.c 12 * Implements error behaviour functions called by T_UConverter_{from,to}Unicode 13 * 14 * 15 * Change history: 16 * 17 * 06/29/2000 helena Major rewrite of the callback APIs. 18 */ 19 20 #include "unicode/utypes.h" 21 22 #if !UCONFIG_NO_CONVERSION 23 24 #include "unicode/ucnv_err.h" 25 #include "unicode/ucnv_cb.h" 26 #include "ucnv_cnv.h" 27 #include "cmemory.h" 28 #include "unicode/ucnv.h" 29 #include "ustrfmt.h" 30 31 #define VALUE_STRING_LENGTH 48 32 /*Magic # 32 = 4(number of char in value string) * 8(max number of bytes per char for any converter) */ 33 #define UNICODE_PERCENT_SIGN_CODEPOINT 0x0025 34 #define UNICODE_U_CODEPOINT 0x0055 35 #define UNICODE_X_CODEPOINT 0x0058 36 #define UNICODE_RS_CODEPOINT 0x005C 37 #define UNICODE_U_LOW_CODEPOINT 0x0075 38 #define UNICODE_X_LOW_CODEPOINT 0x0078 39 #define UNICODE_AMP_CODEPOINT 0x0026 40 #define UNICODE_HASH_CODEPOINT 0x0023 41 #define UNICODE_SEMICOLON_CODEPOINT 0x003B 42 #define UNICODE_PLUS_CODEPOINT 0x002B 43 #define UNICODE_LEFT_CURLY_CODEPOINT 0x007B 44 #define UNICODE_RIGHT_CURLY_CODEPOINT 0x007D 45 #define UNICODE_SPACE_CODEPOINT 0x0020 46 #define UCNV_PRV_ESCAPE_ICU 0 47 #define UCNV_PRV_ESCAPE_C 'C' 48 #define UCNV_PRV_ESCAPE_XML_DEC 'D' 49 #define UCNV_PRV_ESCAPE_XML_HEX 'X' 50 #define UCNV_PRV_ESCAPE_JAVA 'J' 51 #define UCNV_PRV_ESCAPE_UNICODE 'U' 52 #define UCNV_PRV_ESCAPE_CSS2 'S' 53 #define UCNV_PRV_STOP_ON_ILLEGAL 'i' 54 55 /* 56 * IS_DEFAULT_IGNORABLE_CODE_POINT 57 * This is to check if a code point has the default ignorable unicode property. 58 * As such, this list needs to be updated if the ignorable code point list ever 59 * changes. 60 * To avoid dependency on other code, this list is hard coded here. 61 * When an ignorable code point is found and is unmappable, the default callbacks 62 * will ignore them. 63 * For a list of the default ignorable code points, use this link: http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[%3ADI%3A]&g= 64 * 65 * This list should be sync with the one in CharsetCallback.java 66 */ 67 #define IS_DEFAULT_IGNORABLE_CODE_POINT(c) (\ 68 (c == 0x00AD) || \ 69 (c == 0x034F) || \ 70 (c == 0x061C) || \ 71 (c == 0x115F) || \ 72 (c == 0x1160) || \ 73 (0x17B4 <= c && c <= 0x17B5) || \ 74 (0x180B <= c && c <= 0x180E) || \ 75 (0x200B <= c && c <= 0x200F) || \ 76 (0x202A <= c && c <= 0x202E) || \ 77 (c == 0x2060) || \ 78 (0x2066 <= c && c <= 0x2069) || \ 79 (0x2061 <= c && c <= 0x2064) || \ 80 (0x206A <= c && c <= 0x206F) || \ 81 (c == 0x3164) || \ 82 (0x0FE00 <= c && c <= 0x0FE0F) || \ 83 (c == 0x0FEFF) || \ 84 (c == 0x0FFA0) || \ 85 (0x01BCA0 <= c && c <= 0x01BCA3) || \ 86 (0x01D173 <= c && c <= 0x01D17A) || \ 87 (c == 0x0E0001) || \ 88 (0x0E0020 <= c && c <= 0x0E007F) || \ 89 (0x0E0100 <= c && c <= 0x0E01EF) || \ 90 (c == 0x2065) || \ 91 (0x0FFF0 <= c && c <= 0x0FFF8) || \ 92 (c == 0x0E0000) || \ 93 (0x0E0002 <= c && c <= 0x0E001F) || \ 94 (0x0E0080 <= c && c <= 0x0E00FF) || \ 95 (0x0E01F0 <= c && c <= 0x0E0FFF) \ 96 ) 97 98 99 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 100 U_CAPI void U_EXPORT2 101 UCNV_FROM_U_CALLBACK_STOP ( 102 const void *context, 103 UConverterFromUnicodeArgs *fromUArgs, 104 const UChar* codeUnits, 105 int32_t length, 106 UChar32 codePoint, 107 UConverterCallbackReason reason, 108 UErrorCode * err) 109 { 110 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 111 { 112 /* 113 * Skip if the codepoint has unicode property of default ignorable. 114 */ 115 *err = U_ZERO_ERROR; 116 } 117 /* the caller must have set the error code accordingly */ 118 return; 119 } 120 121 122 /*Function Pointer STOPS at the ILLEGAL_SEQUENCE */ 123 U_CAPI void U_EXPORT2 124 UCNV_TO_U_CALLBACK_STOP ( 125 const void *context, 126 UConverterToUnicodeArgs *toUArgs, 127 const char* codePoints, 128 int32_t length, 129 UConverterCallbackReason reason, 130 UErrorCode * err) 131 { 132 /* the caller must have set the error code accordingly */ 133 return; 134 } 135 136 U_CAPI void U_EXPORT2 137 UCNV_FROM_U_CALLBACK_SKIP ( 138 const void *context, 139 UConverterFromUnicodeArgs *fromUArgs, 140 const UChar* codeUnits, 141 int32_t length, 142 UChar32 codePoint, 143 UConverterCallbackReason reason, 144 UErrorCode * err) 145 { 146 if (reason <= UCNV_IRREGULAR) 147 { 148 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 149 { 150 /* 151 * Skip if the codepoint has unicode property of default ignorable. 152 */ 153 *err = U_ZERO_ERROR; 154 } 155 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 156 { 157 *err = U_ZERO_ERROR; 158 } 159 /* else the caller must have set the error code accordingly. */ 160 } 161 /* else ignore the reset, close and clone calls. */ 162 } 163 164 U_CAPI void U_EXPORT2 165 UCNV_FROM_U_CALLBACK_SUBSTITUTE ( 166 const void *context, 167 UConverterFromUnicodeArgs *fromArgs, 168 const UChar* codeUnits, 169 int32_t length, 170 UChar32 codePoint, 171 UConverterCallbackReason reason, 172 UErrorCode * err) 173 { 174 if (reason <= UCNV_IRREGULAR) 175 { 176 if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 177 { 178 /* 179 * Skip if the codepoint has unicode property of default ignorable. 180 */ 181 *err = U_ZERO_ERROR; 182 } 183 else if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 184 { 185 *err = U_ZERO_ERROR; 186 ucnv_cbFromUWriteSub(fromArgs, 0, err); 187 } 188 /* else the caller must have set the error code accordingly. */ 189 } 190 /* else ignore the reset, close and clone calls. */ 191 } 192 193 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 194 *uses a clean copy (resetted) of the converter, to convert that unicode 195 *escape sequence to the target codepage (if conversion failure happens then 196 *we revert to substituting with subchar) 197 */ 198 U_CAPI void U_EXPORT2 199 UCNV_FROM_U_CALLBACK_ESCAPE ( 200 const void *context, 201 UConverterFromUnicodeArgs *fromArgs, 202 const UChar *codeUnits, 203 int32_t length, 204 UChar32 codePoint, 205 UConverterCallbackReason reason, 206 UErrorCode * err) 207 { 208 209 UChar valueString[VALUE_STRING_LENGTH]; 210 int32_t valueStringLength = 0; 211 int32_t i = 0; 212 213 const UChar *myValueSource = NULL; 214 UErrorCode err2 = U_ZERO_ERROR; 215 UConverterFromUCallback original = NULL; 216 const void *originalContext; 217 218 UConverterFromUCallback ignoredCallback = NULL; 219 const void *ignoredContext; 220 221 if (reason > UCNV_IRREGULAR) 222 { 223 return; 224 } 225 else if (reason == UCNV_UNASSIGNED && IS_DEFAULT_IGNORABLE_CODE_POINT(codePoint)) 226 { 227 /* 228 * Skip if the codepoint has unicode property of default ignorable. 229 */ 230 *err = U_ZERO_ERROR; 231 return; 232 } 233 234 ucnv_setFromUCallBack (fromArgs->converter, 235 (UConverterFromUCallback) UCNV_FROM_U_CALLBACK_SUBSTITUTE, 236 NULL, 237 &original, 238 &originalContext, 239 &err2); 240 241 if (U_FAILURE (err2)) 242 { 243 *err = err2; 244 return; 245 } 246 if(context==NULL) 247 { 248 while (i < length) 249 { 250 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 251 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 252 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 253 } 254 } 255 else 256 { 257 switch(*((char*)context)) 258 { 259 case UCNV_PRV_ESCAPE_JAVA: 260 while (i < length) 261 { 262 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 263 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */ 264 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 265 } 266 break; 267 268 case UCNV_PRV_ESCAPE_C: 269 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 270 271 if(length==2){ 272 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 273 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 8); 274 275 } 276 else{ 277 valueString[valueStringLength++] = (UChar) UNICODE_U_LOW_CODEPOINT; /* adding u */ 278 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 279 } 280 break; 281 282 case UCNV_PRV_ESCAPE_XML_DEC: 283 284 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 285 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 286 if(length==2){ 287 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 10, 0); 288 } 289 else{ 290 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 10, 0); 291 } 292 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 293 break; 294 295 case UCNV_PRV_ESCAPE_XML_HEX: 296 297 valueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 298 valueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 299 valueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 300 if(length==2){ 301 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 302 } 303 else{ 304 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 0); 305 } 306 valueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 307 break; 308 309 case UCNV_PRV_ESCAPE_UNICODE: 310 valueString[valueStringLength++] = (UChar) UNICODE_LEFT_CURLY_CODEPOINT; /* adding { */ 311 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 312 valueString[valueStringLength++] = (UChar) UNICODE_PLUS_CODEPOINT; /* adding + */ 313 if (length == 2) { 314 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 4); 315 } else { 316 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[0], 16, 4); 317 } 318 valueString[valueStringLength++] = (UChar) UNICODE_RIGHT_CURLY_CODEPOINT; /* adding } */ 319 break; 320 321 case UCNV_PRV_ESCAPE_CSS2: 322 valueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 323 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, codePoint, 16, 0); 324 /* Always add space character, becase the next character might be whitespace, 325 which would erroneously be considered the termination of the escape sequence. */ 326 valueString[valueStringLength++] = (UChar) UNICODE_SPACE_CODEPOINT; 327 break; 328 329 default: 330 while (i < length) 331 { 332 valueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 333 valueString[valueStringLength++] = (UChar) UNICODE_U_CODEPOINT; /* adding U */ 334 valueStringLength += uprv_itou (valueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint16_t)codeUnits[i++], 16, 4); 335 } 336 } 337 } 338 myValueSource = valueString; 339 340 /* reset the error */ 341 *err = U_ZERO_ERROR; 342 343 ucnv_cbFromUWriteUChars(fromArgs, &myValueSource, myValueSource+valueStringLength, 0, err); 344 345 ucnv_setFromUCallBack (fromArgs->converter, 346 original, 347 originalContext, 348 &ignoredCallback, 349 &ignoredContext, 350 &err2); 351 if (U_FAILURE (err2)) 352 { 353 *err = err2; 354 return; 355 } 356 357 return; 358 } 359 360 361 362 U_CAPI void U_EXPORT2 363 UCNV_TO_U_CALLBACK_SKIP ( 364 const void *context, 365 UConverterToUnicodeArgs *toArgs, 366 const char* codeUnits, 367 int32_t length, 368 UConverterCallbackReason reason, 369 UErrorCode * err) 370 { 371 if (reason <= UCNV_IRREGULAR) 372 { 373 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 374 { 375 *err = U_ZERO_ERROR; 376 } 377 /* else the caller must have set the error code accordingly. */ 378 } 379 /* else ignore the reset, close and clone calls. */ 380 } 381 382 U_CAPI void U_EXPORT2 383 UCNV_TO_U_CALLBACK_SUBSTITUTE ( 384 const void *context, 385 UConverterToUnicodeArgs *toArgs, 386 const char* codeUnits, 387 int32_t length, 388 UConverterCallbackReason reason, 389 UErrorCode * err) 390 { 391 if (reason <= UCNV_IRREGULAR) 392 { 393 if (context == NULL || (*((char*)context) == UCNV_PRV_STOP_ON_ILLEGAL && reason == UCNV_UNASSIGNED)) 394 { 395 *err = U_ZERO_ERROR; 396 ucnv_cbToUWriteSub(toArgs,0,err); 397 } 398 /* else the caller must have set the error code accordingly. */ 399 } 400 /* else ignore the reset, close and clone calls. */ 401 } 402 403 /*uses uprv_itou to get a unicode escape sequence of the offensive sequence, 404 *and uses that as the substitution sequence 405 */ 406 U_CAPI void U_EXPORT2 407 UCNV_TO_U_CALLBACK_ESCAPE ( 408 const void *context, 409 UConverterToUnicodeArgs *toArgs, 410 const char* codeUnits, 411 int32_t length, 412 UConverterCallbackReason reason, 413 UErrorCode * err) 414 { 415 UChar uniValueString[VALUE_STRING_LENGTH]; 416 int32_t valueStringLength = 0; 417 int32_t i = 0; 418 419 if (reason > UCNV_IRREGULAR) 420 { 421 return; 422 } 423 424 if(context==NULL) 425 { 426 while (i < length) 427 { 428 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 429 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */ 430 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 431 } 432 } 433 else 434 { 435 switch(*((char*)context)) 436 { 437 case UCNV_PRV_ESCAPE_XML_DEC: 438 while (i < length) 439 { 440 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 441 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 442 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 10, 0); 443 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 444 } 445 break; 446 447 case UCNV_PRV_ESCAPE_XML_HEX: 448 while (i < length) 449 { 450 uniValueString[valueStringLength++] = (UChar) UNICODE_AMP_CODEPOINT; /* adding & */ 451 uniValueString[valueStringLength++] = (UChar) UNICODE_HASH_CODEPOINT; /* adding # */ 452 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 453 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 0); 454 uniValueString[valueStringLength++] = (UChar) UNICODE_SEMICOLON_CODEPOINT; /* adding ; */ 455 } 456 break; 457 case UCNV_PRV_ESCAPE_C: 458 while (i < length) 459 { 460 uniValueString[valueStringLength++] = (UChar) UNICODE_RS_CODEPOINT; /* adding \ */ 461 uniValueString[valueStringLength++] = (UChar) UNICODE_X_LOW_CODEPOINT; /* adding x */ 462 valueStringLength += uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t)codeUnits[i++], 16, 2); 463 } 464 break; 465 default: 466 while (i < length) 467 { 468 uniValueString[valueStringLength++] = (UChar) UNICODE_PERCENT_SIGN_CODEPOINT; /* adding % */ 469 uniValueString[valueStringLength++] = (UChar) UNICODE_X_CODEPOINT; /* adding X */ 470 uprv_itou (uniValueString + valueStringLength, VALUE_STRING_LENGTH - valueStringLength, (uint8_t) codeUnits[i++], 16, 2); 471 valueStringLength += 2; 472 } 473 } 474 } 475 /* reset the error */ 476 *err = U_ZERO_ERROR; 477 478 ucnv_cbToUWriteUChars(toArgs, uniValueString, valueStringLength, 0, err); 479 } 480 481 #endif 482