1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2014, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: convtest.cpp 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003jul15 16 * created by: Markus W. Scherer 17 * 18 * Test file for data-driven conversion tests. 19 */ 20 21 #include "unicode/utypes.h" 22 23 #if !UCONFIG_NO_LEGACY_CONVERSION 24 /* 25 * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION 26 * is slightly unnecessary - it removes tests for Unicode charsets 27 * like UTF-8 that should work. 28 * However, there is no easy way for the test to detect whether a test case 29 * is for a Unicode charset, so it would be difficult to only exclude those. 30 * Also, regular testing of ICU is done with all modules on, therefore 31 * not testing conversion for a custom configuration like this should be ok. 32 */ 33 34 #include "unicode/ucnv.h" 35 #include "unicode/unistr.h" 36 #include "unicode/parsepos.h" 37 #include "unicode/uniset.h" 38 #include "unicode/ustring.h" 39 #include "unicode/ures.h" 40 #include "convtest.h" 41 #include "cmemory.h" 42 #include "unicode/tstdtmod.h" 43 #include <string.h> 44 #include <stdlib.h> 45 46 enum { 47 // characters used in test data for callbacks 48 SUB_CB='?', 49 SKIP_CB='0', 50 STOP_CB='.', 51 ESC_CB='&' 52 }; 53 54 ConversionTest::ConversionTest() { 55 UErrorCode errorCode=U_ZERO_ERROR; 56 utf8Cnv=ucnv_open("UTF-8", &errorCode); 57 ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); 58 if(U_FAILURE(errorCode)) { 59 errln("unable to open UTF-8 converter"); 60 } 61 } 62 63 ConversionTest::~ConversionTest() { 64 ucnv_close(utf8Cnv); 65 } 66 67 void 68 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { 69 if (exec) logln("TestSuite ConversionTest: "); 70 switch (index) { 71 #if !UCONFIG_NO_FILE_IO 72 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; 73 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; 74 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; 75 case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break; 76 #else 77 case 0: 78 case 1: 79 case 2: 80 case 3: name="skip"; break; 81 #endif 82 case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; 83 default: name=""; break; //needed to end loop 84 } 85 } 86 87 // test data interface ----------------------------------------------------- *** 88 89 void 90 ConversionTest::TestToUnicode() { 91 ConversionCase cc; 92 char charset[100], cbopt[4]; 93 const char *option; 94 UnicodeString s, unicode; 95 int32_t offsetsLength; 96 UConverterToUCallback callback; 97 98 TestDataModule *dataModule; 99 TestData *testData; 100 const DataMap *testCase; 101 UErrorCode errorCode; 102 int32_t i; 103 104 errorCode=U_ZERO_ERROR; 105 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 106 if(U_SUCCESS(errorCode)) { 107 testData=dataModule->createTestData("toUnicode", errorCode); 108 if(U_SUCCESS(errorCode)) { 109 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 110 if(U_FAILURE(errorCode)) { 111 errln("error retrieving conversion/toUnicode test case %d - %s", 112 i, u_errorName(errorCode)); 113 errorCode=U_ZERO_ERROR; 114 continue; 115 } 116 117 cc.caseNr=i; 118 119 s=testCase->getString("charset", errorCode); 120 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 121 cc.charset=charset; 122 123 // BEGIN android-added 124 // To save space, Android does not build full ISO-2022-CN tables. 125 // We skip the TestGetKeywordValuesForLocale for counting available collations. 126 if (strlen(charset) >= 8 && 127 strncmp(charset+4, "2022-CN", 4) == 0) { 128 continue; 129 } 130 // END android-added 131 132 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode); 133 unicode=testCase->getString("unicode", errorCode); 134 cc.unicode=unicode.getBuffer(); 135 cc.unicodeLength=unicode.length(); 136 137 offsetsLength=0; 138 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode); 139 if(offsetsLength==0) { 140 cc.offsets=NULL; 141 } else if(offsetsLength!=unicode.length()) { 142 errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length", 143 i, unicode.length(), offsetsLength); 144 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 145 } 146 147 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode); 148 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode); 149 150 s=testCase->getString("errorCode", errorCode); 151 if(s==UNICODE_STRING("invalid", 7)) { 152 cc.outErrorCode=U_INVALID_CHAR_FOUND; 153 } else if(s==UNICODE_STRING("illegal", 7)) { 154 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND; 155 } else if(s==UNICODE_STRING("truncated", 9)) { 156 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND; 157 } else if(s==UNICODE_STRING("illesc", 6)) { 158 cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; 159 } else if(s==UNICODE_STRING("unsuppesc", 9)) { 160 cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE; 161 } else { 162 cc.outErrorCode=U_ZERO_ERROR; 163 } 164 165 s=testCase->getString("callback", errorCode); 166 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), ""); 167 cc.cbopt=cbopt; 168 switch(cbopt[0]) { 169 case SUB_CB: 170 callback=UCNV_TO_U_CALLBACK_SUBSTITUTE; 171 break; 172 case SKIP_CB: 173 callback=UCNV_TO_U_CALLBACK_SKIP; 174 break; 175 case STOP_CB: 176 callback=UCNV_TO_U_CALLBACK_STOP; 177 break; 178 case ESC_CB: 179 callback=UCNV_TO_U_CALLBACK_ESCAPE; 180 break; 181 default: 182 callback=NULL; 183 break; 184 } 185 option=callback==NULL ? cbopt : cbopt+1; 186 if(*option==0) { 187 option=NULL; 188 } 189 190 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode); 191 192 if(U_FAILURE(errorCode)) { 193 errln("error parsing conversion/toUnicode test case %d - %s", 194 i, u_errorName(errorCode)); 195 errorCode=U_ZERO_ERROR; 196 } else { 197 logln("TestToUnicode[%d] %s", i, charset); 198 ToUnicodeCase(cc, callback, option); 199 } 200 } 201 delete testData; 202 } 203 delete dataModule; 204 } 205 else { 206 dataerrln("Could not load test conversion data"); 207 } 208 } 209 210 void 211 ConversionTest::TestFromUnicode() { 212 ConversionCase cc; 213 char charset[100], cbopt[4]; 214 const char *option; 215 UnicodeString s, unicode, invalidUChars; 216 int32_t offsetsLength, index; 217 UConverterFromUCallback callback; 218 219 TestDataModule *dataModule; 220 TestData *testData; 221 const DataMap *testCase; 222 const UChar *p; 223 UErrorCode errorCode; 224 int32_t i, length; 225 226 errorCode=U_ZERO_ERROR; 227 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 228 if(U_SUCCESS(errorCode)) { 229 testData=dataModule->createTestData("fromUnicode", errorCode); 230 if(U_SUCCESS(errorCode)) { 231 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 232 if(U_FAILURE(errorCode)) { 233 errln("error retrieving conversion/fromUnicode test case %d - %s", 234 i, u_errorName(errorCode)); 235 errorCode=U_ZERO_ERROR; 236 continue; 237 } 238 239 cc.caseNr=i; 240 241 s=testCase->getString("charset", errorCode); 242 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 243 cc.charset=charset; 244 245 // BEGIN android-added 246 // To save space, Android does not build full ISO-2022-CN tables. 247 // We skip the TestGetKeywordValuesForLocale for counting available collations. 248 if (strlen(charset) >= 8 && 249 strncmp(charset+4, "2022-CN", 4) == 0) { 250 continue; 251 } 252 // END android-added 253 254 unicode=testCase->getString("unicode", errorCode); 255 cc.unicode=unicode.getBuffer(); 256 cc.unicodeLength=unicode.length(); 257 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode); 258 259 offsetsLength=0; 260 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode); 261 if(offsetsLength==0) { 262 cc.offsets=NULL; 263 } else if(offsetsLength!=cc.bytesLength) { 264 errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length", 265 i, cc.bytesLength, offsetsLength); 266 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 267 } 268 269 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode); 270 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode); 271 272 s=testCase->getString("errorCode", errorCode); 273 if(s==UNICODE_STRING("invalid", 7)) { 274 cc.outErrorCode=U_INVALID_CHAR_FOUND; 275 } else if(s==UNICODE_STRING("illegal", 7)) { 276 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND; 277 } else if(s==UNICODE_STRING("truncated", 9)) { 278 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND; 279 } else { 280 cc.outErrorCode=U_ZERO_ERROR; 281 } 282 283 s=testCase->getString("callback", errorCode); 284 cc.setSub=0; // default: no subchar 285 286 if((index=s.indexOf((UChar)0))>0) { 287 // read NUL-separated subchar first, if any 288 // copy the subchar from Latin-1 characters 289 // start after the NUL 290 p=s.getTerminatedBuffer(); 291 length=index+1; 292 p+=length; 293 length=s.length()-length; 294 if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) { 295 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 296 } else { 297 int32_t j; 298 299 for(j=0; j<length; ++j) { 300 cc.subchar[j]=(char)p[j]; 301 } 302 // NUL-terminate the subchar 303 cc.subchar[j]=0; 304 cc.setSub=1; 305 } 306 307 // remove the NUL and subchar from s 308 s.truncate(index); 309 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ { 310 // read a substitution string, separated by an equal sign 311 p=s.getBuffer()+index+1; 312 length=s.length()-(index+1); 313 if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) { 314 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 315 } else { 316 u_memcpy(cc.subString, p, length); 317 // NUL-terminate the subString 318 cc.subString[length]=0; 319 cc.setSub=-1; 320 } 321 322 // remove the equal sign and subString from s 323 s.truncate(index); 324 } 325 326 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), ""); 327 cc.cbopt=cbopt; 328 switch(cbopt[0]) { 329 case SUB_CB: 330 callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE; 331 break; 332 case SKIP_CB: 333 callback=UCNV_FROM_U_CALLBACK_SKIP; 334 break; 335 case STOP_CB: 336 callback=UCNV_FROM_U_CALLBACK_STOP; 337 break; 338 case ESC_CB: 339 callback=UCNV_FROM_U_CALLBACK_ESCAPE; 340 break; 341 default: 342 callback=NULL; 343 break; 344 } 345 option=callback==NULL ? cbopt : cbopt+1; 346 if(*option==0) { 347 option=NULL; 348 } 349 350 invalidUChars=testCase->getString("invalidUChars", errorCode); 351 cc.invalidUChars=invalidUChars.getBuffer(); 352 cc.invalidLength=invalidUChars.length(); 353 354 if(U_FAILURE(errorCode)) { 355 errln("error parsing conversion/fromUnicode test case %d - %s", 356 i, u_errorName(errorCode)); 357 errorCode=U_ZERO_ERROR; 358 } else { 359 logln("TestFromUnicode[%d] %s", i, charset); 360 FromUnicodeCase(cc, callback, option); 361 } 362 } 363 delete testData; 364 } 365 delete dataModule; 366 } 367 else { 368 dataerrln("Could not load test conversion data"); 369 } 370 } 371 372 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e }; 373 374 void 375 ConversionTest::TestGetUnicodeSet() { 376 char charset[100]; 377 UnicodeString s, map, mapnot; 378 int32_t which; 379 380 ParsePosition pos; 381 UnicodeSet cnvSet, mapSet, mapnotSet, diffSet; 382 UnicodeSet *cnvSetPtr = &cnvSet; 383 LocalUConverterPointer cnv; 384 385 TestDataModule *dataModule; 386 TestData *testData; 387 const DataMap *testCase; 388 UErrorCode errorCode; 389 int32_t i; 390 391 errorCode=U_ZERO_ERROR; 392 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 393 if(U_SUCCESS(errorCode)) { 394 testData=dataModule->createTestData("getUnicodeSet", errorCode); 395 if(U_SUCCESS(errorCode)) { 396 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 397 if(U_FAILURE(errorCode)) { 398 errln("error retrieving conversion/getUnicodeSet test case %d - %s", 399 i, u_errorName(errorCode)); 400 errorCode=U_ZERO_ERROR; 401 continue; 402 } 403 404 s=testCase->getString("charset", errorCode); 405 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 406 407 // BEGIN android-added 408 // To save space, Android does not build full ISO-2022-CN tables. 409 // We skip the TestGetKeywordValuesForLocale for counting available collations. 410 if (strlen(charset) >= 8 && 411 strncmp(charset+4, "2022-CN", 4) == 0) { 412 continue; 413 } 414 // END android-added 415 416 map=testCase->getString("map", errorCode); 417 mapnot=testCase->getString("mapnot", errorCode); 418 419 which=testCase->getInt28("which", errorCode); 420 421 if(U_FAILURE(errorCode)) { 422 errln("error parsing conversion/getUnicodeSet test case %d - %s", 423 i, u_errorName(errorCode)); 424 errorCode=U_ZERO_ERROR; 425 continue; 426 } 427 428 // test this test case 429 mapSet.clear(); 430 mapnotSet.clear(); 431 432 pos.setIndex(0); 433 mapSet.applyPattern(map, pos, 0, NULL, errorCode); 434 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) { 435 errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n" 436 " error index %d index %d U+%04x", 437 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex())); 438 errorCode=U_ZERO_ERROR; 439 continue; 440 } 441 442 pos.setIndex(0); 443 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode); 444 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) { 445 errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n" 446 " error index %d index %d U+%04x", 447 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex())); 448 errorCode=U_ZERO_ERROR; 449 continue; 450 } 451 452 logln("TestGetUnicodeSet[%d] %s", i, charset); 453 454 cnv.adoptInstead(cnv_open(charset, errorCode)); 455 if(U_FAILURE(errorCode)) { 456 errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s", 457 charset, i, u_errorName(errorCode)); 458 errorCode=U_ZERO_ERROR; 459 continue; 460 } 461 462 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode); 463 464 if(U_FAILURE(errorCode)) { 465 errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s", 466 charset, i, u_errorName(errorCode)); 467 errorCode=U_ZERO_ERROR; 468 continue; 469 } 470 471 // are there items that must be in cnvSet but are not? 472 (diffSet=mapSet).removeAll(cnvSet); 473 if(!diffSet.isEmpty()) { 474 diffSet.toPattern(s, TRUE); 475 if(s.length()>100) { 476 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 477 } 478 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d", 479 charset, i); 480 errln(s); 481 } 482 483 // are there items that must not be in cnvSet but are? 484 (diffSet=mapnotSet).retainAll(cnvSet); 485 if(!diffSet.isEmpty()) { 486 diffSet.toPattern(s, TRUE); 487 if(s.length()>100) { 488 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 489 } 490 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d", 491 charset, i); 492 errln(s); 493 } 494 } 495 delete testData; 496 } 497 delete dataModule; 498 } 499 else { 500 dataerrln("Could not load test conversion data"); 501 } 502 } 503 504 U_CDECL_BEGIN 505 static void U_CALLCONV 506 getUnicodeSetCallback(const void *context, 507 UConverterFromUnicodeArgs * /*fromUArgs*/, 508 const UChar* /*codeUnits*/, 509 int32_t /*length*/, 510 UChar32 codePoint, 511 UConverterCallbackReason reason, 512 UErrorCode *pErrorCode) { 513 if(reason<=UCNV_IRREGULAR) { 514 ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point 515 *pErrorCode=U_ZERO_ERROR; // skip 516 } // else ignore the reset, close and clone calls. 517 } 518 U_CDECL_END 519 520 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted. 521 void 522 ConversionTest::TestGetUnicodeSet2() { 523 // Build a string with all code points. 524 UChar32 cpLimit; 525 int32_t s0Length; 526 if(quick) { 527 cpLimit=s0Length=0x10000; // BMP only 528 } else { 529 cpLimit=0x110000; 530 s0Length=0x10000+0x200000; // BMP + surrogate pairs 531 } 532 UChar *s0=new UChar[s0Length]; 533 if(s0==NULL) { 534 return; 535 } 536 UChar *s=s0; 537 UChar32 c; 538 UChar c2; 539 // low BMP 540 for(c=0; c<=0xd7ff; ++c) { 541 *s++=(UChar)c; 542 } 543 // trail surrogates 544 for(c=0xdc00; c<=0xdfff; ++c) { 545 *s++=(UChar)c; 546 } 547 // lead surrogates 548 // (after trails so that there is not even one surrogate pair in between) 549 for(c=0xd800; c<=0xdbff; ++c) { 550 *s++=(UChar)c; 551 } 552 // high BMP 553 for(c=0xe000; c<=0xffff; ++c) { 554 *s++=(UChar)c; 555 } 556 // supplementary code points = surrogate pairs 557 if(cpLimit==0x110000) { 558 for(c=0xd800; c<=0xdbff; ++c) { 559 for(c2=0xdc00; c2<=0xdfff; ++c2) { 560 *s++=(UChar)c; 561 *s++=c2; 562 } 563 } 564 } 565 566 static const char *const cnvNames[]={ 567 "UTF-8", 568 "UTF-7", 569 "UTF-16", 570 "US-ASCII", 571 "ISO-8859-1", 572 "windows-1252", 573 "Shift-JIS", 574 "ibm-1390", // EBCDIC_STATEFUL table 575 "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table 576 "HZ", 577 "ISO-2022-JP", 578 "JIS7", 579 "ISO-2022-CN", 580 "ISO-2022-CN-EXT", 581 "LMBCS" 582 }; 583 LocalUConverterPointer cnv; 584 char buffer[1024]; 585 int32_t i; 586 for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) { 587 UErrorCode errorCode=U_ZERO_ERROR; 588 cnv.adoptInstead(cnv_open(cnvNames[i], errorCode)); 589 if(U_FAILURE(errorCode)) { 590 errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode)); 591 continue; 592 } 593 UnicodeSet expected; 594 ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode); 595 if(U_FAILURE(errorCode)) { 596 errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode)); 597 continue; 598 } 599 UConverterUnicodeSet which; 600 for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { 601 if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 602 ucnv_setFallback(cnv.getAlias(), TRUE); 603 } 604 expected.add(0, cpLimit-1); 605 s=s0; 606 UBool flush; 607 do { 608 char *t=buffer; 609 flush=(UBool)(s==s0+s0Length); 610 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode); 611 if(U_FAILURE(errorCode)) { 612 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 613 errorCode=U_ZERO_ERROR; 614 continue; 615 } else { 616 break; // unexpected error, should not occur 617 } 618 } 619 } while(!flush); 620 UnicodeSet set; 621 ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode); 622 if(cpLimit<0x110000) { 623 set.remove(cpLimit, 0x10ffff); 624 } 625 if(which==UCNV_ROUNDTRIP_SET) { 626 // ignore PUA code points because they will be converted even if they 627 // are fallbacks and when other fallbacks are turned off, 628 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips 629 expected.remove(0xe000, 0xf8ff); 630 expected.remove(0xf0000, 0xffffd); 631 expected.remove(0x100000, 0x10fffd); 632 set.remove(0xe000, 0xf8ff); 633 set.remove(0xf0000, 0xffffd); 634 set.remove(0x100000, 0x10fffd); 635 } 636 if(set!=expected) { 637 // First try to see if we have different sets because ucnv_getUnicodeSet() 638 // added strings: The above conversion method does not tell us what strings might be convertible. 639 // Remove strings from the set and compare again. 640 // Unfortunately, there are no good, direct set methods for finding out whether there are strings 641 // in the set, nor for enumerating or removing just them. 642 // Intersect all code points with the set. The intersection will not contain strings. 643 UnicodeSet temp(0, 0x10ffff); 644 temp.retainAll(set); 645 set=temp; 646 } 647 if(set!=expected) { 648 UnicodeSet diffSet; 649 UnicodeString out; 650 651 // are there items that must be in the set but are not? 652 (diffSet=expected).removeAll(set); 653 if(!diffSet.isEmpty()) { 654 diffSet.toPattern(out, TRUE); 655 if(out.length()>100) { 656 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 657 } 658 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", 659 cnvNames[i], which); 660 errln(out); 661 } 662 663 // are there items that must not be in the set but are? 664 (diffSet=set).removeAll(expected); 665 if(!diffSet.isEmpty()) { 666 diffSet.toPattern(out, TRUE); 667 if(out.length()>100) { 668 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 669 } 670 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", 671 cnvNames[i], which); 672 errln(out); 673 } 674 } 675 } 676 } 677 678 delete [] s0; 679 } 680 681 // Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping 682 // If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated 683 void 684 ConversionTest::TestDefaultIgnorableCallback() { 685 UErrorCode status = U_ZERO_ERROR; 686 const char *cnv_name = "euc-jp-2007"; 687 const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]"; 688 const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]"; 689 690 UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status); 691 if (U_FAILURE(status)) { 692 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status)); 693 return; 694 } 695 696 UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status); 697 if (U_FAILURE(status)) { 698 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status)); 699 return; 700 } 701 702 UConverter *cnv = cnv_open(cnv_name, status); 703 if (U_FAILURE(status)) { 704 dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status)); 705 return; 706 } 707 708 // set callback for the converter 709 ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status); 710 711 UChar32 input[1]; 712 char output[10]; 713 int32_t outputLength; 714 715 // test default ignorables are ignored 716 int size = set_ignorable->size(); 717 for (int i = 0; i < size; i++) { 718 status = U_ZERO_ERROR; 719 outputLength= 0; 720 721 input[0] = set_ignorable->charAt(i); 722 723 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status); 724 if (U_FAILURE(status) || outputLength != 0) { 725 errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status)); 726 } 727 } 728 729 // test non-ignorables are not ignored 730 size = set_not_ignorable->size(); 731 for (int i = 0; i < size; i++) { 732 status = U_ZERO_ERROR; 733 outputLength= 0; 734 735 input[0] = set_not_ignorable->charAt(i); 736 737 if (input[0] == 0) { 738 continue; 739 } 740 741 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status); 742 if (U_FAILURE(status) || outputLength <= 0) { 743 errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status)); 744 } 745 } 746 747 ucnv_close(cnv); 748 delete set_not_ignorable; 749 delete set_ignorable; 750 } 751 752 // open testdata or ICU data converter ------------------------------------- *** 753 754 UConverter * 755 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) { 756 if(name!=NULL && *name=='+') { 757 // Converter names that start with '+' are ignored in ICU4J tests. 758 ++name; 759 } 760 if(name!=NULL && *name=='*') { 761 /* loadTestData(): set the data directory */ 762 return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode); 763 } else { 764 return ucnv_open(name, &errorCode); 765 } 766 } 767 768 // output helpers ---------------------------------------------------------- *** 769 770 static inline char 771 hexDigit(uint8_t digit) { 772 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); 773 } 774 775 static char * 776 printBytes(const uint8_t *bytes, int32_t length, char *out) { 777 uint8_t b; 778 779 if(length>0) { 780 b=*bytes++; 781 --length; 782 *out++=hexDigit((uint8_t)(b>>4)); 783 *out++=hexDigit((uint8_t)(b&0xf)); 784 } 785 786 while(length>0) { 787 b=*bytes++; 788 --length; 789 *out++=' '; 790 *out++=hexDigit((uint8_t)(b>>4)); 791 *out++=hexDigit((uint8_t)(b&0xf)); 792 } 793 *out++=0; 794 return out; 795 } 796 797 static char * 798 printUnicode(const UChar *unicode, int32_t length, char *out) { 799 UChar32 c; 800 int32_t i; 801 802 for(i=0; i<length;) { 803 if(i>0) { 804 *out++=' '; 805 } 806 U16_NEXT(unicode, i, length, c); 807 // write 4..6 digits 808 if(c>=0x100000) { 809 *out++='1'; 810 } 811 if(c>=0x10000) { 812 *out++=hexDigit((uint8_t)((c>>16)&0xf)); 813 } 814 *out++=hexDigit((uint8_t)((c>>12)&0xf)); 815 *out++=hexDigit((uint8_t)((c>>8)&0xf)); 816 *out++=hexDigit((uint8_t)((c>>4)&0xf)); 817 *out++=hexDigit((uint8_t)(c&0xf)); 818 } 819 *out++=0; 820 return out; 821 } 822 823 static char * 824 printOffsets(const int32_t *offsets, int32_t length, char *out) { 825 int32_t i, o, d; 826 827 if(offsets==NULL) { 828 length=0; 829 } 830 831 for(i=0; i<length; ++i) { 832 if(i>0) { 833 *out++=' '; 834 } 835 o=offsets[i]; 836 837 // print all offsets with 2 characters each (-x, -9..99, xx) 838 if(o<-9) { 839 *out++='-'; 840 *out++='x'; 841 } else if(o<0) { 842 *out++='-'; 843 *out++=(char)('0'-o); 844 } else if(o<=99) { 845 *out++=(d=o/10)==0 ? ' ' : (char)('0'+d); 846 *out++=(char)('0'+o%10); 847 } else /* o>99 */ { 848 *out++='x'; 849 *out++='x'; 850 } 851 } 852 *out++=0; 853 return out; 854 } 855 856 // toUnicode test worker functions ----------------------------------------- *** 857 858 static int32_t 859 stepToUnicode(ConversionCase &cc, UConverter *cnv, 860 UChar *result, int32_t resultCapacity, 861 int32_t *resultOffsets, /* also resultCapacity */ 862 int32_t step, 863 UErrorCode *pErrorCode) { 864 const char *source, *sourceLimit, *bytesLimit; 865 UChar *target, *targetLimit, *resultLimit; 866 UBool flush; 867 868 source=(const char *)cc.bytes; 869 target=result; 870 bytesLimit=source+cc.bytesLength; 871 resultLimit=result+resultCapacity; 872 873 if(step>=0) { 874 // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time 875 // move only one buffer (in vs. out) at a time to be extra mean 876 // step==0 performs bulk conversion and generates offsets 877 878 // initialize the partial limits for the loop 879 if(step==0) { 880 // use the entire buffers 881 sourceLimit=bytesLimit; 882 targetLimit=resultLimit; 883 flush=cc.finalFlush; 884 } else { 885 // start with empty partial buffers 886 sourceLimit=source; 887 targetLimit=target; 888 flush=FALSE; 889 890 // output offsets only for bulk conversion 891 resultOffsets=NULL; 892 } 893 894 for(;;) { 895 // resetting the opposite conversion direction must not affect this one 896 ucnv_resetFromUnicode(cnv); 897 898 // convert 899 ucnv_toUnicode(cnv, 900 &target, targetLimit, 901 &source, sourceLimit, 902 resultOffsets, 903 flush, pErrorCode); 904 905 // check pointers and errors 906 if(source>sourceLimit || target>targetLimit) { 907 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 908 break; 909 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 910 if(target!=targetLimit) { 911 // buffer overflow must only be set when the target is filled 912 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 913 break; 914 } else if(targetLimit==resultLimit) { 915 // not just a partial overflow 916 break; 917 } 918 919 // the partial target is filled, set a new limit, reset the error and continue 920 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 921 *pErrorCode=U_ZERO_ERROR; 922 } else if(U_FAILURE(*pErrorCode)) { 923 // some other error occurred, done 924 break; 925 } else { 926 if(source!=sourceLimit) { 927 // when no error occurs, then the input must be consumed 928 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 929 break; 930 } 931 932 if(sourceLimit==bytesLimit) { 933 // we are done 934 break; 935 } 936 937 // the partial conversion succeeded, set a new limit and continue 938 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit; 939 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit); 940 } 941 } 942 } else /* step<0 */ { 943 /* 944 * step==-1: call only ucnv_getNextUChar() 945 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar() 946 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input, 947 * else give it at most (-step-2)/2 bytes 948 */ 949 UChar32 c; 950 951 // end the loop by getting an index out of bounds error 952 for(;;) { 953 // resetting the opposite conversion direction must not affect this one 954 ucnv_resetFromUnicode(cnv); 955 956 // convert 957 if((step&1)!=0 /* odd: -1, -3, -5, ... */) { 958 sourceLimit=source; // use sourceLimit not as a real limit 959 // but to remember the pre-getNextUChar source pointer 960 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode); 961 962 // check pointers and errors 963 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 964 if(source!=bytesLimit) { 965 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 966 } else { 967 *pErrorCode=U_ZERO_ERROR; 968 } 969 break; 970 } else if(U_FAILURE(*pErrorCode)) { 971 break; 972 } 973 // source may not move if c is from previous overflow 974 975 if(target==resultLimit) { 976 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 977 break; 978 } 979 if(c<=0xffff) { 980 *target++=(UChar)c; 981 } else { 982 *target++=U16_LEAD(c); 983 if(target==resultLimit) { 984 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 985 break; 986 } 987 *target++=U16_TRAIL(c); 988 } 989 990 // alternate between -n-1 and -n but leave -1 alone 991 if(step<-1) { 992 ++step; 993 } 994 } else /* step is even */ { 995 // allow only one UChar output 996 targetLimit=target<resultLimit ? target+1 : resultLimit; 997 998 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit) 999 // and never output offsets 1000 if(step==-2) { 1001 sourceLimit=bytesLimit; 1002 } else { 1003 sourceLimit=source+(-step-2)/2; 1004 if(sourceLimit>bytesLimit) { 1005 sourceLimit=bytesLimit; 1006 } 1007 } 1008 1009 ucnv_toUnicode(cnv, 1010 &target, targetLimit, 1011 &source, sourceLimit, 1012 NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode); 1013 1014 // check pointers and errors 1015 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1016 if(target!=targetLimit) { 1017 // buffer overflow must only be set when the target is filled 1018 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1019 break; 1020 } else if(targetLimit==resultLimit) { 1021 // not just a partial overflow 1022 break; 1023 } 1024 1025 // the partial target is filled, set a new limit and continue 1026 *pErrorCode=U_ZERO_ERROR; 1027 } else if(U_FAILURE(*pErrorCode)) { 1028 // some other error occurred, done 1029 break; 1030 } else { 1031 if(source!=sourceLimit) { 1032 // when no error occurs, then the input must be consumed 1033 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1034 break; 1035 } 1036 1037 // we are done (flush==TRUE) but we continue, to get the index out of bounds error above 1038 } 1039 1040 --step; 1041 } 1042 } 1043 } 1044 1045 return (int32_t)(target-result); 1046 } 1047 1048 UBool 1049 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) { 1050 // open the converter 1051 IcuTestErrorCode errorCode(*this, "ToUnicodeCase"); 1052 LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode)); 1053 if(errorCode.isFailure()) { 1054 errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s", 1055 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName()); 1056 errorCode.reset(); 1057 return FALSE; 1058 } 1059 1060 // set the callback 1061 if(callback!=NULL) { 1062 ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode); 1063 if(U_FAILURE(errorCode)) { 1064 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s", 1065 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1066 return FALSE; 1067 } 1068 } 1069 1070 int32_t resultOffsets[256]; 1071 UChar result[256]; 1072 int32_t resultLength; 1073 UBool ok; 1074 1075 static const struct { 1076 int32_t step; 1077 const char *name; 1078 } steps[]={ 1079 { 0, "bulk" }, // must be first for offsets to be checked 1080 { 1, "step=1" }, 1081 { 3, "step=3" }, 1082 { 7, "step=7" }, 1083 { -1, "getNext" }, 1084 { -2, "toU(bulk)+getNext" }, 1085 { -3, "getNext+toU(bulk)" }, 1086 { -4, "toU(1)+getNext" }, 1087 { -5, "getNext+toU(1)" }, 1088 { -12, "toU(5)+getNext" }, 1089 { -13, "getNext+toU(5)" }, 1090 }; 1091 int32_t i, step; 1092 1093 ok=TRUE; 1094 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) { 1095 step=steps[i].step; 1096 if(step<0 && !cc.finalFlush) { 1097 // skip ucnv_getNextUChar() if !finalFlush because 1098 // ucnv_getNextUChar() always implies flush 1099 continue; 1100 } 1101 if(step!=0) { 1102 // bulk test is first, then offsets are not checked any more 1103 cc.offsets=NULL; 1104 } 1105 else { 1106 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets)); 1107 } 1108 memset(result, -1, UPRV_LENGTHOF(result)); 1109 errorCode.reset(); 1110 resultLength=stepToUnicode(cc, cnv.getAlias(), 1111 result, UPRV_LENGTHOF(result), 1112 step==0 ? resultOffsets : NULL, 1113 step, errorCode); 1114 ok=checkToUnicode( 1115 cc, cnv.getAlias(), steps[i].name, 1116 result, resultLength, 1117 cc.offsets!=NULL ? resultOffsets : NULL, 1118 errorCode); 1119 if(errorCode.isFailure() || !cc.finalFlush) { 1120 // reset if an error occurred or we did not flush 1121 // otherwise do nothing to make sure that flushing resets 1122 ucnv_resetToUnicode(cnv.getAlias()); 1123 } 1124 if (cc.offsets != NULL && resultOffsets[resultLength] != -1) { 1125 errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d", 1126 cc.caseNr, cc.charset, resultLength); 1127 } 1128 if (result[resultLength] != (UChar)-1) { 1129 errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d", 1130 cc.caseNr, cc.charset, resultLength); 1131 } 1132 } 1133 1134 // not a real loop, just a convenience for breaking out of the block 1135 while(ok && cc.finalFlush) { 1136 // test ucnv_toUChars() 1137 memset(result, 0, sizeof(result)); 1138 1139 errorCode.reset(); 1140 resultLength=ucnv_toUChars(cnv.getAlias(), 1141 result, UPRV_LENGTHOF(result), 1142 (const char *)cc.bytes, cc.bytesLength, 1143 errorCode); 1144 ok=checkToUnicode( 1145 cc, cnv.getAlias(), "toUChars", 1146 result, resultLength, 1147 NULL, 1148 errorCode); 1149 if(!ok) { 1150 break; 1151 } 1152 1153 // test preflighting 1154 // keep the correct result for simple checking 1155 errorCode.reset(); 1156 resultLength=ucnv_toUChars(cnv.getAlias(), 1157 NULL, 0, 1158 (const char *)cc.bytes, cc.bytesLength, 1159 errorCode); 1160 if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) { 1161 errorCode.reset(); 1162 } 1163 ok=checkToUnicode( 1164 cc, cnv.getAlias(), "preflight toUChars", 1165 result, resultLength, 1166 NULL, 1167 errorCode); 1168 break; 1169 } 1170 1171 errorCode.reset(); // all errors have already been reported 1172 return ok; 1173 } 1174 1175 UBool 1176 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name, 1177 const UChar *result, int32_t resultLength, 1178 const int32_t *resultOffsets, 1179 UErrorCode resultErrorCode) { 1180 char resultInvalidChars[8]; 1181 int8_t resultInvalidLength; 1182 UErrorCode errorCode; 1183 1184 const char *msg; 1185 1186 // reset the message; NULL will mean "ok" 1187 msg=NULL; 1188 1189 errorCode=U_ZERO_ERROR; 1190 resultInvalidLength=sizeof(resultInvalidChars); 1191 ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode); 1192 if(U_FAILURE(errorCode)) { 1193 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s", 1194 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode)); 1195 return FALSE; 1196 } 1197 1198 // check everything that might have gone wrong 1199 if(cc.unicodeLength!=resultLength) { 1200 msg="wrong result length"; 1201 } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) { 1202 msg="wrong result string"; 1203 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) { 1204 msg="wrong offsets"; 1205 } else if(cc.outErrorCode!=resultErrorCode) { 1206 msg="wrong error code"; 1207 } else if(cc.invalidLength!=resultInvalidLength) { 1208 msg="wrong length of last invalid input"; 1209 } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) { 1210 msg="wrong last invalid input"; 1211 } 1212 1213 if(msg==NULL) { 1214 return TRUE; 1215 } else { 1216 char buffer[2000]; // one buffer for all strings 1217 char *s, *bytesString, *unicodeString, *resultString, 1218 *offsetsString, *resultOffsetsString, 1219 *invalidCharsString, *resultInvalidCharsString; 1220 1221 bytesString=s=buffer; 1222 s=printBytes(cc.bytes, cc.bytesLength, bytesString); 1223 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s); 1224 s=printUnicode(result, resultLength, resultString=s); 1225 s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s); 1226 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s); 1227 s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s); 1228 s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s); 1229 1230 if((s-buffer)>(int32_t)sizeof(buffer)) { 1231 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n", 1232 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer)); 1233 exit(1); 1234 } 1235 1236 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n" 1237 " bytes <%s>[%d]\n" 1238 " expected <%s>[%d]\n" 1239 " result <%s>[%d]\n" 1240 " offsets <%s>\n" 1241 " result offsets <%s>\n" 1242 " error code expected %s got %s\n" 1243 " invalidChars expected <%s> got <%s>\n", 1244 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg, 1245 bytesString, cc.bytesLength, 1246 unicodeString, cc.unicodeLength, 1247 resultString, resultLength, 1248 offsetsString, 1249 resultOffsetsString, 1250 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode), 1251 invalidCharsString, resultInvalidCharsString); 1252 1253 return FALSE; 1254 } 1255 } 1256 1257 // fromUnicode test worker functions --------------------------------------- *** 1258 1259 static int32_t 1260 stepFromUTF8(ConversionCase &cc, 1261 UConverter *utf8Cnv, UConverter *cnv, 1262 char *result, int32_t resultCapacity, 1263 int32_t step, 1264 UErrorCode *pErrorCode) { 1265 const char *source, *sourceLimit, *utf8Limit; 1266 UChar pivotBuffer[32]; 1267 UChar *pivotSource, *pivotTarget, *pivotLimit; 1268 char *target, *targetLimit, *resultLimit; 1269 UBool flush; 1270 1271 source=cc.utf8; 1272 pivotSource=pivotTarget=pivotBuffer; 1273 target=result; 1274 utf8Limit=source+cc.utf8Length; 1275 resultLimit=result+resultCapacity; 1276 1277 // call ucnv_convertEx() with in/out buffers no larger than (step) at a time 1278 // move only one buffer (in vs. out) at a time to be extra mean 1279 // step==0 performs bulk conversion 1280 1281 // initialize the partial limits for the loop 1282 if(step==0) { 1283 // use the entire buffers 1284 sourceLimit=utf8Limit; 1285 targetLimit=resultLimit; 1286 flush=cc.finalFlush; 1287 1288 pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer); 1289 } else { 1290 // start with empty partial buffers 1291 sourceLimit=source; 1292 targetLimit=target; 1293 flush=FALSE; 1294 1295 // empty pivot is not allowed, make it of length step 1296 pivotLimit=pivotBuffer+step; 1297 } 1298 1299 for(;;) { 1300 // resetting the opposite conversion direction must not affect this one 1301 ucnv_resetFromUnicode(utf8Cnv); 1302 ucnv_resetToUnicode(cnv); 1303 1304 // convert 1305 ucnv_convertEx(cnv, utf8Cnv, 1306 &target, targetLimit, 1307 &source, sourceLimit, 1308 pivotBuffer, &pivotSource, &pivotTarget, pivotLimit, 1309 FALSE, flush, pErrorCode); 1310 1311 // check pointers and errors 1312 if(source>sourceLimit || target>targetLimit) { 1313 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1314 break; 1315 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1316 if(target!=targetLimit) { 1317 // buffer overflow must only be set when the target is filled 1318 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1319 break; 1320 } else if(targetLimit==resultLimit) { 1321 // not just a partial overflow 1322 break; 1323 } 1324 1325 // the partial target is filled, set a new limit, reset the error and continue 1326 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 1327 *pErrorCode=U_ZERO_ERROR; 1328 } else if(U_FAILURE(*pErrorCode)) { 1329 if(pivotSource==pivotBuffer) { 1330 // toUnicode error, should not occur 1331 // toUnicode errors are tested in cintltst TestConvertExFromUTF8() 1332 break; 1333 } else { 1334 // fromUnicode error 1335 // some other error occurred, done 1336 break; 1337 } 1338 } else { 1339 if(source!=sourceLimit) { 1340 // when no error occurs, then the input must be consumed 1341 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1342 break; 1343 } 1344 1345 if(sourceLimit==utf8Limit) { 1346 // we are done 1347 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { 1348 // ucnv_convertEx() warns about not terminating the output 1349 // but ucnv_fromUnicode() does not and so 1350 // checkFromUnicode() does not expect it 1351 *pErrorCode=U_ZERO_ERROR; 1352 } 1353 break; 1354 } 1355 1356 // the partial conversion succeeded, set a new limit and continue 1357 sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit; 1358 flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit); 1359 } 1360 } 1361 1362 return (int32_t)(target-result); 1363 } 1364 1365 static int32_t 1366 stepFromUnicode(ConversionCase &cc, UConverter *cnv, 1367 char *result, int32_t resultCapacity, 1368 int32_t *resultOffsets, /* also resultCapacity */ 1369 int32_t step, 1370 UErrorCode *pErrorCode) { 1371 const UChar *source, *sourceLimit, *unicodeLimit; 1372 char *target, *targetLimit, *resultLimit; 1373 UBool flush; 1374 1375 source=cc.unicode; 1376 target=result; 1377 unicodeLimit=source+cc.unicodeLength; 1378 resultLimit=result+resultCapacity; 1379 1380 // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time 1381 // move only one buffer (in vs. out) at a time to be extra mean 1382 // step==0 performs bulk conversion and generates offsets 1383 1384 // initialize the partial limits for the loop 1385 if(step==0) { 1386 // use the entire buffers 1387 sourceLimit=unicodeLimit; 1388 targetLimit=resultLimit; 1389 flush=cc.finalFlush; 1390 } else { 1391 // start with empty partial buffers 1392 sourceLimit=source; 1393 targetLimit=target; 1394 flush=FALSE; 1395 1396 // output offsets only for bulk conversion 1397 resultOffsets=NULL; 1398 } 1399 1400 for(;;) { 1401 // resetting the opposite conversion direction must not affect this one 1402 ucnv_resetToUnicode(cnv); 1403 1404 // convert 1405 ucnv_fromUnicode(cnv, 1406 &target, targetLimit, 1407 &source, sourceLimit, 1408 resultOffsets, 1409 flush, pErrorCode); 1410 1411 // check pointers and errors 1412 if(source>sourceLimit || target>targetLimit) { 1413 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1414 break; 1415 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1416 if(target!=targetLimit) { 1417 // buffer overflow must only be set when the target is filled 1418 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1419 break; 1420 } else if(targetLimit==resultLimit) { 1421 // not just a partial overflow 1422 break; 1423 } 1424 1425 // the partial target is filled, set a new limit, reset the error and continue 1426 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 1427 *pErrorCode=U_ZERO_ERROR; 1428 } else if(U_FAILURE(*pErrorCode)) { 1429 // some other error occurred, done 1430 break; 1431 } else { 1432 if(source!=sourceLimit) { 1433 // when no error occurs, then the input must be consumed 1434 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1435 break; 1436 } 1437 1438 if(sourceLimit==unicodeLimit) { 1439 // we are done 1440 break; 1441 } 1442 1443 // the partial conversion succeeded, set a new limit and continue 1444 sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit; 1445 flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit); 1446 } 1447 } 1448 1449 return (int32_t)(target-result); 1450 } 1451 1452 UBool 1453 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) { 1454 UConverter *cnv; 1455 UErrorCode errorCode; 1456 1457 // open the converter 1458 errorCode=U_ZERO_ERROR; 1459 cnv=cnv_open(cc.charset, errorCode); 1460 if(U_FAILURE(errorCode)) { 1461 errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s", 1462 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1463 return FALSE; 1464 } 1465 ucnv_resetToUnicode(utf8Cnv); 1466 1467 // set the callback 1468 if(callback!=NULL) { 1469 ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode); 1470 if(U_FAILURE(errorCode)) { 1471 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s", 1472 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1473 ucnv_close(cnv); 1474 return FALSE; 1475 } 1476 } 1477 1478 // set the fallbacks flag 1479 // TODO change with Jitterbug 2401, then add a similar call for toUnicode too 1480 ucnv_setFallback(cnv, cc.fallbacks); 1481 1482 // set the subchar 1483 int32_t length; 1484 1485 if(cc.setSub>0) { 1486 length=(int32_t)strlen(cc.subchar); 1487 ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode); 1488 if(U_FAILURE(errorCode)) { 1489 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s", 1490 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1491 ucnv_close(cnv); 1492 return FALSE; 1493 } 1494 } else if(cc.setSub<0) { 1495 ucnv_setSubstString(cnv, cc.subString, -1, &errorCode); 1496 if(U_FAILURE(errorCode)) { 1497 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s", 1498 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1499 ucnv_close(cnv); 1500 return FALSE; 1501 } 1502 } 1503 1504 // convert unicode to utf8 1505 char utf8[256]; 1506 cc.utf8=utf8; 1507 u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length, 1508 cc.unicode, cc.unicodeLength, 1509 &errorCode); 1510 if(U_FAILURE(errorCode)) { 1511 // skip UTF-8 testing of a string with an unpaired surrogate, 1512 // or of one that's too long 1513 // toUnicode errors are tested in cintltst TestConvertExFromUTF8() 1514 cc.utf8Length=-1; 1515 } 1516 1517 int32_t resultOffsets[256]; 1518 char result[256]; 1519 int32_t resultLength; 1520 UBool ok; 1521 1522 static const struct { 1523 int32_t step; 1524 const char *name, *utf8Name; 1525 } steps[]={ 1526 { 0, "bulk", "utf8" }, // must be first for offsets to be checked 1527 { 1, "step=1", "utf8 step=1" }, 1528 { 3, "step=3", "utf8 step=3" }, 1529 { 7, "step=7", "utf8 step=7" } 1530 }; 1531 int32_t i, step; 1532 1533 ok=TRUE; 1534 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) { 1535 step=steps[i].step; 1536 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets)); 1537 memset(result, -1, UPRV_LENGTHOF(result)); 1538 errorCode=U_ZERO_ERROR; 1539 resultLength=stepFromUnicode(cc, cnv, 1540 result, UPRV_LENGTHOF(result), 1541 step==0 ? resultOffsets : NULL, 1542 step, &errorCode); 1543 ok=checkFromUnicode( 1544 cc, cnv, steps[i].name, 1545 (uint8_t *)result, resultLength, 1546 cc.offsets!=NULL ? resultOffsets : NULL, 1547 errorCode); 1548 if(U_FAILURE(errorCode) || !cc.finalFlush) { 1549 // reset if an error occurred or we did not flush 1550 // otherwise do nothing to make sure that flushing resets 1551 ucnv_resetFromUnicode(cnv); 1552 } 1553 if (resultOffsets[resultLength] != -1) { 1554 errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d", 1555 cc.caseNr, cc.charset, resultLength); 1556 } 1557 if (result[resultLength] != (char)-1) { 1558 errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d", 1559 cc.caseNr, cc.charset, resultLength); 1560 } 1561 1562 // bulk test is first, then offsets are not checked any more 1563 cc.offsets=NULL; 1564 1565 // test direct conversion from UTF-8 1566 if(cc.utf8Length>=0) { 1567 errorCode=U_ZERO_ERROR; 1568 resultLength=stepFromUTF8(cc, utf8Cnv, cnv, 1569 result, UPRV_LENGTHOF(result), 1570 step, &errorCode); 1571 ok=checkFromUnicode( 1572 cc, cnv, steps[i].utf8Name, 1573 (uint8_t *)result, resultLength, 1574 NULL, 1575 errorCode); 1576 if(U_FAILURE(errorCode) || !cc.finalFlush) { 1577 // reset if an error occurred or we did not flush 1578 // otherwise do nothing to make sure that flushing resets 1579 ucnv_resetToUnicode(utf8Cnv); 1580 ucnv_resetFromUnicode(cnv); 1581 } 1582 } 1583 } 1584 1585 // not a real loop, just a convenience for breaking out of the block 1586 while(ok && cc.finalFlush) { 1587 // test ucnv_fromUChars() 1588 memset(result, 0, sizeof(result)); 1589 1590 errorCode=U_ZERO_ERROR; 1591 resultLength=ucnv_fromUChars(cnv, 1592 result, UPRV_LENGTHOF(result), 1593 cc.unicode, cc.unicodeLength, 1594 &errorCode); 1595 ok=checkFromUnicode( 1596 cc, cnv, "fromUChars", 1597 (uint8_t *)result, resultLength, 1598 NULL, 1599 errorCode); 1600 if(!ok) { 1601 break; 1602 } 1603 1604 // test preflighting 1605 // keep the correct result for simple checking 1606 errorCode=U_ZERO_ERROR; 1607 resultLength=ucnv_fromUChars(cnv, 1608 NULL, 0, 1609 cc.unicode, cc.unicodeLength, 1610 &errorCode); 1611 if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) { 1612 errorCode=U_ZERO_ERROR; 1613 } 1614 ok=checkFromUnicode( 1615 cc, cnv, "preflight fromUChars", 1616 (uint8_t *)result, resultLength, 1617 NULL, 1618 errorCode); 1619 break; 1620 } 1621 1622 ucnv_close(cnv); 1623 return ok; 1624 } 1625 1626 UBool 1627 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name, 1628 const uint8_t *result, int32_t resultLength, 1629 const int32_t *resultOffsets, 1630 UErrorCode resultErrorCode) { 1631 UChar resultInvalidUChars[8]; 1632 int8_t resultInvalidLength; 1633 UErrorCode errorCode; 1634 1635 const char *msg; 1636 1637 // reset the message; NULL will mean "ok" 1638 msg=NULL; 1639 1640 errorCode=U_ZERO_ERROR; 1641 resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars); 1642 ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode); 1643 if(U_FAILURE(errorCode)) { 1644 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s", 1645 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode)); 1646 return FALSE; 1647 } 1648 1649 // check everything that might have gone wrong 1650 if(cc.bytesLength!=resultLength) { 1651 msg="wrong result length"; 1652 } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) { 1653 msg="wrong result string"; 1654 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) { 1655 msg="wrong offsets"; 1656 } else if(cc.outErrorCode!=resultErrorCode) { 1657 msg="wrong error code"; 1658 } else if(cc.invalidLength!=resultInvalidLength) { 1659 msg="wrong length of last invalid input"; 1660 } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) { 1661 msg="wrong last invalid input"; 1662 } 1663 1664 if(msg==NULL) { 1665 return TRUE; 1666 } else { 1667 char buffer[2000]; // one buffer for all strings 1668 char *s, *unicodeString, *bytesString, *resultString, 1669 *offsetsString, *resultOffsetsString, 1670 *invalidCharsString, *resultInvalidUCharsString; 1671 1672 unicodeString=s=buffer; 1673 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString); 1674 s=printBytes(cc.bytes, cc.bytesLength, bytesString=s); 1675 s=printBytes(result, resultLength, resultString=s); 1676 s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s); 1677 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s); 1678 s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s); 1679 s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s); 1680 1681 if((s-buffer)>(int32_t)sizeof(buffer)) { 1682 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n", 1683 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer)); 1684 exit(1); 1685 } 1686 1687 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n" 1688 " unicode <%s>[%d]\n" 1689 " expected <%s>[%d]\n" 1690 " result <%s>[%d]\n" 1691 " offsets <%s>\n" 1692 " result offsets <%s>\n" 1693 " error code expected %s got %s\n" 1694 " invalidChars expected <%s> got <%s>\n", 1695 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg, 1696 unicodeString, cc.unicodeLength, 1697 bytesString, cc.bytesLength, 1698 resultString, resultLength, 1699 offsetsString, 1700 resultOffsetsString, 1701 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode), 1702 invalidCharsString, resultInvalidUCharsString); 1703 1704 return FALSE; 1705 } 1706 } 1707 1708 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1709