1 /* 2 ******************************************************************************* 3 * 4 * Copyright (C) 2003-2014, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 * 7 ******************************************************************************* 8 * file name: convtest.cpp 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jul15 14 * created by: Markus W. Scherer 15 * 16 * Test file for data-driven conversion tests. 17 */ 18 19 #include "unicode/utypes.h" 20 21 #if !UCONFIG_NO_LEGACY_CONVERSION 22 /* 23 * Note: Turning off all of convtest.cpp if !UCONFIG_NO_LEGACY_CONVERSION 24 * is slightly unnecessary - it removes tests for Unicode charsets 25 * like UTF-8 that should work. 26 * However, there is no easy way for the test to detect whether a test case 27 * is for a Unicode charset, so it would be difficult to only exclude those. 28 * Also, regular testing of ICU is done with all modules on, therefore 29 * not testing conversion for a custom configuration like this should be ok. 30 */ 31 32 #include "unicode/ucnv.h" 33 #include "unicode/unistr.h" 34 #include "unicode/parsepos.h" 35 #include "unicode/uniset.h" 36 #include "unicode/ustring.h" 37 #include "unicode/ures.h" 38 #include "convtest.h" 39 #include "cmemory.h" 40 #include "unicode/tstdtmod.h" 41 #include <string.h> 42 #include <stdlib.h> 43 44 enum { 45 // characters used in test data for callbacks 46 SUB_CB='?', 47 SKIP_CB='0', 48 STOP_CB='.', 49 ESC_CB='&' 50 }; 51 52 ConversionTest::ConversionTest() { 53 UErrorCode errorCode=U_ZERO_ERROR; 54 utf8Cnv=ucnv_open("UTF-8", &errorCode); 55 ucnv_setToUCallBack(utf8Cnv, UCNV_TO_U_CALLBACK_STOP, NULL, NULL, NULL, &errorCode); 56 if(U_FAILURE(errorCode)) { 57 errln("unable to open UTF-8 converter"); 58 } 59 } 60 61 ConversionTest::~ConversionTest() { 62 ucnv_close(utf8Cnv); 63 } 64 65 void 66 ConversionTest::runIndexedTest(int32_t index, UBool exec, const char *&name, char * /*par*/) { 67 if (exec) logln("TestSuite ConversionTest: "); 68 switch (index) { 69 #if !UCONFIG_NO_FILE_IO 70 case 0: name="TestToUnicode"; if (exec) TestToUnicode(); break; 71 case 1: name="TestFromUnicode"; if (exec) TestFromUnicode(); break; 72 case 2: name="TestGetUnicodeSet"; if (exec) TestGetUnicodeSet(); break; 73 case 3: name="TestDefaultIgnorableCallback"; if (exec) TestDefaultIgnorableCallback(); break; 74 #else 75 case 0: 76 case 1: 77 case 2: 78 case 3: name="skip"; break; 79 #endif 80 case 4: name="TestGetUnicodeSet2"; if (exec) TestGetUnicodeSet2(); break; 81 default: name=""; break; //needed to end loop 82 } 83 } 84 85 // test data interface ----------------------------------------------------- *** 86 87 void 88 ConversionTest::TestToUnicode() { 89 ConversionCase cc; 90 char charset[100], cbopt[4]; 91 const char *option; 92 UnicodeString s, unicode; 93 int32_t offsetsLength; 94 UConverterToUCallback callback; 95 96 TestDataModule *dataModule; 97 TestData *testData; 98 const DataMap *testCase; 99 UErrorCode errorCode; 100 int32_t i; 101 102 errorCode=U_ZERO_ERROR; 103 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 104 if(U_SUCCESS(errorCode)) { 105 testData=dataModule->createTestData("toUnicode", errorCode); 106 if(U_SUCCESS(errorCode)) { 107 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 108 if(U_FAILURE(errorCode)) { 109 errln("error retrieving conversion/toUnicode test case %d - %s", 110 i, u_errorName(errorCode)); 111 errorCode=U_ZERO_ERROR; 112 continue; 113 } 114 115 cc.caseNr=i; 116 117 s=testCase->getString("charset", errorCode); 118 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 119 cc.charset=charset; 120 121 // BEGIN android-added 122 // To save space, Android does not build full ISO-2022-CN tables. 123 // We skip the TestGetKeywordValuesForLocale for counting available collations. 124 if (strlen(charset) >= 8 && 125 strncmp(charset+4, "2022-CN", 4) == 0) { 126 continue; 127 } 128 // END android-added 129 130 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode); 131 unicode=testCase->getString("unicode", errorCode); 132 cc.unicode=unicode.getBuffer(); 133 cc.unicodeLength=unicode.length(); 134 135 offsetsLength=0; 136 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode); 137 if(offsetsLength==0) { 138 cc.offsets=NULL; 139 } else if(offsetsLength!=unicode.length()) { 140 errln("toUnicode[%d] unicode[%d] and offsets[%d] must have the same length", 141 i, unicode.length(), offsetsLength); 142 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 143 } 144 145 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode); 146 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode); 147 148 s=testCase->getString("errorCode", errorCode); 149 if(s==UNICODE_STRING("invalid", 7)) { 150 cc.outErrorCode=U_INVALID_CHAR_FOUND; 151 } else if(s==UNICODE_STRING("illegal", 7)) { 152 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND; 153 } else if(s==UNICODE_STRING("truncated", 9)) { 154 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND; 155 } else if(s==UNICODE_STRING("illesc", 6)) { 156 cc.outErrorCode=U_ILLEGAL_ESCAPE_SEQUENCE; 157 } else if(s==UNICODE_STRING("unsuppesc", 9)) { 158 cc.outErrorCode=U_UNSUPPORTED_ESCAPE_SEQUENCE; 159 } else { 160 cc.outErrorCode=U_ZERO_ERROR; 161 } 162 163 s=testCase->getString("callback", errorCode); 164 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), ""); 165 cc.cbopt=cbopt; 166 switch(cbopt[0]) { 167 case SUB_CB: 168 callback=UCNV_TO_U_CALLBACK_SUBSTITUTE; 169 break; 170 case SKIP_CB: 171 callback=UCNV_TO_U_CALLBACK_SKIP; 172 break; 173 case STOP_CB: 174 callback=UCNV_TO_U_CALLBACK_STOP; 175 break; 176 case ESC_CB: 177 callback=UCNV_TO_U_CALLBACK_ESCAPE; 178 break; 179 default: 180 callback=NULL; 181 break; 182 } 183 option=callback==NULL ? cbopt : cbopt+1; 184 if(*option==0) { 185 option=NULL; 186 } 187 188 cc.invalidChars=testCase->getBinary(cc.invalidLength, "invalidChars", errorCode); 189 190 if(U_FAILURE(errorCode)) { 191 errln("error parsing conversion/toUnicode test case %d - %s", 192 i, u_errorName(errorCode)); 193 errorCode=U_ZERO_ERROR; 194 } else { 195 logln("TestToUnicode[%d] %s", i, charset); 196 ToUnicodeCase(cc, callback, option); 197 } 198 } 199 delete testData; 200 } 201 delete dataModule; 202 } 203 else { 204 dataerrln("Could not load test conversion data"); 205 } 206 } 207 208 void 209 ConversionTest::TestFromUnicode() { 210 ConversionCase cc; 211 char charset[100], cbopt[4]; 212 const char *option; 213 UnicodeString s, unicode, invalidUChars; 214 int32_t offsetsLength, index; 215 UConverterFromUCallback callback; 216 217 TestDataModule *dataModule; 218 TestData *testData; 219 const DataMap *testCase; 220 const UChar *p; 221 UErrorCode errorCode; 222 int32_t i, length; 223 224 errorCode=U_ZERO_ERROR; 225 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 226 if(U_SUCCESS(errorCode)) { 227 testData=dataModule->createTestData("fromUnicode", errorCode); 228 if(U_SUCCESS(errorCode)) { 229 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 230 if(U_FAILURE(errorCode)) { 231 errln("error retrieving conversion/fromUnicode test case %d - %s", 232 i, u_errorName(errorCode)); 233 errorCode=U_ZERO_ERROR; 234 continue; 235 } 236 237 cc.caseNr=i; 238 239 s=testCase->getString("charset", errorCode); 240 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 241 cc.charset=charset; 242 243 // BEGIN android-added 244 // To save space, Android does not build full ISO-2022-CN tables. 245 // We skip the TestGetKeywordValuesForLocale for counting available collations. 246 if (strlen(charset) >= 8 && 247 strncmp(charset+4, "2022-CN", 4) == 0) { 248 continue; 249 } 250 // END android-added 251 252 unicode=testCase->getString("unicode", errorCode); 253 cc.unicode=unicode.getBuffer(); 254 cc.unicodeLength=unicode.length(); 255 cc.bytes=testCase->getBinary(cc.bytesLength, "bytes", errorCode); 256 257 offsetsLength=0; 258 cc.offsets=testCase->getIntVector(offsetsLength, "offsets", errorCode); 259 if(offsetsLength==0) { 260 cc.offsets=NULL; 261 } else if(offsetsLength!=cc.bytesLength) { 262 errln("fromUnicode[%d] bytes[%d] and offsets[%d] must have the same length", 263 i, cc.bytesLength, offsetsLength); 264 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 265 } 266 267 cc.finalFlush= 0!=testCase->getInt28("flush", errorCode); 268 cc.fallbacks= 0!=testCase->getInt28("fallbacks", errorCode); 269 270 s=testCase->getString("errorCode", errorCode); 271 if(s==UNICODE_STRING("invalid", 7)) { 272 cc.outErrorCode=U_INVALID_CHAR_FOUND; 273 } else if(s==UNICODE_STRING("illegal", 7)) { 274 cc.outErrorCode=U_ILLEGAL_CHAR_FOUND; 275 } else if(s==UNICODE_STRING("truncated", 9)) { 276 cc.outErrorCode=U_TRUNCATED_CHAR_FOUND; 277 } else { 278 cc.outErrorCode=U_ZERO_ERROR; 279 } 280 281 s=testCase->getString("callback", errorCode); 282 cc.setSub=0; // default: no subchar 283 284 if((index=s.indexOf((UChar)0))>0) { 285 // read NUL-separated subchar first, if any 286 // copy the subchar from Latin-1 characters 287 // start after the NUL 288 p=s.getTerminatedBuffer(); 289 length=index+1; 290 p+=length; 291 length=s.length()-length; 292 if(length<=0 || length>=(int32_t)sizeof(cc.subchar)) { 293 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 294 } else { 295 int32_t j; 296 297 for(j=0; j<length; ++j) { 298 cc.subchar[j]=(char)p[j]; 299 } 300 // NUL-terminate the subchar 301 cc.subchar[j]=0; 302 cc.setSub=1; 303 } 304 305 // remove the NUL and subchar from s 306 s.truncate(index); 307 } else if((index=s.indexOf((UChar)0x3d))>0) /* '=' */ { 308 // read a substitution string, separated by an equal sign 309 p=s.getBuffer()+index+1; 310 length=s.length()-(index+1); 311 if(length<0 || length>=UPRV_LENGTHOF(cc.subString)) { 312 errorCode=U_ILLEGAL_ARGUMENT_ERROR; 313 } else { 314 u_memcpy(cc.subString, p, length); 315 // NUL-terminate the subString 316 cc.subString[length]=0; 317 cc.setSub=-1; 318 } 319 320 // remove the equal sign and subString from s 321 s.truncate(index); 322 } 323 324 s.extract(0, 0x7fffffff, cbopt, sizeof(cbopt), ""); 325 cc.cbopt=cbopt; 326 switch(cbopt[0]) { 327 case SUB_CB: 328 callback=UCNV_FROM_U_CALLBACK_SUBSTITUTE; 329 break; 330 case SKIP_CB: 331 callback=UCNV_FROM_U_CALLBACK_SKIP; 332 break; 333 case STOP_CB: 334 callback=UCNV_FROM_U_CALLBACK_STOP; 335 break; 336 case ESC_CB: 337 callback=UCNV_FROM_U_CALLBACK_ESCAPE; 338 break; 339 default: 340 callback=NULL; 341 break; 342 } 343 option=callback==NULL ? cbopt : cbopt+1; 344 if(*option==0) { 345 option=NULL; 346 } 347 348 invalidUChars=testCase->getString("invalidUChars", errorCode); 349 cc.invalidUChars=invalidUChars.getBuffer(); 350 cc.invalidLength=invalidUChars.length(); 351 352 if(U_FAILURE(errorCode)) { 353 errln("error parsing conversion/fromUnicode test case %d - %s", 354 i, u_errorName(errorCode)); 355 errorCode=U_ZERO_ERROR; 356 } else { 357 logln("TestFromUnicode[%d] %s", i, charset); 358 FromUnicodeCase(cc, callback, option); 359 } 360 } 361 delete testData; 362 } 363 delete dataModule; 364 } 365 else { 366 dataerrln("Could not load test conversion data"); 367 } 368 } 369 370 static const UChar ellipsis[]={ 0x2e, 0x2e, 0x2e }; 371 372 void 373 ConversionTest::TestGetUnicodeSet() { 374 char charset[100]; 375 UnicodeString s, map, mapnot; 376 int32_t which; 377 378 ParsePosition pos; 379 UnicodeSet cnvSet, mapSet, mapnotSet, diffSet; 380 UnicodeSet *cnvSetPtr = &cnvSet; 381 LocalUConverterPointer cnv; 382 383 TestDataModule *dataModule; 384 TestData *testData; 385 const DataMap *testCase; 386 UErrorCode errorCode; 387 int32_t i; 388 389 errorCode=U_ZERO_ERROR; 390 dataModule=TestDataModule::getTestDataModule("conversion", *this, errorCode); 391 if(U_SUCCESS(errorCode)) { 392 testData=dataModule->createTestData("getUnicodeSet", errorCode); 393 if(U_SUCCESS(errorCode)) { 394 for(i=0; testData->nextCase(testCase, errorCode); ++i) { 395 if(U_FAILURE(errorCode)) { 396 errln("error retrieving conversion/getUnicodeSet test case %d - %s", 397 i, u_errorName(errorCode)); 398 errorCode=U_ZERO_ERROR; 399 continue; 400 } 401 402 s=testCase->getString("charset", errorCode); 403 s.extract(0, 0x7fffffff, charset, sizeof(charset), ""); 404 405 // BEGIN android-added 406 // To save space, Android does not build full ISO-2022-CN tables. 407 // We skip the TestGetKeywordValuesForLocale for counting available collations. 408 if (strlen(charset) >= 8 && 409 strncmp(charset+4, "2022-CN", 4) == 0) { 410 continue; 411 } 412 // END android-added 413 414 map=testCase->getString("map", errorCode); 415 mapnot=testCase->getString("mapnot", errorCode); 416 417 which=testCase->getInt28("which", errorCode); 418 419 if(U_FAILURE(errorCode)) { 420 errln("error parsing conversion/getUnicodeSet test case %d - %s", 421 i, u_errorName(errorCode)); 422 errorCode=U_ZERO_ERROR; 423 continue; 424 } 425 426 // test this test case 427 mapSet.clear(); 428 mapnotSet.clear(); 429 430 pos.setIndex(0); 431 mapSet.applyPattern(map, pos, 0, NULL, errorCode); 432 if(U_FAILURE(errorCode) || pos.getIndex()!=map.length()) { 433 errln("error creating the map set for conversion/getUnicodeSet test case %d - %s\n" 434 " error index %d index %d U+%04x", 435 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), map.char32At(pos.getIndex())); 436 errorCode=U_ZERO_ERROR; 437 continue; 438 } 439 440 pos.setIndex(0); 441 mapnotSet.applyPattern(mapnot, pos, 0, NULL, errorCode); 442 if(U_FAILURE(errorCode) || pos.getIndex()!=mapnot.length()) { 443 errln("error creating the mapnot set for conversion/getUnicodeSet test case %d - %s\n" 444 " error index %d index %d U+%04x", 445 i, u_errorName(errorCode), pos.getErrorIndex(), pos.getIndex(), mapnot.char32At(pos.getIndex())); 446 errorCode=U_ZERO_ERROR; 447 continue; 448 } 449 450 logln("TestGetUnicodeSet[%d] %s", i, charset); 451 452 cnv.adoptInstead(cnv_open(charset, errorCode)); 453 if(U_FAILURE(errorCode)) { 454 errcheckln(errorCode, "error opening \"%s\" for conversion/getUnicodeSet test case %d - %s", 455 charset, i, u_errorName(errorCode)); 456 errorCode=U_ZERO_ERROR; 457 continue; 458 } 459 460 ucnv_getUnicodeSet(cnv.getAlias(), cnvSetPtr->toUSet(), (UConverterUnicodeSet)which, &errorCode); 461 462 if(U_FAILURE(errorCode)) { 463 errln("error in ucnv_getUnicodeSet(\"%s\") for conversion/getUnicodeSet test case %d - %s", 464 charset, i, u_errorName(errorCode)); 465 errorCode=U_ZERO_ERROR; 466 continue; 467 } 468 469 // are there items that must be in cnvSet but are not? 470 (diffSet=mapSet).removeAll(cnvSet); 471 if(!diffSet.isEmpty()) { 472 diffSet.toPattern(s, TRUE); 473 if(s.length()>100) { 474 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 475 } 476 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - conversion/getUnicodeSet test case %d", 477 charset, i); 478 errln(s); 479 } 480 481 // are there items that must not be in cnvSet but are? 482 (diffSet=mapnotSet).retainAll(cnvSet); 483 if(!diffSet.isEmpty()) { 484 diffSet.toPattern(s, TRUE); 485 if(s.length()>100) { 486 s.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 487 } 488 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - conversion/getUnicodeSet test case %d", 489 charset, i); 490 errln(s); 491 } 492 } 493 delete testData; 494 } 495 delete dataModule; 496 } 497 else { 498 dataerrln("Could not load test conversion data"); 499 } 500 } 501 502 U_CDECL_BEGIN 503 static void U_CALLCONV 504 getUnicodeSetCallback(const void *context, 505 UConverterFromUnicodeArgs * /*fromUArgs*/, 506 const UChar* /*codeUnits*/, 507 int32_t /*length*/, 508 UChar32 codePoint, 509 UConverterCallbackReason reason, 510 UErrorCode *pErrorCode) { 511 if(reason<=UCNV_IRREGULAR) { 512 ((UnicodeSet *)context)->remove(codePoint); // the converter cannot convert this code point 513 *pErrorCode=U_ZERO_ERROR; // skip 514 } // else ignore the reset, close and clone calls. 515 } 516 U_CDECL_END 517 518 // Compare ucnv_getUnicodeSet() with the set of characters that can be converted. 519 void 520 ConversionTest::TestGetUnicodeSet2() { 521 // Build a string with all code points. 522 UChar32 cpLimit; 523 int32_t s0Length; 524 if(quick) { 525 cpLimit=s0Length=0x10000; // BMP only 526 } else { 527 cpLimit=0x110000; 528 s0Length=0x10000+0x200000; // BMP + surrogate pairs 529 } 530 UChar *s0=new UChar[s0Length]; 531 if(s0==NULL) { 532 return; 533 } 534 UChar *s=s0; 535 UChar32 c; 536 UChar c2; 537 // low BMP 538 for(c=0; c<=0xd7ff; ++c) { 539 *s++=(UChar)c; 540 } 541 // trail surrogates 542 for(c=0xdc00; c<=0xdfff; ++c) { 543 *s++=(UChar)c; 544 } 545 // lead surrogates 546 // (after trails so that there is not even one surrogate pair in between) 547 for(c=0xd800; c<=0xdbff; ++c) { 548 *s++=(UChar)c; 549 } 550 // high BMP 551 for(c=0xe000; c<=0xffff; ++c) { 552 *s++=(UChar)c; 553 } 554 // supplementary code points = surrogate pairs 555 if(cpLimit==0x110000) { 556 for(c=0xd800; c<=0xdbff; ++c) { 557 for(c2=0xdc00; c2<=0xdfff; ++c2) { 558 *s++=(UChar)c; 559 *s++=c2; 560 } 561 } 562 } 563 564 static const char *const cnvNames[]={ 565 "UTF-8", 566 "UTF-7", 567 "UTF-16", 568 "US-ASCII", 569 "ISO-8859-1", 570 "windows-1252", 571 "Shift-JIS", 572 "ibm-1390", // EBCDIC_STATEFUL table 573 "ibm-16684", // DBCS-only extension table based on EBCDIC_STATEFUL table 574 "HZ", 575 "ISO-2022-JP", 576 "JIS7", 577 "ISO-2022-CN", 578 "ISO-2022-CN-EXT", 579 "LMBCS" 580 }; 581 LocalUConverterPointer cnv; 582 char buffer[1024]; 583 int32_t i; 584 for(i=0; i<UPRV_LENGTHOF(cnvNames); ++i) { 585 UErrorCode errorCode=U_ZERO_ERROR; 586 cnv.adoptInstead(cnv_open(cnvNames[i], errorCode)); 587 if(U_FAILURE(errorCode)) { 588 errcheckln(errorCode, "failed to open converter %s - %s", cnvNames[i], u_errorName(errorCode)); 589 continue; 590 } 591 UnicodeSet expected; 592 ucnv_setFromUCallBack(cnv.getAlias(), getUnicodeSetCallback, &expected, NULL, NULL, &errorCode); 593 if(U_FAILURE(errorCode)) { 594 errln("failed to set the callback on converter %s - %s", cnvNames[i], u_errorName(errorCode)); 595 continue; 596 } 597 UConverterUnicodeSet which; 598 for(which=UCNV_ROUNDTRIP_SET; which<UCNV_SET_COUNT; which=(UConverterUnicodeSet)((int)which+1)) { 599 if(which==UCNV_ROUNDTRIP_AND_FALLBACK_SET) { 600 ucnv_setFallback(cnv.getAlias(), TRUE); 601 } 602 expected.add(0, cpLimit-1); 603 s=s0; 604 UBool flush; 605 do { 606 char *t=buffer; 607 flush=(UBool)(s==s0+s0Length); 608 ucnv_fromUnicode(cnv.getAlias(), &t, buffer+sizeof(buffer), (const UChar **)&s, s0+s0Length, NULL, flush, &errorCode); 609 if(U_FAILURE(errorCode)) { 610 if(errorCode==U_BUFFER_OVERFLOW_ERROR) { 611 errorCode=U_ZERO_ERROR; 612 continue; 613 } else { 614 break; // unexpected error, should not occur 615 } 616 } 617 } while(!flush); 618 UnicodeSet set; 619 ucnv_getUnicodeSet(cnv.getAlias(), set.toUSet(), which, &errorCode); 620 if(cpLimit<0x110000) { 621 set.remove(cpLimit, 0x10ffff); 622 } 623 if(which==UCNV_ROUNDTRIP_SET) { 624 // ignore PUA code points because they will be converted even if they 625 // are fallbacks and when other fallbacks are turned off, 626 // but ucnv_getUnicodeSet(UCNV_ROUNDTRIP_SET) delivers true roundtrips 627 expected.remove(0xe000, 0xf8ff); 628 expected.remove(0xf0000, 0xffffd); 629 expected.remove(0x100000, 0x10fffd); 630 set.remove(0xe000, 0xf8ff); 631 set.remove(0xf0000, 0xffffd); 632 set.remove(0x100000, 0x10fffd); 633 } 634 if(set!=expected) { 635 // First try to see if we have different sets because ucnv_getUnicodeSet() 636 // added strings: The above conversion method does not tell us what strings might be convertible. 637 // Remove strings from the set and compare again. 638 // Unfortunately, there are no good, direct set methods for finding out whether there are strings 639 // in the set, nor for enumerating or removing just them. 640 // Intersect all code points with the set. The intersection will not contain strings. 641 UnicodeSet temp(0, 0x10ffff); 642 temp.retainAll(set); 643 set=temp; 644 } 645 if(set!=expected) { 646 UnicodeSet diffSet; 647 UnicodeString out; 648 649 // are there items that must be in the set but are not? 650 (diffSet=expected).removeAll(set); 651 if(!diffSet.isEmpty()) { 652 diffSet.toPattern(out, TRUE); 653 if(out.length()>100) { 654 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 655 } 656 errln("error: ucnv_getUnicodeSet(\"%s\") is missing items - which set: %d", 657 cnvNames[i], which); 658 errln(out); 659 } 660 661 // are there items that must not be in the set but are? 662 (diffSet=set).removeAll(expected); 663 if(!diffSet.isEmpty()) { 664 diffSet.toPattern(out, TRUE); 665 if(out.length()>100) { 666 out.replace(100, 0x7fffffff, ellipsis, UPRV_LENGTHOF(ellipsis)); 667 } 668 errln("error: ucnv_getUnicodeSet(\"%s\") contains unexpected items - which set: %d", 669 cnvNames[i], which); 670 errln(out); 671 } 672 } 673 } 674 } 675 676 delete [] s0; 677 } 678 679 // Test all codepoints which has the default ignorable Unicode property are ignored if they have no mapping 680 // If there are any failures, the hard coded list (IS_DEFAULT_IGNORABLE_CODE_POINT) in ucnv_err.c should be updated 681 void 682 ConversionTest::TestDefaultIgnorableCallback() { 683 UErrorCode status = U_ZERO_ERROR; 684 const char *cnv_name = "euc-jp-2007"; 685 const char *pattern_ignorable = "[:Default_Ignorable_Code_Point:]"; 686 const char *pattern_not_ignorable = "[:^Default_Ignorable_Code_Point:]"; 687 688 UnicodeSet *set_ignorable = new UnicodeSet(pattern_ignorable, status); 689 if (U_FAILURE(status)) { 690 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_ignorable, u_errorName(status)); 691 return; 692 } 693 694 UnicodeSet *set_not_ignorable = new UnicodeSet(pattern_not_ignorable, status); 695 if (U_FAILURE(status)) { 696 dataerrln("Unable to create Unicodeset: %s - %s\n", pattern_not_ignorable, u_errorName(status)); 697 return; 698 } 699 700 UConverter *cnv = cnv_open(cnv_name, status); 701 if (U_FAILURE(status)) { 702 dataerrln("Unable to open converter: %s - %s\n", cnv_name, u_errorName(status)); 703 return; 704 } 705 706 // set callback for the converter 707 ucnv_setFromUCallBack(cnv, UCNV_FROM_U_CALLBACK_SUBSTITUTE, NULL, NULL, NULL, &status); 708 709 UChar32 input[1]; 710 char output[10]; 711 int32_t outputLength; 712 713 // test default ignorables are ignored 714 int size = set_ignorable->size(); 715 for (int i = 0; i < size; i++) { 716 status = U_ZERO_ERROR; 717 outputLength= 0; 718 719 input[0] = set_ignorable->charAt(i); 720 721 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status); 722 if (U_FAILURE(status) || outputLength != 0) { 723 errln("Ignorable code point: U+%04X not skipped as expected - %s", input[0], u_errorName(status)); 724 } 725 } 726 727 // test non-ignorables are not ignored 728 size = set_not_ignorable->size(); 729 for (int i = 0; i < size; i++) { 730 status = U_ZERO_ERROR; 731 outputLength= 0; 732 733 input[0] = set_not_ignorable->charAt(i); 734 735 if (input[0] == 0) { 736 continue; 737 } 738 739 outputLength = ucnv_fromUChars(cnv, output, 10, UnicodeString::fromUTF32(input, 1).getTerminatedBuffer(), -1, &status); 740 if (U_FAILURE(status) || outputLength <= 0) { 741 errln("Non-ignorable code point: U+%04X skipped unexpectedly - %s", input[0], u_errorName(status)); 742 } 743 } 744 745 ucnv_close(cnv); 746 delete set_not_ignorable; 747 delete set_ignorable; 748 } 749 750 // open testdata or ICU data converter ------------------------------------- *** 751 752 UConverter * 753 ConversionTest::cnv_open(const char *name, UErrorCode &errorCode) { 754 if(name!=NULL && *name=='+') { 755 // Converter names that start with '+' are ignored in ICU4J tests. 756 ++name; 757 } 758 if(name!=NULL && *name=='*') { 759 /* loadTestData(): set the data directory */ 760 return ucnv_openPackage(loadTestData(errorCode), name+1, &errorCode); 761 } else { 762 return ucnv_open(name, &errorCode); 763 } 764 } 765 766 // output helpers ---------------------------------------------------------- *** 767 768 static inline char 769 hexDigit(uint8_t digit) { 770 return digit<=9 ? (char)('0'+digit) : (char)('a'-10+digit); 771 } 772 773 static char * 774 printBytes(const uint8_t *bytes, int32_t length, char *out) { 775 uint8_t b; 776 777 if(length>0) { 778 b=*bytes++; 779 --length; 780 *out++=hexDigit((uint8_t)(b>>4)); 781 *out++=hexDigit((uint8_t)(b&0xf)); 782 } 783 784 while(length>0) { 785 b=*bytes++; 786 --length; 787 *out++=' '; 788 *out++=hexDigit((uint8_t)(b>>4)); 789 *out++=hexDigit((uint8_t)(b&0xf)); 790 } 791 *out++=0; 792 return out; 793 } 794 795 static char * 796 printUnicode(const UChar *unicode, int32_t length, char *out) { 797 UChar32 c; 798 int32_t i; 799 800 for(i=0; i<length;) { 801 if(i>0) { 802 *out++=' '; 803 } 804 U16_NEXT(unicode, i, length, c); 805 // write 4..6 digits 806 if(c>=0x100000) { 807 *out++='1'; 808 } 809 if(c>=0x10000) { 810 *out++=hexDigit((uint8_t)((c>>16)&0xf)); 811 } 812 *out++=hexDigit((uint8_t)((c>>12)&0xf)); 813 *out++=hexDigit((uint8_t)((c>>8)&0xf)); 814 *out++=hexDigit((uint8_t)((c>>4)&0xf)); 815 *out++=hexDigit((uint8_t)(c&0xf)); 816 } 817 *out++=0; 818 return out; 819 } 820 821 static char * 822 printOffsets(const int32_t *offsets, int32_t length, char *out) { 823 int32_t i, o, d; 824 825 if(offsets==NULL) { 826 length=0; 827 } 828 829 for(i=0; i<length; ++i) { 830 if(i>0) { 831 *out++=' '; 832 } 833 o=offsets[i]; 834 835 // print all offsets with 2 characters each (-x, -9..99, xx) 836 if(o<-9) { 837 *out++='-'; 838 *out++='x'; 839 } else if(o<0) { 840 *out++='-'; 841 *out++=(char)('0'-o); 842 } else if(o<=99) { 843 *out++=(d=o/10)==0 ? ' ' : (char)('0'+d); 844 *out++=(char)('0'+o%10); 845 } else /* o>99 */ { 846 *out++='x'; 847 *out++='x'; 848 } 849 } 850 *out++=0; 851 return out; 852 } 853 854 // toUnicode test worker functions ----------------------------------------- *** 855 856 static int32_t 857 stepToUnicode(ConversionCase &cc, UConverter *cnv, 858 UChar *result, int32_t resultCapacity, 859 int32_t *resultOffsets, /* also resultCapacity */ 860 int32_t step, 861 UErrorCode *pErrorCode) { 862 const char *source, *sourceLimit, *bytesLimit; 863 UChar *target, *targetLimit, *resultLimit; 864 UBool flush; 865 866 source=(const char *)cc.bytes; 867 target=result; 868 bytesLimit=source+cc.bytesLength; 869 resultLimit=result+resultCapacity; 870 871 if(step>=0) { 872 // call ucnv_toUnicode() with in/out buffers no larger than (step) at a time 873 // move only one buffer (in vs. out) at a time to be extra mean 874 // step==0 performs bulk conversion and generates offsets 875 876 // initialize the partial limits for the loop 877 if(step==0) { 878 // use the entire buffers 879 sourceLimit=bytesLimit; 880 targetLimit=resultLimit; 881 flush=cc.finalFlush; 882 } else { 883 // start with empty partial buffers 884 sourceLimit=source; 885 targetLimit=target; 886 flush=FALSE; 887 888 // output offsets only for bulk conversion 889 resultOffsets=NULL; 890 } 891 892 for(;;) { 893 // resetting the opposite conversion direction must not affect this one 894 ucnv_resetFromUnicode(cnv); 895 896 // convert 897 ucnv_toUnicode(cnv, 898 &target, targetLimit, 899 &source, sourceLimit, 900 resultOffsets, 901 flush, pErrorCode); 902 903 // check pointers and errors 904 if(source>sourceLimit || target>targetLimit) { 905 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 906 break; 907 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 908 if(target!=targetLimit) { 909 // buffer overflow must only be set when the target is filled 910 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 911 break; 912 } else if(targetLimit==resultLimit) { 913 // not just a partial overflow 914 break; 915 } 916 917 // the partial target is filled, set a new limit, reset the error and continue 918 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 919 *pErrorCode=U_ZERO_ERROR; 920 } else if(U_FAILURE(*pErrorCode)) { 921 // some other error occurred, done 922 break; 923 } else { 924 if(source!=sourceLimit) { 925 // when no error occurs, then the input must be consumed 926 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 927 break; 928 } 929 930 if(sourceLimit==bytesLimit) { 931 // we are done 932 break; 933 } 934 935 // the partial conversion succeeded, set a new limit and continue 936 sourceLimit=(bytesLimit-source)>=step ? source+step : bytesLimit; 937 flush=(UBool)(cc.finalFlush && sourceLimit==bytesLimit); 938 } 939 } 940 } else /* step<0 */ { 941 /* 942 * step==-1: call only ucnv_getNextUChar() 943 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar() 944 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input, 945 * else give it at most (-step-2)/2 bytes 946 */ 947 UChar32 c; 948 949 // end the loop by getting an index out of bounds error 950 for(;;) { 951 // resetting the opposite conversion direction must not affect this one 952 ucnv_resetFromUnicode(cnv); 953 954 // convert 955 if((step&1)!=0 /* odd: -1, -3, -5, ... */) { 956 sourceLimit=source; // use sourceLimit not as a real limit 957 // but to remember the pre-getNextUChar source pointer 958 c=ucnv_getNextUChar(cnv, &source, bytesLimit, pErrorCode); 959 960 // check pointers and errors 961 if(*pErrorCode==U_INDEX_OUTOFBOUNDS_ERROR) { 962 if(source!=bytesLimit) { 963 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 964 } else { 965 *pErrorCode=U_ZERO_ERROR; 966 } 967 break; 968 } else if(U_FAILURE(*pErrorCode)) { 969 break; 970 } 971 // source may not move if c is from previous overflow 972 973 if(target==resultLimit) { 974 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 975 break; 976 } 977 if(c<=0xffff) { 978 *target++=(UChar)c; 979 } else { 980 *target++=U16_LEAD(c); 981 if(target==resultLimit) { 982 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; 983 break; 984 } 985 *target++=U16_TRAIL(c); 986 } 987 988 // alternate between -n-1 and -n but leave -1 alone 989 if(step<-1) { 990 ++step; 991 } 992 } else /* step is even */ { 993 // allow only one UChar output 994 targetLimit=target<resultLimit ? target+1 : resultLimit; 995 996 // as with ucnv_getNextUChar(), we always flush (if we go to bytesLimit) 997 // and never output offsets 998 if(step==-2) { 999 sourceLimit=bytesLimit; 1000 } else { 1001 sourceLimit=source+(-step-2)/2; 1002 if(sourceLimit>bytesLimit) { 1003 sourceLimit=bytesLimit; 1004 } 1005 } 1006 1007 ucnv_toUnicode(cnv, 1008 &target, targetLimit, 1009 &source, sourceLimit, 1010 NULL, (UBool)(sourceLimit==bytesLimit), pErrorCode); 1011 1012 // check pointers and errors 1013 if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1014 if(target!=targetLimit) { 1015 // buffer overflow must only be set when the target is filled 1016 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1017 break; 1018 } else if(targetLimit==resultLimit) { 1019 // not just a partial overflow 1020 break; 1021 } 1022 1023 // the partial target is filled, set a new limit and continue 1024 *pErrorCode=U_ZERO_ERROR; 1025 } else if(U_FAILURE(*pErrorCode)) { 1026 // some other error occurred, done 1027 break; 1028 } else { 1029 if(source!=sourceLimit) { 1030 // when no error occurs, then the input must be consumed 1031 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1032 break; 1033 } 1034 1035 // we are done (flush==TRUE) but we continue, to get the index out of bounds error above 1036 } 1037 1038 --step; 1039 } 1040 } 1041 } 1042 1043 return (int32_t)(target-result); 1044 } 1045 1046 UBool 1047 ConversionTest::ToUnicodeCase(ConversionCase &cc, UConverterToUCallback callback, const char *option) { 1048 // open the converter 1049 IcuTestErrorCode errorCode(*this, "ToUnicodeCase"); 1050 LocalUConverterPointer cnv(cnv_open(cc.charset, errorCode)); 1051 if(errorCode.isFailure()) { 1052 errcheckln(errorCode, "toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s", 1053 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, errorCode.errorName()); 1054 errorCode.reset(); 1055 return FALSE; 1056 } 1057 1058 // set the callback 1059 if(callback!=NULL) { 1060 ucnv_setToUCallBack(cnv.getAlias(), callback, option, NULL, NULL, errorCode); 1061 if(U_FAILURE(errorCode)) { 1062 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setToUCallBack() failed - %s", 1063 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1064 return FALSE; 1065 } 1066 } 1067 1068 int32_t resultOffsets[256]; 1069 UChar result[256]; 1070 int32_t resultLength; 1071 UBool ok; 1072 1073 static const struct { 1074 int32_t step; 1075 const char *name; 1076 } steps[]={ 1077 { 0, "bulk" }, // must be first for offsets to be checked 1078 { 1, "step=1" }, 1079 { 3, "step=3" }, 1080 { 7, "step=7" }, 1081 { -1, "getNext" }, 1082 { -2, "toU(bulk)+getNext" }, 1083 { -3, "getNext+toU(bulk)" }, 1084 { -4, "toU(1)+getNext" }, 1085 { -5, "getNext+toU(1)" }, 1086 { -12, "toU(5)+getNext" }, 1087 { -13, "getNext+toU(5)" }, 1088 }; 1089 int32_t i, step; 1090 1091 ok=TRUE; 1092 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) { 1093 step=steps[i].step; 1094 if(step<0 && !cc.finalFlush) { 1095 // skip ucnv_getNextUChar() if !finalFlush because 1096 // ucnv_getNextUChar() always implies flush 1097 continue; 1098 } 1099 if(step!=0) { 1100 // bulk test is first, then offsets are not checked any more 1101 cc.offsets=NULL; 1102 } 1103 else { 1104 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets)); 1105 } 1106 memset(result, -1, UPRV_LENGTHOF(result)); 1107 errorCode.reset(); 1108 resultLength=stepToUnicode(cc, cnv.getAlias(), 1109 result, UPRV_LENGTHOF(result), 1110 step==0 ? resultOffsets : NULL, 1111 step, errorCode); 1112 ok=checkToUnicode( 1113 cc, cnv.getAlias(), steps[i].name, 1114 result, resultLength, 1115 cc.offsets!=NULL ? resultOffsets : NULL, 1116 errorCode); 1117 if(errorCode.isFailure() || !cc.finalFlush) { 1118 // reset if an error occurred or we did not flush 1119 // otherwise do nothing to make sure that flushing resets 1120 ucnv_resetToUnicode(cnv.getAlias()); 1121 } 1122 if (cc.offsets != NULL && resultOffsets[resultLength] != -1) { 1123 errln("toUnicode[%d](%s) Conversion wrote too much to offsets at index %d", 1124 cc.caseNr, cc.charset, resultLength); 1125 } 1126 if (result[resultLength] != (UChar)-1) { 1127 errln("toUnicode[%d](%s) Conversion wrote too much to result at index %d", 1128 cc.caseNr, cc.charset, resultLength); 1129 } 1130 } 1131 1132 // not a real loop, just a convenience for breaking out of the block 1133 while(ok && cc.finalFlush) { 1134 // test ucnv_toUChars() 1135 memset(result, 0, sizeof(result)); 1136 1137 errorCode.reset(); 1138 resultLength=ucnv_toUChars(cnv.getAlias(), 1139 result, UPRV_LENGTHOF(result), 1140 (const char *)cc.bytes, cc.bytesLength, 1141 errorCode); 1142 ok=checkToUnicode( 1143 cc, cnv.getAlias(), "toUChars", 1144 result, resultLength, 1145 NULL, 1146 errorCode); 1147 if(!ok) { 1148 break; 1149 } 1150 1151 // test preflighting 1152 // keep the correct result for simple checking 1153 errorCode.reset(); 1154 resultLength=ucnv_toUChars(cnv.getAlias(), 1155 NULL, 0, 1156 (const char *)cc.bytes, cc.bytesLength, 1157 errorCode); 1158 if(errorCode.get()==U_STRING_NOT_TERMINATED_WARNING || errorCode.get()==U_BUFFER_OVERFLOW_ERROR) { 1159 errorCode.reset(); 1160 } 1161 ok=checkToUnicode( 1162 cc, cnv.getAlias(), "preflight toUChars", 1163 result, resultLength, 1164 NULL, 1165 errorCode); 1166 break; 1167 } 1168 1169 errorCode.reset(); // all errors have already been reported 1170 return ok; 1171 } 1172 1173 UBool 1174 ConversionTest::checkToUnicode(ConversionCase &cc, UConverter *cnv, const char *name, 1175 const UChar *result, int32_t resultLength, 1176 const int32_t *resultOffsets, 1177 UErrorCode resultErrorCode) { 1178 char resultInvalidChars[8]; 1179 int8_t resultInvalidLength; 1180 UErrorCode errorCode; 1181 1182 const char *msg; 1183 1184 // reset the message; NULL will mean "ok" 1185 msg=NULL; 1186 1187 errorCode=U_ZERO_ERROR; 1188 resultInvalidLength=sizeof(resultInvalidChars); 1189 ucnv_getInvalidChars(cnv, resultInvalidChars, &resultInvalidLength, &errorCode); 1190 if(U_FAILURE(errorCode)) { 1191 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidChars() failed - %s", 1192 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode)); 1193 return FALSE; 1194 } 1195 1196 // check everything that might have gone wrong 1197 if(cc.unicodeLength!=resultLength) { 1198 msg="wrong result length"; 1199 } else if(0!=u_memcmp(cc.unicode, result, cc.unicodeLength)) { 1200 msg="wrong result string"; 1201 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.unicodeLength*sizeof(*cc.offsets))) { 1202 msg="wrong offsets"; 1203 } else if(cc.outErrorCode!=resultErrorCode) { 1204 msg="wrong error code"; 1205 } else if(cc.invalidLength!=resultInvalidLength) { 1206 msg="wrong length of last invalid input"; 1207 } else if(0!=memcmp(cc.invalidChars, resultInvalidChars, cc.invalidLength)) { 1208 msg="wrong last invalid input"; 1209 } 1210 1211 if(msg==NULL) { 1212 return TRUE; 1213 } else { 1214 char buffer[2000]; // one buffer for all strings 1215 char *s, *bytesString, *unicodeString, *resultString, 1216 *offsetsString, *resultOffsetsString, 1217 *invalidCharsString, *resultInvalidCharsString; 1218 1219 bytesString=s=buffer; 1220 s=printBytes(cc.bytes, cc.bytesLength, bytesString); 1221 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString=s); 1222 s=printUnicode(result, resultLength, resultString=s); 1223 s=printOffsets(cc.offsets, cc.unicodeLength, offsetsString=s); 1224 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s); 1225 s=printBytes(cc.invalidChars, cc.invalidLength, invalidCharsString=s); 1226 s=printBytes((uint8_t *)resultInvalidChars, resultInvalidLength, resultInvalidCharsString=s); 1227 1228 if((s-buffer)>(int32_t)sizeof(buffer)) { 1229 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkToUnicode() test output buffer overflow writing %d chars\n", 1230 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer)); 1231 exit(1); 1232 } 1233 1234 errln("toUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n" 1235 " bytes <%s>[%d]\n" 1236 " expected <%s>[%d]\n" 1237 " result <%s>[%d]\n" 1238 " offsets <%s>\n" 1239 " result offsets <%s>\n" 1240 " error code expected %s got %s\n" 1241 " invalidChars expected <%s> got <%s>\n", 1242 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg, 1243 bytesString, cc.bytesLength, 1244 unicodeString, cc.unicodeLength, 1245 resultString, resultLength, 1246 offsetsString, 1247 resultOffsetsString, 1248 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode), 1249 invalidCharsString, resultInvalidCharsString); 1250 1251 return FALSE; 1252 } 1253 } 1254 1255 // fromUnicode test worker functions --------------------------------------- *** 1256 1257 static int32_t 1258 stepFromUTF8(ConversionCase &cc, 1259 UConverter *utf8Cnv, UConverter *cnv, 1260 char *result, int32_t resultCapacity, 1261 int32_t step, 1262 UErrorCode *pErrorCode) { 1263 const char *source, *sourceLimit, *utf8Limit; 1264 UChar pivotBuffer[32]; 1265 UChar *pivotSource, *pivotTarget, *pivotLimit; 1266 char *target, *targetLimit, *resultLimit; 1267 UBool flush; 1268 1269 source=cc.utf8; 1270 pivotSource=pivotTarget=pivotBuffer; 1271 target=result; 1272 utf8Limit=source+cc.utf8Length; 1273 resultLimit=result+resultCapacity; 1274 1275 // call ucnv_convertEx() with in/out buffers no larger than (step) at a time 1276 // move only one buffer (in vs. out) at a time to be extra mean 1277 // step==0 performs bulk conversion 1278 1279 // initialize the partial limits for the loop 1280 if(step==0) { 1281 // use the entire buffers 1282 sourceLimit=utf8Limit; 1283 targetLimit=resultLimit; 1284 flush=cc.finalFlush; 1285 1286 pivotLimit=pivotBuffer+UPRV_LENGTHOF(pivotBuffer); 1287 } else { 1288 // start with empty partial buffers 1289 sourceLimit=source; 1290 targetLimit=target; 1291 flush=FALSE; 1292 1293 // empty pivot is not allowed, make it of length step 1294 pivotLimit=pivotBuffer+step; 1295 } 1296 1297 for(;;) { 1298 // resetting the opposite conversion direction must not affect this one 1299 ucnv_resetFromUnicode(utf8Cnv); 1300 ucnv_resetToUnicode(cnv); 1301 1302 // convert 1303 ucnv_convertEx(cnv, utf8Cnv, 1304 &target, targetLimit, 1305 &source, sourceLimit, 1306 pivotBuffer, &pivotSource, &pivotTarget, pivotLimit, 1307 FALSE, flush, pErrorCode); 1308 1309 // check pointers and errors 1310 if(source>sourceLimit || target>targetLimit) { 1311 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1312 break; 1313 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1314 if(target!=targetLimit) { 1315 // buffer overflow must only be set when the target is filled 1316 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1317 break; 1318 } else if(targetLimit==resultLimit) { 1319 // not just a partial overflow 1320 break; 1321 } 1322 1323 // the partial target is filled, set a new limit, reset the error and continue 1324 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 1325 *pErrorCode=U_ZERO_ERROR; 1326 } else if(U_FAILURE(*pErrorCode)) { 1327 if(pivotSource==pivotBuffer) { 1328 // toUnicode error, should not occur 1329 // toUnicode errors are tested in cintltst TestConvertExFromUTF8() 1330 break; 1331 } else { 1332 // fromUnicode error 1333 // some other error occurred, done 1334 break; 1335 } 1336 } else { 1337 if(source!=sourceLimit) { 1338 // when no error occurs, then the input must be consumed 1339 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1340 break; 1341 } 1342 1343 if(sourceLimit==utf8Limit) { 1344 // we are done 1345 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { 1346 // ucnv_convertEx() warns about not terminating the output 1347 // but ucnv_fromUnicode() does not and so 1348 // checkFromUnicode() does not expect it 1349 *pErrorCode=U_ZERO_ERROR; 1350 } 1351 break; 1352 } 1353 1354 // the partial conversion succeeded, set a new limit and continue 1355 sourceLimit=(utf8Limit-source)>=step ? source+step : utf8Limit; 1356 flush=(UBool)(cc.finalFlush && sourceLimit==utf8Limit); 1357 } 1358 } 1359 1360 return (int32_t)(target-result); 1361 } 1362 1363 static int32_t 1364 stepFromUnicode(ConversionCase &cc, UConverter *cnv, 1365 char *result, int32_t resultCapacity, 1366 int32_t *resultOffsets, /* also resultCapacity */ 1367 int32_t step, 1368 UErrorCode *pErrorCode) { 1369 const UChar *source, *sourceLimit, *unicodeLimit; 1370 char *target, *targetLimit, *resultLimit; 1371 UBool flush; 1372 1373 source=cc.unicode; 1374 target=result; 1375 unicodeLimit=source+cc.unicodeLength; 1376 resultLimit=result+resultCapacity; 1377 1378 // call ucnv_fromUnicode() with in/out buffers no larger than (step) at a time 1379 // move only one buffer (in vs. out) at a time to be extra mean 1380 // step==0 performs bulk conversion and generates offsets 1381 1382 // initialize the partial limits for the loop 1383 if(step==0) { 1384 // use the entire buffers 1385 sourceLimit=unicodeLimit; 1386 targetLimit=resultLimit; 1387 flush=cc.finalFlush; 1388 } else { 1389 // start with empty partial buffers 1390 sourceLimit=source; 1391 targetLimit=target; 1392 flush=FALSE; 1393 1394 // output offsets only for bulk conversion 1395 resultOffsets=NULL; 1396 } 1397 1398 for(;;) { 1399 // resetting the opposite conversion direction must not affect this one 1400 ucnv_resetToUnicode(cnv); 1401 1402 // convert 1403 ucnv_fromUnicode(cnv, 1404 &target, targetLimit, 1405 &source, sourceLimit, 1406 resultOffsets, 1407 flush, pErrorCode); 1408 1409 // check pointers and errors 1410 if(source>sourceLimit || target>targetLimit) { 1411 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1412 break; 1413 } else if(*pErrorCode==U_BUFFER_OVERFLOW_ERROR) { 1414 if(target!=targetLimit) { 1415 // buffer overflow must only be set when the target is filled 1416 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1417 break; 1418 } else if(targetLimit==resultLimit) { 1419 // not just a partial overflow 1420 break; 1421 } 1422 1423 // the partial target is filled, set a new limit, reset the error and continue 1424 targetLimit=(resultLimit-target)>=step ? target+step : resultLimit; 1425 *pErrorCode=U_ZERO_ERROR; 1426 } else if(U_FAILURE(*pErrorCode)) { 1427 // some other error occurred, done 1428 break; 1429 } else { 1430 if(source!=sourceLimit) { 1431 // when no error occurs, then the input must be consumed 1432 *pErrorCode=U_INTERNAL_PROGRAM_ERROR; 1433 break; 1434 } 1435 1436 if(sourceLimit==unicodeLimit) { 1437 // we are done 1438 break; 1439 } 1440 1441 // the partial conversion succeeded, set a new limit and continue 1442 sourceLimit=(unicodeLimit-source)>=step ? source+step : unicodeLimit; 1443 flush=(UBool)(cc.finalFlush && sourceLimit==unicodeLimit); 1444 } 1445 } 1446 1447 return (int32_t)(target-result); 1448 } 1449 1450 UBool 1451 ConversionTest::FromUnicodeCase(ConversionCase &cc, UConverterFromUCallback callback, const char *option) { 1452 UConverter *cnv; 1453 UErrorCode errorCode; 1454 1455 // open the converter 1456 errorCode=U_ZERO_ERROR; 1457 cnv=cnv_open(cc.charset, errorCode); 1458 if(U_FAILURE(errorCode)) { 1459 errcheckln(errorCode, "fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_open() failed - %s", 1460 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1461 return FALSE; 1462 } 1463 ucnv_resetToUnicode(utf8Cnv); 1464 1465 // set the callback 1466 if(callback!=NULL) { 1467 ucnv_setFromUCallBack(cnv, callback, option, NULL, NULL, &errorCode); 1468 if(U_FAILURE(errorCode)) { 1469 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setFromUCallBack() failed - %s", 1470 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1471 ucnv_close(cnv); 1472 return FALSE; 1473 } 1474 } 1475 1476 // set the fallbacks flag 1477 // TODO change with Jitterbug 2401, then add a similar call for toUnicode too 1478 ucnv_setFallback(cnv, cc.fallbacks); 1479 1480 // set the subchar 1481 int32_t length; 1482 1483 if(cc.setSub>0) { 1484 length=(int32_t)strlen(cc.subchar); 1485 ucnv_setSubstChars(cnv, cc.subchar, (int8_t)length, &errorCode); 1486 if(U_FAILURE(errorCode)) { 1487 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstChars() failed - %s", 1488 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1489 ucnv_close(cnv); 1490 return FALSE; 1491 } 1492 } else if(cc.setSub<0) { 1493 ucnv_setSubstString(cnv, cc.subString, -1, &errorCode); 1494 if(U_FAILURE(errorCode)) { 1495 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d) ucnv_setSubstString() failed - %s", 1496 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, u_errorName(errorCode)); 1497 ucnv_close(cnv); 1498 return FALSE; 1499 } 1500 } 1501 1502 // convert unicode to utf8 1503 char utf8[256]; 1504 cc.utf8=utf8; 1505 u_strToUTF8(utf8, UPRV_LENGTHOF(utf8), &cc.utf8Length, 1506 cc.unicode, cc.unicodeLength, 1507 &errorCode); 1508 if(U_FAILURE(errorCode)) { 1509 // skip UTF-8 testing of a string with an unpaired surrogate, 1510 // or of one that's too long 1511 // toUnicode errors are tested in cintltst TestConvertExFromUTF8() 1512 cc.utf8Length=-1; 1513 } 1514 1515 int32_t resultOffsets[256]; 1516 char result[256]; 1517 int32_t resultLength; 1518 UBool ok; 1519 1520 static const struct { 1521 int32_t step; 1522 const char *name, *utf8Name; 1523 } steps[]={ 1524 { 0, "bulk", "utf8" }, // must be first for offsets to be checked 1525 { 1, "step=1", "utf8 step=1" }, 1526 { 3, "step=3", "utf8 step=3" }, 1527 { 7, "step=7", "utf8 step=7" } 1528 }; 1529 int32_t i, step; 1530 1531 ok=TRUE; 1532 for(i=0; i<UPRV_LENGTHOF(steps) && ok; ++i) { 1533 step=steps[i].step; 1534 memset(resultOffsets, -1, UPRV_LENGTHOF(resultOffsets)); 1535 memset(result, -1, UPRV_LENGTHOF(result)); 1536 errorCode=U_ZERO_ERROR; 1537 resultLength=stepFromUnicode(cc, cnv, 1538 result, UPRV_LENGTHOF(result), 1539 step==0 ? resultOffsets : NULL, 1540 step, &errorCode); 1541 ok=checkFromUnicode( 1542 cc, cnv, steps[i].name, 1543 (uint8_t *)result, resultLength, 1544 cc.offsets!=NULL ? resultOffsets : NULL, 1545 errorCode); 1546 if(U_FAILURE(errorCode) || !cc.finalFlush) { 1547 // reset if an error occurred or we did not flush 1548 // otherwise do nothing to make sure that flushing resets 1549 ucnv_resetFromUnicode(cnv); 1550 } 1551 if (resultOffsets[resultLength] != -1) { 1552 errln("fromUnicode[%d](%s) Conversion wrote too much to offsets at index %d", 1553 cc.caseNr, cc.charset, resultLength); 1554 } 1555 if (result[resultLength] != (char)-1) { 1556 errln("fromUnicode[%d](%s) Conversion wrote too much to result at index %d", 1557 cc.caseNr, cc.charset, resultLength); 1558 } 1559 1560 // bulk test is first, then offsets are not checked any more 1561 cc.offsets=NULL; 1562 1563 // test direct conversion from UTF-8 1564 if(cc.utf8Length>=0) { 1565 errorCode=U_ZERO_ERROR; 1566 resultLength=stepFromUTF8(cc, utf8Cnv, cnv, 1567 result, UPRV_LENGTHOF(result), 1568 step, &errorCode); 1569 ok=checkFromUnicode( 1570 cc, cnv, steps[i].utf8Name, 1571 (uint8_t *)result, resultLength, 1572 NULL, 1573 errorCode); 1574 if(U_FAILURE(errorCode) || !cc.finalFlush) { 1575 // reset if an error occurred or we did not flush 1576 // otherwise do nothing to make sure that flushing resets 1577 ucnv_resetToUnicode(utf8Cnv); 1578 ucnv_resetFromUnicode(cnv); 1579 } 1580 } 1581 } 1582 1583 // not a real loop, just a convenience for breaking out of the block 1584 while(ok && cc.finalFlush) { 1585 // test ucnv_fromUChars() 1586 memset(result, 0, sizeof(result)); 1587 1588 errorCode=U_ZERO_ERROR; 1589 resultLength=ucnv_fromUChars(cnv, 1590 result, UPRV_LENGTHOF(result), 1591 cc.unicode, cc.unicodeLength, 1592 &errorCode); 1593 ok=checkFromUnicode( 1594 cc, cnv, "fromUChars", 1595 (uint8_t *)result, resultLength, 1596 NULL, 1597 errorCode); 1598 if(!ok) { 1599 break; 1600 } 1601 1602 // test preflighting 1603 // keep the correct result for simple checking 1604 errorCode=U_ZERO_ERROR; 1605 resultLength=ucnv_fromUChars(cnv, 1606 NULL, 0, 1607 cc.unicode, cc.unicodeLength, 1608 &errorCode); 1609 if(errorCode==U_STRING_NOT_TERMINATED_WARNING || errorCode==U_BUFFER_OVERFLOW_ERROR) { 1610 errorCode=U_ZERO_ERROR; 1611 } 1612 ok=checkFromUnicode( 1613 cc, cnv, "preflight fromUChars", 1614 (uint8_t *)result, resultLength, 1615 NULL, 1616 errorCode); 1617 break; 1618 } 1619 1620 ucnv_close(cnv); 1621 return ok; 1622 } 1623 1624 UBool 1625 ConversionTest::checkFromUnicode(ConversionCase &cc, UConverter *cnv, const char *name, 1626 const uint8_t *result, int32_t resultLength, 1627 const int32_t *resultOffsets, 1628 UErrorCode resultErrorCode) { 1629 UChar resultInvalidUChars[8]; 1630 int8_t resultInvalidLength; 1631 UErrorCode errorCode; 1632 1633 const char *msg; 1634 1635 // reset the message; NULL will mean "ok" 1636 msg=NULL; 1637 1638 errorCode=U_ZERO_ERROR; 1639 resultInvalidLength=UPRV_LENGTHOF(resultInvalidUChars); 1640 ucnv_getInvalidUChars(cnv, resultInvalidUChars, &resultInvalidLength, &errorCode); 1641 if(U_FAILURE(errorCode)) { 1642 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) ucnv_getInvalidUChars() failed - %s", 1643 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, u_errorName(errorCode)); 1644 return FALSE; 1645 } 1646 1647 // check everything that might have gone wrong 1648 if(cc.bytesLength!=resultLength) { 1649 msg="wrong result length"; 1650 } else if(0!=memcmp(cc.bytes, result, cc.bytesLength)) { 1651 msg="wrong result string"; 1652 } else if(cc.offsets!=NULL && 0!=memcmp(cc.offsets, resultOffsets, cc.bytesLength*sizeof(*cc.offsets))) { 1653 msg="wrong offsets"; 1654 } else if(cc.outErrorCode!=resultErrorCode) { 1655 msg="wrong error code"; 1656 } else if(cc.invalidLength!=resultInvalidLength) { 1657 msg="wrong length of last invalid input"; 1658 } else if(0!=u_memcmp(cc.invalidUChars, resultInvalidUChars, cc.invalidLength)) { 1659 msg="wrong last invalid input"; 1660 } 1661 1662 if(msg==NULL) { 1663 return TRUE; 1664 } else { 1665 char buffer[2000]; // one buffer for all strings 1666 char *s, *unicodeString, *bytesString, *resultString, 1667 *offsetsString, *resultOffsetsString, 1668 *invalidCharsString, *resultInvalidUCharsString; 1669 1670 unicodeString=s=buffer; 1671 s=printUnicode(cc.unicode, cc.unicodeLength, unicodeString); 1672 s=printBytes(cc.bytes, cc.bytesLength, bytesString=s); 1673 s=printBytes(result, resultLength, resultString=s); 1674 s=printOffsets(cc.offsets, cc.bytesLength, offsetsString=s); 1675 s=printOffsets(resultOffsets, resultLength, resultOffsetsString=s); 1676 s=printUnicode(cc.invalidUChars, cc.invalidLength, invalidCharsString=s); 1677 s=printUnicode(resultInvalidUChars, resultInvalidLength, resultInvalidUCharsString=s); 1678 1679 if((s-buffer)>(int32_t)sizeof(buffer)) { 1680 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) fatal error: checkFromUnicode() test output buffer overflow writing %d chars\n", 1681 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, (int)(s-buffer)); 1682 exit(1); 1683 } 1684 1685 errln("fromUnicode[%d](%s cb=\"%s\" fb=%d flush=%d %s) failed: %s\n" 1686 " unicode <%s>[%d]\n" 1687 " expected <%s>[%d]\n" 1688 " result <%s>[%d]\n" 1689 " offsets <%s>\n" 1690 " result offsets <%s>\n" 1691 " error code expected %s got %s\n" 1692 " invalidChars expected <%s> got <%s>\n", 1693 cc.caseNr, cc.charset, cc.cbopt, cc.fallbacks, cc.finalFlush, name, msg, 1694 unicodeString, cc.unicodeLength, 1695 bytesString, cc.bytesLength, 1696 resultString, resultLength, 1697 offsetsString, 1698 resultOffsetsString, 1699 u_errorName(cc.outErrorCode), u_errorName(resultErrorCode), 1700 invalidCharsString, resultInvalidUCharsString); 1701 1702 return FALSE; 1703 } 1704 } 1705 1706 #endif /* #if !UCONFIG_NO_LEGACY_CONVERSION */ 1707