1 /* 2 ******************************************************************************* 3 * Copyright (C) 2002-2014, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 */ 7 8 package com.ibm.icu.dev.test.charset; 9 10 import java.nio.ByteBuffer; 11 import java.nio.CharBuffer; 12 import java.nio.charset.Charset; 13 import java.nio.charset.CharsetDecoder; 14 import java.nio.charset.CharsetEncoder; 15 import java.nio.charset.CoderResult; 16 import java.nio.charset.CodingErrorAction; 17 import java.util.Iterator; 18 19 import com.ibm.icu.charset.CharsetCallback; 20 import com.ibm.icu.charset.CharsetDecoderICU; 21 import com.ibm.icu.charset.CharsetEncoderICU; 22 import com.ibm.icu.charset.CharsetICU; 23 import com.ibm.icu.charset.CharsetProviderICU; 24 import com.ibm.icu.dev.test.ModuleTest; 25 import com.ibm.icu.dev.test.TestDataModule.DataMap; 26 import com.ibm.icu.impl.ICUResourceBundle; 27 import com.ibm.icu.text.UnicodeSet; 28 29 /** 30 * This maps to convtest.c which tests the test file for data-driven conversion tests. 31 * 32 */ 33 public class TestConversion extends ModuleTest { 34 /** 35 * This maps to the C struct of conversion case in convtest.h that stores the 36 * data for a conversion test 37 * 38 */ 39 private class ConversionCase { 40 int caseNr; // testcase index 41 String option = null; // callback options 42 CodingErrorAction cbErrorAction = null; // callback action type 43 CharBuffer toUnicodeResult = null; 44 ByteBuffer fromUnicodeResult = null; 45 46 // data retrieved from a test case conversion.txt 47 String charset; // charset 48 String unicode; // unicode string 49 ByteBuffer bytes; // byte 50 int[] offsets; // offsets 51 boolean finalFlush; // flush 52 boolean fallbacks; // fallback 53 String outErrorCode; // errorCode 54 String cbopt; // callback 55 56 // TestGetUnicodeSet variables 57 String map; 58 String mapnot; 59 int which; 60 61 // CharsetCallback encoder and decoder 62 CharsetCallback.Decoder cbDecoder = null; 63 CharsetCallback.Encoder cbEncoder = null; 64 65 String caseNrAsString() { 66 return "[" + caseNr + "]"; 67 } 68 } 69 70 /* In the data-driven conversion test, converters that are not available in 71 * ICU4J are marked with the following leading symbol. 72 */ 73 private static final char UNSUPPORTED_CHARSET_SYMBOL = '+'; 74 75 // public methods -------------------------------------------------------- 76 77 public static void main(String[] args) throws Exception { 78 new TestConversion().run(args); 79 } 80 81 public TestConversion() { 82 super("com/ibm/icu/dev/data/testdata/", "conversion"); 83 } 84 85 /* 86 * This method maps to the convtest.cpp runIndexedTest() method to run each 87 * type of conversion. 88 */ 89 public void processModules() { 90 try { 91 int testFromUnicode = 0; 92 int testToUnicode = 0; 93 String testName = t.getName().toString(); 94 95 // Iterate through and get each of the test case to process 96 for (Iterator iter = t.getDataIterator(); iter.hasNext();) { 97 DataMap testcase = (DataMap) iter.next(); 98 99 if (testName.equalsIgnoreCase("toUnicode")) { 100 TestToUnicode(testcase, testToUnicode); 101 testToUnicode++; 102 103 } else if (testName.equalsIgnoreCase("fromUnicode")) { 104 TestFromUnicode(testcase, testFromUnicode); 105 testFromUnicode++; 106 } else if (testName.equalsIgnoreCase("getUnicodeSet")) { 107 TestGetUnicodeSet(testcase); 108 } else { 109 warnln("Could not load the test cases for conversion"); 110 continue; 111 } 112 } 113 } catch (Exception e) { 114 e.printStackTrace(); 115 } 116 117 } 118 119 // private methods ------------------------------------------------------- 120 121 122 // fromUnicode test worker functions --------------------------------------- 123 private void TestFromUnicode(DataMap testcase, int caseNr) { 124 125 ConversionCase cc = new ConversionCase(); 126 127 try { 128 // retrieve test case data 129 cc.caseNr = caseNr; 130 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString(); 131 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString(); 132 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary(); 133 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector(); 134 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0; 135 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0; 136 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString(); 137 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString(); 138 139 } catch (Exception e) { 140 errln("Skipping test:"); 141 errln("error parsing conversion/toUnicode test case " + cc.caseNr); 142 return; 143 } 144 145 /* 146 * Skip the following data driven converter tests. 147 * These tests were added to the data driven conversion test in ICU 148 * to test direct-from-UTF-8 m:n Unicode:charset conversion. 149 * This feature is not in ICU4J. 150 * See #9601 151 */ 152 // Android patch: Skip tests that fail with customized data. 153 String [] testsToSkip = { 154 "*test2", 155 "EUC-TW", 156 "gb18030", 157 "HZ", 158 "ibm-1386", 159 "ibm-1390", 160 "ibm-1390,swaplfnl", 161 "ibm-1399", 162 "ibm-16684", 163 "ibm-25546", 164 "ibm-930", 165 "ibm-943", 166 "ibm-970", 167 "ibm-971", 168 "IBM-eucJP", 169 "iso-2022-cn", 170 "ISO-2022-CN", 171 "iso-2022-jp", 172 "ISO-2022-JP", 173 "ISO-2022-JP-2", 174 "iso-2022-kr", 175 "ISO-2022-KR", 176 "JIS", 177 "JIS7", 178 "JIS8", 179 "lmbcs", 180 "windows-936", 181 "x11-compound-text" 182 }; 183 // Android patch end. 184 for (int i = 0; i < testsToSkip.length; i++) { 185 if (cc.charset.equals(testsToSkip[i])) { 186 logln(""); 187 logln("Skipping: " + cc.charset); 188 logln("..............................................."); 189 return; 190 } 191 } 192 193 // ----for debugging only 194 logln(""); 195 logln("TestFromUnicode[" + caseNr + "] " + cc.charset + " "); 196 logln("Unicode: " + cc.unicode); 197 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit())); 198 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes()); 199 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")"); 200 logln("..............................................."); 201 202 // process the retrieved test data case 203 if (cc.offsets.length == 0) { 204 cc.offsets = null; 205 } else if (cc.offsets.length != cc.bytes.limit()) { 206 errln("fromUnicode[" + cc.caseNr + "] bytes[" + cc.bytes 207 + "] and offsets[" + cc.offsets.length 208 + "] must have the same length"); 209 return; 210 } 211 212 // check the callback replacement value 213 if (cc.cbopt.length() > 0) { 214 215 switch ((cc.cbopt).charAt(0)) { 216 case '?': 217 cc.cbErrorAction = CodingErrorAction.REPLACE; 218 break; 219 case '0': 220 cc.cbErrorAction = CodingErrorAction.IGNORE; 221 break; 222 case '.': 223 cc.cbErrorAction = CodingErrorAction.REPORT; 224 break; 225 case '&': 226 cc.cbErrorAction = CodingErrorAction.REPLACE; 227 cc.cbEncoder = CharsetCallback.FROM_U_CALLBACK_ESCAPE; 228 break; 229 default: 230 cc.cbErrorAction = null; 231 break; 232 } 233 234 // check for any options for the callback value -- 235 cc.option = cc.cbErrorAction == null ? cc.cbopt : cc.cbopt 236 .substring(1); 237 if (cc.option == null) { 238 cc.option = null; 239 } 240 } 241 FromUnicodeCase(cc); 242 } 243 244 245 private void FromUnicodeCase(ConversionCase cc) { 246 // create charset encoder for conversion test 247 CharsetProviderICU provider = new CharsetProviderICU(); 248 CharsetEncoder encoder = null; 249 Charset charset = null; 250 try { 251 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 252 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 253 ? (Charset) provider.charsetForName(cc.charset.substring(1), 254 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 255 : (Charset) provider.charsetForName(cc.charset); 256 if (charset != null) { 257 encoder = (CharsetEncoder) charset.newEncoder(); 258 encoder.onMalformedInput(CodingErrorAction.REPLACE); 259 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 260 if (encoder instanceof CharsetEncoderICU) { 261 ((CharsetEncoderICU)encoder).setFallbackUsed(cc.fallbacks); 262 if (((CharsetEncoderICU)encoder).isFallbackUsed() != cc.fallbacks) { 263 errln("Fallback could not be set for " + cc.charset); 264 } 265 } 266 } 267 } catch (Exception e) { 268 encoder = null; 269 } 270 if (encoder == null) { 271 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) { 272 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time"); 273 } else { 274 errln(cc.charset + " was not found"); 275 } 276 return; 277 } 278 279 // set the callback for the encoder 280 if (cc.cbErrorAction != null) { 281 if (cc.cbEncoder != null) { 282 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.malformedForLength(1), cc.cbEncoder, cc.option); 283 ((CharsetEncoderICU)encoder).setFromUCallback(CoderResult.unmappableForLength(1), cc.cbEncoder, cc.option); 284 } else { 285 encoder.onUnmappableCharacter(cc.cbErrorAction); 286 encoder.onMalformedInput(cc.cbErrorAction); 287 } 288 289 // if action has an option, put in the option for the case 290 if (cc.option.equals("i")) { 291 encoder.onMalformedInput(CodingErrorAction.REPORT); 292 } 293 294 // if callback action is replace, 295 // and there is a subchar 296 // replace the decoder's default replacement value 297 // if substring, skip test due to current api not supporting 298 // substring 299 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) { 300 if (cc.cbopt.length() > 1) { 301 if (cc.cbopt.length() > 1 && cc.cbopt.charAt(1) == '=') { 302 logln("Skipping test due to limitation in Java API - substitution string not supported"); 303 return; 304 } else { 305 // // read NUL-separated subchar first, if any 306 // copy the subchar from Latin-1 characters 307 // start after the NUL 308 if (cc.cbopt.charAt(1) == 0x00) { 309 cc.cbopt = cc.cbopt.substring(2); 310 311 try { 312 encoder.replaceWith(toByteArray(cc.cbopt)); 313 } catch (Exception e) { 314 logln("Skipping test due to limitation in Java API - substitution character sequence size error"); 315 return; 316 } 317 } 318 } 319 } 320 } 321 } 322 323 // do charset encoding from unicode 324 325 // testing by steps using charset.encoder(in,out,flush) 326 int resultLength; 327 boolean ok; 328 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked 329 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } }; 330 int i, step; 331 332 ok = true; 333 334 for (i = 0; i < steps.length && ok; ++i) { 335 step = Integer.parseInt(steps[i][0]); 336 337 logln("Testing step:[" + step + "]"); 338 try { 339 resultLength = stepFromUnicode(cc, encoder, step); 340 ok = checkFromUnicode(cc, resultLength); 341 } catch (Exception ex) { 342 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]"); 343 ex.printStackTrace(System.out); 344 return; 345 } 346 347 } 348 // testing by whole buffer using out = charset.encoder(in) 349 while (ok && cc.finalFlush) { 350 logln("Testing java API charset.encoder(in):"); 351 cc.fromUnicodeResult = null; 352 ByteBuffer out = null; 353 354 try { 355 out = encoder.encode(CharBuffer.wrap(cc.unicode.toCharArray())); 356 out.position(out.limit()); 357 if (out.limit() != out.capacity() || cc.finalFlush) { 358 int pos = out.position(); 359 byte[] temp = out.array(); 360 out = ByteBuffer.allocate(temp.length * 4); 361 out.put(temp); 362 out.position(pos); 363 CoderResult cr = encoder.flush(out); 364 if (cr.isOverflow()) { 365 logln("Overflow error with flushing encoder"); 366 } 367 } 368 cc.fromUnicodeResult = out; 369 370 ok = checkFromUnicode(cc, out.limit()); 371 if (!ok) { 372 break; 373 } 374 } catch (Exception e) { 375 //check the error code to see if it matches cc.errorCode 376 logln("Encoder returned an error code"); 377 logln("ErrorCode expected is: " + cc.outErrorCode); 378 logln("Error Result is: " + e.toString()); 379 } 380 break; 381 } 382 } 383 384 private int stepFromUnicode(ConversionCase cc, CharsetEncoder encoder, int step) { 385 if (step < 0) { 386 errln("Negative step size, test internal error."); 387 return 0; 388 } 389 390 int sourceLen = cc.unicode.length(); 391 int targetLen = cc.bytes.capacity() + 20; // for BOM, and to let failures produce excess output 392 CharBuffer source = CharBuffer.wrap(cc.unicode.toCharArray()); 393 ByteBuffer target = ByteBuffer.allocate(targetLen); 394 cc.fromUnicodeResult = null; 395 encoder.reset(); 396 397 int currentSourceLimit; 398 int currentTargetLimit; 399 if (step > 0) { 400 currentSourceLimit = Math.min(step, sourceLen); 401 currentTargetLimit = Math.min(step, targetLen); 402 } else { 403 currentSourceLimit = sourceLen; 404 currentTargetLimit = targetLen; 405 } 406 407 CoderResult cr = null; 408 409 for (;;) { 410 source.limit(currentSourceLimit); 411 target.limit(currentTargetLimit); 412 413 cr = encoder.encode(source, target, currentSourceLimit == sourceLen); 414 415 if (cr.isUnderflow()) { 416 if (currentSourceLimit == sourceLen) { 417 if (target.position() == cc.bytes.limit()) { 418 // target contains the correct number of bytes 419 break; 420 } 421 // Do a final flush for cleanup, then break out 422 // Encode loop, exits with cr==underflow in normal operation. 423 //target.limit(targetLen); 424 target.limit(targetLen); 425 cr = encoder.flush(target); 426 if (cr.isUnderflow()) { 427 // good 428 } else if (cr.isOverflow()) { 429 errln(cc.caseNrAsString() + " Flush is producing excessive output"); 430 } else { 431 errln(cc.caseNrAsString() + " Flush operation failed. CoderResult = \"" 432 + cr.toString() + "\""); 433 } 434 break; 435 } 436 currentSourceLimit = Math.min(currentSourceLimit + step, sourceLen); 437 } else if (cr.isOverflow()) { 438 if (currentTargetLimit == targetLen) { 439 errln(cc.caseNrAsString() + " encode() is producing excessive output"); 440 break; 441 } 442 currentTargetLimit = Math.min(currentTargetLimit + step, targetLen); 443 } else { 444 // check the error code to see if it matches cc.errorCode 445 logln("Encoder returned an error code"); 446 logln("ErrorCode expected is: " + cc.outErrorCode); 447 logln("Error Result is: " + cr.toString()); 448 break; 449 } 450 451 } 452 453 cc.fromUnicodeResult = target; 454 return target.position(); 455 } 456 457 private boolean checkFromUnicode(ConversionCase cc, int resultLength) { 458 return checkResultsFromUnicode(cc, cc.bytes, cc.fromUnicodeResult); 459 } 460 461 // toUnicode test worker functions ----------------------------------------- *** 462 463 private void TestToUnicode(DataMap testcase, int caseNr) { 464 // create Conversion case to store the test case data 465 ConversionCase cc = new ConversionCase(); 466 467 try { 468 // retrieve test case data 469 cc.caseNr = caseNr; 470 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")).getString(); 471 cc.bytes = ((ICUResourceBundle) testcase.getObject("bytes")).getBinary(); 472 cc.unicode = ((ICUResourceBundle) testcase.getObject("unicode")).getString(); 473 cc.offsets = ((ICUResourceBundle) testcase.getObject("offsets")).getIntVector(); 474 cc.finalFlush = ((ICUResourceBundle) testcase.getObject("flush")).getUInt() != 0; 475 cc.fallbacks = ((ICUResourceBundle) testcase.getObject("fallbacks")).getUInt() != 0; 476 cc.outErrorCode = ((ICUResourceBundle) testcase.getObject("errorCode")).getString(); 477 cc.cbopt = ((ICUResourceBundle) testcase.getObject("callback")).getString(); 478 479 } catch (Exception e) { 480 errln("Skipping test: error parsing conversion/toUnicode test case " + cc.caseNr); 481 return; 482 } 483 484 // Android patch: Skip tests that fail with customized data. 485 String [] testsToSkip = { 486 "HZ", 487 "ibm-1390", 488 "ibm-1390,swaplfnl", 489 "ibm-16684", 490 "ibm-25546", 491 "ibm-971", 492 "ISO-2022-CN", 493 "ISO-2022-JP", 494 "ISO-2022-JP-2", 495 "ISO-2022-KR", 496 "JIS7" 497 }; 498 for (int i = 0; i < testsToSkip.length; i++) { 499 if (cc.charset.equals(testsToSkip[i])) { 500 logln(""); 501 logln("Skipping: " + cc.charset); 502 logln("..............................................."); 503 return; 504 } 505 } 506 // Android patch end. 507 508 // ----for debugging only 509 logln(""); 510 logln("TestToUnicode[" + caseNr + "] " + cc.charset + " "); 511 logln("Unicode: " + hex(cc.unicode)); 512 logln("Bytes: " + printbytes(cc.bytes, cc.bytes.limit())); 513 ByteBuffer c = ByteBuffer.wrap(cc.cbopt.getBytes()); 514 logln("Callback: " + printbytes(c, c.limit()) + " (" + cc.cbopt + ")"); 515 logln("..............................................."); 516 517 // process the retrieved test data case 518 if (cc.offsets.length == 0) { 519 cc.offsets = null; 520 } else if (cc.offsets.length != cc.unicode.length()) { 521 errln("Skipping test: toUnicode[" + cc.caseNr + "] unicode[" 522 + cc.unicode.length() + "] and offsets[" 523 + cc.offsets.length + "] must have the same length"); 524 return; 525 } 526 // check for the callback replacement value for unmappable 527 // characters or malformed errors 528 if (cc.cbopt.length() > 0) { 529 switch ((cc.cbopt).charAt(0)) { 530 case '?': // CALLBACK_SUBSTITUTE 531 cc.cbErrorAction = CodingErrorAction.REPLACE; 532 break; 533 case '0': // CALLBACK_SKIP 534 cc.cbErrorAction = CodingErrorAction.IGNORE; 535 break; 536 case '.': // CALLBACK_STOP 537 cc.cbErrorAction = CodingErrorAction.REPORT; 538 break; 539 case '&': // CALLBACK_ESCAPE 540 cc.cbErrorAction = CodingErrorAction.REPORT; 541 cc.cbDecoder = CharsetCallback.TO_U_CALLBACK_ESCAPE; 542 break; 543 default: 544 cc.cbErrorAction = null; 545 break; 546 } 547 } 548 // check for any options for the callback value 549 cc.option = cc.cbErrorAction == null ? null : cc.cbopt.substring(1); 550 if (cc.option == null) { 551 cc.option = null; 552 } 553 554 ToUnicodeCase(cc); 555 556 } 557 558 private void ToUnicodeCase(ConversionCase cc) { 559 560 // create converter for charset and decoder for each test case 561 CharsetProviderICU provider = new CharsetProviderICU(); 562 CharsetDecoder decoder = null; 563 Charset charset = null; 564 565 try { 566 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 567 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 568 ? (Charset) provider.charsetForName(cc.charset.substring(1), 569 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 570 : (Charset) provider.charsetForName(cc.charset); 571 if (charset != null) { 572 decoder = (CharsetDecoder) charset.newDecoder(); 573 decoder.onMalformedInput(CodingErrorAction.REPLACE); 574 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 575 } 576 } catch (Exception e) { 577 // TODO implement loading of test data. 578 decoder = null; 579 } 580 if (decoder == null) { 581 if (cc.charset.charAt(0) == UNSUPPORTED_CHARSET_SYMBOL) { 582 logln("Skipping test:(" + cc.charset.substring(1) + ") due to ICU Charset not supported at this time"); 583 } else { 584 errln(cc.charset + " was not found"); 585 } 586 return; 587 } 588 589 // set the callback for the decoder 590 if (cc.cbErrorAction != null) { 591 if (cc.cbDecoder != null) { 592 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.malformedForLength(1), cc.cbDecoder, cc.option); 593 ((CharsetDecoderICU)decoder).setToUCallback(CoderResult.unmappableForLength(1), cc.cbDecoder, cc.option); 594 } else { 595 decoder.onMalformedInput(cc.cbErrorAction); 596 decoder.onUnmappableCharacter(cc.cbErrorAction); 597 } 598 599 // set the options (if any: SKIP_STOP_ON_ILLEGAL) for callback 600 if (cc.option.equals("i")) { 601 decoder.onMalformedInput(CodingErrorAction.REPORT); 602 } 603 604 // if callback action is replace, and there is a subchar 605 // replace the decoder's default replacement value 606 // if substring, skip test due to current api not supporting 607 // substring replacement 608 if (cc.cbErrorAction.equals(CodingErrorAction.REPLACE)) { 609 if (cc.cbopt.length() > 1) { 610 if (cc.cbopt.charAt(1) == '=') { 611 logln("Skipping test due to limitation in Java API - substitution string not supported"); 612 613 } else { 614 // // read NUL-separated subchar first, if any 615 // copy the subchar from Latin-1 characters 616 // start after the NUL 617 if (cc.cbopt.charAt(1) == 0x00) { 618 cc.cbopt = cc.cbopt.substring(2); 619 620 try { 621 decoder.replaceWith(cc.cbopt); 622 } catch (Exception e) { 623 logln("Skipping test due to limitation in Java API - substitution character sequence size error"); 624 } 625 } 626 } 627 } 628 } 629 } 630 631 // Check the step to unicode 632 boolean ok; 633 int resultLength; 634 635 String steps[][] = { { "0", "bulk" }, // must be first for offsets to be checked 636 { "1", "step=1" }, { "3", "step=3" }, { "7", "step=7" } }; 637 /* TODO: currently not supported test steps, getNext API is not supported for now 638 { "-1", "getNext" }, 639 { "-2", "toU(bulk)+getNext" }, 640 { "-3", "getNext+toU(bulk)" }, 641 { "-4", "toU(1)+getNext" }, 642 { "-5", "getNext+toU(1)" }, 643 { "-12", "toU(5)+getNext" }, 644 { "-13", "getNext+toU(5)" }};*/ 645 646 ok = true; 647 int step; 648 // testing by steps using the CoderResult cr = charset.decoder(in,out,flush) api 649 for (int i = 0; i < steps.length && ok; ++i) { 650 step = Integer.parseInt(steps[i][0]); 651 652 if (step < 0 && !cc.finalFlush) { 653 continue; 654 } 655 logln("Testing step:[" + step + "]"); 656 657 try { 658 resultLength = stepToUnicode(cc, decoder, step); 659 ok = checkToUnicode(cc, resultLength); 660 } catch (Exception ex) { 661 errln("Test failed: " + ex.getClass().getName() + " thrown: " + cc.charset+ " [" + cc.caseNr + "]"); 662 ex.printStackTrace(System.out); 663 return; 664 } 665 } 666 667 //testing the java's out = charset.decoder(in) api 668 while (ok && cc.finalFlush) { 669 logln("Testing java charset.decoder(in):"); 670 cc.toUnicodeResult = null; 671 CharBuffer out = null; 672 673 try { 674 cc.bytes.rewind(); 675 out = decoder.decode(cc.bytes); 676 out.position(out.limit()); 677 if (out.limit() < cc.unicode.length()) { 678 int pos = out.position(); 679 char[] temp = out.array(); 680 out = CharBuffer.allocate(cc.bytes.limit()); 681 out.put(temp); 682 out.position(pos); 683 CoderResult cr = decoder.flush(out); 684 if (cr.isOverflow()) { 685 logln("Overflow error with flushing decodering"); 686 } 687 } 688 689 cc.toUnicodeResult = out; 690 691 ok = checkToUnicode(cc, out.limit()); 692 if (!ok) { 693 break; 694 } 695 } catch (Exception e) { 696 //check the error code to see if it matches cc.errorCode 697 logln("Decoder returned an error code"); 698 logln("ErrorCode expected is: " + cc.outErrorCode); 699 logln("Error Result is: " + e.toString()); 700 } 701 break; 702 } 703 704 return; 705 } 706 707 708 709 710 private int stepToUnicode(ConversionCase cc, CharsetDecoder decoder, 711 int step) 712 713 { 714 ByteBuffer source; 715 CharBuffer target; 716 boolean flush = false; 717 int sourceLen; 718 source = cc.bytes; 719 sourceLen = cc.bytes.limit(); 720 source.position(0); 721 target = CharBuffer.allocate(cc.unicode.length() + 4); 722 target.position(0); 723 cc.toUnicodeResult = null; 724 decoder.reset(); 725 726 if (step >= 0) { 727 728 int iStep = step; 729 int oStep = step; 730 731 for (;;) { 732 733 if (step != 0) { 734 source.limit((iStep <= sourceLen) ? iStep : sourceLen); 735 target.limit((oStep <= target.capacity()) ? oStep : target 736 .capacity()); 737 flush = (cc.finalFlush && source.limit() == sourceLen); 738 739 } else { 740 //bulk mode 741 source.limit(sourceLen); 742 target.limit(target.capacity()); 743 flush = cc.finalFlush; 744 } 745 // convert 746 CoderResult cr = null; 747 if (source.hasRemaining()) { 748 749 cr = decoder.decode(source, target, flush); 750 // check pointers and errors 751 if (cr.isOverflow()) { 752 // the partial target is filled, set a new limit, 753 oStep = (target.position() + step); 754 target.limit((oStep < target.capacity()) ? oStep 755 : target.capacity()); 756 if (target.limit() > target.capacity()) { 757 //target has reached its limit, an error occurred or test case has an error code 758 //check error code 759 logln("UnExpected error: Target Buffer is larger than capacity"); 760 break; 761 } 762 763 } else if (cr.isError()) { 764 //check the error code to see if it matches cc.errorCode 765 logln("Decoder returned an error code"); 766 logln("ErrorCode expected is: " + cc.outErrorCode); 767 logln("Error Result is: " + cr.toString()); 768 break; 769 } 770 771 } else { 772 if (source.limit() == sourceLen) { 773 774 cr = decoder.decode(source, target, true); 775 776 //due to limitation of the API we need to check for target limit for expected 777 if (target.position() != cc.unicode.length()) { 778 if (target.limit() != cc.unicode.length()) { 779 target.limit(cc.unicode.length()); 780 } 781 cr = decoder.flush(target); 782 if (cr.isError()) { 783 errln("Flush operation failed"); 784 } 785 } 786 break; 787 } 788 } 789 iStep += step; 790 791 } 792 793 }// if(step ==0) 794 795 //-------------------------------------------------------------------------- 796 else /* step<0 */{ 797 /* 798 * step==-1: call only ucnv_getNextUChar() 799 * otherwise alternate between ucnv_toUnicode() and ucnv_getNextUChar() 800 * if step==-2 or -3, then give ucnv_toUnicode() the whole remaining input, 801 * else give it at most (-step-2)/2 bytes 802 */ 803 804 for (;;) { 805 // convert 806 if ((step & 1) != 0 /* odd: -1, -3, -5, ... */) { 807 808 target.limit(target.position() < target.capacity() ? target 809 .position() + 1 : target.capacity()); 810 811 // decode behavior is return to output target 1 character 812 CoderResult cr = null; 813 814 //similar to getNextUChar() , input is the whole string, while outputs only 1 character 815 source.limit(sourceLen); 816 while (target.position() != target.limit() 817 && source.hasRemaining()) { 818 cr = decoder.decode(source, target, 819 source.limit() == sourceLen); 820 821 if (cr.isOverflow()) { 822 823 if (target.limit() >= target.capacity()) { 824 // target has reached its limit, an error occurred 825 logln("UnExpected error: Target Buffer is larger than capacity"); 826 break; 827 } else { 828 //1 character has been consumed 829 target.limit(target.position() + 1); 830 break; 831 } 832 } else if (cr.isError()) { 833 logln("Decoder returned an error code"); 834 logln("ErrorCode expected is: " + cc.outErrorCode); 835 logln("Error Result is: " + cr.toString()); 836 837 cc.toUnicodeResult = target; 838 return target.position(); 839 } 840 841 else { 842 // one character has been consumed 843 if (target.limit() == target.position()) { 844 target.limit(target.position() + 1); 845 break; 846 } 847 } 848 849 } 850 if (source.position() == sourceLen) { 851 852 // due to limitation of the API we need to check 853 // for target limit for expected 854 cr = decoder.decode(source, target, true); 855 if (target.position() != cc.unicode.length()) { 856 857 target.limit(cc.unicode.length()); 858 cr = decoder.flush(target); 859 if (cr.isError()) { 860 errln("Flush operation failed"); 861 } 862 } 863 break; 864 } 865 // alternate between -n-1 and -n but leave -1 alone 866 if (step < -1) { 867 ++step; 868 } 869 } else {/* step is even */ 870 // allow only one UChar output 871 872 target.limit(target.position() < target.capacity() ? target 873 .position() + 1 : target.capacity()); 874 if (step == -2) { 875 source.limit(sourceLen); 876 } else { 877 source.limit(source.position() + (-step - 2) / 2); 878 if (source.limit() > sourceLen) { 879 source.limit(sourceLen); 880 } 881 } 882 CoderResult cr = decoder.decode(source, target, source 883 .limit() == sourceLen); 884 // check pointers and errors 885 if (cr.isOverflow()) { 886 // one character has been consumed 887 if (target.limit() >= target.capacity()) { 888 // target has reached its limit, an error occurred 889 logln("Unexpected error: Target Buffer is larger than capacity"); 890 break; 891 } 892 } else if (cr.isError()) { 893 logln("Decoder returned an error code"); 894 logln("ErrorCode expected is: " + cc.outErrorCode); 895 logln("Error Result is: " + cr.toString()); 896 break; 897 } 898 899 --step; 900 } 901 } 902 } 903 904 //-------------------------------------------------------------------------- 905 906 cc.toUnicodeResult = target; 907 return target.position(); 908 } 909 910 911 912 private boolean checkToUnicode(ConversionCase cc, int resultLength) { 913 return checkResultsToUnicode(cc, cc.unicode, cc.toUnicodeResult); 914 } 915 916 917 private void TestGetUnicodeSet(DataMap testcase) { 918 /* 919 * charset - will be opened, and ucnv_getUnicodeSet() called on it // 920 * map - set of code points and strings that must be in the returned set // 921 * mapnot - set of code points and strings that must *not* be in the // 922 * returned set // which - numeric UConverterUnicodeSet value Headers { 923 * "charset", "map", "mapnot", "which" } 924 */ 925 926 927 // retrieve test case data 928 ConversionCase cc = new ConversionCase(); 929 CharsetProviderICU provider = new CharsetProviderICU(); 930 CharsetICU charset ; 931 932 933 UnicodeSet mapset = new UnicodeSet(); 934 UnicodeSet mapnotset = new UnicodeSet(); 935 UnicodeSet unicodeset = new UnicodeSet(); 936 String ellipsis = "0x2e"; 937 cc.charset = ((ICUResourceBundle) testcase.getObject("charset")) 938 .getString(); 939 cc.map = ((ICUResourceBundle) testcase.getObject("map")).getString(); 940 cc.mapnot = ((ICUResourceBundle) testcase.getObject("mapnot")) 941 .getString(); 942 943 944 cc.which = ((ICUResourceBundle) testcase.getObject("which")).getInt(); // only checking for ROUNDTRIP_SET 945 946 // Android patch: Skip tests that fail with customized data. 947 String [] testsToSkip = { 948 "HZ", 949 "ibm-1390", 950 "ibm-16684", 951 "ibm-25546", 952 "ibm-971", 953 "ISO-2022-CN", 954 "ISO-2022-JP", 955 "ISO-2022-JP-2", 956 "ISO-2022-KR", 957 "JIS7", 958 }; 959 for (int i = 0; i < testsToSkip.length; i++) { 960 if (cc.charset.equals(testsToSkip[i])) { 961 logln(""); 962 logln("Skipping: " + cc.charset); 963 logln("..............................................."); 964 return; 965 } 966 } 967 // Android patch end. 968 969 // ----for debugging only 970 logln(""); 971 logln("TestGetUnicodeSet[" + cc.charset + "] "); 972 logln("..............................................."); 973 974 try{ 975 // if cc.charset starts with '*', obtain it from com/ibm/icu/dev/data/testdata 976 charset = (cc.charset != null && cc.charset.length() > 0 && cc.charset.charAt(0) == '*') 977 ? (CharsetICU) provider.charsetForName(cc.charset.substring(1), 978 "com/ibm/icu/dev/data/testdata", this.getClass().getClassLoader()) 979 : (CharsetICU) provider.charsetForName(cc.charset); 980 981 //checking for converter that are not supported at this point 982 try{ 983 if(charset==null || 984 charset.name()=="BOCU-1" ||charset.name()== "SCSU"|| charset.name()=="lmbcs1" || charset.name()== "lmbcs2" || 985 charset.name()== "lmbcs3" || charset.name()== "lmbcs4" || charset.name()=="lmbcs5" || charset.name()=="lmbcs6" || 986 charset.name()== "lmbcs8" || charset.name()=="lmbcs11" || charset.name()=="lmbcs16" || charset.name()=="lmbcs17" || 987 charset.name()=="lmbcs18"|| charset.name()=="lmbcs19"){ 988 logln("Converter not supported at this point :" + cc.charset); 989 return; 990 } 991 992 if(cc.which==1){ 993 logln("Fallback set not supported at this point for converter : "+charset.displayName()); 994 return; 995 } 996 997 }catch(Exception e){ 998 return; 999 } 1000 1001 mapset.clear(); 1002 mapnotset.clear(); 1003 1004 mapset.applyPattern(cc.map,false); 1005 mapnotset.applyPattern(cc.mapnot,false); 1006 1007 charset.getUnicodeSet(unicodeset, cc.which); 1008 UnicodeSet diffset = new UnicodeSet(); 1009 1010 //are there items that must be in unicodeset but are not? 1011 (diffset = mapset).removeAll(unicodeset); 1012 if(!diffset.isEmpty()){ 1013 StringBuffer s = new StringBuffer(diffset.toPattern(true)); 1014 if(s.length()>100){ 1015 s.replace(0, 0x7fffffff, ellipsis); 1016 } 1017 errln("error in missing items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString()); 1018 } 1019 1020 //are the items that must not be in unicodeset but are? 1021 (diffset=mapnotset).retainAll(unicodeset); 1022 if(!diffset.isEmpty()){ 1023 StringBuffer s = new StringBuffer(diffset.toPattern(true)); 1024 if(s.length()>100){ 1025 s.replace(0, 0x7fffffff, ellipsis); 1026 } 1027 errln("contains unexpected items - conversion/getUnicodeSet test case "+cc.charset + "\n" + s.toString()); 1028 } 1029 } catch (Exception e) { 1030 errln("getUnicodeSet returned an error code"); 1031 errln("ErrorCode expected is: " + cc.outErrorCode); 1032 errln("Error Result is: " + e.toString()); 1033 return; 1034 } 1035 } 1036 1037 /** 1038 * This follows ucnv.c method ucnv_detectUnicodeSignature() to detect the 1039 * start of the stream for example U+FEFF (the Unicode BOM/signature 1040 * character) that can be ignored. 1041 * 1042 * Detects Unicode signature byte sequences at the start of the byte stream 1043 * and returns number of bytes of the BOM of the indicated Unicode charset. 1044 * 0 is returned when no Unicode signature is recognized. 1045 * 1046 */ 1047 1048 private String detectUnicodeSignature(ByteBuffer source) { 1049 int signatureLength = 0; // number of bytes of the signature 1050 final int SIG_MAX_LEN = 5; 1051 String sigUniCharset = null; // states what unicode charset is the BOM 1052 int i = 0; 1053 1054 /* 1055 * initial 0xa5 bytes: make sure that if we read <SIG_MAX_LEN bytes we 1056 * don't misdetect something 1057 */ 1058 byte start[] = { (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, (byte) 0xa5, 1059 (byte) 0xa5 }; 1060 1061 while (i < source.limit() && i < SIG_MAX_LEN) { 1062 start[i] = source.get(i); 1063 i++; 1064 } 1065 1066 if (start[0] == (byte) 0xFE && start[1] == (byte) 0xFF) { 1067 signatureLength = 2; 1068 sigUniCharset = "UTF-16BE"; 1069 source.position(signatureLength); 1070 return sigUniCharset; 1071 } else if (start[0] == (byte) 0xFF && start[1] == (byte) 0xFE) { 1072 if (start[2] == (byte) 0x00 && start[3] == (byte) 0x00) { 1073 signatureLength = 4; 1074 sigUniCharset = "UTF-32LE"; 1075 source.position(signatureLength); 1076 return sigUniCharset; 1077 } else { 1078 signatureLength = 2; 1079 sigUniCharset = "UTF-16LE"; 1080 source.position(signatureLength); 1081 return sigUniCharset; 1082 } 1083 } else if (start[0] == (byte) 0xEF && start[1] == (byte) 0xBB 1084 && start[2] == (byte) 0xBF) { 1085 signatureLength = 3; 1086 sigUniCharset = "UTF-8"; 1087 source.position(signatureLength); 1088 return sigUniCharset; 1089 } else if (start[0] == (byte) 0x00 && start[1] == (byte) 0x00 1090 && start[2] == (byte) 0xFE && start[3] == (byte) 0xFF) { 1091 signatureLength = 4; 1092 sigUniCharset = "UTF-32BE"; 1093 source.position(signatureLength); 1094 return sigUniCharset; 1095 } else if (start[0] == (byte) 0x0E && start[1] == (byte) 0xFE 1096 && start[2] == (byte) 0xFF) { 1097 signatureLength = 3; 1098 sigUniCharset = "SCSU"; 1099 source.position(signatureLength); 1100 return sigUniCharset; 1101 } else if (start[0] == (byte) 0xFB && start[1] == (byte) 0xEE 1102 && start[2] == (byte) 0x28) { 1103 signatureLength = 3; 1104 sigUniCharset = "BOCU-1"; 1105 source.position(signatureLength); 1106 return sigUniCharset; 1107 } else if (start[0] == (byte) 0x2B && start[1] == (byte) 0x2F 1108 && start[2] == (byte) 0x76) { 1109 1110 if (start[3] == (byte) 0x38 && start[4] == (byte) 0x2D) { 1111 signatureLength = 5; 1112 sigUniCharset = "UTF-7"; 1113 source.position(signatureLength); 1114 return sigUniCharset; 1115 } else if (start[3] == (byte) 0x38 || start[3] == (byte) 0x39 1116 || start[3] == (byte) 0x2B || start[3] == (byte) 0x2F) { 1117 signatureLength = 4; 1118 sigUniCharset = "UTF-7"; 1119 source.position(signatureLength); 1120 return sigUniCharset; 1121 } 1122 } else if (start[0] == (byte) 0xDD && start[2] == (byte) 0x73 1123 && start[2] == (byte) 0x66 && start[3] == (byte) 0x73) { 1124 signatureLength = 4; 1125 sigUniCharset = "UTF-EBCDIC"; 1126 source.position(signatureLength); 1127 return sigUniCharset; 1128 } 1129 1130 /* no known Unicode signature byte sequence recognized */ 1131 return null; 1132 } 1133 1134 String printbytes(ByteBuffer buf, int pos) { 1135 int cur = buf.position(); 1136 String res = " (" + pos + ")==["; 1137 for (int i = 0; i < pos; i++) { 1138 res += "(" + i + ")" + hex(buf.get(i) & 0xff).substring(2) + " "; 1139 } 1140 buf.position(cur); 1141 return res + "]"; 1142 } 1143 1144 String printchars(CharBuffer buf, int pos) { 1145 int cur = buf.position(); 1146 String res = " (" + pos + ")==["; 1147 for (int i = 0; i < pos; i++) { 1148 res += "(" + i + ")" + hex(buf.get(i)) + " "; 1149 } 1150 buf.position(cur); 1151 return res + "]"; 1152 } 1153 1154 private boolean checkResultsFromUnicode(ConversionCase cc, ByteBuffer expected, 1155 ByteBuffer output) { 1156 1157 boolean res = true; 1158 expected.rewind(); 1159 output.limit(output.position()); 1160 output.rewind(); 1161 1162 // remove any BOM signature before checking 1163 if (!cc.charset.contains("UnicodeLittle") && !cc.charset.contains("UnicodeBig")) { 1164 detectUnicodeSignature(output); // sets the position to after the BOM 1165 output = output.slice(); // removes anything before the current position 1166 } 1167 1168 if (output.limit() != expected.limit()) { 1169 errln("Test failed: output length does not match expected for charset: " + cc.charset 1170 + " [" + cc.caseNr + "]"); 1171 res = false; 1172 } else { 1173 while (output.hasRemaining()) { 1174 if (output.get() != expected.get()) { 1175 errln("Test failed: output does not match expected for charset: " + cc.charset 1176 + " [" + cc.caseNr + "]"); 1177 res = false; 1178 break; 1179 } 1180 } 1181 } 1182 1183 if (res) { 1184 logln("[" + cc.caseNr + "]:" + cc.charset); 1185 logln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length())); 1186 logln("Output: " + printbytes(output, output.limit())); 1187 logln("Expected: " + printbytes(expected, expected.limit())); 1188 logln("Passed"); 1189 } 1190 else { 1191 errln("[" + cc.caseNr + "]:" + cc.charset); 1192 errln("Input: " + printchars(CharBuffer.wrap(cc.unicode), cc.unicode.length())); 1193 errln("Output: " + printbytes(output, output.limit())); 1194 errln("Expected: " + printbytes(expected, expected.limit())); 1195 errln("Failed"); 1196 } 1197 return res; 1198 } 1199 1200 private boolean checkResultsToUnicode(ConversionCase cc, String expected, CharBuffer output) { 1201 1202 boolean res = true; 1203 output.limit(output.position()); 1204 output.rewind(); 1205 1206 // test to see if the conversion matches actual results 1207 if (output.limit() != expected.length()) { 1208 errln("Test failed: output length does not match expected for charset: "+cc.charset+ " [" + cc.caseNr + "]"); 1209 res = false; 1210 } else { 1211 for (int i = 0; i < expected.length(); i++) { 1212 if (output.get(i) != expected.charAt(i)) { 1213 errln("Test failed: output does not match expected for charset: " + cc.charset 1214 + " [" + cc.caseNr + "]"); 1215 res = false; 1216 break; 1217 } 1218 } 1219 } 1220 1221 if (res) { 1222 logln("[" + cc.caseNr + "]:" + cc.charset); 1223 logln("Input: " + printbytes(cc.bytes, cc.bytes.limit())); 1224 logln("Output: " + printchars(output, output.limit())); 1225 logln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length())); 1226 logln("Passed"); 1227 } else { 1228 errln("[" + cc.caseNr + "]:" + cc.charset); 1229 errln("Input: " + printbytes(cc.bytes, cc.bytes.limit())); 1230 errln("Output: " + printchars(output, output.limit())); 1231 errln("Expected: " + printchars(CharBuffer.wrap(expected), expected.length())); 1232 errln("Failed"); 1233 } 1234 return res; 1235 } 1236 1237 private byte[] toByteArray(String str) { 1238 byte[] ret = new byte[str.length()]; 1239 for (int i = 0; i < ret.length; i++) { 1240 char ch = str.charAt(i); 1241 if (ch <= 0xFF) { 1242 ret[i] = (byte) ch; 1243 } else { 1244 throw new IllegalArgumentException(" byte value out of range: " + ch); 1245 } 1246 } 1247 return ret; 1248 } 1249 } 1250