1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2014, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 * 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.nio.ByteBuffer; 13 import java.nio.CharBuffer; 14 import java.nio.IntBuffer; 15 import java.nio.charset.CharsetDecoder; 16 import java.nio.charset.CoderResult; 17 import java.nio.charset.CodingErrorAction; 18 19 import com.ibm.icu.impl.Assert; 20 21 /** 22 * An abstract class that provides framework methods of decoding operations for concrete 23 * subclasses. 24 * In the future this class will contain API that will implement converter sematics of ICU4C. 25 * @stable ICU 3.6 26 */ 27 public abstract class CharsetDecoderICU extends CharsetDecoder{ 28 29 int toUnicodeStatus; 30 byte[] toUBytesArray = new byte[128]; 31 int toUBytesBegin = 0; 32 int toULength; 33 char[] charErrorBufferArray = new char[128]; 34 int charErrorBufferLength; 35 int charErrorBufferBegin; 36 char[] invalidCharBuffer = new char[128]; 37 int invalidCharLength; 38 39 /** 40 * Maximum number of indexed bytes 41 * @internal 42 * @deprecated This API is ICU internal only. 43 */ 44 @Deprecated 45 protected static final int EXT_MAX_BYTES = 0x1f; 46 47 /* store previous UChars/chars to continue partial matches */ 48 byte[] preToUArray = new byte[EXT_MAX_BYTES]; 49 int preToUBegin; 50 int preToULength; /* negative: replay */ 51 int preToUFirstLength; /* length of first character */ 52 int mode; 53 54 Object toUContext = null; 55 private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP; 56 private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP; 57 CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() { 58 public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source, 59 CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) { 60 if (cr.isUnmappable()) { 61 return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer, 62 length, cr); 63 } else /* if (cr.isMalformed()) */ { 64 return onMalformedInput.call(decoder, context, source, target, offsets, buffer, 65 length, cr); 66 } 67 // return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr); 68 } 69 }; 70 71 // exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive 72 private boolean malformedInputCalled = false; 73 private boolean unmappableCharacterCalled = false; 74 75 /* 76 * Construct a CharsetDecorderICU based on the information provided from a CharsetICU object. 77 * 78 * @param cs The CharsetICU object containing information about how to charset to decode. 79 */ 80 CharsetDecoderICU(CharsetICU cs) { 81 super(cs, (1/cs.maxCharsPerByte), cs.maxCharsPerByte); 82 } 83 84 /* 85 * Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping 86 * that will convert a byte sequence to a Unicode codepoint sequence, but 87 * the encoded Unicode codepoint sequence will round trip convert to a different 88 * byte sequence. In ICU, this is can be called a reverse fallback. 89 * @return A boolean 90 */ 91 final boolean isFallbackUsed() { 92 return true; 93 } 94 95 /** 96 * Fallback is currently always used by icu4j decoders. 97 */ 98 static final boolean isToUUseFallback() { 99 return isToUUseFallback(true); 100 } 101 102 /** 103 * Fallback is currently always used by icu4j decoders. 104 */ 105 static final boolean isToUUseFallback(boolean iUseFallback) { 106 return true; 107 } 108 109 /** 110 * Sets the action to be taken if an illegal sequence is encountered 111 * 112 * @param newAction action to be taken 113 * @exception IllegalArgumentException 114 * @stable ICU 3.6 115 */ 116 protected final void implOnMalformedInput(CodingErrorAction newAction) { 117 // don't run infinitely 118 if (malformedInputCalled) 119 return; 120 121 // if we get a replace, do not let the nio replace 122 if (newAction == CodingErrorAction.REPLACE) { 123 malformedInputCalled = true; 124 super.onMalformedInput(CodingErrorAction.IGNORE); 125 malformedInputCalled = false; 126 } 127 128 onMalformedInput = getCallback(newAction); 129 } 130 131 /** 132 * Sets the action to be taken if an illegal sequence is encountered 133 * 134 * @param newAction action to be taken 135 * @exception IllegalArgumentException 136 * @stable ICU 3.6 137 */ 138 protected final void implOnUnmappableCharacter(CodingErrorAction newAction) { 139 // dont run infinitely 140 if (unmappableCharacterCalled) 141 return; 142 143 // if we get a replace, do not let the nio replace 144 if (newAction == CodingErrorAction.REPLACE) { 145 unmappableCharacterCalled = true; 146 super.onUnmappableCharacter(CodingErrorAction.IGNORE); 147 unmappableCharacterCalled = false; 148 } 149 150 onUnmappableCharacter = getCallback(newAction); 151 } 152 153 /** 154 * Sets the callback encoder method and context to be used if an illegal sequence is encounterd. 155 * You would normally call this twice to set both the malform and unmappable error. In this case, 156 * newContext should remain the same since using a different newContext each time will negate the last 157 * one used. 158 * @param err CoderResult 159 * @param newCallback CharsetCallback.Encoder 160 * @param newContext Object 161 * @stable ICU 4.0 162 */ 163 public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) { 164 if (err.isMalformed()) { 165 onMalformedInput = newCallback; 166 } else if (err.isUnmappable()) { 167 onUnmappableCharacter = newCallback; 168 } else { 169 /* Error: Only malformed and unmappable are handled. */ 170 } 171 172 if (toUContext == null || !toUContext.equals(newContext)) { 173 toUContext = newContext; 174 } 175 } 176 177 private static CharsetCallback.Decoder getCallback(CodingErrorAction action){ 178 if(action==CodingErrorAction.REPLACE){ 179 return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE; 180 }else if(action==CodingErrorAction.IGNORE){ 181 return CharsetCallback.TO_U_CALLBACK_SKIP; 182 }else /* if(action==CodingErrorAction.REPORT) */ { 183 return CharsetCallback.TO_U_CALLBACK_STOP; 184 } 185 } 186 private final ByteBuffer EMPTY = ByteBuffer.allocate(0); 187 /** 188 * Flushes any characters saved in the converter's internal buffer and 189 * resets the converter. 190 * @param out action to be taken 191 * @return result of flushing action and completes the decoding all input. 192 * Returns CoderResult.UNDERFLOW if the action succeeds. 193 * @stable ICU 3.6 194 */ 195 protected final CoderResult implFlush(CharBuffer out) { 196 return decode(EMPTY, out, null, true); 197 } 198 199 /** 200 * Resets the to Unicode mode of converter 201 * @stable ICU 3.6 202 */ 203 protected void implReset() { 204 toUnicodeStatus = 0 ; 205 toULength = 0; 206 charErrorBufferLength = 0; 207 charErrorBufferBegin = 0; 208 209 /* store previous UChars/chars to continue partial matches */ 210 preToUBegin = 0; 211 preToULength = 0; /* negative: replay */ 212 preToUFirstLength = 0; 213 214 mode = 0; 215 } 216 217 /** 218 * Decodes one or more bytes. The default behaviour of the converter 219 * is stop and report if an error in input stream is encountered. 220 * To set different behaviour use @see CharsetDecoder.onMalformedInput() 221 * This method allows a buffer by buffer conversion of a data stream. 222 * The state of the conversion is saved between calls to convert. 223 * Among other things, this means multibyte input sequences can be 224 * split between calls. If a call to convert results in an Error, the 225 * conversion may be continued by calling convert again with suitably 226 * modified parameters.All conversions should be finished with a call to 227 * the flush method. 228 * @param in buffer to decode 229 * @param out buffer to populate with decoded result 230 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding 231 * action succeeds or more input is needed for completing the decoding action. 232 * @stable ICU 3.6 233 */ 234 protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){ 235 if(in.remaining() < toUCountPending()){ 236 return CoderResult.UNDERFLOW; 237 } 238 // if (!in.hasRemaining()) { 239 // toULength = 0; 240 // return CoderResult.UNDERFLOW; 241 // } 242 243 in.position(in.position() + toUCountPending()); 244 245 /* do the conversion */ 246 CoderResult ret = decode(in, out, null, false); 247 248 // ok was there input held in the previous invocation of decodeLoop 249 // that resulted in output in this invocation? 250 in.position(in.position() - toUCountPending()); 251 252 return ret; 253 } 254 255 /* 256 * Implements the ICU semantic for decode operation 257 * @param in The input byte buffer 258 * @param out The output character buffer 259 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding 260 * action succeeds or more input is needed for completing the decoding action. 261 */ 262 abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush); 263 264 /* 265 * Implements the ICU semantic for decode operation 266 * @param source The input byte buffer 267 * @param target The output character buffer 268 * @param offsets 269 * @param flush true if, and only if, the invoker can provide no 270 * additional input bytes beyond those in the given buffer. 271 * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding 272 * action succeeds or more input is needed for completing the decoding action. 273 */ 274 final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 275 276 /* check parameters */ 277 if (target == null || source == null) { 278 throw new IllegalArgumentException(); 279 } 280 281 /* 282 * Make sure that the buffer sizes do not exceed the number range for 283 * int32_t because some functions use the size (in units or bytes) 284 * rather than comparing pointers, and because offsets are int32_t values. 285 * 286 * size_t is guaranteed to be unsigned and large enough for the job. 287 * 288 * Return with an error instead of adjusting the limits because we would 289 * not be able to maintain the semantics that either the source must be 290 * consumed or the target filled (unless an error occurs). 291 * An adjustment would be sourceLimit=t+0x7fffffff; for example. 292 */ 293 /*agljport:fix 294 if( 295 ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) || 296 ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t) 297 ) { 298 *err=U_ILLEGAL_ARGUMENT_ERROR; 299 return; 300 } 301 */ 302 303 /* flush the target overflow buffer */ 304 if (charErrorBufferLength > 0) { 305 int i = 0; 306 do { 307 if (!target.hasRemaining()) { 308 /* the overflow buffer contains too much, keep the rest */ 309 int j = 0; 310 311 do { 312 charErrorBufferArray[j++] = charErrorBufferArray[i++]; 313 } while (i < charErrorBufferLength); 314 315 charErrorBufferLength = (byte) j; 316 return CoderResult.OVERFLOW; 317 } 318 319 /* copy the overflow contents to the target */ 320 target.put(charErrorBufferArray[i++]); 321 if (offsets != null) { 322 offsets.put(-1); /* no source index available for old output */ 323 } 324 } while (i < charErrorBufferLength); 325 326 /* the overflow buffer is completely copied to the target */ 327 charErrorBufferLength = 0; 328 } 329 330 if (!flush && !source.hasRemaining() && toULength == 0 && preToULength >= 0) { 331 /* the overflow buffer is emptied and there is no new input: we are done */ 332 return CoderResult.UNDERFLOW; 333 } 334 335 /* 336 * Do not simply return with a buffer overflow error if 337 * !flush && t==targetLimit 338 * because it is possible that the source will not generate any output. 339 * For example, the skip callback may be called; 340 * it does not output anything. 341 */ 342 343 return toUnicodeWithCallback(source, target, offsets, flush); 344 } 345 346 /* Currently, we are not using offsets in ICU4J. */ 347 /* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) { 348 int limit; 349 int delta, offset; 350 351 if(sourceIndex>=0) { 352 /* 353 * adjust each offset by adding the previous sourceIndex 354 * minus the length of the input sequence that caused an 355 * error, if any 356 */ 357 /* delta=sourceIndex-errorInputLength; 358 } else { 359 /* 360 * set each offset to -1 because this conversion function 361 * does not handle offsets 362 */ 363 /* delta=-1; 364 } 365 limit=offsets.position()+length; 366 if(delta==0) { 367 /* most common case, nothing to do */ 368 /* } else if(delta>0) { 369 /* add the delta to each offset (but not if the offset is <0) */ 370 /* while(offsets.position()<limit) { 371 offset=offsets.get(offsets.position()); 372 if(offset>=0) { 373 offsets.put(offset+delta); 374 } 375 //FIXME: ++offsets; 376 } 377 } else /* delta<0 */ /* { 378 /* 379 * set each offset to -1 because this conversion function 380 * does not handle offsets 381 * or the error input sequence started in a previous buffer 382 */ 383 /* while(offsets.position()<limit) { 384 offsets.put(-1); 385 } 386 } 387 } */ 388 final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){ 389 390 int sourceIndex; 391 int errorInputLength; 392 boolean converterSawEndOfInput, calledCallback; 393 //int t=target.position(); 394 int s=source.position(); 395 /* variables for m:n conversion */ 396 ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES); 397 int replayArrayIndex = 0; 398 399 ByteBuffer realSource=null; 400 boolean realFlush=false; 401 int realSourceIndex=0; 402 403 404 CoderResult cr = CoderResult.UNDERFLOW; 405 406 /* get the converter implementation function */ 407 sourceIndex=0; 408 409 if(preToULength>=0) { 410 /* normal mode */ 411 } else { 412 /* 413 * Previous m:n conversion stored source units from a partial match 414 * and failed to consume all of them. 415 * We need to "replay" them from a temporary buffer and convert them first. 416 */ 417 realSource=source; 418 realFlush=flush; 419 realSourceIndex=sourceIndex; 420 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); 421 replayArray.put(preToUArray,0, -preToULength); 422 source=replayArray; 423 source.position(0); 424 source.limit(replayArrayIndex-preToULength); 425 flush=false; 426 sourceIndex=-1; 427 preToULength=0; 428 } 429 430 /* 431 * loop for conversion and error handling 432 * 433 * loop { 434 * convert 435 * loop { 436 * update offsets 437 * handle end of input 438 * handle errors/call callback 439 * } 440 * } 441 */ 442 for(;;) { 443 444 /* convert */ 445 cr = decodeLoop(source, target, offsets, flush); 446 447 /* 448 * set a flag for whether the converter 449 * successfully processed the end of the input 450 * 451 * need not check cnv->preToULength==0 because a replay (<0) will cause 452 * s<sourceLimit before converterSawEndOfInput is checked 453 */ 454 converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength == 0); 455 456 /* no callback called yet for this iteration */ 457 calledCallback=false; 458 459 /* no sourceIndex adjustment for conversion, only for callback output */ 460 errorInputLength=0; 461 462 /* 463 * loop for offsets and error handling 464 * 465 * iterates at most 3 times: 466 * 1. to clean up after the conversion function 467 * 2. after the callback 468 * 3. after the callback again if there was truncated input 469 */ 470 for(;;) { 471 /* update offsets if we write any */ 472 /* Currently offsets are not being used in ICU4J */ 473 /* if(offsets!=null) { 474 475 int length=(target.position()-t); 476 if(length>0) { 477 updateOffsets(offsets, length, sourceIndex, errorInputLength); 478 479 480 /* 481 * if a converter handles offsets and updates the offsets 482 * pointer at the end, then pArgs->offset should not change 483 * here; 484 * however, some converters do not handle offsets at all 485 * (sourceIndex<0) or may not update the offsets pointer 486 */ 487 //TODO: pArgs->offsets=offsets+=length; 488 /* } 489 490 if(sourceIndex>=0) { 491 sourceIndex+=(source.position()-s); 492 } 493 494 } */ 495 496 if(preToULength<0) { 497 /* 498 * switch the source to new replay units (cannot occur while replaying) 499 * after offset handling and before end-of-input and callback handling 500 */ 501 if(realSource==null) 502 { 503 realSource=source; 504 realFlush=flush; 505 realSourceIndex=sourceIndex; 506 507 //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength); 508 replayArray.put(preToUArray,0, -preToULength); 509 // reset position 510 replayArray.position(0); 511 512 source=replayArray; 513 source.limit(replayArrayIndex-preToULength); 514 flush=false; 515 if((sourceIndex+=preToULength)<0) { 516 sourceIndex=-1; 517 } 518 519 preToULength=0; 520 } else { 521 /* see implementation note before _fromUnicodeWithCallback() */ 522 //agljport:todo U_ASSERT(realSource==NULL); 523 Assert.assrt(realSource==null); 524 } 525 } 526 527 /* update pointers */ 528 s=source.position(); 529 //t=target.position(); 530 531 if(cr.isUnderflow()) { 532 if(s<source.limit()) 533 { 534 /* 535 * continue with the conversion loop while there is still input left 536 * (continue converting by breaking out of only the inner loop) 537 */ 538 break; 539 } else if(realSource!=null) { 540 /* switch back from replaying to the real source and continue */ 541 source = realSource; 542 flush=realFlush; 543 sourceIndex=realSourceIndex; 544 realSource=null; 545 break; 546 } else if(flush && toULength>0) { 547 /* 548 * the entire input stream is consumed 549 * and there is a partial, truncated input sequence left 550 */ 551 552 /* inject an error and continue with callback handling */ 553 cr = CoderResult.malformedForLength(toULength); 554 calledCallback=false; /* new error condition */ 555 } else { 556 /* input consumed */ 557 if(flush) { 558 /* 559 * return to the conversion loop once more if the flush 560 * flag is set and the conversion function has not 561 * successfully processed the end of the input yet 562 * 563 * (continue converting by breaking out of only the inner loop) 564 */ 565 if(!converterSawEndOfInput) { 566 break; 567 } 568 569 /* reset the converter without calling the callback function */ 570 implReset(); 571 } 572 573 /* done successfully */ 574 return cr; 575 } 576 } 577 578 /* U_FAILURE(*err) */ 579 { 580 581 if( calledCallback || cr.isOverflow() || 582 (cr.isMalformed() && cr.isUnmappable()) 583 ) { 584 /* 585 * the callback did not or cannot resolve the error: 586 * set output pointers and return 587 * 588 * the check for buffer overflow is redundant but it is 589 * a high-runner case and hopefully documents the intent 590 * well 591 * 592 * if we were replaying, then the replay buffer must be 593 * copied back into the UConverter 594 * and the real arguments must be restored 595 */ 596 if(realSource!=null) { 597 int length; 598 Assert.assrt(preToULength==0); 599 length = source.limit() - source.position(); 600 if(length>0) { 601 //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length); 602 source.get(preToUArray, preToUBegin, length); 603 preToULength=(byte)-length; 604 } 605 } 606 return cr; 607 } 608 } 609 610 /* copy toUBytes[] to invalidCharBuffer[] */ 611 errorInputLength=invalidCharLength=toULength; 612 if(errorInputLength>0) { 613 copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength); 614 } 615 616 /* set the converter state to deal with the next character */ 617 toULength=0; 618 619 /* call the callback function */ 620 cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr); 621 /* 622 * loop back to the offset handling 623 * 624 * this flag will indicate after offset handling 625 * that a callback was called; 626 * if the callback did not resolve the error, then we return 627 */ 628 calledCallback=true; 629 } 630 } 631 } 632 633 /* 634 * Returns the number of chars held in the converter's internal state 635 * because more input is needed for completing the conversion. This function is 636 * useful for mapping semantics of ICU's converter interface to those of iconv, 637 * and this information is not needed for normal conversion. 638 * @return The number of chars in the state. -1 if an error is encountered. 639 */ 640 /*public*/ int toUCountPending() { 641 if(preToULength > 0){ 642 return preToULength ; 643 } else if(preToULength < 0){ 644 return -preToULength; 645 } else if(toULength > 0){ 646 return toULength; 647 } else { 648 return 0; 649 } 650 } 651 652 653 private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) { 654 for(int i=srcOffset; i<length; i++){ 655 dst[dstOffset++]=(char)(src[srcOffset++] & UConverterConstants.UNSIGNED_BYTE_MASK); 656 } 657 } 658 /* 659 * ONLY used by ToU callback functions. 660 * This function will write out the specified characters to the target 661 * character buffer. 662 * @return A CoderResult object that contains the error result when an error occurs. 663 */ 664 static final CoderResult toUWriteUChars( CharsetDecoderICU cnv, 665 char[] ucharsArray, int ucharsBegin, int length, 666 CharBuffer target, IntBuffer offsets, int sourceIndex) { 667 668 CoderResult cr = CoderResult.UNDERFLOW; 669 670 /* write UChars */ 671 if(offsets==null) { 672 while(length>0 && target.hasRemaining()) { 673 target.put(ucharsArray[ucharsBegin++]); 674 --length; 675 } 676 677 } else { 678 /* output with offsets */ 679 while(length>0 && target.hasRemaining()) { 680 target.put(ucharsArray[ucharsBegin++]); 681 offsets.put(sourceIndex); 682 --length; 683 } 684 } 685 /* write overflow */ 686 if(length>0) { 687 cnv.charErrorBufferLength= 0; 688 cr = CoderResult.OVERFLOW; 689 do { 690 cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++]; 691 } while(--length>0); 692 } 693 return cr; 694 } 695 /* 696 * This function will write out the Unicode substitution character to the 697 * target character buffer. 698 * Sub classes to override this method if required 699 * @param decoder 700 * @param source 701 * @param target 702 * @param offsets 703 * @return A CoderResult object that contains the error result when an error occurs. 704 */ 705 /* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with 706 * the substitution characters. Will leave in here for the time being. To be removed later. (4.0) 707 */ 708 /*CoderResult cbToUWriteSub(CharsetDecoderICU decoder, 709 ByteBuffer source, CharBuffer target, 710 IntBuffer offsets){ 711 String sub = decoder.replacement(); 712 CharsetICU cs = (CharsetICU) decoder.charset(); 713 if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) { 714 char[] subArr = new char[] { 0x1a }; 715 return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub 716 .length(), target, offsets, source.position()); 717 } else { 718 return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(), 719 0, sub.length(), target, offsets, source.position()); 720 721 } 722 }*/ 723 724 /** 725 * Returns the maxBytesPerChar value for the Charset that created this decoder. 726 * @return maxBytesPerChar 727 * @stable ICU 4.8 728 */ 729 public final float maxBytesPerChar() { 730 return ((CharsetICU)(this.charset())).maxBytesPerChar; 731 } 732 } 733