icu/charset/CharsetDecoderICU.java

//  2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
/**
*******************************************************************************
* Copyright (C) 2006-2014, International Business Machines Corporation and    *
* others. All Rights Reserved.                                                *
*******************************************************************************
*
*******************************************************************************
*/

package com.ibm.icu.charset;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.IntBuffer;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CodingErrorAction;

import com.ibm.icu.impl.Assert;

/**
 * An abstract class that provides framework methods of decoding operations for concrete
 * subclasses.
 * In the future this class will contain API that will implement converter sematics of ICU4C.
 * @stable ICU 3.6
 */
public abstract class CharsetDecoderICU extends CharsetDecoder{

    int    toUnicodeStatus;
    byte[] toUBytesArray = new byte[128];
    int    toUBytesBegin = 0;
    int    toULength;
    char[] charErrorBufferArray = new char[128];
    int    charErrorBufferLength;
    int    charErrorBufferBegin;
    char[] invalidCharBuffer = new char[128];
    int    invalidCharLength;

    /**
     * Maximum number of indexed bytes
     * @internal
     * @deprecated This API is ICU internal only.
     */
    @Deprecated
    protected static final int EXT_MAX_BYTES = 0x1f;

    /* store previous UChars/chars to continue partial matches */
    byte[] preToUArray = new byte[EXT_MAX_BYTES];
    int    preToUBegin;
    int    preToULength;       /* negative: replay */
    int    preToUFirstLength;  /* length of first character */
    int mode;

    Object toUContext = null;
    private CharsetCallback.Decoder onUnmappableCharacter = CharsetCallback.TO_U_CALLBACK_STOP;
    private CharsetCallback.Decoder onMalformedInput = CharsetCallback.TO_U_CALLBACK_STOP;
    CharsetCallback.Decoder toCharErrorBehaviour = new CharsetCallback.Decoder() {
        @Override
        public CoderResult call(CharsetDecoderICU decoder, Object context, ByteBuffer source,
                CharBuffer target, IntBuffer offsets, char[] buffer, int length, CoderResult cr) {
            if (cr.isUnmappable()) {
                return onUnmappableCharacter.call(decoder, context, source, target, offsets, buffer,
                        length, cr);
            } else /* if (cr.isMalformed()) */ {
                return onMalformedInput.call(decoder, context, source, target, offsets, buffer,
                        length, cr);
            }
            // return CharsetCallback.TO_U_CALLBACK_STOP.call(decoder, context, source, target, offsets, buffer, length, cr);
        }
    };

    // exist to keep implOnMalformedInput and implOnUnmappableInput from being too recursive
    private boolean malformedInputCalled = false;
    private boolean unmappableCharacterCalled = false;

    /*
     * Construct a CharsetDecorderICU based on the information provided from a CharsetICU object.
     *
     * @param cs The CharsetICU object containing information about how to charset to decode.
     */
    CharsetDecoderICU(CharsetICU cs) {
        super(cs, (1/cs.maxCharsPerByte), cs.maxCharsPerByte);
    }

    /*
     * Is this Decoder allowed to use fallbacks? A fallback mapping is a mapping
     * that will convert a byte sequence to a Unicode codepoint sequence, but
     * the encoded Unicode codepoint sequence will round trip convert to a different
     * byte sequence. In ICU, this is can be called a reverse fallback.
     * @return A boolean
     */
    final boolean isFallbackUsed() {
        return true;
    }

    /**
     * Fallback is currently always used by icu4j decoders.
     */
    static final boolean isToUUseFallback() {
        return isToUUseFallback(true);
    }

    /**
     * Fallback is currently always used by icu4j decoders.
     */
    static final boolean isToUUseFallback(boolean iUseFallback) {
        return true;
    }

    /**
     * Sets the action to be taken if an illegal sequence is encountered
     *
     * @param newAction action to be taken
     * @exception IllegalArgumentException
     * @stable ICU 3.6
     */
    @Override
    protected final void implOnMalformedInput(CodingErrorAction newAction) {
        // don't run infinitely
        if (malformedInputCalled)
            return;

        // if we get a replace, do not let the nio replace
        if (newAction == CodingErrorAction.REPLACE) {
            malformedInputCalled = true;
            super.onMalformedInput(CodingErrorAction.IGNORE);
            malformedInputCalled = false;
        }

        onMalformedInput = getCallback(newAction);
    }

    /**
     * Sets the action to be taken if an illegal sequence is encountered
     *
     * @param newAction action to be taken
     * @exception IllegalArgumentException
     * @stable ICU 3.6
     */
    @Override
    protected final void implOnUnmappableCharacter(CodingErrorAction newAction) {
        // dont run infinitely
        if (unmappableCharacterCalled)
            return;

        // if we get a replace, do not let the nio replace
        if (newAction == CodingErrorAction.REPLACE) {
            unmappableCharacterCalled = true;
            super.onUnmappableCharacter(CodingErrorAction.IGNORE);
            unmappableCharacterCalled = false;
        }

        onUnmappableCharacter = getCallback(newAction);
    }

    /**
     * Sets the callback encoder method and context to be used if an illegal sequence is encounterd.
     * You would normally call this twice to set both the malform and unmappable error. In this case,
     * newContext should remain the same since using a different newContext each time will negate the last
     * one used.
     * @param err CoderResult
     * @param newCallback CharsetCallback.Encoder
     * @param newContext Object
     * @stable ICU 4.0
     */
    public final void setToUCallback(CoderResult err, CharsetCallback.Decoder newCallback, Object newContext) {
        if (err.isMalformed()) {
            onMalformedInput = newCallback;
        } else if (err.isUnmappable()) {
            onUnmappableCharacter = newCallback;
        } else {
            /* Error: Only malformed and unmappable are handled. */
        }

        if (toUContext == null || !toUContext.equals(newContext)) {
            toUContext = newContext;
        }
    }

    private static CharsetCallback.Decoder getCallback(CodingErrorAction action){
        if(action==CodingErrorAction.REPLACE){
            return CharsetCallback.TO_U_CALLBACK_SUBSTITUTE;
        }else if(action==CodingErrorAction.IGNORE){
            return CharsetCallback.TO_U_CALLBACK_SKIP;
        }else /* if(action==CodingErrorAction.REPORT) */ {
            return CharsetCallback.TO_U_CALLBACK_STOP;
        }
    }
    private final ByteBuffer EMPTY = ByteBuffer.allocate(0);
    /**
     * Flushes any characters saved in the converter's internal buffer and
     * resets the converter.
     * @param out action to be taken
     * @return result of flushing action and completes the decoding all input.
     *         Returns CoderResult.UNDERFLOW if the action succeeds.
     * @stable ICU 3.6
     */
    @Override
    protected final CoderResult implFlush(CharBuffer out) {
        return decode(EMPTY, out, null, true);
    }

    /**
     * Resets the to Unicode mode of converter
     * @stable ICU 3.6
     */
    @Override
    protected void implReset() {
        toUnicodeStatus = 0 ;
        toULength = 0;
        charErrorBufferLength = 0;
        charErrorBufferBegin = 0;

        /* store previous UChars/chars to continue partial matches */
        preToUBegin = 0;
        preToULength = 0;       /* negative: replay */
        preToUFirstLength = 0;

        mode = 0;
    }

    /**
     * Decodes one or more bytes. The default behaviour of the converter
     * is stop and report if an error in input stream is encountered.
     * To set different behaviour use @see CharsetDecoder.onMalformedInput()
     * This  method allows a buffer by buffer conversion of a data stream.
     * The state of the conversion is saved between calls to convert.
     * Among other things, this means multibyte input sequences can be
     * split between calls. If a call to convert results in an Error, the
     * conversion may be continued by calling convert again with suitably
     * modified parameters.All conversions should be finished with a call to
     * the flush method.
     * @param in buffer to decode
     * @param out buffer to populate with decoded result
     * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
     *         action succeeds or more input is needed for completing the decoding action.
     * @stable ICU 3.6
     */
    @Override
    protected CoderResult decodeLoop(ByteBuffer in,CharBuffer out){
        if(in.remaining() < toUCountPending()){
            return CoderResult.UNDERFLOW;
        }
//        if (!in.hasRemaining()) {
//            toULength = 0;
//            return CoderResult.UNDERFLOW;
//        }

        in.position(in.position() + toUCountPending());

        /* do the conversion */
        CoderResult ret = decode(in, out, null, false);

        // ok was there input held in the previous invocation of decodeLoop
        // that resulted in output in this invocation?
        in.position(in.position() - toUCountPending());

        return ret;
    }

    /*
     * Implements the ICU semantic for decode operation
     * @param in The input byte buffer
     * @param out The output character buffer
     * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
     *         action succeeds or more input is needed for completing the decoding action.
     */
    abstract CoderResult decodeLoop(ByteBuffer in, CharBuffer out, IntBuffer offsets, boolean flush);

    /*
     * Implements the ICU semantic for decode operation
     * @param source The input byte buffer
     * @param target The output character buffer
     * @param offsets
     * @param flush true if, and only if, the invoker can provide no
     *  additional input bytes beyond those in the given buffer.
     * @return Result of decoding action. Returns CoderResult.UNDERFLOW if the decoding
     *         action succeeds or more input is needed for completing the decoding action.
     */
    final CoderResult decode(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {

        /* check parameters */
        if (target == null || source == null) {
            throw new IllegalArgumentException();
        }

        /*
         * Make sure that the buffer sizes do not exceed the number range for
         * int32_t because some functions use the size (in units or bytes)
         * rather than comparing pointers, and because offsets are int32_t values.
         *
         * size_t is guaranteed to be unsigned and large enough for the job.
         *
         * Return with an error instead of adjusting the limits because we would
         * not be able to maintain the semantics that either the source must be
         * consumed or the target filled (unless an error occurs).
         * An adjustment would be sourceLimit=t+0x7fffffff; for example.
         */
            /*agljport:fix
        if(
            ((size_t)(sourceLimit-s)>(size_t)0x7fffffff && sourceLimit>s) ||
            ((size_t)(targetLimit-t)>(size_t)0x3fffffff && targetLimit>t)
        ) {
            *err=U_ILLEGAL_ARGUMENT_ERROR;
            return;
        }
            */

        /* flush the target overflow buffer */
        if (charErrorBufferLength > 0) {
            int i = 0;
            do {
                if (!target.hasRemaining()) {
                    /* the overflow buffer contains too much, keep the rest */
                    int j = 0;

                    do {
                        charErrorBufferArray[j++] = charErrorBufferArray[i++];
                    } while (i < charErrorBufferLength);

                    charErrorBufferLength = (byte) j;
                    return CoderResult.OVERFLOW;
                }

                /* copy the overflow contents to the target */
                target.put(charErrorBufferArray[i++]);
                if (offsets != null) {
                    offsets.put(-1); /* no source index available for old output */
                }
            } while (i < charErrorBufferLength);

            /* the overflow buffer is completely copied to the target */
            charErrorBufferLength = 0;
        }

        if (!flush && !source.hasRemaining() && toULength == 0 && preToULength >= 0) {
            /* the overflow buffer is emptied and there is no new input: we are done */
            return CoderResult.UNDERFLOW;
        }

        /*
         * Do not simply return with a buffer overflow error if
         * !flush && t==targetLimit
         * because it is possible that the source will not generate any output.
         * For example, the skip callback may be called;
         * it does not output anything.
         */

        return toUnicodeWithCallback(source, target, offsets, flush);
    }

    /* Currently, we are not using offsets in ICU4J. */
    /* private void updateOffsets(IntBuffer offsets,int length, int sourceIndex, int errorInputLength) {
        int limit;
        int delta, offset;

        if(sourceIndex>=0) {
            /*
             * adjust each offset by adding the previous sourceIndex
             * minus the length of the input sequence that caused an
             * error, if any
             */
       /*     delta=sourceIndex-errorInputLength;
        } else {
            /*
             * set each offset to -1 because this conversion function
             * does not handle offsets
             */
        /*    delta=-1;
        }
        limit=offsets.position()+length;
        if(delta==0) {
            /* most common case, nothing to do */
        /* } else if(delta>0) {
            /* add the delta to each offset (but not if the offset is <0) */
        /*    while(offsets.position()<limit) {
                offset=offsets.get(offsets.position());
                if(offset>=0) {
                    offsets.put(offset+delta);
                }
                //FIXME: ++offsets;
            }
        } else /* delta<0 */ /* {
            /*
             * set each offset to -1 because this conversion function
             * does not handle offsets
             * or the error input sequence started in a previous buffer
             */
        /*    while(offsets.position()<limit) {
                offsets.put(-1);
            }
        }
    } */
    final CoderResult toUnicodeWithCallback(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush){

        int sourceIndex;
        int errorInputLength;
        boolean converterSawEndOfInput, calledCallback;
        //int t=target.position();
        int s=source.position();
        /* variables for m:n conversion */
        ByteBuffer replayArray = ByteBuffer.allocate(EXT_MAX_BYTES);
        int replayArrayIndex = 0;

        ByteBuffer realSource=null;
        boolean realFlush=false;
        int realSourceIndex=0;


        CoderResult cr = CoderResult.UNDERFLOW;

        /* get the converter implementation function */
        sourceIndex=0;

        if(preToULength>=0) {
            /* normal mode */
        } else {
            /*
             * Previous m:n conversion stored source units from a partial match
             * and failed to consume all of them.
             * We need to "replay" them from a temporary buffer and convert them first.
             */
            realSource=source;
            realFlush=flush;
            realSourceIndex=sourceIndex;
            //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
            replayArray.put(preToUArray,0, -preToULength);
            source=replayArray;
            source.position(0);
            source.limit(replayArrayIndex-preToULength);
            flush=false;
            sourceIndex=-1;
            preToULength=0;
        }

        /*
         * loop for conversion and error handling
         *
         * loop {
         *   convert
         *   loop {
         *     update offsets
         *     handle end of input
         *     handle errors/call callback
         *   }
         * }
         */
        for(;;) {

            /* convert */
            cr = decodeLoop(source, target, offsets, flush);

            /*
             * set a flag for whether the converter
             * successfully processed the end of the input
             *
             * need not check cnv->preToULength==0 because a replay (<0) will cause
             * s<sourceLimit before converterSawEndOfInput is checked
             */
            converterSawEndOfInput= (cr.isUnderflow() && flush && source.remaining()==0 && toULength == 0);

            /* no callback called yet for this iteration */
            calledCallback=false;

            /* no sourceIndex adjustment for conversion, only for callback output */
            errorInputLength=0;

            /*
             * loop for offsets and error handling
             *
             * iterates at most 3 times:
             * 1. to clean up after the conversion function
             * 2. after the callback
             * 3. after the callback again if there was truncated input
             */
            for(;;) {
                /* update offsets if we write any */
                /* Currently offsets are not being used in ICU4J */
                /* if(offsets!=null) {

                    int length=(target.position()-t);
                    if(length>0) {
                        updateOffsets(offsets, length, sourceIndex, errorInputLength);


                        /*
                         * if a converter handles offsets and updates the offsets
                         * pointer at the end, then pArgs->offset should not change
                         * here;
                         * however, some converters do not handle offsets at all
                         * (sourceIndex<0) or may not update the offsets pointer
                         */
                        //TODO: pArgs->offsets=offsets+=length;
                  /*  }

                    if(sourceIndex>=0) {
                        sourceIndex+=(source.position()-s);
                    }

                } */

                if(preToULength<0) {
                    /*
                     * switch the source to new replay units (cannot occur while replaying)
                     * after offset handling and before end-of-input and callback handling
                     */
                    if(realSource==null)
                                    {
                        realSource=source;
                        realFlush=flush;
                        realSourceIndex=sourceIndex;

                        //UConverterUtility.uprv_memcpy(replayArray, replayBegin, preToUArray, preToUBegin, -preToULength);
                        replayArray.put(preToUArray,0, -preToULength);
                        // reset position
                        replayArray.position(0);

                        source=replayArray;
                        source.limit(replayArrayIndex-preToULength);
                        flush=false;
                        if((sourceIndex+=preToULength)<0) {
                            sourceIndex=-1;
                        }

                        preToULength=0;
                    } else {
                        /* see implementation note before _fromUnicodeWithCallback() */
                        //agljport:todo U_ASSERT(realSource==NULL);
                       Assert.assrt(realSource==null);
                    }
                }

                /* update pointers */
                s=source.position();
                //t=target.position();

                if(cr.isUnderflow()) {
                    if(s<source.limit())
                                    {
                        /*
                         * continue with the conversion loop while there is still input left
                         * (continue converting by breaking out of only the inner loop)
                         */
                        break;
                    } else if(realSource!=null) {
                        /* switch back from replaying to the real source and continue */
                        source = realSource;
                        flush=realFlush;
                        sourceIndex=realSourceIndex;
                        realSource=null;
                        break;
                    } else if(flush && toULength>0) {
                        /*
                         * the entire input stream is consumed
                         * and there is a partial, truncated input sequence left
                         */

                        /* inject an error and continue with callback handling */
                        cr = CoderResult.malformedForLength(toULength);
                        calledCallback=false; /* new error condition */
                    } else {
                        /* input consumed */
                        if(flush) {
                            /*
                             * return to the conversion loop once more if the flush
                             * flag is set and the conversion function has not
                             * successfully processed the end of the input yet
                             *
                             * (continue converting by breaking out of only the inner loop)
                             */
                            if(!converterSawEndOfInput) {
                                break;
                            }

                            /* reset the converter without calling the callback function */
                            implReset();
                        }

                        /* done successfully */
                        return cr;
                    }
                }

                /* U_FAILURE(*err) */
                {

                    if( calledCallback || cr.isOverflow() ||
                        (cr.isMalformed() && cr.isUnmappable())
                      ) {
                        /*
                         * the callback did not or cannot resolve the error:
                         * set output pointers and return
                         *
                         * the check for buffer overflow is redundant but it is
                         * a high-runner case and hopefully documents the intent
                         * well
                         *
                         * if we were replaying, then the replay buffer must be
                         * copied back into the UConverter
                         * and the real arguments must be restored
                         */
                        if(realSource!=null) {
                            int length;
                            Assert.assrt(preToULength==0);
                            length = source.limit() - source.position();
                            if(length>0) {
                                //UConverterUtility.uprv_memcpy(preToUArray, preToUBegin, pArgs.sourceArray, pArgs.sourceBegin, length);
                                source.get(preToUArray, preToUBegin, length);
                                preToULength=(byte)-length;
                            }
                        }
                        return cr;
                    }
                }

                /* copy toUBytes[] to invalidCharBuffer[] */
                errorInputLength=invalidCharLength=toULength;
                if(errorInputLength>0) {
                    copy(toUBytesArray, 0, invalidCharBuffer, 0, errorInputLength);
                }

                /* set the converter state to deal with the next character */
                toULength=0;

                /* call the callback function */
                cr = toCharErrorBehaviour.call(this, toUContext, source, target, offsets, invalidCharBuffer, errorInputLength, cr);
                /*
                 * loop back to the offset handling
                 *
                 * this flag will indicate after offset handling
                 * that a callback was called;
                 * if the callback did not resolve the error, then we return
                 */
                calledCallback=true;
            }
        }
    }

    /*
     * Returns the number of chars held in the converter's internal state
     * because more input is needed for completing the conversion. This function is
     * useful for mapping semantics of ICU's converter interface to those of iconv,
     * and this information is not needed for normal conversion.
     * @return The number of chars in the state. -1 if an error is encountered.
     */
    /*public*/ int toUCountPending()    {
        if(preToULength > 0){
            return preToULength ;
        } else if(preToULength < 0){
            return -preToULength;
        } else if(toULength > 0){
            return toULength;
        } else {
            return 0;
        }
    }


    private void copy(byte[] src, int srcOffset, char[] dst, int dstOffset, int length) {
        for(int i=srcOffset; i<length; i++){
            dst[dstOffset++]=(char)(src[srcOffset++] & UConverterConstants.UNSIGNED_BYTE_MASK);
        }
    }
    /*
     * ONLY used by ToU callback functions.
     * This function will write out the specified characters to the target
     * character buffer.
     * @return A CoderResult object that contains the error result when an error occurs.
     */
    static final CoderResult toUWriteUChars( CharsetDecoderICU cnv,
                                                char[] ucharsArray, int ucharsBegin, int length,
                                                CharBuffer target, IntBuffer offsets, int sourceIndex) {

        CoderResult cr = CoderResult.UNDERFLOW;

        /* write UChars */
        if(offsets==null) {
            while(length>0 && target.hasRemaining()) {
                target.put(ucharsArray[ucharsBegin++]);
                --length;
            }

        } else {
            /* output with offsets */
            while(length>0 && target.hasRemaining()) {
                target.put(ucharsArray[ucharsBegin++]);
                offsets.put(sourceIndex);
                --length;
            }
        }
        /* write overflow */
        if(length>0) {
            cnv.charErrorBufferLength= 0;
            cr = CoderResult.OVERFLOW;
            do {
                cnv.charErrorBufferArray[cnv.charErrorBufferLength++]=ucharsArray[ucharsBegin++];
            } while(--length>0);
        }
        return cr;
    }
    /*
     * This function will write out the Unicode substitution character to the
     * target character buffer.
     * Sub classes to override this method if required
     * @param decoder
     * @param source
     * @param target
     * @param offsets
     * @return A CoderResult object that contains the error result when an error occurs.
     */
    /* Note: Currently, this method is not being used because the callback method calls toUWriteUChars with
     * the substitution characters. Will leave in here for the time being. To be removed later. (4.0)
     */
     /*CoderResult cbToUWriteSub(CharsetDecoderICU decoder,
                                        ByteBuffer source, CharBuffer target,
                                        IntBuffer offsets){
        String sub = decoder.replacement();
        CharsetICU cs = (CharsetICU) decoder.charset();
        if (decoder.invalidCharLength==1 && cs.subChar1 != 0x00) {
            char[] subArr = new char[] { 0x1a };
            return CharsetDecoderICU.toUWriteUChars(decoder, subArr, 0, sub
                    .length(), target, offsets, source.position());
        } else {
            return CharsetDecoderICU.toUWriteUChars(decoder, sub.toCharArray(),
                    0, sub.length(), target, offsets, source.position());

        }
    }*/

    /**
     * Returns the maxBytesPerChar value for the Charset that created this decoder.
     * @return maxBytesPerChar
     * @stable ICU 4.8
     */
    public final float maxBytesPerChar() {
        return ((CharsetICU)(this.charset())).maxBytesPerChar;
    }
}