Home | History | Annotate | Download | only in charset
      1 /**
      2  *******************************************************************************
      3  * Copyright (C) 2006-2008, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  *******************************************************************************
      6  */
      7 package com.ibm.icu.charset;
      8 
      9 import java.nio.ByteBuffer;
     10 import java.nio.CharBuffer;
     11 import java.nio.IntBuffer;
     12 import java.nio.charset.CharsetDecoder;
     13 import java.nio.charset.CharsetEncoder;
     14 import java.nio.charset.CoderResult;
     15 
     16 import com.ibm.icu.text.UTF16;
     17 import com.ibm.icu.text.UnicodeSet;
     18 
     19 /**
     20  * @author Niti Hantaweepant
     21  */
     22 class CharsetUTF32 extends CharsetICU {
     23 
     24     private static final int SIGNATURE_LENGTH = 4;
     25     private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd };
     26     private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 };
     27     private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff };
     28     private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 };
     29     private static final int ENDIAN_XOR_BE = 0;
     30     private static final int ENDIAN_XOR_LE = 3;
     31     private static final int NEED_TO_WRITE_BOM = 1;
     32 
     33     private boolean isEndianSpecified;
     34     private boolean isBigEndian;
     35     private int endianXOR;
     36     private byte[] bom;
     37     private byte[] fromUSubstitution;
     38 
     39     public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) {
     40         super(icuCanonicalName, javaCanonicalName, aliases);
     41 
     42         this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE);
     43         this.isBigEndian = !(this instanceof CharsetUTF32LE);
     44 
     45         if (isBigEndian) {
     46             this.bom = BOM_BE;
     47             this.fromUSubstitution = fromUSubstitution_BE;
     48             this.endianXOR = ENDIAN_XOR_BE;
     49         } else {
     50             this.bom = BOM_LE;
     51             this.fromUSubstitution = fromUSubstitution_LE;
     52             this.endianXOR = ENDIAN_XOR_LE;
     53         }
     54 
     55         maxBytesPerChar = 4;
     56         minBytesPerChar = 4;
     57         maxCharsPerByte = 1;
     58     }
     59 
     60     class CharsetDecoderUTF32 extends CharsetDecoderICU {
     61 
     62         private boolean isBOMReadYet;
     63         private int actualEndianXOR;
     64         private byte[] actualBOM;
     65 
     66         public CharsetDecoderUTF32(CharsetICU cs) {
     67             super(cs);
     68         }
     69 
     70         protected void implReset() {
     71             super.implReset();
     72             isBOMReadYet = false;
     73             actualBOM = null;
     74         }
     75 
     76         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
     77             /*
     78              * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual
     79              * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that
     80              * are in the current buffer.
     81              */
     82             if (!isBOMReadYet) {
     83                 while (true) {
     84                     if (!source.hasRemaining())
     85                         return CoderResult.UNDERFLOW;
     86 
     87                     toUBytesArray[toULength++] = source.get();
     88 
     89                     if (toULength == 1) {
     90                         // on the first byte, we haven't decided whether or not it's bigEndian yet
     91                         if ((!isEndianSpecified || isBigEndian)
     92                                 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) {
     93                             actualBOM = BOM_BE;
     94                             actualEndianXOR = ENDIAN_XOR_BE;
     95                         } else if ((!isEndianSpecified || !isBigEndian)
     96                                 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) {
     97                             actualBOM = BOM_LE;
     98                             actualEndianXOR = ENDIAN_XOR_LE;
     99                         } else {
    100                             // we do not have a BOM (and we have toULength==1 bytes)
    101                             actualBOM = null;
    102                             actualEndianXOR = endianXOR;
    103                             break;
    104                         }
    105                     } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) {
    106                         // we do not have a BOM (and we have toULength bytes)
    107                         actualBOM = null;
    108                         actualEndianXOR = endianXOR;
    109                         break;
    110                     } else if (toULength == SIGNATURE_LENGTH) {
    111                         // we found a BOM! at last!
    112                         // too bad we have to get ignore it now (like it was unwanted or something)
    113                         toULength = 0;
    114                         break;
    115                     }
    116                 }
    117 
    118                 isBOMReadYet = true;
    119             }
    120 
    121             // now that we no longer need to look for a BOM, let's do some work
    122             int char32;
    123 
    124             while (true) {
    125                 while (toULength < 4) {
    126                     if (!source.hasRemaining())
    127                         return CoderResult.UNDERFLOW;
    128                     toUBytesArray[toULength++] = source.get();
    129                 }
    130 
    131                 if (!target.hasRemaining())
    132                     return CoderResult.OVERFLOW;
    133 
    134                 char32 = 0;
    135                 for (int i = 0; i < 4; i++)
    136                     char32 = (char32 << 8)
    137                             | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK);
    138 
    139                 if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) {
    140                     toULength = 0;
    141                     if (char32 <= UConverterConstants.MAXIMUM_UCS2) {
    142                         /* fits in 16 bits */
    143                         target.put((char) char32);
    144                     } else {
    145                         /* write out the surrogates */
    146                         target.put(UTF16.getLeadSurrogate(char32));
    147                         char32 = UTF16.getTrailSurrogate(char32);
    148                         if (target.hasRemaining()) {
    149                             target.put((char) char32);
    150                         } else {
    151                             /* Put in overflow buffer (not handled here) */
    152                             charErrorBufferArray[0] = (char) char32;
    153                             charErrorBufferLength = 1;
    154                             return CoderResult.OVERFLOW;
    155                         }
    156                     }
    157                 } else {
    158                     return CoderResult.malformedForLength(toULength);
    159                 }
    160             }
    161         }
    162     }
    163 
    164     class CharsetEncoderUTF32 extends CharsetEncoderICU {
    165         private final byte[] temp = new byte[4];
    166 
    167         public CharsetEncoderUTF32(CharsetICU cs) {
    168             super(cs, fromUSubstitution);
    169             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
    170         }
    171 
    172         protected void implReset() {
    173             super.implReset();
    174             fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM;
    175         }
    176 
    177         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
    178             CoderResult cr;
    179 
    180             /* write the BOM if necessary */
    181             if (fromUnicodeStatus == NEED_TO_WRITE_BOM) {
    182                 if (!target.hasRemaining())
    183                     return CoderResult.OVERFLOW;
    184 
    185                 fromUnicodeStatus = 0;
    186                 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1);
    187                 if (cr.isOverflow())
    188                     return cr;
    189             }
    190 
    191             if (fromUChar32 != 0) {
    192                 if (!target.hasRemaining())
    193                     return CoderResult.OVERFLOW;
    194 
    195                 // a note: fromUChar32 will either be 0 or a lead surrogate
    196                 cr = encodeChar(source, target, offsets, (char) fromUChar32);
    197                 if (cr != null)
    198                     return cr;
    199             }
    200 
    201             while (true) {
    202                 if (!source.hasRemaining())
    203                     return CoderResult.UNDERFLOW;
    204                 if (!target.hasRemaining())
    205                     return CoderResult.OVERFLOW;
    206 
    207                 cr = encodeChar(source, target, offsets, source.get());
    208                 if (cr != null)
    209                     return cr;
    210             }
    211         }
    212 
    213         private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) {
    214             int sourceIndex = source.position() - 1;
    215             CoderResult cr;
    216             int char32;
    217 
    218             if (UTF16.isSurrogate(ch)) {
    219                 cr = handleSurrogates(source, ch);
    220                 if (cr != null)
    221                     return cr;
    222 
    223                 char32 = fromUChar32;
    224                 fromUChar32 = 0;
    225             } else {
    226                 char32 = ch;
    227             }
    228 
    229             /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */
    230             // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0)
    231             temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f)
    232             temp[2 ^ endianXOR] = (byte) (char32 >>> 8);
    233             temp[3 ^ endianXOR] = (byte) (char32);
    234             cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex);
    235             return (cr.isUnderflow() ? null : cr);
    236         }
    237     }
    238 
    239     public CharsetDecoder newDecoder() {
    240         return new CharsetDecoderUTF32(this);
    241     }
    242 
    243     public CharsetEncoder newEncoder() {
    244         return new CharsetEncoderUTF32(this);
    245     }
    246 
    247 
    248     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
    249         getNonSurrogateUnicodeSet(setFillIn);
    250     }
    251 }
    252