1 /** 2 ******************************************************************************* 3 * Copyright (C) 2006-2008, International Business Machines Corporation and * 4 * others. All Rights Reserved. * 5 ******************************************************************************* 6 */ 7 package com.ibm.icu.charset; 8 9 import java.nio.ByteBuffer; 10 import java.nio.CharBuffer; 11 import java.nio.IntBuffer; 12 import java.nio.charset.CharsetDecoder; 13 import java.nio.charset.CharsetEncoder; 14 import java.nio.charset.CoderResult; 15 16 import com.ibm.icu.text.UTF16; 17 import com.ibm.icu.text.UnicodeSet; 18 19 /** 20 * @author Niti Hantaweepant 21 */ 22 class CharsetUTF32 extends CharsetICU { 23 24 private static final int SIGNATURE_LENGTH = 4; 25 private static final byte[] fromUSubstitution_BE = { (byte) 0, (byte) 0, (byte) 0xff, (byte) 0xfd }; 26 private static final byte[] fromUSubstitution_LE = { (byte) 0xfd, (byte) 0xff, (byte) 0, (byte) 0 }; 27 private static final byte[] BOM_BE = { 0, 0, (byte) 0xfe, (byte) 0xff }; 28 private static final byte[] BOM_LE = { (byte) 0xff, (byte) 0xfe, 0, 0 }; 29 private static final int ENDIAN_XOR_BE = 0; 30 private static final int ENDIAN_XOR_LE = 3; 31 private static final int NEED_TO_WRITE_BOM = 1; 32 33 private boolean isEndianSpecified; 34 private boolean isBigEndian; 35 private int endianXOR; 36 private byte[] bom; 37 private byte[] fromUSubstitution; 38 39 public CharsetUTF32(String icuCanonicalName, String javaCanonicalName, String[] aliases) { 40 super(icuCanonicalName, javaCanonicalName, aliases); 41 42 this.isEndianSpecified = (this instanceof CharsetUTF32BE || this instanceof CharsetUTF32LE); 43 this.isBigEndian = !(this instanceof CharsetUTF32LE); 44 45 if (isBigEndian) { 46 this.bom = BOM_BE; 47 this.fromUSubstitution = fromUSubstitution_BE; 48 this.endianXOR = ENDIAN_XOR_BE; 49 } else { 50 this.bom = BOM_LE; 51 this.fromUSubstitution = fromUSubstitution_LE; 52 this.endianXOR = ENDIAN_XOR_LE; 53 } 54 55 maxBytesPerChar = 4; 56 minBytesPerChar = 4; 57 maxCharsPerByte = 1; 58 } 59 60 class CharsetDecoderUTF32 extends CharsetDecoderICU { 61 62 private boolean isBOMReadYet; 63 private int actualEndianXOR; 64 private byte[] actualBOM; 65 66 public CharsetDecoderUTF32(CharsetICU cs) { 67 super(cs); 68 } 69 70 protected void implReset() { 71 super.implReset(); 72 isBOMReadYet = false; 73 actualBOM = null; 74 } 75 76 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 77 /* 78 * If we detect a BOM in this buffer, then we must add the BOM size to the offsets because the actual 79 * converter function will not see and count the BOM. offsetDelta will have the number of the BOM bytes that 80 * are in the current buffer. 81 */ 82 if (!isBOMReadYet) { 83 while (true) { 84 if (!source.hasRemaining()) 85 return CoderResult.UNDERFLOW; 86 87 toUBytesArray[toULength++] = source.get(); 88 89 if (toULength == 1) { 90 // on the first byte, we haven't decided whether or not it's bigEndian yet 91 if ((!isEndianSpecified || isBigEndian) 92 && toUBytesArray[toULength - 1] == BOM_BE[toULength - 1]) { 93 actualBOM = BOM_BE; 94 actualEndianXOR = ENDIAN_XOR_BE; 95 } else if ((!isEndianSpecified || !isBigEndian) 96 && toUBytesArray[toULength - 1] == BOM_LE[toULength - 1]) { 97 actualBOM = BOM_LE; 98 actualEndianXOR = ENDIAN_XOR_LE; 99 } else { 100 // we do not have a BOM (and we have toULength==1 bytes) 101 actualBOM = null; 102 actualEndianXOR = endianXOR; 103 break; 104 } 105 } else if (toUBytesArray[toULength - 1] != actualBOM[toULength - 1]) { 106 // we do not have a BOM (and we have toULength bytes) 107 actualBOM = null; 108 actualEndianXOR = endianXOR; 109 break; 110 } else if (toULength == SIGNATURE_LENGTH) { 111 // we found a BOM! at last! 112 // too bad we have to get ignore it now (like it was unwanted or something) 113 toULength = 0; 114 break; 115 } 116 } 117 118 isBOMReadYet = true; 119 } 120 121 // now that we no longer need to look for a BOM, let's do some work 122 int char32; 123 124 while (true) { 125 while (toULength < 4) { 126 if (!source.hasRemaining()) 127 return CoderResult.UNDERFLOW; 128 toUBytesArray[toULength++] = source.get(); 129 } 130 131 if (!target.hasRemaining()) 132 return CoderResult.OVERFLOW; 133 134 char32 = 0; 135 for (int i = 0; i < 4; i++) 136 char32 = (char32 << 8) 137 | (toUBytesArray[i ^ actualEndianXOR] & UConverterConstants.UNSIGNED_BYTE_MASK); 138 139 if (0 <= char32 && char32 <= UConverterConstants.MAXIMUM_UTF && !isSurrogate(char32)) { 140 toULength = 0; 141 if (char32 <= UConverterConstants.MAXIMUM_UCS2) { 142 /* fits in 16 bits */ 143 target.put((char) char32); 144 } else { 145 /* write out the surrogates */ 146 target.put(UTF16.getLeadSurrogate(char32)); 147 char32 = UTF16.getTrailSurrogate(char32); 148 if (target.hasRemaining()) { 149 target.put((char) char32); 150 } else { 151 /* Put in overflow buffer (not handled here) */ 152 charErrorBufferArray[0] = (char) char32; 153 charErrorBufferLength = 1; 154 return CoderResult.OVERFLOW; 155 } 156 } 157 } else { 158 return CoderResult.malformedForLength(toULength); 159 } 160 } 161 } 162 } 163 164 class CharsetEncoderUTF32 extends CharsetEncoderICU { 165 private final byte[] temp = new byte[4]; 166 167 public CharsetEncoderUTF32(CharsetICU cs) { 168 super(cs, fromUSubstitution); 169 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM; 170 } 171 172 protected void implReset() { 173 super.implReset(); 174 fromUnicodeStatus = isEndianSpecified ? 0 : NEED_TO_WRITE_BOM; 175 } 176 177 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 178 CoderResult cr; 179 180 /* write the BOM if necessary */ 181 if (fromUnicodeStatus == NEED_TO_WRITE_BOM) { 182 if (!target.hasRemaining()) 183 return CoderResult.OVERFLOW; 184 185 fromUnicodeStatus = 0; 186 cr = fromUWriteBytes(this, bom, 0, bom.length, target, offsets, -1); 187 if (cr.isOverflow()) 188 return cr; 189 } 190 191 if (fromUChar32 != 0) { 192 if (!target.hasRemaining()) 193 return CoderResult.OVERFLOW; 194 195 // a note: fromUChar32 will either be 0 or a lead surrogate 196 cr = encodeChar(source, target, offsets, (char) fromUChar32); 197 if (cr != null) 198 return cr; 199 } 200 201 while (true) { 202 if (!source.hasRemaining()) 203 return CoderResult.UNDERFLOW; 204 if (!target.hasRemaining()) 205 return CoderResult.OVERFLOW; 206 207 cr = encodeChar(source, target, offsets, source.get()); 208 if (cr != null) 209 return cr; 210 } 211 } 212 213 private final CoderResult encodeChar(CharBuffer source, ByteBuffer target, IntBuffer offsets, char ch) { 214 int sourceIndex = source.position() - 1; 215 CoderResult cr; 216 int char32; 217 218 if (UTF16.isSurrogate(ch)) { 219 cr = handleSurrogates(source, ch); 220 if (cr != null) 221 return cr; 222 223 char32 = fromUChar32; 224 fromUChar32 = 0; 225 } else { 226 char32 = ch; 227 } 228 229 /* We cannot get any larger than 10FFFF because we are coming from UTF-16 */ 230 // temp[0 ^ endianXOR] = (byte) (char32 >>> 24); // (always 0) 231 temp[1 ^ endianXOR] = (byte) (char32 >>> 16); // same as (byte)((char32 >>> 16) & 0x1f) 232 temp[2 ^ endianXOR] = (byte) (char32 >>> 8); 233 temp[3 ^ endianXOR] = (byte) (char32); 234 cr = fromUWriteBytes(this, temp, 0, 4, target, offsets, sourceIndex); 235 return (cr.isUnderflow() ? null : cr); 236 } 237 } 238 239 public CharsetDecoder newDecoder() { 240 return new CharsetDecoderUTF32(this); 241 } 242 243 public CharsetEncoder newEncoder() { 244 return new CharsetEncoderUTF32(this); 245 } 246 247 248 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 249 getNonSurrogateUnicodeSet(setFillIn); 250 } 251 } 252