1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2008-2015, International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9 package com.ibm.icu.charset; 10 11 import java.nio.ByteBuffer; 12 import java.nio.CharBuffer; 13 import java.nio.IntBuffer; 14 import java.nio.charset.CharsetDecoder; 15 import java.nio.charset.CharsetEncoder; 16 import java.nio.charset.CoderResult; 17 import java.nio.charset.UnsupportedCharsetException; 18 19 import com.ibm.icu.text.UTF16; 20 import com.ibm.icu.text.UnicodeSet; 21 22 class CharsetHZ extends CharsetICU { 23 24 private static final int UCNV_TILDE = 0x7E; /* ~ */ 25 private static final int UCNV_OPEN_BRACE = 0x7B; /* { */ 26 private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */ 27 private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D }; 28 private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B }; 29 private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E }; 30 private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A }; 31 32 private CharsetMBCS gbCharset; 33 private boolean isEmptySegment; 34 35 public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) { 36 super(icuCanonicalName, canonicalName, aliases); 37 gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK"); 38 if (gbCharset == null) { 39 throw new UnsupportedCharsetException("unable to open ICU GBK Charset, required for HZ"); 40 } 41 42 maxBytesPerChar = 4; 43 minBytesPerChar = 1; 44 maxCharsPerByte = 1; 45 46 isEmptySegment = false; 47 } 48 49 class CharsetDecoderHZ extends CharsetDecoderICU { 50 CharsetMBCS.CharsetDecoderMBCS gbDecoder; 51 boolean isStateDBCS = false; 52 53 public CharsetDecoderHZ(CharsetICU cs) { 54 super(cs); 55 gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder(); 56 } 57 58 @Override 59 protected void implReset() { 60 super.implReset(); 61 gbDecoder.implReset(); 62 63 isStateDBCS = false; 64 isEmptySegment = false; 65 } 66 67 @Override 68 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 69 CoderResult err = CoderResult.UNDERFLOW; 70 byte[] tempBuf = new byte[2]; 71 int targetUniChar = 0; 72 int mySourceChar = 0; 73 74 if (!source.hasRemaining()) 75 return CoderResult.UNDERFLOW; 76 else if (!target.hasRemaining()) 77 return CoderResult.OVERFLOW; 78 79 while (source.hasRemaining()) { 80 81 if (target.hasRemaining()) { 82 83 // get the byte as unsigned 84 mySourceChar = source.get() & 0xff; 85 86 if (mode == UCNV_TILDE) { 87 /* second byte after ~ */ 88 mode = 0; 89 switch (mySourceChar) { 90 case 0x0A: 91 /* no output for ~\n (line-continuation marker) */ 92 continue; 93 case UCNV_TILDE: 94 if (offsets != null) { 95 offsets.put(source.position() - 2); 96 } 97 target.put((char) mySourceChar); 98 continue; 99 case UCNV_OPEN_BRACE: 100 case UCNV_CLOSE_BRACE: 101 isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE); 102 if (isEmptySegment) { 103 isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */ 104 this.toUBytesArray[0] = UCNV_TILDE; 105 this.toUBytesArray[1] = (byte)mySourceChar; 106 this.toULength = 2; 107 return CoderResult.malformedForLength(1); 108 } 109 isEmptySegment = true; 110 continue; 111 default: 112 /* 113 * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an 114 * error condition 115 */ 116 /* 117 * Ticket 5691: consistent illegal sequences: 118 * - We include at least the first byte in the illegal sequence. 119 * - If any of the non-initial bytes could be the start of a character, 120 * we stop the illegal sequence before the first one of those. 121 */ 122 isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */ 123 err = CoderResult.malformedForLength(1); 124 toUBytesArray[0] = UCNV_TILDE; 125 if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) { 126 /* The current byte could be the start of a character: Back it out. */ 127 toULength = 1; 128 source.position(source.position() - 1); 129 } else { 130 /* Include the current byte in the illegal sequence. */ 131 toUBytesArray[1] = (byte)mySourceChar; 132 toULength = 2; 133 } 134 return err; 135 } 136 } else if (isStateDBCS) { 137 if (toUnicodeStatus == 0) { 138 /* lead byte */ 139 if (mySourceChar == UCNV_TILDE) { 140 mode = UCNV_TILDE; 141 } else { 142 /* 143 * add another bit to distinguish a 0 byte from not having seen a lead byte 144 */ 145 toUnicodeStatus = mySourceChar | 0x100; 146 isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */ 147 } 148 continue; 149 } else { 150 /* trail byte */ 151 boolean leadIsOk, trailIsOk; 152 int leadByte = toUnicodeStatus & 0xff; 153 targetUniChar = 0xffff; 154 /* 155 * Ticket 5691: consistent illegal sequence 156 * - We include at least the first byte in the illegal sequence. 157 * - If any of the non-initial bytes could be the start of a character, 158 * we stop the illegal sequence before the first one of those 159 * 160 * In HZ DBCS, if the second byte is in the 21..7e range, 161 * we report ony the first byte as the illegal sequence. 162 * Otherwise we convert of report the pair of bytes. 163 */ 164 leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21); 165 trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21); 166 if (leadIsOk && trailIsOk) { 167 tempBuf[0] = (byte)(leadByte + 0x80); 168 tempBuf[1] = (byte)(mySourceChar + 0x80); 169 targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed()); 170 mySourceChar = (leadByte << 8) | mySourceChar; 171 } else if (trailIsOk) { 172 /* report a single illegal byte and continue with the following DBCS starter byte */ 173 source.position(source.position() - 1); 174 mySourceChar = leadByte; 175 } else { 176 /* report a pair of illegal bytes if the second byte is not a DBCS starter */ 177 /* add another bit so that the code below writes 2 bytes in case of error */ 178 mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar; 179 } 180 toUnicodeStatus = 0x00; 181 } 182 } else { 183 if (mySourceChar == UCNV_TILDE) { 184 mode = UCNV_TILDE; 185 continue; 186 } else if (mySourceChar <= 0x7f) { 187 targetUniChar = mySourceChar; /* ASCII */ 188 isEmptySegment = false; /* the segment has something valid */ 189 } else { 190 targetUniChar = 0xffff; 191 isEmptySegment = false; /* different error here, reset this to avoid spurious future error */ 192 } 193 } 194 195 if (targetUniChar < 0xfffe) { 196 if (offsets != null) { 197 offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0)); 198 } 199 200 target.put((char) targetUniChar); 201 } else /* targetUniChar >= 0xfffe */{ 202 if (mySourceChar > 0xff) { 203 toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8); 204 toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar; 205 toULength = 2; 206 } else { 207 toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar; 208 toULength = 1; 209 } 210 if (targetUniChar == 0xfffe) { 211 return CoderResult.unmappableForLength(toULength); 212 } else { 213 return CoderResult.malformedForLength(toULength); 214 } 215 } 216 } else { 217 return CoderResult.OVERFLOW; 218 } 219 } 220 221 return err; 222 } 223 } 224 225 class CharsetEncoderHZ extends CharsetEncoderICU { 226 CharsetMBCS.CharsetEncoderMBCS gbEncoder; 227 boolean isEscapeAppended = false; 228 boolean isTargetUCharDBCS = false; 229 230 public CharsetEncoderHZ(CharsetICU cs) { 231 super(cs, fromUSubstitution); 232 gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder(); 233 } 234 235 @Override 236 protected void implReset() { 237 super.implReset(); 238 gbEncoder.implReset(); 239 240 isEscapeAppended = false; 241 isTargetUCharDBCS = false; 242 } 243 244 @Override 245 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 246 int length = 0; 247 int[] targetUniChar = new int[] { 0 }; 248 int mySourceChar = 0; 249 boolean oldIsTargetUCharDBCS = isTargetUCharDBCS; 250 251 if (!source.hasRemaining()) 252 return CoderResult.UNDERFLOW; 253 else if (!target.hasRemaining()) 254 return CoderResult.OVERFLOW; 255 256 if (fromUChar32 != 0 && target.hasRemaining()) { 257 CoderResult cr = handleSurrogates(source, (char) fromUChar32); 258 return (cr != null) ? cr : CoderResult.unmappableForLength(2); 259 } 260 /* writing the char to the output stream */ 261 while (source.hasRemaining()) { 262 targetUniChar[0] = MISSING_CHAR_MARKER; 263 if (target.hasRemaining()) { 264 265 mySourceChar = source.get(); 266 267 oldIsTargetUCharDBCS = isTargetUCharDBCS; 268 if (mySourceChar == UCNV_TILDE) { 269 /* 270 * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex); 271 */ 272 concatEscape(source, target, offsets, TILDE_ESCAPE); 273 continue; 274 } else if (mySourceChar <= 0x7f) { 275 length = 1; 276 targetUniChar[0] = mySourceChar; 277 } else { 278 length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed()); 279 280 /* 281 * we can only use lead bytes 21..7D and trail bytes 21..7E 282 */ 283 if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe 284 && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) { 285 targetUniChar[0] -= 0x8080; 286 } else { 287 targetUniChar[0] = MISSING_CHAR_MARKER; 288 } 289 } 290 if (targetUniChar[0] != MISSING_CHAR_MARKER) { 291 isTargetUCharDBCS = (targetUniChar[0] > 0x00FF); 292 if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) { 293 /* Shifting from a double byte to single byte mode */ 294 if (!isTargetUCharDBCS) { 295 concatEscape(source, target, offsets, SB_ESCAPE); 296 isEscapeAppended = true; 297 } else { /* 298 * Shifting from a single byte to double byte mode 299 */ 300 concatEscape(source, target, offsets, DB_ESCAPE); 301 isEscapeAppended = true; 302 303 } 304 } 305 306 if (isTargetUCharDBCS) { 307 if (target.hasRemaining()) { 308 target.put((byte) (targetUniChar[0] >> 8)); 309 if (offsets != null) { 310 offsets.put(source.position() - 1); 311 } 312 if (target.hasRemaining()) { 313 target.put((byte) targetUniChar[0]); 314 if (offsets != null) { 315 offsets.put(source.position() - 1); 316 } 317 } else { 318 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; 319 // *err = U_BUFFER_OVERFLOW_ERROR; 320 } 321 } else { 322 errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8); 323 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; 324 // *err = U_BUFFER_OVERFLOW_ERROR; 325 } 326 327 } else { 328 if (target.hasRemaining()) { 329 target.put((byte) targetUniChar[0]); 330 if (offsets != null) { 331 offsets.put(source.position() - 1); 332 } 333 334 } else { 335 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0]; 336 // *err = U_BUFFER_OVERFLOW_ERROR; 337 } 338 } 339 340 } else { 341 /* oops.. the code point is unassigned */ 342 /* Handle surrogates */ 343 /* check if the char is a First surrogate */ 344 345 if (UTF16.isSurrogate((char) mySourceChar)) { 346 // use that handy handleSurrogates method everyone's been talking about! 347 CoderResult cr = handleSurrogates(source, (char) mySourceChar); 348 return (cr != null) ? cr : CoderResult.unmappableForLength(2); 349 } else { 350 /* callback(unassigned) for a BMP code point */ 351 // *err = U_INVALID_CHAR_FOUND; 352 fromUChar32 = mySourceChar; 353 return CoderResult.unmappableForLength(1); 354 } 355 } 356 } else { 357 // *err = U_BUFFER_OVERFLOW_ERROR; 358 return CoderResult.OVERFLOW; 359 } 360 } 361 362 return CoderResult.UNDERFLOW; 363 } 364 365 private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) { 366 CoderResult cr = null; 367 for (int i=0; i<strToAppend.length; i++) { 368 byte b = strToAppend[i]; 369 if (target.hasRemaining()) { 370 target.put(b); 371 if (offsets != null) 372 offsets.put(source.position() - 1); 373 } else { 374 errorBuffer[errorBufferLength++] = b; 375 cr = CoderResult.OVERFLOW; 376 } 377 } 378 return cr; 379 } 380 } 381 382 @Override 383 public CharsetDecoder newDecoder() { 384 return new CharsetDecoderHZ(this); 385 } 386 387 @Override 388 public CharsetEncoder newEncoder() { 389 return new CharsetEncoderHZ(this); 390 } 391 392 @Override 393 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 394 setFillIn.add(0,0x7f); 395 // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546"); 396 gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ); 397 } 398 } 399