1 /* 2 ******************************************************************************* 3 * Copyright (C) 2006-2015, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ******************************************************************************* 6 * 7 ******************************************************************************* 8 */ 9 10 package com.ibm.icu.charset; 11 12 import java.io.IOException; 13 import java.io.InputStream; 14 import java.nio.Buffer; 15 import java.nio.BufferOverflowException; 16 import java.nio.ByteBuffer; 17 import java.nio.CharBuffer; 18 import java.nio.IntBuffer; 19 import java.nio.charset.CharsetDecoder; 20 import java.nio.charset.CharsetEncoder; 21 import java.nio.charset.CoderResult; 22 import java.util.Locale; 23 24 import com.ibm.icu.charset.UConverterSharedData.UConverterType; 25 import com.ibm.icu.impl.ICUBinary; 26 import com.ibm.icu.impl.ICUData; 27 import com.ibm.icu.impl.ICUResourceBundle; 28 import com.ibm.icu.impl.InvalidFormatException; 29 import com.ibm.icu.lang.UCharacter; 30 import com.ibm.icu.text.UTF16; 31 import com.ibm.icu.text.UnicodeSet; 32 33 class CharsetMBCS extends CharsetICU { 34 35 private byte[] fromUSubstitution = null; 36 UConverterSharedData sharedData = null; 37 private static final int MAX_VERSION_LENGTH = 4; 38 39 // these variables are used in getUnicodeSet() and may be changed in future 40 // typedef enum UConverterSetFilter { 41 static final int UCNV_SET_FILTER_NONE = 1; 42 static final int UCNV_SET_FILTER_DBCS_ONLY = 2; 43 static final int UCNV_SET_FILTER_2022_CN = 3; 44 static final int UCNV_SET_FILTER_SJIS= 4 ; 45 static final int UCNV_SET_FILTER_GR94DBCS = 5; 46 static final int UCNV_SET_FILTER_HZ = 6; 47 static final int UCNV_SET_FILTER_COUNT = 7; 48 // } UConverterSetFilter; 49 50 /** 51 * Fallbacks to Unicode are stored outside the normal state table and code point structures in a vector of items of 52 * this type. They are sorted by offset. 53 */ 54 final static class MBCSToUFallback { 55 int offset; 56 int codePoint; 57 58 MBCSToUFallback(int off, int cp) { 59 offset = off; 60 codePoint = cp; 61 } 62 } 63 64 /** 65 * This is the MBCS part of the UConverterTable union (a runtime data structure). It keeps all the per-converter 66 * data and points into the loaded mapping tables. 67 */ 68 static final class UConverterMBCSTable { 69 /* toUnicode */ 70 short countStates; 71 byte dbcsOnlyState; 72 boolean stateTableOwned; 73 int countToUFallbacks; 74 75 int stateTable[/* countStates */][/* 256 */]; 76 int swapLFNLStateTable[/* countStates */][/* 256 */]; /* for swaplfnl */ 77 char unicodeCodeUnits[/* countUnicodeResults */]; 78 MBCSToUFallback toUFallbacks[/* countToUFallbacks */]; 79 80 /* fromUnicode */ 81 char fromUnicodeTable[]; // stage1, and for MBCS_OUTPUT_1 also contains stage2 82 int fromUnicodeTableInts[]; // stage1 and stage2 together as int[] 83 // Exactly one of the fromUnicode(Type) tables is not null, 84 // depending on the outputType. 85 byte fromUnicodeBytes[]; 86 char fromUnicodeChars[]; 87 int fromUnicodeInts[]; 88 char swapLFNLFromUnicodeChars[]; /* for swaplfnl */ 89 int fromUBytesLength; 90 short outputType, unicodeMask; 91 92 /* converter name for swaplfnl */ 93 String swapLFNLName; 94 95 /* extension data */ 96 UConverterSharedData baseSharedData; 97 // int extIndexes[]; 98 ByteBuffer extIndexes; // create int[] view etc. as needed 99 100 CharBuffer mbcsIndex; /* for fast conversion from most of BMP to MBCS (utf8Friendly data) */ 101 // char sbcsIndex[/* SBCS_FAST_LIMIT>>6 */]; /* for fast conversion from low BMP to SBCS (utf8Friendly data) */ 102 boolean utf8Friendly; /* for utf8Friendly data */ 103 char maxFastUChar; /* for utf8Friendly data */ 104 105 /* roundtrips */ 106 int asciiRoundtrips; 107 108 UConverterMBCSTable() { 109 utf8Friendly = false; 110 mbcsIndex = null; 111 } 112 113 boolean hasSupplementary() { 114 return (unicodeMask & UConverterConstants.HAS_SUPPLEMENTARY) != 0; 115 } 116 117 /* 118 * UConverterMBCSTable(UConverterMBCSTable t) { countStates = t.countStates; dbcsOnlyState = t.dbcsOnlyState; 119 * stateTableOwned = t.stateTableOwned; countToUFallbacks = t.countToUFallbacks; stateTable = t.stateTable; 120 * swapLFNLStateTable = t.swapLFNLStateTable; unicodeCodeUnits = t.unicodeCodeUnits; toUFallbacks = 121 * t.toUFallbacks; fromUnicodeTable = t.fromUnicodeTable; fromUnicodeBytes = t.fromUnicodeBytes; 122 * swapLFNLFromUnicodeChars = t.swapLFNLFromUnicodeChars; fromUBytesLength = t.fromUBytesLength; outputType = 123 * t.outputType; unicodeMask = t.unicodeMask; swapLFNLName = t.swapLFNLName; baseSharedData = t.baseSharedData; 124 * extIndexes = t.extIndexes; } 125 */ 126 } 127 128 /* Constants used in MBCS data header */ 129 // enum { 130 static final int MBCS_OPT_LENGTH_MASK=0x3f; 131 static final int MBCS_OPT_NO_FROM_U=0x40; 132 /* 133 * If any of the following options bits are set, 134 * then the file must be rejected. 135 */ 136 static final int MBCS_OPT_INCOMPATIBLE_MASK=0xffc0; 137 /* 138 * Remove bits from this mask as more options are recognized 139 * by all implementations that use this constant. 140 */ 141 static final int MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK=0xff80; 142 // }; 143 /* Constants for fast and UTF-8-friendly conversion. */ 144 // enum { 145 static final int SBCS_FAST_MAX=0x0fff; /* maximum code point with UTF-8-friendly SBCS runtime code, see makeconv SBCS_UTF8_MAX */ 146 static final int SBCS_FAST_LIMIT=SBCS_FAST_MAX+1; /* =0x1000 */ 147 static final int MBCS_FAST_MAX=0xd7ff; /* maximum code point with UTF-8-friendly MBCS runtime code, see makeconv MBCS_UTF8_MAX */ 148 static final int MBCS_FAST_LIMIT=MBCS_FAST_MAX+1; /* =0xd800 */ 149 // }; 150 /** 151 * MBCS data header. See data format description above. 152 */ 153 final static class MBCSHeader { 154 byte version[/* U_MAX_VERSION_LENGTH */]; 155 int countStates, countToUFallbacks, offsetToUCodeUnits, offsetFromUTable, offsetFromUBytes; 156 int flags; 157 int fromUBytesLength; 158 159 /* new and required in version 5 */ 160 int options; 161 162 /* new and optional in version 5; used if options&MBCS_OPT_NO_FROM_U */ 163 int fullStage2Length; /* number of 32-bit units */ 164 165 MBCSHeader() { 166 version = new byte[MAX_VERSION_LENGTH]; 167 } 168 } 169 170 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases, String classPath, 171 ClassLoader loader) throws InvalidFormatException { 172 super(icuCanonicalName, javaCanonicalName, aliases); 173 174 /* See if the icuCanonicalName contains certain option information. */ 175 if (icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING) > -1) { 176 options = UConverterConstants.OPTION_SWAP_LFNL; 177 icuCanonicalName = icuCanonicalName.substring(0, icuCanonicalName.indexOf(UConverterConstants.OPTION_SWAP_LFNL_STRING)); 178 super.icuCanonicalName = icuCanonicalName; 179 } 180 181 // now try to load the data 182 sharedData = loadConverter(1, icuCanonicalName, classPath, loader); 183 184 maxBytesPerChar = sharedData.staticData.maxBytesPerChar; 185 minBytesPerChar = sharedData.staticData.minBytesPerChar; 186 maxCharsPerByte = 1; 187 fromUSubstitution = sharedData.staticData.subChar; 188 subChar = sharedData.staticData.subChar; 189 subCharLen = sharedData.staticData.subCharLen; 190 subChar1 = sharedData.staticData.subChar1; 191 fromUSubstitution = new byte[sharedData.staticData.subCharLen]; 192 System.arraycopy(sharedData.staticData.subChar, 0, fromUSubstitution, 0, sharedData.staticData.subCharLen); 193 194 initializeConverter(options); 195 } 196 197 public CharsetMBCS(String icuCanonicalName, String javaCanonicalName, String[] aliases) 198 throws InvalidFormatException { 199 this(icuCanonicalName, javaCanonicalName, aliases, ICUResourceBundle.ICU_BUNDLE, null); 200 } 201 202 private UConverterSharedData loadConverter(int nestedLoads, String myName, String classPath, ClassLoader loader) 203 throws InvalidFormatException { 204 boolean noFromU = false; 205 // Read converter data from file 206 UConverterStaticData staticData = new UConverterStaticData(); 207 UConverterDataReader reader = null; 208 try { 209 String itemName = myName + '.' + UConverterSharedData.DATA_TYPE; 210 String resourceName = classPath + '/' + itemName; 211 ByteBuffer b; 212 213 if (loader != null) { 214 @SuppressWarnings("resource") // Closed by getByteBufferFromInputStreamAndCloseStream(). 215 InputStream i = ICUData.getRequiredStream(loader, resourceName); 216 b = ICUBinary.getByteBufferFromInputStreamAndCloseStream(i); 217 } else if (!classPath.equals(ICUData.ICU_BUNDLE)) { 218 @SuppressWarnings("resource") // Closed by getByteBufferFromInputStreamAndCloseStream(). 219 InputStream i = ICUData.getRequiredStream(resourceName); 220 b = ICUBinary.getByteBufferFromInputStreamAndCloseStream(i); 221 } else { 222 b = ICUBinary.getRequiredData(itemName); 223 } 224 reader = new UConverterDataReader(b); 225 reader.readStaticData(staticData); 226 } catch (IOException e) { 227 throw new InvalidFormatException(e); 228 } catch (Exception e) { 229 throw new InvalidFormatException(e); 230 } 231 232 UConverterSharedData data = null; 233 int type = staticData.conversionType; 234 235 if (type != UConverterSharedData.UConverterType.MBCS 236 || staticData.structSize != UConverterStaticData.SIZE_OF_UCONVERTER_STATIC_DATA) { 237 throw new InvalidFormatException(); 238 } 239 240 data = new UConverterSharedData(1, null, false, 0); 241 data.dataReader = reader; 242 data.staticData = staticData; 243 data.sharedDataCached = false; 244 245 // Load data 246 UConverterMBCSTable mbcsTable = data.mbcs; 247 MBCSHeader header = new MBCSHeader(); 248 try { 249 reader.readMBCSHeader(header); 250 } catch (IOException e) { 251 throw new InvalidFormatException(); 252 } 253 254 int offset; 255 // int[] extIndexesArray = null; 256 String baseNameString = null; 257 258 if (header.version[0] == 5 && header.version[1] >= 3 && (header.options & MBCS_OPT_UNKNOWN_INCOMPATIBLE_MASK) == 0) { 259 noFromU = ((header.options & MBCS_OPT_NO_FROM_U) != 0); 260 } else if (header.version[0] != 4) { 261 throw new InvalidFormatException(); 262 } 263 264 mbcsTable.outputType = (byte) header.flags; 265 266 /* extension data, header version 4.2 and higher */ 267 offset = header.flags >>> 8; 268 // if(offset!=0 && mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 269 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 270 try { 271 baseNameString = reader.readBaseTableName(); 272 if (offset != 0) { 273 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null 274 // terminator byte all already read; 275 mbcsTable.extIndexes = reader.readExtIndexes(offset - reader.bytesReadAfterStaticData()); 276 } 277 } catch (IOException e) { 278 throw new InvalidFormatException(); 279 } 280 } 281 282 // agljport:add this would be unnecessary if extIndexes were memory mapped 283 /* 284 * if(mbcsTable.extIndexes != null) { 285 * 286 * try { //int nbytes = mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_LENGTH]*4 + 287 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_TO_U_UCHARS_LENGTH]*2 + 288 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_LENGTH]*6 + 289 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_BYTES_LENGTH] + 290 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_12_LENGTH]*2 + 291 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3_LENGTH]*2 + 292 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_FROM_U_STAGE_3B_LENGTH]*4; //int nbytes = 293 * mbcsTable.extIndexes[UConverterExt.UCNV_EXT_SIZE] //byte[] extTables = dataReader.readExtTables(nbytes); 294 * //mbcsTable.extTables = ByteBuffer.wrap(extTables); } catch(IOException e) { System.err.println("Caught 295 * IOException: " + e.getMessage()); pErrorCode[0] = UErrorCode.U_INVALID_FORMAT_ERROR; return; } } 296 */ 297 if (mbcsTable.outputType == MBCS_OUTPUT_EXT_ONLY) { 298 UConverterSharedData baseSharedData = null; 299 ByteBuffer extIndexes; 300 String baseName; 301 302 /* extension-only file, load the base table and set values appropriately */ 303 extIndexes = mbcsTable.extIndexes; 304 if (extIndexes == null) { 305 /* extension-only file without extension */ 306 throw new InvalidFormatException(); 307 } 308 309 if (nestedLoads != 1) { 310 /* an extension table must not be loaded as a base table */ 311 throw new InvalidFormatException(); 312 } 313 314 /* load the base table */ 315 baseName = baseNameString; 316 if (baseName.equals(staticData.name)) { 317 /* forbid loading this same extension-only file */ 318 throw new InvalidFormatException(); 319 } 320 321 // agljport:fix args.size=sizeof(UConverterLoadArgs); 322 baseSharedData = loadConverter(2, baseName, classPath, loader); 323 324 if (baseSharedData.staticData.conversionType != UConverterType.MBCS 325 || baseSharedData.mbcs.baseSharedData != null) { 326 // agljport:fix ucnv_unload(baseSharedData); 327 throw new InvalidFormatException(); 328 } 329 330 /* copy the base table data */ 331 // agljport:comment deep copy in C changes mbcs through local reference mbcsTable; in java we probably don't 332 // need the deep copy so can just make sure mbcs and its local reference both refer to the same new object 333 mbcsTable = data.mbcs = baseSharedData.mbcs; 334 335 /* overwrite values with relevant ones for the extension converter */ 336 mbcsTable.baseSharedData = baseSharedData; 337 mbcsTable.extIndexes = extIndexes; 338 339 /* 340 * It would be possible to share the swapLFNL data with a base converter, but the generated name would have 341 * to be different, and the memory would have to be free'd only once. It is easier to just create the data 342 * for the extension converter separately when it is requested. 343 */ 344 mbcsTable.swapLFNLStateTable = null; 345 mbcsTable.swapLFNLFromUnicodeChars = null; 346 mbcsTable.swapLFNLName = null; 347 348 /* 349 * Set a special, runtime-only outputType if the extension converter is a DBCS version of a base converter 350 * that also maps single bytes. 351 */ 352 if (staticData.conversionType == UConverterType.DBCS 353 || (staticData.conversionType == UConverterType.MBCS && staticData.minBytesPerChar >= 2)) { 354 355 if (baseSharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { 356 /* the base converter is SI/SO-stateful */ 357 int entry; 358 359 /* get the dbcs state from the state table entry for SO=0x0e */ 360 entry = mbcsTable.stateTable[0][0xe]; 361 if (MBCS_ENTRY_IS_FINAL(entry) && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_CHANGE_ONLY 362 && MBCS_ENTRY_FINAL_STATE(entry) != 0) { 363 mbcsTable.dbcsOnlyState = (byte) MBCS_ENTRY_FINAL_STATE(entry); 364 365 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; 366 } 367 } else if (baseSharedData.staticData.conversionType == UConverterType.MBCS 368 && baseSharedData.staticData.minBytesPerChar == 1 369 && baseSharedData.staticData.maxBytesPerChar == 2 && mbcsTable.countStates <= 127) { 370 371 /* non-stateful base converter, need to modify the state table */ 372 int newStateTable[][/* 256 */]; 373 int state[]; // this works because java 2-D array is array of references and we can have state = 374 // newStateTable[i]; 375 int i, count; 376 377 /* allocate a new state table and copy the base state table contents */ 378 count = mbcsTable.countStates; 379 newStateTable = new int[(count + 1) * 1024][256]; 380 381 for (i = 0; i < mbcsTable.stateTable.length; ++i) 382 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, 383 mbcsTable.stateTable[i].length); 384 385 /* change all final single-byte entries to go to a new all-illegal state */ 386 state = newStateTable[0]; 387 for (i = 0; i < 256; ++i) { 388 if (MBCS_ENTRY_IS_FINAL(state[i])) { 389 state[i] = MBCS_ENTRY_TRANSITION(count, 0); 390 } 391 } 392 393 /* build the new all-illegal state */ 394 state = newStateTable[count]; 395 for (i = 0; i < 256; ++i) { 396 state[i] = MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0); 397 } 398 mbcsTable.stateTable = newStateTable; 399 mbcsTable.countStates = (byte) (count + 1); 400 mbcsTable.stateTableOwned = true; 401 402 mbcsTable.outputType = MBCS_OUTPUT_DBCS_ONLY; 403 } 404 } 405 406 /* 407 * unlike below for files with base tables, do not get the unicodeMask from the sharedData; instead, use the 408 * base table's unicodeMask, which we copied in the memcpy above; this is necessary because the static data 409 * unicodeMask, especially the UCNV_HAS_SUPPLEMENTARY flag, is part of the base table data 410 */ 411 } else { 412 /* conversion file with a base table; an additional extension table is optional */ 413 /* make sure that the output type is known */ 414 switch (mbcsTable.outputType) { 415 case MBCS_OUTPUT_1: 416 case MBCS_OUTPUT_2: 417 case MBCS_OUTPUT_3: 418 case MBCS_OUTPUT_4: 419 case MBCS_OUTPUT_3_EUC: 420 case MBCS_OUTPUT_4_EUC: 421 case MBCS_OUTPUT_2_SISO: 422 /* OK */ 423 break; 424 default: 425 throw new InvalidFormatException(); 426 } 427 428 /* 429 * converter versions 6.1 and up contain a unicodeMask that is used here to select the most efficient 430 * function implementations 431 */ 432 // agljport:fix info.size=sizeof(UDataInfo); 433 // agljport:fix udata_getInfo((UDataMemory *)sharedData->dataMemory, &info); 434 if (reader.dataFormatHasUnicodeMask()) { 435 /* mask off possible future extensions to be safe */ 436 mbcsTable.unicodeMask = (short) (staticData.unicodeMask & 3); 437 } else { 438 /* for older versions, assume worst case: contains anything possible (prevent over-optimizations) */ 439 mbcsTable.unicodeMask = UConverterConstants.HAS_SUPPLEMENTARY | UConverterConstants.HAS_SURROGATES; 440 } 441 try { 442 reader.readMBCSTable(header, mbcsTable); 443 } catch (IOException e) { 444 throw new InvalidFormatException(); 445 } 446 447 if (offset != 0) { 448 try { 449 // agljport:commment subtract 32 for sizeof(_MBCSHeader) and length of baseNameString and 1 null 450 // terminator byte all already read; 451 // int namelen = baseNameString != null? baseNameString.length() + 1: 0; 452 mbcsTable.extIndexes = reader.readExtIndexes(offset - reader.bytesReadAfterStaticData()); 453 } catch (IOException e) { 454 throw new InvalidFormatException(); 455 } 456 } 457 458 if (header.version[1] >= 3 && (mbcsTable.unicodeMask & UConverterConstants.HAS_SURROGATES) == 0 && 459 (mbcsTable.countStates == 1 ? ((char)header.version[2] >= (SBCS_FAST_MAX>>8)) : ((char)header.version[2] >= (MBCS_FAST_MAX>>8)))) { 460 mbcsTable.utf8Friendly = true; 461 462 if (mbcsTable.countStates == 1) { 463 /* 464 * SBCS: Stage 3 is allocated in 64-entry blocks for U+0000..SBCS_FAST_MAX or higher. 465 * Build a table with indexes to each block, to be used instead of 466 * the regular stage 1/2 table. 467 */ 468 // sbcsIndex = new char[SBCS_FAST_LIMIT>>6]; 469 // for (int i = 0; i < (SBCS_FAST_LIMIT>>6); ++i) { 470 // mbcsTable.sbcsIndex[i] = mbcsTable.fromUnicodeTable[mbcsTable.fromUnicodeTable[i>>4]+((i<<2)&0x3c)]; 471 // } 472 /* set SBCS_FAST_MAX to reflect the reach of sbcsIndex[] even if header.version[2]>(SBCS_FAST_MAX>>8) */ 473 mbcsTable.maxFastUChar = SBCS_FAST_MAX; 474 } else { 475 /* 476 * MBCS: Stage 3 is allocated in 64-entry blocks for U+0000..MBCS_FAST_MAX or higher. 477 * The .cnv file is prebuilt with an additional stage table with indexes to each block. 478 */ 479 mbcsTable.maxFastUChar = (char)((header.version[2]<<8) | 0xff); 480 } 481 } 482 /* calculate a bit set of 4 ASCII characters per bit that round-trip to ASCII bytes */ 483 { 484 int asciiRoundtrips = 0xffffffff; 485 for (int i = 0; i < 0x80; ++i) { 486 if (mbcsTable.stateTable[0][i] != MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, i)) { 487 asciiRoundtrips &= ~(1 << (i >> 2)); 488 } 489 } 490 mbcsTable.asciiRoundtrips = asciiRoundtrips; 491 } 492 // TODO: Use asciiRoundtrips to speed up conversion, like in ICU4C. 493 494 if (noFromU) { 495 int stage1Length = (mbcsTable.unicodeMask&UConverterConstants.HAS_SUPPLEMENTARY) != 0 ? 0x440 : 0x40; 496 int stage2Length = (header.offsetFromUBytes - header.offsetFromUTable)/4 - stage1Length/2; 497 reconstituteData(mbcsTable, stage1Length, stage2Length, header.fullStage2Length); 498 } 499 if (mbcsTable.outputType == MBCS_OUTPUT_DBCS_ONLY || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) { 500 /* 501 * MBCS_OUTPUT_DBCS_ONLY: No SBCS mappings, therefore ASCII does not roundtrip. 502 * MBCS_OUTPUT_2_SISO: Bypass the ASCII fastpath to handle prevLength correctly. 503 */ 504 mbcsTable.asciiRoundtrips = 0; 505 } 506 } 507 // TODO: Use mbcsIndex to speed up UTF-16 conversion, like in ICU4C. 508 mbcsTable.mbcsIndex = null; 509 return data; 510 } 511 512 private static boolean writeStage3Roundtrip(UConverterMBCSTable mbcsTable, long value, int codePoints[]) { 513 char[] table; 514 byte[] bytes; 515 int stage2; 516 int p; 517 int c; 518 int i, st3; 519 long temp; 520 521 table = mbcsTable.fromUnicodeTable; 522 int[] tableInts = mbcsTable.fromUnicodeTableInts; 523 bytes = mbcsTable.fromUnicodeBytes; 524 char[] chars = mbcsTable.fromUnicodeChars; 525 int[] ints = mbcsTable.fromUnicodeInts; 526 527 /* for EUC outputTypes, modify the value like genmbcs.c's transformEUC() */ 528 switch(mbcsTable.outputType) { 529 case MBCS_OUTPUT_3_EUC: 530 if(value<=0xffff) { 531 /* short sequences are stored directly */ 532 /* code set 0 or 1 */ 533 } else if(value<=0x8effff) { 534 /* code set 2 */ 535 value&=0x7fff; 536 } else /* first byte is 0x8f */ { 537 /* code set 3 */ 538 value&=0xff7f; 539 } 540 break; 541 case MBCS_OUTPUT_4_EUC: 542 if(value<=0xffffff) { 543 /* short sequences are stored directly */ 544 /* code set 0 or 1 */ 545 } else if(value<=0x8effffff) { 546 /* code set 2 */ 547 value&=0x7fffff; 548 } else /* first byte is 0x8f */ { 549 /* code set 3 */ 550 value&=0xff7fff; 551 } 552 break; 553 default: 554 break; 555 } 556 557 for(i=0; i<=0x1f; ++value, ++i) { 558 c=codePoints[i]; 559 if(c<0) { 560 continue; 561 } 562 563 /* locate the stage 2 & 3 data */ 564 stage2 = table[c>>10] + ((c>>4)&0x3f); 565 st3 = tableInts[stage2]; 566 st3 = (int)(char)(st3 * 16 + (c&0xf)); 567 568 /* write the codepage bytes into stage 3 */ 569 switch(mbcsTable.outputType) { 570 case MBCS_OUTPUT_3: 571 case MBCS_OUTPUT_4_EUC: 572 p = st3*3; 573 bytes[p] = (byte)(value>>16); 574 bytes[p+1] = (byte)(value>>8); 575 bytes[p+2] = (byte)value; 576 break; 577 case MBCS_OUTPUT_4: 578 ints[st3] = (int)value; 579 break; 580 default: 581 /* 2 bytes per character */ 582 chars[st3] = (char)value; 583 break; 584 } 585 586 /* set the roundtrip flag */ 587 temp = (1L<<(16+(c&0xf))); 588 tableInts[stage2] |= temp; 589 } 590 return true; 591 } 592 593 private static void reconstituteData(UConverterMBCSTable mbcsTable, 594 int stage1Length, int stage2Length, int fullStage2Length) { 595 char[] stage1 = mbcsTable.fromUnicodeTable; 596 597 // stage2 starts with unused stage1 space. 598 // Indexes into stage 2 count from the bottom of the fromUnicodeTable. 599 int numStage1Ints = stage1Length / 2; // 2 chars = 1 int 600 int[] stage2 = new int[numStage1Ints + fullStage2Length]; 601 System.arraycopy(mbcsTable.fromUnicodeTableInts, numStage1Ints, 602 stage2, (fullStage2Length - stage2Length) + numStage1Ints, 603 stage2Length); 604 mbcsTable.fromUnicodeTableInts = stage2; 605 606 /* reconstitute the initial part of stage 2 from the mbcsIndex */ 607 { 608 int stageUTF8Length=(mbcsTable.maxFastUChar+1)>>6; 609 int stageUTF8Index=0; 610 int st1, st2, st3, i; 611 612 for (st1 = 0; stageUTF8Index < stageUTF8Length; ++st1) { 613 st2 = stage1[st1]; 614 if (st2 != stage1Length/2) { 615 /* each stage 2 block has 64 entries corresponding to 16 entries in the mbcsIndex */ 616 for (i = 0; i < 16; ++i) { 617 st3 = mbcsTable.mbcsIndex.get(stageUTF8Index++); 618 if (st3 != 0) { 619 /* a stage 2 entry's index is per stage 3 16-block, not per stage 3 entry */ 620 st3>>=4; 621 /* 622 * 4 stage 2 entries point to 4 consecutive stage 3 16-blocks which are 623 * allocated together as a single 64-block for access from the mbcsIndex 624 */ 625 stage2[st2++] = st3++; 626 stage2[st2++] = st3++; 627 stage2[st2++] = st3++; 628 stage2[st2++] = st3; 629 } else { 630 /* no stage 3 block, skip */ 631 st2+=4; 632 } 633 } 634 } else { 635 /* no stage 2 block, skip */ 636 stageUTF8Index+=16; 637 } 638 } 639 } 640 641 switch (mbcsTable.outputType) { 642 case CharsetMBCS.MBCS_OUTPUT_2: 643 case CharsetMBCS.MBCS_OUTPUT_2_SISO: 644 case CharsetMBCS.MBCS_OUTPUT_3_EUC: 645 mbcsTable.fromUnicodeChars = new char[mbcsTable.fromUBytesLength / 2]; 646 break; 647 case CharsetMBCS.MBCS_OUTPUT_3: 648 case CharsetMBCS.MBCS_OUTPUT_4_EUC: 649 mbcsTable.fromUnicodeBytes = new byte[mbcsTable.fromUBytesLength]; 650 break; 651 case CharsetMBCS.MBCS_OUTPUT_4: 652 mbcsTable.fromUnicodeInts = new int[mbcsTable.fromUBytesLength / 4]; 653 break; 654 default: 655 // Cannot occur, caller checked already. 656 assert false; 657 } 658 659 /* reconstitute fromUnicodeBytes with roundtrips from toUnicode data */ 660 MBCSEnumToUnicode(mbcsTable); 661 } 662 663 /* 664 * Internal function enumerating the toUnicode data of an MBCS converter. 665 * Currently only used for reconstituting data for a MBCS_OPT_NO_FROM_U 666 * table, but could also be used for a future getUnicodeSet() option 667 * that includes reverse fallbacks (after updating this function's implementation). 668 * Currently only handles roundtrip mappings. 669 * Does not currently handle extensions. 670 */ 671 private static void MBCSEnumToUnicode(UConverterMBCSTable mbcsTable) { 672 /* 673 * Properties for each state, to speed up the enumeration. 674 * Ignorable actions are unassigned/illegal/state-change-only: 675 * They do not lead to mappings. 676 * 677 * Bits 7..6 678 * 1 direct/initial state (stateful converters have mulitple) 679 * 0 non-initial state with transitions or with nonignorable result actions 680 * -1 final state with only ignorable actions 681 * 682 * Bits 5..3 683 * The lowest byte value with non-ignorable actions is 684 * value<<5 (rounded down). 685 * 686 * Bits 2..0: 687 * The highest byte value with non-ignorable actions is 688 * (value<<5)&0x1f (rounded up). 689 */ 690 byte stateProps[] = new byte[MBCS_MAX_STATE_COUNT]; 691 int state; 692 693 /* recurse from state 0 and set all stateProps */ 694 getStateProp(mbcsTable.stateTable, stateProps, 0); 695 696 for (state = 0; state < mbcsTable.countStates; ++state) { 697 if (stateProps[state] >= 0x40) { 698 /* start from each direct state */ 699 enumToU(mbcsTable, stateProps, state, 0, 0); 700 } 701 } 702 703 704 } 705 706 private static boolean enumToU(UConverterMBCSTable mbcsTable, byte stateProps[], int state, int offset, int value) { 707 int[] codePoints = new int[32]; 708 int[] row; 709 char[] unicodeCodeUnits; 710 int anyCodePoints; 711 int b, limit; 712 713 row = mbcsTable.stateTable[state]; 714 unicodeCodeUnits = mbcsTable.unicodeCodeUnits; 715 716 value<<=8; 717 anyCodePoints = -1; /* becomes non-negative if there is a mapping */ 718 719 b = (stateProps[state]&0x38)<<2; 720 if (b == 0 && stateProps[state] >= 0x40) { 721 /* skip byte sequences with leading zeros because they are note stored in the fromUnicode table */ 722 codePoints[0] = UConverterConstants.U_SENTINEL; 723 b = 1; 724 } 725 limit = ((stateProps[state]&7)+1)<<5; 726 while (b < limit) { 727 int entry = row[b]; 728 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 729 int nextState = MBCS_ENTRY_TRANSITION_STATE(entry); 730 if (stateProps[nextState] >= 0) { 731 /* recurse to a state with non-ignorable actions */ 732 if (!enumToU(mbcsTable, stateProps, nextState, offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), value|b)) { 733 return false; 734 } 735 } 736 codePoints[b&0x1f] = UConverterConstants.U_SENTINEL; 737 } else { 738 int c; 739 int action; 740 741 /* 742 * An if-else-if chain provides more reliable performance for 743 * the most common cases compared to a switch. 744 */ 745 action = MBCS_ENTRY_FINAL_ACTION(entry); 746 if (action == MBCS_STATE_VALID_DIRECT_16) { 747 /* output BMP code point */ 748 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 749 } else if (action == MBCS_STATE_VALID_16) { 750 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 751 c = unicodeCodeUnits[finalOffset]; 752 if (c < 0xfffe) { 753 /* output BMP code point */ 754 } else { 755 c = UConverterConstants.U_SENTINEL; 756 } 757 } else if (action == MBCS_STATE_VALID_16_PAIR) { 758 int finalOffset = offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 759 c = unicodeCodeUnits[finalOffset++]; 760 if (c < 0xd800) { 761 /* output BMP code point below 0xd800 */ 762 } else if (c <= 0xdbff) { 763 /* output roundtrip or fallback supplementary code point */ 764 c = ((c&0x3ff)<<10)+unicodeCodeUnits[finalOffset]+(0x10000-0xdc00); 765 } else if (c == 0xe000) { 766 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 767 c = unicodeCodeUnits[finalOffset]; 768 } else { 769 c = UConverterConstants.U_SENTINEL; 770 } 771 } else if (action == MBCS_STATE_VALID_DIRECT_20) { 772 /* output supplementary code point */ 773 c = MBCS_ENTRY_FINAL_VALUE(entry)+0x10000; 774 } else { 775 c = UConverterConstants.U_SENTINEL; 776 } 777 778 codePoints[b&0x1f] = c; 779 anyCodePoints&=c; 780 } 781 if (((++b)&0x1f) == 0) { 782 if(anyCodePoints>=0) { 783 if(!writeStage3Roundtrip(mbcsTable, value|(b-0x20), codePoints)) { 784 return false; 785 } 786 anyCodePoints=-1; 787 } 788 } 789 } 790 791 return true; 792 } 793 794 /* 795 * Only called if stateProps[state]==-1. 796 * A recursive call may do stateProps[state]|=0x40 if this state is the target of an 797 * MBCS_STATE_CHANGE_ONLY. 798 */ 799 private static byte getStateProp(int stateTable[][], byte stateProps[], int state) { 800 int[] row; 801 int min, max, entry, nextState; 802 803 row = stateTable[state]; 804 stateProps[state] = 0; 805 806 /* find first non-ignorable state */ 807 for (min = 0;;++min) { 808 entry = row[min]; 809 nextState = MBCS_ENTRY_STATE(entry); 810 if (stateProps[nextState] == -1) { 811 getStateProp(stateTable, stateProps, nextState); 812 } 813 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 814 if (stateProps[nextState] >- 0) { 815 break; 816 } 817 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { 818 break; 819 } 820 if (min == 0xff) { 821 stateProps[state] = -0x40; /* (byte)0xc0 */ 822 return stateProps[state]; 823 } 824 } 825 stateProps[state]|=(byte)((min>>5)<<3); 826 827 /* find last non-ignorable state */ 828 for (max = 0xff; min < max; --max) { 829 entry = row[max]; 830 nextState = MBCS_ENTRY_STATE(entry); 831 if (stateProps[nextState] == -1) { 832 getStateProp(stateTable, stateProps, nextState); 833 } 834 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 835 if (stateProps[nextState] >- 0) { 836 break; 837 } 838 } else if (MBCS_ENTRY_FINAL_ACTION(entry) < MBCS_STATE_UNASSIGNED) { 839 break; 840 } 841 } 842 stateProps[state]|=(byte)(max>>5); 843 844 /* recurse further and collect direct-state information */ 845 while (min <= max) { 846 entry = row[min]; 847 nextState = MBCS_ENTRY_STATE(entry); 848 if (stateProps[nextState] == -1) { 849 getStateProp(stateTable, stateProps, nextState); 850 } 851 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 852 stateProps[nextState]|=0x40; 853 if (MBCS_ENTRY_FINAL_ACTION(entry) <= MBCS_STATE_FALLBACK_DIRECT_20) { 854 stateProps[state]|=0x40; 855 } 856 } 857 ++min; 858 } 859 return stateProps[state]; 860 } 861 862 protected void initializeConverter(int myOptions) { 863 UConverterMBCSTable mbcsTable; 864 ByteBuffer extIndexes; 865 short outputType; 866 byte maxBytesPerUChar; 867 868 mbcsTable = sharedData.mbcs; 869 outputType = mbcsTable.outputType; 870 871 if (outputType == MBCS_OUTPUT_DBCS_ONLY) { 872 /* the swaplfnl option does not apply, remove it */ 873 this.options = myOptions &= ~UConverterConstants.OPTION_SWAP_LFNL; 874 } 875 876 if ((myOptions & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 877 /* do this because double-checked locking is broken */ 878 boolean isCached; 879 880 // agljport:todo umtx_lock(NULL); 881 isCached = mbcsTable.swapLFNLStateTable != null; 882 // agljport:todo umtx_unlock(NULL); 883 884 if (!isCached) { 885 try { 886 if (!EBCDICSwapLFNL()) { 887 /* this option does not apply, remove it */ 888 this.options = myOptions & ~UConverterConstants.OPTION_SWAP_LFNL; 889 } 890 } catch (Exception e) { 891 /* something went wrong. */ 892 return; 893 } 894 } 895 } 896 897 String lowerCaseName = icuCanonicalName.toLowerCase(Locale.ENGLISH); 898 if (lowerCaseName.indexOf("gb18030") >= 0) { 899 /* set a flag for GB 18030 mode, which changes the callback behavior */ 900 this.options |= MBCS_OPTION_GB18030; 901 } else if (lowerCaseName.indexOf("keis") >= 0) { 902 this.options |= MBCS_OPTION_KEIS; 903 } else if (lowerCaseName.indexOf("jef") >= 0) { 904 this.options |= MBCS_OPTION_JEF; 905 } else if (lowerCaseName.indexOf("jips") >= 0) { 906 this.options |= MBCS_OPTION_JIPS; 907 } 908 909 /* fix maxBytesPerUChar depending on outputType and options etc. */ 910 if (outputType == MBCS_OUTPUT_2_SISO) { 911 /* changed from 3 to 4 in ICU4J only. #9205 */ 912 maxBytesPerChar = 4; /* SO+DBCS+SI*/ 913 } 914 915 extIndexes = mbcsTable.extIndexes; 916 if (extIndexes != null) { 917 maxBytesPerUChar = (byte) GET_MAX_BYTES_PER_UCHAR(extIndexes); 918 if (outputType == MBCS_OUTPUT_2_SISO) { 919 ++maxBytesPerUChar; /* SO + multiple DBCS */ 920 } 921 922 if (maxBytesPerUChar > maxBytesPerChar) { 923 maxBytesPerChar = maxBytesPerUChar; 924 } 925 } 926 } 927 /* EBCDIC swap LF<->NL--------------------------------------------------------------------------------*/ 928 /* 929 * This code modifies a standard EBCDIC<->Unicode mappling table for 930 * OS/390 (z/OS) Unix System Services (Open Edition). 931 * The difference is in the mapping of Line Feed and New Line control codes: 932 * Standard EBDIC maps 933 * 934 * <U000A> \x25 |0 935 * <U0085> \x15 |0 936 * 937 * but OS/390 USS EBCDIC swaps the control codes for LF and NL, 938 * mapping 939 * 940 * <U000A> \x15 |0 941 * <U0085> \x25 |0 942 * 943 * This code modifies a loaded standard EBCDIC<->Unicode mapping table 944 * by copying it into allocated memory and swapping the LF and NL values. 945 * It allows to support the same EBCDIC charset in both version without 946 * duplicating the entire installed table. 947 */ 948 /* standard EBCDIC codes */ 949 private static final short EBCDIC_LF = 0x0025; 950 private static final short EBCDIC_NL = 0x0015; 951 952 /* standard EBCDIC codes with roundtrip flag as stored in Unicode-to-single-byte tables */ 953 private static final short EBCDIC_RT_LF = 0x0f25; 954 private static final short EBCDIC_RT_NL = 0x0f15; 955 956 /* Unicode code points */ 957 private static final short U_LF = 0x000A; 958 private static final short U_NL = 0x0085; 959 960 private boolean EBCDICSwapLFNL() throws Exception { 961 UConverterMBCSTable mbcsTable; 962 963 char[] table; 964 965 int[][] newStateTable; 966 String newName; 967 968 int stage2Entry; 969 970 mbcsTable = sharedData.mbcs; 971 972 table = mbcsTable.fromUnicodeTable; 973 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 974 char[] chars = mbcsTable.fromUnicodeChars; 975 char[] results = chars; 976 977 /* 978 * Check that this is an EBCDIC table with SBCS portion - 979 * SBCS or EBCDIC with standard EBCDIC LF and NL mappings. 980 * 981 * If not, ignore the option. Options are always ignored if they do not apply. 982 */ 983 if (!((mbcsTable.outputType == MBCS_OUTPUT_1 || mbcsTable.outputType == MBCS_OUTPUT_2_SISO) && 984 mbcsTable.stateTable[0][EBCDIC_LF] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF) && 985 mbcsTable.stateTable[0][EBCDIC_NL] == MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL))) { 986 return false; 987 } 988 989 if (mbcsTable.outputType == MBCS_OUTPUT_1) { 990 if (!(EBCDIC_RT_LF == MBCS_SINGLE_RESULT_FROM_U(table, results, U_LF) && 991 EBCDIC_RT_NL == MBCS_SINGLE_RESULT_FROM_U(table, results, U_NL))) { 992 return false; 993 } 994 } else /* MBCS_OUTPUT_2_SISO */ { 995 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_LF); 996 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_LF) && 997 EBCDIC_LF == MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, U_LF))) { 998 return false; 999 } 1000 1001 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_NL); 1002 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, U_NL) && 1003 EBCDIC_NL == MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, U_NL))) { 1004 return false; 1005 } 1006 } 1007 1008 if (mbcsTable.fromUBytesLength > 0) { 1009 /* 1010 * We _know_ the number of bytes in the fromUnicodeBytes array 1011 * starting with header.version 4.1. 1012 */ 1013 // sizeofFromUBytes = mbcsTable.fromUBytesLength; 1014 } else { 1015 /* 1016 * Otherwise: 1017 * There used to be code to enumerate the fromUnicode 1018 * trie and find the highest entry, but it was removed in ICU 3.2 1019 * because it was not tested and caused a low code coverage number. 1020 */ 1021 throw new Exception("U_INVALID_FORMAT_ERROR"); 1022 } 1023 1024 /* 1025 * The table has an appropriate format. 1026 * Allocate and build 1027 * - a modified to-Unicode state table 1028 * - a modified from-Unicode output array 1029 * - a converter name string with the swap option appended 1030 */ 1031 // size = mbcsTable.countStates * 1024 + sizeofFromUBytes + UConverterConstants.MAX_CONVERTER_NAME_LENGTH + 20; 1032 1033 /* copy and modify the to-Unicode state table */ 1034 newStateTable = new int[mbcsTable.stateTable.length][mbcsTable.stateTable[0].length]; 1035 for (int i = 0; i < newStateTable.length; i++) { 1036 System.arraycopy(mbcsTable.stateTable[i], 0, newStateTable[i], 0, newStateTable[i].length); 1037 } 1038 1039 newStateTable[0][EBCDIC_LF] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_NL); 1040 newStateTable[0][EBCDIC_NL] = MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, U_LF); 1041 1042 /* copy and modify the from-Unicode result table */ 1043 char[] newResults = new char[chars.length]; 1044 System.arraycopy(chars, 0, newResults, 0, chars.length); 1045 /* conveniently, the table access macros work on the left side of expressions */ 1046 if (mbcsTable.outputType == MBCS_OUTPUT_1) { 1047 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_LF, EBCDIC_RT_NL); 1048 MBCS_SINGLE_RESULT_FROM_U_SET(table, newResults, U_NL, EBCDIC_RT_LF); 1049 } else /* MBCS_OUTPUT_2_SISO */ { 1050 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_LF); 1051 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_LF, EBCDIC_NL); 1052 1053 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, U_NL); 1054 MBCS_VALUE_2_FROM_STAGE_2_SET(newResults, stage2Entry, U_NL, EBCDIC_LF); 1055 } 1056 1057 /* set the canonical converter name */ 1058 newName = icuCanonicalName.concat(UConverterConstants.OPTION_SWAP_LFNL_STRING); 1059 1060 if (mbcsTable.swapLFNLStateTable == null) { 1061 mbcsTable.swapLFNLStateTable = newStateTable; 1062 mbcsTable.swapLFNLFromUnicodeChars = newResults; 1063 mbcsTable.swapLFNLName = newName; 1064 } 1065 return true; 1066 } 1067 1068 /** 1069 * MBCS output types for conversions from Unicode. These per-converter types determine the storage method in stage 3 1070 * of the lookup table, mostly how many bytes are stored per entry. 1071 */ 1072 static final int MBCS_OUTPUT_1 = 0; /* 0 */ 1073 static final int MBCS_OUTPUT_2 = MBCS_OUTPUT_1 + 1; /* 1 */ 1074 static final int MBCS_OUTPUT_3 = MBCS_OUTPUT_2 + 1; /* 2 */ 1075 static final int MBCS_OUTPUT_4 = MBCS_OUTPUT_3 + 1; /* 3 */ 1076 static final int MBCS_OUTPUT_3_EUC = 8; /* 8 */ 1077 static final int MBCS_OUTPUT_4_EUC = MBCS_OUTPUT_3_EUC + 1; /* 9 */ 1078 static final int MBCS_OUTPUT_2_SISO = 12; /* c */ 1079 static final int MBCS_OUTPUT_2_HZ = MBCS_OUTPUT_2_SISO + 1; /* d */ 1080 static final int MBCS_OUTPUT_EXT_ONLY = MBCS_OUTPUT_2_HZ + 1; /* e */ 1081 // static final int MBCS_OUTPUT_COUNT = MBCS_OUTPUT_EXT_ONLY + 1; 1082 static final int MBCS_OUTPUT_DBCS_ONLY = 0xdb; /* runtime-only type for DBCS-only handling of SISO tables */ 1083 1084 /* GB 18030 data ------------------------------------------------------------ */ 1085 1086 /* helper macros for linear values for GB 18030 four-byte sequences */ 1087 private static int LINEAR_18030(int a, int b, int c, int d) { 1088 return ((((a & 0xff) * 10 + (b & 0xff)) * 126 + (c & 0xff)) * 10 + (d & 0xff)); 1089 } 1090 1091 private static int LINEAR_18030_BASE = LINEAR_18030(0x81, 0x30, 0x81, 0x30); 1092 1093 private static int LINEAR(int x) { 1094 return LINEAR_18030(x >>> 24, (x >>> 16) & 0xff, (x >>> 8) & 0xff, x & 0xff); 1095 } 1096 1097 /* 1098 * Some ranges of GB 18030 where both the Unicode code points and the GB four-byte sequences are contiguous and are 1099 * handled algorithmically by the special callback functions below. The values are start & end of Unicode & GB 1100 * codes. 1101 * 1102 * Note that single surrogates are not mapped by GB 18030 as of the re-released mapping tables from 2000-nov-30. 1103 */ 1104 private static final int gb18030Ranges[][] = new int[/* 14 */][/* 4 */] { 1105 { 0x10000, 0x10FFFF, LINEAR(0x90308130), LINEAR(0xE3329A35) }, 1106 { 0x9FA6, 0xD7FF, LINEAR(0x82358F33), LINEAR(0x8336C738) }, 1107 { 0x0452, 0x1E3E, LINEAR(0x8130D330), LINEAR(0x8135F436) }, 1108 { 0x1E40, 0x200F, LINEAR(0x8135F438), LINEAR(0x8136A531) }, 1109 { 0xE865, 0xF92B, LINEAR(0x8336D030), LINEAR(0x84308534) }, 1110 { 0x2643, 0x2E80, LINEAR(0x8137A839), LINEAR(0x8138FD38) }, 1111 { 0xFA2A, 0xFE2F, LINEAR(0x84309C38), LINEAR(0x84318537) }, 1112 { 0x3CE1, 0x4055, LINEAR(0x8231D438), LINEAR(0x8232AF32) }, 1113 { 0x361B, 0x3917, LINEAR(0x8230A633), LINEAR(0x8230F237) }, 1114 { 0x49B8, 0x4C76, LINEAR(0x8234A131), LINEAR(0x8234E733) }, 1115 { 0x4160, 0x4336, LINEAR(0x8232C937), LINEAR(0x8232F837) }, 1116 { 0x478E, 0x4946, LINEAR(0x8233E838), LINEAR(0x82349638) }, 1117 { 0x44D7, 0x464B, LINEAR(0x8233A339), LINEAR(0x8233C931) }, 1118 { 0xFFE6, 0xFFFF, LINEAR(0x8431A234), LINEAR(0x8431A439) } }; 1119 1120 /* bit flag for UConverter.options indicating GB 18030 special handling */ 1121 private static final int MBCS_OPTION_GB18030 = 0x8000; 1122 1123 /* bit flag for UConverter.options indicating KEIS,JEF,JIF special handling */ 1124 private static final int MBCS_OPTION_KEIS = 0x01000; 1125 private static final int MBCS_OPTION_JEF = 0x02000; 1126 private static final int MBCS_OPTION_JIPS = 0x04000; 1127 1128 private static enum SISO_Option { 1129 SI, 1130 SO 1131 } 1132 1133 private static final byte[] KEIS_SO_CHAR = { 0x0A, 0x42 }; 1134 private static final byte[] KEIS_SI_CHAR = { 0x0A, 0x41 }; 1135 private static final byte JEF_SO_CHAR = 0x28; 1136 private static final byte JEF_SI_CHAR = 0x29; 1137 private static final byte[] JIPS_SO_CHAR = { 0x1A, 0x70 }; 1138 private static final byte[] JIPS_SI_CHAR = { 0x1A, 0x71 }; 1139 1140 private static int getSISOBytes(SISO_Option option, int cnvOption, byte[] value) { 1141 int SISOLength = 0; 1142 1143 switch (option) { 1144 case SI: 1145 if ((cnvOption&MBCS_OPTION_KEIS)!=0) { 1146 value[0] = KEIS_SI_CHAR[0]; 1147 value[1] = KEIS_SI_CHAR[1]; 1148 SISOLength = 2; 1149 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { 1150 value[0] = JEF_SI_CHAR; 1151 SISOLength = 1; 1152 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { 1153 value[0] = JIPS_SI_CHAR[0]; 1154 value[1] = JIPS_SI_CHAR[1]; 1155 SISOLength = 2; 1156 } else { 1157 value[0] = UConverterConstants.SI; 1158 SISOLength = 1; 1159 } 1160 break; 1161 case SO: 1162 if ((cnvOption&MBCS_OPTION_KEIS)!=0) { 1163 value[0] = KEIS_SO_CHAR[0]; 1164 value[1] = KEIS_SO_CHAR[1]; 1165 SISOLength = 2; 1166 } else if ((cnvOption&MBCS_OPTION_JEF)!=0) { 1167 value[0] = JEF_SO_CHAR; 1168 SISOLength = 1; 1169 } else if ((cnvOption&MBCS_OPTION_JIPS)!=0) { 1170 value[0] = JIPS_SO_CHAR[0]; 1171 value[1] = JIPS_SO_CHAR[1]; 1172 SISOLength = 2; 1173 } else { 1174 value[0] = UConverterConstants.SO; 1175 SISOLength = 1; 1176 } 1177 break; 1178 default: 1179 /* Should never happen. */ 1180 break; 1181 } 1182 1183 return SISOLength; 1184 } 1185 // enum { 1186 static final int MBCS_MAX_STATE_COUNT = 128; 1187 // }; 1188 /** 1189 * MBCS action codes for conversions to Unicode. These values are in bits 23..20 of the state table entries. 1190 */ 1191 static final int MBCS_STATE_VALID_DIRECT_16 = 0; 1192 static final int MBCS_STATE_VALID_DIRECT_20 = MBCS_STATE_VALID_DIRECT_16 + 1; 1193 static final int MBCS_STATE_FALLBACK_DIRECT_16 = MBCS_STATE_VALID_DIRECT_20 + 1; 1194 static final int MBCS_STATE_FALLBACK_DIRECT_20 = MBCS_STATE_FALLBACK_DIRECT_16 + 1; 1195 static final int MBCS_STATE_VALID_16 = MBCS_STATE_FALLBACK_DIRECT_20 + 1; 1196 static final int MBCS_STATE_VALID_16_PAIR = MBCS_STATE_VALID_16 + 1; 1197 static final int MBCS_STATE_UNASSIGNED = MBCS_STATE_VALID_16_PAIR + 1; 1198 static final int MBCS_STATE_ILLEGAL = MBCS_STATE_UNASSIGNED + 1; 1199 static final int MBCS_STATE_CHANGE_ONLY = MBCS_STATE_ILLEGAL + 1; 1200 1201 static int MBCS_ENTRY_SET_STATE(int entry, int state) { 1202 return (entry&0x80ffffff)|(state<<24L); 1203 } 1204 1205 static int MBCS_ENTRY_STATE(int entry) { 1206 return (((entry)>>24)&0x7f); 1207 } 1208 1209 /* Methods for state table entries */ 1210 static int MBCS_ENTRY_TRANSITION(int state, int offset) { 1211 return (state << 24L) | offset; 1212 } 1213 1214 static int MBCS_ENTRY_FINAL(int state, int action, int value) { 1215 return 0x80000000 | (state << 24L) | (action << 20L) | value; 1216 } 1217 1218 static boolean MBCS_ENTRY_IS_TRANSITION(int entry) { 1219 return (entry) >= 0; 1220 } 1221 1222 static boolean MBCS_ENTRY_IS_FINAL(int entry) { 1223 return (entry) < 0; 1224 } 1225 1226 static int MBCS_ENTRY_TRANSITION_STATE(int entry) { 1227 return ((entry) >>> 24); 1228 } 1229 1230 static int MBCS_ENTRY_TRANSITION_OFFSET(int entry) { 1231 return ((entry) & 0xffffff); 1232 } 1233 1234 static int MBCS_ENTRY_FINAL_STATE(int entry) { 1235 return ((entry) >>> 24) & 0x7f; 1236 } 1237 1238 static boolean MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(int entry) { 1239 return ((entry) < 0x80100000); 1240 } 1241 1242 static int MBCS_ENTRY_FINAL_ACTION(int entry) { 1243 return ((entry) >>> 20) & 0xf; 1244 } 1245 1246 static int MBCS_ENTRY_FINAL_VALUE(int entry) { 1247 return ((entry) & 0xfffff); 1248 } 1249 1250 static char MBCS_ENTRY_FINAL_VALUE_16(int entry) { 1251 return (char) (entry); 1252 } 1253 1254 static boolean MBCS_IS_ASCII_ROUNDTRIP(int b, long asciiRoundtrips) { 1255 return (((asciiRoundtrips) & (1<<((b)>>2)))!=0); 1256 } 1257 1258 /** 1259 * This macro version of _MBCSSingleSimpleGetNextUChar() gets a code point from a byte. It works for single-byte, 1260 * single-state codepages that only map to and from BMP code points, and it always returns fallback values. 1261 */ 1262 static char MBCS_SINGLE_SIMPLE_GET_NEXT_BMP(UConverterMBCSTable mbcs, final int b) { 1263 assert 0 <= b && b <= 0xff; 1264 return MBCS_ENTRY_FINAL_VALUE_16(mbcs.stateTable[0][b]); 1265 } 1266 1267 /* single-byte fromUnicode: get the 16-bit result word */ 1268 static char MBCS_SINGLE_RESULT_FROM_U(char[] table, char[] results, int c) { 1269 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); 1270 int i = table[i1] + (c & 0xf); 1271 return results[i]; 1272 } 1273 1274 /* single-byte fromUnicode: set the 16-bit result word with newValue*/ 1275 static void MBCS_SINGLE_RESULT_FROM_U_SET(char[] table, char[] results, int c, int newValue) { 1276 int i1 = table[c >>> 10] + ((c >>> 4) & 0x3f); 1277 int i = table[i1] + (c & 0xf); 1278 results[i] = (char) newValue; 1279 } 1280 1281 /* multi-byte fromUnicode: get the 32-bit stage 2 entry */ 1282 static int MBCS_STAGE_2_FROM_U(char[] table, int[] tableInts, int c) { 1283 int i = table[(c) >>> 10] + ((c >>> 4) & 0x3f); 1284 return tableInts[i]; 1285 } 1286 1287 private static boolean MBCS_FROM_U_IS_ROUNDTRIP(int stage2Entry, int c) { 1288 return (((stage2Entry) & (1 << (16 + ((c) & 0xf)))) != 0); 1289 } 1290 1291 static char MBCS_VALUE_2_FROM_STAGE_2(char[] chars, int stage2Entry, int c) { 1292 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1293 return chars[i]; 1294 } 1295 1296 static void MBCS_VALUE_2_FROM_STAGE_2_SET(char[] chars, int stage2Entry, int c, int newValue) { 1297 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1298 chars[i] = (char) newValue; 1299 } 1300 1301 private static int MBCS_VALUE_4_FROM_STAGE_2(int[] ints, int stage2Entry, int c) { 1302 int i = 16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + (c & 0xf); 1303 return ints[i]; 1304 } 1305 1306 static int MBCS_POINTER_3_FROM_STAGE_2(byte[] bytes, int stage2Entry, int c) { 1307 return ((16 * (stage2Entry & UConverterConstants.UNSIGNED_SHORT_MASK) + ((c) & 0xf)) * 3); 1308 } 1309 1310 // ------------UConverterExt------------------------------------------------------- 1311 1312 static final int EXT_INDEXES_LENGTH = 0; /* 0 */ 1313 1314 static final int EXT_TO_U_INDEX = EXT_INDEXES_LENGTH + 1; /* 1 */ 1315 static final int EXT_TO_U_LENGTH = EXT_TO_U_INDEX + 1; 1316 static final int EXT_TO_U_UCHARS_INDEX = EXT_TO_U_LENGTH + 1; 1317 static final int EXT_TO_U_UCHARS_LENGTH = EXT_TO_U_UCHARS_INDEX + 1; 1318 1319 static final int EXT_FROM_U_UCHARS_INDEX = EXT_TO_U_UCHARS_LENGTH + 1; /* 5 */ 1320 static final int EXT_FROM_U_VALUES_INDEX = EXT_FROM_U_UCHARS_INDEX + 1; 1321 static final int EXT_FROM_U_LENGTH = EXT_FROM_U_VALUES_INDEX + 1; 1322 static final int EXT_FROM_U_BYTES_INDEX = EXT_FROM_U_LENGTH + 1; 1323 static final int EXT_FROM_U_BYTES_LENGTH = EXT_FROM_U_BYTES_INDEX + 1; 1324 1325 static final int EXT_FROM_U_STAGE_12_INDEX = EXT_FROM_U_BYTES_LENGTH + 1; /* 10 */ 1326 static final int EXT_FROM_U_STAGE_1_LENGTH = EXT_FROM_U_STAGE_12_INDEX + 1; 1327 static final int EXT_FROM_U_STAGE_12_LENGTH = EXT_FROM_U_STAGE_1_LENGTH + 1; 1328 static final int EXT_FROM_U_STAGE_3_INDEX = EXT_FROM_U_STAGE_12_LENGTH + 1; 1329 static final int EXT_FROM_U_STAGE_3_LENGTH = EXT_FROM_U_STAGE_3_INDEX + 1; 1330 static final int EXT_FROM_U_STAGE_3B_INDEX = EXT_FROM_U_STAGE_3_LENGTH + 1; 1331 static final int EXT_FROM_U_STAGE_3B_LENGTH = EXT_FROM_U_STAGE_3B_INDEX + 1; 1332 1333 private static final int EXT_COUNT_BYTES = EXT_FROM_U_STAGE_3B_LENGTH + 1; /* 17 */ 1334 // private static final int EXT_COUNT_UCHARS = EXT_COUNT_BYTES + 1; 1335 // private static final int EXT_FLAGS = EXT_COUNT_UCHARS + 1; 1336 // 1337 // private static final int EXT_RESERVED_INDEX = EXT_FLAGS + 1; /* 20, moves with additional indexes */ 1338 // 1339 // private static final int EXT_SIZE=31; 1340 // private static final int EXT_INDEXES_MIN_LENGTH=32; 1341 1342 static final int EXT_FROM_U_MAX_DIRECT_LENGTH = 3; 1343 1344 /* toUnicode helpers -------------------------------------------------------- */ 1345 1346 private static final int TO_U_BYTE_SHIFT = 24; 1347 private static final int TO_U_VALUE_MASK = 0xffffff; 1348 private static final int TO_U_MIN_CODE_POINT = 0x1f0000; 1349 private static final int TO_U_MAX_CODE_POINT = 0x2fffff; 1350 private static final int TO_U_ROUNDTRIP_FLAG = (1 << 23); 1351 private static final int TO_U_INDEX_MASK = 0x3ffff; 1352 private static final int TO_U_LENGTH_SHIFT = 18; 1353 private static final int TO_U_LENGTH_OFFSET = 12; 1354 1355 /* maximum number of indexed UChars */ 1356 static final int MAX_UCHARS = 19; 1357 1358 static int TO_U_GET_BYTE(int word) { 1359 return word >>> TO_U_BYTE_SHIFT; 1360 } 1361 1362 static int TO_U_GET_VALUE(int word) { 1363 return word & TO_U_VALUE_MASK; 1364 } 1365 1366 static boolean TO_U_IS_ROUNDTRIP(int value) { 1367 return (value & TO_U_ROUNDTRIP_FLAG) != 0; 1368 } 1369 1370 static boolean TO_U_IS_PARTIAL(int value) { 1371 return 0 <= value && value < TO_U_MIN_CODE_POINT; 1372 } 1373 1374 static int TO_U_GET_PARTIAL_INDEX(int value) { 1375 return value; 1376 } 1377 1378 static int TO_U_MASK_ROUNDTRIP(int value) { 1379 return value & ~TO_U_ROUNDTRIP_FLAG; 1380 } 1381 1382 private static int TO_U_MAKE_WORD(byte b, int value) { 1383 // TO_U_BYTE_SHIFT == 24: safe to just shift the signed byte-as-int. 1384 return (b << TO_U_BYTE_SHIFT) | value; 1385 } 1386 1387 /* use after masking off the roundtrip flag */ 1388 static boolean TO_U_IS_CODE_POINT(int value) { 1389 assert value >= 0; 1390 return value <= TO_U_MAX_CODE_POINT; 1391 } 1392 1393 static int TO_U_GET_CODE_POINT(int value) { 1394 assert value >= 0; 1395 return value - TO_U_MIN_CODE_POINT; 1396 } 1397 1398 private static int TO_U_GET_INDEX(int value) { 1399 return value & TO_U_INDEX_MASK; 1400 } 1401 1402 private static int TO_U_GET_LENGTH(int value) { 1403 return (value >>> TO_U_LENGTH_SHIFT) - TO_U_LENGTH_OFFSET; 1404 } 1405 1406 /* fromUnicode helpers ------------------------------------------------------ */ 1407 1408 /* most trie constants are shared with ucnvmbcs.h */ 1409 private static final int STAGE_2_LEFT_SHIFT = 2; 1410 1411 // private static final int STAGE_3_GRANULARITY = 4; 1412 1413 /* trie access, returns the stage 3 value=index to stage 3b; s1Index=c>>10 */ 1414 static int FROM_U(CharBuffer stage12, CharBuffer stage3, int s1Index, int c) { 1415 return stage3.get(((int) stage12.get((stage12.get(s1Index) + ((c >>> 4) & 0x3f))) << STAGE_2_LEFT_SHIFT) 1416 + (c & 0xf)); 1417 } 1418 1419 private static final int FROM_U_LENGTH_SHIFT = 24; 1420 private static final int FROM_U_ROUNDTRIP_FLAG = 1 << 31; 1421 static final int FROM_U_RESERVED_MASK = 0x60000000; 1422 private static final int FROM_U_DATA_MASK = 0xffffff; 1423 1424 /* special value for "no mapping" to <subchar1> (impossible roundtrip to 0 bytes, value 01) */ 1425 static final int FROM_U_SUBCHAR1 = 0x80000001; 1426 1427 /* at most 3 bytes in the lower part of the value */ 1428 private static final int FROM_U_MAX_DIRECT_LENGTH = 3; 1429 1430 /* maximum number of indexed bytes */ 1431 static final int MAX_BYTES = 0x1f; 1432 1433 static boolean FROM_U_IS_PARTIAL(int value) { 1434 return (value >>> FROM_U_LENGTH_SHIFT) == 0; 1435 } 1436 1437 static int FROM_U_GET_PARTIAL_INDEX(int value) { 1438 return value; 1439 } 1440 1441 static boolean FROM_U_IS_ROUNDTRIP(int value) { 1442 return (value & FROM_U_ROUNDTRIP_FLAG) != 0; 1443 } 1444 1445 private static int FROM_U_MASK_ROUNDTRIP(int value) { 1446 return value & ~FROM_U_ROUNDTRIP_FLAG; 1447 } 1448 1449 /* use after masking off the roundtrip flag */ 1450 static int FROM_U_GET_LENGTH(int value) { 1451 return (value >>> FROM_U_LENGTH_SHIFT) & MAX_BYTES; 1452 } 1453 1454 /* get bytes or bytes index */ 1455 static int FROM_U_GET_DATA(int value) { 1456 return value & FROM_U_DATA_MASK; 1457 } 1458 1459 /* get the pointer to an extension array from indexes[index] */ 1460 static Buffer ARRAY(ByteBuffer indexes, int index, Class<?> itemType) { 1461 int oldpos = indexes.position(); 1462 Buffer b; 1463 1464 // TODO: It is very inefficient to create Buffer objects for each array access. 1465 // We should create an inner class Extensions (or sibling class CharsetMBCSExtensions) 1466 // which has buffers for the arrays, together with the code that works with them. 1467 indexes.position(indexes.getInt(index << 2)); 1468 if (itemType == int.class) 1469 b = indexes.asIntBuffer(); 1470 else if (itemType == char.class) 1471 b = indexes.asCharBuffer(); 1472 else if (itemType == short.class) 1473 b = indexes.asShortBuffer(); 1474 else 1475 // default or (itemType == byte.class) 1476 b = indexes.slice(); 1477 indexes.position(oldpos); 1478 return b; 1479 } 1480 1481 private static int GET_MAX_BYTES_PER_UCHAR(ByteBuffer indexes) { 1482 indexes.position(0); 1483 return indexes.getInt(EXT_COUNT_BYTES) & 0xff; 1484 } 1485 1486 /* 1487 * @return index of the UChar, if found; else <0 1488 */ 1489 static int findFromU(CharBuffer fromUSection, int length, char u) { 1490 int i, start, limit; 1491 1492 /* binary search */ 1493 start = 0; 1494 limit = length; 1495 for (;;) { 1496 i = limit - start; 1497 if (i <= 1) { 1498 break; /* done */ 1499 } 1500 /* start<limit-1 */ 1501 1502 if (i <= 4) { 1503 /* linear search for the last part */ 1504 if (u <= fromUSection.get(fromUSection.position() + start)) { 1505 break; 1506 } 1507 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { 1508 break; 1509 } 1510 if (++start < limit && u <= fromUSection.get(fromUSection.position() + start)) { 1511 break; 1512 } 1513 /* always break at start==limit-1 */ 1514 ++start; 1515 break; 1516 } 1517 1518 i = (start + limit) / 2; 1519 if (u < fromUSection.get(fromUSection.position() + i)) { 1520 limit = i; 1521 } else { 1522 start = i; 1523 } 1524 } 1525 1526 /* did we really find it? */ 1527 if (start < limit && u == fromUSection.get(fromUSection.position() + start)) { 1528 return start; 1529 } else { 1530 return -1; /* not found */ 1531 } 1532 } 1533 1534 /* 1535 * @return lookup value for the byte, if found; else 0 1536 */ 1537 static int findToU(IntBuffer toUSection, int length, short byt) { 1538 long word0, word; 1539 int i, start, limit; 1540 1541 /* check the input byte against the lowest and highest section bytes */ 1542 // agljport:comment instead of receiving a start position parameter for toUSection we'll rely on its position 1543 // property 1544 start = TO_U_GET_BYTE(toUSection.get(toUSection.position())); 1545 limit = TO_U_GET_BYTE(toUSection.get(toUSection.position() + length - 1)); 1546 if (byt < start || limit < byt) { 1547 return 0; /* the byte is out of range */ 1548 } 1549 1550 if (length == ((limit - start) + 1)) { 1551 /* direct access on a linear array */ 1552 return TO_U_GET_VALUE(toUSection.get(toUSection.position() + byt - start)); /* could be 0 */ 1553 } 1554 1555 /* word0 is suitable for <=toUSection[] comparison, word for <toUSection[] */ 1556 word0 = TO_U_MAKE_WORD((byte) byt, 0) & UConverterConstants.UNSIGNED_INT_MASK; 1557 1558 /* 1559 * Shift byte once instead of each section word and add 0xffffff. We will compare the shifted/added byte 1560 * (bbffffff) against section words which have byte values in the same bit position. If and only if byte bb < 1561 * section byte ss then bbffffff<ssvvvvvv for all v=0..f so we need not mask off the lower 24 bits of each 1562 * section word. 1563 */ 1564 word = word0 | TO_U_VALUE_MASK; 1565 1566 /* binary search */ 1567 start = 0; 1568 limit = length; 1569 for (;;) { 1570 i = limit - start; 1571 if (i <= 1) { 1572 break; /* done */ 1573 } 1574 /* start<limit-1 */ 1575 1576 if (i <= 4) { 1577 /* linear search for the last part */ 1578 if (word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1579 break; 1580 } 1581 if (++start < limit 1582 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1583 break; 1584 } 1585 if (++start < limit 1586 && word0 <= (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK)) { 1587 break; 1588 } 1589 /* always break at start==limit-1 */ 1590 ++start; 1591 break; 1592 } 1593 1594 i = (start + limit) / 2; 1595 if (word < (toUSection.get(toUSection.position() + i) & UConverterConstants.UNSIGNED_INT_MASK)) { 1596 limit = i; 1597 } else { 1598 start = i; 1599 } 1600 } 1601 1602 /* did we really find it? */ 1603 if (start < limit) { 1604 word = (toUSection.get(toUSection.position() + start) & UConverterConstants.UNSIGNED_INT_MASK); 1605 if (byt == TO_U_GET_BYTE((int)word)) { 1606 return TO_U_GET_VALUE((int) word); /* never 0 */ 1607 } 1608 } 1609 return 0; /* not found */ 1610 } 1611 1612 /* 1613 * TRUE if not an SI/SO stateful converter, or if the match length fits with the current converter state 1614 */ 1615 static boolean TO_U_VERIFY_SISO_MATCH(byte sisoState, int match) { 1616 return sisoState < 0 || (sisoState == 0) == (match == 1); 1617 } 1618 1619 /* 1620 * get the SI/SO toU state (state 0 is for SBCS, 1 for DBCS), or 1 for DBCS-only, or -1 if the converter is not 1621 * SI/SO stateful 1622 * 1623 * Note: For SI/SO stateful converters getting here, cnv->mode==0 is equivalent to firstLength==1. 1624 */ 1625 private static int SISO_STATE(UConverterSharedData sharedData, int mode) { 1626 return sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO ? (byte) mode 1627 : sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY ? 1 : -1; 1628 } 1629 1630 class CharsetDecoderMBCS extends CharsetDecoderICU { 1631 1632 CharsetDecoderMBCS(CharsetICU cs) { 1633 super(cs); 1634 } 1635 1636 protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 1637 /* Just call cnvMBCSToUnicodeWithOffsets() to remove duplicate code. */ 1638 return cnvMBCSToUnicodeWithOffsets(source, target, offsets, flush); 1639 } 1640 1641 /* 1642 * continue partial match with new input never called for simple, single-character conversion 1643 */ 1644 private CoderResult continueMatchToU(ByteBuffer source, CharBuffer target, IntBuffer offsets, int srcIndex, 1645 boolean flush) { 1646 CoderResult cr = CoderResult.UNDERFLOW; 1647 1648 int[] value = new int[1]; 1649 int match, length; 1650 1651 match = matchToU((byte) SISO_STATE(sharedData, mode), preToUArray, preToUBegin, preToULength, source, 1652 value, isToUUseFallback(), flush); 1653 1654 if (match > 0) { 1655 if (match >= preToULength) { 1656 /* advance src pointer for the consumed input */ 1657 source.position(source.position() + match - preToULength); 1658 preToULength = 0; 1659 } else { 1660 /* the match did not use all of preToU[] - keep the rest for replay */ 1661 length = preToULength - match; 1662 System.arraycopy(preToUArray, preToUBegin + match, preToUArray, preToUBegin, length); 1663 preToULength = (byte) -length; 1664 } 1665 1666 /* write result */ 1667 cr = writeToU(value[0], target, offsets, srcIndex); 1668 } else if (match < 0) { 1669 /* save state for partial match */ 1670 int j, sArrayIndex; 1671 1672 /* just _append_ the newly consumed input to preToU[] */ 1673 sArrayIndex = source.position(); 1674 match = -match; 1675 for (j = preToULength; j < match; ++j) { 1676 preToUArray[j] = source.get(sArrayIndex++); 1677 } 1678 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 1679 preToULength = (byte) match; 1680 } else /* match==0 */{ 1681 /* 1682 * no match 1683 * 1684 * We need to split the previous input into two parts: 1685 * 1686 * 1. The first codepage character is unmappable - that's how we got into trying the extension data in 1687 * the first place. We need to move it from the preToU buffer to the error buffer, set an error code, 1688 * and prepare the rest of the previous input for 2. 1689 * 1690 * 2. The rest of the previous input must be converted once we come back from the callback for the first 1691 * character. At that time, we have to try again from scratch to convert these input characters. The 1692 * replay will be handled by the ucnv.c conversion code. 1693 */ 1694 1695 /* move the first codepage character to the error field */ 1696 System.arraycopy(preToUArray, preToUBegin, toUBytesArray, toUBytesBegin, preToUFirstLength); 1697 toULength = preToUFirstLength; 1698 1699 /* move the rest up inside the buffer */ 1700 length = preToULength - preToUFirstLength; 1701 if (length > 0) { 1702 System.arraycopy(preToUArray, preToUBegin + preToUFirstLength, preToUArray, preToUBegin, length); 1703 } 1704 1705 /* mark preToU for replay */ 1706 preToULength = (byte) -length; 1707 1708 /* set the error code for unassigned */ 1709 cr = CoderResult.unmappableForLength(preToUFirstLength); 1710 } 1711 return cr; 1712 } 1713 1714 /* 1715 * this works like matchFromU() except - the first character is in pre - no trie is used - the returned 1716 * matchLength is not offset by 2 1717 */ 1718 private int matchToU(byte sisoState, byte[] preArray, int preArrayBegin, int preLength, ByteBuffer source, 1719 int[] pMatchValue, boolean isUseFallback, boolean flush) { 1720 ByteBuffer cx = sharedData.mbcs.extIndexes; 1721 IntBuffer toUTable, toUSection; 1722 1723 int value, matchValue, srcLength = 0; 1724 int i, j, index, length, matchLength; 1725 short b; 1726 1727 if (cx == null || cx.asIntBuffer().get(EXT_TO_U_LENGTH) <= 0) { 1728 return 0; /* no extension data, no match */ 1729 } 1730 1731 /* initialize */ 1732 toUTable = (IntBuffer) ARRAY(cx, EXT_TO_U_INDEX, int.class); 1733 index = 0; 1734 1735 matchValue = 0; 1736 i = j = matchLength = 0; 1737 if (source != null) { 1738 srcLength = source.remaining(); 1739 } 1740 1741 if (sisoState == 0) { 1742 /* SBCS state of an SI/SO stateful converter, look at only exactly 1 byte */ 1743 if (preLength > 1) { 1744 return 0; /* no match of a DBCS sequence in SBCS mode */ 1745 } else if (preLength == 1) { 1746 srcLength = 0; 1747 } else /* preLength==0 */{ 1748 if (srcLength > 1) { 1749 srcLength = 1; 1750 } 1751 } 1752 flush = true; 1753 } 1754 1755 /* we must not remember fallback matches when not using fallbacks */ 1756 1757 /* match input units until there is a full match or the input is consumed */ 1758 for (;;) { 1759 /* go to the next section */ 1760 int oldpos = toUTable.position(); 1761 toUSection = ((IntBuffer) toUTable.position(index)).slice(); 1762 toUTable.position(oldpos); 1763 1764 /* read first pair of the section */ 1765 value = toUSection.get(); 1766 length = TO_U_GET_BYTE(value); 1767 value = TO_U_GET_VALUE(value); 1768 if (value != 0 && (TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) 1769 && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { 1770 /* remember longest match so far */ 1771 matchValue = value; 1772 matchLength = i + j; 1773 } 1774 1775 /* match pre[] then src[] */ 1776 if (i < preLength) { 1777 b = (short) (preArray[preArrayBegin + i++] & UConverterConstants.UNSIGNED_BYTE_MASK); 1778 } else if (j < srcLength) { 1779 b = (short) (source.get(source.position() + j++) & UConverterConstants.UNSIGNED_BYTE_MASK); 1780 } else { 1781 /* all input consumed, partial match */ 1782 if (flush || (length = (i + j)) > MAX_BYTES) { 1783 /* 1784 * end of the entire input stream, stop with the longest match so far or: partial match must not 1785 * be longer than UCNV_EXT_MAX_BYTES because it must fit into state buffers 1786 */ 1787 break; 1788 } else { 1789 /* continue with more input next time */ 1790 return -length; 1791 } 1792 } 1793 1794 /* search for the current UChar */ 1795 value = findToU(toUSection, length, b); 1796 if (value == 0) { 1797 /* no match here, stop with the longest match so far */ 1798 break; 1799 } else { 1800 if (TO_U_IS_PARTIAL(value)) { 1801 /* partial match, continue */ 1802 index = TO_U_GET_PARTIAL_INDEX(value); 1803 } else { 1804 if ((TO_U_IS_ROUNDTRIP(value) || isToUUseFallback(isUseFallback)) && TO_U_VERIFY_SISO_MATCH(sisoState, i + j)) { 1805 /* full match, stop with result */ 1806 matchValue = value; 1807 matchLength = i + j; 1808 } else { 1809 /* full match on fallback not taken, stop with the longest match so far */ 1810 } 1811 break; 1812 } 1813 } 1814 } 1815 1816 if (matchLength == 0) { 1817 /* no match at all */ 1818 return 0; 1819 } 1820 1821 /* return result */ 1822 pMatchValue[0] = TO_U_MASK_ROUNDTRIP(matchValue); 1823 return matchLength; 1824 } 1825 1826 private CoderResult writeToU(int value, CharBuffer target, IntBuffer offsets, int srcIndex) { 1827 ByteBuffer cx = sharedData.mbcs.extIndexes; 1828 /* output the result */ 1829 if (TO_U_IS_CODE_POINT(value)) { 1830 /* output a single code point */ 1831 return toUWriteCodePoint(TO_U_GET_CODE_POINT(value), target, offsets, srcIndex); 1832 } else { 1833 /* output a string - with correct data we have resultLength>0 */ 1834 1835 char[] a = new char[TO_U_GET_LENGTH(value)]; 1836 CharBuffer cb = ((CharBuffer) ARRAY(cx, EXT_TO_U_UCHARS_INDEX, char.class)); 1837 cb.position(TO_U_GET_INDEX(value)); 1838 cb.get(a, 0, a.length); 1839 return toUWriteUChars(this, a, 0, a.length, target, offsets, srcIndex); 1840 } 1841 } 1842 1843 private CoderResult toUWriteCodePoint(int c, CharBuffer target, IntBuffer offsets, int sourceIndex) { 1844 CoderResult cr = CoderResult.UNDERFLOW; 1845 int tBeginIndex = target.position(); 1846 1847 if (target.hasRemaining()) { 1848 if (c <= 0xffff) { 1849 target.put((char) c); 1850 c = UConverterConstants.U_SENTINEL; 1851 } else /* c is a supplementary code point */{ 1852 target.put(UTF16.getLeadSurrogate(c)); 1853 c = UTF16.getTrailSurrogate(c); 1854 if (target.hasRemaining()) { 1855 target.put((char) c); 1856 c = UConverterConstants.U_SENTINEL; 1857 } 1858 } 1859 1860 /* write offsets */ 1861 if (offsets != null) { 1862 offsets.put(sourceIndex); 1863 if ((tBeginIndex + 1) < target.position()) { 1864 offsets.put(sourceIndex); 1865 } 1866 } 1867 } 1868 1869 /* write overflow from c */ 1870 if (c >= 0) { 1871 charErrorBufferLength = UTF16.append(charErrorBufferArray, 0, c); 1872 cr = CoderResult.OVERFLOW; 1873 } 1874 1875 return cr; 1876 } 1877 1878 /* 1879 * Input sequence: cnv->toUBytes[0..length[ @return if(U_FAILURE) return the length (toULength, byteIndex) for 1880 * the input else return 0 after output has been written to the target 1881 */ 1882 private int toU(int length, ByteBuffer source, CharBuffer target, IntBuffer offsets, int sourceIndex, 1883 boolean flush, CoderResult[] cr) { 1884 // ByteBuffer cx; 1885 1886 if (sharedData.mbcs.extIndexes != null 1887 && initialMatchToU(length, source, target, offsets, sourceIndex, flush, cr)) { 1888 return 0; /* an extension mapping handled the input */ 1889 } 1890 1891 /* GB 18030 */ 1892 if (length == 4 && (options & MBCS_OPTION_GB18030) != 0) { 1893 int[] range; 1894 int linear; 1895 int i; 1896 1897 linear = LINEAR_18030(toUBytesArray[0], toUBytesArray[1], toUBytesArray[2], toUBytesArray[3]); 1898 for (i = 0; i < gb18030Ranges.length; ++i) { 1899 range = gb18030Ranges[i]; 1900 if (range[2] <= linear && linear <= range[3]) { 1901 /* found the sequence, output the Unicode code point for it */ 1902 cr[0] = CoderResult.UNDERFLOW; 1903 1904 /* add the linear difference between the input and start sequences to the start code point */ 1905 linear = range[0] + (linear - range[2]); 1906 1907 /* output this code point */ 1908 cr[0] = toUWriteCodePoint(linear, target, offsets, sourceIndex); 1909 1910 return 0; 1911 } 1912 } 1913 } 1914 1915 /* no mapping */ 1916 cr[0] = CoderResult.unmappableForLength(length); 1917 return length; 1918 } 1919 1920 /* 1921 * target<targetLimit; set error code for overflow 1922 */ 1923 private boolean initialMatchToU(int firstLength, ByteBuffer source, CharBuffer target, IntBuffer offsets, 1924 int srcIndex, boolean flush, CoderResult[] cr) { 1925 int[] value = new int[1]; 1926 int match = 0; 1927 1928 /* try to match */ 1929 match = matchToU((byte) SISO_STATE(sharedData, mode), toUBytesArray, toUBytesBegin, firstLength, source, 1930 value, isToUUseFallback(), flush); 1931 if (match > 0) { 1932 /* advance src pointer for the consumed input */ 1933 source.position(source.position() + match - firstLength); 1934 1935 /* write result to target */ 1936 cr[0] = writeToU(value[0], target, offsets, srcIndex); 1937 return true; 1938 } else if (match < 0) { 1939 /* save state for partial match */ 1940 byte[] sArray; 1941 int sArrayIndex; 1942 int j; 1943 1944 /* copy the first code point */ 1945 sArray = toUBytesArray; 1946 sArrayIndex = toUBytesBegin; 1947 preToUFirstLength = (byte) firstLength; 1948 for (j = 0; j < firstLength; ++j) { 1949 preToUArray[j] = sArray[sArrayIndex++]; 1950 } 1951 1952 /* now copy the newly consumed input */ 1953 sArrayIndex = source.position(); 1954 match = -match; 1955 for (; j < match; ++j) { 1956 preToUArray[j] = source.get(sArrayIndex++); 1957 } 1958 source.position(sArrayIndex); 1959 preToULength = (byte) match; 1960 return true; 1961 } else /* match==0 no match */{ 1962 return false; 1963 } 1964 } 1965 1966 private int simpleMatchToU(ByteBuffer source, boolean useFallback) { 1967 int[] value = new int[1]; 1968 int match; 1969 1970 if (source.remaining() <= 0) { 1971 return 0xffff; 1972 } 1973 1974 /* try to match */ 1975 byte[] sourceArray; 1976 int sourcePosition, sourceLimit; 1977 if (source.isReadOnly()) { 1978 // source.array() would throw an exception 1979 sourcePosition = source.position(); // relative to source.array() 1980 sourceArray = new byte[Math.min(source.remaining(), EXT_MAX_BYTES)]; 1981 source.get(sourceArray).position(sourcePosition); 1982 sourcePosition = 0; // relative to sourceArray 1983 sourceLimit = sourceArray.length; 1984 } else { 1985 sourceArray = source.array(); 1986 sourcePosition = source.position(); 1987 sourceLimit = source.limit(); 1988 } 1989 match = matchToU((byte) -1, sourceArray, sourcePosition, sourceLimit, null, value, useFallback, true); 1990 1991 if (match == source.remaining()) { 1992 /* write result for simple, single-character conversion */ 1993 if (TO_U_IS_CODE_POINT(value[0])) { 1994 return TO_U_GET_CODE_POINT(value[0]); 1995 } 1996 } 1997 1998 /* 1999 * return no match because - match>0 && value points to string: simple conversion cannot handle multiple 2000 * code points - match>0 && match!=length: not all input consumed, forbidden for this function - match==0: 2001 * no match found in the first place - match<0: partial match, not supported for simple conversion (and 2002 * flush==TRUE) 2003 */ 2004 return 0xfffe; 2005 } 2006 2007 CoderResult cnvMBCSToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) { 2008 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2009 2010 int sourceArrayIndex, sourceArrayIndexStart; 2011 int stateTable[][/* 256 */]; 2012 char[] unicodeCodeUnits; 2013 2014 int offset; 2015 byte state; 2016 int byteIndex; 2017 byte[] bytes; 2018 2019 int sourceIndex, nextSourceIndex; 2020 2021 int entry = 0; 2022 char c; 2023 byte action; 2024 2025 if (preToULength > 0) { 2026 /* 2027 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change with 2028 * continuous offsets 2029 */ 2030 cr[0] = continueMatchToU(source, target, offsets, -1, flush); 2031 2032 if (cr[0].isError() || preToULength < 0) { 2033 return cr[0]; 2034 } 2035 } 2036 2037 if (sharedData.mbcs.countStates == 1) { 2038 if (!sharedData.mbcs.hasSupplementary()) { 2039 cr[0] = cnvMBCSSingleToBMPWithOffsets(source, target, offsets, flush); 2040 } else { 2041 cr[0] = cnvMBCSSingleToUnicodeWithOffsets(source, target, offsets, flush); 2042 } 2043 return cr[0]; 2044 } 2045 2046 /* set up the local pointers */ 2047 sourceArrayIndex = sourceArrayIndexStart = source.position(); 2048 2049 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2050 stateTable = sharedData.mbcs.swapLFNLStateTable; 2051 } else { 2052 stateTable = sharedData.mbcs.stateTable; 2053 } 2054 unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; 2055 2056 /* get the converter state from UConverter */ 2057 offset = toUnicodeStatus; 2058 byteIndex = toULength; 2059 bytes = toUBytesArray; 2060 2061 /* 2062 * if we are in the SBCS state for a DBCS-only converter, then load the DBCS state from the MBCS data 2063 * (dbcsOnlyState==0 if it is not a DBCS-only converter) 2064 */ 2065 state = (byte)mode; 2066 if (state == 0) { 2067 state = sharedData.mbcs.dbcsOnlyState; 2068 } 2069 2070 /* sourceIndex=-1 if the current character began in the previous buffer */ 2071 sourceIndex = byteIndex == 0 ? 0 : -1; 2072 nextSourceIndex = 0; 2073 2074 /* conversion loop */ 2075 while (sourceArrayIndex < source.limit()) { 2076 /* 2077 * This following test is to see if available input would overflow the output. It does not catch output 2078 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the 2079 * last source byte. Therefore, those situations also test for overflows and will then break the loop, 2080 * too. 2081 */ 2082 if (!target.hasRemaining()) { 2083 /* target is full */ 2084 cr[0] = CoderResult.OVERFLOW; 2085 break; 2086 } 2087 2088 if (byteIndex == 0) { 2089 /* optimized loop for 1/2-byte input and BMP output */ 2090 // agljport:todo see ucnvmbcs.c for deleted block 2091 do { 2092 entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]; 2093 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2094 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); 2095 offset = MBCS_ENTRY_TRANSITION_OFFSET(entry); 2096 ++sourceArrayIndex; 2097 if (sourceArrayIndex < source.limit() 2098 && MBCS_ENTRY_IS_FINAL(entry = stateTable[state][source.get(sourceArrayIndex)&UConverterConstants.UNSIGNED_BYTE_MASK]) 2099 && MBCS_ENTRY_FINAL_ACTION(entry) == MBCS_STATE_VALID_16 2100 && (c = unicodeCodeUnits[offset + MBCS_ENTRY_FINAL_VALUE_16(entry)]) < 0xfffe) { 2101 ++sourceArrayIndex; 2102 target.put(c); 2103 if (offsets != null) { 2104 offsets.put(sourceIndex); 2105 sourceIndex = (nextSourceIndex += 2); 2106 } 2107 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2108 offset = 0; 2109 } else { 2110 /* set the state and leave the optimized loop */ 2111 ++nextSourceIndex; 2112 bytes[0] = source.get(sourceArrayIndex - 1); 2113 byteIndex = 1; 2114 break; 2115 } 2116 } else { 2117 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2118 /* output BMP code point */ 2119 ++sourceArrayIndex; 2120 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2121 if (offsets != null) { 2122 offsets.put(sourceIndex); 2123 sourceIndex = ++nextSourceIndex; 2124 } 2125 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2126 } else { 2127 /* leave the optimized loop */ 2128 break; 2129 } 2130 } 2131 } while (sourceArrayIndex < source.limit() && target.hasRemaining()); 2132 /* 2133 * these tests and break statements could be put inside the loop if C had "break outerLoop" like 2134 * Java 2135 */ 2136 if (sourceArrayIndex >= source.limit()) { 2137 break; 2138 } 2139 if (!target.hasRemaining()) { 2140 /* target is full */ 2141 cr[0] = CoderResult.OVERFLOW; 2142 break; 2143 } 2144 2145 ++nextSourceIndex; 2146 bytes[byteIndex++] = source.get(sourceArrayIndex++); 2147 } else /* byteIndex>0 */{ 2148 ++nextSourceIndex; 2149 entry = stateTable[state][(bytes[byteIndex++] = source.get(sourceArrayIndex++)) 2150 & UConverterConstants.UNSIGNED_BYTE_MASK]; 2151 } 2152 2153 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2154 state = (byte)MBCS_ENTRY_TRANSITION_STATE(entry); 2155 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); 2156 continue; 2157 } 2158 2159 /* save the previous state for proper extension mapping with SI/SO-stateful converters */ 2160 mode = state; 2161 2162 /* set the next state early so that we can reuse the entry variable */ 2163 state = (byte)MBCS_ENTRY_FINAL_STATE(entry); /* typically 0 */ 2164 2165 /* 2166 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2167 * switch. 2168 */ 2169 action = (byte)MBCS_ENTRY_FINAL_ACTION(entry); 2170 if (action == MBCS_STATE_VALID_16) { 2171 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2172 c = unicodeCodeUnits[offset]; 2173 if (c < 0xfffe) { 2174 /* output BMP code point */ 2175 target.put(c); 2176 if (offsets != null) { 2177 offsets.put(sourceIndex); 2178 } 2179 byteIndex = 0; 2180 } else if (c == 0xfffe) { 2181 if (isFallbackUsed() && (entry = getFallback(sharedData.mbcs, offset)) != 0xfffe) { 2182 /* output fallback BMP code point */ 2183 target.put((char)entry); 2184 if (offsets != null) { 2185 offsets.put(sourceIndex); 2186 } 2187 byteIndex = 0; 2188 } 2189 } else { 2190 /* callback(illegal) */ 2191 cr[0] = CoderResult.malformedForLength(byteIndex); 2192 } 2193 } else if (action == MBCS_STATE_VALID_DIRECT_16) { 2194 /* output BMP code point */ 2195 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2196 if (offsets != null) { 2197 offsets.put(sourceIndex); 2198 } 2199 byteIndex = 0; 2200 } else if (action == MBCS_STATE_VALID_16_PAIR) { 2201 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2202 c = unicodeCodeUnits[offset++]; 2203 if (c < 0xd800) { 2204 /* output BMP code point below 0xd800 */ 2205 target.put(c); 2206 if (offsets != null) { 2207 offsets.put(sourceIndex); 2208 } 2209 byteIndex = 0; 2210 } else if (isFallbackUsed() ? c <= 0xdfff : c <= 0xdbff) { 2211 /* output roundtrip or fallback surrogate pair */ 2212 target.put((char)(c & 0xdbff)); 2213 if (offsets != null) { 2214 offsets.put(sourceIndex); 2215 } 2216 byteIndex = 0; 2217 if (target.hasRemaining()) { 2218 target.put(unicodeCodeUnits[offset]); 2219 if (offsets != null) { 2220 offsets.put(sourceIndex); 2221 } 2222 } else { 2223 /* target overflow */ 2224 charErrorBufferArray[0] = unicodeCodeUnits[offset]; 2225 charErrorBufferLength = 1; 2226 cr[0] = CoderResult.OVERFLOW; 2227 2228 offset = 0; 2229 break; 2230 } 2231 } else if (isFallbackUsed() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { 2232 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2233 target.put(unicodeCodeUnits[offset]); 2234 if (offsets != null) { 2235 offsets.put(sourceIndex); 2236 } 2237 byteIndex = 0; 2238 } else if (c == 0xffff) { 2239 /* callback(illegal) */ 2240 cr[0] = CoderResult.malformedForLength(byteIndex); 2241 } 2242 } else if (action == MBCS_STATE_VALID_DIRECT_20 2243 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { 2244 entry = MBCS_ENTRY_FINAL_VALUE(entry); 2245 /* output surrogate pair */ 2246 target.put((char)(0xd800 | (char)(entry >> 10))); 2247 if (offsets != null) { 2248 offsets.put(sourceIndex); 2249 } 2250 byteIndex = 0; 2251 c = (char)(0xdc00 | (char)(entry & 0x3ff)); 2252 if (target.hasRemaining()) { 2253 target.put(c); 2254 if (offsets != null) { 2255 offsets.put(sourceIndex); 2256 } 2257 } else { 2258 /* target overflow */ 2259 charErrorBufferArray[0] = c; 2260 charErrorBufferLength = 1; 2261 cr[0] = CoderResult.OVERFLOW; 2262 2263 offset = 0; 2264 break; 2265 } 2266 } else if (action == MBCS_STATE_CHANGE_ONLY) { 2267 /* 2268 * This serves as a state change without any output. It is useful for reading simple stateful 2269 * encodings, for example using just Shift-In/Shift-Out codes. The 21 unused bits may later be used 2270 * for more sophisticated state transitions. 2271 */ 2272 if (sharedData.mbcs.dbcsOnlyState == 0) { 2273 byteIndex = 0; 2274 } else { 2275 /* SI/SO are illegal for DBCS-only conversion */ 2276 state = (byte)(mode); /* restore the previous state */ 2277 2278 /* callback(illegal) */ 2279 cr[0] = CoderResult.malformedForLength(byteIndex); 2280 } 2281 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2282 if (isFallbackUsed()) { 2283 /* output BMP code point */ 2284 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2285 if (offsets != null) { 2286 offsets.put(sourceIndex); 2287 } 2288 byteIndex = 0; 2289 } 2290 } else if (action == MBCS_STATE_UNASSIGNED) { 2291 /* just fall through */ 2292 } else if (action == MBCS_STATE_ILLEGAL) { 2293 /* callback(illegal) */ 2294 cr[0] = CoderResult.malformedForLength(byteIndex); 2295 } else { 2296 /* reserved, must never occur */ 2297 byteIndex = 0; 2298 } 2299 2300 /* end of action codes: prepare for a new character */ 2301 offset = 0; 2302 2303 if (byteIndex == 0) { 2304 sourceIndex = nextSourceIndex; 2305 } else if (cr[0].isError()) { 2306 /* callback(illegal) */ 2307 if (byteIndex > 1) { 2308 /* 2309 * Ticket 5691: consistent illegal sequences: 2310 * - We include at least the first byte in the illegal sequence. 2311 * - If any of the non-initial bytes could be the start of a character, 2312 * we stop the illegal sequence before the first one of those. 2313 */ 2314 boolean isDBCSOnly = (sharedData.mbcs.dbcsOnlyState != 0); 2315 byte i; 2316 for (i = 1; i < byteIndex && !isSingleOrLead(stateTable, state, isDBCSOnly, (short)(bytes[i] & UConverterConstants.UNSIGNED_BYTE_MASK)); i++) {} 2317 if (i < byteIndex) { 2318 byte backOutDistance = (byte)(byteIndex - i); 2319 int bytesFromThisBuffer = sourceArrayIndex - sourceArrayIndexStart; 2320 byteIndex = i; /* length of reported illegal byte sequence */ 2321 if (backOutDistance <= bytesFromThisBuffer) { 2322 sourceArrayIndex -= backOutDistance; 2323 } else { 2324 /* Back out bytes from the previous buffer: Need to replay them. */ 2325 this.preToULength = (byte)(bytesFromThisBuffer - backOutDistance); 2326 /* preToULength is negative! */ 2327 for (int n = 0; n < -this.preToULength; n++) { 2328 this.preToUArray[n] = bytes[i+n]; 2329 } 2330 sourceArrayIndex = sourceArrayIndexStart; 2331 } 2332 } 2333 } 2334 break; 2335 } else /* unassigned sequences indicated with byteIndex>0 */{ 2336 /* try an extension mapping */ 2337 int sourceBeginIndex = sourceArrayIndex; 2338 source.position(sourceArrayIndex); 2339 byteIndex = toU(byteIndex, source, target, offsets, sourceIndex, flush, cr); 2340 sourceArrayIndex = source.position(); 2341 sourceIndex = nextSourceIndex += (sourceArrayIndex - sourceBeginIndex); 2342 2343 if (cr[0].isError() || cr[0].isOverflow()) { 2344 /* not mappable or buffer overflow */ 2345 break; 2346 } 2347 } 2348 } 2349 2350 /* set the converter state back into UConverter */ 2351 toUnicodeStatus = offset; 2352 mode = state; 2353 toULength = byteIndex; 2354 2355 /* write back the updated pointers */ 2356 source.position(sourceArrayIndex); 2357 2358 return cr[0]; 2359 } 2360 /* 2361 * This version of cnvMBCSSingleToUnicodeWithOffsets() is optimized for single-byte, single-state codepages that 2362 * only map to and from the BMP. In addition to single-byte optimizations, the offset calculations become much 2363 * easier. 2364 */ 2365 private CoderResult cnvMBCSSingleToBMPWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, 2366 boolean flush) { 2367 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2368 2369 int sourceArrayIndex, lastSource; 2370 int targetCapacity, length; 2371 int[][] stateTable; 2372 2373 int sourceIndex; 2374 2375 int entry; 2376 byte action; 2377 2378 /* set up the local pointers */ 2379 sourceArrayIndex = source.position(); 2380 targetCapacity = target.remaining(); 2381 2382 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2383 stateTable = sharedData.mbcs.swapLFNLStateTable; 2384 } else { 2385 stateTable = sharedData.mbcs.stateTable; 2386 } 2387 2388 /* sourceIndex=-1 if the current character began in the previous buffer */ 2389 sourceIndex = 0; 2390 lastSource = sourceArrayIndex; 2391 2392 /* 2393 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the 2394 * sourceLength and targetCapacity 2395 */ 2396 length = source.remaining(); 2397 if (length < targetCapacity) { 2398 targetCapacity = length; 2399 } 2400 2401 /* conversion loop */ 2402 while (targetCapacity > 0 && sourceArrayIndex < source.limit()) { 2403 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2404 /* MBCS_ENTRY_IS_FINAL(entry) */ 2405 2406 /* test the most common case first */ 2407 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2408 /* output BMP code point */ 2409 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2410 --targetCapacity; 2411 continue; 2412 } 2413 2414 /* 2415 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2416 * switch. 2417 */ 2418 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); 2419 if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2420 if (isFallbackUsed()) { 2421 /* output BMP code point */ 2422 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2423 --targetCapacity; 2424 continue; 2425 } 2426 } else if (action == MBCS_STATE_UNASSIGNED) { 2427 /* just fall through */ 2428 } else if (action == MBCS_STATE_ILLEGAL) { 2429 /* callback(illegal) */ 2430 cr[0] = CoderResult.malformedForLength(sourceArrayIndex - lastSource); 2431 } else { 2432 /* reserved, must never occur */ 2433 continue; 2434 } 2435 2436 /* set offsets since the start or the last extension */ 2437 if (offsets != null) { 2438 int count = sourceArrayIndex - lastSource; 2439 2440 /* predecrement: do not set the offset for the callback-causing character */ 2441 while (--count > 0) { 2442 offsets.put(sourceIndex++); 2443 } 2444 /* offset and sourceIndex are now set for the current character */ 2445 } 2446 2447 if (cr[0].isError()) { 2448 /* callback(illegal) */ 2449 break; 2450 } else /* unassigned sequences indicated with byteIndex>0 */{ 2451 /* try an extension mapping */ 2452 lastSource = sourceArrayIndex; 2453 toUBytesArray[0] = source.get(sourceArrayIndex - 1); 2454 source.position(sourceArrayIndex); 2455 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); 2456 sourceArrayIndex = source.position(); 2457 sourceIndex += 1 + (sourceArrayIndex - lastSource); 2458 2459 if (cr[0].isError()) { 2460 /* not mappable or buffer overflow */ 2461 break; 2462 } 2463 2464 /* recalculate the targetCapacity after an extension mapping */ 2465 targetCapacity = target.remaining(); 2466 length = source.remaining(); 2467 if (length < targetCapacity) { 2468 targetCapacity = length; 2469 } 2470 } 2471 } 2472 2473 if (!cr[0].isError() && sourceArrayIndex < source.limit() && !target.hasRemaining()) { 2474 /* target is full */ 2475 cr[0] = CoderResult.OVERFLOW; 2476 } 2477 2478 /* set offsets since the start or the last callback */ 2479 if (offsets != null) { 2480 int count = sourceArrayIndex - lastSource; 2481 while (count > 0) { 2482 offsets.put(sourceIndex++); 2483 --count; 2484 } 2485 } 2486 2487 /* write back the updated pointers */ 2488 source.position(sourceArrayIndex); 2489 2490 return cr[0]; 2491 } 2492 2493 /* This version of cnvMBCSToUnicodeWithOffsets() is optimized for single-byte, single-state codepages. */ 2494 private CoderResult cnvMBCSSingleToUnicodeWithOffsets(ByteBuffer source, CharBuffer target, IntBuffer offsets, 2495 boolean flush) { 2496 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2497 2498 int sourceArrayIndex; 2499 int[][] stateTable; 2500 2501 int sourceIndex; 2502 2503 int entry; 2504 char c; 2505 byte action; 2506 2507 /* set up the local pointers */ 2508 sourceArrayIndex = source.position(); 2509 2510 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2511 stateTable = sharedData.mbcs.swapLFNLStateTable; 2512 } else { 2513 stateTable = sharedData.mbcs.stateTable; 2514 } 2515 2516 /* sourceIndex=-1 if the current character began in the previous buffer */ 2517 sourceIndex = 0; 2518 2519 /* conversion loop */ 2520 while (sourceArrayIndex < source.limit()) { 2521 /* 2522 * This following test is to see if available input would overflow the output. It does not catch output 2523 * of more than one code unit that overflows as a result of a surrogate pair or callback output from the 2524 * last source byte. Therefore, those situations also test for overflows and will then break the loop, 2525 * too. 2526 */ 2527 if (!target.hasRemaining()) { 2528 /* target is full */ 2529 cr[0] = CoderResult.OVERFLOW; 2530 break; 2531 } 2532 2533 entry = stateTable[0][source.get(sourceArrayIndex++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2534 /* MBCS_ENTRY_IS_FINAL(entry) */ 2535 2536 /* test the most common case first */ 2537 if (MBCS_ENTRY_FINAL_IS_VALID_DIRECT_16(entry)) { 2538 /* output BMP code point */ 2539 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2540 if (offsets != null) { 2541 offsets.put(sourceIndex); 2542 } 2543 2544 /* normal end of action codes: prepare for a new character */ 2545 ++sourceIndex; 2546 continue; 2547 } 2548 2549 /* 2550 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2551 * switch. 2552 */ 2553 action = (byte) (MBCS_ENTRY_FINAL_ACTION(entry)); 2554 if (action == MBCS_STATE_VALID_DIRECT_20 2555 || (action == MBCS_STATE_FALLBACK_DIRECT_20 && isFallbackUsed())) { 2556 2557 entry = MBCS_ENTRY_FINAL_VALUE(entry); 2558 /* output surrogate pair */ 2559 target.put((char) (0xd800 | (char) (entry >>> 10))); 2560 if (offsets != null) { 2561 offsets.put(sourceIndex); 2562 } 2563 c = (char) (0xdc00 | (char) (entry & 0x3ff)); 2564 if (target.hasRemaining()) { 2565 target.put(c); 2566 if (offsets != null) { 2567 offsets.put(sourceIndex); 2568 } 2569 } else { 2570 /* target overflow */ 2571 charErrorBufferArray[0] = c; 2572 charErrorBufferLength = 1; 2573 cr[0] = CoderResult.OVERFLOW; 2574 break; 2575 } 2576 2577 ++sourceIndex; 2578 continue; 2579 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2580 if (isFallbackUsed()) { 2581 /* output BMP code point */ 2582 target.put(MBCS_ENTRY_FINAL_VALUE_16(entry)); 2583 if (offsets != null) { 2584 offsets.put(sourceIndex); 2585 } 2586 2587 ++sourceIndex; 2588 continue; 2589 } 2590 } else if (action == MBCS_STATE_UNASSIGNED) { 2591 /* just fall through */ 2592 } else if (action == MBCS_STATE_ILLEGAL) { 2593 /* callback(illegal) */ 2594 cr[0] = CoderResult.malformedForLength(1); 2595 } else { 2596 /* reserved, must never occur */ 2597 ++sourceIndex; 2598 continue; 2599 } 2600 2601 if (cr[0].isError()) { 2602 /* callback(illegal) */ 2603 break; 2604 } else /* unassigned sequences indicated with byteIndex>0 */{ 2605 /* try an extension mapping */ 2606 int sourceBeginIndex = sourceArrayIndex; 2607 toUBytesArray[0] = source.get(sourceArrayIndex - 1); 2608 source.position(sourceArrayIndex); 2609 toULength = toU((byte) 1, source, target, offsets, sourceIndex, flush, cr); 2610 sourceArrayIndex = source.position(); 2611 sourceIndex += 1 + (sourceArrayIndex - sourceBeginIndex); 2612 2613 if (cr[0].isError()) { 2614 /* not mappable or buffer overflow */ 2615 break; 2616 } 2617 } 2618 } 2619 2620 /* write back the updated pointers */ 2621 source.position(sourceArrayIndex); 2622 2623 return cr[0]; 2624 } 2625 2626 private int getFallback(UConverterMBCSTable mbcsTable, int offset) { 2627 MBCSToUFallback[] toUFallbacks; 2628 int i, start, limit; 2629 2630 limit = mbcsTable.countToUFallbacks; 2631 if (limit > 0) { 2632 /* do a binary search for the fallback mapping */ 2633 toUFallbacks = mbcsTable.toUFallbacks; 2634 start = 0; 2635 while (start < limit - 1) { 2636 i = (start + limit) >>> 1; 2637 if (offset < toUFallbacks[i].offset) { 2638 limit = i; 2639 } else { 2640 start = i; 2641 } 2642 } 2643 2644 /* did we really find it? */ 2645 if (offset == toUFallbacks[start].offset) { 2646 return toUFallbacks[start].codePoint; 2647 } 2648 } 2649 2650 return 0xfffe; 2651 } 2652 2653 /** 2654 * This is a simple version of _MBCSGetNextUChar() that is used by other converter implementations. It only 2655 * returns an "assigned" result if it consumes the entire input. It does not use state from the converter, nor 2656 * error codes. It does not handle the EBCDIC swaplfnl option (set in UConverter). It handles conversion 2657 * extensions but not GB 18030. 2658 * 2659 * @return U+fffe unassigned U+ffff illegal otherwise the Unicode code point 2660 */ 2661 int simpleGetNextUChar(ByteBuffer source, boolean useFallback) { 2662 2663 // #if 0 2664 // /* 2665 // * Code disabled 2002dec09 (ICU 2.4) because it is not currently used in ICU. markus 2666 // * TODO In future releases, verify that this function is never called for SBCS 2667 // * conversions, i.e., that sharedData->mbcs.countStates==1 is still true. 2668 // * Removal improves code coverage. 2669 // */ 2670 // /* use optimized function if possible */ 2671 // if(sharedData->mbcs.countStates==1) { 2672 // if(length==1) { 2673 // return ucnv_MBCSSingleSimpleGetNextUChar(sharedData, (uint8_t)*source, useFallback); 2674 // } else { 2675 // return 0xffff; /* illegal: more than a single byte for an SBCS converter */ 2676 // } 2677 // } 2678 // #endif 2679 2680 /* set up the local pointers */ 2681 int[][] stateTable = sharedData.mbcs.stateTable; 2682 char[] unicodeCodeUnits = sharedData.mbcs.unicodeCodeUnits; 2683 2684 /* converter state */ 2685 int offset = 0; 2686 int state = sharedData.mbcs.dbcsOnlyState; 2687 2688 int action; 2689 int entry; 2690 int c; 2691 int i = source.position(); 2692 int length = source.limit() - i; 2693 2694 /* conversion loop */ 2695 while (true) { 2696 // entry=stateTable[state][(uint8_t)source[i++]]; 2697 entry = stateTable[state][source.get(i++) & UConverterConstants.UNSIGNED_BYTE_MASK]; 2698 2699 if (MBCS_ENTRY_IS_TRANSITION(entry)) { 2700 state = MBCS_ENTRY_TRANSITION_STATE(entry); 2701 offset += MBCS_ENTRY_TRANSITION_OFFSET(entry); 2702 2703 if (i == source.limit()) { 2704 return 0xffff; /* truncated character */ 2705 } 2706 } else { 2707 /* 2708 * An if-else-if chain provides more reliable performance for the most common cases compared to a 2709 * switch. 2710 */ 2711 action = MBCS_ENTRY_FINAL_ACTION(entry); 2712 if (action == MBCS_STATE_VALID_16) { 2713 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2714 c = unicodeCodeUnits[offset]; 2715 if (c != 0xfffe) { 2716 /* done */ 2717 } else if (isToUUseFallback()) { 2718 c = getFallback(sharedData.mbcs, offset); 2719 } 2720 /* else done with 0xfffe */ 2721 } else if (action == MBCS_STATE_VALID_DIRECT_16) { 2722 // /* output BMP code point */ 2723 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 2724 } else if (action == MBCS_STATE_VALID_16_PAIR) { 2725 offset += MBCS_ENTRY_FINAL_VALUE_16(entry); 2726 c = unicodeCodeUnits[offset++]; 2727 if (c < 0xd800) { 2728 /* output BMP code point below 0xd800 */ 2729 } else if (isToUUseFallback() ? c <= 0xdfff : c <= 0xdbff) { 2730 /* output roundtrip or fallback supplementary code point */ 2731 c = (((c & 0x3ff) << 10) + unicodeCodeUnits[offset] + (0x10000 - 0xdc00)); 2732 } else if (isToUUseFallback() ? (c & 0xfffe) == 0xe000 : c == 0xe000) { 2733 /* output roundtrip BMP code point above 0xd800 or fallback BMP code point */ 2734 c = unicodeCodeUnits[offset]; 2735 } else if (c == 0xffff) { 2736 return 0xffff; 2737 } else { 2738 c = 0xfffe; 2739 } 2740 } else if (action == MBCS_STATE_VALID_DIRECT_20) { 2741 /* output supplementary code point */ 2742 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); 2743 } else if (action == MBCS_STATE_FALLBACK_DIRECT_16) { 2744 if (!isToUUseFallback(useFallback)) { 2745 c = 0xfffe; 2746 } else { 2747 /* output BMP code point */ 2748 c = MBCS_ENTRY_FINAL_VALUE_16(entry); 2749 } 2750 } else if (action == MBCS_STATE_FALLBACK_DIRECT_20) { 2751 if (!isToUUseFallback(useFallback)) { 2752 c = 0xfffe; 2753 } else { 2754 /* output supplementary code point */ 2755 c = 0x10000 + MBCS_ENTRY_FINAL_VALUE(entry); 2756 } 2757 } else if (action == MBCS_STATE_UNASSIGNED) { 2758 c = 0xfffe; 2759 } else { 2760 /* 2761 * forbid MBCS_STATE_CHANGE_ONLY for this function, and MBCS_STATE_ILLEGAL and reserved action 2762 * codes 2763 */ 2764 return 0xffff; 2765 } 2766 break; 2767 } 2768 } 2769 2770 if (i != source.limit()) { 2771 /* illegal for this function: not all input consumed */ 2772 return 0xffff; 2773 } 2774 2775 if (c == 0xfffe) { 2776 /* try an extension mapping */ 2777 if (sharedData.mbcs.extIndexes != null) { 2778 /* Increase the limit for proper handling. Used in LMBCS. */ 2779 if (source.limit() > i + length) { 2780 source.limit(i + length); 2781 } 2782 return simpleMatchToU(source, useFallback); 2783 } 2784 } 2785 2786 return c; 2787 } 2788 private boolean hasValidTrailBytes(int[][] stateTable, short state) { 2789 int[] row = stateTable[state]; 2790 int b, entry; 2791 /* First test for final entries in this state for some commonly valid byte values. */ 2792 entry = row[0xa1]; 2793 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2794 return true; 2795 } 2796 entry = row[0x41]; 2797 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2798 return true; 2799 } 2800 /* Then test for final entries in this state. */ 2801 for (b = 0; b <= 0xff; b++) { 2802 entry = row[b]; 2803 if (!MBCS_ENTRY_IS_TRANSITION(entry) && MBCS_ENTRY_FINAL_ACTION(entry) != MBCS_STATE_ILLEGAL) { 2804 return true; 2805 } 2806 } 2807 /* Then recurse for transition entries. */ 2808 for (b = 0; b <= 0xff; b++) { 2809 entry = row[b]; 2810 if (MBCS_ENTRY_IS_TRANSITION(entry) && 2811 hasValidTrailBytes(stateTable, (short)MBCS_ENTRY_TRANSITION_STATE(entry))) { 2812 return true; 2813 } 2814 } 2815 return false; 2816 } 2817 2818 private boolean isSingleOrLead(int[][] stateTable, int state, boolean isDBCSOnly, int b) { 2819 int[] row = stateTable[state]; 2820 int entry = row[b]; 2821 if (MBCS_ENTRY_IS_TRANSITION(entry)) { /* lead byte */ 2822 return hasValidTrailBytes(stateTable, (short)MBCS_ENTRY_TRANSITION_STATE(entry)); 2823 } else { 2824 int action = MBCS_ENTRY_FINAL_ACTION(entry); 2825 if (action == MBCS_STATE_CHANGE_ONLY && isDBCSOnly) { 2826 return false; /* SI/SO are illegal for DBCS-only conversion */ 2827 } else { 2828 return (action != MBCS_STATE_ILLEGAL); 2829 } 2830 } 2831 } 2832 2833 2834 } 2835 2836 class CharsetEncoderMBCS extends CharsetEncoderICU { 2837 private boolean allowReplacementChanges = false; 2838 2839 CharsetEncoderMBCS(CharsetICU cs) { 2840 super(cs, fromUSubstitution); 2841 allowReplacementChanges = true; // allow changes in implReplaceWith 2842 implReset(); 2843 } 2844 2845 protected void implReset() { 2846 super.implReset(); 2847 preFromUFirstCP = UConverterConstants.U_SENTINEL; 2848 } 2849 2850 @SuppressWarnings("fallthrough") 2851 protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 2852 CoderResult[] cr = { CoderResult.UNDERFLOW }; 2853 // if (!source.hasRemaining() && fromUChar32 == 0) 2854 // return cr[0]; 2855 2856 int sourceArrayIndex; 2857 char[] table; 2858 byte[] pArray, bytes; 2859 char[] chars; 2860 int[] ints; 2861 int pArrayIndex, outputType, c; 2862 int prevSourceIndex, sourceIndex, nextSourceIndex; 2863 int stage2Entry = 0, value = 0, length = 0, prevLength; 2864 short uniMask; 2865 // long asciiRoundtrips; 2866 2867 byte[] si_value = new byte[2]; 2868 byte[] so_value = new byte[2]; 2869 int si_value_length = 0, so_value_length = 0; 2870 2871 boolean gotoUnassigned = false; 2872 2873 try { 2874 2875 if (!flush && preFromUFirstCP >= 0) { 2876 /* 2877 * pass sourceIndex=-1 because we continue from an earlier buffer in the future, this may change 2878 * with continuous offsets 2879 */ 2880 cr[0] = continueMatchFromU(source, target, offsets, flush, -1); 2881 2882 if (cr[0].isError() || preFromULength < 0) { 2883 return cr[0]; 2884 } 2885 } 2886 2887 /* use optimized function if possible */ 2888 outputType = sharedData.mbcs.outputType; 2889 uniMask = sharedData.mbcs.unicodeMask; 2890 if (outputType == MBCS_OUTPUT_1 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2891 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 2892 cr[0] = cnvMBCSSingleFromBMPWithOffsets(source, target, offsets, flush); 2893 } else { 2894 cr[0] = cnvMBCSSingleFromUnicodeWithOffsets(source, target, offsets, flush); 2895 } 2896 return cr[0]; 2897 } else if (outputType == MBCS_OUTPUT_2) { 2898 cr[0] = cnvMBCSDoubleFromUnicodeWithOffsets(source, target, offsets, flush); 2899 return cr[0]; 2900 } 2901 2902 table = sharedData.mbcs.fromUnicodeTable; 2903 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 2904 sourceArrayIndex = source.position(); 2905 2906 bytes = sharedData.mbcs.fromUnicodeBytes; 2907 ints = sharedData.mbcs.fromUnicodeInts; 2908 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 2909 chars = sharedData.mbcs.swapLFNLFromUnicodeChars; 2910 } else { 2911 chars = sharedData.mbcs.fromUnicodeChars; 2912 } 2913 2914 // asciiRoundtrips = sharedData.mbcs.asciiRoundtrips; 2915 2916 /* get the converter state from UConverter */ 2917 c = fromUChar32; 2918 2919 if (outputType == MBCS_OUTPUT_2_SISO) { 2920 prevLength = fromUnicodeStatus; 2921 if (prevLength == 0) { 2922 /* set the real value */ 2923 prevLength = 1; 2924 } 2925 } else { 2926 /* prevent fromUnicodeStatus from being set to something non-0 */ 2927 prevLength = 0; 2928 } 2929 2930 /* sourceIndex=-1 if the current character began in the previous buffer */ 2931 prevSourceIndex = -1; 2932 sourceIndex = c == 0 ? 0 : -1; 2933 nextSourceIndex = 0; 2934 2935 /* Get the SI/SO character for the converter */ 2936 si_value_length = getSISOBytes(SISO_Option.SI, options, si_value); 2937 so_value_length = getSISOBytes(SISO_Option.SO, options, so_value); 2938 2939 /* conversion loop */ 2940 /* 2941 * This is another piece of ugly code: A goto into the loop if the converter state contains a first 2942 * surrogate from the previous function call. It saves me to check in each loop iteration a check of 2943 * if(c==0) and duplicating the trail-surrogate-handling code in the else branch of that check. I could 2944 * not find any other way to get around this other than using a function call for the conversion and 2945 * callback, which would be even more inefficient. 2946 * 2947 * Markus Scherer 2000-jul-19 2948 */ 2949 boolean doloop = true; 2950 boolean doread = true; 2951 if (c != 0 && target.hasRemaining()) { 2952 if (UTF16.isLeadSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2953 // c is a lead surrogate, read another input 2954 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, 2955 prevSourceIndex, prevLength); 2956 doloop = getTrail(source, target, uniMask, x, flush, cr); 2957 doread = x.doread; 2958 c = x.c; 2959 sourceArrayIndex = x.sourceArrayIndex; 2960 sourceIndex = x.sourceIndex; 2961 nextSourceIndex = x.nextSourceIndex; 2962 prevSourceIndex = x.prevSourceIndex; 2963 prevLength = x.prevLength; 2964 } else { 2965 // c is not a lead surrogate, do not read another input 2966 doread = false; 2967 } 2968 } 2969 2970 if (doloop) { 2971 while (!doread || sourceArrayIndex < source.limit()) { 2972 /* 2973 * This following test is to see if available input would overflow the output. It does not catch 2974 * output of more than one byte that overflows as a result of a multi-byte character or callback 2975 * output from the last source character. Therefore, those situations also test for overflows 2976 * and will then break the loop, too. 2977 */ 2978 if (target.hasRemaining()) { 2979 /* 2980 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched 2981 * surrogate pair for a "supplementary code point". 2982 */ 2983 2984 if (doread) { 2985 // doread might be false only on the first looping 2986 2987 c = source.get(sourceArrayIndex++); 2988 ++nextSourceIndex; 2989 2990 /* 2991 * This also tests if the codepage maps single surrogates. If it does, then surrogates 2992 * are not paired but mapped separately. Note that in this case unmatched surrogates are 2993 * not detected. 2994 */ 2995 if (UTF16.isSurrogate((char) c) 2996 && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 2997 if (UTF16.isLeadSurrogate((char) c)) { 2998 // getTrail: 2999 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, 3000 nextSourceIndex, prevSourceIndex, prevLength); 3001 doloop = getTrail(source, target, uniMask, x, flush, cr); 3002 c = x.c; 3003 sourceArrayIndex = x.sourceArrayIndex; 3004 sourceIndex = x.sourceIndex; 3005 nextSourceIndex = x.nextSourceIndex; 3006 prevSourceIndex = x.prevSourceIndex; 3007 3008 if (x.doread) { 3009 if (doloop) 3010 continue; 3011 else 3012 break; 3013 } 3014 } else { 3015 /* this is an unmatched trail code unit (2nd surrogate) */ 3016 /* callback(illegal) */ 3017 cr[0] = CoderResult.malformedForLength(1); 3018 break; 3019 } 3020 } 3021 } else { 3022 doread = true; 3023 } 3024 /* convert the Unicode code point in c into codepage bytes */ 3025 3026 /* 3027 * The basic lookup is a triple-stage compact array (trie) lookup. For details see the 3028 * beginning of this file. 3029 * 3030 * Single-byte codepages are handled with a different data structure by _MBCSSingle... 3031 * functions. 3032 * 3033 * The result consists of a 32-bit value from stage 2 and a pointer to as many bytes as are 3034 * stored per character. The pointer points to the character's bytes in stage 3. Bits 15..0 3035 * of the stage 2 entry contain the stage 3 index for that pointer, while bits 31..16 are 3036 * flags for which of the 16 characters in the block are roundtrip-assigned. 3037 * 3038 * For 2-byte and 4-byte codepages, the bytes are stored as uint16_t respectively as 3039 * uint32_t, in the platform encoding. For 3-byte codepages, the bytes are always stored in 3040 * big-endian order. 3041 * 3042 * For EUC encodings that use only either 0x8e or 0x8f as the first byte of their longest 3043 * byte sequences, the first two bytes in this third stage indicate with their 7th bits 3044 * whether these bytes are to be written directly or actually need to be preceeded by one of 3045 * the two Single-Shift codes. With this, the third stage stores one byte fewer per 3046 * character than the actual maximum length of EUC byte sequences. 3047 * 3048 * Other than that, leading zero bytes are removed and the other bytes output. A single zero 3049 * byte may be output if the "assigned" bit in stage 2 was on. The data structure does not 3050 * support zero byte output as a fallback, and also does not allow output of leading zeros. 3051 */ 3052 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 3053 3054 /* get the bytes and the length for the output */ 3055 switch (outputType) { 3056 /* This is handled above with the method cnvMBCSDoubleFromUnicodeWithOffsets() */ 3057 /* case MBCS_OUTPUT_2: 3058 value = MBCS_VALUE_2_FROM_STAGE_2(bytes, stage2Entry, c); 3059 if ((value & UConverterConstants.UNSIGNED_INT_MASK) <= 0xff) { 3060 length = 1; 3061 } else { 3062 length = 2; 3063 } 3064 break; */ 3065 case MBCS_OUTPUT_2_SISO: 3066 /* 1/2-byte stateful with Shift-In/Shift-Out */ 3067 /* 3068 * Save the old state in the converter object right here, then change the local 3069 * prevLength state variable if necessary. Then, if this character turns out to be 3070 * unassigned or a fallback that is not taken, the callback code must not save the new 3071 * state in the converter because the new state is for a character that is not output. 3072 * However, the callback must still restore the state from the converter in case the 3073 * callback function changed it for its output. 3074 */ 3075 fromUnicodeStatus = prevLength; /* save the old state */ 3076 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3077 if (value <= 0xff) { 3078 if (value == 0 && MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) == false) { 3079 /* no mapping, leave value==0 */ 3080 length = 0; 3081 } else if (prevLength <= 1) { 3082 length = 1; 3083 } else { 3084 /* change from double-byte mode to single-byte */ 3085 if (si_value_length == 1) { 3086 value|=si_value[0]<<8; 3087 length = 2; 3088 } else if (si_value_length == 2) { 3089 value|=si_value[1]<<8; 3090 value|=si_value[0]<<16; 3091 length = 3; 3092 } 3093 prevLength = 1; 3094 } 3095 } else { 3096 if (prevLength == 2) { 3097 length = 2; 3098 } else { 3099 /* change from single-byte mode to double-byte */ 3100 if (so_value_length == 1) { 3101 value|=so_value[0]<<16; 3102 length = 3; 3103 } else if (so_value_length == 2) { 3104 value|=so_value[1]<<16; 3105 value|=so_value[0]<<24; 3106 length = 4; 3107 } 3108 prevLength = 2; 3109 } 3110 } 3111 break; 3112 case MBCS_OUTPUT_DBCS_ONLY: 3113 /* table with single-byte results, but only DBCS mappings used */ 3114 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3115 if (value <= 0xff) { 3116 /* no mapping or SBCS result, not taken for DBCS-only */ 3117 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ 3118 length = 0; 3119 } else { 3120 length = 2; 3121 } 3122 break; 3123 case MBCS_OUTPUT_3: 3124 pArray = bytes; 3125 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3126 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) 3127 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) 3128 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3129 if (value <= 0xff) { 3130 length = 1; 3131 } else if (value <= 0xffff) { 3132 length = 2; 3133 } else { 3134 length = 3; 3135 } 3136 break; 3137 case MBCS_OUTPUT_4: 3138 value = MBCS_VALUE_4_FROM_STAGE_2(ints, stage2Entry, c); 3139 if (value < 0) { 3140 // Half of the 4-byte values look negative in a signed int. 3141 length = 4; 3142 } else if (value <= 0xff) { 3143 length = 1; 3144 } else if (value <= 0xffff) { 3145 length = 2; 3146 } else if (value <= 0xffffff) { 3147 length = 3; 3148 } else { 3149 length = 4; 3150 } 3151 break; 3152 case MBCS_OUTPUT_3_EUC: 3153 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 3154 /* EUC 16-bit fixed-length representation */ 3155 if (value <= 0xff) { 3156 length = 1; 3157 } else if ((value & 0x8000) == 0) { 3158 value |= 0x8e8000; 3159 length = 3; 3160 } else if ((value & 0x80) == 0) { 3161 value |= 0x8f0080; 3162 length = 3; 3163 } else { 3164 length = 2; 3165 } 3166 break; 3167 case MBCS_OUTPUT_4_EUC: 3168 pArray = bytes; 3169 pArrayIndex = MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3170 value = ((pArray[pArrayIndex] & UConverterConstants.UNSIGNED_BYTE_MASK) << 16) 3171 | ((pArray[pArrayIndex + 1] & UConverterConstants.UNSIGNED_BYTE_MASK) << 8) 3172 | (pArray[pArrayIndex + 2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3173 /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3174 if (value <= 0xff) { 3175 length = 1; 3176 } else if (value <= 0xffff) { 3177 length = 2; 3178 } else if ((value & 0x800000) == 0) { 3179 value |= 0x8e800000; 3180 length = 4; 3181 } else if ((value & 0x8000) == 0) { 3182 value |= 0x8f008000; 3183 length = 4; 3184 } else { 3185 length = 3; 3186 } 3187 break; 3188 default: 3189 /* must not occur */ 3190 /* 3191 * To avoid compiler warnings that value & length may be used without having been 3192 * initialized, we set them here. In reality, this is unreachable code. Not having a 3193 * default branch also causes warnings with some compilers. 3194 */ 3195 value = stage2Entry = 0; /* stage2Entry=0 to reset roundtrip flags */ 3196 length = 0; 3197 break; 3198 } 3199 3200 /* is this code point assigned, or do we use fallbacks? */ 3201 if (gotoUnassigned || (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0)))) { 3202 gotoUnassigned = false; 3203 /* 3204 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way 3205 * with this data structure for fallback output to be a zero byte. 3206 */ 3207 3208 // unassigned: 3209 SideEffects x = new SideEffects(c, sourceArrayIndex, sourceIndex, nextSourceIndex, 3210 prevSourceIndex, prevLength); 3211 doloop = unassigned(source, target, offsets, x, flush, cr); 3212 c = x.c; 3213 sourceArrayIndex = x.sourceArrayIndex; 3214 sourceIndex = x.sourceIndex; 3215 nextSourceIndex = x.nextSourceIndex; 3216 prevSourceIndex = x.prevSourceIndex; 3217 prevLength = x.prevLength; 3218 if (doloop) 3219 continue; 3220 else 3221 break; 3222 } 3223 3224 /* write the output character bytes from value and length */ 3225 /* from the first if in the loop we know that targetCapacity>0 */ 3226 if (length <= target.remaining()) { 3227 switch (length) { 3228 /* each branch falls through to the next one */ 3229 case 4: 3230 target.put((byte) (value >>> 24)); 3231 if (offsets != null) { 3232 offsets.put(sourceIndex); 3233 } 3234 case 3: 3235 target.put((byte) (value >>> 16)); 3236 if (offsets != null) { 3237 offsets.put(sourceIndex); 3238 } 3239 case 2: 3240 target.put((byte) (value >>> 8)); 3241 if (offsets != null) { 3242 offsets.put(sourceIndex); 3243 } 3244 case 1: 3245 target.put((byte) value); 3246 if (offsets != null) { 3247 offsets.put(sourceIndex); 3248 } 3249 default: 3250 /* will never occur */ 3251 break; 3252 } 3253 } else { 3254 int errorBufferArrayIndex; 3255 3256 /* 3257 * We actually do this backwards here: In order to save an intermediate variable, we 3258 * output first to the overflow buffer what does not fit into the regular target. 3259 */ 3260 /* we know that 1<=targetCapacity<length<=4 */ 3261 length -= target.remaining(); 3262 3263 errorBufferArrayIndex = 0; 3264 switch (length) { 3265 /* each branch falls through to the next one */ 3266 case 3: 3267 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 16); 3268 case 2: 3269 errorBuffer[errorBufferArrayIndex++] = (byte) (value >>> 8); 3270 case 1: 3271 errorBuffer[errorBufferArrayIndex] = (byte) value; 3272 default: 3273 /* will never occur */ 3274 break; 3275 } 3276 errorBufferLength = (byte) length; 3277 3278 /* now output what fits into the regular target */ 3279 value >>>= 8 * length; /* length was reduced by targetCapacity */ 3280 switch (target.remaining()) { 3281 /* each branch falls through to the next one */ 3282 case 3: 3283 target.put((byte) (value >>> 16)); 3284 if (offsets != null) { 3285 offsets.put(sourceIndex); 3286 } 3287 case 2: 3288 target.put((byte) (value >>> 8)); 3289 if (offsets != null) { 3290 offsets.put(sourceIndex); 3291 } 3292 case 1: 3293 target.put((byte) value); 3294 if (offsets != null) { 3295 offsets.put(sourceIndex); 3296 } 3297 default: 3298 /* will never occur */ 3299 break; 3300 } 3301 3302 /* target overflow */ 3303 cr[0] = CoderResult.OVERFLOW; 3304 c = 0; 3305 break; 3306 } 3307 3308 /* normal end of conversion: prepare for a new character */ 3309 c = 0; 3310 if (offsets != null) { 3311 prevSourceIndex = sourceIndex; 3312 sourceIndex = nextSourceIndex; 3313 } 3314 continue; 3315 } else { 3316 /* target is full */ 3317 cr[0] = CoderResult.OVERFLOW; 3318 break; 3319 } 3320 } 3321 } 3322 3323 /* 3324 * the end of the input stream and detection of truncated input are handled by the framework, but for 3325 * EBCDIC_STATEFUL conversion we need to emit an SI at the very end 3326 * 3327 * conditions: successful EBCDIC_STATEFUL in DBCS mode end of input and no truncated input 3328 */ 3329 if (outputType == MBCS_OUTPUT_2_SISO && prevLength == 2 && flush && sourceArrayIndex >= source.limit() 3330 && c == 0) { 3331 3332 /* EBCDIC_STATEFUL ending with DBCS: emit an SI to return the output stream to SBCS */ 3333 if (target.hasRemaining()) { 3334 target.put(si_value[0]); 3335 if (si_value_length == 2) { 3336 if (target.remaining() > 0) { 3337 target.put(si_value[1]); 3338 } else { 3339 errorBuffer[0] = si_value[1]; 3340 errorBufferLength = 1; 3341 cr[0] = CoderResult.OVERFLOW; 3342 } 3343 } 3344 if (offsets != null) { 3345 /* set the last source character's index (sourceIndex points at sourceLimit now) */ 3346 offsets.put(prevSourceIndex); 3347 } 3348 } else { 3349 /* target is full */ 3350 errorBuffer[0] = si_value[0]; 3351 if (si_value_length == 2) { 3352 errorBuffer[1] = si_value[1]; 3353 } 3354 errorBufferLength = si_value_length; 3355 cr[0] = CoderResult.OVERFLOW; 3356 } 3357 prevLength = 1; /* we switched into SBCS */ 3358 } 3359 3360 /* set the converter state back into UConverter */ 3361 fromUChar32 = c; 3362 fromUnicodeStatus = prevLength; 3363 3364 source.position(sourceArrayIndex); 3365 } catch (BufferOverflowException ex) { 3366 cr[0] = CoderResult.OVERFLOW; 3367 } 3368 3369 return cr[0]; 3370 } 3371 3372 /* 3373 * This is another simple conversion function for internal use by other conversion implementations. It does not 3374 * use the converter state nor call callbacks. It does not handle the EBCDIC swaplfnl option (set in 3375 * UConverter). It handles conversion extensions but not GB 18030. 3376 * 3377 * It converts one single Unicode code point into codepage bytes, encoded as one 32-bit value. The function 3378 * returns the number of bytes in *pValue: 1..4 the number of bytes in *pValue 0 unassigned (*pValue undefined) 3379 * -1 illegal (currently not used, *pValue undefined) 3380 * 3381 * *pValue will contain the resulting bytes with the last byte in bits 7..0, the second to last byte in bits 3382 * 15..8, etc. Currently, the function assumes but does not check that 0<=c<=0x10ffff. 3383 */ 3384 int fromUChar32(int c, int[] pValue, boolean isUseFallback) { 3385 // #if 0 3386 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 3387 // const uint8_t *p; 3388 // #endif 3389 3390 char[] table; 3391 int stage2Entry; 3392 int value; 3393 int length; 3394 int p; 3395 3396 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 3397 if (c <= 0xffff || sharedData.mbcs.hasSupplementary()) { 3398 table = sharedData.mbcs.fromUnicodeTable; 3399 3400 /* convert the Unicode code point in c into codepage bytes (same as in _MBCSFromUnicodeWithOffsets) */ 3401 if (sharedData.mbcs.outputType == MBCS_OUTPUT_1) { 3402 value = MBCS_SINGLE_RESULT_FROM_U(table, sharedData.mbcs.fromUnicodeChars, c); 3403 /* is this code point assigned, or do we use fallbacks? */ 3404 if (isUseFallback ? value >= 0x800 : value >= 0xc00) { 3405 pValue[0] = value & 0xff; 3406 return 1; 3407 } 3408 } else /* outputType!=MBCS_OUTPUT_1 */{ 3409 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 3410 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 3411 3412 /* get the bytes and the length for the output */ 3413 switch (sharedData.mbcs.outputType) { 3414 case MBCS_OUTPUT_2: 3415 value = MBCS_VALUE_2_FROM_STAGE_2(sharedData.mbcs.fromUnicodeChars, stage2Entry, c); 3416 if (value <= 0xff) { 3417 length = 1; 3418 } else { 3419 length = 2; 3420 } 3421 break; 3422 // #if 0 3423 // /* #if 0 because this is not currently used in ICU - reduce code, increase code coverage */ 3424 // case MBCS_OUTPUT_DBCS_ONLY: 3425 // /* table with single-byte results, but only DBCS mappings used */ 3426 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3427 // if(value<=0xff) { 3428 // /* no mapping or SBCS result, not taken for DBCS-only */ 3429 // value=stage2Entry=0; /* stage2Entry=0 to reset roundtrip flags */ 3430 // length=0; 3431 // } else { 3432 // length=2; 3433 // } 3434 // break; 3435 case MBCS_OUTPUT_3: 3436 byte[] bytes = sharedData.mbcs.fromUnicodeBytes; 3437 p = CharsetMBCS.MBCS_POINTER_3_FROM_STAGE_2(bytes, stage2Entry, c); 3438 value = ((bytes[p] & UConverterConstants.UNSIGNED_BYTE_MASK)<<16) | 3439 ((bytes[p+1] & UConverterConstants.UNSIGNED_BYTE_MASK)<<8) | 3440 (bytes[p+2] & UConverterConstants.UNSIGNED_BYTE_MASK); 3441 if (value <= 0xff) { 3442 length = 1; 3443 } else if (value <= 0xffff) { 3444 length = 2; 3445 } else { 3446 length = 3; 3447 } 3448 break; 3449 // case MBCS_OUTPUT_4: 3450 // value=MBCS_VALUE_4_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3451 // if(value<=0xff) { 3452 // length=1; 3453 // } else if(value<=0xffff) { 3454 // length=2; 3455 // } else if(value<=0xffffff) { 3456 // length=3; 3457 // } else { 3458 // length=4; 3459 // } 3460 // break; 3461 // case MBCS_OUTPUT_3_EUC: 3462 // value=MBCS_VALUE_2_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3463 // /* EUC 16-bit fixed-length representation */ 3464 // if(value<=0xff) { 3465 // length=1; 3466 // } else if((value&0x8000)==0) { 3467 // value|=0x8e8000; 3468 // length=3; 3469 // } else if((value&0x80)==0) { 3470 // value|=0x8f0080; 3471 // length=3; 3472 // } else { 3473 // length=2; 3474 // } 3475 // break; 3476 // case MBCS_OUTPUT_4_EUC: 3477 // p=MBCS_POINTER_3_FROM_STAGE_2(sharedData->mbcs.fromUnicodeBytes, stage2Entry, c); 3478 // value=((uint32_t)*p<<16)|((uint32_t)p[1]<<8)|p[2]; 3479 // /* EUC 16-bit fixed-length representation applied to the first two bytes */ 3480 // if(value<=0xff) { 3481 // length=1; 3482 // } else if(value<=0xffff) { 3483 // length=2; 3484 // } else if((value&0x800000)==0) { 3485 // value|=0x8e800000; 3486 // length=4; 3487 // } else if((value&0x8000)==0) { 3488 // value|=0x8f008000; 3489 // length=4; 3490 // } else { 3491 // length=3; 3492 // } 3493 // break; 3494 // #endif 3495 default: 3496 /* must not occur */ 3497 return -1; 3498 } 3499 3500 /* is this code point assigned, or do we use fallbacks? */ 3501 if (MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) 3502 || (CharsetEncoderICU.isFromUUseFallback(isUseFallback, c) && value != 0)) { 3503 /* 3504 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way with 3505 * this data structure for fallback output to be a zero byte. 3506 */ 3507 /* assigned */ 3508 pValue[0] = value; 3509 return length; 3510 } 3511 } 3512 } 3513 3514 if (sharedData.mbcs.extIndexes != null) { 3515 length = simpleMatchFromU(c, pValue, isUseFallback); 3516 return length >= 0 ? length : -length; /* return abs(length); */ 3517 } 3518 3519 /* unassigned */ 3520 return 0; 3521 } 3522 3523 /* 3524 * continue partial match with new input, requires cnv->preFromUFirstCP>=0 never called for simple, 3525 * single-character conversion 3526 */ 3527 private CoderResult continueMatchFromU(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush, 3528 int srcIndex) { 3529 CoderResult cr = CoderResult.UNDERFLOW; 3530 int[] value = new int[1]; 3531 int match; 3532 3533 match = matchFromU(preFromUFirstCP, preFromUArray, preFromUBegin, preFromULength, source, value, useFallback, flush); 3534 if (match >= 2) { 3535 match -= 2; /* remove 2 for the initial code point */ 3536 3537 if (match >= preFromULength) { 3538 /* advance src pointer for the consumed input */ 3539 source.position(source.position() + match - preFromULength); 3540 preFromULength = 0; 3541 } else { 3542 /* the match did not use all of preFromU[] - keep the rest for replay */ 3543 int length = preFromULength - match; 3544 System.arraycopy(preFromUArray, preFromUBegin + match, preFromUArray, preFromUBegin, length); 3545 preFromULength = (byte) -length; 3546 } 3547 3548 /* finish the partial match */ 3549 preFromUFirstCP = UConverterConstants.U_SENTINEL; 3550 3551 /* write result */ 3552 writeFromU(value[0], target, offsets, srcIndex); 3553 } else if (match < 0) { 3554 /* save state for partial match */ 3555 int sArrayIndex; 3556 int j; 3557 3558 /* just _append_ the newly consumed input to preFromU[] */ 3559 sArrayIndex = source.position(); 3560 match = -match - 2; /* remove 2 for the initial code point */ 3561 for (j = preFromULength; j < match; ++j) { 3562 preFromUArray[j] = source.get(sArrayIndex++); 3563 } 3564 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 3565 preFromULength = (byte) match; 3566 } else { /* match==0 or 1 */ 3567 /* 3568 * no match 3569 * 3570 * We need to split the previous input into two parts: 3571 * 3572 * 1. The first code point is unmappable - that's how we got into trying the extension data in the first 3573 * place. We need to move it from the preFromU buffer to the error buffer, set an error code, and 3574 * prepare the rest of the previous input for 2. 3575 * 3576 * 2. The rest of the previous input must be converted once we come back from the callback for the first 3577 * code point. At that time, we have to try again from scratch to convert these input characters. The 3578 * replay will be handled by the ucnv.c conversion code. 3579 */ 3580 3581 if (match == 1) { 3582 /* matched, no mapping but request for <subchar1> */ 3583 useSubChar1 = true; 3584 } 3585 3586 /* move the first code point to the error field */ 3587 fromUChar32 = preFromUFirstCP; 3588 preFromUFirstCP = UConverterConstants.U_SENTINEL; 3589 3590 /* mark preFromU for replay */ 3591 preFromULength = (byte) -preFromULength; 3592 3593 /* set the error code for unassigned */ 3594 // TODO: figure out what the unmappable length really should be 3595 cr = CoderResult.unmappableForLength(1); 3596 } 3597 return cr; 3598 } 3599 3600 /** 3601 * @param cx 3602 * pointer to extension data; if NULL, returns 0 3603 * @param firstCP 3604 * the first code point before all the other UChars 3605 * @param pre 3606 * UChars that must match; !initialMatch: partial match with them 3607 * @param preLength 3608 * length of pre, >=0 3609 * @param src 3610 * UChars that can be used to complete a match 3611 * @param srcLength 3612 * length of src, >=0 3613 * @param pMatchValue 3614 * [out] output result value for the match from the data structure 3615 * @param useFallback 3616 * "use fallback" flag, usually from cnv->useFallback 3617 * @param flush 3618 * TRUE if the end of the input stream is reached 3619 * @return >1: matched, return value=total match length (number of input units matched) 1: matched, no mapping 3620 * but request for <subchar1> (only for the first code point) 0: no match <0: partial match, return 3621 * value=negative total match length (partial matches are never returned for flush==TRUE) (partial 3622 * matches are never returned as being longer than UCNV_EXT_MAX_UCHARS) the matchLength is 2 if only 3623 * firstCP matched, and >2 if firstCP and further code units matched 3624 */ 3625 // static int32_t ucnv_extMatchFromU(const int32_t *cx, UChar32 firstCP, const UChar *pre, int32_t preLength, 3626 // const UChar *src, int32_t srcLength, uint32_t *pMatchValue, UBool useFallback, UBool flush) 3627 private int matchFromU(int firstCP, char[] preArray, int preArrayBegin, int preLength, CharBuffer source, 3628 int[] pMatchValue, boolean isUseFallback, boolean flush) { 3629 ByteBuffer cx = sharedData.mbcs.extIndexes; 3630 3631 CharBuffer stage12, stage3; 3632 IntBuffer stage3b; 3633 3634 CharBuffer fromUTableUChars, fromUSectionUChars; 3635 IntBuffer fromUTableValues, fromUSectionValues; 3636 3637 int value, matchValue; 3638 int i, j, index, length, matchLength; 3639 char c; 3640 3641 if (cx == null) { 3642 return 0; /* no extension data, no match */ 3643 } 3644 3645 /* trie lookup of firstCP */ 3646 index = firstCP >>> 10; /* stage 1 index */ 3647 if (index >= cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH)) { 3648 return 0; /* the first code point is outside the trie */ 3649 } 3650 3651 stage12 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX, char.class); 3652 stage3 = (CharBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX, char.class); 3653 index = FROM_U(stage12, stage3, index, firstCP); 3654 3655 stage3b = (IntBuffer) ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX, int.class); 3656 value = stage3b.get(stage3b.position() + index); 3657 if (value == 0) { 3658 return 0; 3659 } 3660 3661 if (TO_U_IS_PARTIAL(value)) { 3662 /* partial match, enter the loop below */ 3663 index = FROM_U_GET_PARTIAL_INDEX(value); 3664 3665 /* initialize */ 3666 fromUTableUChars = (CharBuffer) ARRAY(cx, EXT_FROM_U_UCHARS_INDEX, char.class); 3667 fromUTableValues = (IntBuffer) ARRAY(cx, EXT_FROM_U_VALUES_INDEX, int.class); 3668 3669 matchValue = 0; 3670 i = j = matchLength = 0; 3671 3672 /* we must not remember fallback matches when not using fallbacks */ 3673 3674 /* match input units until there is a full match or the input is consumed */ 3675 for (;;) { 3676 /* go to the next section */ 3677 int oldpos = fromUTableUChars.position(); 3678 fromUSectionUChars = ((CharBuffer) fromUTableUChars.position(index)).slice(); 3679 fromUTableUChars.position(oldpos); 3680 oldpos = fromUTableValues.position(); 3681 fromUSectionValues = ((IntBuffer) fromUTableValues.position(index)).slice(); 3682 fromUTableValues.position(oldpos); 3683 3684 /* read first pair of the section */ 3685 length = fromUSectionUChars.get(); 3686 value = fromUSectionValues.get(); 3687 if (value != 0 && (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP))) { 3688 /* remember longest match so far */ 3689 matchValue = value; 3690 matchLength = 2 + i + j; 3691 } 3692 3693 /* match pre[] then src[] */ 3694 if (i < preLength) { 3695 c = preArray[preArrayBegin + i++]; 3696 } else if (source != null && j < source.remaining()) { 3697 c = source.get(source.position() + j++); 3698 } else { 3699 /* all input consumed, partial match */ 3700 if (flush || (length = (i + j)) > MAX_UCHARS) { 3701 /* 3702 * end of the entire input stream, stop with the longest match so far or: partial match must 3703 * not be longer than UCNV_EXT_MAX_UCHARS because it must fit into state buffers 3704 */ 3705 break; 3706 } else { 3707 /* continue with more input next time */ 3708 return -(2 + length); 3709 } 3710 } 3711 3712 /* search for the current UChar */ 3713 index = findFromU(fromUSectionUChars, length, c); 3714 if (index < 0) { 3715 /* no match here, stop with the longest match so far */ 3716 break; 3717 } else { 3718 value = fromUSectionValues.get(fromUSectionValues.position() + index); 3719 if (FROM_U_IS_PARTIAL(value)) { 3720 /* partial match, continue */ 3721 index = FROM_U_GET_PARTIAL_INDEX(value); 3722 } else { 3723 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { 3724 /* full match, stop with result */ 3725 matchValue = value; 3726 matchLength = 2 + i + j; 3727 } else { 3728 /* full match on fallback not taken, stop with the longest match so far */ 3729 } 3730 break; 3731 } 3732 } 3733 } 3734 3735 if (matchLength == 0) { 3736 /* no match at all */ 3737 return 0; 3738 } 3739 } else /* result from firstCP trie lookup */{ 3740 if (FROM_U_IS_ROUNDTRIP(value) || isFromUUseFallback(isUseFallback, firstCP)) { 3741 /* full match, stop with result */ 3742 matchValue = value; 3743 matchLength = 2; 3744 } else { 3745 /* fallback not taken */ 3746 return 0; 3747 } 3748 } 3749 3750 if ((matchValue & FROM_U_RESERVED_MASK) != 0) { 3751 /* do not interpret values with reserved bits used, for forward compatibility */ 3752 return 0; 3753 } 3754 3755 /* return result */ 3756 if (matchValue == FROM_U_SUBCHAR1) { 3757 return 1; /* assert matchLength==2 */ 3758 } 3759 3760 pMatchValue[0] = FROM_U_MASK_ROUNDTRIP(matchValue); 3761 return matchLength; 3762 } 3763 3764 private int simpleMatchFromU(int cp, int[] pValue, boolean isUseFallback) { 3765 int[] value = new int[1]; 3766 int match; // signed 3767 3768 /* try to match */ 3769 match = matchFromU(cp, null, 0, 0, null, value, isUseFallback, true); 3770 if (match >= 2) { 3771 /* write result for simple, single-character conversion */ 3772 int length; 3773 boolean isRoundtrip; 3774 3775 isRoundtrip = FROM_U_IS_ROUNDTRIP(value[0]); 3776 length = FROM_U_GET_LENGTH(value[0]); 3777 value[0] = FROM_U_GET_DATA(value[0]); 3778 3779 if (length <= EXT_FROM_U_MAX_DIRECT_LENGTH) { 3780 pValue[0] = value[0]; 3781 return isRoundtrip ? length : -length; 3782 // #if 0 /* not currently used */ 3783 // } else if(length==4) { 3784 // /* de-serialize a 4-byte result */ 3785 // const uint8_t *result=UCNV_EXT_ARRAY(cx, UCNV_EXT_FROM_U_BYTES_INDEX, uint8_t)+value; 3786 // *pValue= 3787 // ((uint32_t)result[0]<<24)| 3788 // ((uint32_t)result[1]<<16)| 3789 // ((uint32_t)result[2]<<8)| 3790 // result[3]; 3791 // return isRoundtrip ? 4 : -4; 3792 // #endif 3793 } 3794 } 3795 3796 /* 3797 * return no match because - match>1 && resultLength>4: result too long for simple conversion - match==1: no 3798 * match found, <subchar1> preferred - match==0: no match found in the first place - match<0: partial 3799 * match, not supported for simple conversion (and flush==TRUE) 3800 */ 3801 return 0; 3802 } 3803 3804 @SuppressWarnings("fallthrough") 3805 private CoderResult writeFromU(int value, ByteBuffer target, IntBuffer offsets, int srcIndex) { 3806 ByteBuffer cx = sharedData.mbcs.extIndexes; 3807 3808 byte bufferArray[] = new byte[1 + MAX_BYTES]; 3809 int bufferArrayIndex = 0; 3810 byte[] resultArray; 3811 int resultArrayIndex; 3812 int length, prevLength; 3813 3814 length = FROM_U_GET_LENGTH(value); 3815 value = FROM_U_GET_DATA(value); 3816 3817 /* output the result */ 3818 if (length <= FROM_U_MAX_DIRECT_LENGTH) { 3819 /* 3820 * Generate a byte array and then write it below. This is not the fastest possible way, but it should be 3821 * ok for extension mappings, and it is much simpler. Offset and overflow handling are only done once 3822 * this way. 3823 */ 3824 int p = bufferArrayIndex + 1; /* reserve buffer[0] for shiftByte below */ 3825 switch (length) { 3826 case 3: 3827 bufferArray[p++] = (byte) (value >>> 16); 3828 case 2: 3829 bufferArray[p++] = (byte) (value >>> 8); 3830 case 1: 3831 bufferArray[p++] = (byte) value; 3832 default: 3833 break; /* will never occur */ 3834 } 3835 resultArray = bufferArray; 3836 resultArrayIndex = bufferArrayIndex + 1; 3837 } else { 3838 byte[] slice = new byte[length]; 3839 3840 ByteBuffer bb = ((ByteBuffer) ARRAY(cx, EXT_FROM_U_BYTES_INDEX, byte.class)); 3841 bb.position(value); 3842 bb.get(slice, 0, slice.length); 3843 3844 resultArray = slice; 3845 resultArrayIndex = 0; 3846 } 3847 3848 /* with correct data we have length>0 */ 3849 3850 if ((prevLength = fromUnicodeStatus) != 0) { 3851 /* handle SI/SO stateful output */ 3852 byte shiftByte; 3853 3854 if (prevLength > 1 && length == 1) { 3855 /* change from double-byte mode to single-byte */ 3856 shiftByte = (byte) UConverterConstants.SI; 3857 fromUnicodeStatus = 1; 3858 } else if (prevLength == 1 && length > 1) { 3859 /* change from single-byte mode to double-byte */ 3860 shiftByte = (byte) UConverterConstants.SO; 3861 fromUnicodeStatus = 2; 3862 } else { 3863 shiftByte = 0; 3864 } 3865 3866 if (shiftByte != 0) { 3867 /* prepend the shift byte to the result bytes */ 3868 bufferArray[0] = shiftByte; 3869 if (resultArray != bufferArray || resultArrayIndex != bufferArrayIndex + 1) { 3870 System.arraycopy(resultArray, resultArrayIndex, bufferArray, bufferArrayIndex + 1, length); 3871 } 3872 resultArray = bufferArray; 3873 resultArrayIndex = bufferArrayIndex; 3874 ++length; 3875 } 3876 } 3877 3878 return fromUWriteBytes(this, resultArray, resultArrayIndex, length, target, offsets, srcIndex); 3879 } 3880 3881 /* 3882 * @return if(U_FAILURE) return the code point for cnv->fromUChar32 else return 0 after output has been written 3883 * to the target 3884 */ 3885 private int fromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, int sourceIndex, 3886 int length, boolean flush, CoderResult[] cr) { 3887 // ByteBuffer cx; 3888 3889 useSubChar1 = false; 3890 3891 if (sharedData.mbcs.extIndexes != null 3892 && initialMatchFromU(cp, source, target, offsets, sourceIndex, flush, cr)) { 3893 return 0; /* an extension mapping handled the input */ 3894 } 3895 3896 /* GB 18030 */ 3897 if ((options & MBCS_OPTION_GB18030) != 0) { 3898 int[] range; 3899 int i; 3900 3901 for (i = 0; i < gb18030Ranges.length; ++i) { 3902 range = gb18030Ranges[i]; 3903 if (range[0] <= cp && cp <= range[1]) { 3904 /* found the Unicode code point, output the four-byte sequence for it */ 3905 int linear; 3906 byte bytes[] = new byte[4]; 3907 3908 /* get the linear value of the first GB 18030 code in this range */ 3909 linear = range[2] - LINEAR_18030_BASE; 3910 3911 /* add the offset from the beginning of the range */ 3912 linear += (cp - range[0]); 3913 3914 bytes[3] = (byte) (0x30 + linear % 10); 3915 linear /= 10; 3916 bytes[2] = (byte) (0x81 + linear % 126); 3917 linear /= 126; 3918 bytes[1] = (byte) (0x30 + linear % 10); 3919 linear /= 10; 3920 bytes[0] = (byte) (0x81 + linear); 3921 3922 /* output this sequence */ 3923 cr[0] = fromUWriteBytes(this, bytes, 0, 4, target, offsets, sourceIndex); 3924 return 0; 3925 } 3926 } 3927 } 3928 3929 /* no mapping */ 3930 cr[0] = CoderResult.unmappableForLength(length); 3931 return cp; 3932 } 3933 3934 /* 3935 * target<targetLimit; set error code for overflow 3936 */ 3937 private boolean initialMatchFromU(int cp, CharBuffer source, ByteBuffer target, IntBuffer offsets, 3938 int srcIndex, boolean flush, CoderResult[] cr) { 3939 int[] value = new int[1]; 3940 int match; 3941 3942 /* try to match */ 3943 match = matchFromU(cp, null, 0, 0, source, value, useFallback, flush); 3944 3945 /* reject a match if the result is a single byte for DBCS-only */ 3946 if (match >= 2 3947 && !(FROM_U_GET_LENGTH(value[0]) == 1 && sharedData.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY)) { 3948 /* advance src pointer for the consumed input */ 3949 source.position(source.position() + match - 2); /* remove 2 for the initial code point */ 3950 3951 /* write result to target */ 3952 cr[0] = writeFromU(value[0], target, offsets, srcIndex); 3953 return true; 3954 } else if (match < 0) { 3955 /* save state for partial match */ 3956 int sArrayIndex; 3957 int j; 3958 3959 /* copy the first code point */ 3960 preFromUFirstCP = cp; 3961 3962 /* now copy the newly consumed input */ 3963 sArrayIndex = source.position(); 3964 match = -match - 2; /* remove 2 for the initial code point */ 3965 for (j = 0; j < match; ++j) { 3966 preFromUArray[j] = source.get(sArrayIndex++); 3967 } 3968 source.position(sArrayIndex); /* same as *src=srcLimit; because we reached the end of input */ 3969 preFromULength = (byte) match; 3970 return true; 3971 } else if (match == 1) { 3972 /* matched, no mapping but request for <subchar1> */ 3973 useSubChar1 = true; 3974 return false; 3975 } else /* match==0 no match */{ 3976 return false; 3977 } 3978 } 3979 3980 CoderResult cnvMBCSFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) { 3981 // Just call encodeLoop to remove duplicate code. 3982 return encodeLoop(source, target, offsets, flush); 3983 } 3984 3985 /* 3986 * This version of ucnv_MBCSFromUnicode() is optimized for single-byte codepages that map only to and from the 3987 * BMP. In addition to single-byte/state optimizations, the offset calculations become much easier. 3988 */ 3989 private CoderResult cnvMBCSSingleFromBMPWithOffsets(CharBuffer source, ByteBuffer target, IntBuffer offsets, 3990 boolean flush) { 3991 3992 CoderResult[] cr = { CoderResult.UNDERFLOW }; 3993 3994 int sourceArrayIndex, lastSource; 3995 int targetCapacity, length; 3996 char[] table; 3997 char[] results; 3998 3999 int c, sourceIndex; 4000 char value, minValue; 4001 4002 /* set up the local pointers */ 4003 sourceArrayIndex = source.position(); 4004 targetCapacity = target.remaining(); 4005 table = sharedData.mbcs.fromUnicodeTable; 4006 4007 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4008 results = sharedData.mbcs.swapLFNLFromUnicodeChars; 4009 } else { 4010 results = sharedData.mbcs.fromUnicodeChars; 4011 } 4012 4013 if (useFallback) { 4014 /* use all roundtrip and fallback results */ 4015 minValue = 0x800; 4016 } else { 4017 /* use only roundtrips and fallbacks from private-use characters */ 4018 minValue = 0xc00; 4019 } 4020 4021 /* get the converter state from UConverter */ 4022 c = fromUChar32; 4023 4024 /* sourceIndex=-1 if the current character began in the previous buffer */ 4025 sourceIndex = c == 0 ? 0 : -1; 4026 lastSource = sourceArrayIndex; 4027 4028 /* 4029 * since the conversion here is 1:1 UChar:uint8_t, we need only one counter for the minimum of the 4030 * sourceLength and targetCapacity 4031 */ 4032 length = source.limit() - sourceArrayIndex; 4033 if (length < targetCapacity) { 4034 targetCapacity = length; 4035 } 4036 4037 boolean doloop = true; 4038 if (c != 0 && targetCapacity > 0) { 4039 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); 4040 doloop = getTrailSingleBMP(source, x, cr); 4041 c = x.c; 4042 sourceArrayIndex = x.sourceArrayIndex; 4043 } 4044 4045 if (doloop) { 4046 while (targetCapacity > 0) { 4047 /* 4048 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate pair 4049 * for a "supplementary code point". 4050 */ 4051 c = source.get(sourceArrayIndex++); 4052 /* 4053 * Do not immediately check for single surrogates: Assume that they are unassigned and check for 4054 * them in that case. This speeds up the conversion of assigned characters. 4055 */ 4056 /* convert the Unicode code point in c into codepage bytes */ 4057 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4058 4059 /* is this code point assigned, or do we use fallbacks? */ 4060 if (value >= minValue) { 4061 /* assigned, write the output character bytes from value and length */ 4062 /* length==1 */ 4063 /* this is easy because we know that there is enough space */ 4064 target.put((byte) value); 4065 --targetCapacity; 4066 4067 /* normal end of conversion: prepare for a new character */ 4068 c = 0; 4069 continue; 4070 } else if (!UTF16.isSurrogate((char) c)) { 4071 /* normal, unassigned BMP character */ 4072 } else if (UTF16.isLeadSurrogate((char) c)) { 4073 // getTrail: 4074 SideEffectsSingleBMP x = new SideEffectsSingleBMP(c, sourceArrayIndex); 4075 doloop = getTrailSingleBMP(source, x, cr); 4076 c = x.c; 4077 sourceArrayIndex = x.sourceArrayIndex; 4078 if (!doloop) 4079 break; 4080 } else { 4081 /* this is an unmatched trail code unit (2nd surrogate) */ 4082 /* callback(illegal) */ 4083 cr[0] = CoderResult.malformedForLength(1); 4084 break; 4085 } 4086 4087 /* c does not have a mapping */ 4088 4089 /* get the number of code units for c to correctly advance sourceIndex */ 4090 length = UTF16.getCharCount(c); 4091 4092 /* set offsets since the start or the last extension */ 4093 if (offsets != null) { 4094 int count = sourceArrayIndex - lastSource; 4095 4096 /* do not set the offset for this character */ 4097 count -= length; 4098 4099 while (count > 0) { 4100 offsets.put(sourceIndex++); 4101 --count; 4102 } 4103 /* offsets and sourceIndex are now set for the current character */ 4104 } 4105 4106 /* try an extension mapping */ 4107 lastSource = sourceArrayIndex; 4108 source.position(sourceArrayIndex); 4109 c = fromU(c, source, target, offsets, sourceIndex, length, flush, cr); 4110 sourceArrayIndex = source.position(); 4111 sourceIndex += length + (sourceArrayIndex - lastSource); 4112 lastSource = sourceArrayIndex; 4113 4114 if (cr[0].isError()) { 4115 /* not mappable or buffer overflow */ 4116 break; 4117 } else { 4118 /* a mapping was written to the target, continue */ 4119 4120 /* recalculate the targetCapacity after an extension mapping */ 4121 targetCapacity = target.remaining(); 4122 length = source.limit() - sourceArrayIndex; 4123 if (length < targetCapacity) { 4124 targetCapacity = length; 4125 } 4126 } 4127 } 4128 } 4129 4130 if (sourceArrayIndex < source.limit() && !target.hasRemaining()) { 4131 /* target is full */ 4132 cr[0] = CoderResult.OVERFLOW; 4133 } 4134 4135 /* set offsets since the start or the last callback */ 4136 if (offsets != null) { 4137 int count = sourceArrayIndex - lastSource; 4138 while (count > 0) { 4139 offsets.put(sourceIndex++); 4140 --count; 4141 } 4142 } 4143 4144 /* set the converter state back into UConverter */ 4145 fromUChar32 = c; 4146 4147 /* write back the updated pointers */ 4148 source.position(sourceArrayIndex); 4149 4150 return cr[0]; 4151 } 4152 4153 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for single-byte codepages. */ 4154 private CoderResult cnvMBCSSingleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, 4155 IntBuffer offsets, boolean flush) { 4156 4157 CoderResult[] cr = { CoderResult.UNDERFLOW }; 4158 4159 int sourceArrayIndex; 4160 4161 char[] table; 4162 char[] results; 4163 4164 int c; 4165 int sourceIndex, nextSourceIndex; 4166 4167 char value, minValue; 4168 4169 /* set up the local pointers */ 4170 short uniMask; 4171 sourceArrayIndex = source.position(); 4172 4173 table = sharedData.mbcs.fromUnicodeTable; 4174 4175 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4176 results = sharedData.mbcs.swapLFNLFromUnicodeChars; 4177 } else { 4178 results = sharedData.mbcs.fromUnicodeChars; 4179 } 4180 4181 if (useFallback) { 4182 /* use all roundtrip and fallback results */ 4183 minValue = 0x800; 4184 } else { 4185 /* use only roundtrips and fallbacks from private-use characters */ 4186 minValue = 0xc00; 4187 } 4188 // agljport:comment hasSupplementary only used in getTrail block which now simply repeats the mask operation 4189 uniMask = sharedData.mbcs.unicodeMask; 4190 4191 /* get the converter state from UConverter */ 4192 c = fromUChar32; 4193 4194 /* sourceIndex=-1 if the current character began in the previous buffer */ 4195 sourceIndex = c == 0 ? 0 : -1; 4196 nextSourceIndex = 0; 4197 4198 boolean doloop = true; 4199 boolean doread = true; 4200 if (c != 0 && target.hasRemaining()) { 4201 if (UTF16.isLeadSurrogate((char) c)) { 4202 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); 4203 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4204 doread = x.doread; 4205 c = x.c; 4206 sourceArrayIndex = x.sourceArrayIndex; 4207 sourceIndex = x.sourceIndex; 4208 nextSourceIndex = x.nextSourceIndex; 4209 } else { 4210 doread = false; 4211 } 4212 } 4213 4214 if (doloop) { 4215 while (!doread || sourceArrayIndex < source.limit()) { 4216 /* 4217 * This following test is to see if available input would overflow the output. It does not catch 4218 * output of more than one byte that overflows as a result of a multi-byte character or callback 4219 * output from the last source character. Therefore, those situations also test for overflows and 4220 * will then break the loop, too. 4221 */ 4222 if (target.hasRemaining()) { 4223 /* 4224 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched surrogate 4225 * pair for a "supplementary code point". 4226 */ 4227 4228 if (doread) { 4229 c = source.get(sourceArrayIndex++); 4230 ++nextSourceIndex; 4231 if (UTF16.isSurrogate((char) c)) { 4232 if (UTF16.isLeadSurrogate((char) c)) { 4233 // getTrail: 4234 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4235 nextSourceIndex); 4236 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4237 c = x.c; 4238 sourceArrayIndex = x.sourceArrayIndex; 4239 sourceIndex = x.sourceIndex; 4240 nextSourceIndex = x.nextSourceIndex; 4241 if (x.doread) { 4242 if (doloop) 4243 continue; 4244 else 4245 break; 4246 } 4247 } else { 4248 /* this is an unmatched trail code unit (2nd surrogate) */ 4249 /* callback(illegal) */ 4250 cr[0] = CoderResult.malformedForLength(1); 4251 break; 4252 } 4253 } 4254 } else { 4255 doread = true; 4256 } 4257 4258 /* convert the Unicode code point in c into codepage bytes */ 4259 value = MBCS_SINGLE_RESULT_FROM_U(table, results, c); 4260 4261 /* is this code point assigned, or do we use fallbacks? */ 4262 if (value >= minValue) { 4263 /* assigned, write the output character bytes from value and length */ 4264 /* length==1 */ 4265 /* this is easy because we know that there is enough space */ 4266 target.put((byte) value); 4267 if (offsets != null) { 4268 offsets.put(sourceIndex); 4269 } 4270 4271 /* normal end of conversion: prepare for a new character */ 4272 c = 0; 4273 sourceIndex = nextSourceIndex; 4274 } else { /* unassigned */ 4275 /* try an extension mapping */ 4276 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4277 nextSourceIndex); 4278 doloop = unassignedDouble(source, target, x, flush, cr); 4279 c = x.c; 4280 sourceArrayIndex = x.sourceArrayIndex; 4281 sourceIndex = x.sourceIndex; 4282 nextSourceIndex = x.nextSourceIndex; 4283 if (!doloop) 4284 break; 4285 } 4286 } else { 4287 /* target is full */ 4288 cr[0] = CoderResult.OVERFLOW; 4289 break; 4290 } 4291 } 4292 } 4293 4294 /* set the converter state back into UConverter */ 4295 fromUChar32 = c; 4296 4297 /* write back the updated pointers */ 4298 source.position(sourceArrayIndex); 4299 4300 return cr[0]; 4301 } 4302 4303 /* This version of ucnv_MBCSFromUnicodeWithOffsets() is optimized for double-byte codepages. */ 4304 private CoderResult cnvMBCSDoubleFromUnicodeWithOffsets(CharBuffer source, ByteBuffer target, 4305 IntBuffer offsets, boolean flush) { 4306 CoderResult[] cr = { CoderResult.UNDERFLOW }; 4307 4308 int sourceArrayIndex; 4309 4310 char[] table; 4311 char[] chars; 4312 4313 int c, sourceIndex, nextSourceIndex; 4314 4315 int stage2Entry; 4316 int value; 4317 int length; 4318 short uniMask; 4319 4320 /* use optimized function if possible */ 4321 uniMask = sharedData.mbcs.unicodeMask; 4322 4323 /* set up the local pointers */ 4324 sourceArrayIndex = source.position(); 4325 4326 table = sharedData.mbcs.fromUnicodeTable; 4327 int[] tableInts = sharedData.mbcs.fromUnicodeTableInts; 4328 4329 if ((options & UConverterConstants.OPTION_SWAP_LFNL) != 0) { 4330 chars = sharedData.mbcs.swapLFNLFromUnicodeChars; 4331 } else { 4332 chars = sharedData.mbcs.fromUnicodeChars; 4333 } 4334 4335 /* get the converter state from UConverter */ 4336 c = fromUChar32; 4337 4338 /* sourceIndex=-1 if the current character began in the previous buffer */ 4339 sourceIndex = c == 0 ? 0 : -1; 4340 nextSourceIndex = 0; 4341 4342 /* conversion loop */ 4343 boolean doloop = true; 4344 boolean doread = true; 4345 if (c != 0 && target.hasRemaining()) { 4346 if (UTF16.isLeadSurrogate((char) c)) { 4347 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, nextSourceIndex); 4348 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4349 doread = x.doread; 4350 c = x.c; 4351 sourceArrayIndex = x.sourceArrayIndex; 4352 sourceIndex = x.sourceIndex; 4353 nextSourceIndex = x.nextSourceIndex; 4354 } else { 4355 doread = false; 4356 } 4357 } 4358 4359 if (doloop) { 4360 while (!doread || sourceArrayIndex < source.limit()) { 4361 /* 4362 * This following test is to see if available input would overflow the output. It does not catch 4363 * output of more than one byte that overflows as a result of a multi-byte character or callback 4364 * output from the last source character. Therefore, those situations also test for overflows and 4365 * will then break the loop, too. 4366 */ 4367 if (target.hasRemaining()) { 4368 if (doread) { 4369 /* 4370 * Get a correct Unicode code point: a single UChar for a BMP code point or a matched 4371 * surrogate pair for a "supplementary code point". 4372 */ 4373 c = source.get(sourceArrayIndex++); 4374 ++nextSourceIndex; 4375 /* 4376 * This also tests if the codepage maps single surrogates. If it does, then surrogates are 4377 * not paired but mapped separately. Note that in this case unmatched surrogates are not 4378 * detected. 4379 */ 4380 if (UTF16.isSurrogate((char) c) && (uniMask & UConverterConstants.HAS_SURROGATES) == 0) { 4381 if (UTF16.isLeadSurrogate((char) c)) { 4382 // getTrail: 4383 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4384 nextSourceIndex); 4385 doloop = getTrailDouble(source, target, uniMask, x, flush, cr); 4386 c = x.c; 4387 sourceArrayIndex = x.sourceArrayIndex; 4388 sourceIndex = x.sourceIndex; 4389 nextSourceIndex = x.nextSourceIndex; 4390 4391 if (x.doread) { 4392 if (doloop) 4393 continue; 4394 else 4395 break; 4396 } 4397 } else { 4398 /* this is an unmatched trail code unit (2nd surrogate) */ 4399 /* callback(illegal) */ 4400 cr[0] = CoderResult.malformedForLength(1); 4401 break; 4402 } 4403 } 4404 } else { 4405 doread = true; 4406 } 4407 4408 /* convert the Unicode code point in c into codepage bytes */ 4409 stage2Entry = MBCS_STAGE_2_FROM_U(table, tableInts, c); 4410 4411 /* get the bytes and the length for the output */ 4412 /* MBCS_OUTPUT_2 */ 4413 value = MBCS_VALUE_2_FROM_STAGE_2(chars, stage2Entry, c); 4414 if (value <= 0xff) { 4415 length = 1; 4416 } else { 4417 length = 2; 4418 } 4419 4420 /* is this code point assigned, or do we use fallbacks? */ 4421 if (!(MBCS_FROM_U_IS_ROUNDTRIP(stage2Entry, c) || (isFromUUseFallback(c) && value != 0))) { 4422 /* 4423 * We allow a 0 byte output if the "assigned" bit is set for this entry. There is no way 4424 * with this data structure for fallback output to be a zero byte. 4425 */ 4426 4427 // unassigned: 4428 SideEffectsDouble x = new SideEffectsDouble(c, sourceArrayIndex, sourceIndex, 4429 nextSourceIndex); 4430 4431 doloop = unassignedDouble(source, target, x, flush, cr); 4432 c = x.c; 4433 sourceArrayIndex = x.sourceArrayIndex; 4434 sourceIndex = x.sourceIndex; 4435 nextSourceIndex = x.nextSourceIndex; 4436 if (doloop) 4437 continue; 4438 else 4439 break; 4440 } 4441 4442 /* write the output character bytes from value and length */ 4443 /* from the first if in the loop we know that targetCapacity>0 */ 4444 if (length == 1) { 4445 /* this is easy because we know that there is enough space */ 4446 target.put((byte) value); 4447 if (offsets != null) { 4448 offsets.put(sourceIndex); 4449 } 4450 } else /* length==2 */{ 4451 target.put((byte) (value >>> 8)); 4452 if (2 <= target.remaining()) { 4453 target.put((byte) value); 4454 if (offsets != null) { 4455 offsets.put(sourceIndex); 4456 offsets.put(sourceIndex); 4457 } 4458 } else { 4459 if (offsets != null) { 4460 offsets.put(sourceIndex); 4461 } 4462 errorBuffer[0] = (byte) value; 4463 errorBufferLength = 1; 4464 4465 /* target overflow */ 4466 cr[0] = CoderResult.OVERFLOW; 4467 c = 0; 4468 break; 4469 } 4470 } 4471 4472 /* normal end of conversion: prepare for a new character */ 4473 c = 0; 4474 sourceIndex = nextSourceIndex; 4475 continue; 4476 } else { 4477 /* target is full */ 4478 cr[0] = CoderResult.OVERFLOW; 4479 break; 4480 } 4481 } 4482 } 4483 4484 /* set the converter state back into UConverter */ 4485 fromUChar32 = c; 4486 4487 /* write back the updated pointers */ 4488 source.position(sourceArrayIndex); 4489 4490 return cr[0]; 4491 } 4492 4493 private final class SideEffectsSingleBMP { 4494 int c, sourceArrayIndex; 4495 4496 SideEffectsSingleBMP(int c_, int sourceArrayIndex_) { 4497 c = c_; 4498 sourceArrayIndex = sourceArrayIndex_; 4499 } 4500 } 4501 4502 // function made out of block labeled getTrail in ucnv_MBCSSingleFromUnicodeWithOffsets 4503 // assumes input c is lead surrogate 4504 private final boolean getTrailSingleBMP(CharBuffer source, SideEffectsSingleBMP x, CoderResult[] cr) { 4505 if (x.sourceArrayIndex < source.limit()) { 4506 /* test the following code unit */ 4507 char trail = source.get(x.sourceArrayIndex); 4508 if (UTF16.isTrailSurrogate(trail)) { 4509 ++x.sourceArrayIndex; 4510 x.c = UCharacter.getCodePoint((char) x.c, trail); 4511 /* this codepage does not map supplementary code points */ 4512 /* callback(unassigned) */ 4513 cr[0] = CoderResult.unmappableForLength(2); 4514 return false; 4515 } else { 4516 /* this is an unmatched lead code unit (1st surrogate) */ 4517 /* callback(illegal) */ 4518 cr[0] = CoderResult.malformedForLength(1); 4519 return false; 4520 } 4521 } else { 4522 /* no more input */ 4523 return false; 4524 } 4525 // return true; 4526 } 4527 4528 private final class SideEffects { 4529 int c, sourceArrayIndex, sourceIndex, nextSourceIndex, prevSourceIndex, prevLength; 4530 boolean doread = true; 4531 4532 SideEffects(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_, int prevSourceIndex_, 4533 int prevLength_) { 4534 c = c_; 4535 sourceArrayIndex = sourceArrayIndex_; 4536 sourceIndex = sourceIndex_; 4537 nextSourceIndex = nextSourceIndex_; 4538 prevSourceIndex = prevSourceIndex_; 4539 prevLength = prevLength_; 4540 } 4541 } 4542 4543 // function made out of block labeled getTrail in ucnv_MBCSFromUnicodeWithOffsets 4544 // assumes input c is lead surrogate 4545 private final boolean getTrail(CharBuffer source, ByteBuffer target, int uniMask, SideEffects x, 4546 boolean flush, CoderResult[] cr) { 4547 if (x.sourceArrayIndex < source.limit()) { 4548 /* test the following code unit */ 4549 char trail = source.get(x.sourceArrayIndex); 4550 if (UTF16.isTrailSurrogate(trail)) { 4551 ++x.sourceArrayIndex; 4552 ++x.nextSourceIndex; 4553 /* convert this supplementary code point */ 4554 x.c = UCharacter.getCodePoint((char) x.c, trail); 4555 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 4556 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4557 fromUnicodeStatus = x.prevLength; /* save the old state */ 4558 /* callback(unassigned) */ 4559 x.doread = true; 4560 return unassigned(source, target, null, x, flush, cr); 4561 } else { 4562 x.doread = false; 4563 return true; 4564 } 4565 } else { 4566 /* this is an unmatched lead code unit (1st surrogate) */ 4567 /* callback(illegal) */ 4568 cr[0] = CoderResult.malformedForLength(1); 4569 return false; 4570 } 4571 } else { 4572 /* no more input */ 4573 return false; 4574 } 4575 } 4576 4577 // function made out of block labeled unassigned in ucnv_MBCSFromUnicodeWithOffsets 4578 private final boolean unassigned(CharBuffer source, ByteBuffer target, IntBuffer offsets, SideEffects x, 4579 boolean flush, CoderResult[] cr) { 4580 /* try an extension mapping */ 4581 int sourceBegin = x.sourceArrayIndex; 4582 source.position(x.sourceArrayIndex); 4583 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); 4584 x.sourceArrayIndex = source.position(); 4585 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; 4586 x.prevLength = fromUnicodeStatus; 4587 4588 if (cr[0].isError()) { 4589 /* not mappable or buffer overflow */ 4590 return false; 4591 } else { 4592 /* a mapping was written to the target, continue */ 4593 4594 /* recalculate the targetCapacity after an extension mapping */ 4595 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; 4596 /* normal end of conversion: prepare for a new character */ 4597 if (offsets != null) { 4598 x.prevSourceIndex = x.sourceIndex; 4599 x.sourceIndex = x.nextSourceIndex; 4600 } 4601 return true; 4602 } 4603 } 4604 4605 private final class SideEffectsDouble { 4606 int c, sourceArrayIndex, sourceIndex, nextSourceIndex; 4607 boolean doread = true; 4608 4609 SideEffectsDouble(int c_, int sourceArrayIndex_, int sourceIndex_, int nextSourceIndex_) { 4610 c = c_; 4611 sourceArrayIndex = sourceArrayIndex_; 4612 sourceIndex = sourceIndex_; 4613 nextSourceIndex = nextSourceIndex_; 4614 } 4615 } 4616 4617 // function made out of block labeled getTrail in ucnv_MBCSDoubleFromUnicodeWithOffsets 4618 // assumes input c is lead surrogate 4619 private final boolean getTrailDouble(CharBuffer source, ByteBuffer target, int uniMask, 4620 SideEffectsDouble x, boolean flush, CoderResult[] cr) { 4621 if (x.sourceArrayIndex < source.limit()) { 4622 /* test the following code unit */ 4623 char trail = source.get(x.sourceArrayIndex); 4624 if (UTF16.isTrailSurrogate(trail)) { 4625 ++x.sourceArrayIndex; 4626 ++x.nextSourceIndex; 4627 /* convert this supplementary code point */ 4628 x.c = UCharacter.getCodePoint((char) x.c, trail); 4629 if ((uniMask & UConverterConstants.HAS_SUPPLEMENTARY) == 0) { 4630 /* BMP-only codepages are stored without stage 1 entries for supplementary code points */ 4631 /* callback(unassigned) */ 4632 x.doread = true; 4633 return unassignedDouble(source, target, x, flush, cr); 4634 } else { 4635 x.doread = false; 4636 return true; 4637 } 4638 } else { 4639 /* this is an unmatched lead code unit (1st surrogate) */ 4640 /* callback(illegal) */ 4641 cr[0] = CoderResult.malformedForLength(1); 4642 return false; 4643 } 4644 } else { 4645 /* no more input */ 4646 return false; 4647 } 4648 } 4649 4650 // function made out of block labeled unassigned in ucnv_MBCSDoubleFromUnicodeWithOffsets 4651 private final boolean unassignedDouble(CharBuffer source, ByteBuffer target, SideEffectsDouble x, 4652 boolean flush, CoderResult[] cr) { 4653 /* try an extension mapping */ 4654 int sourceBegin = x.sourceArrayIndex; 4655 source.position(x.sourceArrayIndex); 4656 x.c = fromU(x.c, source, target, null, x.sourceIndex, x.nextSourceIndex, flush, cr); 4657 x.sourceArrayIndex = source.position(); 4658 x.nextSourceIndex += x.sourceArrayIndex - sourceBegin; 4659 4660 if (cr[0].isError()) { 4661 /* not mappable or buffer overflow */ 4662 return false; 4663 } else { 4664 /* a mapping was written to the target, continue */ 4665 4666 /* recalculate the targetCapacity after an extension mapping */ 4667 // x.targetCapacity=pArgs.targetLimit-x.targetArrayIndex; 4668 /* normal end of conversion: prepare for a new character */ 4669 x.sourceIndex = x.nextSourceIndex; 4670 return true; 4671 } 4672 } 4673 4674 /** 4675 * Overrides super class method 4676 * 4677 * @param encoder 4678 * @param source 4679 * @param target 4680 * @param offsets 4681 * @return 4682 */ 4683 protected CoderResult cbFromUWriteSub(CharsetEncoderICU encoder, CharBuffer source, ByteBuffer target, 4684 IntBuffer offsets) { 4685 CharsetMBCS cs = (CharsetMBCS) encoder.charset(); 4686 byte[] subchar; 4687 int length; 4688 4689 if (cs.subChar1 != 0 4690 && (cs.sharedData.mbcs.extIndexes != null ? encoder.useSubChar1 4691 : (encoder.invalidUCharBuffer[0] <= 0xff))) { 4692 /* 4693 * select subChar1 if it is set (not 0) and the unmappable Unicode code point is up to U+00ff (IBM MBCS 4694 * behavior) 4695 */ 4696 subchar = new byte[] { cs.subChar1 }; 4697 length = 1; 4698 } else { 4699 /* select subChar in all other cases */ 4700 subchar = cs.subChar; 4701 length = cs.subCharLen; 4702 } 4703 4704 /* reset the selector for the next code point */ 4705 encoder.useSubChar1 = false; 4706 4707 if (cs.sharedData.mbcs.outputType == MBCS_OUTPUT_2_SISO) { 4708 byte[] buffer = new byte[4]; 4709 int i = 0; 4710 4711 /* fromUnicodeStatus contains prevLength */ 4712 switch (length) { 4713 case 1: 4714 if (encoder.fromUnicodeStatus == 2) { 4715 /* DBCS mode and SBCS sub char: change to SBCS */ 4716 encoder.fromUnicodeStatus = 1; 4717 buffer[i++] = UConverterConstants.SI; 4718 } 4719 buffer[i++] = subchar[0]; 4720 break; 4721 case 2: 4722 if (encoder.fromUnicodeStatus <= 1) { 4723 /* SBCS mode and DBCS sub char: change to DBCS */ 4724 encoder.fromUnicodeStatus = 2; 4725 buffer[i++] = UConverterConstants.SO; 4726 } 4727 buffer[i++] = subchar[0]; 4728 buffer[i++] = subchar[1]; 4729 break; 4730 default: 4731 throw new IllegalArgumentException(); 4732 } 4733 4734 subchar = buffer; 4735 length = i; 4736 } 4737 return CharsetEncoderICU.fromUWriteBytes(encoder, subchar, 0, length, target, offsets, source.position()); 4738 } 4739 4740 /** 4741 * Gets called whenever CharsetEncoder.replaceWith gets called. allowReplacementChanges only allows subChar and 4742 * subChar1 to be modified outside construction (since replaceWith is called once during construction). 4743 * 4744 * @param replacement 4745 * The replacement for subchar. 4746 */ 4747 protected void implReplaceWith(byte[] replacement) { 4748 if (allowReplacementChanges) { 4749 CharsetMBCS cs = (CharsetMBCS) this.charset(); 4750 4751 System.arraycopy(replacement, 0, cs.subChar, 0, replacement.length); 4752 cs.subCharLen = (byte) replacement.length; 4753 cs.subChar1 = 0; 4754 } 4755 } 4756 } 4757 4758 public CharsetDecoder newDecoder() { 4759 return new CharsetDecoderMBCS(this); 4760 } 4761 4762 public CharsetEncoder newEncoder() { 4763 return new CharsetEncoderMBCS(this); 4764 } 4765 4766 @SuppressWarnings("fallthrough") 4767 void MBCSGetFilteredUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which, int filter){ 4768 UConverterMBCSTable mbcsTable; 4769 char[] table; 4770 char st1,maxStage1, st2; 4771 int st3; 4772 int c ; 4773 4774 mbcsTable = data.mbcs; 4775 table = mbcsTable.fromUnicodeTable; 4776 if(mbcsTable.hasSupplementary()){ 4777 maxStage1 = 0x440; 4778 } 4779 else{ 4780 maxStage1 = 0x40; 4781 } 4782 c=0; /* keep track of current code point while enumerating */ 4783 4784 if(mbcsTable.outputType==MBCS_OUTPUT_1){ 4785 char stage2, stage3; 4786 char minValue; 4787 char[] results = mbcsTable.fromUnicodeChars; 4788 4789 if(which==ROUNDTRIP_SET) { 4790 /* use only roundtrips */ 4791 minValue=0xf00; 4792 } else { 4793 /* use all roundtrip and fallback results */ 4794 minValue=0x800; 4795 } 4796 for(st1=0;st1<maxStage1;++st1){ 4797 st2 = table[st1]; 4798 if(st2>maxStage1){ 4799 stage2 = st2; 4800 for(st2=0; st2<64; ++st2){ 4801 st3 = table[stage2 + st2]; 4802 if(st3!=0){ 4803 /*read the stage 3 block */ 4804 stage3 = (char)st3; 4805 do { 4806 if(results[stage3++]>=minValue){ 4807 setFillIn.add(c); 4808 } 4809 }while((++c&0xf) !=0); 4810 } else { 4811 c+= 16; /*empty stage 2 block */ 4812 } 4813 } 4814 } else { 4815 c+=1024; /* empty stage 2 block */ 4816 } 4817 } 4818 } else { 4819 int[] tableInts = mbcsTable.fromUnicodeTableInts; 4820 int stage2,stage3; 4821 byte[] bytes; 4822 int st3Multiplier; 4823 int value; 4824 boolean useFallBack; 4825 bytes = mbcsTable.fromUnicodeBytes; 4826 char[] chars = mbcsTable.fromUnicodeChars; 4827 int[] ints = mbcsTable.fromUnicodeInts; 4828 useFallBack = (which == ROUNDTRIP_AND_FALLBACK_SET); 4829 switch(mbcsTable.outputType) { 4830 case MBCS_OUTPUT_3: 4831 case MBCS_OUTPUT_4_EUC: 4832 st3Multiplier = 3; 4833 break; 4834 case MBCS_OUTPUT_4: 4835 st3Multiplier =4; 4836 break; 4837 default: 4838 st3Multiplier =2; 4839 break; 4840 } 4841 4842 for(st1=0;st1<maxStage1;++st1){ 4843 st2 = table[st1]; 4844 if(st2>(maxStage1>>1)){ 4845 stage2 = st2 ; 4846 for(st2=0;st2<64;++st2){ 4847 /*read the stage 3 block */ 4848 st3 = tableInts[stage2 + st2]; 4849 if(st3!=0){ 4850 //if((st3=table[stage2+st2])!=0){ 4851 stage3 = st3Multiplier*16*(st3&UConverterConstants.UNSIGNED_SHORT_MASK); 4852 4853 /* get the roundtrip flags for the stage 3 block */ 4854 st3>>>=16; 4855 switch(filter) { 4856 case UCNV_SET_FILTER_NONE: 4857 do { 4858 if((st3&1)!=0){ 4859 setFillIn.add(c); 4860 }else if (useFallBack) { 4861 int b =0; 4862 switch(st3Multiplier) { 4863 case 4: 4864 b = ints[stage3 / 4]; 4865 break; 4866 case 3: 4867 b |= bytes[stage3] | bytes[stage3 + 1] | bytes[stage3 + 2]; 4868 break; 4869 case 2: 4870 b = chars[stage3 / 2]; 4871 break; 4872 default: 4873 break; 4874 } 4875 stage3+=st3Multiplier; 4876 if(b!=0) { 4877 setFillIn.add(c); 4878 } 4879 } 4880 st3>>=1; 4881 }while((++c&0xf)!=0); 4882 break; 4883 case UCNV_SET_FILTER_DBCS_ONLY: 4884 /* Ignore single bytes results (<0x100). */ 4885 do { 4886 if(((st3&1) != 0 || useFallBack) && chars[stage3 / 2] >= 0x100){ 4887 setFillIn.add(c); 4888 } 4889 st3>>=1; 4890 stage3+=2; 4891 }while((++c&0xf) != 0); 4892 break; 4893 case UCNV_SET_FILTER_2022_CN : 4894 /* only add code points that map to CNS 11643 planes 1&2 for non-EXT ISO-2202-CN. */ 4895 do { 4896 if(((st3&1) != 0 || useFallBack) && 4897 ((value= (UConverterConstants.UNSIGNED_BYTE_MASK & bytes[stage3]))==0x81 || value==0x82) ){ 4898 setFillIn.add(c); 4899 } 4900 st3>>=1; 4901 stage3+=3; 4902 }while((++c&0xf)!=0); 4903 break; 4904 case UCNV_SET_FILTER_SJIS: 4905 /* only add code points that map tp Shift-JIS codes corrosponding to JIS X 0280. */ 4906 do{ 4907 if(((st3&1) != 0 || useFallBack) && (value=chars[stage3 / 2])>=0x8140 && value<=0xeffc){ 4908 setFillIn.add(c); 4909 } 4910 st3>>=1; 4911 stage3+=2; 4912 }while((++c&0xf)!=0); 4913 break; 4914 case UCNV_SET_FILTER_GR94DBCS: 4915 /* only add code points that maps to ISO 2022 GR 94 DBCS codes*/ 4916 do { 4917 if(((st3&1) != 0 || useFallBack) && 4918 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=chars[stage3 / 2])- 0xa1a1))<=(0xfefe - 0xa1a1) && 4919 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ 4920 setFillIn.add(c); 4921 } 4922 st3>>=1; 4923 stage3+=2; 4924 }while((++c&0xf)!=0); 4925 break; 4926 case UCNV_SET_FILTER_HZ: 4927 /*Only add code points that are suitable for HZ DBCS*/ 4928 do { 4929 if( ((st3&1) != 0 || useFallBack) && 4930 (UConverterConstants.UNSIGNED_SHORT_MASK & ((value=chars[stage3 / 2])-0xa1a1))<=(0xfdfe - 0xa1a1) && 4931 (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1)) <= (0xfe - 0xa1)){ 4932 setFillIn.add(c); 4933 } 4934 st3>>=1; 4935 stage3+=2; 4936 }while((++c&0xf) != 0); 4937 break; 4938 default: 4939 return; 4940 } 4941 } else { 4942 c+=16; /* empty stage 3 block */ 4943 } 4944 } 4945 } else { 4946 c+=1024; /*empty stage2 block */ 4947 } 4948 } 4949 } 4950 extGetUnicodeSet(setFillIn, which, filter, data); 4951 } 4952 4953 static void extGetUnicodeSetString(ByteBuffer cx,UnicodeSet setFillIn, boolean useFallback, 4954 int minLength, int c, char s[],int length,int sectionIndex){ 4955 CharBuffer fromUSectionUChar; 4956 IntBuffer fromUSectionValues; 4957 fromUSectionUChar = (CharBuffer)ARRAY(cx, EXT_FROM_U_UCHARS_INDEX,char.class ); 4958 fromUSectionValues = (IntBuffer)ARRAY(cx, EXT_FROM_U_VALUES_INDEX,int.class ); 4959 int fromUSectionUCharIndex = fromUSectionUChar.position()+sectionIndex; 4960 int fromUSectionValuesIndex = fromUSectionValues.position()+sectionIndex; 4961 int value, i, count; 4962 4963 /* read first pair of the section */ 4964 count = fromUSectionUChar.get(fromUSectionUCharIndex++); 4965 value = fromUSectionValues.get(fromUSectionValuesIndex++); 4966 if(value!=0 && (FROM_U_IS_ROUNDTRIP(value) || useFallback) && FROM_U_GET_LENGTH(value)>=minLength) { 4967 if(c>=0){ 4968 setFillIn.add(c); 4969 } else { 4970 StringBuilder normalizedStringBuilder = new StringBuilder(); 4971 for(int j=0; j<length;j++){ 4972 normalizedStringBuilder.append(s[j]); 4973 } 4974 String normalizedString = normalizedStringBuilder.toString(); 4975 for(int j=0;j<length;j++){ 4976 setFillIn.add(normalizedString); 4977 } 4978 } 4979 } 4980 4981 for(i=0; i<count; ++i){ 4982 s[length] = fromUSectionUChar.get(fromUSectionUCharIndex + i); 4983 value = fromUSectionValues.get(fromUSectionValuesIndex + i); 4984 4985 if(value==0) { 4986 /* no mapping, do nothing */ 4987 } else if (FROM_U_IS_PARTIAL(value)) { 4988 extGetUnicodeSetString( cx, setFillIn, useFallback, minLength, UConverterConstants.U_SENTINEL, s, length+1, 4989 FROM_U_GET_PARTIAL_INDEX(value)); 4990 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0:((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))==FROM_U_ROUNDTRIP_FLAG)) 4991 && FROM_U_GET_LENGTH(value)>=minLength) { 4992 StringBuilder normalizedStringBuilder = new StringBuilder(); // String for composite characters 4993 for(int j=0; j<(length+1);j++){ 4994 normalizedStringBuilder.append(s[j]); 4995 } 4996 setFillIn.add(normalizedStringBuilder.toString()); 4997 } 4998 } 4999 5000 } 5001 5002 5003 static void extGetUnicodeSet(UnicodeSet setFillIn, int which, int filter, UConverterSharedData Data){ 5004 int st1, stage1Length, st2, st3, minLength; 5005 int ps2, ps3; 5006 5007 CharBuffer stage12, stage3; 5008 int value, length; 5009 IntBuffer stage3b; 5010 boolean useFallback; 5011 char s[] = new char[MAX_UCHARS]; 5012 int c; 5013 ByteBuffer cx = Data.mbcs.extIndexes; 5014 if(cx == null){ 5015 return; 5016 } 5017 stage12 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_12_INDEX,char.class ); 5018 stage3 = (CharBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3_INDEX,char.class ); 5019 stage3b = (IntBuffer)ARRAY(cx, EXT_FROM_U_STAGE_3B_INDEX,int.class ); 5020 5021 stage1Length = cx.asIntBuffer().get(EXT_FROM_U_STAGE_1_LENGTH); 5022 useFallback = (which==ROUNDTRIP_AND_FALLBACK_SET); 5023 5024 c = 0; 5025 if(filter == UCNV_SET_FILTER_2022_CN) { 5026 minLength = 3; 5027 } else if (Data.mbcs.outputType == MBCS_OUTPUT_DBCS_ONLY || filter != UCNV_SET_FILTER_NONE) { 5028 /* DBCS-only, ignore single-byte results */ 5029 minLength = 2; 5030 } else { 5031 minLength = 1; 5032 } 5033 5034 for(st1=0; st1< stage1Length; ++st1){ 5035 st2 = stage12.get(st1); 5036 if(st2>stage1Length) { 5037 ps2 = st2; 5038 for(st2=0;st2<64;++st2){ 5039 st3=((int) stage12.get(ps2+st2))<<STAGE_2_LEFT_SHIFT; 5040 if(st3!= 0){ 5041 ps3 = st3; 5042 do { 5043 value = stage3b.get(stage3.get(ps3++)); 5044 if(value==0){ 5045 /* no mapping do nothing */ 5046 }else if (FROM_U_IS_PARTIAL(value)){ 5047 length = 0; 5048 length=UTF16.append(s, length, c); 5049 extGetUnicodeSetString(cx,setFillIn,useFallback,minLength,c,s,length,FROM_U_GET_PARTIAL_INDEX(value)); 5050 } else if ((useFallback ? (value&FROM_U_RESERVED_MASK)==0 :((value&(FROM_U_ROUNDTRIP_FLAG|FROM_U_RESERVED_MASK))== FROM_U_ROUNDTRIP_FLAG)) && 5051 FROM_U_GET_LENGTH(value)>=minLength){ 5052 5053 switch(filter) { 5054 case UCNV_SET_FILTER_2022_CN: 5055 if(!(FROM_U_GET_LENGTH(value)==3 && FROM_U_GET_DATA(value)<=0x82ffff)){ 5056 continue; 5057 } 5058 break; 5059 case UCNV_SET_FILTER_SJIS: 5060 if(!(FROM_U_GET_LENGTH(value)==2 && (value=FROM_U_GET_DATA(value))>=0x8140 && value<=0xeffc)){ 5061 continue; 5062 } 5063 break; 5064 case UCNV_SET_FILTER_GR94DBCS: 5065 if(!(FROM_U_GET_LENGTH(value)==2 && ((value=FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfefe - 0xa1a1) 5066 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ 5067 continue; 5068 } 5069 break; 5070 case UCNV_SET_FILTER_HZ: 5071 if(!(FROM_U_GET_LENGTH(value)==2 && ((value=FROM_U_GET_DATA(value)) - 0xa1a1)<=(0xfdfe - 0xa1a1) 5072 && (UConverterConstants.UNSIGNED_BYTE_MASK & (value - 0xa1))<= (0xfe - 0xa1))){ 5073 continue; 5074 } 5075 break; 5076 default: 5077 /* 5078 * UCNV_SET_FILTER_NONE, 5079 * or UCNV_SET_FILTER_DBCS_ONLY which is handled via minLength 5080 */ 5081 break; 5082 } 5083 setFillIn.add(c); 5084 5085 } 5086 }while((++c&0xf) != 0); 5087 5088 } else { 5089 c+=16; /* emplty stage3 block */ 5090 } 5091 } 5092 } else { 5093 c+=1024; /* empty stage 2 block*/ 5094 } 5095 } 5096 } 5097 5098 void MBCSGetUnicodeSetForUnicode(UConverterSharedData data, UnicodeSet setFillIn, int which){ 5099 MBCSGetFilteredUnicodeSetForUnicode(data, setFillIn, which, 5100 this.sharedData.mbcs.outputType==MBCS_OUTPUT_DBCS_ONLY ? UCNV_SET_FILTER_DBCS_ONLY : UCNV_SET_FILTER_NONE ); 5101 } 5102 5103 void getUnicodeSetImpl( UnicodeSet setFillIn, int which){ 5104 if((options & MBCS_OPTION_GB18030)!=0){ 5105 setFillIn.add(0, 0xd7ff); 5106 setFillIn.add(0xe000, 0x10ffff); 5107 } 5108 else { 5109 this.MBCSGetUnicodeSetForUnicode(sharedData, setFillIn, which); 5110 } 5111 } 5112 5113 } 5114