Home | History | Annotate | Download | only in charset
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  *******************************************************************************
      5  * Copyright (C) 2008-2015, International Business Machines Corporation and
      6  * others. All Rights Reserved.
      7  *******************************************************************************
      8  */
      9 package com.ibm.icu.charset;
     10 
     11 import java.nio.ByteBuffer;
     12 import java.nio.CharBuffer;
     13 import java.nio.IntBuffer;
     14 import java.nio.charset.CharsetDecoder;
     15 import java.nio.charset.CharsetEncoder;
     16 import java.nio.charset.CoderResult;
     17 import java.nio.charset.UnsupportedCharsetException;
     18 
     19 import com.ibm.icu.text.UTF16;
     20 import com.ibm.icu.text.UnicodeSet;
     21 
     22 class CharsetHZ extends CharsetICU {
     23 
     24     private static final int UCNV_TILDE = 0x7E; /* ~ */
     25     private static final int UCNV_OPEN_BRACE = 0x7B; /* { */
     26     private static final int UCNV_CLOSE_BRACE = 0x7D; /* } */
     27     private static final byte[] SB_ESCAPE = new byte[] { 0x7E, 0x7D };
     28     private static final byte[] DB_ESCAPE = new byte[] { 0x7E, 0x7B };
     29     private static final byte[] TILDE_ESCAPE = new byte[] { 0x7E, 0x7E };
     30     private static final byte[] fromUSubstitution = new byte[] { (byte) 0x1A };
     31 
     32     private CharsetMBCS gbCharset;
     33     private boolean isEmptySegment;
     34 
     35     public CharsetHZ(String icuCanonicalName, String canonicalName, String[] aliases) {
     36         super(icuCanonicalName, canonicalName, aliases);
     37         gbCharset = (CharsetMBCS) new CharsetProviderICU().charsetForName("GBK");
     38         if (gbCharset == null) {
     39             throw new UnsupportedCharsetException("unable to open ICU GBK Charset, required for HZ");
     40         }
     41 
     42         maxBytesPerChar = 4;
     43         minBytesPerChar = 1;
     44         maxCharsPerByte = 1;
     45 
     46         isEmptySegment = false;
     47     }
     48 
     49     class CharsetDecoderHZ extends CharsetDecoderICU {
     50         CharsetMBCS.CharsetDecoderMBCS gbDecoder;
     51         boolean isStateDBCS = false;
     52 
     53         public CharsetDecoderHZ(CharsetICU cs) {
     54             super(cs);
     55             gbDecoder = (CharsetMBCS.CharsetDecoderMBCS) gbCharset.newDecoder();
     56         }
     57 
     58         @Override
     59         protected void implReset() {
     60             super.implReset();
     61             gbDecoder.implReset();
     62 
     63             isStateDBCS = false;
     64             isEmptySegment = false;
     65         }
     66 
     67         @Override
     68         protected CoderResult decodeLoop(ByteBuffer source, CharBuffer target, IntBuffer offsets, boolean flush) {
     69             CoderResult err = CoderResult.UNDERFLOW;
     70             byte[] tempBuf = new byte[2];
     71             int targetUniChar = 0;
     72             int mySourceChar = 0;
     73 
     74             if (!source.hasRemaining())
     75                 return CoderResult.UNDERFLOW;
     76             else if (!target.hasRemaining())
     77                 return CoderResult.OVERFLOW;
     78 
     79             while (source.hasRemaining()) {
     80 
     81                 if (target.hasRemaining()) {
     82 
     83                     // get the byte as unsigned
     84                     mySourceChar = source.get() & 0xff;
     85 
     86                     if (mode == UCNV_TILDE) {
     87                         /* second byte after ~ */
     88                         mode = 0;
     89                         switch (mySourceChar) {
     90                         case 0x0A:
     91                             /* no output for ~\n (line-continuation marker) */
     92                             continue;
     93                         case UCNV_TILDE:
     94                             if (offsets != null) {
     95                                 offsets.put(source.position() - 2);
     96                             }
     97                             target.put((char) mySourceChar);
     98                             continue;
     99                         case UCNV_OPEN_BRACE:
    100                         case UCNV_CLOSE_BRACE:
    101                             isStateDBCS = (mySourceChar == UCNV_OPEN_BRACE);
    102                             if (isEmptySegment) {
    103                                 isEmptySegment = false; /* we are handling it, reset to avoid future spurious errors */
    104                                 this.toUBytesArray[0] = UCNV_TILDE;
    105                                 this.toUBytesArray[1] = (byte)mySourceChar;
    106                                 this.toULength = 2;
    107                                 return CoderResult.malformedForLength(1);
    108                             }
    109                             isEmptySegment = true;
    110                             continue;
    111                         default:
    112                             /*
    113                              * if the first byte is equal to TILDE and the trail byte is not a valid byte then it is an
    114                              * error condition
    115                              */
    116                             /*
    117                              * Ticket 5691: consistent illegal sequences:
    118                              * - We include at least the first byte in the illegal sequence.
    119                              * - If any of the non-initial bytes could be the start of a character,
    120                              *   we stop the illegal sequence before the first one of those.
    121                              */
    122                             isEmptySegment = false; /* different error here, reset this to avoid spurious furture error */
    123                             err = CoderResult.malformedForLength(1);
    124                             toUBytesArray[0] = UCNV_TILDE;
    125                             if (isStateDBCS ? (0x21 <= mySourceChar && mySourceChar <= 0x7e) : mySourceChar <= 0x7f) {
    126                                 /* The current byte could be the start of a character: Back it out. */
    127                                 toULength = 1;
    128                                 source.position(source.position() - 1);
    129                             } else {
    130                                 /* Include the current byte in the illegal sequence. */
    131                                 toUBytesArray[1] = (byte)mySourceChar;
    132                                 toULength = 2;
    133                             }
    134                             return err;
    135                         }
    136                     } else if (isStateDBCS) {
    137                         if (toUnicodeStatus == 0) {
    138                             /* lead byte */
    139                             if (mySourceChar == UCNV_TILDE) {
    140                                 mode = UCNV_TILDE;
    141                             } else {
    142                                 /*
    143                                  * add another bit to distinguish a 0 byte from not having seen a lead byte
    144                                  */
    145                                 toUnicodeStatus = mySourceChar | 0x100;
    146                                 isEmptySegment = false; /* the segment has something, either valid or will produce a different error, so reset this */
    147                             }
    148                             continue;
    149                         } else {
    150                             /* trail byte */
    151                             boolean leadIsOk, trailIsOk;
    152                             int leadByte = toUnicodeStatus & 0xff;
    153                             targetUniChar = 0xffff;
    154                             /*
    155                              * Ticket 5691: consistent illegal sequence
    156                              * - We include at least the first byte in the illegal sequence.
    157                              * - If any of the non-initial bytes could be the start of a character,
    158                              *   we stop the illegal sequence before the first one of those
    159                              *
    160                              * In HZ DBCS, if the second byte is in the 21..7e range,
    161                              * we report ony the first byte as the illegal sequence.
    162                              * Otherwise we convert of report the pair of bytes.
    163                              */
    164                             leadIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (leadByte - 0x21)) <= (0x7d - 0x21);
    165                             trailIsOk = (short)(UConverterConstants.UNSIGNED_BYTE_MASK & (mySourceChar - 0x21)) <= (0x7e - 0x21);
    166                             if (leadIsOk && trailIsOk) {
    167                                 tempBuf[0] = (byte)(leadByte + 0x80);
    168                                 tempBuf[1] = (byte)(mySourceChar + 0x80);
    169                                 targetUniChar = gbDecoder.simpleGetNextUChar(ByteBuffer.wrap(tempBuf), super.isFallbackUsed());
    170                                 mySourceChar = (leadByte << 8) | mySourceChar;
    171                             } else if (trailIsOk) {
    172                                 /* report a single illegal byte and continue with the following DBCS starter byte */
    173                                 source.position(source.position() - 1);
    174                                 mySourceChar = leadByte;
    175                             } else {
    176                                 /* report a pair of illegal bytes if the second byte is not a DBCS starter */
    177                                 /* add another bit so that the code below writes 2 bytes in case of error */
    178                                 mySourceChar = 0x10000 | (leadByte << 8) | mySourceChar;
    179                             }
    180                             toUnicodeStatus = 0x00;
    181                         }
    182                     } else {
    183                         if (mySourceChar == UCNV_TILDE) {
    184                             mode = UCNV_TILDE;
    185                             continue;
    186                         } else if (mySourceChar <= 0x7f) {
    187                             targetUniChar = mySourceChar; /* ASCII */
    188                             isEmptySegment = false; /* the segment has something valid */
    189                         } else {
    190                             targetUniChar = 0xffff;
    191                             isEmptySegment = false; /* different error here, reset this to avoid spurious future error */
    192                         }
    193                     }
    194 
    195                     if (targetUniChar < 0xfffe) {
    196                         if (offsets != null) {
    197                             offsets.put(source.position() - 1 - (isStateDBCS ? 1 : 0));
    198                         }
    199 
    200                         target.put((char) targetUniChar);
    201                     } else /* targetUniChar >= 0xfffe */{
    202                         if (mySourceChar > 0xff) {
    203                             toUBytesArray[toUBytesBegin + 0] = (byte) (mySourceChar >> 8);
    204                             toUBytesArray[toUBytesBegin + 1] = (byte) mySourceChar;
    205                             toULength = 2;
    206                         } else {
    207                             toUBytesArray[toUBytesBegin + 0] = (byte) mySourceChar;
    208                             toULength = 1;
    209                         }
    210                         if (targetUniChar == 0xfffe) {
    211                             return CoderResult.unmappableForLength(toULength);
    212                         } else {
    213                             return CoderResult.malformedForLength(toULength);
    214                         }
    215                     }
    216                 } else {
    217                     return CoderResult.OVERFLOW;
    218                 }
    219             }
    220 
    221             return err;
    222         }
    223     }
    224 
    225     class CharsetEncoderHZ extends CharsetEncoderICU {
    226         CharsetMBCS.CharsetEncoderMBCS gbEncoder;
    227         boolean isEscapeAppended = false;
    228         boolean isTargetUCharDBCS = false;
    229 
    230         public CharsetEncoderHZ(CharsetICU cs) {
    231             super(cs, fromUSubstitution);
    232             gbEncoder = (CharsetMBCS.CharsetEncoderMBCS) gbCharset.newEncoder();
    233         }
    234 
    235         @Override
    236         protected void implReset() {
    237             super.implReset();
    238             gbEncoder.implReset();
    239 
    240             isEscapeAppended = false;
    241             isTargetUCharDBCS = false;
    242         }
    243 
    244         @Override
    245         protected CoderResult encodeLoop(CharBuffer source, ByteBuffer target, IntBuffer offsets, boolean flush) {
    246             int length = 0;
    247             int[] targetUniChar = new int[] { 0 };
    248             int mySourceChar = 0;
    249             boolean oldIsTargetUCharDBCS = isTargetUCharDBCS;
    250 
    251             if (!source.hasRemaining())
    252                 return CoderResult.UNDERFLOW;
    253             else if (!target.hasRemaining())
    254                 return CoderResult.OVERFLOW;
    255 
    256             if (fromUChar32 != 0 && target.hasRemaining()) {
    257                 CoderResult cr = handleSurrogates(source, (char) fromUChar32);
    258                 return (cr != null) ? cr : CoderResult.unmappableForLength(2);
    259             }
    260             /* writing the char to the output stream */
    261             while (source.hasRemaining()) {
    262                 targetUniChar[0] = MISSING_CHAR_MARKER;
    263                 if (target.hasRemaining()) {
    264 
    265                     mySourceChar = source.get();
    266 
    267                     oldIsTargetUCharDBCS = isTargetUCharDBCS;
    268                     if (mySourceChar == UCNV_TILDE) {
    269                         /*
    270                          * concatEscape(args, &myTargetIndex, &targetLength,"\x7E\x7E",err,2,&mySourceIndex);
    271                          */
    272                         concatEscape(source, target, offsets, TILDE_ESCAPE);
    273                         continue;
    274                     } else if (mySourceChar <= 0x7f) {
    275                         length = 1;
    276                         targetUniChar[0] = mySourceChar;
    277                     } else {
    278                         length = gbEncoder.fromUChar32(mySourceChar, targetUniChar, super.isFallbackUsed());
    279 
    280                         /*
    281                          * we can only use lead bytes 21..7D and trail bytes 21..7E
    282                          */
    283                         if (length == 2 && 0xa1a1 <= targetUniChar[0] && targetUniChar[0] <= 0xfdfe
    284                                 && 0xa1 <= (targetUniChar[0] & 0xff) && (targetUniChar[0] & 0xff) <= 0xfe) {
    285                             targetUniChar[0] -= 0x8080;
    286                         } else {
    287                             targetUniChar[0] = MISSING_CHAR_MARKER;
    288                         }
    289                     }
    290                     if (targetUniChar[0] != MISSING_CHAR_MARKER) {
    291                         isTargetUCharDBCS = (targetUniChar[0] > 0x00FF);
    292                         if (oldIsTargetUCharDBCS != isTargetUCharDBCS || !isEscapeAppended) {
    293                             /* Shifting from a double byte to single byte mode */
    294                             if (!isTargetUCharDBCS) {
    295                                 concatEscape(source, target, offsets, SB_ESCAPE);
    296                                 isEscapeAppended = true;
    297                             } else { /*
    298                                          * Shifting from a single byte to double byte mode
    299                                          */
    300                                 concatEscape(source, target, offsets, DB_ESCAPE);
    301                                 isEscapeAppended = true;
    302 
    303                             }
    304                         }
    305 
    306                         if (isTargetUCharDBCS) {
    307                             if (target.hasRemaining()) {
    308                                 target.put((byte) (targetUniChar[0] >> 8));
    309                                 if (offsets != null) {
    310                                     offsets.put(source.position() - 1);
    311                                 }
    312                                 if (target.hasRemaining()) {
    313                                     target.put((byte) targetUniChar[0]);
    314                                     if (offsets != null) {
    315                                         offsets.put(source.position() - 1);
    316                                     }
    317                                 } else {
    318                                     errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
    319                                     // *err = U_BUFFER_OVERFLOW_ERROR;
    320                                 }
    321                             } else {
    322                                 errorBuffer[errorBufferLength++] = (byte) (targetUniChar[0] >> 8);
    323                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
    324                                 // *err = U_BUFFER_OVERFLOW_ERROR;
    325                             }
    326 
    327                         } else {
    328                             if (target.hasRemaining()) {
    329                                 target.put((byte) targetUniChar[0]);
    330                                 if (offsets != null) {
    331                                     offsets.put(source.position() - 1);
    332                                 }
    333 
    334                             } else {
    335                                 errorBuffer[errorBufferLength++] = (byte) targetUniChar[0];
    336                                 // *err = U_BUFFER_OVERFLOW_ERROR;
    337                             }
    338                         }
    339 
    340                     } else {
    341                         /* oops.. the code point is unassigned */
    342                         /* Handle surrogates */
    343                         /* check if the char is a First surrogate */
    344 
    345                         if (UTF16.isSurrogate((char) mySourceChar)) {
    346                             // use that handy handleSurrogates method everyone's been talking about!
    347                             CoderResult cr = handleSurrogates(source, (char) mySourceChar);
    348                             return (cr != null) ? cr : CoderResult.unmappableForLength(2);
    349                         } else {
    350                             /* callback(unassigned) for a BMP code point */
    351                             // *err = U_INVALID_CHAR_FOUND;
    352                             fromUChar32 = mySourceChar;
    353                             return CoderResult.unmappableForLength(1);
    354                         }
    355                     }
    356                 } else {
    357                     // *err = U_BUFFER_OVERFLOW_ERROR;
    358                     return CoderResult.OVERFLOW;
    359                 }
    360             }
    361 
    362             return CoderResult.UNDERFLOW;
    363         }
    364 
    365         private CoderResult concatEscape(CharBuffer source, ByteBuffer target, IntBuffer offsets, byte[] strToAppend) {
    366             CoderResult cr = null;
    367             for (int i=0; i<strToAppend.length; i++) {
    368                 byte b = strToAppend[i];
    369                 if (target.hasRemaining()) {
    370                     target.put(b);
    371                     if (offsets != null)
    372                         offsets.put(source.position() - 1);
    373                 } else {
    374                     errorBuffer[errorBufferLength++] = b;
    375                     cr = CoderResult.OVERFLOW;
    376                 }
    377             }
    378             return cr;
    379         }
    380     }
    381 
    382     @Override
    383     public CharsetDecoder newDecoder() {
    384         return new CharsetDecoderHZ(this);
    385     }
    386 
    387     @Override
    388     public CharsetEncoder newEncoder() {
    389         return new CharsetEncoderHZ(this);
    390     }
    391 
    392     @Override
    393     void getUnicodeSetImpl( UnicodeSet setFillIn, int which){
    394         setFillIn.add(0,0x7f);
    395        // CharsetMBCS mbcshz = (CharsetMBCS)CharsetICU.forNameICU("icu-internal-25546");
    396         gbCharset.MBCSGetFilteredUnicodeSetForUnicode(gbCharset.sharedData, setFillIn, which, CharsetMBCS.UCNV_SET_FILTER_HZ);
    397     }
    398 }
    399