Home | History | Annotate | Download | only in text
      1 /*
      2  *******************************************************************************
      3  * Copyright (C) 1996-2016, International Business Machines Corporation and    *
      4  * others. All Rights Reserved.                                                *
      5  *******************************************************************************
      6  */
      7 
      8 package com.ibm.icu.text;
      9 
     10 /**
     11 * A decompression engine implementing the Standard Compression Scheme
     12 * for Unicode (SCSU) as outlined in <A
     13 * HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
     14 * Report #6</A>.
     15 *
     16 * <P><STRONG>USAGE</STRONG></P>
     17 *
     18 * <P>The static methods on <TT>UnicodeDecompressor</TT> may be used in a
     19 * straightforward manner to decompress simple strings:</P>
     20 *
     21 * <PRE>
     22 *  byte [] compressed = ... ; // get compressed bytes from somewhere
     23 *  String result = UnicodeDecompressor.decompress(compressed);
     24 * </PRE>
     25 *
     26 * <P>The static methods have a fairly large memory footprint.
     27 * For finer-grained control over memory usage,
     28 * <TT>UnicodeDecompressor</TT> offers more powerful APIs allowing
     29 * iterative decompression:</P>
     30 *
     31 * <PRE>
     32 *  // Decompress an array "bytes" of length "len" using a buffer of 512 chars
     33 *  // to the Writer "out"
     34 *
     35 *  UnicodeDecompressor myDecompressor         = new UnicodeDecompressor();
     36 *  final static int    BUFSIZE                = 512;
     37 *  char []             charBuffer             = new char [ BUFSIZE ];
     38 *  int                 charsWritten           = 0;
     39 *  int []              bytesRead              = new int [1];
     40 *  int                 totalBytesDecompressed = 0;
     41 *  int                 totalCharsWritten      = 0;
     42 *
     43 *  do {
     44 *    // do the decompression
     45 *    charsWritten = myDecompressor.decompress(bytes, totalBytesDecompressed,
     46 *                                             len, bytesRead,
     47 *                                             charBuffer, 0, BUFSIZE);
     48 *
     49 *    // do something with the current set of chars
     50 *    out.write(charBuffer, 0, charsWritten);
     51 *
     52 *    // update the no. of bytes decompressed
     53 *    totalBytesDecompressed += bytesRead[0];
     54 *
     55 *    // update the no. of chars written
     56 *    totalCharsWritten += charsWritten;
     57 *
     58 *  } while(totalBytesDecompressed &lt; len);
     59 *
     60 *  myDecompressor.reset(); // reuse decompressor
     61 * </PRE>
     62 *
     63 * <P>Decompression is performed according to the standard set forth in
     64 * <A HREF="http://www.unicode.org/unicode/reports/tr6">Unicode Technical
     65 * Report #6</A></P>
     66 *
     67 * @see UnicodeCompressor
     68 *
     69 * @author Stephen F. Booth
     70 * @stable ICU 2.4
     71 */
     72 public final class UnicodeDecompressor implements SCSU
     73 {
     74     //==========================
     75     // Instance variables
     76     //==========================
     77 
     78     /** Alias to current dynamic window */
     79     private int       fCurrentWindow   = 0;
     80 
     81     /** Dynamic compression window offsets */
     82     private int []    fOffsets         = new int [ NUMWINDOWS ];
     83 
     84     /** Current compression mode */
     85     private int       fMode            = SINGLEBYTEMODE;
     86 
     87     /** Size of our internal buffer */
     88     private final static int BUFSIZE   = 3;
     89 
     90     /** Internal buffer for saving state */
     91     private byte []   fBuffer          = new byte [BUFSIZE];
     92 
     93     /** Number of characters in our internal buffer */
     94     private int       fBufferLength    = 0;
     95 
     96 
     97     /**
     98      * Create a UnicodeDecompressor.
     99      * Sets all windows to their default values.
    100      * @see #reset
    101      * @stable ICU 2.4
    102      */
    103     public UnicodeDecompressor(){
    104         reset();              // initialize to defaults
    105     }
    106 
    107     /**
    108      * Decompress a byte array into a String.
    109      * @param buffer The byte array to decompress.
    110      * @return A String containing the decompressed characters.
    111      * @see #decompress(byte [], int, int)
    112      * @stable ICU 2.4
    113      */
    114     public static String decompress(byte [] buffer){
    115         char [] buf = decompress(buffer, 0, buffer.length);
    116         return new String(buf);
    117     }
    118 
    119     /**
    120      * Decompress a byte array into a Unicode character array.
    121      * @param buffer The byte array to decompress.
    122      * @param start The start of the byte run to decompress.
    123      * @param limit The limit of the byte run to decompress.
    124      * @return A character array containing the decompressed bytes.
    125      * @see #decompress(byte [])
    126      * @stable ICU 2.4
    127      */
    128     public static char [] decompress(byte [] buffer, int start, int limit) {
    129         UnicodeDecompressor comp = new UnicodeDecompressor();
    130 
    131         // use a buffer we know will never overflow
    132         // in the worst case, each byte will decompress
    133         // to a surrogate pair (buffer must be at least 2 chars)
    134         int len = Math.max(2, 2 * (limit - start));
    135         char [] temp = new char [len];
    136 
    137         int charCount = comp.decompress(buffer, start, limit, null,
    138                         temp, 0, len);
    139 
    140         char [] result = new char [charCount];
    141         System.arraycopy(temp, 0, result, 0, charCount);
    142         return result;
    143     }
    144 
    145     /**
    146      * Decompress a byte array into a Unicode character array.
    147      *
    148      * This function will either completely fill the output buffer,
    149      * or consume the entire input.
    150      *
    151      * @param byteBuffer The byte buffer to decompress.
    152      * @param byteBufferStart The start of the byte run to decompress.
    153      * @param byteBufferLimit The limit of the byte run to decompress.
    154      * @param bytesRead A one-element array.  If not null, on return
    155      * the number of bytes read from byteBuffer.
    156      * @param charBuffer A buffer to receive the decompressed data.
    157      * This buffer must be at minimum two characters in size.
    158      * @param charBufferStart The starting offset to which to write
    159      * decompressed data.
    160      * @param charBufferLimit The limiting offset for writing
    161      * decompressed data.
    162      * @return The number of Unicode characters written to charBuffer.
    163      * @stable ICU 2.4
    164      */
    165     public int decompress(byte []    byteBuffer,
    166               int        byteBufferStart,
    167               int        byteBufferLimit,
    168               int []     bytesRead,
    169               char []    charBuffer,
    170               int        charBufferStart,
    171               int        charBufferLimit)
    172     {
    173     // the current position in the source byte buffer
    174     int bytePos      = byteBufferStart;
    175 
    176     // the current position in the target char buffer
    177     int ucPos        = charBufferStart;
    178 
    179         // the current byte from the source buffer
    180     int aByte        = 0x00;
    181 
    182 
    183     // charBuffer must be at least 2 chars in size
    184     if(charBuffer.length < 2 || (charBufferLimit - charBufferStart) < 2)
    185         throw new IllegalArgumentException("charBuffer.length < 2");
    186 
    187     // if our internal buffer isn't empty, flush its contents
    188     // to the output buffer before doing any more decompression
    189     if(fBufferLength > 0) {
    190 
    191         int newBytes = 0;
    192 
    193         // fill the buffer completely, to guarantee one full character
    194         if(fBufferLength != BUFSIZE) {
    195         newBytes = fBuffer.length - fBufferLength;
    196 
    197         // verify there are newBytes bytes in byteBuffer
    198         if(byteBufferLimit - byteBufferStart < newBytes)
    199             newBytes = byteBufferLimit - byteBufferStart;
    200 
    201         System.arraycopy(byteBuffer, byteBufferStart,
    202                  fBuffer, fBufferLength, newBytes);
    203         }
    204 
    205         // reset buffer length to 0 before recursive call
    206         fBufferLength = 0;
    207 
    208         // call self recursively to decompress the buffer
    209         int count = decompress(fBuffer, 0, fBuffer.length, null,
    210                    charBuffer, charBufferStart,
    211                    charBufferLimit);
    212 
    213         // update the positions into the arrays
    214         ucPos += count;
    215         bytePos += newBytes;
    216     }
    217 
    218         // the main decompression loop
    219     mainLoop:
    220     while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
    221         switch(fMode) {
    222         case SINGLEBYTEMODE:
    223         // single-byte mode decompression loop
    224         singleByteModeLoop:
    225         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
    226         aByte = byteBuffer[bytePos++] & 0xFF;
    227         switch(aByte) {
    228             // All bytes from 0x80 through 0xFF are remapped
    229             // to chars or surrogate pairs according to the
    230             // currently active window
    231         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84:
    232         case 0x85: case 0x86: case 0x87: case 0x88: case 0x89:
    233         case 0x8A: case 0x8B: case 0x8C: case 0x8D: case 0x8E:
    234         case 0x8F: case 0x90: case 0x91: case 0x92: case 0x93:
    235         case 0x94: case 0x95: case 0x96: case 0x97: case 0x98:
    236         case 0x99: case 0x9A: case 0x9B: case 0x9C: case 0x9D:
    237         case 0x9E: case 0x9F: case 0xA0: case 0xA1: case 0xA2:
    238         case 0xA3: case 0xA4: case 0xA5: case 0xA6: case 0xA7:
    239         case 0xA8: case 0xA9: case 0xAA: case 0xAB: case 0xAC:
    240         case 0xAD: case 0xAE: case 0xAF: case 0xB0: case 0xB1:
    241         case 0xB2: case 0xB3: case 0xB4: case 0xB5: case 0xB6:
    242         case 0xB7: case 0xB8: case 0xB9: case 0xBA: case 0xBB:
    243         case 0xBC: case 0xBD: case 0xBE: case 0xBF: case 0xC0:
    244         case 0xC1: case 0xC2: case 0xC3: case 0xC4: case 0xC5:
    245         case 0xC6: case 0xC7: case 0xC8: case 0xC9: case 0xCA:
    246         case 0xCB: case 0xCC: case 0xCD: case 0xCE: case 0xCF:
    247         case 0xD0: case 0xD1: case 0xD2: case 0xD3: case 0xD4:
    248         case 0xD5: case 0xD6: case 0xD7: case 0xD8: case 0xD9:
    249         case 0xDA: case 0xDB: case 0xDC: case 0xDD: case 0xDE:
    250         case 0xDF: case 0xE0: case 0xE1: case 0xE2: case 0xE3:
    251         case 0xE4: case 0xE5: case 0xE6: case 0xE7: case 0xE8:
    252         case 0xE9: case 0xEA: case 0xEB: case 0xEC: case 0xED:
    253         case 0xEE: case 0xEF: case 0xF0: case 0xF1: case 0xF2:
    254         case 0xF3: case 0xF4: case 0xF5: case 0xF6: case 0xF7:
    255         case 0xF8: case 0xF9: case 0xFA: case 0xFB: case 0xFC:
    256         case 0xFD: case 0xFE: case 0xFF:
    257             // For offsets <= 0xFFFF, convert to a single char
    258             // by adding the window's offset and subtracting
    259             // the generic compression offset
    260             if(fOffsets[ fCurrentWindow ] <= 0xFFFF) {
    261             charBuffer[ucPos++] = (char)
    262                 (aByte + fOffsets[ fCurrentWindow ]
    263                  - COMPRESSIONOFFSET);
    264             }
    265             // For offsets > 0x10000, convert to a surrogate pair by
    266             // normBase = window's offset - 0x10000
    267             // high surr. = 0xD800 + (normBase >> 10)
    268             // low  surr. = 0xDC00 + (normBase & 0x3FF) + (byte & 0x7F)
    269             else {
    270             // make sure there is enough room to write
    271             // both characters
    272             // if not, save state and break out
    273             if((ucPos + 1) >= charBufferLimit) {
    274                 --bytePos;
    275                 System.arraycopy(byteBuffer, bytePos,
    276                          fBuffer, 0,
    277                          byteBufferLimit - bytePos);
    278                 fBufferLength = byteBufferLimit - bytePos;
    279                 bytePos += fBufferLength;
    280                 break mainLoop;
    281             }
    282 
    283             int normalizedBase = fOffsets[ fCurrentWindow ]
    284                 - 0x10000;
    285             charBuffer[ucPos++] = (char)
    286                 (0xD800 + (normalizedBase >> 10));
    287             charBuffer[ucPos++] = (char)
    288                 (0xDC00 + (normalizedBase & 0x3FF)+(aByte & 0x7F));
    289             }
    290             break;
    291 
    292             // bytes from 0x20 through 0x7F are treated as ASCII and
    293             // are remapped to chars by padding the high byte
    294             // (this is the same as quoting from static window 0)
    295             // NUL (0x00), HT (0x09), CR (0x0A), LF (0x0D)
    296             // are treated as ASCII as well
    297         case 0x00: case 0x09: case 0x0A: case 0x0D:
    298         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24:
    299         case 0x25: case 0x26: case 0x27: case 0x28: case 0x29:
    300         case 0x2A: case 0x2B: case 0x2C: case 0x2D: case 0x2E:
    301         case 0x2F: case 0x30: case 0x31: case 0x32: case 0x33:
    302         case 0x34: case 0x35: case 0x36: case 0x37: case 0x38:
    303         case 0x39: case 0x3A: case 0x3B: case 0x3C: case 0x3D:
    304         case 0x3E: case 0x3F: case 0x40: case 0x41: case 0x42:
    305         case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
    306         case 0x48: case 0x49: case 0x4A: case 0x4B: case 0x4C:
    307         case 0x4D: case 0x4E: case 0x4F: case 0x50: case 0x51:
    308         case 0x52: case 0x53: case 0x54: case 0x55: case 0x56:
    309         case 0x57: case 0x58: case 0x59: case 0x5A: case 0x5B:
    310         case 0x5C: case 0x5D: case 0x5E: case 0x5F: case 0x60:
    311         case 0x61: case 0x62: case 0x63: case 0x64: case 0x65:
    312         case 0x66: case 0x67: case 0x68: case 0x69: case 0x6A:
    313         case 0x6B: case 0x6C: case 0x6D: case 0x6E: case 0x6F:
    314         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74:
    315         case 0x75: case 0x76: case 0x77: case 0x78: case 0x79:
    316         case 0x7A: case 0x7B: case 0x7C: case 0x7D: case 0x7E:
    317         case 0x7F:
    318             charBuffer[ucPos++] = (char) aByte;
    319             break;
    320 
    321             // quote unicode
    322         case SQUOTEU:
    323             // verify we have two bytes following tag
    324             // if not, save state and break out
    325             if( (bytePos + 1) >= byteBufferLimit ) {
    326             --bytePos;
    327             System.arraycopy(byteBuffer, bytePos,
    328                      fBuffer, 0,
    329                      byteBufferLimit - bytePos);
    330             fBufferLength = byteBufferLimit - bytePos;
    331             bytePos += fBufferLength;
    332             break mainLoop;
    333             }
    334 
    335             aByte = byteBuffer[bytePos++];
    336             charBuffer[ucPos++] = (char)
    337             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
    338             break;
    339 
    340             // switch to Unicode mode
    341         case SCHANGEU:
    342             fMode = UNICODEMODE;
    343             break singleByteModeLoop;
    344             //break;
    345 
    346             // handle all quote tags
    347         case SQUOTE0: case SQUOTE1: case SQUOTE2: case SQUOTE3:
    348         case SQUOTE4: case SQUOTE5: case SQUOTE6: case SQUOTE7:
    349             // verify there is a byte following the tag
    350             // if not, save state and break out
    351             if(bytePos >= byteBufferLimit) {
    352             --bytePos;
    353             System.arraycopy(byteBuffer, bytePos,
    354                      fBuffer, 0,
    355                      byteBufferLimit - bytePos);
    356             fBufferLength = byteBufferLimit - bytePos;
    357             bytePos += fBufferLength;
    358             break mainLoop;
    359             }
    360 
    361             // if the byte is in the range 0x00 - 0x7F, use
    362             // static window n otherwise, use dynamic window n
    363             int dByte = byteBuffer[bytePos++] & 0xFF;
    364             charBuffer[ucPos++] = (char)
    365             (dByte+ (dByte >= 0x00 && dByte < 0x80
    366                  ? sOffsets[aByte - SQUOTE0]
    367                  : (fOffsets[aByte - SQUOTE0]
    368                     - COMPRESSIONOFFSET)));
    369             break;
    370 
    371             // handle all change tags
    372         case SCHANGE0: case SCHANGE1: case SCHANGE2: case SCHANGE3:
    373         case SCHANGE4: case SCHANGE5: case SCHANGE6: case SCHANGE7:
    374             fCurrentWindow = aByte - SCHANGE0;
    375             break;
    376 
    377             // handle all define tags
    378         case SDEFINE0: case SDEFINE1: case SDEFINE2: case SDEFINE3:
    379         case SDEFINE4: case SDEFINE5: case SDEFINE6: case SDEFINE7:
    380             // verify there is a byte following the tag
    381             // if not, save state and break out
    382             if(bytePos >= byteBufferLimit) {
    383             --bytePos;
    384             System.arraycopy(byteBuffer, bytePos,
    385                      fBuffer, 0,
    386                      byteBufferLimit - bytePos);
    387             fBufferLength = byteBufferLimit - bytePos;
    388             bytePos += fBufferLength;
    389             break mainLoop;
    390             }
    391 
    392             fCurrentWindow = aByte - SDEFINE0;
    393             fOffsets[fCurrentWindow] =
    394             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
    395             break;
    396 
    397             // handle define extended tag
    398         case SDEFINEX:
    399             // verify we have two bytes following tag
    400             // if not, save state and break out
    401             if((bytePos + 1) >= byteBufferLimit ) {
    402             --bytePos;
    403             System.arraycopy(byteBuffer, bytePos,
    404                      fBuffer, 0,
    405                      byteBufferLimit - bytePos);
    406             fBufferLength = byteBufferLimit - bytePos;
    407             bytePos += fBufferLength;
    408             break mainLoop;
    409             }
    410 
    411             aByte = byteBuffer[bytePos++] & 0xFF;
    412             fCurrentWindow = (aByte & 0xE0) >> 5;
    413             fOffsets[fCurrentWindow] = 0x10000 +
    414             (0x80 * (((aByte & 0x1F) << 8)
    415                  | (byteBuffer[bytePos++] & 0xFF)));
    416             break;
    417 
    418             // reserved, shouldn't happen
    419         case SRESERVED:
    420             break;
    421 
    422         } // end switch
    423         } // end while
    424         break;
    425 
    426         case UNICODEMODE:
    427         // unicode mode decompression loop
    428         unicodeModeLoop:
    429         while(bytePos < byteBufferLimit && ucPos < charBufferLimit) {
    430         aByte = byteBuffer[bytePos++] & 0xFF;
    431         switch(aByte) {
    432             // handle all define tags
    433         case UDEFINE0: case UDEFINE1: case UDEFINE2: case UDEFINE3:
    434         case UDEFINE4: case UDEFINE5: case UDEFINE6: case UDEFINE7:
    435             // verify there is a byte following tag
    436             // if not, save state and break out
    437             if(bytePos >= byteBufferLimit ) {
    438             --bytePos;
    439             System.arraycopy(byteBuffer, bytePos,
    440                      fBuffer, 0,
    441                      byteBufferLimit - bytePos);
    442             fBufferLength = byteBufferLimit - bytePos;
    443             bytePos += fBufferLength;
    444             break mainLoop;
    445             }
    446 
    447             fCurrentWindow = aByte - UDEFINE0;
    448             fOffsets[fCurrentWindow] =
    449             sOffsetTable[byteBuffer[bytePos++] & 0xFF];
    450             fMode = SINGLEBYTEMODE;
    451             break unicodeModeLoop;
    452             //break;
    453 
    454             // handle define extended tag
    455         case UDEFINEX:
    456             // verify we have two bytes following tag
    457             // if not, save state and break out
    458             if((bytePos + 1) >= byteBufferLimit ) {
    459             --bytePos;
    460             System.arraycopy(byteBuffer, bytePos,
    461                      fBuffer, 0,
    462                      byteBufferLimit - bytePos);
    463             fBufferLength = byteBufferLimit - bytePos;
    464             bytePos += fBufferLength;
    465             break mainLoop;
    466             }
    467 
    468             aByte = byteBuffer[bytePos++] & 0xFF;
    469             fCurrentWindow = (aByte & 0xE0) >> 5;
    470             fOffsets[fCurrentWindow] = 0x10000 +
    471             (0x80 * (((aByte & 0x1F) << 8)
    472                  | (byteBuffer[bytePos++] & 0xFF)));
    473             fMode = SINGLEBYTEMODE;
    474             break unicodeModeLoop;
    475             //break;
    476 
    477             // handle all change tags
    478         case UCHANGE0: case UCHANGE1: case UCHANGE2: case UCHANGE3:
    479         case UCHANGE4: case UCHANGE5: case UCHANGE6: case UCHANGE7:
    480             fCurrentWindow = aByte - UCHANGE0;
    481             fMode = SINGLEBYTEMODE;
    482             break unicodeModeLoop;
    483             //break;
    484 
    485             // quote unicode
    486         case UQUOTEU:
    487             // verify we have two bytes following tag
    488             // if not, save state and break out
    489             if(bytePos >= byteBufferLimit  - 1) {
    490             --bytePos;
    491             System.arraycopy(byteBuffer, bytePos,
    492                      fBuffer, 0,
    493                      byteBufferLimit - bytePos);
    494             fBufferLength = byteBufferLimit - bytePos;
    495             bytePos += fBufferLength;
    496             break mainLoop;
    497             }
    498 
    499             aByte = byteBuffer[bytePos++];
    500             charBuffer[ucPos++] = (char)
    501             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
    502             break;
    503 
    504         default:
    505             // verify there is a byte following tag
    506             // if not, save state and break out
    507             if(bytePos >= byteBufferLimit ) {
    508             --bytePos;
    509             System.arraycopy(byteBuffer, bytePos,
    510                      fBuffer, 0,
    511                      byteBufferLimit - bytePos);
    512             fBufferLength = byteBufferLimit - bytePos;
    513             bytePos += fBufferLength;
    514             break mainLoop;
    515             }
    516 
    517             charBuffer[ucPos++] = (char)
    518             (aByte << 8 | (byteBuffer[bytePos++] & 0xFF));
    519             break;
    520 
    521         } // end switch
    522         } // end while
    523         break;
    524 
    525         } // end switch( fMode )
    526     } // end while
    527 
    528         // fill in output parameter
    529     if(bytesRead != null)
    530         bytesRead [0] = (bytePos - byteBufferStart);
    531 
    532         // return # of chars written
    533     return (ucPos - charBufferStart);
    534     }
    535 
    536     /**
    537      * Reset the decompressor to its initial state.
    538      * @stable ICU 2.4
    539      */
    540     public void reset()
    541     {
    542         // reset dynamic windows
    543         fOffsets[0] = 0x0080;    // Latin-1
    544         fOffsets[1] = 0x00C0;    // Latin-1 Supplement + Latin Extended-A
    545         fOffsets[2] = 0x0400;    // Cyrillic
    546         fOffsets[3] = 0x0600;    // Arabic
    547         fOffsets[4] = 0x0900;    // Devanagari
    548         fOffsets[5] = 0x3040;    // Hiragana
    549         fOffsets[6] = 0x30A0;    // Katakana
    550         fOffsets[7] = 0xFF00;    // Fullwidth ASCII
    551 
    552 
    553         fCurrentWindow  = 0;                // Make current window Latin-1
    554         fMode           = SINGLEBYTEMODE;   // Always start in single-byte mode
    555         fBufferLength   = 0;                // Empty buffer
    556     }
    557 }
    558