Home | History | Annotate | Download | only in serializer
      1 /*
      2  * Licensed to the Apache Software Foundation (ASF) under one
      3  * or more contributor license agreements. See the NOTICE file
      4  * distributed with this work for additional information
      5  * regarding copyright ownership. The ASF licenses this file
      6  * to you under the Apache License, Version 2.0 (the  "License");
      7  * you may not use this file except in compliance with the License.
      8  * You may obtain a copy of the License at
      9  *
     10  *     http://www.apache.org/licenses/LICENSE-2.0
     11  *
     12  * Unless required by applicable law or agreed to in writing, software
     13  * distributed under the License is distributed on an "AS IS" BASIS,
     14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15  * See the License for the specific language governing permissions and
     16  * limitations under the License.
     17  */
     18 /*
     19  * $Id: WriterToUTF8Buffered.java 469356 2006-10-31 03:20:34Z minchau $
     20  */
     21 package org.apache.xml.serializer;
     22 
     23 import java.io.IOException;
     24 import java.io.OutputStream;
     25 import java.io.UnsupportedEncodingException;
     26 import java.io.Writer;
     27 
     28 
     29 /**
     30  * This class writes unicode characters to a byte stream (java.io.OutputStream)
     31  * as quickly as possible. It buffers the output in an internal
     32  * buffer which must be flushed to the OutputStream when done. This flushing
     33  * is done via the close() flush() or flushBuffer() method.
     34  *
     35  * This class is only used internally within Xalan.
     36  *
     37  * @xsl.usage internal
     38  */
     39 final class WriterToUTF8Buffered extends Writer implements WriterChain
     40 {
     41 
     42   /** number of bytes that the byte buffer can hold.
     43    * This is a fixed constant is used rather than m_outputBytes.lenght for performance.
     44    */
     45   private static final int BYTES_MAX=16*1024;
     46   /** number of characters that the character buffer can hold.
     47    * This is 1/3 of the number of bytes because UTF-8 encoding
     48    * can expand one unicode character by up to 3 bytes.
     49    */
     50   private static final int CHARS_MAX=(BYTES_MAX/3);
     51 
     52  // private static final int
     53 
     54   /** The byte stream to write to. (sc & sb remove final to compile in JDK 1.1.8) */
     55   private final OutputStream m_os;
     56 
     57   /**
     58    * The internal buffer where data is stored.
     59    * (sc & sb remove final to compile in JDK 1.1.8)
     60    */
     61   private final byte m_outputBytes[];
     62 
     63   private final char m_inputChars[];
     64 
     65   /**
     66    * The number of valid bytes in the buffer. This value is always
     67    * in the range <tt>0</tt> through <tt>m_outputBytes.length</tt>; elements
     68    * <tt>m_outputBytes[0]</tt> through <tt>m_outputBytes[count-1]</tt> contain valid
     69    * byte data.
     70    */
     71   private int count;
     72 
     73   /**
     74    * Create an buffered UTF-8 writer.
     75    *
     76    *
     77    * @param   out    the underlying output stream.
     78    *
     79    * @throws UnsupportedEncodingException
     80    */
     81   public WriterToUTF8Buffered(OutputStream out)
     82   {
     83       m_os = out;
     84       // get 3 extra bytes to make buffer overflow checking simpler and faster
     85       // we won't have to keep checking for a few extra characters
     86       m_outputBytes = new byte[BYTES_MAX + 3];
     87 
     88       // Big enough to hold the input chars that will be transformed
     89       // into output bytes in m_ouputBytes.
     90       m_inputChars = new char[CHARS_MAX + 2];
     91       count = 0;
     92 
     93 //      the old body of this constructor, before the buffersize was changed to a constant
     94 //      this(out, 8*1024);
     95   }
     96 
     97   /**
     98    * Create an buffered UTF-8 writer to write data to the
     99    * specified underlying output stream with the specified buffer
    100    * size.
    101    *
    102    * @param   out    the underlying output stream.
    103    * @param   size   the buffer size.
    104    * @exception IllegalArgumentException if size <= 0.
    105    */
    106 //  public WriterToUTF8Buffered(final OutputStream out, final int size)
    107 //  {
    108 //
    109 //    m_os = out;
    110 //
    111 //    if (size <= 0)
    112 //    {
    113 //      throw new IllegalArgumentException(
    114 //        SerializerMessages.createMessage(SerializerErrorResources.ER_BUFFER_SIZE_LESSTHAN_ZERO, null)); //"Buffer size <= 0");
    115 //    }
    116 //
    117 //    m_outputBytes = new byte[size];
    118 //    count = 0;
    119 //  }
    120 
    121   /**
    122    * Write a single character.  The character to be written is contained in
    123    * the 16 low-order bits of the given integer value; the 16 high-order bits
    124    * are ignored.
    125    *
    126    * <p> Subclasses that intend to support efficient single-character output
    127    * should override this method.
    128    *
    129    * @param c  int specifying a character to be written.
    130    * @exception  IOException  If an I/O error occurs
    131    */
    132   public void write(final int c) throws IOException
    133   {
    134 
    135     /* If we are close to the end of the buffer then flush it.
    136      * Remember the buffer can hold a few more bytes than BYTES_MAX
    137      */
    138     if (count >= BYTES_MAX)
    139         flushBuffer();
    140 
    141     if (c < 0x80)
    142     {
    143        m_outputBytes[count++] = (byte) (c);
    144     }
    145     else if (c < 0x800)
    146     {
    147       m_outputBytes[count++] = (byte) (0xc0 + (c >> 6));
    148       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
    149     }
    150     else if (c < 0x10000)
    151     {
    152       m_outputBytes[count++] = (byte) (0xe0 + (c >> 12));
    153       m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
    154       m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
    155     }
    156 	else
    157 	{
    158 	  m_outputBytes[count++] = (byte) (0xf0 + (c >> 18));
    159 	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 12) & 0x3f));
    160 	  m_outputBytes[count++] = (byte) (0x80 + ((c >> 6) & 0x3f));
    161 	  m_outputBytes[count++] = (byte) (0x80 + (c & 0x3f));
    162 	}
    163 
    164   }
    165 
    166 
    167   /**
    168    * Write a portion of an array of characters.
    169    *
    170    * @param  chars  Array of characters
    171    * @param  start   Offset from which to start writing characters
    172    * @param  length   Number of characters to write
    173    *
    174    * @exception  IOException  If an I/O error occurs
    175    *
    176    * @throws java.io.IOException
    177    */
    178   public void write(final char chars[], final int start, final int length)
    179           throws java.io.IOException
    180   {
    181 
    182     // We multiply the length by three since this is the maximum length
    183     // of the characters that we can put into the buffer.  It is possible
    184     // for each Unicode character to expand to three bytes.
    185 
    186     int lengthx3 = 3*length;
    187 
    188     if (lengthx3 >= BYTES_MAX - count)
    189     {
    190       // The requested length is greater than the unused part of the buffer
    191       flushBuffer();
    192 
    193       if (lengthx3 > BYTES_MAX)
    194       {
    195         /*
    196          * The requested length exceeds the size of the buffer.
    197          * Cut the buffer up into chunks, each of which will
    198          * not cause an overflow to the output buffer m_outputBytes,
    199          * and make multiple recursive calls.
    200          * Be careful about integer overflows in multiplication.
    201          */
    202         int split = length/CHARS_MAX;
    203         final int chunks;
    204         if (length % CHARS_MAX > 0)
    205             chunks = split + 1;
    206         else
    207             chunks = split;
    208         int end_chunk = start;
    209         for (int chunk = 1; chunk <= chunks; chunk++)
    210         {
    211             int start_chunk = end_chunk;
    212             end_chunk = start + (int) ((((long) length) * chunk) / chunks);
    213 
    214             // Adjust the end of the chunk if it ends on a high char
    215             // of a Unicode surrogate pair and low char of the pair
    216             // is not going to be in the same chunk
    217             final char c = chars[end_chunk - 1];
    218             int ic = chars[end_chunk - 1];
    219             if (c >= 0xD800 && c <= 0xDBFF) {
    220                 // The last Java char that we were going
    221                 // to process is the first of a
    222                 // Java surrogate char pair that
    223                 // represent a Unicode character.
    224 
    225                 if (end_chunk < start + length) {
    226                     // Avoid spanning by including the low
    227                     // char in the current chunk of chars.
    228                     end_chunk++;
    229                 } else {
    230                     /* This is the last char of the last chunk,
    231                      * and it is the high char of a high/low pair with
    232                      * no low char provided.
    233                      * TODO: error message needed.
    234                      * The char array incorrectly ends in a high char
    235                      * of a high/low surrogate pair, but there is
    236                      * no corresponding low as the high is the last char
    237                      */
    238                     end_chunk--;
    239                 }
    240             }
    241 
    242 
    243             int len_chunk = (end_chunk - start_chunk);
    244             this.write(chars,start_chunk, len_chunk);
    245         }
    246         return;
    247       }
    248     }
    249 
    250 
    251 
    252     final int n = length+start;
    253     final byte[] buf_loc = m_outputBytes; // local reference for faster access
    254     int count_loc = count;      // local integer for faster access
    255     int i = start;
    256     {
    257         /* This block could be omitted and the code would produce
    258          * the same result. But this block exists to give the JIT
    259          * a better chance of optimizing a tight and common loop which
    260          * occurs when writing out ASCII characters.
    261          */
    262         char c;
    263         for(; i < n && (c = chars[i])< 0x80 ; i++ )
    264             buf_loc[count_loc++] = (byte)c;
    265     }
    266     for (; i < n; i++)
    267     {
    268 
    269       final char c = chars[i];
    270 
    271       if (c < 0x80)
    272         buf_loc[count_loc++] = (byte) (c);
    273       else if (c < 0x800)
    274       {
    275         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
    276         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
    277       }
    278       /**
    279         * The following else if condition is added to support XML 1.1 Characters for
    280         * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
    281         * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
    282         *          [1101 11yy] [yyxx xxxx] (low surrogate)
    283         *          * uuuuu = wwww + 1
    284         */
    285       else if (c >= 0xD800 && c <= 0xDBFF)
    286       {
    287           char high, low;
    288           high = c;
    289           i++;
    290           low = chars[i];
    291 
    292           buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
    293           buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
    294           buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
    295           buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
    296       }
    297       else
    298       {
    299         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
    300         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
    301         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
    302       }
    303     }
    304     // Store the local integer back into the instance variable
    305     count = count_loc;
    306 
    307   }
    308 
    309   /**
    310    * Write a string.
    311    *
    312    * @param  s  String to be written
    313    *
    314    * @exception  IOException  If an I/O error occurs
    315    */
    316   public void write(final String s) throws IOException
    317   {
    318 
    319     // We multiply the length by three since this is the maximum length
    320     // of the characters that we can put into the buffer.  It is possible
    321     // for each Unicode character to expand to three bytes.
    322     final int length = s.length();
    323     int lengthx3 = 3*length;
    324 
    325     if (lengthx3 >= BYTES_MAX - count)
    326     {
    327       // The requested length is greater than the unused part of the buffer
    328       flushBuffer();
    329 
    330       if (lengthx3 > BYTES_MAX)
    331       {
    332         /*
    333          * The requested length exceeds the size of the buffer,
    334          * so break it up in chunks that don't exceed the buffer size.
    335          */
    336          final int start = 0;
    337          int split = length/CHARS_MAX;
    338          final int chunks;
    339          if (length % CHARS_MAX > 0)
    340              chunks = split + 1;
    341          else
    342              chunks = split;
    343          int end_chunk = 0;
    344          for (int chunk = 1; chunk <= chunks; chunk++)
    345          {
    346              int start_chunk = end_chunk;
    347              end_chunk = start + (int) ((((long) length) * chunk) / chunks);
    348              s.getChars(start_chunk,end_chunk, m_inputChars,0);
    349              int len_chunk = (end_chunk - start_chunk);
    350 
    351              // Adjust the end of the chunk if it ends on a high char
    352              // of a Unicode surrogate pair and low char of the pair
    353              // is not going to be in the same chunk
    354              final char c = m_inputChars[len_chunk - 1];
    355              if (c >= 0xD800 && c <= 0xDBFF) {
    356                  // Exclude char in this chunk,
    357                  // to avoid spanning a Unicode character
    358                  // that is in two Java chars as a high/low surrogate
    359                  end_chunk--;
    360                  len_chunk--;
    361                  if (chunk == chunks) {
    362                      /* TODO: error message needed.
    363                       * The String incorrectly ends in a high char
    364                       * of a high/low surrogate pair, but there is
    365                       * no corresponding low as the high is the last char
    366                       * Recover by ignoring this last char.
    367                       */
    368                  }
    369              }
    370 
    371              this.write(m_inputChars,0, len_chunk);
    372          }
    373          return;
    374       }
    375     }
    376 
    377 
    378     s.getChars(0, length , m_inputChars, 0);
    379     final char[] chars = m_inputChars;
    380     final int n = length;
    381     final byte[] buf_loc = m_outputBytes; // local reference for faster access
    382     int count_loc = count;      // local integer for faster access
    383     int i = 0;
    384     {
    385         /* This block could be omitted and the code would produce
    386          * the same result. But this block exists to give the JIT
    387          * a better chance of optimizing a tight and common loop which
    388          * occurs when writing out ASCII characters.
    389          */
    390         char c;
    391         for(; i < n && (c = chars[i])< 0x80 ; i++ )
    392             buf_loc[count_loc++] = (byte)c;
    393     }
    394     for (; i < n; i++)
    395     {
    396 
    397       final char c = chars[i];
    398 
    399       if (c < 0x80)
    400         buf_loc[count_loc++] = (byte) (c);
    401       else if (c < 0x800)
    402       {
    403         buf_loc[count_loc++] = (byte) (0xc0 + (c >> 6));
    404         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
    405       }
    406     /**
    407       * The following else if condition is added to support XML 1.1 Characters for
    408       * UTF-8:   [1111 0uuu] [10uu zzzz] [10yy yyyy] [10xx xxxx]*
    409       * Unicode: [1101 10ww] [wwzz zzyy] (high surrogate)
    410       *          [1101 11yy] [yyxx xxxx] (low surrogate)
    411       *          * uuuuu = wwww + 1
    412       */
    413     else if (c >= 0xD800 && c <= 0xDBFF)
    414     {
    415         char high, low;
    416         high = c;
    417         i++;
    418         low = chars[i];
    419 
    420         buf_loc[count_loc++] = (byte) (0xF0 | (((high + 0x40) >> 8) & 0xf0));
    421         buf_loc[count_loc++] = (byte) (0x80 | (((high + 0x40) >> 2) & 0x3f));
    422         buf_loc[count_loc++] = (byte) (0x80 | ((low >> 6) & 0x0f) + ((high << 4) & 0x30));
    423         buf_loc[count_loc++] = (byte) (0x80 | (low & 0x3f));
    424     }
    425       else
    426       {
    427         buf_loc[count_loc++] = (byte) (0xe0 + (c >> 12));
    428         buf_loc[count_loc++] = (byte) (0x80 + ((c >> 6) & 0x3f));
    429         buf_loc[count_loc++] = (byte) (0x80 + (c & 0x3f));
    430       }
    431     }
    432     // Store the local integer back into the instance variable
    433     count = count_loc;
    434 
    435   }
    436 
    437   /**
    438    * Flush the internal buffer
    439    *
    440    * @throws IOException
    441    */
    442   public void flushBuffer() throws IOException
    443   {
    444 
    445     if (count > 0)
    446     {
    447       m_os.write(m_outputBytes, 0, count);
    448 
    449       count = 0;
    450     }
    451   }
    452 
    453   /**
    454    * Flush the stream.  If the stream has saved any characters from the
    455    * various write() methods in a buffer, write them immediately to their
    456    * intended destination.  Then, if that destination is another character or
    457    * byte stream, flush it.  Thus one flush() invocation will flush all the
    458    * buffers in a chain of Writers and OutputStreams.
    459    *
    460    * @exception  IOException  If an I/O error occurs
    461    *
    462    * @throws java.io.IOException
    463    */
    464   public void flush() throws java.io.IOException
    465   {
    466     flushBuffer();
    467     m_os.flush();
    468   }
    469 
    470   /**
    471    * Close the stream, flushing it first.  Once a stream has been closed,
    472    * further write() or flush() invocations will cause an IOException to be
    473    * thrown.  Closing a previously-closed stream, however, has no effect.
    474    *
    475    * @exception  IOException  If an I/O error occurs
    476    *
    477    * @throws java.io.IOException
    478    */
    479   public void close() throws java.io.IOException
    480   {
    481     flushBuffer();
    482     m_os.close();
    483   }
    484 
    485   /**
    486    * Get the output stream where the events will be serialized to.
    487    *
    488    * @return reference to the result stream, or null of only a writer was
    489    * set.
    490    */
    491   public OutputStream getOutputStream()
    492   {
    493     return m_os;
    494   }
    495 
    496   public Writer getWriter()
    497   {
    498     // Only one of getWriter() or getOutputStream() can return null
    499     // This type of writer wraps an OutputStream, not a Writer.
    500     return null;
    501   }
    502 }
    503