Home | History | Annotate | Download | only in charset
      1 /*
      2  * Copyright (C) 2015 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License
     15  */
     16 
     17 package java.nio.charset;
     18 
     19 import java.io.UTFDataFormatException;
     20 
     21 /**
     22  * Encoding and decoding methods for Modified UTF-8
     23  *
     24  * <p>Modified UTF-8 is a simple variation of UTF-8 in which {@code \u0000} is encoded as
     25  * 0xc0 0x80 . This avoids the presence of bytes 0 in the output.
     26  *
     27  * @hide
     28  */
     29 public class ModifiedUtf8 {
     30 
     31     /**
     32      * Count the number of bytes in the modified UTF-8 representation of {@code s}.
     33      *
     34      * <p>Additionally, if {@code shortLength} is true, throw a {@code UTFDataFormatException} if
     35      * the size cannot be presented in an (unsigned) java short.
     36      */
     37     public static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
     38         long counter = 0;
     39         int strLen = s.length();
     40         for (int i = 0; i < strLen; i++) {
     41             char c = s.charAt(i);
     42             if (c < '\u0080') {
     43                 counter++;
     44                 if (c == '\u0000') {
     45                     counter++;
     46                 }
     47             } else if (c < '\u0800') {
     48                 counter += 2;
     49             } else {
     50                 counter += 3;
     51             }
     52         }
     53         // Allow up to the maximum value of an unsigned short (as the value is known to be
     54         // unsigned.
     55         if (shortLength && counter > 0xffff) {
     56             throw new UTFDataFormatException(
     57                     "Size of the encoded string doesn't fit in two bytes");
     58         }
     59         return counter;
     60     }
     61 
     62     /**
     63      * Encode {@code s} into {@code dst} starting at offset {@code offset}.
     64      *
     65      * <p>The output buffer is guaranteed to have enough space.
     66      */
     67     public static void encode(byte[] dst, int offset, String s) {
     68         int strLen = s.length();
     69         for (int i = 0; i < strLen; i++) {
     70             char c = s.charAt(i);
     71             if (c < '\u0080') {
     72                 if (c == 0) {
     73                     dst[offset++] = (byte) 0xc0;
     74                     dst[offset++] = (byte) 0x80;
     75                 } else {
     76                     dst[offset++] = (byte) c;
     77                 }
     78             } else if (c < '\u0800') {
     79                 dst[offset++] = (byte) ((c >>> 6) | 0xc0);
     80                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
     81             } else {
     82                 dst[offset++] = (byte) ((c >>> 12) | 0xe0);
     83                 dst[offset++] = (byte) (((c >>> 6) & 0x3f) | 0x80);
     84                 dst[offset++] = (byte) ((c & 0x3f) | 0x80);
     85             }
     86         }
     87     }
     88 
     89     /**
     90      * Encodes {@code s} into a buffer with the following format:
     91      *
     92      * <p>- the first two bytes of the buffer are the length of the modified-utf8 output
     93      * (as a big endian short. A UTFDataFormatException is thrown if the encoded size cannot be
     94      * represented as a short.
     95      *
     96      * <p>- the remainder of the buffer contains the modified-utf8 output (equivalent to
     97      * {@code encode(buf, 2, s)}).
     98      */
     99     public static byte[] encode(String s) throws UTFDataFormatException {
    100         long size = countBytes(s, true);
    101         byte[] output = new byte[(int) size + 2];
    102         encode(output, 2, s);
    103         output[0] = (byte) (size >>> 8);
    104         output[1] = (byte) size;
    105         return output;
    106     }
    107 
    108     /**
    109      * Decodes {@code length} utf-8 bytes from {@code in} starting at offset {@code offset} to
    110      * {@code out},
    111      *
    112      * <p>A maximum of {@code length} chars are written to the output starting at offset 0.
    113      * {@code out} is assumed to have enough space for the output (a standard
    114      * {@code ArrayIndexOutOfBoundsException} is thrown otherwise).
    115      *
    116      * <p>If a 0 byte is encountered, it is converted to U+0000.
    117      */
    118     public static String decode(byte[] in, char[] out, int offset, int length)
    119             throws UTFDataFormatException {
    120         if (offset < 0 || length < 0) {
    121             throw new IllegalArgumentException("Illegal arguments: offset " + offset
    122                     + ". Length: " + length);
    123         }
    124         int outputIndex = 0;
    125         int limitIndex = offset + length;
    126         while (offset < limitIndex) {
    127             int i = in[offset] & 0xff;
    128             offset++;
    129             if (i < 0x80) {
    130                 out[outputIndex] = (char) i;
    131                 outputIndex++;
    132                 continue;
    133             }
    134             if (0xc0 <= i && i < 0xe0) {
    135                 // This branch covers the case 0 = 0xc080.
    136 
    137                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte.
    138                 i = (i & 0x1f) << 6;
    139                 if(offset == limitIndex) {
    140                     throw new UTFDataFormatException("unexpected end of input");
    141                 }
    142                 // Include 6 least-significant bits of the input byte.
    143                 if ((in[offset] & 0xc0) != 0x80) {
    144                     throw new UTFDataFormatException("bad second byte at " + offset);
    145                 }
    146                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
    147                 offset++;
    148                 outputIndex++;
    149             } else if(i < 0xf0) {
    150                 // The result is: 5 least-significant bits of i + 6 l-s bits of next input byte
    151                 // + 6 l-s of next to next input byte.
    152                 i = (i & 0x1f) << 12;
    153                 // Make sure there are are at least two bytes left.
    154                 if (offset + 1 >= limitIndex) {
    155                     throw new UTFDataFormatException("unexpected end of input");
    156                 }
    157                 // Include 6 least-significant bits of the input byte, with 6 bits of room
    158                 // for the next byte.
    159                 if ((in[offset] & 0xc0) != 0x80) {
    160                     throw new UTFDataFormatException("bad second byte at " + offset);
    161                 }
    162                 i = i | (in[offset] & 0x3f) << 6;
    163                 offset++;
    164                 // Include 6 least-significant bits of the input byte.
    165                 if ((in[offset] & 0xc0) != 0x80) {
    166                     throw new UTFDataFormatException("bad third byte at " + offset);
    167                 }
    168                 out[outputIndex] = (char) (i | (in[offset] & 0x3f));
    169                 offset++;
    170                 outputIndex++;
    171             } else {
    172                 throw new UTFDataFormatException("Invalid UTF8 byte "
    173                         + (int) i + " at position " + (offset - 1));
    174             }
    175         }
    176         return String.valueOf(out, 0, outputIndex);
    177     }
    178 }
    179