Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.dx.util;
     18 
     19 import java.io.UTFDataFormatException;
     20 
     21 /**
     22  * Modified UTF-8 as described in the dex file format spec.
     23  *
     24  * <p>Derived from libcore's MUTF-8 encoder at java.nio.charset.ModifiedUtf8.
     25  */
     26 public final class Mutf8 {
     27     private Mutf8() {}
     28 
     29     /**
     30      * Decodes bytes from {@code in} into {@code out} until a delimiter 0x00 is
     31      * encountered. Returns a new string containing the decoded characters.
     32      */
     33     public static String decode(ByteInput in, char[] out) throws UTFDataFormatException {
     34         int s = 0;
     35         while (true) {
     36             char a = (char) (in.readByte() & 0xff);
     37             if (a == 0) {
     38                 return new String(out, 0, s);
     39             }
     40             out[s] = a;
     41             if (a < '\u0080') {
     42                 s++;
     43             } else if ((a & 0xe0) == 0xc0) {
     44                 int b = in.readByte() & 0xff;
     45                 if ((b & 0xC0) != 0x80) {
     46                     throw new UTFDataFormatException("bad second byte");
     47                 }
     48                 out[s++] = (char) (((a & 0x1F) << 6) | (b & 0x3F));
     49             } else if ((a & 0xf0) == 0xe0) {
     50                 int b = in.readByte() & 0xff;
     51                 int c = in.readByte() & 0xff;
     52                 if (((b & 0xC0) != 0x80) || ((c & 0xC0) != 0x80)) {
     53                     throw new UTFDataFormatException("bad second or third byte");
     54                 }
     55                 out[s++] = (char) (((a & 0x0F) << 12) | ((b & 0x3F) << 6) | (c & 0x3F));
     56             } else {
     57                 throw new UTFDataFormatException("bad byte");
     58             }
     59         }
     60     }
     61 
     62     /**
     63      * Returns the number of bytes the modified UTF8 representation of 's' would take.
     64      */
     65     private static long countBytes(String s, boolean shortLength) throws UTFDataFormatException {
     66         long result = 0;
     67         final int length = s.length();
     68         for (int i = 0; i < length; ++i) {
     69             char ch = s.charAt(i);
     70             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
     71                 ++result;
     72             } else if (ch <= 2047) {
     73                 result += 2;
     74             } else {
     75                 result += 3;
     76             }
     77             if (shortLength && result > 65535) {
     78                 throw new UTFDataFormatException("String more than 65535 UTF bytes long");
     79             }
     80         }
     81         return result;
     82     }
     83 
     84     /**
     85      * Encodes the modified UTF-8 bytes corresponding to {@code s} into  {@code
     86      * dst}, starting at {@code offset}.
     87      */
     88     public static void encode(byte[] dst, int offset, String s) {
     89         final int length = s.length();
     90         for (int i = 0; i < length; i++) {
     91             char ch = s.charAt(i);
     92             if (ch != 0 && ch <= 127) { // U+0000 uses two bytes.
     93                 dst[offset++] = (byte) ch;
     94             } else if (ch <= 2047) {
     95                 dst[offset++] = (byte) (0xc0 | (0x1f & (ch >> 6)));
     96                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
     97             } else {
     98                 dst[offset++] = (byte) (0xe0 | (0x0f & (ch >> 12)));
     99                 dst[offset++] = (byte) (0x80 | (0x3f & (ch >> 6)));
    100                 dst[offset++] = (byte) (0x80 | (0x3f & ch));
    101             }
    102         }
    103     }
    104 
    105     /**
    106      * Returns an array containing the <i>modified UTF-8</i> form of {@code s}.
    107      */
    108     public static byte[] encode(String s) throws UTFDataFormatException {
    109         int utfCount = (int) countBytes(s, true);
    110         byte[] result = new byte[utfCount];
    111         encode(result, 0, s);
    112         return result;
    113     }
    114 }
    115