Home | History | Annotate | Download | only in util
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * As per the Apache license requirements, this file has been modified
     19  * from its original state.
     20  *
     21  * Such modifications are Copyright (C) 2010 Ben Gruver, and are released
     22  * under the original license
     23  */
     24 
     25 package org.jf.util;
     26 
     27 import javax.annotation.Nonnull;
     28 import javax.annotation.Nullable;
     29 
     30 /**
     31  * Constants of type <code>CONSTANT_Utf8_info</code>.
     32  */
     33 public final class Utf8Utils {
     34     /**
     35      * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
     36      * differs from normal UTF-8 in the handling of character '\0' and
     37      * surrogate pairs.
     38      *
     39      * @param string non-null; the string to convert
     40      * @return non-null; the UTF-8 bytes for it
     41      */
     42     public static byte[] stringToUtf8Bytes(String string) {
     43         int len = string.length();
     44         byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
     45         int outAt = 0;
     46 
     47         for (int i = 0; i < len; i++) {
     48             char c = string.charAt(i);
     49             if ((c != 0) && (c < 0x80)) {
     50                 bytes[outAt] = (byte) c;
     51                 outAt++;
     52             } else if (c < 0x800) {
     53                 bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
     54                 bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
     55                 outAt += 2;
     56             } else {
     57                 bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
     58                 bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
     59                 bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
     60                 outAt += 3;
     61             }
     62         }
     63 
     64         byte[] result = new byte[outAt];
     65         System.arraycopy(bytes, 0, result, 0, outAt);
     66         return result;
     67     }
     68 
     69     private static final ThreadLocal<char[]> localBuffer =
     70             new ThreadLocal<char[]> () {
     71                 @Override protected char[] initialValue() {
     72                     // A reasonably sized initial value
     73                     return new char[256];
     74                 }
     75             };
     76 
     77     /**
     78      * Converts an array of UTF-8 bytes into a string.
     79      *
     80      * @param bytes non-null; the bytes to convert
     81      * @param start the start index of the utf8 string to convert
     82      * @param length the length of the utf8 string to convert, not including any null-terminator that might be present
     83      * @return non-null; the converted string
     84      */
     85     public static String utf8BytesToString(byte[] bytes, int start, int length) {
     86         char[] chars = localBuffer.get();
     87         if (chars == null || chars.length < length) {
     88             chars = new char[length];
     89             localBuffer.set(chars);
     90         }
     91         int outAt = 0;
     92 
     93         for (int at = start; length > 0; /*at*/) {
     94             int v0 = bytes[at] & 0xFF;
     95             char out;
     96             switch (v0 >> 4) {
     97                 case 0x00: case 0x01: case 0x02: case 0x03:
     98                 case 0x04: case 0x05: case 0x06: case 0x07: {
     99                     // 0XXXXXXX -- single-byte encoding
    100                     length--;
    101                     if (v0 == 0) {
    102                         // A single zero byte is illegal.
    103                         return throwBadUtf8(v0, at);
    104                     }
    105                     out = (char) v0;
    106                     at++;
    107                     break;
    108                 }
    109                 case 0x0c: case 0x0d: {
    110                     // 110XXXXX -- two-byte encoding
    111                     length -= 2;
    112                     if (length < 0) {
    113                         return throwBadUtf8(v0, at);
    114                     }
    115                     int v1 = bytes[at + 1] & 0xFF;
    116                     if ((v1 & 0xc0) != 0x80) {
    117                         return throwBadUtf8(v1, at + 1);
    118                     }
    119                     int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
    120                     if ((value != 0) && (value < 0x80)) {
    121                         /*
    122                          * This should have been represented with
    123                          * one-byte encoding.
    124                          */
    125                         return throwBadUtf8(v1, at + 1);
    126                     }
    127                     out = (char) value;
    128                     at += 2;
    129                     break;
    130                 }
    131                 case 0x0e: {
    132                     // 1110XXXX -- three-byte encoding
    133                     length -= 3;
    134                     if (length < 0) {
    135                         return throwBadUtf8(v0, at);
    136                     }
    137                     int v1 = bytes[at + 1] & 0xFF;
    138                     if ((v1 & 0xc0) != 0x80) {
    139                         return throwBadUtf8(v1, at + 1);
    140                     }
    141                     int v2 = bytes[at + 2] & 0xFF;
    142                     if ((v2 & 0xc0) != 0x80) {
    143                         return throwBadUtf8(v2, at + 2);
    144                     }
    145                     int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
    146                             (v2 & 0x3f);
    147                     if (value < 0x800) {
    148                         /*
    149                          * This should have been represented with one- or
    150                          * two-byte encoding.
    151                          */
    152                         return throwBadUtf8(v2, at + 2);
    153                     }
    154                     out = (char) value;
    155                     at += 3;
    156                     break;
    157                 }
    158                 default: {
    159                     // 10XXXXXX, 1111XXXX -- illegal
    160                     return throwBadUtf8(v0, at);
    161                 }
    162             }
    163             chars[outAt] = out;
    164             outAt++;
    165         }
    166 
    167         return new String(chars, 0, outAt);
    168     }
    169 
    170     /**
    171      * Converts an array of UTF-8 bytes into a string.
    172      *
    173      * @param bytes non-null; the bytes to convert
    174      * @param start the start index of the utf8 string to convert
    175      * @param utf16Length the number of utf16 characters in the string to decode
    176      * @return non-null; the converted string
    177      */
    178     public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length) {
    179         return utf8BytesWithUtf16LengthToString(bytes, start, utf16Length, null);
    180     }
    181 
    182     /**
    183      * Converts an array of UTF-8 bytes into a string.
    184      *
    185      * @param bytes non-null; the bytes to convert
    186      * @param start the start index of the utf8 string to convert
    187      * @param utf16Length the number of utf16 characters in the string to decode
    188      * @param readLength If non-null, the first element will contain the number of bytes read after the method exits
    189      * @return non-null; the converted string
    190      */
    191     public static String utf8BytesWithUtf16LengthToString(@Nonnull byte[] bytes, int start, int utf16Length,
    192                                                           @Nullable int[] readLength) {
    193         char[] chars = localBuffer.get();
    194         if (chars == null || chars.length < utf16Length) {
    195             chars = new char[utf16Length];
    196             localBuffer.set(chars);
    197         }
    198         int outAt = 0;
    199 
    200         int at = 0;
    201         for (at = start; utf16Length > 0; utf16Length--) {
    202             int v0 = bytes[at] & 0xFF;
    203             char out;
    204             switch (v0 >> 4) {
    205                 case 0x00: case 0x01: case 0x02: case 0x03:
    206                 case 0x04: case 0x05: case 0x06: case 0x07: {
    207                     // 0XXXXXXX -- single-byte encoding
    208                     if (v0 == 0) {
    209                         // A single zero byte is illegal.
    210                         return throwBadUtf8(v0, at);
    211                     }
    212                     out = (char) v0;
    213                     at++;
    214                     break;
    215                 }
    216                 case 0x0c: case 0x0d: {
    217                     // 110XXXXX -- two-byte encoding
    218                     int v1 = bytes[at + 1] & 0xFF;
    219                     if ((v1 & 0xc0) != 0x80) {
    220                         return throwBadUtf8(v1, at + 1);
    221                     }
    222                     int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
    223                     if ((value != 0) && (value < 0x80)) {
    224                         /*
    225                          * This should have been represented with
    226                          * one-byte encoding.
    227                          */
    228                         return throwBadUtf8(v1, at + 1);
    229                     }
    230                     out = (char) value;
    231                     at += 2;
    232                     break;
    233                 }
    234                 case 0x0e: {
    235                     // 1110XXXX -- three-byte encoding
    236                     int v1 = bytes[at + 1] & 0xFF;
    237                     if ((v1 & 0xc0) != 0x80) {
    238                         return throwBadUtf8(v1, at + 1);
    239                     }
    240                     int v2 = bytes[at + 2] & 0xFF;
    241                     if ((v2 & 0xc0) != 0x80) {
    242                         return throwBadUtf8(v2, at + 2);
    243                     }
    244                     int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
    245                         (v2 & 0x3f);
    246                     if (value < 0x800) {
    247                         /*
    248                          * This should have been represented with one- or
    249                          * two-byte encoding.
    250                          */
    251                         return throwBadUtf8(v2, at + 2);
    252                     }
    253                     out = (char) value;
    254                     at += 3;
    255                     break;
    256                 }
    257                 default: {
    258                     // 10XXXXXX, 1111XXXX -- illegal
    259                     return throwBadUtf8(v0, at);
    260                 }
    261             }
    262             chars[outAt] = out;
    263             outAt++;
    264         }
    265 
    266         if (readLength != null && readLength.length > 0) {
    267             readLength[0] = at - start;
    268             readLength[0] = at - start;
    269         }
    270         return new String(chars, 0, outAt);
    271     }
    272 
    273     /**
    274      * Helper for {@link #utf8BytesToString}, which throws the right
    275      * exception for a bogus utf-8 byte.
    276      *
    277      * @param value the byte value
    278      * @param offset the file offset
    279      * @return never
    280      * @throws IllegalArgumentException always thrown
    281      */
    282     private static String throwBadUtf8(int value, int offset) {
    283         throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
    284                                            " at offset " + Hex.u4(offset));
    285     }
    286 }
    287