Home | History | Annotate | Download | only in cst
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.dexgen.rop.cst;
     18 
     19 import com.android.dexgen.util.ByteArray;
     20 import com.android.dexgen.util.Hex;
     21 
     22 /**
     23  * Constants of type {@code CONSTANT_Utf8_info}.
     24  */
     25 public final class CstUtf8 extends Constant {
     26     /**
     27      * {@code non-null;} instance representing {@code ""}, that is, the
     28      * empty string
     29      */
     30     public static final CstUtf8 EMPTY_STRING = new CstUtf8("");
     31 
     32     /** {@code non-null;} the UTF-8 value as a string */
     33     private final String string;
     34 
     35     /** {@code non-null;} the UTF-8 value as bytes */
     36     private final ByteArray bytes;
     37 
     38     /**
     39      * Converts a string into its Java-style UTF-8 form. Java-style UTF-8
     40      * differs from normal UTF-8 in the handling of character '\0' and
     41      * surrogate pairs.
     42      *
     43      * @param string {@code non-null;} the string to convert
     44      * @return {@code non-null;} the UTF-8 bytes for it
     45      */
     46     public static byte[] stringToUtf8Bytes(String string) {
     47         int len = string.length();
     48         byte[] bytes = new byte[len * 3]; // Avoid having to reallocate.
     49         int outAt = 0;
     50 
     51         for (int i = 0; i < len; i++) {
     52             char c = string.charAt(i);
     53             if ((c != 0) && (c < 0x80)) {
     54                 bytes[outAt] = (byte) c;
     55                 outAt++;
     56             } else if (c < 0x800) {
     57                 bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0);
     58                 bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80);
     59                 outAt += 2;
     60             } else {
     61                 bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0);
     62                 bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80);
     63                 bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80);
     64                 outAt += 3;
     65             }
     66         }
     67 
     68         byte[] result = new byte[outAt];
     69         System.arraycopy(bytes, 0, result, 0, outAt);
     70         return result;
     71     }
     72 
     73     /**
     74      * Converts an array of UTF-8 bytes into a string.
     75      *
     76      * @param bytes {@code non-null;} the bytes to convert
     77      * @return {@code non-null;} the converted string
     78      */
     79     public static String utf8BytesToString(ByteArray bytes) {
     80         int length = bytes.size();
     81         char[] chars = new char[length]; // This is sized to avoid a realloc.
     82         int outAt = 0;
     83 
     84         for (int at = 0; length > 0; /*at*/) {
     85             int v0 = bytes.getUnsignedByte(at);
     86             char out;
     87             switch (v0 >> 4) {
     88                 case 0x00: case 0x01: case 0x02: case 0x03:
     89                 case 0x04: case 0x05: case 0x06: case 0x07: {
     90                     // 0XXXXXXX -- single-byte encoding
     91                     length--;
     92                     if (v0 == 0) {
     93                         // A single zero byte is illegal.
     94                         return throwBadUtf8(v0, at);
     95                     }
     96                     out = (char) v0;
     97                     at++;
     98                     break;
     99                 }
    100                 case 0x0c: case 0x0d: {
    101                     // 110XXXXX -- two-byte encoding
    102                     length -= 2;
    103                     if (length < 0) {
    104                         return throwBadUtf8(v0, at);
    105                     }
    106                     int v1 = bytes.getUnsignedByte(at + 1);
    107                     if ((v1 & 0xc0) != 0x80) {
    108                         return throwBadUtf8(v1, at + 1);
    109                     }
    110                     int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f);
    111                     if ((value != 0) && (value < 0x80)) {
    112                         /*
    113                          * This should have been represented with
    114                          * one-byte encoding.
    115                          */
    116                         return throwBadUtf8(v1, at + 1);
    117                     }
    118                     out = (char) value;
    119                     at += 2;
    120                     break;
    121                 }
    122                 case 0x0e: {
    123                     // 1110XXXX -- three-byte encoding
    124                     length -= 3;
    125                     if (length < 0) {
    126                         return throwBadUtf8(v0, at);
    127                     }
    128                     int v1 = bytes.getUnsignedByte(at + 1);
    129                     if ((v1 & 0xc0) != 0x80) {
    130                         return throwBadUtf8(v1, at + 1);
    131                     }
    132                     int v2 = bytes.getUnsignedByte(at + 2);
    133                     if ((v1 & 0xc0) != 0x80) {
    134                         return throwBadUtf8(v2, at + 2);
    135                     }
    136                     int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) |
    137                         (v2 & 0x3f);
    138                     if (value < 0x800) {
    139                         /*
    140                          * This should have been represented with one- or
    141                          * two-byte encoding.
    142                          */
    143                         return throwBadUtf8(v2, at + 2);
    144                     }
    145                     out = (char) value;
    146                     at += 3;
    147                     break;
    148                 }
    149                 default: {
    150                     // 10XXXXXX, 1111XXXX -- illegal
    151                     return throwBadUtf8(v0, at);
    152                 }
    153             }
    154             chars[outAt] = out;
    155             outAt++;
    156         }
    157 
    158         return new String(chars, 0, outAt);
    159     }
    160 
    161     /**
    162      * Helper for {@link #utf8BytesToString}, which throws the right
    163      * exception for a bogus utf-8 byte.
    164      *
    165      * @param value the byte value
    166      * @param offset the file offset
    167      * @return never
    168      * @throws IllegalArgumentException always thrown
    169      */
    170     private static String throwBadUtf8(int value, int offset) {
    171         throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) +
    172                                            " at offset " + Hex.u4(offset));
    173     }
    174 
    175     /**
    176      * Constructs an instance from a {@code String}.
    177      *
    178      * @param string {@code non-null;} the UTF-8 value as a string
    179      */
    180     public CstUtf8(String string) {
    181         if (string == null) {
    182             throw new NullPointerException("string == null");
    183         }
    184 
    185         this.string = string.intern();
    186         this.bytes = new ByteArray(stringToUtf8Bytes(string));
    187     }
    188 
    189     /**
    190      * Constructs an instance from some UTF-8 bytes.
    191      *
    192      * @param bytes {@code non-null;} array of the UTF-8 bytes
    193      */
    194     public CstUtf8(ByteArray bytes) {
    195         if (bytes == null) {
    196             throw new NullPointerException("bytes == null");
    197         }
    198 
    199         this.bytes = bytes;
    200         this.string = utf8BytesToString(bytes).intern();
    201     }
    202 
    203     /** {@inheritDoc} */
    204     @Override
    205     public boolean equals(Object other) {
    206         if (!(other instanceof CstUtf8)) {
    207             return false;
    208         }
    209 
    210         return string.equals(((CstUtf8) other).string);
    211     }
    212 
    213     /** {@inheritDoc} */
    214     @Override
    215     public int hashCode() {
    216         return string.hashCode();
    217     }
    218 
    219     /** {@inheritDoc} */
    220     @Override
    221     protected int compareTo0(Constant other) {
    222         return string.compareTo(((CstUtf8) other).string);
    223     }
    224 
    225     /** {@inheritDoc} */
    226     @Override
    227     public String toString() {
    228         return "utf8{\"" + toHuman() + "\"}";
    229     }
    230 
    231     /** {@inheritDoc} */
    232     @Override
    233     public String typeName() {
    234         return "utf8";
    235     }
    236 
    237     /** {@inheritDoc} */
    238     @Override
    239     public boolean isCategory2() {
    240         return false;
    241     }
    242 
    243     /** {@inheritDoc} */
    244     public String toHuman() {
    245         int len = string.length();
    246         StringBuilder sb = new StringBuilder(len * 3 / 2);
    247 
    248         for (int i = 0; i < len; i++) {
    249             char c = string.charAt(i);
    250             if ((c >= ' ') && (c < 0x7f)) {
    251                 if ((c == '\'') || (c == '\"') || (c == '\\')) {
    252                     sb.append('\\');
    253                 }
    254                 sb.append(c);
    255             } else if (c <= 0x7f) {
    256                 switch (c) {
    257                     case '\n': sb.append("\\n"); break;
    258                     case '\r': sb.append("\\r"); break;
    259                     case '\t': sb.append("\\t"); break;
    260                     default: {
    261                         /*
    262                          * Represent the character as an octal escape.
    263                          * If the next character is a valid octal
    264                          * digit, disambiguate by using the
    265                          * three-digit form.
    266                          */
    267                         char nextChar =
    268                             (i < (len - 1)) ? string.charAt(i + 1) : 0;
    269                         boolean displayZero =
    270                             (nextChar >= '0') && (nextChar <= '7');
    271                         sb.append('\\');
    272                         for (int shift = 6; shift >= 0; shift -= 3) {
    273                             char outChar = (char) (((c >> shift) & 7) + '0');
    274                             if ((outChar != '0') || displayZero) {
    275                                 sb.append(outChar);
    276                                 displayZero = true;
    277                             }
    278                         }
    279                         if (! displayZero) {
    280                             // Ironic edge case: The original value was 0.
    281                             sb.append('0');
    282                         }
    283                         break;
    284                     }
    285                 }
    286             } else {
    287                 sb.append("\\u");
    288                 sb.append(Character.forDigit(c >> 12, 16));
    289                 sb.append(Character.forDigit((c >> 8) & 0x0f, 16));
    290                 sb.append(Character.forDigit((c >> 4) & 0x0f, 16));
    291                 sb.append(Character.forDigit(c & 0x0f, 16));
    292             }
    293         }
    294 
    295         return sb.toString();
    296     }
    297 
    298     /**
    299      * Gets the value as a human-oriented string, surrounded by double
    300      * quotes.
    301      *
    302      * @return {@code non-null;} the quoted string
    303      */
    304     public String toQuoted() {
    305         return '\"' + toHuman() + '\"';
    306     }
    307 
    308     /**
    309      * Gets the value as a human-oriented string, surrounded by double
    310      * quotes, but ellipsizes the result if it is longer than the given
    311      * maximum length
    312      *
    313      * @param maxLength {@code >= 5;} the maximum length of the string to return
    314      * @return {@code non-null;} the quoted string
    315      */
    316     public String toQuoted(int maxLength) {
    317         String string = toHuman();
    318         int length = string.length();
    319         String ellipses;
    320 
    321         if (length <= (maxLength - 2)) {
    322             ellipses = "";
    323         } else {
    324             string = string.substring(0, maxLength - 5);
    325             ellipses = "...";
    326         }
    327 
    328         return '\"' + string + ellipses + '\"';
    329     }
    330 
    331     /**
    332      * Gets the UTF-8 value as a string.
    333      * The returned string is always already interned.
    334      *
    335      * @return {@code non-null;} the UTF-8 value as a string
    336      */
    337     public String getString() {
    338         return string;
    339     }
    340 
    341     /**
    342      * Gets the UTF-8 value as UTF-8 encoded bytes.
    343      *
    344      * @return {@code non-null;} an array of the UTF-8 bytes
    345      */
    346     public ByteArray getBytes() {
    347         return bytes;
    348     }
    349 
    350     /**
    351      * Gets the size of this instance as UTF-8 code points. That is,
    352      * get the number of bytes in the UTF-8 encoding of this instance.
    353      *
    354      * @return {@code >= 0;} the UTF-8 size
    355      */
    356     public int getUtf8Size() {
    357         return bytes.size();
    358     }
    359 
    360     /**
    361      * Gets the size of this instance as UTF-16 code points. That is,
    362      * get the number of 16-bit chars in the UTF-16 encoding of this
    363      * instance. This is the same as the {@code length} of the
    364      * Java {@code String} representation of this instance.
    365      *
    366      * @return {@code >= 0;} the UTF-16 size
    367      */
    368     public int getUtf16Size() {
    369         return string.length();
    370     }
    371 }
    372