1 /* 2 * Copyright (C) 2007 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.dexgen.rop.cst; 18 19 import com.android.dexgen.util.ByteArray; 20 import com.android.dexgen.util.Hex; 21 22 /** 23 * Constants of type {@code CONSTANT_Utf8_info}. 24 */ 25 public final class CstUtf8 extends Constant { 26 /** 27 * {@code non-null;} instance representing {@code ""}, that is, the 28 * empty string 29 */ 30 public static final CstUtf8 EMPTY_STRING = new CstUtf8(""); 31 32 /** {@code non-null;} the UTF-8 value as a string */ 33 private final String string; 34 35 /** {@code non-null;} the UTF-8 value as bytes */ 36 private final ByteArray bytes; 37 38 /** 39 * Converts a string into its Java-style UTF-8 form. Java-style UTF-8 40 * differs from normal UTF-8 in the handling of character '\0' and 41 * surrogate pairs. 42 * 43 * @param string {@code non-null;} the string to convert 44 * @return {@code non-null;} the UTF-8 bytes for it 45 */ 46 public static byte[] stringToUtf8Bytes(String string) { 47 int len = string.length(); 48 byte[] bytes = new byte[len * 3]; // Avoid having to reallocate. 49 int outAt = 0; 50 51 for (int i = 0; i < len; i++) { 52 char c = string.charAt(i); 53 if ((c != 0) && (c < 0x80)) { 54 bytes[outAt] = (byte) c; 55 outAt++; 56 } else if (c < 0x800) { 57 bytes[outAt] = (byte) (((c >> 6) & 0x1f) | 0xc0); 58 bytes[outAt + 1] = (byte) ((c & 0x3f) | 0x80); 59 outAt += 2; 60 } else { 61 bytes[outAt] = (byte) (((c >> 12) & 0x0f) | 0xe0); 62 bytes[outAt + 1] = (byte) (((c >> 6) & 0x3f) | 0x80); 63 bytes[outAt + 2] = (byte) ((c & 0x3f) | 0x80); 64 outAt += 3; 65 } 66 } 67 68 byte[] result = new byte[outAt]; 69 System.arraycopy(bytes, 0, result, 0, outAt); 70 return result; 71 } 72 73 /** 74 * Converts an array of UTF-8 bytes into a string. 75 * 76 * @param bytes {@code non-null;} the bytes to convert 77 * @return {@code non-null;} the converted string 78 */ 79 public static String utf8BytesToString(ByteArray bytes) { 80 int length = bytes.size(); 81 char[] chars = new char[length]; // This is sized to avoid a realloc. 82 int outAt = 0; 83 84 for (int at = 0; length > 0; /*at*/) { 85 int v0 = bytes.getUnsignedByte(at); 86 char out; 87 switch (v0 >> 4) { 88 case 0x00: case 0x01: case 0x02: case 0x03: 89 case 0x04: case 0x05: case 0x06: case 0x07: { 90 // 0XXXXXXX -- single-byte encoding 91 length--; 92 if (v0 == 0) { 93 // A single zero byte is illegal. 94 return throwBadUtf8(v0, at); 95 } 96 out = (char) v0; 97 at++; 98 break; 99 } 100 case 0x0c: case 0x0d: { 101 // 110XXXXX -- two-byte encoding 102 length -= 2; 103 if (length < 0) { 104 return throwBadUtf8(v0, at); 105 } 106 int v1 = bytes.getUnsignedByte(at + 1); 107 if ((v1 & 0xc0) != 0x80) { 108 return throwBadUtf8(v1, at + 1); 109 } 110 int value = ((v0 & 0x1f) << 6) | (v1 & 0x3f); 111 if ((value != 0) && (value < 0x80)) { 112 /* 113 * This should have been represented with 114 * one-byte encoding. 115 */ 116 return throwBadUtf8(v1, at + 1); 117 } 118 out = (char) value; 119 at += 2; 120 break; 121 } 122 case 0x0e: { 123 // 1110XXXX -- three-byte encoding 124 length -= 3; 125 if (length < 0) { 126 return throwBadUtf8(v0, at); 127 } 128 int v1 = bytes.getUnsignedByte(at + 1); 129 if ((v1 & 0xc0) != 0x80) { 130 return throwBadUtf8(v1, at + 1); 131 } 132 int v2 = bytes.getUnsignedByte(at + 2); 133 if ((v1 & 0xc0) != 0x80) { 134 return throwBadUtf8(v2, at + 2); 135 } 136 int value = ((v0 & 0x0f) << 12) | ((v1 & 0x3f) << 6) | 137 (v2 & 0x3f); 138 if (value < 0x800) { 139 /* 140 * This should have been represented with one- or 141 * two-byte encoding. 142 */ 143 return throwBadUtf8(v2, at + 2); 144 } 145 out = (char) value; 146 at += 3; 147 break; 148 } 149 default: { 150 // 10XXXXXX, 1111XXXX -- illegal 151 return throwBadUtf8(v0, at); 152 } 153 } 154 chars[outAt] = out; 155 outAt++; 156 } 157 158 return new String(chars, 0, outAt); 159 } 160 161 /** 162 * Helper for {@link #utf8BytesToString}, which throws the right 163 * exception for a bogus utf-8 byte. 164 * 165 * @param value the byte value 166 * @param offset the file offset 167 * @return never 168 * @throws IllegalArgumentException always thrown 169 */ 170 private static String throwBadUtf8(int value, int offset) { 171 throw new IllegalArgumentException("bad utf-8 byte " + Hex.u1(value) + 172 " at offset " + Hex.u4(offset)); 173 } 174 175 /** 176 * Constructs an instance from a {@code String}. 177 * 178 * @param string {@code non-null;} the UTF-8 value as a string 179 */ 180 public CstUtf8(String string) { 181 if (string == null) { 182 throw new NullPointerException("string == null"); 183 } 184 185 this.string = string.intern(); 186 this.bytes = new ByteArray(stringToUtf8Bytes(string)); 187 } 188 189 /** 190 * Constructs an instance from some UTF-8 bytes. 191 * 192 * @param bytes {@code non-null;} array of the UTF-8 bytes 193 */ 194 public CstUtf8(ByteArray bytes) { 195 if (bytes == null) { 196 throw new NullPointerException("bytes == null"); 197 } 198 199 this.bytes = bytes; 200 this.string = utf8BytesToString(bytes).intern(); 201 } 202 203 /** {@inheritDoc} */ 204 @Override 205 public boolean equals(Object other) { 206 if (!(other instanceof CstUtf8)) { 207 return false; 208 } 209 210 return string.equals(((CstUtf8) other).string); 211 } 212 213 /** {@inheritDoc} */ 214 @Override 215 public int hashCode() { 216 return string.hashCode(); 217 } 218 219 /** {@inheritDoc} */ 220 @Override 221 protected int compareTo0(Constant other) { 222 return string.compareTo(((CstUtf8) other).string); 223 } 224 225 /** {@inheritDoc} */ 226 @Override 227 public String toString() { 228 return "utf8{\"" + toHuman() + "\"}"; 229 } 230 231 /** {@inheritDoc} */ 232 @Override 233 public String typeName() { 234 return "utf8"; 235 } 236 237 /** {@inheritDoc} */ 238 @Override 239 public boolean isCategory2() { 240 return false; 241 } 242 243 /** {@inheritDoc} */ 244 public String toHuman() { 245 int len = string.length(); 246 StringBuilder sb = new StringBuilder(len * 3 / 2); 247 248 for (int i = 0; i < len; i++) { 249 char c = string.charAt(i); 250 if ((c >= ' ') && (c < 0x7f)) { 251 if ((c == '\'') || (c == '\"') || (c == '\\')) { 252 sb.append('\\'); 253 } 254 sb.append(c); 255 } else if (c <= 0x7f) { 256 switch (c) { 257 case '\n': sb.append("\\n"); break; 258 case '\r': sb.append("\\r"); break; 259 case '\t': sb.append("\\t"); break; 260 default: { 261 /* 262 * Represent the character as an octal escape. 263 * If the next character is a valid octal 264 * digit, disambiguate by using the 265 * three-digit form. 266 */ 267 char nextChar = 268 (i < (len - 1)) ? string.charAt(i + 1) : 0; 269 boolean displayZero = 270 (nextChar >= '0') && (nextChar <= '7'); 271 sb.append('\\'); 272 for (int shift = 6; shift >= 0; shift -= 3) { 273 char outChar = (char) (((c >> shift) & 7) + '0'); 274 if ((outChar != '0') || displayZero) { 275 sb.append(outChar); 276 displayZero = true; 277 } 278 } 279 if (! displayZero) { 280 // Ironic edge case: The original value was 0. 281 sb.append('0'); 282 } 283 break; 284 } 285 } 286 } else { 287 sb.append("\\u"); 288 sb.append(Character.forDigit(c >> 12, 16)); 289 sb.append(Character.forDigit((c >> 8) & 0x0f, 16)); 290 sb.append(Character.forDigit((c >> 4) & 0x0f, 16)); 291 sb.append(Character.forDigit(c & 0x0f, 16)); 292 } 293 } 294 295 return sb.toString(); 296 } 297 298 /** 299 * Gets the value as a human-oriented string, surrounded by double 300 * quotes. 301 * 302 * @return {@code non-null;} the quoted string 303 */ 304 public String toQuoted() { 305 return '\"' + toHuman() + '\"'; 306 } 307 308 /** 309 * Gets the value as a human-oriented string, surrounded by double 310 * quotes, but ellipsizes the result if it is longer than the given 311 * maximum length 312 * 313 * @param maxLength {@code >= 5;} the maximum length of the string to return 314 * @return {@code non-null;} the quoted string 315 */ 316 public String toQuoted(int maxLength) { 317 String string = toHuman(); 318 int length = string.length(); 319 String ellipses; 320 321 if (length <= (maxLength - 2)) { 322 ellipses = ""; 323 } else { 324 string = string.substring(0, maxLength - 5); 325 ellipses = "..."; 326 } 327 328 return '\"' + string + ellipses + '\"'; 329 } 330 331 /** 332 * Gets the UTF-8 value as a string. 333 * The returned string is always already interned. 334 * 335 * @return {@code non-null;} the UTF-8 value as a string 336 */ 337 public String getString() { 338 return string; 339 } 340 341 /** 342 * Gets the UTF-8 value as UTF-8 encoded bytes. 343 * 344 * @return {@code non-null;} an array of the UTF-8 bytes 345 */ 346 public ByteArray getBytes() { 347 return bytes; 348 } 349 350 /** 351 * Gets the size of this instance as UTF-8 code points. That is, 352 * get the number of bytes in the UTF-8 encoding of this instance. 353 * 354 * @return {@code >= 0;} the UTF-8 size 355 */ 356 public int getUtf8Size() { 357 return bytes.size(); 358 } 359 360 /** 361 * Gets the size of this instance as UTF-16 code points. That is, 362 * get the number of 16-bit chars in the UTF-16 encoding of this 363 * instance. This is the same as the {@code length} of the 364 * Java {@code String} representation of this instance. 365 * 366 * @return {@code >= 0;} the UTF-16 size 367 */ 368 public int getUtf16Size() { 369 return string.length(); 370 } 371 } 372