1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 * UTF-8 and Unicode string manipulation, plus java/lang/String convenience 19 * functions. 20 * 21 * In most cases we populate the fields in the String object directly, 22 * rather than going through an instance field lookup. 23 */ 24 #include "Dalvik.h" 25 #include <stdlib.h> 26 27 /* 28 * Allocate a new instance of the class String, performing first-use 29 * initialization of the class if necessary. Upon success, the 30 * returned value will have all its fields except hashCode already 31 * filled in, including a reference to a newly-allocated char[] for 32 * the contents, sized as given. Additionally, a reference to the 33 * chars array is stored to the pChars pointer. Callers must 34 * subsequently call dvmReleaseTrackedAlloc() on the result pointer. 35 * This function returns NULL on failure. 36 */ 37 static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars) 38 { 39 /* 40 * The String class should have already gotten found (but not 41 * necessarily initialized) before making it here. We assert it 42 * explicitly, since historically speaking, we have had bugs with 43 * regard to when the class String gets set up. The assert helps 44 * make any regressions easier to diagnose. 45 */ 46 assert(gDvm.classJavaLangString != NULL); 47 48 if (!dvmIsClassInitialized(gDvm.classJavaLangString)) { 49 /* Perform first-time use initialization of the class. */ 50 if (!dvmInitClass(gDvm.classJavaLangString)) { 51 LOGE("FATAL: Could not initialize class String"); 52 dvmAbort(); 53 } 54 } 55 56 Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT); 57 if (result == NULL) { 58 return NULL; 59 } 60 61 ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT); 62 if (chars == NULL) { 63 dvmReleaseTrackedAlloc(result, NULL); 64 return NULL; 65 } 66 67 dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength); 68 dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars); 69 dvmReleaseTrackedAlloc((Object*) chars, NULL); 70 /* Leave offset and hashCode set to zero. */ 71 72 *pChars = chars; 73 return (StringObject*) result; 74 } 75 76 /* 77 * Compute a hash code on a UTF-8 string, for use with internal hash tables. 78 * 79 * This may or may not yield the same results as the java/lang/String 80 * computeHashCode() function. (To make sure this doesn't get abused, 81 * I'm initializing the hash code to 1 so they *don't* match up.) 82 * 83 * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute 84 * the hash with the result. That way, if something encoded the same 85 * character in two different ways, the hash value would be the same. For 86 * our purposes that isn't necessary. 87 */ 88 u4 dvmComputeUtf8Hash(const char* utf8Str) 89 { 90 u4 hash = 1; 91 92 while (*utf8Str != '\0') 93 hash = hash * 31 + *utf8Str++; 94 95 return hash; 96 } 97 98 /* 99 * Like "strlen", but for strings encoded with "modified" UTF-8. 100 * 101 * The value returned is the number of characters, which may or may not 102 * be the same as the number of bytes. 103 * 104 * (If this needs optimizing, try: mask against 0xa0, shift right 5, 105 * get increment {1-3} from table of 8 values.) 106 */ 107 size_t dvmUtf8Len(const char* utf8Str) 108 { 109 size_t len = 0; 110 int ic; 111 112 while ((ic = *utf8Str++) != '\0') { 113 len++; 114 if ((ic & 0x80) != 0) { 115 /* two- or three-byte encoding */ 116 utf8Str++; 117 if ((ic & 0x20) != 0) { 118 /* three-byte encoding */ 119 utf8Str++; 120 } 121 } 122 } 123 124 return len; 125 } 126 127 /* 128 * Convert a "modified" UTF-8 string to UTF-16. 129 */ 130 void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str) 131 { 132 while (*utf8Str != '\0') 133 *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str); 134 } 135 136 /* 137 * Given a UTF-16 string, compute the length of the corresponding UTF-8 138 * string in bytes. 139 */ 140 static int utf16_utf8ByteLen(const u2* utf16Str, int len) 141 { 142 int utf8Len = 0; 143 144 while (len--) { 145 unsigned int uic = *utf16Str++; 146 147 /* 148 * The most common case is (uic > 0 && uic <= 0x7f). 149 */ 150 if (uic == 0 || uic > 0x7f) { 151 if (uic > 0x07ff) 152 utf8Len += 3; 153 else /*(uic > 0x7f || uic == 0) */ 154 utf8Len += 2; 155 } else 156 utf8Len++; 157 } 158 return utf8Len; 159 } 160 161 /* 162 * Convert a UTF-16 string to UTF-8. 163 * 164 * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(), 165 * not just "len". 166 */ 167 static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len) 168 { 169 assert(len >= 0); 170 171 while (len--) { 172 unsigned int uic = *utf16Str++; 173 174 /* 175 * The most common case is (uic > 0 && uic <= 0x7f). 176 */ 177 if (uic == 0 || uic > 0x7f) { 178 if (uic > 0x07ff) { 179 *utf8Str++ = (uic >> 12) | 0xe0; 180 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80; 181 *utf8Str++ = (uic & 0x3f) | 0x80; 182 } else /*(uic > 0x7f || uic == 0)*/ { 183 *utf8Str++ = (uic >> 6) | 0xc0; 184 *utf8Str++ = (uic & 0x3f) | 0x80; 185 } 186 } else { 187 *utf8Str++ = uic; 188 } 189 } 190 191 *utf8Str = '\0'; 192 } 193 194 /* 195 * Use the java/lang/String.computeHashCode() algorithm. 196 */ 197 static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len) 198 { 199 u4 hash = 0; 200 201 while (len--) 202 hash = hash * 31 + *utf16Str++; 203 204 return hash; 205 } 206 207 u4 dvmComputeStringHash(StringObject* strObj) { 208 int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE); 209 if (hashCode != 0) { 210 return hashCode; 211 } 212 int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT); 213 int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET); 214 ArrayObject* chars = 215 (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE); 216 hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len); 217 dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode); 218 return hashCode; 219 } 220 221 StringObject* dvmCreateStringFromCstr(const char* utf8Str) { 222 assert(utf8Str != NULL); 223 return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str)); 224 } 225 226 StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) { 227 return dvmCreateStringFromCstr(utf8Str.c_str()); 228 } 229 230 /* 231 * Create a java/lang/String from a C string, given its UTF-16 length 232 * (number of UTF-16 code points). 233 * 234 * The caller must call dvmReleaseTrackedAlloc() on the return value. 235 * 236 * Returns NULL and throws an exception on failure. 237 */ 238 StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str, 239 size_t utf16Length) 240 { 241 assert(utf8Str != NULL); 242 243 ArrayObject* chars; 244 StringObject* newObj = makeStringObject(utf16Length, &chars); 245 if (newObj == NULL) { 246 return NULL; 247 } 248 249 dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str); 250 251 u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length); 252 dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode); 253 254 return newObj; 255 } 256 257 /* 258 * Create a new java/lang/String object, using the given Unicode data. 259 */ 260 StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len) 261 { 262 /* We allow a NULL pointer if the length is zero. */ 263 assert(len == 0 || unichars != NULL); 264 265 ArrayObject* chars; 266 StringObject* newObj = makeStringObject(len, &chars); 267 if (newObj == NULL) { 268 return NULL; 269 } 270 271 if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2)); 272 273 u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len); 274 dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode); 275 276 return newObj; 277 } 278 279 /* 280 * Create a new C string from a java/lang/String object. 281 * 282 * Returns NULL if the object is NULL. 283 */ 284 char* dvmCreateCstrFromString(const StringObject* jstr) 285 { 286 assert(gDvm.classJavaLangString != NULL); 287 if (jstr == NULL) { 288 return NULL; 289 } 290 291 int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT); 292 int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET); 293 ArrayObject* chars = 294 (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE); 295 const u2* data = (const u2*)(void*)chars->contents + offset; 296 assert(offset + len <= (int) chars->length); 297 298 int byteLen = utf16_utf8ByteLen(data, len); 299 char* newStr = (char*) malloc(byteLen+1); 300 if (newStr == NULL) { 301 return NULL; 302 } 303 convertUtf16ToUtf8(newStr, data, len); 304 305 return newStr; 306 } 307 308 void dvmGetStringUtfRegion(const StringObject* jstr, 309 int start, int len, char* buf) 310 { 311 const u2* data = jstr->chars() + start; 312 convertUtf16ToUtf8(buf, data, len); 313 } 314 315 int StringObject::utfLength() const 316 { 317 assert(gDvm.classJavaLangString != NULL); 318 319 int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT); 320 int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET); 321 ArrayObject* chars = 322 (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); 323 const u2* data = (const u2*)(void*)chars->contents + offset; 324 assert(offset + len <= (int) chars->length); 325 326 return utf16_utf8ByteLen(data, len); 327 } 328 329 int StringObject::length() const 330 { 331 return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT); 332 } 333 334 ArrayObject* StringObject::array() const 335 { 336 return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); 337 } 338 339 const u2* StringObject::chars() const 340 { 341 int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET); 342 ArrayObject* chars = 343 (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE); 344 return (const u2*)(void*)chars->contents + offset; 345 } 346 347 348 /* 349 * Compare two String objects. 350 * 351 * This is a dvmHashTableLookup() callback. The function has already 352 * compared their hash values; we need to do a full compare to ensure 353 * that the strings really match. 354 */ 355 int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2) 356 { 357 const StringObject* strObj1 = (const StringObject*) vstrObj1; 358 const StringObject* strObj2 = (const StringObject*) vstrObj2; 359 360 assert(gDvm.classJavaLangString != NULL); 361 362 /* get offset and length into char array; all values are in 16-bit units */ 363 int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT); 364 int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET); 365 int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT); 366 int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET); 367 if (len1 != len2) { 368 return len1 - len2; 369 } 370 371 ArrayObject* chars1 = 372 (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE); 373 ArrayObject* chars2 = 374 (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE); 375 376 /* damage here actually indicates a broken java/lang/String */ 377 assert(offset1 + len1 <= (int) chars1->length); 378 assert(offset2 + len2 <= (int) chars2->length); 379 380 return memcmp((const u2*)(void*)chars1->contents + offset1, 381 (const u2*)(void*)chars2->contents + offset2, 382 len1 * sizeof(u2)); 383 } 384 385 ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) { 386 Thread* self = dvmThreadSelf(); 387 388 // Allocate an array to hold the String objects. 389 ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString); 390 ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT); 391 if (stringArray == NULL) { 392 // Probably OOM. 393 assert(dvmCheckException(self)); 394 return NULL; 395 } 396 397 // Create the individual String objects and add them to the array. 398 for (size_t i = 0; i < strings.size(); i++) { 399 Object* str = (Object*) dvmCreateStringFromCstr(strings[i]); 400 if (str == NULL) { 401 // Probably OOM; drop out now. 402 assert(dvmCheckException(self)); 403 dvmReleaseTrackedAlloc((Object*) stringArray, self); 404 return NULL; 405 } 406 dvmSetObjectArrayElement(stringArray, i, str); 407 /* stored in tracked array, okay to release */ 408 dvmReleaseTrackedAlloc(str, self); 409 } 410 411 return stringArray; 412 } 413