Home | History | Annotate | Download | only in vm
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
     19  * functions.
     20  *
     21  * In most cases we populate the fields in the String object directly,
     22  * rather than going through an instance field lookup.
     23  */
     24 #include "Dalvik.h"
     25 #include <stdlib.h>
     26 
     27 /*
     28  * Allocate a new instance of the class String, performing first-use
     29  * initialization of the class if necessary. Upon success, the
     30  * returned value will have all its fields except hashCode already
     31  * filled in, including a reference to a newly-allocated char[] for
     32  * the contents, sized as given. Additionally, a reference to the
     33  * chars array is stored to the pChars pointer. Callers must
     34  * subsequently call dvmReleaseTrackedAlloc() on the result pointer.
     35  * This function returns NULL on failure.
     36  */
     37 static StringObject* makeStringObject(u4 charsLength, ArrayObject** pChars)
     38 {
     39     /*
     40      * The String class should have already gotten found (but not
     41      * necessarily initialized) before making it here. We assert it
     42      * explicitly, since historically speaking, we have had bugs with
     43      * regard to when the class String gets set up. The assert helps
     44      * make any regressions easier to diagnose.
     45      */
     46     assert(gDvm.classJavaLangString != NULL);
     47 
     48     if (!dvmIsClassInitialized(gDvm.classJavaLangString)) {
     49         /* Perform first-time use initialization of the class. */
     50         if (!dvmInitClass(gDvm.classJavaLangString)) {
     51             LOGE("FATAL: Could not initialize class String");
     52             dvmAbort();
     53         }
     54     }
     55 
     56     Object* result = dvmAllocObject(gDvm.classJavaLangString, ALLOC_DEFAULT);
     57     if (result == NULL) {
     58         return NULL;
     59     }
     60 
     61     ArrayObject* chars = dvmAllocPrimitiveArray('C', charsLength, ALLOC_DEFAULT);
     62     if (chars == NULL) {
     63         dvmReleaseTrackedAlloc(result, NULL);
     64         return NULL;
     65     }
     66 
     67     dvmSetFieldInt(result, STRING_FIELDOFF_COUNT, charsLength);
     68     dvmSetFieldObject(result, STRING_FIELDOFF_VALUE, (Object*) chars);
     69     dvmReleaseTrackedAlloc((Object*) chars, NULL);
     70     /* Leave offset and hashCode set to zero. */
     71 
     72     *pChars = chars;
     73     return (StringObject*) result;
     74 }
     75 
     76 /*
     77  * Compute a hash code on a UTF-8 string, for use with internal hash tables.
     78  *
     79  * This may or may not yield the same results as the java/lang/String
     80  * computeHashCode() function.  (To make sure this doesn't get abused,
     81  * I'm initializing the hash code to 1 so they *don't* match up.)
     82  *
     83  * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
     84  * the hash with the result.  That way, if something encoded the same
     85  * character in two different ways, the hash value would be the same.  For
     86  * our purposes that isn't necessary.
     87  */
     88 u4 dvmComputeUtf8Hash(const char* utf8Str)
     89 {
     90     u4 hash = 1;
     91 
     92     while (*utf8Str != '\0')
     93         hash = hash * 31 + *utf8Str++;
     94 
     95     return hash;
     96 }
     97 
     98 /*
     99  * Like "strlen", but for strings encoded with "modified" UTF-8.
    100  *
    101  * The value returned is the number of characters, which may or may not
    102  * be the same as the number of bytes.
    103  *
    104  * (If this needs optimizing, try: mask against 0xa0, shift right 5,
    105  * get increment {1-3} from table of 8 values.)
    106  */
    107 size_t dvmUtf8Len(const char* utf8Str)
    108 {
    109     size_t len = 0;
    110     int ic;
    111 
    112     while ((ic = *utf8Str++) != '\0') {
    113         len++;
    114         if ((ic & 0x80) != 0) {
    115             /* two- or three-byte encoding */
    116             utf8Str++;
    117             if ((ic & 0x20) != 0) {
    118                 /* three-byte encoding */
    119                 utf8Str++;
    120             }
    121         }
    122     }
    123 
    124     return len;
    125 }
    126 
    127 /*
    128  * Convert a "modified" UTF-8 string to UTF-16.
    129  */
    130 void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
    131 {
    132     while (*utf8Str != '\0')
    133         *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
    134 }
    135 
    136 /*
    137  * Given a UTF-16 string, compute the length of the corresponding UTF-8
    138  * string in bytes.
    139  */
    140 static int utf16_utf8ByteLen(const u2* utf16Str, int len)
    141 {
    142     int utf8Len = 0;
    143 
    144     while (len--) {
    145         unsigned int uic = *utf16Str++;
    146 
    147         /*
    148          * The most common case is (uic > 0 && uic <= 0x7f).
    149          */
    150         if (uic == 0 || uic > 0x7f) {
    151             if (uic > 0x07ff)
    152                 utf8Len += 3;
    153             else /*(uic > 0x7f || uic == 0) */
    154                 utf8Len += 2;
    155         } else
    156             utf8Len++;
    157     }
    158     return utf8Len;
    159 }
    160 
    161 /*
    162  * Convert a UTF-16 string to UTF-8.
    163  *
    164  * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
    165  * not just "len".
    166  */
    167 static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
    168 {
    169     assert(len >= 0);
    170 
    171     while (len--) {
    172         unsigned int uic = *utf16Str++;
    173 
    174         /*
    175          * The most common case is (uic > 0 && uic <= 0x7f).
    176          */
    177         if (uic == 0 || uic > 0x7f) {
    178             if (uic > 0x07ff) {
    179                 *utf8Str++ = (uic >> 12) | 0xe0;
    180                 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
    181                 *utf8Str++ = (uic & 0x3f) | 0x80;
    182             } else /*(uic > 0x7f || uic == 0)*/ {
    183                 *utf8Str++ = (uic >> 6) | 0xc0;
    184                 *utf8Str++ = (uic & 0x3f) | 0x80;
    185             }
    186         } else {
    187             *utf8Str++ = uic;
    188         }
    189     }
    190 
    191     *utf8Str = '\0';
    192 }
    193 
    194 /*
    195  * Use the java/lang/String.computeHashCode() algorithm.
    196  */
    197 static inline u4 computeUtf16Hash(const u2* utf16Str, size_t len)
    198 {
    199     u4 hash = 0;
    200 
    201     while (len--)
    202         hash = hash * 31 + *utf16Str++;
    203 
    204     return hash;
    205 }
    206 
    207 u4 dvmComputeStringHash(StringObject* strObj) {
    208     int hashCode = dvmGetFieldInt(strObj, STRING_FIELDOFF_HASHCODE);
    209     if (hashCode != 0) {
    210       return hashCode;
    211     }
    212     int len = dvmGetFieldInt(strObj, STRING_FIELDOFF_COUNT);
    213     int offset = dvmGetFieldInt(strObj, STRING_FIELDOFF_OFFSET);
    214     ArrayObject* chars =
    215             (ArrayObject*) dvmGetFieldObject(strObj, STRING_FIELDOFF_VALUE);
    216     hashCode = computeUtf16Hash((u2*)(void*)chars->contents + offset, len);
    217     dvmSetFieldInt(strObj, STRING_FIELDOFF_HASHCODE, hashCode);
    218     return hashCode;
    219 }
    220 
    221 StringObject* dvmCreateStringFromCstr(const char* utf8Str) {
    222     assert(utf8Str != NULL);
    223     return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str));
    224 }
    225 
    226 StringObject* dvmCreateStringFromCstr(const std::string& utf8Str) {
    227     return dvmCreateStringFromCstr(utf8Str.c_str());
    228 }
    229 
    230 /*
    231  * Create a java/lang/String from a C string, given its UTF-16 length
    232  * (number of UTF-16 code points).
    233  *
    234  * The caller must call dvmReleaseTrackedAlloc() on the return value.
    235  *
    236  * Returns NULL and throws an exception on failure.
    237  */
    238 StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
    239     size_t utf16Length)
    240 {
    241     assert(utf8Str != NULL);
    242 
    243     ArrayObject* chars;
    244     StringObject* newObj = makeStringObject(utf16Length, &chars);
    245     if (newObj == NULL) {
    246         return NULL;
    247     }
    248 
    249     dvmConvertUtf8ToUtf16((u2*)(void*)chars->contents, utf8Str);
    250 
    251     u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, utf16Length);
    252     dvmSetFieldInt((Object*) newObj, STRING_FIELDOFF_HASHCODE, hashCode);
    253 
    254     return newObj;
    255 }
    256 
    257 /*
    258  * Create a new java/lang/String object, using the given Unicode data.
    259  */
    260 StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
    261 {
    262     /* We allow a NULL pointer if the length is zero. */
    263     assert(len == 0 || unichars != NULL);
    264 
    265     ArrayObject* chars;
    266     StringObject* newObj = makeStringObject(len, &chars);
    267     if (newObj == NULL) {
    268         return NULL;
    269     }
    270 
    271     if (len > 0) memcpy(chars->contents, unichars, len * sizeof(u2));
    272 
    273     u4 hashCode = computeUtf16Hash((u2*)(void*)chars->contents, len);
    274     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
    275 
    276     return newObj;
    277 }
    278 
    279 /*
    280  * Create a new C string from a java/lang/String object.
    281  *
    282  * Returns NULL if the object is NULL.
    283  */
    284 char* dvmCreateCstrFromString(const StringObject* jstr)
    285 {
    286     assert(gDvm.classJavaLangString != NULL);
    287     if (jstr == NULL) {
    288         return NULL;
    289     }
    290 
    291     int len = dvmGetFieldInt(jstr, STRING_FIELDOFF_COUNT);
    292     int offset = dvmGetFieldInt(jstr, STRING_FIELDOFF_OFFSET);
    293     ArrayObject* chars =
    294             (ArrayObject*) dvmGetFieldObject(jstr, STRING_FIELDOFF_VALUE);
    295     const u2* data = (const u2*)(void*)chars->contents + offset;
    296     assert(offset + len <= (int) chars->length);
    297 
    298     int byteLen = utf16_utf8ByteLen(data, len);
    299     char* newStr = (char*) malloc(byteLen+1);
    300     if (newStr == NULL) {
    301         return NULL;
    302     }
    303     convertUtf16ToUtf8(newStr, data, len);
    304 
    305     return newStr;
    306 }
    307 
    308 void dvmGetStringUtfRegion(const StringObject* jstr,
    309         int start, int len, char* buf)
    310 {
    311     const u2* data = jstr->chars() + start;
    312     convertUtf16ToUtf8(buf, data, len);
    313 }
    314 
    315 int StringObject::utfLength() const
    316 {
    317     assert(gDvm.classJavaLangString != NULL);
    318 
    319     int len = dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
    320     int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
    321     ArrayObject* chars =
    322             (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
    323     const u2* data = (const u2*)(void*)chars->contents + offset;
    324     assert(offset + len <= (int) chars->length);
    325 
    326     return utf16_utf8ByteLen(data, len);
    327 }
    328 
    329 int StringObject::length() const
    330 {
    331     return dvmGetFieldInt(this, STRING_FIELDOFF_COUNT);
    332 }
    333 
    334 ArrayObject* StringObject::array() const
    335 {
    336     return (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
    337 }
    338 
    339 const u2* StringObject::chars() const
    340 {
    341     int offset = dvmGetFieldInt(this, STRING_FIELDOFF_OFFSET);
    342     ArrayObject* chars =
    343             (ArrayObject*) dvmGetFieldObject(this, STRING_FIELDOFF_VALUE);
    344     return (const u2*)(void*)chars->contents + offset;
    345 }
    346 
    347 
    348 /*
    349  * Compare two String objects.
    350  *
    351  * This is a dvmHashTableLookup() callback.  The function has already
    352  * compared their hash values; we need to do a full compare to ensure
    353  * that the strings really match.
    354  */
    355 int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
    356 {
    357     const StringObject* strObj1 = (const StringObject*) vstrObj1;
    358     const StringObject* strObj2 = (const StringObject*) vstrObj2;
    359 
    360     assert(gDvm.classJavaLangString != NULL);
    361 
    362     /* get offset and length into char array; all values are in 16-bit units */
    363     int len1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_COUNT);
    364     int offset1 = dvmGetFieldInt(strObj1, STRING_FIELDOFF_OFFSET);
    365     int len2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_COUNT);
    366     int offset2 = dvmGetFieldInt(strObj2, STRING_FIELDOFF_OFFSET);
    367     if (len1 != len2) {
    368         return len1 - len2;
    369     }
    370 
    371     ArrayObject* chars1 =
    372             (ArrayObject*) dvmGetFieldObject(strObj1, STRING_FIELDOFF_VALUE);
    373     ArrayObject* chars2 =
    374             (ArrayObject*) dvmGetFieldObject(strObj2, STRING_FIELDOFF_VALUE);
    375 
    376     /* damage here actually indicates a broken java/lang/String */
    377     assert(offset1 + len1 <= (int) chars1->length);
    378     assert(offset2 + len2 <= (int) chars2->length);
    379 
    380     return memcmp((const u2*)(void*)chars1->contents + offset1,
    381                   (const u2*)(void*)chars2->contents + offset2,
    382                   len1 * sizeof(u2));
    383 }
    384 
    385 ArrayObject* dvmCreateStringArray(const std::vector<std::string>& strings) {
    386     Thread* self = dvmThreadSelf();
    387 
    388     // Allocate an array to hold the String objects.
    389     ClassObject* elementClass = dvmFindArrayClassForElement(gDvm.classJavaLangString);
    390     ArrayObject* stringArray = dvmAllocArrayByClass(elementClass, strings.size(), ALLOC_DEFAULT);
    391     if (stringArray == NULL) {
    392         // Probably OOM.
    393         assert(dvmCheckException(self));
    394         return NULL;
    395     }
    396 
    397     // Create the individual String objects and add them to the array.
    398     for (size_t i = 0; i < strings.size(); i++) {
    399         Object* str = (Object*) dvmCreateStringFromCstr(strings[i]);
    400         if (str == NULL) {
    401             // Probably OOM; drop out now.
    402             assert(dvmCheckException(self));
    403             dvmReleaseTrackedAlloc((Object*) stringArray, self);
    404             return NULL;
    405         }
    406         dvmSetObjectArrayElement(stringArray, i, str);
    407         /* stored in tracked array, okay to release */
    408         dvmReleaseTrackedAlloc(str, self);
    409     }
    410 
    411     return stringArray;
    412 }
    413