Home | History | Annotate | Download | only in vm
      1 /*
      2  * Copyright (C) 2008 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * UTF-8 and Unicode string manipulation, plus java/lang/String convenience
     19  * functions.
     20  *
     21  * In most cases we populate the fields in the String object directly,
     22  * rather than going through an instance field lookup.
     23  */
     24 #include "Dalvik.h"
     25 #include <stdlib.h>
     26 
     27 /*
     28  * Initialize string globals.
     29  *
     30  * This isn't part of the VM init sequence because it's hard to get the
     31  * timing right -- we need it to happen after java/lang/String has been
     32  * loaded, but before anybody wants to use a string.  It's easiest to
     33  * just initialize it on first use.
     34  *
     35  * In some unusual circumstances (e.g. trying to throw an exception because
     36  * String implements java/lang/CharSequence, but CharSequence doesn't exist)
     37  * we can try to create an exception string internally before anything has
     38  * really tried to use String.  In that case we basically self-destruct.
     39  */
     40 static bool stringStartup()
     41 {
     42     if (gDvm.javaLangStringReady < 0) {
     43         LOGE("ERROR: reentrant string initialization\n");
     44         assert(false);
     45         return false;
     46     }
     47     assert(gDvm.javaLangStringReady == 0);
     48 
     49     gDvm.javaLangStringReady = -1;
     50 
     51     if (gDvm.classJavaLangString == NULL)
     52         gDvm.classJavaLangString =
     53             dvmFindSystemClassNoInit("Ljava/lang/String;");
     54 
     55     gDvm.offJavaLangString_value =
     56         dvmFindFieldOffset(gDvm.classJavaLangString, "value", "[C");
     57     gDvm.offJavaLangString_count =
     58         dvmFindFieldOffset(gDvm.classJavaLangString, "count", "I");
     59     gDvm.offJavaLangString_offset =
     60         dvmFindFieldOffset(gDvm.classJavaLangString, "offset", "I");
     61     gDvm.offJavaLangString_hashCode =
     62         dvmFindFieldOffset(gDvm.classJavaLangString, "hashCode", "I");
     63 
     64     if (gDvm.offJavaLangString_value < 0 ||
     65         gDvm.offJavaLangString_count < 0 ||
     66         gDvm.offJavaLangString_offset < 0 ||
     67         gDvm.offJavaLangString_hashCode < 0)
     68     {
     69         LOGE("VM-required field missing from java/lang/String\n");
     70         return false;
     71     }
     72 
     73     bool badValue = false;
     74     if (gDvm.offJavaLangString_value != STRING_FIELDOFF_VALUE) {
     75         LOGE("InlineNative: String.value offset = %d, expected %d\n",
     76             gDvm.offJavaLangString_value, STRING_FIELDOFF_VALUE);
     77         badValue = true;
     78     }
     79     if (gDvm.offJavaLangString_count != STRING_FIELDOFF_COUNT) {
     80         LOGE("InlineNative: String.count offset = %d, expected %d\n",
     81             gDvm.offJavaLangString_count, STRING_FIELDOFF_COUNT);
     82         badValue = true;
     83     }
     84     if (gDvm.offJavaLangString_offset != STRING_FIELDOFF_OFFSET) {
     85         LOGE("InlineNative: String.offset offset = %d, expected %d\n",
     86             gDvm.offJavaLangString_offset, STRING_FIELDOFF_OFFSET);
     87         badValue = true;
     88     }
     89     if (gDvm.offJavaLangString_hashCode != STRING_FIELDOFF_HASHCODE) {
     90         LOGE("InlineNative: String.hashCode offset = %d, expected %d\n",
     91             gDvm.offJavaLangString_hashCode, STRING_FIELDOFF_HASHCODE);
     92         badValue = true;
     93     }
     94     if (badValue)
     95         return false;
     96 
     97     gDvm.javaLangStringReady = 1;
     98 
     99     return true;
    100 }
    101 
    102 /*
    103  * Discard heap-allocated storage.
    104  */
    105 void dvmStringShutdown()
    106 {
    107     // currently unused
    108 }
    109 
    110 /*
    111  * Compute a hash code on a UTF-8 string, for use with internal hash tables.
    112  *
    113  * This may or may not yield the same results as the java/lang/String
    114  * computeHashCode() function.  (To make sure this doesn't get abused,
    115  * I'm initializing the hash code to 1 so they *don't* match up.)
    116  *
    117  * It would be more correct to invoke dexGetUtf16FromUtf8() here and compute
    118  * the hash with the result.  That way, if something encoded the same
    119  * character in two different ways, the hash value would be the same.  For
    120  * our purposes that isn't necessary.
    121  */
    122 u4 dvmComputeUtf8Hash(const char* utf8Str)
    123 {
    124     u4 hash = 1;
    125 
    126     while (*utf8Str != '\0')
    127         hash = hash * 31 + *utf8Str++;
    128 
    129     return hash;
    130 }
    131 
    132 /*
    133  * Like "strlen", but for strings encoded with "modified" UTF-8.
    134  *
    135  * The value returned is the number of characters, which may or may not
    136  * be the same as the number of bytes.
    137  *
    138  * (If this needs optimizing, try: mask against 0xa0, shift right 5,
    139  * get increment {1-3} from table of 8 values.)
    140  */
    141 int dvmUtf8Len(const char* utf8Str)
    142 {
    143     int ic, len = 0;
    144 
    145     while ((ic = *utf8Str++) != '\0') {
    146         len++;
    147         if ((ic & 0x80) != 0) {
    148             /* two- or three-byte encoding */
    149             utf8Str++;
    150             if ((ic & 0x20) != 0) {
    151                 /* three-byte encoding */
    152                 utf8Str++;
    153             }
    154         }
    155     }
    156 
    157     return len;
    158 }
    159 
    160 /*
    161  * Convert a "modified" UTF-8 string to UTF-16.
    162  */
    163 void dvmConvertUtf8ToUtf16(u2* utf16Str, const char* utf8Str)
    164 {
    165     while (*utf8Str != '\0')
    166         *utf16Str++ = dexGetUtf16FromUtf8(&utf8Str);
    167 }
    168 
    169 /*
    170  * Given a UTF-16 string, compute the length of the corresponding UTF-8
    171  * string in bytes.
    172  */
    173 static int utf16_utf8ByteLen(const u2* utf16Str, int len)
    174 {
    175     int utf8Len = 0;
    176 
    177     while (len--) {
    178         unsigned int uic = *utf16Str++;
    179 
    180         /*
    181          * The most common case is (uic > 0 && uic <= 0x7f).
    182          */
    183         if (uic == 0 || uic > 0x7f) {
    184             if (uic > 0x07ff)
    185                 utf8Len += 3;
    186             else /*(uic > 0x7f || uic == 0) */
    187                 utf8Len += 2;
    188         } else
    189             utf8Len++;
    190     }
    191     return utf8Len;
    192 }
    193 
    194 /*
    195  * Convert a UTF-16 string to UTF-8.
    196  *
    197  * Make sure you allocate "utf8Str" with the result of utf16_utf8ByteLen(),
    198  * not just "len".
    199  */
    200 static void convertUtf16ToUtf8(char* utf8Str, const u2* utf16Str, int len)
    201 {
    202     assert(len >= 0);
    203 
    204     while (len--) {
    205         unsigned int uic = *utf16Str++;
    206 
    207         /*
    208          * The most common case is (uic > 0 && uic <= 0x7f).
    209          */
    210         if (uic == 0 || uic > 0x7f) {
    211             if (uic > 0x07ff) {
    212                 *utf8Str++ = (uic >> 12) | 0xe0;
    213                 *utf8Str++ = ((uic >> 6) & 0x3f) | 0x80;
    214                 *utf8Str++ = (uic & 0x3f) | 0x80;
    215             } else /*(uic > 0x7f || uic == 0)*/ {
    216                 *utf8Str++ = (uic >> 6) | 0xc0;
    217                 *utf8Str++ = (uic & 0x3f) | 0x80;
    218             }
    219         } else {
    220             *utf8Str++ = uic;
    221         }
    222     }
    223 
    224     *utf8Str = '\0';
    225 }
    226 
    227 /*
    228  * Use the java/lang/String.computeHashCode() algorithm.
    229  */
    230 static inline u4 dvmComputeUtf16Hash(const u2* utf16Str, int len)
    231 {
    232     u4 hash = 0;
    233 
    234     while (len--)
    235         hash = hash * 31 + *utf16Str++;
    236 
    237     return hash;
    238 }
    239 u4 dvmComputeStringHash(StringObject* strObj) {
    240     ArrayObject* chars = (ArrayObject*) dvmGetFieldObject((Object*) strObj,
    241                                 STRING_FIELDOFF_VALUE);
    242     int offset, len;
    243 
    244     len = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_COUNT);
    245     offset = dvmGetFieldInt((Object*) strObj, STRING_FIELDOFF_OFFSET);
    246 
    247     return dvmComputeUtf16Hash((u2*) chars->contents + offset, len);
    248 }
    249 
    250 /*
    251  * Create a new java/lang/String object, using the string data in "utf8Str".
    252  *
    253  * Note that "allocFlags" affects both of the allocations here.  If you
    254  * use ALLOC_DONT_TRACK in a context where a GC could happen between the
    255  * two allocations, you could lose the array reference.
    256  *
    257  * Returns NULL and throws an exception on failure.
    258  */
    259 StringObject* dvmCreateStringFromCstr(const char* utf8Str, int allocFlags)
    260 {
    261     assert(utf8Str != NULL);
    262 
    263     return dvmCreateStringFromCstrAndLength(utf8Str, dvmUtf8Len(utf8Str),
    264             allocFlags);
    265 }
    266 
    267 /*
    268  * Create a java/lang/String from a C string, given its UTF-16 length
    269  * (number of UTF-16 code points).
    270  *
    271  * The caller must call dvmReleaseTrackedAlloc() on the return value or
    272  * use a non-default value for "allocFlags".  It is never appropriate
    273  * to use ALLOC_DONT_TRACK with this function.
    274  *
    275  * Returns NULL and throws an exception on failure.
    276  */
    277 StringObject* dvmCreateStringFromCstrAndLength(const char* utf8Str,
    278     u4 utf16Length, int allocFlags)
    279 {
    280     StringObject* newObj;
    281     ArrayObject* chars;
    282     u4 hashCode = 0;
    283 
    284     //LOGV("Creating String from '%s'\n", utf8Str);
    285     assert(allocFlags != ALLOC_DONT_TRACK);     /* don't currently need */
    286     assert(utf8Str != NULL);
    287 
    288     if (gDvm.javaLangStringReady <= 0) {
    289         if (!stringStartup())
    290             return NULL;
    291     }
    292 
    293     /* init before alloc */
    294     if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
    295         !dvmInitClass(gDvm.classJavaLangString))
    296     {
    297         return NULL;
    298     }
    299 
    300     newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
    301                 allocFlags);
    302     if (newObj == NULL)
    303         return NULL;
    304 
    305     chars = dvmAllocPrimitiveArray('C', utf16Length, allocFlags);
    306     if (chars == NULL) {
    307         dvmReleaseTrackedAllocIFN((Object*) newObj, NULL, allocFlags);
    308         return NULL;
    309     }
    310     dvmConvertUtf8ToUtf16((u2*)chars->contents, utf8Str);
    311     hashCode = dvmComputeUtf16Hash((u2*) chars->contents, utf16Length);
    312 
    313     dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
    314         (Object*)chars);
    315     dvmReleaseTrackedAllocIFN((Object*) chars, NULL, allocFlags);
    316     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, utf16Length);
    317     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
    318     /* leave offset set to zero */
    319 
    320     /* debugging stuff */
    321     //dvmDumpObject((Object*)newObj);
    322     //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, utf16Length * 2,
    323     //    kHexDumpMem);
    324 
    325     /* caller may need to dvmReleaseTrackedAlloc(newObj) */
    326     return newObj;
    327 }
    328 
    329 /*
    330  * Create a new java/lang/String object, using the Unicode data.
    331  */
    332 StringObject* dvmCreateStringFromUnicode(const u2* unichars, int len)
    333 {
    334     StringObject* newObj;
    335     ArrayObject* chars;
    336     u4 hashCode = 0;
    337 
    338     /* we allow a null pointer if the length is zero */
    339     assert(len == 0 || unichars != NULL);
    340 
    341     if (gDvm.javaLangStringReady <= 0) {
    342         if (!stringStartup())
    343             return NULL;
    344     }
    345 
    346     /* init before alloc */
    347     if (!dvmIsClassInitialized(gDvm.classJavaLangString) &&
    348         !dvmInitClass(gDvm.classJavaLangString))
    349     {
    350         return NULL;
    351     }
    352 
    353     newObj = (StringObject*) dvmAllocObject(gDvm.classJavaLangString,
    354         ALLOC_DEFAULT);
    355     if (newObj == NULL)
    356         return NULL;
    357 
    358     chars = dvmAllocPrimitiveArray('C', len, ALLOC_DEFAULT);
    359     if (chars == NULL) {
    360         dvmReleaseTrackedAlloc((Object*) newObj, NULL);
    361         return NULL;
    362     }
    363     if (len > 0)
    364         memcpy(chars->contents, unichars, len * sizeof(u2));
    365     hashCode = dvmComputeUtf16Hash((u2*) chars->contents, len);
    366 
    367     dvmSetFieldObject((Object*)newObj, STRING_FIELDOFF_VALUE,
    368         (Object*)chars);
    369     dvmReleaseTrackedAlloc((Object*) chars, NULL);
    370     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_COUNT, len);
    371     dvmSetFieldInt((Object*)newObj, STRING_FIELDOFF_HASHCODE, hashCode);
    372     /* leave offset set to zero */
    373 
    374     /* debugging stuff */
    375     //dvmDumpObject((Object*)newObj);
    376     //printHexDumpEx(ANDROID_LOG_DEBUG, chars->contents, len*2, kHexDumpMem);
    377 
    378     /* caller must dvmReleaseTrackedAlloc(newObj) */
    379     return newObj;
    380 }
    381 
    382 /*
    383  * Create a new C string from a java/lang/String object.
    384  *
    385  * Returns NULL if the object is NULL.
    386  */
    387 char* dvmCreateCstrFromString(StringObject* jstr)
    388 {
    389     char* newStr;
    390     ArrayObject* chars;
    391     int len, byteLen, offset;
    392     const u2* data;
    393 
    394     assert(gDvm.javaLangStringReady > 0);
    395 
    396     if (jstr == NULL)
    397         return NULL;
    398 
    399     len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
    400     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
    401     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
    402                                 STRING_FIELDOFF_VALUE);
    403     data = (const u2*) chars->contents + offset;
    404     assert(offset + len <= (int) chars->length);
    405 
    406     byteLen = utf16_utf8ByteLen(data, len);
    407     newStr = (char*) malloc(byteLen+1);
    408     if (newStr == NULL)
    409         return NULL;
    410     convertUtf16ToUtf8(newStr, data, len);
    411 
    412     return newStr;
    413 }
    414 
    415 /*
    416  * Create a UTF-8 C string from a region of a java/lang/String.  (Used by
    417  * the JNI GetStringUTFRegion call.)
    418  */
    419 void dvmCreateCstrFromStringRegion(StringObject* jstr, int start, int len,
    420     char* buf)
    421 {
    422     const u2* data;
    423 
    424     data = dvmStringChars(jstr) + start;
    425     convertUtf16ToUtf8(buf, data, len);
    426 }
    427 
    428 /*
    429  * Compute the length, in modified UTF-8, of a java/lang/String object.
    430  *
    431  * Does not include the terminating null byte.
    432  */
    433 int dvmStringUtf8ByteLen(StringObject* jstr)
    434 {
    435     ArrayObject* chars;
    436     int len, offset;
    437     const u2* data;
    438 
    439     assert(gDvm.javaLangStringReady > 0);
    440 
    441     if (jstr == NULL)
    442         return 0;       // should we throw something?  assert?
    443 
    444     len = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
    445     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
    446     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
    447                                 STRING_FIELDOFF_VALUE);
    448     data = (const u2*) chars->contents + offset;
    449     assert(offset + len <= (int) chars->length);
    450 
    451     return utf16_utf8ByteLen(data, len);
    452 }
    453 
    454 /*
    455  * Get the string's length.
    456  */
    457 int dvmStringLen(StringObject* jstr)
    458 {
    459     return dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_COUNT);
    460 }
    461 
    462 /*
    463  * Get the char[] object from the String.
    464  */
    465 ArrayObject* dvmStringCharArray(StringObject* jstr)
    466 {
    467     return (ArrayObject*) dvmGetFieldObject((Object*) jstr,
    468                                 STRING_FIELDOFF_VALUE);
    469 }
    470 
    471 /*
    472  * Get the string's data.
    473  */
    474 const u2* dvmStringChars(StringObject* jstr)
    475 {
    476     ArrayObject* chars;
    477     int offset;
    478 
    479     offset = dvmGetFieldInt((Object*) jstr, STRING_FIELDOFF_OFFSET);
    480     chars = (ArrayObject*) dvmGetFieldObject((Object*) jstr,
    481                                 STRING_FIELDOFF_VALUE);
    482     return (const u2*) chars->contents + offset;
    483 }
    484 
    485 
    486 /*
    487  * Compare two String objects.
    488  *
    489  * This is a dvmHashTableLookup() callback.  The function has already
    490  * compared their hash values; we need to do a full compare to ensure
    491  * that the strings really match.
    492  */
    493 int dvmHashcmpStrings(const void* vstrObj1, const void* vstrObj2)
    494 {
    495     const StringObject* strObj1 = (const StringObject*) vstrObj1;
    496     const StringObject* strObj2 = (const StringObject*) vstrObj2;
    497     ArrayObject* chars1;
    498     ArrayObject* chars2;
    499     int len1, len2, offset1, offset2;
    500 
    501     assert(gDvm.javaLangStringReady > 0);
    502 
    503     /* get offset and length into char array; all values are in 16-bit units */
    504     len1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_COUNT);
    505     offset1 = dvmGetFieldInt((Object*) strObj1, STRING_FIELDOFF_OFFSET);
    506     len2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_COUNT);
    507     offset2 = dvmGetFieldInt((Object*) strObj2, STRING_FIELDOFF_OFFSET);
    508     if (len1 != len2)
    509         return len1 - len2;
    510 
    511     chars1 = (ArrayObject*) dvmGetFieldObject((Object*) strObj1,
    512                                 STRING_FIELDOFF_VALUE);
    513     chars2 = (ArrayObject*) dvmGetFieldObject((Object*) strObj2,
    514                                 STRING_FIELDOFF_VALUE);
    515 
    516     /* damage here actually indicates a broken java/lang/String */
    517     assert(offset1 + len1 <= (int) chars1->length);
    518     assert(offset2 + len2 <= (int) chars2->length);
    519 
    520     return memcmp((const u2*) chars1->contents + offset1,
    521                   (const u2*) chars2->contents + offset2,
    522                   len1 * sizeof(u2));
    523 }
    524 
    525