Home | History | Annotate | Download | only in libdex
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data.
     19  */
     20 
     21 #ifndef LIBDEX_DEXUTF_H_
     22 #define LIBDEX_DEXUTF_H_
     23 
     24 #include "DexFile.h"
     25 
     26 /*
     27  * Retrieve the next UTF-16 character from a UTF-8 string.
     28  *
     29  * Advances "*pUtf8Ptr" to the start of the next character.
     30  *
     31  * WARNING: If a string is corrupted by dropping a '\0' in the middle
     32  * of a 3-byte sequence, you can end up overrunning the buffer with
     33  * reads (and possibly with the writes if the length was computed and
     34  * cached before the damage). For performance reasons, this function
     35  * assumes that the string being parsed is known to be valid (e.g., by
     36  * already being verified). Most strings we process here are coming
     37  * out of dex files or other internal translations, so the only real
     38  * risk comes from the JNI NewStringUTF call.
     39  */
     40 DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr)
     41 {
     42     unsigned int one, two, three;
     43 
     44     one = *(*pUtf8Ptr)++;
     45     if ((one & 0x80) != 0) {
     46         /* two- or three-byte encoding */
     47         two = *(*pUtf8Ptr)++;
     48         if ((one & 0x20) != 0) {
     49             /* three-byte encoding */
     50             three = *(*pUtf8Ptr)++;
     51             return ((one & 0x0f) << 12) |
     52                    ((two & 0x3f) << 6) |
     53                    (three & 0x3f);
     54         } else {
     55             /* two-byte encoding */
     56             return ((one & 0x1f) << 6) |
     57                    (two & 0x3f);
     58         }
     59     } else {
     60         /* one-byte encoding */
     61         return one;
     62     }
     63 }
     64 
     65 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
     66  * code point values for comparison. This treats different encodings
     67  * for the same code point as equivalent, except that only a real '\0'
     68  * byte is considered the string terminator. The return value is as
     69  * for strcmp(). */
     70 int dexUtf8Cmp(const char* s1, const char* s2);
     71 
     72 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
     73 extern u4 DEX_MEMBER_VALID_LOW_ASCII[4];
     74 
     75 /* Helper for dexIsValidMemberUtf8(); do not call directly. */
     76 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr);
     77 
     78 /* Return whether the pointed-at modified-UTF-8 encoded character is
     79  * valid as part of a member name, updating the pointer to point past
     80  * the consumed character. This will consume two encoded UTF-16 code
     81  * points if the character is encoded as a surrogate pair. Also, if
     82  * this function returns false, then the given pointer may only have
     83  * been partially advanced. */
     84 DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) {
     85     u1 c = (u1) **pUtf8Ptr;
     86     if (c <= 0x7f) {
     87         // It's low-ascii, so check the table.
     88         u4 wordIdx = c >> 5;
     89         u4 bitIdx = c & 0x1f;
     90         (*pUtf8Ptr)++;
     91         return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0;
     92     }
     93 
     94     /*
     95      * It's a multibyte encoded character. Call a non-inline function
     96      * for the heavy lifting.
     97      */
     98     return dexIsValidMemberNameUtf8_0(pUtf8Ptr);
     99 }
    100 
    101 /* Return whether the given string is a valid field or method name. */
    102 bool dexIsValidMemberName(const char* s);
    103 
    104 /* Return whether the given string is a valid type descriptor. */
    105 bool dexIsValidTypeDescriptor(const char* s);
    106 
    107 /* Return whether the given string is a valid internal-form class
    108  * name, with components separated either by dots or slashes as
    109  * specified. A class name is like a type descriptor, except that it
    110  * can't name a primitive type (including void). In terms of syntax,
    111  * the form is either (a) the name of the class without adornment
    112  * (that is, not bracketed by "L" and ";"); or (b) identical to the
    113  * type descriptor syntax for array types. */
    114 bool dexIsValidClassName(const char* s, bool dotSeparator);
    115 
    116 /* Return whether the given string is a valid reference descriptor. This
    117  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    118  * is for a class or array and not a primitive type. */
    119 bool dexIsReferenceDescriptor(const char* s);
    120 
    121 /* Return whether the given string is a valid class descriptor. This
    122  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    123  * is for a class and not an array or primitive type. */
    124 bool dexIsClassDescriptor(const char* s);
    125 
    126 /* Return whether the given string is a valid field type descriptor. This
    127  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    128  * is for anything but "void". */
    129 bool dexIsFieldDescriptor(const char* s);
    130 
    131 #endif  // LIBDEX_DEXUTF_H_
    132