Home | History | Annotate | Download | only in libdex
      1 /*
      2  * Copyright (C) 2011 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 /*
     18  * Validate and manipulate MUTF-8 encoded string data.
     19  */
     20 
     21 #include "DexUtf.h"
     22 
     23 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode
     24  * code point values for comparison. This treats different encodings
     25  * for the same code point as equivalent, except that only a real '\0'
     26  * byte is considered the string terminator. The return value is as
     27  * for strcmp(). */
     28 int dexUtf8Cmp(const char* s1, const char* s2) {
     29     for (;;) {
     30         if (*s1 == '\0') {
     31             if (*s2 == '\0') {
     32                 return 0;
     33             }
     34             return -1;
     35         } else if (*s2 == '\0') {
     36             return 1;
     37         }
     38 
     39         int utf1 = dexGetUtf16FromUtf8(&s1);
     40         int utf2 = dexGetUtf16FromUtf8(&s2);
     41         int diff = utf1 - utf2;
     42 
     43         if (diff != 0) {
     44             return diff;
     45         }
     46     }
     47 }
     48 
     49 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */
     50 u4 DEX_MEMBER_VALID_LOW_ASCII[4] = {
     51     0x00000000, // 00..1f low control characters; nothing valid
     52     0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-'
     53     0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_'
     54     0x07fffffe  // 60..7f lowercase etc.; valid: 'a'..'z'
     55 };
     56 
     57 /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */
     58 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) {
     59     /*
     60      * It's a multibyte encoded character. Decode it and analyze. We
     61      * accept anything that isn't (a) an improperly encoded low value,
     62      * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high
     63      * control character, or (e) a high space, layout, or special
     64      * character (U+00a0, U+2000..U+200f, U+2028..U+202f,
     65      * U+fff0..U+ffff). This is all specified in the dex format
     66      * document.
     67      */
     68 
     69     u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
     70 
     71     // Perform follow-up tests based on the high 8 bits.
     72     switch (utf16 >> 8) {
     73         case 0x00: {
     74             // It's only valid if it's above the ISO-8859-1 high space (0xa0).
     75             return (utf16 > 0x00a0);
     76         }
     77         case 0xd8:
     78         case 0xd9:
     79         case 0xda:
     80         case 0xdb: {
     81             /*
     82              * It's a leading surrogate. Check to see that a trailing
     83              * surrogate follows.
     84              */
     85             utf16 = dexGetUtf16FromUtf8(pUtf8Ptr);
     86             return (utf16 >= 0xdc00) && (utf16 <= 0xdfff);
     87         }
     88         case 0xdc:
     89         case 0xdd:
     90         case 0xde:
     91         case 0xdf: {
     92             // It's a trailing surrogate, which is not valid at this point.
     93             return false;
     94         }
     95         case 0x20:
     96         case 0xff: {
     97             // It's in the range that has spaces, controls, and specials.
     98             switch (utf16 & 0xfff8) {
     99                 case 0x2000:
    100                 case 0x2008:
    101                 case 0x2028:
    102                 case 0xfff0:
    103                 case 0xfff8: {
    104                     return false;
    105                 }
    106             }
    107             break;
    108         }
    109     }
    110 
    111     return true;
    112 }
    113 
    114 /* Return whether the given string is a valid field or method name. */
    115 bool dexIsValidMemberName(const char* s) {
    116     bool angleName = false;
    117 
    118     switch (*s) {
    119         case '\0': {
    120             // The empty string is not a valid name.
    121             return false;
    122         }
    123         case '<': {
    124             /*
    125              * '<' is allowed only at the start of a name, and if present,
    126              * means that the name must end with '>'.
    127              */
    128             angleName = true;
    129             s++;
    130             break;
    131         }
    132     }
    133 
    134     for (;;) {
    135         switch (*s) {
    136             case '\0': {
    137                 return !angleName;
    138             }
    139             case '>': {
    140                 return angleName && s[1] == '\0';
    141             }
    142         }
    143         if (!dexIsValidMemberNameUtf8(&s)) {
    144             return false;
    145         }
    146     }
    147 }
    148 
    149 /* Helper for validating type descriptors and class names, which is parametric
    150  * with respect to type vs. class and dot vs. slash. */
    151 static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName,
    152         bool dotSeparator) {
    153     int arrayCount = 0;
    154 
    155     while (*s == '[') {
    156         arrayCount++;
    157         s++;
    158     }
    159 
    160     if (arrayCount > 255) {
    161         // Arrays may have no more than 255 dimensions.
    162         return false;
    163     }
    164 
    165     if (arrayCount != 0) {
    166         /*
    167          * If we're looking at an array of some sort, then it doesn't
    168          * matter if what is being asked for is a class name; the
    169          * format looks the same as a type descriptor in that case, so
    170          * treat it as such.
    171          */
    172         isClassName = false;
    173     }
    174 
    175     if (!isClassName) {
    176         /*
    177          * We are looking for a descriptor. Either validate it as a
    178          * single-character primitive type, or continue on to check the
    179          * embedded class name (bracketed by "L" and ";").
    180          */
    181         switch (*(s++)) {
    182             case 'B':
    183             case 'C':
    184             case 'D':
    185             case 'F':
    186             case 'I':
    187             case 'J':
    188             case 'S':
    189             case 'Z': {
    190                 // These are all single-character descriptors for primitive types.
    191                 return (*s == '\0');
    192             }
    193             case 'V': {
    194                 // Non-array void is valid, but you can't have an array of void.
    195                 return (arrayCount == 0) && (*s == '\0');
    196             }
    197             case 'L': {
    198                 // Class name: Break out and continue below.
    199                 break;
    200             }
    201             default: {
    202                 // Oddball descriptor character.
    203                 return false;
    204             }
    205         }
    206     }
    207 
    208     /*
    209      * We just consumed the 'L' that introduces a class name as part
    210      * of a type descriptor, or we are looking for an unadorned class
    211      * name.
    212      */
    213 
    214     bool sepOrFirst = true; // first character or just encountered a separator.
    215     for (;;) {
    216         u1 c = (u1) *s;
    217         switch (c) {
    218             case '\0': {
    219                 /*
    220                  * Premature end for a type descriptor, but valid for
    221                  * a class name as long as we haven't encountered an
    222                  * empty component (including the degenerate case of
    223                  * the empty string "").
    224                  */
    225                 return isClassName && !sepOrFirst;
    226             }
    227             case ';': {
    228                 /*
    229                  * Invalid character for a class name, but the
    230                  * legitimate end of a type descriptor. In the latter
    231                  * case, make sure that this is the end of the string
    232                  * and that it doesn't end with an empty component
    233                  * (including the degenerate case of "L;").
    234                  */
    235                 return !isClassName && !sepOrFirst && (s[1] == '\0');
    236             }
    237             case '/':
    238             case '.': {
    239                 if (dotSeparator != (c == '.')) {
    240                     // The wrong separator character.
    241                     return false;
    242                 }
    243                 if (sepOrFirst) {
    244                     // Separator at start or two separators in a row.
    245                     return false;
    246                 }
    247                 sepOrFirst = true;
    248                 s++;
    249                 break;
    250             }
    251             default: {
    252                 if (!dexIsValidMemberNameUtf8(&s)) {
    253                     return false;
    254                 }
    255                 sepOrFirst = false;
    256                 break;
    257             }
    258         }
    259     }
    260 }
    261 
    262 /* Return whether the given string is a valid type descriptor. */
    263 bool dexIsValidTypeDescriptor(const char* s) {
    264     return isValidTypeDescriptorOrClassName(s, false, false);
    265 }
    266 
    267 /* (documented in header) */
    268 bool dexIsValidClassName(const char* s, bool dotSeparator) {
    269     return isValidTypeDescriptorOrClassName(s, true, dotSeparator);
    270 }
    271 
    272 /* Return whether the given string is a valid reference descriptor. This
    273  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    274  * is for a class or array and not a primitive type. */
    275 bool dexIsReferenceDescriptor(const char* s) {
    276     if (!dexIsValidTypeDescriptor(s)) {
    277         return false;
    278     }
    279 
    280     return (s[0] == 'L') || (s[0] == '[');
    281 }
    282 
    283 /* Return whether the given string is a valid class descriptor. This
    284  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    285  * is for a class and not an array or primitive type. */
    286 bool dexIsClassDescriptor(const char* s) {
    287     if (!dexIsValidTypeDescriptor(s)) {
    288         return false;
    289     }
    290 
    291     return s[0] == 'L';
    292 }
    293 
    294 /* Return whether the given string is a valid field type descriptor. This
    295  * is true if dexIsValidTypeDescriptor() returns true and the descriptor
    296  * is for anything but "void". */
    297 bool dexIsFieldDescriptor(const char* s) {
    298     if (!dexIsValidTypeDescriptor(s)) {
    299         return false;
    300     }
    301 
    302     return s[0] != 'V';
    303 }
    304 
    305