1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 * Validate and manipulate MUTF-8 encoded string data. 19 */ 20 21 #include "DexUtf.h" 22 23 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode 24 * code point values for comparison. This treats different encodings 25 * for the same code point as equivalent, except that only a real '\0' 26 * byte is considered the string terminator. The return value is as 27 * for strcmp(). */ 28 int dexUtf8Cmp(const char* s1, const char* s2) { 29 for (;;) { 30 if (*s1 == '\0') { 31 if (*s2 == '\0') { 32 return 0; 33 } 34 return -1; 35 } else if (*s2 == '\0') { 36 return 1; 37 } 38 39 int utf1 = dexGetUtf16FromUtf8(&s1); 40 int utf2 = dexGetUtf16FromUtf8(&s2); 41 int diff = utf1 - utf2; 42 43 if (diff != 0) { 44 return diff; 45 } 46 } 47 } 48 49 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */ 50 u4 DEX_MEMBER_VALID_LOW_ASCII[4] = { 51 0x00000000, // 00..1f low control characters; nothing valid 52 0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-' 53 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_' 54 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z' 55 }; 56 57 /* Helper for dexIsValidMemberNameUtf8(); do not call directly. */ 58 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr) { 59 /* 60 * It's a multibyte encoded character. Decode it and analyze. We 61 * accept anything that isn't (a) an improperly encoded low value, 62 * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high 63 * control character, or (e) a high space, layout, or special 64 * character (U+00a0, U+2000..U+200f, U+2028..U+202f, 65 * U+fff0..U+ffff). This is all specified in the dex format 66 * document. 67 */ 68 69 u2 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr); 70 71 // Perform follow-up tests based on the high 8 bits. 72 switch (utf16 >> 8) { 73 case 0x00: { 74 // It's only valid if it's above the ISO-8859-1 high space (0xa0). 75 return (utf16 > 0x00a0); 76 } 77 case 0xd8: 78 case 0xd9: 79 case 0xda: 80 case 0xdb: { 81 /* 82 * It's a leading surrogate. Check to see that a trailing 83 * surrogate follows. 84 */ 85 utf16 = dexGetUtf16FromUtf8(pUtf8Ptr); 86 return (utf16 >= 0xdc00) && (utf16 <= 0xdfff); 87 } 88 case 0xdc: 89 case 0xdd: 90 case 0xde: 91 case 0xdf: { 92 // It's a trailing surrogate, which is not valid at this point. 93 return false; 94 } 95 case 0x20: 96 case 0xff: { 97 // It's in the range that has spaces, controls, and specials. 98 switch (utf16 & 0xfff8) { 99 case 0x2000: 100 case 0x2008: 101 case 0x2028: 102 case 0xfff0: 103 case 0xfff8: { 104 return false; 105 } 106 } 107 break; 108 } 109 } 110 111 return true; 112 } 113 114 /* Return whether the given string is a valid field or method name. */ 115 bool dexIsValidMemberName(const char* s) { 116 bool angleName = false; 117 118 switch (*s) { 119 case '\0': { 120 // The empty string is not a valid name. 121 return false; 122 } 123 case '<': { 124 /* 125 * '<' is allowed only at the start of a name, and if present, 126 * means that the name must end with '>'. 127 */ 128 angleName = true; 129 s++; 130 break; 131 } 132 } 133 134 for (;;) { 135 switch (*s) { 136 case '\0': { 137 return !angleName; 138 } 139 case '>': { 140 return angleName && s[1] == '\0'; 141 } 142 } 143 if (!dexIsValidMemberNameUtf8(&s)) { 144 return false; 145 } 146 } 147 } 148 149 /* Helper for validating type descriptors and class names, which is parametric 150 * with respect to type vs. class and dot vs. slash. */ 151 static bool isValidTypeDescriptorOrClassName(const char* s, bool isClassName, 152 bool dotSeparator) { 153 int arrayCount = 0; 154 155 while (*s == '[') { 156 arrayCount++; 157 s++; 158 } 159 160 if (arrayCount > 255) { 161 // Arrays may have no more than 255 dimensions. 162 return false; 163 } 164 165 if (arrayCount != 0) { 166 /* 167 * If we're looking at an array of some sort, then it doesn't 168 * matter if what is being asked for is a class name; the 169 * format looks the same as a type descriptor in that case, so 170 * treat it as such. 171 */ 172 isClassName = false; 173 } 174 175 if (!isClassName) { 176 /* 177 * We are looking for a descriptor. Either validate it as a 178 * single-character primitive type, or continue on to check the 179 * embedded class name (bracketed by "L" and ";"). 180 */ 181 switch (*(s++)) { 182 case 'B': 183 case 'C': 184 case 'D': 185 case 'F': 186 case 'I': 187 case 'J': 188 case 'S': 189 case 'Z': { 190 // These are all single-character descriptors for primitive types. 191 return (*s == '\0'); 192 } 193 case 'V': { 194 // Non-array void is valid, but you can't have an array of void. 195 return (arrayCount == 0) && (*s == '\0'); 196 } 197 case 'L': { 198 // Class name: Break out and continue below. 199 break; 200 } 201 default: { 202 // Oddball descriptor character. 203 return false; 204 } 205 } 206 } 207 208 /* 209 * We just consumed the 'L' that introduces a class name as part 210 * of a type descriptor, or we are looking for an unadorned class 211 * name. 212 */ 213 214 bool sepOrFirst = true; // first character or just encountered a separator. 215 for (;;) { 216 u1 c = (u1) *s; 217 switch (c) { 218 case '\0': { 219 /* 220 * Premature end for a type descriptor, but valid for 221 * a class name as long as we haven't encountered an 222 * empty component (including the degenerate case of 223 * the empty string ""). 224 */ 225 return isClassName && !sepOrFirst; 226 } 227 case ';': { 228 /* 229 * Invalid character for a class name, but the 230 * legitimate end of a type descriptor. In the latter 231 * case, make sure that this is the end of the string 232 * and that it doesn't end with an empty component 233 * (including the degenerate case of "L;"). 234 */ 235 return !isClassName && !sepOrFirst && (s[1] == '\0'); 236 } 237 case '/': 238 case '.': { 239 if (dotSeparator != (c == '.')) { 240 // The wrong separator character. 241 return false; 242 } 243 if (sepOrFirst) { 244 // Separator at start or two separators in a row. 245 return false; 246 } 247 sepOrFirst = true; 248 s++; 249 break; 250 } 251 default: { 252 if (!dexIsValidMemberNameUtf8(&s)) { 253 return false; 254 } 255 sepOrFirst = false; 256 break; 257 } 258 } 259 } 260 } 261 262 /* Return whether the given string is a valid type descriptor. */ 263 bool dexIsValidTypeDescriptor(const char* s) { 264 return isValidTypeDescriptorOrClassName(s, false, false); 265 } 266 267 /* (documented in header) */ 268 bool dexIsValidClassName(const char* s, bool dotSeparator) { 269 return isValidTypeDescriptorOrClassName(s, true, dotSeparator); 270 } 271 272 /* Return whether the given string is a valid reference descriptor. This 273 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 274 * is for a class or array and not a primitive type. */ 275 bool dexIsReferenceDescriptor(const char* s) { 276 if (!dexIsValidTypeDescriptor(s)) { 277 return false; 278 } 279 280 return (s[0] == 'L') || (s[0] == '['); 281 } 282 283 /* Return whether the given string is a valid class descriptor. This 284 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 285 * is for a class and not an array or primitive type. */ 286 bool dexIsClassDescriptor(const char* s) { 287 if (!dexIsValidTypeDescriptor(s)) { 288 return false; 289 } 290 291 return s[0] == 'L'; 292 } 293 294 /* Return whether the given string is a valid field type descriptor. This 295 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 296 * is for anything but "void". */ 297 bool dexIsFieldDescriptor(const char* s) { 298 if (!dexIsValidTypeDescriptor(s)) { 299 return false; 300 } 301 302 return s[0] != 'V'; 303 } 304 305