1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 /* 18 * Validate and manipulate MUTF-8 (modified UTF-8) encoded string data. 19 */ 20 21 #ifndef LIBDEX_DEXUTF_H_ 22 #define LIBDEX_DEXUTF_H_ 23 24 #include "DexFile.h" 25 26 /* 27 * Retrieve the next UTF-16 character from a UTF-8 string. 28 * 29 * Advances "*pUtf8Ptr" to the start of the next character. 30 * 31 * WARNING: If a string is corrupted by dropping a '\0' in the middle 32 * of a 3-byte sequence, you can end up overrunning the buffer with 33 * reads (and possibly with the writes if the length was computed and 34 * cached before the damage). For performance reasons, this function 35 * assumes that the string being parsed is known to be valid (e.g., by 36 * already being verified). Most strings we process here are coming 37 * out of dex files or other internal translations, so the only real 38 * risk comes from the JNI NewStringUTF call. 39 */ 40 DEX_INLINE u2 dexGetUtf16FromUtf8(const char** pUtf8Ptr) 41 { 42 unsigned int one, two, three; 43 44 one = *(*pUtf8Ptr)++; 45 if ((one & 0x80) != 0) { 46 /* two- or three-byte encoding */ 47 two = *(*pUtf8Ptr)++; 48 if ((one & 0x20) != 0) { 49 /* three-byte encoding */ 50 three = *(*pUtf8Ptr)++; 51 return ((one & 0x0f) << 12) | 52 ((two & 0x3f) << 6) | 53 (three & 0x3f); 54 } else { 55 /* two-byte encoding */ 56 return ((one & 0x1f) << 6) | 57 (two & 0x3f); 58 } 59 } else { 60 /* one-byte encoding */ 61 return one; 62 } 63 } 64 65 /* Compare two '\0'-terminated modified UTF-8 strings, using Unicode 66 * code point values for comparison. This treats different encodings 67 * for the same code point as equivalent, except that only a real '\0' 68 * byte is considered the string terminator. The return value is as 69 * for strcmp(). */ 70 int dexUtf8Cmp(const char* s1, const char* s2); 71 72 /* for dexIsValidMemberNameUtf8(), a bit vector indicating valid low ascii */ 73 extern u4 DEX_MEMBER_VALID_LOW_ASCII[4]; 74 75 /* Helper for dexIsValidMemberUtf8(); do not call directly. */ 76 bool dexIsValidMemberNameUtf8_0(const char** pUtf8Ptr); 77 78 /* Return whether the pointed-at modified-UTF-8 encoded character is 79 * valid as part of a member name, updating the pointer to point past 80 * the consumed character. This will consume two encoded UTF-16 code 81 * points if the character is encoded as a surrogate pair. Also, if 82 * this function returns false, then the given pointer may only have 83 * been partially advanced. */ 84 DEX_INLINE bool dexIsValidMemberNameUtf8(const char** pUtf8Ptr) { 85 u1 c = (u1) **pUtf8Ptr; 86 if (c <= 0x7f) { 87 // It's low-ascii, so check the table. 88 u4 wordIdx = c >> 5; 89 u4 bitIdx = c & 0x1f; 90 (*pUtf8Ptr)++; 91 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0; 92 } 93 94 /* 95 * It's a multibyte encoded character. Call a non-inline function 96 * for the heavy lifting. 97 */ 98 return dexIsValidMemberNameUtf8_0(pUtf8Ptr); 99 } 100 101 /* Return whether the given string is a valid field or method name. */ 102 bool dexIsValidMemberName(const char* s); 103 104 /* Return whether the given string is a valid type descriptor. */ 105 bool dexIsValidTypeDescriptor(const char* s); 106 107 /* Return whether the given string is a valid internal-form class 108 * name, with components separated either by dots or slashes as 109 * specified. A class name is like a type descriptor, except that it 110 * can't name a primitive type (including void). In terms of syntax, 111 * the form is either (a) the name of the class without adornment 112 * (that is, not bracketed by "L" and ";"); or (b) identical to the 113 * type descriptor syntax for array types. */ 114 bool dexIsValidClassName(const char* s, bool dotSeparator); 115 116 /* Return whether the given string is a valid reference descriptor. This 117 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 118 * is for a class or array and not a primitive type. */ 119 bool dexIsReferenceDescriptor(const char* s); 120 121 /* Return whether the given string is a valid class descriptor. This 122 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 123 * is for a class and not an array or primitive type. */ 124 bool dexIsClassDescriptor(const char* s); 125 126 /* Return whether the given string is a valid field type descriptor. This 127 * is true if dexIsValidTypeDescriptor() returns true and the descriptor 128 * is for anything but "void". */ 129 bool dexIsFieldDescriptor(const char* s); 130 131 #endif // LIBDEX_DEXUTF_H_ 132