1 /* 2 * Copyright (C) 2011 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "descriptors_names.h" 18 19 #include "android-base/stringprintf.h" 20 #include "android-base/strings.h" 21 22 #include "base/macros.h" 23 #include "dex/utf-inl.h" 24 25 namespace art { 26 27 using android::base::StringAppendF; 28 29 void AppendPrettyDescriptor(const char* descriptor, std::string* result) { 30 // Count the number of '['s to get the dimensionality. 31 const char* c = descriptor; 32 size_t dim = 0; 33 while (*c == '[') { 34 dim++; 35 c++; 36 } 37 38 // Reference or primitive? 39 if (*c == 'L') { 40 // "[[La/b/C;" -> "a.b.C[][]". 41 c++; // Skip the 'L'. 42 } else { 43 // "[[B" -> "byte[][]". 44 // To make life easier, we make primitives look like unqualified 45 // reference types. 46 switch (*c) { 47 case 'B': c = "byte;"; break; 48 case 'C': c = "char;"; break; 49 case 'D': c = "double;"; break; 50 case 'F': c = "float;"; break; 51 case 'I': c = "int;"; break; 52 case 'J': c = "long;"; break; 53 case 'S': c = "short;"; break; 54 case 'Z': c = "boolean;"; break; 55 case 'V': c = "void;"; break; // Used when decoding return types. 56 default: result->append(descriptor); return; 57 } 58 } 59 60 // At this point, 'c' is a string of the form "fully/qualified/Type;" 61 // or "primitive;". Rewrite the type with '.' instead of '/': 62 const char* p = c; 63 while (*p != ';') { 64 char ch = *p++; 65 if (ch == '/') { 66 ch = '.'; 67 } 68 result->push_back(ch); 69 } 70 // ...and replace the semicolon with 'dim' "[]" pairs: 71 for (size_t i = 0; i < dim; ++i) { 72 result->append("[]"); 73 } 74 } 75 76 std::string PrettyDescriptor(const char* descriptor) { 77 std::string result; 78 AppendPrettyDescriptor(descriptor, &result); 79 return result; 80 } 81 82 std::string GetJniShortName(const std::string& class_descriptor, const std::string& method) { 83 // Remove the leading 'L' and trailing ';'... 84 std::string class_name(class_descriptor); 85 CHECK_EQ(class_name[0], 'L') << class_name; 86 CHECK_EQ(class_name[class_name.size() - 1], ';') << class_name; 87 class_name.erase(0, 1); 88 class_name.erase(class_name.size() - 1, 1); 89 90 std::string short_name; 91 short_name += "Java_"; 92 short_name += MangleForJni(class_name); 93 short_name += "_"; 94 short_name += MangleForJni(method); 95 return short_name; 96 } 97 98 // See http://java.sun.com/j2se/1.5.0/docs/guide/jni/spec/design.html#wp615 for the full rules. 99 std::string MangleForJni(const std::string& s) { 100 std::string result; 101 size_t char_count = CountModifiedUtf8Chars(s.c_str()); 102 const char* cp = &s[0]; 103 for (size_t i = 0; i < char_count; ++i) { 104 uint32_t ch = GetUtf16FromUtf8(&cp); 105 if ((ch >= 'A' && ch <= 'Z') || (ch >= 'a' && ch <= 'z') || (ch >= '0' && ch <= '9')) { 106 result.push_back(ch); 107 } else if (ch == '.' || ch == '/') { 108 result += "_"; 109 } else if (ch == '_') { 110 result += "_1"; 111 } else if (ch == ';') { 112 result += "_2"; 113 } else if (ch == '[') { 114 result += "_3"; 115 } else { 116 const uint16_t leading = GetLeadingUtf16Char(ch); 117 const uint32_t trailing = GetTrailingUtf16Char(ch); 118 119 StringAppendF(&result, "_0%04x", leading); 120 if (trailing != 0) { 121 StringAppendF(&result, "_0%04x", trailing); 122 } 123 } 124 } 125 return result; 126 } 127 128 std::string DotToDescriptor(const char* class_name) { 129 std::string descriptor(class_name); 130 std::replace(descriptor.begin(), descriptor.end(), '.', '/'); 131 if (descriptor.length() > 0 && descriptor[0] != '[') { 132 descriptor = "L" + descriptor + ";"; 133 } 134 return descriptor; 135 } 136 137 std::string DescriptorToDot(const char* descriptor) { 138 size_t length = strlen(descriptor); 139 if (length > 1) { 140 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') { 141 // Descriptors have the leading 'L' and trailing ';' stripped. 142 std::string result(descriptor + 1, length - 2); 143 std::replace(result.begin(), result.end(), '/', '.'); 144 return result; 145 } else { 146 // For arrays the 'L' and ';' remain intact. 147 std::string result(descriptor); 148 std::replace(result.begin(), result.end(), '/', '.'); 149 return result; 150 } 151 } 152 // Do nothing for non-class/array descriptors. 153 return descriptor; 154 } 155 156 std::string DescriptorToName(const char* descriptor) { 157 size_t length = strlen(descriptor); 158 if (descriptor[0] == 'L' && descriptor[length - 1] == ';') { 159 std::string result(descriptor + 1, length - 2); 160 return result; 161 } 162 return descriptor; 163 } 164 165 // Helper for IsValidPartOfMemberNameUtf8(), a bit vector indicating valid low ascii. 166 static constexpr uint32_t DEX_MEMBER_VALID_LOW_ASCII[4] = { 167 0x00000000, // 00..1f low control characters; nothing valid 168 0x03ff2010, // 20..3f digits and symbols; valid: '0'..'9', '$', '-' 169 0x87fffffe, // 40..5f uppercase etc.; valid: 'A'..'Z', '_' 170 0x07fffffe // 60..7f lowercase etc.; valid: 'a'..'z' 171 }; 172 173 // Helper for IsValidPartOfMemberNameUtf8(); do not call directly. 174 COLD_ATTR 175 static bool IsValidPartOfMemberNameUtf8Slow(const char** pUtf8Ptr) { 176 /* 177 * It's a multibyte encoded character. Decode it and analyze. We 178 * accept anything that isn't (a) an improperly encoded low value, 179 * (b) an improper surrogate pair, (c) an encoded '\0', (d) a high 180 * control character, or (e) a high space, layout, or special 181 * character (U+00a0, U+2000..U+200f, U+2028..U+202f, 182 * U+fff0..U+ffff). This is all specified in the dex format 183 * document. 184 */ 185 186 const uint32_t pair = GetUtf16FromUtf8(pUtf8Ptr); 187 const uint16_t leading = GetLeadingUtf16Char(pair); 188 189 // We have a surrogate pair resulting from a valid 4 byte UTF sequence. 190 // No further checks are necessary because 4 byte sequences span code 191 // points [U+10000, U+1FFFFF], which are valid codepoints in a dex 192 // identifier. Furthermore, GetUtf16FromUtf8 guarantees that each of 193 // the surrogate halves are valid and well formed in this instance. 194 if (GetTrailingUtf16Char(pair) != 0) { 195 return true; 196 } 197 198 199 // We've encountered a one, two or three byte UTF-8 sequence. The 200 // three byte UTF-8 sequence could be one half of a surrogate pair. 201 switch (leading >> 8) { 202 case 0x00: 203 // It's only valid if it's above the ISO-8859-1 high space (0xa0). 204 return (leading > 0x00a0); 205 case 0xd8: 206 case 0xd9: 207 case 0xda: 208 case 0xdb: 209 { 210 // We found a three byte sequence encoding one half of a surrogate. 211 // Look for the other half. 212 const uint32_t pair2 = GetUtf16FromUtf8(pUtf8Ptr); 213 const uint16_t trailing = GetLeadingUtf16Char(pair2); 214 215 return (GetTrailingUtf16Char(pair2) == 0) && (0xdc00 <= trailing && trailing <= 0xdfff); 216 } 217 case 0xdc: 218 case 0xdd: 219 case 0xde: 220 case 0xdf: 221 // It's a trailing surrogate, which is not valid at this point. 222 return false; 223 case 0x20: 224 case 0xff: 225 // It's in the range that has spaces, controls, and specials. 226 switch (leading & 0xfff8) { 227 case 0x2000: 228 case 0x2008: 229 case 0x2028: 230 case 0xfff0: 231 case 0xfff8: 232 return false; 233 } 234 return true; 235 default: 236 return true; 237 } 238 239 UNREACHABLE(); 240 } 241 242 /* Return whether the pointed-at modified-UTF-8 encoded character is 243 * valid as part of a member name, updating the pointer to point past 244 * the consumed character. This will consume two encoded UTF-16 code 245 * points if the character is encoded as a surrogate pair. Also, if 246 * this function returns false, then the given pointer may only have 247 * been partially advanced. 248 */ 249 ALWAYS_INLINE 250 static bool IsValidPartOfMemberNameUtf8(const char** pUtf8Ptr) { 251 uint8_t c = (uint8_t) **pUtf8Ptr; 252 if (LIKELY(c <= 0x7f)) { 253 // It's low-ascii, so check the table. 254 uint32_t wordIdx = c >> 5; 255 uint32_t bitIdx = c & 0x1f; 256 (*pUtf8Ptr)++; 257 return (DEX_MEMBER_VALID_LOW_ASCII[wordIdx] & (1 << bitIdx)) != 0; 258 } 259 260 // It's a multibyte encoded character. Call a non-inline function 261 // for the heavy lifting. 262 return IsValidPartOfMemberNameUtf8Slow(pUtf8Ptr); 263 } 264 265 bool IsValidMemberName(const char* s) { 266 bool angle_name = false; 267 268 switch (*s) { 269 case '\0': 270 // The empty string is not a valid name. 271 return false; 272 case '<': 273 angle_name = true; 274 s++; 275 break; 276 } 277 278 while (true) { 279 switch (*s) { 280 case '\0': 281 return !angle_name; 282 case '>': 283 return angle_name && s[1] == '\0'; 284 } 285 286 if (!IsValidPartOfMemberNameUtf8(&s)) { 287 return false; 288 } 289 } 290 } 291 292 enum ClassNameType { kName, kDescriptor }; 293 template<ClassNameType kType, char kSeparator> 294 static bool IsValidClassName(const char* s) { 295 int arrayCount = 0; 296 while (*s == '[') { 297 arrayCount++; 298 s++; 299 } 300 301 if (arrayCount > 255) { 302 // Arrays may have no more than 255 dimensions. 303 return false; 304 } 305 306 ClassNameType type = kType; 307 if (type != kDescriptor && arrayCount != 0) { 308 /* 309 * If we're looking at an array of some sort, then it doesn't 310 * matter if what is being asked for is a class name; the 311 * format looks the same as a type descriptor in that case, so 312 * treat it as such. 313 */ 314 type = kDescriptor; 315 } 316 317 if (type == kDescriptor) { 318 /* 319 * We are looking for a descriptor. Either validate it as a 320 * single-character primitive type, or continue on to check the 321 * embedded class name (bracketed by "L" and ";"). 322 */ 323 switch (*(s++)) { 324 case 'B': 325 case 'C': 326 case 'D': 327 case 'F': 328 case 'I': 329 case 'J': 330 case 'S': 331 case 'Z': 332 // These are all single-character descriptors for primitive types. 333 return (*s == '\0'); 334 case 'V': 335 // Non-array void is valid, but you can't have an array of void. 336 return (arrayCount == 0) && (*s == '\0'); 337 case 'L': 338 // Class name: Break out and continue below. 339 break; 340 default: 341 // Oddball descriptor character. 342 return false; 343 } 344 } 345 346 /* 347 * We just consumed the 'L' that introduces a class name as part 348 * of a type descriptor, or we are looking for an unadorned class 349 * name. 350 */ 351 352 bool sepOrFirst = true; // first character or just encountered a separator. 353 for (;;) { 354 uint8_t c = (uint8_t) *s; 355 switch (c) { 356 case '\0': 357 /* 358 * Premature end for a type descriptor, but valid for 359 * a class name as long as we haven't encountered an 360 * empty component (including the degenerate case of 361 * the empty string ""). 362 */ 363 return (type == kName) && !sepOrFirst; 364 case ';': 365 /* 366 * Invalid character for a class name, but the 367 * legitimate end of a type descriptor. In the latter 368 * case, make sure that this is the end of the string 369 * and that it doesn't end with an empty component 370 * (including the degenerate case of "L;"). 371 */ 372 return (type == kDescriptor) && !sepOrFirst && (s[1] == '\0'); 373 case '/': 374 case '.': 375 if (c != kSeparator) { 376 // The wrong separator character. 377 return false; 378 } 379 if (sepOrFirst) { 380 // Separator at start or two separators in a row. 381 return false; 382 } 383 sepOrFirst = true; 384 s++; 385 break; 386 default: 387 if (!IsValidPartOfMemberNameUtf8(&s)) { 388 return false; 389 } 390 sepOrFirst = false; 391 break; 392 } 393 } 394 } 395 396 bool IsValidBinaryClassName(const char* s) { 397 return IsValidClassName<kName, '.'>(s); 398 } 399 400 bool IsValidJniClassName(const char* s) { 401 return IsValidClassName<kName, '/'>(s); 402 } 403 404 bool IsValidDescriptor(const char* s) { 405 return IsValidClassName<kDescriptor, '/'>(s); 406 } 407 408 std::string PrettyDescriptor(Primitive::Type type) { 409 return PrettyDescriptor(Primitive::Descriptor(type)); 410 } 411 412 } // namespace art 413