1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/Util.h" 18 19 #include <algorithm> 20 #include <ostream> 21 #include <string> 22 #include <vector> 23 24 #include "android-base/stringprintf.h" 25 #include "androidfw/StringPiece.h" 26 #include "build/version.h" 27 28 #include "text/Unicode.h" 29 #include "text/Utf8Iterator.h" 30 #include "util/BigBuffer.h" 31 #include "util/Maybe.h" 32 #include "utils/Unicode.h" 33 34 using ::aapt::text::Utf8Iterator; 35 using ::android::StringPiece; 36 using ::android::StringPiece16; 37 38 namespace aapt { 39 namespace util { 40 41 static std::vector<std::string> SplitAndTransform( 42 const StringPiece& str, char sep, const std::function<char(char)>& f) { 43 std::vector<std::string> parts; 44 const StringPiece::const_iterator end = std::end(str); 45 StringPiece::const_iterator start = std::begin(str); 46 StringPiece::const_iterator current; 47 do { 48 current = std::find(start, end, sep); 49 parts.emplace_back(str.substr(start, current).to_string()); 50 if (f) { 51 std::string& part = parts.back(); 52 std::transform(part.begin(), part.end(), part.begin(), f); 53 } 54 start = current + 1; 55 } while (current != end); 56 return parts; 57 } 58 59 std::vector<std::string> Split(const StringPiece& str, char sep) { 60 return SplitAndTransform(str, sep, nullptr); 61 } 62 63 std::vector<std::string> SplitAndLowercase(const StringPiece& str, char sep) { 64 return SplitAndTransform(str, sep, ::tolower); 65 } 66 67 bool StartsWith(const StringPiece& str, const StringPiece& prefix) { 68 if (str.size() < prefix.size()) { 69 return false; 70 } 71 return str.substr(0, prefix.size()) == prefix; 72 } 73 74 bool EndsWith(const StringPiece& str, const StringPiece& suffix) { 75 if (str.size() < suffix.size()) { 76 return false; 77 } 78 return str.substr(str.size() - suffix.size(), suffix.size()) == suffix; 79 } 80 81 StringPiece TrimLeadingWhitespace(const StringPiece& str) { 82 if (str.size() == 0 || str.data() == nullptr) { 83 return str; 84 } 85 86 const char* start = str.data(); 87 const char* end = start + str.length(); 88 89 while (start != end && isspace(*start)) { 90 start++; 91 } 92 return StringPiece(start, end - start); 93 } 94 95 StringPiece TrimTrailingWhitespace(const StringPiece& str) { 96 if (str.size() == 0 || str.data() == nullptr) { 97 return str; 98 } 99 100 const char* start = str.data(); 101 const char* end = start + str.length(); 102 103 while (end != start && isspace(*(end - 1))) { 104 end--; 105 } 106 return StringPiece(start, end - start); 107 } 108 109 StringPiece TrimWhitespace(const StringPiece& str) { 110 if (str.size() == 0 || str.data() == nullptr) { 111 return str; 112 } 113 114 const char* start = str.data(); 115 const char* end = str.data() + str.length(); 116 117 while (start != end && isspace(*start)) { 118 start++; 119 } 120 121 while (end != start && isspace(*(end - 1))) { 122 end--; 123 } 124 125 return StringPiece(start, end - start); 126 } 127 128 static int IsJavaNameImpl(const StringPiece& str) { 129 int pieces = 0; 130 for (const StringPiece& piece : Tokenize(str, '.')) { 131 pieces++; 132 if (!text::IsJavaIdentifier(piece)) { 133 return -1; 134 } 135 } 136 return pieces; 137 } 138 139 bool IsJavaClassName(const StringPiece& str) { 140 return IsJavaNameImpl(str) >= 2; 141 } 142 143 bool IsJavaPackageName(const StringPiece& str) { 144 return IsJavaNameImpl(str) >= 1; 145 } 146 147 static int IsAndroidNameImpl(const StringPiece& str) { 148 int pieces = 0; 149 for (const StringPiece& piece : Tokenize(str, '.')) { 150 if (piece.empty()) { 151 return -1; 152 } 153 154 const char first_character = piece.data()[0]; 155 if (!::isalpha(first_character)) { 156 return -1; 157 } 158 159 bool valid = std::all_of(piece.begin() + 1, piece.end(), [](const char c) -> bool { 160 return ::isalnum(c) || c == '_'; 161 }); 162 163 if (!valid) { 164 return -1; 165 } 166 pieces++; 167 } 168 return pieces; 169 } 170 171 bool IsAndroidPackageName(const StringPiece& str) { 172 return IsAndroidNameImpl(str) > 1 || str == "android"; 173 } 174 175 bool IsAndroidSplitName(const StringPiece& str) { 176 return IsAndroidNameImpl(str) > 0; 177 } 178 179 Maybe<std::string> GetFullyQualifiedClassName(const StringPiece& package, 180 const StringPiece& classname) { 181 if (classname.empty()) { 182 return {}; 183 } 184 185 if (util::IsJavaClassName(classname)) { 186 return classname.to_string(); 187 } 188 189 if (package.empty()) { 190 return {}; 191 } 192 193 std::string result = package.to_string(); 194 if (classname.data()[0] != '.') { 195 result += '.'; 196 } 197 198 result.append(classname.data(), classname.size()); 199 if (!IsJavaClassName(result)) { 200 return {}; 201 } 202 return result; 203 } 204 205 const char* GetToolName() { 206 static const char* const sToolName = "Android Asset Packaging Tool (aapt)"; 207 return sToolName; 208 } 209 210 std::string GetToolFingerprint() { 211 // DO NOT UPDATE, this is more of a marketing version. 212 static const char* const sMajorVersion = "2"; 213 214 // Update minor version whenever a feature or flag is added. 215 static const char* const sMinorVersion = "19"; 216 217 // The build id of aapt2 binary. 218 static const std::string sBuildId = android::build::GetBuildNumber(); 219 220 return android::base::StringPrintf("%s.%s-%s", sMajorVersion, sMinorVersion, sBuildId.c_str()); 221 } 222 223 static size_t ConsumeDigits(const char* start, const char* end) { 224 const char* c = start; 225 for (; c != end && *c >= '0' && *c <= '9'; c++) { 226 } 227 return static_cast<size_t>(c - start); 228 } 229 230 bool VerifyJavaStringFormat(const StringPiece& str) { 231 const char* c = str.begin(); 232 const char* const end = str.end(); 233 234 size_t arg_count = 0; 235 bool nonpositional = false; 236 while (c != end) { 237 if (*c == '%' && c + 1 < end) { 238 c++; 239 240 if (*c == '%' || *c == 'n') { 241 c++; 242 continue; 243 } 244 245 arg_count++; 246 247 size_t num_digits = ConsumeDigits(c, end); 248 if (num_digits > 0) { 249 c += num_digits; 250 if (c != end && *c != '$') { 251 // The digits were a size, but not a positional argument. 252 nonpositional = true; 253 } 254 } else if (*c == '<') { 255 // Reusing last argument, bad idea since positions can be moved around 256 // during translation. 257 nonpositional = true; 258 259 c++; 260 261 // Optionally we can have a $ after 262 if (c != end && *c == '$') { 263 c++; 264 } 265 } else { 266 nonpositional = true; 267 } 268 269 // Ignore size, width, flags, etc. 270 while (c != end && (*c == '-' || *c == '#' || *c == '+' || *c == ' ' || 271 *c == ',' || *c == '(' || (*c >= '0' && *c <= '9'))) { 272 c++; 273 } 274 275 /* 276 * This is a shortcut to detect strings that are going to Time.format() 277 * instead of String.format() 278 * 279 * Comparison of String.format() and Time.format() args: 280 * 281 * String: ABC E GH ST X abcdefgh nost x 282 * Time: DEFGHKMS W Za d hkm s w yz 283 * 284 * Therefore we know it's definitely Time if we have: 285 * DFKMWZkmwyz 286 */ 287 if (c != end) { 288 switch (*c) { 289 case 'D': 290 case 'F': 291 case 'K': 292 case 'M': 293 case 'W': 294 case 'Z': 295 case 'k': 296 case 'm': 297 case 'w': 298 case 'y': 299 case 'z': 300 return true; 301 } 302 } 303 } 304 305 if (c != end) { 306 c++; 307 } 308 } 309 310 if (arg_count > 1 && nonpositional) { 311 // Multiple arguments were specified, but some or all were non positional. 312 // Translated 313 // strings may rearrange the order of the arguments, which will break the 314 // string. 315 return false; 316 } 317 return true; 318 } 319 320 std::string Utf8ToModifiedUtf8(const std::string& utf8) { 321 // Java uses Modified UTF-8 which only supports the 1, 2, and 3 byte formats of UTF-8. To encode 322 // 4 byte UTF-8 codepoints, Modified UTF-8 allows the use of surrogate pairs in the same format 323 // of CESU-8 surrogate pairs. Calculate the size of the utf8 string with all 4 byte UTF-8 324 // codepoints replaced with 2 3 byte surrogate pairs 325 size_t modified_size = 0; 326 const size_t size = utf8.size(); 327 for (size_t i = 0; i < size; i++) { 328 if (((uint8_t) utf8[i] >> 4) == 0xF) { 329 modified_size += 6; 330 i += 3; 331 } else { 332 modified_size++; 333 } 334 } 335 336 // Early out if no 4 byte codepoints are found 337 if (size == modified_size) { 338 return utf8; 339 } 340 341 std::string output; 342 output.reserve(modified_size); 343 for (size_t i = 0; i < size; i++) { 344 if (((uint8_t) utf8[i] >> 4) == 0xF) { 345 int32_t codepoint = utf32_from_utf8_at(utf8.data(), size, i, nullptr); 346 347 // Calculate the high and low surrogates as UTF-16 would 348 int32_t high = ((codepoint - 0x10000) / 0x400) + 0xD800; 349 int32_t low = ((codepoint - 0x10000) % 0x400) + 0xDC00; 350 351 // Encode each surrogate in UTF-8 352 output.push_back((char) (0xE4 | ((high >> 12) & 0xF))); 353 output.push_back((char) (0x80 | ((high >> 6) & 0x3F))); 354 output.push_back((char) (0x80 | (high & 0x3F))); 355 output.push_back((char) (0xE4 | ((low >> 12) & 0xF))); 356 output.push_back((char) (0x80 | ((low >> 6) & 0x3F))); 357 output.push_back((char) (0x80 | (low & 0x3F))); 358 i += 3; 359 } else { 360 output.push_back(utf8[i]); 361 } 362 } 363 364 return output; 365 } 366 367 std::string ModifiedUtf8ToUtf8(const std::string& modified_utf8) { 368 // The UTF-8 representation will have a byte length less than or equal to the Modified UTF-8 369 // representation. 370 std::string output; 371 output.reserve(modified_utf8.size()); 372 373 size_t index = 0; 374 const size_t modified_size = modified_utf8.size(); 375 while (index < modified_size) { 376 size_t next_index; 377 int32_t high_surrogate = utf32_from_utf8_at(modified_utf8.data(), modified_size, index, 378 &next_index); 379 if (high_surrogate < 0) { 380 return {}; 381 } 382 383 // Check that the first codepoint is within the high surrogate range 384 if (high_surrogate >= 0xD800 && high_surrogate <= 0xDB7F) { 385 int32_t low_surrogate = utf32_from_utf8_at(modified_utf8.data(), modified_size, next_index, 386 &next_index); 387 if (low_surrogate < 0) { 388 return {}; 389 } 390 391 // Check that the second codepoint is within the low surrogate range 392 if (low_surrogate >= 0xDC00 && low_surrogate <= 0xDFFF) { 393 const char32_t codepoint = (char32_t) (((high_surrogate - 0xD800) * 0x400) 394 + (low_surrogate - 0xDC00) + 0x10000); 395 396 // The decoded codepoint should represent a 4 byte, UTF-8 character 397 const size_t utf8_length = (size_t) utf32_to_utf8_length(&codepoint, 1); 398 if (utf8_length != 4) { 399 return {}; 400 } 401 402 // Encode the UTF-8 representation of the codepoint into the string 403 char* start = &output[output.size()]; 404 output.resize(output.size() + utf8_length); 405 utf32_to_utf8((char32_t*) &codepoint, 1, start, utf8_length + 1); 406 407 index = next_index; 408 continue; 409 } 410 } 411 412 // Append non-surrogate pairs to the output string 413 for (size_t i = index; i < next_index; i++) { 414 output.push_back(modified_utf8[i]); 415 } 416 index = next_index; 417 } 418 return output; 419 } 420 421 std::u16string Utf8ToUtf16(const StringPiece& utf8) { 422 ssize_t utf16_length = utf8_to_utf16_length( 423 reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length()); 424 if (utf16_length <= 0) { 425 return {}; 426 } 427 428 std::u16string utf16; 429 utf16.resize(utf16_length); 430 utf8_to_utf16(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length(), 431 &*utf16.begin(), utf16_length + 1); 432 return utf16; 433 } 434 435 std::string Utf16ToUtf8(const StringPiece16& utf16) { 436 ssize_t utf8_length = utf16_to_utf8_length(utf16.data(), utf16.length()); 437 if (utf8_length <= 0) { 438 return {}; 439 } 440 441 std::string utf8; 442 utf8.resize(utf8_length); 443 utf16_to_utf8(utf16.data(), utf16.length(), &*utf8.begin(), utf8_length + 1); 444 return utf8; 445 } 446 447 bool WriteAll(std::ostream& out, const BigBuffer& buffer) { 448 for (const auto& b : buffer) { 449 if (!out.write(reinterpret_cast<const char*>(b.buffer.get()), b.size)) { 450 return false; 451 } 452 } 453 return true; 454 } 455 456 std::unique_ptr<uint8_t[]> Copy(const BigBuffer& buffer) { 457 std::unique_ptr<uint8_t[]> data = 458 std::unique_ptr<uint8_t[]>(new uint8_t[buffer.size()]); 459 uint8_t* p = data.get(); 460 for (const auto& block : buffer) { 461 memcpy(p, block.buffer.get(), block.size); 462 p += block.size; 463 } 464 return data; 465 } 466 467 typename Tokenizer::iterator& Tokenizer::iterator::operator++() { 468 const char* start = token_.end(); 469 const char* end = str_.end(); 470 if (start == end) { 471 end_ = true; 472 token_.assign(token_.end(), 0); 473 return *this; 474 } 475 476 start += 1; 477 const char* current = start; 478 while (current != end) { 479 if (*current == separator_) { 480 token_.assign(start, current - start); 481 return *this; 482 } 483 ++current; 484 } 485 token_.assign(start, end - start); 486 return *this; 487 } 488 489 bool Tokenizer::iterator::operator==(const iterator& rhs) const { 490 // We check equality here a bit differently. 491 // We need to know that the addresses are the same. 492 return token_.begin() == rhs.token_.begin() && 493 token_.end() == rhs.token_.end() && end_ == rhs.end_; 494 } 495 496 bool Tokenizer::iterator::operator!=(const iterator& rhs) const { 497 return !(*this == rhs); 498 } 499 500 Tokenizer::iterator::iterator(const StringPiece& s, char sep, const StringPiece& tok, bool end) 501 : str_(s), separator_(sep), token_(tok), end_(end) {} 502 503 Tokenizer::Tokenizer(const StringPiece& str, char sep) 504 : begin_(++iterator(str, sep, StringPiece(str.begin() - 1, 0), false)), 505 end_(str, sep, StringPiece(str.end(), 0), true) {} 506 507 bool ExtractResFilePathParts(const StringPiece& path, StringPiece* out_prefix, 508 StringPiece* out_entry, StringPiece* out_suffix) { 509 const StringPiece res_prefix("res/"); 510 if (!StartsWith(path, res_prefix)) { 511 return false; 512 } 513 514 StringPiece::const_iterator last_occurence = path.end(); 515 for (auto iter = path.begin() + res_prefix.size(); iter != path.end(); 516 ++iter) { 517 if (*iter == '/') { 518 last_occurence = iter; 519 } 520 } 521 522 if (last_occurence == path.end()) { 523 return false; 524 } 525 526 auto iter = std::find(last_occurence, path.end(), '.'); 527 *out_suffix = StringPiece(iter, path.end() - iter); 528 *out_entry = StringPiece(last_occurence + 1, iter - last_occurence - 1); 529 *out_prefix = StringPiece(path.begin(), last_occurence - path.begin() + 1); 530 return true; 531 } 532 533 StringPiece16 GetString16(const android::ResStringPool& pool, size_t idx) { 534 size_t len; 535 const char16_t* str = pool.stringAt(idx, &len); 536 if (str != nullptr) { 537 return StringPiece16(str, len); 538 } 539 return StringPiece16(); 540 } 541 542 std::string GetString(const android::ResStringPool& pool, size_t idx) { 543 size_t len; 544 const char* str = pool.string8At(idx, &len); 545 if (str != nullptr) { 546 return ModifiedUtf8ToUtf8(std::string(str, len)); 547 } 548 return Utf16ToUtf8(GetString16(pool, idx)); 549 } 550 551 } // namespace util 552 } // namespace aapt 553