1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include "util/Util.h" 18 19 #include <utils/Unicode.h> 20 #include <algorithm> 21 #include <ostream> 22 #include <string> 23 #include <vector> 24 25 #include "androidfw/StringPiece.h" 26 27 #include "util/BigBuffer.h" 28 #include "util/Maybe.h" 29 30 using android::StringPiece; 31 using android::StringPiece16; 32 33 namespace aapt { 34 namespace util { 35 36 static std::vector<std::string> SplitAndTransform( 37 const StringPiece& str, char sep, const std::function<char(char)>& f) { 38 std::vector<std::string> parts; 39 const StringPiece::const_iterator end = std::end(str); 40 StringPiece::const_iterator start = std::begin(str); 41 StringPiece::const_iterator current; 42 do { 43 current = std::find(start, end, sep); 44 parts.emplace_back(str.substr(start, current).to_string()); 45 if (f) { 46 std::string& part = parts.back(); 47 std::transform(part.begin(), part.end(), part.begin(), f); 48 } 49 start = current + 1; 50 } while (current != end); 51 return parts; 52 } 53 54 std::vector<std::string> Split(const StringPiece& str, char sep) { 55 return SplitAndTransform(str, sep, nullptr); 56 } 57 58 std::vector<std::string> SplitAndLowercase(const StringPiece& str, char sep) { 59 return SplitAndTransform(str, sep, ::tolower); 60 } 61 62 bool StartsWith(const StringPiece& str, const StringPiece& prefix) { 63 if (str.size() < prefix.size()) { 64 return false; 65 } 66 return str.substr(0, prefix.size()) == prefix; 67 } 68 69 bool EndsWith(const StringPiece& str, const StringPiece& suffix) { 70 if (str.size() < suffix.size()) { 71 return false; 72 } 73 return str.substr(str.size() - suffix.size(), suffix.size()) == suffix; 74 } 75 76 StringPiece TrimWhitespace(const StringPiece& str) { 77 if (str.size() == 0 || str.data() == nullptr) { 78 return str; 79 } 80 81 const char* start = str.data(); 82 const char* end = str.data() + str.length(); 83 84 while (start != end && isspace(*start)) { 85 start++; 86 } 87 88 while (end != start && isspace(*(end - 1))) { 89 end--; 90 } 91 92 return StringPiece(start, end - start); 93 } 94 95 StringPiece::const_iterator FindNonAlphaNumericAndNotInSet( 96 const StringPiece& str, const StringPiece& allowed_chars) { 97 const auto end_iter = str.end(); 98 for (auto iter = str.begin(); iter != end_iter; ++iter) { 99 char c = *iter; 100 if ((c >= u'a' && c <= u'z') || (c >= u'A' && c <= u'Z') || 101 (c >= u'0' && c <= u'9')) { 102 continue; 103 } 104 105 bool match = false; 106 for (char i : allowed_chars) { 107 if (c == i) { 108 match = true; 109 break; 110 } 111 } 112 113 if (!match) { 114 return iter; 115 } 116 } 117 return end_iter; 118 } 119 120 bool IsJavaClassName(const StringPiece& str) { 121 size_t pieces = 0; 122 for (const StringPiece& piece : Tokenize(str, '.')) { 123 pieces++; 124 if (piece.empty()) { 125 return false; 126 } 127 128 // Can't have starting or trailing $ character. 129 if (piece.data()[0] == '$' || piece.data()[piece.size() - 1] == '$') { 130 return false; 131 } 132 133 if (FindNonAlphaNumericAndNotInSet(piece, "$_") != piece.end()) { 134 return false; 135 } 136 } 137 return pieces >= 2; 138 } 139 140 bool IsJavaPackageName(const StringPiece& str) { 141 if (str.empty()) { 142 return false; 143 } 144 145 size_t pieces = 0; 146 for (const StringPiece& piece : Tokenize(str, '.')) { 147 pieces++; 148 if (piece.empty()) { 149 return false; 150 } 151 152 if (piece.data()[0] == '_' || piece.data()[piece.size() - 1] == '_') { 153 return false; 154 } 155 156 if (FindNonAlphaNumericAndNotInSet(piece, "_") != piece.end()) { 157 return false; 158 } 159 } 160 return pieces >= 1; 161 } 162 163 Maybe<std::string> GetFullyQualifiedClassName(const StringPiece& package, 164 const StringPiece& classname) { 165 if (classname.empty()) { 166 return {}; 167 } 168 169 if (util::IsJavaClassName(classname)) { 170 return classname.to_string(); 171 } 172 173 if (package.empty()) { 174 return {}; 175 } 176 177 std::string result(package.data(), package.size()); 178 if (classname.data()[0] != '.') { 179 result += '.'; 180 } 181 182 result.append(classname.data(), classname.size()); 183 if (!IsJavaClassName(result)) { 184 return {}; 185 } 186 return result; 187 } 188 189 static size_t ConsumeDigits(const char* start, const char* end) { 190 const char* c = start; 191 for (; c != end && *c >= '0' && *c <= '9'; c++) { 192 } 193 return static_cast<size_t>(c - start); 194 } 195 196 bool VerifyJavaStringFormat(const StringPiece& str) { 197 const char* c = str.begin(); 198 const char* const end = str.end(); 199 200 size_t arg_count = 0; 201 bool nonpositional = false; 202 while (c != end) { 203 if (*c == '%' && c + 1 < end) { 204 c++; 205 206 if (*c == '%') { 207 c++; 208 continue; 209 } 210 211 arg_count++; 212 213 size_t num_digits = ConsumeDigits(c, end); 214 if (num_digits > 0) { 215 c += num_digits; 216 if (c != end && *c != '$') { 217 // The digits were a size, but not a positional argument. 218 nonpositional = true; 219 } 220 } else if (*c == '<') { 221 // Reusing last argument, bad idea since positions can be moved around 222 // during translation. 223 nonpositional = true; 224 225 c++; 226 227 // Optionally we can have a $ after 228 if (c != end && *c == '$') { 229 c++; 230 } 231 } else { 232 nonpositional = true; 233 } 234 235 // Ignore size, width, flags, etc. 236 while (c != end && (*c == '-' || *c == '#' || *c == '+' || *c == ' ' || 237 *c == ',' || *c == '(' || (*c >= '0' && *c <= '9'))) { 238 c++; 239 } 240 241 /* 242 * This is a shortcut to detect strings that are going to Time.format() 243 * instead of String.format() 244 * 245 * Comparison of String.format() and Time.format() args: 246 * 247 * String: ABC E GH ST X abcdefgh nost x 248 * Time: DEFGHKMS W Za d hkm s w yz 249 * 250 * Therefore we know it's definitely Time if we have: 251 * DFKMWZkmwyz 252 */ 253 if (c != end) { 254 switch (*c) { 255 case 'D': 256 case 'F': 257 case 'K': 258 case 'M': 259 case 'W': 260 case 'Z': 261 case 'k': 262 case 'm': 263 case 'w': 264 case 'y': 265 case 'z': 266 return true; 267 } 268 } 269 } 270 271 if (c != end) { 272 c++; 273 } 274 } 275 276 if (arg_count > 1 && nonpositional) { 277 // Multiple arguments were specified, but some or all were non positional. 278 // Translated 279 // strings may rearrange the order of the arguments, which will break the 280 // string. 281 return false; 282 } 283 return true; 284 } 285 286 static Maybe<std::string> ParseUnicodeCodepoint(const char** start, 287 const char* end) { 288 char32_t code = 0; 289 for (size_t i = 0; i < 4 && *start != end; i++, (*start)++) { 290 char c = **start; 291 char32_t a; 292 if (c >= '0' && c <= '9') { 293 a = c - '0'; 294 } else if (c >= 'a' && c <= 'f') { 295 a = c - 'a' + 10; 296 } else if (c >= 'A' && c <= 'F') { 297 a = c - 'A' + 10; 298 } else { 299 return {}; 300 } 301 code = (code << 4) | a; 302 } 303 304 ssize_t len = utf32_to_utf8_length(&code, 1); 305 if (len < 0) { 306 return {}; 307 } 308 309 std::string result_utf8; 310 result_utf8.resize(len); 311 utf32_to_utf8(&code, 1, &*result_utf8.begin(), len + 1); 312 return result_utf8; 313 } 314 315 StringBuilder& StringBuilder::Append(const StringPiece& str) { 316 if (!error_.empty()) { 317 return *this; 318 } 319 320 // Where the new data will be appended to. 321 size_t new_data_index = str_.size(); 322 323 const char* const end = str.end(); 324 const char* start = str.begin(); 325 const char* current = start; 326 while (current != end) { 327 if (last_char_was_escape_) { 328 switch (*current) { 329 case 't': 330 str_ += '\t'; 331 break; 332 case 'n': 333 str_ += '\n'; 334 break; 335 case '#': 336 str_ += '#'; 337 break; 338 case '@': 339 str_ += '@'; 340 break; 341 case '?': 342 str_ += '?'; 343 break; 344 case '"': 345 str_ += '"'; 346 break; 347 case '\'': 348 str_ += '\''; 349 break; 350 case '\\': 351 str_ += '\\'; 352 break; 353 case 'u': { 354 current++; 355 Maybe<std::string> c = ParseUnicodeCodepoint(¤t, end); 356 if (!c) { 357 error_ = "invalid unicode escape sequence"; 358 return *this; 359 } 360 str_ += c.value(); 361 current -= 1; 362 break; 363 } 364 365 default: 366 // Ignore. 367 break; 368 } 369 last_char_was_escape_ = false; 370 start = current + 1; 371 } else if (*current == '"') { 372 if (!quote_ && trailing_space_) { 373 // We found an opening quote, and we have 374 // trailing space, so we should append that 375 // space now. 376 if (trailing_space_) { 377 // We had trailing whitespace, so 378 // replace with a single space. 379 if (!str_.empty()) { 380 str_ += ' '; 381 } 382 trailing_space_ = false; 383 } 384 } 385 quote_ = !quote_; 386 str_.append(start, current - start); 387 start = current + 1; 388 } else if (*current == '\'' && !quote_) { 389 // This should be escaped. 390 error_ = "unescaped apostrophe"; 391 return *this; 392 } else if (*current == '\\') { 393 // This is an escape sequence, convert to the real value. 394 if (!quote_ && trailing_space_) { 395 // We had trailing whitespace, so 396 // replace with a single space. 397 if (!str_.empty()) { 398 str_ += ' '; 399 } 400 trailing_space_ = false; 401 } 402 str_.append(start, current - start); 403 start = current + 1; 404 last_char_was_escape_ = true; 405 } else if (!quote_) { 406 // This is not quoted text, so look for whitespace. 407 if (isspace(*current)) { 408 // We found whitespace, see if we have seen some 409 // before. 410 if (!trailing_space_) { 411 // We didn't see a previous adjacent space, 412 // so mark that we did. 413 trailing_space_ = true; 414 str_.append(start, current - start); 415 } 416 417 // Keep skipping whitespace. 418 start = current + 1; 419 } else if (trailing_space_) { 420 // We saw trailing space before, so replace all 421 // that trailing space with one space. 422 if (!str_.empty()) { 423 str_ += ' '; 424 } 425 trailing_space_ = false; 426 } 427 } 428 current++; 429 } 430 str_.append(start, end - start); 431 432 // Accumulate the added string's UTF-16 length. 433 ssize_t len = utf8_to_utf16_length( 434 reinterpret_cast<const uint8_t*>(str_.data()) + new_data_index, 435 str_.size() - new_data_index); 436 if (len < 0) { 437 error_ = "invalid unicode code point"; 438 return *this; 439 } 440 utf16_len_ += len; 441 return *this; 442 } 443 444 std::u16string Utf8ToUtf16(const StringPiece& utf8) { 445 ssize_t utf16_length = utf8_to_utf16_length( 446 reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length()); 447 if (utf16_length <= 0) { 448 return {}; 449 } 450 451 std::u16string utf16; 452 utf16.resize(utf16_length); 453 utf8_to_utf16(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.length(), 454 &*utf16.begin(), utf16_length + 1); 455 return utf16; 456 } 457 458 std::string Utf16ToUtf8(const StringPiece16& utf16) { 459 ssize_t utf8_length = utf16_to_utf8_length(utf16.data(), utf16.length()); 460 if (utf8_length <= 0) { 461 return {}; 462 } 463 464 std::string utf8; 465 utf8.resize(utf8_length); 466 utf16_to_utf8(utf16.data(), utf16.length(), &*utf8.begin(), utf8_length + 1); 467 return utf8; 468 } 469 470 bool WriteAll(std::ostream& out, const BigBuffer& buffer) { 471 for (const auto& b : buffer) { 472 if (!out.write(reinterpret_cast<const char*>(b.buffer.get()), b.size)) { 473 return false; 474 } 475 } 476 return true; 477 } 478 479 std::unique_ptr<uint8_t[]> Copy(const BigBuffer& buffer) { 480 std::unique_ptr<uint8_t[]> data = 481 std::unique_ptr<uint8_t[]>(new uint8_t[buffer.size()]); 482 uint8_t* p = data.get(); 483 for (const auto& block : buffer) { 484 memcpy(p, block.buffer.get(), block.size); 485 p += block.size; 486 } 487 return data; 488 } 489 490 typename Tokenizer::iterator& Tokenizer::iterator::operator++() { 491 const char* start = token_.end(); 492 const char* end = str_.end(); 493 if (start == end) { 494 end_ = true; 495 token_.assign(token_.end(), 0); 496 return *this; 497 } 498 499 start += 1; 500 const char* current = start; 501 while (current != end) { 502 if (*current == separator_) { 503 token_.assign(start, current - start); 504 return *this; 505 } 506 ++current; 507 } 508 token_.assign(start, end - start); 509 return *this; 510 } 511 512 bool Tokenizer::iterator::operator==(const iterator& rhs) const { 513 // We check equality here a bit differently. 514 // We need to know that the addresses are the same. 515 return token_.begin() == rhs.token_.begin() && 516 token_.end() == rhs.token_.end() && end_ == rhs.end_; 517 } 518 519 bool Tokenizer::iterator::operator!=(const iterator& rhs) const { 520 return !(*this == rhs); 521 } 522 523 Tokenizer::iterator::iterator(StringPiece s, char sep, StringPiece tok, 524 bool end) 525 : str_(s), separator_(sep), token_(tok), end_(end) {} 526 527 Tokenizer::Tokenizer(StringPiece str, char sep) 528 : begin_(++iterator(str, sep, StringPiece(str.begin() - 1, 0), false)), 529 end_(str, sep, StringPiece(str.end(), 0), true) {} 530 531 bool ExtractResFilePathParts(const StringPiece& path, StringPiece* out_prefix, 532 StringPiece* out_entry, StringPiece* out_suffix) { 533 const StringPiece res_prefix("res/"); 534 if (!StartsWith(path, res_prefix)) { 535 return false; 536 } 537 538 StringPiece::const_iterator last_occurence = path.end(); 539 for (auto iter = path.begin() + res_prefix.size(); iter != path.end(); 540 ++iter) { 541 if (*iter == '/') { 542 last_occurence = iter; 543 } 544 } 545 546 if (last_occurence == path.end()) { 547 return false; 548 } 549 550 auto iter = std::find(last_occurence, path.end(), '.'); 551 *out_suffix = StringPiece(iter, path.end() - iter); 552 *out_entry = StringPiece(last_occurence + 1, iter - last_occurence - 1); 553 *out_prefix = StringPiece(path.begin(), last_occurence - path.begin() + 1); 554 return true; 555 } 556 557 StringPiece16 GetString16(const android::ResStringPool& pool, size_t idx) { 558 size_t len; 559 const char16_t* str = pool.stringAt(idx, &len); 560 if (str != nullptr) { 561 return StringPiece16(str, len); 562 } 563 return StringPiece16(); 564 } 565 566 std::string GetString(const android::ResStringPool& pool, size_t idx) { 567 size_t len; 568 const char* str = pool.string8At(idx, &len); 569 if (str != nullptr) { 570 return std::string(str, len); 571 } 572 return Utf16ToUtf8(GetString16(pool, idx)); 573 } 574 575 } // namespace util 576 } // namespace aapt 577