1 // Copyright 2017 The Chromium OS Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "puffin/src/include/puffin/utils.h" 6 7 #include <inttypes.h> 8 9 #include <algorithm> 10 #include <set> 11 #include <string> 12 #include <vector> 13 14 #include "puffin/src/bit_reader.h" 15 #include "puffin/src/file_stream.h" 16 #include "puffin/src/include/puffin/common.h" 17 #include "puffin/src/include/puffin/puffer.h" 18 #include "puffin/src/logging.h" 19 #include "puffin/src/memory_stream.h" 20 #include "puffin/src/puff_writer.h" 21 22 using std::set; 23 using std::string; 24 using std::vector; 25 26 namespace { 27 // Use memcpy to access the unaligned data of type |T|. 28 template <typename T> 29 inline T get_unaligned(const void* address) { 30 T result; 31 memcpy(&result, address, sizeof(T)); 32 return result; 33 } 34 35 struct ExtentData { 36 puffin::BitExtent extent; 37 uint64_t byte_offset; 38 uint64_t byte_length; 39 const puffin::Buffer& data; 40 41 ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data) 42 : extent(in_extent), data(in_data) { 43 // Round start offset up and end offset down to exclude bits not in this 44 // extent. We simply ignore the bits at start and end that's not on byte 45 // boundary because as long as the majority of the bytes are the same, 46 // bsdiff will be able to reference it. 47 byte_offset = (extent.offset + 7) / 8; 48 uint64_t byte_end_offset = (extent.offset + extent.length) / 8; 49 CHECK(byte_end_offset <= data.size()); 50 if (byte_end_offset > byte_offset) { 51 byte_length = byte_end_offset - byte_offset; 52 } else { 53 byte_length = 0; 54 } 55 } 56 57 int Compare(const ExtentData& other) const { 58 if (extent.length != other.extent.length) { 59 return extent.length < other.extent.length ? -1 : 1; 60 } 61 return memcmp(data.data() + byte_offset, 62 other.data.data() + other.byte_offset, 63 std::min(byte_length, other.byte_length)); 64 } 65 bool operator<(const ExtentData& other) const { return Compare(other) < 0; } 66 bool operator==(const ExtentData& other) const { return Compare(other) == 0; } 67 }; 68 69 } // namespace 70 71 namespace puffin { 72 73 bool LocateDeflatesInDeflateStream(const uint8_t* data, 74 uint64_t size, 75 uint64_t virtual_offset, 76 vector<BitExtent>* deflates, 77 uint64_t* compressed_size) { 78 Puffer puffer; 79 BufferBitReader bit_reader(data, size); 80 BufferPuffWriter puff_writer(nullptr, 0); 81 vector<BitExtent> sub_deflates; 82 TEST_AND_RETURN_FALSE( 83 puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates)); 84 for (const auto& deflate : sub_deflates) { 85 deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length); 86 } 87 if (compressed_size) { 88 *compressed_size = bit_reader.Offset(); 89 } 90 return true; 91 } 92 93 // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the 94 // definition of a zlib stream. For finding the deflate blocks, we relying on 95 // the proper size of the zlib stream in |data|. Basically the size of the zlib 96 // stream should be known before hand. Otherwise we need to parse the stream and 97 // find the location of compressed blocks using CalculateSizeOfDeflateBlock(). 98 bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) { 99 // A zlib stream has the following format: 100 // 0 1 compression method and flag 101 // 1 1 flag 102 // 2 4 preset dictionary (optional) 103 // 2 or 6 n compressed data 104 // n+(2 or 6) 4 Adler-32 checksum 105 TEST_AND_RETURN_FALSE(data.size() >= 6 + 4); // Header + Footer 106 uint16_t cmf = data[0]; 107 auto compression_method = cmf & 0x0F; 108 // For deflate compression_method should be 8. 109 TEST_AND_RETURN_FALSE(compression_method == 8); 110 111 auto cinfo = (cmf & 0xF0) >> 4; 112 // Value greater than 7 is not allowed in deflate. 113 TEST_AND_RETURN_FALSE(cinfo <= 7); 114 115 auto flag = data[1]; 116 TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0); 117 118 uint64_t header_len = 2; 119 if (flag & 0x20) { 120 header_len += 4; // 4 bytes for the preset dictionary. 121 } 122 123 // 4 is for ADLER32. 124 TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( 125 data.data() + header_len, data.size() - header_len - 4, header_len, 126 deflates, nullptr)); 127 return true; 128 } 129 130 bool FindDeflateSubBlocks(const UniqueStreamPtr& src, 131 const vector<ByteExtent>& deflates, 132 vector<BitExtent>* subblock_deflates) { 133 Puffer puffer; 134 Buffer deflate_buffer; 135 for (const auto& deflate : deflates) { 136 TEST_AND_RETURN_FALSE(src->Seek(deflate.offset)); 137 // Read from src into deflate_buffer. 138 deflate_buffer.resize(deflate.length); 139 TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length)); 140 141 // Find all the subblocks. 142 BufferBitReader bit_reader(deflate_buffer.data(), deflate.length); 143 // The uncompressed blocks will be ignored since we are passing a null 144 // buffered puff writer and a valid deflate locations output array. This 145 // should not happen in the puffdiff or anywhere else by default. 146 BufferPuffWriter puff_writer(nullptr, 0); 147 vector<BitExtent> subblocks; 148 TEST_AND_RETURN_FALSE( 149 puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks)); 150 TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset()); 151 for (const auto& subblock : subblocks) { 152 subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8, 153 subblock.length); 154 } 155 } 156 return true; 157 } 158 159 bool LocateDeflatesInZlibBlocks(const string& file_path, 160 const vector<ByteExtent>& zlibs, 161 vector<BitExtent>* deflates) { 162 auto src = FileStream::Open(file_path, true, false); 163 TEST_AND_RETURN_FALSE(src); 164 165 Buffer buffer; 166 for (const auto& zlib : zlibs) { 167 buffer.resize(zlib.length); 168 TEST_AND_RETURN_FALSE(src->Seek(zlib.offset)); 169 TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size())); 170 vector<BitExtent> tmp_deflates; 171 TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates)); 172 for (const auto& deflate : tmp_deflates) { 173 deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length); 174 } 175 } 176 return true; 177 } 178 179 // For more information about gzip format, refer to RFC 1952 located at: 180 // https://www.ietf.org/rfc/rfc1952.txt 181 bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) { 182 uint64_t member_start = 0; 183 while (member_start + 10 <= data.size() && data[member_start + 0] == 0x1F && 184 data[member_start + 1] == 0x8B && data[member_start + 2] == 8) { 185 // Each member entry has the following format 186 // 0 1 0x1F 187 // 1 1 0x8B 188 // 2 1 compression method (8 denotes deflate) 189 // 3 1 set of flags 190 // 4 4 modification time 191 // 8 1 extra flags 192 // 9 1 operating system 193 194 uint64_t offset = member_start + 10; 195 int flag = data[member_start + 3]; 196 // Extra field 197 if (flag & 4) { 198 TEST_AND_RETURN_FALSE(offset + 2 <= data.size()); 199 uint16_t extra_length = data[offset++]; 200 extra_length |= static_cast<uint16_t>(data[offset++]) << 8; 201 TEST_AND_RETURN_FALSE(offset + extra_length <= data.size()); 202 offset += extra_length; 203 } 204 // File name field 205 if (flag & 8) { 206 while (true) { 207 TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); 208 if (data[offset++] == 0) { 209 break; 210 } 211 } 212 } 213 // File comment field 214 if (flag & 16) { 215 while (true) { 216 TEST_AND_RETURN_FALSE(offset + 1 <= data.size()); 217 if (data[offset++] == 0) { 218 break; 219 } 220 } 221 } 222 // CRC16 field 223 if (flag & 2) { 224 offset += 2; 225 } 226 227 uint64_t compressed_size = 0; 228 TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream( 229 data.data() + offset, data.size() - offset, offset, deflates, 230 &compressed_size)); 231 offset += compressed_size; 232 233 // Ignore CRC32 and uncompressed size. 234 TEST_AND_RETURN_FALSE(offset + 8 <= data.size()); 235 offset += 8; 236 member_start = offset; 237 } 238 // Return true if we've successfully parsed at least one gzip. 239 return member_start != 0; 240 } 241 242 // For more information about the zip format, refer to 243 // https://support.pkware.com/display/PKZIP/APPNOTE 244 bool LocateDeflatesInZipArchive(const Buffer& data, 245 vector<BitExtent>* deflates) { 246 uint64_t pos = 0; 247 while (pos <= data.size() - 30) { 248 // TODO(xunchang) add support for big endian system when searching for 249 // magic numbers. 250 if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) { 251 pos++; 252 continue; 253 } 254 255 // local file header format 256 // 0 4 0x04034b50 257 // 4 2 minimum version needed to extract 258 // 6 2 general purpose bit flag 259 // 8 2 compression method 260 // 10 4 file last modification date & time 261 // 14 4 CRC-32 262 // 18 4 compressed size 263 // 22 4 uncompressed size 264 // 26 2 file name length 265 // 28 2 extra field length 266 // 30 n file name 267 // 30+n m extra field 268 auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8); 269 if (compression_method != 8) { // non-deflate type 270 pos += 4; 271 continue; 272 } 273 274 auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18); 275 auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26); 276 auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28); 277 uint64_t header_size = 30 + file_name_length + extra_field_length; 278 279 // sanity check 280 if (static_cast<uint64_t>(header_size) + compressed_size > data.size() || 281 pos > data.size() - header_size - compressed_size) { 282 pos += 4; 283 continue; 284 } 285 286 vector<BitExtent> tmp_deflates; 287 uint64_t offset = pos + header_size; 288 uint64_t calculated_compressed_size = 0; 289 if (!LocateDeflatesInDeflateStream( 290 data.data() + offset, data.size() - offset, offset, &tmp_deflates, 291 &calculated_compressed_size)) { 292 LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos 293 << ", skip adding deflates for this entry."; 294 pos += 4; 295 continue; 296 } 297 298 // Double check the compressed size if it is available in the file header. 299 if (compressed_size > 0 && compressed_size != calculated_compressed_size) { 300 LOG(WARNING) << "Compressed size in the file header: " << compressed_size 301 << " doesn't equal the real size: " 302 << calculated_compressed_size; 303 } 304 305 deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end()); 306 pos += header_size + calculated_compressed_size; 307 } 308 309 return true; 310 } 311 312 bool FindPuffLocations(const UniqueStreamPtr& src, 313 const vector<BitExtent>& deflates, 314 vector<ByteExtent>* puffs, 315 uint64_t* out_puff_size) { 316 Puffer puffer; 317 Buffer deflate_buffer; 318 319 // Here accumulate the size difference between each corresponding deflate and 320 // puff. At the end we add this cummulative size difference to the size of the 321 // deflate stream to get the size of the puff stream. We use signed size 322 // because puff size could be smaller than deflate size. 323 int64_t total_size_difference = 0; 324 for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) { 325 // Read from src into deflate_buffer. 326 auto start_byte = deflate->offset / 8; 327 auto end_byte = (deflate->offset + deflate->length + 7) / 8; 328 deflate_buffer.resize(end_byte - start_byte); 329 TEST_AND_RETURN_FALSE(src->Seek(start_byte)); 330 TEST_AND_RETURN_FALSE( 331 src->Read(deflate_buffer.data(), deflate_buffer.size())); 332 // Find the size of the puff. 333 BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size()); 334 uint64_t bits_to_skip = deflate->offset % 8; 335 TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip)); 336 bit_reader.DropBits(bits_to_skip); 337 338 BufferPuffWriter puff_writer(nullptr, 0); 339 TEST_AND_RETURN_FALSE( 340 puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr)); 341 TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset()); 342 343 // 1 if a deflate ends at the same byte that the next deflate starts and 344 // there is a few bits gap between them. In practice this may never happen, 345 // but it is a good idea to support it anyways. If there is a gap, the value 346 // of the gap will be saved as an integer byte to the puff stream. The parts 347 // of the byte that belogs to the deflates are shifted out. 348 int gap = 0; 349 if (deflate != deflates.begin()) { 350 auto prev_deflate = std::prev(deflate); 351 if ((prev_deflate->offset + prev_deflate->length == deflate->offset) 352 // If deflates are on byte boundary the gap will not be counted later, 353 // so we won't worry about it. 354 && (deflate->offset % 8 != 0)) { 355 gap = 1; 356 } 357 } 358 359 start_byte = ((deflate->offset + 7) / 8); 360 end_byte = (deflate->offset + deflate->length) / 8; 361 int64_t deflate_length_in_bytes = end_byte - start_byte; 362 363 // If there was no gap bits between the current and previous deflates, there 364 // will be no extra gap byte, so the offset will be shifted one byte back. 365 auto puff_offset = start_byte - gap + total_size_difference; 366 auto puff_size = puff_writer.Size(); 367 // Add the location into puff. 368 puffs->emplace_back(puff_offset, puff_size); 369 total_size_difference += 370 static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap; 371 } 372 373 uint64_t src_size; 374 TEST_AND_RETURN_FALSE(src->GetSize(&src_size)); 375 auto final_size = static_cast<int64_t>(src_size) + total_size_difference; 376 TEST_AND_RETURN_FALSE(final_size >= 0); 377 *out_puff_size = final_size; 378 return true; 379 } 380 381 void RemoveEqualBitExtents(const Buffer& data1, 382 const Buffer& data2, 383 vector<BitExtent>* extents1, 384 vector<BitExtent>* extents2) { 385 set<ExtentData> extent1_set, equal_extents; 386 for (const BitExtent& ext : *extents1) { 387 extent1_set.emplace(ext, data1); 388 } 389 390 auto new_extents2_end = extents2->begin(); 391 for (const BitExtent& ext : *extents2) { 392 ExtentData extent_data(ext, data2); 393 if (extent1_set.find(extent_data) != extent1_set.end()) { 394 equal_extents.insert(extent_data); 395 } else { 396 *new_extents2_end++ = ext; 397 } 398 } 399 extents2->erase(new_extents2_end, extents2->end()); 400 extents1->erase( 401 std::remove_if(extents1->begin(), extents1->end(), 402 [&equal_extents, &data1](const BitExtent& ext) { 403 return equal_extents.find(ExtentData(ext, data1)) != 404 equal_extents.end(); 405 }), 406 extents1->end()); 407 } 408 409 bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data, 410 vector<BitExtent>* deflates) { 411 Puffer puffer(true /* exclude_bad_distance_caches */); 412 for (auto def = deflates->begin(); def != deflates->end();) { 413 uint64_t offset = def->offset / 8; 414 uint64_t length = (def->offset + def->length + 7) / 8 - offset; 415 BufferBitReader br(&data[offset], length); 416 BufferPuffWriter pw(nullptr, 0); 417 418 // Drop the first few bits in the buffer so we start exactly where the 419 // deflate starts. 420 uint64_t bits_to_drop = def->offset % 8; 421 TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop)); 422 br.DropBits(bits_to_drop); 423 424 vector<BitExtent> defs_out; 425 TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out)); 426 427 TEST_AND_RETURN_FALSE(defs_out.size() <= 1); 428 if (defs_out.size() == 0) { 429 // This is a deflate we were looking for, remove it. 430 def = deflates->erase(def); 431 } else { 432 ++def; 433 } 434 } 435 return true; 436 } 437 438 } // namespace puffin 439