Home | History | Annotate | Download | only in src
      1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "puffin/src/include/puffin/utils.h"
      6 
      7 #include <inttypes.h>
      8 
      9 #include <algorithm>
     10 #include <set>
     11 #include <string>
     12 #include <vector>
     13 
     14 #include "puffin/src/bit_reader.h"
     15 #include "puffin/src/file_stream.h"
     16 #include "puffin/src/include/puffin/common.h"
     17 #include "puffin/src/include/puffin/puffer.h"
     18 #include "puffin/src/logging.h"
     19 #include "puffin/src/memory_stream.h"
     20 #include "puffin/src/puff_writer.h"
     21 
     22 using std::set;
     23 using std::string;
     24 using std::vector;
     25 
     26 namespace {
     27 // Use memcpy to access the unaligned data of type |T|.
     28 template <typename T>
     29 inline T get_unaligned(const void* address) {
     30   T result;
     31   memcpy(&result, address, sizeof(T));
     32   return result;
     33 }
     34 
     35 struct ExtentData {
     36   puffin::BitExtent extent;
     37   uint64_t byte_offset;
     38   uint64_t byte_length;
     39   const puffin::Buffer& data;
     40 
     41   ExtentData(const puffin::BitExtent& in_extent, const puffin::Buffer& in_data)
     42       : extent(in_extent), data(in_data) {
     43     // Round start offset up and end offset down to exclude bits not in this
     44     // extent. We simply ignore the bits at start and end that's not on byte
     45     // boundary because as long as the majority of the bytes are the same,
     46     // bsdiff will be able to reference it.
     47     byte_offset = (extent.offset + 7) / 8;
     48     uint64_t byte_end_offset = (extent.offset + extent.length) / 8;
     49     CHECK(byte_end_offset <= data.size());
     50     if (byte_end_offset > byte_offset) {
     51       byte_length = byte_end_offset - byte_offset;
     52     } else {
     53       byte_length = 0;
     54     }
     55   }
     56 
     57   int Compare(const ExtentData& other) const {
     58     if (extent.length != other.extent.length) {
     59       return extent.length < other.extent.length ? -1 : 1;
     60     }
     61     return memcmp(data.data() + byte_offset,
     62                   other.data.data() + other.byte_offset,
     63                   std::min(byte_length, other.byte_length));
     64   }
     65   bool operator<(const ExtentData& other) const { return Compare(other) < 0; }
     66   bool operator==(const ExtentData& other) const { return Compare(other) == 0; }
     67 };
     68 
     69 }  // namespace
     70 
     71 namespace puffin {
     72 
     73 bool LocateDeflatesInDeflateStream(const uint8_t* data,
     74                                    uint64_t size,
     75                                    uint64_t virtual_offset,
     76                                    vector<BitExtent>* deflates,
     77                                    uint64_t* compressed_size) {
     78   Puffer puffer;
     79   BufferBitReader bit_reader(data, size);
     80   BufferPuffWriter puff_writer(nullptr, 0);
     81   vector<BitExtent> sub_deflates;
     82   TEST_AND_RETURN_FALSE(
     83       puffer.PuffDeflate(&bit_reader, &puff_writer, &sub_deflates));
     84   for (const auto& deflate : sub_deflates) {
     85     deflates->emplace_back(deflate.offset + virtual_offset * 8, deflate.length);
     86   }
     87   if (compressed_size) {
     88     *compressed_size = bit_reader.Offset();
     89   }
     90   return true;
     91 }
     92 
     93 // This function uses RFC1950 (https://www.ietf.org/rfc/rfc1950.txt) for the
     94 // definition of a zlib stream.  For finding the deflate blocks, we relying on
     95 // the proper size of the zlib stream in |data|. Basically the size of the zlib
     96 // stream should be known before hand. Otherwise we need to parse the stream and
     97 // find the location of compressed blocks using CalculateSizeOfDeflateBlock().
     98 bool LocateDeflatesInZlib(const Buffer& data, vector<BitExtent>* deflates) {
     99   // A zlib stream has the following format:
    100   // 0           1     compression method and flag
    101   // 1           1     flag
    102   // 2           4     preset dictionary (optional)
    103   // 2 or 6      n     compressed data
    104   // n+(2 or 6)  4     Adler-32 checksum
    105   TEST_AND_RETURN_FALSE(data.size() >= 6 + 4);  // Header + Footer
    106   uint16_t cmf = data[0];
    107   auto compression_method = cmf & 0x0F;
    108   // For deflate compression_method should be 8.
    109   TEST_AND_RETURN_FALSE(compression_method == 8);
    110 
    111   auto cinfo = (cmf & 0xF0) >> 4;
    112   // Value greater than 7 is not allowed in deflate.
    113   TEST_AND_RETURN_FALSE(cinfo <= 7);
    114 
    115   auto flag = data[1];
    116   TEST_AND_RETURN_FALSE(((cmf << 8) + flag) % 31 == 0);
    117 
    118   uint64_t header_len = 2;
    119   if (flag & 0x20) {
    120     header_len += 4;  // 4 bytes for the preset dictionary.
    121   }
    122 
    123   // 4 is for ADLER32.
    124   TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
    125       data.data() + header_len, data.size() - header_len - 4, header_len,
    126       deflates, nullptr));
    127   return true;
    128 }
    129 
    130 bool FindDeflateSubBlocks(const UniqueStreamPtr& src,
    131                           const vector<ByteExtent>& deflates,
    132                           vector<BitExtent>* subblock_deflates) {
    133   Puffer puffer;
    134   Buffer deflate_buffer;
    135   for (const auto& deflate : deflates) {
    136     TEST_AND_RETURN_FALSE(src->Seek(deflate.offset));
    137     // Read from src into deflate_buffer.
    138     deflate_buffer.resize(deflate.length);
    139     TEST_AND_RETURN_FALSE(src->Read(deflate_buffer.data(), deflate.length));
    140 
    141     // Find all the subblocks.
    142     BufferBitReader bit_reader(deflate_buffer.data(), deflate.length);
    143     // The uncompressed blocks will be ignored since we are passing a null
    144     // buffered puff writer and a valid deflate locations output array. This
    145     // should not happen in the puffdiff or anywhere else by default.
    146     BufferPuffWriter puff_writer(nullptr, 0);
    147     vector<BitExtent> subblocks;
    148     TEST_AND_RETURN_FALSE(
    149         puffer.PuffDeflate(&bit_reader, &puff_writer, &subblocks));
    150     TEST_AND_RETURN_FALSE(deflate.length == bit_reader.Offset());
    151     for (const auto& subblock : subblocks) {
    152       subblock_deflates->emplace_back(subblock.offset + deflate.offset * 8,
    153                                       subblock.length);
    154     }
    155   }
    156   return true;
    157 }
    158 
    159 bool LocateDeflatesInZlibBlocks(const string& file_path,
    160                                 const vector<ByteExtent>& zlibs,
    161                                 vector<BitExtent>* deflates) {
    162   auto src = FileStream::Open(file_path, true, false);
    163   TEST_AND_RETURN_FALSE(src);
    164 
    165   Buffer buffer;
    166   for (const auto& zlib : zlibs) {
    167     buffer.resize(zlib.length);
    168     TEST_AND_RETURN_FALSE(src->Seek(zlib.offset));
    169     TEST_AND_RETURN_FALSE(src->Read(buffer.data(), buffer.size()));
    170     vector<BitExtent> tmp_deflates;
    171     TEST_AND_RETURN_FALSE(LocateDeflatesInZlib(buffer, &tmp_deflates));
    172     for (const auto& deflate : tmp_deflates) {
    173       deflates->emplace_back(deflate.offset + zlib.offset * 8, deflate.length);
    174     }
    175   }
    176   return true;
    177 }
    178 
    179 // For more information about gzip format, refer to RFC 1952 located at:
    180 // https://www.ietf.org/rfc/rfc1952.txt
    181 bool LocateDeflatesInGzip(const Buffer& data, vector<BitExtent>* deflates) {
    182   uint64_t member_start = 0;
    183   while (member_start + 10 <= data.size() && data[member_start + 0] == 0x1F &&
    184          data[member_start + 1] == 0x8B && data[member_start + 2] == 8) {
    185     // Each member entry has the following format
    186     // 0      1     0x1F
    187     // 1      1     0x8B
    188     // 2      1     compression method (8 denotes deflate)
    189     // 3      1     set of flags
    190     // 4      4     modification time
    191     // 8      1     extra flags
    192     // 9      1     operating system
    193 
    194     uint64_t offset = member_start + 10;
    195     int flag = data[member_start + 3];
    196     // Extra field
    197     if (flag & 4) {
    198       TEST_AND_RETURN_FALSE(offset + 2 <= data.size());
    199       uint16_t extra_length = data[offset++];
    200       extra_length |= static_cast<uint16_t>(data[offset++]) << 8;
    201       TEST_AND_RETURN_FALSE(offset + extra_length <= data.size());
    202       offset += extra_length;
    203     }
    204     // File name field
    205     if (flag & 8) {
    206       while (true) {
    207         TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
    208         if (data[offset++] == 0) {
    209           break;
    210         }
    211       }
    212     }
    213     // File comment field
    214     if (flag & 16) {
    215       while (true) {
    216         TEST_AND_RETURN_FALSE(offset + 1 <= data.size());
    217         if (data[offset++] == 0) {
    218           break;
    219         }
    220       }
    221     }
    222     // CRC16 field
    223     if (flag & 2) {
    224       offset += 2;
    225     }
    226 
    227     uint64_t compressed_size = 0;
    228     TEST_AND_RETURN_FALSE(LocateDeflatesInDeflateStream(
    229         data.data() + offset, data.size() - offset, offset, deflates,
    230         &compressed_size));
    231     offset += compressed_size;
    232 
    233     // Ignore CRC32 and uncompressed size.
    234     TEST_AND_RETURN_FALSE(offset + 8 <= data.size());
    235     offset += 8;
    236     member_start = offset;
    237   }
    238   // Return true if we've successfully parsed at least one gzip.
    239   return member_start != 0;
    240 }
    241 
    242 // For more information about the zip format, refer to
    243 // https://support.pkware.com/display/PKZIP/APPNOTE
    244 bool LocateDeflatesInZipArchive(const Buffer& data,
    245                                 vector<BitExtent>* deflates) {
    246   uint64_t pos = 0;
    247   while (pos <= data.size() - 30) {
    248     // TODO(xunchang) add support for big endian system when searching for
    249     // magic numbers.
    250     if (get_unaligned<uint32_t>(data.data() + pos) != 0x04034b50) {
    251       pos++;
    252       continue;
    253     }
    254 
    255     // local file header format
    256     // 0      4     0x04034b50
    257     // 4      2     minimum version needed to extract
    258     // 6      2     general purpose bit flag
    259     // 8      2     compression method
    260     // 10     4     file last modification date & time
    261     // 14     4     CRC-32
    262     // 18     4     compressed size
    263     // 22     4     uncompressed size
    264     // 26     2     file name length
    265     // 28     2     extra field length
    266     // 30     n     file name
    267     // 30+n   m     extra field
    268     auto compression_method = get_unaligned<uint16_t>(data.data() + pos + 8);
    269     if (compression_method != 8) {  // non-deflate type
    270       pos += 4;
    271       continue;
    272     }
    273 
    274     auto compressed_size = get_unaligned<uint32_t>(data.data() + pos + 18);
    275     auto file_name_length = get_unaligned<uint16_t>(data.data() + pos + 26);
    276     auto extra_field_length = get_unaligned<uint16_t>(data.data() + pos + 28);
    277     uint64_t header_size = 30 + file_name_length + extra_field_length;
    278 
    279     // sanity check
    280     if (static_cast<uint64_t>(header_size) + compressed_size > data.size() ||
    281         pos > data.size() - header_size - compressed_size) {
    282       pos += 4;
    283       continue;
    284     }
    285 
    286     vector<BitExtent> tmp_deflates;
    287     uint64_t offset = pos + header_size;
    288     uint64_t calculated_compressed_size = 0;
    289     if (!LocateDeflatesInDeflateStream(
    290             data.data() + offset, data.size() - offset, offset, &tmp_deflates,
    291             &calculated_compressed_size)) {
    292       LOG(ERROR) << "Failed to decompress the zip entry starting from: " << pos
    293                  << ", skip adding deflates for this entry.";
    294       pos += 4;
    295       continue;
    296     }
    297 
    298     // Double check the compressed size if it is available in the file header.
    299     if (compressed_size > 0 && compressed_size != calculated_compressed_size) {
    300       LOG(WARNING) << "Compressed size in the file header: " << compressed_size
    301                    << " doesn't equal the real size: "
    302                    << calculated_compressed_size;
    303     }
    304 
    305     deflates->insert(deflates->end(), tmp_deflates.begin(), tmp_deflates.end());
    306     pos += header_size + calculated_compressed_size;
    307   }
    308 
    309   return true;
    310 }
    311 
    312 bool FindPuffLocations(const UniqueStreamPtr& src,
    313                        const vector<BitExtent>& deflates,
    314                        vector<ByteExtent>* puffs,
    315                        uint64_t* out_puff_size) {
    316   Puffer puffer;
    317   Buffer deflate_buffer;
    318 
    319   // Here accumulate the size difference between each corresponding deflate and
    320   // puff. At the end we add this cummulative size difference to the size of the
    321   // deflate stream to get the size of the puff stream. We use signed size
    322   // because puff size could be smaller than deflate size.
    323   int64_t total_size_difference = 0;
    324   for (auto deflate = deflates.begin(); deflate != deflates.end(); ++deflate) {
    325     // Read from src into deflate_buffer.
    326     auto start_byte = deflate->offset / 8;
    327     auto end_byte = (deflate->offset + deflate->length + 7) / 8;
    328     deflate_buffer.resize(end_byte - start_byte);
    329     TEST_AND_RETURN_FALSE(src->Seek(start_byte));
    330     TEST_AND_RETURN_FALSE(
    331         src->Read(deflate_buffer.data(), deflate_buffer.size()));
    332     // Find the size of the puff.
    333     BufferBitReader bit_reader(deflate_buffer.data(), deflate_buffer.size());
    334     uint64_t bits_to_skip = deflate->offset % 8;
    335     TEST_AND_RETURN_FALSE(bit_reader.CacheBits(bits_to_skip));
    336     bit_reader.DropBits(bits_to_skip);
    337 
    338     BufferPuffWriter puff_writer(nullptr, 0);
    339     TEST_AND_RETURN_FALSE(
    340         puffer.PuffDeflate(&bit_reader, &puff_writer, nullptr));
    341     TEST_AND_RETURN_FALSE(deflate_buffer.size() == bit_reader.Offset());
    342 
    343     // 1 if a deflate ends at the same byte that the next deflate starts and
    344     // there is a few bits gap between them. In practice this may never happen,
    345     // but it is a good idea to support it anyways. If there is a gap, the value
    346     // of the gap will be saved as an integer byte to the puff stream. The parts
    347     // of the byte that belogs to the deflates are shifted out.
    348     int gap = 0;
    349     if (deflate != deflates.begin()) {
    350       auto prev_deflate = std::prev(deflate);
    351       if ((prev_deflate->offset + prev_deflate->length == deflate->offset)
    352           // If deflates are on byte boundary the gap will not be counted later,
    353           // so we won't worry about it.
    354           && (deflate->offset % 8 != 0)) {
    355         gap = 1;
    356       }
    357     }
    358 
    359     start_byte = ((deflate->offset + 7) / 8);
    360     end_byte = (deflate->offset + deflate->length) / 8;
    361     int64_t deflate_length_in_bytes = end_byte - start_byte;
    362 
    363     // If there was no gap bits between the current and previous deflates, there
    364     // will be no extra gap byte, so the offset will be shifted one byte back.
    365     auto puff_offset = start_byte - gap + total_size_difference;
    366     auto puff_size = puff_writer.Size();
    367     // Add the location into puff.
    368     puffs->emplace_back(puff_offset, puff_size);
    369     total_size_difference +=
    370         static_cast<int64_t>(puff_size) - deflate_length_in_bytes - gap;
    371   }
    372 
    373   uint64_t src_size;
    374   TEST_AND_RETURN_FALSE(src->GetSize(&src_size));
    375   auto final_size = static_cast<int64_t>(src_size) + total_size_difference;
    376   TEST_AND_RETURN_FALSE(final_size >= 0);
    377   *out_puff_size = final_size;
    378   return true;
    379 }
    380 
    381 void RemoveEqualBitExtents(const Buffer& data1,
    382                            const Buffer& data2,
    383                            vector<BitExtent>* extents1,
    384                            vector<BitExtent>* extents2) {
    385   set<ExtentData> extent1_set, equal_extents;
    386   for (const BitExtent& ext : *extents1) {
    387     extent1_set.emplace(ext, data1);
    388   }
    389 
    390   auto new_extents2_end = extents2->begin();
    391   for (const BitExtent& ext : *extents2) {
    392     ExtentData extent_data(ext, data2);
    393     if (extent1_set.find(extent_data) != extent1_set.end()) {
    394       equal_extents.insert(extent_data);
    395     } else {
    396       *new_extents2_end++ = ext;
    397     }
    398   }
    399   extents2->erase(new_extents2_end, extents2->end());
    400   extents1->erase(
    401       std::remove_if(extents1->begin(), extents1->end(),
    402                      [&equal_extents, &data1](const BitExtent& ext) {
    403                        return equal_extents.find(ExtentData(ext, data1)) !=
    404                               equal_extents.end();
    405                      }),
    406       extents1->end());
    407 }
    408 
    409 bool RemoveDeflatesWithBadDistanceCaches(const Buffer& data,
    410                                          vector<BitExtent>* deflates) {
    411   Puffer puffer(true /* exclude_bad_distance_caches */);
    412   for (auto def = deflates->begin(); def != deflates->end();) {
    413     uint64_t offset = def->offset / 8;
    414     uint64_t length = (def->offset + def->length + 7) / 8 - offset;
    415     BufferBitReader br(&data[offset], length);
    416     BufferPuffWriter pw(nullptr, 0);
    417 
    418     // Drop the first few bits in the buffer so we start exactly where the
    419     // deflate starts.
    420     uint64_t bits_to_drop = def->offset % 8;
    421     TEST_AND_RETURN_FALSE(br.CacheBits(bits_to_drop));
    422     br.DropBits(bits_to_drop);
    423 
    424     vector<BitExtent> defs_out;
    425     TEST_AND_RETURN_FALSE(puffer.PuffDeflate(&br, &pw, &defs_out));
    426 
    427     TEST_AND_RETURN_FALSE(defs_out.size() <= 1);
    428     if (defs_out.size() == 0) {
    429       // This is a deflate we were looking for, remove it.
    430       def = deflates->erase(def);
    431     } else {
    432       ++def;
    433     }
    434   }
    435   return true;
    436 }
    437 
    438 }  // namespace puffin
    439