Home | History | Annotate | Download | only in src
      1 // Copyright 2007 Google Inc.
      2 // Author: Lincoln Smith
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 // Implementation of the Address Cache and Address Encoding
     17 // algorithms described in sections 5.1 - 5.4 of RFC 3284 -
     18 // The VCDIFF Generic Differencing and Compression Data Format.
     19 // The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html
     20 //
     21 // Assumptions:
     22 //   * The VCDAddress type is large enough to hold any offset within
     23 //     the source and target windows.  The limit (for int32_t) is 2^31-1 bytes.
     24 //     The source (dictionary) should not approach this size limit;
     25 //     to compress a target file that is larger than
     26 //     INT_MAX - (dictionary size) bytes, the encoder must
     27 //     break it up into multiple target windows.
     28 
     29 #include <config.h>
     30 #include "addrcache.h"
     31 #include "logging.h"
     32 #include "varint_bigendian.h"
     33 #include "vcdiff_defs.h"  // RESULT_ERROR
     34 
     35 namespace open_vcdiff {
     36 
     37 // The constructor does not initialize near_addresses_ and same_addresses_.
     38 // Therefore, Init() must be called before any other method can be used.
     39 //
     40 // Arguments:
     41 //   near_cache_size: Size of the NEAR cache (number of 4-byte integers)
     42 //   same_cache_size: Size of the SAME cache (number of blocks of
     43 //                                            256 4-byte integers per block)
     44 //     Because the mode is expressed as a byte value,
     45 //     near_cache_size + same_cache_size should not exceed 254.
     46 //
     47 VCDiffAddressCache::VCDiffAddressCache(int near_cache_size,
     48                                        int same_cache_size)
     49     : near_cache_size_(near_cache_size),
     50       same_cache_size_(same_cache_size),
     51       next_slot_(0) { }
     52 
     53 VCDiffAddressCache::VCDiffAddressCache()
     54     : near_cache_size_(kDefaultNearCacheSize),
     55       same_cache_size_(kDefaultSameCacheSize),
     56       next_slot_(0) { }
     57 
     58 // Sets up data structures needed to call other methods.  Operations that may
     59 // fail at runtime (for example, validating the provided near_cache_size_ and
     60 // same_cache_size_ parameters against their maximum allowed values) are
     61 // confined to this routine in order to guarantee that the class constructor
     62 // will never fail.  Other methods (except the destructor) cannot be invoked
     63 // until this method has been called successfully.  After the object has been
     64 // initialized and used, Init() can be called again to reset it to its initial
     65 // state.
     66 //
     67 // Return value: "true" if initialization succeeded, "false" if it failed.
     68 //     No other method except the destructor may be invoked if this function
     69 //     returns false.  The caller is responsible for checking the return value
     70 //     and providing an exit path in case of error.
     71 //
     72 bool VCDiffAddressCache::Init() {
     73   // The mode is expressed as a byte value, so there is only room for 256 modes,
     74   // including the two non-cached modes (SELF and HERE).  Do not allow a larger
     75   // number of modes to be defined.  We do a separate sanity check for
     76   // near_cache_size_ and same_cache_size_ because adding them together can
     77   // cause an integer overflow if each is set to, say, INT_MAX.
     78   if ((near_cache_size_ > (VCD_MAX_MODES - 2)) || (near_cache_size_ < 0)) {
     79     LOG(ERROR) << "Near cache size " << near_cache_size_ << " is invalid"
     80                << LOG_ENDL;
     81     return false;
     82   }
     83   if ((same_cache_size_ > (VCD_MAX_MODES - 2)) || (same_cache_size_ < 0)) {
     84     LOG(ERROR) << "Same cache size " << same_cache_size_ << " is invalid"
     85                << LOG_ENDL;
     86     return false;
     87   }
     88   if ((near_cache_size_ + same_cache_size_) > VCD_MAX_MODES - 2) {
     89     LOG(ERROR) << "Using near cache size " << near_cache_size_
     90                << " and same cache size " << same_cache_size_
     91                << " would exceed maximum number of COPY modes ("
     92                << VCD_MAX_MODES << ")" << LOG_ENDL;
     93     return false;
     94   }
     95   if (near_cache_size_ > 0) {
     96     near_addresses_.assign(near_cache_size_, 0);
     97   }
     98   if (same_cache_size_ > 0) {
     99     same_addresses_.assign(same_cache_size_ * 256, 0);
    100   }
    101   next_slot_ = 0;  // in case Init() is called a second time to reinit
    102   return true;
    103 }
    104 
    105 // This method will be called whenever an address is calculated for an
    106 // encoded or decoded COPY instruction, and will update the contents
    107 // of the SAME and NEAR caches.  It is vital that the use of
    108 // UpdateCache (called cache_update in the RFC examples) exactly match
    109 // the RFC standard, and that the same caching logic be used in the
    110 // decoder as in the encoder, in order for the decoded addresses to
    111 // match.
    112 //
    113 // Argument:
    114 //   address: This must be a valid address between 0 and
    115 //       (source window size + target window size).  It is assumed that
    116 //       these bounds have been checked before calling UpdateCache.
    117 //
    118 void VCDiffAddressCache::UpdateCache(VCDAddress address) {
    119   if (near_cache_size_ > 0) {
    120     near_addresses_[next_slot_] = address;
    121     next_slot_ = (next_slot_ + 1) % near_cache_size_;
    122   }
    123   if (same_cache_size_ > 0) {
    124     same_addresses_[address % (same_cache_size_ * 256)] = address;
    125   }
    126 }
    127 
    128 // Determines the address mode that yields the most compact encoding
    129 // of the given address value, writes the encoded address into the
    130 // address stream, and returns the mode used.  The most compact encoding
    131 // is found by looking for the numerically lowest encoded address.
    132 // The Init() function must already have been called.
    133 //
    134 // Arguments:
    135 //   address: The address to be encoded.  Must be a non-negative integer
    136 //       between 0 and (here_address - 1).
    137 //   here_address: The current location in the target data (i.e., the
    138 //       position just after the last encoded value.)  Must be non-negative.
    139 //   encoded_addr: Points to an VCDAddress that will be replaced
    140 //       with the encoded representation of address.
    141 //       If WriteAddressAsVarintForMode returns true when passed
    142 //       the return value, then encoded_addr should be written
    143 //       into the delta file as a variable-length integer (Varint);
    144 //       otherwise, it should be written as a byte (unsigned char).
    145 //
    146 // Return value: A mode value between 0 and 255.  The mode will tell
    147 //       how to interpret the next value in the address stream.
    148 //       The values 0 and 1 correspond to SELF and HERE addressing.
    149 //
    150 // The function is guaranteed to succeed unless the conditions on the arguments
    151 // have not been met, in which case a LOG(DFATAL) message will be produced,
    152 // 0 will be returned, and *encoded_addr will be replaced with 0.
    153 //
    154 unsigned char VCDiffAddressCache::EncodeAddress(VCDAddress address,
    155                                                 VCDAddress here_address,
    156                                                 VCDAddress* encoded_addr) {
    157   if (address < 0) {
    158     LOG(DFATAL) << "EncodeAddress was passed a negative address: "
    159                 << address << LOG_ENDL;
    160     *encoded_addr = 0;
    161     return 0;
    162   }
    163   if (address >= here_address) {
    164     LOG(DFATAL) << "EncodeAddress was called with address (" << address
    165                 << ") < here_address (" << here_address << ")" << LOG_ENDL;
    166     *encoded_addr = 0;
    167     return 0;
    168   }
    169   // Try using the SAME cache.  This method, if available, always
    170   // results in the smallest encoding and takes priority over other modes.
    171   if (same_cache_size() > 0) {
    172     const VCDAddress same_cache_pos =
    173         address % (same_cache_size() * 256);
    174     if (SameAddress(same_cache_pos) == address) {
    175       // This is the only mode for which an single byte will be written
    176       // to the address stream instead of a variable-length integer.
    177       UpdateCache(address);
    178       *encoded_addr = same_cache_pos % 256;
    179       return FirstSameMode() + (same_cache_pos / 256);  // SAME mode
    180     }
    181   }
    182 
    183   // Try SELF mode
    184   unsigned char best_mode = VCD_SELF_MODE;
    185   VCDAddress best_encoded_address = address;
    186 
    187   // Try HERE mode
    188   {
    189     const VCDAddress here_encoded_address = here_address - address;
    190     if (here_encoded_address < best_encoded_address) {
    191       best_mode = VCD_HERE_MODE;
    192       best_encoded_address = here_encoded_address;
    193     }
    194   }
    195 
    196   // Try using the NEAR cache
    197   for (int i = 0; i < near_cache_size(); ++i) {
    198     const VCDAddress near_encoded_address = address - NearAddress(i);
    199     if ((near_encoded_address >= 0) &&
    200         (near_encoded_address < best_encoded_address)) {
    201       best_mode = FirstNearMode() + i;
    202       best_encoded_address = near_encoded_address;
    203     }
    204   }
    205 
    206   UpdateCache(address);
    207   *encoded_addr = best_encoded_address;
    208   return best_mode;
    209 }
    210 
    211 // Increments *byte_pointer and returns the byte it pointed to before the
    212 // increment.  The caller must check bounds to ensure that *byte_pointer
    213 // points to a valid address in memory.
    214 static unsigned char ParseByte(const char** byte_pointer) {
    215   unsigned char byte_value = static_cast<unsigned char>(**byte_pointer);
    216   ++(*byte_pointer);
    217   return byte_value;
    218 }
    219 
    220 // Checks the given decoded address for validity.  Returns true if the
    221 // address is valid; otherwise, prints an error message to the log and
    222 // returns false.
    223 static bool IsDecodedAddressValid(VCDAddress decoded_address,
    224                                   VCDAddress here_address) {
    225   if (decoded_address < 0) {
    226     LOG(ERROR) << "Decoded address " << decoded_address << " is invalid"
    227                << LOG_ENDL;
    228     return false;
    229   } else if (decoded_address >= here_address) {
    230     LOG(ERROR) << "Decoded address (" << decoded_address
    231                << ") is beyond location in target file (" << here_address
    232                << ")" << LOG_ENDL;
    233     return false;
    234   }
    235   return true;
    236 }
    237 
    238 // Interprets the next value in the address_stream using the provided mode,
    239 // which may need to access the SAME or NEAR address cache.  Returns the
    240 // decoded address.
    241 // The Init() function must already have been called.
    242 //
    243 // Arguments:
    244 //   here_address: The current location in the source + target data (i.e., the
    245 //       location into which the COPY instruction will copy.)  By definition,
    246 //       all addresses between 0 and (here_address - 1) are valid, and
    247 //       any other address is invalid.
    248 //   mode: A byte value between 0 and (near_cache_size_ + same_cache_size_ + 1)
    249 //       which tells how to interpret the next value in the address stream.
    250 //       The values 0 and 1 correspond to SELF and HERE addressing.
    251 //       The validity of "mode" should already have been checked before
    252 //       calling this function.
    253 //   address_stream: Points to a pointer holding the position
    254 //       in the "Addresses section for COPYs" part of the input data.
    255 //       That section must already have been uncompressed
    256 //       using a secondary decompressor (if necessary.)
    257 //       This is an IN/OUT argument; the value of *address_stream will be
    258 //       incremented by the size of an integer, or (if the SAME cache
    259 //       was used) by the size of a byte (1).
    260 //   address_stream_end: Points to the position just after the end of
    261 //       the address stream buffer.  All addresses between *address_stream
    262 //       and address_stream_end should contain valid address data.
    263 //
    264 // Return value: If the input conditions were met, and the address section
    265 //     of the input data contains properly encoded addresses that match
    266 //     the instructions section, then an integer between 0 and here_address - 1
    267 //     will be returned, representing the address from which data should
    268 //     be copied from the source or target window into the output stream.
    269 //     If an invalid address value is found in address_stream, then
    270 //     RESULT_ERROR will be returned.  If the limit address_stream_end
    271 //     is reached before the address can be decoded, then
    272 //     RESULT_END_OF_DATA will be returned.  If more streamed data
    273 //     is expected, this means that the consumer should block and wait
    274 //     for more data before continuing to decode.  If no more data is expected,
    275 //     this return value signals an error condition.
    276 //
    277 VCDAddress VCDiffAddressCache::DecodeAddress(VCDAddress here_address,
    278                                              unsigned char mode,
    279                                              const char** address_stream,
    280                                              const char* address_stream_end) {
    281   if (here_address < 0) {
    282     LOG(DFATAL) << "DecodeAddress was passed a negative value"
    283                    " for here_address: " << here_address << LOG_ENDL;
    284     return RESULT_ERROR;
    285   }
    286   const char* new_address_pos = *address_stream;
    287   if (new_address_pos >= address_stream_end) {
    288     return RESULT_END_OF_DATA;
    289   }
    290   VCDAddress decoded_address;
    291   if (IsSameMode(mode)) {
    292     // SAME mode expects a byte value as the encoded address
    293     unsigned char encoded_address = ParseByte(&new_address_pos);
    294     decoded_address = DecodeSameAddress(mode, encoded_address);
    295   } else {
    296     // All modes except SAME mode expect a VarintBE as the encoded address
    297     int32_t encoded_address = VarintBE<int32_t>::Parse(address_stream_end,
    298                                                        &new_address_pos);
    299     switch (encoded_address) {
    300       case RESULT_ERROR:
    301         LOG(ERROR) << "Found invalid variable-length integer "
    302                       "as encoded address value" << LOG_ENDL;
    303         return RESULT_ERROR;
    304       case RESULT_END_OF_DATA:
    305         return RESULT_END_OF_DATA;
    306       default:
    307         break;
    308     }
    309     if (IsSelfMode(mode)) {
    310       decoded_address = DecodeSelfAddress(encoded_address);
    311     } else if (IsHereMode(mode)) {
    312       decoded_address = DecodeHereAddress(encoded_address, here_address);
    313     } else if (IsNearMode(mode)) {
    314       decoded_address = DecodeNearAddress(mode, encoded_address);
    315     } else {
    316       LOG(DFATAL) << "Invalid mode value (" << static_cast<int>(mode)
    317                   << ") passed to DecodeAddress; maximum mode value = "
    318                   << static_cast<int>(LastMode()) << LOG_ENDL;
    319       return RESULT_ERROR;
    320     }
    321   }
    322   // Check for an out-of-bounds address (corrupt/malicious data)
    323   if (!IsDecodedAddressValid(decoded_address, here_address)) {
    324     return RESULT_ERROR;
    325   }
    326   *address_stream = new_address_pos;
    327   UpdateCache(decoded_address);
    328   return decoded_address;
    329 }
    330 
    331 }  // namespace open_vcdiff
    332