1 // Copyright 2007 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // Implementation of the Address Cache and Address Encoding 17 // algorithms described in sections 5.1 - 5.4 of RFC 3284 - 18 // The VCDIFF Generic Differencing and Compression Data Format. 19 // The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html 20 // 21 // Assumptions: 22 // * The VCDAddress type is large enough to hold any offset within 23 // the source and target windows. The limit (for int32_t) is 2^31-1 bytes. 24 // The source (dictionary) should not approach this size limit; 25 // to compress a target file that is larger than 26 // INT_MAX - (dictionary size) bytes, the encoder must 27 // break it up into multiple target windows. 28 29 #include <config.h> 30 #include "addrcache.h" 31 #include "logging.h" 32 #include "varint_bigendian.h" 33 #include "vcdiff_defs.h" // RESULT_ERROR 34 35 namespace open_vcdiff { 36 37 // The constructor does not initialize near_addresses_ and same_addresses_. 38 // Therefore, Init() must be called before any other method can be used. 39 // 40 // Arguments: 41 // near_cache_size: Size of the NEAR cache (number of 4-byte integers) 42 // same_cache_size: Size of the SAME cache (number of blocks of 43 // 256 4-byte integers per block) 44 // Because the mode is expressed as a byte value, 45 // near_cache_size + same_cache_size should not exceed 254. 46 // 47 VCDiffAddressCache::VCDiffAddressCache(int near_cache_size, 48 int same_cache_size) 49 : near_cache_size_(near_cache_size), 50 same_cache_size_(same_cache_size), 51 next_slot_(0) { } 52 53 VCDiffAddressCache::VCDiffAddressCache() 54 : near_cache_size_(kDefaultNearCacheSize), 55 same_cache_size_(kDefaultSameCacheSize), 56 next_slot_(0) { } 57 58 // Sets up data structures needed to call other methods. Operations that may 59 // fail at runtime (for example, validating the provided near_cache_size_ and 60 // same_cache_size_ parameters against their maximum allowed values) are 61 // confined to this routine in order to guarantee that the class constructor 62 // will never fail. Other methods (except the destructor) cannot be invoked 63 // until this method has been called successfully. After the object has been 64 // initialized and used, Init() can be called again to reset it to its initial 65 // state. 66 // 67 // Return value: "true" if initialization succeeded, "false" if it failed. 68 // No other method except the destructor may be invoked if this function 69 // returns false. The caller is responsible for checking the return value 70 // and providing an exit path in case of error. 71 // 72 bool VCDiffAddressCache::Init() { 73 // The mode is expressed as a byte value, so there is only room for 256 modes, 74 // including the two non-cached modes (SELF and HERE). Do not allow a larger 75 // number of modes to be defined. We do a separate sanity check for 76 // near_cache_size_ and same_cache_size_ because adding them together can 77 // cause an integer overflow if each is set to, say, INT_MAX. 78 if ((near_cache_size_ > (VCD_MAX_MODES - 2)) || (near_cache_size_ < 0)) { 79 LOG(ERROR) << "Near cache size " << near_cache_size_ << " is invalid" 80 << LOG_ENDL; 81 return false; 82 } 83 if ((same_cache_size_ > (VCD_MAX_MODES - 2)) || (same_cache_size_ < 0)) { 84 LOG(ERROR) << "Same cache size " << same_cache_size_ << " is invalid" 85 << LOG_ENDL; 86 return false; 87 } 88 if ((near_cache_size_ + same_cache_size_) > VCD_MAX_MODES - 2) { 89 LOG(ERROR) << "Using near cache size " << near_cache_size_ 90 << " and same cache size " << same_cache_size_ 91 << " would exceed maximum number of COPY modes (" 92 << VCD_MAX_MODES << ")" << LOG_ENDL; 93 return false; 94 } 95 if (near_cache_size_ > 0) { 96 near_addresses_.assign(near_cache_size_, 0); 97 } 98 if (same_cache_size_ > 0) { 99 same_addresses_.assign(same_cache_size_ * 256, 0); 100 } 101 next_slot_ = 0; // in case Init() is called a second time to reinit 102 return true; 103 } 104 105 // This method will be called whenever an address is calculated for an 106 // encoded or decoded COPY instruction, and will update the contents 107 // of the SAME and NEAR caches. It is vital that the use of 108 // UpdateCache (called cache_update in the RFC examples) exactly match 109 // the RFC standard, and that the same caching logic be used in the 110 // decoder as in the encoder, in order for the decoded addresses to 111 // match. 112 // 113 // Argument: 114 // address: This must be a valid address between 0 and 115 // (source window size + target window size). It is assumed that 116 // these bounds have been checked before calling UpdateCache. 117 // 118 void VCDiffAddressCache::UpdateCache(VCDAddress address) { 119 if (near_cache_size_ > 0) { 120 near_addresses_[next_slot_] = address; 121 next_slot_ = (next_slot_ + 1) % near_cache_size_; 122 } 123 if (same_cache_size_ > 0) { 124 same_addresses_[address % (same_cache_size_ * 256)] = address; 125 } 126 } 127 128 // Determines the address mode that yields the most compact encoding 129 // of the given address value, writes the encoded address into the 130 // address stream, and returns the mode used. The most compact encoding 131 // is found by looking for the numerically lowest encoded address. 132 // The Init() function must already have been called. 133 // 134 // Arguments: 135 // address: The address to be encoded. Must be a non-negative integer 136 // between 0 and (here_address - 1). 137 // here_address: The current location in the target data (i.e., the 138 // position just after the last encoded value.) Must be non-negative. 139 // encoded_addr: Points to an VCDAddress that will be replaced 140 // with the encoded representation of address. 141 // If WriteAddressAsVarintForMode returns true when passed 142 // the return value, then encoded_addr should be written 143 // into the delta file as a variable-length integer (Varint); 144 // otherwise, it should be written as a byte (unsigned char). 145 // 146 // Return value: A mode value between 0 and 255. The mode will tell 147 // how to interpret the next value in the address stream. 148 // The values 0 and 1 correspond to SELF and HERE addressing. 149 // 150 // The function is guaranteed to succeed unless the conditions on the arguments 151 // have not been met, in which case a LOG(DFATAL) message will be produced, 152 // 0 will be returned, and *encoded_addr will be replaced with 0. 153 // 154 unsigned char VCDiffAddressCache::EncodeAddress(VCDAddress address, 155 VCDAddress here_address, 156 VCDAddress* encoded_addr) { 157 if (address < 0) { 158 LOG(DFATAL) << "EncodeAddress was passed a negative address: " 159 << address << LOG_ENDL; 160 *encoded_addr = 0; 161 return 0; 162 } 163 if (address >= here_address) { 164 LOG(DFATAL) << "EncodeAddress was called with address (" << address 165 << ") < here_address (" << here_address << ")" << LOG_ENDL; 166 *encoded_addr = 0; 167 return 0; 168 } 169 // Try using the SAME cache. This method, if available, always 170 // results in the smallest encoding and takes priority over other modes. 171 if (same_cache_size() > 0) { 172 const VCDAddress same_cache_pos = 173 address % (same_cache_size() * 256); 174 if (SameAddress(same_cache_pos) == address) { 175 // This is the only mode for which an single byte will be written 176 // to the address stream instead of a variable-length integer. 177 UpdateCache(address); 178 *encoded_addr = same_cache_pos % 256; 179 return FirstSameMode() + (same_cache_pos / 256); // SAME mode 180 } 181 } 182 183 // Try SELF mode 184 unsigned char best_mode = VCD_SELF_MODE; 185 VCDAddress best_encoded_address = address; 186 187 // Try HERE mode 188 { 189 const VCDAddress here_encoded_address = here_address - address; 190 if (here_encoded_address < best_encoded_address) { 191 best_mode = VCD_HERE_MODE; 192 best_encoded_address = here_encoded_address; 193 } 194 } 195 196 // Try using the NEAR cache 197 for (int i = 0; i < near_cache_size(); ++i) { 198 const VCDAddress near_encoded_address = address - NearAddress(i); 199 if ((near_encoded_address >= 0) && 200 (near_encoded_address < best_encoded_address)) { 201 best_mode = FirstNearMode() + i; 202 best_encoded_address = near_encoded_address; 203 } 204 } 205 206 UpdateCache(address); 207 *encoded_addr = best_encoded_address; 208 return best_mode; 209 } 210 211 // Increments *byte_pointer and returns the byte it pointed to before the 212 // increment. The caller must check bounds to ensure that *byte_pointer 213 // points to a valid address in memory. 214 static unsigned char ParseByte(const char** byte_pointer) { 215 unsigned char byte_value = static_cast<unsigned char>(**byte_pointer); 216 ++(*byte_pointer); 217 return byte_value; 218 } 219 220 // Checks the given decoded address for validity. Returns true if the 221 // address is valid; otherwise, prints an error message to the log and 222 // returns false. 223 static bool IsDecodedAddressValid(VCDAddress decoded_address, 224 VCDAddress here_address) { 225 if (decoded_address < 0) { 226 LOG(ERROR) << "Decoded address " << decoded_address << " is invalid" 227 << LOG_ENDL; 228 return false; 229 } else if (decoded_address >= here_address) { 230 LOG(ERROR) << "Decoded address (" << decoded_address 231 << ") is beyond location in target file (" << here_address 232 << ")" << LOG_ENDL; 233 return false; 234 } 235 return true; 236 } 237 238 // Interprets the next value in the address_stream using the provided mode, 239 // which may need to access the SAME or NEAR address cache. Returns the 240 // decoded address. 241 // The Init() function must already have been called. 242 // 243 // Arguments: 244 // here_address: The current location in the source + target data (i.e., the 245 // location into which the COPY instruction will copy.) By definition, 246 // all addresses between 0 and (here_address - 1) are valid, and 247 // any other address is invalid. 248 // mode: A byte value between 0 and (near_cache_size_ + same_cache_size_ + 1) 249 // which tells how to interpret the next value in the address stream. 250 // The values 0 and 1 correspond to SELF and HERE addressing. 251 // The validity of "mode" should already have been checked before 252 // calling this function. 253 // address_stream: Points to a pointer holding the position 254 // in the "Addresses section for COPYs" part of the input data. 255 // That section must already have been uncompressed 256 // using a secondary decompressor (if necessary.) 257 // This is an IN/OUT argument; the value of *address_stream will be 258 // incremented by the size of an integer, or (if the SAME cache 259 // was used) by the size of a byte (1). 260 // address_stream_end: Points to the position just after the end of 261 // the address stream buffer. All addresses between *address_stream 262 // and address_stream_end should contain valid address data. 263 // 264 // Return value: If the input conditions were met, and the address section 265 // of the input data contains properly encoded addresses that match 266 // the instructions section, then an integer between 0 and here_address - 1 267 // will be returned, representing the address from which data should 268 // be copied from the source or target window into the output stream. 269 // If an invalid address value is found in address_stream, then 270 // RESULT_ERROR will be returned. If the limit address_stream_end 271 // is reached before the address can be decoded, then 272 // RESULT_END_OF_DATA will be returned. If more streamed data 273 // is expected, this means that the consumer should block and wait 274 // for more data before continuing to decode. If no more data is expected, 275 // this return value signals an error condition. 276 // 277 VCDAddress VCDiffAddressCache::DecodeAddress(VCDAddress here_address, 278 unsigned char mode, 279 const char** address_stream, 280 const char* address_stream_end) { 281 if (here_address < 0) { 282 LOG(DFATAL) << "DecodeAddress was passed a negative value" 283 " for here_address: " << here_address << LOG_ENDL; 284 return RESULT_ERROR; 285 } 286 const char* new_address_pos = *address_stream; 287 if (new_address_pos >= address_stream_end) { 288 return RESULT_END_OF_DATA; 289 } 290 VCDAddress decoded_address; 291 if (IsSameMode(mode)) { 292 // SAME mode expects a byte value as the encoded address 293 unsigned char encoded_address = ParseByte(&new_address_pos); 294 decoded_address = DecodeSameAddress(mode, encoded_address); 295 } else { 296 // All modes except SAME mode expect a VarintBE as the encoded address 297 int32_t encoded_address = VarintBE<int32_t>::Parse(address_stream_end, 298 &new_address_pos); 299 switch (encoded_address) { 300 case RESULT_ERROR: 301 LOG(ERROR) << "Found invalid variable-length integer " 302 "as encoded address value" << LOG_ENDL; 303 return RESULT_ERROR; 304 case RESULT_END_OF_DATA: 305 return RESULT_END_OF_DATA; 306 default: 307 break; 308 } 309 if (IsSelfMode(mode)) { 310 decoded_address = DecodeSelfAddress(encoded_address); 311 } else if (IsHereMode(mode)) { 312 decoded_address = DecodeHereAddress(encoded_address, here_address); 313 } else if (IsNearMode(mode)) { 314 decoded_address = DecodeNearAddress(mode, encoded_address); 315 } else { 316 LOG(DFATAL) << "Invalid mode value (" << static_cast<int>(mode) 317 << ") passed to DecodeAddress; maximum mode value = " 318 << static_cast<int>(LastMode()) << LOG_ENDL; 319 return RESULT_ERROR; 320 } 321 } 322 // Check for an out-of-bounds address (corrupt/malicious data) 323 if (!IsDecodedAddressValid(decoded_address, here_address)) { 324 return RESULT_ERROR; 325 } 326 *address_stream = new_address_pos; 327 UpdateCache(decoded_address); 328 return decoded_address; 329 } 330 331 } // namespace open_vcdiff 332