1 // Copyright 2008 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // A command-line interface to the open-vcdiff library. 17 18 #include <config.h> 19 #include <assert.h> 20 #include <errno.h> 21 #ifdef WIN32 22 #include <fcntl.h> 23 #include <io.h> 24 #endif // WIN32 25 #include <stdio.h> 26 #include <string.h> // strerror 27 #include <iostream> 28 #include <memory> 29 #include <string> 30 #include <vector> 31 #include "gflags/gflags.h" 32 #include "google/vcdecoder.h" 33 #include "google/vcencoder.h" 34 35 #ifndef HAS_GLOBAL_STRING 36 using std::string; 37 #endif // !HAS_GLOBAL_STRING 38 using google::GetCommandLineFlagInfoOrDie; 39 using google::ShowUsageWithFlagsRestrict; 40 41 static const size_t kDefaultMaxTargetSize = 1 << 26; // 64 MB 42 43 // Definitions of command-line flags 44 DEFINE_string(dictionary, "", 45 "File containing dictionary data (required)"); 46 DEFINE_string(target, "", 47 "Target file (default is stdin for encode, stdout for decode"); 48 DEFINE_string(delta, "", 49 "Encoded delta file (default is stdout for encode, " 50 "stdin for decode"); 51 // --buffersize is the maximum allowable size of a target window. 52 // This value may be increased if there is sufficient memory available. 53 DEFINE_uint64(buffersize, 1 << 20, // 1 MB 54 "Buffer size for reading input file"); 55 DEFINE_bool(allow_vcd_target, true, 56 "If false, the decoder issues an error when the VCD_TARGET flag " 57 "is encountered"); 58 DEFINE_bool(checksum, false, 59 "Include an Adler32 checksum of the target data when encoding"); 60 DEFINE_bool(interleaved, false, "Use interleaved format"); 61 DEFINE_bool(stats, false, "Report compression percentage"); 62 DEFINE_bool(target_matches, false, "Find duplicate strings in target data" 63 " as well as dictionary data"); 64 DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize, 65 "Maximum target file size allowed by decoder"); 66 DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize, 67 "Maximum target window size allowed by decoder"); 68 69 static const char* const kUsageString = 70 " {encode | delta | decode | patch }[ <options> ]\n" 71 "encode or delta: create delta file from dictionary and target file\n" 72 "decode or patch: reconstruct target file from dictionary and delta file"; 73 74 namespace open_vcdiff { 75 76 class VCDiffFileBasedCoder { 77 public: 78 VCDiffFileBasedCoder(); 79 ~VCDiffFileBasedCoder(); 80 81 // Once the command-line arguments have been parsed, these functions 82 // will use the supplied options to carry out a file-based encode 83 // or decode operation. 84 bool Encode(); 85 bool Decode(); 86 bool DecodeAndCompare(); // for "vcdiff test"; compare target with original 87 88 private: 89 // Determines the size of the file. The given file must be an input file 90 // opened for reading only, not an input stream such as stdin. The function 91 // returns true and populates file_size if successful; otherwise, it returns 92 // false. 93 static bool FileSize(FILE* file, size_t* file_size); 94 95 // Opens a file for incremental reading. file_name is the name of the file 96 // to be opened. file_type should be a descriptive name (like "target") for 97 // use in log messages. If successful, returns true and sets *file to a 98 // valid input file, *buffer to a region of memory allocated using malloc() 99 // (so the caller must release it using free()), and buffer_size to the size 100 // of the buffer, which will not be larger than the size of the file, and 101 // will not be smaller than the --buffersize option. If the function fails, 102 // it outputs a log message and returns false. 103 bool OpenFileForReading(const string& file_name, 104 const char* file_type, 105 FILE** file, 106 std::vector<char>* buffer); 107 108 // Opens the dictionary file and reads it into a newly allocated buffer. 109 // If successful, returns true and populates dictionary_ with the dictionary 110 // contents; otherwise, returns false. 111 bool OpenDictionary(); 112 113 // Opens the input file (the delta or target file) for reading. 114 // Allocates space for the input buffer. If successful, 115 // input_file_ will be valid and input_buffer_ will be allocated. 116 bool OpenInputFile() { 117 return OpenFileForReading(input_file_name_, 118 input_file_type_, 119 &input_file_, 120 &input_buffer_); 121 } 122 123 // Opens the output file (the target or delta file) for writing. 124 // If successful, output_file_ will be valid. 125 bool OpenOutputFile(); 126 127 // Opens the output file (the target file) for comparison against the decoded 128 // output when using "vcdiff test". 129 bool OpenOutputFileForCompare() { 130 return OpenFileForReading(output_file_name_, 131 output_file_type_, 132 &output_file_, 133 &compare_buffer_); 134 } 135 136 // Reads as much input data as possible from the input file 137 // into input_buffer_. If successful, returns true and sets *bytes_read 138 // to the number of bytes read into input_buffer_. If an error occurs, 139 // writes an error log message and returns false. 140 bool ReadInput(size_t* bytes_read); 141 142 // Writes the contents of output to output_file_. If successful, returns 143 // true. If an error occurs, writes an error log message and returns false. 144 bool WriteOutput(const string& output); 145 146 // Reads a number of bytes from output_file_ equal to the size of output, 147 // and compares to make sure they match the contents of output. If the bytes 148 // do not match, or if end of file is reached before the expected number of 149 // bytes have been read, or a read error occurs, the function returns false; 150 // otherwise, returns true. 151 bool CompareOutput(const string& output); 152 153 // Dictionary contents. The entire dictionary file will be read into memory. 154 std::vector<char> dictionary_; 155 156 std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_; 157 158 // These should be set to either "delta" or "target". They are only 159 // used in log messages such as "Error opening delta file..." 160 const char* input_file_type_; 161 const char* output_file_type_; 162 163 // The filenames used for input and output. Will be empty if stdin 164 // or stdout is being used. 165 string input_file_name_; 166 string output_file_name_; 167 168 // stdio-style file handles for the input and output files and the dictionary. 169 // When encoding, input_file_ is the target file and output_file_ is the delta 170 // file; when decoding, the reverse is true. The dictionary is always read 171 // from a file rather than from standard input. 172 FILE* input_file_; 173 FILE* output_file_; 174 175 // A memory buffer used to load the input file into memory. If the input 176 // comes from stdin because no input file was specified, then the size of 177 // input_buffer_ will be the value specified by the --buffersize option. 178 // If the input comes from a file, then the buffer will be allocated to match 179 // the file size, if possible. However, the buffer will not exceed 180 // --buffersize bytes in length. 181 std::vector<char> input_buffer_; 182 183 // A memory buffer used to load the output file into memory for comparison 184 // if "vcdiff test" is specified. 185 std::vector<char> compare_buffer_; 186 187 // Making these private avoids implicit copy constructor & assignment operator 188 VCDiffFileBasedCoder(const VCDiffFileBasedCoder&); // NOLINT 189 void operator=(const VCDiffFileBasedCoder&); 190 }; 191 192 inline VCDiffFileBasedCoder::VCDiffFileBasedCoder() 193 : input_file_type_(""), 194 output_file_type_(""), 195 input_file_(NULL), 196 output_file_(NULL) { } 197 198 VCDiffFileBasedCoder::~VCDiffFileBasedCoder() { 199 if (input_file_ && (input_file_ != stdin)) { 200 fclose(input_file_); 201 input_file_ = NULL; 202 } 203 if (output_file_ && (output_file_ != stdout)) { 204 fclose(output_file_); 205 output_file_ = NULL; 206 } 207 } 208 209 bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) { 210 long initial_position = ftell(file); 211 if (fseek(file, 0, SEEK_END) != 0) { 212 return false; 213 } 214 *file_size = static_cast<size_t>(ftell(file)); 215 if (fseek(file, initial_position, SEEK_SET) != 0) { 216 return false; 217 } 218 return true; 219 } 220 221 bool VCDiffFileBasedCoder::OpenDictionary() { 222 assert(dictionary_.empty()); 223 assert(!FLAGS_dictionary.empty()); 224 FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb"); 225 if (!dictionary_file) { 226 std::cerr << "Error opening dictionary file '" << FLAGS_dictionary 227 << "': " << strerror(errno) << std::endl; 228 return false; 229 } 230 size_t dictionary_size = 0U; 231 if (!FileSize(dictionary_file, &dictionary_size)) { 232 std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary 233 << "': " << strerror(errno) << std::endl; 234 return false; 235 } 236 dictionary_.resize(dictionary_size); 237 if (dictionary_size > 0) { 238 if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file) 239 != dictionary_size) { 240 std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary 241 << "': " << strerror(errno) << std::endl; 242 fclose(dictionary_file); 243 dictionary_.clear(); 244 return false; 245 } 246 } 247 fclose(dictionary_file); 248 return true; 249 } 250 251 bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name, 252 const char* file_type, 253 FILE** file, 254 std::vector<char>* buffer) { 255 assert(buffer->empty()); 256 size_t buffer_size = 0U; 257 if (!*file && file_name.empty()) { 258 #ifdef WIN32 259 _setmode(_fileno(stdin), _O_BINARY); 260 #endif 261 *file = stdin; 262 buffer_size = static_cast<size_t>(FLAGS_buffersize); 263 } else { 264 if (!*file) { 265 *file = fopen(file_name.c_str(), "rb"); 266 if (!*file) { 267 std::cerr << "Error opening " << file_type << " file '" 268 << file_name << "': " << strerror(errno) << std::endl; 269 return false; 270 } 271 } 272 size_t file_size = 0U; 273 if (!FileSize(*file, &file_size)) { 274 std::cerr << "Error finding size of " << file_type << " file '" 275 << file_name << "': " << strerror(errno) << std::endl; 276 return false; 277 } 278 buffer_size = static_cast<size_t>(FLAGS_buffersize); 279 if (file_size < buffer_size) { 280 // Allocate just enough memory to store the entire file 281 buffer_size = file_size; 282 } 283 } 284 buffer->resize(buffer_size); 285 return true; 286 } 287 288 // Opens the output file for streamed read operations using the 289 // standard C I/O library, i.e., fopen(), fwrite(), fclose(). 290 // No output buffer is allocated because the encoded/decoded output 291 // is constructed progressively using a std::string object 292 // whose buffer is resized as needed. 293 bool VCDiffFileBasedCoder::OpenOutputFile() { 294 if (output_file_name_.empty()) { 295 #ifdef WIN32 296 _setmode(_fileno(stdout), _O_BINARY); 297 #endif 298 output_file_ = stdout; 299 } else { 300 output_file_ = fopen(output_file_name_.c_str(), "wb"); 301 if (!output_file_) { 302 std::cerr << "Error opening " << output_file_type_ << " file '" 303 << output_file_name_ 304 << "': " << strerror(errno) << std::endl; 305 return false; 306 } 307 } 308 return true; 309 } 310 311 bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) { 312 // Read from file or stdin 313 *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_); 314 if (ferror(input_file_)) { 315 std::cerr << "Error reading from " << input_file_type_ << " file '" 316 << input_file_name_ 317 << "': " << strerror(errno) << std::endl; 318 return false; 319 } 320 return true; 321 } 322 323 bool VCDiffFileBasedCoder::WriteOutput(const string& output) { 324 if (!output.empty()) { 325 // Some new output has been generated and is ready to be written 326 // to the output file or to stdout. 327 fwrite(output.data(), 1, output.size(), output_file_); 328 if (ferror(output_file_)) { 329 std::cerr << "Error writing " << output.size() << " bytes to " 330 << output_file_type_ << " file '" << output_file_name_ 331 << "': " << strerror(errno) << std::endl; 332 return false; 333 } 334 } 335 return true; 336 } 337 338 bool VCDiffFileBasedCoder::CompareOutput(const string& output) { 339 if (!output.empty()) { 340 size_t output_size = output.size(); 341 // Some new output has been generated and is ready to be compared against 342 // the output file. 343 if (output_size > compare_buffer_.size()) { 344 compare_buffer_.resize(output_size); 345 } 346 size_t bytes_read = fread(&compare_buffer_[0], 347 1, 348 output_size, 349 output_file_); 350 if (ferror(output_file_)) { 351 std::cerr << "Error reading from " << output_file_type_ << " file '" 352 << output_file_name_ << "': " << strerror(errno) << std::endl; 353 return false; 354 } 355 if (bytes_read < output_size) { 356 std::cerr << "Decoded target is longer than original target file" 357 << std::endl; 358 return false; 359 } 360 if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) { 361 std::cerr << "Original target file does not match decoded target" 362 << std::endl; 363 return false; 364 } 365 } 366 return true; 367 } 368 369 bool VCDiffFileBasedCoder::Encode() { 370 input_file_type_ = "target"; 371 input_file_name_ = FLAGS_target; 372 output_file_type_ = "delta"; 373 output_file_name_ = FLAGS_delta; 374 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { 375 return false; 376 } 377 // Issue 6: Visual Studio STL produces a runtime exception 378 // if &dictionary_[0] is attempted for an empty dictionary. 379 if (dictionary_.empty()) { 380 hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0)); 381 } else { 382 hashed_dictionary_.reset( 383 new open_vcdiff::HashedDictionary(&dictionary_[0], 384 dictionary_.size())); 385 } 386 if (!hashed_dictionary_->Init()) { 387 std::cerr << "Error initializing hashed dictionary" << std::endl; 388 return false; 389 } 390 VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT; 391 if (FLAGS_interleaved) { 392 format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED; 393 } 394 if (FLAGS_checksum) { 395 format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM; 396 } 397 open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(), 398 format_flags, 399 FLAGS_target_matches); 400 string output; 401 size_t input_size = 0; 402 size_t output_size = 0; 403 { 404 if (!encoder.StartEncoding(&output)) { 405 std::cerr << "Error during encoder initialization" << std::endl; 406 return false; 407 } 408 } 409 do { 410 size_t bytes_read = 0; 411 if (!WriteOutput(output) || !ReadInput(&bytes_read)) { 412 return false; 413 } 414 output_size += output.size(); 415 output.clear(); 416 if (bytes_read > 0) { 417 input_size += bytes_read; 418 if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) { 419 std::cerr << "Error trying to encode data chunk of length " 420 << bytes_read << std::endl; 421 return false; 422 } 423 } 424 } while (!feof(input_file_)); 425 encoder.FinishEncoding(&output); 426 if (!WriteOutput(output)) { 427 return false; 428 } 429 output_size += output.size(); 430 output.clear(); 431 if (FLAGS_stats && (input_size > 0)) { 432 std::cerr << "Original size: " << input_size 433 << "\tCompressed size: " << output_size << " (" 434 << ((static_cast<double>(output_size) / input_size) * 100) 435 << "% of original)" << std::endl; 436 } 437 return true; 438 } 439 440 bool VCDiffFileBasedCoder::Decode() { 441 input_file_type_ = "delta"; 442 input_file_name_ = FLAGS_delta; 443 output_file_type_ = "target"; 444 output_file_name_ = FLAGS_target; 445 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { 446 return false; 447 } 448 449 open_vcdiff::VCDiffStreamingDecoder decoder; 450 decoder.SetMaximumTargetFileSize( 451 static_cast<size_t>(FLAGS_max_target_file_size)); 452 decoder.SetMaximumTargetWindowSize( 453 static_cast<size_t>(FLAGS_max_target_window_size)); 454 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); 455 string output; 456 size_t input_size = 0; 457 size_t output_size = 0; 458 // Issue 6: Visual Studio STL produces a runtime exception 459 // if &dictionary_[0] is attempted for an empty dictionary. 460 if (dictionary_.empty()) { 461 decoder.StartDecoding("", 0); 462 } else { 463 decoder.StartDecoding(&dictionary_[0], dictionary_.size()); 464 } 465 466 do { 467 size_t bytes_read = 0; 468 if (!ReadInput(&bytes_read)) { 469 return false; 470 } 471 if (bytes_read > 0) { 472 input_size += bytes_read; 473 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { 474 std::cerr << "Error trying to decode data chunk of length " 475 << bytes_read << std::endl; 476 return false; 477 } 478 } 479 if (!WriteOutput(output)) { 480 return false; 481 } 482 output_size += output.size(); 483 output.clear(); 484 } while (!feof(input_file_)); 485 if (!decoder.FinishDecoding()) { 486 std::cerr << "Decode error; '" << FLAGS_delta 487 << " may not be a valid VCDIFF delta file" << std::endl; 488 return false; 489 } 490 if (!WriteOutput(output)) { 491 return false; 492 } 493 output_size += output.size(); 494 output.clear(); 495 if (FLAGS_stats && (output_size > 0)) { 496 std::cerr << "Decompressed size: " << output_size 497 << "\tCompressed size: " << input_size << " (" 498 << ((static_cast<double>(input_size) / output_size) * 100) 499 << "% of original)" << std::endl; 500 } 501 return true; 502 } 503 504 bool VCDiffFileBasedCoder::DecodeAndCompare() { 505 input_file_type_ = "delta"; 506 input_file_name_ = FLAGS_delta; 507 output_file_type_ = "target"; 508 output_file_name_ = FLAGS_target; 509 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) { 510 return false; 511 } 512 513 open_vcdiff::VCDiffStreamingDecoder decoder; 514 decoder.SetMaximumTargetFileSize( 515 static_cast<size_t>(FLAGS_max_target_file_size)); 516 decoder.SetMaximumTargetWindowSize( 517 static_cast<size_t>(FLAGS_max_target_window_size)); 518 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); 519 string output; 520 size_t input_size = 0; 521 size_t output_size = 0; 522 // Issue 6: Visual Studio STL produces a runtime exception 523 // if &dictionary_[0] is attempted for an empty dictionary. 524 if (dictionary_.empty()) { 525 decoder.StartDecoding("", 0); 526 } else { 527 decoder.StartDecoding(&dictionary_[0], dictionary_.size()); 528 } 529 530 do { 531 size_t bytes_read = 0; 532 if (!ReadInput(&bytes_read)) { 533 return false; 534 } 535 if (bytes_read > 0) { 536 input_size += bytes_read; 537 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { 538 std::cerr << "Error trying to decode data chunk of length " 539 << bytes_read << std::endl; 540 return false; 541 } 542 } 543 if (!CompareOutput(output)) { 544 return false; 545 } 546 output_size += output.size(); 547 output.clear(); 548 } while (!feof(input_file_)); 549 if (!decoder.FinishDecoding()) { 550 std::cerr << "Decode error; '" << FLAGS_delta 551 << " may not be a valid VCDIFF delta file" << std::endl; 552 return false; 553 } 554 if (!CompareOutput(output)) { 555 return false; 556 } 557 output_size += output.size(); 558 output.clear(); 559 if (fgetc(output_file_) != EOF) { 560 std::cerr << "Decoded target is shorter than original target file" 561 << std::endl; 562 return false; 563 } 564 if (ferror(output_file_)) { 565 std::cerr << "Error reading end-of-file indicator from target file" 566 << std::endl; 567 return false; 568 } 569 if (FLAGS_stats && (output_size > 0)) { 570 std::cerr << "Decompressed size: " << output_size 571 << "\tCompressed size: " << input_size << " (" 572 << ((static_cast<double>(input_size) / output_size) * 100) 573 << "% of original)" << std::endl; 574 } 575 return true; 576 } 577 578 } // namespace open_vcdiff 579 580 int main(int argc, char** argv) { 581 const char* const command_name = argv[0]; 582 google::SetUsageMessage(kUsageString); 583 google::ParseCommandLineFlags(&argc, &argv, true); 584 if (argc != 2) { 585 std::cerr << command_name << ": Must specify exactly one command option" 586 << std::endl; 587 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 588 return 1; 589 } 590 const char* const command_option = argv[1]; 591 if (FLAGS_dictionary.empty()) { 592 std::cerr << command_name << " " << command_option 593 << ": Must specify --dictionary <file-name>" << std::endl; 594 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 595 return 1; 596 } 597 if (!GetCommandLineFlagInfoOrDie("buffersize").is_default && 598 (FLAGS_buffersize == 0)) { 599 std::cerr << command_name << ": Option --buffersize cannot be 0" 600 << std::endl; 601 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 602 return 1; 603 } 604 if ((strcmp(command_option, "encode") == 0) || 605 (strcmp(command_option, "delta") == 0)) { 606 open_vcdiff::VCDiffFileBasedCoder coder; 607 if (!coder.Encode()) { 608 return 1; 609 } 610 // The destructor for VCDiffFileBasedCoder will clean up the open files 611 // and allocated memory. 612 } else if ((strcmp(command_option, "decode") == 0) || 613 (strcmp(command_option, "patch") == 0)) { 614 open_vcdiff::VCDiffFileBasedCoder coder; 615 if (!coder.Decode()) { 616 return 1; 617 } 618 } else if ((strcmp(command_option, "test") == 0)) { 619 // "vcdiff test" does not appear in the usage string, but can be 620 // used for debugging. It encodes, then decodes, then compares the result 621 // with the original target. It expects the same arguments as 622 // "vcdiff encode", with the additional requirement that the --target 623 // and --delta file arguments must be specified, rather than using stdin 624 // or stdout. It produces a delta file just as for "vcdiff encode". 625 if (FLAGS_target.empty() || FLAGS_delta.empty()) { 626 std::cerr << command_name 627 << " test: Must specify both --target <file-name>" 628 " and --delta <file-name>" << std::endl; 629 return 1; 630 } 631 const string original_target(FLAGS_target); 632 // Put coder into a separate scope. 633 { 634 open_vcdiff::VCDiffFileBasedCoder coder; 635 if (!coder.Encode()) { 636 return 1; 637 } 638 } 639 { 640 open_vcdiff::VCDiffFileBasedCoder coder; 641 if (!coder.DecodeAndCompare()) { 642 return 1; 643 } 644 } 645 } else { 646 std::cerr << command_name << ": Unrecognized command option " 647 << command_option << std::endl; 648 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 649 return 1; 650 } 651 return 0; 652 } 653