1 // Copyright 2008 Google Inc. 2 // Author: Lincoln Smith 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // A command-line interface to the open-vcdiff library. 17 18 #include <config.h> 19 #include <assert.h> 20 #include <errno.h> 21 #ifdef WIN32 22 #include <fcntl.h> 23 #include <io.h> 24 #endif // WIN32 25 #include <stdio.h> 26 #include <string.h> // strerror 27 #include <iostream> 28 #include <memory> 29 #include <string> 30 #include <vector> 31 #include "gflags/gflags.h" 32 #include "google/vcdecoder.h" 33 #include "google/vcencoder.h" 34 35 #ifndef HAS_GLOBAL_STRING 36 using std::string; 37 #endif // !HAS_GLOBAL_STRING 38 using google::GetCommandLineFlagInfoOrDie; 39 using google::ShowUsageWithFlagsRestrict; 40 41 static const size_t kDefaultMaxTargetSize = 1 << 26; // 64 MB 42 43 // Definitions of command-line flags 44 DEFINE_string(dictionary, "", 45 "File containing dictionary data (required)"); 46 DEFINE_string(target, "", 47 "Target file (default is stdin for encode, stdout for decode"); 48 DEFINE_string(delta, "", 49 "Encoded delta file (default is stdout for encode, " 50 "stdin for decode"); 51 // --buffersize is the maximum allowable size of a target window. 52 // This value may be increased if there is sufficient memory available. 53 DEFINE_uint64(buffersize, 1 << 20, // 1 MB 54 "Buffer size for reading input file"); 55 DEFINE_bool(allow_vcd_target, true, 56 "If false, the decoder issues an error when the VCD_TARGET flag " 57 "is encountered"); 58 DEFINE_bool(checksum, false, 59 "Include an Adler32 checksum of the target data when encoding"); 60 DEFINE_bool(interleaved, false, "Use interleaved format"); 61 DEFINE_bool(json, false, "Output diff in the JSON format when encoding"); 62 DEFINE_bool(stats, false, "Report compression percentage"); 63 DEFINE_bool(target_matches, false, "Find duplicate strings in target data" 64 " as well as dictionary data"); 65 DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize, 66 "Maximum target file size allowed by decoder"); 67 DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize, 68 "Maximum target window size allowed by decoder"); 69 70 static const char* const kUsageString = 71 " {encode | delta | decode | patch }[ <options> ]\n" 72 "encode or delta: create delta file from dictionary and target file\n" 73 "decode or patch: reconstruct target file from dictionary and delta file"; 74 75 namespace open_vcdiff { 76 77 class VCDiffFileBasedCoder { 78 public: 79 VCDiffFileBasedCoder(); 80 ~VCDiffFileBasedCoder(); 81 82 // Once the command-line arguments have been parsed, these functions 83 // will use the supplied options to carry out a file-based encode 84 // or decode operation. 85 bool Encode(); 86 bool Decode(); 87 bool DecodeAndCompare(); // for "vcdiff test"; compare target with original 88 89 private: 90 // Determines the size of the file. The given file must be an input file 91 // opened for reading only, not an input stream such as stdin. The function 92 // returns true and populates file_size if successful; otherwise, it returns 93 // false. 94 static bool FileSize(FILE* file, size_t* file_size); 95 96 // Opens a file for incremental reading. file_name is the name of the file 97 // to be opened. file_type should be a descriptive name (like "target") for 98 // use in log messages. If successful, returns true and sets *file to a 99 // valid input file, *buffer to a region of memory allocated using malloc() 100 // (so the caller must release it using free()), and buffer_size to the size 101 // of the buffer, which will not be larger than the size of the file, and 102 // will not be smaller than the --buffersize option. If the function fails, 103 // it outputs a log message and returns false. 104 bool OpenFileForReading(const string& file_name, 105 const char* file_type, 106 FILE** file, 107 std::vector<char>* buffer); 108 109 // Opens the dictionary file and reads it into a newly allocated buffer. 110 // If successful, returns true and populates dictionary_ with the dictionary 111 // contents; otherwise, returns false. 112 bool OpenDictionary(); 113 114 // Opens the input file (the delta or target file) for reading. 115 // Allocates space for the input buffer. If successful, 116 // input_file_ will be valid and input_buffer_ will be allocated. 117 bool OpenInputFile() { 118 return OpenFileForReading(input_file_name_, 119 input_file_type_, 120 &input_file_, 121 &input_buffer_); 122 } 123 124 // Opens the output file (the target or delta file) for writing. 125 // If successful, output_file_ will be valid. 126 bool OpenOutputFile(); 127 128 // Opens the output file (the target file) for comparison against the decoded 129 // output when using "vcdiff test". 130 bool OpenOutputFileForCompare() { 131 return OpenFileForReading(output_file_name_, 132 output_file_type_, 133 &output_file_, 134 &compare_buffer_); 135 } 136 137 // Reads as much input data as possible from the input file 138 // into input_buffer_. If successful, returns true and sets *bytes_read 139 // to the number of bytes read into input_buffer_. If an error occurs, 140 // writes an error log message and returns false. 141 bool ReadInput(size_t* bytes_read); 142 143 // Writes the contents of output to output_file_. If successful, returns 144 // true. If an error occurs, writes an error log message and returns false. 145 bool WriteOutput(const string& output); 146 147 // Reads a number of bytes from output_file_ equal to the size of output, 148 // and compares to make sure they match the contents of output. If the bytes 149 // do not match, or if end of file is reached before the expected number of 150 // bytes have been read, or a read error occurs, the function returns false; 151 // otherwise, returns true. 152 bool CompareOutput(const string& output); 153 154 // Dictionary contents. The entire dictionary file will be read into memory. 155 std::vector<char> dictionary_; 156 157 std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_; 158 159 // These should be set to either "delta" or "target". They are only 160 // used in log messages such as "Error opening delta file..." 161 const char* input_file_type_; 162 const char* output_file_type_; 163 164 // The filenames used for input and output. Will be empty if stdin 165 // or stdout is being used. 166 string input_file_name_; 167 string output_file_name_; 168 169 // stdio-style file handles for the input and output files and the dictionary. 170 // When encoding, input_file_ is the target file and output_file_ is the delta 171 // file; when decoding, the reverse is true. The dictionary is always read 172 // from a file rather than from standard input. 173 FILE* input_file_; 174 FILE* output_file_; 175 176 // A memory buffer used to load the input file into memory. If the input 177 // comes from stdin because no input file was specified, then the size of 178 // input_buffer_ will be the value specified by the --buffersize option. 179 // If the input comes from a file, then the buffer will be allocated to match 180 // the file size, if possible. However, the buffer will not exceed 181 // --buffersize bytes in length. 182 std::vector<char> input_buffer_; 183 184 // A memory buffer used to load the output file into memory for comparison 185 // if "vcdiff test" is specified. 186 std::vector<char> compare_buffer_; 187 188 // Making these private avoids implicit copy constructor & assignment operator 189 VCDiffFileBasedCoder(const VCDiffFileBasedCoder&); // NOLINT 190 void operator=(const VCDiffFileBasedCoder&); 191 }; 192 193 inline VCDiffFileBasedCoder::VCDiffFileBasedCoder() 194 : input_file_type_(""), 195 output_file_type_(""), 196 input_file_(NULL), 197 output_file_(NULL) { } 198 199 VCDiffFileBasedCoder::~VCDiffFileBasedCoder() { 200 if (input_file_ && (input_file_ != stdin)) { 201 fclose(input_file_); 202 input_file_ = NULL; 203 } 204 if (output_file_ && (output_file_ != stdout)) { 205 fclose(output_file_); 206 output_file_ = NULL; 207 } 208 } 209 210 bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) { 211 long initial_position = ftell(file); 212 if (fseek(file, 0, SEEK_END) != 0) { 213 return false; 214 } 215 *file_size = static_cast<size_t>(ftell(file)); 216 if (fseek(file, initial_position, SEEK_SET) != 0) { 217 return false; 218 } 219 return true; 220 } 221 222 bool VCDiffFileBasedCoder::OpenDictionary() { 223 assert(dictionary_.empty()); 224 assert(!FLAGS_dictionary.empty()); 225 FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb"); 226 if (!dictionary_file) { 227 std::cerr << "Error opening dictionary file '" << FLAGS_dictionary 228 << "': " << strerror(errno) << std::endl; 229 return false; 230 } 231 size_t dictionary_size = 0U; 232 if (!FileSize(dictionary_file, &dictionary_size)) { 233 std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary 234 << "': " << strerror(errno) << std::endl; 235 return false; 236 } 237 dictionary_.resize(dictionary_size); 238 if (dictionary_size > 0) { 239 if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file) 240 != dictionary_size) { 241 std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary 242 << "': " << strerror(errno) << std::endl; 243 fclose(dictionary_file); 244 dictionary_.clear(); 245 return false; 246 } 247 } 248 fclose(dictionary_file); 249 return true; 250 } 251 252 bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name, 253 const char* file_type, 254 FILE** file, 255 std::vector<char>* buffer) { 256 assert(buffer->empty()); 257 size_t buffer_size = 0U; 258 if (!*file && file_name.empty()) { 259 #ifdef WIN32 260 _setmode(_fileno(stdin), _O_BINARY); 261 #endif 262 *file = stdin; 263 buffer_size = static_cast<size_t>(FLAGS_buffersize); 264 } else { 265 if (!*file) { 266 *file = fopen(file_name.c_str(), "rb"); 267 if (!*file) { 268 std::cerr << "Error opening " << file_type << " file '" 269 << file_name << "': " << strerror(errno) << std::endl; 270 return false; 271 } 272 } 273 size_t file_size = 0U; 274 if (!FileSize(*file, &file_size)) { 275 std::cerr << "Error finding size of " << file_type << " file '" 276 << file_name << "': " << strerror(errno) << std::endl; 277 return false; 278 } 279 buffer_size = static_cast<size_t>(FLAGS_buffersize); 280 if (file_size < buffer_size) { 281 // Allocate just enough memory to store the entire file 282 buffer_size = file_size; 283 } 284 } 285 buffer->resize(buffer_size); 286 return true; 287 } 288 289 // Opens the output file for streamed read operations using the 290 // standard C I/O library, i.e., fopen(), fwrite(), fclose(). 291 // No output buffer is allocated because the encoded/decoded output 292 // is constructed progressively using a std::string object 293 // whose buffer is resized as needed. 294 bool VCDiffFileBasedCoder::OpenOutputFile() { 295 if (output_file_name_.empty()) { 296 #ifdef WIN32 297 _setmode(_fileno(stdout), _O_BINARY); 298 #endif 299 output_file_ = stdout; 300 } else { 301 output_file_ = fopen(output_file_name_.c_str(), "wb"); 302 if (!output_file_) { 303 std::cerr << "Error opening " << output_file_type_ << " file '" 304 << output_file_name_ 305 << "': " << strerror(errno) << std::endl; 306 return false; 307 } 308 } 309 return true; 310 } 311 312 bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) { 313 // Read from file or stdin 314 *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_); 315 if (ferror(input_file_)) { 316 std::cerr << "Error reading from " << input_file_type_ << " file '" 317 << input_file_name_ 318 << "': " << strerror(errno) << std::endl; 319 return false; 320 } 321 return true; 322 } 323 324 bool VCDiffFileBasedCoder::WriteOutput(const string& output) { 325 if (!output.empty()) { 326 // Some new output has been generated and is ready to be written 327 // to the output file or to stdout. 328 fwrite(output.data(), 1, output.size(), output_file_); 329 if (ferror(output_file_)) { 330 std::cerr << "Error writing " << output.size() << " bytes to " 331 << output_file_type_ << " file '" << output_file_name_ 332 << "': " << strerror(errno) << std::endl; 333 return false; 334 } 335 } 336 return true; 337 } 338 339 bool VCDiffFileBasedCoder::CompareOutput(const string& output) { 340 if (!output.empty()) { 341 size_t output_size = output.size(); 342 // Some new output has been generated and is ready to be compared against 343 // the output file. 344 if (output_size > compare_buffer_.size()) { 345 compare_buffer_.resize(output_size); 346 } 347 size_t bytes_read = fread(&compare_buffer_[0], 348 1, 349 output_size, 350 output_file_); 351 if (ferror(output_file_)) { 352 std::cerr << "Error reading from " << output_file_type_ << " file '" 353 << output_file_name_ << "': " << strerror(errno) << std::endl; 354 return false; 355 } 356 if (bytes_read < output_size) { 357 std::cerr << "Decoded target is longer than original target file" 358 << std::endl; 359 return false; 360 } 361 if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) { 362 std::cerr << "Original target file does not match decoded target" 363 << std::endl; 364 return false; 365 } 366 } 367 return true; 368 } 369 370 bool VCDiffFileBasedCoder::Encode() { 371 input_file_type_ = "target"; 372 input_file_name_ = FLAGS_target; 373 output_file_type_ = "delta"; 374 output_file_name_ = FLAGS_delta; 375 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { 376 return false; 377 } 378 // Issue 6: Visual Studio STL produces a runtime exception 379 // if &dictionary_[0] is attempted for an empty dictionary. 380 if (dictionary_.empty()) { 381 hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0)); 382 } else { 383 hashed_dictionary_.reset( 384 new open_vcdiff::HashedDictionary(&dictionary_[0], 385 dictionary_.size())); 386 } 387 if (!hashed_dictionary_->Init()) { 388 std::cerr << "Error initializing hashed dictionary" << std::endl; 389 return false; 390 } 391 VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT; 392 if (FLAGS_interleaved) { 393 format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED; 394 } 395 if (FLAGS_checksum) { 396 format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM; 397 } 398 if (FLAGS_json) { 399 format_flags |= open_vcdiff::VCD_FORMAT_JSON; 400 } 401 open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(), 402 format_flags, 403 FLAGS_target_matches); 404 string output; 405 size_t input_size = 0; 406 size_t output_size = 0; 407 { 408 if (!encoder.StartEncoding(&output)) { 409 std::cerr << "Error during encoder initialization" << std::endl; 410 return false; 411 } 412 } 413 do { 414 size_t bytes_read = 0; 415 if (!WriteOutput(output) || !ReadInput(&bytes_read)) { 416 return false; 417 } 418 output_size += output.size(); 419 output.clear(); 420 if (bytes_read > 0) { 421 input_size += bytes_read; 422 if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) { 423 std::cerr << "Error trying to encode data chunk of length " 424 << bytes_read << std::endl; 425 return false; 426 } 427 } 428 } while (!feof(input_file_)); 429 encoder.FinishEncoding(&output); 430 if (!WriteOutput(output)) { 431 return false; 432 } 433 output_size += output.size(); 434 output.clear(); 435 if (FLAGS_stats && (input_size > 0)) { 436 std::cerr << "Original size: " << input_size 437 << "\tCompressed size: " << output_size << " (" 438 << ((static_cast<double>(output_size) / input_size) * 100) 439 << "% of original)" << std::endl; 440 } 441 return true; 442 } 443 444 bool VCDiffFileBasedCoder::Decode() { 445 input_file_type_ = "delta"; 446 input_file_name_ = FLAGS_delta; 447 output_file_type_ = "target"; 448 output_file_name_ = FLAGS_target; 449 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { 450 return false; 451 } 452 453 open_vcdiff::VCDiffStreamingDecoder decoder; 454 decoder.SetMaximumTargetFileSize( 455 static_cast<size_t>(FLAGS_max_target_file_size)); 456 decoder.SetMaximumTargetWindowSize( 457 static_cast<size_t>(FLAGS_max_target_window_size)); 458 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); 459 string output; 460 size_t input_size = 0; 461 size_t output_size = 0; 462 // Issue 6: Visual Studio STL produces a runtime exception 463 // if &dictionary_[0] is attempted for an empty dictionary. 464 if (dictionary_.empty()) { 465 decoder.StartDecoding("", 0); 466 } else { 467 decoder.StartDecoding(&dictionary_[0], dictionary_.size()); 468 } 469 470 do { 471 size_t bytes_read = 0; 472 if (!ReadInput(&bytes_read)) { 473 return false; 474 } 475 if (bytes_read > 0) { 476 input_size += bytes_read; 477 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { 478 std::cerr << "Error trying to decode data chunk of length " 479 << bytes_read << std::endl; 480 return false; 481 } 482 } 483 if (!WriteOutput(output)) { 484 return false; 485 } 486 output_size += output.size(); 487 output.clear(); 488 } while (!feof(input_file_)); 489 if (!decoder.FinishDecoding()) { 490 std::cerr << "Decode error; '" << FLAGS_delta 491 << " may not be a valid VCDIFF delta file" << std::endl; 492 return false; 493 } 494 if (!WriteOutput(output)) { 495 return false; 496 } 497 output_size += output.size(); 498 output.clear(); 499 if (FLAGS_stats && (output_size > 0)) { 500 std::cerr << "Decompressed size: " << output_size 501 << "\tCompressed size: " << input_size << " (" 502 << ((static_cast<double>(input_size) / output_size) * 100) 503 << "% of original)" << std::endl; 504 } 505 return true; 506 } 507 508 bool VCDiffFileBasedCoder::DecodeAndCompare() { 509 input_file_type_ = "delta"; 510 input_file_name_ = FLAGS_delta; 511 output_file_type_ = "target"; 512 output_file_name_ = FLAGS_target; 513 if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) { 514 return false; 515 } 516 517 open_vcdiff::VCDiffStreamingDecoder decoder; 518 decoder.SetMaximumTargetFileSize( 519 static_cast<size_t>(FLAGS_max_target_file_size)); 520 decoder.SetMaximumTargetWindowSize( 521 static_cast<size_t>(FLAGS_max_target_window_size)); 522 decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); 523 string output; 524 size_t input_size = 0; 525 size_t output_size = 0; 526 // Issue 6: Visual Studio STL produces a runtime exception 527 // if &dictionary_[0] is attempted for an empty dictionary. 528 if (dictionary_.empty()) { 529 decoder.StartDecoding("", 0); 530 } else { 531 decoder.StartDecoding(&dictionary_[0], dictionary_.size()); 532 } 533 534 do { 535 size_t bytes_read = 0; 536 if (!ReadInput(&bytes_read)) { 537 return false; 538 } 539 if (bytes_read > 0) { 540 input_size += bytes_read; 541 if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { 542 std::cerr << "Error trying to decode data chunk of length " 543 << bytes_read << std::endl; 544 return false; 545 } 546 } 547 if (!CompareOutput(output)) { 548 return false; 549 } 550 output_size += output.size(); 551 output.clear(); 552 } while (!feof(input_file_)); 553 if (!decoder.FinishDecoding()) { 554 std::cerr << "Decode error; '" << FLAGS_delta 555 << " may not be a valid VCDIFF delta file" << std::endl; 556 return false; 557 } 558 if (!CompareOutput(output)) { 559 return false; 560 } 561 output_size += output.size(); 562 output.clear(); 563 if (fgetc(output_file_) != EOF) { 564 std::cerr << "Decoded target is shorter than original target file" 565 << std::endl; 566 return false; 567 } 568 if (ferror(output_file_)) { 569 std::cerr << "Error reading end-of-file indicator from target file" 570 << std::endl; 571 return false; 572 } 573 if (FLAGS_stats && (output_size > 0)) { 574 std::cerr << "Decompressed size: " << output_size 575 << "\tCompressed size: " << input_size << " (" 576 << ((static_cast<double>(input_size) / output_size) * 100) 577 << "% of original)" << std::endl; 578 } 579 return true; 580 } 581 582 } // namespace open_vcdiff 583 584 int main(int argc, char** argv) { 585 const char* const command_name = argv[0]; 586 google::SetUsageMessage(kUsageString); 587 google::ParseCommandLineFlags(&argc, &argv, true); 588 if (argc != 2) { 589 std::cerr << command_name << ": Must specify exactly one command option" 590 << std::endl; 591 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 592 return 1; 593 } 594 const char* const command_option = argv[1]; 595 if (FLAGS_dictionary.empty()) { 596 std::cerr << command_name << " " << command_option 597 << ": Must specify --dictionary <file-name>" << std::endl; 598 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 599 return 1; 600 } 601 if (!GetCommandLineFlagInfoOrDie("buffersize").is_default && 602 (FLAGS_buffersize == 0)) { 603 std::cerr << command_name << ": Option --buffersize cannot be 0" 604 << std::endl; 605 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 606 return 1; 607 } 608 if ((strcmp(command_option, "encode") == 0) || 609 (strcmp(command_option, "delta") == 0)) { 610 open_vcdiff::VCDiffFileBasedCoder coder; 611 if (!coder.Encode()) { 612 return 1; 613 } 614 // The destructor for VCDiffFileBasedCoder will clean up the open files 615 // and allocated memory. 616 } else if ((strcmp(command_option, "decode") == 0) || 617 (strcmp(command_option, "patch") == 0)) { 618 open_vcdiff::VCDiffFileBasedCoder coder; 619 if (!coder.Decode()) { 620 return 1; 621 } 622 } else if ((strcmp(command_option, "test") == 0)) { 623 // "vcdiff test" does not appear in the usage string, but can be 624 // used for debugging. It encodes, then decodes, then compares the result 625 // with the original target. It expects the same arguments as 626 // "vcdiff encode", with the additional requirement that the --target 627 // and --delta file arguments must be specified, rather than using stdin 628 // or stdout. It produces a delta file just as for "vcdiff encode". 629 if (FLAGS_target.empty() || FLAGS_delta.empty()) { 630 std::cerr << command_name 631 << " test: Must specify both --target <file-name>" 632 " and --delta <file-name>" << std::endl; 633 return 1; 634 } 635 const string original_target(FLAGS_target); 636 // Put coder into a separate scope. 637 { 638 open_vcdiff::VCDiffFileBasedCoder coder; 639 if (!coder.Encode()) { 640 return 1; 641 } 642 } 643 { 644 open_vcdiff::VCDiffFileBasedCoder coder; 645 if (!coder.DecodeAndCompare()) { 646 return 1; 647 } 648 } 649 } else { 650 std::cerr << command_name << ": Unrecognized command option " 651 << command_option << std::endl; 652 ShowUsageWithFlagsRestrict(command_name, "vcdiff"); 653 return 1; 654 } 655 return 0; 656 } 657