// Copyright 2008 Google Inc. // Author: Lincoln Smith // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // A command-line interface to the open-vcdiff library. #include <config.h> #include <assert.h> #include <errno.h> #ifdef WIN32 #include <fcntl.h> #include <io.h> #endif // WIN32 #include <stdio.h> #include <string.h> // strerror #include <iostream> #include <memory> #include <string> #include <vector> #include "gflags/gflags.h" #include "google/vcdecoder.h" #include "google/vcencoder.h" #ifndef HAS_GLOBAL_STRING using std::string; #endif // !HAS_GLOBAL_STRING using google::GetCommandLineFlagInfoOrDie; using google::ShowUsageWithFlagsRestrict; static const size_t kDefaultMaxTargetSize = 1 << 26; // 64 MB // Definitions of command-line flags DEFINE_string(dictionary, "", "File containing dictionary data (required)"); DEFINE_string(target, "", "Target file (default is stdin for encode, stdout for decode"); DEFINE_string(delta, "", "Encoded delta file (default is stdout for encode, " "stdin for decode"); // --buffersize is the maximum allowable size of a target window. // This value may be increased if there is sufficient memory available. DEFINE_uint64(buffersize, 1 << 20, // 1 MB "Buffer size for reading input file"); DEFINE_bool(allow_vcd_target, true, "If false, the decoder issues an error when the VCD_TARGET flag " "is encountered"); DEFINE_bool(checksum, false, "Include an Adler32 checksum of the target data when encoding"); DEFINE_bool(interleaved, false, "Use interleaved format"); DEFINE_bool(json, false, "Output diff in the JSON format when encoding"); DEFINE_bool(stats, false, "Report compression percentage"); DEFINE_bool(target_matches, false, "Find duplicate strings in target data" " as well as dictionary data"); DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize, "Maximum target file size allowed by decoder"); DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize, "Maximum target window size allowed by decoder"); static const char* const kUsageString = " {encode | delta | decode | patch }[ <options> ]\n" "encode or delta: create delta file from dictionary and target file\n" "decode or patch: reconstruct target file from dictionary and delta file"; namespace open_vcdiff { class VCDiffFileBasedCoder { public: VCDiffFileBasedCoder(); ~VCDiffFileBasedCoder(); // Once the command-line arguments have been parsed, these functions // will use the supplied options to carry out a file-based encode // or decode operation. bool Encode(); bool Decode(); bool DecodeAndCompare(); // for "vcdiff test"; compare target with original private: // Determines the size of the file. The given file must be an input file // opened for reading only, not an input stream such as stdin. The function // returns true and populates file_size if successful; otherwise, it returns // false. static bool FileSize(FILE* file, size_t* file_size); // Opens a file for incremental reading. file_name is the name of the file // to be opened. file_type should be a descriptive name (like "target") for // use in log messages. If successful, returns true and sets *file to a // valid input file, *buffer to a region of memory allocated using malloc() // (so the caller must release it using free()), and buffer_size to the size // of the buffer, which will not be larger than the size of the file, and // will not be smaller than the --buffersize option. If the function fails, // it outputs a log message and returns false. bool OpenFileForReading(const string& file_name, const char* file_type, FILE** file, std::vector<char>* buffer); // Opens the dictionary file and reads it into a newly allocated buffer. // If successful, returns true and populates dictionary_ with the dictionary // contents; otherwise, returns false. bool OpenDictionary(); // Opens the input file (the delta or target file) for reading. // Allocates space for the input buffer. If successful, // input_file_ will be valid and input_buffer_ will be allocated. bool OpenInputFile() { return OpenFileForReading(input_file_name_, input_file_type_, &input_file_, &input_buffer_); } // Opens the output file (the target or delta file) for writing. // If successful, output_file_ will be valid. bool OpenOutputFile(); // Opens the output file (the target file) for comparison against the decoded // output when using "vcdiff test". bool OpenOutputFileForCompare() { return OpenFileForReading(output_file_name_, output_file_type_, &output_file_, &compare_buffer_); } // Reads as much input data as possible from the input file // into input_buffer_. If successful, returns true and sets *bytes_read // to the number of bytes read into input_buffer_. If an error occurs, // writes an error log message and returns false. bool ReadInput(size_t* bytes_read); // Writes the contents of output to output_file_. If successful, returns // true. If an error occurs, writes an error log message and returns false. bool WriteOutput(const string& output); // Reads a number of bytes from output_file_ equal to the size of output, // and compares to make sure they match the contents of output. If the bytes // do not match, or if end of file is reached before the expected number of // bytes have been read, or a read error occurs, the function returns false; // otherwise, returns true. bool CompareOutput(const string& output); // Dictionary contents. The entire dictionary file will be read into memory. std::vector<char> dictionary_; std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_; // These should be set to either "delta" or "target". They are only // used in log messages such as "Error opening delta file..." const char* input_file_type_; const char* output_file_type_; // The filenames used for input and output. Will be empty if stdin // or stdout is being used. string input_file_name_; string output_file_name_; // stdio-style file handles for the input and output files and the dictionary. // When encoding, input_file_ is the target file and output_file_ is the delta // file; when decoding, the reverse is true. The dictionary is always read // from a file rather than from standard input. FILE* input_file_; FILE* output_file_; // A memory buffer used to load the input file into memory. If the input // comes from stdin because no input file was specified, then the size of // input_buffer_ will be the value specified by the --buffersize option. // If the input comes from a file, then the buffer will be allocated to match // the file size, if possible. However, the buffer will not exceed // --buffersize bytes in length. std::vector<char> input_buffer_; // A memory buffer used to load the output file into memory for comparison // if "vcdiff test" is specified. std::vector<char> compare_buffer_; // Making these private avoids implicit copy constructor & assignment operator VCDiffFileBasedCoder(const VCDiffFileBasedCoder&); // NOLINT void operator=(const VCDiffFileBasedCoder&); }; inline VCDiffFileBasedCoder::VCDiffFileBasedCoder() : input_file_type_(""), output_file_type_(""), input_file_(NULL), output_file_(NULL) { } VCDiffFileBasedCoder::~VCDiffFileBasedCoder() { if (input_file_ && (input_file_ != stdin)) { fclose(input_file_); input_file_ = NULL; } if (output_file_ && (output_file_ != stdout)) { fclose(output_file_); output_file_ = NULL; } } bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) { long initial_position = ftell(file); if (fseek(file, 0, SEEK_END) != 0) { return false; } *file_size = static_cast<size_t>(ftell(file)); if (fseek(file, initial_position, SEEK_SET) != 0) { return false; } return true; } bool VCDiffFileBasedCoder::OpenDictionary() { assert(dictionary_.empty()); assert(!FLAGS_dictionary.empty()); FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb"); if (!dictionary_file) { std::cerr << "Error opening dictionary file '" << FLAGS_dictionary << "': " << strerror(errno) << std::endl; return false; } size_t dictionary_size = 0U; if (!FileSize(dictionary_file, &dictionary_size)) { std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary << "': " << strerror(errno) << std::endl; return false; } dictionary_.resize(dictionary_size); if (dictionary_size > 0) { if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file) != dictionary_size) { std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary << "': " << strerror(errno) << std::endl; fclose(dictionary_file); dictionary_.clear(); return false; } } fclose(dictionary_file); return true; } bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name, const char* file_type, FILE** file, std::vector<char>* buffer) { assert(buffer->empty()); size_t buffer_size = 0U; if (!*file && file_name.empty()) { #ifdef WIN32 _setmode(_fileno(stdin), _O_BINARY); #endif *file = stdin; buffer_size = static_cast<size_t>(FLAGS_buffersize); } else { if (!*file) { *file = fopen(file_name.c_str(), "rb"); if (!*file) { std::cerr << "Error opening " << file_type << " file '" << file_name << "': " << strerror(errno) << std::endl; return false; } } size_t file_size = 0U; if (!FileSize(*file, &file_size)) { std::cerr << "Error finding size of " << file_type << " file '" << file_name << "': " << strerror(errno) << std::endl; return false; } buffer_size = static_cast<size_t>(FLAGS_buffersize); if (file_size < buffer_size) { // Allocate just enough memory to store the entire file buffer_size = file_size; } } buffer->resize(buffer_size); return true; } // Opens the output file for streamed read operations using the // standard C I/O library, i.e., fopen(), fwrite(), fclose(). // No output buffer is allocated because the encoded/decoded output // is constructed progressively using a std::string object // whose buffer is resized as needed. bool VCDiffFileBasedCoder::OpenOutputFile() { if (output_file_name_.empty()) { #ifdef WIN32 _setmode(_fileno(stdout), _O_BINARY); #endif output_file_ = stdout; } else { output_file_ = fopen(output_file_name_.c_str(), "wb"); if (!output_file_) { std::cerr << "Error opening " << output_file_type_ << " file '" << output_file_name_ << "': " << strerror(errno) << std::endl; return false; } } return true; } bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) { // Read from file or stdin *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_); if (ferror(input_file_)) { std::cerr << "Error reading from " << input_file_type_ << " file '" << input_file_name_ << "': " << strerror(errno) << std::endl; return false; } return true; } bool VCDiffFileBasedCoder::WriteOutput(const string& output) { if (!output.empty()) { // Some new output has been generated and is ready to be written // to the output file or to stdout. fwrite(output.data(), 1, output.size(), output_file_); if (ferror(output_file_)) { std::cerr << "Error writing " << output.size() << " bytes to " << output_file_type_ << " file '" << output_file_name_ << "': " << strerror(errno) << std::endl; return false; } } return true; } bool VCDiffFileBasedCoder::CompareOutput(const string& output) { if (!output.empty()) { size_t output_size = output.size(); // Some new output has been generated and is ready to be compared against // the output file. if (output_size > compare_buffer_.size()) { compare_buffer_.resize(output_size); } size_t bytes_read = fread(&compare_buffer_[0], 1, output_size, output_file_); if (ferror(output_file_)) { std::cerr << "Error reading from " << output_file_type_ << " file '" << output_file_name_ << "': " << strerror(errno) << std::endl; return false; } if (bytes_read < output_size) { std::cerr << "Decoded target is longer than original target file" << std::endl; return false; } if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) { std::cerr << "Original target file does not match decoded target" << std::endl; return false; } } return true; } bool VCDiffFileBasedCoder::Encode() { input_file_type_ = "target"; input_file_name_ = FLAGS_target; output_file_type_ = "delta"; output_file_name_ = FLAGS_delta; if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { return false; } // Issue 6: Visual Studio STL produces a runtime exception // if &dictionary_[0] is attempted for an empty dictionary. if (dictionary_.empty()) { hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0)); } else { hashed_dictionary_.reset( new open_vcdiff::HashedDictionary(&dictionary_[0], dictionary_.size())); } if (!hashed_dictionary_->Init()) { std::cerr << "Error initializing hashed dictionary" << std::endl; return false; } VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT; if (FLAGS_interleaved) { format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED; } if (FLAGS_checksum) { format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM; } if (FLAGS_json) { format_flags |= open_vcdiff::VCD_FORMAT_JSON; } open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(), format_flags, FLAGS_target_matches); string output; size_t input_size = 0; size_t output_size = 0; { if (!encoder.StartEncoding(&output)) { std::cerr << "Error during encoder initialization" << std::endl; return false; } } do { size_t bytes_read = 0; if (!WriteOutput(output) || !ReadInput(&bytes_read)) { return false; } output_size += output.size(); output.clear(); if (bytes_read > 0) { input_size += bytes_read; if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) { std::cerr << "Error trying to encode data chunk of length " << bytes_read << std::endl; return false; } } } while (!feof(input_file_)); encoder.FinishEncoding(&output); if (!WriteOutput(output)) { return false; } output_size += output.size(); output.clear(); if (FLAGS_stats && (input_size > 0)) { std::cerr << "Original size: " << input_size << "\tCompressed size: " << output_size << " (" << ((static_cast<double>(output_size) / input_size) * 100) << "% of original)" << std::endl; } return true; } bool VCDiffFileBasedCoder::Decode() { input_file_type_ = "delta"; input_file_name_ = FLAGS_delta; output_file_type_ = "target"; output_file_name_ = FLAGS_target; if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) { return false; } open_vcdiff::VCDiffStreamingDecoder decoder; decoder.SetMaximumTargetFileSize( static_cast<size_t>(FLAGS_max_target_file_size)); decoder.SetMaximumTargetWindowSize( static_cast<size_t>(FLAGS_max_target_window_size)); decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); string output; size_t input_size = 0; size_t output_size = 0; // Issue 6: Visual Studio STL produces a runtime exception // if &dictionary_[0] is attempted for an empty dictionary. if (dictionary_.empty()) { decoder.StartDecoding("", 0); } else { decoder.StartDecoding(&dictionary_[0], dictionary_.size()); } do { size_t bytes_read = 0; if (!ReadInput(&bytes_read)) { return false; } if (bytes_read > 0) { input_size += bytes_read; if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { std::cerr << "Error trying to decode data chunk of length " << bytes_read << std::endl; return false; } } if (!WriteOutput(output)) { return false; } output_size += output.size(); output.clear(); } while (!feof(input_file_)); if (!decoder.FinishDecoding()) { std::cerr << "Decode error; '" << FLAGS_delta << " may not be a valid VCDIFF delta file" << std::endl; return false; } if (!WriteOutput(output)) { return false; } output_size += output.size(); output.clear(); if (FLAGS_stats && (output_size > 0)) { std::cerr << "Decompressed size: " << output_size << "\tCompressed size: " << input_size << " (" << ((static_cast<double>(input_size) / output_size) * 100) << "% of original)" << std::endl; } return true; } bool VCDiffFileBasedCoder::DecodeAndCompare() { input_file_type_ = "delta"; input_file_name_ = FLAGS_delta; output_file_type_ = "target"; output_file_name_ = FLAGS_target; if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) { return false; } open_vcdiff::VCDiffStreamingDecoder decoder; decoder.SetMaximumTargetFileSize( static_cast<size_t>(FLAGS_max_target_file_size)); decoder.SetMaximumTargetWindowSize( static_cast<size_t>(FLAGS_max_target_window_size)); decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target); string output; size_t input_size = 0; size_t output_size = 0; // Issue 6: Visual Studio STL produces a runtime exception // if &dictionary_[0] is attempted for an empty dictionary. if (dictionary_.empty()) { decoder.StartDecoding("", 0); } else { decoder.StartDecoding(&dictionary_[0], dictionary_.size()); } do { size_t bytes_read = 0; if (!ReadInput(&bytes_read)) { return false; } if (bytes_read > 0) { input_size += bytes_read; if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) { std::cerr << "Error trying to decode data chunk of length " << bytes_read << std::endl; return false; } } if (!CompareOutput(output)) { return false; } output_size += output.size(); output.clear(); } while (!feof(input_file_)); if (!decoder.FinishDecoding()) { std::cerr << "Decode error; '" << FLAGS_delta << " may not be a valid VCDIFF delta file" << std::endl; return false; } if (!CompareOutput(output)) { return false; } output_size += output.size(); output.clear(); if (fgetc(output_file_) != EOF) { std::cerr << "Decoded target is shorter than original target file" << std::endl; return false; } if (ferror(output_file_)) { std::cerr << "Error reading end-of-file indicator from target file" << std::endl; return false; } if (FLAGS_stats && (output_size > 0)) { std::cerr << "Decompressed size: " << output_size << "\tCompressed size: " << input_size << " (" << ((static_cast<double>(input_size) / output_size) * 100) << "% of original)" << std::endl; } return true; } } // namespace open_vcdiff int main(int argc, char** argv) { const char* const command_name = argv[0]; google::SetUsageMessage(kUsageString); google::ParseCommandLineFlags(&argc, &argv, true); if (argc != 2) { std::cerr << command_name << ": Must specify exactly one command option" << std::endl; ShowUsageWithFlagsRestrict(command_name, "vcdiff"); return 1; } const char* const command_option = argv[1]; if (FLAGS_dictionary.empty()) { std::cerr << command_name << " " << command_option << ": Must specify --dictionary <file-name>" << std::endl; ShowUsageWithFlagsRestrict(command_name, "vcdiff"); return 1; } if (!GetCommandLineFlagInfoOrDie("buffersize").is_default && (FLAGS_buffersize == 0)) { std::cerr << command_name << ": Option --buffersize cannot be 0" << std::endl; ShowUsageWithFlagsRestrict(command_name, "vcdiff"); return 1; } if ((strcmp(command_option, "encode") == 0) || (strcmp(command_option, "delta") == 0)) { open_vcdiff::VCDiffFileBasedCoder coder; if (!coder.Encode()) { return 1; } // The destructor for VCDiffFileBasedCoder will clean up the open files // and allocated memory. } else if ((strcmp(command_option, "decode") == 0) || (strcmp(command_option, "patch") == 0)) { open_vcdiff::VCDiffFileBasedCoder coder; if (!coder.Decode()) { return 1; } } else if ((strcmp(command_option, "test") == 0)) { // "vcdiff test" does not appear in the usage string, but can be // used for debugging. It encodes, then decodes, then compares the result // with the original target. It expects the same arguments as // "vcdiff encode", with the additional requirement that the --target // and --delta file arguments must be specified, rather than using stdin // or stdout. It produces a delta file just as for "vcdiff encode". if (FLAGS_target.empty() || FLAGS_delta.empty()) { std::cerr << command_name << " test: Must specify both --target <file-name>" " and --delta <file-name>" << std::endl; return 1; } const string original_target(FLAGS_target); // Put coder into a separate scope. { open_vcdiff::VCDiffFileBasedCoder coder; if (!coder.Encode()) { return 1; } } { open_vcdiff::VCDiffFileBasedCoder coder; if (!coder.DecodeAndCompare()) { return 1; } } } else { std::cerr << command_name << ": Unrecognized command option " << command_option << std::endl; ShowUsageWithFlagsRestrict(command_name, "vcdiff"); return 1; } return 0; }