// Copyright 2008 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A command-line interface to the open-vcdiff library.
#include <config.h>
#include <assert.h>
#include <errno.h>
#ifdef WIN32
#include <fcntl.h>
#include <io.h>
#endif // WIN32
#include <stdio.h>
#include <string.h> // strerror
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "google/vcdecoder.h"
#include "google/vcencoder.h"
#ifndef HAS_GLOBAL_STRING
using std::string;
#endif // !HAS_GLOBAL_STRING
using google::GetCommandLineFlagInfoOrDie;
using google::ShowUsageWithFlagsRestrict;
static const size_t kDefaultMaxTargetSize = 1 << 26; // 64 MB
// Definitions of command-line flags
DEFINE_string(dictionary, "",
"File containing dictionary data (required)");
DEFINE_string(target, "",
"Target file (default is stdin for encode, stdout for decode");
DEFINE_string(delta, "",
"Encoded delta file (default is stdout for encode, "
"stdin for decode");
// --buffersize is the maximum allowable size of a target window.
// This value may be increased if there is sufficient memory available.
DEFINE_uint64(buffersize, 1 << 20, // 1 MB
"Buffer size for reading input file");
DEFINE_bool(allow_vcd_target, true,
"If false, the decoder issues an error when the VCD_TARGET flag "
"is encountered");
DEFINE_bool(checksum, false,
"Include an Adler32 checksum of the target data when encoding");
DEFINE_bool(interleaved, false, "Use interleaved format");
DEFINE_bool(json, false, "Output diff in the JSON format when encoding");
DEFINE_bool(stats, false, "Report compression percentage");
DEFINE_bool(target_matches, false, "Find duplicate strings in target data"
" as well as dictionary data");
DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize,
"Maximum target file size allowed by decoder");
DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize,
"Maximum target window size allowed by decoder");
static const char* const kUsageString =
" {encode | delta | decode | patch }[ <options> ]\n"
"encode or delta: create delta file from dictionary and target file\n"
"decode or patch: reconstruct target file from dictionary and delta file";
namespace open_vcdiff {
class VCDiffFileBasedCoder {
public:
VCDiffFileBasedCoder();
~VCDiffFileBasedCoder();
// Once the command-line arguments have been parsed, these functions
// will use the supplied options to carry out a file-based encode
// or decode operation.
bool Encode();
bool Decode();
bool DecodeAndCompare(); // for "vcdiff test"; compare target with original
private:
// Determines the size of the file. The given file must be an input file
// opened for reading only, not an input stream such as stdin. The function
// returns true and populates file_size if successful; otherwise, it returns
// false.
static bool FileSize(FILE* file, size_t* file_size);
// Opens a file for incremental reading. file_name is the name of the file
// to be opened. file_type should be a descriptive name (like "target") for
// use in log messages. If successful, returns true and sets *file to a
// valid input file, *buffer to a region of memory allocated using malloc()
// (so the caller must release it using free()), and buffer_size to the size
// of the buffer, which will not be larger than the size of the file, and
// will not be smaller than the --buffersize option. If the function fails,
// it outputs a log message and returns false.
bool OpenFileForReading(const string& file_name,
const char* file_type,
FILE** file,
std::vector<char>* buffer);
// Opens the dictionary file and reads it into a newly allocated buffer.
// If successful, returns true and populates dictionary_ with the dictionary
// contents; otherwise, returns false.
bool OpenDictionary();
// Opens the input file (the delta or target file) for reading.
// Allocates space for the input buffer. If successful,
// input_file_ will be valid and input_buffer_ will be allocated.
bool OpenInputFile() {
return OpenFileForReading(input_file_name_,
input_file_type_,
&input_file_,
&input_buffer_);
}
// Opens the output file (the target or delta file) for writing.
// If successful, output_file_ will be valid.
bool OpenOutputFile();
// Opens the output file (the target file) for comparison against the decoded
// output when using "vcdiff test".
bool OpenOutputFileForCompare() {
return OpenFileForReading(output_file_name_,
output_file_type_,
&output_file_,
&compare_buffer_);
}
// Reads as much input data as possible from the input file
// into input_buffer_. If successful, returns true and sets *bytes_read
// to the number of bytes read into input_buffer_. If an error occurs,
// writes an error log message and returns false.
bool ReadInput(size_t* bytes_read);
// Writes the contents of output to output_file_. If successful, returns
// true. If an error occurs, writes an error log message and returns false.
bool WriteOutput(const string& output);
// Reads a number of bytes from output_file_ equal to the size of output,
// and compares to make sure they match the contents of output. If the bytes
// do not match, or if end of file is reached before the expected number of
// bytes have been read, or a read error occurs, the function returns false;
// otherwise, returns true.
bool CompareOutput(const string& output);
// Dictionary contents. The entire dictionary file will be read into memory.
std::vector<char> dictionary_;
std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_;
// These should be set to either "delta" or "target". They are only
// used in log messages such as "Error opening delta file..."
const char* input_file_type_;
const char* output_file_type_;
// The filenames used for input and output. Will be empty if stdin
// or stdout is being used.
string input_file_name_;
string output_file_name_;
// stdio-style file handles for the input and output files and the dictionary.
// When encoding, input_file_ is the target file and output_file_ is the delta
// file; when decoding, the reverse is true. The dictionary is always read
// from a file rather than from standard input.
FILE* input_file_;
FILE* output_file_;
// A memory buffer used to load the input file into memory. If the input
// comes from stdin because no input file was specified, then the size of
// input_buffer_ will be the value specified by the --buffersize option.
// If the input comes from a file, then the buffer will be allocated to match
// the file size, if possible. However, the buffer will not exceed
// --buffersize bytes in length.
std::vector<char> input_buffer_;
// A memory buffer used to load the output file into memory for comparison
// if "vcdiff test" is specified.
std::vector<char> compare_buffer_;
// Making these private avoids implicit copy constructor & assignment operator
VCDiffFileBasedCoder(const VCDiffFileBasedCoder&); // NOLINT
void operator=(const VCDiffFileBasedCoder&);
};
inline VCDiffFileBasedCoder::VCDiffFileBasedCoder()
: input_file_type_(""),
output_file_type_(""),
input_file_(NULL),
output_file_(NULL) { }
VCDiffFileBasedCoder::~VCDiffFileBasedCoder() {
if (input_file_ && (input_file_ != stdin)) {
fclose(input_file_);
input_file_ = NULL;
}
if (output_file_ && (output_file_ != stdout)) {
fclose(output_file_);
output_file_ = NULL;
}
}
bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) {
long initial_position = ftell(file);
if (fseek(file, 0, SEEK_END) != 0) {
return false;
}
*file_size = static_cast<size_t>(ftell(file));
if (fseek(file, initial_position, SEEK_SET) != 0) {
return false;
}
return true;
}
bool VCDiffFileBasedCoder::OpenDictionary() {
assert(dictionary_.empty());
assert(!FLAGS_dictionary.empty());
FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb");
if (!dictionary_file) {
std::cerr << "Error opening dictionary file '" << FLAGS_dictionary
<< "': " << strerror(errno) << std::endl;
return false;
}
size_t dictionary_size = 0U;
if (!FileSize(dictionary_file, &dictionary_size)) {
std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary
<< "': " << strerror(errno) << std::endl;
return false;
}
dictionary_.resize(dictionary_size);
if (dictionary_size > 0) {
if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file)
!= dictionary_size) {
std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary
<< "': " << strerror(errno) << std::endl;
fclose(dictionary_file);
dictionary_.clear();
return false;
}
}
fclose(dictionary_file);
return true;
}
bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name,
const char* file_type,
FILE** file,
std::vector<char>* buffer) {
assert(buffer->empty());
size_t buffer_size = 0U;
if (!*file && file_name.empty()) {
#ifdef WIN32
_setmode(_fileno(stdin), _O_BINARY);
#endif
*file = stdin;
buffer_size = static_cast<size_t>(FLAGS_buffersize);
} else {
if (!*file) {
*file = fopen(file_name.c_str(), "rb");
if (!*file) {
std::cerr << "Error opening " << file_type << " file '"
<< file_name << "': " << strerror(errno) << std::endl;
return false;
}
}
size_t file_size = 0U;
if (!FileSize(*file, &file_size)) {
std::cerr << "Error finding size of " << file_type << " file '"
<< file_name << "': " << strerror(errno) << std::endl;
return false;
}
buffer_size = static_cast<size_t>(FLAGS_buffersize);
if (file_size < buffer_size) {
// Allocate just enough memory to store the entire file
buffer_size = file_size;
}
}
buffer->resize(buffer_size);
return true;
}
// Opens the output file for streamed read operations using the
// standard C I/O library, i.e., fopen(), fwrite(), fclose().
// No output buffer is allocated because the encoded/decoded output
// is constructed progressively using a std::string object
// whose buffer is resized as needed.
bool VCDiffFileBasedCoder::OpenOutputFile() {
if (output_file_name_.empty()) {
#ifdef WIN32
_setmode(_fileno(stdout), _O_BINARY);
#endif
output_file_ = stdout;
} else {
output_file_ = fopen(output_file_name_.c_str(), "wb");
if (!output_file_) {
std::cerr << "Error opening " << output_file_type_ << " file '"
<< output_file_name_
<< "': " << strerror(errno) << std::endl;
return false;
}
}
return true;
}
bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) {
// Read from file or stdin
*bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_);
if (ferror(input_file_)) {
std::cerr << "Error reading from " << input_file_type_ << " file '"
<< input_file_name_
<< "': " << strerror(errno) << std::endl;
return false;
}
return true;
}
bool VCDiffFileBasedCoder::WriteOutput(const string& output) {
if (!output.empty()) {
// Some new output has been generated and is ready to be written
// to the output file or to stdout.
fwrite(output.data(), 1, output.size(), output_file_);
if (ferror(output_file_)) {
std::cerr << "Error writing " << output.size() << " bytes to "
<< output_file_type_ << " file '" << output_file_name_
<< "': " << strerror(errno) << std::endl;
return false;
}
}
return true;
}
bool VCDiffFileBasedCoder::CompareOutput(const string& output) {
if (!output.empty()) {
size_t output_size = output.size();
// Some new output has been generated and is ready to be compared against
// the output file.
if (output_size > compare_buffer_.size()) {
compare_buffer_.resize(output_size);
}
size_t bytes_read = fread(&compare_buffer_[0],
1,
output_size,
output_file_);
if (ferror(output_file_)) {
std::cerr << "Error reading from " << output_file_type_ << " file '"
<< output_file_name_ << "': " << strerror(errno) << std::endl;
return false;
}
if (bytes_read < output_size) {
std::cerr << "Decoded target is longer than original target file"
<< std::endl;
return false;
}
if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) {
std::cerr << "Original target file does not match decoded target"
<< std::endl;
return false;
}
}
return true;
}
bool VCDiffFileBasedCoder::Encode() {
input_file_type_ = "target";
input_file_name_ = FLAGS_target;
output_file_type_ = "delta";
output_file_name_ = FLAGS_delta;
if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
return false;
}
// Issue 6: Visual Studio STL produces a runtime exception
// if &dictionary_[0] is attempted for an empty dictionary.
if (dictionary_.empty()) {
hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0));
} else {
hashed_dictionary_.reset(
new open_vcdiff::HashedDictionary(&dictionary_[0],
dictionary_.size()));
}
if (!hashed_dictionary_->Init()) {
std::cerr << "Error initializing hashed dictionary" << std::endl;
return false;
}
VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT;
if (FLAGS_interleaved) {
format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED;
}
if (FLAGS_checksum) {
format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM;
}
if (FLAGS_json) {
format_flags |= open_vcdiff::VCD_FORMAT_JSON;
}
open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(),
format_flags,
FLAGS_target_matches);
string output;
size_t input_size = 0;
size_t output_size = 0;
{
if (!encoder.StartEncoding(&output)) {
std::cerr << "Error during encoder initialization" << std::endl;
return false;
}
}
do {
size_t bytes_read = 0;
if (!WriteOutput(output) || !ReadInput(&bytes_read)) {
return false;
}
output_size += output.size();
output.clear();
if (bytes_read > 0) {
input_size += bytes_read;
if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) {
std::cerr << "Error trying to encode data chunk of length "
<< bytes_read << std::endl;
return false;
}
}
} while (!feof(input_file_));
encoder.FinishEncoding(&output);
if (!WriteOutput(output)) {
return false;
}
output_size += output.size();
output.clear();
if (FLAGS_stats && (input_size > 0)) {
std::cerr << "Original size: " << input_size
<< "\tCompressed size: " << output_size << " ("
<< ((static_cast<double>(output_size) / input_size) * 100)
<< "% of original)" << std::endl;
}
return true;
}
bool VCDiffFileBasedCoder::Decode() {
input_file_type_ = "delta";
input_file_name_ = FLAGS_delta;
output_file_type_ = "target";
output_file_name_ = FLAGS_target;
if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
return false;
}
open_vcdiff::VCDiffStreamingDecoder decoder;
decoder.SetMaximumTargetFileSize(
static_cast<size_t>(FLAGS_max_target_file_size));
decoder.SetMaximumTargetWindowSize(
static_cast<size_t>(FLAGS_max_target_window_size));
decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
string output;
size_t input_size = 0;
size_t output_size = 0;
// Issue 6: Visual Studio STL produces a runtime exception
// if &dictionary_[0] is attempted for an empty dictionary.
if (dictionary_.empty()) {
decoder.StartDecoding("", 0);
} else {
decoder.StartDecoding(&dictionary_[0], dictionary_.size());
}
do {
size_t bytes_read = 0;
if (!ReadInput(&bytes_read)) {
return false;
}
if (bytes_read > 0) {
input_size += bytes_read;
if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
std::cerr << "Error trying to decode data chunk of length "
<< bytes_read << std::endl;
return false;
}
}
if (!WriteOutput(output)) {
return false;
}
output_size += output.size();
output.clear();
} while (!feof(input_file_));
if (!decoder.FinishDecoding()) {
std::cerr << "Decode error; '" << FLAGS_delta
<< " may not be a valid VCDIFF delta file" << std::endl;
return false;
}
if (!WriteOutput(output)) {
return false;
}
output_size += output.size();
output.clear();
if (FLAGS_stats && (output_size > 0)) {
std::cerr << "Decompressed size: " << output_size
<< "\tCompressed size: " << input_size << " ("
<< ((static_cast<double>(input_size) / output_size) * 100)
<< "% of original)" << std::endl;
}
return true;
}
bool VCDiffFileBasedCoder::DecodeAndCompare() {
input_file_type_ = "delta";
input_file_name_ = FLAGS_delta;
output_file_type_ = "target";
output_file_name_ = FLAGS_target;
if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) {
return false;
}
open_vcdiff::VCDiffStreamingDecoder decoder;
decoder.SetMaximumTargetFileSize(
static_cast<size_t>(FLAGS_max_target_file_size));
decoder.SetMaximumTargetWindowSize(
static_cast<size_t>(FLAGS_max_target_window_size));
decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
string output;
size_t input_size = 0;
size_t output_size = 0;
// Issue 6: Visual Studio STL produces a runtime exception
// if &dictionary_[0] is attempted for an empty dictionary.
if (dictionary_.empty()) {
decoder.StartDecoding("", 0);
} else {
decoder.StartDecoding(&dictionary_[0], dictionary_.size());
}
do {
size_t bytes_read = 0;
if (!ReadInput(&bytes_read)) {
return false;
}
if (bytes_read > 0) {
input_size += bytes_read;
if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
std::cerr << "Error trying to decode data chunk of length "
<< bytes_read << std::endl;
return false;
}
}
if (!CompareOutput(output)) {
return false;
}
output_size += output.size();
output.clear();
} while (!feof(input_file_));
if (!decoder.FinishDecoding()) {
std::cerr << "Decode error; '" << FLAGS_delta
<< " may not be a valid VCDIFF delta file" << std::endl;
return false;
}
if (!CompareOutput(output)) {
return false;
}
output_size += output.size();
output.clear();
if (fgetc(output_file_) != EOF) {
std::cerr << "Decoded target is shorter than original target file"
<< std::endl;
return false;
}
if (ferror(output_file_)) {
std::cerr << "Error reading end-of-file indicator from target file"
<< std::endl;
return false;
}
if (FLAGS_stats && (output_size > 0)) {
std::cerr << "Decompressed size: " << output_size
<< "\tCompressed size: " << input_size << " ("
<< ((static_cast<double>(input_size) / output_size) * 100)
<< "% of original)" << std::endl;
}
return true;
}
} // namespace open_vcdiff
int main(int argc, char** argv) {
const char* const command_name = argv[0];
google::SetUsageMessage(kUsageString);
google::ParseCommandLineFlags(&argc, &argv, true);
if (argc != 2) {
std::cerr << command_name << ": Must specify exactly one command option"
<< std::endl;
ShowUsageWithFlagsRestrict(command_name, "vcdiff");
return 1;
}
const char* const command_option = argv[1];
if (FLAGS_dictionary.empty()) {
std::cerr << command_name << " " << command_option
<< ": Must specify --dictionary <file-name>" << std::endl;
ShowUsageWithFlagsRestrict(command_name, "vcdiff");
return 1;
}
if (!GetCommandLineFlagInfoOrDie("buffersize").is_default &&
(FLAGS_buffersize == 0)) {
std::cerr << command_name << ": Option --buffersize cannot be 0"
<< std::endl;
ShowUsageWithFlagsRestrict(command_name, "vcdiff");
return 1;
}
if ((strcmp(command_option, "encode") == 0) ||
(strcmp(command_option, "delta") == 0)) {
open_vcdiff::VCDiffFileBasedCoder coder;
if (!coder.Encode()) {
return 1;
}
// The destructor for VCDiffFileBasedCoder will clean up the open files
// and allocated memory.
} else if ((strcmp(command_option, "decode") == 0) ||
(strcmp(command_option, "patch") == 0)) {
open_vcdiff::VCDiffFileBasedCoder coder;
if (!coder.Decode()) {
return 1;
}
} else if ((strcmp(command_option, "test") == 0)) {
// "vcdiff test" does not appear in the usage string, but can be
// used for debugging. It encodes, then decodes, then compares the result
// with the original target. It expects the same arguments as
// "vcdiff encode", with the additional requirement that the --target
// and --delta file arguments must be specified, rather than using stdin
// or stdout. It produces a delta file just as for "vcdiff encode".
if (FLAGS_target.empty() || FLAGS_delta.empty()) {
std::cerr << command_name
<< " test: Must specify both --target <file-name>"
" and --delta <file-name>" << std::endl;
return 1;
}
const string original_target(FLAGS_target);
// Put coder into a separate scope.
{
open_vcdiff::VCDiffFileBasedCoder coder;
if (!coder.Encode()) {
return 1;
}
}
{
open_vcdiff::VCDiffFileBasedCoder coder;
if (!coder.DecodeAndCompare()) {
return 1;
}
}
} else {
std::cerr << command_name << ": Unrecognized command option "
<< command_option << std::endl;
ShowUsageWithFlagsRestrict(command_name, "vcdiff");
return 1;
}
return 0;
}