普通文本  |  657行  |  22.82 KB

// Copyright 2008 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// A command-line interface to the open-vcdiff library.

#include <config.h>
#include <assert.h>
#include <errno.h>
#ifdef WIN32
#include <fcntl.h>
#include <io.h>
#endif  // WIN32
#include <stdio.h>
#include <string.h>  // strerror
#include <iostream>
#include <memory>
#include <string>
#include <vector>
#include "gflags/gflags.h"
#include "google/vcdecoder.h"
#include "google/vcencoder.h"

#ifndef HAS_GLOBAL_STRING
using std::string;
#endif  // !HAS_GLOBAL_STRING
using google::GetCommandLineFlagInfoOrDie;
using google::ShowUsageWithFlagsRestrict;

static const size_t kDefaultMaxTargetSize = 1 << 26;      // 64 MB

// Definitions of command-line flags
DEFINE_string(dictionary, "",
              "File containing dictionary data (required)");
DEFINE_string(target, "",
              "Target file (default is stdin for encode, stdout for decode");
DEFINE_string(delta, "",
              "Encoded delta file (default is stdout for encode, "
              "stdin for decode");
// --buffersize is the maximum allowable size of a target window.
// This value may be increased if there is sufficient memory available.
DEFINE_uint64(buffersize, 1 << 20,  // 1 MB
              "Buffer size for reading input file");
DEFINE_bool(allow_vcd_target, true,
            "If false, the decoder issues an error when the VCD_TARGET flag "
            "is encountered");
DEFINE_bool(checksum, false,
            "Include an Adler32 checksum of the target data when encoding");
DEFINE_bool(interleaved, false, "Use interleaved format");
DEFINE_bool(json, false, "Output diff in the JSON format when encoding");
DEFINE_bool(stats, false, "Report compression percentage");
DEFINE_bool(target_matches, false, "Find duplicate strings in target data"
                                   " as well as dictionary data");
DEFINE_uint64(max_target_file_size, kDefaultMaxTargetSize,
              "Maximum target file size allowed by decoder");
DEFINE_uint64(max_target_window_size, kDefaultMaxTargetSize,
              "Maximum target window size allowed by decoder");

static const char* const kUsageString =
    " {encode | delta | decode | patch }[ <options> ]\n"
    "encode or delta: create delta file from dictionary and target file\n"
    "decode or patch: reconstruct target file from dictionary and delta file";

namespace open_vcdiff {

class VCDiffFileBasedCoder {
 public:
  VCDiffFileBasedCoder();
  ~VCDiffFileBasedCoder();

  // Once the command-line arguments have been parsed, these functions
  // will use the supplied options to carry out a file-based encode
  // or decode operation.
  bool Encode();
  bool Decode();
  bool DecodeAndCompare();  // for "vcdiff test"; compare target with original

 private:
  // Determines the size of the file.  The given file must be an input file
  // opened for reading only, not an input stream such as stdin.  The function
  // returns true and populates file_size if successful; otherwise, it returns
  // false.
  static bool FileSize(FILE* file, size_t* file_size);

  // Opens a file for incremental reading.  file_name is the name of the file
  // to be opened.  file_type should be a descriptive name (like "target") for
  // use in log messages.  If successful, returns true and sets *file to a
  // valid input file, *buffer to a region of memory allocated using malloc()
  // (so the caller must release it using free()), and buffer_size to the size
  // of the buffer, which will not be larger than the size of the file, and
  // will not be smaller than the --buffersize option.  If the function fails,
  // it outputs a log message and returns false.
  bool OpenFileForReading(const string& file_name,
                          const char* file_type,
                          FILE** file,
                          std::vector<char>* buffer);

  // Opens the dictionary file and reads it into a newly allocated buffer.
  // If successful, returns true and populates dictionary_ with the dictionary
  // contents; otherwise, returns false.
  bool OpenDictionary();

  // Opens the input file (the delta or target file) for reading.
  // Allocates space for the input buffer.  If successful,
  // input_file_ will be valid and input_buffer_ will be allocated.
  bool OpenInputFile() {
    return OpenFileForReading(input_file_name_,
                              input_file_type_,
                              &input_file_,
                              &input_buffer_);
  }

  // Opens the output file (the target or delta file) for writing.
  // If successful, output_file_ will be valid.
  bool OpenOutputFile();

  // Opens the output file (the target file) for comparison against the decoded
  // output when using "vcdiff test".
  bool OpenOutputFileForCompare() {
    return OpenFileForReading(output_file_name_,
                              output_file_type_,
                              &output_file_,
                              &compare_buffer_);
  }

  // Reads as much input data as possible from the input file
  // into input_buffer_.  If successful, returns true and sets *bytes_read
  // to the number of bytes read into input_buffer_.  If an error occurs,
  // writes an error log message and returns false.
  bool ReadInput(size_t* bytes_read);

  // Writes the contents of output to output_file_.  If successful, returns
  // true.  If an error occurs, writes an error log message and returns false.
  bool WriteOutput(const string& output);

  // Reads a number of bytes from output_file_ equal to the size of output,
  // and compares to make sure they match the contents of output.  If the bytes
  // do not match, or if end of file is reached before the expected number of
  // bytes have been read, or a read error occurs, the function returns false;
  // otherwise, returns true.
  bool CompareOutput(const string& output);

  // Dictionary contents.  The entire dictionary file will be read into memory.
  std::vector<char> dictionary_;

  std::auto_ptr<open_vcdiff::HashedDictionary> hashed_dictionary_;

  // These should be set to either "delta" or "target".  They are only
  // used in log messages such as "Error opening delta file..."
  const char* input_file_type_;
  const char* output_file_type_;

  // The filenames used for input and output.  Will be empty if stdin
  // or stdout is being used.
  string input_file_name_;
  string output_file_name_;

  // stdio-style file handles for the input and output files and the dictionary.
  // When encoding, input_file_ is the target file and output_file_ is the delta
  // file; when decoding, the reverse is true.  The dictionary is always read
  // from a file rather than from standard input.
  FILE* input_file_;
  FILE* output_file_;

  // A memory buffer used to load the input file into memory.  If the input
  // comes from stdin because no input file was specified, then the size of
  // input_buffer_ will be the value specified by the --buffersize option.
  // If the input comes from a file, then the buffer will be allocated to match
  // the file size, if possible.  However, the buffer will not exceed
  // --buffersize bytes in length.
  std::vector<char> input_buffer_;

  // A memory buffer used to load the output file into memory for comparison
  // if "vcdiff test" is specified.
  std::vector<char> compare_buffer_;

  // Making these private avoids implicit copy constructor & assignment operator
  VCDiffFileBasedCoder(const VCDiffFileBasedCoder&);  // NOLINT
  void operator=(const VCDiffFileBasedCoder&);
};

inline VCDiffFileBasedCoder::VCDiffFileBasedCoder()
    : input_file_type_(""),
      output_file_type_(""),
      input_file_(NULL),
      output_file_(NULL) { }

VCDiffFileBasedCoder::~VCDiffFileBasedCoder() {
  if (input_file_ && (input_file_ != stdin)) {
    fclose(input_file_);
    input_file_ = NULL;
  }
  if (output_file_ && (output_file_ != stdout)) {
    fclose(output_file_);
    output_file_ = NULL;
  }
}

bool VCDiffFileBasedCoder::FileSize(FILE* file, size_t* file_size) {
  long initial_position = ftell(file);
  if (fseek(file, 0, SEEK_END) != 0) {
    return false;
  }
  *file_size = static_cast<size_t>(ftell(file));
  if (fseek(file, initial_position, SEEK_SET) != 0) {
    return false;
  }
  return true;
}

bool VCDiffFileBasedCoder::OpenDictionary() {
  assert(dictionary_.empty());
  assert(!FLAGS_dictionary.empty());
  FILE* dictionary_file = fopen(FLAGS_dictionary.c_str(), "rb");
  if (!dictionary_file) {
    std::cerr << "Error opening dictionary file '" << FLAGS_dictionary
              << "': " << strerror(errno) << std::endl;
    return false;
  }
  size_t dictionary_size = 0U;
  if (!FileSize(dictionary_file, &dictionary_size)) {
    std::cerr << "Error finding size of dictionary file '" << FLAGS_dictionary
              << "': " << strerror(errno) << std::endl;
    return false;
  }
  dictionary_.resize(dictionary_size);
  if (dictionary_size > 0) {
    if (fread(&dictionary_[0], 1, dictionary_size, dictionary_file)
            != dictionary_size) {
      std::cerr << "Unable to read dictionary file '" << FLAGS_dictionary
                << "': " << strerror(errno) << std::endl;
      fclose(dictionary_file);
      dictionary_.clear();
      return false;
    }
  }
  fclose(dictionary_file);
  return true;
}

bool VCDiffFileBasedCoder::OpenFileForReading(const string& file_name,
                                              const char* file_type,
                                              FILE** file,
                                              std::vector<char>* buffer) {
  assert(buffer->empty());
  size_t buffer_size = 0U;
  if (!*file && file_name.empty()) {
#ifdef WIN32
    _setmode(_fileno(stdin), _O_BINARY);
#endif
    *file = stdin;
    buffer_size = static_cast<size_t>(FLAGS_buffersize);
  } else {
    if (!*file) {
      *file = fopen(file_name.c_str(), "rb");
      if (!*file) {
        std::cerr << "Error opening " << file_type << " file '"
                  << file_name << "': " << strerror(errno) << std::endl;
        return false;
      }
    }
    size_t file_size = 0U;
    if (!FileSize(*file, &file_size)) {
      std::cerr << "Error finding size of " << file_type << " file '"
                << file_name << "': " << strerror(errno) << std::endl;
      return false;
    }
    buffer_size = static_cast<size_t>(FLAGS_buffersize);
    if (file_size < buffer_size) {
      // Allocate just enough memory to store the entire file
      buffer_size = file_size;
    }
  }
  buffer->resize(buffer_size);
  return true;
}

// Opens the output file for streamed read operations using the
// standard C I/O library, i.e., fopen(), fwrite(), fclose().
// No output buffer is allocated because the encoded/decoded output
// is constructed progressively using a std::string object
// whose buffer is resized as needed.
bool VCDiffFileBasedCoder::OpenOutputFile() {
  if (output_file_name_.empty()) {
#ifdef WIN32
    _setmode(_fileno(stdout), _O_BINARY);
#endif
    output_file_ = stdout;
  } else {
    output_file_ = fopen(output_file_name_.c_str(), "wb");
    if (!output_file_) {
      std::cerr << "Error opening " << output_file_type_ << " file '"
                << output_file_name_
                << "': " << strerror(errno) << std::endl;
      return false;
    }
  }
  return true;
}

bool VCDiffFileBasedCoder::ReadInput(size_t* bytes_read) {
  // Read from file or stdin
  *bytes_read = fread(&input_buffer_[0], 1, input_buffer_.size(), input_file_);
  if (ferror(input_file_)) {
    std::cerr << "Error reading from " << input_file_type_ << " file '"
              << input_file_name_
              << "': " << strerror(errno) << std::endl;
    return false;
  }
  return true;
}

bool VCDiffFileBasedCoder::WriteOutput(const string& output) {
  if (!output.empty()) {
    // Some new output has been generated and is ready to be written
    // to the output file or to stdout.
    fwrite(output.data(), 1, output.size(), output_file_);
    if (ferror(output_file_)) {
      std::cerr << "Error writing " << output.size() << " bytes to "
                << output_file_type_ << " file '" << output_file_name_
                << "': " << strerror(errno) << std::endl;
      return false;
    }
  }
  return true;
}

bool VCDiffFileBasedCoder::CompareOutput(const string& output) {
  if (!output.empty()) {
    size_t output_size = output.size();
    // Some new output has been generated and is ready to be compared against
    // the output file.
    if (output_size > compare_buffer_.size()) {
      compare_buffer_.resize(output_size);
    }
    size_t bytes_read = fread(&compare_buffer_[0],
                              1,
                              output_size,
                              output_file_);
    if (ferror(output_file_)) {
      std::cerr << "Error reading from " << output_file_type_ << " file '"
                << output_file_name_ << "': " << strerror(errno) << std::endl;
      return false;
    }
    if (bytes_read < output_size) {
      std::cerr << "Decoded target is longer than original target file"
                << std::endl;
      return false;
    }
    if (output.compare(0, output_size, &compare_buffer_[0], bytes_read) != 0) {
      std::cerr << "Original target file does not match decoded target"
                << std::endl;
      return false;
    }
  }
  return true;
}

bool VCDiffFileBasedCoder::Encode() {
  input_file_type_ = "target";
  input_file_name_ = FLAGS_target;
  output_file_type_ = "delta";
  output_file_name_ = FLAGS_delta;
  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
    return false;
  }
  // Issue 6: Visual Studio STL produces a runtime exception
  // if &dictionary_[0] is attempted for an empty dictionary.
  if (dictionary_.empty()) {
    hashed_dictionary_.reset(new open_vcdiff::HashedDictionary("", 0));
  } else {
    hashed_dictionary_.reset(
        new open_vcdiff::HashedDictionary(&dictionary_[0],
                                          dictionary_.size()));
  }
  if (!hashed_dictionary_->Init()) {
    std::cerr << "Error initializing hashed dictionary" << std::endl;
    return false;
  }
  VCDiffFormatExtensionFlags format_flags = open_vcdiff::VCD_STANDARD_FORMAT;
  if (FLAGS_interleaved) {
    format_flags |= open_vcdiff::VCD_FORMAT_INTERLEAVED;
  }
  if (FLAGS_checksum) {
    format_flags |= open_vcdiff::VCD_FORMAT_CHECKSUM;
  }
  if (FLAGS_json) {
    format_flags |= open_vcdiff::VCD_FORMAT_JSON;
  }
  open_vcdiff::VCDiffStreamingEncoder encoder(hashed_dictionary_.get(),
                                              format_flags,
                                              FLAGS_target_matches);
  string output;
  size_t input_size = 0;
  size_t output_size = 0;
  {
    if (!encoder.StartEncoding(&output)) {
      std::cerr << "Error during encoder initialization" << std::endl;
      return false;
    }
  }
  do {
    size_t bytes_read = 0;
    if (!WriteOutput(output) || !ReadInput(&bytes_read)) {
      return false;
    }
    output_size += output.size();
    output.clear();
    if (bytes_read > 0) {
      input_size += bytes_read;
      if (!encoder.EncodeChunk(&input_buffer_[0], bytes_read, &output)) {
        std::cerr << "Error trying to encode data chunk of length "
                  << bytes_read << std::endl;
        return false;
      }
    }
  } while (!feof(input_file_));
  encoder.FinishEncoding(&output);
  if (!WriteOutput(output)) {
    return false;
  }
  output_size += output.size();
  output.clear();
  if (FLAGS_stats && (input_size > 0)) {
    std::cerr << "Original size: " << input_size
              << "\tCompressed size: " << output_size << " ("
              << ((static_cast<double>(output_size) / input_size) * 100)
              << "% of original)" << std::endl;
  }
  return true;
}

bool VCDiffFileBasedCoder::Decode() {
  input_file_type_ = "delta";
  input_file_name_ = FLAGS_delta;
  output_file_type_ = "target";
  output_file_name_ = FLAGS_target;
  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFile()) {
    return false;
  }

  open_vcdiff::VCDiffStreamingDecoder decoder;
  decoder.SetMaximumTargetFileSize(
      static_cast<size_t>(FLAGS_max_target_file_size));
  decoder.SetMaximumTargetWindowSize(
      static_cast<size_t>(FLAGS_max_target_window_size));
  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
  string output;
  size_t input_size = 0;
  size_t output_size = 0;
  // Issue 6: Visual Studio STL produces a runtime exception
  // if &dictionary_[0] is attempted for an empty dictionary.
  if (dictionary_.empty()) {
    decoder.StartDecoding("", 0);
  } else {
    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
  }

  do {
    size_t bytes_read = 0;
    if (!ReadInput(&bytes_read)) {
      return false;
    }
    if (bytes_read > 0) {
      input_size += bytes_read;
      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
        std::cerr << "Error trying to decode data chunk of length "
                  << bytes_read << std::endl;
        return false;
      }
    }
    if (!WriteOutput(output)) {
      return false;
    }
    output_size += output.size();
    output.clear();
  } while (!feof(input_file_));
  if (!decoder.FinishDecoding()) {
    std::cerr << "Decode error; '" << FLAGS_delta
              << " may not be a valid VCDIFF delta file" << std::endl;
    return false;
  }
  if (!WriteOutput(output)) {
    return false;
  }
  output_size += output.size();
  output.clear();
  if (FLAGS_stats && (output_size > 0)) {
    std::cerr << "Decompressed size: " << output_size
              << "\tCompressed size: " << input_size << " ("
              << ((static_cast<double>(input_size) / output_size) * 100)
              << "% of original)" << std::endl;
  }
  return true;
}

bool VCDiffFileBasedCoder::DecodeAndCompare() {
  input_file_type_ = "delta";
  input_file_name_ = FLAGS_delta;
  output_file_type_ = "target";
  output_file_name_ = FLAGS_target;
  if (!OpenDictionary() || !OpenInputFile() || !OpenOutputFileForCompare()) {
    return false;
  }

  open_vcdiff::VCDiffStreamingDecoder decoder;
  decoder.SetMaximumTargetFileSize(
      static_cast<size_t>(FLAGS_max_target_file_size));
  decoder.SetMaximumTargetWindowSize(
      static_cast<size_t>(FLAGS_max_target_window_size));
  decoder.SetAllowVcdTarget(FLAGS_allow_vcd_target);
  string output;
  size_t input_size = 0;
  size_t output_size = 0;
  // Issue 6: Visual Studio STL produces a runtime exception
  // if &dictionary_[0] is attempted for an empty dictionary.
  if (dictionary_.empty()) {
    decoder.StartDecoding("", 0);
  } else {
    decoder.StartDecoding(&dictionary_[0], dictionary_.size());
  }

  do {
    size_t bytes_read = 0;
    if (!ReadInput(&bytes_read)) {
      return false;
    }
    if (bytes_read > 0) {
      input_size += bytes_read;
      if (!decoder.DecodeChunk(&input_buffer_[0], bytes_read, &output)) {
        std::cerr << "Error trying to decode data chunk of length "
                  << bytes_read << std::endl;
        return false;
      }
    }
    if (!CompareOutput(output)) {
      return false;
    }
    output_size += output.size();
    output.clear();
  } while (!feof(input_file_));
  if (!decoder.FinishDecoding()) {
    std::cerr << "Decode error; '" << FLAGS_delta
              << " may not be a valid VCDIFF delta file" << std::endl;
    return false;
  }
  if (!CompareOutput(output)) {
    return false;
  }
  output_size += output.size();
  output.clear();
  if (fgetc(output_file_) != EOF) {
    std::cerr << "Decoded target is shorter than original target file"
              << std::endl;
    return false;
  }
  if (ferror(output_file_)) {
    std::cerr << "Error reading end-of-file indicator from target file"
              << std::endl;
    return false;
  }
  if (FLAGS_stats && (output_size > 0)) {
    std::cerr << "Decompressed size: " << output_size
              << "\tCompressed size: " << input_size << " ("
              << ((static_cast<double>(input_size) / output_size) * 100)
              << "% of original)" << std::endl;
  }
  return true;
}

}  // namespace open_vcdiff

int main(int argc, char** argv) {
  const char* const command_name = argv[0];
  google::SetUsageMessage(kUsageString);
  google::ParseCommandLineFlags(&argc, &argv, true);
  if (argc != 2) {
    std::cerr << command_name << ": Must specify exactly one command option"
              << std::endl;
    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
    return 1;
  }
  const char* const command_option = argv[1];
  if (FLAGS_dictionary.empty()) {
    std::cerr << command_name << " " << command_option
              << ": Must specify --dictionary <file-name>" << std::endl;
    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
    return 1;
  }
  if (!GetCommandLineFlagInfoOrDie("buffersize").is_default &&
       (FLAGS_buffersize == 0)) {
    std::cerr << command_name << ": Option --buffersize cannot be 0"
              << std::endl;
    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
    return 1;
  }
  if ((strcmp(command_option, "encode") == 0) ||
      (strcmp(command_option, "delta") == 0)) {
    open_vcdiff::VCDiffFileBasedCoder coder;
    if (!coder.Encode()) {
      return 1;
    }
    // The destructor for VCDiffFileBasedCoder will clean up the open files
    // and allocated memory.
  } else if ((strcmp(command_option, "decode") == 0) ||
             (strcmp(command_option, "patch") == 0)) {
    open_vcdiff::VCDiffFileBasedCoder coder;
    if (!coder.Decode()) {
      return 1;
    }
  } else if ((strcmp(command_option, "test") == 0)) {
    // "vcdiff test" does not appear in the usage string, but can be
    // used for debugging.  It encodes, then decodes, then compares the result
    // with the original target. It expects the same arguments as
    // "vcdiff encode", with the additional requirement that the --target
    // and --delta file arguments must be specified, rather than using stdin
    // or stdout.  It produces a delta file just as for "vcdiff encode".
    if (FLAGS_target.empty() || FLAGS_delta.empty()) {
      std::cerr << command_name
                << " test: Must specify both --target <file-name>"
                   " and --delta <file-name>" << std::endl;
      return 1;
    }
    const string original_target(FLAGS_target);
    // Put coder into a separate scope.
    {
      open_vcdiff::VCDiffFileBasedCoder coder;
      if (!coder.Encode()) {
        return 1;
      }
    }
    {
      open_vcdiff::VCDiffFileBasedCoder coder;
      if (!coder.DecodeAndCompare()) {
        return 1;
      }
    }
  } else {
    std::cerr << command_name << ": Unrecognized command option "
              << command_option << std::endl;
    ShowUsageWithFlagsRestrict(command_name, "vcdiff");
    return 1;
  }
  return 0;
}