C++程序  |  241行  |  9.76 KB

// Copyright 2008 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef OPEN_VCDIFF_ENCODETABLE_H_
#define OPEN_VCDIFF_ENCODETABLE_H_

#include <config.h>
#include <stddef.h>  // size_t
#include <stdint.h>  // int32_t
#include <string>
#include <vector>
#include "addrcache.h"
#include "checksum.h"
#include "codetable.h"
#include "codetablewriter_interface.h"

namespace open_vcdiff {

class OutputStringInterface;
class VCDiffInstructionMap;

// The method calls after construction *must* conform
// to the following pattern:
//    {{Add|Copy|Run}* [AddChecksum] Output}*
//
// When Output has been called in this sequence, a complete target window
// (as defined in RFC 3284 section 4.3) will have been appended to
// out (unless no calls to Add, Run, or Copy were made, in which
// case Output will do nothing.)  The output will not be available for use
// until after each call to Output().
//
// NOT threadsafe.
//
class VCDiffCodeTableWriter : public CodeTableWriterInterface {
 public:
  // This constructor uses the default code table.
  // If interleaved is true, the encoder writes each delta file window
  // by interleaving instructions and sizes with their corresponding
  // addresses and data, rather than placing these elements into three
  // separate sections.  This facilitates providing partially
  // decoded results when only a portion of a delta file window
  // is received (e.g. when HTTP over TCP is used as the
  // transmission protocol.)  The interleaved format is
  // not consistent with the VCDIFF draft standard.
  //
  explicit VCDiffCodeTableWriter(bool interleaved);

  // Uses a non-standard code table and non-standard cache sizes.  The caller
  // must guarantee that code_table_data remains allocated for the lifetime of
  // the VCDiffCodeTableWriter object.  Note that this is different from how
  // VCDiffCodeTableReader::UseCodeTable works.  It is assumed that a given
  // encoder will use either the default code table or a statically-defined
  // non-standard code table, whereas the decoder must have the ability to read
  // an arbitrary non-standard code table from a delta file and discard it once
  // the file has been decoded.
  //
  VCDiffCodeTableWriter(bool interleaved,
                        int near_cache_size,
                        int same_cache_size,
                        const VCDiffCodeTableData& code_table_data,
                        unsigned char max_mode);

  virtual ~VCDiffCodeTableWriter();

  // Initializes the constructed object for use.
  // This method must be called after a VCDiffCodeTableWriter is constructed
  // and before any of its other methods can be called.  It will return
  // false if there was an error initializing the object, or true if it
  // was successful.  After the object has been initialized and used,
  // Init() can be called again to restore the initial state of the object.
  //
  bool Init(size_t dictionary_size);

  virtual size_t target_length() const { return target_length_; }

  // Encode an ADD opcode with the "size" bytes starting at data
  virtual void Add(const char* data, size_t size);

  // Encode a COPY opcode with args "offset" (into dictionary) and "size" bytes.
  virtual void Copy(int32_t offset, size_t size);

  // Encode a RUN opcode for "size" copies of the value "byte".
  virtual void Run(size_t size, unsigned char byte);

  void AddChecksum(VCDChecksum checksum) {
    add_checksum_ = true;
    checksum_ = checksum;
  }

  // Finishes encoding and appends the encoded delta window to the output
  // string.  The output string is not null-terminated and may contain embedded
  // '\0' characters.
  virtual void Output(OutputStringInterface* out);

  const std::vector<int>& match_counts() const { return match_counts_; }

 private:
  typedef std::string string;

  // This is an estimate of the longest match size the encoder expects to find.
  // It is used to determine the initial size of the vector match_counts_.
  // If it is too large, then some space will be wasted on vector elements
  // that are not used.  If it is too small, then some time will be wasted
  // expanding match_counts_ to accommodate larger match sizes.
  static const size_t kMaxMatchSize = 2000;

  // The maximum value for the mode of a COPY instruction.
  const unsigned char max_mode_;

  // If interleaved is true, sets data_for_add_and_run_ and
  // addresses_for_copy_ to point at instructions_and_sizes_,
  // so that instructions, sizes, addresses and data will be
  // combined into a single interleaved stream.
  // If interleaved is false, sets data_for_add_and_run_ and
  // addresses_for_copy_ to point at their corresponding
  // separate_... strings, so that the three sections will
  // be generated separately from one another.
  //
  void InitSectionPointers(bool interleaved);

  // Determines the best opcode to encode an instruction, and appends
  // or substitutes that opcode and its size into the
  // instructions_and_sizes_ string.
  //
  void EncodeInstruction(VCDiffInstructionType inst,
                         size_t size,
                         unsigned char mode);

  void EncodeInstruction(VCDiffInstructionType inst, size_t size) {
    return EncodeInstruction(inst, size, 0);
  }

  // Calculates the number of bytes needed to store the given size value as a
  // variable-length integer (VarintBE).
  static size_t CalculateLengthOfSizeAsVarint(size_t size);

  // Appends the size value to the string as a variable-length integer.
  static void AppendSizeToString(size_t size, string* out);

  // Appends the size value to the output string as a variable-length integer.
  static void AppendSizeToOutputString(size_t size, OutputStringInterface* out);

  // Calculates the "Length of the delta encoding" field for the delta window
  // header, based on the sizes of the sections and of the other header
  // elements.
  size_t CalculateLengthOfTheDeltaEncoding() const;

  // None of the following 'string' objects are null-terminated.

  // A series of instruction opcodes, each of which may be followed
  // by one or two Varint values representing the size parameters
  // of the first and second instruction in the opcode.
  string instructions_and_sizes_;

  // A series of data arguments (byte values) used for ADD and RUN
  // instructions.  Depending on whether interleaved output is used
  // for streaming or not, the pointer may point to
  // separate_data_for_add_and_run_ or to instructions_and_sizes_.
  string *data_for_add_and_run_;
  string separate_data_for_add_and_run_;

  // A series of Varint addresses used for COPY instructions.
  // For the SAME mode, a byte value is stored instead of a Varint.
  // Depending on whether interleaved output is used
  // for streaming or not, the pointer may point to
  // separate_addresses_for_copy_ or to instructions_and_sizes_.
  string *addresses_for_copy_;
  string separate_addresses_for_copy_;

  VCDiffAddressCache address_cache_;

  size_t dictionary_size_;

  // The number of bytes of target data that has been encoded so far.
  // Each time Add(), Copy(), or Run() is called, this will be incremented.
  // The target length is used to compute HERE mode addresses
  // for COPY instructions, and is also written into the header
  // of the delta window when Output() is called.
  //
  size_t target_length_;

  const VCDiffCodeTableData* code_table_data_;

  // The instruction map facilitates finding an opcode quickly given an
  // instruction inst, size, and mode.  This is an alternate representation
  // of the same information that is found in code_table_data_.
  //
  const VCDiffInstructionMap* instruction_map_;

  // The zero-based index within instructions_and_sizes_ of the byte
  // that contains the last single-instruction opcode generated by
  // EncodeInstruction().  (See that function for exhaustive details.)
  // It is necessary to use an index rather than a pointer for this value
  // because instructions_and_sizes_ may be resized, which would invalidate
  // any pointers into its data buffer.  The value -1 is reserved to mean that
  // either no opcodes have been generated yet, or else the last opcode
  // generated was a double-instruction opcode.
  //
  int last_opcode_index_;

  // If true, an Adler32 checksum of the target window data will be written as
  // a variable-length integer, just after the size of the addresses section.
  //
  bool add_checksum_;

  // The checksum to be written to the current target window,
  // if add_checksum_ is true.
  // This will not be calculated based on the individual calls to Add(), Run(),
  // and Copy(), which would be unnecessarily expensive.  Instead, the code
  // that uses the VCDiffCodeTableWriter object is expected to calculate
  // the checksum all at once and to call AddChecksum() with that value.
  // Must be called sometime before calling Output(), though it can be called
  // either before or after the calls to Add(), Run(), and Copy().
  //
  VCDChecksum checksum_;

  // The value of match_counts_[n] is equal to the number of matches
  // of length n (that is, COPY instructions of size n) found so far.
  std::vector<int> match_counts_;

  // Making these private avoids implicit copy constructor & assignment operator
  VCDiffCodeTableWriter(const VCDiffCodeTableWriter&);  // NOLINT
  void operator=(const VCDiffCodeTableWriter&);
};

};  // namespace open_vcdiff

#endif  // OPEN_VCDIFF_ENCODETABLE_H_