C++程序  |  390行  |  12.24 KB

// Copyright (c) 2017 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Contains utils for reading, writing and debug printing bit streams.

#ifndef SOURCE_COMP_HUFFMAN_CODEC_H_
#define SOURCE_COMP_HUFFMAN_CODEC_H_

#include <algorithm>
#include <cassert>
#include <functional>
#include <iomanip>
#include <map>
#include <memory>
#include <ostream>
#include <queue>
#include <sstream>
#include <stack>
#include <string>
#include <tuple>
#include <unordered_map>
#include <utility>
#include <vector>

namespace spvtools {
namespace comp {

// Used to generate and apply a Huffman coding scheme.
// |Val| is the type of variable being encoded (for example a string or a
// literal).
template <class Val>
class HuffmanCodec {
 public:
  // Huffman tree node.
  struct Node {
    Node() {}

    // Creates Node from serialization leaving weight and id undefined.
    Node(const Val& in_value, uint32_t in_left, uint32_t in_right)
        : value(in_value), left(in_left), right(in_right) {}

    Val value = Val();
    uint32_t weight = 0;
    // Ids are issued sequentially starting from 1. Ids are used as an ordering
    // tie-breaker, to make sure that the ordering (and resulting coding scheme)
    // are consistent accross multiple platforms.
    uint32_t id = 0;
    // Handles of children.
    uint32_t left = 0;
    uint32_t right = 0;
  };

  // Creates Huffman codec from a histogramm.
  // Histogramm counts must not be zero.
  explicit HuffmanCodec(const std::map<Val, uint32_t>& hist) {
    if (hist.empty()) return;

    // Heuristic estimate.
    nodes_.reserve(3 * hist.size());

    // Create NIL.
    CreateNode();

    // The queue is sorted in ascending order by weight (or by node id if
    // weights are equal).
    std::vector<uint32_t> queue_vector;
    queue_vector.reserve(hist.size());
    std::priority_queue<uint32_t, std::vector<uint32_t>,
                        std::function<bool(uint32_t, uint32_t)>>
        queue(std::bind(&HuffmanCodec::LeftIsBigger, this,
                        std::placeholders::_1, std::placeholders::_2),
              std::move(queue_vector));

    // Put all leaves in the queue.
    for (const auto& pair : hist) {
      const uint32_t node = CreateNode();
      MutableValueOf(node) = pair.first;
      MutableWeightOf(node) = pair.second;
      assert(WeightOf(node));
      queue.push(node);
    }

    // Form the tree by combining two subtrees with the least weight,
    // and pushing the root of the new tree in the queue.
    while (true) {
      // We push a node at the end of each iteration, so the queue is never
      // supposed to be empty at this point, unless there are no leaves, but
      // that case was already handled.
      assert(!queue.empty());
      const uint32_t right = queue.top();
      queue.pop();

      // If the queue is empty at this point, then the last node is
      // the root of the complete Huffman tree.
      if (queue.empty()) {
        root_ = right;
        break;
      }

      const uint32_t left = queue.top();
      queue.pop();

      // Combine left and right into a new tree and push it into the queue.
      const uint32_t parent = CreateNode();
      MutableWeightOf(parent) = WeightOf(right) + WeightOf(left);
      MutableLeftOf(parent) = left;
      MutableRightOf(parent) = right;
      queue.push(parent);
    }

    // Traverse the tree and form encoding table.
    CreateEncodingTable();
  }

  // Creates Huffman codec from saved tree structure.
  // |nodes| is the list of nodes of the tree, nodes[0] being NIL.
  // |root_handle| is the index of the root node.
  HuffmanCodec(uint32_t root_handle, std::vector<Node>&& nodes) {
    nodes_ = std::move(nodes);
    assert(!nodes_.empty());
    assert(root_handle > 0 && root_handle < nodes_.size());
    assert(!LeftOf(0) && !RightOf(0));

    root_ = root_handle;

    // Traverse the tree and form encoding table.
    CreateEncodingTable();
  }

  // Serializes the codec in the following text format:
  // (<root_handle>, {
  //   {0, 0, 0},
  //   {val1, left1, right1},
  //   {val2, left2, right2},
  //   ...
  // })
  std::string SerializeToText(int indent_num_whitespaces) const {
    const bool value_is_text = std::is_same<Val, std::string>::value;

    const std::string indent1 = std::string(indent_num_whitespaces, ' ');
    const std::string indent2 = std::string(indent_num_whitespaces + 2, ' ');

    std::stringstream code;
    code << "(" << root_ << ", {\n";

    for (const Node& node : nodes_) {
      code << indent2 << "{";
      if (value_is_text) code << "\"";
      code << node.value;
      if (value_is_text) code << "\"";
      code << ", " << node.left << ", " << node.right << "},\n";
    }

    code << indent1 << "})";

    return code.str();
  }

  // Prints the Huffman tree in the following format:
  // w------w------'x'
  //        w------'y'
  // Where w stands for the weight of the node.
  // Right tree branches appear above left branches. Taking the right path
  // adds 1 to the code, taking the left adds 0.
  void PrintTree(std::ostream& out) const { PrintTreeInternal(out, root_, 0); }

  // Traverses the tree and prints the Huffman table: value, code
  // and optionally node weight for every leaf.
  void PrintTable(std::ostream& out, bool print_weights = true) {
    std::queue<std::pair<uint32_t, std::string>> queue;
    queue.emplace(root_, "");

    while (!queue.empty()) {
      const uint32_t node = queue.front().first;
      const std::string code = queue.front().second;
      queue.pop();
      if (!RightOf(node) && !LeftOf(node)) {
        out << ValueOf(node);
        if (print_weights) out << " " << WeightOf(node);
        out << " " << code << std::endl;
      } else {
        if (LeftOf(node)) queue.emplace(LeftOf(node), code + "0");

        if (RightOf(node)) queue.emplace(RightOf(node), code + "1");
      }
    }
  }

  // Returns the Huffman table. The table was built at at construction time,
  // this function just returns a const reference.
  const std::unordered_map<Val, std::pair<uint64_t, size_t>>& GetEncodingTable()
      const {
    return encoding_table_;
  }

  // Encodes |val| and stores its Huffman code in the lower |num_bits| of
  // |bits|. Returns false of |val| is not in the Huffman table.
  bool Encode(const Val& val, uint64_t* bits, size_t* num_bits) const {
    auto it = encoding_table_.find(val);
    if (it == encoding_table_.end()) return false;
    *bits = it->second.first;
    *num_bits = it->second.second;
    return true;
  }

  // Reads bits one-by-one using callback |read_bit| until a match is found.
  // Matching value is stored in |val|. Returns false if |read_bit| terminates
  // before a code was mathced.
  // |read_bit| has type bool func(bool* bit). When called, the next bit is
  // stored in |bit|. |read_bit| returns false if the stream terminates
  // prematurely.
  bool DecodeFromStream(const std::function<bool(bool*)>& read_bit,
                        Val* val) const {
    uint32_t node = root_;
    while (true) {
      assert(node);

      if (!RightOf(node) && !LeftOf(node)) {
        *val = ValueOf(node);
        return true;
      }

      bool go_right;
      if (!read_bit(&go_right)) return false;

      if (go_right)
        node = RightOf(node);
      else
        node = LeftOf(node);
    }

    assert(0);
    return false;
  }

 private:
  // Returns value of the node referenced by |handle|.
  Val ValueOf(uint32_t node) const { return nodes_.at(node).value; }

  // Returns left child of |node|.
  uint32_t LeftOf(uint32_t node) const { return nodes_.at(node).left; }

  // Returns right child of |node|.
  uint32_t RightOf(uint32_t node) const { return nodes_.at(node).right; }

  // Returns weight of |node|.
  uint32_t WeightOf(uint32_t node) const { return nodes_.at(node).weight; }

  // Returns id of |node|.
  uint32_t IdOf(uint32_t node) const { return nodes_.at(node).id; }

  // Returns mutable reference to value of |node|.
  Val& MutableValueOf(uint32_t node) {
    assert(node);
    return nodes_.at(node).value;
  }

  // Returns mutable reference to handle of left child of |node|.
  uint32_t& MutableLeftOf(uint32_t node) {
    assert(node);
    return nodes_.at(node).left;
  }

  // Returns mutable reference to handle of right child of |node|.
  uint32_t& MutableRightOf(uint32_t node) {
    assert(node);
    return nodes_.at(node).right;
  }

  // Returns mutable reference to weight of |node|.
  uint32_t& MutableWeightOf(uint32_t node) { return nodes_.at(node).weight; }

  // Returns mutable reference to id of |node|.
  uint32_t& MutableIdOf(uint32_t node) { return nodes_.at(node).id; }

  // Returns true if |left| has bigger weight than |right|. Node ids are
  // used as tie-breaker.
  bool LeftIsBigger(uint32_t left, uint32_t right) const {
    if (WeightOf(left) == WeightOf(right)) {
      assert(IdOf(left) != IdOf(right));
      return IdOf(left) > IdOf(right);
    }
    return WeightOf(left) > WeightOf(right);
  }

  // Prints subtree (helper function used by PrintTree).
  void PrintTreeInternal(std::ostream& out, uint32_t node, size_t depth) const {
    if (!node) return;

    const size_t kTextFieldWidth = 7;

    if (!RightOf(node) && !LeftOf(node)) {
      out << ValueOf(node) << std::endl;
    } else {
      if (RightOf(node)) {
        std::stringstream label;
        label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
              << WeightOf(RightOf(node));
        out << label.str();
        PrintTreeInternal(out, RightOf(node), depth + 1);
      }

      if (LeftOf(node)) {
        out << std::string(depth * kTextFieldWidth, ' ');
        std::stringstream label;
        label << std::setfill('-') << std::left << std::setw(kTextFieldWidth)
              << WeightOf(LeftOf(node));
        out << label.str();
        PrintTreeInternal(out, LeftOf(node), depth + 1);
      }
    }
  }

  // Traverses the Huffman tree and saves paths to the leaves as bit
  // sequences to encoding_table_.
  void CreateEncodingTable() {
    struct Context {
      Context(uint32_t in_node, uint64_t in_bits, size_t in_depth)
          : node(in_node), bits(in_bits), depth(in_depth) {}
      uint32_t node;
      // Huffman tree depth cannot exceed 64 as histogramm counts are expected
      // to be positive and limited by numeric_limits<uint32_t>::max().
      // For practical applications tree depth would be much smaller than 64.
      uint64_t bits;
      size_t depth;
    };

    std::queue<Context> queue;
    queue.emplace(root_, 0, 0);

    while (!queue.empty()) {
      const Context& context = queue.front();
      const uint32_t node = context.node;
      const uint64_t bits = context.bits;
      const size_t depth = context.depth;
      queue.pop();

      if (!RightOf(node) && !LeftOf(node)) {
        auto insertion_result = encoding_table_.emplace(
            ValueOf(node), std::pair<uint64_t, size_t>(bits, depth));
        assert(insertion_result.second);
        (void)insertion_result;
      } else {
        if (LeftOf(node)) queue.emplace(LeftOf(node), bits, depth + 1);

        if (RightOf(node))
          queue.emplace(RightOf(node), bits | (1ULL << depth), depth + 1);
      }
    }
  }

  // Creates new Huffman tree node and stores it in the deleter array.
  uint32_t CreateNode() {
    const uint32_t handle = static_cast<uint32_t>(nodes_.size());
    nodes_.emplace_back(Node());
    nodes_.back().id = next_node_id_++;
    return handle;
  }

  // Huffman tree root handle.
  uint32_t root_ = 0;

  // Huffman tree deleter.
  std::vector<Node> nodes_;

  // Encoding table value -> {bits, num_bits}.
  // Huffman codes are expected to never exceed 64 bit length (this is in fact
  // impossible if frequencies are stored as uint32_t).
  std::unordered_map<Val, std::pair<uint64_t, size_t>> encoding_table_;

  // Next node id issued by CreateNode();
  uint32_t next_node_id_ = 1;
};

}  // namespace comp
}  // namespace spvtools

#endif  // SOURCE_COMP_HUFFMAN_CODEC_H_