C++程序  |  217行  |  6.61 KB

// compile.h

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Copyright 2005-2010 Google, Inc.
// Author: riley@google.com (Michael Riley)
//
// \file
// Class to to compile a binary Fst from textual input.

#ifndef FST_SCRIPT_COMPILE_IMPL_H_
#define FST_SCRIPT_COMPILE_IMPL_H_

#include <tr1/unordered_map>
using std::tr1::unordered_map;
using std::tr1::unordered_multimap;
#include <sstream>
#include <string>
#include <vector>
using std::vector;

#include <iostream>
#include <fstream>
#include <sstream>
#include <fst/fst.h>
#include <fst/util.h>
#include <fst/vector-fst.h>

DECLARE_string(fst_field_separator);

namespace fst {

// Compile a binary Fst from textual input, helper class for fstcompile.cc
// WARNING: Stand-alone use of this class not recommended, most code should
// read/write using the binary format which is much more efficient.
template <class A> class FstCompiler {
 public:
  typedef A Arc;
  typedef typename A::StateId StateId;
  typedef typename A::Label Label;
  typedef typename A::Weight Weight;

  // WARNING: use of 'allow_negative_labels = true' not recommended; may
  // cause conflicts
  FstCompiler(istream &istrm, const string &source,
            const SymbolTable *isyms, const SymbolTable *osyms,
            const SymbolTable *ssyms, bool accep, bool ikeep,
              bool okeep, bool nkeep, bool allow_negative_labels = false)
      : nline_(0), source_(source),
        isyms_(isyms), osyms_(osyms), ssyms_(ssyms),
        nstates_(0), keep_state_numbering_(nkeep),
        allow_negative_labels_(allow_negative_labels) {
    char line[kLineLen];
    while (istrm.getline(line, kLineLen)) {
      ++nline_;
      vector<char *> col;
      string separator = FLAGS_fst_field_separator + "\n";
      SplitToVector(line, separator.c_str(), &col, true);
      if (col.size() == 0 || col[0][0] == '\0')  // empty line
        continue;
      if (col.size() > 5 ||
          (col.size() > 4 && accep) ||
          (col.size() == 3 && !accep)) {
        FSTERROR() << "FstCompiler: Bad number of columns, source = "
                   << source_
                   << ", line = " << nline_;
        fst_.SetProperties(kError, kError);
        return;
      }
      StateId s = StrToStateId(col[0]);
      while (s >= fst_.NumStates())
        fst_.AddState();
      if (nline_ == 1)
        fst_.SetStart(s);

      Arc arc;
      StateId d = s;
      switch (col.size()) {
      case 1:
        fst_.SetFinal(s, Weight::One());
        break;
      case 2:
        fst_.SetFinal(s, StrToWeight(col[1], true));
        break;
      case 3:
        arc.nextstate = d = StrToStateId(col[1]);
        arc.ilabel = StrToILabel(col[2]);
        arc.olabel = arc.ilabel;
        arc.weight = Weight::One();
        fst_.AddArc(s, arc);
        break;
      case 4:
        arc.nextstate = d = StrToStateId(col[1]);
        arc.ilabel = StrToILabel(col[2]);
        if (accep) {
          arc.olabel = arc.ilabel;
          arc.weight = StrToWeight(col[3], false);
        } else {
          arc.olabel = StrToOLabel(col[3]);
          arc.weight = Weight::One();
        }
        fst_.AddArc(s, arc);
        break;
      case 5:
        arc.nextstate = d = StrToStateId(col[1]);
        arc.ilabel = StrToILabel(col[2]);
        arc.olabel = StrToOLabel(col[3]);
        arc.weight = StrToWeight(col[4], false);
        fst_.AddArc(s, arc);
      }
      while (d >= fst_.NumStates())
        fst_.AddState();
    }
    if (ikeep)
      fst_.SetInputSymbols(isyms);
    if (okeep)
      fst_.SetOutputSymbols(osyms);
  }

  const VectorFst<A> &Fst() const {
    return fst_;
  }

 private:
  // Maximum line length in text file.
  static const int kLineLen = 8096;

  int64 StrToId(const char *s, const SymbolTable *syms,
                const char *name, bool allow_negative = false) const {
    int64 n = 0;

    if (syms) {
      n = syms->Find(s);
      if (n == -1 || (!allow_negative && n < 0)) {
        FSTERROR() << "FstCompiler: Symbol \"" << s
                   << "\" is not mapped to any integer " << name
                   << ", symbol table = " << syms->Name()
                   << ", source = " << source_ << ", line = " << nline_;
        fst_.SetProperties(kError, kError);
      }
    } else {
      char *p;
      n = strtoll(s, &p, 10);
      if (p < s + strlen(s) || (!allow_negative && n < 0)) {
        FSTERROR() << "FstCompiler: Bad " << name << " integer = \"" << s
                   << "\", source = " << source_ << ", line = " << nline_;
        fst_.SetProperties(kError, kError);
      }
    }
    return n;
  }

  StateId StrToStateId(const char *s) {
    StateId n = StrToId(s, ssyms_, "state ID");

    if (keep_state_numbering_)
      return n;

    // remap state IDs to make dense set
    typename unordered_map<StateId, StateId>::const_iterator it = states_.find(n);
    if (it == states_.end()) {
      states_[n] = nstates_;
      return nstates_++;
    } else {
      return it->second;
    }
  }

  StateId StrToILabel(const char *s) const {
    return StrToId(s, isyms_, "arc ilabel", allow_negative_labels_);
  }

  StateId StrToOLabel(const char *s) const {
    return StrToId(s, osyms_, "arc olabel", allow_negative_labels_);
  }

  Weight StrToWeight(const char *s, bool allow_zero) const {
    Weight w;
    istringstream strm(s);
    strm >> w;
    if (!strm || (!allow_zero && w == Weight::Zero())) {
      FSTERROR() << "FstCompiler: Bad weight = \"" << s
                 << "\", source = " << source_ << ", line = " << nline_;
      fst_.SetProperties(kError, kError);
      w = Weight::NoWeight();
    }
    return w;
  }

  mutable VectorFst<A> fst_;
  size_t nline_;
  string source_;                      // text FST source name
  const SymbolTable *isyms_;           // ilabel symbol table
  const SymbolTable *osyms_;           // olabel symbol table
  const SymbolTable *ssyms_;           // slabel symbol table
  unordered_map<StateId, StateId> states_;  // state ID map
  StateId nstates_;                    // number of seen states
  bool keep_state_numbering_;
  bool allow_negative_labels_;         // not recommended; may cause conflicts

  DISALLOW_COPY_AND_ASSIGN(FstCompiler);
};

}  // namespace fst

#endif  // FST_SCRIPT_COMPILE_IMPL_H_