// compile.h // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Copyright 2005-2010 Google, Inc. // Author: riley@google.com (Michael Riley) // // \file // Class to to compile a binary Fst from textual input. #ifndef FST_SCRIPT_COMPILE_IMPL_H_ #define FST_SCRIPT_COMPILE_IMPL_H_ #include <tr1/unordered_map> using std::tr1::unordered_map; using std::tr1::unordered_multimap; #include <sstream> #include <string> #include <vector> using std::vector; #include <iostream> #include <fstream> #include <sstream> #include <fst/fst.h> #include <fst/util.h> #include <fst/vector-fst.h> DECLARE_string(fst_field_separator); namespace fst { // Compile a binary Fst from textual input, helper class for fstcompile.cc // WARNING: Stand-alone use of this class not recommended, most code should // read/write using the binary format which is much more efficient. template <class A> class FstCompiler { public: typedef A Arc; typedef typename A::StateId StateId; typedef typename A::Label Label; typedef typename A::Weight Weight; // WARNING: use of 'allow_negative_labels = true' not recommended; may // cause conflicts FstCompiler(istream &istrm, const string &source, const SymbolTable *isyms, const SymbolTable *osyms, const SymbolTable *ssyms, bool accep, bool ikeep, bool okeep, bool nkeep, bool allow_negative_labels = false) : nline_(0), source_(source), isyms_(isyms), osyms_(osyms), ssyms_(ssyms), nstates_(0), keep_state_numbering_(nkeep), allow_negative_labels_(allow_negative_labels) { char line[kLineLen]; while (istrm.getline(line, kLineLen)) { ++nline_; vector<char *> col; string separator = FLAGS_fst_field_separator + "\n"; SplitToVector(line, separator.c_str(), &col, true); if (col.size() == 0 || col[0][0] == '\0') // empty line continue; if (col.size() > 5 || (col.size() > 4 && accep) || (col.size() == 3 && !accep)) { FSTERROR() << "FstCompiler: Bad number of columns, source = " << source_ << ", line = " << nline_; fst_.SetProperties(kError, kError); return; } StateId s = StrToStateId(col[0]); while (s >= fst_.NumStates()) fst_.AddState(); if (nline_ == 1) fst_.SetStart(s); Arc arc; StateId d = s; switch (col.size()) { case 1: fst_.SetFinal(s, Weight::One()); break; case 2: fst_.SetFinal(s, StrToWeight(col[1], true)); break; case 3: arc.nextstate = d = StrToStateId(col[1]); arc.ilabel = StrToILabel(col[2]); arc.olabel = arc.ilabel; arc.weight = Weight::One(); fst_.AddArc(s, arc); break; case 4: arc.nextstate = d = StrToStateId(col[1]); arc.ilabel = StrToILabel(col[2]); if (accep) { arc.olabel = arc.ilabel; arc.weight = StrToWeight(col[3], false); } else { arc.olabel = StrToOLabel(col[3]); arc.weight = Weight::One(); } fst_.AddArc(s, arc); break; case 5: arc.nextstate = d = StrToStateId(col[1]); arc.ilabel = StrToILabel(col[2]); arc.olabel = StrToOLabel(col[3]); arc.weight = StrToWeight(col[4], false); fst_.AddArc(s, arc); } while (d >= fst_.NumStates()) fst_.AddState(); } if (ikeep) fst_.SetInputSymbols(isyms); if (okeep) fst_.SetOutputSymbols(osyms); } const VectorFst<A> &Fst() const { return fst_; } private: // Maximum line length in text file. static const int kLineLen = 8096; int64 StrToId(const char *s, const SymbolTable *syms, const char *name, bool allow_negative = false) const { int64 n = 0; if (syms) { n = syms->Find(s); if (n == -1 || (!allow_negative && n < 0)) { FSTERROR() << "FstCompiler: Symbol \"" << s << "\" is not mapped to any integer " << name << ", symbol table = " << syms->Name() << ", source = " << source_ << ", line = " << nline_; fst_.SetProperties(kError, kError); } } else { char *p; n = strtoll(s, &p, 10); if (p < s + strlen(s) || (!allow_negative && n < 0)) { FSTERROR() << "FstCompiler: Bad " << name << " integer = \"" << s << "\", source = " << source_ << ", line = " << nline_; fst_.SetProperties(kError, kError); } } return n; } StateId StrToStateId(const char *s) { StateId n = StrToId(s, ssyms_, "state ID"); if (keep_state_numbering_) return n; // remap state IDs to make dense set typename unordered_map<StateId, StateId>::const_iterator it = states_.find(n); if (it == states_.end()) { states_[n] = nstates_; return nstates_++; } else { return it->second; } } StateId StrToILabel(const char *s) const { return StrToId(s, isyms_, "arc ilabel", allow_negative_labels_); } StateId StrToOLabel(const char *s) const { return StrToId(s, osyms_, "arc olabel", allow_negative_labels_); } Weight StrToWeight(const char *s, bool allow_zero) const { Weight w; istringstream strm(s); strm >> w; if (!strm || (!allow_zero && w == Weight::Zero())) { FSTERROR() << "FstCompiler: Bad weight = \"" << s << "\", source = " << source_ << ", line = " << nline_; fst_.SetProperties(kError, kError); w = Weight::NoWeight(); } return w; } mutable VectorFst<A> fst_; size_t nline_; string source_; // text FST source name const SymbolTable *isyms_; // ilabel symbol table const SymbolTable *osyms_; // olabel symbol table const SymbolTable *ssyms_; // slabel symbol table unordered_map<StateId, StateId> states_; // state ID map StateId nstates_; // number of seen states bool keep_state_numbering_; bool allow_negative_labels_; // not recommended; may cause conflicts DISALLOW_COPY_AND_ASSIGN(FstCompiler); }; } // namespace fst #endif // FST_SCRIPT_COMPILE_IMPL_H_