// determinize.h // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // // Copyright 2005-2010 Google, Inc. // Author: riley@google.com (Michael Riley) // // \file // Functions and classes to determinize an FST. #ifndef FST_LIB_DETERMINIZE_H__ #define FST_LIB_DETERMINIZE_H__ #include <algorithm> #include <climits> #include <tr1/unordered_map> using std::tr1::unordered_map; using std::tr1::unordered_multimap; #include <map> #include <fst/slist.h> #include <string> #include <vector> using std::vector; #include <fst/arc-map.h> #include <fst/cache.h> #include <fst/bi-table.h> #include <fst/factor-weight.h> #include <fst/prune.h> #include <fst/test-properties.h> namespace fst { // // COMMON DIVISORS - these are used in determinization to compute // the transition weights. In the simplest case, it is just the same // as the semiring Plus(). However, other choices permit more efficient // determinization when the output contains strings. // // The default common divisor uses the semiring Plus. template <class W> class DefaultCommonDivisor { public: typedef W Weight; W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); } }; // The label common divisor for a (left) string semiring selects a // single letter common prefix or the empty string. This is used in // the determinization of output strings so that at most a single // letter will appear in the output of a transtion. template <typename L, StringType S> class LabelCommonDivisor { public: typedef StringWeight<L, S> Weight; Weight operator()(const Weight &w1, const Weight &w2) const { StringWeightIterator<L, S> iter1(w1); StringWeightIterator<L, S> iter2(w2); if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) { FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring"; return Weight::NoWeight(); } else if (w1.Size() == 0 || w2.Size() == 0) { return Weight::One(); } else if (w1 == Weight::Zero()) { return Weight(iter2.Value()); } else if (w2 == Weight::Zero()) { return Weight(iter1.Value()); } else if (iter1.Value() == iter2.Value()) { return Weight(iter1.Value()); } else { return Weight::One(); } } }; // The gallic common divisor uses the label common divisor on the // string component and the template argument D common divisor on the // weight component, which defaults to the default common divisor. template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> > class GallicCommonDivisor { public: typedef GallicWeight<L, W, S> Weight; Weight operator()(const Weight &w1, const Weight &w2) const { return Weight(label_common_divisor_(w1.Value1(), w2.Value1()), weight_common_divisor_(w1.Value2(), w2.Value2())); } private: LabelCommonDivisor<L, S> label_common_divisor_; D weight_common_divisor_; }; // Represents an element in a subset template <class A> struct DeterminizeElement { typedef typename A::StateId StateId; typedef typename A::Weight Weight; DeterminizeElement() {} DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {} bool operator==(const DeterminizeElement<A> & element) const { return state_id == element.state_id && weight == element.weight; } bool operator<(const DeterminizeElement<A> & element) const { return state_id < element.state_id || (state_id == element.state_id && weight == element.weight); } StateId state_id; // Input state Id Weight weight; // Residual weight }; // // DETERMINIZE FILTERS - these can be used in determinization to compute // transformations on the subsets prior to their being added as destination // states. The filter operates on a map between a label and the // corresponding destination subsets. The possibly modified map is // then used to construct the destination states for arcs exiting state 's'. // It must define the ordered map type LabelMap and have a default // and copy constructor. // A determinize filter that does not modify its input. template <class Arc> struct IdentityDeterminizeFilter { typedef typename Arc::StateId StateId; typedef typename Arc::Label Label; typedef slist< DeterminizeElement<Arc> > Subset; typedef map<Label, Subset*> LabelMap; static uint64 Properties(uint64 props) { return props; } void operator()(StateId s, LabelMap *label_map) {} }; // // DETERMINIZATION STATE TABLES // // The determiziation state table has the form: // // template <class Arc> // class DeterminizeStateTable { // public: // typedef typename Arc::StateId StateId; // typedef DeterminizeElement<Arc> Element; // typedef slist<Element> Subset; // // // Required constuctor // DeterminizeStateTable(); // // // Required copy constructor that does not copy state // DeterminizeStateTable(const DeterminizeStateTable<A,P> &table); // // // Lookup state ID by subset (not depending of the element order). // // If it doesn't exist, then add it. FindState takes // // ownership of the subset argument (so that it doesn't have to // // copy it if it creates a new state). // StateId FindState(Subset *subset); // // // Lookup subset by ID. // const Subset *FindSubset(StateId id) const; // }; // // The default determinization state table based on the // compact hash bi-table. template <class Arc> class DefaultDeterminizeStateTable { public: typedef typename Arc::StateId StateId; typedef typename Arc::Label Label; typedef typename Arc::Weight Weight; typedef DeterminizeElement<Arc> Element; typedef slist<Element> Subset; explicit DefaultDeterminizeStateTable(size_t table_size = 0) : table_size_(table_size), subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable<Arc> &table) : table_size_(table.table_size_), subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } ~DefaultDeterminizeStateTable() { for (StateId s = 0; s < subsets_.Size(); ++s) delete subsets_.FindEntry(s); } // Finds the state corresponding to a subset. Only creates a new // state if the subset is not found. FindState takes ownership of // the subset argument (so that it doesn't have to copy it if it // creates a new state). StateId FindState(Subset *subset) { StateId ns = subsets_.Size(); StateId s = subsets_.FindId(subset); if (s != ns) delete subset; // subset found return s; } const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); } private: // Comparison object for hashing Subset(s). Subsets are not sorted in this // implementation, so ordering must not be assumed in the equivalence // test. class SubsetEqual { public: SubsetEqual() { // needed for compilation but should never be called FSTERROR() << "SubsetEqual: default constructor not implemented"; } // Constructor takes vector needed to check equality. See immediately // below for constraints on it. explicit SubsetEqual(vector<Element *> *elements) : elements_(elements) {} // At each call to operator(), the elements_ vector should contain // only NULLs. When this operator returns, elements_ will still // have this property. bool operator()(Subset* subset1, Subset* subset2) const { if (!subset1 && !subset2) return true; if ((subset1 && !subset2) || (!subset1 && subset2)) return false; if (subset1->size() != subset2->size()) return false; // Loads first subset elements in element vector. for (typename Subset::iterator iter1 = subset1->begin(); iter1 != subset1->end(); ++iter1) { Element &element1 = *iter1; while (elements_->size() <= element1.state_id) elements_->push_back(0); (*elements_)[element1.state_id] = &element1; } // Checks second subset matches first via element vector. for (typename Subset::iterator iter2 = subset2->begin(); iter2 != subset2->end(); ++iter2) { Element &element2 = *iter2; while (elements_->size() <= element2.state_id) elements_->push_back(0); Element *element1 = (*elements_)[element2.state_id]; if (!element1 || element1->weight != element2.weight) { // Mismatch found. Resets element vector before returning false. for (typename Subset::iterator iter1 = subset1->begin(); iter1 != subset1->end(); ++iter1) (*elements_)[iter1->state_id] = 0; return false; } else { (*elements_)[element2.state_id] = 0; // Clears entry } } return true; } private: vector<Element *> *elements_; }; // Hash function for Subset to Fst states. Subset elements are not // sorted in this implementation, so the hash must be invariant // under subset reordering. class SubsetKey { public: size_t operator()(const Subset* subset) const { size_t hash = 0; if (subset) { for (typename Subset::const_iterator iter = subset->begin(); iter != subset->end(); ++iter) { const Element &element = *iter; int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1; int rshift = CHAR_BIT * sizeof(size_t) - lshift; size_t n = element.state_id; hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash(); } } return hash; } }; size_t table_size_; typedef CompactHashBiTable<StateId, Subset *, SubsetKey, SubsetEqual, HS_STL> SubsetTable; SubsetTable subsets_; vector<Element *> elements_; void operator=(const DefaultDeterminizeStateTable<Arc> &); // disallow }; // Options for finite-state transducer determinization templated on // the arc type, common divisor, the determinization filter and the // state table. DeterminizeFst takes ownership of the determinization // filter and state table if provided. template <class Arc, class D = DefaultCommonDivisor<typename Arc::Weight>, class F = IdentityDeterminizeFilter<Arc>, class T = DefaultDeterminizeStateTable<Arc> > struct DeterminizeFstOptions : CacheOptions { typedef typename Arc::Label Label; float delta; // Quantization delta for subset weights Label subsequential_label; // Label used for residual final output // when producing subsequential transducers. F *filter; // Determinization filter T *state_table; // Determinization state table explicit DeterminizeFstOptions(const CacheOptions &opts, float del = kDelta, Label lab = 0, F *filt = 0, T *table = 0) : CacheOptions(opts), delta(del), subsequential_label(lab), filter(filt), state_table(table) {} explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0, F *filt = 0, T *table = 0) : delta(del), subsequential_label(lab), filter(filt), state_table(table) {} }; // Implementation of delayed DeterminizeFst. This base class is // common to the variants that implement acceptor and transducer // determinization. template <class A> class DeterminizeFstImplBase : public CacheImpl<A> { public: using FstImpl<A>::SetType; using FstImpl<A>::SetProperties; using FstImpl<A>::Properties; using FstImpl<A>::SetInputSymbols; using FstImpl<A>::SetOutputSymbols; using CacheBaseImpl< CacheState<A> >::HasStart; using CacheBaseImpl< CacheState<A> >::HasFinal; using CacheBaseImpl< CacheState<A> >::HasArcs; using CacheBaseImpl< CacheState<A> >::SetFinal; using CacheBaseImpl< CacheState<A> >::SetStart; typedef typename A::Label Label; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef CacheState<A> State; template <class D, class F, class T> DeterminizeFstImplBase(const Fst<A> &fst, const DeterminizeFstOptions<A, D, F, T> &opts) : CacheImpl<A>(opts), fst_(fst.Copy()) { SetType("determinize"); uint64 iprops = fst.Properties(kFstProperties, false); uint64 dprops = DeterminizeProperties(iprops, opts.subsequential_label != 0); SetProperties(F::Properties(dprops), kCopyProperties); SetInputSymbols(fst.InputSymbols()); SetOutputSymbols(fst.OutputSymbols()); } DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl) : CacheImpl<A>(impl), fst_(impl.fst_->Copy(true)) { SetType("determinize"); SetProperties(impl.Properties(), kCopyProperties); SetInputSymbols(impl.InputSymbols()); SetOutputSymbols(impl.OutputSymbols()); } virtual ~DeterminizeFstImplBase() { delete fst_; } virtual DeterminizeFstImplBase<A> *Copy() = 0; StateId Start() { if (!HasStart()) { StateId start = ComputeStart(); if (start != kNoStateId) { SetStart(start); } } return CacheImpl<A>::Start(); } Weight Final(StateId s) { if (!HasFinal(s)) { Weight final = ComputeFinal(s); SetFinal(s, final); } return CacheImpl<A>::Final(s); } virtual void Expand(StateId s) = 0; size_t NumArcs(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumArcs(s); } size_t NumInputEpsilons(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumInputEpsilons(s); } size_t NumOutputEpsilons(StateId s) { if (!HasArcs(s)) Expand(s); return CacheImpl<A>::NumOutputEpsilons(s); } void InitArcIterator(StateId s, ArcIteratorData<A> *data) { if (!HasArcs(s)) Expand(s); CacheImpl<A>::InitArcIterator(s, data); } virtual StateId ComputeStart() = 0; virtual Weight ComputeFinal(StateId s) = 0; const Fst<A> &GetFst() const { return *fst_; } private: const Fst<A> *fst_; // Input Fst void operator=(const DeterminizeFstImplBase<A> &); // disallow }; // Implementation of delayed determinization for weighted acceptors. // It is templated on the arc type A and the common divisor D. template <class A, class D, class F, class T> class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> { public: using FstImpl<A>::SetProperties; using DeterminizeFstImplBase<A>::GetFst; using DeterminizeFstImplBase<A>::SetArcs; typedef typename A::Label Label; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef DeterminizeElement<A> Element; typedef slist<Element> Subset; typedef typename F::LabelMap LabelMap; DeterminizeFsaImpl(const Fst<A> &fst, const vector<Weight> *in_dist, vector<Weight> *out_dist, const DeterminizeFstOptions<A, D, F, T> &opts) : DeterminizeFstImplBase<A>(fst, opts), delta_(opts.delta), in_dist_(in_dist), out_dist_(out_dist), filter_(opts.filter ? opts.filter : new F()), state_table_(opts.state_table ? opts.state_table : new T()) { if (!fst.Properties(kAcceptor, true)) { FSTERROR() << "DeterminizeFst: argument not an acceptor"; SetProperties(kError, kError); } if (!(Weight::Properties() & kLeftSemiring)) { FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: " << Weight::Type(); SetProperties(kError, kError); } if (out_dist_) out_dist_->clear(); } DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D, F, T> &impl) : DeterminizeFstImplBase<A>(impl), delta_(impl.delta_), in_dist_(0), out_dist_(0), filter_(new F(*impl.filter_)), state_table_(new T(*impl.state_table_)) { if (impl.out_dist_) { FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector"; SetProperties(kError, kError); } } virtual ~DeterminizeFsaImpl() { delete filter_; delete state_table_; } virtual DeterminizeFsaImpl<A, D, F, T> *Copy() { return new DeterminizeFsaImpl<A, D, F, T>(*this); } uint64 Properties() const { return Properties(kFstProperties); } // Set error if found; return FST impl properties. uint64 Properties(uint64 mask) const { if ((mask & kError) && (GetFst().Properties(kError, false))) SetProperties(kError, kError); return FstImpl<A>::Properties(mask); } virtual StateId ComputeStart() { StateId s = GetFst().Start(); if (s == kNoStateId) return kNoStateId; Element element(s, Weight::One()); Subset *subset = new Subset; subset->push_front(element); return FindState(subset); } virtual Weight ComputeFinal(StateId s) { const Subset *subset = state_table_->FindSubset(s); Weight final = Weight::Zero(); for (typename Subset::const_iterator siter = subset->begin(); siter != subset->end(); ++siter) { const Element &element = *siter; final = Plus(final, Times(element.weight, GetFst().Final(element.state_id))); if (!final.Member()) SetProperties(kError, kError); } return final; } StateId FindState(Subset *subset) { StateId s = state_table_->FindState(subset); if (in_dist_ && out_dist_->size() <= s) out_dist_->push_back(ComputeDistance(subset)); return s; } // Compute distance from a state to the final states in the DFA // given the distances in the NFA. Weight ComputeDistance(const Subset *subset) { Weight outd = Weight::Zero(); for (typename Subset::const_iterator siter = subset->begin(); siter != subset->end(); ++siter) { const Element &element = *siter; Weight ind = element.state_id < in_dist_->size() ? (*in_dist_)[element.state_id] : Weight::Zero(); outd = Plus(outd, Times(element.weight, ind)); } return outd; } // Computes the outgoing transitions from a state, creating new destination // states as needed. virtual void Expand(StateId s) { LabelMap label_map; LabelSubsets(s, &label_map); for (typename LabelMap::iterator liter = label_map.begin(); liter != label_map.end(); ++liter) AddArc(s, liter->first, liter->second); SetArcs(s); } private: // Constructs destination subsets per label. At return, subset // element weights include the input automaton label weights and the // subsets may contain duplicate states. void LabelSubsets(StateId s, LabelMap *label_map) { const Subset *src_subset = state_table_->FindSubset(s); for (typename Subset::const_iterator siter = src_subset->begin(); siter != src_subset->end(); ++siter) { const Element &src_element = *siter; for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id); !aiter.Done(); aiter.Next()) { const A &arc = aiter.Value(); Element dest_element(arc.nextstate, Times(src_element.weight, arc.weight)); // The LabelMap may be a e.g. multimap with more complex // determinization filters, so we insert efficiently w/o using []. typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel); Subset* dest_subset; if (liter == label_map->end() || liter->first != arc.ilabel) { dest_subset = new Subset; label_map->insert(liter, make_pair(arc.ilabel, dest_subset)); } else { dest_subset = liter->second; } dest_subset->push_front(dest_element); } } // Applies the determinization filter (*filter_)(s, label_map); } // Adds an arc from state S to the destination state associated // with subset DEST_SUBSET (as created by LabelSubsets). void AddArc(StateId s, Label label, Subset *dest_subset) { A arc; arc.ilabel = label; arc.olabel = label; arc.weight = Weight::Zero(); typename Subset::iterator oiter; for (typename Subset::iterator diter = dest_subset->begin(); diter != dest_subset->end();) { Element &dest_element = *diter; // Computes label weight. arc.weight = common_divisor_(arc.weight, dest_element.weight); while (elements_.size() <= dest_element.state_id) elements_.push_back(0); Element *matching_element = elements_[dest_element.state_id]; if (matching_element) { // Found duplicate state: sums state weight and deletes dup. matching_element->weight = Plus(matching_element->weight, dest_element.weight); if (!matching_element->weight.Member()) SetProperties(kError, kError); ++diter; dest_subset->erase_after(oiter); } else { // Saves element so we can check for duplicate for this state. elements_[dest_element.state_id] = &dest_element; oiter = diter; ++diter; } } // Divides out label weight from destination subset elements. // Quantizes to ensure comparisons are effective. // Clears element vector. for (typename Subset::iterator diter = dest_subset->begin(); diter != dest_subset->end(); ++diter) { Element &dest_element = *diter; dest_element.weight = Divide(dest_element.weight, arc.weight, DIVIDE_LEFT); dest_element.weight = dest_element.weight.Quantize(delta_); elements_[dest_element.state_id] = 0; } arc.nextstate = FindState(dest_subset); CacheImpl<A>::PushArc(s, arc); } float delta_; // Quantization delta for subset weights const vector<Weight> *in_dist_; // Distance to final NFA states vector<Weight> *out_dist_; // Distance to final DFA states D common_divisor_; F *filter_; T *state_table_; vector<Element *> elements_; void operator=(const DeterminizeFsaImpl<A, D, F, T> &); // disallow }; // Implementation of delayed determinization for transducers. // Transducer determinization is implemented by mapping the input to // the Gallic semiring as an acceptor whose weights contain the output // strings and using acceptor determinization above to determinize // that acceptor. template <class A, StringType S, class D, class F, class T> class DeterminizeFstImpl : public DeterminizeFstImplBase<A> { public: using FstImpl<A>::SetProperties; using DeterminizeFstImplBase<A>::GetFst; using CacheBaseImpl< CacheState<A> >::GetCacheGc; using CacheBaseImpl< CacheState<A> >::GetCacheLimit; typedef typename A::Label Label; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef ToGallicMapper<A, S> ToMapper; typedef FromGallicMapper<A, S> FromMapper; typedef typename ToMapper::ToArc ToArc; typedef ArcMapFst<A, ToArc, ToMapper> ToFst; typedef ArcMapFst<ToArc, A, FromMapper> FromFst; typedef GallicCommonDivisor<Label, Weight, S, D> CommonDivisor; typedef GallicFactor<Label, Weight, S> FactorIterator; DeterminizeFstImpl(const Fst<A> &fst, const DeterminizeFstOptions<A, D, F, T> &opts) : DeterminizeFstImplBase<A>(fst, opts), delta_(opts.delta), subsequential_label_(opts.subsequential_label) { Init(GetFst()); } DeterminizeFstImpl(const DeterminizeFstImpl<A, S, D, F, T> &impl) : DeterminizeFstImplBase<A>(impl), delta_(impl.delta_), subsequential_label_(impl.subsequential_label_) { Init(GetFst()); } ~DeterminizeFstImpl() { delete from_fst_; } virtual DeterminizeFstImpl<A, S, D, F, T> *Copy() { return new DeterminizeFstImpl<A, S, D, F, T>(*this); } uint64 Properties() const { return Properties(kFstProperties); } // Set error if found; return FST impl properties. uint64 Properties(uint64 mask) const { if ((mask & kError) && (GetFst().Properties(kError, false) || from_fst_->Properties(kError, false))) SetProperties(kError, kError); return FstImpl<A>::Properties(mask); } virtual StateId ComputeStart() { return from_fst_->Start(); } virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); } virtual void Expand(StateId s) { for (ArcIterator<FromFst> aiter(*from_fst_, s); !aiter.Done(); aiter.Next()) CacheImpl<A>::PushArc(s, aiter.Value()); CacheImpl<A>::SetArcs(s); } private: // Initialization of transducer determinization implementation, which // is defined after DeterminizeFst since it calls it. void Init(const Fst<A> &fst); float delta_; Label subsequential_label_; FromFst *from_fst_; void operator=(const DeterminizeFstImpl<A, S, D, F, T> &); // disallow }; // Determinizes a weighted transducer. This version is a delayed // Fst. The result will be an equivalent FST that has the property // that no state has two transitions with the same input label. // For this algorithm, epsilon transitions are treated as regular // symbols (cf. RmEpsilon). // // The transducer must be functional. The weights must be (weakly) // left divisible (valid for TropicalWeight and LogWeight for instance) // and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0. // // Complexity: // - Determinizable: exponential (polynomial in the size of the output) // - Non-determinizable) does not terminate // // The determinizable automata include all unweighted and all acyclic input. // // References: // - Mehryar Mohri, "Finite-State Transducers in Language and Speech // Processing". Computational Linguistics, 23:2, 1997. // // This class attaches interface to implementation and handles // reference counting, delegating most methods to ImplToFst. template <class A> class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> > { public: friend class ArcIterator< DeterminizeFst<A> >; friend class StateIterator< DeterminizeFst<A> >; template <class B, StringType S, class D, class F, class T> friend class DeterminizeFstImpl; typedef A Arc; typedef typename A::Weight Weight; typedef typename A::StateId StateId; typedef typename A::Label Label; typedef CacheState<A> State; typedef DeterminizeFstImplBase<A> Impl; using ImplToFst<Impl>::SetImpl; explicit DeterminizeFst(const Fst<A> &fst) { typedef DefaultCommonDivisor<Weight> D; typedef IdentityDeterminizeFilter<A> F; typedef DefaultDeterminizeStateTable<A> T; DeterminizeFstOptions<A, D, F, T> opts; if (fst.Properties(kAcceptor, true)) { // Calls implementation for acceptors. SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); } else { // Calls implementation for transducers. SetImpl(new DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); } } template <class D, class F, class T> DeterminizeFst(const Fst<A> &fst, const DeterminizeFstOptions<A, D, F, T> &opts) { if (fst.Properties(kAcceptor, true)) { // Calls implementation for acceptors. SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); } else { // Calls implementation for transducers. SetImpl(new DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); } } // This acceptor-only version additionally computes the distance to // final states in the output if provided with those distances for the // input. Useful for e.g. unique N-shortest paths. template <class D, class F, class T> DeterminizeFst(const Fst<A> &fst, const vector<Weight> *in_dist, vector<Weight> *out_dist, const DeterminizeFstOptions<A, D, F, T> &opts) { if (!fst.Properties(kAcceptor, true)) { FSTERROR() << "DeterminizeFst:" << " distance to final states computed for acceptors only"; GetImpl()->SetProperties(kError, kError); } SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, in_dist, out_dist, opts)); } // See Fst<>::Copy() for doc. DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) { if (safe) SetImpl(fst.GetImpl()->Copy()); else SetImpl(fst.GetImpl(), false); } // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc. virtual DeterminizeFst<A> *Copy(bool safe = false) const { return new DeterminizeFst<A>(*this, safe); } virtual inline void InitStateIterator(StateIteratorData<A> *data) const; virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { GetImpl()->InitArcIterator(s, data); } private: // Makes visible to friends. Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } void operator=(const DeterminizeFst<A> &fst); // Disallow }; // Initialization of transducer determinization implementation. which // is defined after DeterminizeFst since it calls it. template <class A, StringType S, class D, class F, class T> void DeterminizeFstImpl<A, S, D, F, T>::Init(const Fst<A> &fst) { // Mapper to an acceptor. ToFst to_fst(fst, ToMapper()); // Determinizes acceptor. // This recursive call terminates since it passes the common divisor // to a private constructor. CacheOptions copts(GetCacheGc(), GetCacheLimit()); DeterminizeFstOptions<ToArc, CommonDivisor> dopts(copts, delta_); // Uses acceptor-only constructor to avoid template recursion DeterminizeFst<ToArc> det_fsa(to_fst, 0, 0, dopts); // Mapper back to transducer. FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_, kFactorFinalWeights, subsequential_label_, subsequential_label_); FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts); from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_)); } // Specialization for DeterminizeFst. template <class A> class StateIterator< DeterminizeFst<A> > : public CacheStateIterator< DeterminizeFst<A> > { public: explicit StateIterator(const DeterminizeFst<A> &fst) : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {} }; // Specialization for DeterminizeFst. template <class A> class ArcIterator< DeterminizeFst<A> > : public CacheArcIterator< DeterminizeFst<A> > { public: typedef typename A::StateId StateId; ArcIterator(const DeterminizeFst<A> &fst, StateId s) : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) { if (!fst.GetImpl()->HasArcs(s)) fst.GetImpl()->Expand(s); } private: DISALLOW_COPY_AND_ASSIGN(ArcIterator); }; template <class A> inline void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const { data->base = new StateIterator< DeterminizeFst<A> >(*this); } // Useful aliases when using StdArc. typedef DeterminizeFst<StdArc> StdDeterminizeFst; template <class Arc> struct DeterminizeOptions { typedef typename Arc::StateId StateId; typedef typename Arc::Weight Weight; typedef typename Arc::Label Label; float delta; // Quantization delta for subset weights. Weight weight_threshold; // Pruning weight threshold. StateId state_threshold; // Pruning state threshold. Label subsequential_label; // Label used for residual final output // when producing subsequential transducers. explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(), StateId n = kNoStateId, Label l = 0) : delta(d), weight_threshold(w), state_threshold(n), subsequential_label(l) {} }; // Determinizes a weighted transducer. This version writes the // determinized Fst to an output MutableFst. The result will be an // equivalent FST that has the property that no state has two // transitions with the same input label. For this algorithm, epsilon // transitions are treated as regular symbols (cf. RmEpsilon). // // The transducer must be functional. The weights must be (weakly) // left divisible (valid for TropicalWeight and LogWeight). // // Complexity: // - Determinizable: exponential (polynomial in the size of the output) // - Non-determinizable: does not terminate // // The determinizable automata include all unweighted and all acyclic input. // // References: // - Mehryar Mohri, "Finite-State Transducers in Language and Speech // Processing". Computational Linguistics, 23:2, 1997. template <class Arc> void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst, const DeterminizeOptions<Arc> &opts = DeterminizeOptions<Arc>()) { typedef typename Arc::StateId StateId; typedef typename Arc::Weight Weight; DeterminizeFstOptions<Arc> nopts; nopts.delta = opts.delta; nopts.subsequential_label = opts.subsequential_label; nopts.gc_limit = 0; // Cache only the last state for fastest copy. if (opts.weight_threshold != Weight::Zero() || opts.state_threshold != kNoStateId) { if (ifst.Properties(kAcceptor, false)) { vector<Weight> idistance, odistance; ShortestDistance(ifst, &idistance, true); DeterminizeFst<Arc> dfst(ifst, &idistance, &odistance, nopts); PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold, opts.state_threshold, AnyArcFilter<Arc>(), &odistance); Prune(dfst, ofst, popts); } else { *ofst = DeterminizeFst<Arc>(ifst, nopts); Prune(ofst, opts.weight_threshold, opts.state_threshold); } } else { *ofst = DeterminizeFst<Arc>(ifst, nopts); } } } // namespace fst #endif // FST_LIB_DETERMINIZE_H__