Home | History | Annotate | Download | only in fst
      1 // determinize.h
      2 
      3 
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //     http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 // Copyright 2005-2010 Google, Inc.
     17 // Author: riley (at) google.com (Michael Riley)
     18 //
     19 // \file
     20 // Functions and classes to determinize an FST.
     21 
     22 #ifndef FST_LIB_DETERMINIZE_H__
     23 #define FST_LIB_DETERMINIZE_H__
     24 
     25 #include <algorithm>
     26 #include <climits>
     27 #include <tr1/unordered_map>
     28 using std::tr1::unordered_map;
     29 using std::tr1::unordered_multimap;
     30 #include <map>
     31 #include <fst/slist.h>
     32 #include <string>
     33 #include <vector>
     34 using std::vector;
     35 
     36 #include <fst/arc-map.h>
     37 #include <fst/cache.h>
     38 #include <fst/bi-table.h>
     39 #include <fst/factor-weight.h>
     40 #include <fst/prune.h>
     41 #include <fst/test-properties.h>
     42 
     43 
     44 namespace fst {
     45 
     46 //
     47 // COMMON DIVISORS - these are used in determinization to compute
     48 // the transition weights. In the simplest case, it is just the same
     49 // as the semiring Plus(). However, other choices permit more efficient
     50 // determinization when the output contains strings.
     51 //
     52 
     53 // The default common divisor uses the semiring Plus.
     54 template <class W>
     55 class DefaultCommonDivisor {
     56  public:
     57   typedef W Weight;
     58 
     59   W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); }
     60 };
     61 
     62 
     63 // The label common divisor for a (left) string semiring selects a
     64 // single letter common prefix or the empty string. This is used in
     65 // the determinization of output strings so that at most a single
     66 // letter will appear in the output of a transtion.
     67 template <typename L, StringType S>
     68 class LabelCommonDivisor {
     69  public:
     70   typedef StringWeight<L, S> Weight;
     71 
     72   Weight operator()(const Weight &w1, const Weight &w2) const {
     73     StringWeightIterator<L, S> iter1(w1);
     74     StringWeightIterator<L, S> iter2(w2);
     75 
     76     if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) {
     77       FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring";
     78       return Weight::NoWeight();
     79     } else if (w1.Size() == 0 || w2.Size() == 0) {
     80       return Weight::One();
     81     } else if (w1 == Weight::Zero()) {
     82       return Weight(iter2.Value());
     83     } else if (w2 == Weight::Zero()) {
     84       return Weight(iter1.Value());
     85     } else if (iter1.Value() == iter2.Value()) {
     86       return Weight(iter1.Value());
     87     } else {
     88       return Weight::One();
     89     }
     90   }
     91 };
     92 
     93 
     94 // The gallic common divisor uses the label common divisor on the
     95 // string component and the template argument D common divisor on the
     96 // weight component, which defaults to the default common divisor.
     97 template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> >
     98 class GallicCommonDivisor {
     99  public:
    100   typedef GallicWeight<L, W, S> Weight;
    101 
    102   Weight operator()(const Weight &w1, const Weight &w2) const {
    103     return Weight(label_common_divisor_(w1.Value1(), w2.Value1()),
    104                   weight_common_divisor_(w1.Value2(), w2.Value2()));
    105   }
    106 
    107  private:
    108   LabelCommonDivisor<L, S> label_common_divisor_;
    109   D weight_common_divisor_;
    110 };
    111 
    112 
    113 // Represents an element in a subset
    114 template <class A>
    115 struct DeterminizeElement {
    116   typedef typename A::StateId StateId;
    117   typedef typename A::Weight Weight;
    118 
    119   DeterminizeElement() {}
    120 
    121   DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {}
    122 
    123   bool operator==(const DeterminizeElement<A> & element) const {
    124     return state_id == element.state_id && weight == element.weight;
    125   }
    126 
    127   bool operator<(const DeterminizeElement<A> & element) const {
    128     return state_id < element.state_id ||
    129         (state_id == element.state_id && weight == element.weight);
    130   }
    131 
    132   StateId state_id;  // Input state Id
    133   Weight weight;     // Residual weight
    134 };
    135 
    136 
    137 //
    138 // DETERMINIZE FILTERS - these can be used in determinization to compute
    139 // transformations on the subsets prior to their being added as destination
    140 // states. The filter operates on a map between a label and the
    141 // corresponding destination subsets. The possibly modified map is
    142 // then used to construct the destination states for arcs exiting state 's'.
    143 // It must define the ordered map type LabelMap and have a default
    144 // and copy constructor.
    145 
    146 // A determinize filter that does not modify its input.
    147 template <class Arc>
    148 struct IdentityDeterminizeFilter {
    149   typedef typename Arc::StateId StateId;
    150   typedef typename Arc::Label Label;
    151   typedef slist< DeterminizeElement<Arc> > Subset;
    152   typedef map<Label, Subset*> LabelMap;
    153 
    154   static uint64 Properties(uint64 props) { return props; }
    155 
    156   void operator()(StateId s, LabelMap *label_map) {}
    157 };
    158 
    159 
    160 //
    161 // DETERMINIZATION STATE TABLES
    162 //
    163 // The determiziation state table has the form:
    164 //
    165 // template <class Arc>
    166 // class DeterminizeStateTable {
    167 //  public:
    168 //   typedef typename Arc::StateId StateId;
    169 //   typedef DeterminizeElement<Arc> Element;
    170 //   typedef slist<Element> Subset;
    171 //
    172 //   // Required constuctor
    173 //   DeterminizeStateTable();
    174 //
    175 //   // Required copy constructor that does not copy state
    176 //   DeterminizeStateTable(const DeterminizeStateTable<A,P> &table);
    177 //
    178 //   // Lookup state ID by subset (not depending of the element order).
    179 //   // If it doesn't exist, then add it.  FindState takes
    180 //   // ownership of the subset argument (so that it doesn't have to
    181 //   // copy it if it creates a new state).
    182 //   StateId FindState(Subset *subset);
    183 //
    184 //   // Lookup subset by ID.
    185 //   const Subset *FindSubset(StateId id) const;
    186 // };
    187 //
    188 
    189 // The default determinization state table based on the
    190 // compact hash bi-table.
    191 template <class Arc>
    192 class DefaultDeterminizeStateTable {
    193  public:
    194   typedef typename Arc::StateId StateId;
    195   typedef typename Arc::Label Label;
    196   typedef typename Arc::Weight Weight;
    197   typedef DeterminizeElement<Arc> Element;
    198   typedef slist<Element> Subset;
    199 
    200   explicit DefaultDeterminizeStateTable(size_t table_size = 0)
    201       : table_size_(table_size),
    202         subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { }
    203 
    204   DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable<Arc> &table)
    205       : table_size_(table.table_size_),
    206         subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { }
    207 
    208   ~DefaultDeterminizeStateTable() {
    209     for (StateId s = 0; s < subsets_.Size(); ++s)
    210       delete subsets_.FindEntry(s);
    211   }
    212 
    213   // Finds the state corresponding to a subset. Only creates a new
    214   // state if the subset is not found. FindState takes ownership of
    215   // the subset argument (so that it doesn't have to copy it if it
    216   // creates a new state).
    217   StateId FindState(Subset *subset) {
    218     StateId ns = subsets_.Size();
    219     StateId s = subsets_.FindId(subset);
    220     if (s != ns) delete subset;  // subset found
    221     return s;
    222   }
    223 
    224   const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); }
    225 
    226  private:
    227   // Comparison object for hashing Subset(s). Subsets are not sorted in this
    228   // implementation, so ordering must not be assumed in the equivalence
    229   // test.
    230   class SubsetEqual {
    231    public:
    232     SubsetEqual() {  // needed for compilation but should never be called
    233       FSTERROR() << "SubsetEqual: default constructor not implemented";
    234     }
    235 
    236     // Constructor takes vector needed to check equality. See immediately
    237     // below for constraints on it.
    238     explicit SubsetEqual(vector<Element *> *elements)
    239         : elements_(elements) {}
    240 
    241     // At each call to operator(), the elements_ vector should contain
    242     // only NULLs. When this operator returns, elements_ will still
    243     // have this property.
    244     bool operator()(Subset* subset1, Subset* subset2) const {
    245       if (!subset1 && !subset2)
    246         return true;
    247       if ((subset1 && !subset2) || (!subset1 && subset2))
    248         return false;
    249 
    250       if (subset1->size() != subset2->size())
    251         return false;
    252 
    253       // Loads first subset elements in element vector.
    254       for (typename Subset::iterator iter1 = subset1->begin();
    255            iter1 != subset1->end();
    256            ++iter1) {
    257         Element &element1 = *iter1;
    258         while (elements_->size() <= element1.state_id)
    259           elements_->push_back(0);
    260         (*elements_)[element1.state_id] = &element1;
    261       }
    262 
    263       // Checks second subset matches first via element vector.
    264       for (typename Subset::iterator iter2 = subset2->begin();
    265            iter2 != subset2->end();
    266            ++iter2) {
    267         Element &element2 = *iter2;
    268         while (elements_->size() <= element2.state_id)
    269           elements_->push_back(0);
    270         Element *element1 = (*elements_)[element2.state_id];
    271         if (!element1 || element1->weight != element2.weight) {
    272           // Mismatch found. Resets element vector before returning false.
    273           for (typename Subset::iterator iter1 = subset1->begin();
    274                iter1 != subset1->end();
    275                ++iter1)
    276             (*elements_)[iter1->state_id] = 0;
    277           return false;
    278         } else {
    279           (*elements_)[element2.state_id] = 0;  // Clears entry
    280         }
    281       }
    282       return true;
    283     }
    284    private:
    285     vector<Element *> *elements_;
    286   };
    287 
    288   // Hash function for Subset to Fst states. Subset elements are not
    289   // sorted in this implementation, so the hash must be invariant
    290   // under subset reordering.
    291   class SubsetKey {
    292    public:
    293     size_t operator()(const Subset* subset) const {
    294       size_t hash = 0;
    295       if (subset) {
    296         for (typename Subset::const_iterator iter = subset->begin();
    297              iter != subset->end();
    298              ++iter) {
    299           const Element &element = *iter;
    300           int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1;
    301           int rshift = CHAR_BIT * sizeof(size_t) - lshift;
    302           size_t n = element.state_id;
    303           hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash();
    304         }
    305       }
    306       return hash;
    307     }
    308   };
    309 
    310   size_t table_size_;
    311 
    312   typedef CompactHashBiTable<StateId, Subset *,
    313                              SubsetKey, SubsetEqual, HS_STL>  SubsetTable;
    314 
    315   SubsetTable subsets_;
    316   vector<Element *> elements_;
    317 
    318   void operator=(const DefaultDeterminizeStateTable<Arc> &);  // disallow
    319 };
    320 
    321 // Options for finite-state transducer determinization templated on
    322 // the arc type, common divisor, the determinization filter and the
    323 // state table.  DeterminizeFst takes ownership of the determinization
    324 // filter and state table if provided.
    325 template <class Arc,
    326           class D = DefaultCommonDivisor<typename Arc::Weight>,
    327           class F = IdentityDeterminizeFilter<Arc>,
    328           class T = DefaultDeterminizeStateTable<Arc> >
    329 struct DeterminizeFstOptions : CacheOptions {
    330   typedef typename Arc::Label Label;
    331   float delta;                // Quantization delta for subset weights
    332   Label subsequential_label;  // Label used for residual final output
    333                               // when producing subsequential transducers.
    334   F *filter;                  // Determinization filter
    335   T *state_table;             // Determinization state table
    336 
    337   explicit DeterminizeFstOptions(const CacheOptions &opts,
    338                                  float del = kDelta, Label lab = 0,
    339                                  F *filt = 0,
    340                                  T *table = 0)
    341       : CacheOptions(opts), delta(del), subsequential_label(lab),
    342         filter(filt), state_table(table) {}
    343 
    344   explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0,
    345                                  F *filt = 0, T *table = 0)
    346       : delta(del), subsequential_label(lab), filter(filt),
    347         state_table(table) {}
    348 };
    349 
    350 // Implementation of delayed DeterminizeFst. This base class is
    351 // common to the variants that implement acceptor and transducer
    352 // determinization.
    353 template <class A>
    354 class DeterminizeFstImplBase : public CacheImpl<A> {
    355  public:
    356   using FstImpl<A>::SetType;
    357   using FstImpl<A>::SetProperties;
    358   using FstImpl<A>::Properties;
    359   using FstImpl<A>::SetInputSymbols;
    360   using FstImpl<A>::SetOutputSymbols;
    361 
    362   using CacheBaseImpl< CacheState<A> >::HasStart;
    363   using CacheBaseImpl< CacheState<A> >::HasFinal;
    364   using CacheBaseImpl< CacheState<A> >::HasArcs;
    365   using CacheBaseImpl< CacheState<A> >::SetFinal;
    366   using CacheBaseImpl< CacheState<A> >::SetStart;
    367 
    368   typedef typename A::Label Label;
    369   typedef typename A::Weight Weight;
    370   typedef typename A::StateId StateId;
    371   typedef CacheState<A> State;
    372 
    373   template <class D, class F, class T>
    374   DeterminizeFstImplBase(const Fst<A> &fst,
    375                          const DeterminizeFstOptions<A, D, F, T> &opts)
    376       : CacheImpl<A>(opts), fst_(fst.Copy()) {
    377     SetType("determinize");
    378     uint64 iprops = fst.Properties(kFstProperties, false);
    379     uint64 dprops = DeterminizeProperties(iprops,
    380                                           opts.subsequential_label != 0);
    381     SetProperties(F::Properties(dprops), kCopyProperties);
    382     SetInputSymbols(fst.InputSymbols());
    383     SetOutputSymbols(fst.OutputSymbols());
    384   }
    385 
    386   DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl)
    387       : CacheImpl<A>(impl),
    388         fst_(impl.fst_->Copy(true)) {
    389     SetType("determinize");
    390     SetProperties(impl.Properties(), kCopyProperties);
    391     SetInputSymbols(impl.InputSymbols());
    392     SetOutputSymbols(impl.OutputSymbols());
    393   }
    394 
    395   virtual ~DeterminizeFstImplBase() { delete fst_; }
    396 
    397   virtual DeterminizeFstImplBase<A> *Copy() = 0;
    398 
    399   StateId Start() {
    400     if (!HasStart()) {
    401       StateId start = ComputeStart();
    402       if (start != kNoStateId) {
    403         SetStart(start);
    404       }
    405     }
    406     return CacheImpl<A>::Start();
    407   }
    408 
    409   Weight Final(StateId s) {
    410     if (!HasFinal(s)) {
    411       Weight final = ComputeFinal(s);
    412       SetFinal(s, final);
    413     }
    414     return CacheImpl<A>::Final(s);
    415   }
    416 
    417   virtual void Expand(StateId s) = 0;
    418 
    419   size_t NumArcs(StateId s) {
    420     if (!HasArcs(s))
    421       Expand(s);
    422     return CacheImpl<A>::NumArcs(s);
    423   }
    424 
    425   size_t NumInputEpsilons(StateId s) {
    426     if (!HasArcs(s))
    427       Expand(s);
    428     return CacheImpl<A>::NumInputEpsilons(s);
    429   }
    430 
    431   size_t NumOutputEpsilons(StateId s) {
    432     if (!HasArcs(s))
    433       Expand(s);
    434     return CacheImpl<A>::NumOutputEpsilons(s);
    435   }
    436 
    437   void InitArcIterator(StateId s, ArcIteratorData<A> *data) {
    438     if (!HasArcs(s))
    439       Expand(s);
    440     CacheImpl<A>::InitArcIterator(s, data);
    441   }
    442 
    443   virtual StateId ComputeStart() = 0;
    444 
    445   virtual Weight ComputeFinal(StateId s) = 0;
    446 
    447   const Fst<A> &GetFst() const { return *fst_; }
    448 
    449  private:
    450   const Fst<A> *fst_;            // Input Fst
    451 
    452   void operator=(const DeterminizeFstImplBase<A> &);  // disallow
    453 };
    454 
    455 
    456 // Implementation of delayed determinization for weighted acceptors.
    457 // It is templated on the arc type A and the common divisor D.
    458 template <class A, class D, class F, class T>
    459 class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> {
    460  public:
    461   using FstImpl<A>::SetProperties;
    462   using DeterminizeFstImplBase<A>::GetFst;
    463   using DeterminizeFstImplBase<A>::SetArcs;
    464 
    465   typedef typename A::Label Label;
    466   typedef typename A::Weight Weight;
    467   typedef typename A::StateId StateId;
    468   typedef DeterminizeElement<A> Element;
    469   typedef slist<Element> Subset;
    470   typedef typename F::LabelMap LabelMap;
    471 
    472   DeterminizeFsaImpl(const Fst<A> &fst,
    473                      const vector<Weight> *in_dist, vector<Weight> *out_dist,
    474                      const DeterminizeFstOptions<A, D, F, T> &opts)
    475       : DeterminizeFstImplBase<A>(fst, opts),
    476         delta_(opts.delta),
    477         in_dist_(in_dist),
    478         out_dist_(out_dist),
    479         filter_(opts.filter ? opts.filter : new F()),
    480         state_table_(opts.state_table ? opts.state_table : new T()) {
    481     if (!fst.Properties(kAcceptor, true)) {
    482       FSTERROR()  << "DeterminizeFst: argument not an acceptor";
    483       SetProperties(kError, kError);
    484     }
    485     if (!(Weight::Properties() & kLeftSemiring)) {
    486       FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: "
    487                  << Weight::Type();
    488       SetProperties(kError, kError);
    489     }
    490     if (out_dist_)
    491       out_dist_->clear();
    492   }
    493 
    494   DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D, F, T> &impl)
    495       : DeterminizeFstImplBase<A>(impl),
    496         delta_(impl.delta_),
    497         in_dist_(0),
    498         out_dist_(0),
    499         filter_(new F(*impl.filter_)),
    500         state_table_(new T(*impl.state_table_)) {
    501     if (impl.out_dist_) {
    502       FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector";
    503       SetProperties(kError, kError);
    504     }
    505   }
    506 
    507   virtual ~DeterminizeFsaImpl() {
    508     delete filter_;
    509     delete state_table_;
    510   }
    511 
    512   virtual DeterminizeFsaImpl<A, D, F, T> *Copy() {
    513     return new DeterminizeFsaImpl<A, D, F, T>(*this);
    514   }
    515 
    516   uint64 Properties() const { return Properties(kFstProperties); }
    517 
    518   // Set error if found; return FST impl properties.
    519   uint64 Properties(uint64 mask) const {
    520     if ((mask & kError) && (GetFst().Properties(kError, false)))
    521       SetProperties(kError, kError);
    522     return FstImpl<A>::Properties(mask);
    523   }
    524 
    525   virtual StateId ComputeStart() {
    526     StateId s = GetFst().Start();
    527     if (s == kNoStateId)
    528       return kNoStateId;
    529     Element element(s, Weight::One());
    530     Subset *subset = new Subset;
    531     subset->push_front(element);
    532     return FindState(subset);
    533   }
    534 
    535   virtual Weight ComputeFinal(StateId s) {
    536     const Subset *subset = state_table_->FindSubset(s);
    537     Weight final = Weight::Zero();
    538     for (typename Subset::const_iterator siter = subset->begin();
    539          siter != subset->end();
    540          ++siter) {
    541       const Element &element = *siter;
    542       final = Plus(final, Times(element.weight,
    543                                 GetFst().Final(element.state_id)));
    544       if (!final.Member())
    545         SetProperties(kError, kError);
    546     }
    547     return final;
    548   }
    549 
    550   StateId FindState(Subset *subset) {
    551     StateId s = state_table_->FindState(subset);
    552     if (in_dist_ && out_dist_->size() <= s)
    553       out_dist_->push_back(ComputeDistance(subset));
    554     return s;
    555   }
    556 
    557   // Compute distance from a state to the final states in the DFA
    558   // given the distances in the NFA.
    559   Weight ComputeDistance(const Subset *subset) {
    560     Weight outd = Weight::Zero();
    561     for (typename Subset::const_iterator siter = subset->begin();
    562          siter != subset->end(); ++siter) {
    563       const Element &element = *siter;
    564       Weight ind = element.state_id < in_dist_->size() ?
    565           (*in_dist_)[element.state_id] : Weight::Zero();
    566       outd = Plus(outd, Times(element.weight, ind));
    567     }
    568     return outd;
    569   }
    570 
    571   // Computes the outgoing transitions from a state, creating new destination
    572   // states as needed.
    573   virtual void Expand(StateId s) {
    574 
    575     LabelMap label_map;
    576     LabelSubsets(s, &label_map);
    577 
    578     for (typename LabelMap::iterator liter = label_map.begin();
    579          liter != label_map.end();
    580          ++liter)
    581       AddArc(s, liter->first, liter->second);
    582     SetArcs(s);
    583   }
    584 
    585  private:
    586   // Constructs destination subsets per label. At return, subset
    587   // element weights include the input automaton label weights and the
    588   // subsets may contain duplicate states.
    589   void LabelSubsets(StateId s, LabelMap *label_map) {
    590     const Subset *src_subset = state_table_->FindSubset(s);
    591 
    592     for (typename Subset::const_iterator siter = src_subset->begin();
    593          siter != src_subset->end();
    594          ++siter) {
    595       const Element &src_element = *siter;
    596       for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id);
    597            !aiter.Done();
    598            aiter.Next()) {
    599         const A &arc = aiter.Value();
    600         Element dest_element(arc.nextstate,
    601                              Times(src_element.weight, arc.weight));
    602 
    603         // The LabelMap may be a e.g. multimap with more complex
    604         // determinization filters, so we insert efficiently w/o using [].
    605         typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel);
    606         Subset* dest_subset;
    607         if (liter == label_map->end() || liter->first != arc.ilabel) {
    608           dest_subset = new Subset;
    609           label_map->insert(liter, make_pair(arc.ilabel, dest_subset));
    610         } else {
    611           dest_subset = liter->second;
    612         }
    613 
    614         dest_subset->push_front(dest_element);
    615       }
    616     }
    617     // Applies the determinization filter
    618     (*filter_)(s, label_map);
    619   }
    620 
    621   // Adds an arc from state S to the destination state associated
    622   // with subset DEST_SUBSET (as created by LabelSubsets).
    623   void AddArc(StateId s, Label label, Subset *dest_subset) {
    624     A arc;
    625     arc.ilabel = label;
    626     arc.olabel = label;
    627     arc.weight = Weight::Zero();
    628 
    629     typename Subset::iterator oiter;
    630     for (typename Subset::iterator diter = dest_subset->begin();
    631          diter != dest_subset->end();) {
    632       Element &dest_element = *diter;
    633       // Computes label weight.
    634       arc.weight = common_divisor_(arc.weight, dest_element.weight);
    635 
    636       while (elements_.size() <= dest_element.state_id)
    637         elements_.push_back(0);
    638       Element *matching_element = elements_[dest_element.state_id];
    639       if (matching_element) {
    640         // Found duplicate state: sums state weight and deletes dup.
    641         matching_element->weight = Plus(matching_element->weight,
    642                                         dest_element.weight);
    643         if (!matching_element->weight.Member())
    644           SetProperties(kError, kError);
    645         ++diter;
    646         dest_subset->erase_after(oiter);
    647       } else {
    648         // Saves element so we can check for duplicate for this state.
    649         elements_[dest_element.state_id] = &dest_element;
    650         oiter = diter;
    651         ++diter;
    652       }
    653     }
    654 
    655     // Divides out label weight from destination subset elements.
    656     // Quantizes to ensure comparisons are effective.
    657     // Clears element vector.
    658     for (typename Subset::iterator diter = dest_subset->begin();
    659          diter != dest_subset->end();
    660          ++diter) {
    661       Element &dest_element = *diter;
    662       dest_element.weight = Divide(dest_element.weight, arc.weight,
    663                                    DIVIDE_LEFT);
    664       dest_element.weight = dest_element.weight.Quantize(delta_);
    665       elements_[dest_element.state_id] = 0;
    666     }
    667 
    668     arc.nextstate = FindState(dest_subset);
    669     CacheImpl<A>::PushArc(s, arc);
    670   }
    671 
    672   float delta_;                    // Quantization delta for subset weights
    673   const vector<Weight> *in_dist_;  // Distance to final NFA states
    674   vector<Weight> *out_dist_;       // Distance to final DFA states
    675 
    676   D common_divisor_;
    677   F *filter_;
    678   T *state_table_;
    679 
    680   vector<Element *> elements_;
    681 
    682   void operator=(const DeterminizeFsaImpl<A, D, F, T> &);  // disallow
    683 };
    684 
    685 
    686 // Implementation of delayed determinization for transducers.
    687 // Transducer determinization is implemented by mapping the input to
    688 // the Gallic semiring as an acceptor whose weights contain the output
    689 // strings and using acceptor determinization above to determinize
    690 // that acceptor.
    691 template <class A, StringType S, class D, class F, class T>
    692 class DeterminizeFstImpl : public DeterminizeFstImplBase<A> {
    693  public:
    694   using FstImpl<A>::SetProperties;
    695   using DeterminizeFstImplBase<A>::GetFst;
    696   using CacheBaseImpl< CacheState<A> >::GetCacheGc;
    697   using CacheBaseImpl< CacheState<A> >::GetCacheLimit;
    698 
    699   typedef typename A::Label Label;
    700   typedef typename A::Weight Weight;
    701   typedef typename A::StateId StateId;
    702 
    703   typedef ToGallicMapper<A, S> ToMapper;
    704   typedef FromGallicMapper<A, S> FromMapper;
    705 
    706   typedef typename ToMapper::ToArc ToArc;
    707   typedef ArcMapFst<A, ToArc, ToMapper> ToFst;
    708   typedef ArcMapFst<ToArc, A, FromMapper> FromFst;
    709 
    710   typedef GallicCommonDivisor<Label, Weight, S, D> CommonDivisor;
    711   typedef GallicFactor<Label, Weight, S> FactorIterator;
    712 
    713   DeterminizeFstImpl(const Fst<A> &fst,
    714                      const DeterminizeFstOptions<A, D, F, T> &opts)
    715       : DeterminizeFstImplBase<A>(fst, opts),
    716         delta_(opts.delta),
    717         subsequential_label_(opts.subsequential_label) {
    718      Init(GetFst());
    719   }
    720 
    721   DeterminizeFstImpl(const DeterminizeFstImpl<A, S, D, F, T> &impl)
    722       : DeterminizeFstImplBase<A>(impl),
    723         delta_(impl.delta_),
    724         subsequential_label_(impl.subsequential_label_) {
    725     Init(GetFst());
    726   }
    727 
    728   ~DeterminizeFstImpl() { delete from_fst_; }
    729 
    730   virtual DeterminizeFstImpl<A, S, D, F, T> *Copy() {
    731     return new DeterminizeFstImpl<A, S, D, F, T>(*this);
    732   }
    733 
    734   uint64 Properties() const { return Properties(kFstProperties); }
    735 
    736   // Set error if found; return FST impl properties.
    737   uint64 Properties(uint64 mask) const {
    738     if ((mask & kError) && (GetFst().Properties(kError, false) ||
    739                             from_fst_->Properties(kError, false)))
    740       SetProperties(kError, kError);
    741     return FstImpl<A>::Properties(mask);
    742   }
    743 
    744   virtual StateId ComputeStart() { return from_fst_->Start(); }
    745 
    746   virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); }
    747 
    748   virtual void Expand(StateId s) {
    749     for (ArcIterator<FromFst> aiter(*from_fst_, s);
    750          !aiter.Done();
    751          aiter.Next())
    752       CacheImpl<A>::PushArc(s, aiter.Value());
    753     CacheImpl<A>::SetArcs(s);
    754   }
    755 
    756  private:
    757   // Initialization of transducer determinization implementation, which
    758   // is defined after DeterminizeFst since it calls it.
    759   void Init(const Fst<A> &fst);
    760 
    761   float delta_;
    762   Label subsequential_label_;
    763   FromFst *from_fst_;
    764 
    765   void operator=(const DeterminizeFstImpl<A, S, D, F, T> &);  // disallow
    766 };
    767 
    768 
    769 // Determinizes a weighted transducer. This version is a delayed
    770 // Fst. The result will be an equivalent FST that has the property
    771 // that no state has two transitions with the same input label.
    772 // For this algorithm, epsilon transitions are treated as regular
    773 // symbols (cf. RmEpsilon).
    774 //
    775 // The transducer must be functional. The weights must be (weakly)
    776 // left divisible (valid for TropicalWeight and LogWeight for instance)
    777 // and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0.
    778 //
    779 // Complexity:
    780 // - Determinizable: exponential (polynomial in the size of the output)
    781 // - Non-determinizable) does not terminate
    782 //
    783 // The determinizable automata include all unweighted and all acyclic input.
    784 //
    785 // References:
    786 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech
    787 //   Processing". Computational Linguistics, 23:2, 1997.
    788 //
    789 // This class attaches interface to implementation and handles
    790 // reference counting, delegating most methods to ImplToFst.
    791 template <class A>
    792 class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> >  {
    793  public:
    794   friend class ArcIterator< DeterminizeFst<A> >;
    795   friend class StateIterator< DeterminizeFst<A> >;
    796   template <class B, StringType S, class D, class F, class T>
    797   friend class DeterminizeFstImpl;
    798 
    799   typedef A Arc;
    800   typedef typename A::Weight Weight;
    801   typedef typename A::StateId StateId;
    802   typedef typename A::Label Label;
    803   typedef CacheState<A> State;
    804   typedef DeterminizeFstImplBase<A> Impl;
    805 
    806   using ImplToFst<Impl>::SetImpl;
    807 
    808   explicit DeterminizeFst(const Fst<A> &fst) {
    809     typedef DefaultCommonDivisor<Weight> D;
    810     typedef IdentityDeterminizeFilter<A> F;
    811     typedef DefaultDeterminizeStateTable<A> T;
    812     DeterminizeFstOptions<A, D, F, T> opts;
    813     if (fst.Properties(kAcceptor, true)) {
    814       // Calls implementation for acceptors.
    815       SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts));
    816     } else {
    817       // Calls implementation for transducers.
    818       SetImpl(new
    819               DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts));
    820     }
    821   }
    822 
    823   template <class D, class F, class T>
    824   DeterminizeFst(const Fst<A> &fst,
    825                  const DeterminizeFstOptions<A, D, F, T> &opts) {
    826     if (fst.Properties(kAcceptor, true)) {
    827       // Calls implementation for acceptors.
    828       SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts));
    829     } else {
    830       // Calls implementation for transducers.
    831       SetImpl(new
    832               DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts));
    833     }
    834   }
    835 
    836   // This acceptor-only version additionally computes the distance to
    837   // final states in the output if provided with those distances for the
    838   // input. Useful for e.g. unique N-shortest paths.
    839   template <class D, class F, class T>
    840   DeterminizeFst(const Fst<A> &fst,
    841                  const vector<Weight> *in_dist, vector<Weight> *out_dist,
    842                  const DeterminizeFstOptions<A, D, F, T> &opts) {
    843     if (!fst.Properties(kAcceptor, true)) {
    844       FSTERROR() << "DeterminizeFst:"
    845                  << " distance to final states computed for acceptors only";
    846       GetImpl()->SetProperties(kError, kError);
    847     }
    848     SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, in_dist, out_dist, opts));
    849   }
    850 
    851   // See Fst<>::Copy() for doc.
    852   DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) {
    853     if (safe)
    854       SetImpl(fst.GetImpl()->Copy());
    855     else
    856       SetImpl(fst.GetImpl(), false);
    857   }
    858 
    859   // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc.
    860   virtual DeterminizeFst<A> *Copy(bool safe = false) const {
    861     return new DeterminizeFst<A>(*this, safe);
    862   }
    863 
    864   virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
    865 
    866   virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
    867     GetImpl()->InitArcIterator(s, data);
    868   }
    869 
    870  private:
    871   // Makes visible to friends.
    872   Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
    873 
    874   void operator=(const DeterminizeFst<A> &fst);  // Disallow
    875 };
    876 
    877 
    878 // Initialization of transducer determinization implementation. which
    879 // is defined after DeterminizeFst since it calls it.
    880 template <class A, StringType S, class D, class F, class T>
    881 void DeterminizeFstImpl<A, S, D, F, T>::Init(const Fst<A> &fst) {
    882   // Mapper to an acceptor.
    883   ToFst to_fst(fst, ToMapper());
    884 
    885   // Determinizes acceptor.
    886   // This recursive call terminates since it passes the common divisor
    887   // to a private constructor.
    888   CacheOptions copts(GetCacheGc(), GetCacheLimit());
    889   DeterminizeFstOptions<ToArc, CommonDivisor> dopts(copts, delta_);
    890   // Uses acceptor-only constructor to avoid template recursion
    891   DeterminizeFst<ToArc> det_fsa(to_fst, 0, 0, dopts);
    892 
    893   // Mapper back to transducer.
    894   FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_,
    895                                    kFactorFinalWeights,
    896                                    subsequential_label_,
    897                                    subsequential_label_);
    898   FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts);
    899   from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_));
    900 }
    901 
    902 
    903 // Specialization for DeterminizeFst.
    904 template <class A>
    905 class StateIterator< DeterminizeFst<A> >
    906     : public CacheStateIterator< DeterminizeFst<A> > {
    907  public:
    908   explicit StateIterator(const DeterminizeFst<A> &fst)
    909       : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {}
    910 };
    911 
    912 
    913 // Specialization for DeterminizeFst.
    914 template <class A>
    915 class ArcIterator< DeterminizeFst<A> >
    916     : public CacheArcIterator< DeterminizeFst<A> > {
    917  public:
    918   typedef typename A::StateId StateId;
    919 
    920   ArcIterator(const DeterminizeFst<A> &fst, StateId s)
    921       : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) {
    922     if (!fst.GetImpl()->HasArcs(s))
    923       fst.GetImpl()->Expand(s);
    924   }
    925 
    926  private:
    927   DISALLOW_COPY_AND_ASSIGN(ArcIterator);
    928 };
    929 
    930 
    931 template <class A> inline
    932 void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const
    933 {
    934   data->base = new StateIterator< DeterminizeFst<A> >(*this);
    935 }
    936 
    937 
    938 // Useful aliases when using StdArc.
    939 typedef DeterminizeFst<StdArc> StdDeterminizeFst;
    940 
    941 
    942 template <class Arc>
    943 struct DeterminizeOptions {
    944   typedef typename Arc::StateId StateId;
    945   typedef typename Arc::Weight Weight;
    946   typedef typename Arc::Label Label;
    947 
    948   float delta;                // Quantization delta for subset weights.
    949   Weight weight_threshold;    // Pruning weight threshold.
    950   StateId state_threshold;    // Pruning state threshold.
    951   Label subsequential_label;  // Label used for residual final output
    952   // when producing subsequential transducers.
    953 
    954   explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(),
    955                               StateId n = kNoStateId, Label l = 0)
    956       : delta(d), weight_threshold(w), state_threshold(n),
    957         subsequential_label(l) {}
    958 };
    959 
    960 
    961 // Determinizes a weighted transducer.  This version writes the
    962 // determinized Fst to an output MutableFst.  The result will be an
    963 // equivalent FST that has the property that no state has two
    964 // transitions with the same input label.  For this algorithm, epsilon
    965 // transitions are treated as regular symbols (cf. RmEpsilon).
    966 //
    967 // The transducer must be functional. The weights must be (weakly)
    968 // left divisible (valid for TropicalWeight and LogWeight).
    969 //
    970 // Complexity:
    971 // - Determinizable: exponential (polynomial in the size of the output)
    972 // - Non-determinizable: does not terminate
    973 //
    974 // The determinizable automata include all unweighted and all acyclic input.
    975 //
    976 // References:
    977 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech
    978 //   Processing". Computational Linguistics, 23:2, 1997.
    979 template <class Arc>
    980 void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst,
    981              const DeterminizeOptions<Arc> &opts
    982                  = DeterminizeOptions<Arc>()) {
    983   typedef typename Arc::StateId StateId;
    984   typedef typename Arc::Weight Weight;
    985 
    986   DeterminizeFstOptions<Arc> nopts;
    987   nopts.delta = opts.delta;
    988   nopts.subsequential_label = opts.subsequential_label;
    989 
    990   nopts.gc_limit = 0;  // Cache only the last state for fastest copy.
    991 
    992   if (opts.weight_threshold != Weight::Zero() ||
    993       opts.state_threshold != kNoStateId) {
    994     if (ifst.Properties(kAcceptor, false)) {
    995       vector<Weight> idistance, odistance;
    996       ShortestDistance(ifst, &idistance, true);
    997       DeterminizeFst<Arc> dfst(ifst, &idistance, &odistance, nopts);
    998       PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold,
    999                                                    opts.state_threshold,
   1000                                                    AnyArcFilter<Arc>(),
   1001                                                    &odistance);
   1002       Prune(dfst, ofst, popts);
   1003     } else {
   1004       *ofst = DeterminizeFst<Arc>(ifst, nopts);
   1005       Prune(ofst, opts.weight_threshold, opts.state_threshold);
   1006     }
   1007   } else {
   1008     *ofst = DeterminizeFst<Arc>(ifst, nopts);
   1009   }
   1010 }
   1011 
   1012 
   1013 }  // namespace fst
   1014 
   1015 #endif  // FST_LIB_DETERMINIZE_H__
   1016