Home | History | Annotate | Download | only in fst
      1 // determinize.h
      2 
      3 
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //     http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 //
     16 // Copyright 2005-2010 Google, Inc.
     17 // Author: riley (at) google.com (Michael Riley)
     18 //
     19 // \file
     20 // Functions and classes to determinize an FST.
     21 
     22 #ifndef FST_LIB_DETERMINIZE_H__
     23 #define FST_LIB_DETERMINIZE_H__
     24 
     25 #include <algorithm>
     26 #include <climits>
     27 #include <unordered_map>
     28 using std::tr1::unordered_map;
     29 using std::tr1::unordered_multimap;
     30 #include <map>
     31 #include <fst/slist.h>
     32 #include <string>
     33 #include <vector>
     34 using std::vector;
     35 
     36 #include <fst/cache.h>
     37 #include <fst/factor-weight.h>
     38 #include <fst/arc-map.h>
     39 #include <fst/prune.h>
     40 #include <fst/test-properties.h>
     41 
     42 
     43 namespace fst {
     44 
     45 //
     46 // COMMON DIVISORS - these are used in determinization to compute
     47 // the transition weights. In the simplest case, it is just the same
     48 // as the semiring Plus(). However, other choices permit more efficient
     49 // determinization when the output contains strings.
     50 //
     51 
     52 // The default common divisor uses the semiring Plus.
     53 template <class W>
     54 class DefaultCommonDivisor {
     55  public:
     56   typedef W Weight;
     57 
     58   W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); }
     59 };
     60 
     61 
     62 // The label common divisor for a (left) string semiring selects a
     63 // single letter common prefix or the empty string. This is used in
     64 // the determinization of output strings so that at most a single
     65 // letter will appear in the output of a transtion.
     66 template <typename L, StringType S>
     67 class LabelCommonDivisor {
     68  public:
     69   typedef StringWeight<L, S> Weight;
     70 
     71   Weight operator()(const Weight &w1, const Weight &w2) const {
     72     StringWeightIterator<L, S> iter1(w1);
     73     StringWeightIterator<L, S> iter2(w2);
     74 
     75     if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) {
     76       FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring";
     77       return Weight::NoWeight();
     78     } else if (w1.Size() == 0 || w2.Size() == 0) {
     79       return Weight::One();
     80     } else if (w1 == Weight::Zero()) {
     81       return Weight(iter2.Value());
     82     } else if (w2 == Weight::Zero()) {
     83       return Weight(iter1.Value());
     84     } else if (iter1.Value() == iter2.Value()) {
     85       return Weight(iter1.Value());
     86     } else {
     87       return Weight::One();
     88     }
     89   }
     90 };
     91 
     92 
     93 // The gallic common divisor uses the label common divisor on the
     94 // string component and the template argument D common divisor on the
     95 // weight component, which defaults to the default common divisor.
     96 template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> >
     97 class GallicCommonDivisor {
     98  public:
     99   typedef GallicWeight<L, W, S> Weight;
    100 
    101   Weight operator()(const Weight &w1, const Weight &w2) const {
    102     return Weight(label_common_divisor_(w1.Value1(), w2.Value1()),
    103                   weight_common_divisor_(w1.Value2(), w2.Value2()));
    104   }
    105 
    106  private:
    107   LabelCommonDivisor<L, S> label_common_divisor_;
    108   D weight_common_divisor_;
    109 };
    110 
    111 // Options for finite-state transducer determinization.
    112 template <class Arc>
    113 struct DeterminizeFstOptions : CacheOptions {
    114   typedef typename Arc::Label Label;
    115   float delta;                // Quantization delta for subset weights
    116   Label subsequential_label;  // Label used for residual final output
    117                               // when producing subsequential transducers.
    118 
    119   explicit DeterminizeFstOptions(const CacheOptions &opts,
    120                                  float del = kDelta,
    121                                  Label lab = 0)
    122       : CacheOptions(opts), delta(del), subsequential_label(lab) {}
    123 
    124   explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0)
    125       : delta(del), subsequential_label(lab) {}
    126 };
    127 
    128 
    129 // Implementation of delayed DeterminizeFst. This base class is
    130 // common to the variants that implement acceptor and transducer
    131 // determinization.
    132 template <class A>
    133 class DeterminizeFstImplBase : public CacheImpl<A> {
    134  public:
    135   using FstImpl<A>::SetType;
    136   using FstImpl<A>::SetProperties;
    137   using FstImpl<A>::Properties;
    138   using FstImpl<A>::SetInputSymbols;
    139   using FstImpl<A>::SetOutputSymbols;
    140 
    141   using CacheBaseImpl< CacheState<A> >::HasStart;
    142   using CacheBaseImpl< CacheState<A> >::HasFinal;
    143   using CacheBaseImpl< CacheState<A> >::HasArcs;
    144   using CacheBaseImpl< CacheState<A> >::SetFinal;
    145   using CacheBaseImpl< CacheState<A> >::SetStart;
    146 
    147   typedef typename A::Label Label;
    148   typedef typename A::Weight Weight;
    149   typedef typename A::StateId StateId;
    150   typedef CacheState<A> State;
    151 
    152   DeterminizeFstImplBase(const Fst<A> &fst,
    153                          const DeterminizeFstOptions<A> &opts)
    154       : CacheImpl<A>(opts), fst_(fst.Copy()) {
    155     SetType("determinize");
    156     uint64 props = fst.Properties(kFstProperties, false);
    157     SetProperties(DeterminizeProperties(props,
    158                                         opts.subsequential_label != 0),
    159                   kCopyProperties);
    160 
    161     SetInputSymbols(fst.InputSymbols());
    162     SetOutputSymbols(fst.OutputSymbols());
    163   }
    164 
    165   DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl)
    166       : CacheImpl<A>(impl),
    167         fst_(impl.fst_->Copy(true)) {
    168     SetType("determinize");
    169     SetProperties(impl.Properties(), kCopyProperties);
    170     SetInputSymbols(impl.InputSymbols());
    171     SetOutputSymbols(impl.OutputSymbols());
    172   }
    173 
    174   virtual ~DeterminizeFstImplBase() { delete fst_; }
    175 
    176   virtual DeterminizeFstImplBase<A> *Copy() = 0;
    177 
    178   StateId Start() {
    179     if (!HasStart()) {
    180       StateId start = ComputeStart();
    181       if (start != kNoStateId) {
    182         SetStart(start);
    183       }
    184     }
    185     return CacheImpl<A>::Start();
    186   }
    187 
    188   Weight Final(StateId s) {
    189     if (!HasFinal(s)) {
    190       Weight final = ComputeFinal(s);
    191       SetFinal(s, final);
    192     }
    193     return CacheImpl<A>::Final(s);
    194   }
    195 
    196   virtual void Expand(StateId s) = 0;
    197 
    198   size_t NumArcs(StateId s) {
    199     if (!HasArcs(s))
    200       Expand(s);
    201     return CacheImpl<A>::NumArcs(s);
    202   }
    203 
    204   size_t NumInputEpsilons(StateId s) {
    205     if (!HasArcs(s))
    206       Expand(s);
    207     return CacheImpl<A>::NumInputEpsilons(s);
    208   }
    209 
    210   size_t NumOutputEpsilons(StateId s) {
    211     if (!HasArcs(s))
    212       Expand(s);
    213     return CacheImpl<A>::NumOutputEpsilons(s);
    214   }
    215 
    216   void InitArcIterator(StateId s, ArcIteratorData<A> *data) {
    217     if (!HasArcs(s))
    218       Expand(s);
    219     CacheImpl<A>::InitArcIterator(s, data);
    220   }
    221 
    222   virtual StateId ComputeStart() = 0;
    223 
    224   virtual Weight ComputeFinal(StateId s) = 0;
    225 
    226   const Fst<A> &GetFst() const { return *fst_; }
    227 
    228  private:
    229   const Fst<A> *fst_;            // Input Fst
    230 
    231   void operator=(const DeterminizeFstImplBase<A> &);  // disallow
    232 };
    233 
    234 
    235 // Implementation of delayed determinization for weighted acceptors.
    236 // It is templated on the arc type A and the common divisor D.
    237 template <class A, class D>
    238 class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> {
    239  public:
    240   using FstImpl<A>::SetProperties;
    241   using DeterminizeFstImplBase<A>::GetFst;
    242   using DeterminizeFstImplBase<A>::SetArcs;
    243 
    244   typedef typename A::Label Label;
    245   typedef typename A::Weight Weight;
    246   typedef typename A::StateId StateId;
    247 
    248   struct Element {
    249     Element() {}
    250 
    251     Element(StateId s, Weight w) : state_id(s), weight(w) {}
    252 
    253     StateId state_id;  // Input state Id
    254     Weight weight;     // Residual weight
    255   };
    256   typedef slist<Element> Subset;
    257   typedef map<Label, Subset*> LabelMap;
    258 
    259   DeterminizeFsaImpl(const Fst<A> &fst, D common_divisor,
    260                      const vector<Weight> *in_dist, vector<Weight> *out_dist,
    261                      const DeterminizeFstOptions<A> &opts)
    262       : DeterminizeFstImplBase<A>(fst, opts),
    263         delta_(opts.delta),
    264         in_dist_(in_dist),
    265         out_dist_(out_dist),
    266         common_divisor_(common_divisor),
    267         subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) {
    268     if (!fst.Properties(kAcceptor, true)) {
    269       FSTERROR()  << "DeterminizeFst: argument not an acceptor";
    270       SetProperties(kError, kError);
    271     }
    272     if (!(Weight::Properties() & kLeftSemiring)) {
    273       FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: "
    274                  << Weight::Type();
    275       SetProperties(kError, kError);
    276     }
    277     if (out_dist_)
    278       out_dist_->clear();
    279   }
    280 
    281   DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D> &impl)
    282       : DeterminizeFstImplBase<A>(impl),
    283         delta_(impl.delta_),
    284         in_dist_(0),
    285         out_dist_(0),
    286         common_divisor_(impl.common_divisor_),
    287         subset_hash_(0, SubsetKey(), SubsetEqual(&elements_)) {
    288     if (impl.out_dist_) {
    289       FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector";
    290       SetProperties(kError, kError);
    291     }
    292   }
    293 
    294   virtual ~DeterminizeFsaImpl() {
    295     for (int i = 0; i < subsets_.size(); ++i)
    296       delete subsets_[i];
    297   }
    298 
    299   virtual DeterminizeFsaImpl<A, D> *Copy() {
    300     return new DeterminizeFsaImpl<A, D>(*this);
    301   }
    302 
    303   uint64 Properties() const { return Properties(kFstProperties); }
    304 
    305   // Set error if found; return FST impl properties.
    306   uint64 Properties(uint64 mask) const {
    307     if ((mask & kError) && (GetFst().Properties(kError, false)))
    308       SetProperties(kError, kError);
    309     return FstImpl<A>::Properties(mask);
    310   }
    311 
    312   virtual StateId ComputeStart() {
    313     StateId s = GetFst().Start();
    314     if (s == kNoStateId)
    315       return kNoStateId;
    316     Element element(s, Weight::One());
    317     Subset *subset = new Subset;
    318     subset->push_front(element);
    319     return FindState(subset);
    320   }
    321 
    322   virtual Weight ComputeFinal(StateId s) {
    323     Subset *subset = subsets_[s];
    324     Weight final = Weight::Zero();
    325     for (typename Subset::iterator siter = subset->begin();
    326          siter != subset->end();
    327          ++siter) {
    328       Element &element = *siter;
    329       final = Plus(final, Times(element.weight,
    330                                 GetFst().Final(element.state_id)));
    331       if (!final.Member())
    332         SetProperties(kError, kError);
    333     }
    334     return final;
    335   }
    336 
    337   // Finds the state corresponding to a subset. Only creates a new state
    338   // if the subset is not found in the subset hash. FindState takes
    339   // ownership of the subset argument (so that it doesn't have to copy it
    340   // if it creates a new state).
    341   //
    342   // The method exploits the following device: all pairs stored in the
    343   // associative container subset_hash_ are of the form (subset,
    344   // id(subset) + 1), i.e. subset_hash_[subset] > 0 if subset has been
    345   // stored previously. For unassigned subsets, the call to
    346   // subset_hash_[subset] creates a new pair (subset, 0). As a result,
    347   // subset_hash_[subset] == 0 iff subset is new.
    348   StateId FindState(Subset *subset) {
    349     StateId &assoc_value = subset_hash_[subset];
    350     if (assoc_value == 0) {    // subset wasn't present; create new state
    351       StateId s = CreateState(subset);
    352       assoc_value = s + 1;
    353       return  s;
    354     } else {
    355       delete subset;
    356       return assoc_value - 1;  // NB: assoc_value = ID + 1
    357     }
    358   }
    359 
    360   StateId CreateState(Subset *subset) {
    361     StateId s = subsets_.size();
    362     subsets_.push_back(subset);
    363     if (in_dist_)
    364       out_dist_->push_back(ComputeDistance(subset));
    365     return s;
    366   }
    367 
    368   // Compute distance from a state to the final states in the DFA
    369   // given the distances in the NFA.
    370   Weight ComputeDistance(const Subset *subset) {
    371     Weight outd = Weight::Zero();
    372     for (typename Subset::const_iterator siter = subset->begin();
    373          siter != subset->end(); ++siter) {
    374       const Element &element = *siter;
    375       Weight ind = element.state_id < in_dist_->size() ?
    376           (*in_dist_)[element.state_id] : Weight::Zero();
    377       outd = Plus(outd, Times(element.weight, ind));
    378     }
    379     return outd;
    380   }
    381 
    382   // Computes the outgoing transitions from a state, creating new destination
    383   // states as needed.
    384   virtual void Expand(StateId s) {
    385 
    386     LabelMap label_map;
    387     LabelSubsets(s, &label_map);
    388 
    389     for (typename LabelMap::iterator liter = label_map.begin();
    390          liter != label_map.end();
    391          ++liter)
    392       AddArc(s, liter->first, liter->second);
    393     SetArcs(s);
    394   }
    395 
    396  private:
    397   // Constructs destination subsets per label. At return, subset
    398   // element weights include the input automaton label weights and the
    399   // subsets may contain duplicate states.
    400   void LabelSubsets(StateId s, LabelMap *label_map) {
    401     Subset *src_subset = subsets_[s];
    402 
    403     for (typename Subset::iterator siter = src_subset->begin();
    404          siter != src_subset->end();
    405          ++siter) {
    406       Element &src_element = *siter;
    407       for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id);
    408            !aiter.Done();
    409            aiter.Next()) {
    410         const A &arc = aiter.Value();
    411         Element dest_element(arc.nextstate,
    412                              Times(src_element.weight, arc.weight));
    413         Subset* &dest_subset = (*label_map)[arc.ilabel];
    414         if (dest_subset == 0)
    415           dest_subset = new Subset;
    416         dest_subset->push_front(dest_element);
    417       }
    418     }
    419   }
    420 
    421   // Adds an arc from state S to the destination state associated
    422   // with subset DEST_SUBSET (as created by LabelSubsets).
    423   void AddArc(StateId s, Label label, Subset *dest_subset) {
    424     A arc;
    425     arc.ilabel = label;
    426     arc.olabel = label;
    427     arc.weight = Weight::Zero();
    428 
    429     typename Subset::iterator oiter;
    430     for (typename Subset::iterator diter = dest_subset->begin();
    431          diter != dest_subset->end();) {
    432       Element &dest_element = *diter;
    433       // Computes label weight.
    434       arc.weight = common_divisor_(arc.weight, dest_element.weight);
    435 
    436       while (elements_.size() <= dest_element.state_id)
    437         elements_.push_back(0);
    438       Element *matching_element = elements_[dest_element.state_id];
    439       if (matching_element) {
    440         // Found duplicate state: sums state weight and deletes dup.
    441         matching_element->weight = Plus(matching_element->weight,
    442                                         dest_element.weight);
    443         if (!matching_element->weight.Member())
    444           SetProperties(kError, kError);
    445         ++diter;
    446         dest_subset->erase_after(oiter);
    447       } else {
    448         // Saves element so we can check for duplicate for this state.
    449         elements_[dest_element.state_id] = &dest_element;
    450         oiter = diter;
    451         ++diter;
    452       }
    453     }
    454 
    455     // Divides out label weight from destination subset elements.
    456     // Quantizes to ensure comparisons are effective.
    457     // Clears element vector.
    458     for (typename Subset::iterator diter = dest_subset->begin();
    459          diter != dest_subset->end();
    460          ++diter) {
    461       Element &dest_element = *diter;
    462       dest_element.weight = Divide(dest_element.weight, arc.weight,
    463                                    DIVIDE_LEFT);
    464       dest_element.weight = dest_element.weight.Quantize(delta_);
    465       elements_[dest_element.state_id] = 0;
    466     }
    467 
    468     arc.nextstate = FindState(dest_subset);
    469     CacheImpl<A>::PushArc(s, arc);
    470   }
    471 
    472   // Comparison object for hashing Subset(s). Subsets are not sorted in this
    473   // implementation, so ordering must not be assumed in the equivalence
    474   // test.
    475   class SubsetEqual {
    476    public:
    477     // Constructor takes vector needed to check equality. See immediately
    478     // below for constraints on it.
    479     explicit SubsetEqual(vector<Element *> *elements)
    480         : elements_(elements) {}
    481 
    482     // At each call to operator(), the elements_ vector should contain
    483     // only NULLs. When this operator returns, elements_ will still
    484     // have this property.
    485     bool operator()(Subset* subset1, Subset* subset2) const {
    486         if (subset1->size() != subset2->size())
    487           return false;
    488 
    489       // Loads first subset elements in element vector.
    490       for (typename Subset::iterator iter1 = subset1->begin();
    491            iter1 != subset1->end();
    492            ++iter1) {
    493         Element &element1 = *iter1;
    494         while (elements_->size() <= element1.state_id)
    495           elements_->push_back(0);
    496         (*elements_)[element1.state_id] = &element1;
    497       }
    498 
    499       // Checks second subset matches first via element vector.
    500       for (typename Subset::iterator iter2 = subset2->begin();
    501            iter2 != subset2->end();
    502            ++iter2) {
    503         Element &element2 = *iter2;
    504         while (elements_->size() <= element2.state_id)
    505           elements_->push_back(0);
    506         Element *element1 = (*elements_)[element2.state_id];
    507         if (!element1 || element1->weight != element2.weight) {
    508           // Mismatch found. Resets element vector before returning false.
    509           for (typename Subset::iterator iter1 = subset1->begin();
    510                iter1 != subset1->end();
    511                ++iter1)
    512             (*elements_)[iter1->state_id] = 0;
    513           return false;
    514         } else {
    515           (*elements_)[element2.state_id] = 0;  // Clears entry
    516         }
    517       }
    518       return true;
    519     }
    520    private:
    521     vector<Element *> *elements_;
    522   };
    523 
    524   // Hash function for Subset to Fst states. Subset elements are not
    525   // sorted in this implementation, so the hash must be invariant
    526   // under subset reordering.
    527   class SubsetKey {
    528    public:
    529     size_t operator()(const Subset* subset) const {
    530       size_t hash = 0;
    531       for (typename Subset::const_iterator iter = subset->begin();
    532            iter != subset->end();
    533            ++iter) {
    534         const Element &element = *iter;
    535         int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1;
    536         int rshift = CHAR_BIT * sizeof(size_t) - lshift;
    537         size_t n = element.state_id;
    538         hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash();
    539       }
    540       return hash;
    541     }
    542   };
    543 
    544   float delta_;                    // Quantization delta for subset weights
    545   const vector<Weight> *in_dist_;  // Distance to final NFA states
    546   vector<Weight> *out_dist_;       // Distance to final DFA states
    547 
    548   D common_divisor_;
    549 
    550   // Used to test equivalence of subsets.
    551   vector<Element *> elements_;
    552 
    553   // Maps from StateId to Subset.
    554   vector<Subset *> subsets_;
    555 
    556   // Hashes from Subset to its StateId in the output automaton.
    557   typedef unordered_map<Subset *, StateId, SubsetKey, SubsetEqual>
    558   SubsetHash;
    559 
    560   // Hashes from Label to Subsets corr. to destination states of current state.
    561   SubsetHash subset_hash_;
    562 
    563   void operator=(const DeterminizeFsaImpl<A, D> &);  // disallow
    564 };
    565 
    566 
    567 // Implementation of delayed determinization for transducers.
    568 // Transducer determinization is implemented by mapping the input to
    569 // the Gallic semiring as an acceptor whose weights contain the output
    570 // strings and using acceptor determinization above to determinize
    571 // that acceptor.
    572 template <class A, StringType S>
    573 class DeterminizeFstImpl : public DeterminizeFstImplBase<A> {
    574  public:
    575   using FstImpl<A>::SetProperties;
    576   using DeterminizeFstImplBase<A>::GetFst;
    577   using CacheBaseImpl< CacheState<A> >::GetCacheGc;
    578   using CacheBaseImpl< CacheState<A> >::GetCacheLimit;
    579 
    580   typedef typename A::Label Label;
    581   typedef typename A::Weight Weight;
    582   typedef typename A::StateId StateId;
    583 
    584   typedef ToGallicMapper<A, S> ToMapper;
    585   typedef FromGallicMapper<A, S> FromMapper;
    586 
    587   typedef typename ToMapper::ToArc ToArc;
    588   typedef ArcMapFst<A, ToArc, ToMapper> ToFst;
    589   typedef ArcMapFst<ToArc, A, FromMapper> FromFst;
    590 
    591   typedef GallicCommonDivisor<Label, Weight, S> CommonDivisor;
    592   typedef GallicFactor<Label, Weight, S> FactorIterator;
    593 
    594   DeterminizeFstImpl(const Fst<A> &fst, const DeterminizeFstOptions<A> &opts)
    595       : DeterminizeFstImplBase<A>(fst, opts),
    596         delta_(opts.delta),
    597         subsequential_label_(opts.subsequential_label) {
    598      Init(GetFst());
    599   }
    600 
    601   DeterminizeFstImpl(const DeterminizeFstImpl<A, S> &impl)
    602       : DeterminizeFstImplBase<A>(impl),
    603         delta_(impl.delta_),
    604         subsequential_label_(impl.subsequential_label_) {
    605     Init(GetFst());
    606   }
    607 
    608   ~DeterminizeFstImpl() { delete from_fst_; }
    609 
    610   virtual DeterminizeFstImpl<A, S> *Copy() {
    611     return new DeterminizeFstImpl<A, S>(*this);
    612   }
    613 
    614   uint64 Properties() const { return Properties(kFstProperties); }
    615 
    616   // Set error if found; return FST impl properties.
    617   uint64 Properties(uint64 mask) const {
    618     if ((mask & kError) && (GetFst().Properties(kError, false) ||
    619                             from_fst_->Properties(kError, false)))
    620       SetProperties(kError, kError);
    621     return FstImpl<A>::Properties(mask);
    622   }
    623 
    624   virtual StateId ComputeStart() { return from_fst_->Start(); }
    625 
    626   virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); }
    627 
    628   virtual void Expand(StateId s) {
    629     for (ArcIterator<FromFst> aiter(*from_fst_, s);
    630          !aiter.Done();
    631          aiter.Next())
    632       CacheImpl<A>::PushArc(s, aiter.Value());
    633     CacheImpl<A>::SetArcs(s);
    634   }
    635 
    636  private:
    637   // Initialization of transducer determinization implementation, which
    638   // is defined after DeterminizeFst since it calls it.
    639   void Init(const Fst<A> &fst);
    640 
    641   float delta_;
    642   Label subsequential_label_;
    643   FromFst *from_fst_;
    644 
    645   void operator=(const DeterminizeFstImpl<A, S> &);  // disallow
    646 };
    647 
    648 
    649 // Determinizes a weighted transducer. This version is a delayed
    650 // Fst. The result will be an equivalent FST that has the property
    651 // that no state has two transitions with the same input label.
    652 // For this algorithm, epsilon transitions are treated as regular
    653 // symbols (cf. RmEpsilon).
    654 //
    655 // The transducer must be functional. The weights must be (weakly)
    656 // left divisible (valid for TropicalWeight and LogWeight for instance)
    657 // and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0.
    658 //
    659 // Complexity:
    660 // - Determinizable: exponential (polynomial in the size of the output)
    661 // - Non-determinizable) does not terminate
    662 //
    663 // The determinizable automata include all unweighted and all acyclic input.
    664 //
    665 // References:
    666 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech
    667 //   Processing". Computational Linguistics, 23:2, 1997.
    668 //
    669 // This class attaches interface to implementation and handles
    670 // reference counting, delegating most methods to ImplToFst.
    671 template <class A>
    672 class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> >  {
    673  public:
    674   friend class ArcIterator< DeterminizeFst<A> >;
    675   friend class StateIterator< DeterminizeFst<A> >;
    676   template <class B, StringType S> friend class DeterminizeFstImpl;
    677 
    678   typedef A Arc;
    679   typedef typename A::Weight Weight;
    680   typedef typename A::StateId StateId;
    681   typedef typename A::Label Label;
    682   typedef CacheState<A> State;
    683   typedef DeterminizeFstImplBase<A> Impl;
    684 
    685   using ImplToFst<Impl>::SetImpl;
    686 
    687   explicit DeterminizeFst(
    688       const Fst<A> &fst,
    689       const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) {
    690     if (fst.Properties(kAcceptor, true)) {
    691       // Calls implementation for acceptors.
    692       typedef DefaultCommonDivisor<Weight> D;
    693       SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), 0, 0, opts));
    694     } else {
    695       // Calls implementation for transducers.
    696       SetImpl(new DeterminizeFstImpl<A, STRING_LEFT_RESTRICT>(fst, opts));
    697     }
    698   }
    699 
    700   // This acceptor-only version additionally computes the distance to
    701   // final states in the output if provided with those distances for the
    702   // input. Useful for e.g. unique N-shortest paths.
    703   DeterminizeFst(
    704       const Fst<A> &fst,
    705       const vector<Weight> &in_dist, vector<Weight> *out_dist,
    706       const DeterminizeFstOptions<A> &opts = DeterminizeFstOptions<A>()) {
    707     if (!fst.Properties(kAcceptor, true)) {
    708       FSTERROR() << "DeterminizeFst:"
    709                  << " distance to final states computed for acceptors only";
    710       GetImpl()->SetProperties(kError, kError);
    711     }
    712     typedef DefaultCommonDivisor<Weight> D;
    713     SetImpl(new DeterminizeFsaImpl<A, D>(fst, D(), &in_dist, out_dist, opts));
    714   }
    715 
    716   // See Fst<>::Copy() for doc.
    717   DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) {
    718     if (safe)
    719       SetImpl(fst.GetImpl()->Copy());
    720     else
    721       SetImpl(fst.GetImpl(), false);
    722   }
    723 
    724   // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc.
    725   virtual DeterminizeFst<A> *Copy(bool safe = false) const {
    726     return new DeterminizeFst<A>(*this, safe);
    727   }
    728 
    729   virtual inline void InitStateIterator(StateIteratorData<A> *data) const;
    730 
    731   virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const {
    732     GetImpl()->InitArcIterator(s, data);
    733   }
    734 
    735  private:
    736   // This private version is for passing the common divisor to
    737   // FSA determinization.
    738   template <class D>
    739   DeterminizeFst(const Fst<A> &fst, const D &common_div,
    740                  const DeterminizeFstOptions<A> &opts)
    741       :  ImplToFst<Impl>(
    742           new DeterminizeFsaImpl<A, D>(fst, common_div, 0, 0, opts)) {}
    743 
    744   // Makes visible to friends.
    745   Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); }
    746 
    747   void operator=(const DeterminizeFst<A> &fst);  // Disallow
    748 };
    749 
    750 
    751 // Initialization of transducer determinization implementation. which
    752 // is defined after DeterminizeFst since it calls it.
    753 template <class A, StringType S>
    754 void DeterminizeFstImpl<A, S>::Init(const Fst<A> &fst) {
    755   // Mapper to an acceptor.
    756   ToFst to_fst(fst, ToMapper());
    757 
    758   // Determinize acceptor.
    759   // This recursive call terminates since it passes the common divisor
    760   // to a private constructor.
    761   CacheOptions copts(GetCacheGc(), GetCacheLimit());
    762   DeterminizeFstOptions<ToArc> dopts(copts, delta_);
    763   DeterminizeFst<ToArc> det_fsa(to_fst, CommonDivisor(), dopts);
    764 
    765   // Mapper back to transducer.
    766   FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_,
    767                                    kFactorFinalWeights,
    768                                    subsequential_label_,
    769                                    subsequential_label_);
    770   FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts);
    771   from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_));
    772 }
    773 
    774 
    775 // Specialization for DeterminizeFst.
    776 template <class A>
    777 class StateIterator< DeterminizeFst<A> >
    778     : public CacheStateIterator< DeterminizeFst<A> > {
    779  public:
    780   explicit StateIterator(const DeterminizeFst<A> &fst)
    781       : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {}
    782 };
    783 
    784 
    785 // Specialization for DeterminizeFst.
    786 template <class A>
    787 class ArcIterator< DeterminizeFst<A> >
    788     : public CacheArcIterator< DeterminizeFst<A> > {
    789  public:
    790   typedef typename A::StateId StateId;
    791 
    792   ArcIterator(const DeterminizeFst<A> &fst, StateId s)
    793       : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) {
    794     if (!fst.GetImpl()->HasArcs(s))
    795       fst.GetImpl()->Expand(s);
    796   }
    797 
    798  private:
    799   DISALLOW_COPY_AND_ASSIGN(ArcIterator);
    800 };
    801 
    802 
    803 template <class A> inline
    804 void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const
    805 {
    806   data->base = new StateIterator< DeterminizeFst<A> >(*this);
    807 }
    808 
    809 
    810 // Useful aliases when using StdArc.
    811 typedef DeterminizeFst<StdArc> StdDeterminizeFst;
    812 
    813 
    814 template <class Arc>
    815 struct DeterminizeOptions {
    816   typedef typename Arc::StateId StateId;
    817   typedef typename Arc::Weight Weight;
    818   typedef typename Arc::Label Label;
    819 
    820   float delta;                // Quantization delta for subset weights.
    821   Weight weight_threshold;    // Pruning weight threshold.
    822   StateId state_threshold;    // Pruning state threshold.
    823   Label subsequential_label;  // Label used for residual final output
    824   // when producing subsequential transducers.
    825 
    826   explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(),
    827                               StateId n = kNoStateId, Label l = 0)
    828       : delta(d), weight_threshold(w), state_threshold(n),
    829         subsequential_label(l) {}
    830 };
    831 
    832 
    833 // Determinizes a weighted transducer.  This version writes the
    834 // determinized Fst to an output MutableFst.  The result will be an
    835 // equivalent FSt that has the property that no state has two
    836 // transitions with the same input label.  For this algorithm, epsilon
    837 // transitions are treated as regular symbols (cf. RmEpsilon).
    838 //
    839 // The transducer must be functional. The weights must be (weakly)
    840 // left divisible (valid for TropicalWeight and LogWeight).
    841 //
    842 // Complexity:
    843 // - Determinizable: exponential (polynomial in the size of the output)
    844 // - Non-determinizable: does not terminate
    845 //
    846 // The determinizable automata include all unweighted and all acyclic input.
    847 //
    848 // References:
    849 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech
    850 //   Processing". Computational Linguistics, 23:2, 1997.
    851 template <class Arc>
    852 void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst,
    853              const DeterminizeOptions<Arc> &opts
    854                  = DeterminizeOptions<Arc>()) {
    855   typedef typename Arc::StateId StateId;
    856   typedef typename Arc::Weight Weight;
    857 
    858   DeterminizeFstOptions<Arc> nopts;
    859   nopts.delta = opts.delta;
    860   nopts.subsequential_label = opts.subsequential_label;
    861 
    862   nopts.gc_limit = 0;  // Cache only the last state for fastest copy.
    863 
    864   if (opts.weight_threshold != Weight::Zero() ||
    865       opts.state_threshold != kNoStateId) {
    866     if (ifst.Properties(kAcceptor, false)) {
    867       vector<Weight> idistance, odistance;
    868       ShortestDistance(ifst, &idistance, true);
    869       DeterminizeFst<Arc> dfst(ifst, idistance, &odistance, nopts);
    870       PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold,
    871                                                    opts.state_threshold,
    872                                                    AnyArcFilter<Arc>(),
    873                                                    &odistance);
    874       Prune(dfst, ofst, popts);
    875     } else {
    876       *ofst = DeterminizeFst<Arc>(ifst, nopts);
    877       Prune(ofst, opts.weight_threshold, opts.state_threshold);
    878     }
    879   } else {
    880     *ofst = DeterminizeFst<Arc>(ifst, nopts);
    881   }
    882 }
    883 
    884 
    885 }  // namespace fst
    886 
    887 #endif  // FST_LIB_DETERMINIZE_H__
    888