1 // determinize.h 2 3 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 // 16 // Copyright 2005-2010 Google, Inc. 17 // Author: riley (at) google.com (Michael Riley) 18 // 19 // \file 20 // Functions and classes to determinize an FST. 21 22 #ifndef FST_LIB_DETERMINIZE_H__ 23 #define FST_LIB_DETERMINIZE_H__ 24 25 #include <algorithm> 26 #include <climits> 27 #include <tr1/unordered_map> 28 using std::tr1::unordered_map; 29 using std::tr1::unordered_multimap; 30 #include <map> 31 #include <fst/slist.h> 32 #include <string> 33 #include <vector> 34 using std::vector; 35 36 #include <fst/arc-map.h> 37 #include <fst/cache.h> 38 #include <fst/bi-table.h> 39 #include <fst/factor-weight.h> 40 #include <fst/prune.h> 41 #include <fst/test-properties.h> 42 43 44 namespace fst { 45 46 // 47 // COMMON DIVISORS - these are used in determinization to compute 48 // the transition weights. In the simplest case, it is just the same 49 // as the semiring Plus(). However, other choices permit more efficient 50 // determinization when the output contains strings. 51 // 52 53 // The default common divisor uses the semiring Plus. 54 template <class W> 55 class DefaultCommonDivisor { 56 public: 57 typedef W Weight; 58 59 W operator()(const W &w1, const W &w2) const { return Plus(w1, w2); } 60 }; 61 62 63 // The label common divisor for a (left) string semiring selects a 64 // single letter common prefix or the empty string. This is used in 65 // the determinization of output strings so that at most a single 66 // letter will appear in the output of a transtion. 67 template <typename L, StringType S> 68 class LabelCommonDivisor { 69 public: 70 typedef StringWeight<L, S> Weight; 71 72 Weight operator()(const Weight &w1, const Weight &w2) const { 73 StringWeightIterator<L, S> iter1(w1); 74 StringWeightIterator<L, S> iter2(w2); 75 76 if (!(StringWeight<L, S>::Properties() & kLeftSemiring)) { 77 FSTERROR() << "LabelCommonDivisor: Weight needs to be left semiring"; 78 return Weight::NoWeight(); 79 } else if (w1.Size() == 0 || w2.Size() == 0) { 80 return Weight::One(); 81 } else if (w1 == Weight::Zero()) { 82 return Weight(iter2.Value()); 83 } else if (w2 == Weight::Zero()) { 84 return Weight(iter1.Value()); 85 } else if (iter1.Value() == iter2.Value()) { 86 return Weight(iter1.Value()); 87 } else { 88 return Weight::One(); 89 } 90 } 91 }; 92 93 94 // The gallic common divisor uses the label common divisor on the 95 // string component and the template argument D common divisor on the 96 // weight component, which defaults to the default common divisor. 97 template <class L, class W, StringType S, class D = DefaultCommonDivisor<W> > 98 class GallicCommonDivisor { 99 public: 100 typedef GallicWeight<L, W, S> Weight; 101 102 Weight operator()(const Weight &w1, const Weight &w2) const { 103 return Weight(label_common_divisor_(w1.Value1(), w2.Value1()), 104 weight_common_divisor_(w1.Value2(), w2.Value2())); 105 } 106 107 private: 108 LabelCommonDivisor<L, S> label_common_divisor_; 109 D weight_common_divisor_; 110 }; 111 112 113 // Represents an element in a subset 114 template <class A> 115 struct DeterminizeElement { 116 typedef typename A::StateId StateId; 117 typedef typename A::Weight Weight; 118 119 DeterminizeElement() {} 120 121 DeterminizeElement(StateId s, Weight w) : state_id(s), weight(w) {} 122 123 bool operator==(const DeterminizeElement<A> & element) const { 124 return state_id == element.state_id && weight == element.weight; 125 } 126 127 bool operator<(const DeterminizeElement<A> & element) const { 128 return state_id < element.state_id || 129 (state_id == element.state_id && weight == element.weight); 130 } 131 132 StateId state_id; // Input state Id 133 Weight weight; // Residual weight 134 }; 135 136 137 // 138 // DETERMINIZE FILTERS - these can be used in determinization to compute 139 // transformations on the subsets prior to their being added as destination 140 // states. The filter operates on a map between a label and the 141 // corresponding destination subsets. The possibly modified map is 142 // then used to construct the destination states for arcs exiting state 's'. 143 // It must define the ordered map type LabelMap and have a default 144 // and copy constructor. 145 146 // A determinize filter that does not modify its input. 147 template <class Arc> 148 struct IdentityDeterminizeFilter { 149 typedef typename Arc::StateId StateId; 150 typedef typename Arc::Label Label; 151 typedef slist< DeterminizeElement<Arc> > Subset; 152 typedef map<Label, Subset*> LabelMap; 153 154 static uint64 Properties(uint64 props) { return props; } 155 156 void operator()(StateId s, LabelMap *label_map) {} 157 }; 158 159 160 // 161 // DETERMINIZATION STATE TABLES 162 // 163 // The determiziation state table has the form: 164 // 165 // template <class Arc> 166 // class DeterminizeStateTable { 167 // public: 168 // typedef typename Arc::StateId StateId; 169 // typedef DeterminizeElement<Arc> Element; 170 // typedef slist<Element> Subset; 171 // 172 // // Required constuctor 173 // DeterminizeStateTable(); 174 // 175 // // Required copy constructor that does not copy state 176 // DeterminizeStateTable(const DeterminizeStateTable<A,P> &table); 177 // 178 // // Lookup state ID by subset (not depending of the element order). 179 // // If it doesn't exist, then add it. FindState takes 180 // // ownership of the subset argument (so that it doesn't have to 181 // // copy it if it creates a new state). 182 // StateId FindState(Subset *subset); 183 // 184 // // Lookup subset by ID. 185 // const Subset *FindSubset(StateId id) const; 186 // }; 187 // 188 189 // The default determinization state table based on the 190 // compact hash bi-table. 191 template <class Arc> 192 class DefaultDeterminizeStateTable { 193 public: 194 typedef typename Arc::StateId StateId; 195 typedef typename Arc::Label Label; 196 typedef typename Arc::Weight Weight; 197 typedef DeterminizeElement<Arc> Element; 198 typedef slist<Element> Subset; 199 200 explicit DefaultDeterminizeStateTable(size_t table_size = 0) 201 : table_size_(table_size), 202 subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } 203 204 DefaultDeterminizeStateTable(const DefaultDeterminizeStateTable<Arc> &table) 205 : table_size_(table.table_size_), 206 subsets_(table_size_, new SubsetKey(), new SubsetEqual(&elements_)) { } 207 208 ~DefaultDeterminizeStateTable() { 209 for (StateId s = 0; s < subsets_.Size(); ++s) 210 delete subsets_.FindEntry(s); 211 } 212 213 // Finds the state corresponding to a subset. Only creates a new 214 // state if the subset is not found. FindState takes ownership of 215 // the subset argument (so that it doesn't have to copy it if it 216 // creates a new state). 217 StateId FindState(Subset *subset) { 218 StateId ns = subsets_.Size(); 219 StateId s = subsets_.FindId(subset); 220 if (s != ns) delete subset; // subset found 221 return s; 222 } 223 224 const Subset* FindSubset(StateId s) { return subsets_.FindEntry(s); } 225 226 private: 227 // Comparison object for hashing Subset(s). Subsets are not sorted in this 228 // implementation, so ordering must not be assumed in the equivalence 229 // test. 230 class SubsetEqual { 231 public: 232 SubsetEqual() { // needed for compilation but should never be called 233 FSTERROR() << "SubsetEqual: default constructor not implemented"; 234 } 235 236 // Constructor takes vector needed to check equality. See immediately 237 // below for constraints on it. 238 explicit SubsetEqual(vector<Element *> *elements) 239 : elements_(elements) {} 240 241 // At each call to operator(), the elements_ vector should contain 242 // only NULLs. When this operator returns, elements_ will still 243 // have this property. 244 bool operator()(Subset* subset1, Subset* subset2) const { 245 if (!subset1 && !subset2) 246 return true; 247 if ((subset1 && !subset2) || (!subset1 && subset2)) 248 return false; 249 250 if (subset1->size() != subset2->size()) 251 return false; 252 253 // Loads first subset elements in element vector. 254 for (typename Subset::iterator iter1 = subset1->begin(); 255 iter1 != subset1->end(); 256 ++iter1) { 257 Element &element1 = *iter1; 258 while (elements_->size() <= element1.state_id) 259 elements_->push_back(0); 260 (*elements_)[element1.state_id] = &element1; 261 } 262 263 // Checks second subset matches first via element vector. 264 for (typename Subset::iterator iter2 = subset2->begin(); 265 iter2 != subset2->end(); 266 ++iter2) { 267 Element &element2 = *iter2; 268 while (elements_->size() <= element2.state_id) 269 elements_->push_back(0); 270 Element *element1 = (*elements_)[element2.state_id]; 271 if (!element1 || element1->weight != element2.weight) { 272 // Mismatch found. Resets element vector before returning false. 273 for (typename Subset::iterator iter1 = subset1->begin(); 274 iter1 != subset1->end(); 275 ++iter1) 276 (*elements_)[iter1->state_id] = 0; 277 return false; 278 } else { 279 (*elements_)[element2.state_id] = 0; // Clears entry 280 } 281 } 282 return true; 283 } 284 private: 285 vector<Element *> *elements_; 286 }; 287 288 // Hash function for Subset to Fst states. Subset elements are not 289 // sorted in this implementation, so the hash must be invariant 290 // under subset reordering. 291 class SubsetKey { 292 public: 293 size_t operator()(const Subset* subset) const { 294 size_t hash = 0; 295 if (subset) { 296 for (typename Subset::const_iterator iter = subset->begin(); 297 iter != subset->end(); 298 ++iter) { 299 const Element &element = *iter; 300 int lshift = element.state_id % (CHAR_BIT * sizeof(size_t) - 1) + 1; 301 int rshift = CHAR_BIT * sizeof(size_t) - lshift; 302 size_t n = element.state_id; 303 hash ^= n << lshift ^ n >> rshift ^ element.weight.Hash(); 304 } 305 } 306 return hash; 307 } 308 }; 309 310 size_t table_size_; 311 312 typedef CompactHashBiTable<StateId, Subset *, 313 SubsetKey, SubsetEqual, HS_STL> SubsetTable; 314 315 SubsetTable subsets_; 316 vector<Element *> elements_; 317 318 void operator=(const DefaultDeterminizeStateTable<Arc> &); // disallow 319 }; 320 321 // Options for finite-state transducer determinization templated on 322 // the arc type, common divisor, the determinization filter and the 323 // state table. DeterminizeFst takes ownership of the determinization 324 // filter and state table if provided. 325 template <class Arc, 326 class D = DefaultCommonDivisor<typename Arc::Weight>, 327 class F = IdentityDeterminizeFilter<Arc>, 328 class T = DefaultDeterminizeStateTable<Arc> > 329 struct DeterminizeFstOptions : CacheOptions { 330 typedef typename Arc::Label Label; 331 float delta; // Quantization delta for subset weights 332 Label subsequential_label; // Label used for residual final output 333 // when producing subsequential transducers. 334 F *filter; // Determinization filter 335 T *state_table; // Determinization state table 336 337 explicit DeterminizeFstOptions(const CacheOptions &opts, 338 float del = kDelta, Label lab = 0, 339 F *filt = 0, 340 T *table = 0) 341 : CacheOptions(opts), delta(del), subsequential_label(lab), 342 filter(filt), state_table(table) {} 343 344 explicit DeterminizeFstOptions(float del = kDelta, Label lab = 0, 345 F *filt = 0, T *table = 0) 346 : delta(del), subsequential_label(lab), filter(filt), 347 state_table(table) {} 348 }; 349 350 // Implementation of delayed DeterminizeFst. This base class is 351 // common to the variants that implement acceptor and transducer 352 // determinization. 353 template <class A> 354 class DeterminizeFstImplBase : public CacheImpl<A> { 355 public: 356 using FstImpl<A>::SetType; 357 using FstImpl<A>::SetProperties; 358 using FstImpl<A>::Properties; 359 using FstImpl<A>::SetInputSymbols; 360 using FstImpl<A>::SetOutputSymbols; 361 362 using CacheBaseImpl< CacheState<A> >::HasStart; 363 using CacheBaseImpl< CacheState<A> >::HasFinal; 364 using CacheBaseImpl< CacheState<A> >::HasArcs; 365 using CacheBaseImpl< CacheState<A> >::SetFinal; 366 using CacheBaseImpl< CacheState<A> >::SetStart; 367 368 typedef typename A::Label Label; 369 typedef typename A::Weight Weight; 370 typedef typename A::StateId StateId; 371 typedef CacheState<A> State; 372 373 template <class D, class F, class T> 374 DeterminizeFstImplBase(const Fst<A> &fst, 375 const DeterminizeFstOptions<A, D, F, T> &opts) 376 : CacheImpl<A>(opts), fst_(fst.Copy()) { 377 SetType("determinize"); 378 uint64 iprops = fst.Properties(kFstProperties, false); 379 uint64 dprops = DeterminizeProperties(iprops, 380 opts.subsequential_label != 0); 381 SetProperties(F::Properties(dprops), kCopyProperties); 382 SetInputSymbols(fst.InputSymbols()); 383 SetOutputSymbols(fst.OutputSymbols()); 384 } 385 386 DeterminizeFstImplBase(const DeterminizeFstImplBase<A> &impl) 387 : CacheImpl<A>(impl), 388 fst_(impl.fst_->Copy(true)) { 389 SetType("determinize"); 390 SetProperties(impl.Properties(), kCopyProperties); 391 SetInputSymbols(impl.InputSymbols()); 392 SetOutputSymbols(impl.OutputSymbols()); 393 } 394 395 virtual ~DeterminizeFstImplBase() { delete fst_; } 396 397 virtual DeterminizeFstImplBase<A> *Copy() = 0; 398 399 StateId Start() { 400 if (!HasStart()) { 401 StateId start = ComputeStart(); 402 if (start != kNoStateId) { 403 SetStart(start); 404 } 405 } 406 return CacheImpl<A>::Start(); 407 } 408 409 Weight Final(StateId s) { 410 if (!HasFinal(s)) { 411 Weight final = ComputeFinal(s); 412 SetFinal(s, final); 413 } 414 return CacheImpl<A>::Final(s); 415 } 416 417 virtual void Expand(StateId s) = 0; 418 419 size_t NumArcs(StateId s) { 420 if (!HasArcs(s)) 421 Expand(s); 422 return CacheImpl<A>::NumArcs(s); 423 } 424 425 size_t NumInputEpsilons(StateId s) { 426 if (!HasArcs(s)) 427 Expand(s); 428 return CacheImpl<A>::NumInputEpsilons(s); 429 } 430 431 size_t NumOutputEpsilons(StateId s) { 432 if (!HasArcs(s)) 433 Expand(s); 434 return CacheImpl<A>::NumOutputEpsilons(s); 435 } 436 437 void InitArcIterator(StateId s, ArcIteratorData<A> *data) { 438 if (!HasArcs(s)) 439 Expand(s); 440 CacheImpl<A>::InitArcIterator(s, data); 441 } 442 443 virtual StateId ComputeStart() = 0; 444 445 virtual Weight ComputeFinal(StateId s) = 0; 446 447 const Fst<A> &GetFst() const { return *fst_; } 448 449 private: 450 const Fst<A> *fst_; // Input Fst 451 452 void operator=(const DeterminizeFstImplBase<A> &); // disallow 453 }; 454 455 456 // Implementation of delayed determinization for weighted acceptors. 457 // It is templated on the arc type A and the common divisor D. 458 template <class A, class D, class F, class T> 459 class DeterminizeFsaImpl : public DeterminizeFstImplBase<A> { 460 public: 461 using FstImpl<A>::SetProperties; 462 using DeterminizeFstImplBase<A>::GetFst; 463 using DeterminizeFstImplBase<A>::SetArcs; 464 465 typedef typename A::Label Label; 466 typedef typename A::Weight Weight; 467 typedef typename A::StateId StateId; 468 typedef DeterminizeElement<A> Element; 469 typedef slist<Element> Subset; 470 typedef typename F::LabelMap LabelMap; 471 472 DeterminizeFsaImpl(const Fst<A> &fst, 473 const vector<Weight> *in_dist, vector<Weight> *out_dist, 474 const DeterminizeFstOptions<A, D, F, T> &opts) 475 : DeterminizeFstImplBase<A>(fst, opts), 476 delta_(opts.delta), 477 in_dist_(in_dist), 478 out_dist_(out_dist), 479 filter_(opts.filter ? opts.filter : new F()), 480 state_table_(opts.state_table ? opts.state_table : new T()) { 481 if (!fst.Properties(kAcceptor, true)) { 482 FSTERROR() << "DeterminizeFst: argument not an acceptor"; 483 SetProperties(kError, kError); 484 } 485 if (!(Weight::Properties() & kLeftSemiring)) { 486 FSTERROR() << "DeterminizeFst: Weight needs to be left distributive: " 487 << Weight::Type(); 488 SetProperties(kError, kError); 489 } 490 if (out_dist_) 491 out_dist_->clear(); 492 } 493 494 DeterminizeFsaImpl(const DeterminizeFsaImpl<A, D, F, T> &impl) 495 : DeterminizeFstImplBase<A>(impl), 496 delta_(impl.delta_), 497 in_dist_(0), 498 out_dist_(0), 499 filter_(new F(*impl.filter_)), 500 state_table_(new T(*impl.state_table_)) { 501 if (impl.out_dist_) { 502 FSTERROR() << "DeterminizeFsaImpl: cannot copy with out_dist vector"; 503 SetProperties(kError, kError); 504 } 505 } 506 507 virtual ~DeterminizeFsaImpl() { 508 delete filter_; 509 delete state_table_; 510 } 511 512 virtual DeterminizeFsaImpl<A, D, F, T> *Copy() { 513 return new DeterminizeFsaImpl<A, D, F, T>(*this); 514 } 515 516 uint64 Properties() const { return Properties(kFstProperties); } 517 518 // Set error if found; return FST impl properties. 519 uint64 Properties(uint64 mask) const { 520 if ((mask & kError) && (GetFst().Properties(kError, false))) 521 SetProperties(kError, kError); 522 return FstImpl<A>::Properties(mask); 523 } 524 525 virtual StateId ComputeStart() { 526 StateId s = GetFst().Start(); 527 if (s == kNoStateId) 528 return kNoStateId; 529 Element element(s, Weight::One()); 530 Subset *subset = new Subset; 531 subset->push_front(element); 532 return FindState(subset); 533 } 534 535 virtual Weight ComputeFinal(StateId s) { 536 const Subset *subset = state_table_->FindSubset(s); 537 Weight final = Weight::Zero(); 538 for (typename Subset::const_iterator siter = subset->begin(); 539 siter != subset->end(); 540 ++siter) { 541 const Element &element = *siter; 542 final = Plus(final, Times(element.weight, 543 GetFst().Final(element.state_id))); 544 if (!final.Member()) 545 SetProperties(kError, kError); 546 } 547 return final; 548 } 549 550 StateId FindState(Subset *subset) { 551 StateId s = state_table_->FindState(subset); 552 if (in_dist_ && out_dist_->size() <= s) 553 out_dist_->push_back(ComputeDistance(subset)); 554 return s; 555 } 556 557 // Compute distance from a state to the final states in the DFA 558 // given the distances in the NFA. 559 Weight ComputeDistance(const Subset *subset) { 560 Weight outd = Weight::Zero(); 561 for (typename Subset::const_iterator siter = subset->begin(); 562 siter != subset->end(); ++siter) { 563 const Element &element = *siter; 564 Weight ind = element.state_id < in_dist_->size() ? 565 (*in_dist_)[element.state_id] : Weight::Zero(); 566 outd = Plus(outd, Times(element.weight, ind)); 567 } 568 return outd; 569 } 570 571 // Computes the outgoing transitions from a state, creating new destination 572 // states as needed. 573 virtual void Expand(StateId s) { 574 575 LabelMap label_map; 576 LabelSubsets(s, &label_map); 577 578 for (typename LabelMap::iterator liter = label_map.begin(); 579 liter != label_map.end(); 580 ++liter) 581 AddArc(s, liter->first, liter->second); 582 SetArcs(s); 583 } 584 585 private: 586 // Constructs destination subsets per label. At return, subset 587 // element weights include the input automaton label weights and the 588 // subsets may contain duplicate states. 589 void LabelSubsets(StateId s, LabelMap *label_map) { 590 const Subset *src_subset = state_table_->FindSubset(s); 591 592 for (typename Subset::const_iterator siter = src_subset->begin(); 593 siter != src_subset->end(); 594 ++siter) { 595 const Element &src_element = *siter; 596 for (ArcIterator< Fst<A> > aiter(GetFst(), src_element.state_id); 597 !aiter.Done(); 598 aiter.Next()) { 599 const A &arc = aiter.Value(); 600 Element dest_element(arc.nextstate, 601 Times(src_element.weight, arc.weight)); 602 603 // The LabelMap may be a e.g. multimap with more complex 604 // determinization filters, so we insert efficiently w/o using []. 605 typename LabelMap::iterator liter = label_map->lower_bound(arc.ilabel); 606 Subset* dest_subset; 607 if (liter == label_map->end() || liter->first != arc.ilabel) { 608 dest_subset = new Subset; 609 label_map->insert(liter, make_pair(arc.ilabel, dest_subset)); 610 } else { 611 dest_subset = liter->second; 612 } 613 614 dest_subset->push_front(dest_element); 615 } 616 } 617 // Applies the determinization filter 618 (*filter_)(s, label_map); 619 } 620 621 // Adds an arc from state S to the destination state associated 622 // with subset DEST_SUBSET (as created by LabelSubsets). 623 void AddArc(StateId s, Label label, Subset *dest_subset) { 624 A arc; 625 arc.ilabel = label; 626 arc.olabel = label; 627 arc.weight = Weight::Zero(); 628 629 typename Subset::iterator oiter; 630 for (typename Subset::iterator diter = dest_subset->begin(); 631 diter != dest_subset->end();) { 632 Element &dest_element = *diter; 633 // Computes label weight. 634 arc.weight = common_divisor_(arc.weight, dest_element.weight); 635 636 while (elements_.size() <= dest_element.state_id) 637 elements_.push_back(0); 638 Element *matching_element = elements_[dest_element.state_id]; 639 if (matching_element) { 640 // Found duplicate state: sums state weight and deletes dup. 641 matching_element->weight = Plus(matching_element->weight, 642 dest_element.weight); 643 if (!matching_element->weight.Member()) 644 SetProperties(kError, kError); 645 ++diter; 646 dest_subset->erase_after(oiter); 647 } else { 648 // Saves element so we can check for duplicate for this state. 649 elements_[dest_element.state_id] = &dest_element; 650 oiter = diter; 651 ++diter; 652 } 653 } 654 655 // Divides out label weight from destination subset elements. 656 // Quantizes to ensure comparisons are effective. 657 // Clears element vector. 658 for (typename Subset::iterator diter = dest_subset->begin(); 659 diter != dest_subset->end(); 660 ++diter) { 661 Element &dest_element = *diter; 662 dest_element.weight = Divide(dest_element.weight, arc.weight, 663 DIVIDE_LEFT); 664 dest_element.weight = dest_element.weight.Quantize(delta_); 665 elements_[dest_element.state_id] = 0; 666 } 667 668 arc.nextstate = FindState(dest_subset); 669 CacheImpl<A>::PushArc(s, arc); 670 } 671 672 float delta_; // Quantization delta for subset weights 673 const vector<Weight> *in_dist_; // Distance to final NFA states 674 vector<Weight> *out_dist_; // Distance to final DFA states 675 676 D common_divisor_; 677 F *filter_; 678 T *state_table_; 679 680 vector<Element *> elements_; 681 682 void operator=(const DeterminizeFsaImpl<A, D, F, T> &); // disallow 683 }; 684 685 686 // Implementation of delayed determinization for transducers. 687 // Transducer determinization is implemented by mapping the input to 688 // the Gallic semiring as an acceptor whose weights contain the output 689 // strings and using acceptor determinization above to determinize 690 // that acceptor. 691 template <class A, StringType S, class D, class F, class T> 692 class DeterminizeFstImpl : public DeterminizeFstImplBase<A> { 693 public: 694 using FstImpl<A>::SetProperties; 695 using DeterminizeFstImplBase<A>::GetFst; 696 using CacheBaseImpl< CacheState<A> >::GetCacheGc; 697 using CacheBaseImpl< CacheState<A> >::GetCacheLimit; 698 699 typedef typename A::Label Label; 700 typedef typename A::Weight Weight; 701 typedef typename A::StateId StateId; 702 703 typedef ToGallicMapper<A, S> ToMapper; 704 typedef FromGallicMapper<A, S> FromMapper; 705 706 typedef typename ToMapper::ToArc ToArc; 707 typedef ArcMapFst<A, ToArc, ToMapper> ToFst; 708 typedef ArcMapFst<ToArc, A, FromMapper> FromFst; 709 710 typedef GallicCommonDivisor<Label, Weight, S, D> CommonDivisor; 711 typedef GallicFactor<Label, Weight, S> FactorIterator; 712 713 DeterminizeFstImpl(const Fst<A> &fst, 714 const DeterminizeFstOptions<A, D, F, T> &opts) 715 : DeterminizeFstImplBase<A>(fst, opts), 716 delta_(opts.delta), 717 subsequential_label_(opts.subsequential_label) { 718 Init(GetFst()); 719 } 720 721 DeterminizeFstImpl(const DeterminizeFstImpl<A, S, D, F, T> &impl) 722 : DeterminizeFstImplBase<A>(impl), 723 delta_(impl.delta_), 724 subsequential_label_(impl.subsequential_label_) { 725 Init(GetFst()); 726 } 727 728 ~DeterminizeFstImpl() { delete from_fst_; } 729 730 virtual DeterminizeFstImpl<A, S, D, F, T> *Copy() { 731 return new DeterminizeFstImpl<A, S, D, F, T>(*this); 732 } 733 734 uint64 Properties() const { return Properties(kFstProperties); } 735 736 // Set error if found; return FST impl properties. 737 uint64 Properties(uint64 mask) const { 738 if ((mask & kError) && (GetFst().Properties(kError, false) || 739 from_fst_->Properties(kError, false))) 740 SetProperties(kError, kError); 741 return FstImpl<A>::Properties(mask); 742 } 743 744 virtual StateId ComputeStart() { return from_fst_->Start(); } 745 746 virtual Weight ComputeFinal(StateId s) { return from_fst_->Final(s); } 747 748 virtual void Expand(StateId s) { 749 for (ArcIterator<FromFst> aiter(*from_fst_, s); 750 !aiter.Done(); 751 aiter.Next()) 752 CacheImpl<A>::PushArc(s, aiter.Value()); 753 CacheImpl<A>::SetArcs(s); 754 } 755 756 private: 757 // Initialization of transducer determinization implementation, which 758 // is defined after DeterminizeFst since it calls it. 759 void Init(const Fst<A> &fst); 760 761 float delta_; 762 Label subsequential_label_; 763 FromFst *from_fst_; 764 765 void operator=(const DeterminizeFstImpl<A, S, D, F, T> &); // disallow 766 }; 767 768 769 // Determinizes a weighted transducer. This version is a delayed 770 // Fst. The result will be an equivalent FST that has the property 771 // that no state has two transitions with the same input label. 772 // For this algorithm, epsilon transitions are treated as regular 773 // symbols (cf. RmEpsilon). 774 // 775 // The transducer must be functional. The weights must be (weakly) 776 // left divisible (valid for TropicalWeight and LogWeight for instance) 777 // and be zero-sum-free if for all a,b: (Plus(a, b) = 0 => a = b = 0. 778 // 779 // Complexity: 780 // - Determinizable: exponential (polynomial in the size of the output) 781 // - Non-determinizable) does not terminate 782 // 783 // The determinizable automata include all unweighted and all acyclic input. 784 // 785 // References: 786 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech 787 // Processing". Computational Linguistics, 23:2, 1997. 788 // 789 // This class attaches interface to implementation and handles 790 // reference counting, delegating most methods to ImplToFst. 791 template <class A> 792 class DeterminizeFst : public ImplToFst< DeterminizeFstImplBase<A> > { 793 public: 794 friend class ArcIterator< DeterminizeFst<A> >; 795 friend class StateIterator< DeterminizeFst<A> >; 796 template <class B, StringType S, class D, class F, class T> 797 friend class DeterminizeFstImpl; 798 799 typedef A Arc; 800 typedef typename A::Weight Weight; 801 typedef typename A::StateId StateId; 802 typedef typename A::Label Label; 803 typedef CacheState<A> State; 804 typedef DeterminizeFstImplBase<A> Impl; 805 806 using ImplToFst<Impl>::SetImpl; 807 808 explicit DeterminizeFst(const Fst<A> &fst) { 809 typedef DefaultCommonDivisor<Weight> D; 810 typedef IdentityDeterminizeFilter<A> F; 811 typedef DefaultDeterminizeStateTable<A> T; 812 DeterminizeFstOptions<A, D, F, T> opts; 813 if (fst.Properties(kAcceptor, true)) { 814 // Calls implementation for acceptors. 815 SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); 816 } else { 817 // Calls implementation for transducers. 818 SetImpl(new 819 DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); 820 } 821 } 822 823 template <class D, class F, class T> 824 DeterminizeFst(const Fst<A> &fst, 825 const DeterminizeFstOptions<A, D, F, T> &opts) { 826 if (fst.Properties(kAcceptor, true)) { 827 // Calls implementation for acceptors. 828 SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, 0, 0, opts)); 829 } else { 830 // Calls implementation for transducers. 831 SetImpl(new 832 DeterminizeFstImpl<A, STRING_LEFT_RESTRICT, D, F, T>(fst, opts)); 833 } 834 } 835 836 // This acceptor-only version additionally computes the distance to 837 // final states in the output if provided with those distances for the 838 // input. Useful for e.g. unique N-shortest paths. 839 template <class D, class F, class T> 840 DeterminizeFst(const Fst<A> &fst, 841 const vector<Weight> *in_dist, vector<Weight> *out_dist, 842 const DeterminizeFstOptions<A, D, F, T> &opts) { 843 if (!fst.Properties(kAcceptor, true)) { 844 FSTERROR() << "DeterminizeFst:" 845 << " distance to final states computed for acceptors only"; 846 GetImpl()->SetProperties(kError, kError); 847 } 848 SetImpl(new DeterminizeFsaImpl<A, D, F, T>(fst, in_dist, out_dist, opts)); 849 } 850 851 // See Fst<>::Copy() for doc. 852 DeterminizeFst(const DeterminizeFst<A> &fst, bool safe = false) { 853 if (safe) 854 SetImpl(fst.GetImpl()->Copy()); 855 else 856 SetImpl(fst.GetImpl(), false); 857 } 858 859 // Get a copy of this DeterminizeFst. See Fst<>::Copy() for further doc. 860 virtual DeterminizeFst<A> *Copy(bool safe = false) const { 861 return new DeterminizeFst<A>(*this, safe); 862 } 863 864 virtual inline void InitStateIterator(StateIteratorData<A> *data) const; 865 866 virtual void InitArcIterator(StateId s, ArcIteratorData<A> *data) const { 867 GetImpl()->InitArcIterator(s, data); 868 } 869 870 private: 871 // Makes visible to friends. 872 Impl *GetImpl() const { return ImplToFst<Impl>::GetImpl(); } 873 874 void operator=(const DeterminizeFst<A> &fst); // Disallow 875 }; 876 877 878 // Initialization of transducer determinization implementation. which 879 // is defined after DeterminizeFst since it calls it. 880 template <class A, StringType S, class D, class F, class T> 881 void DeterminizeFstImpl<A, S, D, F, T>::Init(const Fst<A> &fst) { 882 // Mapper to an acceptor. 883 ToFst to_fst(fst, ToMapper()); 884 885 // Determinizes acceptor. 886 // This recursive call terminates since it passes the common divisor 887 // to a private constructor. 888 CacheOptions copts(GetCacheGc(), GetCacheLimit()); 889 DeterminizeFstOptions<ToArc, CommonDivisor> dopts(copts, delta_); 890 // Uses acceptor-only constructor to avoid template recursion 891 DeterminizeFst<ToArc> det_fsa(to_fst, 0, 0, dopts); 892 893 // Mapper back to transducer. 894 FactorWeightOptions<ToArc> fopts(CacheOptions(true, 0), delta_, 895 kFactorFinalWeights, 896 subsequential_label_, 897 subsequential_label_); 898 FactorWeightFst<ToArc, FactorIterator> factored_fst(det_fsa, fopts); 899 from_fst_ = new FromFst(factored_fst, FromMapper(subsequential_label_)); 900 } 901 902 903 // Specialization for DeterminizeFst. 904 template <class A> 905 class StateIterator< DeterminizeFst<A> > 906 : public CacheStateIterator< DeterminizeFst<A> > { 907 public: 908 explicit StateIterator(const DeterminizeFst<A> &fst) 909 : CacheStateIterator< DeterminizeFst<A> >(fst, fst.GetImpl()) {} 910 }; 911 912 913 // Specialization for DeterminizeFst. 914 template <class A> 915 class ArcIterator< DeterminizeFst<A> > 916 : public CacheArcIterator< DeterminizeFst<A> > { 917 public: 918 typedef typename A::StateId StateId; 919 920 ArcIterator(const DeterminizeFst<A> &fst, StateId s) 921 : CacheArcIterator< DeterminizeFst<A> >(fst.GetImpl(), s) { 922 if (!fst.GetImpl()->HasArcs(s)) 923 fst.GetImpl()->Expand(s); 924 } 925 926 private: 927 DISALLOW_COPY_AND_ASSIGN(ArcIterator); 928 }; 929 930 931 template <class A> inline 932 void DeterminizeFst<A>::InitStateIterator(StateIteratorData<A> *data) const 933 { 934 data->base = new StateIterator< DeterminizeFst<A> >(*this); 935 } 936 937 938 // Useful aliases when using StdArc. 939 typedef DeterminizeFst<StdArc> StdDeterminizeFst; 940 941 942 template <class Arc> 943 struct DeterminizeOptions { 944 typedef typename Arc::StateId StateId; 945 typedef typename Arc::Weight Weight; 946 typedef typename Arc::Label Label; 947 948 float delta; // Quantization delta for subset weights. 949 Weight weight_threshold; // Pruning weight threshold. 950 StateId state_threshold; // Pruning state threshold. 951 Label subsequential_label; // Label used for residual final output 952 // when producing subsequential transducers. 953 954 explicit DeterminizeOptions(float d = kDelta, Weight w = Weight::Zero(), 955 StateId n = kNoStateId, Label l = 0) 956 : delta(d), weight_threshold(w), state_threshold(n), 957 subsequential_label(l) {} 958 }; 959 960 961 // Determinizes a weighted transducer. This version writes the 962 // determinized Fst to an output MutableFst. The result will be an 963 // equivalent FST that has the property that no state has two 964 // transitions with the same input label. For this algorithm, epsilon 965 // transitions are treated as regular symbols (cf. RmEpsilon). 966 // 967 // The transducer must be functional. The weights must be (weakly) 968 // left divisible (valid for TropicalWeight and LogWeight). 969 // 970 // Complexity: 971 // - Determinizable: exponential (polynomial in the size of the output) 972 // - Non-determinizable: does not terminate 973 // 974 // The determinizable automata include all unweighted and all acyclic input. 975 // 976 // References: 977 // - Mehryar Mohri, "Finite-State Transducers in Language and Speech 978 // Processing". Computational Linguistics, 23:2, 1997. 979 template <class Arc> 980 void Determinize(const Fst<Arc> &ifst, MutableFst<Arc> *ofst, 981 const DeterminizeOptions<Arc> &opts 982 = DeterminizeOptions<Arc>()) { 983 typedef typename Arc::StateId StateId; 984 typedef typename Arc::Weight Weight; 985 986 DeterminizeFstOptions<Arc> nopts; 987 nopts.delta = opts.delta; 988 nopts.subsequential_label = opts.subsequential_label; 989 990 nopts.gc_limit = 0; // Cache only the last state for fastest copy. 991 992 if (opts.weight_threshold != Weight::Zero() || 993 opts.state_threshold != kNoStateId) { 994 if (ifst.Properties(kAcceptor, false)) { 995 vector<Weight> idistance, odistance; 996 ShortestDistance(ifst, &idistance, true); 997 DeterminizeFst<Arc> dfst(ifst, &idistance, &odistance, nopts); 998 PruneOptions< Arc, AnyArcFilter<Arc> > popts(opts.weight_threshold, 999 opts.state_threshold, 1000 AnyArcFilter<Arc>(), 1001 &odistance); 1002 Prune(dfst, ofst, popts); 1003 } else { 1004 *ofst = DeterminizeFst<Arc>(ifst, nopts); 1005 Prune(ofst, opts.weight_threshold, opts.state_threshold); 1006 } 1007 } else { 1008 *ofst = DeterminizeFst<Arc>(ifst, nopts); 1009 } 1010 } 1011 1012 1013 } // namespace fst 1014 1015 #endif // FST_LIB_DETERMINIZE_H__ 1016