Home | History | Annotate | Download | only in lib
      1 // symbol-table.h
      2 //
      3 // Licensed under the Apache License, Version 2.0 (the "License");
      4 // you may not use this file except in compliance with the License.
      5 // You may obtain a copy of the License at
      6 //
      7 //      http://www.apache.org/licenses/LICENSE-2.0
      8 //
      9 // Unless required by applicable law or agreed to in writing, software
     10 // distributed under the License is distributed on an "AS IS" BASIS,
     11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 // See the License for the specific language governing permissions and
     13 // limitations under the License.
     14 //
     15 //
     16 // \file
     17 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
     18 
     19 #ifndef FST_LIB_SYMBOL_TABLE_H__
     20 #define FST_LIB_SYMBOL_TABLE_H__
     21 
     22 #include <ext/hash_map>
     23 using __gnu_cxx::hash_map;
     24 #include <fstream>
     25 #include <iostream>
     26 #include <string>
     27 #include <vector>
     28 
     29 #include "fst/lib/compat.h"
     30 
     31 
     32 
     33 DECLARE_bool(fst_compat_symbols);
     34 
     35 namespace fst {
     36 
     37 class SymbolTableImpl {
     38   friend class SymbolTableIterator;
     39  public:
     40   SymbolTableImpl(const string &name)
     41       : name_(name), available_key_(0), ref_count_(1),
     42         check_sum_finalized_(false) {}
     43   ~SymbolTableImpl() {
     44     for (size_t i = 0; i < symbols_.size(); ++i)
     45       delete[] symbols_[i];
     46   }
     47 
     48   int64 AddSymbol(const string& symbol, int64 key);
     49 
     50   int64 AddSymbol(const string& symbol) {
     51     int64 key = Find(symbol);
     52     return (key == -1) ? AddSymbol(symbol, available_key_++) : key;
     53   }
     54 
     55   void AddTable(SymbolTableImpl* table) {
     56     for (size_t i = 0; i < table->symbols_.size(); ++i) {
     57       AddSymbol(table->symbols_[i]);
     58     }
     59   }
     60 
     61   static SymbolTableImpl* ReadText(const string& filename);
     62 
     63   static SymbolTableImpl* Read(istream &strm, const string& source);
     64 
     65   bool Write(ostream &strm) const;
     66 
     67   bool WriteText(ostream &strm) const;
     68 
     69   //
     70   // Return the string associated with the key. If the key is out of
     71   // range (<0, >max), return an empty string.
     72   string Find(int64 key) const {
     73     hash_map<int64, string>::const_iterator it =
     74       key_map_.find(key);
     75     if (it == key_map_.end()) {
     76       return "";
     77     }
     78     return it->second;
     79   }
     80 
     81   //
     82   // Return the key associated with the symbol. If the symbol
     83   // does not exists, return -1.
     84   int64 Find(const string& symbol) const {
     85     return Find(symbol.c_str());
     86   }
     87 
     88   //
     89   // Return the key associated with the symbol. If the symbol
     90   // does not exists, return -1.
     91   int64 Find(const char* symbol) const {
     92     hash_map<string, int64>::const_iterator it =
     93       symbol_map_.find(symbol);
     94     if (it == symbol_map_.end()) {
     95       return -1;
     96     }
     97     return it->second;
     98   }
     99 
    100   const string& Name() const { return name_; }
    101 
    102   int IncrRefCount() const {
    103     return ++ref_count_;
    104   }
    105   int DecrRefCount() const {
    106     return --ref_count_;
    107   }
    108 
    109   string CheckSum() const {
    110     if (!check_sum_finalized_) {
    111       RecomputeCheckSum();
    112       check_sum_string_ = check_sum_.Digest();
    113     }
    114     return check_sum_string_;
    115   }
    116 
    117   int64 AvailableKey() const {
    118     return available_key_;
    119   }
    120 
    121   // private support methods
    122  private:
    123   void RecomputeCheckSum() const;
    124   static SymbolTableImpl* Read1(istream &, const string &);
    125 
    126   string name_;
    127   int64 available_key_;
    128   vector<const char *> symbols_;
    129   hash_map<int64, string> key_map_;
    130   hash_map<string, int64> symbol_map_;
    131 
    132   mutable int ref_count_;
    133   mutable bool check_sum_finalized_;
    134   mutable MD5 check_sum_;
    135   mutable string check_sum_string_;
    136 
    137   DISALLOW_EVIL_CONSTRUCTORS(SymbolTableImpl);
    138 };
    139 
    140 
    141 class SymbolTableIterator;
    142 
    143 //
    144 // \class SymbolTable
    145 // \brief Symbol (string) to int and reverse mapping
    146 //
    147 // The SymbolTable implements the mappings of labels to strings and reverse.
    148 // SymbolTables are used to describe the alphabet of the input and output
    149 // labels for arcs in a Finite State Transducer.
    150 //
    151 // SymbolTables are reference counted and can therefore be shared across
    152 // multiple machines. For example a language model grammar G, with a
    153 // SymbolTable for the words in the language model can share this symbol
    154 // table with the lexical representation L o G.
    155 //
    156 class SymbolTable {
    157   friend class SymbolTableIterator;
    158  public:
    159   static const int64 kNoSymbol = -1;
    160 
    161   // Construct symbol table with a unique name.
    162   SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {}
    163 
    164   // Create a reference counted copy.
    165   SymbolTable(const SymbolTable& table) : impl_(table.impl_) {
    166     impl_->IncrRefCount();
    167   }
    168 
    169   // Derefence implentation object. When reference count hits 0, delete
    170   // implementation.
    171   ~SymbolTable() {
    172     if (!impl_->DecrRefCount()) delete impl_;
    173   }
    174 
    175   // create a reference counted copy
    176   SymbolTable* Copy() const {
    177     return new SymbolTable(*this);
    178   }
    179 
    180   // Add a symbol with given key to table. A symbol table also
    181   // keeps track of the last available key (highest key value in
    182   // the symbol table).
    183   //
    184   // \param symbol string symbol to add
    185   // \param key associated key for string symbol
    186   // \return the key created by the symbol table. Symbols allready added to
    187   //         the symbol table will not get a different key.
    188   int64 AddSymbol(const string& symbol, int64 key) {
    189     return impl_->AddSymbol(symbol, key);
    190   }
    191 
    192   // Add a symbol to the table. The associated value key is automatically
    193   // assigned by the symbol table.
    194   //
    195   // \param symbol string to add to the table
    196   // \return the value key assigned to the associated string symbol
    197   int64 AddSymbol(const string& symbol) {
    198     return impl_->AddSymbol(symbol);
    199   }
    200 
    201   // Add another symbol table to this table. All key values will be offset
    202   // by the current available key (highest key value in the symbol table).
    203   // Note string symbols with the same key value with still have the same
    204   // key value after the symbol table has been merged, but a different
    205   // value. Adding symbol tables do not result in changes in the base table.
    206   //
    207   // Merging N symbol tables is often useful when combining the various
    208   // name spaces of transducers to a unified representation.
    209   //
    210   // \param table the symbol table to add to this table
    211   void AddTable(const SymbolTable& table) {
    212     return impl_->AddTable(table.impl_);
    213   }
    214 
    215   // return the name of the symbol table
    216   const string& Name() const {
    217     return impl_->Name();
    218   }
    219 
    220   // return the MD5 check-sum for this table. All new symbols added to
    221   // the table will result in an updated checksum.
    222   string CheckSum() const {
    223     return impl_->CheckSum();
    224   }
    225 
    226   // read an ascii representation of the symbol table
    227   static SymbolTable* ReadText(const string& filename) {
    228     SymbolTableImpl* impl = SymbolTableImpl::ReadText(filename);
    229     if (!impl)
    230       return 0;
    231     else
    232       return new SymbolTable(impl);
    233   }
    234 
    235   // read a binary dump of the symbol table
    236   static SymbolTable* Read(istream &strm, const string& source) {
    237     SymbolTableImpl* impl = SymbolTableImpl::Read(strm, source);
    238     if (!impl)
    239       return 0;
    240     else
    241       return new SymbolTable(impl);
    242   }
    243 
    244   // read a binary dump of the symbol table
    245   static SymbolTable* Read(const string& filename) {
    246     ifstream strm(filename.c_str());
    247     if (!strm) {
    248       LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename;
    249       return 0;
    250     }
    251     return Read(strm, filename);
    252   }
    253 
    254   bool Write(ostream  &strm) const {
    255     return impl_->Write(strm);
    256   }
    257 
    258   bool Write(const string& filename) const {
    259     ofstream strm(filename.c_str());
    260     if (!strm) {
    261       LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename;
    262       return false;
    263     }
    264     return Write(strm);
    265   }
    266 
    267   // Dump an ascii text representation of the symbol table
    268   bool WriteText(ostream &strm) const {
    269     return impl_->WriteText(strm);
    270   }
    271 
    272   // Dump an ascii text representation of the symbol table
    273   bool WriteText(const string& filename) const {
    274     ofstream strm(filename.c_str());
    275     if (!strm) {
    276       LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename;
    277       return false;
    278     }
    279     return WriteText(strm);
    280   }
    281 
    282   // Return the string associated with the key. If the key is out of
    283   // range (<0, >max), log error and return an empty string.
    284   string Find(int64 key) const {
    285     return impl_->Find(key);
    286   }
    287 
    288   // Return the key associated with the symbol. If the symbol
    289   // does not exists, log error and  return -1
    290   int64 Find(const string& symbol) const {
    291     return impl_->Find(symbol);
    292   }
    293 
    294   // Return the key associated with the symbol. If the symbol
    295   // does not exists, log error and  return -1
    296   int64 Find(const char* symbol) const {
    297     return impl_->Find(symbol);
    298   }
    299 
    300   // return the current available key (i.e highest key number) in
    301   // the symbol table
    302   int64 AvailableKey(void) const {
    303     return impl_->AvailableKey();
    304   }
    305 
    306  protected:
    307   explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {}
    308 
    309   const SymbolTableImpl* Impl() const {
    310     return impl_;
    311   }
    312 
    313  private:
    314   SymbolTableImpl* impl_;
    315 
    316 
    317   void operator=(const SymbolTable &table);  // disallow
    318 };
    319 
    320 
    321 //
    322 // \class SymbolTableIterator
    323 // \brief Iterator class for symbols in a symbol table
    324 class SymbolTableIterator {
    325  public:
    326   // Constructor creates a refcounted copy of underlying implementation
    327   SymbolTableIterator(const SymbolTable& symbol_table) {
    328     impl_ = symbol_table.Impl();
    329     impl_->IncrRefCount();
    330     pos_ = 0;
    331     size_ = impl_->symbols_.size();
    332   }
    333 
    334   // decrement implementation refcount, and delete if 0
    335   ~SymbolTableIterator() {
    336     if (!impl_->DecrRefCount()) delete impl_;
    337   }
    338 
    339   // is iterator done
    340   bool Done(void) {
    341     return (pos_ == size_);
    342   }
    343 
    344   // return the Value() of the current symbol (in64 key)
    345   int64 Value(void) {
    346     return impl_->Find(impl_->symbols_[pos_]);
    347   }
    348 
    349   // return the string of the current symbol
    350   const char* Symbol(void) {
    351     return impl_->symbols_[pos_];
    352   }
    353 
    354   // advance iterator forward
    355   void Next(void) {
    356     if (Done()) return;
    357     ++pos_;
    358   }
    359 
    360   // reset iterator
    361   void Reset(void) {
    362     pos_ = 0;
    363   }
    364 
    365  private:
    366   const SymbolTableImpl* impl_;
    367   size_t pos_;
    368   size_t size_;
    369 };
    370 
    371 
    372 // Tests compatibilty between two sets of symbol tables
    373 inline bool CompatSymbols(const SymbolTable *syms1,
    374                           const SymbolTable *syms2) {
    375   if (!FLAGS_fst_compat_symbols)
    376     return true;
    377   else if (!syms1 && !syms2)
    378     return true;
    379   else if (syms1 && !syms2 || !syms1 && syms2)
    380     return false;
    381   else
    382     return syms1->CheckSum() == syms2->CheckSum();
    383 }
    384 
    385 }  // namespace fst
    386 
    387 #endif  // FST_LIB_SYMBOL_TABLE_H__
    388