1 // symbol-table.h 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 // 15 // 16 // \file 17 // Classes to provide symbol-to-integer and integer-to-symbol mappings. 18 19 #ifndef FST_LIB_SYMBOL_TABLE_H__ 20 #define FST_LIB_SYMBOL_TABLE_H__ 21 22 #include <ext/hash_map> 23 using __gnu_cxx::hash_map; 24 #include <fstream> 25 #include <iostream> 26 #include <string> 27 #include <vector> 28 29 #include "fst/lib/compat.h" 30 31 32 33 DECLARE_bool(fst_compat_symbols); 34 35 namespace fst { 36 37 class SymbolTableImpl { 38 friend class SymbolTableIterator; 39 public: 40 SymbolTableImpl(const string &name) 41 : name_(name), available_key_(0), ref_count_(1), 42 check_sum_finalized_(false) {} 43 ~SymbolTableImpl() { 44 for (size_t i = 0; i < symbols_.size(); ++i) 45 delete[] symbols_[i]; 46 } 47 48 int64 AddSymbol(const string& symbol, int64 key); 49 50 int64 AddSymbol(const string& symbol) { 51 int64 key = Find(symbol); 52 return (key == -1) ? AddSymbol(symbol, available_key_++) : key; 53 } 54 55 void AddTable(SymbolTableImpl* table) { 56 for (size_t i = 0; i < table->symbols_.size(); ++i) { 57 AddSymbol(table->symbols_[i]); 58 } 59 } 60 61 static SymbolTableImpl* ReadText(const string& filename); 62 63 static SymbolTableImpl* Read(istream &strm, const string& source); 64 65 bool Write(ostream &strm) const; 66 67 bool WriteText(ostream &strm) const; 68 69 // 70 // Return the string associated with the key. If the key is out of 71 // range (<0, >max), return an empty string. 72 string Find(int64 key) const { 73 hash_map<int64, string>::const_iterator it = 74 key_map_.find(key); 75 if (it == key_map_.end()) { 76 return ""; 77 } 78 return it->second; 79 } 80 81 // 82 // Return the key associated with the symbol. If the symbol 83 // does not exists, return -1. 84 int64 Find(const string& symbol) const { 85 return Find(symbol.c_str()); 86 } 87 88 // 89 // Return the key associated with the symbol. If the symbol 90 // does not exists, return -1. 91 int64 Find(const char* symbol) const { 92 hash_map<string, int64>::const_iterator it = 93 symbol_map_.find(symbol); 94 if (it == symbol_map_.end()) { 95 return -1; 96 } 97 return it->second; 98 } 99 100 const string& Name() const { return name_; } 101 102 int IncrRefCount() const { 103 return ++ref_count_; 104 } 105 int DecrRefCount() const { 106 return --ref_count_; 107 } 108 109 string CheckSum() const { 110 if (!check_sum_finalized_) { 111 RecomputeCheckSum(); 112 check_sum_string_ = check_sum_.Digest(); 113 } 114 return check_sum_string_; 115 } 116 117 int64 AvailableKey() const { 118 return available_key_; 119 } 120 121 // private support methods 122 private: 123 void RecomputeCheckSum() const; 124 static SymbolTableImpl* Read1(istream &, const string &); 125 126 string name_; 127 int64 available_key_; 128 vector<const char *> symbols_; 129 hash_map<int64, string> key_map_; 130 hash_map<string, int64> symbol_map_; 131 132 mutable int ref_count_; 133 mutable bool check_sum_finalized_; 134 mutable MD5 check_sum_; 135 mutable string check_sum_string_; 136 137 DISALLOW_EVIL_CONSTRUCTORS(SymbolTableImpl); 138 }; 139 140 141 class SymbolTableIterator; 142 143 // 144 // \class SymbolTable 145 // \brief Symbol (string) to int and reverse mapping 146 // 147 // The SymbolTable implements the mappings of labels to strings and reverse. 148 // SymbolTables are used to describe the alphabet of the input and output 149 // labels for arcs in a Finite State Transducer. 150 // 151 // SymbolTables are reference counted and can therefore be shared across 152 // multiple machines. For example a language model grammar G, with a 153 // SymbolTable for the words in the language model can share this symbol 154 // table with the lexical representation L o G. 155 // 156 class SymbolTable { 157 friend class SymbolTableIterator; 158 public: 159 static const int64 kNoSymbol = -1; 160 161 // Construct symbol table with a unique name. 162 SymbolTable(const string& name) : impl_(new SymbolTableImpl(name)) {} 163 164 // Create a reference counted copy. 165 SymbolTable(const SymbolTable& table) : impl_(table.impl_) { 166 impl_->IncrRefCount(); 167 } 168 169 // Derefence implentation object. When reference count hits 0, delete 170 // implementation. 171 ~SymbolTable() { 172 if (!impl_->DecrRefCount()) delete impl_; 173 } 174 175 // create a reference counted copy 176 SymbolTable* Copy() const { 177 return new SymbolTable(*this); 178 } 179 180 // Add a symbol with given key to table. A symbol table also 181 // keeps track of the last available key (highest key value in 182 // the symbol table). 183 // 184 // \param symbol string symbol to add 185 // \param key associated key for string symbol 186 // \return the key created by the symbol table. Symbols allready added to 187 // the symbol table will not get a different key. 188 int64 AddSymbol(const string& symbol, int64 key) { 189 return impl_->AddSymbol(symbol, key); 190 } 191 192 // Add a symbol to the table. The associated value key is automatically 193 // assigned by the symbol table. 194 // 195 // \param symbol string to add to the table 196 // \return the value key assigned to the associated string symbol 197 int64 AddSymbol(const string& symbol) { 198 return impl_->AddSymbol(symbol); 199 } 200 201 // Add another symbol table to this table. All key values will be offset 202 // by the current available key (highest key value in the symbol table). 203 // Note string symbols with the same key value with still have the same 204 // key value after the symbol table has been merged, but a different 205 // value. Adding symbol tables do not result in changes in the base table. 206 // 207 // Merging N symbol tables is often useful when combining the various 208 // name spaces of transducers to a unified representation. 209 // 210 // \param table the symbol table to add to this table 211 void AddTable(const SymbolTable& table) { 212 return impl_->AddTable(table.impl_); 213 } 214 215 // return the name of the symbol table 216 const string& Name() const { 217 return impl_->Name(); 218 } 219 220 // return the MD5 check-sum for this table. All new symbols added to 221 // the table will result in an updated checksum. 222 string CheckSum() const { 223 return impl_->CheckSum(); 224 } 225 226 // read an ascii representation of the symbol table 227 static SymbolTable* ReadText(const string& filename) { 228 SymbolTableImpl* impl = SymbolTableImpl::ReadText(filename); 229 if (!impl) 230 return 0; 231 else 232 return new SymbolTable(impl); 233 } 234 235 // read a binary dump of the symbol table 236 static SymbolTable* Read(istream &strm, const string& source) { 237 SymbolTableImpl* impl = SymbolTableImpl::Read(strm, source); 238 if (!impl) 239 return 0; 240 else 241 return new SymbolTable(impl); 242 } 243 244 // read a binary dump of the symbol table 245 static SymbolTable* Read(const string& filename) { 246 ifstream strm(filename.c_str()); 247 if (!strm) { 248 LOG(ERROR) << "SymbolTable::Read: Can't open file " << filename; 249 return 0; 250 } 251 return Read(strm, filename); 252 } 253 254 bool Write(ostream &strm) const { 255 return impl_->Write(strm); 256 } 257 258 bool Write(const string& filename) const { 259 ofstream strm(filename.c_str()); 260 if (!strm) { 261 LOG(ERROR) << "SymbolTable::Write: Can't open file " << filename; 262 return false; 263 } 264 return Write(strm); 265 } 266 267 // Dump an ascii text representation of the symbol table 268 bool WriteText(ostream &strm) const { 269 return impl_->WriteText(strm); 270 } 271 272 // Dump an ascii text representation of the symbol table 273 bool WriteText(const string& filename) const { 274 ofstream strm(filename.c_str()); 275 if (!strm) { 276 LOG(ERROR) << "SymbolTable::WriteText: Can't open file " << filename; 277 return false; 278 } 279 return WriteText(strm); 280 } 281 282 // Return the string associated with the key. If the key is out of 283 // range (<0, >max), log error and return an empty string. 284 string Find(int64 key) const { 285 return impl_->Find(key); 286 } 287 288 // Return the key associated with the symbol. If the symbol 289 // does not exists, log error and return -1 290 int64 Find(const string& symbol) const { 291 return impl_->Find(symbol); 292 } 293 294 // Return the key associated with the symbol. If the symbol 295 // does not exists, log error and return -1 296 int64 Find(const char* symbol) const { 297 return impl_->Find(symbol); 298 } 299 300 // return the current available key (i.e highest key number) in 301 // the symbol table 302 int64 AvailableKey(void) const { 303 return impl_->AvailableKey(); 304 } 305 306 protected: 307 explicit SymbolTable(SymbolTableImpl* impl) : impl_(impl) {} 308 309 const SymbolTableImpl* Impl() const { 310 return impl_; 311 } 312 313 private: 314 SymbolTableImpl* impl_; 315 316 317 void operator=(const SymbolTable &table); // disallow 318 }; 319 320 321 // 322 // \class SymbolTableIterator 323 // \brief Iterator class for symbols in a symbol table 324 class SymbolTableIterator { 325 public: 326 // Constructor creates a refcounted copy of underlying implementation 327 SymbolTableIterator(const SymbolTable& symbol_table) { 328 impl_ = symbol_table.Impl(); 329 impl_->IncrRefCount(); 330 pos_ = 0; 331 size_ = impl_->symbols_.size(); 332 } 333 334 // decrement implementation refcount, and delete if 0 335 ~SymbolTableIterator() { 336 if (!impl_->DecrRefCount()) delete impl_; 337 } 338 339 // is iterator done 340 bool Done(void) { 341 return (pos_ == size_); 342 } 343 344 // return the Value() of the current symbol (in64 key) 345 int64 Value(void) { 346 return impl_->Find(impl_->symbols_[pos_]); 347 } 348 349 // return the string of the current symbol 350 const char* Symbol(void) { 351 return impl_->symbols_[pos_]; 352 } 353 354 // advance iterator forward 355 void Next(void) { 356 if (Done()) return; 357 ++pos_; 358 } 359 360 // reset iterator 361 void Reset(void) { 362 pos_ = 0; 363 } 364 365 private: 366 const SymbolTableImpl* impl_; 367 size_t pos_; 368 size_t size_; 369 }; 370 371 372 // Tests compatibilty between two sets of symbol tables 373 inline bool CompatSymbols(const SymbolTable *syms1, 374 const SymbolTable *syms2) { 375 if (!FLAGS_fst_compat_symbols) 376 return true; 377 else if (!syms1 && !syms2) 378 return true; 379 else if (syms1 && !syms2 || !syms1 && syms2) 380 return false; 381 else 382 return syms1->CheckSum() == syms2->CheckSum(); 383 } 384 385 } // namespace fst 386 387 #endif // FST_LIB_SYMBOL_TABLE_H__ 388