Home | History | Annotate | Download | only in lib
      1 
      2 // Licensed under the Apache License, Version 2.0 (the "License");
      3 // you may not use this file except in compliance with the License.
      4 // You may obtain a copy of the License at
      5 //
      6 //     http://www.apache.org/licenses/LICENSE-2.0
      7 //
      8 // Unless required by applicable law or agreed to in writing, software
      9 // distributed under the License is distributed on an "AS IS" BASIS,
     10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     11 // See the License for the specific language governing permissions and
     12 // limitations under the License.
     13 //
     14 // Copyright 2005-2010 Google, Inc.
     15 // All Rights Reserved.
     16 //
     17 // Author : Johan Schalkwyk
     18 //
     19 // \file
     20 // Classes to provide symbol-to-integer and integer-to-symbol mappings.
     21 
     22 #include <fst/symbol-table.h>
     23 #include <fst/util.h>
     24 
     25 DEFINE_bool(fst_compat_symbols, true,
     26             "Require symbol tables to match when appropriate");
     27 DEFINE_string(fst_field_separator, "\t ",
     28               "Set of characters used as a separator between printed fields");
     29 
     30 namespace fst {
     31 
     32 // Maximum line length in textual symbols file.
     33 const int kLineLen = 8096;
     34 
     35 // Identifies stream data as a symbol table (and its endianity)
     36 static const int32 kSymbolTableMagicNumber = 2125658996;
     37 
     38 SymbolTableImpl* SymbolTableImpl::ReadText(istream &strm,
     39                                            const string &filename,
     40                                            bool allow_negative) {
     41   SymbolTableImpl* impl = new SymbolTableImpl(filename);
     42 
     43   int64 nline = 0;
     44   char line[kLineLen];
     45   while (strm.getline(line, kLineLen)) {
     46     ++nline;
     47     vector<char *> col;
     48     string separator = FLAGS_fst_field_separator + "\n";
     49     SplitToVector(line, separator.c_str(), &col, true);
     50     if (col.size() == 0)  // empty line
     51       continue;
     52     if (col.size() != 2) {
     53       LOG(ERROR) << "SymbolTable::ReadText: Bad number of columns ("
     54                  << col.size() << " skipping), "
     55                  << "file = " << filename << ", line = " << nline
     56                  << ":<" << line << ">";
     57       continue;
     58     }
     59     const char *symbol = col[0];
     60     const char *value = col[1];
     61     char *p;
     62     int64 key = strtoll(value, &p, 10);
     63     if (p < value + strlen(value) ||
     64         (!allow_negative && key < 0) || key == -1) {
     65       LOG(ERROR) << "SymbolTable::ReadText: Bad non-negative integer \""
     66                  << value << "\" (skipping), "
     67                  << "file = " << filename << ", line = " << nline;
     68       continue;
     69     }
     70     impl->AddSymbol(symbol, key);
     71   }
     72 
     73   return impl;
     74 }
     75 
     76 void SymbolTableImpl::MaybeRecomputeCheckSum() const {
     77   if (check_sum_finalized_)
     78     return;
     79 
     80   // Calculate the original label-agnostic check sum.
     81   check_sum_.Reset();
     82   for (int64 i = 0; i < symbols_.size(); ++i)
     83     check_sum_.Update(symbols_[i], strlen(symbols_[i]) + 1);
     84   check_sum_string_ = check_sum_.Digest();
     85 
     86   // Calculate the safer, label-dependent check sum.
     87   labeled_check_sum_.Reset();
     88   for (int64 key = 0; key < dense_key_limit_; ++key) {
     89     ostringstream line;
     90     line << symbols_[key] << '\t' << key;
     91     labeled_check_sum_.Update(line.str()); }
     92   for (map<int64, const char*>::const_iterator it =
     93        key_map_.begin();
     94        it != key_map_.end();
     95        ++it) {
     96     if (it->first >= dense_key_limit_) {
     97       ostringstream line;
     98       line << it->second << '\t' << it->first;
     99       labeled_check_sum_.Update(line.str());
    100     }
    101   }
    102   labeled_check_sum_string_ = labeled_check_sum_.Digest();
    103 
    104   check_sum_finalized_ = true;
    105 }
    106 
    107 int64 SymbolTableImpl::AddSymbol(const string& symbol, int64 key) {
    108   map<const char *, int64, StrCmp>::const_iterator it =
    109       symbol_map_.find(symbol.c_str());
    110   if (it == symbol_map_.end()) {  // only add if not in table
    111     check_sum_finalized_ = false;
    112 
    113     char *csymbol = new char[symbol.size() + 1];
    114     strcpy(csymbol, symbol.c_str());
    115     symbols_.push_back(csymbol);
    116     key_map_[key] = csymbol;
    117     symbol_map_[csymbol] = key;
    118 
    119     if (key >= available_key_) {
    120       available_key_ = key + 1;
    121     }
    122   } else {
    123     // Log if symbol already in table with different key
    124     if (it->second != key) {
    125       VLOG(1) << "SymbolTable::AddSymbol: symbol = " << symbol
    126               << " already in symbol_map_ with key = "
    127               << it->second
    128               << " but supplied new key = " << key
    129               << " (ignoring new key)";
    130     }
    131   }
    132   return key;
    133 }
    134 
    135 static bool IsInRange(const vector<pair<int64, int64> >& ranges,
    136                       int64 key) {
    137   if (ranges.size() == 0) return true;
    138   for (size_t i = 0; i < ranges.size(); ++i) {
    139     if (key >= ranges[i].first && key <= ranges[i].second)
    140       return true;
    141   }
    142   return false;
    143 }
    144 
    145 SymbolTableImpl* SymbolTableImpl::Read(istream &strm,
    146                                        const SymbolTableReadOptions& opts) {
    147   int32 magic_number = 0;
    148   ReadType(strm, &magic_number);
    149   if (!strm) {
    150     LOG(ERROR) << "SymbolTable::Read: read failed";
    151     return 0;
    152   }
    153   string name;
    154   ReadType(strm, &name);
    155   SymbolTableImpl* impl = new SymbolTableImpl(name);
    156   ReadType(strm, &impl->available_key_);
    157   int64 size;
    158   ReadType(strm, &size);
    159   if (!strm) {
    160     LOG(ERROR) << "SymbolTable::Read: read failed";
    161     delete impl;
    162     return 0;
    163   }
    164 
    165   string symbol;
    166   int64 key;
    167   impl->check_sum_finalized_ = false;
    168   for (size_t i = 0; i < size; ++i) {
    169     ReadType(strm, &symbol);
    170     ReadType(strm, &key);
    171     if (!strm) {
    172       LOG(ERROR) << "SymbolTable::Read: read failed";
    173       delete impl;
    174       return 0;
    175     }
    176 
    177     char *csymbol = new char[symbol.size() + 1];
    178     strcpy(csymbol, symbol.c_str());
    179     impl->symbols_.push_back(csymbol);
    180     if (key == impl->dense_key_limit_ &&
    181         key == impl->symbols_.size() - 1)
    182       impl->dense_key_limit_ = impl->symbols_.size();
    183     else
    184       impl->key_map_[key] = csymbol;
    185 
    186     if (IsInRange(opts.string_hash_ranges, key)) {
    187       impl->symbol_map_[csymbol] = key;
    188     }
    189   }
    190   return impl;
    191 }
    192 
    193 bool SymbolTableImpl::Write(ostream &strm) const {
    194   WriteType(strm, kSymbolTableMagicNumber);
    195   WriteType(strm, name_);
    196   WriteType(strm, available_key_);
    197   int64 size = symbols_.size();
    198   WriteType(strm, size);
    199   // first write out dense keys
    200   int64 i = 0;
    201   for (; i < dense_key_limit_; ++i) {
    202     WriteType(strm, string(symbols_[i]));
    203     WriteType(strm, i);
    204   }
    205   // next write out the remaining non densely packed keys
    206   for (map<const char *, int64, StrCmp>::const_iterator it =
    207            symbol_map_.begin(); it != symbol_map_.end(); ++it) {
    208     if ((it->second >= 0) && (it->second < dense_key_limit_))
    209       continue;
    210     WriteType(strm, string(it->first));
    211     WriteType(strm, it->second);
    212     ++i;
    213   }
    214   if (i != size) {
    215     LOG(ERROR) << "SymbolTable::Write:  write failed";
    216     return false;
    217   }
    218   strm.flush();
    219   if (!strm) {
    220     LOG(ERROR) << "SymbolTable::Write: write failed";
    221     return false;
    222   }
    223   return true;
    224 }
    225 
    226 const int64 SymbolTable::kNoSymbol;
    227 
    228 
    229 void SymbolTable::AddTable(const SymbolTable& table) {
    230   for (SymbolTableIterator iter(table); !iter.Done(); iter.Next())
    231     impl_->AddSymbol(iter.Symbol());
    232 }
    233 
    234 bool SymbolTable::WriteText(ostream &strm) const {
    235   for (SymbolTableIterator iter(*this); !iter.Done(); iter.Next()) {
    236     ostringstream line;
    237     line << iter.Symbol() << FLAGS_fst_field_separator[0] << iter.Value()
    238          << '\n';
    239     strm.write(line.str().c_str(), line.str().length());
    240   }
    241   return true;
    242 }
    243 }  // namespace fst
    244