Home | History | Annotate | Download | only in ngram
      1 
      2 // Licensed under the Apache License, Version 2.0 (the "License");
      3 // you may not use this file except in compliance with the License.
      4 // You may obtain a copy of the License at
      5 //
      6 //     http://www.apache.org/licenses/LICENSE-2.0
      7 //
      8 // Unless required by applicable law or agreed to in writing, software
      9 // distributed under the License is distributed on an "AS IS" BASIS,
     10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     11 // See the License for the specific language governing permissions and
     12 // limitations under the License.
     13 //
     14 // Copyright 2005-2010 Google, Inc.
     15 // Author: Jeffrey Soresnen (sorenj (at) google.com)
     16 
     17 #include <fst/extensions/ngram/bitmap-index.h>
     18 
     19 #include <algorithm>
     20 #include <iterator>
     21 
     22 #include <fst/extensions/ngram/nthbit.h>
     23 
     24 namespace fst {
     25 
     26 // These two internal classes implemented inverted views of the
     27 // primary and secondary indexes.  That is, they provide iterators
     28 // that have operator*'s that return the number zeros rather than
     29 // the number of ones.
     30 
     31 class primary_index_inverted : public vector<uint32>::const_iterator {
     32  public:
     33   primary_index_inverted() {}
     34   primary_index_inverted(vector<uint32>::const_iterator loc,
     35                          vector<uint32>::const_iterator begin) :
     36     vector<uint32>::const_iterator(loc), begin_(begin) {}
     37   uint32 operator*() {
     38     return BitmapIndex::kStorageBitSize * BitmapIndex::kSecondaryBlockSize *
     39            (1 + std::distance<vector<uint32>::const_iterator>(begin_, *this)) -
     40            vector<uint32>::const_iterator::operator*();
     41   }
     42  private:
     43   vector<uint32>::const_iterator begin_;
     44 };
     45 
     46 class secondary_index_inverted : public vector<uint16>::const_iterator {
     47  public:
     48   secondary_index_inverted() : vector<uint16>::const_iterator() {}
     49   secondary_index_inverted(vector<uint16>::const_iterator loc,
     50                            vector<uint16>::const_iterator block_begin) :
     51     vector<uint16>::const_iterator(loc), block_begin_(block_begin) {}
     52   uint16 operator*() {
     53     return ((1 + std::distance<vector<uint16>::const_iterator>(
     54         block_begin_, *this)) << BitmapIndex::kStorageLogBitSize) -
     55         vector<uint16>::const_iterator::operator*();
     56   }
     57  private:
     58   vector<uint16>::const_iterator block_begin_;
     59 };
     60 
     61 size_t BitmapIndex::Rank1(size_t end) const {
     62   if (end == 0) return 0;
     63   CHECK_LE(end, Bits());
     64   const uint32 end_word = (end - 1) >> BitmapIndex::kStorageLogBitSize;
     65   const uint32 sum = get_index_ones_count(end_word);
     66   const uint64 zero = 0;
     67   const uint64 ones = ~zero;
     68   return sum + __builtin_popcountll(bits_[end_word] &
     69       (ones >> (kStorageBitSize - (end & kStorageBlockMask))));
     70 }
     71 
     72 size_t BitmapIndex::Select1(size_t bit_index) const {
     73   if (bit_index >= GetOnesCount()) return Bits();
     74   // search primary index for the relevant block
     75   uint32 rembits = bit_index + 1;
     76   const uint32 block = find_primary_block(bit_index + 1);
     77   uint32 offset = 0;
     78   if (block > 0) {
     79     rembits -= primary_index_[block - 1];
     80     offset += block * kSecondaryBlockSize;
     81   }
     82   // search the secondary index
     83   uint32 word = find_secondary_block(offset, rembits);
     84   if (word > 0) {
     85     rembits -= secondary_index_[offset + word - 1];
     86     offset += word;
     87   }
     88   int nth = nth_bit(bits_[offset], rembits);
     89   return (offset << BitmapIndex::kStorageLogBitSize) + nth;
     90 }
     91 
     92 size_t BitmapIndex::Select0(size_t bit_index) const {
     93   if (bit_index >= Bits() - GetOnesCount()) return Bits();
     94   // search inverted primary index for relevant block
     95   uint32 remzeros = bit_index + 1;
     96   uint32 offset = 0;
     97   const uint32 block = find_inverted_primary_block(bit_index + 1);
     98   if (block > 0) {
     99     remzeros -= *primary_index_inverted(primary_index_.begin() + block - 1,
    100                                         primary_index_.begin());
    101     offset += block * kSecondaryBlockSize;
    102   }
    103   // search the inverted secondary index
    104   uint32 word = find_inverted_secondary_block(offset, remzeros);
    105   if (word > 0) {
    106     vector<uint16>::const_iterator block_begin =
    107         secondary_index_.begin() + offset;
    108     remzeros -= *secondary_index_inverted(block_begin + word - 1, block_begin);
    109     offset += word;
    110   }
    111   int nth = nth_bit(~bits_[offset], remzeros);
    112   return (offset << BitmapIndex::kStorageLogBitSize) + nth;
    113 }
    114 
    115 size_t BitmapIndex::get_index_ones_count(size_t array_index) const {
    116   uint32 sum = 0;
    117   if (array_index > 0) {
    118     sum += secondary_index_[array_index-1];
    119     uint32 end_block = (array_index - 1) / kSecondaryBlockSize;
    120     if (end_block > 0) sum += primary_index_[end_block-1];
    121   }
    122   return sum;
    123 }
    124 
    125 void BitmapIndex::BuildIndex(const uint64 *bits, size_t size) {
    126   bits_ = bits;
    127   size_ = size;
    128   secondary_index_.clear();
    129   secondary_index_.reserve(ArraySize());
    130   primary_index_.clear();
    131   primary_index_.reserve(primary_index_size());
    132   const uint64 zero = 0;
    133   const uint64 ones = ~zero;
    134   uint32 popcount = 0;
    135   for (uint32 block_begin = 0; block_begin < ArraySize();
    136        block_begin += kSecondaryBlockSize) {
    137     uint32 block_popcount = 0;
    138     uint32 block_end = block_begin + kSecondaryBlockSize;
    139     if (block_end > ArraySize()) block_end = ArraySize();
    140     for (uint32 j = block_begin; j < block_end; ++j) {
    141       uint64 mask = ones;
    142       if (j == ArraySize() - 1) {
    143         mask = ones >> (-size_ & BitmapIndex::kStorageBlockMask);
    144       }
    145       block_popcount += __builtin_popcountll(bits_[j] & mask);
    146       secondary_index_.push_back(block_popcount);
    147     }
    148     popcount += block_popcount;
    149     primary_index_.push_back(popcount);
    150   }
    151 }
    152 
    153 size_t BitmapIndex::find_secondary_block(
    154     size_t block_begin, size_t rem_bit_index) const {
    155   size_t block_end = block_begin + kSecondaryBlockSize;
    156   if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
    157   return std::distance(secondary_index_.begin() + block_begin,
    158                        std::lower_bound(secondary_index_.begin() + block_begin,
    159                                         secondary_index_.begin() + block_end,
    160                                         rem_bit_index));
    161 }
    162 
    163 size_t BitmapIndex::find_inverted_secondary_block(
    164     size_t block_begin, size_t rem_bit_index) const {
    165   size_t block_end = block_begin + kSecondaryBlockSize;
    166   if (block_end > secondary_index_.size()) block_end = secondary_index_.size();
    167   secondary_index_inverted start(secondary_index_.begin() + block_begin,
    168                                  secondary_index_.begin() + block_begin);
    169   secondary_index_inverted end(secondary_index_.begin() + block_end,
    170                                secondary_index_.begin() + block_begin);
    171   return std::distance(start,
    172                        std::lower_bound(start, end, rem_bit_index));
    173 }
    174 
    175 inline size_t BitmapIndex::find_primary_block(size_t bit_index) const {
    176   return std::distance(primary_index_.begin(),
    177                        std::lower_bound(primary_index_.begin(),
    178                                         primary_index_.end(), bit_index));
    179 }
    180 
    181 size_t BitmapIndex::find_inverted_primary_block(size_t bit_index) const {
    182   primary_index_inverted start(primary_index_.begin(), primary_index_.begin());
    183   primary_index_inverted end(primary_index_.end(), primary_index_.begin());
    184   return std::distance(start, std::lower_bound(start, end, bit_index));
    185 }
    186 
    187 }  // end namespace fst
    188