Home | History | Annotate | Download | only in ngram
      1 
      2 // Licensed under the Apache License, Version 2.0 (the "License");
      3 // you may not use this file except in compliance with the License.
      4 // You may obtain a copy of the License at
      5 //
      6 //     http://www.apache.org/licenses/LICENSE-2.0
      7 //
      8 // Unless required by applicable law or agreed to in writing, software
      9 // distributed under the License is distributed on an "AS IS" BASIS,
     10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     11 // See the License for the specific language governing permissions and
     12 // limitations under the License.
     13 //
     14 // Copyright 2005-2010 Google, Inc.
     15 // Author: sorenj (at) google.com (Jeffrey Sorensen)
     16 
     17 #ifndef FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
     18 #define FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
     19 
     20 #include <vector>
     21 using std::vector;
     22 
     23 #include <fst/compat.h>
     24 
     25 // This class is a bitstring storage class with an index that allows
     26 // seeking to the Nth set or clear bit in time O(Log(N)) where N is
     27 // the length of the bit vector.  In addition, it allows counting set or
     28 // clear bits over ranges in constant time.
     29 //
     30 // This is accomplished by maintaining an "secondary" index of limited
     31 // size in bits that maintains a running count of the number of bits set
     32 // in each block of bitmap data.  A block is defined as the number of
     33 // uint64 values that can fit in the secondary index before an overflow
     34 // occurs.
     35 //
     36 // To handle overflows, a "primary" index containing a running count of
     37 // bits set in each block is created using the type uint64.
     38 
     39 namespace fst {
     40 
     41 class BitmapIndex {
     42  public:
     43   static size_t StorageSize(size_t size) {
     44     return ((size + kStorageBlockMask) >> kStorageLogBitSize);
     45   }
     46 
     47   BitmapIndex() : bits_(NULL), size_(0) { }
     48 
     49   bool Get(size_t index) const {
     50     return (bits_[index >> kStorageLogBitSize] &
     51             (kOne << (index & kStorageBlockMask))) != 0;
     52   }
     53 
     54   static void Set(uint64* bits, size_t index) {
     55     bits[index >> kStorageLogBitSize] |= (kOne << (index & kStorageBlockMask));
     56   }
     57 
     58   static void Clear(uint64* bits, size_t index) {
     59     bits[index >> kStorageLogBitSize] &= ~(kOne << (index & kStorageBlockMask));
     60   }
     61 
     62   size_t Bits() const {
     63     return size_;
     64   }
     65 
     66   size_t ArraySize() const {
     67     return StorageSize(size_);
     68   }
     69 
     70   // Returns the number of one bits in the bitmap
     71   size_t GetOnesCount() const {
     72     return primary_index_[primary_index_size() - 1];
     73   }
     74 
     75   // Returns the number of one bits in positions 0 to limit - 1.
     76   // REQUIRES: limit <= Bits()
     77   size_t Rank1(size_t end) const;
     78 
     79   // Returns the number of one bits in the range start to end - 1.
     80   // REQUIRES: limit <= Bits()
     81   size_t GetOnesCountInRange(size_t start, size_t end) const {
     82     return Rank1(end) - Rank1(start);
     83   }
     84 
     85   // Returns the number of zero bits in positions 0 to limit - 1.
     86   // REQUIRES: limit <= Bits()
     87   size_t Rank0(size_t end) const {
     88     return end - Rank1(end);
     89   }
     90 
     91   // Returns the number of zero bits in the range start to end - 1.
     92   // REQUIRES: limit <= Bits()
     93   size_t GetZeroesCountInRange(size_t start, size_t end) const {
     94     return end - start - GetOnesCountInRange(start, end);
     95   }
     96 
     97   // Return true if any bit between begin inclusive and end exclusive
     98   // is set.  0 <= begin <= end <= Bits() is required.
     99   //
    100   bool TestRange(size_t start, size_t end) const {
    101     return Rank1(end) > Rank1(start);
    102   }
    103 
    104   // Returns the offset to the nth set bit (zero based)
    105   // or Bits() if index >= number of ones
    106   size_t Select1(size_t bit_index) const;
    107 
    108   // Returns the offset to the nth clear bit (zero based)
    109   // or Bits() if index > number of
    110   size_t Select0(size_t bit_index) const;
    111 
    112   // Rebuilds from index for the associated Bitmap, should be called
    113   // whenever changes have been made to the Bitmap or else behavior
    114   // of the indexed bitmap methods will be undefined.
    115   void BuildIndex(const uint64 *bits, size_t size);
    116 
    117   // the secondary index accumulates counts until it can possibly overflow
    118   // this constant computes the number of uint64 units that can fit into
    119   // units the size of uint16.
    120   static const uint64 kOne = 1;
    121   static const uint32 kStorageBitSize = 64;
    122   static const uint32 kStorageLogBitSize = 6;
    123   static const uint32 kSecondaryBlockSize = ((1 << 16) - 1)
    124       >> kStorageLogBitSize;
    125 
    126  private:
    127   static const uint32 kStorageBlockMask = kStorageBitSize - 1;
    128 
    129   // returns, from the index, the count of ones up to array_index
    130   size_t get_index_ones_count(size_t array_index) const;
    131 
    132   // because the indexes, both primary and secondary, contain a running
    133   // count of the population of one bits contained in [0,i), there is
    134   // no reason to have an element in the zeroth position as this value would
    135   // necessarily be zero.  (The bits are indexed in a zero based way.)  Thus
    136   // we don't store the 0th element in either index.  Both of the following
    137   // functions, if greater than 0, must be decremented by one before retreiving
    138   // the value from the corresponding array.
    139   // returns the 1 + the block that contains the bitindex in question
    140   // the inverted version works the same but looks for zeros using an inverted
    141   // view of the index
    142   size_t find_primary_block(size_t bit_index) const;
    143 
    144   size_t find_inverted_primary_block(size_t bit_index) const;
    145 
    146   // similarly, the secondary index (which resets its count to zero at
    147   // the end of every kSecondaryBlockSize entries) does not store the element
    148   // at 0.  Note that the rem_bit_index parameter is the number of bits
    149   // within the secondary block, after the bits accounted for by the primary
    150   // block have been removed (i.e. the remaining bits)  And, because we
    151   // reset to zero with each new block, there is no need to store those
    152   // actual zeros.
    153   // returns 1 + the secondary block that contains the bitindex in question
    154   size_t find_secondary_block(size_t block, size_t rem_bit_index) const;
    155 
    156   size_t find_inverted_secondary_block(size_t block, size_t rem_bit_index)
    157       const;
    158 
    159   // We create a primary index based upon the number of secondary index
    160   // blocks.  The primary index uses fields wide enough to accomodate any
    161   // index of the bitarray so cannot overflow
    162   // The primary index is the actual running
    163   // count of one bits set for all blocks (and, thus, all uint64s).
    164   size_t primary_index_size() const {
    165     return (ArraySize() + kSecondaryBlockSize - 1) / kSecondaryBlockSize;
    166   }
    167 
    168   const uint64* bits_;
    169   size_t size_;
    170 
    171   // The primary index contains the running popcount of all blocks
    172   // which means the nth value contains the popcounts of
    173   // [0,n*kSecondaryBlockSize], however, the 0th element is omitted.
    174   vector<uint32> primary_index_;
    175   // The secondary index contains the running popcount of the associated
    176   // bitmap.  It is the same length (in units of uint16) as the
    177   // bitmap's map is in units of uint64s.
    178   vector<uint16> secondary_index_;
    179 };
    180 
    181 }  // end namespace fst
    182 
    183 #endif  // FST_EXTENSIONS_NGRAM_BITMAP_INDEX_H_
    184