Home | History | Annotate | Download | only in flann
      1 /***********************************************************************
      2  * Software License Agreement (BSD License)
      3  *
      4  * Copyright 2008-2009  Marius Muja (mariusm (at) cs.ubc.ca). All rights reserved.
      5  * Copyright 2008-2009  David G. Lowe (lowe (at) cs.ubc.ca). All rights reserved.
      6  *
      7  * THE BSD LICENSE
      8  *
      9  * Redistribution and use in source and binary forms, with or without
     10  * modification, are permitted provided that the following conditions
     11  * are met:
     12  *
     13  * 1. Redistributions of source code must retain the above copyright
     14  *    notice, this list of conditions and the following disclaimer.
     15  * 2. Redistributions in binary form must reproduce the above copyright
     16  *    notice, this list of conditions and the following disclaimer in the
     17  *    documentation and/or other materials provided with the distribution.
     18  *
     19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
     20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
     21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
     22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
     23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
     24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
     28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29  *************************************************************************/
     30 
     31 /***********************************************************************
     32  * Author: Vincent Rabaud
     33  *************************************************************************/
     34 
     35 #ifndef OPENCV_FLANN_LSH_TABLE_H_
     36 #define OPENCV_FLANN_LSH_TABLE_H_
     37 
     38 #include <algorithm>
     39 #include <iostream>
     40 #include <iomanip>
     41 #include <limits.h>
     42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP
     43 #ifdef __GXX_EXPERIMENTAL_CXX0X__
     44 #  define USE_UNORDERED_MAP 1
     45 #else
     46 #  define USE_UNORDERED_MAP 0
     47 #endif
     48 #if USE_UNORDERED_MAP
     49 #include <unordered_map>
     50 #else
     51 #include <map>
     52 #endif
     53 #include <math.h>
     54 #include <stddef.h>
     55 
     56 #include "dynamic_bitset.h"
     57 #include "matrix.h"
     58 
     59 namespace cvflann
     60 {
     61 
     62 namespace lsh
     63 {
     64 
     65 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     66 
     67 /** What is stored in an LSH bucket
     68  */
     69 typedef uint32_t FeatureIndex;
     70 /** The id from which we can get a bucket back in an LSH table
     71  */
     72 typedef unsigned int BucketKey;
     73 
     74 /** A bucket in an LSH table
     75  */
     76 typedef std::vector<FeatureIndex> Bucket;
     77 
     78 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
     79 
     80 /** POD for stats about an LSH table
     81  */
     82 struct LshStats
     83 {
     84     std::vector<unsigned int> bucket_sizes_;
     85     size_t n_buckets_;
     86     size_t bucket_size_mean_;
     87     size_t bucket_size_median_;
     88     size_t bucket_size_min_;
     89     size_t bucket_size_max_;
     90     size_t bucket_size_std_dev;
     91     /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin
     92      */
     93     std::vector<std::vector<unsigned int> > size_histogram_;
     94 };
     95 
     96 /** Overload the << operator for LshStats
     97  * @param out the streams
     98  * @param stats the stats to display
     99  * @return the streams
    100  */
    101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats)
    102 {
    103     int w = 20;
    104     out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : "
    105     << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : "
    106     << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w)
    107     << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w)
    108     << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left)
    109     << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : "
    110     << std::setiosflags(std::ios::left) << stats.bucket_size_max_;
    111 
    112     // Display the histogram
    113     out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : "
    114     << std::setiosflags(std::ios::left);
    115     for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end =
    116              stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ",  ";
    117 
    118     return out;
    119 }
    120 
    121 
    122 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    123 
    124 /** Lsh hash table. As its key is a sub-feature, and as usually
    125  * the size of it is pretty small, we keep it as a continuous memory array.
    126  * The value is an index in the corpus of features (we keep it as an unsigned
    127  * int for pure memory reasons, it could be a size_t)
    128  */
    129 template<typename ElementType>
    130 class LshTable
    131 {
    132 public:
    133     /** A container of all the feature indices. Optimized for space
    134      */
    135 #if USE_UNORDERED_MAP
    136     typedef std::unordered_map<BucketKey, Bucket> BucketsSpace;
    137 #else
    138     typedef std::map<BucketKey, Bucket> BucketsSpace;
    139 #endif
    140 
    141     /** A container of all the feature indices. Optimized for speed
    142      */
    143     typedef std::vector<Bucket> BucketsSpeed;
    144 
    145     /** Default constructor
    146      */
    147     LshTable()
    148     {
    149     }
    150 
    151     /** Default constructor
    152      * Create the mask and allocate the memory
    153      * @param feature_size is the size of the feature (considered as a ElementType[])
    154      * @param key_size is the number of bits that are turned on in the feature
    155      */
    156     LshTable(unsigned int feature_size, unsigned int key_size)
    157     {
    158         (void)feature_size;
    159         (void)key_size;
    160         std::cerr << "LSH is not implemented for that type" << std::endl;
    161         assert(0);
    162     }
    163 
    164     /** Add a feature to the table
    165      * @param value the value to store for that feature
    166      * @param feature the feature itself
    167      */
    168     void add(unsigned int value, const ElementType* feature)
    169     {
    170         // Add the value to the corresponding bucket
    171         BucketKey key = (lsh::BucketKey)getKey(feature);
    172 
    173         switch (speed_level_) {
    174         case kArray:
    175             // That means we get the buckets from an array
    176             buckets_speed_[key].push_back(value);
    177             break;
    178         case kBitsetHash:
    179             // That means we can check the bitset for the presence of a key
    180             key_bitset_.set(key);
    181             buckets_space_[key].push_back(value);
    182             break;
    183         case kHash:
    184         {
    185             // That means we have to check for the hash table for the presence of a key
    186             buckets_space_[key].push_back(value);
    187             break;
    188         }
    189         }
    190     }
    191 
    192     /** Add a set of features to the table
    193      * @param dataset the values to store
    194      */
    195     void add(Matrix<ElementType> dataset)
    196     {
    197 #if USE_UNORDERED_MAP
    198         buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2);
    199 #endif
    200         // Add the features to the table
    201         for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]);
    202         // Now that the table is full, optimize it for speed/space
    203         optimize();
    204     }
    205 
    206     /** Get a bucket given the key
    207      * @param key
    208      * @return
    209      */
    210     inline const Bucket* getBucketFromKey(BucketKey key) const
    211     {
    212         // Generate other buckets
    213         switch (speed_level_) {
    214         case kArray:
    215             // That means we get the buckets from an array
    216             return &buckets_speed_[key];
    217             break;
    218         case kBitsetHash:
    219             // That means we can check the bitset for the presence of a key
    220             if (key_bitset_.test(key)) return &buckets_space_.find(key)->second;
    221             else return 0;
    222             break;
    223         case kHash:
    224         {
    225             // That means we have to check for the hash table for the presence of a key
    226             BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end();
    227             bucket_it = buckets_space_.find(key);
    228             // Stop here if that bucket does not exist
    229             if (bucket_it == bucket_end) return 0;
    230             else return &bucket_it->second;
    231             break;
    232         }
    233         }
    234         return 0;
    235     }
    236 
    237     /** Compute the sub-signature of a feature
    238      */
    239     size_t getKey(const ElementType* /*feature*/) const
    240     {
    241         std::cerr << "LSH is not implemented for that type" << std::endl;
    242         assert(0);
    243         return 1;
    244     }
    245 
    246     /** Get statistics about the table
    247      * @return
    248      */
    249     LshStats getStats() const;
    250 
    251 private:
    252     /** defines the speed fo the implementation
    253      * kArray uses a vector for storing data
    254      * kBitsetHash uses a hash map but checks for the validity of a key with a bitset
    255      * kHash uses a hash map only
    256      */
    257     enum SpeedLevel
    258     {
    259         kArray, kBitsetHash, kHash
    260     };
    261 
    262     /** Initialize some variables
    263      */
    264     void initialize(size_t key_size)
    265     {
    266         const size_t key_size_lower_bound = 1;
    267         //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t
    268         const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT);
    269         if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound)
    270         {
    271             CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound));
    272         }
    273 
    274         speed_level_ = kHash;
    275         key_size_ = (unsigned)key_size;
    276     }
    277 
    278     /** Optimize the table for speed/space
    279      */
    280     void optimize()
    281     {
    282         // If we are already using the fast storage, no need to do anything
    283         if (speed_level_ == kArray) return;
    284 
    285         // Use an array if it will be more than half full
    286         if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) {
    287             speed_level_ = kArray;
    288             // Fill the array version of it
    289             buckets_speed_.resize(size_t(1) << key_size_);
    290             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second;
    291 
    292             // Empty the hash table
    293             buckets_space_.clear();
    294             return;
    295         }
    296 
    297         // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two
    298         // for the vector) or less than 512MB (key_size_ <= 30)
    299         if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10
    300              >= (size_t(1) << key_size_)) || (key_size_ <= 32)) {
    301             speed_level_ = kBitsetHash;
    302             key_bitset_.resize(size_t(1) << key_size_);
    303             key_bitset_.reset();
    304             // Try with the BucketsSpace
    305             for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first);
    306         }
    307         else {
    308             speed_level_ = kHash;
    309             key_bitset_.clear();
    310         }
    311     }
    312 
    313     /** The vector of all the buckets if they are held for speed
    314      */
    315     BucketsSpeed buckets_speed_;
    316 
    317     /** The hash table of all the buckets in case we cannot use the speed version
    318      */
    319     BucketsSpace buckets_space_;
    320 
    321     /** What is used to store the data */
    322     SpeedLevel speed_level_;
    323 
    324     /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset
    325      * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset)
    326      */
    327     DynamicBitset key_bitset_;
    328 
    329     /** The size of the sub-signature in bits
    330      */
    331     unsigned int key_size_;
    332 
    333     // Members only used for the unsigned char specialization
    334     /** The mask to apply to a feature to get the hash key
    335      * Only used in the unsigned char case
    336      */
    337     std::vector<size_t> mask_;
    338 };
    339 
    340 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    341 // Specialization for unsigned char
    342 
    343 template<>
    344 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size)
    345 {
    346     initialize(subsignature_size);
    347     // Allocate the mask
    348     mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0);
    349 
    350     // A bit brutal but fast to code
    351     std::vector<size_t> indices(feature_size * CHAR_BIT);
    352     for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i;
    353     std::random_shuffle(indices.begin(), indices.end());
    354 
    355     // Generate a random set of order of subsignature_size_ bits
    356     for (unsigned int i = 0; i < key_size_; ++i) {
    357         size_t index = indices[i];
    358 
    359         // Set that bit in the mask
    360         size_t divisor = CHAR_BIT * sizeof(size_t);
    361         size_t idx = index / divisor; //pick the right size_t index
    362         mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset
    363     }
    364 
    365     // Set to 1 if you want to display the mask for debug
    366 #if 0
    367     {
    368         size_t bcount = 0;
    369         BOOST_FOREACH(size_t mask_block, mask_){
    370             out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block
    371                 << std::endl;
    372             bcount += __builtin_popcountll(mask_block);
    373         }
    374         out << "bit count : " << std::dec << bcount << std::endl;
    375         out << "mask size : " << mask_.size() << std::endl;
    376         return out;
    377     }
    378 #endif
    379 }
    380 
    381 /** Return the Subsignature of a feature
    382  * @param feature the feature to analyze
    383  */
    384 template<>
    385 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const
    386 {
    387     // no need to check if T is dividable by sizeof(size_t) like in the Hamming
    388     // distance computation as we have a mask
    389     const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature);
    390 
    391     // Figure out the subsignature of the feature
    392     // Given the feature ABCDEF, and the mask 001011, the output will be
    393     // 000CEF
    394     size_t subsignature = 0;
    395     size_t bit_index = 1;
    396 
    397     for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) {
    398         // get the mask and signature blocks
    399         size_t feature_block = *feature_block_ptr;
    400         size_t mask_block = *pmask_block;
    401         while (mask_block) {
    402             // Get the lowest set bit in the mask block
    403             size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block);
    404             // Add it to the current subsignature if necessary
    405             subsignature += (feature_block & lowest_bit) ? bit_index : 0;
    406             // Reset the bit in the mask block
    407             mask_block ^= lowest_bit;
    408             // increment the bit index for the subsignature
    409             bit_index <<= 1;
    410         }
    411         // Check the next feature block
    412         ++feature_block_ptr;
    413     }
    414     return subsignature;
    415 }
    416 
    417 template<>
    418 inline LshStats LshTable<unsigned char>::getStats() const
    419 {
    420     LshStats stats;
    421     stats.bucket_size_mean_ = 0;
    422     if ((buckets_speed_.empty()) && (buckets_space_.empty())) {
    423         stats.n_buckets_ = 0;
    424         stats.bucket_size_median_ = 0;
    425         stats.bucket_size_min_ = 0;
    426         stats.bucket_size_max_ = 0;
    427         return stats;
    428     }
    429 
    430     if (!buckets_speed_.empty()) {
    431         for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) {
    432             stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size());
    433             stats.bucket_size_mean_ += pbucket->size();
    434         }
    435         stats.bucket_size_mean_ /= buckets_speed_.size();
    436         stats.n_buckets_ = buckets_speed_.size();
    437     }
    438     else {
    439         for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) {
    440             stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size());
    441             stats.bucket_size_mean_ += x->second.size();
    442         }
    443         stats.bucket_size_mean_ /= buckets_space_.size();
    444         stats.n_buckets_ = buckets_space_.size();
    445     }
    446 
    447     std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end());
    448 
    449     //  BOOST_FOREACH(int size, stats.bucket_sizes_)
    450     //          std::cout << size << " ";
    451     //  std::cout << std::endl;
    452     stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2];
    453     stats.bucket_size_min_ = stats.bucket_sizes_.front();
    454     stats.bucket_size_max_ = stats.bucket_sizes_.back();
    455 
    456     // TODO compute mean and std
    457     /*float mean, stddev;
    458        stats.bucket_size_mean_ = mean;
    459        stats.bucket_size_std_dev = stddev;*/
    460 
    461     // Include a histogram of the buckets
    462     unsigned int bin_start = 0;
    463     unsigned int bin_end = 20;
    464     bool is_new_bin = true;
    465     for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator
    466          != end; )
    467         if (*iterator < bin_end) {
    468             if (is_new_bin) {
    469                 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0));
    470                 stats.size_histogram_.back()[0] = bin_start;
    471                 stats.size_histogram_.back()[1] = bin_end - 1;
    472                 is_new_bin = false;
    473             }
    474             ++stats.size_histogram_.back()[2];
    475             ++iterator;
    476         }
    477         else {
    478             bin_start += 20;
    479             bin_end += 20;
    480             is_new_bin = true;
    481         }
    482 
    483     return stats;
    484 }
    485 
    486 // End the two namespaces
    487 }
    488 }
    489 
    490 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    491 
    492 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */
    493