1 /*********************************************************************** 2 * Software License Agreement (BSD License) 3 * 4 * Copyright 2008-2009 Marius Muja (mariusm (at) cs.ubc.ca). All rights reserved. 5 * Copyright 2008-2009 David G. Lowe (lowe (at) cs.ubc.ca). All rights reserved. 6 * 7 * THE BSD LICENSE 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 20 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 21 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 22 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 23 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 24 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 28 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 *************************************************************************/ 30 31 /*********************************************************************** 32 * Author: Vincent Rabaud 33 *************************************************************************/ 34 35 #ifndef OPENCV_FLANN_LSH_TABLE_H_ 36 #define OPENCV_FLANN_LSH_TABLE_H_ 37 38 #include <algorithm> 39 #include <iostream> 40 #include <iomanip> 41 #include <limits.h> 42 // TODO as soon as we use C++0x, use the code in USE_UNORDERED_MAP 43 #ifdef __GXX_EXPERIMENTAL_CXX0X__ 44 # define USE_UNORDERED_MAP 1 45 #else 46 # define USE_UNORDERED_MAP 0 47 #endif 48 #if USE_UNORDERED_MAP 49 #include <unordered_map> 50 #else 51 #include <map> 52 #endif 53 #include <math.h> 54 #include <stddef.h> 55 56 #include "dynamic_bitset.h" 57 #include "matrix.h" 58 59 namespace cvflann 60 { 61 62 namespace lsh 63 { 64 65 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 66 67 /** What is stored in an LSH bucket 68 */ 69 typedef uint32_t FeatureIndex; 70 /** The id from which we can get a bucket back in an LSH table 71 */ 72 typedef unsigned int BucketKey; 73 74 /** A bucket in an LSH table 75 */ 76 typedef std::vector<FeatureIndex> Bucket; 77 78 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 79 80 /** POD for stats about an LSH table 81 */ 82 struct LshStats 83 { 84 std::vector<unsigned int> bucket_sizes_; 85 size_t n_buckets_; 86 size_t bucket_size_mean_; 87 size_t bucket_size_median_; 88 size_t bucket_size_min_; 89 size_t bucket_size_max_; 90 size_t bucket_size_std_dev; 91 /** Each contained vector contains three value: beginning/end for interval, number of elements in the bin 92 */ 93 std::vector<std::vector<unsigned int> > size_histogram_; 94 }; 95 96 /** Overload the << operator for LshStats 97 * @param out the streams 98 * @param stats the stats to display 99 * @return the streams 100 */ 101 inline std::ostream& operator <<(std::ostream& out, const LshStats& stats) 102 { 103 int w = 20; 104 out << "Lsh Table Stats:\n" << std::setw(w) << std::setiosflags(std::ios::right) << "N buckets : " 105 << stats.n_buckets_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "mean size : " 106 << std::setiosflags(std::ios::left) << stats.bucket_size_mean_ << "\n" << std::setw(w) 107 << std::setiosflags(std::ios::right) << "median size : " << stats.bucket_size_median_ << "\n" << std::setw(w) 108 << std::setiosflags(std::ios::right) << "min size : " << std::setiosflags(std::ios::left) 109 << stats.bucket_size_min_ << "\n" << std::setw(w) << std::setiosflags(std::ios::right) << "max size : " 110 << std::setiosflags(std::ios::left) << stats.bucket_size_max_; 111 112 // Display the histogram 113 out << std::endl << std::setw(w) << std::setiosflags(std::ios::right) << "histogram : " 114 << std::setiosflags(std::ios::left); 115 for (std::vector<std::vector<unsigned int> >::const_iterator iterator = stats.size_histogram_.begin(), end = 116 stats.size_histogram_.end(); iterator != end; ++iterator) out << (*iterator)[0] << "-" << (*iterator)[1] << ": " << (*iterator)[2] << ", "; 117 118 return out; 119 } 120 121 122 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 123 124 /** Lsh hash table. As its key is a sub-feature, and as usually 125 * the size of it is pretty small, we keep it as a continuous memory array. 126 * The value is an index in the corpus of features (we keep it as an unsigned 127 * int for pure memory reasons, it could be a size_t) 128 */ 129 template<typename ElementType> 130 class LshTable 131 { 132 public: 133 /** A container of all the feature indices. Optimized for space 134 */ 135 #if USE_UNORDERED_MAP 136 typedef std::unordered_map<BucketKey, Bucket> BucketsSpace; 137 #else 138 typedef std::map<BucketKey, Bucket> BucketsSpace; 139 #endif 140 141 /** A container of all the feature indices. Optimized for speed 142 */ 143 typedef std::vector<Bucket> BucketsSpeed; 144 145 /** Default constructor 146 */ 147 LshTable() 148 { 149 } 150 151 /** Default constructor 152 * Create the mask and allocate the memory 153 * @param feature_size is the size of the feature (considered as a ElementType[]) 154 * @param key_size is the number of bits that are turned on in the feature 155 */ 156 LshTable(unsigned int feature_size, unsigned int key_size) 157 { 158 (void)feature_size; 159 (void)key_size; 160 std::cerr << "LSH is not implemented for that type" << std::endl; 161 assert(0); 162 } 163 164 /** Add a feature to the table 165 * @param value the value to store for that feature 166 * @param feature the feature itself 167 */ 168 void add(unsigned int value, const ElementType* feature) 169 { 170 // Add the value to the corresponding bucket 171 BucketKey key = (lsh::BucketKey)getKey(feature); 172 173 switch (speed_level_) { 174 case kArray: 175 // That means we get the buckets from an array 176 buckets_speed_[key].push_back(value); 177 break; 178 case kBitsetHash: 179 // That means we can check the bitset for the presence of a key 180 key_bitset_.set(key); 181 buckets_space_[key].push_back(value); 182 break; 183 case kHash: 184 { 185 // That means we have to check for the hash table for the presence of a key 186 buckets_space_[key].push_back(value); 187 break; 188 } 189 } 190 } 191 192 /** Add a set of features to the table 193 * @param dataset the values to store 194 */ 195 void add(Matrix<ElementType> dataset) 196 { 197 #if USE_UNORDERED_MAP 198 buckets_space_.rehash((buckets_space_.size() + dataset.rows) * 1.2); 199 #endif 200 // Add the features to the table 201 for (unsigned int i = 0; i < dataset.rows; ++i) add(i, dataset[i]); 202 // Now that the table is full, optimize it for speed/space 203 optimize(); 204 } 205 206 /** Get a bucket given the key 207 * @param key 208 * @return 209 */ 210 inline const Bucket* getBucketFromKey(BucketKey key) const 211 { 212 // Generate other buckets 213 switch (speed_level_) { 214 case kArray: 215 // That means we get the buckets from an array 216 return &buckets_speed_[key]; 217 break; 218 case kBitsetHash: 219 // That means we can check the bitset for the presence of a key 220 if (key_bitset_.test(key)) return &buckets_space_.find(key)->second; 221 else return 0; 222 break; 223 case kHash: 224 { 225 // That means we have to check for the hash table for the presence of a key 226 BucketsSpace::const_iterator bucket_it, bucket_end = buckets_space_.end(); 227 bucket_it = buckets_space_.find(key); 228 // Stop here if that bucket does not exist 229 if (bucket_it == bucket_end) return 0; 230 else return &bucket_it->second; 231 break; 232 } 233 } 234 return 0; 235 } 236 237 /** Compute the sub-signature of a feature 238 */ 239 size_t getKey(const ElementType* /*feature*/) const 240 { 241 std::cerr << "LSH is not implemented for that type" << std::endl; 242 assert(0); 243 return 1; 244 } 245 246 /** Get statistics about the table 247 * @return 248 */ 249 LshStats getStats() const; 250 251 private: 252 /** defines the speed fo the implementation 253 * kArray uses a vector for storing data 254 * kBitsetHash uses a hash map but checks for the validity of a key with a bitset 255 * kHash uses a hash map only 256 */ 257 enum SpeedLevel 258 { 259 kArray, kBitsetHash, kHash 260 }; 261 262 /** Initialize some variables 263 */ 264 void initialize(size_t key_size) 265 { 266 const size_t key_size_lower_bound = 1; 267 //a value (size_t(1) << key_size) must fit the size_t type so key_size has to be strictly less than size of size_t 268 const size_t key_size_upper_bound = std::min(sizeof(BucketKey) * CHAR_BIT + 1, sizeof(size_t) * CHAR_BIT); 269 if (key_size < key_size_lower_bound || key_size >= key_size_upper_bound) 270 { 271 CV_Error(cv::Error::StsBadArg, cv::format("Invalid key_size (=%d). Valid values for your system are %d <= key_size < %d.", (int)key_size, (int)key_size_lower_bound, (int)key_size_upper_bound)); 272 } 273 274 speed_level_ = kHash; 275 key_size_ = (unsigned)key_size; 276 } 277 278 /** Optimize the table for speed/space 279 */ 280 void optimize() 281 { 282 // If we are already using the fast storage, no need to do anything 283 if (speed_level_ == kArray) return; 284 285 // Use an array if it will be more than half full 286 if (buckets_space_.size() > ((size_t(1) << key_size_) / 2)) { 287 speed_level_ = kArray; 288 // Fill the array version of it 289 buckets_speed_.resize(size_t(1) << key_size_); 290 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) buckets_speed_[key_bucket->first] = key_bucket->second; 291 292 // Empty the hash table 293 buckets_space_.clear(); 294 return; 295 } 296 297 // If the bitset is going to use less than 10% of the RAM of the hash map (at least 1 size_t for the key and two 298 // for the vector) or less than 512MB (key_size_ <= 30) 299 if (((std::max(buckets_space_.size(), buckets_speed_.size()) * CHAR_BIT * 3 * sizeof(BucketKey)) / 10 300 >= (size_t(1) << key_size_)) || (key_size_ <= 32)) { 301 speed_level_ = kBitsetHash; 302 key_bitset_.resize(size_t(1) << key_size_); 303 key_bitset_.reset(); 304 // Try with the BucketsSpace 305 for (BucketsSpace::const_iterator key_bucket = buckets_space_.begin(); key_bucket != buckets_space_.end(); ++key_bucket) key_bitset_.set(key_bucket->first); 306 } 307 else { 308 speed_level_ = kHash; 309 key_bitset_.clear(); 310 } 311 } 312 313 /** The vector of all the buckets if they are held for speed 314 */ 315 BucketsSpeed buckets_speed_; 316 317 /** The hash table of all the buckets in case we cannot use the speed version 318 */ 319 BucketsSpace buckets_space_; 320 321 /** What is used to store the data */ 322 SpeedLevel speed_level_; 323 324 /** If the subkey is small enough, it will keep track of which subkeys are set through that bitset 325 * That is just a speedup so that we don't look in the hash table (which can be mush slower that checking a bitset) 326 */ 327 DynamicBitset key_bitset_; 328 329 /** The size of the sub-signature in bits 330 */ 331 unsigned int key_size_; 332 333 // Members only used for the unsigned char specialization 334 /** The mask to apply to a feature to get the hash key 335 * Only used in the unsigned char case 336 */ 337 std::vector<size_t> mask_; 338 }; 339 340 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 341 // Specialization for unsigned char 342 343 template<> 344 inline LshTable<unsigned char>::LshTable(unsigned int feature_size, unsigned int subsignature_size) 345 { 346 initialize(subsignature_size); 347 // Allocate the mask 348 mask_ = std::vector<size_t>((size_t)ceil((float)(feature_size * sizeof(char)) / (float)sizeof(size_t)), 0); 349 350 // A bit brutal but fast to code 351 std::vector<size_t> indices(feature_size * CHAR_BIT); 352 for (size_t i = 0; i < feature_size * CHAR_BIT; ++i) indices[i] = i; 353 std::random_shuffle(indices.begin(), indices.end()); 354 355 // Generate a random set of order of subsignature_size_ bits 356 for (unsigned int i = 0; i < key_size_; ++i) { 357 size_t index = indices[i]; 358 359 // Set that bit in the mask 360 size_t divisor = CHAR_BIT * sizeof(size_t); 361 size_t idx = index / divisor; //pick the right size_t index 362 mask_[idx] |= size_t(1) << (index % divisor); //use modulo to find the bit offset 363 } 364 365 // Set to 1 if you want to display the mask for debug 366 #if 0 367 { 368 size_t bcount = 0; 369 BOOST_FOREACH(size_t mask_block, mask_){ 370 out << std::setw(sizeof(size_t) * CHAR_BIT / 4) << std::setfill('0') << std::hex << mask_block 371 << std::endl; 372 bcount += __builtin_popcountll(mask_block); 373 } 374 out << "bit count : " << std::dec << bcount << std::endl; 375 out << "mask size : " << mask_.size() << std::endl; 376 return out; 377 } 378 #endif 379 } 380 381 /** Return the Subsignature of a feature 382 * @param feature the feature to analyze 383 */ 384 template<> 385 inline size_t LshTable<unsigned char>::getKey(const unsigned char* feature) const 386 { 387 // no need to check if T is dividable by sizeof(size_t) like in the Hamming 388 // distance computation as we have a mask 389 const size_t* feature_block_ptr = reinterpret_cast<const size_t*> ((const void*)feature); 390 391 // Figure out the subsignature of the feature 392 // Given the feature ABCDEF, and the mask 001011, the output will be 393 // 000CEF 394 size_t subsignature = 0; 395 size_t bit_index = 1; 396 397 for (std::vector<size_t>::const_iterator pmask_block = mask_.begin(); pmask_block != mask_.end(); ++pmask_block) { 398 // get the mask and signature blocks 399 size_t feature_block = *feature_block_ptr; 400 size_t mask_block = *pmask_block; 401 while (mask_block) { 402 // Get the lowest set bit in the mask block 403 size_t lowest_bit = mask_block & (-(ptrdiff_t)mask_block); 404 // Add it to the current subsignature if necessary 405 subsignature += (feature_block & lowest_bit) ? bit_index : 0; 406 // Reset the bit in the mask block 407 mask_block ^= lowest_bit; 408 // increment the bit index for the subsignature 409 bit_index <<= 1; 410 } 411 // Check the next feature block 412 ++feature_block_ptr; 413 } 414 return subsignature; 415 } 416 417 template<> 418 inline LshStats LshTable<unsigned char>::getStats() const 419 { 420 LshStats stats; 421 stats.bucket_size_mean_ = 0; 422 if ((buckets_speed_.empty()) && (buckets_space_.empty())) { 423 stats.n_buckets_ = 0; 424 stats.bucket_size_median_ = 0; 425 stats.bucket_size_min_ = 0; 426 stats.bucket_size_max_ = 0; 427 return stats; 428 } 429 430 if (!buckets_speed_.empty()) { 431 for (BucketsSpeed::const_iterator pbucket = buckets_speed_.begin(); pbucket != buckets_speed_.end(); ++pbucket) { 432 stats.bucket_sizes_.push_back((lsh::FeatureIndex)pbucket->size()); 433 stats.bucket_size_mean_ += pbucket->size(); 434 } 435 stats.bucket_size_mean_ /= buckets_speed_.size(); 436 stats.n_buckets_ = buckets_speed_.size(); 437 } 438 else { 439 for (BucketsSpace::const_iterator x = buckets_space_.begin(); x != buckets_space_.end(); ++x) { 440 stats.bucket_sizes_.push_back((lsh::FeatureIndex)x->second.size()); 441 stats.bucket_size_mean_ += x->second.size(); 442 } 443 stats.bucket_size_mean_ /= buckets_space_.size(); 444 stats.n_buckets_ = buckets_space_.size(); 445 } 446 447 std::sort(stats.bucket_sizes_.begin(), stats.bucket_sizes_.end()); 448 449 // BOOST_FOREACH(int size, stats.bucket_sizes_) 450 // std::cout << size << " "; 451 // std::cout << std::endl; 452 stats.bucket_size_median_ = stats.bucket_sizes_[stats.bucket_sizes_.size() / 2]; 453 stats.bucket_size_min_ = stats.bucket_sizes_.front(); 454 stats.bucket_size_max_ = stats.bucket_sizes_.back(); 455 456 // TODO compute mean and std 457 /*float mean, stddev; 458 stats.bucket_size_mean_ = mean; 459 stats.bucket_size_std_dev = stddev;*/ 460 461 // Include a histogram of the buckets 462 unsigned int bin_start = 0; 463 unsigned int bin_end = 20; 464 bool is_new_bin = true; 465 for (std::vector<unsigned int>::iterator iterator = stats.bucket_sizes_.begin(), end = stats.bucket_sizes_.end(); iterator 466 != end; ) 467 if (*iterator < bin_end) { 468 if (is_new_bin) { 469 stats.size_histogram_.push_back(std::vector<unsigned int>(3, 0)); 470 stats.size_histogram_.back()[0] = bin_start; 471 stats.size_histogram_.back()[1] = bin_end - 1; 472 is_new_bin = false; 473 } 474 ++stats.size_histogram_.back()[2]; 475 ++iterator; 476 } 477 else { 478 bin_start += 20; 479 bin_end += 20; 480 is_new_bin = true; 481 } 482 483 return stats; 484 } 485 486 // End the two namespaces 487 } 488 } 489 490 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// 491 492 #endif /* OPENCV_FLANN_LSH_TABLE_H_ */ 493