1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ 17 #define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ 18 19 #include <algorithm> 20 #include <vector> 21 #include "tensorflow/core/framework/types.h" 22 #include "tensorflow/core/platform/macros.h" 23 24 namespace tensorflow { 25 26 // Class for efficiently storing key->value mappings when the size is 27 // known in advance and the keys are pre-hashed into uint64s. 28 // Keys should have "good enough" randomness (be spread across the 29 // entire 64 bit space). 30 // 31 // Important: Clients wishing to use deterministic keys must 32 // ensure that their keys fall in the range 0 .. (uint64max-1); 33 // the table uses 2^64-1 as the "not occupied" flag. 34 // 35 // Inserted keys must be unique, and there are no update 36 // or delete functions (until some subsequent use of this table 37 // requires them). 38 // 39 // Threads must synchronize their access to a PresizedCuckooMap. 40 // 41 // The cuckoo hash table is 4-way associative (each "bucket" has 4 42 // "slots" for key/value entries). Uses breadth-first-search to find 43 // a good cuckoo path with less data movement (see 44 // http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf ) 45 46 namespace presized_cuckoo_map { 47 // Utility function to compute (x * y) >> 64, or "multiply high". 48 // On x86-64, this is a single instruction, but not all platforms 49 // support the __uint128_t type, so we provide a generic 50 // implementation as well. 51 inline uint64 multiply_high_u64(uint64 x, uint64 y) { 52 #if defined(__SIZEOF_INT128__) 53 return (uint64)(((__uint128_t)x * (__uint128_t)y) >> 64); 54 #else 55 // For platforms without int128 support, do it the long way. 56 uint64 x_lo = x & 0xffffffff; 57 uint64 x_hi = x >> 32; 58 uint64 buckets_lo = y & 0xffffffff; 59 uint64 buckets_hi = y >> 32; 60 uint64 prod_hi = x_hi * buckets_hi; 61 uint64 prod_lo = x_lo * buckets_lo; 62 uint64 prod_mid1 = x_hi * buckets_lo; 63 uint64 prod_mid2 = x_lo * buckets_hi; 64 uint64 carry = 65 ((prod_mid1 & 0xffffffff) + (prod_mid2 & 0xffffffff) + (prod_lo >> 32)) >> 66 32; 67 return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry; 68 #endif 69 } 70 } // namespace presized_cuckoo_map 71 72 template <class value> 73 class PresizedCuckooMap { 74 public: 75 // The key type is fixed as a pre-hashed key for this specialized use. 76 typedef uint64 key_type; 77 78 explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); } 79 80 void Clear(uint64 num_entries) { 81 cpq_.reset(new CuckooPathQueue()); 82 double n(num_entries); 83 n /= kLoadFactor; 84 num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket); 85 // Very small cuckoo tables don't work, because the probability 86 // of having same-bucket hashes is large. We compromise for those 87 // uses by having a larger static starting size. 88 num_buckets_ += 32; 89 Bucket empty_bucket; 90 for (int i = 0; i < kSlotsPerBucket; i++) { 91 empty_bucket.keys[i] = kUnusedSlot; 92 } 93 buckets_.clear(); 94 buckets_.resize(num_buckets_, empty_bucket); 95 } 96 97 // Returns false if k is already in table or if the table 98 // is full; true otherwise. 99 bool InsertUnique(const key_type k, const value& v) { 100 uint64 tk = key_transform(k); 101 uint64 b1 = fast_map_to_buckets(tk); 102 uint64 b2 = fast_map_to_buckets(h2(tk)); 103 104 // Merged find and duplicate checking. 105 uint64 target_bucket = 0; 106 int target_slot = kNoSpace; 107 108 for (auto bucket : {b1, b2}) { 109 Bucket* bptr = &buckets_[bucket]; 110 for (int slot = 0; slot < kSlotsPerBucket; slot++) { 111 if (bptr->keys[slot] == k) { // Duplicates are not allowed. 112 return false; 113 } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) { 114 target_bucket = bucket; 115 target_slot = slot; 116 } 117 } 118 } 119 120 if (target_slot != kNoSpace) { 121 InsertInternal(tk, v, target_bucket, target_slot); 122 return true; 123 } 124 125 return CuckooInsert(tk, v, b1, b2); 126 } 127 128 // Returns true if found. Sets *out = value. 129 bool Find(const key_type k, value* out) const { 130 uint64 tk = key_transform(k); 131 return FindInBucket(k, fast_map_to_buckets(tk), out) || 132 FindInBucket(k, fast_map_to_buckets(h2(tk)), out); 133 } 134 135 int64 MemoryUsed() const { 136 return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue); 137 } 138 139 private: 140 static constexpr int kSlotsPerBucket = 4; 141 142 // The load factor is chosen slightly conservatively for speed and 143 // to avoid the need for a table rebuild on insertion failure. 144 // 0.94 is achievable, but 0.85 is faster and keeps the code simple 145 // at the cost of a small amount of memory. 146 // NOTE: 0 < kLoadFactor <= 1.0 147 static constexpr double kLoadFactor = 0.85; 148 149 // Cuckoo insert: The maximum number of entries to scan should be ~400 150 // (Source: Personal communication with Michael Mitzenmacher; empirical 151 // experiments validate.). After trying 400 candidate locations, declare 152 // the table full - it's probably full of unresolvable cycles. Less than 153 // 400 reduces max occupancy; much more results in very poor performance 154 // around the full point. For (2,4) a max BFS path len of 5 results in ~682 155 // nodes to visit, calculated below, and is a good value. 156 157 static constexpr uint8 kMaxBFSPathLen = 5; 158 159 // Constants for BFS cuckoo path search: 160 // The visited list must be maintained for all but the last level of search 161 // in order to trace back the path. The BFS search has two roots 162 // and each can go to a total depth (including the root) of 5. 163 // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682. 164 // The visited queue, however, does not need to hold the deepest level, 165 // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170 166 static constexpr int kMaxQueueSize = 682; 167 static constexpr int kVisitedListSize = 170; 168 169 static constexpr int kNoSpace = -1; // SpaceAvailable return 170 static constexpr uint64 kUnusedSlot = ~(0ULL); 171 172 // Buckets are organized with key_types clustered for access speed 173 // and for compactness while remaining aligned. 174 struct Bucket { 175 key_type keys[kSlotsPerBucket]; 176 value values[kSlotsPerBucket]; 177 }; 178 179 // Insert uses the BFS optimization (search before moving) to reduce 180 // the number of cache lines dirtied during search. 181 182 struct CuckooPathEntry { 183 uint64 bucket; 184 int depth; 185 int parent; // To index in the visited array. 186 int parent_slot; // Which slot in our parent did we come from? -1 == root. 187 }; 188 189 // CuckooPathQueue is a trivial circular queue for path entries. 190 // The caller is responsible for not inserting more than kMaxQueueSize 191 // entries. Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue 192 // that it reuses across inserts. 193 class CuckooPathQueue { 194 public: 195 CuckooPathQueue() : head_(0), tail_(0) {} 196 197 void push_back(CuckooPathEntry e) { 198 queue_[tail_] = e; 199 tail_ = (tail_ + 1) % kMaxQueueSize; 200 } 201 202 CuckooPathEntry pop_front() { 203 CuckooPathEntry& e = queue_[head_]; 204 head_ = (head_ + 1) % kMaxQueueSize; 205 return e; 206 } 207 208 bool empty() const { return head_ == tail_; } 209 210 bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; } 211 212 void reset() { head_ = tail_ = 0; } 213 214 private: 215 CuckooPathEntry queue_[kMaxQueueSize]; 216 int head_; 217 int tail_; 218 }; 219 220 typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath; 221 222 // Callers are expected to have pre-hashed the keys into a uint64 223 // and are expected to be able to handle (a very low rate) of 224 // collisions, OR must ensure that their keys are always in 225 // the range 0 - (uint64max - 1). This transforms 'not found flag' 226 // keys into something else. 227 inline uint64 key_transform(const key_type k) const { 228 return k + (k == kUnusedSlot); 229 } 230 231 // h2 performs a very quick mix of h to generate the second bucket hash. 232 // Assumes there is plenty of remaining entropy in the initial h. 233 inline uint64 h2(uint64 h) const { 234 const uint64 m = 0xc6a4a7935bd1e995; 235 return m * ((h >> 32) | (h << 32)); 236 } 237 238 // alt_bucket identifies the "other" bucket for key k, where 239 // other is "the one that isn't bucket b" 240 inline uint64 alt_bucket(key_type k, uint64 b) const { 241 if (fast_map_to_buckets(k) != b) { 242 return fast_map_to_buckets(k); 243 } 244 return fast_map_to_buckets(h2(k)); 245 } 246 247 inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) { 248 Bucket* bptr = &buckets_[b]; 249 bptr->keys[slot] = k; 250 bptr->values[slot] = v; 251 } 252 253 // For the associative cuckoo table, check all of the slots in 254 // the bucket to see if the key is present. 255 bool FindInBucket(key_type k, uint64 b, value* out) const { 256 const Bucket& bref = buckets_[b]; 257 for (int i = 0; i < kSlotsPerBucket; i++) { 258 if (bref.keys[i] == k) { 259 *out = bref.values[i]; 260 return true; 261 } 262 } 263 return false; 264 } 265 266 // returns either kNoSpace or the index of an 267 // available slot (0 <= slot < kSlotsPerBucket) 268 inline int SpaceAvailable(uint64 bucket) const { 269 const Bucket& bref = buckets_[bucket]; 270 for (int i = 0; i < kSlotsPerBucket; i++) { 271 if (bref.keys[i] == kUnusedSlot) { 272 return i; 273 } 274 } 275 return kNoSpace; 276 } 277 278 inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket, 279 int dst_slot) { 280 Bucket& src_ref = buckets_[src_bucket]; 281 Bucket& dst_ref = buckets_[dst_bucket]; 282 dst_ref.keys[dst_slot] = src_ref.keys[src_slot]; 283 dst_ref.values[dst_slot] = src_ref.values[src_slot]; 284 } 285 286 bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) { 287 int visited_end = 0; 288 cpq_->reset(); 289 290 cpq_->push_back({b1, 1, 0, 0}); // Note depth starts at 1. 291 cpq_->push_back({b2, 1, 0, 0}); 292 293 while (!cpq_->empty()) { 294 CuckooPathEntry e = cpq_->pop_front(); 295 int free_slot; 296 free_slot = SpaceAvailable(e.bucket); 297 if (free_slot != kNoSpace) { 298 while (e.depth > 1) { 299 // "copy" instead of "swap" because one entry is always zero. 300 // After, write target key/value over top of last copied entry. 301 CuckooPathEntry parent = visited_[e.parent]; 302 CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot); 303 free_slot = e.parent_slot; 304 e = parent; 305 } 306 InsertInternal(k, v, e.bucket, free_slot); 307 return true; 308 } else { 309 if (e.depth < (kMaxBFSPathLen)) { 310 auto parent_index = visited_end; 311 visited_[visited_end] = e; 312 visited_end++; 313 // Don't always start with the same slot, to even out the path depth. 314 int start_slot = (k + e.bucket) % kSlotsPerBucket; 315 const Bucket& bref = buckets_[e.bucket]; 316 for (int i = 0; i < kSlotsPerBucket; i++) { 317 int slot = (start_slot + i) % kSlotsPerBucket; 318 uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket); 319 // Optimization: Avoid single-step cycles (from e, don't 320 // add a child node that is actually e's parent). 321 uint64 e_parent_bucket = visited_[e.parent].bucket; 322 if (next_bucket != e_parent_bucket) { 323 cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot}); 324 } 325 } 326 } 327 } 328 } 329 330 LOG(WARNING) << "Cuckoo path finding failed: Table too small?"; 331 return false; 332 } 333 334 inline uint64 fast_map_to_buckets(uint64 x) const { 335 // Map x (uniform in 2^64) to the range [0, num_buckets_ -1] 336 // using Lemire's alternative to modulo reduction: 337 // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 338 // Instead of x % N, use (x * N) >> 64. 339 return presized_cuckoo_map::multiply_high_u64(x, num_buckets_); 340 } 341 342 // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket. 343 uint64 num_buckets_; 344 std::vector<Bucket> buckets_; 345 346 std::unique_ptr<CuckooPathQueue> cpq_; 347 CuckooPathEntry visited_[kVisitedListSize]; 348 349 TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap); 350 }; 351 352 } // namespace tensorflow 353 354 #endif // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_ 355