Home | History | Annotate | Download | only in util
      1 /* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
     17 #define TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
     18 
     19 #include <algorithm>
     20 #include <vector>
     21 #include "tensorflow/core/framework/types.h"
     22 #include "tensorflow/core/platform/macros.h"
     23 
     24 namespace tensorflow {
     25 
     26 // Class for efficiently storing key->value mappings when the size is
     27 // known in advance and the keys are pre-hashed into uint64s.
     28 // Keys should have "good enough" randomness (be spread across the
     29 // entire 64 bit space).
     30 //
     31 // Important:  Clients wishing to use deterministic keys must
     32 // ensure that their keys fall in the range 0 .. (uint64max-1);
     33 // the table uses 2^64-1 as the "not occupied" flag.
     34 //
     35 // Inserted keys must be unique, and there are no update
     36 // or delete functions (until some subsequent use of this table
     37 // requires them).
     38 //
     39 // Threads must synchronize their access to a PresizedCuckooMap.
     40 //
     41 // The cuckoo hash table is 4-way associative (each "bucket" has 4
     42 // "slots" for key/value entries).  Uses breadth-first-search to find
     43 // a good cuckoo path with less data movement (see
     44 // http://www.cs.cmu.edu/~dga/papers/cuckoo-eurosys14.pdf )
     45 
     46 namespace presized_cuckoo_map {
     47 // Utility function to compute (x * y) >> 64, or "multiply high".
     48 // On x86-64, this is a single instruction, but not all platforms
     49 // support the __uint128_t type, so we provide a generic
     50 // implementation as well.
     51 inline uint64 multiply_high_u64(uint64 x, uint64 y) {
     52 #if defined(__SIZEOF_INT128__)
     53   return (uint64)(((__uint128_t)x * (__uint128_t)y) >> 64);
     54 #else
     55   // For platforms without int128 support, do it the long way.
     56   uint64 x_lo = x & 0xffffffff;
     57   uint64 x_hi = x >> 32;
     58   uint64 buckets_lo = y & 0xffffffff;
     59   uint64 buckets_hi = y >> 32;
     60   uint64 prod_hi = x_hi * buckets_hi;
     61   uint64 prod_lo = x_lo * buckets_lo;
     62   uint64 prod_mid1 = x_hi * buckets_lo;
     63   uint64 prod_mid2 = x_lo * buckets_hi;
     64   uint64 carry =
     65       ((prod_mid1 & 0xffffffff) + (prod_mid2 & 0xffffffff) + (prod_lo >> 32)) >>
     66       32;
     67   return prod_hi + (prod_mid1 >> 32) + (prod_mid2 >> 32) + carry;
     68 #endif
     69 }
     70 }  // namespace presized_cuckoo_map
     71 
     72 template <class value>
     73 class PresizedCuckooMap {
     74  public:
     75   // The key type is fixed as a pre-hashed key for this specialized use.
     76   typedef uint64 key_type;
     77 
     78   explicit PresizedCuckooMap(uint64 num_entries) { Clear(num_entries); }
     79 
     80   void Clear(uint64 num_entries) {
     81     cpq_.reset(new CuckooPathQueue());
     82     double n(num_entries);
     83     n /= kLoadFactor;
     84     num_buckets_ = (static_cast<uint64>(n) / kSlotsPerBucket);
     85     // Very small cuckoo tables don't work, because the probability
     86     // of having same-bucket hashes is large.  We compromise for those
     87     // uses by having a larger static starting size.
     88     num_buckets_ += 32;
     89     Bucket empty_bucket;
     90     for (int i = 0; i < kSlotsPerBucket; i++) {
     91       empty_bucket.keys[i] = kUnusedSlot;
     92     }
     93     buckets_.clear();
     94     buckets_.resize(num_buckets_, empty_bucket);
     95   }
     96 
     97   // Returns false if k is already in table or if the table
     98   // is full; true otherwise.
     99   bool InsertUnique(const key_type k, const value& v) {
    100     uint64 tk = key_transform(k);
    101     uint64 b1 = fast_map_to_buckets(tk);
    102     uint64 b2 = fast_map_to_buckets(h2(tk));
    103 
    104     // Merged find and duplicate checking.
    105     uint64 target_bucket = 0;
    106     int target_slot = kNoSpace;
    107 
    108     for (auto bucket : {b1, b2}) {
    109       Bucket* bptr = &buckets_[bucket];
    110       for (int slot = 0; slot < kSlotsPerBucket; slot++) {
    111         if (bptr->keys[slot] == k) {  // Duplicates are not allowed.
    112           return false;
    113         } else if (target_slot == kNoSpace && bptr->keys[slot] == kUnusedSlot) {
    114           target_bucket = bucket;
    115           target_slot = slot;
    116         }
    117       }
    118     }
    119 
    120     if (target_slot != kNoSpace) {
    121       InsertInternal(tk, v, target_bucket, target_slot);
    122       return true;
    123     }
    124 
    125     return CuckooInsert(tk, v, b1, b2);
    126   }
    127 
    128   // Returns true if found.  Sets *out = value.
    129   bool Find(const key_type k, value* out) const {
    130     uint64 tk = key_transform(k);
    131     return FindInBucket(k, fast_map_to_buckets(tk), out) ||
    132            FindInBucket(k, fast_map_to_buckets(h2(tk)), out);
    133   }
    134 
    135   int64 MemoryUsed() const {
    136     return sizeof(PresizedCuckooMap<value>) + sizeof(CuckooPathQueue);
    137   }
    138 
    139  private:
    140   static constexpr int kSlotsPerBucket = 4;
    141 
    142   // The load factor is chosen slightly conservatively for speed and
    143   // to avoid the need for a table rebuild on insertion failure.
    144   // 0.94 is achievable, but 0.85 is faster and keeps the code simple
    145   // at the cost of a small amount of memory.
    146   // NOTE:  0 < kLoadFactor <= 1.0
    147   static constexpr double kLoadFactor = 0.85;
    148 
    149   // Cuckoo insert:  The maximum number of entries to scan should be ~400
    150   // (Source:  Personal communication with Michael Mitzenmacher;  empirical
    151   // experiments validate.).  After trying 400 candidate locations, declare
    152   // the table full - it's probably full of unresolvable cycles.  Less than
    153   // 400 reduces max occupancy;  much more results in very poor performance
    154   // around the full point.  For (2,4) a max BFS path len of 5 results in ~682
    155   // nodes to visit, calculated below, and is a good value.
    156 
    157   static constexpr uint8 kMaxBFSPathLen = 5;
    158 
    159   // Constants for BFS cuckoo path search:
    160   // The visited list must be maintained for all but the last level of search
    161   // in order to trace back the path.  The BFS search has two roots
    162   // and each can go to a total depth (including the root) of 5.
    163   // The queue must be sized for 2 * \sum_{k=0...4}{kSlotsPerBucket^k} = 682.
    164   // The visited queue, however, does not need to hold the deepest level,
    165   // and so it is sized 2 * \sum{k=0...3}{kSlotsPerBucket^k} = 170
    166   static constexpr int kMaxQueueSize = 682;
    167   static constexpr int kVisitedListSize = 170;
    168 
    169   static constexpr int kNoSpace = -1;  // SpaceAvailable return
    170   static constexpr uint64 kUnusedSlot = ~(0ULL);
    171 
    172   // Buckets are organized with key_types clustered for access speed
    173   // and for compactness while remaining aligned.
    174   struct Bucket {
    175     key_type keys[kSlotsPerBucket];
    176     value values[kSlotsPerBucket];
    177   };
    178 
    179   // Insert uses the BFS optimization (search before moving) to reduce
    180   // the number of cache lines dirtied during search.
    181 
    182   struct CuckooPathEntry {
    183     uint64 bucket;
    184     int depth;
    185     int parent;       // To index in the visited array.
    186     int parent_slot;  // Which slot in our parent did we come from?  -1 == root.
    187   };
    188 
    189   // CuckooPathQueue is a trivial circular queue for path entries.
    190   // The caller is responsible for not inserting more than kMaxQueueSize
    191   // entries.  Each PresizedCuckooMap has one (heap-allocated) CuckooPathQueue
    192   // that it reuses across inserts.
    193   class CuckooPathQueue {
    194    public:
    195     CuckooPathQueue() : head_(0), tail_(0) {}
    196 
    197     void push_back(CuckooPathEntry e) {
    198       queue_[tail_] = e;
    199       tail_ = (tail_ + 1) % kMaxQueueSize;
    200     }
    201 
    202     CuckooPathEntry pop_front() {
    203       CuckooPathEntry& e = queue_[head_];
    204       head_ = (head_ + 1) % kMaxQueueSize;
    205       return e;
    206     }
    207 
    208     bool empty() const { return head_ == tail_; }
    209 
    210     bool full() const { return ((tail_ + 1) % kMaxQueueSize) == head_; }
    211 
    212     void reset() { head_ = tail_ = 0; }
    213 
    214    private:
    215     CuckooPathEntry queue_[kMaxQueueSize];
    216     int head_;
    217     int tail_;
    218   };
    219 
    220   typedef std::array<CuckooPathEntry, kMaxBFSPathLen> CuckooPath;
    221 
    222   // Callers are expected to have pre-hashed the keys into a uint64
    223   // and are expected to be able to handle (a very low rate) of
    224   // collisions, OR must ensure that their keys are always in
    225   // the range 0 - (uint64max - 1).  This transforms 'not found flag'
    226   // keys into something else.
    227   inline uint64 key_transform(const key_type k) const {
    228     return k + (k == kUnusedSlot);
    229   }
    230 
    231   // h2 performs a very quick mix of h to generate the second bucket hash.
    232   // Assumes there is plenty of remaining entropy in the initial h.
    233   inline uint64 h2(uint64 h) const {
    234     const uint64 m = 0xc6a4a7935bd1e995;
    235     return m * ((h >> 32) | (h << 32));
    236   }
    237 
    238   // alt_bucket identifies the "other" bucket for key k, where
    239   // other is "the one that isn't bucket b"
    240   inline uint64 alt_bucket(key_type k, uint64 b) const {
    241     if (fast_map_to_buckets(k) != b) {
    242       return fast_map_to_buckets(k);
    243     }
    244     return fast_map_to_buckets(h2(k));
    245   }
    246 
    247   inline void InsertInternal(key_type k, const value& v, uint64 b, int slot) {
    248     Bucket* bptr = &buckets_[b];
    249     bptr->keys[slot] = k;
    250     bptr->values[slot] = v;
    251   }
    252 
    253   // For the associative cuckoo table, check all of the slots in
    254   // the bucket to see if the key is present.
    255   bool FindInBucket(key_type k, uint64 b, value* out) const {
    256     const Bucket& bref = buckets_[b];
    257     for (int i = 0; i < kSlotsPerBucket; i++) {
    258       if (bref.keys[i] == k) {
    259         *out = bref.values[i];
    260         return true;
    261       }
    262     }
    263     return false;
    264   }
    265 
    266   //  returns either kNoSpace or the index of an
    267   //  available slot (0 <= slot < kSlotsPerBucket)
    268   inline int SpaceAvailable(uint64 bucket) const {
    269     const Bucket& bref = buckets_[bucket];
    270     for (int i = 0; i < kSlotsPerBucket; i++) {
    271       if (bref.keys[i] == kUnusedSlot) {
    272         return i;
    273       }
    274     }
    275     return kNoSpace;
    276   }
    277 
    278   inline void CopyItem(uint64 src_bucket, int src_slot, uint64 dst_bucket,
    279                        int dst_slot) {
    280     Bucket& src_ref = buckets_[src_bucket];
    281     Bucket& dst_ref = buckets_[dst_bucket];
    282     dst_ref.keys[dst_slot] = src_ref.keys[src_slot];
    283     dst_ref.values[dst_slot] = src_ref.values[src_slot];
    284   }
    285 
    286   bool CuckooInsert(key_type k, const value& v, uint64 b1, uint64 b2) {
    287     int visited_end = 0;
    288     cpq_->reset();
    289 
    290     cpq_->push_back({b1, 1, 0, 0});  // Note depth starts at 1.
    291     cpq_->push_back({b2, 1, 0, 0});
    292 
    293     while (!cpq_->empty()) {
    294       CuckooPathEntry e = cpq_->pop_front();
    295       int free_slot;
    296       free_slot = SpaceAvailable(e.bucket);
    297       if (free_slot != kNoSpace) {
    298         while (e.depth > 1) {
    299           // "copy" instead of "swap" because one entry is always zero.
    300           // After, write target key/value over top of last copied entry.
    301           CuckooPathEntry parent = visited_[e.parent];
    302           CopyItem(parent.bucket, e.parent_slot, e.bucket, free_slot);
    303           free_slot = e.parent_slot;
    304           e = parent;
    305         }
    306         InsertInternal(k, v, e.bucket, free_slot);
    307         return true;
    308       } else {
    309         if (e.depth < (kMaxBFSPathLen)) {
    310           auto parent_index = visited_end;
    311           visited_[visited_end] = e;
    312           visited_end++;
    313           // Don't always start with the same slot, to even out the path depth.
    314           int start_slot = (k + e.bucket) % kSlotsPerBucket;
    315           const Bucket& bref = buckets_[e.bucket];
    316           for (int i = 0; i < kSlotsPerBucket; i++) {
    317             int slot = (start_slot + i) % kSlotsPerBucket;
    318             uint64 next_bucket = alt_bucket(bref.keys[slot], e.bucket);
    319             // Optimization:  Avoid single-step cycles (from e, don't
    320             // add a child node that is actually e's parent).
    321             uint64 e_parent_bucket = visited_[e.parent].bucket;
    322             if (next_bucket != e_parent_bucket) {
    323               cpq_->push_back({next_bucket, e.depth + 1, parent_index, slot});
    324             }
    325           }
    326         }
    327       }
    328     }
    329 
    330     LOG(WARNING) << "Cuckoo path finding failed: Table too small?";
    331     return false;
    332   }
    333 
    334   inline uint64 fast_map_to_buckets(uint64 x) const {
    335     // Map x (uniform in 2^64) to the range [0, num_buckets_ -1]
    336     // using Lemire's alternative to modulo reduction:
    337     // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/
    338     // Instead of x % N, use (x * N) >> 64.
    339     return presized_cuckoo_map::multiply_high_u64(x, num_buckets_);
    340   }
    341 
    342   // Set upon initialization: num_entries / kLoadFactor / kSlotsPerBucket.
    343   uint64 num_buckets_;
    344   std::vector<Bucket> buckets_;
    345 
    346   std::unique_ptr<CuckooPathQueue> cpq_;
    347   CuckooPathEntry visited_[kVisitedListSize];
    348 
    349   TF_DISALLOW_COPY_AND_ASSIGN(PresizedCuckooMap);
    350 };
    351 
    352 }  // namespace tensorflow
    353 
    354 #endif  // TENSORFLOW_UTIL_PRESIZED_CUCKOO_MAP_H_
    355