Home | History | Annotate | Download | only in base
      1 /*
      2  * Copyright (C) 2014 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 #ifndef ART_RUNTIME_BASE_HASH_SET_H_
     18 #define ART_RUNTIME_BASE_HASH_SET_H_
     19 
     20 #include <functional>
     21 #include <memory>
     22 #include <stdint.h>
     23 #include <utility>
     24 
     25 #include "bit_utils.h"
     26 #include "logging.h"
     27 
     28 namespace art {
     29 
     30 // Returns true if an item is empty.
     31 template <class T>
     32 class DefaultEmptyFn {
     33  public:
     34   void MakeEmpty(T& item) const {
     35     item = T();
     36   }
     37   bool IsEmpty(const T& item) const {
     38     return item == T();
     39   }
     40 };
     41 
     42 template <class T>
     43 class DefaultEmptyFn<T*> {
     44  public:
     45   void MakeEmpty(T*& item) const {
     46     item = nullptr;
     47   }
     48   bool IsEmpty(const T*& item) const {
     49     return item == nullptr;
     50   }
     51 };
     52 
     53 // Low memory version of a hash set, uses less memory than std::unordered_set since elements aren't
     54 // boxed. Uses linear probing to resolve collisions.
     55 // EmptyFn needs to implement two functions MakeEmpty(T& item) and IsEmpty(const T& item).
     56 // TODO: We could get rid of this requirement by using a bitmap, though maybe this would be slower
     57 // and more complicated.
     58 template <class T, class EmptyFn = DefaultEmptyFn<T>, class HashFn = std::hash<T>,
     59     class Pred = std::equal_to<T>, class Alloc = std::allocator<T>>
     60 class HashSet {
     61   template <class Elem, class HashSetType>
     62   class BaseIterator {
     63    public:
     64     BaseIterator(const BaseIterator&) = default;
     65     BaseIterator(BaseIterator&&) = default;
     66     BaseIterator(HashSetType* hash_set, size_t index) : index_(index), hash_set_(hash_set) {
     67     }
     68     BaseIterator& operator=(const BaseIterator&) = default;
     69     BaseIterator& operator=(BaseIterator&&) = default;
     70 
     71     bool operator==(const BaseIterator& other) const {
     72       return hash_set_ == other.hash_set_ && this->index_ == other.index_;
     73     }
     74 
     75     bool operator!=(const BaseIterator& other) const {
     76       return !(*this == other);
     77     }
     78 
     79     BaseIterator operator++() {  // Value after modification.
     80       this->index_ = this->NextNonEmptySlot(this->index_, hash_set_);
     81       return *this;
     82     }
     83 
     84     BaseIterator operator++(int) {
     85       Iterator temp = *this;
     86       this->index_ = this->NextNonEmptySlot(this->index_, hash_set_);
     87       return temp;
     88     }
     89 
     90     Elem& operator*() const {
     91       DCHECK(!hash_set_->IsFreeSlot(this->index_));
     92       return hash_set_->ElementForIndex(this->index_);
     93     }
     94 
     95     Elem* operator->() const {
     96       return &**this;
     97     }
     98 
     99     // TODO: Operator -- --(int)
    100 
    101    private:
    102     size_t index_;
    103     HashSetType* hash_set_;
    104 
    105     size_t NextNonEmptySlot(size_t index, const HashSet* hash_set) const {
    106       const size_t num_buckets = hash_set->NumBuckets();
    107       DCHECK_LT(index, num_buckets);
    108       do {
    109         ++index;
    110       } while (index < num_buckets && hash_set->IsFreeSlot(index));
    111       return index;
    112     }
    113 
    114     friend class HashSet;
    115   };
    116 
    117  public:
    118   static constexpr double kDefaultMinLoadFactor = 0.5;
    119   static constexpr double kDefaultMaxLoadFactor = 0.9;
    120   static constexpr size_t kMinBuckets = 1000;
    121 
    122   typedef BaseIterator<T, HashSet> Iterator;
    123   typedef BaseIterator<const T, const HashSet> ConstIterator;
    124 
    125   // If we don't own the data, this will create a new array which owns the data.
    126   void Clear() {
    127     DeallocateStorage();
    128     AllocateStorage(1);
    129     num_elements_ = 0;
    130     elements_until_expand_ = 0;
    131   }
    132 
    133   HashSet() : num_elements_(0), num_buckets_(0), owns_data_(false), data_(nullptr),
    134       min_load_factor_(kDefaultMinLoadFactor), max_load_factor_(kDefaultMaxLoadFactor) {
    135     Clear();
    136   }
    137 
    138   HashSet(const HashSet& other) : num_elements_(0), num_buckets_(0), owns_data_(false),
    139       data_(nullptr) {
    140     *this = other;
    141   }
    142 
    143   HashSet(HashSet&& other) : num_elements_(0), num_buckets_(0), owns_data_(false),
    144       data_(nullptr) {
    145     *this = std::move(other);
    146   }
    147 
    148   // Construct from existing data.
    149   // Read from a block of memory, if make_copy_of_data is false, then data_ points to within the
    150   // passed in ptr_.
    151   HashSet(const uint8_t* ptr, bool make_copy_of_data, size_t* read_count) {
    152     uint64_t temp;
    153     size_t offset = 0;
    154     offset = ReadFromBytes(ptr, offset, &temp);
    155     num_elements_ = static_cast<uint64_t>(temp);
    156     offset = ReadFromBytes(ptr, offset, &temp);
    157     num_buckets_ = static_cast<uint64_t>(temp);
    158     CHECK_LE(num_elements_, num_buckets_);
    159     offset = ReadFromBytes(ptr, offset, &temp);
    160     elements_until_expand_ = static_cast<uint64_t>(temp);
    161     offset = ReadFromBytes(ptr, offset, &min_load_factor_);
    162     offset = ReadFromBytes(ptr, offset, &max_load_factor_);
    163     if (!make_copy_of_data) {
    164       owns_data_ = false;
    165       data_ = const_cast<T*>(reinterpret_cast<const T*>(ptr + offset));
    166       offset += sizeof(*data_) * num_buckets_;
    167     } else {
    168       AllocateStorage(num_buckets_);
    169       // Write elements, not that this may not be safe for cross compilation if the elements are
    170       // pointer sized.
    171       for (size_t i = 0; i < num_buckets_; ++i) {
    172         offset = ReadFromBytes(ptr, offset, &data_[i]);
    173       }
    174     }
    175     // Caller responsible for aligning.
    176     *read_count = offset;
    177   }
    178 
    179   // Returns how large the table is after being written. If target is null, then no writing happens
    180   // but the size is still returned. Target must be 8 byte aligned.
    181   size_t WriteToMemory(uint8_t* ptr) {
    182     size_t offset = 0;
    183     offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(num_elements_));
    184     offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(num_buckets_));
    185     offset = WriteToBytes(ptr, offset, static_cast<uint64_t>(elements_until_expand_));
    186     offset = WriteToBytes(ptr, offset, min_load_factor_);
    187     offset = WriteToBytes(ptr, offset, max_load_factor_);
    188     // Write elements, not that this may not be safe for cross compilation if the elements are
    189     // pointer sized.
    190     for (size_t i = 0; i < num_buckets_; ++i) {
    191       offset = WriteToBytes(ptr, offset, data_[i]);
    192     }
    193     // Caller responsible for aligning.
    194     return offset;
    195   }
    196 
    197   ~HashSet() {
    198     DeallocateStorage();
    199   }
    200 
    201   HashSet& operator=(HashSet&& other) {
    202     std::swap(data_, other.data_);
    203     std::swap(num_buckets_, other.num_buckets_);
    204     std::swap(num_elements_, other.num_elements_);
    205     std::swap(elements_until_expand_, other.elements_until_expand_);
    206     std::swap(min_load_factor_, other.min_load_factor_);
    207     std::swap(max_load_factor_, other.max_load_factor_);
    208     std::swap(owns_data_, other.owns_data_);
    209     return *this;
    210   }
    211 
    212   HashSet& operator=(const HashSet& other) {
    213     DeallocateStorage();
    214     AllocateStorage(other.NumBuckets());
    215     for (size_t i = 0; i < num_buckets_; ++i) {
    216       ElementForIndex(i) = other.data_[i];
    217     }
    218     num_elements_ = other.num_elements_;
    219     elements_until_expand_ = other.elements_until_expand_;
    220     min_load_factor_ = other.min_load_factor_;
    221     max_load_factor_ = other.max_load_factor_;
    222     return *this;
    223   }
    224 
    225   // Lower case for c++11 for each.
    226   Iterator begin() {
    227     Iterator ret(this, 0);
    228     if (num_buckets_ != 0 && IsFreeSlot(ret.index_)) {
    229       ++ret;  // Skip all the empty slots.
    230     }
    231     return ret;
    232   }
    233 
    234   // Lower case for c++11 for each.
    235   Iterator end() {
    236     return Iterator(this, NumBuckets());
    237   }
    238 
    239   bool Empty() {
    240     return Size() == 0;
    241   }
    242 
    243   // Erase algorithm:
    244   // Make an empty slot where the iterator is pointing.
    245   // Scan fowards until we hit another empty slot.
    246   // If an element inbetween doesn't rehash to the range from the current empty slot to the
    247   // iterator. It must be before the empty slot, in that case we can move it to the empty slot
    248   // and set the empty slot to be the location we just moved from.
    249   // Relies on maintaining the invariant that there's no empty slots from the 'ideal' index of an
    250   // element to its actual location/index.
    251   Iterator Erase(Iterator it) {
    252     // empty_index is the index that will become empty.
    253     size_t empty_index = it.index_;
    254     DCHECK(!IsFreeSlot(empty_index));
    255     size_t next_index = empty_index;
    256     bool filled = false;  // True if we filled the empty index.
    257     while (true) {
    258       next_index = NextIndex(next_index);
    259       T& next_element = ElementForIndex(next_index);
    260       // If the next element is empty, we are done. Make sure to clear the current empty index.
    261       if (emptyfn_.IsEmpty(next_element)) {
    262         emptyfn_.MakeEmpty(ElementForIndex(empty_index));
    263         break;
    264       }
    265       // Otherwise try to see if the next element can fill the current empty index.
    266       const size_t next_hash = hashfn_(next_element);
    267       // Calculate the ideal index, if it is within empty_index + 1 to next_index then there is
    268       // nothing we can do.
    269       size_t next_ideal_index = IndexForHash(next_hash);
    270       // Loop around if needed for our check.
    271       size_t unwrapped_next_index = next_index;
    272       if (unwrapped_next_index < empty_index) {
    273         unwrapped_next_index += NumBuckets();
    274       }
    275       // Loop around if needed for our check.
    276       size_t unwrapped_next_ideal_index = next_ideal_index;
    277       if (unwrapped_next_ideal_index < empty_index) {
    278         unwrapped_next_ideal_index += NumBuckets();
    279       }
    280       if (unwrapped_next_ideal_index <= empty_index ||
    281           unwrapped_next_ideal_index > unwrapped_next_index) {
    282         // If the target index isn't within our current range it must have been probed from before
    283         // the empty index.
    284         ElementForIndex(empty_index) = std::move(next_element);
    285         filled = true;  // TODO: Optimize
    286         empty_index = next_index;
    287       }
    288     }
    289     --num_elements_;
    290     // If we didn't fill the slot then we need go to the next non free slot.
    291     if (!filled) {
    292       ++it;
    293     }
    294     return it;
    295   }
    296 
    297   // Find an element, returns end() if not found.
    298   // Allows custom key (K) types, example of when this is useful:
    299   // Set of Class* sorted by name, want to find a class with a name but can't allocate a dummy
    300   // object in the heap for performance solution.
    301   template <typename K>
    302   Iterator Find(const K& element) {
    303     return FindWithHash(element, hashfn_(element));
    304   }
    305 
    306   template <typename K>
    307   ConstIterator Find(const K& element) const {
    308     return FindWithHash(element, hashfn_(element));
    309   }
    310 
    311   template <typename K>
    312   Iterator FindWithHash(const K& element, size_t hash) {
    313     return Iterator(this, FindIndex(element, hash));
    314   }
    315 
    316   template <typename K>
    317   ConstIterator FindWithHash(const K& element, size_t hash) const {
    318     return ConstIterator(this, FindIndex(element, hash));
    319   }
    320 
    321   // Insert an element, allows duplicates.
    322   void Insert(const T& element) {
    323     InsertWithHash(element, hashfn_(element));
    324   }
    325 
    326   void InsertWithHash(const T& element, size_t hash) {
    327     DCHECK_EQ(hash, hashfn_(element));
    328     if (num_elements_ >= elements_until_expand_) {
    329       Expand();
    330       DCHECK_LT(num_elements_, elements_until_expand_);
    331     }
    332     const size_t index = FirstAvailableSlot(IndexForHash(hash));
    333     data_[index] = element;
    334     ++num_elements_;
    335   }
    336 
    337   size_t Size() const {
    338     return num_elements_;
    339   }
    340 
    341   void ShrinkToMaximumLoad() {
    342     Resize(Size() / max_load_factor_);
    343   }
    344 
    345   // To distance that inserted elements were probed. Used for measuring how good hash functions
    346   // are.
    347   size_t TotalProbeDistance() const {
    348     size_t total = 0;
    349     for (size_t i = 0; i < NumBuckets(); ++i) {
    350       const T& element = ElementForIndex(i);
    351       if (!emptyfn_.IsEmpty(element)) {
    352         size_t ideal_location = IndexForHash(hashfn_(element));
    353         if (ideal_location > i) {
    354           total += i + NumBuckets() - ideal_location;
    355         } else {
    356           total += i - ideal_location;
    357         }
    358       }
    359     }
    360     return total;
    361   }
    362 
    363   // Calculate the current load factor and return it.
    364   double CalculateLoadFactor() const {
    365     return static_cast<double>(Size()) / static_cast<double>(NumBuckets());
    366   }
    367 
    368   // Make sure that everything reinserts in the right spot. Returns the number of errors.
    369   size_t Verify() {
    370     size_t errors = 0;
    371     for (size_t i = 0; i < num_buckets_; ++i) {
    372       T& element = data_[i];
    373       if (!emptyfn_.IsEmpty(element)) {
    374         T temp;
    375         emptyfn_.MakeEmpty(temp);
    376         std::swap(temp, element);
    377         size_t first_slot = FirstAvailableSlot(IndexForHash(hashfn_(temp)));
    378         if (i != first_slot) {
    379           LOG(ERROR) << "Element " << i << " should be in slot " << first_slot;
    380           ++errors;
    381         }
    382         std::swap(temp, element);
    383       }
    384     }
    385     return errors;
    386   }
    387 
    388  private:
    389   T& ElementForIndex(size_t index) {
    390     DCHECK_LT(index, NumBuckets());
    391     DCHECK(data_ != nullptr);
    392     return data_[index];
    393   }
    394 
    395   const T& ElementForIndex(size_t index) const {
    396     DCHECK_LT(index, NumBuckets());
    397     DCHECK(data_ != nullptr);
    398     return data_[index];
    399   }
    400 
    401   size_t IndexForHash(size_t hash) const {
    402     return hash % num_buckets_;
    403   }
    404 
    405   size_t NextIndex(size_t index) const {
    406     if (UNLIKELY(++index >= num_buckets_)) {
    407       DCHECK_EQ(index, NumBuckets());
    408       return 0;
    409     }
    410     return index;
    411   }
    412 
    413   // Find the hash table slot for an element, or return NumBuckets() if not found.
    414   // This value for not found is important so that Iterator(this, FindIndex(...)) == end().
    415   template <typename K>
    416   size_t FindIndex(const K& element, size_t hash) const {
    417     DCHECK_EQ(hashfn_(element), hash);
    418     size_t index = IndexForHash(hash);
    419     while (true) {
    420       const T& slot = ElementForIndex(index);
    421       if (emptyfn_.IsEmpty(slot)) {
    422         return NumBuckets();
    423       }
    424       if (pred_(slot, element)) {
    425         return index;
    426       }
    427       index = NextIndex(index);
    428     }
    429   }
    430 
    431   bool IsFreeSlot(size_t index) const {
    432     return emptyfn_.IsEmpty(ElementForIndex(index));
    433   }
    434 
    435   size_t NumBuckets() const {
    436     return num_buckets_;
    437   }
    438 
    439   // Allocate a number of buckets.
    440   void AllocateStorage(size_t num_buckets) {
    441     num_buckets_ = num_buckets;
    442     data_ = allocfn_.allocate(num_buckets_);
    443     owns_data_ = true;
    444     for (size_t i = 0; i < num_buckets_; ++i) {
    445       allocfn_.construct(allocfn_.address(data_[i]));
    446       emptyfn_.MakeEmpty(data_[i]);
    447     }
    448   }
    449 
    450   void DeallocateStorage() {
    451     if (num_buckets_ != 0) {
    452       if (owns_data_) {
    453         for (size_t i = 0; i < NumBuckets(); ++i) {
    454           allocfn_.destroy(allocfn_.address(data_[i]));
    455         }
    456         allocfn_.deallocate(data_, NumBuckets());
    457         owns_data_ = false;
    458       }
    459       data_ = nullptr;
    460       num_buckets_ = 0;
    461     }
    462   }
    463 
    464   // Expand the set based on the load factors.
    465   void Expand() {
    466     size_t min_index = static_cast<size_t>(Size() / min_load_factor_);
    467     if (min_index < kMinBuckets) {
    468       min_index = kMinBuckets;
    469     }
    470     // Resize based on the minimum load factor.
    471     Resize(min_index);
    472     // When we hit elements_until_expand_, we are at the max load factor and must expand again.
    473     elements_until_expand_ = NumBuckets() * max_load_factor_;
    474   }
    475 
    476   // Expand / shrink the table to the new specified size.
    477   void Resize(size_t new_size) {
    478     DCHECK_GE(new_size, Size());
    479     T* const old_data = data_;
    480     size_t old_num_buckets = num_buckets_;
    481     // Reinsert all of the old elements.
    482     const bool owned_data = owns_data_;
    483     AllocateStorage(new_size);
    484     for (size_t i = 0; i < old_num_buckets; ++i) {
    485       T& element = old_data[i];
    486       if (!emptyfn_.IsEmpty(element)) {
    487         data_[FirstAvailableSlot(IndexForHash(hashfn_(element)))] = std::move(element);
    488       }
    489       if (owned_data) {
    490         allocfn_.destroy(allocfn_.address(element));
    491       }
    492     }
    493     if (owned_data) {
    494       allocfn_.deallocate(old_data, old_num_buckets);
    495     }
    496   }
    497 
    498   ALWAYS_INLINE size_t FirstAvailableSlot(size_t index) const {
    499     while (!emptyfn_.IsEmpty(data_[index])) {
    500       index = NextIndex(index);
    501     }
    502     return index;
    503   }
    504 
    505   // Return new offset.
    506   template <typename Elem>
    507   static size_t WriteToBytes(uint8_t* ptr, size_t offset, Elem n) {
    508     DCHECK_ALIGNED(ptr + offset, sizeof(n));
    509     if (ptr != nullptr) {
    510       *reinterpret_cast<Elem*>(ptr + offset) = n;
    511     }
    512     return offset + sizeof(n);
    513   }
    514 
    515   template <typename Elem>
    516   static size_t ReadFromBytes(const uint8_t* ptr, size_t offset, Elem* out) {
    517     DCHECK(ptr != nullptr);
    518     DCHECK_ALIGNED(ptr + offset, sizeof(*out));
    519     *out = *reinterpret_cast<const Elem*>(ptr + offset);
    520     return offset + sizeof(*out);
    521   }
    522 
    523   Alloc allocfn_;  // Allocator function.
    524   HashFn hashfn_;  // Hashing function.
    525   EmptyFn emptyfn_;  // IsEmpty/SetEmpty function.
    526   Pred pred_;  // Equals function.
    527   size_t num_elements_;  // Number of inserted elements.
    528   size_t num_buckets_;  // Number of hash table buckets.
    529   size_t elements_until_expand_;  // Maxmimum number of elements until we expand the table.
    530   bool owns_data_;  // If we own data_ and are responsible for freeing it.
    531   T* data_;  // Backing storage.
    532   double min_load_factor_;
    533   double max_load_factor_;
    534 };
    535 
    536 }  // namespace art
    537 
    538 #endif  // ART_RUNTIME_BASE_HASH_SET_H_
    539