Home | History | Annotate | Download | only in Support
      1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Defines facilities for reading and writing on-disk hash tables.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 #ifndef LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
     15 #define LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
     16 
     17 #include "llvm/Support/Allocator.h"
     18 #include "llvm/Support/AlignOf.h"
     19 #include "llvm/Support/DataTypes.h"
     20 #include "llvm/Support/EndianStream.h"
     21 #include "llvm/Support/Host.h"
     22 #include "llvm/Support/MathExtras.h"
     23 #include "llvm/Support/raw_ostream.h"
     24 #include <cassert>
     25 #include <cstdlib>
     26 
     27 namespace llvm {
     28 
     29 /// \brief Generates an on disk hash table.
     30 ///
     31 /// This needs an \c Info that handles storing values into the hash table's
     32 /// payload and computes the hash for a given key. This should provide the
     33 /// following interface:
     34 ///
     35 /// \code
     36 /// class ExampleInfo {
     37 /// public:
     38 ///   typedef ExampleKey key_type;   // Must be copy constructible
     39 ///   typedef ExampleKey &key_type_ref;
     40 ///   typedef ExampleData data_type; // Must be copy constructible
     41 ///   typedef ExampleData &data_type_ref;
     42 ///   typedef uint32_t hash_value_type; // The type the hash function returns.
     43 ///   typedef uint32_t offset_type; // The type for offsets into the table.
     44 ///
     45 ///   /// Calculate the hash for Key
     46 ///   static hash_value_type ComputeHash(key_type_ref Key);
     47 ///   /// Return the lengths, in bytes, of the given Key/Data pair.
     48 ///   static std::pair<offset_type, offset_type>
     49 ///   EmitKeyDataLength(raw_ostream &Out, key_type_ref Key, data_type_ref Data);
     50 ///   /// Write Key to Out.  KeyLen is the length from EmitKeyDataLength.
     51 ///   static void EmitKey(raw_ostream &Out, key_type_ref Key,
     52 ///                       offset_type KeyLen);
     53 ///   /// Write Data to Out.  DataLen is the length from EmitKeyDataLength.
     54 ///   static void EmitData(raw_ostream &Out, key_type_ref Key,
     55 ///                        data_type_ref Data, offset_type DataLen);
     56 /// };
     57 /// \endcode
     58 template <typename Info> class OnDiskChainedHashTableGenerator {
     59   /// \brief A single item in the hash table.
     60   class Item {
     61   public:
     62     typename Info::key_type Key;
     63     typename Info::data_type Data;
     64     Item *Next;
     65     const typename Info::hash_value_type Hash;
     66 
     67     Item(typename Info::key_type_ref Key, typename Info::data_type_ref Data,
     68          Info &InfoObj)
     69         : Key(Key), Data(Data), Next(nullptr), Hash(InfoObj.ComputeHash(Key)) {}
     70   };
     71 
     72   typedef typename Info::offset_type offset_type;
     73   offset_type NumBuckets;
     74   offset_type NumEntries;
     75   llvm::SpecificBumpPtrAllocator<Item> BA;
     76 
     77   /// \brief A linked list of values in a particular hash bucket.
     78   class Bucket {
     79   public:
     80     offset_type Off;
     81     Item *Head;
     82     unsigned Length;
     83 
     84     Bucket() {}
     85   };
     86 
     87   Bucket *Buckets;
     88 
     89 private:
     90   /// \brief Insert an item into the appropriate hash bucket.
     91   void insert(Bucket *Buckets, size_t Size, Item *E) {
     92     Bucket &B = Buckets[E->Hash & (Size - 1)];
     93     E->Next = B.Head;
     94     ++B.Length;
     95     B.Head = E;
     96   }
     97 
     98   /// \brief Resize the hash table, moving the old entries into the new buckets.
     99   void resize(size_t NewSize) {
    100     Bucket *NewBuckets = (Bucket *)std::calloc(NewSize, sizeof(Bucket));
    101     // Populate NewBuckets with the old entries.
    102     for (size_t I = 0; I < NumBuckets; ++I)
    103       for (Item *E = Buckets[I].Head; E;) {
    104         Item *N = E->Next;
    105         E->Next = nullptr;
    106         insert(NewBuckets, NewSize, E);
    107         E = N;
    108       }
    109 
    110     free(Buckets);
    111     NumBuckets = NewSize;
    112     Buckets = NewBuckets;
    113   }
    114 
    115 public:
    116   /// \brief Insert an entry into the table.
    117   void insert(typename Info::key_type_ref Key,
    118               typename Info::data_type_ref Data) {
    119     Info InfoObj;
    120     insert(Key, Data, InfoObj);
    121   }
    122 
    123   /// \brief Insert an entry into the table.
    124   ///
    125   /// Uses the provided Info instead of a stack allocated one.
    126   void insert(typename Info::key_type_ref Key,
    127               typename Info::data_type_ref Data, Info &InfoObj) {
    128 
    129     ++NumEntries;
    130     if (4 * NumEntries >= 3 * NumBuckets)
    131       resize(NumBuckets * 2);
    132     insert(Buckets, NumBuckets, new (BA.Allocate()) Item(Key, Data, InfoObj));
    133   }
    134 
    135   /// \brief Emit the table to Out, which must not be at offset 0.
    136   offset_type Emit(raw_ostream &Out) {
    137     Info InfoObj;
    138     return Emit(Out, InfoObj);
    139   }
    140 
    141   /// \brief Emit the table to Out, which must not be at offset 0.
    142   ///
    143   /// Uses the provided Info instead of a stack allocated one.
    144   offset_type Emit(raw_ostream &Out, Info &InfoObj) {
    145     using namespace llvm::support;
    146     endian::Writer<little> LE(Out);
    147 
    148     // Emit the payload of the table.
    149     for (offset_type I = 0; I < NumBuckets; ++I) {
    150       Bucket &B = Buckets[I];
    151       if (!B.Head)
    152         continue;
    153 
    154       // Store the offset for the data of this bucket.
    155       B.Off = Out.tell();
    156       assert(B.Off && "Cannot write a bucket at offset 0. Please add padding.");
    157 
    158       // Write out the number of items in the bucket.
    159       LE.write<uint16_t>(B.Length);
    160       assert(B.Length != 0 && "Bucket has a head but zero length?");
    161 
    162       // Write out the entries in the bucket.
    163       for (Item *I = B.Head; I; I = I->Next) {
    164         LE.write<typename Info::hash_value_type>(I->Hash);
    165         const std::pair<offset_type, offset_type> &Len =
    166             InfoObj.EmitKeyDataLength(Out, I->Key, I->Data);
    167         InfoObj.EmitKey(Out, I->Key, Len.first);
    168         InfoObj.EmitData(Out, I->Key, I->Data, Len.second);
    169       }
    170     }
    171 
    172     // Pad with zeros so that we can start the hashtable at an aligned address.
    173     offset_type TableOff = Out.tell();
    174     uint64_t N = llvm::OffsetToAlignment(TableOff, alignOf<offset_type>());
    175     TableOff += N;
    176     while (N--)
    177       LE.write<uint8_t>(0);
    178 
    179     // Emit the hashtable itself.
    180     LE.write<offset_type>(NumBuckets);
    181     LE.write<offset_type>(NumEntries);
    182     for (offset_type I = 0; I < NumBuckets; ++I)
    183       LE.write<offset_type>(Buckets[I].Off);
    184 
    185     return TableOff;
    186   }
    187 
    188   OnDiskChainedHashTableGenerator() {
    189     NumEntries = 0;
    190     NumBuckets = 64;
    191     // Note that we do not need to run the constructors of the individual
    192     // Bucket objects since 'calloc' returns bytes that are all 0.
    193     Buckets = (Bucket *)std::calloc(NumBuckets, sizeof(Bucket));
    194   }
    195 
    196   ~OnDiskChainedHashTableGenerator() { std::free(Buckets); }
    197 };
    198 
    199 /// \brief Provides lookup on an on disk hash table.
    200 ///
    201 /// This needs an \c Info that handles reading values from the hash table's
    202 /// payload and computes the hash for a given key. This should provide the
    203 /// following interface:
    204 ///
    205 /// \code
    206 /// class ExampleLookupInfo {
    207 /// public:
    208 ///   typedef ExampleData data_type;
    209 ///   typedef ExampleInternalKey internal_key_type; // The stored key type.
    210 ///   typedef ExampleKey external_key_type; // The type to pass to find().
    211 ///   typedef uint32_t hash_value_type; // The type the hash function returns.
    212 ///   typedef uint32_t offset_type; // The type for offsets into the table.
    213 ///
    214 ///   /// Compare two keys for equality.
    215 ///   static bool EqualKey(internal_key_type &Key1, internal_key_type &Key2);
    216 ///   /// Calculate the hash for the given key.
    217 ///   static hash_value_type ComputeHash(internal_key_type &IKey);
    218 ///   /// Translate from the semantic type of a key in the hash table to the
    219 ///   /// type that is actually stored and used for hashing and comparisons.
    220 ///   /// The internal and external types are often the same, in which case this
    221 ///   /// can simply return the passed in value.
    222 ///   static const internal_key_type &GetInternalKey(external_key_type &EKey);
    223 ///   /// Read the key and data length from Buffer, leaving it pointing at the
    224 ///   /// following byte.
    225 ///   static std::pair<offset_type, offset_type>
    226 ///   ReadKeyDataLength(const unsigned char *&Buffer);
    227 ///   /// Read the key from Buffer, given the KeyLen as reported from
    228 ///   /// ReadKeyDataLength.
    229 ///   const internal_key_type &ReadKey(const unsigned char *Buffer,
    230 ///                                    offset_type KeyLen);
    231 ///   /// Read the data for Key from Buffer, given the DataLen as reported from
    232 ///   /// ReadKeyDataLength.
    233 ///   data_type ReadData(StringRef Key, const unsigned char *Buffer,
    234 ///                      offset_type DataLen);
    235 /// };
    236 /// \endcode
    237 template <typename Info> class OnDiskChainedHashTable {
    238   const typename Info::offset_type NumBuckets;
    239   const typename Info::offset_type NumEntries;
    240   const unsigned char *const Buckets;
    241   const unsigned char *const Base;
    242   Info InfoObj;
    243 
    244 public:
    245   typedef typename Info::internal_key_type internal_key_type;
    246   typedef typename Info::external_key_type external_key_type;
    247   typedef typename Info::data_type         data_type;
    248   typedef typename Info::hash_value_type   hash_value_type;
    249   typedef typename Info::offset_type       offset_type;
    250 
    251   OnDiskChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
    252                          const unsigned char *Buckets,
    253                          const unsigned char *Base,
    254                          const Info &InfoObj = Info())
    255       : NumBuckets(NumBuckets), NumEntries(NumEntries), Buckets(Buckets),
    256         Base(Base), InfoObj(InfoObj) {
    257     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
    258            "'buckets' must have a 4-byte alignment");
    259   }
    260 
    261   offset_type getNumBuckets() const { return NumBuckets; }
    262   offset_type getNumEntries() const { return NumEntries; }
    263   const unsigned char *getBase() const { return Base; }
    264   const unsigned char *getBuckets() const { return Buckets; }
    265 
    266   bool isEmpty() const { return NumEntries == 0; }
    267 
    268   class iterator {
    269     internal_key_type Key;
    270     const unsigned char *const Data;
    271     const offset_type Len;
    272     Info *InfoObj;
    273 
    274   public:
    275     iterator() : Data(nullptr), Len(0) {}
    276     iterator(const internal_key_type K, const unsigned char *D, offset_type L,
    277              Info *InfoObj)
    278         : Key(K), Data(D), Len(L), InfoObj(InfoObj) {}
    279 
    280     data_type operator*() const { return InfoObj->ReadData(Key, Data, Len); }
    281     bool operator==(const iterator &X) const { return X.Data == Data; }
    282     bool operator!=(const iterator &X) const { return X.Data != Data; }
    283   };
    284 
    285   /// \brief Look up the stored data for a particular key.
    286   iterator find(const external_key_type &EKey, Info *InfoPtr = 0) {
    287     if (!InfoPtr)
    288       InfoPtr = &InfoObj;
    289 
    290     using namespace llvm::support;
    291     const internal_key_type &IKey = InfoObj.GetInternalKey(EKey);
    292     hash_value_type KeyHash = InfoObj.ComputeHash(IKey);
    293 
    294     // Each bucket is just an offset into the hash table file.
    295     offset_type Idx = KeyHash & (NumBuckets - 1);
    296     const unsigned char *Bucket = Buckets + sizeof(offset_type) * Idx;
    297 
    298     offset_type Offset = endian::readNext<offset_type, little, aligned>(Bucket);
    299     if (Offset == 0)
    300       return iterator(); // Empty bucket.
    301     const unsigned char *Items = Base + Offset;
    302 
    303     // 'Items' starts with a 16-bit unsigned integer representing the
    304     // number of items in this bucket.
    305     unsigned Len = endian::readNext<uint16_t, little, unaligned>(Items);
    306 
    307     for (unsigned i = 0; i < Len; ++i) {
    308       // Read the hash.
    309       hash_value_type ItemHash =
    310           endian::readNext<hash_value_type, little, unaligned>(Items);
    311 
    312       // Determine the length of the key and the data.
    313       const std::pair<offset_type, offset_type> &L =
    314           Info::ReadKeyDataLength(Items);
    315       offset_type ItemLen = L.first + L.second;
    316 
    317       // Compare the hashes.  If they are not the same, skip the entry entirely.
    318       if (ItemHash != KeyHash) {
    319         Items += ItemLen;
    320         continue;
    321       }
    322 
    323       // Read the key.
    324       const internal_key_type &X =
    325           InfoPtr->ReadKey((const unsigned char *const)Items, L.first);
    326 
    327       // If the key doesn't match just skip reading the value.
    328       if (!InfoPtr->EqualKey(X, IKey)) {
    329         Items += ItemLen;
    330         continue;
    331       }
    332 
    333       // The key matches!
    334       return iterator(X, Items + L.first, L.second, InfoPtr);
    335     }
    336 
    337     return iterator();
    338   }
    339 
    340   iterator end() const { return iterator(); }
    341 
    342   Info &getInfoObj() { return InfoObj; }
    343 
    344   /// \brief Create the hash table.
    345   ///
    346   /// \param Buckets is the beginning of the hash table itself, which follows
    347   /// the payload of entire structure. This is the value returned by
    348   /// OnDiskHashTableGenerator::Emit.
    349   ///
    350   /// \param Base is the point from which all offsets into the structure are
    351   /// based. This is offset 0 in the stream that was used when Emitting the
    352   /// table.
    353   static OnDiskChainedHashTable *Create(const unsigned char *Buckets,
    354                                         const unsigned char *const Base,
    355                                         const Info &InfoObj = Info()) {
    356     using namespace llvm::support;
    357     assert(Buckets > Base);
    358     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
    359            "buckets should be 4-byte aligned.");
    360 
    361     offset_type NumBuckets =
    362         endian::readNext<offset_type, little, aligned>(Buckets);
    363     offset_type NumEntries =
    364         endian::readNext<offset_type, little, aligned>(Buckets);
    365     return new OnDiskChainedHashTable<Info>(NumBuckets, NumEntries, Buckets,
    366                                             Base, InfoObj);
    367   }
    368 };
    369 
    370 /// \brief Provides lookup and iteration over an on disk hash table.
    371 ///
    372 /// \copydetails llvm::OnDiskChainedHashTable
    373 template <typename Info>
    374 class OnDiskIterableChainedHashTable : public OnDiskChainedHashTable<Info> {
    375   const unsigned char *Payload;
    376 
    377 public:
    378   typedef OnDiskChainedHashTable<Info>          base_type;
    379   typedef typename base_type::internal_key_type internal_key_type;
    380   typedef typename base_type::external_key_type external_key_type;
    381   typedef typename base_type::data_type         data_type;
    382   typedef typename base_type::hash_value_type   hash_value_type;
    383   typedef typename base_type::offset_type       offset_type;
    384 
    385   OnDiskIterableChainedHashTable(offset_type NumBuckets, offset_type NumEntries,
    386                                  const unsigned char *Buckets,
    387                                  const unsigned char *Payload,
    388                                  const unsigned char *Base,
    389                                  const Info &InfoObj = Info())
    390       : base_type(NumBuckets, NumEntries, Buckets, Base, InfoObj),
    391         Payload(Payload) {}
    392 
    393   /// \brief Iterates over all of the keys in the table.
    394   class key_iterator {
    395     const unsigned char *Ptr;
    396     offset_type NumItemsInBucketLeft;
    397     offset_type NumEntriesLeft;
    398     Info *InfoObj;
    399 
    400   public:
    401     typedef external_key_type value_type;
    402 
    403     key_iterator(const unsigned char *const Ptr, offset_type NumEntries,
    404                  Info *InfoObj)
    405         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    406           InfoObj(InfoObj) {}
    407     key_iterator()
    408         : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
    409           InfoObj(0) {}
    410 
    411     friend bool operator==(const key_iterator &X, const key_iterator &Y) {
    412       return X.NumEntriesLeft == Y.NumEntriesLeft;
    413     }
    414     friend bool operator!=(const key_iterator &X, const key_iterator &Y) {
    415       return X.NumEntriesLeft != Y.NumEntriesLeft;
    416     }
    417 
    418     key_iterator &operator++() { // Preincrement
    419       using namespace llvm::support;
    420       if (!NumItemsInBucketLeft) {
    421         // 'Items' starts with a 16-bit unsigned integer representing the
    422         // number of items in this bucket.
    423         NumItemsInBucketLeft =
    424             endian::readNext<uint16_t, little, unaligned>(Ptr);
    425       }
    426       Ptr += sizeof(hash_value_type); // Skip the hash.
    427       // Determine the length of the key and the data.
    428       const std::pair<offset_type, offset_type> &L =
    429           Info::ReadKeyDataLength(Ptr);
    430       Ptr += L.first + L.second;
    431       assert(NumItemsInBucketLeft);
    432       --NumItemsInBucketLeft;
    433       assert(NumEntriesLeft);
    434       --NumEntriesLeft;
    435       return *this;
    436     }
    437     key_iterator operator++(int) { // Postincrement
    438       key_iterator tmp = *this; ++*this; return tmp;
    439     }
    440 
    441     value_type operator*() const {
    442       const unsigned char *LocalPtr = Ptr;
    443       if (!NumItemsInBucketLeft)
    444         LocalPtr += 2; // number of items in bucket
    445       LocalPtr += sizeof(hash_value_type); // Skip the hash.
    446 
    447       // Determine the length of the key and the data.
    448       const std::pair<offset_type, offset_type> &L =
    449           Info::ReadKeyDataLength(LocalPtr);
    450 
    451       // Read the key.
    452       const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
    453       return InfoObj->GetExternalKey(Key);
    454     }
    455   };
    456 
    457   key_iterator key_begin() {
    458     return key_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
    459   }
    460   key_iterator key_end() { return key_iterator(); }
    461 
    462   iterator_range<key_iterator> keys() {
    463     return make_range(key_begin(), key_end());
    464   }
    465 
    466   /// \brief Iterates over all the entries in the table, returning the data.
    467   class data_iterator {
    468     const unsigned char *Ptr;
    469     offset_type NumItemsInBucketLeft;
    470     offset_type NumEntriesLeft;
    471     Info *InfoObj;
    472 
    473   public:
    474     typedef data_type value_type;
    475 
    476     data_iterator(const unsigned char *const Ptr, offset_type NumEntries,
    477                   Info *InfoObj)
    478         : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    479           InfoObj(InfoObj) {}
    480     data_iterator()
    481         : Ptr(nullptr), NumItemsInBucketLeft(0), NumEntriesLeft(0),
    482           InfoObj(nullptr) {}
    483 
    484     bool operator==(const data_iterator &X) const {
    485       return X.NumEntriesLeft == NumEntriesLeft;
    486     }
    487     bool operator!=(const data_iterator &X) const {
    488       return X.NumEntriesLeft != NumEntriesLeft;
    489     }
    490 
    491     data_iterator &operator++() { // Preincrement
    492       using namespace llvm::support;
    493       if (!NumItemsInBucketLeft) {
    494         // 'Items' starts with a 16-bit unsigned integer representing the
    495         // number of items in this bucket.
    496         NumItemsInBucketLeft =
    497             endian::readNext<uint16_t, little, unaligned>(Ptr);
    498       }
    499       Ptr += sizeof(hash_value_type); // Skip the hash.
    500       // Determine the length of the key and the data.
    501       const std::pair<offset_type, offset_type> &L =
    502           Info::ReadKeyDataLength(Ptr);
    503       Ptr += L.first + L.second;
    504       assert(NumItemsInBucketLeft);
    505       --NumItemsInBucketLeft;
    506       assert(NumEntriesLeft);
    507       --NumEntriesLeft;
    508       return *this;
    509     }
    510     data_iterator operator++(int) { // Postincrement
    511       data_iterator tmp = *this; ++*this; return tmp;
    512     }
    513 
    514     value_type operator*() const {
    515       const unsigned char *LocalPtr = Ptr;
    516       if (!NumItemsInBucketLeft)
    517         LocalPtr += 2; // number of items in bucket
    518       LocalPtr += sizeof(hash_value_type); // Skip the hash.
    519 
    520       // Determine the length of the key and the data.
    521       const std::pair<offset_type, offset_type> &L =
    522           Info::ReadKeyDataLength(LocalPtr);
    523 
    524       // Read the key.
    525       const internal_key_type &Key = InfoObj->ReadKey(LocalPtr, L.first);
    526       return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
    527     }
    528   };
    529 
    530   data_iterator data_begin() {
    531     return data_iterator(Payload, this->getNumEntries(), &this->getInfoObj());
    532   }
    533   data_iterator data_end() { return data_iterator(); }
    534 
    535   iterator_range<data_iterator> data() {
    536     return make_range(data_begin(), data_end());
    537   }
    538 
    539   /// \brief Create the hash table.
    540   ///
    541   /// \param Buckets is the beginning of the hash table itself, which follows
    542   /// the payload of entire structure. This is the value returned by
    543   /// OnDiskHashTableGenerator::Emit.
    544   ///
    545   /// \param Payload is the beginning of the data contained in the table.  This
    546   /// is Base plus any padding or header data that was stored, ie, the offset
    547   /// that the stream was at when calling Emit.
    548   ///
    549   /// \param Base is the point from which all offsets into the structure are
    550   /// based. This is offset 0 in the stream that was used when Emitting the
    551   /// table.
    552   static OnDiskIterableChainedHashTable *
    553   Create(const unsigned char *Buckets, const unsigned char *const Payload,
    554          const unsigned char *const Base, const Info &InfoObj = Info()) {
    555     using namespace llvm::support;
    556     assert(Buckets > Base);
    557     assert((reinterpret_cast<uintptr_t>(Buckets) & 0x3) == 0 &&
    558            "buckets should be 4-byte aligned.");
    559 
    560     offset_type NumBuckets =
    561         endian::readNext<offset_type, little, aligned>(Buckets);
    562     offset_type NumEntries =
    563         endian::readNext<offset_type, little, aligned>(Buckets);
    564     return new OnDiskIterableChainedHashTable<Info>(
    565         NumBuckets, NumEntries, Buckets, Payload, Base, InfoObj);
    566   }
    567 };
    568 
    569 } // end namespace llvm
    570 
    571 #endif // LLVM_SUPPORT_ON_DISK_HASH_TABLE_H
    572