Home | History | Annotate | Download | only in Basic
      1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Defines facilities for reading and writing on-disk hash tables.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
     15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
     16 
     17 #include "llvm/Support/Allocator.h"
     18 #include "llvm/Support/DataTypes.h"
     19 #include "llvm/Support/MathExtras.h"
     20 #include "llvm/Support/raw_ostream.h"
     21 #include "llvm/Support/Host.h"
     22 #include <cassert>
     23 #include <cstdlib>
     24 
     25 namespace clang {
     26 
     27 namespace io {
     28 
     29 typedef uint32_t Offset;
     30 
     31 inline void Emit8(raw_ostream& Out, uint32_t V) {
     32   Out << (unsigned char)(V);
     33 }
     34 
     35 inline void Emit16(raw_ostream& Out, uint32_t V) {
     36   Out << (unsigned char)(V);
     37   Out << (unsigned char)(V >>  8);
     38   assert((V >> 16) == 0);
     39 }
     40 
     41 inline void Emit24(raw_ostream& Out, uint32_t V) {
     42   Out << (unsigned char)(V);
     43   Out << (unsigned char)(V >>  8);
     44   Out << (unsigned char)(V >> 16);
     45   assert((V >> 24) == 0);
     46 }
     47 
     48 inline void Emit32(raw_ostream& Out, uint32_t V) {
     49   Out << (unsigned char)(V);
     50   Out << (unsigned char)(V >>  8);
     51   Out << (unsigned char)(V >> 16);
     52   Out << (unsigned char)(V >> 24);
     53 }
     54 
     55 inline void Emit64(raw_ostream& Out, uint64_t V) {
     56   Out << (unsigned char)(V);
     57   Out << (unsigned char)(V >>  8);
     58   Out << (unsigned char)(V >> 16);
     59   Out << (unsigned char)(V >> 24);
     60   Out << (unsigned char)(V >> 32);
     61   Out << (unsigned char)(V >> 40);
     62   Out << (unsigned char)(V >> 48);
     63   Out << (unsigned char)(V >> 56);
     64 }
     65 
     66 inline void Pad(raw_ostream& Out, unsigned A) {
     67   Offset off = (Offset) Out.tell();
     68   for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n)
     69     Emit8(Out, 0);
     70 }
     71 
     72 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
     73   uint16_t V = ((uint16_t)Data[0]) |
     74                ((uint16_t)Data[1] <<  8);
     75   Data += 2;
     76   return V;
     77 }
     78 
     79 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
     80   uint32_t V = ((uint32_t)Data[0])  |
     81                ((uint32_t)Data[1] << 8)  |
     82                ((uint32_t)Data[2] << 16) |
     83                ((uint32_t)Data[3] << 24);
     84   Data += 4;
     85   return V;
     86 }
     87 
     88 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
     89   uint64_t V = ((uint64_t)Data[0])  |
     90     ((uint64_t)Data[1] << 8)  |
     91     ((uint64_t)Data[2] << 16) |
     92     ((uint64_t)Data[3] << 24) |
     93     ((uint64_t)Data[4] << 32) |
     94     ((uint64_t)Data[5] << 40) |
     95     ((uint64_t)Data[6] << 48) |
     96     ((uint64_t)Data[7] << 56);
     97   Data += 8;
     98   return V;
     99 }
    100 
    101 inline uint32_t ReadLE32(const unsigned char *&Data) {
    102   // Hosts that directly support little-endian 32-bit loads can just
    103   // use them.  Big-endian hosts need a bswap.
    104   uint32_t V = *((const uint32_t*)Data);
    105   if (llvm::sys::isBigEndianHost())
    106     V = llvm::ByteSwap_32(V);
    107   Data += 4;
    108   return V;
    109 }
    110 
    111 } // end namespace io
    112 
    113 template<typename Info>
    114 class OnDiskChainedHashTableGenerator {
    115   unsigned NumBuckets;
    116   unsigned NumEntries;
    117   llvm::BumpPtrAllocator BA;
    118 
    119   class Item {
    120   public:
    121     typename Info::key_type key;
    122     typename Info::data_type data;
    123     Item *next;
    124     const uint32_t hash;
    125 
    126     Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
    127          Info &InfoObj)
    128     : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
    129   };
    130 
    131   class Bucket {
    132   public:
    133     io::Offset off;
    134     Item* head;
    135     unsigned length;
    136 
    137     Bucket() {}
    138   };
    139 
    140   Bucket* Buckets;
    141 
    142 private:
    143   void insert(Bucket* b, size_t size, Item* E) {
    144     unsigned idx = E->hash & (size - 1);
    145     Bucket& B = b[idx];
    146     E->next = B.head;
    147     ++B.length;
    148     B.head = E;
    149   }
    150 
    151   void resize(size_t newsize) {
    152     Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
    153     // Populate newBuckets with the old entries.
    154     for (unsigned i = 0; i < NumBuckets; ++i)
    155       for (Item* E = Buckets[i].head; E ; ) {
    156         Item* N = E->next;
    157         E->next = 0;
    158         insert(newBuckets, newsize, E);
    159         E = N;
    160       }
    161 
    162     free(Buckets);
    163     NumBuckets = newsize;
    164     Buckets = newBuckets;
    165   }
    166 
    167 public:
    168 
    169   void insert(typename Info::key_type_ref key,
    170               typename Info::data_type_ref data) {
    171     Info InfoObj;
    172     insert(key, data, InfoObj);
    173   }
    174 
    175   void insert(typename Info::key_type_ref key,
    176               typename Info::data_type_ref data, Info &InfoObj) {
    177 
    178     ++NumEntries;
    179     if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
    180     insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
    181                                                                InfoObj));
    182   }
    183 
    184   io::Offset Emit(raw_ostream &out) {
    185     Info InfoObj;
    186     return Emit(out, InfoObj);
    187   }
    188 
    189   io::Offset Emit(raw_ostream &out, Info &InfoObj) {
    190     using namespace clang::io;
    191 
    192     // Emit the payload of the table.
    193     for (unsigned i = 0; i < NumBuckets; ++i) {
    194       Bucket& B = Buckets[i];
    195       if (!B.head) continue;
    196 
    197       // Store the offset for the data of this bucket.
    198       B.off = out.tell();
    199       assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
    200 
    201       // Write out the number of items in the bucket.
    202       Emit16(out, B.length);
    203       assert(B.length != 0  && "Bucket has a head but zero length?");
    204 
    205       // Write out the entries in the bucket.
    206       for (Item *I = B.head; I ; I = I->next) {
    207         Emit32(out, I->hash);
    208         const std::pair<unsigned, unsigned>& Len =
    209           InfoObj.EmitKeyDataLength(out, I->key, I->data);
    210         InfoObj.EmitKey(out, I->key, Len.first);
    211         InfoObj.EmitData(out, I->key, I->data, Len.second);
    212       }
    213     }
    214 
    215     // Emit the hashtable itself.
    216     Pad(out, 4);
    217     io::Offset TableOff = out.tell();
    218     Emit32(out, NumBuckets);
    219     Emit32(out, NumEntries);
    220     for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
    221 
    222     return TableOff;
    223   }
    224 
    225   OnDiskChainedHashTableGenerator() {
    226     NumEntries = 0;
    227     NumBuckets = 64;
    228     // Note that we do not need to run the constructors of the individual
    229     // Bucket objects since 'calloc' returns bytes that are all 0.
    230     Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
    231   }
    232 
    233   ~OnDiskChainedHashTableGenerator() {
    234     std::free(Buckets);
    235   }
    236 };
    237 
    238 template<typename Info>
    239 class OnDiskChainedHashTable {
    240   const unsigned NumBuckets;
    241   const unsigned NumEntries;
    242   const unsigned char* const Buckets;
    243   const unsigned char* const Base;
    244   Info InfoObj;
    245 
    246 public:
    247   typedef typename Info::internal_key_type internal_key_type;
    248   typedef typename Info::external_key_type external_key_type;
    249   typedef typename Info::data_type         data_type;
    250 
    251   OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
    252                          const unsigned char* buckets,
    253                          const unsigned char* base,
    254                          const Info &InfoObj = Info())
    255     : NumBuckets(numBuckets), NumEntries(numEntries),
    256       Buckets(buckets), Base(base), InfoObj(InfoObj) {
    257         assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
    258                "'buckets' must have a 4-byte alignment");
    259       }
    260 
    261   unsigned getNumBuckets() const { return NumBuckets; }
    262   unsigned getNumEntries() const { return NumEntries; }
    263   const unsigned char* getBase() const { return Base; }
    264   const unsigned char* getBuckets() const { return Buckets; }
    265 
    266   bool isEmpty() const { return NumEntries == 0; }
    267 
    268   class iterator {
    269     internal_key_type key;
    270     const unsigned char* const data;
    271     const unsigned len;
    272     Info *InfoObj;
    273   public:
    274     iterator() : data(0), len(0) {}
    275     iterator(const internal_key_type k, const unsigned char* d, unsigned l,
    276              Info *InfoObj)
    277       : key(k), data(d), len(l), InfoObj(InfoObj) {}
    278 
    279     data_type operator*() const { return InfoObj->ReadData(key, data, len); }
    280     bool operator==(const iterator& X) const { return X.data == data; }
    281     bool operator!=(const iterator& X) const { return X.data != data; }
    282   };
    283 
    284   iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
    285     if (!InfoPtr)
    286       InfoPtr = &InfoObj;
    287 
    288     using namespace io;
    289     const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
    290     unsigned key_hash = InfoObj.ComputeHash(iKey);
    291 
    292     // Each bucket is just a 32-bit offset into the hash table file.
    293     unsigned idx = key_hash & (NumBuckets - 1);
    294     const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
    295 
    296     unsigned offset = ReadLE32(Bucket);
    297     if (offset == 0) return iterator(); // Empty bucket.
    298     const unsigned char* Items = Base + offset;
    299 
    300     // 'Items' starts with a 16-bit unsigned integer representing the
    301     // number of items in this bucket.
    302     unsigned len = ReadUnalignedLE16(Items);
    303 
    304     for (unsigned i = 0; i < len; ++i) {
    305       // Read the hash.
    306       uint32_t item_hash = ReadUnalignedLE32(Items);
    307 
    308       // Determine the length of the key and the data.
    309       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
    310       unsigned item_len = L.first + L.second;
    311 
    312       // Compare the hashes.  If they are not the same, skip the entry entirely.
    313       if (item_hash != key_hash) {
    314         Items += item_len;
    315         continue;
    316       }
    317 
    318       // Read the key.
    319       const internal_key_type& X =
    320         InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
    321 
    322       // If the key doesn't match just skip reading the value.
    323       if (!InfoPtr->EqualKey(X, iKey)) {
    324         Items += item_len;
    325         continue;
    326       }
    327 
    328       // The key matches!
    329       return iterator(X, Items + L.first, L.second, InfoPtr);
    330     }
    331 
    332     return iterator();
    333   }
    334 
    335   iterator end() const { return iterator(); }
    336 
    337   /// \brief Iterates over all of the keys in the table.
    338   class key_iterator {
    339     const unsigned char* Ptr;
    340     unsigned NumItemsInBucketLeft;
    341     unsigned NumEntriesLeft;
    342     Info *InfoObj;
    343   public:
    344     typedef external_key_type value_type;
    345 
    346     key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
    347                   Info *InfoObj)
    348       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    349         InfoObj(InfoObj) { }
    350     key_iterator()
    351       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
    352 
    353     friend bool operator==(const key_iterator &X, const key_iterator &Y) {
    354       return X.NumEntriesLeft == Y.NumEntriesLeft;
    355     }
    356     friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
    357       return X.NumEntriesLeft != Y.NumEntriesLeft;
    358     }
    359 
    360     key_iterator& operator++() {  // Preincrement
    361       if (!NumItemsInBucketLeft) {
    362         // 'Items' starts with a 16-bit unsigned integer representing the
    363         // number of items in this bucket.
    364         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
    365       }
    366       Ptr += 4; // Skip the hash.
    367       // Determine the length of the key and the data.
    368       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
    369       Ptr += L.first + L.second;
    370       assert(NumItemsInBucketLeft);
    371       --NumItemsInBucketLeft;
    372       assert(NumEntriesLeft);
    373       --NumEntriesLeft;
    374       return *this;
    375     }
    376     key_iterator operator++(int) {  // Postincrement
    377       key_iterator tmp = *this; ++*this; return tmp;
    378     }
    379 
    380     value_type operator*() const {
    381       const unsigned char* LocalPtr = Ptr;
    382       if (!NumItemsInBucketLeft)
    383         LocalPtr += 2; // number of items in bucket
    384       LocalPtr += 4; // Skip the hash.
    385 
    386       // Determine the length of the key and the data.
    387       const std::pair<unsigned, unsigned>& L
    388         = Info::ReadKeyDataLength(LocalPtr);
    389 
    390       // Read the key.
    391       const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
    392       return InfoObj->GetExternalKey(Key);
    393     }
    394   };
    395 
    396   key_iterator key_begin() {
    397     return key_iterator(Base + 4, getNumEntries(), &InfoObj);
    398   }
    399   key_iterator key_end() { return key_iterator(); }
    400 
    401   /// \brief Iterates over all the entries in the table, returning the data.
    402   class data_iterator {
    403     const unsigned char* Ptr;
    404     unsigned NumItemsInBucketLeft;
    405     unsigned NumEntriesLeft;
    406     Info *InfoObj;
    407   public:
    408     typedef data_type value_type;
    409 
    410     data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
    411                   Info *InfoObj)
    412       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    413         InfoObj(InfoObj) { }
    414     data_iterator()
    415       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
    416 
    417     bool operator==(const data_iterator& X) const {
    418       return X.NumEntriesLeft == NumEntriesLeft;
    419     }
    420     bool operator!=(const data_iterator& X) const {
    421       return X.NumEntriesLeft != NumEntriesLeft;
    422     }
    423 
    424     data_iterator& operator++() {  // Preincrement
    425       if (!NumItemsInBucketLeft) {
    426         // 'Items' starts with a 16-bit unsigned integer representing the
    427         // number of items in this bucket.
    428         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
    429       }
    430       Ptr += 4; // Skip the hash.
    431       // Determine the length of the key and the data.
    432       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
    433       Ptr += L.first + L.second;
    434       assert(NumItemsInBucketLeft);
    435       --NumItemsInBucketLeft;
    436       assert(NumEntriesLeft);
    437       --NumEntriesLeft;
    438       return *this;
    439     }
    440     data_iterator operator++(int) {  // Postincrement
    441       data_iterator tmp = *this; ++*this; return tmp;
    442     }
    443 
    444     value_type operator*() const {
    445       const unsigned char* LocalPtr = Ptr;
    446       if (!NumItemsInBucketLeft)
    447         LocalPtr += 2; // number of items in bucket
    448       LocalPtr += 4; // Skip the hash.
    449 
    450       // Determine the length of the key and the data.
    451       const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
    452 
    453       // Read the key.
    454       const internal_key_type& Key =
    455         InfoObj->ReadKey(LocalPtr, L.first);
    456       return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
    457     }
    458   };
    459 
    460   data_iterator data_begin() {
    461     return data_iterator(Base + 4, getNumEntries(), &InfoObj);
    462   }
    463   data_iterator data_end() { return data_iterator(); }
    464 
    465   Info &getInfoObj() { return InfoObj; }
    466 
    467   static OnDiskChainedHashTable* Create(const unsigned char* buckets,
    468                                         const unsigned char* const base,
    469                                         const Info &InfoObj = Info()) {
    470     using namespace io;
    471     assert(buckets > base);
    472     assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
    473            "buckets should be 4-byte aligned.");
    474 
    475     unsigned numBuckets = ReadLE32(buckets);
    476     unsigned numEntries = ReadLE32(buckets);
    477     return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
    478                                             base, InfoObj);
    479   }
    480 };
    481 
    482 } // end namespace clang
    483 
    484 #endif
    485