Home | History | Annotate | Download | only in Basic
      1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===//
      2 //
      3 //                     The LLVM Compiler Infrastructure
      4 //
      5 // This file is distributed under the University of Illinois Open Source
      6 // License. See LICENSE.TXT for details.
      7 //
      8 //===----------------------------------------------------------------------===//
      9 ///
     10 /// \file
     11 /// \brief Defines facilities for reading and writing on-disk hash tables.
     12 ///
     13 //===----------------------------------------------------------------------===//
     14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
     15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H
     16 
     17 #include "clang/Basic/LLVM.h"
     18 #include "llvm/Support/Allocator.h"
     19 #include "llvm/Support/DataTypes.h"
     20 #include "llvm/Support/Host.h"
     21 #include "llvm/Support/MathExtras.h"
     22 #include "llvm/Support/raw_ostream.h"
     23 #include <cassert>
     24 #include <cstdlib>
     25 
     26 namespace clang {
     27 
     28 namespace io {
     29 
     30 typedef uint32_t Offset;
     31 
     32 inline void Emit8(raw_ostream& Out, uint32_t V) {
     33   Out << (unsigned char)(V);
     34 }
     35 
     36 inline void Emit16(raw_ostream& Out, uint32_t V) {
     37   Out << (unsigned char)(V);
     38   Out << (unsigned char)(V >>  8);
     39   assert((V >> 16) == 0);
     40 }
     41 
     42 inline void Emit24(raw_ostream& Out, uint32_t V) {
     43   Out << (unsigned char)(V);
     44   Out << (unsigned char)(V >>  8);
     45   Out << (unsigned char)(V >> 16);
     46   assert((V >> 24) == 0);
     47 }
     48 
     49 inline void Emit32(raw_ostream& Out, uint32_t V) {
     50   Out << (unsigned char)(V);
     51   Out << (unsigned char)(V >>  8);
     52   Out << (unsigned char)(V >> 16);
     53   Out << (unsigned char)(V >> 24);
     54 }
     55 
     56 inline void Emit64(raw_ostream& Out, uint64_t V) {
     57   Out << (unsigned char)(V);
     58   Out << (unsigned char)(V >>  8);
     59   Out << (unsigned char)(V >> 16);
     60   Out << (unsigned char)(V >> 24);
     61   Out << (unsigned char)(V >> 32);
     62   Out << (unsigned char)(V >> 40);
     63   Out << (unsigned char)(V >> 48);
     64   Out << (unsigned char)(V >> 56);
     65 }
     66 
     67 inline void Pad(raw_ostream& Out, unsigned A) {
     68   Offset off = (Offset) Out.tell();
     69   for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n)
     70     Emit8(Out, 0);
     71 }
     72 
     73 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) {
     74   uint16_t V = ((uint16_t)Data[0]) |
     75                ((uint16_t)Data[1] <<  8);
     76   Data += 2;
     77   return V;
     78 }
     79 
     80 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) {
     81   uint32_t V = ((uint32_t)Data[0])  |
     82                ((uint32_t)Data[1] << 8)  |
     83                ((uint32_t)Data[2] << 16) |
     84                ((uint32_t)Data[3] << 24);
     85   Data += 4;
     86   return V;
     87 }
     88 
     89 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) {
     90   uint64_t V = ((uint64_t)Data[0])  |
     91     ((uint64_t)Data[1] << 8)  |
     92     ((uint64_t)Data[2] << 16) |
     93     ((uint64_t)Data[3] << 24) |
     94     ((uint64_t)Data[4] << 32) |
     95     ((uint64_t)Data[5] << 40) |
     96     ((uint64_t)Data[6] << 48) |
     97     ((uint64_t)Data[7] << 56);
     98   Data += 8;
     99   return V;
    100 }
    101 
    102 inline uint32_t ReadLE32(const unsigned char *&Data) {
    103   // Hosts that directly support little-endian 32-bit loads can just
    104   // use them.  Big-endian hosts need a bswap.
    105   uint32_t V = *((const uint32_t*)Data);
    106   if (llvm::sys::isBigEndianHost())
    107     V = llvm::ByteSwap_32(V);
    108   Data += 4;
    109   return V;
    110 }
    111 
    112 } // end namespace io
    113 
    114 template<typename Info>
    115 class OnDiskChainedHashTableGenerator {
    116   unsigned NumBuckets;
    117   unsigned NumEntries;
    118   llvm::BumpPtrAllocator BA;
    119 
    120   class Item {
    121   public:
    122     typename Info::key_type key;
    123     typename Info::data_type data;
    124     Item *next;
    125     const uint32_t hash;
    126 
    127     Item(typename Info::key_type_ref k, typename Info::data_type_ref d,
    128          Info &InfoObj)
    129     : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {}
    130   };
    131 
    132   class Bucket {
    133   public:
    134     io::Offset off;
    135     Item* head;
    136     unsigned length;
    137 
    138     Bucket() {}
    139   };
    140 
    141   Bucket* Buckets;
    142 
    143 private:
    144   void insert(Bucket* b, size_t size, Item* E) {
    145     unsigned idx = E->hash & (size - 1);
    146     Bucket& B = b[idx];
    147     E->next = B.head;
    148     ++B.length;
    149     B.head = E;
    150   }
    151 
    152   void resize(size_t newsize) {
    153     Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket));
    154     // Populate newBuckets with the old entries.
    155     for (unsigned i = 0; i < NumBuckets; ++i)
    156       for (Item* E = Buckets[i].head; E ; ) {
    157         Item* N = E->next;
    158         E->next = 0;
    159         insert(newBuckets, newsize, E);
    160         E = N;
    161       }
    162 
    163     free(Buckets);
    164     NumBuckets = newsize;
    165     Buckets = newBuckets;
    166   }
    167 
    168 public:
    169 
    170   void insert(typename Info::key_type_ref key,
    171               typename Info::data_type_ref data) {
    172     Info InfoObj;
    173     insert(key, data, InfoObj);
    174   }
    175 
    176   void insert(typename Info::key_type_ref key,
    177               typename Info::data_type_ref data, Info &InfoObj) {
    178 
    179     ++NumEntries;
    180     if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2);
    181     insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data,
    182                                                                InfoObj));
    183   }
    184 
    185   io::Offset Emit(raw_ostream &out) {
    186     Info InfoObj;
    187     return Emit(out, InfoObj);
    188   }
    189 
    190   io::Offset Emit(raw_ostream &out, Info &InfoObj) {
    191     using namespace clang::io;
    192 
    193     // Emit the payload of the table.
    194     for (unsigned i = 0; i < NumBuckets; ++i) {
    195       Bucket& B = Buckets[i];
    196       if (!B.head) continue;
    197 
    198       // Store the offset for the data of this bucket.
    199       B.off = out.tell();
    200       assert(B.off && "Cannot write a bucket at offset 0. Please add padding.");
    201 
    202       // Write out the number of items in the bucket.
    203       Emit16(out, B.length);
    204       assert(B.length != 0  && "Bucket has a head but zero length?");
    205 
    206       // Write out the entries in the bucket.
    207       for (Item *I = B.head; I ; I = I->next) {
    208         Emit32(out, I->hash);
    209         const std::pair<unsigned, unsigned>& Len =
    210           InfoObj.EmitKeyDataLength(out, I->key, I->data);
    211         InfoObj.EmitKey(out, I->key, Len.first);
    212         InfoObj.EmitData(out, I->key, I->data, Len.second);
    213       }
    214     }
    215 
    216     // Emit the hashtable itself.
    217     Pad(out, 4);
    218     io::Offset TableOff = out.tell();
    219     Emit32(out, NumBuckets);
    220     Emit32(out, NumEntries);
    221     for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off);
    222 
    223     return TableOff;
    224   }
    225 
    226   OnDiskChainedHashTableGenerator() {
    227     NumEntries = 0;
    228     NumBuckets = 64;
    229     // Note that we do not need to run the constructors of the individual
    230     // Bucket objects since 'calloc' returns bytes that are all 0.
    231     Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket));
    232   }
    233 
    234   ~OnDiskChainedHashTableGenerator() {
    235     std::free(Buckets);
    236   }
    237 };
    238 
    239 template<typename Info>
    240 class OnDiskChainedHashTable {
    241   const unsigned NumBuckets;
    242   const unsigned NumEntries;
    243   const unsigned char* const Buckets;
    244   const unsigned char* const Base;
    245   Info InfoObj;
    246 
    247 public:
    248   typedef typename Info::internal_key_type internal_key_type;
    249   typedef typename Info::external_key_type external_key_type;
    250   typedef typename Info::data_type         data_type;
    251 
    252   OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries,
    253                          const unsigned char* buckets,
    254                          const unsigned char* base,
    255                          const Info &InfoObj = Info())
    256     : NumBuckets(numBuckets), NumEntries(numEntries),
    257       Buckets(buckets), Base(base), InfoObj(InfoObj) {
    258         assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
    259                "'buckets' must have a 4-byte alignment");
    260       }
    261 
    262   unsigned getNumBuckets() const { return NumBuckets; }
    263   unsigned getNumEntries() const { return NumEntries; }
    264   const unsigned char* getBase() const { return Base; }
    265   const unsigned char* getBuckets() const { return Buckets; }
    266 
    267   bool isEmpty() const { return NumEntries == 0; }
    268 
    269   class iterator {
    270     internal_key_type key;
    271     const unsigned char* const data;
    272     const unsigned len;
    273     Info *InfoObj;
    274   public:
    275     iterator() : data(0), len(0) {}
    276     iterator(const internal_key_type k, const unsigned char* d, unsigned l,
    277              Info *InfoObj)
    278       : key(k), data(d), len(l), InfoObj(InfoObj) {}
    279 
    280     data_type operator*() const { return InfoObj->ReadData(key, data, len); }
    281     bool operator==(const iterator& X) const { return X.data == data; }
    282     bool operator!=(const iterator& X) const { return X.data != data; }
    283   };
    284 
    285   iterator find(const external_key_type& eKey, Info *InfoPtr = 0) {
    286     if (!InfoPtr)
    287       InfoPtr = &InfoObj;
    288 
    289     using namespace io;
    290     const internal_key_type& iKey = InfoObj.GetInternalKey(eKey);
    291     unsigned key_hash = InfoObj.ComputeHash(iKey);
    292 
    293     // Each bucket is just a 32-bit offset into the hash table file.
    294     unsigned idx = key_hash & (NumBuckets - 1);
    295     const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx;
    296 
    297     unsigned offset = ReadLE32(Bucket);
    298     if (offset == 0) return iterator(); // Empty bucket.
    299     const unsigned char* Items = Base + offset;
    300 
    301     // 'Items' starts with a 16-bit unsigned integer representing the
    302     // number of items in this bucket.
    303     unsigned len = ReadUnalignedLE16(Items);
    304 
    305     for (unsigned i = 0; i < len; ++i) {
    306       // Read the hash.
    307       uint32_t item_hash = ReadUnalignedLE32(Items);
    308 
    309       // Determine the length of the key and the data.
    310       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items);
    311       unsigned item_len = L.first + L.second;
    312 
    313       // Compare the hashes.  If they are not the same, skip the entry entirely.
    314       if (item_hash != key_hash) {
    315         Items += item_len;
    316         continue;
    317       }
    318 
    319       // Read the key.
    320       const internal_key_type& X =
    321         InfoPtr->ReadKey((const unsigned char* const) Items, L.first);
    322 
    323       // If the key doesn't match just skip reading the value.
    324       if (!InfoPtr->EqualKey(X, iKey)) {
    325         Items += item_len;
    326         continue;
    327       }
    328 
    329       // The key matches!
    330       return iterator(X, Items + L.first, L.second, InfoPtr);
    331     }
    332 
    333     return iterator();
    334   }
    335 
    336   iterator end() const { return iterator(); }
    337 
    338   /// \brief Iterates over all of the keys in the table.
    339   class key_iterator {
    340     const unsigned char* Ptr;
    341     unsigned NumItemsInBucketLeft;
    342     unsigned NumEntriesLeft;
    343     Info *InfoObj;
    344   public:
    345     typedef external_key_type value_type;
    346 
    347     key_iterator(const unsigned char* const Ptr, unsigned NumEntries,
    348                   Info *InfoObj)
    349       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    350         InfoObj(InfoObj) { }
    351     key_iterator()
    352       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
    353 
    354     friend bool operator==(const key_iterator &X, const key_iterator &Y) {
    355       return X.NumEntriesLeft == Y.NumEntriesLeft;
    356     }
    357     friend bool operator!=(const key_iterator& X, const key_iterator &Y) {
    358       return X.NumEntriesLeft != Y.NumEntriesLeft;
    359     }
    360 
    361     key_iterator& operator++() {  // Preincrement
    362       if (!NumItemsInBucketLeft) {
    363         // 'Items' starts with a 16-bit unsigned integer representing the
    364         // number of items in this bucket.
    365         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
    366       }
    367       Ptr += 4; // Skip the hash.
    368       // Determine the length of the key and the data.
    369       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
    370       Ptr += L.first + L.second;
    371       assert(NumItemsInBucketLeft);
    372       --NumItemsInBucketLeft;
    373       assert(NumEntriesLeft);
    374       --NumEntriesLeft;
    375       return *this;
    376     }
    377     key_iterator operator++(int) {  // Postincrement
    378       key_iterator tmp = *this; ++*this; return tmp;
    379     }
    380 
    381     value_type operator*() const {
    382       const unsigned char* LocalPtr = Ptr;
    383       if (!NumItemsInBucketLeft)
    384         LocalPtr += 2; // number of items in bucket
    385       LocalPtr += 4; // Skip the hash.
    386 
    387       // Determine the length of the key and the data.
    388       const std::pair<unsigned, unsigned>& L
    389         = Info::ReadKeyDataLength(LocalPtr);
    390 
    391       // Read the key.
    392       const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first);
    393       return InfoObj->GetExternalKey(Key);
    394     }
    395   };
    396 
    397   key_iterator key_begin() {
    398     return key_iterator(Base + 4, getNumEntries(), &InfoObj);
    399   }
    400   key_iterator key_end() { return key_iterator(); }
    401 
    402   /// \brief Iterates over all the entries in the table, returning the data.
    403   class data_iterator {
    404     const unsigned char* Ptr;
    405     unsigned NumItemsInBucketLeft;
    406     unsigned NumEntriesLeft;
    407     Info *InfoObj;
    408   public:
    409     typedef data_type value_type;
    410 
    411     data_iterator(const unsigned char* const Ptr, unsigned NumEntries,
    412                   Info *InfoObj)
    413       : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries),
    414         InfoObj(InfoObj) { }
    415     data_iterator()
    416       : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { }
    417 
    418     bool operator==(const data_iterator& X) const {
    419       return X.NumEntriesLeft == NumEntriesLeft;
    420     }
    421     bool operator!=(const data_iterator& X) const {
    422       return X.NumEntriesLeft != NumEntriesLeft;
    423     }
    424 
    425     data_iterator& operator++() {  // Preincrement
    426       if (!NumItemsInBucketLeft) {
    427         // 'Items' starts with a 16-bit unsigned integer representing the
    428         // number of items in this bucket.
    429         NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr);
    430       }
    431       Ptr += 4; // Skip the hash.
    432       // Determine the length of the key and the data.
    433       const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr);
    434       Ptr += L.first + L.second;
    435       assert(NumItemsInBucketLeft);
    436       --NumItemsInBucketLeft;
    437       assert(NumEntriesLeft);
    438       --NumEntriesLeft;
    439       return *this;
    440     }
    441     data_iterator operator++(int) {  // Postincrement
    442       data_iterator tmp = *this; ++*this; return tmp;
    443     }
    444 
    445     value_type operator*() const {
    446       const unsigned char* LocalPtr = Ptr;
    447       if (!NumItemsInBucketLeft)
    448         LocalPtr += 2; // number of items in bucket
    449       LocalPtr += 4; // Skip the hash.
    450 
    451       // Determine the length of the key and the data.
    452       const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr);
    453 
    454       // Read the key.
    455       const internal_key_type& Key =
    456         InfoObj->ReadKey(LocalPtr, L.first);
    457       return InfoObj->ReadData(Key, LocalPtr + L.first, L.second);
    458     }
    459   };
    460 
    461   data_iterator data_begin() {
    462     return data_iterator(Base + 4, getNumEntries(), &InfoObj);
    463   }
    464   data_iterator data_end() { return data_iterator(); }
    465 
    466   Info &getInfoObj() { return InfoObj; }
    467 
    468   static OnDiskChainedHashTable* Create(const unsigned char* buckets,
    469                                         const unsigned char* const base,
    470                                         const Info &InfoObj = Info()) {
    471     using namespace io;
    472     assert(buckets > base);
    473     assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 &&
    474            "buckets should be 4-byte aligned.");
    475 
    476     unsigned numBuckets = ReadLE32(buckets);
    477     unsigned numEntries = ReadLE32(buckets);
    478     return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets,
    479                                             base, InfoObj);
    480   }
    481 };
    482 
    483 } // end namespace clang
    484 
    485 #endif
    486