1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 /// 10 /// \file 11 /// \brief Defines facilities for reading and writing on-disk hash tables. 12 /// 13 //===----------------------------------------------------------------------===// 14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 16 17 #include "clang/Basic/LLVM.h" 18 #include "llvm/Support/Allocator.h" 19 #include "llvm/Support/DataTypes.h" 20 #include "llvm/Support/Host.h" 21 #include "llvm/Support/MathExtras.h" 22 #include "llvm/Support/raw_ostream.h" 23 #include <cassert> 24 #include <cstdlib> 25 26 namespace clang { 27 28 namespace io { 29 30 typedef uint32_t Offset; 31 32 inline void Emit8(raw_ostream& Out, uint32_t V) { 33 Out << (unsigned char)(V); 34 } 35 36 inline void Emit16(raw_ostream& Out, uint32_t V) { 37 Out << (unsigned char)(V); 38 Out << (unsigned char)(V >> 8); 39 assert((V >> 16) == 0); 40 } 41 42 inline void Emit24(raw_ostream& Out, uint32_t V) { 43 Out << (unsigned char)(V); 44 Out << (unsigned char)(V >> 8); 45 Out << (unsigned char)(V >> 16); 46 assert((V >> 24) == 0); 47 } 48 49 inline void Emit32(raw_ostream& Out, uint32_t V) { 50 Out << (unsigned char)(V); 51 Out << (unsigned char)(V >> 8); 52 Out << (unsigned char)(V >> 16); 53 Out << (unsigned char)(V >> 24); 54 } 55 56 inline void Emit64(raw_ostream& Out, uint64_t V) { 57 Out << (unsigned char)(V); 58 Out << (unsigned char)(V >> 8); 59 Out << (unsigned char)(V >> 16); 60 Out << (unsigned char)(V >> 24); 61 Out << (unsigned char)(V >> 32); 62 Out << (unsigned char)(V >> 40); 63 Out << (unsigned char)(V >> 48); 64 Out << (unsigned char)(V >> 56); 65 } 66 67 inline void Pad(raw_ostream& Out, unsigned A) { 68 Offset off = (Offset) Out.tell(); 69 for (uint32_t n = llvm::OffsetToAlignment(off, A); n; --n) 70 Emit8(Out, 0); 71 } 72 73 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { 74 uint16_t V = ((uint16_t)Data[0]) | 75 ((uint16_t)Data[1] << 8); 76 Data += 2; 77 return V; 78 } 79 80 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { 81 uint32_t V = ((uint32_t)Data[0]) | 82 ((uint32_t)Data[1] << 8) | 83 ((uint32_t)Data[2] << 16) | 84 ((uint32_t)Data[3] << 24); 85 Data += 4; 86 return V; 87 } 88 89 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { 90 uint64_t V = ((uint64_t)Data[0]) | 91 ((uint64_t)Data[1] << 8) | 92 ((uint64_t)Data[2] << 16) | 93 ((uint64_t)Data[3] << 24) | 94 ((uint64_t)Data[4] << 32) | 95 ((uint64_t)Data[5] << 40) | 96 ((uint64_t)Data[6] << 48) | 97 ((uint64_t)Data[7] << 56); 98 Data += 8; 99 return V; 100 } 101 102 inline uint32_t ReadLE32(const unsigned char *&Data) { 103 // Hosts that directly support little-endian 32-bit loads can just 104 // use them. Big-endian hosts need a bswap. 105 uint32_t V = *((const uint32_t*)Data); 106 if (llvm::sys::isBigEndianHost()) 107 V = llvm::ByteSwap_32(V); 108 Data += 4; 109 return V; 110 } 111 112 } // end namespace io 113 114 template<typename Info> 115 class OnDiskChainedHashTableGenerator { 116 unsigned NumBuckets; 117 unsigned NumEntries; 118 llvm::BumpPtrAllocator BA; 119 120 class Item { 121 public: 122 typename Info::key_type key; 123 typename Info::data_type data; 124 Item *next; 125 const uint32_t hash; 126 127 Item(typename Info::key_type_ref k, typename Info::data_type_ref d, 128 Info &InfoObj) 129 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {} 130 }; 131 132 class Bucket { 133 public: 134 io::Offset off; 135 Item* head; 136 unsigned length; 137 138 Bucket() {} 139 }; 140 141 Bucket* Buckets; 142 143 private: 144 void insert(Bucket* b, size_t size, Item* E) { 145 unsigned idx = E->hash & (size - 1); 146 Bucket& B = b[idx]; 147 E->next = B.head; 148 ++B.length; 149 B.head = E; 150 } 151 152 void resize(size_t newsize) { 153 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket)); 154 // Populate newBuckets with the old entries. 155 for (unsigned i = 0; i < NumBuckets; ++i) 156 for (Item* E = Buckets[i].head; E ; ) { 157 Item* N = E->next; 158 E->next = 0; 159 insert(newBuckets, newsize, E); 160 E = N; 161 } 162 163 free(Buckets); 164 NumBuckets = newsize; 165 Buckets = newBuckets; 166 } 167 168 public: 169 170 void insert(typename Info::key_type_ref key, 171 typename Info::data_type_ref data) { 172 Info InfoObj; 173 insert(key, data, InfoObj); 174 } 175 176 void insert(typename Info::key_type_ref key, 177 typename Info::data_type_ref data, Info &InfoObj) { 178 179 ++NumEntries; 180 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); 181 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data, 182 InfoObj)); 183 } 184 185 io::Offset Emit(raw_ostream &out) { 186 Info InfoObj; 187 return Emit(out, InfoObj); 188 } 189 190 io::Offset Emit(raw_ostream &out, Info &InfoObj) { 191 using namespace clang::io; 192 193 // Emit the payload of the table. 194 for (unsigned i = 0; i < NumBuckets; ++i) { 195 Bucket& B = Buckets[i]; 196 if (!B.head) continue; 197 198 // Store the offset for the data of this bucket. 199 B.off = out.tell(); 200 assert(B.off && "Cannot write a bucket at offset 0. Please add padding."); 201 202 // Write out the number of items in the bucket. 203 Emit16(out, B.length); 204 assert(B.length != 0 && "Bucket has a head but zero length?"); 205 206 // Write out the entries in the bucket. 207 for (Item *I = B.head; I ; I = I->next) { 208 Emit32(out, I->hash); 209 const std::pair<unsigned, unsigned>& Len = 210 InfoObj.EmitKeyDataLength(out, I->key, I->data); 211 InfoObj.EmitKey(out, I->key, Len.first); 212 InfoObj.EmitData(out, I->key, I->data, Len.second); 213 } 214 } 215 216 // Emit the hashtable itself. 217 Pad(out, 4); 218 io::Offset TableOff = out.tell(); 219 Emit32(out, NumBuckets); 220 Emit32(out, NumEntries); 221 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); 222 223 return TableOff; 224 } 225 226 OnDiskChainedHashTableGenerator() { 227 NumEntries = 0; 228 NumBuckets = 64; 229 // Note that we do not need to run the constructors of the individual 230 // Bucket objects since 'calloc' returns bytes that are all 0. 231 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket)); 232 } 233 234 ~OnDiskChainedHashTableGenerator() { 235 std::free(Buckets); 236 } 237 }; 238 239 template<typename Info> 240 class OnDiskChainedHashTable { 241 const unsigned NumBuckets; 242 const unsigned NumEntries; 243 const unsigned char* const Buckets; 244 const unsigned char* const Base; 245 Info InfoObj; 246 247 public: 248 typedef typename Info::internal_key_type internal_key_type; 249 typedef typename Info::external_key_type external_key_type; 250 typedef typename Info::data_type data_type; 251 252 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, 253 const unsigned char* buckets, 254 const unsigned char* base, 255 const Info &InfoObj = Info()) 256 : NumBuckets(numBuckets), NumEntries(numEntries), 257 Buckets(buckets), Base(base), InfoObj(InfoObj) { 258 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 259 "'buckets' must have a 4-byte alignment"); 260 } 261 262 unsigned getNumBuckets() const { return NumBuckets; } 263 unsigned getNumEntries() const { return NumEntries; } 264 const unsigned char* getBase() const { return Base; } 265 const unsigned char* getBuckets() const { return Buckets; } 266 267 bool isEmpty() const { return NumEntries == 0; } 268 269 class iterator { 270 internal_key_type key; 271 const unsigned char* const data; 272 const unsigned len; 273 Info *InfoObj; 274 public: 275 iterator() : data(0), len(0) {} 276 iterator(const internal_key_type k, const unsigned char* d, unsigned l, 277 Info *InfoObj) 278 : key(k), data(d), len(l), InfoObj(InfoObj) {} 279 280 data_type operator*() const { return InfoObj->ReadData(key, data, len); } 281 bool operator==(const iterator& X) const { return X.data == data; } 282 bool operator!=(const iterator& X) const { return X.data != data; } 283 }; 284 285 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) { 286 if (!InfoPtr) 287 InfoPtr = &InfoObj; 288 289 using namespace io; 290 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey); 291 unsigned key_hash = InfoObj.ComputeHash(iKey); 292 293 // Each bucket is just a 32-bit offset into the hash table file. 294 unsigned idx = key_hash & (NumBuckets - 1); 295 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; 296 297 unsigned offset = ReadLE32(Bucket); 298 if (offset == 0) return iterator(); // Empty bucket. 299 const unsigned char* Items = Base + offset; 300 301 // 'Items' starts with a 16-bit unsigned integer representing the 302 // number of items in this bucket. 303 unsigned len = ReadUnalignedLE16(Items); 304 305 for (unsigned i = 0; i < len; ++i) { 306 // Read the hash. 307 uint32_t item_hash = ReadUnalignedLE32(Items); 308 309 // Determine the length of the key and the data. 310 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items); 311 unsigned item_len = L.first + L.second; 312 313 // Compare the hashes. If they are not the same, skip the entry entirely. 314 if (item_hash != key_hash) { 315 Items += item_len; 316 continue; 317 } 318 319 // Read the key. 320 const internal_key_type& X = 321 InfoPtr->ReadKey((const unsigned char* const) Items, L.first); 322 323 // If the key doesn't match just skip reading the value. 324 if (!InfoPtr->EqualKey(X, iKey)) { 325 Items += item_len; 326 continue; 327 } 328 329 // The key matches! 330 return iterator(X, Items + L.first, L.second, InfoPtr); 331 } 332 333 return iterator(); 334 } 335 336 iterator end() const { return iterator(); } 337 338 /// \brief Iterates over all of the keys in the table. 339 class key_iterator { 340 const unsigned char* Ptr; 341 unsigned NumItemsInBucketLeft; 342 unsigned NumEntriesLeft; 343 Info *InfoObj; 344 public: 345 typedef external_key_type value_type; 346 347 key_iterator(const unsigned char* const Ptr, unsigned NumEntries, 348 Info *InfoObj) 349 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 350 InfoObj(InfoObj) { } 351 key_iterator() 352 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 353 354 friend bool operator==(const key_iterator &X, const key_iterator &Y) { 355 return X.NumEntriesLeft == Y.NumEntriesLeft; 356 } 357 friend bool operator!=(const key_iterator& X, const key_iterator &Y) { 358 return X.NumEntriesLeft != Y.NumEntriesLeft; 359 } 360 361 key_iterator& operator++() { // Preincrement 362 if (!NumItemsInBucketLeft) { 363 // 'Items' starts with a 16-bit unsigned integer representing the 364 // number of items in this bucket. 365 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 366 } 367 Ptr += 4; // Skip the hash. 368 // Determine the length of the key and the data. 369 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 370 Ptr += L.first + L.second; 371 assert(NumItemsInBucketLeft); 372 --NumItemsInBucketLeft; 373 assert(NumEntriesLeft); 374 --NumEntriesLeft; 375 return *this; 376 } 377 key_iterator operator++(int) { // Postincrement 378 key_iterator tmp = *this; ++*this; return tmp; 379 } 380 381 value_type operator*() const { 382 const unsigned char* LocalPtr = Ptr; 383 if (!NumItemsInBucketLeft) 384 LocalPtr += 2; // number of items in bucket 385 LocalPtr += 4; // Skip the hash. 386 387 // Determine the length of the key and the data. 388 const std::pair<unsigned, unsigned>& L 389 = Info::ReadKeyDataLength(LocalPtr); 390 391 // Read the key. 392 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first); 393 return InfoObj->GetExternalKey(Key); 394 } 395 }; 396 397 key_iterator key_begin() { 398 return key_iterator(Base + 4, getNumEntries(), &InfoObj); 399 } 400 key_iterator key_end() { return key_iterator(); } 401 402 /// \brief Iterates over all the entries in the table, returning the data. 403 class data_iterator { 404 const unsigned char* Ptr; 405 unsigned NumItemsInBucketLeft; 406 unsigned NumEntriesLeft; 407 Info *InfoObj; 408 public: 409 typedef data_type value_type; 410 411 data_iterator(const unsigned char* const Ptr, unsigned NumEntries, 412 Info *InfoObj) 413 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 414 InfoObj(InfoObj) { } 415 data_iterator() 416 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 417 418 bool operator==(const data_iterator& X) const { 419 return X.NumEntriesLeft == NumEntriesLeft; 420 } 421 bool operator!=(const data_iterator& X) const { 422 return X.NumEntriesLeft != NumEntriesLeft; 423 } 424 425 data_iterator& operator++() { // Preincrement 426 if (!NumItemsInBucketLeft) { 427 // 'Items' starts with a 16-bit unsigned integer representing the 428 // number of items in this bucket. 429 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 430 } 431 Ptr += 4; // Skip the hash. 432 // Determine the length of the key and the data. 433 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 434 Ptr += L.first + L.second; 435 assert(NumItemsInBucketLeft); 436 --NumItemsInBucketLeft; 437 assert(NumEntriesLeft); 438 --NumEntriesLeft; 439 return *this; 440 } 441 data_iterator operator++(int) { // Postincrement 442 data_iterator tmp = *this; ++*this; return tmp; 443 } 444 445 value_type operator*() const { 446 const unsigned char* LocalPtr = Ptr; 447 if (!NumItemsInBucketLeft) 448 LocalPtr += 2; // number of items in bucket 449 LocalPtr += 4; // Skip the hash. 450 451 // Determine the length of the key and the data. 452 const std::pair<unsigned, unsigned>& L =Info::ReadKeyDataLength(LocalPtr); 453 454 // Read the key. 455 const internal_key_type& Key = 456 InfoObj->ReadKey(LocalPtr, L.first); 457 return InfoObj->ReadData(Key, LocalPtr + L.first, L.second); 458 } 459 }; 460 461 data_iterator data_begin() { 462 return data_iterator(Base + 4, getNumEntries(), &InfoObj); 463 } 464 data_iterator data_end() { return data_iterator(); } 465 466 Info &getInfoObj() { return InfoObj; } 467 468 static OnDiskChainedHashTable* Create(const unsigned char* buckets, 469 const unsigned char* const base, 470 const Info &InfoObj = Info()) { 471 using namespace io; 472 assert(buckets > base); 473 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 474 "buckets should be 4-byte aligned."); 475 476 unsigned numBuckets = ReadLE32(buckets); 477 unsigned numEntries = ReadLE32(buckets); 478 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets, 479 base, InfoObj); 480 } 481 }; 482 483 } // end namespace clang 484 485 #endif 486