1 //===--- OnDiskHashTable.h - On-Disk Hash Table Implementation --*- C++ -*-===// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //===----------------------------------------------------------------------===// 9 // 10 // This file defines facilities for reading and writing on-disk hash 11 // tables. 12 // 13 //===----------------------------------------------------------------------===// 14 #ifndef LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 15 #define LLVM_CLANG_BASIC_ON_DISK_HASH_TABLE_H 16 17 #include "llvm/Support/Allocator.h" 18 #include "llvm/Support/DataTypes.h" 19 #include "llvm/Support/MathExtras.h" 20 #include "llvm/Support/raw_ostream.h" 21 #include "llvm/Support/Host.h" 22 #include <cassert> 23 #include <cstdlib> 24 25 namespace clang { 26 27 namespace io { 28 29 typedef uint32_t Offset; 30 31 inline void Emit8(raw_ostream& Out, uint32_t V) { 32 Out << (unsigned char)(V); 33 } 34 35 inline void Emit16(raw_ostream& Out, uint32_t V) { 36 Out << (unsigned char)(V); 37 Out << (unsigned char)(V >> 8); 38 assert((V >> 16) == 0); 39 } 40 41 inline void Emit24(raw_ostream& Out, uint32_t V) { 42 Out << (unsigned char)(V); 43 Out << (unsigned char)(V >> 8); 44 Out << (unsigned char)(V >> 16); 45 assert((V >> 24) == 0); 46 } 47 48 inline void Emit32(raw_ostream& Out, uint32_t V) { 49 Out << (unsigned char)(V); 50 Out << (unsigned char)(V >> 8); 51 Out << (unsigned char)(V >> 16); 52 Out << (unsigned char)(V >> 24); 53 } 54 55 inline void Emit64(raw_ostream& Out, uint64_t V) { 56 Out << (unsigned char)(V); 57 Out << (unsigned char)(V >> 8); 58 Out << (unsigned char)(V >> 16); 59 Out << (unsigned char)(V >> 24); 60 Out << (unsigned char)(V >> 32); 61 Out << (unsigned char)(V >> 40); 62 Out << (unsigned char)(V >> 48); 63 Out << (unsigned char)(V >> 56); 64 } 65 66 inline void Pad(raw_ostream& Out, unsigned A) { 67 Offset off = (Offset) Out.tell(); 68 uint32_t n = ((uintptr_t)(off+A-1) & ~(uintptr_t)(A-1)) - off; 69 for (; n ; --n) 70 Emit8(Out, 0); 71 } 72 73 inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { 74 uint16_t V = ((uint16_t)Data[0]) | 75 ((uint16_t)Data[1] << 8); 76 Data += 2; 77 return V; 78 } 79 80 inline uint32_t ReadUnalignedLE32(const unsigned char *&Data) { 81 uint32_t V = ((uint32_t)Data[0]) | 82 ((uint32_t)Data[1] << 8) | 83 ((uint32_t)Data[2] << 16) | 84 ((uint32_t)Data[3] << 24); 85 Data += 4; 86 return V; 87 } 88 89 inline uint64_t ReadUnalignedLE64(const unsigned char *&Data) { 90 uint64_t V = ((uint64_t)Data[0]) | 91 ((uint64_t)Data[1] << 8) | 92 ((uint64_t)Data[2] << 16) | 93 ((uint64_t)Data[3] << 24) | 94 ((uint64_t)Data[4] << 32) | 95 ((uint64_t)Data[5] << 40) | 96 ((uint64_t)Data[6] << 48) | 97 ((uint64_t)Data[7] << 56); 98 Data += 8; 99 return V; 100 } 101 102 inline uint32_t ReadLE32(const unsigned char *&Data) { 103 // Hosts that directly support little-endian 32-bit loads can just 104 // use them. Big-endian hosts need a bswap. 105 uint32_t V = *((uint32_t*)Data); 106 if (llvm::sys::isBigEndianHost()) 107 V = llvm::ByteSwap_32(V); 108 Data += 4; 109 return V; 110 } 111 112 } // end namespace io 113 114 template<typename Info> 115 class OnDiskChainedHashTableGenerator { 116 unsigned NumBuckets; 117 unsigned NumEntries; 118 llvm::BumpPtrAllocator BA; 119 120 class Item { 121 public: 122 typename Info::key_type key; 123 typename Info::data_type data; 124 Item *next; 125 const uint32_t hash; 126 127 Item(typename Info::key_type_ref k, typename Info::data_type_ref d, 128 Info &InfoObj) 129 : key(k), data(d), next(0), hash(InfoObj.ComputeHash(k)) {} 130 }; 131 132 class Bucket { 133 public: 134 io::Offset off; 135 Item* head; 136 unsigned length; 137 138 Bucket() {} 139 }; 140 141 Bucket* Buckets; 142 143 private: 144 void insert(Bucket* b, size_t size, Item* E) { 145 unsigned idx = E->hash & (size - 1); 146 Bucket& B = b[idx]; 147 E->next = B.head; 148 ++B.length; 149 B.head = E; 150 } 151 152 void resize(size_t newsize) { 153 Bucket* newBuckets = (Bucket*) std::calloc(newsize, sizeof(Bucket)); 154 // Populate newBuckets with the old entries. 155 for (unsigned i = 0; i < NumBuckets; ++i) 156 for (Item* E = Buckets[i].head; E ; ) { 157 Item* N = E->next; 158 E->next = 0; 159 insert(newBuckets, newsize, E); 160 E = N; 161 } 162 163 free(Buckets); 164 NumBuckets = newsize; 165 Buckets = newBuckets; 166 } 167 168 public: 169 170 void insert(typename Info::key_type_ref key, 171 typename Info::data_type_ref data) { 172 Info InfoObj; 173 insert(key, data, InfoObj); 174 } 175 176 void insert(typename Info::key_type_ref key, 177 typename Info::data_type_ref data, Info &InfoObj) { 178 179 ++NumEntries; 180 if (4*NumEntries >= 3*NumBuckets) resize(NumBuckets*2); 181 insert(Buckets, NumBuckets, new (BA.Allocate<Item>()) Item(key, data, 182 InfoObj)); 183 } 184 185 io::Offset Emit(raw_ostream &out) { 186 Info InfoObj; 187 return Emit(out, InfoObj); 188 } 189 190 io::Offset Emit(raw_ostream &out, Info &InfoObj) { 191 using namespace clang::io; 192 193 // Emit the payload of the table. 194 for (unsigned i = 0; i < NumBuckets; ++i) { 195 Bucket& B = Buckets[i]; 196 if (!B.head) continue; 197 198 // Store the offset for the data of this bucket. 199 B.off = out.tell(); 200 assert(B.off && "Cannot write a bucket at offset 0. Please add padding."); 201 202 // Write out the number of items in the bucket. 203 Emit16(out, B.length); 204 205 // Write out the entries in the bucket. 206 for (Item *I = B.head; I ; I = I->next) { 207 Emit32(out, I->hash); 208 const std::pair<unsigned, unsigned>& Len = 209 InfoObj.EmitKeyDataLength(out, I->key, I->data); 210 InfoObj.EmitKey(out, I->key, Len.first); 211 InfoObj.EmitData(out, I->key, I->data, Len.second); 212 } 213 } 214 215 // Emit the hashtable itself. 216 Pad(out, 4); 217 io::Offset TableOff = out.tell(); 218 Emit32(out, NumBuckets); 219 Emit32(out, NumEntries); 220 for (unsigned i = 0; i < NumBuckets; ++i) Emit32(out, Buckets[i].off); 221 222 return TableOff; 223 } 224 225 OnDiskChainedHashTableGenerator() { 226 NumEntries = 0; 227 NumBuckets = 64; 228 // Note that we do not need to run the constructors of the individual 229 // Bucket objects since 'calloc' returns bytes that are all 0. 230 Buckets = (Bucket*) std::calloc(NumBuckets, sizeof(Bucket)); 231 } 232 233 ~OnDiskChainedHashTableGenerator() { 234 std::free(Buckets); 235 } 236 }; 237 238 template<typename Info> 239 class OnDiskChainedHashTable { 240 const unsigned NumBuckets; 241 const unsigned NumEntries; 242 const unsigned char* const Buckets; 243 const unsigned char* const Base; 244 Info InfoObj; 245 246 public: 247 typedef typename Info::internal_key_type internal_key_type; 248 typedef typename Info::external_key_type external_key_type; 249 typedef typename Info::data_type data_type; 250 251 OnDiskChainedHashTable(unsigned numBuckets, unsigned numEntries, 252 const unsigned char* buckets, 253 const unsigned char* base, 254 const Info &InfoObj = Info()) 255 : NumBuckets(numBuckets), NumEntries(numEntries), 256 Buckets(buckets), Base(base), InfoObj(InfoObj) { 257 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 258 "'buckets' must have a 4-byte alignment"); 259 } 260 261 unsigned getNumBuckets() const { return NumBuckets; } 262 unsigned getNumEntries() const { return NumEntries; } 263 const unsigned char* getBase() const { return Base; } 264 const unsigned char* getBuckets() const { return Buckets; } 265 266 bool isEmpty() const { return NumEntries == 0; } 267 268 class iterator { 269 internal_key_type key; 270 const unsigned char* const data; 271 const unsigned len; 272 Info *InfoObj; 273 public: 274 iterator() : data(0), len(0) {} 275 iterator(const internal_key_type k, const unsigned char* d, unsigned l, 276 Info *InfoObj) 277 : key(k), data(d), len(l), InfoObj(InfoObj) {} 278 279 data_type operator*() const { return InfoObj->ReadData(key, data, len); } 280 bool operator==(const iterator& X) const { return X.data == data; } 281 bool operator!=(const iterator& X) const { return X.data != data; } 282 }; 283 284 iterator find(const external_key_type& eKey, Info *InfoPtr = 0) { 285 if (!InfoPtr) 286 InfoPtr = &InfoObj; 287 288 using namespace io; 289 const internal_key_type& iKey = InfoObj.GetInternalKey(eKey); 290 unsigned key_hash = InfoObj.ComputeHash(iKey); 291 292 // Each bucket is just a 32-bit offset into the hash table file. 293 unsigned idx = key_hash & (NumBuckets - 1); 294 const unsigned char* Bucket = Buckets + sizeof(uint32_t)*idx; 295 296 unsigned offset = ReadLE32(Bucket); 297 if (offset == 0) return iterator(); // Empty bucket. 298 const unsigned char* Items = Base + offset; 299 300 // 'Items' starts with a 16-bit unsigned integer representing the 301 // number of items in this bucket. 302 unsigned len = ReadUnalignedLE16(Items); 303 304 for (unsigned i = 0; i < len; ++i) { 305 // Read the hash. 306 uint32_t item_hash = ReadUnalignedLE32(Items); 307 308 // Determine the length of the key and the data. 309 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Items); 310 unsigned item_len = L.first + L.second; 311 312 // Compare the hashes. If they are not the same, skip the entry entirely. 313 if (item_hash != key_hash) { 314 Items += item_len; 315 continue; 316 } 317 318 // Read the key. 319 const internal_key_type& X = 320 InfoPtr->ReadKey((const unsigned char* const) Items, L.first); 321 322 // If the key doesn't match just skip reading the value. 323 if (!InfoPtr->EqualKey(X, iKey)) { 324 Items += item_len; 325 continue; 326 } 327 328 // The key matches! 329 return iterator(X, Items + L.first, L.second, InfoPtr); 330 } 331 332 return iterator(); 333 } 334 335 iterator end() const { return iterator(); } 336 337 /// \brief Iterates over all of the keys in the table. 338 class key_iterator { 339 const unsigned char* Ptr; 340 unsigned NumItemsInBucketLeft; 341 unsigned NumEntriesLeft; 342 Info *InfoObj; 343 public: 344 typedef external_key_type value_type; 345 346 key_iterator(const unsigned char* const Ptr, unsigned NumEntries, 347 Info *InfoObj) 348 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 349 InfoObj(InfoObj) { } 350 key_iterator() 351 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 352 353 friend bool operator==(const key_iterator &X, const key_iterator &Y) { 354 return X.NumEntriesLeft == Y.NumEntriesLeft; 355 } 356 friend bool operator!=(const key_iterator& X, const key_iterator &Y) { 357 return X.NumEntriesLeft != Y.NumEntriesLeft; 358 } 359 360 key_iterator& operator++() { // Preincrement 361 if (!NumItemsInBucketLeft) { 362 // 'Items' starts with a 16-bit unsigned integer representing the 363 // number of items in this bucket. 364 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 365 } 366 Ptr += 4; // Skip the hash. 367 // Determine the length of the key and the data. 368 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 369 Ptr += L.first + L.second; 370 assert(NumItemsInBucketLeft); 371 --NumItemsInBucketLeft; 372 assert(NumEntriesLeft); 373 --NumEntriesLeft; 374 return *this; 375 } 376 key_iterator operator++(int) { // Postincrement 377 key_iterator tmp = *this; ++*this; return tmp; 378 } 379 380 value_type operator*() const { 381 const unsigned char* LocalPtr = Ptr; 382 if (!NumItemsInBucketLeft) 383 LocalPtr += 2; // number of items in bucket 384 LocalPtr += 4; // Skip the hash. 385 386 // Determine the length of the key and the data. 387 const std::pair<unsigned, unsigned>& L 388 = Info::ReadKeyDataLength(LocalPtr); 389 390 // Read the key. 391 const internal_key_type& Key = InfoObj->ReadKey(LocalPtr, L.first); 392 return InfoObj->GetExternalKey(Key); 393 } 394 }; 395 396 key_iterator key_begin() { 397 return key_iterator(Base + 4, getNumEntries(), &InfoObj); 398 } 399 key_iterator key_end() { return key_iterator(); } 400 401 /// \brief Iterates over all the entries in the table, returning 402 /// a key/data pair. 403 class item_iterator { 404 const unsigned char* Ptr; 405 unsigned NumItemsInBucketLeft; 406 unsigned NumEntriesLeft; 407 Info *InfoObj; 408 public: 409 typedef std::pair<external_key_type, data_type> value_type; 410 411 item_iterator(const unsigned char* const Ptr, unsigned NumEntries, 412 Info *InfoObj) 413 : Ptr(Ptr), NumItemsInBucketLeft(0), NumEntriesLeft(NumEntries), 414 InfoObj(InfoObj) { } 415 item_iterator() 416 : Ptr(0), NumItemsInBucketLeft(0), NumEntriesLeft(0), InfoObj(0) { } 417 418 bool operator==(const item_iterator& X) const { 419 return X.NumEntriesLeft == NumEntriesLeft; 420 } 421 bool operator!=(const item_iterator& X) const { 422 return X.NumEntriesLeft != NumEntriesLeft; 423 } 424 425 item_iterator& operator++() { // Preincrement 426 if (!NumItemsInBucketLeft) { 427 // 'Items' starts with a 16-bit unsigned integer representing the 428 // number of items in this bucket. 429 NumItemsInBucketLeft = io::ReadUnalignedLE16(Ptr); 430 } 431 Ptr += 4; // Skip the hash. 432 // Determine the length of the key and the data. 433 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(Ptr); 434 Ptr += L.first + L.second; 435 assert(NumItemsInBucketLeft); 436 --NumItemsInBucketLeft; 437 assert(NumEntriesLeft); 438 --NumEntriesLeft; 439 return *this; 440 } 441 item_iterator operator++(int) { // Postincrement 442 item_iterator tmp = *this; ++*this; return tmp; 443 } 444 445 value_type operator*() const { 446 const unsigned char* LocalPtr = Ptr; 447 if (!NumItemsInBucketLeft) 448 LocalPtr += 2; // number of items in bucket 449 LocalPtr += 4; // Skip the hash. 450 451 // Determine the length of the key and the data. 452 const std::pair<unsigned, unsigned>& L = Info::ReadKeyDataLength(LocalPtr); 453 454 // Read the key. 455 const internal_key_type& Key = 456 InfoObj->ReadKey(LocalPtr, L.first); 457 return std::make_pair(InfoObj->GetExternalKey(Key), 458 InfoObj->ReadData(Key, LocalPtr + L.first, L.second)); 459 } 460 }; 461 462 item_iterator item_begin() { 463 return item_iterator(Base + 4, getNumEntries(), &InfoObj); 464 } 465 item_iterator item_end() { return item_iterator(); } 466 467 Info &getInfoObj() { return InfoObj; } 468 469 static OnDiskChainedHashTable* Create(const unsigned char* buckets, 470 const unsigned char* const base, 471 const Info &InfoObj = Info()) { 472 using namespace io; 473 assert(buckets > base); 474 assert((reinterpret_cast<uintptr_t>(buckets) & 0x3) == 0 && 475 "buckets should be 4-byte aligned."); 476 477 unsigned numBuckets = ReadLE32(buckets); 478 unsigned numEntries = ReadLE32(buckets); 479 return new OnDiskChainedHashTable<Info>(numBuckets, numEntries, buckets, 480 base, InfoObj); 481 } 482 }; 483 484 } // end namespace clang 485 486 #endif 487