1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 // 5 // The representation of a DBImpl consists of a set of Versions. The 6 // newest version is called "current". Older versions may be kept 7 // around to provide a consistent view to live iterators. 8 // 9 // Each Version keeps track of a set of Table files per level. The 10 // entire set of versions is maintained in a VersionSet. 11 // 12 // Version,VersionSet are thread-compatible, but require external 13 // synchronization on all accesses. 14 15 #ifndef STORAGE_LEVELDB_DB_VERSION_SET_H_ 16 #define STORAGE_LEVELDB_DB_VERSION_SET_H_ 17 18 #include <map> 19 #include <set> 20 #include <vector> 21 #include "db/dbformat.h" 22 #include "db/version_edit.h" 23 #include "port/port.h" 24 #include "port/thread_annotations.h" 25 26 namespace leveldb { 27 28 namespace log { class Writer; } 29 30 class Compaction; 31 class Iterator; 32 class MemTable; 33 class TableBuilder; 34 class TableCache; 35 class Version; 36 class VersionSet; 37 class WritableFile; 38 39 // Return the smallest index i such that files[i]->largest >= key. 40 // Return files.size() if there is no such file. 41 // REQUIRES: "files" contains a sorted list of non-overlapping files. 42 extern int FindFile(const InternalKeyComparator& icmp, 43 const std::vector<FileMetaData*>& files, 44 const Slice& key); 45 46 // Returns true iff some file in "files" overlaps the user key range 47 // [*smallest,*largest]. 48 // smallest==NULL represents a key smaller than all keys in the DB. 49 // largest==NULL represents a key largest than all keys in the DB. 50 // REQUIRES: If disjoint_sorted_files, files[] contains disjoint ranges 51 // in sorted order. 52 extern bool SomeFileOverlapsRange( 53 const InternalKeyComparator& icmp, 54 bool disjoint_sorted_files, 55 const std::vector<FileMetaData*>& files, 56 const Slice* smallest_user_key, 57 const Slice* largest_user_key); 58 59 class Version { 60 public: 61 // Append to *iters a sequence of iterators that will 62 // yield the contents of this Version when merged together. 63 // REQUIRES: This version has been saved (see VersionSet::SaveTo) 64 void AddIterators(const ReadOptions&, std::vector<Iterator*>* iters); 65 66 // Lookup the value for key. If found, store it in *val and 67 // return OK. Else return a non-OK status. Fills *stats. 68 // REQUIRES: lock is not held 69 struct GetStats { 70 FileMetaData* seek_file; 71 int seek_file_level; 72 }; 73 Status Get(const ReadOptions&, const LookupKey& key, std::string* val, 74 GetStats* stats); 75 76 // Adds "stats" into the current state. Returns true if a new 77 // compaction may need to be triggered, false otherwise. 78 // REQUIRES: lock is held 79 bool UpdateStats(const GetStats& stats); 80 81 // Record a sample of bytes read at the specified internal key. 82 // Samples are taken approximately once every config::kReadBytesPeriod 83 // bytes. Returns true if a new compaction may need to be triggered. 84 // REQUIRES: lock is held 85 bool RecordReadSample(Slice key); 86 87 // Reference count management (so Versions do not disappear out from 88 // under live iterators) 89 void Ref(); 90 void Unref(); 91 92 void GetOverlappingInputs( 93 int level, 94 const InternalKey* begin, // NULL means before all keys 95 const InternalKey* end, // NULL means after all keys 96 std::vector<FileMetaData*>* inputs); 97 98 // Returns true iff some file in the specified level overlaps 99 // some part of [*smallest_user_key,*largest_user_key]. 100 // smallest_user_key==NULL represents a key smaller than all keys in the DB. 101 // largest_user_key==NULL represents a key largest than all keys in the DB. 102 bool OverlapInLevel(int level, 103 const Slice* smallest_user_key, 104 const Slice* largest_user_key); 105 106 // Return the level at which we should place a new memtable compaction 107 // result that covers the range [smallest_user_key,largest_user_key]. 108 int PickLevelForMemTableOutput(const Slice& smallest_user_key, 109 const Slice& largest_user_key); 110 111 int NumFiles(int level) const { return files_[level].size(); } 112 113 // Return a human readable string that describes this version's contents. 114 std::string DebugString() const; 115 116 private: 117 friend class Compaction; 118 friend class VersionSet; 119 120 class LevelFileNumIterator; 121 Iterator* NewConcatenatingIterator(const ReadOptions&, int level) const; 122 123 // Call func(arg, level, f) for every file that overlaps user_key in 124 // order from newest to oldest. If an invocation of func returns 125 // false, makes no more calls. 126 // 127 // REQUIRES: user portion of internal_key == user_key. 128 void ForEachOverlapping(Slice user_key, Slice internal_key, 129 void* arg, 130 bool (*func)(void*, int, FileMetaData*)); 131 132 VersionSet* vset_; // VersionSet to which this Version belongs 133 Version* next_; // Next version in linked list 134 Version* prev_; // Previous version in linked list 135 int refs_; // Number of live refs to this version 136 137 // List of files per level 138 std::vector<FileMetaData*> files_[config::kNumLevels]; 139 140 // Next file to compact based on seek stats. 141 FileMetaData* file_to_compact_; 142 int file_to_compact_level_; 143 144 // Level that should be compacted next and its compaction score. 145 // Score < 1 means compaction is not strictly needed. These fields 146 // are initialized by Finalize(). 147 double compaction_score_; 148 int compaction_level_; 149 150 explicit Version(VersionSet* vset) 151 : vset_(vset), next_(this), prev_(this), refs_(0), 152 file_to_compact_(NULL), 153 file_to_compact_level_(-1), 154 compaction_score_(-1), 155 compaction_level_(-1) { 156 } 157 158 ~Version(); 159 160 // No copying allowed 161 Version(const Version&); 162 void operator=(const Version&); 163 }; 164 165 class VersionSet { 166 public: 167 VersionSet(const std::string& dbname, 168 const Options* options, 169 TableCache* table_cache, 170 const InternalKeyComparator*); 171 ~VersionSet(); 172 173 // Apply *edit to the current version to form a new descriptor that 174 // is both saved to persistent state and installed as the new 175 // current version. Will release *mu while actually writing to the file. 176 // REQUIRES: *mu is held on entry. 177 // REQUIRES: no other thread concurrently calls LogAndApply() 178 Status LogAndApply(VersionEdit* edit, port::Mutex* mu) 179 EXCLUSIVE_LOCKS_REQUIRED(mu); 180 181 // Recover the last saved descriptor from persistent storage. 182 Status Recover(); 183 184 // Return the current version. 185 Version* current() const { return current_; } 186 187 // Return the current manifest file number 188 uint64_t ManifestFileNumber() const { return manifest_file_number_; } 189 190 // Allocate and return a new file number 191 uint64_t NewFileNumber() { return next_file_number_++; } 192 193 // Arrange to reuse "file_number" unless a newer file number has 194 // already been allocated. 195 // REQUIRES: "file_number" was returned by a call to NewFileNumber(). 196 void ReuseFileNumber(uint64_t file_number) { 197 if (next_file_number_ == file_number + 1) { 198 next_file_number_ = file_number; 199 } 200 } 201 202 // Return the number of Table files at the specified level. 203 int NumLevelFiles(int level) const; 204 205 // Return the combined file size of all files at the specified level. 206 int64_t NumLevelBytes(int level) const; 207 208 // Return the last sequence number. 209 uint64_t LastSequence() const { return last_sequence_; } 210 211 // Set the last sequence number to s. 212 void SetLastSequence(uint64_t s) { 213 assert(s >= last_sequence_); 214 last_sequence_ = s; 215 } 216 217 // Mark the specified file number as used. 218 void MarkFileNumberUsed(uint64_t number); 219 220 // Return the current log file number. 221 uint64_t LogNumber() const { return log_number_; } 222 223 // Return the log file number for the log file that is currently 224 // being compacted, or zero if there is no such log file. 225 uint64_t PrevLogNumber() const { return prev_log_number_; } 226 227 // Pick level and inputs for a new compaction. 228 // Returns NULL if there is no compaction to be done. 229 // Otherwise returns a pointer to a heap-allocated object that 230 // describes the compaction. Caller should delete the result. 231 Compaction* PickCompaction(); 232 233 // Return a compaction object for compacting the range [begin,end] in 234 // the specified level. Returns NULL if there is nothing in that 235 // level that overlaps the specified range. Caller should delete 236 // the result. 237 Compaction* CompactRange( 238 int level, 239 const InternalKey* begin, 240 const InternalKey* end); 241 242 // Return the maximum overlapping data (in bytes) at next level for any 243 // file at a level >= 1. 244 int64_t MaxNextLevelOverlappingBytes(); 245 246 // Create an iterator that reads over the compaction inputs for "*c". 247 // The caller should delete the iterator when no longer needed. 248 Iterator* MakeInputIterator(Compaction* c); 249 250 // Returns true iff some level needs a compaction. 251 bool NeedsCompaction() const { 252 Version* v = current_; 253 return (v->compaction_score_ >= 1) || (v->file_to_compact_ != NULL); 254 } 255 256 // Add all files listed in any live version to *live. 257 // May also mutate some internal state. 258 void AddLiveFiles(std::set<uint64_t>* live); 259 260 // Return the approximate offset in the database of the data for 261 // "key" as of version "v". 262 uint64_t ApproximateOffsetOf(Version* v, const InternalKey& key); 263 264 // Return a human-readable short (single-line) summary of the number 265 // of files per level. Uses *scratch as backing store. 266 struct LevelSummaryStorage { 267 char buffer[100]; 268 }; 269 const char* LevelSummary(LevelSummaryStorage* scratch) const; 270 271 private: 272 class Builder; 273 274 friend class Compaction; 275 friend class Version; 276 277 void Finalize(Version* v); 278 279 void GetRange(const std::vector<FileMetaData*>& inputs, 280 InternalKey* smallest, 281 InternalKey* largest); 282 283 void GetRange2(const std::vector<FileMetaData*>& inputs1, 284 const std::vector<FileMetaData*>& inputs2, 285 InternalKey* smallest, 286 InternalKey* largest); 287 288 void SetupOtherInputs(Compaction* c); 289 290 // Save current contents to *log 291 Status WriteSnapshot(log::Writer* log); 292 293 void AppendVersion(Version* v); 294 295 Env* const env_; 296 const std::string dbname_; 297 const Options* const options_; 298 TableCache* const table_cache_; 299 const InternalKeyComparator icmp_; 300 uint64_t next_file_number_; 301 uint64_t manifest_file_number_; 302 uint64_t last_sequence_; 303 uint64_t log_number_; 304 uint64_t prev_log_number_; // 0 or backing store for memtable being compacted 305 306 // Opened lazily 307 WritableFile* descriptor_file_; 308 log::Writer* descriptor_log_; 309 Version dummy_versions_; // Head of circular doubly-linked list of versions. 310 Version* current_; // == dummy_versions_.prev_ 311 312 // Per-level key at which the next compaction at that level should start. 313 // Either an empty string, or a valid InternalKey. 314 std::string compact_pointer_[config::kNumLevels]; 315 316 // No copying allowed 317 VersionSet(const VersionSet&); 318 void operator=(const VersionSet&); 319 }; 320 321 // A Compaction encapsulates information about a compaction. 322 class Compaction { 323 public: 324 ~Compaction(); 325 326 // Return the level that is being compacted. Inputs from "level" 327 // and "level+1" will be merged to produce a set of "level+1" files. 328 int level() const { return level_; } 329 330 // Return the object that holds the edits to the descriptor done 331 // by this compaction. 332 VersionEdit* edit() { return &edit_; } 333 334 // "which" must be either 0 or 1 335 int num_input_files(int which) const { return inputs_[which].size(); } 336 337 // Return the ith input file at "level()+which" ("which" must be 0 or 1). 338 FileMetaData* input(int which, int i) const { return inputs_[which][i]; } 339 340 // Maximum size of files to build during this compaction. 341 uint64_t MaxOutputFileSize() const { return max_output_file_size_; } 342 343 // Is this a trivial compaction that can be implemented by just 344 // moving a single input file to the next level (no merging or splitting) 345 bool IsTrivialMove() const; 346 347 // Add all inputs to this compaction as delete operations to *edit. 348 void AddInputDeletions(VersionEdit* edit); 349 350 // Returns true if the information we have available guarantees that 351 // the compaction is producing data in "level+1" for which no data exists 352 // in levels greater than "level+1". 353 bool IsBaseLevelForKey(const Slice& user_key); 354 355 // Returns true iff we should stop building the current output 356 // before processing "internal_key". 357 bool ShouldStopBefore(const Slice& internal_key); 358 359 // Release the input version for the compaction, once the compaction 360 // is successful. 361 void ReleaseInputs(); 362 363 private: 364 friend class Version; 365 friend class VersionSet; 366 367 explicit Compaction(int level); 368 369 int level_; 370 uint64_t max_output_file_size_; 371 Version* input_version_; 372 VersionEdit edit_; 373 374 // Each compaction reads inputs from "level_" and "level_+1" 375 std::vector<FileMetaData*> inputs_[2]; // The two sets of inputs 376 377 // State used to check for number of of overlapping grandparent files 378 // (parent == level_ + 1, grandparent == level_ + 2) 379 std::vector<FileMetaData*> grandparents_; 380 size_t grandparent_index_; // Index in grandparent_starts_ 381 bool seen_key_; // Some output key has been seen 382 int64_t overlapped_bytes_; // Bytes of overlap between current output 383 // and grandparent files 384 385 // State for implementing IsBaseLevelForKey 386 387 // level_ptrs_ holds indices into input_version_->levels_: our state 388 // is that we are positioned at one of the file ranges for each 389 // higher level than the ones involved in this compaction (i.e. for 390 // all L >= level_ + 2). 391 size_t level_ptrs_[config::kNumLevels]; 392 }; 393 394 } // namespace leveldb 395 396 #endif // STORAGE_LEVELDB_DB_VERSION_SET_H_ 397