1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 5 #include "leveldb/table.h" 6 7 #include "leveldb/cache.h" 8 #include "leveldb/comparator.h" 9 #include "leveldb/env.h" 10 #include "leveldb/filter_policy.h" 11 #include "leveldb/options.h" 12 #include "table/block.h" 13 #include "table/filter_block.h" 14 #include "table/format.h" 15 #include "table/two_level_iterator.h" 16 #include "util/coding.h" 17 18 namespace leveldb { 19 20 struct Table::Rep { 21 ~Rep() { 22 delete filter; 23 delete [] filter_data; 24 delete index_block; 25 } 26 27 Options options; 28 Status status; 29 RandomAccessFile* file; 30 uint64_t cache_id; 31 FilterBlockReader* filter; 32 const char* filter_data; 33 34 BlockHandle metaindex_handle; // Handle to metaindex_block: saved from footer 35 Block* index_block; 36 }; 37 38 Status Table::Open(const Options& options, 39 RandomAccessFile* file, 40 uint64_t size, 41 Table** table) { 42 *table = NULL; 43 if (size < Footer::kEncodedLength) { 44 return Status::InvalidArgument("file is too short to be an sstable"); 45 } 46 47 char footer_space[Footer::kEncodedLength]; 48 Slice footer_input; 49 Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength, 50 &footer_input, footer_space); 51 if (!s.ok()) return s; 52 53 Footer footer; 54 s = footer.DecodeFrom(&footer_input); 55 if (!s.ok()) return s; 56 57 // Read the index block 58 BlockContents contents; 59 Block* index_block = NULL; 60 if (s.ok()) { 61 s = ReadBlock(file, ReadOptions(), footer.index_handle(), &contents); 62 if (s.ok()) { 63 index_block = new Block(contents); 64 } 65 } 66 67 if (s.ok()) { 68 // We've successfully read the footer and the index block: we're 69 // ready to serve requests. 70 Rep* rep = new Table::Rep; 71 rep->options = options; 72 rep->file = file; 73 rep->metaindex_handle = footer.metaindex_handle(); 74 rep->index_block = index_block; 75 rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0); 76 rep->filter_data = NULL; 77 rep->filter = NULL; 78 *table = new Table(rep); 79 (*table)->ReadMeta(footer); 80 } else { 81 if (index_block) delete index_block; 82 } 83 84 return s; 85 } 86 87 void Table::ReadMeta(const Footer& footer) { 88 if (rep_->options.filter_policy == NULL) { 89 return; // Do not need any metadata 90 } 91 92 // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates 93 // it is an empty block. 94 ReadOptions opt; 95 BlockContents contents; 96 if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) { 97 // Do not propagate errors since meta info is not needed for operation 98 return; 99 } 100 Block* meta = new Block(contents); 101 102 Iterator* iter = meta->NewIterator(BytewiseComparator()); 103 std::string key = "filter."; 104 key.append(rep_->options.filter_policy->Name()); 105 iter->Seek(key); 106 if (iter->Valid() && iter->key() == Slice(key)) { 107 ReadFilter(iter->value()); 108 } 109 delete iter; 110 delete meta; 111 } 112 113 void Table::ReadFilter(const Slice& filter_handle_value) { 114 Slice v = filter_handle_value; 115 BlockHandle filter_handle; 116 if (!filter_handle.DecodeFrom(&v).ok()) { 117 return; 118 } 119 120 // We might want to unify with ReadBlock() if we start 121 // requiring checksum verification in Table::Open. 122 ReadOptions opt; 123 BlockContents block; 124 if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) { 125 return; 126 } 127 if (block.heap_allocated) { 128 rep_->filter_data = block.data.data(); // Will need to delete later 129 } 130 rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data); 131 } 132 133 Table::~Table() { 134 delete rep_; 135 } 136 137 static void DeleteBlock(void* arg, void* ignored) { 138 delete reinterpret_cast<Block*>(arg); 139 } 140 141 static void DeleteCachedBlock(const Slice& key, void* value) { 142 Block* block = reinterpret_cast<Block*>(value); 143 delete block; 144 } 145 146 static void ReleaseBlock(void* arg, void* h) { 147 Cache* cache = reinterpret_cast<Cache*>(arg); 148 Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h); 149 cache->Release(handle); 150 } 151 152 // Convert an index iterator value (i.e., an encoded BlockHandle) 153 // into an iterator over the contents of the corresponding block. 154 Iterator* Table::BlockReader(void* arg, 155 const ReadOptions& options, 156 const Slice& index_value) { 157 Table* table = reinterpret_cast<Table*>(arg); 158 Cache* block_cache = table->rep_->options.block_cache; 159 Block* block = NULL; 160 Cache::Handle* cache_handle = NULL; 161 162 BlockHandle handle; 163 Slice input = index_value; 164 Status s = handle.DecodeFrom(&input); 165 // We intentionally allow extra stuff in index_value so that we 166 // can add more features in the future. 167 168 if (s.ok()) { 169 BlockContents contents; 170 if (block_cache != NULL) { 171 char cache_key_buffer[16]; 172 EncodeFixed64(cache_key_buffer, table->rep_->cache_id); 173 EncodeFixed64(cache_key_buffer+8, handle.offset()); 174 Slice key(cache_key_buffer, sizeof(cache_key_buffer)); 175 cache_handle = block_cache->Lookup(key); 176 if (cache_handle != NULL) { 177 block = reinterpret_cast<Block*>(block_cache->Value(cache_handle)); 178 } else { 179 s = ReadBlock(table->rep_->file, options, handle, &contents); 180 if (s.ok()) { 181 block = new Block(contents); 182 if (contents.cachable && options.fill_cache) { 183 cache_handle = block_cache->Insert( 184 key, block, block->size(), &DeleteCachedBlock); 185 } 186 } 187 } 188 } else { 189 s = ReadBlock(table->rep_->file, options, handle, &contents); 190 if (s.ok()) { 191 block = new Block(contents); 192 } 193 } 194 } 195 196 Iterator* iter; 197 if (block != NULL) { 198 iter = block->NewIterator(table->rep_->options.comparator); 199 if (cache_handle == NULL) { 200 iter->RegisterCleanup(&DeleteBlock, block, NULL); 201 } else { 202 iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle); 203 } 204 } else { 205 iter = NewErrorIterator(s); 206 } 207 return iter; 208 } 209 210 Iterator* Table::NewIterator(const ReadOptions& options) const { 211 return NewTwoLevelIterator( 212 rep_->index_block->NewIterator(rep_->options.comparator), 213 &Table::BlockReader, const_cast<Table*>(this), options); 214 } 215 216 Status Table::InternalGet(const ReadOptions& options, const Slice& k, 217 void* arg, 218 void (*saver)(void*, const Slice&, const Slice&)) { 219 Status s; 220 Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator); 221 iiter->Seek(k); 222 if (iiter->Valid()) { 223 Slice handle_value = iiter->value(); 224 FilterBlockReader* filter = rep_->filter; 225 BlockHandle handle; 226 if (filter != NULL && 227 handle.DecodeFrom(&handle_value).ok() && 228 !filter->KeyMayMatch(handle.offset(), k)) { 229 // Not found 230 } else { 231 Iterator* block_iter = BlockReader(this, options, iiter->value()); 232 block_iter->Seek(k); 233 if (block_iter->Valid()) { 234 (*saver)(arg, block_iter->key(), block_iter->value()); 235 } 236 s = block_iter->status(); 237 delete block_iter; 238 } 239 } 240 if (s.ok()) { 241 s = iiter->status(); 242 } 243 delete iiter; 244 return s; 245 } 246 247 248 uint64_t Table::ApproximateOffsetOf(const Slice& key) const { 249 Iterator* index_iter = 250 rep_->index_block->NewIterator(rep_->options.comparator); 251 index_iter->Seek(key); 252 uint64_t result; 253 if (index_iter->Valid()) { 254 BlockHandle handle; 255 Slice input = index_iter->value(); 256 Status s = handle.DecodeFrom(&input); 257 if (s.ok()) { 258 result = handle.offset(); 259 } else { 260 // Strange: we can't decode the block handle in the index block. 261 // We'll just return the offset of the metaindex block, which is 262 // close to the whole file size for this case. 263 result = rep_->metaindex_handle.offset(); 264 } 265 } else { 266 // key is past the last key in the file. Approximate the offset 267 // by returning the offset of the metaindex block (which is 268 // right near the end of the file). 269 result = rep_->metaindex_handle.offset(); 270 } 271 delete index_iter; 272 return result; 273 } 274 275 } // namespace leveldb 276