1 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. See the AUTHORS file for names of contributors. 4 // 5 // We recover the contents of the descriptor from the other files we find. 6 // (1) Any log files are first converted to tables 7 // (2) We scan every table to compute 8 // (a) smallest/largest for the table 9 // (b) largest sequence number in the table 10 // (3) We generate descriptor contents: 11 // - log number is set to zero 12 // - next-file-number is set to 1 + largest file number we found 13 // - last-sequence-number is set to largest sequence# found across 14 // all tables (see 2c) 15 // - compaction pointers are cleared 16 // - every table file is added at level 0 17 // 18 // Possible optimization 1: 19 // (a) Compute total size and use to pick appropriate max-level M 20 // (b) Sort tables by largest sequence# in the table 21 // (c) For each table: if it overlaps earlier table, place in level-0, 22 // else place in level-M. 23 // Possible optimization 2: 24 // Store per-table metadata (smallest, largest, largest-seq#, ...) 25 // in the table's meta section to speed up ScanTable. 26 27 #include "db/builder.h" 28 #include "db/db_impl.h" 29 #include "db/dbformat.h" 30 #include "db/filename.h" 31 #include "db/log_reader.h" 32 #include "db/log_writer.h" 33 #include "db/memtable.h" 34 #include "db/table_cache.h" 35 #include "db/version_edit.h" 36 #include "db/write_batch_internal.h" 37 #include "leveldb/comparator.h" 38 #include "leveldb/db.h" 39 #include "leveldb/env.h" 40 41 namespace leveldb { 42 43 namespace { 44 45 class Repairer { 46 public: 47 Repairer(const std::string& dbname, const Options& options) 48 : dbname_(dbname), 49 env_(options.env), 50 icmp_(options.comparator), 51 ipolicy_(options.filter_policy), 52 options_(SanitizeOptions(dbname, &icmp_, &ipolicy_, options)), 53 owns_info_log_(options_.info_log != options.info_log), 54 owns_cache_(options_.block_cache != options.block_cache), 55 next_file_number_(1) { 56 // TableCache can be small since we expect each table to be opened once. 57 table_cache_ = new TableCache(dbname_, &options_, 10); 58 } 59 60 ~Repairer() { 61 delete table_cache_; 62 if (owns_info_log_) { 63 delete options_.info_log; 64 } 65 if (owns_cache_) { 66 delete options_.block_cache; 67 } 68 } 69 70 Status Run() { 71 Status status = FindFiles(); 72 if (status.ok()) { 73 ConvertLogFilesToTables(); 74 ExtractMetaData(); 75 status = WriteDescriptor(); 76 } 77 if (status.ok()) { 78 unsigned long long bytes = 0; 79 for (size_t i = 0; i < tables_.size(); i++) { 80 bytes += tables_[i].meta.file_size; 81 } 82 Log(options_.info_log, 83 "**** Repaired leveldb %s; " 84 "recovered %d files; %llu bytes. " 85 "Some data may have been lost. " 86 "****", 87 dbname_.c_str(), 88 static_cast<int>(tables_.size()), 89 bytes); 90 } 91 return status; 92 } 93 94 private: 95 struct TableInfo { 96 FileMetaData meta; 97 SequenceNumber max_sequence; 98 }; 99 100 std::string const dbname_; 101 Env* const env_; 102 InternalKeyComparator const icmp_; 103 InternalFilterPolicy const ipolicy_; 104 Options const options_; 105 bool owns_info_log_; 106 bool owns_cache_; 107 TableCache* table_cache_; 108 VersionEdit edit_; 109 110 std::vector<std::string> manifests_; 111 std::vector<uint64_t> table_numbers_; 112 std::vector<uint64_t> logs_; 113 std::vector<TableInfo> tables_; 114 uint64_t next_file_number_; 115 116 Status FindFiles() { 117 std::vector<std::string> filenames; 118 Status status = env_->GetChildren(dbname_, &filenames); 119 if (!status.ok()) { 120 return status; 121 } 122 if (filenames.empty()) { 123 return Status::IOError(dbname_, "repair found no files"); 124 } 125 126 uint64_t number; 127 FileType type; 128 for (size_t i = 0; i < filenames.size(); i++) { 129 if (ParseFileName(filenames[i], &number, &type)) { 130 if (type == kDescriptorFile) { 131 manifests_.push_back(filenames[i]); 132 } else { 133 if (number + 1 > next_file_number_) { 134 next_file_number_ = number + 1; 135 } 136 if (type == kLogFile) { 137 logs_.push_back(number); 138 } else if (type == kTableFile) { 139 table_numbers_.push_back(number); 140 } else { 141 // Ignore other files 142 } 143 } 144 } 145 } 146 return status; 147 } 148 149 void ConvertLogFilesToTables() { 150 for (size_t i = 0; i < logs_.size(); i++) { 151 std::string logname = LogFileName(dbname_, logs_[i]); 152 Status status = ConvertLogToTable(logs_[i]); 153 if (!status.ok()) { 154 Log(options_.info_log, "Log #%llu: ignoring conversion error: %s", 155 (unsigned long long) logs_[i], 156 status.ToString().c_str()); 157 } 158 ArchiveFile(logname); 159 } 160 } 161 162 Status ConvertLogToTable(uint64_t log) { 163 struct LogReporter : public log::Reader::Reporter { 164 Env* env; 165 Logger* info_log; 166 uint64_t lognum; 167 virtual void Corruption(size_t bytes, const Status& s) { 168 // We print error messages for corruption, but continue repairing. 169 Log(info_log, "Log #%llu: dropping %d bytes; %s", 170 (unsigned long long) lognum, 171 static_cast<int>(bytes), 172 s.ToString().c_str()); 173 } 174 }; 175 176 // Open the log file 177 std::string logname = LogFileName(dbname_, log); 178 SequentialFile* lfile; 179 Status status = env_->NewSequentialFile(logname, &lfile); 180 if (!status.ok()) { 181 return status; 182 } 183 184 // Create the log reader. 185 LogReporter reporter; 186 reporter.env = env_; 187 reporter.info_log = options_.info_log; 188 reporter.lognum = log; 189 // We intentially make log::Reader do checksumming so that 190 // corruptions cause entire commits to be skipped instead of 191 // propagating bad information (like overly large sequence 192 // numbers). 193 log::Reader reader(lfile, &reporter, false/*do not checksum*/, 194 0/*initial_offset*/); 195 196 // Read all the records and add to a memtable 197 std::string scratch; 198 Slice record; 199 WriteBatch batch; 200 MemTable* mem = new MemTable(icmp_); 201 mem->Ref(); 202 int counter = 0; 203 while (reader.ReadRecord(&record, &scratch)) { 204 if (record.size() < 12) { 205 reporter.Corruption( 206 record.size(), Status::Corruption("log record too small")); 207 continue; 208 } 209 WriteBatchInternal::SetContents(&batch, record); 210 status = WriteBatchInternal::InsertInto(&batch, mem); 211 if (status.ok()) { 212 counter += WriteBatchInternal::Count(&batch); 213 } else { 214 Log(options_.info_log, "Log #%llu: ignoring %s", 215 (unsigned long long) log, 216 status.ToString().c_str()); 217 status = Status::OK(); // Keep going with rest of file 218 } 219 } 220 delete lfile; 221 222 // Do not record a version edit for this conversion to a Table 223 // since ExtractMetaData() will also generate edits. 224 FileMetaData meta; 225 meta.number = next_file_number_++; 226 Iterator* iter = mem->NewIterator(); 227 status = BuildTable(dbname_, env_, options_, table_cache_, iter, &meta); 228 delete iter; 229 mem->Unref(); 230 mem = NULL; 231 if (status.ok()) { 232 if (meta.file_size > 0) { 233 table_numbers_.push_back(meta.number); 234 } 235 } 236 Log(options_.info_log, "Log #%llu: %d ops saved to Table #%llu %s", 237 (unsigned long long) log, 238 counter, 239 (unsigned long long) meta.number, 240 status.ToString().c_str()); 241 return status; 242 } 243 244 void ExtractMetaData() { 245 std::vector<TableInfo> kept; 246 for (size_t i = 0; i < table_numbers_.size(); i++) { 247 ScanTable(table_numbers_[i]); 248 } 249 } 250 251 Iterator* NewTableIterator(const FileMetaData& meta) { 252 // Same as compaction iterators: if paranoid_checks are on, turn 253 // on checksum verification. 254 ReadOptions r; 255 r.verify_checksums = options_.paranoid_checks; 256 return table_cache_->NewIterator(r, meta.number, meta.file_size); 257 } 258 259 void ScanTable(uint64_t number) { 260 TableInfo t; 261 t.meta.number = number; 262 std::string fname = TableFileName(dbname_, number); 263 Status status = env_->GetFileSize(fname, &t.meta.file_size); 264 if (!status.ok()) { 265 // Try alternate file name. 266 fname = SSTTableFileName(dbname_, number); 267 Status s2 = env_->GetFileSize(fname, &t.meta.file_size); 268 if (s2.ok()) { 269 status = Status::OK(); 270 } 271 } 272 if (!status.ok()) { 273 ArchiveFile(TableFileName(dbname_, number)); 274 ArchiveFile(SSTTableFileName(dbname_, number)); 275 Log(options_.info_log, "Table #%llu: dropped: %s", 276 (unsigned long long) t.meta.number, 277 status.ToString().c_str()); 278 return; 279 } 280 281 // Extract metadata by scanning through table. 282 int counter = 0; 283 Iterator* iter = NewTableIterator(t.meta); 284 bool empty = true; 285 ParsedInternalKey parsed; 286 t.max_sequence = 0; 287 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { 288 Slice key = iter->key(); 289 if (!ParseInternalKey(key, &parsed)) { 290 Log(options_.info_log, "Table #%llu: unparsable key %s", 291 (unsigned long long) t.meta.number, 292 EscapeString(key).c_str()); 293 continue; 294 } 295 296 counter++; 297 if (empty) { 298 empty = false; 299 t.meta.smallest.DecodeFrom(key); 300 } 301 t.meta.largest.DecodeFrom(key); 302 if (parsed.sequence > t.max_sequence) { 303 t.max_sequence = parsed.sequence; 304 } 305 } 306 if (!iter->status().ok()) { 307 status = iter->status(); 308 } 309 delete iter; 310 Log(options_.info_log, "Table #%llu: %d entries %s", 311 (unsigned long long) t.meta.number, 312 counter, 313 status.ToString().c_str()); 314 315 if (status.ok()) { 316 tables_.push_back(t); 317 } else { 318 RepairTable(fname, t); // RepairTable archives input file. 319 } 320 } 321 322 void RepairTable(const std::string& src, TableInfo t) { 323 // We will copy src contents to a new table and then rename the 324 // new table over the source. 325 326 // Create builder. 327 std::string copy = TableFileName(dbname_, next_file_number_++); 328 WritableFile* file; 329 Status s = env_->NewWritableFile(copy, &file); 330 if (!s.ok()) { 331 return; 332 } 333 TableBuilder* builder = new TableBuilder(options_, file); 334 335 // Copy data. 336 Iterator* iter = NewTableIterator(t.meta); 337 int counter = 0; 338 for (iter->SeekToFirst(); iter->Valid(); iter->Next()) { 339 builder->Add(iter->key(), iter->value()); 340 counter++; 341 } 342 delete iter; 343 344 ArchiveFile(src); 345 if (counter == 0) { 346 builder->Abandon(); // Nothing to save 347 } else { 348 s = builder->Finish(); 349 if (s.ok()) { 350 t.meta.file_size = builder->FileSize(); 351 } 352 } 353 delete builder; 354 builder = NULL; 355 356 if (s.ok()) { 357 s = file->Close(); 358 } 359 delete file; 360 file = NULL; 361 362 if (counter > 0 && s.ok()) { 363 std::string orig = TableFileName(dbname_, t.meta.number); 364 s = env_->RenameFile(copy, orig); 365 if (s.ok()) { 366 Log(options_.info_log, "Table #%llu: %d entries repaired", 367 (unsigned long long) t.meta.number, counter); 368 tables_.push_back(t); 369 } 370 } 371 if (!s.ok()) { 372 env_->DeleteFile(copy); 373 } 374 } 375 376 Status WriteDescriptor() { 377 std::string tmp = TempFileName(dbname_, 1); 378 WritableFile* file; 379 Status status = env_->NewWritableFile(tmp, &file); 380 if (!status.ok()) { 381 return status; 382 } 383 384 SequenceNumber max_sequence = 0; 385 for (size_t i = 0; i < tables_.size(); i++) { 386 if (max_sequence < tables_[i].max_sequence) { 387 max_sequence = tables_[i].max_sequence; 388 } 389 } 390 391 edit_.SetComparatorName(icmp_.user_comparator()->Name()); 392 edit_.SetLogNumber(0); 393 edit_.SetNextFile(next_file_number_); 394 edit_.SetLastSequence(max_sequence); 395 396 for (size_t i = 0; i < tables_.size(); i++) { 397 // TODO(opt): separate out into multiple levels 398 const TableInfo& t = tables_[i]; 399 edit_.AddFile(0, t.meta.number, t.meta.file_size, 400 t.meta.smallest, t.meta.largest); 401 } 402 403 //fprintf(stderr, "NewDescriptor:\n%s\n", edit_.DebugString().c_str()); 404 { 405 log::Writer log(file); 406 std::string record; 407 edit_.EncodeTo(&record); 408 status = log.AddRecord(record); 409 } 410 if (status.ok()) { 411 status = file->Close(); 412 } 413 delete file; 414 file = NULL; 415 416 if (!status.ok()) { 417 env_->DeleteFile(tmp); 418 } else { 419 // Discard older manifests 420 for (size_t i = 0; i < manifests_.size(); i++) { 421 ArchiveFile(dbname_ + "/" + manifests_[i]); 422 } 423 424 // Install new manifest 425 status = env_->RenameFile(tmp, DescriptorFileName(dbname_, 1)); 426 if (status.ok()) { 427 status = SetCurrentFile(env_, dbname_, 1); 428 } else { 429 env_->DeleteFile(tmp); 430 } 431 } 432 return status; 433 } 434 435 void ArchiveFile(const std::string& fname) { 436 // Move into another directory. E.g., for 437 // dir/foo 438 // rename to 439 // dir/lost/foo 440 const char* slash = strrchr(fname.c_str(), '/'); 441 std::string new_dir; 442 if (slash != NULL) { 443 new_dir.assign(fname.data(), slash - fname.data()); 444 } 445 new_dir.append("/lost"); 446 env_->CreateDir(new_dir); // Ignore error 447 std::string new_file = new_dir; 448 new_file.append("/"); 449 new_file.append((slash == NULL) ? fname.c_str() : slash + 1); 450 Status s = env_->RenameFile(fname, new_file); 451 Log(options_.info_log, "Archiving %s: %s\n", 452 fname.c_str(), s.ToString().c_str()); 453 } 454 }; 455 } // namespace 456 457 Status RepairDB(const std::string& dbname, const Options& options) { 458 Repairer repairer(dbname, options); 459 return repairer.Run(); 460 } 461 462 } // namespace leveldb 463