1 /* 2 * linux/fs/recovery.c 3 * 4 * Written by Stephen C. Tweedie <sct (at) redhat.com>, 1999 5 * 6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal recovery routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 */ 15 16 #ifndef __KERNEL__ 17 #include "jfs_user.h" 18 #else 19 #include <linux/time.h> 20 #include <linux/fs.h> 21 #include <linux/jbd.h> 22 #include <linux/errno.h> 23 #include <linux/slab.h> 24 #endif 25 26 /* 27 * Maintain information about the progress of the recovery job, so that 28 * the different passes can carry information between them. 29 */ 30 struct recovery_info 31 { 32 tid_t start_transaction; 33 tid_t end_transaction; 34 35 int nr_replays; 36 int nr_revokes; 37 int nr_revoke_hits; 38 }; 39 40 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; 41 static int do_one_pass(journal_t *journal, 42 struct recovery_info *info, enum passtype pass); 43 static int scan_revoke_records(journal_t *, struct buffer_head *, 44 tid_t, struct recovery_info *); 45 46 #ifdef __KERNEL__ 47 48 /* Release readahead buffers after use */ 49 void journal_brelse_array(struct buffer_head *b[], int n) 50 { 51 while (--n >= 0) 52 brelse (b[n]); 53 } 54 55 56 /* 57 * When reading from the journal, we are going through the block device 58 * layer directly and so there is no readahead being done for us. We 59 * need to implement any readahead ourselves if we want it to happen at 60 * all. Recovery is basically one long sequential read, so make sure we 61 * do the IO in reasonably large chunks. 62 * 63 * This is not so critical that we need to be enormously clever about 64 * the readahead size, though. 128K is a purely arbitrary, good-enough 65 * fixed value. 66 */ 67 68 #define MAXBUF 8 69 static int do_readahead(journal_t *journal, unsigned int start) 70 { 71 int err; 72 unsigned int max, nbufs, next; 73 unsigned long blocknr; 74 struct buffer_head *bh; 75 76 struct buffer_head * bufs[MAXBUF]; 77 78 /* Do up to 128K of readahead */ 79 max = start + (128 * 1024 / journal->j_blocksize); 80 if (max > journal->j_maxlen) 81 max = journal->j_maxlen; 82 83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at 84 * a time to the block device IO layer. */ 85 86 nbufs = 0; 87 88 for (next = start; next < max; next++) { 89 err = journal_bmap(journal, next, &blocknr); 90 91 if (err) { 92 printk (KERN_ERR "JBD: bad block at offset %u\n", 93 next); 94 goto failed; 95 } 96 97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 98 if (!bh) { 99 err = -ENOMEM; 100 goto failed; 101 } 102 103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) { 104 bufs[nbufs++] = bh; 105 if (nbufs == MAXBUF) { 106 ll_rw_block(READ, nbufs, bufs); 107 journal_brelse_array(bufs, nbufs); 108 nbufs = 0; 109 } 110 } else 111 brelse(bh); 112 } 113 114 if (nbufs) 115 ll_rw_block(READ, nbufs, bufs); 116 err = 0; 117 118 failed: 119 if (nbufs) 120 journal_brelse_array(bufs, nbufs); 121 return err; 122 } 123 124 #endif /* __KERNEL__ */ 125 126 127 /* 128 * Read a block from the journal 129 */ 130 131 static int jread(struct buffer_head **bhp, journal_t *journal, 132 unsigned int offset) 133 { 134 int err; 135 unsigned long blocknr; 136 struct buffer_head *bh; 137 138 *bhp = NULL; 139 140 J_ASSERT (offset < journal->j_maxlen); 141 142 err = journal_bmap(journal, offset, &blocknr); 143 144 if (err) { 145 printk (KERN_ERR "JBD: bad block at offset %u\n", 146 offset); 147 return err; 148 } 149 150 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 151 if (!bh) 152 return -ENOMEM; 153 154 if (!buffer_uptodate(bh)) { 155 /* If this is a brand new buffer, start readahead. 156 Otherwise, we assume we are already reading it. */ 157 if (!buffer_req(bh)) 158 do_readahead(journal, offset); 159 wait_on_buffer(bh); 160 } 161 162 if (!buffer_uptodate(bh)) { 163 printk (KERN_ERR "JBD: Failed to read block at offset %u\n", 164 offset); 165 brelse(bh); 166 return -EIO; 167 } 168 169 *bhp = bh; 170 return 0; 171 } 172 173 174 /* 175 * Count the number of in-use tags in a journal descriptor block. 176 */ 177 178 static int count_tags(struct buffer_head *bh, int size) 179 { 180 char * tagp; 181 journal_block_tag_t * tag; 182 int nr = 0; 183 184 tagp = &bh->b_data[sizeof(journal_header_t)]; 185 186 while ((tagp - bh->b_data + sizeof(journal_block_tag_t)) <= size) { 187 tag = (journal_block_tag_t *) tagp; 188 189 nr++; 190 tagp += sizeof(journal_block_tag_t); 191 if (!(tag->t_flags & htonl(JFS_FLAG_SAME_UUID))) 192 tagp += 16; 193 194 if (tag->t_flags & htonl(JFS_FLAG_LAST_TAG)) 195 break; 196 } 197 198 return nr; 199 } 200 201 202 /* Make sure we wrap around the log correctly! */ 203 #define wrap(journal, var) \ 204 do { \ 205 if (var >= (journal)->j_last) \ 206 var -= ((journal)->j_last - (journal)->j_first); \ 207 } while (0) 208 209 /** 210 * int journal_recover(journal_t *journal) - recovers a on-disk journal 211 * @journal: the journal to recover 212 * 213 * The primary function for recovering the log contents when mounting a 214 * journaled device. 215 * 216 * Recovery is done in three passes. In the first pass, we look for the 217 * end of the log. In the second, we assemble the list of revoke 218 * blocks. In the third and final pass, we replay any un-revoked blocks 219 * in the log. 220 */ 221 int journal_recover(journal_t *journal) 222 { 223 int err; 224 journal_superblock_t * sb; 225 226 struct recovery_info info; 227 228 memset(&info, 0, sizeof(info)); 229 sb = journal->j_superblock; 230 231 /* 232 * The journal superblock's s_start field (the current log head) 233 * is always zero if, and only if, the journal was cleanly 234 * unmounted. 235 */ 236 237 if (!sb->s_start) { 238 jbd_debug(1, "No recovery required, last transaction %d\n", 239 (int)ntohl(sb->s_sequence)); 240 journal->j_transaction_sequence = ntohl(sb->s_sequence) + 1; 241 return 0; 242 } 243 244 err = do_one_pass(journal, &info, PASS_SCAN); 245 if (!err) 246 err = do_one_pass(journal, &info, PASS_REVOKE); 247 if (!err) 248 err = do_one_pass(journal, &info, PASS_REPLAY); 249 250 jbd_debug(0, "JBD: recovery, exit status %d, " 251 "recovered transactions %u to %u\n", 252 err, info.start_transaction, info.end_transaction); 253 jbd_debug(0, "JBD: Replayed %d and revoked %d/%d blocks\n", 254 info.nr_replays, info.nr_revoke_hits, info.nr_revokes); 255 256 /* Restart the log at the next transaction ID, thus invalidating 257 * any existing commit records in the log. */ 258 journal->j_transaction_sequence = ++info.end_transaction; 259 260 journal_clear_revoke(journal); 261 sync_blockdev(journal->j_fs_dev); 262 return err; 263 } 264 265 /** 266 * int journal_skip_recovery() - Start journal and wipe exiting records 267 * @journal: journal to startup 268 * 269 * Locate any valid recovery information from the journal and set up the 270 * journal structures in memory to ignore it (presumably because the 271 * caller has evidence that it is out of date). 272 * This function does'nt appear to be exorted.. 273 * 274 * We perform one pass over the journal to allow us to tell the user how 275 * much recovery information is being erased, and to let us initialise 276 * the journal transaction sequence numbers to the next unused ID. 277 */ 278 int journal_skip_recovery(journal_t *journal) 279 { 280 int err; 281 journal_superblock_t * sb; 282 283 struct recovery_info info; 284 285 memset (&info, 0, sizeof(info)); 286 sb = journal->j_superblock; 287 288 err = do_one_pass(journal, &info, PASS_SCAN); 289 290 if (err) { 291 printk(KERN_ERR "JBD: error %d scanning journal\n", err); 292 ++journal->j_transaction_sequence; 293 } else { 294 #ifdef CONFIG_JBD_DEBUG 295 int dropped = info.end_transaction - ntohl(sb->s_sequence); 296 #endif 297 jbd_debug(0, 298 "JBD: ignoring %d transaction%s from the journal.\n", 299 dropped, (dropped == 1) ? "" : "s"); 300 journal->j_transaction_sequence = ++info.end_transaction; 301 } 302 303 journal->j_tail = 0; 304 return err; 305 } 306 307 static int do_one_pass(journal_t *journal, 308 struct recovery_info *info, enum passtype pass) 309 { 310 unsigned int first_commit_ID, next_commit_ID; 311 unsigned long next_log_block; 312 int err, success = 0; 313 journal_superblock_t * sb; 314 journal_header_t * tmp; 315 struct buffer_head * bh; 316 unsigned int sequence; 317 int blocktype; 318 319 /* Precompute the maximum metadata descriptors in a descriptor block */ 320 int MAX_BLOCKS_PER_DESC; 321 MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) 322 / sizeof(journal_block_tag_t)); 323 324 /* 325 * First thing is to establish what we expect to find in the log 326 * (in terms of transaction IDs), and where (in terms of log 327 * block offsets): query the superblock. 328 */ 329 330 sb = journal->j_superblock; 331 next_commit_ID = ntohl(sb->s_sequence); 332 next_log_block = ntohl(sb->s_start); 333 334 first_commit_ID = next_commit_ID; 335 if (pass == PASS_SCAN) 336 info->start_transaction = first_commit_ID; 337 338 jbd_debug(1, "Starting recovery pass %d\n", pass); 339 340 /* 341 * Now we walk through the log, transaction by transaction, 342 * making sure that each transaction has a commit block in the 343 * expected place. Each complete transaction gets replayed back 344 * into the main filesystem. 345 */ 346 347 while (1) { 348 int flags; 349 char * tagp; 350 journal_block_tag_t * tag; 351 struct buffer_head * obh; 352 struct buffer_head * nbh; 353 354 /* If we already know where to stop the log traversal, 355 * check right now that we haven't gone past the end of 356 * the log. */ 357 358 if (pass != PASS_SCAN) 359 if (tid_geq(next_commit_ID, info->end_transaction)) 360 break; 361 362 jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n", 363 next_commit_ID, next_log_block, journal->j_last); 364 365 /* Skip over each chunk of the transaction looking 366 * either the next descriptor block or the final commit 367 * record. */ 368 369 jbd_debug(3, "JBD: checking block %ld\n", next_log_block); 370 err = jread(&bh, journal, next_log_block); 371 if (err) 372 goto failed; 373 374 next_log_block++; 375 wrap(journal, next_log_block); 376 377 /* What kind of buffer is it? 378 * 379 * If it is a descriptor block, check that it has the 380 * expected sequence number. Otherwise, we're all done 381 * here. */ 382 383 tmp = (journal_header_t *)bh->b_data; 384 385 if (tmp->h_magic != htonl(JFS_MAGIC_NUMBER)) { 386 brelse(bh); 387 break; 388 } 389 390 blocktype = ntohl(tmp->h_blocktype); 391 sequence = ntohl(tmp->h_sequence); 392 jbd_debug(3, "Found magic %d, sequence %d\n", 393 blocktype, sequence); 394 395 if (sequence != next_commit_ID) { 396 brelse(bh); 397 break; 398 } 399 400 /* OK, we have a valid descriptor block which matches 401 * all of the sequence number checks. What are we going 402 * to do with it? That depends on the pass... */ 403 404 switch(blocktype) { 405 case JFS_DESCRIPTOR_BLOCK: 406 /* If it is a valid descriptor block, replay it 407 * in pass REPLAY; otherwise, just skip over the 408 * blocks it describes. */ 409 if (pass != PASS_REPLAY) { 410 next_log_block += 411 count_tags(bh, journal->j_blocksize); 412 wrap(journal, next_log_block); 413 brelse(bh); 414 continue; 415 } 416 417 /* A descriptor block: we can now write all of 418 * the data blocks. Yay, useful work is finally 419 * getting done here! */ 420 421 tagp = &bh->b_data[sizeof(journal_header_t)]; 422 while ((tagp - bh->b_data +sizeof(journal_block_tag_t)) 423 <= journal->j_blocksize) { 424 unsigned long io_block; 425 426 tag = (journal_block_tag_t *) tagp; 427 flags = ntohl(tag->t_flags); 428 429 io_block = next_log_block++; 430 wrap(journal, next_log_block); 431 err = jread(&obh, journal, io_block); 432 if (err) { 433 /* Recover what we can, but 434 * report failure at the end. */ 435 success = err; 436 printk (KERN_ERR 437 "JBD: IO error %d recovering " 438 "block %lu in log\n", 439 err, io_block); 440 } else { 441 unsigned long blocknr; 442 443 J_ASSERT(obh != NULL); 444 blocknr = ntohl(tag->t_blocknr); 445 446 /* If the block has been 447 * revoked, then we're all done 448 * here. */ 449 if (journal_test_revoke 450 (journal, blocknr, 451 next_commit_ID)) { 452 brelse(obh); 453 ++info->nr_revoke_hits; 454 goto skip_write; 455 } 456 457 /* Find a buffer for the new 458 * data being restored */ 459 nbh = __getblk(journal->j_fs_dev, 460 blocknr, 461 journal->j_blocksize); 462 if (nbh == NULL) { 463 printk(KERN_ERR 464 "JBD: Out of memory " 465 "during recovery.\n"); 466 err = -ENOMEM; 467 brelse(bh); 468 brelse(obh); 469 goto failed; 470 } 471 472 lock_buffer(nbh); 473 memcpy(nbh->b_data, obh->b_data, 474 journal->j_blocksize); 475 if (flags & JFS_FLAG_ESCAPE) { 476 *((unsigned int *)bh->b_data) = 477 htonl(JFS_MAGIC_NUMBER); 478 } 479 480 BUFFER_TRACE(nbh, "marking dirty"); 481 set_buffer_uptodate(nbh); 482 mark_buffer_dirty(nbh); 483 BUFFER_TRACE(nbh, "marking uptodate"); 484 ++info->nr_replays; 485 /* ll_rw_block(WRITE, 1, &nbh); */ 486 unlock_buffer(nbh); 487 brelse(obh); 488 brelse(nbh); 489 } 490 491 skip_write: 492 tagp += sizeof(journal_block_tag_t); 493 if (!(flags & JFS_FLAG_SAME_UUID)) 494 tagp += 16; 495 496 if (flags & JFS_FLAG_LAST_TAG) 497 break; 498 } 499 500 brelse(bh); 501 continue; 502 503 case JFS_COMMIT_BLOCK: 504 /* Found an expected commit block: not much to 505 * do other than move on to the next sequence 506 * number. */ 507 brelse(bh); 508 next_commit_ID++; 509 continue; 510 511 case JFS_REVOKE_BLOCK: 512 /* If we aren't in the REVOKE pass, then we can 513 * just skip over this block. */ 514 if (pass != PASS_REVOKE) { 515 brelse(bh); 516 continue; 517 } 518 519 err = scan_revoke_records(journal, bh, 520 next_commit_ID, info); 521 brelse(bh); 522 if (err) 523 goto failed; 524 continue; 525 526 default: 527 jbd_debug(3, "Unrecognised magic %d, end of scan.\n", 528 blocktype); 529 brelse(bh); 530 goto done; 531 } 532 } 533 534 done: 535 /* 536 * We broke out of the log scan loop: either we came to the 537 * known end of the log or we found an unexpected block in the 538 * log. If the latter happened, then we know that the "current" 539 * transaction marks the end of the valid log. 540 */ 541 542 if (pass == PASS_SCAN) 543 info->end_transaction = next_commit_ID; 544 else { 545 /* It's really bad news if different passes end up at 546 * different places (but possible due to IO errors). */ 547 if (info->end_transaction != next_commit_ID) { 548 printk (KERN_ERR "JBD: recovery pass %d ended at " 549 "transaction %u, expected %u\n", 550 pass, next_commit_ID, info->end_transaction); 551 if (!success) 552 success = -EIO; 553 } 554 } 555 556 return success; 557 558 failed: 559 return err; 560 } 561 562 563 /* Scan a revoke record, marking all blocks mentioned as revoked. */ 564 565 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 566 tid_t sequence, struct recovery_info *info) 567 { 568 journal_revoke_header_t *header; 569 int offset, max; 570 571 header = (journal_revoke_header_t *) bh->b_data; 572 offset = sizeof(journal_revoke_header_t); 573 max = ntohl(header->r_count); 574 575 while (offset < max) { 576 unsigned long blocknr; 577 int err; 578 579 blocknr = ntohl(* ((unsigned int *) (bh->b_data+offset))); 580 offset += 4; 581 err = journal_set_revoke(journal, blocknr, sequence); 582 if (err) 583 return err; 584 ++info->nr_revokes; 585 } 586 return 0; 587 } 588