1 /* 2 * linux/fs/jbd/recovery.c 3 * 4 * Written by Stephen C. Tweedie <sct (at) redhat.com>, 1999 5 * 6 * Copyright 1999-2000 Red Hat Software --- All Rights Reserved 7 * 8 * This file is part of the Linux kernel and is made available under 9 * the terms of the GNU General Public License, version 2, or at your 10 * option, any later version, incorporated herein by reference. 11 * 12 * Journal recovery routines for the generic filesystem journaling code; 13 * part of the ext2fs journaling system. 14 */ 15 16 #ifndef __KERNEL__ 17 #include "jfs_user.h" 18 #else 19 #include <linux/time.h> 20 #include <linux/fs.h> 21 #include <linux/jbd.h> 22 #include <linux/errno.h> 23 #include <linux/slab.h> 24 #endif 25 26 /* 27 * Maintain information about the progress of the recovery job, so that 28 * the different passes can carry information between them. 29 */ 30 struct recovery_info 31 { 32 tid_t start_transaction; 33 tid_t end_transaction; 34 35 int nr_replays; 36 int nr_revokes; 37 int nr_revoke_hits; 38 }; 39 40 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY}; 41 static int do_one_pass(journal_t *journal, 42 struct recovery_info *info, enum passtype pass); 43 static int scan_revoke_records(journal_t *, struct buffer_head *, 44 tid_t, struct recovery_info *); 45 46 #ifdef __KERNEL__ 47 48 /* Release readahead buffers after use */ 49 static void journal_brelse_array(struct buffer_head *b[], int n) 50 { 51 while (--n >= 0) 52 brelse (b[n]); 53 } 54 55 56 /* 57 * When reading from the journal, we are going through the block device 58 * layer directly and so there is no readahead being done for us. We 59 * need to implement any readahead ourselves if we want it to happen at 60 * all. Recovery is basically one long sequential read, so make sure we 61 * do the IO in reasonably large chunks. 62 * 63 * This is not so critical that we need to be enormously clever about 64 * the readahead size, though. 128K is a purely arbitrary, good-enough 65 * fixed value. 66 */ 67 68 #define MAXBUF 8 69 static int do_readahead(journal_t *journal, unsigned int start) 70 { 71 int err; 72 unsigned int max, nbufs, next; 73 unsigned long long blocknr; 74 struct buffer_head *bh; 75 76 struct buffer_head * bufs[MAXBUF]; 77 78 /* Do up to 128K of readahead */ 79 max = start + (128 * 1024 / journal->j_blocksize); 80 if (max > journal->j_maxlen) 81 max = journal->j_maxlen; 82 83 /* Do the readahead itself. We'll submit MAXBUF buffer_heads at 84 * a time to the block device IO layer. */ 85 86 nbufs = 0; 87 88 for (next = start; next < max; next++) { 89 err = journal_bmap(journal, next, &blocknr); 90 91 if (err) { 92 printk (KERN_ERR "JBD: bad block at offset %u\n", 93 next); 94 goto failed; 95 } 96 97 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 98 if (!bh) { 99 err = -ENOMEM; 100 goto failed; 101 } 102 103 if (!buffer_uptodate(bh) && !buffer_locked(bh)) { 104 bufs[nbufs++] = bh; 105 if (nbufs == MAXBUF) { 106 ll_rw_block(READ, nbufs, bufs); 107 journal_brelse_array(bufs, nbufs); 108 nbufs = 0; 109 } 110 } else 111 brelse(bh); 112 } 113 114 if (nbufs) 115 ll_rw_block(READ, nbufs, bufs); 116 err = 0; 117 118 failed: 119 if (nbufs) 120 journal_brelse_array(bufs, nbufs); 121 return err; 122 } 123 124 #endif /* __KERNEL__ */ 125 126 127 /* 128 * Read a block from the journal 129 */ 130 131 static int jread(struct buffer_head **bhp, journal_t *journal, 132 unsigned int offset) 133 { 134 int err; 135 unsigned long long blocknr; 136 struct buffer_head *bh; 137 138 *bhp = NULL; 139 140 if (offset >= journal->j_maxlen) { 141 printk(KERN_ERR "JBD: corrupted journal superblock\n"); 142 return -EIO; 143 } 144 145 err = journal_bmap(journal, offset, &blocknr); 146 147 if (err) { 148 printk (KERN_ERR "JBD: bad block at offset %u\n", 149 offset); 150 return err; 151 } 152 153 bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); 154 if (!bh) 155 return -ENOMEM; 156 157 if (!buffer_uptodate(bh)) { 158 /* If this is a brand new buffer, start readahead. 159 Otherwise, we assume we are already reading it. */ 160 if (!buffer_req(bh)) 161 do_readahead(journal, offset); 162 wait_on_buffer(bh); 163 } 164 165 if (!buffer_uptodate(bh)) { 166 printk (KERN_ERR "JBD: Failed to read block at offset %u\n", 167 offset); 168 brelse(bh); 169 return -EIO; 170 } 171 172 *bhp = bh; 173 return 0; 174 } 175 176 177 /* 178 * Count the number of in-use tags in a journal descriptor block. 179 */ 180 181 static int count_tags(journal_t *journal, struct buffer_head *bh) 182 { 183 char * tagp; 184 journal_block_tag_t * tag; 185 int nr = 0, size = journal->j_blocksize; 186 int tag_bytes = journal_tag_bytes(journal); 187 188 tagp = &bh->b_data[sizeof(journal_header_t)]; 189 190 while ((tagp - bh->b_data + tag_bytes) <= size) { 191 tag = (journal_block_tag_t *) tagp; 192 193 nr++; 194 tagp += tag_bytes; 195 if (!(tag->t_flags & cpu_to_be32(JFS_FLAG_SAME_UUID))) 196 tagp += 16; 197 198 if (tag->t_flags & cpu_to_be32(JFS_FLAG_LAST_TAG)) 199 break; 200 } 201 202 return nr; 203 } 204 205 206 /* Make sure we wrap around the log correctly! */ 207 #define wrap(journal, var) \ 208 do { \ 209 if (var >= (journal)->j_last) \ 210 var -= ((journal)->j_last - (journal)->j_first); \ 211 } while (0) 212 213 /** 214 * journal_recover - recovers a on-disk journal 215 * @journal: the journal to recover 216 * 217 * The primary function for recovering the log contents when mounting a 218 * journaled device. 219 * 220 * Recovery is done in three passes. In the first pass, we look for the 221 * end of the log. In the second, we assemble the list of revoke 222 * blocks. In the third and final pass, we replay any un-revoked blocks 223 * in the log. 224 */ 225 int journal_recover(journal_t *journal) 226 { 227 int err; 228 journal_superblock_t * sb; 229 230 struct recovery_info info; 231 232 memset(&info, 0, sizeof(info)); 233 sb = journal->j_superblock; 234 235 /* 236 * The journal superblock's s_start field (the current log head) 237 * is always zero if, and only if, the journal was cleanly 238 * unmounted. 239 */ 240 241 if (!sb->s_start) { 242 jbd_debug(1, "No recovery required, last transaction %d\n", 243 be32_to_cpu(sb->s_sequence)); 244 journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; 245 return 0; 246 } 247 248 err = do_one_pass(journal, &info, PASS_SCAN); 249 if (!err) 250 err = do_one_pass(journal, &info, PASS_REVOKE); 251 if (!err) 252 err = do_one_pass(journal, &info, PASS_REPLAY); 253 254 jbd_debug(1, "JBD: recovery, exit status %d, " 255 "recovered transactions %u to %u\n", 256 err, info.start_transaction, info.end_transaction); 257 jbd_debug(1, "JBD: Replayed %d and revoked %d/%d blocks\n", 258 info.nr_replays, info.nr_revoke_hits, info.nr_revokes); 259 260 /* Restart the log at the next transaction ID, thus invalidating 261 * any existing commit records in the log. */ 262 journal->j_transaction_sequence = ++info.end_transaction; 263 264 journal_clear_revoke(journal); 265 sync_blockdev(journal->j_fs_dev); 266 return err; 267 } 268 269 /** 270 * journal_skip_recovery - Start journal and wipe exiting records 271 * @journal: journal to startup 272 * 273 * Locate any valid recovery information from the journal and set up the 274 * journal structures in memory to ignore it (presumably because the 275 * caller has evidence that it is out of date). 276 * This function does'nt appear to be exorted.. 277 * 278 * We perform one pass over the journal to allow us to tell the user how 279 * much recovery information is being erased, and to let us initialise 280 * the journal transaction sequence numbers to the next unused ID. 281 */ 282 int journal_skip_recovery(journal_t *journal) 283 { 284 int err; 285 struct recovery_info info; 286 287 memset (&info, 0, sizeof(info)); 288 289 err = do_one_pass(journal, &info, PASS_SCAN); 290 291 if (err) { 292 printk(KERN_ERR "JBD: error %d scanning journal\n", err); 293 ++journal->j_transaction_sequence; 294 } else { 295 #ifdef CONFIG_JBD_DEBUG 296 journal_superblock_t *sb = journal->j_superblock; 297 298 int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); 299 #endif 300 jbd_debug(1, 301 "JBD: ignoring %d transaction%s from the journal.\n", 302 dropped, (dropped == 1) ? "" : "s"); 303 journal->j_transaction_sequence = ++info.end_transaction; 304 } 305 306 journal->j_tail = 0; 307 return err; 308 } 309 310 static inline unsigned long long read_tag_block(int tag_bytes, journal_block_tag_t *tag) 311 { 312 unsigned long long block = be32_to_cpu(tag->t_blocknr); 313 if (tag_bytes > JBD_TAG_SIZE32) 314 block |= (__u64)be32_to_cpu(tag->t_blocknr_high) << 32; 315 return block; 316 } 317 318 /* 319 * calc_chksums calculates the checksums for the blocks described in the 320 * descriptor block. 321 */ 322 static int calc_chksums(journal_t *journal, struct buffer_head *bh, 323 unsigned long long *next_log_block, __u32 *crc32_sum) 324 { 325 int i, num_blks, err; 326 unsigned long long io_block; 327 struct buffer_head *obh; 328 329 num_blks = count_tags(journal, bh); 330 /* Calculate checksum of the descriptor block. */ 331 *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size); 332 333 for (i = 0; i < num_blks; i++) { 334 io_block = (*next_log_block)++; 335 wrap(journal, *next_log_block); 336 err = jread(&obh, journal, io_block); 337 if (err) { 338 printk(KERN_ERR "JBD: IO error %d recovering block " 339 "%llu in log\n", err, io_block); 340 return 1; 341 } else { 342 *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data, 343 obh->b_size); 344 } 345 brelse(obh); 346 } 347 return 0; 348 } 349 350 static int do_one_pass(journal_t *journal, 351 struct recovery_info *info, enum passtype pass) 352 { 353 unsigned int first_commit_ID, next_commit_ID; 354 unsigned long long next_log_block; 355 int err, success = 0; 356 journal_superblock_t * sb; 357 journal_header_t * tmp; 358 struct buffer_head * bh; 359 unsigned int sequence; 360 int blocktype; 361 int tag_bytes = journal_tag_bytes(journal); 362 __u32 crc32_sum = ~0; /* Transactional Checksums */ 363 364 /* 365 * First thing is to establish what we expect to find in the log 366 * (in terms of transaction IDs), and where (in terms of log 367 * block offsets): query the superblock. 368 */ 369 370 sb = journal->j_superblock; 371 next_commit_ID = be32_to_cpu(sb->s_sequence); 372 next_log_block = be32_to_cpu(sb->s_start); 373 374 first_commit_ID = next_commit_ID; 375 if (pass == PASS_SCAN) 376 info->start_transaction = first_commit_ID; 377 378 jbd_debug(1, "Starting recovery pass %d\n", pass); 379 380 /* 381 * Now we walk through the log, transaction by transaction, 382 * making sure that each transaction has a commit block in the 383 * expected place. Each complete transaction gets replayed back 384 * into the main filesystem. 385 */ 386 387 while (1) { 388 int flags; 389 char * tagp; 390 journal_block_tag_t * tag; 391 struct buffer_head * obh; 392 struct buffer_head * nbh; 393 394 cond_resched(); 395 396 /* If we already know where to stop the log traversal, 397 * check right now that we haven't gone past the end of 398 * the log. */ 399 400 if (pass != PASS_SCAN) 401 if (tid_geq(next_commit_ID, info->end_transaction)) 402 break; 403 404 jbd_debug(2, "Scanning for sequence ID %u at %llu/%lu\n", 405 next_commit_ID, next_log_block, journal->j_last); 406 407 /* Skip over each chunk of the transaction looking 408 * either the next descriptor block or the final commit 409 * record. */ 410 411 jbd_debug(3, "JBD: checking block %llu\n", next_log_block); 412 err = jread(&bh, journal, next_log_block); 413 if (err) 414 goto failed; 415 416 next_log_block++; 417 wrap(journal, next_log_block); 418 419 /* What kind of buffer is it? 420 * 421 * If it is a descriptor block, check that it has the 422 * expected sequence number. Otherwise, we're all done 423 * here. */ 424 425 tmp = (journal_header_t *)bh->b_data; 426 427 if (tmp->h_magic != cpu_to_be32(JFS_MAGIC_NUMBER)) { 428 brelse(bh); 429 break; 430 } 431 432 blocktype = be32_to_cpu(tmp->h_blocktype); 433 sequence = be32_to_cpu(tmp->h_sequence); 434 jbd_debug(3, "Found magic %d, sequence %d\n", 435 blocktype, sequence); 436 437 if (sequence != next_commit_ID) { 438 brelse(bh); 439 break; 440 } 441 442 /* OK, we have a valid descriptor block which matches 443 * all of the sequence number checks. What are we going 444 * to do with it? That depends on the pass... */ 445 446 switch(blocktype) { 447 case JFS_DESCRIPTOR_BLOCK: 448 /* If it is a valid descriptor block, replay it 449 * in pass REPLAY; if journal_checksums enabled, then 450 * calculate checksums in PASS_SCAN, otherwise, 451 * just skip over the blocks it describes. */ 452 if (pass != PASS_REPLAY) { 453 if (pass == PASS_SCAN && 454 JFS_HAS_COMPAT_FEATURE(journal, 455 JFS_FEATURE_COMPAT_CHECKSUM) && 456 !info->end_transaction) { 457 if (calc_chksums(journal, bh, 458 &next_log_block, 459 &crc32_sum)) { 460 brelse(bh); 461 break; 462 } 463 brelse(bh); 464 continue; 465 } 466 next_log_block += count_tags(journal, bh); 467 wrap(journal, next_log_block); 468 brelse(bh); 469 continue; 470 } 471 472 /* A descriptor block: we can now write all of 473 * the data blocks. Yay, useful work is finally 474 * getting done here! */ 475 476 tagp = &bh->b_data[sizeof(journal_header_t)]; 477 while ((tagp - bh->b_data + tag_bytes) 478 <= journal->j_blocksize) { 479 unsigned long long io_block; 480 481 tag = (journal_block_tag_t *) tagp; 482 flags = be32_to_cpu(tag->t_flags); 483 484 io_block = next_log_block++; 485 wrap(journal, next_log_block); 486 err = jread(&obh, journal, io_block); 487 if (err) { 488 /* Recover what we can, but 489 * report failure at the end. */ 490 success = err; 491 printk (KERN_ERR 492 "JBD: IO error %d recovering " 493 "block %llu in log\n", 494 err, io_block); 495 } else { 496 unsigned long long blocknr; 497 498 J_ASSERT(obh != NULL); 499 blocknr = read_tag_block(tag_bytes, 500 tag); 501 502 /* If the block has been 503 * revoked, then we're all done 504 * here. */ 505 if (journal_test_revoke 506 (journal, blocknr, 507 next_commit_ID)) { 508 brelse(obh); 509 ++info->nr_revoke_hits; 510 goto skip_write; 511 } 512 513 /* Find a buffer for the new 514 * data being restored */ 515 nbh = __getblk(journal->j_fs_dev, 516 blocknr, 517 journal->j_blocksize); 518 if (nbh == NULL) { 519 printk(KERN_ERR 520 "JBD: Out of memory " 521 "during recovery.\n"); 522 err = -ENOMEM; 523 brelse(bh); 524 brelse(obh); 525 goto failed; 526 } 527 528 lock_buffer(nbh); 529 memcpy(nbh->b_data, obh->b_data, 530 journal->j_blocksize); 531 if (flags & JFS_FLAG_ESCAPE) { 532 journal_header_t *header; 533 534 header = (journal_header_t *) &nbh->b_data[0]; 535 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER); 536 } 537 538 BUFFER_TRACE(nbh, "marking dirty"); 539 set_buffer_uptodate(nbh); 540 mark_buffer_dirty(nbh); 541 BUFFER_TRACE(nbh, "marking uptodate"); 542 ++info->nr_replays; 543 /* ll_rw_block(WRITE, 1, &nbh); */ 544 unlock_buffer(nbh); 545 brelse(obh); 546 brelse(nbh); 547 } 548 549 skip_write: 550 tagp += tag_bytes; 551 if (!(flags & JFS_FLAG_SAME_UUID)) 552 tagp += 16; 553 554 if (flags & JFS_FLAG_LAST_TAG) 555 break; 556 } 557 558 brelse(bh); 559 continue; 560 561 case JFS_COMMIT_BLOCK: 562 jbd_debug(3, "Commit block for #%u found\n", 563 next_commit_ID); 564 /* How to differentiate between interrupted commit 565 * and journal corruption ? 566 * 567 * {nth transaction} 568 * Checksum Verification Failed 569 * | 570 * ____________________ 571 * | | 572 * async_commit sync_commit 573 * | | 574 * | GO TO NEXT "Journal Corruption" 575 * | TRANSACTION 576 * | 577 * {(n+1)th transanction} 578 * | 579 * _______|______________ 580 * | | 581 * Commit block found Commit block not found 582 * | | 583 * "Journal Corruption" | 584 * _____________|_________ 585 * | | 586 * nth trans corrupt OR nth trans 587 * and (n+1)th interrupted interrupted 588 * before commit block 589 * could reach the disk. 590 * (Cannot find the difference in above 591 * mentioned conditions. Hence assume 592 * "Interrupted Commit".) 593 */ 594 595 /* Found an expected commit block: if checksums 596 * are present verify them in PASS_SCAN; else not 597 * much to do other than move on to the next sequence 598 * number. */ 599 if (pass == PASS_SCAN && 600 JFS_HAS_COMPAT_FEATURE(journal, 601 JFS_FEATURE_COMPAT_CHECKSUM)) { 602 int chksum_err, chksum_seen; 603 struct commit_header *cbh = 604 (struct commit_header *)bh->b_data; 605 unsigned found_chksum = 606 be32_to_cpu(cbh->h_chksum[0]); 607 608 chksum_err = chksum_seen = 0; 609 610 jbd_debug(3, "Checksums %x %x\n", 611 crc32_sum, found_chksum); 612 if (info->end_transaction) { 613 journal->j_failed_commit = 614 info->end_transaction; 615 brelse(bh); 616 break; 617 } 618 619 if (crc32_sum == found_chksum && 620 cbh->h_chksum_type == JBD2_CRC32_CHKSUM && 621 cbh->h_chksum_size == 622 JBD2_CRC32_CHKSUM_SIZE) 623 chksum_seen = 1; 624 else if (!(cbh->h_chksum_type == 0 && 625 cbh->h_chksum_size == 0 && 626 found_chksum == 0 && 627 !chksum_seen)) 628 /* 629 * If fs is mounted using an old kernel and then 630 * kernel with journal_chksum is used then we 631 * get a situation where the journal flag has 632 * checksum flag set but checksums are not 633 * present i.e chksum = 0, in the individual 634 * commit blocks. 635 * Hence to avoid checksum failures, in this 636 * situation, this extra check is added. 637 */ 638 chksum_err = 1; 639 640 if (chksum_err) { 641 info->end_transaction = next_commit_ID; 642 jbd_debug(1, "Checksum_err %x %x\n", 643 crc32_sum, found_chksum); 644 if (!JFS_HAS_INCOMPAT_FEATURE(journal, 645 JFS_FEATURE_INCOMPAT_ASYNC_COMMIT)){ 646 journal->j_failed_commit = 647 next_commit_ID; 648 brelse(bh); 649 break; 650 } 651 } 652 crc32_sum = ~0; 653 } 654 brelse(bh); 655 next_commit_ID++; 656 continue; 657 658 case JFS_REVOKE_BLOCK: 659 /* If we aren't in the REVOKE pass, then we can 660 * just skip over this block. */ 661 if (pass != PASS_REVOKE) { 662 brelse(bh); 663 continue; 664 } 665 666 err = scan_revoke_records(journal, bh, 667 next_commit_ID, info); 668 brelse(bh); 669 if (err) 670 goto failed; 671 continue; 672 673 default: 674 jbd_debug(3, "Unrecognised magic %d, end of scan.\n", 675 blocktype); 676 brelse(bh); 677 goto done; 678 } 679 } 680 681 done: 682 /* 683 * We broke out of the log scan loop: either we came to the 684 * known end of the log or we found an unexpected block in the 685 * log. If the latter happened, then we know that the "current" 686 * transaction marks the end of the valid log. 687 */ 688 689 if (pass == PASS_SCAN) { 690 if (!info->end_transaction) 691 info->end_transaction = next_commit_ID; 692 } else { 693 /* It's really bad news if different passes end up at 694 * different places (but possible due to IO errors). */ 695 if (info->end_transaction != next_commit_ID) { 696 printk (KERN_ERR "JBD: recovery pass %d ended at " 697 "transaction %u, expected %u\n", 698 pass, next_commit_ID, info->end_transaction); 699 if (!success) 700 success = -EIO; 701 } 702 } 703 704 return success; 705 706 failed: 707 return err; 708 } 709 710 711 /* Scan a revoke record, marking all blocks mentioned as revoked. */ 712 713 static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, 714 tid_t sequence, struct recovery_info *info) 715 { 716 journal_revoke_header_t *header; 717 int offset, max; 718 int record_len = 4; 719 720 header = (journal_revoke_header_t *) bh->b_data; 721 offset = sizeof(journal_revoke_header_t); 722 max = be32_to_cpu(header->r_count); 723 724 if (JFS_HAS_INCOMPAT_FEATURE(journal, JFS_FEATURE_INCOMPAT_64BIT)) 725 record_len = 8; 726 727 while (offset < max) { 728 unsigned long long blocknr; 729 int err; 730 731 if (record_len == 4) { 732 __be32 b; 733 memcpy(&b, bh->b_data + offset, sizeof(__be32)); 734 blocknr = ext2fs_be32_to_cpu(b); 735 } else { 736 __be64 b; 737 memcpy(&b, bh->b_data + offset, sizeof(__be64)); 738 blocknr = ext2fs_be64_to_cpu(b); 739 } 740 741 offset += record_len; 742 err = journal_set_revoke(journal, blocknr, sequence); 743 if (err) 744 return err; 745 ++info->nr_revokes; 746 } 747 return 0; 748 } 749