1 /* 2 * Block driver for the QCOW version 2 format 3 * 4 * Copyright (c) 2004-2006 Fabrice Bellard 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a copy 7 * of this software and associated documentation files (the "Software"), to deal 8 * in the Software without restriction, including without limitation the rights 9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 * copies of the Software, and to permit persons to whom the Software is 11 * furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice shall be included in 14 * all copies or substantial portions of the Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 22 * THE SOFTWARE. 23 */ 24 #include "qemu-common.h" 25 #include "block_int.h" 26 #include "module.h" 27 #include <zlib.h> 28 #include "aes.h" 29 #include "block/qcow2.h" 30 31 /* 32 Differences with QCOW: 33 34 - Support for multiple incremental snapshots. 35 - Memory management by reference counts. 36 - Clusters which have a reference count of one have the bit 37 QCOW_OFLAG_COPIED to optimize write performance. 38 - Size of compressed clusters is stored in sectors to reduce bit usage 39 in the cluster offsets. 40 - Support for storing additional data (such as the VM state) in the 41 snapshots. 42 - If a backing store is used, the cluster size is not constrained 43 (could be backported to QCOW). 44 - L2 tables have always a size of one cluster. 45 */ 46 47 //#define DEBUG_ALLOC 48 //#define DEBUG_ALLOC2 49 //#define DEBUG_EXT 50 51 52 typedef struct { 53 uint32_t magic; 54 uint32_t len; 55 } QCowExtension; 56 #define QCOW_EXT_MAGIC_END 0 57 #define QCOW_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA 58 59 60 61 static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) 62 { 63 const QCowHeader *cow_header = (const void *)buf; 64 65 if (buf_size >= sizeof(QCowHeader) && 66 be32_to_cpu(cow_header->magic) == QCOW_MAGIC && 67 be32_to_cpu(cow_header->version) == QCOW_VERSION) 68 return 100; 69 else 70 return 0; 71 } 72 73 74 /* 75 * read qcow2 extension and fill bs 76 * start reading from start_offset 77 * finish reading upon magic of value 0 or when end_offset reached 78 * unknown magic is skipped (future extension this version knows nothing about) 79 * return 0 upon success, non-0 otherwise 80 */ 81 static int qcow_read_extensions(BlockDriverState *bs, uint64_t start_offset, 82 uint64_t end_offset) 83 { 84 BDRVQcowState *s = bs->opaque; 85 QCowExtension ext; 86 uint64_t offset; 87 88 #ifdef DEBUG_EXT 89 printf("qcow_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); 90 #endif 91 offset = start_offset; 92 while (offset < end_offset) { 93 94 #ifdef DEBUG_EXT 95 /* Sanity check */ 96 if (offset > s->cluster_size) 97 printf("qcow_handle_extension: suspicious offset %lu\n", offset); 98 99 printf("attemting to read extended header in offset %lu\n", offset); 100 #endif 101 102 if (bdrv_pread(s->hd, offset, &ext, sizeof(ext)) != sizeof(ext)) { 103 fprintf(stderr, "qcow_handle_extension: ERROR: pread fail from offset %llu\n", 104 (unsigned long long)offset); 105 return 1; 106 } 107 be32_to_cpus(&ext.magic); 108 be32_to_cpus(&ext.len); 109 offset += sizeof(ext); 110 #ifdef DEBUG_EXT 111 printf("ext.magic = 0x%x\n", ext.magic); 112 #endif 113 switch (ext.magic) { 114 case QCOW_EXT_MAGIC_END: 115 return 0; 116 117 case QCOW_EXT_MAGIC_BACKING_FORMAT: 118 if (ext.len >= sizeof(bs->backing_format)) { 119 fprintf(stderr, "ERROR: ext_backing_format: len=%u too large" 120 " (>=%zu)\n", 121 ext.len, sizeof(bs->backing_format)); 122 return 2; 123 } 124 if (bdrv_pread(s->hd, offset , bs->backing_format, 125 ext.len) != ext.len) 126 return 3; 127 bs->backing_format[ext.len] = '\0'; 128 #ifdef DEBUG_EXT 129 printf("Qcow2: Got format extension %s\n", bs->backing_format); 130 #endif 131 offset += ((ext.len + 7) & ~7); 132 break; 133 134 default: 135 /* unknown magic -- just skip it */ 136 offset += ((ext.len + 7) & ~7); 137 break; 138 } 139 } 140 141 return 0; 142 } 143 144 145 static int qcow_open(BlockDriverState *bs, const char *filename, int flags) 146 { 147 BDRVQcowState *s = bs->opaque; 148 int len, i, shift, ret; 149 QCowHeader header; 150 uint64_t ext_end; 151 152 /* Performance is terrible right now with cache=writethrough due mainly 153 * to reference count updates. If the user does not explicitly specify 154 * a caching type, force to writeback caching. 155 */ 156 if ((flags & BDRV_O_CACHE_DEF)) { 157 flags |= BDRV_O_CACHE_WB; 158 flags &= ~BDRV_O_CACHE_DEF; 159 } 160 ret = bdrv_file_open(&s->hd, filename, flags); 161 if (ret < 0) 162 return ret; 163 if (bdrv_pread(s->hd, 0, &header, sizeof(header)) != sizeof(header)) 164 goto fail; 165 be32_to_cpus(&header.magic); 166 be32_to_cpus(&header.version); 167 be64_to_cpus(&header.backing_file_offset); 168 be32_to_cpus(&header.backing_file_size); 169 be64_to_cpus(&header.size); 170 be32_to_cpus(&header.cluster_bits); 171 be32_to_cpus(&header.crypt_method); 172 be64_to_cpus(&header.l1_table_offset); 173 be32_to_cpus(&header.l1_size); 174 be64_to_cpus(&header.refcount_table_offset); 175 be32_to_cpus(&header.refcount_table_clusters); 176 be64_to_cpus(&header.snapshots_offset); 177 be32_to_cpus(&header.nb_snapshots); 178 179 if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION) 180 goto fail; 181 if (header.size <= 1 || 182 header.cluster_bits < MIN_CLUSTER_BITS || 183 header.cluster_bits > MAX_CLUSTER_BITS) 184 goto fail; 185 if (header.crypt_method > QCOW_CRYPT_AES) 186 goto fail; 187 s->crypt_method_header = header.crypt_method; 188 if (s->crypt_method_header) 189 bs->encrypted = 1; 190 s->cluster_bits = header.cluster_bits; 191 s->cluster_size = 1 << s->cluster_bits; 192 s->cluster_sectors = 1 << (s->cluster_bits - 9); 193 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ 194 s->l2_size = 1 << s->l2_bits; 195 bs->total_sectors = header.size / 512; 196 s->csize_shift = (62 - (s->cluster_bits - 8)); 197 s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; 198 s->cluster_offset_mask = (1LL << s->csize_shift) - 1; 199 s->refcount_table_offset = header.refcount_table_offset; 200 s->refcount_table_size = 201 header.refcount_table_clusters << (s->cluster_bits - 3); 202 203 s->snapshots_offset = header.snapshots_offset; 204 s->nb_snapshots = header.nb_snapshots; 205 206 /* read the level 1 table */ 207 s->l1_size = header.l1_size; 208 shift = s->cluster_bits + s->l2_bits; 209 s->l1_vm_state_index = (header.size + (1LL << shift) - 1) >> shift; 210 /* the L1 table must contain at least enough entries to put 211 header.size bytes */ 212 if (s->l1_size < s->l1_vm_state_index) 213 goto fail; 214 s->l1_table_offset = header.l1_table_offset; 215 s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t)); 216 if (bdrv_pread(s->hd, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) != 217 s->l1_size * sizeof(uint64_t)) 218 goto fail; 219 for(i = 0;i < s->l1_size; i++) { 220 be64_to_cpus(&s->l1_table[i]); 221 } 222 /* alloc L2 cache */ 223 s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); 224 s->cluster_cache = qemu_malloc(s->cluster_size); 225 /* one more sector for decompressed data alignment */ 226 s->cluster_data = qemu_malloc(QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size 227 + 512); 228 s->cluster_cache_offset = -1; 229 230 if (qcow2_refcount_init(bs) < 0) 231 goto fail; 232 233 /* read qcow2 extensions */ 234 if (header.backing_file_offset) 235 ext_end = header.backing_file_offset; 236 else 237 ext_end = s->cluster_size; 238 if (qcow_read_extensions(bs, sizeof(header), ext_end)) 239 goto fail; 240 241 /* read the backing file name */ 242 if (header.backing_file_offset != 0) { 243 len = header.backing_file_size; 244 if (len > 1023) 245 len = 1023; 246 if (bdrv_pread(s->hd, header.backing_file_offset, bs->backing_file, len) != len) 247 goto fail; 248 bs->backing_file[len] = '\0'; 249 } 250 if (qcow2_read_snapshots(bs) < 0) 251 goto fail; 252 253 #ifdef DEBUG_ALLOC 254 check_refcounts(bs); 255 #endif 256 return 0; 257 258 fail: 259 qcow2_free_snapshots(bs); 260 qcow2_refcount_close(bs); 261 qemu_free(s->l1_table); 262 qemu_free(s->l2_cache); 263 qemu_free(s->cluster_cache); 264 qemu_free(s->cluster_data); 265 bdrv_delete(s->hd); 266 return -1; 267 } 268 269 static int qcow_set_key(BlockDriverState *bs, const char *key) 270 { 271 BDRVQcowState *s = bs->opaque; 272 uint8_t keybuf[16]; 273 int len, i; 274 275 memset(keybuf, 0, 16); 276 len = strlen(key); 277 if (len > 16) 278 len = 16; 279 /* XXX: we could compress the chars to 7 bits to increase 280 entropy */ 281 for(i = 0;i < len;i++) { 282 keybuf[i] = key[i]; 283 } 284 s->crypt_method = s->crypt_method_header; 285 286 if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0) 287 return -1; 288 if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0) 289 return -1; 290 #if 0 291 /* test */ 292 { 293 uint8_t in[16]; 294 uint8_t out[16]; 295 uint8_t tmp[16]; 296 for(i=0;i<16;i++) 297 in[i] = i; 298 AES_encrypt(in, tmp, &s->aes_encrypt_key); 299 AES_decrypt(tmp, out, &s->aes_decrypt_key); 300 for(i = 0; i < 16; i++) 301 printf(" %02x", tmp[i]); 302 printf("\n"); 303 for(i = 0; i < 16; i++) 304 printf(" %02x", out[i]); 305 printf("\n"); 306 } 307 #endif 308 return 0; 309 } 310 311 static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num, 312 int nb_sectors, int *pnum) 313 { 314 uint64_t cluster_offset; 315 316 *pnum = nb_sectors; 317 cluster_offset = qcow2_get_cluster_offset(bs, sector_num << 9, pnum); 318 319 return (cluster_offset != 0); 320 } 321 322 /* handle reading after the end of the backing file */ 323 int qcow2_backing_read1(BlockDriverState *bs, 324 int64_t sector_num, uint8_t *buf, int nb_sectors) 325 { 326 int n1; 327 if ((sector_num + nb_sectors) <= bs->total_sectors) 328 return nb_sectors; 329 if (sector_num >= bs->total_sectors) 330 n1 = 0; 331 else 332 n1 = bs->total_sectors - sector_num; 333 memset(buf + n1 * 512, 0, 512 * (nb_sectors - n1)); 334 return n1; 335 } 336 337 typedef struct QCowAIOCB { 338 BlockDriverAIOCB common; 339 int64_t sector_num; 340 QEMUIOVector *qiov; 341 uint8_t *buf; 342 void *orig_buf; 343 int nb_sectors; 344 int n; 345 uint64_t cluster_offset; 346 uint8_t *cluster_data; 347 BlockDriverAIOCB *hd_aiocb; 348 struct iovec hd_iov; 349 QEMUIOVector hd_qiov; 350 QEMUBH *bh; 351 QCowL2Meta l2meta; 352 } QCowAIOCB; 353 354 static void qcow_aio_cancel(BlockDriverAIOCB *blockacb) 355 { 356 QCowAIOCB *acb = (QCowAIOCB *)blockacb; 357 if (acb->hd_aiocb) 358 bdrv_aio_cancel(acb->hd_aiocb); 359 qemu_aio_release(acb); 360 } 361 362 static AIOPool qcow_aio_pool = { 363 .aiocb_size = sizeof(QCowAIOCB), 364 .cancel = qcow_aio_cancel, 365 }; 366 367 static void qcow_aio_read_cb(void *opaque, int ret); 368 static void qcow_aio_read_bh(void *opaque) 369 { 370 QCowAIOCB *acb = opaque; 371 qemu_bh_delete(acb->bh); 372 acb->bh = NULL; 373 qcow_aio_read_cb(opaque, 0); 374 } 375 376 static int qcow_schedule_bh(QEMUBHFunc *cb, QCowAIOCB *acb) 377 { 378 if (acb->bh) 379 return -EIO; 380 381 acb->bh = qemu_bh_new(cb, acb); 382 if (!acb->bh) 383 return -EIO; 384 385 qemu_bh_schedule(acb->bh); 386 387 return 0; 388 } 389 390 static void qcow_aio_read_cb(void *opaque, int ret) 391 { 392 QCowAIOCB *acb = opaque; 393 BlockDriverState *bs = acb->common.bs; 394 BDRVQcowState *s = bs->opaque; 395 int index_in_cluster, n1; 396 397 acb->hd_aiocb = NULL; 398 if (ret < 0) 399 goto done; 400 401 /* post process the read buffer */ 402 if (!acb->cluster_offset) { 403 /* nothing to do */ 404 } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { 405 /* nothing to do */ 406 } else { 407 if (s->crypt_method) { 408 qcow2_encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf, 409 acb->n, 0, 410 &s->aes_decrypt_key); 411 } 412 } 413 414 acb->nb_sectors -= acb->n; 415 acb->sector_num += acb->n; 416 acb->buf += acb->n * 512; 417 418 if (acb->nb_sectors == 0) { 419 /* request completed */ 420 ret = 0; 421 goto done; 422 } 423 424 /* prepare next AIO request */ 425 acb->n = acb->nb_sectors; 426 acb->cluster_offset = 427 qcow2_get_cluster_offset(bs, acb->sector_num << 9, &acb->n); 428 index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); 429 430 if (!acb->cluster_offset) { 431 if (bs->backing_hd) { 432 /* read from the base image */ 433 n1 = qcow2_backing_read1(bs->backing_hd, acb->sector_num, 434 acb->buf, acb->n); 435 if (n1 > 0) { 436 acb->hd_iov.iov_base = (void *)acb->buf; 437 acb->hd_iov.iov_len = acb->n * 512; 438 qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); 439 acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num, 440 &acb->hd_qiov, acb->n, 441 qcow_aio_read_cb, acb); 442 if (acb->hd_aiocb == NULL) 443 goto done; 444 } else { 445 ret = qcow_schedule_bh(qcow_aio_read_bh, acb); 446 if (ret < 0) 447 goto done; 448 } 449 } else { 450 /* Note: in this case, no need to wait */ 451 memset(acb->buf, 0, 512 * acb->n); 452 ret = qcow_schedule_bh(qcow_aio_read_bh, acb); 453 if (ret < 0) 454 goto done; 455 } 456 } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) { 457 /* add AIO support for compressed blocks ? */ 458 if (qcow2_decompress_cluster(s, acb->cluster_offset) < 0) 459 goto done; 460 memcpy(acb->buf, 461 s->cluster_cache + index_in_cluster * 512, 512 * acb->n); 462 ret = qcow_schedule_bh(qcow_aio_read_bh, acb); 463 if (ret < 0) 464 goto done; 465 } else { 466 if ((acb->cluster_offset & 511) != 0) { 467 ret = -EIO; 468 goto done; 469 } 470 471 acb->hd_iov.iov_base = (void *)acb->buf; 472 acb->hd_iov.iov_len = acb->n * 512; 473 qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); 474 acb->hd_aiocb = bdrv_aio_readv(s->hd, 475 (acb->cluster_offset >> 9) + index_in_cluster, 476 &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb); 477 if (acb->hd_aiocb == NULL) 478 goto done; 479 } 480 481 return; 482 done: 483 if (acb->qiov->niov > 1) { 484 qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size); 485 qemu_vfree(acb->orig_buf); 486 } 487 acb->common.cb(acb->common.opaque, ret); 488 qemu_aio_release(acb); 489 } 490 491 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs, 492 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 493 BlockDriverCompletionFunc *cb, void *opaque, int is_write) 494 { 495 QCowAIOCB *acb; 496 497 acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque); 498 if (!acb) 499 return NULL; 500 acb->hd_aiocb = NULL; 501 acb->sector_num = sector_num; 502 acb->qiov = qiov; 503 if (qiov->niov > 1) { 504 acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size); 505 if (is_write) 506 qemu_iovec_to_buffer(qiov, acb->buf); 507 } else { 508 acb->buf = (uint8_t *)qiov->iov->iov_base; 509 } 510 acb->nb_sectors = nb_sectors; 511 acb->n = 0; 512 acb->cluster_offset = 0; 513 acb->l2meta.nb_clusters = 0; 514 return acb; 515 } 516 517 static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs, 518 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 519 BlockDriverCompletionFunc *cb, void *opaque) 520 { 521 QCowAIOCB *acb; 522 523 acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); 524 if (!acb) 525 return NULL; 526 527 qcow_aio_read_cb(acb, 0); 528 return &acb->common; 529 } 530 531 static void qcow_aio_write_cb(void *opaque, int ret) 532 { 533 QCowAIOCB *acb = opaque; 534 BlockDriverState *bs = acb->common.bs; 535 BDRVQcowState *s = bs->opaque; 536 int index_in_cluster; 537 const uint8_t *src_buf; 538 int n_end; 539 540 acb->hd_aiocb = NULL; 541 542 if (ret < 0) 543 goto done; 544 545 if (qcow2_alloc_cluster_link_l2(bs, acb->cluster_offset, &acb->l2meta) < 0) { 546 qcow2_free_any_clusters(bs, acb->cluster_offset, acb->l2meta.nb_clusters); 547 goto done; 548 } 549 550 acb->nb_sectors -= acb->n; 551 acb->sector_num += acb->n; 552 acb->buf += acb->n * 512; 553 554 if (acb->nb_sectors == 0) { 555 /* request completed */ 556 ret = 0; 557 goto done; 558 } 559 560 index_in_cluster = acb->sector_num & (s->cluster_sectors - 1); 561 n_end = index_in_cluster + acb->nb_sectors; 562 if (s->crypt_method && 563 n_end > QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors) 564 n_end = QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors; 565 566 acb->cluster_offset = qcow2_alloc_cluster_offset(bs, acb->sector_num << 9, 567 index_in_cluster, 568 n_end, &acb->n, &acb->l2meta); 569 if (!acb->cluster_offset || (acb->cluster_offset & 511) != 0) { 570 ret = -EIO; 571 goto done; 572 } 573 if (s->crypt_method) { 574 if (!acb->cluster_data) { 575 acb->cluster_data = qemu_mallocz(QCOW_MAX_CRYPT_CLUSTERS * 576 s->cluster_size); 577 } 578 qcow2_encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf, 579 acb->n, 1, &s->aes_encrypt_key); 580 src_buf = acb->cluster_data; 581 } else { 582 src_buf = acb->buf; 583 } 584 acb->hd_iov.iov_base = (void *)src_buf; 585 acb->hd_iov.iov_len = acb->n * 512; 586 qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1); 587 acb->hd_aiocb = bdrv_aio_writev(s->hd, 588 (acb->cluster_offset >> 9) + index_in_cluster, 589 &acb->hd_qiov, acb->n, 590 qcow_aio_write_cb, acb); 591 if (acb->hd_aiocb == NULL) 592 goto done; 593 594 return; 595 596 done: 597 if (acb->qiov->niov > 1) 598 qemu_vfree(acb->orig_buf); 599 acb->common.cb(acb->common.opaque, ret); 600 qemu_aio_release(acb); 601 } 602 603 static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs, 604 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, 605 BlockDriverCompletionFunc *cb, void *opaque) 606 { 607 BDRVQcowState *s = bs->opaque; 608 QCowAIOCB *acb; 609 610 s->cluster_cache_offset = -1; /* disable compressed cache */ 611 612 acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); 613 if (!acb) 614 return NULL; 615 616 qcow_aio_write_cb(acb, 0); 617 return &acb->common; 618 } 619 620 static void qcow_close(BlockDriverState *bs) 621 { 622 BDRVQcowState *s = bs->opaque; 623 qemu_free(s->l1_table); 624 qemu_free(s->l2_cache); 625 qemu_free(s->cluster_cache); 626 qemu_free(s->cluster_data); 627 qcow2_refcount_close(bs); 628 bdrv_delete(s->hd); 629 } 630 631 static int get_bits_from_size(size_t size) 632 { 633 int res = 0; 634 635 if (size == 0) { 636 return -1; 637 } 638 639 while (size != 1) { 640 /* Not a power of two */ 641 if (size & 1) { 642 return -1; 643 } 644 645 size >>= 1; 646 res++; 647 } 648 649 return res; 650 } 651 652 static int qcow_create2(const char *filename, int64_t total_size, 653 const char *backing_file, const char *backing_format, 654 int flags, size_t cluster_size) 655 { 656 657 int fd, header_size, backing_filename_len, l1_size, i, shift, l2_bits; 658 int ref_clusters, backing_format_len = 0; 659 QCowHeader header; 660 uint64_t tmp, offset; 661 QCowCreateState s1, *s = &s1; 662 QCowExtension ext_bf = {0, 0}; 663 664 665 memset(s, 0, sizeof(*s)); 666 667 fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644); 668 if (fd < 0) 669 return -1; 670 memset(&header, 0, sizeof(header)); 671 header.magic = cpu_to_be32(QCOW_MAGIC); 672 header.version = cpu_to_be32(QCOW_VERSION); 673 header.size = cpu_to_be64(total_size * 512); 674 header_size = sizeof(header); 675 backing_filename_len = 0; 676 if (backing_file) { 677 if (backing_format) { 678 ext_bf.magic = QCOW_EXT_MAGIC_BACKING_FORMAT; 679 backing_format_len = strlen(backing_format); 680 ext_bf.len = (backing_format_len + 7) & ~7; 681 header_size += ((sizeof(ext_bf) + ext_bf.len + 7) & ~7); 682 } 683 header.backing_file_offset = cpu_to_be64(header_size); 684 backing_filename_len = strlen(backing_file); 685 header.backing_file_size = cpu_to_be32(backing_filename_len); 686 header_size += backing_filename_len; 687 } 688 689 /* Cluster size */ 690 s->cluster_bits = get_bits_from_size(cluster_size); 691 if (s->cluster_bits < MIN_CLUSTER_BITS || 692 s->cluster_bits > MAX_CLUSTER_BITS) 693 { 694 fprintf(stderr, "Cluster size must be a power of two between " 695 "%d and %dk\n", 696 1 << MIN_CLUSTER_BITS, 697 1 << (MAX_CLUSTER_BITS - 10)); 698 return -EINVAL; 699 } 700 s->cluster_size = 1 << s->cluster_bits; 701 702 header.cluster_bits = cpu_to_be32(s->cluster_bits); 703 header_size = (header_size + 7) & ~7; 704 if (flags & BLOCK_FLAG_ENCRYPT) { 705 header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); 706 } else { 707 header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); 708 } 709 l2_bits = s->cluster_bits - 3; 710 shift = s->cluster_bits + l2_bits; 711 l1_size = (((total_size * 512) + (1LL << shift) - 1) >> shift); 712 offset = align_offset(header_size, s->cluster_size); 713 s->l1_table_offset = offset; 714 header.l1_table_offset = cpu_to_be64(s->l1_table_offset); 715 header.l1_size = cpu_to_be32(l1_size); 716 offset += align_offset(l1_size * sizeof(uint64_t), s->cluster_size); 717 718 s->refcount_table = qemu_mallocz(s->cluster_size); 719 720 s->refcount_table_offset = offset; 721 header.refcount_table_offset = cpu_to_be64(offset); 722 header.refcount_table_clusters = cpu_to_be32(1); 723 offset += s->cluster_size; 724 s->refcount_block_offset = offset; 725 726 /* count how many refcount blocks needed */ 727 tmp = offset >> s->cluster_bits; 728 ref_clusters = (tmp >> (s->cluster_bits - REFCOUNT_SHIFT)) + 1; 729 for (i=0; i < ref_clusters; i++) { 730 s->refcount_table[i] = cpu_to_be64(offset); 731 offset += s->cluster_size; 732 } 733 734 s->refcount_block = qemu_mallocz(ref_clusters * s->cluster_size); 735 736 /* update refcounts */ 737 qcow2_create_refcount_update(s, 0, header_size); 738 qcow2_create_refcount_update(s, s->l1_table_offset, 739 l1_size * sizeof(uint64_t)); 740 qcow2_create_refcount_update(s, s->refcount_table_offset, s->cluster_size); 741 qcow2_create_refcount_update(s, s->refcount_block_offset, 742 ref_clusters * s->cluster_size); 743 744 /* write all the data */ 745 write(fd, &header, sizeof(header)); 746 if (backing_file) { 747 if (backing_format_len) { 748 char zero[16]; 749 int d = ext_bf.len - backing_format_len; 750 751 memset(zero, 0, sizeof(zero)); 752 cpu_to_be32s(&ext_bf.magic); 753 cpu_to_be32s(&ext_bf.len); 754 write(fd, &ext_bf, sizeof(ext_bf)); 755 write(fd, backing_format, backing_format_len); 756 if (d>0) { 757 write(fd, zero, d); 758 } 759 } 760 write(fd, backing_file, backing_filename_len); 761 } 762 lseek(fd, s->l1_table_offset, SEEK_SET); 763 tmp = 0; 764 for(i = 0;i < l1_size; i++) { 765 write(fd, &tmp, sizeof(tmp)); 766 } 767 lseek(fd, s->refcount_table_offset, SEEK_SET); 768 write(fd, s->refcount_table, s->cluster_size); 769 770 lseek(fd, s->refcount_block_offset, SEEK_SET); 771 write(fd, s->refcount_block, ref_clusters * s->cluster_size); 772 773 qemu_free(s->refcount_table); 774 qemu_free(s->refcount_block); 775 close(fd); 776 return 0; 777 } 778 779 static int qcow_create(const char *filename, QEMUOptionParameter *options) 780 { 781 const char *backing_file = NULL; 782 const char *backing_fmt = NULL; 783 uint64_t sectors = 0; 784 int flags = 0; 785 size_t cluster_size = 65536; 786 787 /* Read out options */ 788 while (options && options->name) { 789 if (!strcmp(options->name, BLOCK_OPT_SIZE)) { 790 sectors = options->value.n / 512; 791 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) { 792 backing_file = options->value.s; 793 } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) { 794 backing_fmt = options->value.s; 795 } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) { 796 flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0; 797 } else if (!strcmp(options->name, BLOCK_OPT_CLUSTER_SIZE)) { 798 if (options->value.n) { 799 cluster_size = options->value.n; 800 } 801 } 802 options++; 803 } 804 805 return qcow_create2(filename, sectors, backing_file, backing_fmt, flags, 806 cluster_size); 807 } 808 809 static int qcow_make_empty(BlockDriverState *bs) 810 { 811 #if 0 812 /* XXX: not correct */ 813 BDRVQcowState *s = bs->opaque; 814 uint32_t l1_length = s->l1_size * sizeof(uint64_t); 815 int ret; 816 817 memset(s->l1_table, 0, l1_length); 818 if (bdrv_pwrite(s->hd, s->l1_table_offset, s->l1_table, l1_length) < 0) 819 return -1; 820 ret = bdrv_truncate(s->hd, s->l1_table_offset + l1_length); 821 if (ret < 0) 822 return ret; 823 824 l2_cache_reset(bs); 825 #endif 826 return 0; 827 } 828 829 /* XXX: put compressed sectors first, then all the cluster aligned 830 tables to avoid losing bytes in alignment */ 831 static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, 832 const uint8_t *buf, int nb_sectors) 833 { 834 BDRVQcowState *s = bs->opaque; 835 z_stream strm; 836 int ret, out_len; 837 uint8_t *out_buf; 838 uint64_t cluster_offset; 839 840 if (nb_sectors == 0) { 841 /* align end of file to a sector boundary to ease reading with 842 sector based I/Os */ 843 cluster_offset = bdrv_getlength(s->hd); 844 cluster_offset = (cluster_offset + 511) & ~511; 845 bdrv_truncate(s->hd, cluster_offset); 846 return 0; 847 } 848 849 if (nb_sectors != s->cluster_sectors) 850 return -EINVAL; 851 852 out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); 853 854 /* best compression, small window, no zlib header */ 855 memset(&strm, 0, sizeof(strm)); 856 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, 857 Z_DEFLATED, -12, 858 9, Z_DEFAULT_STRATEGY); 859 if (ret != 0) { 860 qemu_free(out_buf); 861 return -1; 862 } 863 864 strm.avail_in = s->cluster_size; 865 strm.next_in = (uint8_t *)buf; 866 strm.avail_out = s->cluster_size; 867 strm.next_out = out_buf; 868 869 ret = deflate(&strm, Z_FINISH); 870 if (ret != Z_STREAM_END && ret != Z_OK) { 871 qemu_free(out_buf); 872 deflateEnd(&strm); 873 return -1; 874 } 875 out_len = strm.next_out - out_buf; 876 877 deflateEnd(&strm); 878 879 if (ret != Z_STREAM_END || out_len >= s->cluster_size) { 880 /* could not compress: write normal cluster */ 881 bdrv_write(bs, sector_num, buf, s->cluster_sectors); 882 } else { 883 cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, 884 sector_num << 9, out_len); 885 if (!cluster_offset) 886 return -1; 887 cluster_offset &= s->cluster_offset_mask; 888 if (bdrv_pwrite(s->hd, cluster_offset, out_buf, out_len) != out_len) { 889 qemu_free(out_buf); 890 return -1; 891 } 892 } 893 894 qemu_free(out_buf); 895 return 0; 896 } 897 898 static void qcow_flush(BlockDriverState *bs) 899 { 900 BDRVQcowState *s = bs->opaque; 901 bdrv_flush(s->hd); 902 } 903 904 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) 905 { 906 BDRVQcowState *s = bs->opaque; 907 bdi->cluster_size = s->cluster_size; 908 bdi->vm_state_offset = (int64_t)s->l1_vm_state_index << 909 (s->cluster_bits + s->l2_bits); 910 return 0; 911 } 912 913 914 static int qcow_check(BlockDriverState *bs) 915 { 916 return qcow2_check_refcounts(bs); 917 } 918 919 #if 0 920 static void dump_refcounts(BlockDriverState *bs) 921 { 922 BDRVQcowState *s = bs->opaque; 923 int64_t nb_clusters, k, k1, size; 924 int refcount; 925 926 size = bdrv_getlength(s->hd); 927 nb_clusters = size_to_clusters(s, size); 928 for(k = 0; k < nb_clusters;) { 929 k1 = k; 930 refcount = get_refcount(bs, k); 931 k++; 932 while (k < nb_clusters && get_refcount(bs, k) == refcount) 933 k++; 934 printf("%lld: refcount=%d nb=%lld\n", k, refcount, k - k1); 935 } 936 } 937 #endif 938 939 static int qcow_put_buffer(BlockDriverState *bs, const uint8_t *buf, 940 int64_t pos, int size) 941 { 942 int growable = bs->growable; 943 944 bs->growable = 1; 945 bdrv_pwrite(bs, pos, buf, size); 946 bs->growable = growable; 947 948 return size; 949 } 950 951 static int qcow_get_buffer(BlockDriverState *bs, uint8_t *buf, 952 int64_t pos, int size) 953 { 954 int growable = bs->growable; 955 int ret; 956 957 bs->growable = 1; 958 ret = bdrv_pread(bs, pos, buf, size); 959 bs->growable = growable; 960 961 return ret; 962 } 963 964 static QEMUOptionParameter qcow_create_options[] = { 965 { 966 .name = BLOCK_OPT_SIZE, 967 .type = OPT_SIZE, 968 .help = "Virtual disk size" 969 }, 970 { 971 .name = BLOCK_OPT_BACKING_FILE, 972 .type = OPT_STRING, 973 .help = "File name of a base image" 974 }, 975 { 976 .name = BLOCK_OPT_BACKING_FMT, 977 .type = OPT_STRING, 978 .help = "Image format of the base image" 979 }, 980 { 981 .name = BLOCK_OPT_ENCRYPT, 982 .type = OPT_FLAG, 983 .help = "Encrypt the image" 984 }, 985 { 986 .name = BLOCK_OPT_CLUSTER_SIZE, 987 .type = OPT_SIZE, 988 .help = "qcow2 cluster size" 989 }, 990 { NULL } 991 }; 992 993 static BlockDriver bdrv_qcow2 = { 994 .format_name = "qcow2", 995 .instance_size = sizeof(BDRVQcowState), 996 .bdrv_probe = qcow_probe, 997 .bdrv_open = qcow_open, 998 .bdrv_close = qcow_close, 999 .bdrv_create = qcow_create, 1000 .bdrv_flush = qcow_flush, 1001 .bdrv_is_allocated = qcow_is_allocated, 1002 .bdrv_set_key = qcow_set_key, 1003 .bdrv_make_empty = qcow_make_empty, 1004 1005 .bdrv_aio_readv = qcow_aio_readv, 1006 .bdrv_aio_writev = qcow_aio_writev, 1007 .bdrv_write_compressed = qcow_write_compressed, 1008 1009 .bdrv_snapshot_create = qcow2_snapshot_create, 1010 .bdrv_snapshot_goto = qcow2_snapshot_goto, 1011 .bdrv_snapshot_delete = qcow2_snapshot_delete, 1012 .bdrv_snapshot_list = qcow2_snapshot_list, 1013 .bdrv_get_info = qcow_get_info, 1014 1015 .bdrv_put_buffer = qcow_put_buffer, 1016 .bdrv_get_buffer = qcow_get_buffer, 1017 1018 .create_options = qcow_create_options, 1019 .bdrv_check = qcow_check, 1020 }; 1021 1022 static void bdrv_qcow2_init(void) 1023 { 1024 bdrv_register(&bdrv_qcow2); 1025 } 1026 1027 block_init(bdrv_qcow2_init); 1028