Home | History | Annotate | Download | only in block
      1 /*
      2  * Block driver for the QCOW version 2 format
      3  *
      4  * Copyright (c) 2004-2006 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 
     25 #include "qemu-common.h"
     26 #include "block_int.h"
     27 #include "block/qcow2.h"
     28 
     29 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
     30 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
     31                             int64_t offset, int64_t length,
     32                             int addend);
     33 
     34 
     35 static int cache_refcount_updates = 0;
     36 
     37 static int write_refcount_block(BlockDriverState *bs)
     38 {
     39     BDRVQcowState *s = bs->opaque;
     40     size_t size = s->cluster_size;
     41 
     42     if (s->refcount_block_cache_offset == 0) {
     43         return 0;
     44     }
     45 
     46     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE);
     47     if (bdrv_pwrite_sync(bs->file, s->refcount_block_cache_offset,
     48             s->refcount_block_cache, size) < 0)
     49     {
     50         return -EIO;
     51     }
     52 
     53     return 0;
     54 }
     55 
     56 /*********************************************************/
     57 /* refcount handling */
     58 
     59 int qcow2_refcount_init(BlockDriverState *bs)
     60 {
     61     BDRVQcowState *s = bs->opaque;
     62     int ret, refcount_table_size2, i;
     63 
     64     s->refcount_block_cache = qemu_malloc(s->cluster_size);
     65     refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t);
     66     s->refcount_table = qemu_malloc(refcount_table_size2);
     67     if (s->refcount_table_size > 0) {
     68         BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD);
     69         ret = bdrv_pread(bs->file, s->refcount_table_offset,
     70                          s->refcount_table, refcount_table_size2);
     71         if (ret != refcount_table_size2)
     72             goto fail;
     73         for(i = 0; i < s->refcount_table_size; i++)
     74             be64_to_cpus(&s->refcount_table[i]);
     75     }
     76     return 0;
     77  fail:
     78     return -ENOMEM;
     79 }
     80 
     81 void qcow2_refcount_close(BlockDriverState *bs)
     82 {
     83     BDRVQcowState *s = bs->opaque;
     84     qemu_free(s->refcount_block_cache);
     85     qemu_free(s->refcount_table);
     86 }
     87 
     88 
     89 static int load_refcount_block(BlockDriverState *bs,
     90                                int64_t refcount_block_offset)
     91 {
     92     BDRVQcowState *s = bs->opaque;
     93     int ret;
     94 
     95     if (cache_refcount_updates) {
     96         ret = write_refcount_block(bs);
     97         if (ret < 0) {
     98             return ret;
     99         }
    100     }
    101 
    102     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD);
    103     ret = bdrv_pread(bs->file, refcount_block_offset, s->refcount_block_cache,
    104                      s->cluster_size);
    105     if (ret < 0) {
    106         return ret;
    107     }
    108 
    109     s->refcount_block_cache_offset = refcount_block_offset;
    110     return 0;
    111 }
    112 
    113 /*
    114  * Returns the refcount of the cluster given by its index. Any non-negative
    115  * return value is the refcount of the cluster, negative values are -errno
    116  * and indicate an error.
    117  */
    118 static int get_refcount(BlockDriverState *bs, int64_t cluster_index)
    119 {
    120     BDRVQcowState *s = bs->opaque;
    121     int refcount_table_index, block_index;
    122     int64_t refcount_block_offset;
    123     int ret;
    124 
    125     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
    126     if (refcount_table_index >= s->refcount_table_size)
    127         return 0;
    128     refcount_block_offset = s->refcount_table[refcount_table_index];
    129     if (!refcount_block_offset)
    130         return 0;
    131     if (refcount_block_offset != s->refcount_block_cache_offset) {
    132         /* better than nothing: return allocated if read error */
    133         ret = load_refcount_block(bs, refcount_block_offset);
    134         if (ret < 0) {
    135             return ret;
    136         }
    137     }
    138     block_index = cluster_index &
    139         ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
    140     return be16_to_cpu(s->refcount_block_cache[block_index]);
    141 }
    142 
    143 /*
    144  * Rounds the refcount table size up to avoid growing the table for each single
    145  * refcount block that is allocated.
    146  */
    147 static unsigned int next_refcount_table_size(BDRVQcowState *s,
    148     unsigned int min_size)
    149 {
    150     unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1;
    151     unsigned int refcount_table_clusters =
    152         MAX(1, s->refcount_table_size >> (s->cluster_bits - 3));
    153 
    154     while (min_clusters > refcount_table_clusters) {
    155         refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2;
    156     }
    157 
    158     return refcount_table_clusters << (s->cluster_bits - 3);
    159 }
    160 
    161 
    162 /* Checks if two offsets are described by the same refcount block */
    163 static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a,
    164     uint64_t offset_b)
    165 {
    166     uint64_t block_a = offset_a >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
    167     uint64_t block_b = offset_b >> (2 * s->cluster_bits - REFCOUNT_SHIFT);
    168 
    169     return (block_a == block_b);
    170 }
    171 
    172 /*
    173  * Loads a refcount block. If it doesn't exist yet, it is allocated first
    174  * (including growing the refcount table if needed).
    175  *
    176  * Returns the offset of the refcount block on success or -errno in error case
    177  */
    178 static int64_t alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index)
    179 {
    180     BDRVQcowState *s = bs->opaque;
    181     unsigned int refcount_table_index;
    182     int ret;
    183 
    184     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
    185 
    186     /* Find the refcount block for the given cluster */
    187     refcount_table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
    188 
    189     if (refcount_table_index < s->refcount_table_size) {
    190 
    191         uint64_t refcount_block_offset =
    192             s->refcount_table[refcount_table_index];
    193 
    194         /* If it's already there, we're done */
    195         if (refcount_block_offset) {
    196             if (refcount_block_offset != s->refcount_block_cache_offset) {
    197                 ret = load_refcount_block(bs, refcount_block_offset);
    198                 if (ret < 0) {
    199                     return ret;
    200                 }
    201             }
    202             return refcount_block_offset;
    203         }
    204     }
    205 
    206     /*
    207      * If we came here, we need to allocate something. Something is at least
    208      * a cluster for the new refcount block. It may also include a new refcount
    209      * table if the old refcount table is too small.
    210      *
    211      * Note that allocating clusters here needs some special care:
    212      *
    213      * - We can't use the normal qcow2_alloc_clusters(), it would try to
    214      *   increase the refcount and very likely we would end up with an endless
    215      *   recursion. Instead we must place the refcount blocks in a way that
    216      *   they can describe them themselves.
    217      *
    218      * - We need to consider that at this point we are inside update_refcounts
    219      *   and doing the initial refcount increase. This means that some clusters
    220      *   have already been allocated by the caller, but their refcount isn't
    221      *   accurate yet. free_cluster_index tells us where this allocation ends
    222      *   as long as we don't overwrite it by freeing clusters.
    223      *
    224      * - alloc_clusters_noref and qcow2_free_clusters may load a different
    225      *   refcount block into the cache
    226      */
    227 
    228     if (cache_refcount_updates) {
    229         ret = write_refcount_block(bs);
    230         if (ret < 0) {
    231             return ret;
    232         }
    233     }
    234 
    235     /* Allocate the refcount block itself and mark it as used */
    236     int64_t new_block = alloc_clusters_noref(bs, s->cluster_size);
    237     if (new_block < 0) {
    238         return new_block;
    239     }
    240 
    241 #ifdef DEBUG_ALLOC2
    242     fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64
    243         " at %" PRIx64 "\n",
    244         refcount_table_index, cluster_index << s->cluster_bits, new_block);
    245 #endif
    246 
    247     if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) {
    248         /* Zero the new refcount block before updating it */
    249         memset(s->refcount_block_cache, 0, s->cluster_size);
    250         s->refcount_block_cache_offset = new_block;
    251 
    252         /* The block describes itself, need to update the cache */
    253         int block_index = (new_block >> s->cluster_bits) &
    254             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
    255         s->refcount_block_cache[block_index] = cpu_to_be16(1);
    256     } else {
    257         /* Described somewhere else. This can recurse at most twice before we
    258          * arrive at a block that describes itself. */
    259         ret = update_refcount(bs, new_block, s->cluster_size, 1);
    260         if (ret < 0) {
    261             goto fail_block;
    262         }
    263 
    264         /* Initialize the new refcount block only after updating its refcount,
    265          * update_refcount uses the refcount cache itself */
    266         memset(s->refcount_block_cache, 0, s->cluster_size);
    267         s->refcount_block_cache_offset = new_block;
    268     }
    269 
    270     /* Now the new refcount block needs to be written to disk */
    271     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE);
    272     ret = bdrv_pwrite_sync(bs->file, new_block, s->refcount_block_cache,
    273         s->cluster_size);
    274     if (ret < 0) {
    275         goto fail_block;
    276     }
    277 
    278     /* If the refcount table is big enough, just hook the block up there */
    279     if (refcount_table_index < s->refcount_table_size) {
    280         uint64_t data64 = cpu_to_be64(new_block);
    281         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP);
    282         ret = bdrv_pwrite_sync(bs->file,
    283             s->refcount_table_offset + refcount_table_index * sizeof(uint64_t),
    284             &data64, sizeof(data64));
    285         if (ret < 0) {
    286             goto fail_block;
    287         }
    288 
    289         s->refcount_table[refcount_table_index] = new_block;
    290         return new_block;
    291     }
    292 
    293     /*
    294      * If we come here, we need to grow the refcount table. Again, a new
    295      * refcount table needs some space and we can't simply allocate to avoid
    296      * endless recursion.
    297      *
    298      * Therefore let's grab new refcount blocks at the end of the image, which
    299      * will describe themselves and the new refcount table. This way we can
    300      * reference them only in the new table and do the switch to the new
    301      * refcount table at once without producing an inconsistent state in
    302      * between.
    303      */
    304     BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW);
    305 
    306     /* Calculate the number of refcount blocks needed so far */
    307     uint64_t refcount_block_clusters = 1 << (s->cluster_bits - REFCOUNT_SHIFT);
    308     uint64_t blocks_used = (s->free_cluster_index +
    309         refcount_block_clusters - 1) / refcount_block_clusters;
    310 
    311     /* And now we need at least one block more for the new metadata */
    312     uint64_t table_size = next_refcount_table_size(s, blocks_used + 1);
    313     uint64_t last_table_size;
    314     uint64_t blocks_clusters;
    315     do {
    316         uint64_t table_clusters = size_to_clusters(s, table_size);
    317         blocks_clusters = 1 +
    318             ((table_clusters + refcount_block_clusters - 1)
    319             / refcount_block_clusters);
    320         uint64_t meta_clusters = table_clusters + blocks_clusters;
    321 
    322         last_table_size = table_size;
    323         table_size = next_refcount_table_size(s, blocks_used +
    324             ((meta_clusters + refcount_block_clusters - 1)
    325             / refcount_block_clusters));
    326 
    327     } while (last_table_size != table_size);
    328 
    329 #ifdef DEBUG_ALLOC2
    330     fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n",
    331         s->refcount_table_size, table_size);
    332 #endif
    333 
    334     /* Create the new refcount table and blocks */
    335     uint64_t meta_offset = (blocks_used * refcount_block_clusters) *
    336         s->cluster_size;
    337     uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size;
    338     uint16_t *new_blocks = qemu_mallocz(blocks_clusters * s->cluster_size);
    339     uint64_t *new_table = qemu_mallocz(table_size * sizeof(uint64_t));
    340 
    341     assert(meta_offset >= (s->free_cluster_index * s->cluster_size));
    342 
    343     /* Fill the new refcount table */
    344     memcpy(new_table, s->refcount_table,
    345         s->refcount_table_size * sizeof(uint64_t));
    346     new_table[refcount_table_index] = new_block;
    347 
    348     int i;
    349     for (i = 0; i < blocks_clusters; i++) {
    350         new_table[blocks_used + i] = meta_offset + (i * s->cluster_size);
    351     }
    352 
    353     /* Fill the refcount blocks */
    354     uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t));
    355     int block = 0;
    356     for (i = 0; i < table_clusters + blocks_clusters; i++) {
    357         new_blocks[block++] = cpu_to_be16(1);
    358     }
    359 
    360     /* Write refcount blocks to disk */
    361     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS);
    362     ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks,
    363         blocks_clusters * s->cluster_size);
    364     qemu_free(new_blocks);
    365     if (ret < 0) {
    366         goto fail_table;
    367     }
    368 
    369     /* Write refcount table to disk */
    370     for(i = 0; i < table_size; i++) {
    371         cpu_to_be64s(&new_table[i]);
    372     }
    373 
    374     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE);
    375     ret = bdrv_pwrite_sync(bs->file, table_offset, new_table,
    376         table_size * sizeof(uint64_t));
    377     if (ret < 0) {
    378         goto fail_table;
    379     }
    380 
    381     for(i = 0; i < table_size; i++) {
    382         cpu_to_be64s(&new_table[i]);
    383     }
    384 
    385     /* Hook up the new refcount table in the qcow2 header */
    386     uint8_t data[12];
    387     cpu_to_be64w((uint64_t*)data, table_offset);
    388     cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
    389     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
    390     ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset),
    391         data, sizeof(data));
    392     if (ret < 0) {
    393         goto fail_table;
    394     }
    395 
    396     /* And switch it in memory */
    397     uint64_t old_table_offset = s->refcount_table_offset;
    398     uint64_t old_table_size = s->refcount_table_size;
    399 
    400     qemu_free(s->refcount_table);
    401     s->refcount_table = new_table;
    402     s->refcount_table_size = table_size;
    403     s->refcount_table_offset = table_offset;
    404 
    405     /* Free old table. Remember, we must not change free_cluster_index */
    406     uint64_t old_free_cluster_index = s->free_cluster_index;
    407     qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t));
    408     s->free_cluster_index = old_free_cluster_index;
    409 
    410     ret = load_refcount_block(bs, new_block);
    411     if (ret < 0) {
    412         goto fail_block;
    413     }
    414 
    415     return new_block;
    416 
    417 fail_table:
    418     qemu_free(new_table);
    419 fail_block:
    420     s->refcount_block_cache_offset = 0;
    421     return ret;
    422 }
    423 
    424 #define REFCOUNTS_PER_SECTOR (512 >> REFCOUNT_SHIFT)
    425 static int write_refcount_block_entries(BlockDriverState *bs,
    426     int64_t refcount_block_offset, int first_index, int last_index)
    427 {
    428     BDRVQcowState *s = bs->opaque;
    429     size_t size;
    430     int ret;
    431 
    432     if (cache_refcount_updates) {
    433         return 0;
    434     }
    435 
    436     if (first_index < 0) {
    437         return 0;
    438     }
    439 
    440     first_index &= ~(REFCOUNTS_PER_SECTOR - 1);
    441     last_index = (last_index + REFCOUNTS_PER_SECTOR)
    442         & ~(REFCOUNTS_PER_SECTOR - 1);
    443 
    444     size = (last_index - first_index) << REFCOUNT_SHIFT;
    445 
    446     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
    447     ret = bdrv_pwrite_sync(bs->file,
    448         refcount_block_offset + (first_index << REFCOUNT_SHIFT),
    449         &s->refcount_block_cache[first_index], size);
    450     if (ret < 0) {
    451         return ret;
    452     }
    453 
    454     return 0;
    455 }
    456 
    457 /* XXX: cache several refcount block clusters ? */
    458 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
    459     int64_t offset, int64_t length, int addend)
    460 {
    461     BDRVQcowState *s = bs->opaque;
    462     int64_t start, last, cluster_offset;
    463     int64_t refcount_block_offset = 0;
    464     int64_t table_index = -1, old_table_index;
    465     int first_index = -1, last_index = -1;
    466     int ret;
    467 
    468 #ifdef DEBUG_ALLOC2
    469     printf("update_refcount: offset=%" PRId64 " size=%" PRId64 " addend=%d\n",
    470            offset, length, addend);
    471 #endif
    472     if (length < 0) {
    473         return -EINVAL;
    474     } else if (length == 0) {
    475         return 0;
    476     }
    477 
    478     start = offset & ~(s->cluster_size - 1);
    479     last = (offset + length - 1) & ~(s->cluster_size - 1);
    480     for(cluster_offset = start; cluster_offset <= last;
    481         cluster_offset += s->cluster_size)
    482     {
    483         int block_index, refcount;
    484         int64_t cluster_index = cluster_offset >> s->cluster_bits;
    485         int64_t new_block;
    486 
    487         /* Only write refcount block to disk when we are done with it */
    488         old_table_index = table_index;
    489         table_index = cluster_index >> (s->cluster_bits - REFCOUNT_SHIFT);
    490         if ((old_table_index >= 0) && (table_index != old_table_index)) {
    491 
    492             ret = write_refcount_block_entries(bs, refcount_block_offset,
    493                 first_index, last_index);
    494             if (ret < 0) {
    495                 return ret;
    496             }
    497 
    498             first_index = -1;
    499             last_index = -1;
    500         }
    501 
    502         /* Load the refcount block and allocate it if needed */
    503         new_block = alloc_refcount_block(bs, cluster_index);
    504         if (new_block < 0) {
    505             ret = new_block;
    506             goto fail;
    507         }
    508         refcount_block_offset = new_block;
    509 
    510         /* we can update the count and save it */
    511         block_index = cluster_index &
    512             ((1 << (s->cluster_bits - REFCOUNT_SHIFT)) - 1);
    513         if (first_index == -1 || block_index < first_index) {
    514             first_index = block_index;
    515         }
    516         if (block_index > last_index) {
    517             last_index = block_index;
    518         }
    519 
    520         refcount = be16_to_cpu(s->refcount_block_cache[block_index]);
    521         refcount += addend;
    522         if (refcount < 0 || refcount > 0xffff) {
    523             ret = -EINVAL;
    524             goto fail;
    525         }
    526         if (refcount == 0 && cluster_index < s->free_cluster_index) {
    527             s->free_cluster_index = cluster_index;
    528         }
    529         s->refcount_block_cache[block_index] = cpu_to_be16(refcount);
    530     }
    531 
    532     ret = 0;
    533 fail:
    534 
    535     /* Write last changed block to disk */
    536     if (refcount_block_offset != 0) {
    537         int wret;
    538         wret = write_refcount_block_entries(bs, refcount_block_offset,
    539             first_index, last_index);
    540         if (wret < 0) {
    541             return ret < 0 ? ret : wret;
    542         }
    543     }
    544 
    545     /*
    546      * Try do undo any updates if an error is returned (This may succeed in
    547      * some cases like ENOSPC for allocating a new refcount block)
    548      */
    549     if (ret < 0) {
    550         int dummy;
    551         dummy = update_refcount(bs, offset, cluster_offset - offset, -addend);
    552     }
    553 
    554     return ret;
    555 }
    556 
    557 /*
    558  * Increases or decreases the refcount of a given cluster by one.
    559  * addend must be 1 or -1.
    560  *
    561  * If the return value is non-negative, it is the new refcount of the cluster.
    562  * If it is negative, it is -errno and indicates an error.
    563  */
    564 static int update_cluster_refcount(BlockDriverState *bs,
    565                                    int64_t cluster_index,
    566                                    int addend)
    567 {
    568     BDRVQcowState *s = bs->opaque;
    569     int ret;
    570 
    571     ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend);
    572     if (ret < 0) {
    573         return ret;
    574     }
    575 
    576     return get_refcount(bs, cluster_index);
    577 }
    578 
    579 
    580 
    581 /*********************************************************/
    582 /* cluster allocation functions */
    583 
    584 
    585 
    586 /* return < 0 if error */
    587 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size)
    588 {
    589     BDRVQcowState *s = bs->opaque;
    590     int i, nb_clusters, refcount;
    591 
    592     nb_clusters = size_to_clusters(s, size);
    593 retry:
    594     for(i = 0; i < nb_clusters; i++) {
    595         int64_t next_cluster_index = s->free_cluster_index++;
    596         refcount = get_refcount(bs, next_cluster_index);
    597 
    598         if (refcount < 0) {
    599             return refcount;
    600         } else if (refcount != 0) {
    601             goto retry;
    602         }
    603     }
    604 #ifdef DEBUG_ALLOC2
    605     printf("alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n",
    606             size,
    607             (s->free_cluster_index - nb_clusters) << s->cluster_bits);
    608 #endif
    609     return (s->free_cluster_index - nb_clusters) << s->cluster_bits;
    610 }
    611 
    612 int64_t qcow2_alloc_clusters(BlockDriverState *bs, int64_t size)
    613 {
    614     int64_t offset;
    615     int ret;
    616 
    617     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC);
    618     offset = alloc_clusters_noref(bs, size);
    619     if (offset < 0) {
    620         return offset;
    621     }
    622 
    623     ret = update_refcount(bs, offset, size, 1);
    624     if (ret < 0) {
    625         return ret;
    626     }
    627     return offset;
    628 }
    629 
    630 /* only used to allocate compressed sectors. We try to allocate
    631    contiguous sectors. size must be <= cluster_size */
    632 int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size)
    633 {
    634     BDRVQcowState *s = bs->opaque;
    635     int64_t offset, cluster_offset;
    636     int free_in_cluster;
    637 
    638     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES);
    639     assert(size > 0 && size <= s->cluster_size);
    640     if (s->free_byte_offset == 0) {
    641         s->free_byte_offset = qcow2_alloc_clusters(bs, s->cluster_size);
    642         if (s->free_byte_offset < 0) {
    643             return s->free_byte_offset;
    644         }
    645     }
    646  redo:
    647     free_in_cluster = s->cluster_size -
    648         (s->free_byte_offset & (s->cluster_size - 1));
    649     if (size <= free_in_cluster) {
    650         /* enough space in current cluster */
    651         offset = s->free_byte_offset;
    652         s->free_byte_offset += size;
    653         free_in_cluster -= size;
    654         if (free_in_cluster == 0)
    655             s->free_byte_offset = 0;
    656         if ((offset & (s->cluster_size - 1)) != 0)
    657             update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
    658     } else {
    659         offset = qcow2_alloc_clusters(bs, s->cluster_size);
    660         if (offset < 0) {
    661             return offset;
    662         }
    663         cluster_offset = s->free_byte_offset & ~(s->cluster_size - 1);
    664         if ((cluster_offset + s->cluster_size) == offset) {
    665             /* we are lucky: contiguous data */
    666             offset = s->free_byte_offset;
    667             update_cluster_refcount(bs, offset >> s->cluster_bits, 1);
    668             s->free_byte_offset += size;
    669         } else {
    670             s->free_byte_offset = offset;
    671             goto redo;
    672         }
    673     }
    674     return offset;
    675 }
    676 
    677 void qcow2_free_clusters(BlockDriverState *bs,
    678                           int64_t offset, int64_t size)
    679 {
    680     int ret;
    681 
    682     BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE);
    683     ret = update_refcount(bs, offset, size, -1);
    684     if (ret < 0) {
    685         fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret));
    686         /* TODO Remember the clusters to free them later and avoid leaking */
    687     }
    688 }
    689 
    690 /*
    691  * free_any_clusters
    692  *
    693  * free clusters according to its type: compressed or not
    694  *
    695  */
    696 
    697 void qcow2_free_any_clusters(BlockDriverState *bs,
    698     uint64_t cluster_offset, int nb_clusters)
    699 {
    700     BDRVQcowState *s = bs->opaque;
    701 
    702     /* free the cluster */
    703 
    704     if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
    705         int nb_csectors;
    706         nb_csectors = ((cluster_offset >> s->csize_shift) &
    707                        s->csize_mask) + 1;
    708         qcow2_free_clusters(bs,
    709             (cluster_offset & s->cluster_offset_mask) & ~511,
    710             nb_csectors * 512);
    711         return;
    712     }
    713 
    714     qcow2_free_clusters(bs, cluster_offset, nb_clusters << s->cluster_bits);
    715 
    716     return;
    717 }
    718 
    719 
    720 
    721 /*********************************************************/
    722 /* snapshots and image creation */
    723 
    724 
    725 
    726 void qcow2_create_refcount_update(QCowCreateState *s, int64_t offset,
    727     int64_t size)
    728 {
    729     int refcount;
    730     int64_t start, last, cluster_offset;
    731     uint16_t *p;
    732 
    733     start = offset & ~(s->cluster_size - 1);
    734     last = (offset + size - 1)  & ~(s->cluster_size - 1);
    735     for(cluster_offset = start; cluster_offset <= last;
    736         cluster_offset += s->cluster_size) {
    737         p = &s->refcount_block[cluster_offset >> s->cluster_bits];
    738         refcount = be16_to_cpu(*p);
    739         refcount++;
    740         *p = cpu_to_be16(refcount);
    741     }
    742 }
    743 
    744 /* update the refcounts of snapshots and the copied flag */
    745 int qcow2_update_snapshot_refcount(BlockDriverState *bs,
    746     int64_t l1_table_offset, int l1_size, int addend)
    747 {
    748     BDRVQcowState *s = bs->opaque;
    749     uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, l1_allocated;
    750     int64_t old_offset, old_l2_offset;
    751     int l2_size, i, j, l1_modified, l2_modified, nb_csectors, refcount;
    752 
    753     qcow2_l2_cache_reset(bs);
    754     cache_refcount_updates = 1;
    755 
    756     l2_table = NULL;
    757     l1_table = NULL;
    758     l1_size2 = l1_size * sizeof(uint64_t);
    759     if (l1_table_offset != s->l1_table_offset) {
    760         if (l1_size2 != 0) {
    761             l1_table = qemu_mallocz(align_offset(l1_size2, 512));
    762         } else {
    763             l1_table = NULL;
    764         }
    765         l1_allocated = 1;
    766         if (bdrv_pread(bs->file, l1_table_offset,
    767                        l1_table, l1_size2) != l1_size2)
    768             goto fail;
    769         for(i = 0;i < l1_size; i++)
    770             be64_to_cpus(&l1_table[i]);
    771     } else {
    772         assert(l1_size == s->l1_size);
    773         l1_table = s->l1_table;
    774         l1_allocated = 0;
    775     }
    776 
    777     l2_size = s->l2_size * sizeof(uint64_t);
    778     l2_table = qemu_malloc(l2_size);
    779     l1_modified = 0;
    780     for(i = 0; i < l1_size; i++) {
    781         l2_offset = l1_table[i];
    782         if (l2_offset) {
    783             old_l2_offset = l2_offset;
    784             l2_offset &= ~QCOW_OFLAG_COPIED;
    785             l2_modified = 0;
    786             if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
    787                 goto fail;
    788             for(j = 0; j < s->l2_size; j++) {
    789                 offset = be64_to_cpu(l2_table[j]);
    790                 if (offset != 0) {
    791                     old_offset = offset;
    792                     offset &= ~QCOW_OFLAG_COPIED;
    793                     if (offset & QCOW_OFLAG_COMPRESSED) {
    794                         nb_csectors = ((offset >> s->csize_shift) &
    795                                        s->csize_mask) + 1;
    796                         if (addend != 0) {
    797                             int ret;
    798                             ret = update_refcount(bs,
    799                                 (offset & s->cluster_offset_mask) & ~511,
    800                                 nb_csectors * 512, addend);
    801                             if (ret < 0) {
    802                                 goto fail;
    803                             }
    804                         }
    805                         /* compressed clusters are never modified */
    806                         refcount = 2;
    807                     } else {
    808                         if (addend != 0) {
    809                             refcount = update_cluster_refcount(bs, offset >> s->cluster_bits, addend);
    810                         } else {
    811                             refcount = get_refcount(bs, offset >> s->cluster_bits);
    812                         }
    813 
    814                         if (refcount < 0) {
    815                             goto fail;
    816                         }
    817                     }
    818 
    819                     if (refcount == 1) {
    820                         offset |= QCOW_OFLAG_COPIED;
    821                     }
    822                     if (offset != old_offset) {
    823                         l2_table[j] = cpu_to_be64(offset);
    824                         l2_modified = 1;
    825                     }
    826                 }
    827             }
    828             if (l2_modified) {
    829                 if (bdrv_pwrite_sync(bs->file,
    830                                 l2_offset, l2_table, l2_size) < 0)
    831                     goto fail;
    832             }
    833 
    834             if (addend != 0) {
    835                 refcount = update_cluster_refcount(bs, l2_offset >> s->cluster_bits, addend);
    836             } else {
    837                 refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
    838             }
    839             if (refcount < 0) {
    840                 goto fail;
    841             } else if (refcount == 1) {
    842                 l2_offset |= QCOW_OFLAG_COPIED;
    843             }
    844             if (l2_offset != old_l2_offset) {
    845                 l1_table[i] = l2_offset;
    846                 l1_modified = 1;
    847             }
    848         }
    849     }
    850     if (l1_modified) {
    851         for(i = 0; i < l1_size; i++)
    852             cpu_to_be64s(&l1_table[i]);
    853         if (bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table,
    854                         l1_size2) < 0)
    855             goto fail;
    856         for(i = 0; i < l1_size; i++)
    857             be64_to_cpus(&l1_table[i]);
    858     }
    859     if (l1_allocated)
    860         qemu_free(l1_table);
    861     qemu_free(l2_table);
    862     cache_refcount_updates = 0;
    863     write_refcount_block(bs);
    864     return 0;
    865  fail:
    866     if (l1_allocated)
    867         qemu_free(l1_table);
    868     qemu_free(l2_table);
    869     cache_refcount_updates = 0;
    870     write_refcount_block(bs);
    871     return -EIO;
    872 }
    873 
    874 
    875 
    876 
    877 /*********************************************************/
    878 /* refcount checking functions */
    879 
    880 
    881 
    882 /*
    883  * Increases the refcount for a range of clusters in a given refcount table.
    884  * This is used to construct a temporary refcount table out of L1 and L2 tables
    885  * which can be compared the the refcount table saved in the image.
    886  *
    887  * Modifies the number of errors in res.
    888  */
    889 static void inc_refcounts(BlockDriverState *bs,
    890                           BdrvCheckResult *res,
    891                           uint16_t *refcount_table,
    892                           int refcount_table_size,
    893                           int64_t offset, int64_t size)
    894 {
    895     BDRVQcowState *s = bs->opaque;
    896     int64_t start, last, cluster_offset;
    897     int k;
    898 
    899     if (size <= 0)
    900         return;
    901 
    902     start = offset & ~(s->cluster_size - 1);
    903     last = (offset + size - 1) & ~(s->cluster_size - 1);
    904     for(cluster_offset = start; cluster_offset <= last;
    905         cluster_offset += s->cluster_size) {
    906         k = cluster_offset >> s->cluster_bits;
    907         if (k < 0) {
    908             fprintf(stderr, "ERROR: invalid cluster offset=0x%" PRIx64 "\n",
    909                 cluster_offset);
    910             res->corruptions++;
    911         } else if (k >= refcount_table_size) {
    912             fprintf(stderr, "Warning: cluster offset=0x%" PRIx64 " is after "
    913                 "the end of the image file, can't properly check refcounts.\n",
    914                 cluster_offset);
    915             res->check_errors++;
    916         } else {
    917             if (++refcount_table[k] == 0) {
    918                 fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64
    919                     "\n", cluster_offset);
    920                 res->corruptions++;
    921             }
    922         }
    923     }
    924 }
    925 
    926 /*
    927  * Increases the refcount in the given refcount table for the all clusters
    928  * referenced in the L2 table. While doing so, performs some checks on L2
    929  * entries.
    930  *
    931  * Returns the number of errors found by the checks or -errno if an internal
    932  * error occurred.
    933  */
    934 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
    935     uint16_t *refcount_table, int refcount_table_size, int64_t l2_offset,
    936     int check_copied)
    937 {
    938     BDRVQcowState *s = bs->opaque;
    939     uint64_t *l2_table, offset;
    940     int i, l2_size, nb_csectors, refcount;
    941 
    942     /* Read L2 table from disk */
    943     l2_size = s->l2_size * sizeof(uint64_t);
    944     l2_table = qemu_malloc(l2_size);
    945 
    946     if (bdrv_pread(bs->file, l2_offset, l2_table, l2_size) != l2_size)
    947         goto fail;
    948 
    949     /* Do the actual checks */
    950     for(i = 0; i < s->l2_size; i++) {
    951         offset = be64_to_cpu(l2_table[i]);
    952         if (offset != 0) {
    953             if (offset & QCOW_OFLAG_COMPRESSED) {
    954                 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
    955                 if (offset & QCOW_OFLAG_COPIED) {
    956                     fprintf(stderr, "ERROR: cluster %" PRId64 ": "
    957                         "copied flag must never be set for compressed "
    958                         "clusters\n", offset >> s->cluster_bits);
    959                     offset &= ~QCOW_OFLAG_COPIED;
    960                     res->corruptions++;
    961                 }
    962 
    963                 /* Mark cluster as used */
    964                 nb_csectors = ((offset >> s->csize_shift) &
    965                                s->csize_mask) + 1;
    966                 offset &= s->cluster_offset_mask;
    967                 inc_refcounts(bs, res, refcount_table, refcount_table_size,
    968                     offset & ~511, nb_csectors * 512);
    969             } else {
    970                 /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
    971                 if (check_copied) {
    972                     uint64_t entry = offset;
    973                     offset &= ~QCOW_OFLAG_COPIED;
    974                     refcount = get_refcount(bs, offset >> s->cluster_bits);
    975                     if (refcount < 0) {
    976                         fprintf(stderr, "Can't get refcount for offset %"
    977                             PRIx64 ": %s\n", entry, strerror(-refcount));
    978                         goto fail;
    979                     }
    980                     if ((refcount == 1) != ((entry & QCOW_OFLAG_COPIED) != 0)) {
    981                         fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
    982                             PRIx64 " refcount=%d\n", entry, refcount);
    983                         res->corruptions++;
    984                     }
    985                 }
    986 
    987                 /* Mark cluster as used */
    988                 offset &= ~QCOW_OFLAG_COPIED;
    989                 inc_refcounts(bs, res, refcount_table,refcount_table_size,
    990                     offset, s->cluster_size);
    991 
    992                 /* Correct offsets are cluster aligned */
    993                 if (offset & (s->cluster_size - 1)) {
    994                     fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not "
    995                         "properly aligned; L2 entry corrupted.\n", offset);
    996                     res->corruptions++;
    997                 }
    998             }
    999         }
   1000     }
   1001 
   1002     qemu_free(l2_table);
   1003     return 0;
   1004 
   1005 fail:
   1006     fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n");
   1007     qemu_free(l2_table);
   1008     return -EIO;
   1009 }
   1010 
   1011 /*
   1012  * Increases the refcount for the L1 table, its L2 tables and all referenced
   1013  * clusters in the given refcount table. While doing so, performs some checks
   1014  * on L1 and L2 entries.
   1015  *
   1016  * Returns the number of errors found by the checks or -errno if an internal
   1017  * error occurred.
   1018  */
   1019 static int check_refcounts_l1(BlockDriverState *bs,
   1020                               BdrvCheckResult *res,
   1021                               uint16_t *refcount_table,
   1022                               int refcount_table_size,
   1023                               int64_t l1_table_offset, int l1_size,
   1024                               int check_copied)
   1025 {
   1026     BDRVQcowState *s = bs->opaque;
   1027     uint64_t *l1_table, l2_offset, l1_size2;
   1028     int i, refcount, ret;
   1029 
   1030     l1_size2 = l1_size * sizeof(uint64_t);
   1031 
   1032     /* Mark L1 table as used */
   1033     inc_refcounts(bs, res, refcount_table, refcount_table_size,
   1034         l1_table_offset, l1_size2);
   1035 
   1036     /* Read L1 table entries from disk */
   1037     if (l1_size2 == 0) {
   1038         l1_table = NULL;
   1039     } else {
   1040         l1_table = qemu_malloc(l1_size2);
   1041         if (bdrv_pread(bs->file, l1_table_offset,
   1042                        l1_table, l1_size2) != l1_size2)
   1043             goto fail;
   1044         for(i = 0;i < l1_size; i++)
   1045             be64_to_cpus(&l1_table[i]);
   1046     }
   1047 
   1048     /* Do the actual checks */
   1049     for(i = 0; i < l1_size; i++) {
   1050         l2_offset = l1_table[i];
   1051         if (l2_offset) {
   1052             /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
   1053             if (check_copied) {
   1054                 refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
   1055                     >> s->cluster_bits);
   1056                 if (refcount < 0) {
   1057                     fprintf(stderr, "Can't get refcount for l2_offset %"
   1058                         PRIx64 ": %s\n", l2_offset, strerror(-refcount));
   1059                     goto fail;
   1060                 }
   1061                 if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
   1062                     fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
   1063                         " refcount=%d\n", l2_offset, refcount);
   1064                     res->corruptions++;
   1065                 }
   1066             }
   1067 
   1068             /* Mark L2 table as used */
   1069             l2_offset &= ~QCOW_OFLAG_COPIED;
   1070             inc_refcounts(bs, res, refcount_table, refcount_table_size,
   1071                 l2_offset, s->cluster_size);
   1072 
   1073             /* L2 tables are cluster aligned */
   1074             if (l2_offset & (s->cluster_size - 1)) {
   1075                 fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not "
   1076                     "cluster aligned; L1 entry corrupted\n", l2_offset);
   1077                 res->corruptions++;
   1078             }
   1079 
   1080             /* Process and check L2 entries */
   1081             ret = check_refcounts_l2(bs, res, refcount_table,
   1082                 refcount_table_size, l2_offset, check_copied);
   1083             if (ret < 0) {
   1084                 goto fail;
   1085             }
   1086         }
   1087     }
   1088     qemu_free(l1_table);
   1089     return 0;
   1090 
   1091 fail:
   1092     fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n");
   1093     res->check_errors++;
   1094     qemu_free(l1_table);
   1095     return -EIO;
   1096 }
   1097 
   1098 /*
   1099  * Checks an image for refcount consistency.
   1100  *
   1101  * Returns 0 if no errors are found, the number of errors in case the image is
   1102  * detected as corrupted, and -errno when an internal error occured.
   1103  */
   1104 int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res)
   1105 {
   1106     BDRVQcowState *s = bs->opaque;
   1107     int64_t size;
   1108     int nb_clusters, refcount1, refcount2, i;
   1109     QCowSnapshot *sn;
   1110     uint16_t *refcount_table;
   1111     int ret;
   1112 
   1113     size = bdrv_getlength(bs->file);
   1114     nb_clusters = size_to_clusters(s, size);
   1115     refcount_table = qemu_mallocz(nb_clusters * sizeof(uint16_t));
   1116 
   1117     /* header */
   1118     inc_refcounts(bs, res, refcount_table, nb_clusters,
   1119         0, s->cluster_size);
   1120 
   1121     /* current L1 table */
   1122     ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
   1123                        s->l1_table_offset, s->l1_size, 1);
   1124     if (ret < 0) {
   1125         return ret;
   1126     }
   1127 
   1128     /* snapshots */
   1129     for(i = 0; i < s->nb_snapshots; i++) {
   1130         sn = s->snapshots + i;
   1131         ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
   1132             sn->l1_table_offset, sn->l1_size, 0);
   1133         if (ret < 0) {
   1134             return ret;
   1135         }
   1136     }
   1137     inc_refcounts(bs, res, refcount_table, nb_clusters,
   1138         s->snapshots_offset, s->snapshots_size);
   1139 
   1140     /* refcount data */
   1141     inc_refcounts(bs, res, refcount_table, nb_clusters,
   1142         s->refcount_table_offset,
   1143         s->refcount_table_size * sizeof(uint64_t));
   1144 
   1145     for(i = 0; i < s->refcount_table_size; i++) {
   1146         uint64_t offset, cluster;
   1147         offset = s->refcount_table[i];
   1148         cluster = offset >> s->cluster_bits;
   1149 
   1150         /* Refcount blocks are cluster aligned */
   1151         if (offset & (s->cluster_size - 1)) {
   1152             fprintf(stderr, "ERROR refcount block %d is not "
   1153                 "cluster aligned; refcount table entry corrupted\n", i);
   1154             res->corruptions++;
   1155             continue;
   1156         }
   1157 
   1158         if (cluster >= nb_clusters) {
   1159             fprintf(stderr, "ERROR refcount block %d is outside image\n", i);
   1160             res->corruptions++;
   1161             continue;
   1162         }
   1163 
   1164         if (offset != 0) {
   1165             inc_refcounts(bs, res, refcount_table, nb_clusters,
   1166                 offset, s->cluster_size);
   1167             if (refcount_table[cluster] != 1) {
   1168                 fprintf(stderr, "ERROR refcount block %d refcount=%d\n",
   1169                     i, refcount_table[cluster]);
   1170                 res->corruptions++;
   1171             }
   1172         }
   1173     }
   1174 
   1175     /* compare ref counts */
   1176     for(i = 0; i < nb_clusters; i++) {
   1177         refcount1 = get_refcount(bs, i);
   1178         if (refcount1 < 0) {
   1179             fprintf(stderr, "Can't get refcount for cluster %d: %s\n",
   1180                 i, strerror(-refcount1));
   1181             res->check_errors++;
   1182             continue;
   1183         }
   1184 
   1185         refcount2 = refcount_table[i];
   1186         if (refcount1 != refcount2) {
   1187             fprintf(stderr, "%s cluster %d refcount=%d reference=%d\n",
   1188                    refcount1 < refcount2 ? "ERROR" : "Leaked",
   1189                    i, refcount1, refcount2);
   1190             if (refcount1 < refcount2) {
   1191                 res->corruptions++;
   1192             } else {
   1193                 res->leaks++;
   1194             }
   1195         }
   1196     }
   1197 
   1198     qemu_free(refcount_table);
   1199 
   1200     return 0;
   1201 }
   1202 
   1203