Home | History | Annotate | Download | only in block
      1 /*
      2  * Block driver for the QCOW format
      3  *
      4  * Copyright (c) 2004-2006 Fabrice Bellard
      5  *
      6  * Permission is hereby granted, free of charge, to any person obtaining a copy
      7  * of this software and associated documentation files (the "Software"), to deal
      8  * in the Software without restriction, including without limitation the rights
      9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
     10  * copies of the Software, and to permit persons to whom the Software is
     11  * furnished to do so, subject to the following conditions:
     12  *
     13  * The above copyright notice and this permission notice shall be included in
     14  * all copies or substantial portions of the Software.
     15  *
     16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
     19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     22  * THE SOFTWARE.
     23  */
     24 #include "qemu-common.h"
     25 #include "block_int.h"
     26 #include "module.h"
     27 #include <zlib.h>
     28 #include "aes.h"
     29 
     30 /**************************************************************/
     31 /* QEMU COW block driver with compression and encryption support */
     32 
     33 #define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb)
     34 #define QCOW_VERSION 1
     35 
     36 #define QCOW_CRYPT_NONE 0
     37 #define QCOW_CRYPT_AES  1
     38 
     39 #define QCOW_OFLAG_COMPRESSED (1LL << 63)
     40 
     41 typedef struct QCowHeader {
     42     uint32_t magic;
     43     uint32_t version;
     44     uint64_t backing_file_offset;
     45     uint32_t backing_file_size;
     46     uint32_t mtime;
     47     uint64_t size; /* in bytes */
     48     uint8_t cluster_bits;
     49     uint8_t l2_bits;
     50     uint32_t crypt_method;
     51     uint64_t l1_table_offset;
     52 } QCowHeader;
     53 
     54 #define L2_CACHE_SIZE 16
     55 
     56 typedef struct BDRVQcowState {
     57     BlockDriverState *hd;
     58     int cluster_bits;
     59     int cluster_size;
     60     int cluster_sectors;
     61     int l2_bits;
     62     int l2_size;
     63     int l1_size;
     64     uint64_t cluster_offset_mask;
     65     uint64_t l1_table_offset;
     66     uint64_t *l1_table;
     67     uint64_t *l2_cache;
     68     uint64_t l2_cache_offsets[L2_CACHE_SIZE];
     69     uint32_t l2_cache_counts[L2_CACHE_SIZE];
     70     uint8_t *cluster_cache;
     71     uint8_t *cluster_data;
     72     uint64_t cluster_cache_offset;
     73     uint32_t crypt_method; /* current crypt method, 0 if no key yet */
     74     uint32_t crypt_method_header;
     75     AES_KEY aes_encrypt_key;
     76     AES_KEY aes_decrypt_key;
     77 } BDRVQcowState;
     78 
     79 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
     80 
     81 static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename)
     82 {
     83     const QCowHeader *cow_header = (const void *)buf;
     84 
     85     if (buf_size >= sizeof(QCowHeader) &&
     86         be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
     87         be32_to_cpu(cow_header->version) == QCOW_VERSION)
     88         return 100;
     89     else
     90         return 0;
     91 }
     92 
     93 static int qcow_open(BlockDriverState *bs, int flags)
     94 {
     95     BDRVQcowState *s = bs->opaque;
     96     int len, i, shift;
     97     QCowHeader header;
     98 
     99     if (bdrv_pread(bs->file, 0, &header, sizeof(header)) != sizeof(header))
    100         goto fail;
    101     be32_to_cpus(&header.magic);
    102     be32_to_cpus(&header.version);
    103     be64_to_cpus(&header.backing_file_offset);
    104     be32_to_cpus(&header.backing_file_size);
    105     be32_to_cpus(&header.mtime);
    106     be64_to_cpus(&header.size);
    107     be32_to_cpus(&header.crypt_method);
    108     be64_to_cpus(&header.l1_table_offset);
    109 
    110     if (header.magic != QCOW_MAGIC || header.version != QCOW_VERSION)
    111         goto fail;
    112     if (header.size <= 1 || header.cluster_bits < 9)
    113         goto fail;
    114     if (header.crypt_method > QCOW_CRYPT_AES)
    115         goto fail;
    116     s->crypt_method_header = header.crypt_method;
    117     if (s->crypt_method_header)
    118         bs->encrypted = 1;
    119     s->cluster_bits = header.cluster_bits;
    120     s->cluster_size = 1 << s->cluster_bits;
    121     s->cluster_sectors = 1 << (s->cluster_bits - 9);
    122     s->l2_bits = header.l2_bits;
    123     s->l2_size = 1 << s->l2_bits;
    124     bs->total_sectors = header.size / 512;
    125     s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1;
    126 
    127     /* read the level 1 table */
    128     shift = s->cluster_bits + s->l2_bits;
    129     s->l1_size = (header.size + (1LL << shift) - 1) >> shift;
    130 
    131     s->l1_table_offset = header.l1_table_offset;
    132     s->l1_table = qemu_malloc(s->l1_size * sizeof(uint64_t));
    133     if (!s->l1_table)
    134         goto fail;
    135     if (bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)) !=
    136         s->l1_size * sizeof(uint64_t))
    137         goto fail;
    138     for(i = 0;i < s->l1_size; i++) {
    139         be64_to_cpus(&s->l1_table[i]);
    140     }
    141     /* alloc L2 cache */
    142     s->l2_cache = qemu_malloc(s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
    143     if (!s->l2_cache)
    144         goto fail;
    145     s->cluster_cache = qemu_malloc(s->cluster_size);
    146     if (!s->cluster_cache)
    147         goto fail;
    148     s->cluster_data = qemu_malloc(s->cluster_size);
    149     if (!s->cluster_data)
    150         goto fail;
    151     s->cluster_cache_offset = -1;
    152 
    153     /* read the backing file name */
    154     if (header.backing_file_offset != 0) {
    155         len = header.backing_file_size;
    156         if (len > 1023)
    157             len = 1023;
    158         if (bdrv_pread(bs->file, header.backing_file_offset, bs->backing_file, len) != len)
    159             goto fail;
    160         bs->backing_file[len] = '\0';
    161     }
    162     return 0;
    163 
    164  fail:
    165     qemu_free(s->l1_table);
    166     qemu_free(s->l2_cache);
    167     qemu_free(s->cluster_cache);
    168     qemu_free(s->cluster_data);
    169     return -1;
    170 }
    171 
    172 static int qcow_set_key(BlockDriverState *bs, const char *key)
    173 {
    174     BDRVQcowState *s = bs->opaque;
    175     uint8_t keybuf[16];
    176     int len, i;
    177 
    178     memset(keybuf, 0, 16);
    179     len = strlen(key);
    180     if (len > 16)
    181         len = 16;
    182     /* XXX: we could compress the chars to 7 bits to increase
    183        entropy */
    184     for(i = 0;i < len;i++) {
    185         keybuf[i] = key[i];
    186     }
    187     s->crypt_method = s->crypt_method_header;
    188 
    189     if (AES_set_encrypt_key(keybuf, 128, &s->aes_encrypt_key) != 0)
    190         return -1;
    191     if (AES_set_decrypt_key(keybuf, 128, &s->aes_decrypt_key) != 0)
    192         return -1;
    193 #if 0
    194     /* test */
    195     {
    196         uint8_t in[16];
    197         uint8_t out[16];
    198         uint8_t tmp[16];
    199         for(i=0;i<16;i++)
    200             in[i] = i;
    201         AES_encrypt(in, tmp, &s->aes_encrypt_key);
    202         AES_decrypt(tmp, out, &s->aes_decrypt_key);
    203         for(i = 0; i < 16; i++)
    204             printf(" %02x", tmp[i]);
    205         printf("\n");
    206         for(i = 0; i < 16; i++)
    207             printf(" %02x", out[i]);
    208         printf("\n");
    209     }
    210 #endif
    211     return 0;
    212 }
    213 
    214 /* The crypt function is compatible with the linux cryptoloop
    215    algorithm for < 4 GB images. NOTE: out_buf == in_buf is
    216    supported */
    217 static void encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
    218                             uint8_t *out_buf, const uint8_t *in_buf,
    219                             int nb_sectors, int enc,
    220                             const AES_KEY *key)
    221 {
    222     union {
    223         uint64_t ll[2];
    224         uint8_t b[16];
    225     } ivec;
    226     int i;
    227 
    228     for(i = 0; i < nb_sectors; i++) {
    229         ivec.ll[0] = cpu_to_le64(sector_num);
    230         ivec.ll[1] = 0;
    231         AES_cbc_encrypt(in_buf, out_buf, 512, key,
    232                         ivec.b, enc);
    233         sector_num++;
    234         in_buf += 512;
    235         out_buf += 512;
    236     }
    237 }
    238 
    239 /* 'allocate' is:
    240  *
    241  * 0 to not allocate.
    242  *
    243  * 1 to allocate a normal cluster (for sector indexes 'n_start' to
    244  * 'n_end')
    245  *
    246  * 2 to allocate a compressed cluster of size
    247  * 'compressed_size'. 'compressed_size' must be > 0 and <
    248  * cluster_size
    249  *
    250  * return 0 if not allocated.
    251  */
    252 static uint64_t get_cluster_offset(BlockDriverState *bs,
    253                                    uint64_t offset, int allocate,
    254                                    int compressed_size,
    255                                    int n_start, int n_end)
    256 {
    257     BDRVQcowState *s = bs->opaque;
    258     int min_index, i, j, l1_index, l2_index;
    259     uint64_t l2_offset, *l2_table, cluster_offset, tmp;
    260     uint32_t min_count;
    261     int new_l2_table;
    262 
    263     l1_index = offset >> (s->l2_bits + s->cluster_bits);
    264     l2_offset = s->l1_table[l1_index];
    265     new_l2_table = 0;
    266     if (!l2_offset) {
    267         if (!allocate)
    268             return 0;
    269         /* allocate a new l2 entry */
    270         l2_offset = bdrv_getlength(bs->file);
    271         /* round to cluster size */
    272         l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1);
    273         /* update the L1 entry */
    274         s->l1_table[l1_index] = l2_offset;
    275         tmp = cpu_to_be64(l2_offset);
    276         if (bdrv_pwrite_sync(bs->file,
    277                 s->l1_table_offset + l1_index * sizeof(tmp),
    278                 &tmp, sizeof(tmp)) < 0)
    279             return 0;
    280         new_l2_table = 1;
    281     }
    282     for(i = 0; i < L2_CACHE_SIZE; i++) {
    283         if (l2_offset == s->l2_cache_offsets[i]) {
    284             /* increment the hit count */
    285             if (++s->l2_cache_counts[i] == 0xffffffff) {
    286                 for(j = 0; j < L2_CACHE_SIZE; j++) {
    287                     s->l2_cache_counts[j] >>= 1;
    288                 }
    289             }
    290             l2_table = s->l2_cache + (i << s->l2_bits);
    291             goto found;
    292         }
    293     }
    294     /* not found: load a new entry in the least used one */
    295     min_index = 0;
    296     min_count = 0xffffffff;
    297     for(i = 0; i < L2_CACHE_SIZE; i++) {
    298         if (s->l2_cache_counts[i] < min_count) {
    299             min_count = s->l2_cache_counts[i];
    300             min_index = i;
    301         }
    302     }
    303     l2_table = s->l2_cache + (min_index << s->l2_bits);
    304     if (new_l2_table) {
    305         memset(l2_table, 0, s->l2_size * sizeof(uint64_t));
    306         if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table,
    307                 s->l2_size * sizeof(uint64_t)) < 0)
    308             return 0;
    309     } else {
    310         if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) !=
    311             s->l2_size * sizeof(uint64_t))
    312             return 0;
    313     }
    314     s->l2_cache_offsets[min_index] = l2_offset;
    315     s->l2_cache_counts[min_index] = 1;
    316  found:
    317     l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1);
    318     cluster_offset = be64_to_cpu(l2_table[l2_index]);
    319     if (!cluster_offset ||
    320         ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) {
    321         if (!allocate)
    322             return 0;
    323         /* allocate a new cluster */
    324         if ((cluster_offset & QCOW_OFLAG_COMPRESSED) &&
    325             (n_end - n_start) < s->cluster_sectors) {
    326             /* if the cluster is already compressed, we must
    327                decompress it in the case it is not completely
    328                overwritten */
    329             if (decompress_cluster(bs, cluster_offset) < 0)
    330                 return 0;
    331             cluster_offset = bdrv_getlength(bs->file);
    332             cluster_offset = (cluster_offset + s->cluster_size - 1) &
    333                 ~(s->cluster_size - 1);
    334             /* write the cluster content */
    335             if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) !=
    336                 s->cluster_size)
    337                 return -1;
    338         } else {
    339             cluster_offset = bdrv_getlength(bs->file);
    340             if (allocate == 1) {
    341                 /* round to cluster size */
    342                 cluster_offset = (cluster_offset + s->cluster_size - 1) &
    343                     ~(s->cluster_size - 1);
    344                 bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
    345                 /* if encrypted, we must initialize the cluster
    346                    content which won't be written */
    347                 if (s->crypt_method &&
    348                     (n_end - n_start) < s->cluster_sectors) {
    349                     uint64_t start_sect;
    350                     start_sect = (offset & ~(s->cluster_size - 1)) >> 9;
    351                     memset(s->cluster_data + 512, 0x00, 512);
    352                     for(i = 0; i < s->cluster_sectors; i++) {
    353                         if (i < n_start || i >= n_end) {
    354                             encrypt_sectors(s, start_sect + i,
    355                                             s->cluster_data,
    356                                             s->cluster_data + 512, 1, 1,
    357                                             &s->aes_encrypt_key);
    358                             if (bdrv_pwrite(bs->file, cluster_offset + i * 512,
    359                                             s->cluster_data, 512) != 512)
    360                                 return -1;
    361                         }
    362                     }
    363                 }
    364             } else if (allocate == 2) {
    365                 cluster_offset |= QCOW_OFLAG_COMPRESSED |
    366                     (uint64_t)compressed_size << (63 - s->cluster_bits);
    367             }
    368         }
    369         /* update L2 table */
    370         tmp = cpu_to_be64(cluster_offset);
    371         l2_table[l2_index] = tmp;
    372         if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp),
    373                 &tmp, sizeof(tmp)) < 0)
    374             return 0;
    375     }
    376     return cluster_offset;
    377 }
    378 
    379 static int qcow_is_allocated(BlockDriverState *bs, int64_t sector_num,
    380                              int nb_sectors, int *pnum)
    381 {
    382     BDRVQcowState *s = bs->opaque;
    383     int index_in_cluster, n;
    384     uint64_t cluster_offset;
    385 
    386     cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
    387     index_in_cluster = sector_num & (s->cluster_sectors - 1);
    388     n = s->cluster_sectors - index_in_cluster;
    389     if (n > nb_sectors)
    390         n = nb_sectors;
    391     *pnum = n;
    392     return (cluster_offset != 0);
    393 }
    394 
    395 static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
    396                              const uint8_t *buf, int buf_size)
    397 {
    398     z_stream strm1, *strm = &strm1;
    399     int ret, out_len;
    400 
    401     memset(strm, 0, sizeof(*strm));
    402 
    403     strm->next_in = (uint8_t *)buf;
    404     strm->avail_in = buf_size;
    405     strm->next_out = out_buf;
    406     strm->avail_out = out_buf_size;
    407 
    408     ret = inflateInit2(strm, -12);
    409     if (ret != Z_OK)
    410         return -1;
    411     ret = inflate(strm, Z_FINISH);
    412     out_len = strm->next_out - out_buf;
    413     if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
    414         out_len != out_buf_size) {
    415         inflateEnd(strm);
    416         return -1;
    417     }
    418     inflateEnd(strm);
    419     return 0;
    420 }
    421 
    422 static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
    423 {
    424     BDRVQcowState *s = bs->opaque;
    425     int ret, csize;
    426     uint64_t coffset;
    427 
    428     coffset = cluster_offset & s->cluster_offset_mask;
    429     if (s->cluster_cache_offset != coffset) {
    430         csize = cluster_offset >> (63 - s->cluster_bits);
    431         csize &= (s->cluster_size - 1);
    432         ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize);
    433         if (ret != csize)
    434             return -1;
    435         if (decompress_buffer(s->cluster_cache, s->cluster_size,
    436                               s->cluster_data, csize) < 0) {
    437             return -1;
    438         }
    439         s->cluster_cache_offset = coffset;
    440     }
    441     return 0;
    442 }
    443 
    444 #if 0
    445 
    446 static int qcow_read(BlockDriverState *bs, int64_t sector_num,
    447                      uint8_t *buf, int nb_sectors)
    448 {
    449     BDRVQcowState *s = bs->opaque;
    450     int ret, index_in_cluster, n;
    451     uint64_t cluster_offset;
    452 
    453     while (nb_sectors > 0) {
    454         cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0);
    455         index_in_cluster = sector_num & (s->cluster_sectors - 1);
    456         n = s->cluster_sectors - index_in_cluster;
    457         if (n > nb_sectors)
    458             n = nb_sectors;
    459         if (!cluster_offset) {
    460             if (bs->backing_hd) {
    461                 /* read from the base image */
    462                 ret = bdrv_read(bs->backing_hd, sector_num, buf, n);
    463                 if (ret < 0)
    464                     return -1;
    465             } else {
    466                 memset(buf, 0, 512 * n);
    467             }
    468         } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) {
    469             if (decompress_cluster(bs, cluster_offset) < 0)
    470                 return -1;
    471             memcpy(buf, s->cluster_cache + index_in_cluster * 512, 512 * n);
    472         } else {
    473             ret = bdrv_pread(bs->file, cluster_offset + index_in_cluster * 512, buf, n * 512);
    474             if (ret != n * 512)
    475                 return -1;
    476             if (s->crypt_method) {
    477                 encrypt_sectors(s, sector_num, buf, buf, n, 0,
    478                                 &s->aes_decrypt_key);
    479             }
    480         }
    481         nb_sectors -= n;
    482         sector_num += n;
    483         buf += n * 512;
    484     }
    485     return 0;
    486 }
    487 #endif
    488 
    489 typedef struct QCowAIOCB {
    490     BlockDriverAIOCB common;
    491     int64_t sector_num;
    492     QEMUIOVector *qiov;
    493     uint8_t *buf;
    494     void *orig_buf;
    495     int nb_sectors;
    496     int n;
    497     uint64_t cluster_offset;
    498     uint8_t *cluster_data;
    499     struct iovec hd_iov;
    500     QEMUIOVector hd_qiov;
    501     BlockDriverAIOCB *hd_aiocb;
    502 } QCowAIOCB;
    503 
    504 static void qcow_aio_cancel(BlockDriverAIOCB *blockacb)
    505 {
    506     QCowAIOCB *acb = container_of(blockacb, QCowAIOCB, common);
    507     if (acb->hd_aiocb)
    508         bdrv_aio_cancel(acb->hd_aiocb);
    509     qemu_aio_release(acb);
    510 }
    511 
    512 static AIOPool qcow_aio_pool = {
    513     .aiocb_size         = sizeof(QCowAIOCB),
    514     .cancel             = qcow_aio_cancel,
    515 };
    516 
    517 static QCowAIOCB *qcow_aio_setup(BlockDriverState *bs,
    518         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
    519         BlockDriverCompletionFunc *cb, void *opaque, int is_write)
    520 {
    521     QCowAIOCB *acb;
    522 
    523     acb = qemu_aio_get(&qcow_aio_pool, bs, cb, opaque);
    524     if (!acb)
    525         return NULL;
    526     acb->hd_aiocb = NULL;
    527     acb->sector_num = sector_num;
    528     acb->qiov = qiov;
    529     if (qiov->niov > 1) {
    530         acb->buf = acb->orig_buf = qemu_blockalign(bs, qiov->size);
    531         if (is_write)
    532             qemu_iovec_to_buffer(qiov, acb->buf);
    533     } else {
    534         acb->buf = (uint8_t *)qiov->iov->iov_base;
    535     }
    536     acb->nb_sectors = nb_sectors;
    537     acb->n = 0;
    538     acb->cluster_offset = 0;
    539     return acb;
    540 }
    541 
    542 static void qcow_aio_read_cb(void *opaque, int ret)
    543 {
    544     QCowAIOCB *acb = opaque;
    545     BlockDriverState *bs = acb->common.bs;
    546     BDRVQcowState *s = bs->opaque;
    547     int index_in_cluster;
    548 
    549     acb->hd_aiocb = NULL;
    550     if (ret < 0)
    551         goto done;
    552 
    553  redo:
    554     /* post process the read buffer */
    555     if (!acb->cluster_offset) {
    556         /* nothing to do */
    557     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
    558         /* nothing to do */
    559     } else {
    560         if (s->crypt_method) {
    561             encrypt_sectors(s, acb->sector_num, acb->buf, acb->buf,
    562                             acb->n, 0,
    563                             &s->aes_decrypt_key);
    564         }
    565     }
    566 
    567     acb->nb_sectors -= acb->n;
    568     acb->sector_num += acb->n;
    569     acb->buf += acb->n * 512;
    570 
    571     if (acb->nb_sectors == 0) {
    572         /* request completed */
    573         ret = 0;
    574         goto done;
    575     }
    576 
    577     /* prepare next AIO request */
    578     acb->cluster_offset = get_cluster_offset(bs, acb->sector_num << 9,
    579                                              0, 0, 0, 0);
    580     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
    581     acb->n = s->cluster_sectors - index_in_cluster;
    582     if (acb->n > acb->nb_sectors)
    583         acb->n = acb->nb_sectors;
    584 
    585     if (!acb->cluster_offset) {
    586         if (bs->backing_hd) {
    587             /* read from the base image */
    588             acb->hd_iov.iov_base = (void *)acb->buf;
    589             acb->hd_iov.iov_len = acb->n * 512;
    590             qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
    591             acb->hd_aiocb = bdrv_aio_readv(bs->backing_hd, acb->sector_num,
    592                 &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
    593             if (acb->hd_aiocb == NULL)
    594                 goto done;
    595         } else {
    596             /* Note: in this case, no need to wait */
    597             memset(acb->buf, 0, 512 * acb->n);
    598             goto redo;
    599         }
    600     } else if (acb->cluster_offset & QCOW_OFLAG_COMPRESSED) {
    601         /* add AIO support for compressed blocks ? */
    602         if (decompress_cluster(bs, acb->cluster_offset) < 0)
    603             goto done;
    604         memcpy(acb->buf,
    605                s->cluster_cache + index_in_cluster * 512, 512 * acb->n);
    606         goto redo;
    607     } else {
    608         if ((acb->cluster_offset & 511) != 0) {
    609             ret = -EIO;
    610             goto done;
    611         }
    612         acb->hd_iov.iov_base = (void *)acb->buf;
    613         acb->hd_iov.iov_len = acb->n * 512;
    614         qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
    615         acb->hd_aiocb = bdrv_aio_readv(bs->file,
    616                             (acb->cluster_offset >> 9) + index_in_cluster,
    617                             &acb->hd_qiov, acb->n, qcow_aio_read_cb, acb);
    618         if (acb->hd_aiocb == NULL)
    619             goto done;
    620     }
    621 
    622     return;
    623 
    624 done:
    625     if (acb->qiov->niov > 1) {
    626         qemu_iovec_from_buffer(acb->qiov, acb->orig_buf, acb->qiov->size);
    627         qemu_vfree(acb->orig_buf);
    628     }
    629     acb->common.cb(acb->common.opaque, ret);
    630     qemu_aio_release(acb);
    631 }
    632 
    633 static BlockDriverAIOCB *qcow_aio_readv(BlockDriverState *bs,
    634         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
    635         BlockDriverCompletionFunc *cb, void *opaque)
    636 {
    637     QCowAIOCB *acb;
    638 
    639     acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0);
    640     if (!acb)
    641         return NULL;
    642 
    643     qcow_aio_read_cb(acb, 0);
    644     return &acb->common;
    645 }
    646 
    647 static void qcow_aio_write_cb(void *opaque, int ret)
    648 {
    649     QCowAIOCB *acb = opaque;
    650     BlockDriverState *bs = acb->common.bs;
    651     BDRVQcowState *s = bs->opaque;
    652     int index_in_cluster;
    653     uint64_t cluster_offset;
    654     const uint8_t *src_buf;
    655 
    656     acb->hd_aiocb = NULL;
    657 
    658     if (ret < 0)
    659         goto done;
    660 
    661     acb->nb_sectors -= acb->n;
    662     acb->sector_num += acb->n;
    663     acb->buf += acb->n * 512;
    664 
    665     if (acb->nb_sectors == 0) {
    666         /* request completed */
    667         ret = 0;
    668         goto done;
    669     }
    670 
    671     index_in_cluster = acb->sector_num & (s->cluster_sectors - 1);
    672     acb->n = s->cluster_sectors - index_in_cluster;
    673     if (acb->n > acb->nb_sectors)
    674         acb->n = acb->nb_sectors;
    675     cluster_offset = get_cluster_offset(bs, acb->sector_num << 9, 1, 0,
    676                                         index_in_cluster,
    677                                         index_in_cluster + acb->n);
    678     if (!cluster_offset || (cluster_offset & 511) != 0) {
    679         ret = -EIO;
    680         goto done;
    681     }
    682     if (s->crypt_method) {
    683         if (!acb->cluster_data) {
    684             acb->cluster_data = qemu_mallocz(s->cluster_size);
    685             if (!acb->cluster_data) {
    686                 ret = -ENOMEM;
    687                 goto done;
    688             }
    689         }
    690         encrypt_sectors(s, acb->sector_num, acb->cluster_data, acb->buf,
    691                         acb->n, 1, &s->aes_encrypt_key);
    692         src_buf = acb->cluster_data;
    693     } else {
    694         src_buf = acb->buf;
    695     }
    696 
    697     acb->hd_iov.iov_base = (void *)src_buf;
    698     acb->hd_iov.iov_len = acb->n * 512;
    699     qemu_iovec_init_external(&acb->hd_qiov, &acb->hd_iov, 1);
    700     acb->hd_aiocb = bdrv_aio_writev(bs->file,
    701                                     (cluster_offset >> 9) + index_in_cluster,
    702                                     &acb->hd_qiov, acb->n,
    703                                     qcow_aio_write_cb, acb);
    704     if (acb->hd_aiocb == NULL)
    705         goto done;
    706     return;
    707 
    708 done:
    709     if (acb->qiov->niov > 1)
    710         qemu_vfree(acb->orig_buf);
    711     acb->common.cb(acb->common.opaque, ret);
    712     qemu_aio_release(acb);
    713 }
    714 
    715 static BlockDriverAIOCB *qcow_aio_writev(BlockDriverState *bs,
    716         int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
    717         BlockDriverCompletionFunc *cb, void *opaque)
    718 {
    719     BDRVQcowState *s = bs->opaque;
    720     QCowAIOCB *acb;
    721 
    722     s->cluster_cache_offset = -1; /* disable compressed cache */
    723 
    724     acb = qcow_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 1);
    725     if (!acb)
    726         return NULL;
    727 
    728 
    729     qcow_aio_write_cb(acb, 0);
    730     return &acb->common;
    731 }
    732 
    733 static void qcow_close(BlockDriverState *bs)
    734 {
    735     BDRVQcowState *s = bs->opaque;
    736     qemu_free(s->l1_table);
    737     qemu_free(s->l2_cache);
    738     qemu_free(s->cluster_cache);
    739     qemu_free(s->cluster_data);
    740 }
    741 
    742 static int qcow_create(const char *filename, QEMUOptionParameter *options)
    743 {
    744     int fd, header_size, backing_filename_len, l1_size, i, shift;
    745     QCowHeader header;
    746     uint64_t tmp;
    747     int64_t total_size = 0;
    748     const char *backing_file = NULL;
    749     int flags = 0;
    750     int ret;
    751 
    752     /* Read out options */
    753     while (options && options->name) {
    754         if (!strcmp(options->name, BLOCK_OPT_SIZE)) {
    755             total_size = options->value.n / 512;
    756         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
    757             backing_file = options->value.s;
    758         } else if (!strcmp(options->name, BLOCK_OPT_ENCRYPT)) {
    759             flags |= options->value.n ? BLOCK_FLAG_ENCRYPT : 0;
    760         }
    761         options++;
    762     }
    763 
    764     fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, 0644);
    765     if (fd < 0)
    766         return -errno;
    767     memset(&header, 0, sizeof(header));
    768     header.magic = cpu_to_be32(QCOW_MAGIC);
    769     header.version = cpu_to_be32(QCOW_VERSION);
    770     header.size = cpu_to_be64(total_size * 512);
    771     header_size = sizeof(header);
    772     backing_filename_len = 0;
    773     if (backing_file) {
    774         if (strcmp(backing_file, "fat:")) {
    775             header.backing_file_offset = cpu_to_be64(header_size);
    776             backing_filename_len = strlen(backing_file);
    777             header.backing_file_size = cpu_to_be32(backing_filename_len);
    778             header_size += backing_filename_len;
    779         } else {
    780             /* special backing file for vvfat */
    781             backing_file = NULL;
    782         }
    783         header.cluster_bits = 9; /* 512 byte cluster to avoid copying
    784                                     unmodifyed sectors */
    785         header.l2_bits = 12; /* 32 KB L2 tables */
    786     } else {
    787         header.cluster_bits = 12; /* 4 KB clusters */
    788         header.l2_bits = 9; /* 4 KB L2 tables */
    789     }
    790     header_size = (header_size + 7) & ~7;
    791     shift = header.cluster_bits + header.l2_bits;
    792     l1_size = ((total_size * 512) + (1LL << shift) - 1) >> shift;
    793 
    794     header.l1_table_offset = cpu_to_be64(header_size);
    795     if (flags & BLOCK_FLAG_ENCRYPT) {
    796         header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES);
    797     } else {
    798         header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
    799     }
    800 
    801     /* write all the data */
    802     ret = qemu_write_full(fd, &header, sizeof(header));
    803     if (ret != sizeof(header)) {
    804         ret = -errno;
    805         goto exit;
    806     }
    807 
    808     if (backing_file) {
    809         ret = qemu_write_full(fd, backing_file, backing_filename_len);
    810         if (ret != backing_filename_len) {
    811             ret = -errno;
    812             goto exit;
    813         }
    814 
    815     }
    816     lseek(fd, header_size, SEEK_SET);
    817     tmp = 0;
    818     for(i = 0;i < l1_size; i++) {
    819         ret = qemu_write_full(fd, &tmp, sizeof(tmp));
    820         if (ret != sizeof(tmp)) {
    821             ret = -errno;
    822             goto exit;
    823         }
    824     }
    825 
    826     ret = 0;
    827 exit:
    828     close(fd);
    829     return ret;
    830 }
    831 
    832 static int qcow_make_empty(BlockDriverState *bs)
    833 {
    834     BDRVQcowState *s = bs->opaque;
    835     uint32_t l1_length = s->l1_size * sizeof(uint64_t);
    836     int ret;
    837 
    838     memset(s->l1_table, 0, l1_length);
    839     if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
    840             l1_length) < 0)
    841         return -1;
    842     ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
    843     if (ret < 0)
    844         return ret;
    845 
    846     memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t));
    847     memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t));
    848     memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t));
    849 
    850     return 0;
    851 }
    852 
    853 /* XXX: put compressed sectors first, then all the cluster aligned
    854    tables to avoid losing bytes in alignment */
    855 static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num,
    856                                  const uint8_t *buf, int nb_sectors)
    857 {
    858     BDRVQcowState *s = bs->opaque;
    859     z_stream strm;
    860     int ret, out_len;
    861     uint8_t *out_buf;
    862     uint64_t cluster_offset;
    863 
    864     if (nb_sectors != s->cluster_sectors)
    865         return -EINVAL;
    866 
    867     out_buf = qemu_malloc(s->cluster_size + (s->cluster_size / 1000) + 128);
    868     if (!out_buf)
    869         return -1;
    870 
    871     /* best compression, small window, no zlib header */
    872     memset(&strm, 0, sizeof(strm));
    873     ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
    874                        Z_DEFLATED, -12,
    875                        9, Z_DEFAULT_STRATEGY);
    876     if (ret != 0) {
    877         qemu_free(out_buf);
    878         return -1;
    879     }
    880 
    881     strm.avail_in = s->cluster_size;
    882     strm.next_in = (uint8_t *)buf;
    883     strm.avail_out = s->cluster_size;
    884     strm.next_out = out_buf;
    885 
    886     ret = deflate(&strm, Z_FINISH);
    887     if (ret != Z_STREAM_END && ret != Z_OK) {
    888         qemu_free(out_buf);
    889         deflateEnd(&strm);
    890         return -1;
    891     }
    892     out_len = strm.next_out - out_buf;
    893 
    894     deflateEnd(&strm);
    895 
    896     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
    897         /* could not compress: write normal cluster */
    898         bdrv_write(bs, sector_num, buf, s->cluster_sectors);
    899     } else {
    900         cluster_offset = get_cluster_offset(bs, sector_num << 9, 2,
    901                                             out_len, 0, 0);
    902         cluster_offset &= s->cluster_offset_mask;
    903         if (bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len) != out_len) {
    904             qemu_free(out_buf);
    905             return -1;
    906         }
    907     }
    908 
    909     qemu_free(out_buf);
    910     return 0;
    911 }
    912 
    913 static void qcow_flush(BlockDriverState *bs)
    914 {
    915     bdrv_flush(bs->file);
    916 }
    917 
    918 static BlockDriverAIOCB *qcow_aio_flush(BlockDriverState *bs,
    919         BlockDriverCompletionFunc *cb, void *opaque)
    920 {
    921     return bdrv_aio_flush(bs->file, cb, opaque);
    922 }
    923 
    924 static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
    925 {
    926     BDRVQcowState *s = bs->opaque;
    927     bdi->cluster_size = s->cluster_size;
    928     return 0;
    929 }
    930 
    931 
    932 static QEMUOptionParameter qcow_create_options[] = {
    933     {
    934         .name = BLOCK_OPT_SIZE,
    935         .type = OPT_SIZE,
    936         .help = "Virtual disk size"
    937     },
    938     {
    939         .name = BLOCK_OPT_BACKING_FILE,
    940         .type = OPT_STRING,
    941         .help = "File name of a base image"
    942     },
    943     {
    944         .name = BLOCK_OPT_ENCRYPT,
    945         .type = OPT_FLAG,
    946         .help = "Encrypt the image"
    947     },
    948     { NULL }
    949 };
    950 
    951 static BlockDriver bdrv_qcow = {
    952     .format_name	= "qcow",
    953     .instance_size	= sizeof(BDRVQcowState),
    954     .bdrv_probe		= qcow_probe,
    955     .bdrv_open		= qcow_open,
    956     .bdrv_close		= qcow_close,
    957     .bdrv_create	= qcow_create,
    958     .bdrv_flush		= qcow_flush,
    959     .bdrv_is_allocated	= qcow_is_allocated,
    960     .bdrv_set_key	= qcow_set_key,
    961     .bdrv_make_empty	= qcow_make_empty,
    962     .bdrv_aio_readv	= qcow_aio_readv,
    963     .bdrv_aio_writev	= qcow_aio_writev,
    964     .bdrv_aio_flush	= qcow_aio_flush,
    965     .bdrv_write_compressed = qcow_write_compressed,
    966     .bdrv_get_info	= qcow_get_info,
    967 
    968     .create_options = qcow_create_options,
    969 };
    970 
    971 static void bdrv_qcow_init(void)
    972 {
    973     bdrv_register(&bdrv_qcow);
    974 }
    975 
    976 block_init(bdrv_qcow_init);
    977