qemu

Форк
0
/
qcow2.c 
6214 строк · 207.8 Кб
1
/*
2
 * Block driver for the QCOW version 2 format
3
 *
4
 * Copyright (c) 2004-2006 Fabrice Bellard
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a copy
7
 * of this software and associated documentation files (the "Software"), to deal
8
 * in the Software without restriction, including without limitation the rights
9
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
 * copies of the Software, and to permit persons to whom the Software is
11
 * furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice shall be included in
14
 * all copies or substantial portions of the Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22
 * THE SOFTWARE.
23
 */
24

25
#include "qemu/osdep.h"
26

27
#include "block/qdict.h"
28
#include "sysemu/block-backend.h"
29
#include "qemu/main-loop.h"
30
#include "qemu/module.h"
31
#include "qcow2.h"
32
#include "qemu/error-report.h"
33
#include "qapi/error.h"
34
#include "qapi/qapi-events-block-core.h"
35
#include "qapi/qmp/qdict.h"
36
#include "qapi/qmp/qstring.h"
37
#include "trace.h"
38
#include "qemu/option_int.h"
39
#include "qemu/cutils.h"
40
#include "qemu/bswap.h"
41
#include "qemu/memalign.h"
42
#include "qapi/qobject-input-visitor.h"
43
#include "qapi/qapi-visit-block-core.h"
44
#include "crypto.h"
45
#include "block/aio_task.h"
46
#include "block/dirty-bitmap.h"
47

48
/*
49
  Differences with QCOW:
50

51
  - Support for multiple incremental snapshots.
52
  - Memory management by reference counts.
53
  - Clusters which have a reference count of one have the bit
54
    QCOW_OFLAG_COPIED to optimize write performance.
55
  - Size of compressed clusters is stored in sectors to reduce bit usage
56
    in the cluster offsets.
57
  - Support for storing additional data (such as the VM state) in the
58
    snapshots.
59
  - If a backing store is used, the cluster size is not constrained
60
    (could be backported to QCOW).
61
  - L2 tables have always a size of one cluster.
62
*/
63

64

65
typedef struct {
66
    uint32_t magic;
67
    uint32_t len;
68
} QEMU_PACKED QCowExtension;
69

70
#define  QCOW2_EXT_MAGIC_END 0
71
#define  QCOW2_EXT_MAGIC_BACKING_FORMAT 0xe2792aca
72
#define  QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857
73
#define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
74
#define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
75
#define  QCOW2_EXT_MAGIC_DATA_FILE 0x44415441
76

77
static int coroutine_fn
78
qcow2_co_preadv_compressed(BlockDriverState *bs,
79
                           uint64_t l2_entry,
80
                           uint64_t offset,
81
                           uint64_t bytes,
82
                           QEMUIOVector *qiov,
83
                           size_t qiov_offset);
84

85
static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
86
{
87
    const QCowHeader *cow_header = (const void *)buf;
88

89
    if (buf_size >= sizeof(QCowHeader) &&
90
        be32_to_cpu(cow_header->magic) == QCOW_MAGIC &&
91
        be32_to_cpu(cow_header->version) >= 2)
92
        return 100;
93
    else
94
        return 0;
95
}
96

97

98
static int GRAPH_RDLOCK
99
qcow2_crypto_hdr_read_func(QCryptoBlock *block, size_t offset,
100
                           uint8_t *buf, size_t buflen,
101
                           void *opaque, Error **errp)
102
{
103
    BlockDriverState *bs = opaque;
104
    BDRVQcow2State *s = bs->opaque;
105
    ssize_t ret;
106

107
    if ((offset + buflen) > s->crypto_header.length) {
108
        error_setg(errp, "Request for data outside of extension header");
109
        return -1;
110
    }
111

112
    ret = bdrv_pread(bs->file, s->crypto_header.offset + offset, buflen, buf,
113
                     0);
114
    if (ret < 0) {
115
        error_setg_errno(errp, -ret, "Could not read encryption header");
116
        return -1;
117
    }
118
    return 0;
119
}
120

121

122
static int coroutine_fn GRAPH_RDLOCK
123
qcow2_crypto_hdr_init_func(QCryptoBlock *block, size_t headerlen, void *opaque,
124
                           Error **errp)
125
{
126
    BlockDriverState *bs = opaque;
127
    BDRVQcow2State *s = bs->opaque;
128
    int64_t ret;
129
    int64_t clusterlen;
130

131
    ret = qcow2_alloc_clusters(bs, headerlen);
132
    if (ret < 0) {
133
        error_setg_errno(errp, -ret,
134
                         "Cannot allocate cluster for LUKS header size %zu",
135
                         headerlen);
136
        return -1;
137
    }
138

139
    s->crypto_header.length = headerlen;
140
    s->crypto_header.offset = ret;
141

142
    /*
143
     * Zero fill all space in cluster so it has predictable
144
     * content, as we may not initialize some regions of the
145
     * header (eg only 1 out of 8 key slots will be initialized)
146
     */
147
    clusterlen = size_to_clusters(s, headerlen) * s->cluster_size;
148
    assert(qcow2_pre_write_overlap_check(bs, 0, ret, clusterlen, false) == 0);
149
    ret = bdrv_co_pwrite_zeroes(bs->file, ret, clusterlen, 0);
150
    if (ret < 0) {
151
        error_setg_errno(errp, -ret, "Could not zero fill encryption header");
152
        return -1;
153
    }
154

155
    return 0;
156
}
157

158

159
/* The graph lock must be held when called in coroutine context */
160
static int coroutine_mixed_fn GRAPH_RDLOCK
161
qcow2_crypto_hdr_write_func(QCryptoBlock *block, size_t offset,
162
                            const uint8_t *buf, size_t buflen,
163
                            void *opaque, Error **errp)
164
{
165
    BlockDriverState *bs = opaque;
166
    BDRVQcow2State *s = bs->opaque;
167
    ssize_t ret;
168

169
    if ((offset + buflen) > s->crypto_header.length) {
170
        error_setg(errp, "Request for data outside of extension header");
171
        return -1;
172
    }
173

174
    ret = bdrv_pwrite(bs->file, s->crypto_header.offset + offset, buflen, buf,
175
                      0);
176
    if (ret < 0) {
177
        error_setg_errno(errp, -ret, "Could not read encryption header");
178
        return -1;
179
    }
180
    return 0;
181
}
182

183
static QDict*
184
qcow2_extract_crypto_opts(QemuOpts *opts, const char *fmt, Error **errp)
185
{
186
    QDict *cryptoopts_qdict;
187
    QDict *opts_qdict;
188

189
    /* Extract "encrypt." options into a qdict */
190
    opts_qdict = qemu_opts_to_qdict(opts, NULL);
191
    qdict_extract_subqdict(opts_qdict, &cryptoopts_qdict, "encrypt.");
192
    qobject_unref(opts_qdict);
193
    qdict_put_str(cryptoopts_qdict, "format", fmt);
194
    return cryptoopts_qdict;
195
}
196

197
/*
198
 * read qcow2 extension and fill bs
199
 * start reading from start_offset
200
 * finish reading upon magic of value 0 or when end_offset reached
201
 * unknown magic is skipped (future extension this version knows nothing about)
202
 * return 0 upon success, non-0 otherwise
203
 */
204
static int coroutine_fn GRAPH_RDLOCK
205
qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset,
206
                      uint64_t end_offset, void **p_feature_table,
207
                      int flags, bool *need_update_header, Error **errp)
208
{
209
    BDRVQcow2State *s = bs->opaque;
210
    QCowExtension ext;
211
    uint64_t offset;
212
    int ret;
213
    Qcow2BitmapHeaderExt bitmaps_ext;
214

215
    if (need_update_header != NULL) {
216
        *need_update_header = false;
217
    }
218

219
#ifdef DEBUG_EXT
220
    printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset);
221
#endif
222
    offset = start_offset;
223
    while (offset < end_offset) {
224

225
#ifdef DEBUG_EXT
226
        /* Sanity check */
227
        if (offset > s->cluster_size)
228
            printf("qcow2_read_extension: suspicious offset %lu\n", offset);
229

230
        printf("attempting to read extended header in offset %lu\n", offset);
231
#endif
232

233
        ret = bdrv_co_pread(bs->file, offset, sizeof(ext), &ext, 0);
234
        if (ret < 0) {
235
            error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: "
236
                             "pread fail from offset %" PRIu64, offset);
237
            return 1;
238
        }
239
        ext.magic = be32_to_cpu(ext.magic);
240
        ext.len = be32_to_cpu(ext.len);
241
        offset += sizeof(ext);
242
#ifdef DEBUG_EXT
243
        printf("ext.magic = 0x%x\n", ext.magic);
244
#endif
245
        if (offset > end_offset || ext.len > end_offset - offset) {
246
            error_setg(errp, "Header extension too large");
247
            return -EINVAL;
248
        }
249

250
        switch (ext.magic) {
251
        case QCOW2_EXT_MAGIC_END:
252
            return 0;
253

254
        case QCOW2_EXT_MAGIC_BACKING_FORMAT:
255
            if (ext.len >= sizeof(bs->backing_format)) {
256
                error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32
257
                           " too large (>=%zu)", ext.len,
258
                           sizeof(bs->backing_format));
259
                return 2;
260
            }
261
            ret = bdrv_co_pread(bs->file, offset, ext.len, bs->backing_format, 0);
262
            if (ret < 0) {
263
                error_setg_errno(errp, -ret, "ERROR: ext_backing_format: "
264
                                 "Could not read format name");
265
                return 3;
266
            }
267
            bs->backing_format[ext.len] = '\0';
268
            s->image_backing_format = g_strdup(bs->backing_format);
269
#ifdef DEBUG_EXT
270
            printf("Qcow2: Got format extension %s\n", bs->backing_format);
271
#endif
272
            break;
273

274
        case QCOW2_EXT_MAGIC_FEATURE_TABLE:
275
            if (p_feature_table != NULL) {
276
                void *feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature));
277
                ret = bdrv_co_pread(bs->file, offset, ext.len, feature_table, 0);
278
                if (ret < 0) {
279
                    error_setg_errno(errp, -ret, "ERROR: ext_feature_table: "
280
                                     "Could not read table");
281
                    g_free(feature_table);
282
                    return ret;
283
                }
284

285
                *p_feature_table = feature_table;
286
            }
287
            break;
288

289
        case QCOW2_EXT_MAGIC_CRYPTO_HEADER: {
290
            unsigned int cflags = 0;
291
            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
292
                error_setg(errp, "CRYPTO header extension only "
293
                           "expected with LUKS encryption method");
294
                return -EINVAL;
295
            }
296
            if (ext.len != sizeof(Qcow2CryptoHeaderExtension)) {
297
                error_setg(errp, "CRYPTO header extension size %u, "
298
                           "but expected size %zu", ext.len,
299
                           sizeof(Qcow2CryptoHeaderExtension));
300
                return -EINVAL;
301
            }
302

303
            ret = bdrv_co_pread(bs->file, offset, ext.len, &s->crypto_header, 0);
304
            if (ret < 0) {
305
                error_setg_errno(errp, -ret,
306
                                 "Unable to read CRYPTO header extension");
307
                return ret;
308
            }
309
            s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
310
            s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
311

312
            if ((s->crypto_header.offset % s->cluster_size) != 0) {
313
                error_setg(errp, "Encryption header offset '%" PRIu64 "' is "
314
                           "not a multiple of cluster size '%u'",
315
                           s->crypto_header.offset, s->cluster_size);
316
                return -EINVAL;
317
            }
318

319
            if (flags & BDRV_O_NO_IO) {
320
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
321
            }
322
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
323
                                           qcow2_crypto_hdr_read_func,
324
                                           bs, cflags, errp);
325
            if (!s->crypto) {
326
                return -EINVAL;
327
            }
328
        }   break;
329

330
        case QCOW2_EXT_MAGIC_BITMAPS:
331
            if (ext.len != sizeof(bitmaps_ext)) {
332
                error_setg_errno(errp, -ret, "bitmaps_ext: "
333
                                 "Invalid extension length");
334
                return -EINVAL;
335
            }
336

337
            if (!(s->autoclear_features & QCOW2_AUTOCLEAR_BITMAPS)) {
338
                if (s->qcow_version < 3) {
339
                    /* Let's be a bit more specific */
340
                    warn_report("This qcow2 v2 image contains bitmaps, but "
341
                                "they may have been modified by a program "
342
                                "without persistent bitmap support; so now "
343
                                "they must all be considered inconsistent");
344
                } else {
345
                    warn_report("a program lacking bitmap support "
346
                                "modified this file, so all bitmaps are now "
347
                                "considered inconsistent");
348
                }
349
                error_printf("Some clusters may be leaked, "
350
                             "run 'qemu-img check -r' on the image "
351
                             "file to fix.");
352
                if (need_update_header != NULL) {
353
                    /* Updating is needed to drop invalid bitmap extension. */
354
                    *need_update_header = true;
355
                }
356
                break;
357
            }
358

359
            ret = bdrv_co_pread(bs->file, offset, ext.len, &bitmaps_ext, 0);
360
            if (ret < 0) {
361
                error_setg_errno(errp, -ret, "bitmaps_ext: "
362
                                 "Could not read ext header");
363
                return ret;
364
            }
365

366
            if (bitmaps_ext.reserved32 != 0) {
367
                error_setg_errno(errp, -ret, "bitmaps_ext: "
368
                                 "Reserved field is not zero");
369
                return -EINVAL;
370
            }
371

372
            bitmaps_ext.nb_bitmaps = be32_to_cpu(bitmaps_ext.nb_bitmaps);
373
            bitmaps_ext.bitmap_directory_size =
374
                be64_to_cpu(bitmaps_ext.bitmap_directory_size);
375
            bitmaps_ext.bitmap_directory_offset =
376
                be64_to_cpu(bitmaps_ext.bitmap_directory_offset);
377

378
            if (bitmaps_ext.nb_bitmaps > QCOW2_MAX_BITMAPS) {
379
                error_setg(errp,
380
                           "bitmaps_ext: Image has %" PRIu32 " bitmaps, "
381
                           "exceeding the QEMU supported maximum of %d",
382
                           bitmaps_ext.nb_bitmaps, QCOW2_MAX_BITMAPS);
383
                return -EINVAL;
384
            }
385

386
            if (bitmaps_ext.nb_bitmaps == 0) {
387
                error_setg(errp, "found bitmaps extension with zero bitmaps");
388
                return -EINVAL;
389
            }
390

391
            if (offset_into_cluster(s, bitmaps_ext.bitmap_directory_offset)) {
392
                error_setg(errp, "bitmaps_ext: "
393
                                 "invalid bitmap directory offset");
394
                return -EINVAL;
395
            }
396

397
            if (bitmaps_ext.bitmap_directory_size >
398
                QCOW2_MAX_BITMAP_DIRECTORY_SIZE) {
399
                error_setg(errp, "bitmaps_ext: "
400
                                 "bitmap directory size (%" PRIu64 ") exceeds "
401
                                 "the maximum supported size (%d)",
402
                                 bitmaps_ext.bitmap_directory_size,
403
                                 QCOW2_MAX_BITMAP_DIRECTORY_SIZE);
404
                return -EINVAL;
405
            }
406

407
            s->nb_bitmaps = bitmaps_ext.nb_bitmaps;
408
            s->bitmap_directory_offset =
409
                    bitmaps_ext.bitmap_directory_offset;
410
            s->bitmap_directory_size =
411
                    bitmaps_ext.bitmap_directory_size;
412

413
#ifdef DEBUG_EXT
414
            printf("Qcow2: Got bitmaps extension: "
415
                   "offset=%" PRIu64 " nb_bitmaps=%" PRIu32 "\n",
416
                   s->bitmap_directory_offset, s->nb_bitmaps);
417
#endif
418
            break;
419

420
        case QCOW2_EXT_MAGIC_DATA_FILE:
421
        {
422
            s->image_data_file = g_malloc0(ext.len + 1);
423
            ret = bdrv_co_pread(bs->file, offset, ext.len, s->image_data_file, 0);
424
            if (ret < 0) {
425
                error_setg_errno(errp, -ret,
426
                                 "ERROR: Could not read data file name");
427
                return ret;
428
            }
429
#ifdef DEBUG_EXT
430
            printf("Qcow2: Got external data file %s\n", s->image_data_file);
431
#endif
432
            break;
433
        }
434

435
        default:
436
            /* unknown magic - save it in case we need to rewrite the header */
437
            /* If you add a new feature, make sure to also update the fast
438
             * path of qcow2_make_empty() to deal with it. */
439
            {
440
                Qcow2UnknownHeaderExtension *uext;
441

442
                uext = g_malloc0(sizeof(*uext)  + ext.len);
443
                uext->magic = ext.magic;
444
                uext->len = ext.len;
445
                QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next);
446

447
                ret = bdrv_co_pread(bs->file, offset, uext->len, uext->data, 0);
448
                if (ret < 0) {
449
                    error_setg_errno(errp, -ret, "ERROR: unknown extension: "
450
                                     "Could not read data");
451
                    return ret;
452
                }
453
            }
454
            break;
455
        }
456

457
        offset += ((ext.len + 7) & ~7);
458
    }
459

460
    return 0;
461
}
462

463
static void cleanup_unknown_header_ext(BlockDriverState *bs)
464
{
465
    BDRVQcow2State *s = bs->opaque;
466
    Qcow2UnknownHeaderExtension *uext, *next;
467

468
    QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) {
469
        QLIST_REMOVE(uext, next);
470
        g_free(uext);
471
    }
472
}
473

474
static void report_unsupported_feature(Error **errp, Qcow2Feature *table,
475
                                       uint64_t mask)
476
{
477
    g_autoptr(GString) features = g_string_sized_new(60);
478

479
    while (table && table->name[0] != '\0') {
480
        if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) {
481
            if (mask & (1ULL << table->bit)) {
482
                if (features->len > 0) {
483
                    g_string_append(features, ", ");
484
                }
485
                g_string_append_printf(features, "%.46s", table->name);
486
                mask &= ~(1ULL << table->bit);
487
            }
488
        }
489
        table++;
490
    }
491

492
    if (mask) {
493
        if (features->len > 0) {
494
            g_string_append(features, ", ");
495
        }
496
        g_string_append_printf(features,
497
                               "Unknown incompatible feature: %" PRIx64, mask);
498
    }
499

500
    error_setg(errp, "Unsupported qcow2 feature(s): %s", features->str);
501
}
502

503
/*
504
 * Sets the dirty bit and flushes afterwards if necessary.
505
 *
506
 * The incompatible_features bit is only set if the image file header was
507
 * updated successfully.  Therefore it is not required to check the return
508
 * value of this function.
509
 */
510
int qcow2_mark_dirty(BlockDriverState *bs)
511
{
512
    BDRVQcow2State *s = bs->opaque;
513
    uint64_t val;
514
    int ret;
515

516
    assert(s->qcow_version >= 3);
517

518
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
519
        return 0; /* already dirty */
520
    }
521

522
    val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY);
523
    ret = bdrv_pwrite_sync(bs->file,
524
                           offsetof(QCowHeader, incompatible_features),
525
                           sizeof(val), &val, 0);
526
    if (ret < 0) {
527
        return ret;
528
    }
529

530
    /* Only treat image as dirty if the header was updated successfully */
531
    s->incompatible_features |= QCOW2_INCOMPAT_DIRTY;
532
    return 0;
533
}
534

535
/*
536
 * Clears the dirty bit and flushes before if necessary.  Only call this
537
 * function when there are no pending requests, it does not guard against
538
 * concurrent requests dirtying the image.
539
 */
540
static int GRAPH_RDLOCK qcow2_mark_clean(BlockDriverState *bs)
541
{
542
    BDRVQcow2State *s = bs->opaque;
543

544
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
545
        int ret;
546

547
        s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY;
548

549
        ret = qcow2_flush_caches(bs);
550
        if (ret < 0) {
551
            return ret;
552
        }
553

554
        return qcow2_update_header(bs);
555
    }
556
    return 0;
557
}
558

559
/*
560
 * Marks the image as corrupt.
561
 */
562
int qcow2_mark_corrupt(BlockDriverState *bs)
563
{
564
    BDRVQcow2State *s = bs->opaque;
565

566
    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
567
    return qcow2_update_header(bs);
568
}
569

570
/*
571
 * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
572
 * before if necessary.
573
 */
574
static int coroutine_fn GRAPH_RDLOCK
575
qcow2_mark_consistent(BlockDriverState *bs)
576
{
577
    BDRVQcow2State *s = bs->opaque;
578

579
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
580
        int ret = qcow2_flush_caches(bs);
581
        if (ret < 0) {
582
            return ret;
583
        }
584

585
        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
586
        return qcow2_update_header(bs);
587
    }
588
    return 0;
589
}
590

591
static void qcow2_add_check_result(BdrvCheckResult *out,
592
                                   const BdrvCheckResult *src,
593
                                   bool set_allocation_info)
594
{
595
    out->corruptions += src->corruptions;
596
    out->leaks += src->leaks;
597
    out->check_errors += src->check_errors;
598
    out->corruptions_fixed += src->corruptions_fixed;
599
    out->leaks_fixed += src->leaks_fixed;
600

601
    if (set_allocation_info) {
602
        out->image_end_offset = src->image_end_offset;
603
        out->bfi = src->bfi;
604
    }
605
}
606

607
static int coroutine_fn GRAPH_RDLOCK
608
qcow2_co_check_locked(BlockDriverState *bs, BdrvCheckResult *result,
609
                      BdrvCheckMode fix)
610
{
611
    BdrvCheckResult snapshot_res = {};
612
    BdrvCheckResult refcount_res = {};
613
    int ret;
614

615
    memset(result, 0, sizeof(*result));
616

617
    ret = qcow2_check_read_snapshot_table(bs, &snapshot_res, fix);
618
    if (ret < 0) {
619
        qcow2_add_check_result(result, &snapshot_res, false);
620
        return ret;
621
    }
622

623
    ret = qcow2_check_refcounts(bs, &refcount_res, fix);
624
    qcow2_add_check_result(result, &refcount_res, true);
625
    if (ret < 0) {
626
        qcow2_add_check_result(result, &snapshot_res, false);
627
        return ret;
628
    }
629

630
    ret = qcow2_check_fix_snapshot_table(bs, &snapshot_res, fix);
631
    qcow2_add_check_result(result, &snapshot_res, false);
632
    if (ret < 0) {
633
        return ret;
634
    }
635

636
    if (fix && result->check_errors == 0 && result->corruptions == 0) {
637
        ret = qcow2_mark_clean(bs);
638
        if (ret < 0) {
639
            return ret;
640
        }
641
        return qcow2_mark_consistent(bs);
642
    }
643
    return ret;
644
}
645

646
static int coroutine_fn GRAPH_RDLOCK
647
qcow2_co_check(BlockDriverState *bs, BdrvCheckResult *result,
648
               BdrvCheckMode fix)
649
{
650
    BDRVQcow2State *s = bs->opaque;
651
    int ret;
652

653
    qemu_co_mutex_lock(&s->lock);
654
    ret = qcow2_co_check_locked(bs, result, fix);
655
    qemu_co_mutex_unlock(&s->lock);
656
    return ret;
657
}
658

659
int qcow2_validate_table(BlockDriverState *bs, uint64_t offset,
660
                         uint64_t entries, size_t entry_len,
661
                         int64_t max_size_bytes, const char *table_name,
662
                         Error **errp)
663
{
664
    BDRVQcow2State *s = bs->opaque;
665

666
    if (entries > max_size_bytes / entry_len) {
667
        error_setg(errp, "%s too large", table_name);
668
        return -EFBIG;
669
    }
670

671
    /* Use signed INT64_MAX as the maximum even for uint64_t header fields,
672
     * because values will be passed to qemu functions taking int64_t. */
673
    if ((INT64_MAX - entries * entry_len < offset) ||
674
        (offset_into_cluster(s, offset) != 0)) {
675
        error_setg(errp, "%s offset invalid", table_name);
676
        return -EINVAL;
677
    }
678

679
    return 0;
680
}
681

682
static const char *const mutable_opts[] = {
683
    QCOW2_OPT_LAZY_REFCOUNTS,
684
    QCOW2_OPT_DISCARD_REQUEST,
685
    QCOW2_OPT_DISCARD_SNAPSHOT,
686
    QCOW2_OPT_DISCARD_OTHER,
687
    QCOW2_OPT_DISCARD_NO_UNREF,
688
    QCOW2_OPT_OVERLAP,
689
    QCOW2_OPT_OVERLAP_TEMPLATE,
690
    QCOW2_OPT_OVERLAP_MAIN_HEADER,
691
    QCOW2_OPT_OVERLAP_ACTIVE_L1,
692
    QCOW2_OPT_OVERLAP_ACTIVE_L2,
693
    QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
694
    QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
695
    QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
696
    QCOW2_OPT_OVERLAP_INACTIVE_L1,
697
    QCOW2_OPT_OVERLAP_INACTIVE_L2,
698
    QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
699
    QCOW2_OPT_CACHE_SIZE,
700
    QCOW2_OPT_L2_CACHE_SIZE,
701
    QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
702
    QCOW2_OPT_REFCOUNT_CACHE_SIZE,
703
    QCOW2_OPT_CACHE_CLEAN_INTERVAL,
704
    NULL
705
};
706

707
static QemuOptsList qcow2_runtime_opts = {
708
    .name = "qcow2",
709
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head),
710
    .desc = {
711
        {
712
            .name = QCOW2_OPT_LAZY_REFCOUNTS,
713
            .type = QEMU_OPT_BOOL,
714
            .help = "Postpone refcount updates",
715
        },
716
        {
717
            .name = QCOW2_OPT_DISCARD_REQUEST,
718
            .type = QEMU_OPT_BOOL,
719
            .help = "Pass guest discard requests to the layer below",
720
        },
721
        {
722
            .name = QCOW2_OPT_DISCARD_SNAPSHOT,
723
            .type = QEMU_OPT_BOOL,
724
            .help = "Generate discard requests when snapshot related space "
725
                    "is freed",
726
        },
727
        {
728
            .name = QCOW2_OPT_DISCARD_OTHER,
729
            .type = QEMU_OPT_BOOL,
730
            .help = "Generate discard requests when other clusters are freed",
731
        },
732
        {
733
            .name = QCOW2_OPT_DISCARD_NO_UNREF,
734
            .type = QEMU_OPT_BOOL,
735
            .help = "Do not unreference discarded clusters",
736
        },
737
        {
738
            .name = QCOW2_OPT_OVERLAP,
739
            .type = QEMU_OPT_STRING,
740
            .help = "Selects which overlap checks to perform from a range of "
741
                    "templates (none, constant, cached, all)",
742
        },
743
        {
744
            .name = QCOW2_OPT_OVERLAP_TEMPLATE,
745
            .type = QEMU_OPT_STRING,
746
            .help = "Selects which overlap checks to perform from a range of "
747
                    "templates (none, constant, cached, all)",
748
        },
749
        {
750
            .name = QCOW2_OPT_OVERLAP_MAIN_HEADER,
751
            .type = QEMU_OPT_BOOL,
752
            .help = "Check for unintended writes into the main qcow2 header",
753
        },
754
        {
755
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L1,
756
            .type = QEMU_OPT_BOOL,
757
            .help = "Check for unintended writes into the active L1 table",
758
        },
759
        {
760
            .name = QCOW2_OPT_OVERLAP_ACTIVE_L2,
761
            .type = QEMU_OPT_BOOL,
762
            .help = "Check for unintended writes into an active L2 table",
763
        },
764
        {
765
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
766
            .type = QEMU_OPT_BOOL,
767
            .help = "Check for unintended writes into the refcount table",
768
        },
769
        {
770
            .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
771
            .type = QEMU_OPT_BOOL,
772
            .help = "Check for unintended writes into a refcount block",
773
        },
774
        {
775
            .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
776
            .type = QEMU_OPT_BOOL,
777
            .help = "Check for unintended writes into the snapshot table",
778
        },
779
        {
780
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L1,
781
            .type = QEMU_OPT_BOOL,
782
            .help = "Check for unintended writes into an inactive L1 table",
783
        },
784
        {
785
            .name = QCOW2_OPT_OVERLAP_INACTIVE_L2,
786
            .type = QEMU_OPT_BOOL,
787
            .help = "Check for unintended writes into an inactive L2 table",
788
        },
789
        {
790
            .name = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
791
            .type = QEMU_OPT_BOOL,
792
            .help = "Check for unintended writes into the bitmap directory",
793
        },
794
        {
795
            .name = QCOW2_OPT_CACHE_SIZE,
796
            .type = QEMU_OPT_SIZE,
797
            .help = "Maximum combined metadata (L2 tables and refcount blocks) "
798
                    "cache size",
799
        },
800
        {
801
            .name = QCOW2_OPT_L2_CACHE_SIZE,
802
            .type = QEMU_OPT_SIZE,
803
            .help = "Maximum L2 table cache size",
804
        },
805
        {
806
            .name = QCOW2_OPT_L2_CACHE_ENTRY_SIZE,
807
            .type = QEMU_OPT_SIZE,
808
            .help = "Size of each entry in the L2 cache",
809
        },
810
        {
811
            .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE,
812
            .type = QEMU_OPT_SIZE,
813
            .help = "Maximum refcount block cache size",
814
        },
815
        {
816
            .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL,
817
            .type = QEMU_OPT_NUMBER,
818
            .help = "Clean unused cache entries after this time (in seconds)",
819
        },
820
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",
821
            "ID of secret providing qcow2 AES key or LUKS passphrase"),
822
        { /* end of list */ }
823
    },
824
};
825

826
static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = {
827
    [QCOW2_OL_MAIN_HEADER_BITNR]      = QCOW2_OPT_OVERLAP_MAIN_HEADER,
828
    [QCOW2_OL_ACTIVE_L1_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L1,
829
    [QCOW2_OL_ACTIVE_L2_BITNR]        = QCOW2_OPT_OVERLAP_ACTIVE_L2,
830
    [QCOW2_OL_REFCOUNT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE,
831
    [QCOW2_OL_REFCOUNT_BLOCK_BITNR]   = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK,
832
    [QCOW2_OL_SNAPSHOT_TABLE_BITNR]   = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE,
833
    [QCOW2_OL_INACTIVE_L1_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L1,
834
    [QCOW2_OL_INACTIVE_L2_BITNR]      = QCOW2_OPT_OVERLAP_INACTIVE_L2,
835
    [QCOW2_OL_BITMAP_DIRECTORY_BITNR] = QCOW2_OPT_OVERLAP_BITMAP_DIRECTORY,
836
};
837

838
static void cache_clean_timer_cb(void *opaque)
839
{
840
    BlockDriverState *bs = opaque;
841
    BDRVQcow2State *s = bs->opaque;
842
    qcow2_cache_clean_unused(s->l2_table_cache);
843
    qcow2_cache_clean_unused(s->refcount_block_cache);
844
    timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
845
              (int64_t) s->cache_clean_interval * 1000);
846
}
847

848
static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context)
849
{
850
    BDRVQcow2State *s = bs->opaque;
851
    if (s->cache_clean_interval > 0) {
852
        s->cache_clean_timer =
853
            aio_timer_new_with_attrs(context, QEMU_CLOCK_VIRTUAL,
854
                                     SCALE_MS, QEMU_TIMER_ATTR_EXTERNAL,
855
                                     cache_clean_timer_cb, bs);
856
        timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) +
857
                  (int64_t) s->cache_clean_interval * 1000);
858
    }
859
}
860

861
static void cache_clean_timer_del(BlockDriverState *bs)
862
{
863
    BDRVQcow2State *s = bs->opaque;
864
    if (s->cache_clean_timer) {
865
        timer_free(s->cache_clean_timer);
866
        s->cache_clean_timer = NULL;
867
    }
868
}
869

870
static void qcow2_detach_aio_context(BlockDriverState *bs)
871
{
872
    cache_clean_timer_del(bs);
873
}
874

875
static void qcow2_attach_aio_context(BlockDriverState *bs,
876
                                     AioContext *new_context)
877
{
878
    cache_clean_timer_init(bs, new_context);
879
}
880

881
static bool read_cache_sizes(BlockDriverState *bs, QemuOpts *opts,
882
                             uint64_t *l2_cache_size,
883
                             uint64_t *l2_cache_entry_size,
884
                             uint64_t *refcount_cache_size, Error **errp)
885
{
886
    BDRVQcow2State *s = bs->opaque;
887
    uint64_t combined_cache_size, l2_cache_max_setting;
888
    bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set;
889
    bool l2_cache_entry_size_set;
890
    int min_refcount_cache = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
891
    uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
892
    uint64_t max_l2_entries = DIV_ROUND_UP(virtual_disk_size, s->cluster_size);
893
    /* An L2 table is always one cluster in size so the max cache size
894
     * should be a multiple of the cluster size. */
895
    uint64_t max_l2_cache = ROUND_UP(max_l2_entries * l2_entry_size(s),
896
                                     s->cluster_size);
897

898
    combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE);
899
    l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE);
900
    refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
901
    l2_cache_entry_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE);
902

903
    combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0);
904
    l2_cache_max_setting = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE,
905
                                             DEFAULT_L2_CACHE_MAX_SIZE);
906
    *refcount_cache_size = qemu_opt_get_size(opts,
907
                                             QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0);
908

909
    *l2_cache_entry_size = qemu_opt_get_size(
910
        opts, QCOW2_OPT_L2_CACHE_ENTRY_SIZE, s->cluster_size);
911

912
    *l2_cache_size = MIN(max_l2_cache, l2_cache_max_setting);
913

914
    if (combined_cache_size_set) {
915
        if (l2_cache_size_set && refcount_cache_size_set) {
916
            error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE
917
                       " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set "
918
                       "at the same time");
919
            return false;
920
        } else if (l2_cache_size_set &&
921
                   (l2_cache_max_setting > combined_cache_size)) {
922
            error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed "
923
                       QCOW2_OPT_CACHE_SIZE);
924
            return false;
925
        } else if (*refcount_cache_size > combined_cache_size) {
926
            error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed "
927
                       QCOW2_OPT_CACHE_SIZE);
928
            return false;
929
        }
930

931
        if (l2_cache_size_set) {
932
            *refcount_cache_size = combined_cache_size - *l2_cache_size;
933
        } else if (refcount_cache_size_set) {
934
            *l2_cache_size = combined_cache_size - *refcount_cache_size;
935
        } else {
936
            /* Assign as much memory as possible to the L2 cache, and
937
             * use the remainder for the refcount cache */
938
            if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
939
                *l2_cache_size = max_l2_cache;
940
                *refcount_cache_size = combined_cache_size - *l2_cache_size;
941
            } else {
942
                *refcount_cache_size =
943
                    MIN(combined_cache_size, min_refcount_cache);
944
                *l2_cache_size = combined_cache_size - *refcount_cache_size;
945
            }
946
        }
947
    }
948

949
    /*
950
     * If the L2 cache is not enough to cover the whole disk then
951
     * default to 4KB entries. Smaller entries reduce the cost of
952
     * loads and evictions and increase I/O performance.
953
     */
954
    if (*l2_cache_size < max_l2_cache && !l2_cache_entry_size_set) {
955
        *l2_cache_entry_size = MIN(s->cluster_size, 4096);
956
    }
957

958
    /* l2_cache_size and refcount_cache_size are ensured to have at least
959
     * their minimum values in qcow2_update_options_prepare() */
960

961
    if (*l2_cache_entry_size < (1 << MIN_CLUSTER_BITS) ||
962
        *l2_cache_entry_size > s->cluster_size ||
963
        !is_power_of_2(*l2_cache_entry_size)) {
964
        error_setg(errp, "L2 cache entry size must be a power of two "
965
                   "between %d and the cluster size (%d)",
966
                   1 << MIN_CLUSTER_BITS, s->cluster_size);
967
        return false;
968
    }
969

970
    return true;
971
}
972

973
typedef struct Qcow2ReopenState {
974
    Qcow2Cache *l2_table_cache;
975
    Qcow2Cache *refcount_block_cache;
976
    int l2_slice_size; /* Number of entries in a slice of the L2 table */
977
    bool use_lazy_refcounts;
978
    int overlap_check;
979
    bool discard_passthrough[QCOW2_DISCARD_MAX];
980
    bool discard_no_unref;
981
    uint64_t cache_clean_interval;
982
    QCryptoBlockOpenOptions *crypto_opts; /* Disk encryption runtime options */
983
} Qcow2ReopenState;
984

985
static int GRAPH_RDLOCK
986
qcow2_update_options_prepare(BlockDriverState *bs, Qcow2ReopenState *r,
987
                             QDict *options, int flags, Error **errp)
988
{
989
    BDRVQcow2State *s = bs->opaque;
990
    QemuOpts *opts = NULL;
991
    const char *opt_overlap_check, *opt_overlap_check_template;
992
    int overlap_check_template = 0;
993
    uint64_t l2_cache_size, l2_cache_entry_size, refcount_cache_size;
994
    int i;
995
    const char *encryptfmt;
996
    QDict *encryptopts = NULL;
997
    int ret;
998

999
    qdict_extract_subqdict(options, &encryptopts, "encrypt.");
1000
    encryptfmt = qdict_get_try_str(encryptopts, "format");
1001

1002
    opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort);
1003
    if (!qemu_opts_absorb_qdict(opts, options, errp)) {
1004
        ret = -EINVAL;
1005
        goto fail;
1006
    }
1007

1008
    /* get L2 table/refcount block cache size from command line options */
1009
    if (!read_cache_sizes(bs, opts, &l2_cache_size, &l2_cache_entry_size,
1010
                          &refcount_cache_size, errp)) {
1011
        ret = -EINVAL;
1012
        goto fail;
1013
    }
1014

1015
    l2_cache_size /= l2_cache_entry_size;
1016
    if (l2_cache_size < MIN_L2_CACHE_SIZE) {
1017
        l2_cache_size = MIN_L2_CACHE_SIZE;
1018
    }
1019
    if (l2_cache_size > INT_MAX) {
1020
        error_setg(errp, "L2 cache size too big");
1021
        ret = -EINVAL;
1022
        goto fail;
1023
    }
1024

1025
    refcount_cache_size /= s->cluster_size;
1026
    if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) {
1027
        refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE;
1028
    }
1029
    if (refcount_cache_size > INT_MAX) {
1030
        error_setg(errp, "Refcount cache size too big");
1031
        ret = -EINVAL;
1032
        goto fail;
1033
    }
1034

1035
    /* alloc new L2 table/refcount block cache, flush old one */
1036
    if (s->l2_table_cache) {
1037
        ret = qcow2_cache_flush(bs, s->l2_table_cache);
1038
        if (ret) {
1039
            error_setg_errno(errp, -ret, "Failed to flush the L2 table cache");
1040
            goto fail;
1041
        }
1042
    }
1043

1044
    if (s->refcount_block_cache) {
1045
        ret = qcow2_cache_flush(bs, s->refcount_block_cache);
1046
        if (ret) {
1047
            error_setg_errno(errp, -ret,
1048
                             "Failed to flush the refcount block cache");
1049
            goto fail;
1050
        }
1051
    }
1052

1053
    r->l2_slice_size = l2_cache_entry_size / l2_entry_size(s);
1054
    r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size,
1055
                                           l2_cache_entry_size);
1056
    r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size,
1057
                                                 s->cluster_size);
1058
    if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) {
1059
        error_setg(errp, "Could not allocate metadata caches");
1060
        ret = -ENOMEM;
1061
        goto fail;
1062
    }
1063

1064
    /* New interval for cache cleanup timer */
1065
    r->cache_clean_interval =
1066
        qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL,
1067
                            DEFAULT_CACHE_CLEAN_INTERVAL);
1068
#ifndef CONFIG_LINUX
1069
    if (r->cache_clean_interval != 0) {
1070
        error_setg(errp, QCOW2_OPT_CACHE_CLEAN_INTERVAL
1071
                   " not supported on this host");
1072
        ret = -EINVAL;
1073
        goto fail;
1074
    }
1075
#endif
1076
    if (r->cache_clean_interval > UINT_MAX) {
1077
        error_setg(errp, "Cache clean interval too big");
1078
        ret = -EINVAL;
1079
        goto fail;
1080
    }
1081

1082
    /* lazy-refcounts; flush if going from enabled to disabled */
1083
    r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS,
1084
        (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS));
1085
    if (r->use_lazy_refcounts && s->qcow_version < 3) {
1086
        error_setg(errp, "Lazy refcounts require a qcow2 image with at least "
1087
                   "qemu 1.1 compatibility level");
1088
        ret = -EINVAL;
1089
        goto fail;
1090
    }
1091

1092
    if (s->use_lazy_refcounts && !r->use_lazy_refcounts) {
1093
        ret = qcow2_mark_clean(bs);
1094
        if (ret < 0) {
1095
            error_setg_errno(errp, -ret, "Failed to disable lazy refcounts");
1096
            goto fail;
1097
        }
1098
    }
1099

1100
    /* Overlap check options */
1101
    opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP);
1102
    opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE);
1103
    if (opt_overlap_check_template && opt_overlap_check &&
1104
        strcmp(opt_overlap_check_template, opt_overlap_check))
1105
    {
1106
        error_setg(errp, "Conflicting values for qcow2 options '"
1107
                   QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE
1108
                   "' ('%s')", opt_overlap_check, opt_overlap_check_template);
1109
        ret = -EINVAL;
1110
        goto fail;
1111
    }
1112
    if (!opt_overlap_check) {
1113
        opt_overlap_check = opt_overlap_check_template ?: "cached";
1114
    }
1115

1116
    if (!strcmp(opt_overlap_check, "none")) {
1117
        overlap_check_template = 0;
1118
    } else if (!strcmp(opt_overlap_check, "constant")) {
1119
        overlap_check_template = QCOW2_OL_CONSTANT;
1120
    } else if (!strcmp(opt_overlap_check, "cached")) {
1121
        overlap_check_template = QCOW2_OL_CACHED;
1122
    } else if (!strcmp(opt_overlap_check, "all")) {
1123
        overlap_check_template = QCOW2_OL_ALL;
1124
    } else {
1125
        error_setg(errp, "Unsupported value '%s' for qcow2 option "
1126
                   "'overlap-check'. Allowed are any of the following: "
1127
                   "none, constant, cached, all", opt_overlap_check);
1128
        ret = -EINVAL;
1129
        goto fail;
1130
    }
1131

1132
    r->overlap_check = 0;
1133
    for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) {
1134
        /* overlap-check defines a template bitmask, but every flag may be
1135
         * overwritten through the associated boolean option */
1136
        r->overlap_check |=
1137
            qemu_opt_get_bool(opts, overlap_bool_option_names[i],
1138
                              overlap_check_template & (1 << i)) << i;
1139
    }
1140

1141
    r->discard_passthrough[QCOW2_DISCARD_NEVER] = false;
1142
    r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true;
1143
    r->discard_passthrough[QCOW2_DISCARD_REQUEST] =
1144
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST,
1145
                          flags & BDRV_O_UNMAP);
1146
    r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] =
1147
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true);
1148
    r->discard_passthrough[QCOW2_DISCARD_OTHER] =
1149
        qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false);
1150

1151
    r->discard_no_unref = qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_NO_UNREF,
1152
                                            false);
1153
    if (r->discard_no_unref && s->qcow_version < 3) {
1154
        error_setg(errp,
1155
                   "discard-no-unref is only supported since qcow2 version 3");
1156
        ret = -EINVAL;
1157
        goto fail;
1158
    }
1159

1160
    switch (s->crypt_method_header) {
1161
    case QCOW_CRYPT_NONE:
1162
        if (encryptfmt) {
1163
            error_setg(errp, "No encryption in image header, but options "
1164
                       "specified format '%s'", encryptfmt);
1165
            ret = -EINVAL;
1166
            goto fail;
1167
        }
1168
        break;
1169

1170
    case QCOW_CRYPT_AES:
1171
        if (encryptfmt && !g_str_equal(encryptfmt, "aes")) {
1172
            error_setg(errp,
1173
                       "Header reported 'aes' encryption format but "
1174
                       "options specify '%s'", encryptfmt);
1175
            ret = -EINVAL;
1176
            goto fail;
1177
        }
1178
        qdict_put_str(encryptopts, "format", "qcow");
1179
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1180
        if (!r->crypto_opts) {
1181
            ret = -EINVAL;
1182
            goto fail;
1183
        }
1184
        break;
1185

1186
    case QCOW_CRYPT_LUKS:
1187
        if (encryptfmt && !g_str_equal(encryptfmt, "luks")) {
1188
            error_setg(errp,
1189
                       "Header reported 'luks' encryption format but "
1190
                       "options specify '%s'", encryptfmt);
1191
            ret = -EINVAL;
1192
            goto fail;
1193
        }
1194
        qdict_put_str(encryptopts, "format", "luks");
1195
        r->crypto_opts = block_crypto_open_opts_init(encryptopts, errp);
1196
        if (!r->crypto_opts) {
1197
            ret = -EINVAL;
1198
            goto fail;
1199
        }
1200
        break;
1201

1202
    default:
1203
        error_setg(errp, "Unsupported encryption method %d",
1204
                   s->crypt_method_header);
1205
        ret = -EINVAL;
1206
        goto fail;
1207
    }
1208

1209
    ret = 0;
1210
fail:
1211
    qobject_unref(encryptopts);
1212
    qemu_opts_del(opts);
1213
    opts = NULL;
1214
    return ret;
1215
}
1216

1217
static void qcow2_update_options_commit(BlockDriverState *bs,
1218
                                        Qcow2ReopenState *r)
1219
{
1220
    BDRVQcow2State *s = bs->opaque;
1221
    int i;
1222

1223
    if (s->l2_table_cache) {
1224
        qcow2_cache_destroy(s->l2_table_cache);
1225
    }
1226
    if (s->refcount_block_cache) {
1227
        qcow2_cache_destroy(s->refcount_block_cache);
1228
    }
1229
    s->l2_table_cache = r->l2_table_cache;
1230
    s->refcount_block_cache = r->refcount_block_cache;
1231
    s->l2_slice_size = r->l2_slice_size;
1232

1233
    s->overlap_check = r->overlap_check;
1234
    s->use_lazy_refcounts = r->use_lazy_refcounts;
1235

1236
    for (i = 0; i < QCOW2_DISCARD_MAX; i++) {
1237
        s->discard_passthrough[i] = r->discard_passthrough[i];
1238
    }
1239

1240
    s->discard_no_unref = r->discard_no_unref;
1241

1242
    if (s->cache_clean_interval != r->cache_clean_interval) {
1243
        cache_clean_timer_del(bs);
1244
        s->cache_clean_interval = r->cache_clean_interval;
1245
        cache_clean_timer_init(bs, bdrv_get_aio_context(bs));
1246
    }
1247

1248
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1249
    s->crypto_opts = r->crypto_opts;
1250
}
1251

1252
static void qcow2_update_options_abort(BlockDriverState *bs,
1253
                                       Qcow2ReopenState *r)
1254
{
1255
    if (r->l2_table_cache) {
1256
        qcow2_cache_destroy(r->l2_table_cache);
1257
    }
1258
    if (r->refcount_block_cache) {
1259
        qcow2_cache_destroy(r->refcount_block_cache);
1260
    }
1261
    qapi_free_QCryptoBlockOpenOptions(r->crypto_opts);
1262
}
1263

1264
static int coroutine_fn GRAPH_RDLOCK
1265
qcow2_update_options(BlockDriverState *bs, QDict *options, int flags,
1266
                     Error **errp)
1267
{
1268
    Qcow2ReopenState r = {};
1269
    int ret;
1270

1271
    ret = qcow2_update_options_prepare(bs, &r, options, flags, errp);
1272
    if (ret >= 0) {
1273
        qcow2_update_options_commit(bs, &r);
1274
    } else {
1275
        qcow2_update_options_abort(bs, &r);
1276
    }
1277

1278
    return ret;
1279
}
1280

1281
static int validate_compression_type(BDRVQcow2State *s, Error **errp)
1282
{
1283
    switch (s->compression_type) {
1284
    case QCOW2_COMPRESSION_TYPE_ZLIB:
1285
#ifdef CONFIG_ZSTD
1286
    case QCOW2_COMPRESSION_TYPE_ZSTD:
1287
#endif
1288
        break;
1289

1290
    default:
1291
        error_setg(errp, "qcow2: unknown compression type: %u",
1292
                   s->compression_type);
1293
        return -ENOTSUP;
1294
    }
1295

1296
    /*
1297
     * if the compression type differs from QCOW2_COMPRESSION_TYPE_ZLIB
1298
     * the incompatible feature flag must be set
1299
     */
1300
    if (s->compression_type == QCOW2_COMPRESSION_TYPE_ZLIB) {
1301
        if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
1302
            error_setg(errp, "qcow2: Compression type incompatible feature "
1303
                             "bit must not be set");
1304
            return -EINVAL;
1305
        }
1306
    } else {
1307
        if (!(s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION)) {
1308
            error_setg(errp, "qcow2: Compression type incompatible feature "
1309
                             "bit must be set");
1310
            return -EINVAL;
1311
        }
1312
    }
1313

1314
    return 0;
1315
}
1316

1317
/* Called with s->lock held.  */
1318
static int coroutine_fn GRAPH_RDLOCK
1319
qcow2_do_open(BlockDriverState *bs, QDict *options, int flags,
1320
              bool open_data_file, Error **errp)
1321
{
1322
    ERRP_GUARD();
1323
    BDRVQcow2State *s = bs->opaque;
1324
    unsigned int len, i;
1325
    int ret = 0;
1326
    QCowHeader header;
1327
    uint64_t ext_end;
1328
    uint64_t l1_vm_state_index;
1329
    bool update_header = false;
1330

1331
    ret = bdrv_co_pread(bs->file, 0, sizeof(header), &header, 0);
1332
    if (ret < 0) {
1333
        error_setg_errno(errp, -ret, "Could not read qcow2 header");
1334
        goto fail;
1335
    }
1336
    header.magic = be32_to_cpu(header.magic);
1337
    header.version = be32_to_cpu(header.version);
1338
    header.backing_file_offset = be64_to_cpu(header.backing_file_offset);
1339
    header.backing_file_size = be32_to_cpu(header.backing_file_size);
1340
    header.size = be64_to_cpu(header.size);
1341
    header.cluster_bits = be32_to_cpu(header.cluster_bits);
1342
    header.crypt_method = be32_to_cpu(header.crypt_method);
1343
    header.l1_table_offset = be64_to_cpu(header.l1_table_offset);
1344
    header.l1_size = be32_to_cpu(header.l1_size);
1345
    header.refcount_table_offset = be64_to_cpu(header.refcount_table_offset);
1346
    header.refcount_table_clusters =
1347
        be32_to_cpu(header.refcount_table_clusters);
1348
    header.snapshots_offset = be64_to_cpu(header.snapshots_offset);
1349
    header.nb_snapshots = be32_to_cpu(header.nb_snapshots);
1350

1351
    if (header.magic != QCOW_MAGIC) {
1352
        error_setg(errp, "Image is not in qcow2 format");
1353
        ret = -EINVAL;
1354
        goto fail;
1355
    }
1356
    if (header.version < 2 || header.version > 3) {
1357
        error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version);
1358
        ret = -ENOTSUP;
1359
        goto fail;
1360
    }
1361

1362
    s->qcow_version = header.version;
1363

1364
    /* Initialise cluster size */
1365
    if (header.cluster_bits < MIN_CLUSTER_BITS ||
1366
        header.cluster_bits > MAX_CLUSTER_BITS) {
1367
        error_setg(errp, "Unsupported cluster size: 2^%" PRIu32,
1368
                   header.cluster_bits);
1369
        ret = -EINVAL;
1370
        goto fail;
1371
    }
1372

1373
    s->cluster_bits = header.cluster_bits;
1374
    s->cluster_size = 1 << s->cluster_bits;
1375

1376
    /* Initialise version 3 header fields */
1377
    if (header.version == 2) {
1378
        header.incompatible_features    = 0;
1379
        header.compatible_features      = 0;
1380
        header.autoclear_features       = 0;
1381
        header.refcount_order           = 4;
1382
        header.header_length            = 72;
1383
    } else {
1384
        header.incompatible_features =
1385
            be64_to_cpu(header.incompatible_features);
1386
        header.compatible_features = be64_to_cpu(header.compatible_features);
1387
        header.autoclear_features = be64_to_cpu(header.autoclear_features);
1388
        header.refcount_order = be32_to_cpu(header.refcount_order);
1389
        header.header_length = be32_to_cpu(header.header_length);
1390

1391
        if (header.header_length < 104) {
1392
            error_setg(errp, "qcow2 header too short");
1393
            ret = -EINVAL;
1394
            goto fail;
1395
        }
1396
    }
1397

1398
    if (header.header_length > s->cluster_size) {
1399
        error_setg(errp, "qcow2 header exceeds cluster size");
1400
        ret = -EINVAL;
1401
        goto fail;
1402
    }
1403

1404
    if (header.header_length > sizeof(header)) {
1405
        s->unknown_header_fields_size = header.header_length - sizeof(header);
1406
        s->unknown_header_fields = g_malloc(s->unknown_header_fields_size);
1407
        ret = bdrv_co_pread(bs->file, sizeof(header),
1408
                            s->unknown_header_fields_size,
1409
                            s->unknown_header_fields, 0);
1410
        if (ret < 0) {
1411
            error_setg_errno(errp, -ret, "Could not read unknown qcow2 header "
1412
                             "fields");
1413
            goto fail;
1414
        }
1415
    }
1416

1417
    if (header.backing_file_offset > s->cluster_size) {
1418
        error_setg(errp, "Invalid backing file offset");
1419
        ret = -EINVAL;
1420
        goto fail;
1421
    }
1422

1423
    if (header.backing_file_offset) {
1424
        ext_end = header.backing_file_offset;
1425
    } else {
1426
        ext_end = 1 << header.cluster_bits;
1427
    }
1428

1429
    /* Handle feature bits */
1430
    s->incompatible_features    = header.incompatible_features;
1431
    s->compatible_features      = header.compatible_features;
1432
    s->autoclear_features       = header.autoclear_features;
1433

1434
    /*
1435
     * Handle compression type
1436
     * Older qcow2 images don't contain the compression type header.
1437
     * Distinguish them by the header length and use
1438
     * the only valid (default) compression type in that case
1439
     */
1440
    if (header.header_length > offsetof(QCowHeader, compression_type)) {
1441
        s->compression_type = header.compression_type;
1442
    } else {
1443
        s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
1444
    }
1445

1446
    ret = validate_compression_type(s, errp);
1447
    if (ret) {
1448
        goto fail;
1449
    }
1450

1451
    if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) {
1452
        void *feature_table = NULL;
1453
        qcow2_read_extensions(bs, header.header_length, ext_end,
1454
                              &feature_table, flags, NULL, NULL);
1455
        report_unsupported_feature(errp, feature_table,
1456
                                   s->incompatible_features &
1457
                                   ~QCOW2_INCOMPAT_MASK);
1458
        ret = -ENOTSUP;
1459
        g_free(feature_table);
1460
        goto fail;
1461
    }
1462

1463
    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
1464
        /* Corrupt images may not be written to unless they are being repaired
1465
         */
1466
        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
1467
            error_setg(errp, "qcow2: Image is corrupt; cannot be opened "
1468
                       "read/write");
1469
            ret = -EACCES;
1470
            goto fail;
1471
        }
1472
    }
1473

1474
    s->subclusters_per_cluster =
1475
        has_subclusters(s) ? QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER : 1;
1476
    s->subcluster_size = s->cluster_size / s->subclusters_per_cluster;
1477
    s->subcluster_bits = ctz32(s->subcluster_size);
1478

1479
    if (s->subcluster_size < (1 << MIN_CLUSTER_BITS)) {
1480
        error_setg(errp, "Unsupported subcluster size: %d", s->subcluster_size);
1481
        ret = -EINVAL;
1482
        goto fail;
1483
    }
1484

1485
    /* Check support for various header values */
1486
    if (header.refcount_order > 6) {
1487
        error_setg(errp, "Reference count entry width too large; may not "
1488
                   "exceed 64 bits");
1489
        ret = -EINVAL;
1490
        goto fail;
1491
    }
1492
    s->refcount_order = header.refcount_order;
1493
    s->refcount_bits = 1 << s->refcount_order;
1494
    s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1);
1495
    s->refcount_max += s->refcount_max - 1;
1496

1497
    s->crypt_method_header = header.crypt_method;
1498
    if (s->crypt_method_header) {
1499
        if (bdrv_uses_whitelist() &&
1500
            s->crypt_method_header == QCOW_CRYPT_AES) {
1501
            error_setg(errp,
1502
                       "Use of AES-CBC encrypted qcow2 images is no longer "
1503
                       "supported in system emulators");
1504
            error_append_hint(errp,
1505
                              "You can use 'qemu-img convert' to convert your "
1506
                              "image to an alternative supported format, such "
1507
                              "as unencrypted qcow2, or raw with the LUKS "
1508
                              "format instead.\n");
1509
            ret = -ENOSYS;
1510
            goto fail;
1511
        }
1512

1513
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
1514
            s->crypt_physical_offset = false;
1515
        } else {
1516
            /* Assuming LUKS and any future crypt methods we
1517
             * add will all use physical offsets, due to the
1518
             * fact that the alternative is insecure...  */
1519
            s->crypt_physical_offset = true;
1520
        }
1521

1522
        bs->encrypted = true;
1523
    }
1524

1525
    s->l2_bits = s->cluster_bits - ctz32(l2_entry_size(s));
1526
    s->l2_size = 1 << s->l2_bits;
1527
    /* 2^(s->refcount_order - 3) is the refcount width in bytes */
1528
    s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3);
1529
    s->refcount_block_size = 1 << s->refcount_block_bits;
1530
    bs->total_sectors = header.size / BDRV_SECTOR_SIZE;
1531
    s->csize_shift = (62 - (s->cluster_bits - 8));
1532
    s->csize_mask = (1 << (s->cluster_bits - 8)) - 1;
1533
    s->cluster_offset_mask = (1LL << s->csize_shift) - 1;
1534

1535
    s->refcount_table_offset = header.refcount_table_offset;
1536
    s->refcount_table_size =
1537
        header.refcount_table_clusters << (s->cluster_bits - 3);
1538

1539
    if (header.refcount_table_clusters == 0 && !(flags & BDRV_O_CHECK)) {
1540
        error_setg(errp, "Image does not contain a reference count table");
1541
        ret = -EINVAL;
1542
        goto fail;
1543
    }
1544

1545
    ret = qcow2_validate_table(bs, s->refcount_table_offset,
1546
                               header.refcount_table_clusters,
1547
                               s->cluster_size, QCOW_MAX_REFTABLE_SIZE,
1548
                               "Reference count table", errp);
1549
    if (ret < 0) {
1550
        goto fail;
1551
    }
1552

1553
    if (!(flags & BDRV_O_CHECK)) {
1554
        /*
1555
         * The total size in bytes of the snapshot table is checked in
1556
         * qcow2_read_snapshots() because the size of each snapshot is
1557
         * variable and we don't know it yet.
1558
         * Here we only check the offset and number of snapshots.
1559
         */
1560
        ret = qcow2_validate_table(bs, header.snapshots_offset,
1561
                                   header.nb_snapshots,
1562
                                   sizeof(QCowSnapshotHeader),
1563
                                   sizeof(QCowSnapshotHeader) *
1564
                                       QCOW_MAX_SNAPSHOTS,
1565
                                   "Snapshot table", errp);
1566
        if (ret < 0) {
1567
            goto fail;
1568
        }
1569
    }
1570

1571
    /* read the level 1 table */
1572
    ret = qcow2_validate_table(bs, header.l1_table_offset,
1573
                               header.l1_size, L1E_SIZE,
1574
                               QCOW_MAX_L1_SIZE, "Active L1 table", errp);
1575
    if (ret < 0) {
1576
        goto fail;
1577
    }
1578
    s->l1_size = header.l1_size;
1579
    s->l1_table_offset = header.l1_table_offset;
1580

1581
    l1_vm_state_index = size_to_l1(s, header.size);
1582
    if (l1_vm_state_index > INT_MAX) {
1583
        error_setg(errp, "Image is too big");
1584
        ret = -EFBIG;
1585
        goto fail;
1586
    }
1587
    s->l1_vm_state_index = l1_vm_state_index;
1588

1589
    /* the L1 table must contain at least enough entries to put
1590
       header.size bytes */
1591
    if (s->l1_size < s->l1_vm_state_index) {
1592
        error_setg(errp, "L1 table is too small");
1593
        ret = -EINVAL;
1594
        goto fail;
1595
    }
1596

1597
    if (s->l1_size > 0) {
1598
        s->l1_table = qemu_try_blockalign(bs->file->bs, s->l1_size * L1E_SIZE);
1599
        if (s->l1_table == NULL) {
1600
            error_setg(errp, "Could not allocate L1 table");
1601
            ret = -ENOMEM;
1602
            goto fail;
1603
        }
1604
        ret = bdrv_co_pread(bs->file, s->l1_table_offset, s->l1_size * L1E_SIZE,
1605
                            s->l1_table, 0);
1606
        if (ret < 0) {
1607
            error_setg_errno(errp, -ret, "Could not read L1 table");
1608
            goto fail;
1609
        }
1610
        for(i = 0;i < s->l1_size; i++) {
1611
            s->l1_table[i] = be64_to_cpu(s->l1_table[i]);
1612
        }
1613
    }
1614

1615
    /* Parse driver-specific options */
1616
    ret = qcow2_update_options(bs, options, flags, errp);
1617
    if (ret < 0) {
1618
        goto fail;
1619
    }
1620

1621
    s->flags = flags;
1622

1623
    ret = qcow2_refcount_init(bs);
1624
    if (ret != 0) {
1625
        error_setg_errno(errp, -ret, "Could not initialize refcount handling");
1626
        goto fail;
1627
    }
1628

1629
    QLIST_INIT(&s->cluster_allocs);
1630
    QTAILQ_INIT(&s->discards);
1631

1632
    /* read qcow2 extensions */
1633
    if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL,
1634
                              flags, &update_header, errp)) {
1635
        ret = -EINVAL;
1636
        goto fail;
1637
    }
1638

1639
    if (open_data_file && (flags & BDRV_O_NO_IO)) {
1640
        /*
1641
         * Don't open the data file for 'qemu-img info' so that it can be used
1642
         * to verify that an untrusted qcow2 image doesn't refer to external
1643
         * files.
1644
         *
1645
         * Note: This still makes has_data_file() return true.
1646
         */
1647
        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
1648
            s->data_file = NULL;
1649
        } else {
1650
            s->data_file = bs->file;
1651
        }
1652
        qdict_extract_subqdict(options, NULL, "data-file.");
1653
        qdict_del(options, "data-file");
1654
    } else if (open_data_file) {
1655
        /* Open external data file */
1656
        bdrv_graph_co_rdunlock();
1657
        s->data_file = bdrv_co_open_child(NULL, options, "data-file", bs,
1658
                                          &child_of_bds, BDRV_CHILD_DATA,
1659
                                          true, errp);
1660
        bdrv_graph_co_rdlock();
1661
        if (*errp) {
1662
            ret = -EINVAL;
1663
            goto fail;
1664
        }
1665

1666
        if (s->incompatible_features & QCOW2_INCOMPAT_DATA_FILE) {
1667
            if (!s->data_file && s->image_data_file) {
1668
                bdrv_graph_co_rdunlock();
1669
                s->data_file = bdrv_co_open_child(s->image_data_file, options,
1670
                                                  "data-file", bs,
1671
                                                  &child_of_bds,
1672
                                                  BDRV_CHILD_DATA, false, errp);
1673
                bdrv_graph_co_rdlock();
1674
                if (!s->data_file) {
1675
                    ret = -EINVAL;
1676
                    goto fail;
1677
                }
1678
            }
1679
            if (!s->data_file) {
1680
                error_setg(errp, "'data-file' is required for this image");
1681
                ret = -EINVAL;
1682
                goto fail;
1683
            }
1684

1685
            /* No data here */
1686
            bs->file->role &= ~BDRV_CHILD_DATA;
1687

1688
            /* Must succeed because we have given up permissions if anything */
1689
            bdrv_child_refresh_perms(bs, bs->file, &error_abort);
1690
        } else {
1691
            if (s->data_file) {
1692
                error_setg(errp, "'data-file' can only be set for images with "
1693
                                 "an external data file");
1694
                ret = -EINVAL;
1695
                goto fail;
1696
            }
1697

1698
            s->data_file = bs->file;
1699

1700
            if (data_file_is_raw(bs)) {
1701
                error_setg(errp, "data-file-raw requires a data file");
1702
                ret = -EINVAL;
1703
                goto fail;
1704
            }
1705
        }
1706
    }
1707

1708
    /* qcow2_read_extension may have set up the crypto context
1709
     * if the crypt method needs a header region, some methods
1710
     * don't need header extensions, so must check here
1711
     */
1712
    if (s->crypt_method_header && !s->crypto) {
1713
        if (s->crypt_method_header == QCOW_CRYPT_AES) {
1714
            unsigned int cflags = 0;
1715
            if (flags & BDRV_O_NO_IO) {
1716
                cflags |= QCRYPTO_BLOCK_OPEN_NO_IO;
1717
            }
1718
            s->crypto = qcrypto_block_open(s->crypto_opts, "encrypt.",
1719
                                           NULL, NULL, cflags, errp);
1720
            if (!s->crypto) {
1721
                ret = -EINVAL;
1722
                goto fail;
1723
            }
1724
        } else if (!(flags & BDRV_O_NO_IO)) {
1725
            error_setg(errp, "Missing CRYPTO header for crypt method %d",
1726
                       s->crypt_method_header);
1727
            ret = -EINVAL;
1728
            goto fail;
1729
        }
1730
    }
1731

1732
    /* read the backing file name */
1733
    if (header.backing_file_offset != 0) {
1734
        len = header.backing_file_size;
1735
        if (len > MIN(1023, s->cluster_size - header.backing_file_offset) ||
1736
            len >= sizeof(bs->backing_file)) {
1737
            error_setg(errp, "Backing file name too long");
1738
            ret = -EINVAL;
1739
            goto fail;
1740
        }
1741

1742
        s->image_backing_file = g_malloc(len + 1);
1743
        ret = bdrv_co_pread(bs->file, header.backing_file_offset, len,
1744
                            s->image_backing_file, 0);
1745
        if (ret < 0) {
1746
            error_setg_errno(errp, -ret, "Could not read backing file name");
1747
            goto fail;
1748
        }
1749
        s->image_backing_file[len] = '\0';
1750

1751
        /*
1752
         * Update only when something has changed.  This function is called by
1753
         * qcow2_co_invalidate_cache(), and we do not want to reset
1754
         * auto_backing_file unless necessary.
1755
         */
1756
        if (!g_str_equal(s->image_backing_file, bs->backing_file)) {
1757
            pstrcpy(bs->backing_file, sizeof(bs->backing_file),
1758
                    s->image_backing_file);
1759
            pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
1760
                    s->image_backing_file);
1761
        }
1762
    }
1763

1764
    /*
1765
     * Internal snapshots; skip reading them in check mode, because
1766
     * we do not need them then, and we do not want to abort because
1767
     * of a broken table.
1768
     */
1769
    if (!(flags & BDRV_O_CHECK)) {
1770
        s->snapshots_offset = header.snapshots_offset;
1771
        s->nb_snapshots = header.nb_snapshots;
1772

1773
        ret = qcow2_read_snapshots(bs, errp);
1774
        if (ret < 0) {
1775
            goto fail;
1776
        }
1777
    }
1778

1779
    /* Clear unknown autoclear feature bits */
1780
    update_header |= s->autoclear_features & ~QCOW2_AUTOCLEAR_MASK;
1781
    update_header = update_header && bdrv_is_writable(bs);
1782
    if (update_header) {
1783
        s->autoclear_features &= QCOW2_AUTOCLEAR_MASK;
1784
    }
1785

1786
    /* == Handle persistent dirty bitmaps ==
1787
     *
1788
     * We want load dirty bitmaps in three cases:
1789
     *
1790
     * 1. Normal open of the disk in active mode, not related to invalidation
1791
     *    after migration.
1792
     *
1793
     * 2. Invalidation of the target vm after pre-copy phase of migration, if
1794
     *    bitmaps are _not_ migrating through migration channel, i.e.
1795
     *    'dirty-bitmaps' capability is disabled.
1796
     *
1797
     * 3. Invalidation of source vm after failed or canceled migration.
1798
     *    This is a very interesting case. There are two possible types of
1799
     *    bitmaps:
1800
     *
1801
     *    A. Stored on inactivation and removed. They should be loaded from the
1802
     *       image.
1803
     *
1804
     *    B. Not stored: not-persistent bitmaps and bitmaps, migrated through
1805
     *       the migration channel (with dirty-bitmaps capability).
1806
     *
1807
     *    On the other hand, there are two possible sub-cases:
1808
     *
1809
     *    3.1 disk was changed by somebody else while were inactive. In this
1810
     *        case all in-RAM dirty bitmaps (both persistent and not) are
1811
     *        definitely invalid. And we don't have any method to determine
1812
     *        this.
1813
     *
1814
     *        Simple and safe thing is to just drop all the bitmaps of type B on
1815
     *        inactivation. But in this case we lose bitmaps in valid 4.2 case.
1816
     *
1817
     *        On the other hand, resuming source vm, if disk was already changed
1818
     *        is a bad thing anyway: not only bitmaps, the whole vm state is
1819
     *        out of sync with disk.
1820
     *
1821
     *        This means, that user or management tool, who for some reason
1822
     *        decided to resume source vm, after disk was already changed by
1823
     *        target vm, should at least drop all dirty bitmaps by hand.
1824
     *
1825
     *        So, we can ignore this case for now, but TODO: "generation"
1826
     *        extension for qcow2, to determine, that image was changed after
1827
     *        last inactivation. And if it is changed, we will drop (or at least
1828
     *        mark as 'invalid' all the bitmaps of type B, both persistent
1829
     *        and not).
1830
     *
1831
     *    3.2 disk was _not_ changed while were inactive. Bitmaps may be saved
1832
     *        to disk ('dirty-bitmaps' capability disabled), or not saved
1833
     *        ('dirty-bitmaps' capability enabled), but we don't need to care
1834
     *        of: let's load bitmaps as always: stored bitmaps will be loaded,
1835
     *        and not stored has flag IN_USE=1 in the image and will be skipped
1836
     *        on loading.
1837
     *
1838
     * One remaining possible case when we don't want load bitmaps:
1839
     *
1840
     * 4. Open disk in inactive mode in target vm (bitmaps are migrating or
1841
     *    will be loaded on invalidation, no needs try loading them before)
1842
     */
1843

1844
    if (!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)) {
1845
        /* It's case 1, 2 or 3.2. Or 3.1 which is BUG in management layer. */
1846
        bool header_updated;
1847
        if (!qcow2_load_dirty_bitmaps(bs, &header_updated, errp)) {
1848
            ret = -EINVAL;
1849
            goto fail;
1850
        }
1851

1852
        update_header = update_header && !header_updated;
1853
    }
1854

1855
    if (update_header) {
1856
        ret = qcow2_update_header(bs);
1857
        if (ret < 0) {
1858
            error_setg_errno(errp, -ret, "Could not update qcow2 header");
1859
            goto fail;
1860
        }
1861
    }
1862

1863
    bs->supported_zero_flags = header.version >= 3 ?
1864
                               BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK : 0;
1865
    bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
1866

1867
    /* Repair image if dirty */
1868
    if (!(flags & BDRV_O_CHECK) && bdrv_is_writable(bs) &&
1869
        (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) {
1870
        BdrvCheckResult result = {0};
1871

1872
        ret = qcow2_co_check_locked(bs, &result,
1873
                                    BDRV_FIX_ERRORS | BDRV_FIX_LEAKS);
1874
        if (ret < 0 || result.check_errors) {
1875
            if (ret >= 0) {
1876
                ret = -EIO;
1877
            }
1878
            error_setg_errno(errp, -ret, "Could not repair dirty image");
1879
            goto fail;
1880
        }
1881
    }
1882

1883
#ifdef DEBUG_ALLOC
1884
    {
1885
        BdrvCheckResult result = {0};
1886
        qcow2_check_refcounts(bs, &result, 0);
1887
    }
1888
#endif
1889

1890
    qemu_co_queue_init(&s->thread_task_queue);
1891

1892
    return ret;
1893

1894
 fail:
1895
    g_free(s->image_data_file);
1896
    if (open_data_file && has_data_file(bs)) {
1897
        bdrv_graph_co_rdunlock();
1898
        bdrv_co_unref_child(bs, s->data_file);
1899
        bdrv_graph_co_rdlock();
1900
        s->data_file = NULL;
1901
    }
1902
    g_free(s->unknown_header_fields);
1903
    cleanup_unknown_header_ext(bs);
1904
    qcow2_free_snapshots(bs);
1905
    qcow2_refcount_close(bs);
1906
    qemu_vfree(s->l1_table);
1907
    /* else pre-write overlap checks in cache_destroy may crash */
1908
    s->l1_table = NULL;
1909
    cache_clean_timer_del(bs);
1910
    if (s->l2_table_cache) {
1911
        qcow2_cache_destroy(s->l2_table_cache);
1912
    }
1913
    if (s->refcount_block_cache) {
1914
        qcow2_cache_destroy(s->refcount_block_cache);
1915
    }
1916
    qcrypto_block_free(s->crypto);
1917
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
1918
    return ret;
1919
}
1920

1921
typedef struct QCow2OpenCo {
1922
    BlockDriverState *bs;
1923
    QDict *options;
1924
    int flags;
1925
    Error **errp;
1926
    int ret;
1927
} QCow2OpenCo;
1928

1929
static void coroutine_fn qcow2_open_entry(void *opaque)
1930
{
1931
    QCow2OpenCo *qoc = opaque;
1932
    BDRVQcow2State *s = qoc->bs->opaque;
1933

1934
    GRAPH_RDLOCK_GUARD();
1935

1936
    qemu_co_mutex_lock(&s->lock);
1937
    qoc->ret = qcow2_do_open(qoc->bs, qoc->options, qoc->flags, true,
1938
                             qoc->errp);
1939
    qemu_co_mutex_unlock(&s->lock);
1940

1941
    aio_wait_kick();
1942
}
1943

1944
static int qcow2_open(BlockDriverState *bs, QDict *options, int flags,
1945
                      Error **errp)
1946
{
1947
    BDRVQcow2State *s = bs->opaque;
1948
    QCow2OpenCo qoc = {
1949
        .bs = bs,
1950
        .options = options,
1951
        .flags = flags,
1952
        .errp = errp,
1953
        .ret = -EINPROGRESS
1954
    };
1955
    int ret;
1956

1957
    ret = bdrv_open_file_child(NULL, options, "file", bs, errp);
1958
    if (ret < 0) {
1959
        return ret;
1960
    }
1961

1962
    /* Initialise locks */
1963
    qemu_co_mutex_init(&s->lock);
1964

1965
    assert(!qemu_in_coroutine());
1966
    assert(qemu_get_current_aio_context() == qemu_get_aio_context());
1967

1968
    aio_co_enter(bdrv_get_aio_context(bs),
1969
                 qemu_coroutine_create(qcow2_open_entry, &qoc));
1970
    AIO_WAIT_WHILE_UNLOCKED(NULL, qoc.ret == -EINPROGRESS);
1971

1972
    return qoc.ret;
1973
}
1974

1975
static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp)
1976
{
1977
    BDRVQcow2State *s = bs->opaque;
1978

1979
    if (bs->encrypted) {
1980
        /* Encryption works on a sector granularity */
1981
        bs->bl.request_alignment = qcrypto_block_get_sector_size(s->crypto);
1982
    }
1983
    bs->bl.pwrite_zeroes_alignment = s->subcluster_size;
1984
    bs->bl.pdiscard_alignment = s->cluster_size;
1985
}
1986

1987
static int GRAPH_UNLOCKED
1988
qcow2_reopen_prepare(BDRVReopenState *state,BlockReopenQueue *queue,
1989
                     Error **errp)
1990
{
1991
    BDRVQcow2State *s = state->bs->opaque;
1992
    Qcow2ReopenState *r;
1993
    int ret;
1994

1995
    GLOBAL_STATE_CODE();
1996
    GRAPH_RDLOCK_GUARD_MAINLOOP();
1997

1998
    r = g_new0(Qcow2ReopenState, 1);
1999
    state->opaque = r;
2000

2001
    ret = qcow2_update_options_prepare(state->bs, r, state->options,
2002
                                       state->flags, errp);
2003
    if (ret < 0) {
2004
        goto fail;
2005
    }
2006

2007
    /* We need to write out any unwritten data if we reopen read-only. */
2008
    if ((state->flags & BDRV_O_RDWR) == 0) {
2009
        ret = qcow2_reopen_bitmaps_ro(state->bs, errp);
2010
        if (ret < 0) {
2011
            goto fail;
2012
        }
2013

2014
        ret = bdrv_flush(state->bs);
2015
        if (ret < 0) {
2016
            goto fail;
2017
        }
2018

2019
        ret = qcow2_mark_clean(state->bs);
2020
        if (ret < 0) {
2021
            goto fail;
2022
        }
2023
    }
2024

2025
    /*
2026
     * Without an external data file, s->data_file points to the same BdrvChild
2027
     * as bs->file. It needs to be resynced after reopen because bs->file may
2028
     * be changed. We can't use it in the meantime.
2029
     */
2030
    if (!has_data_file(state->bs)) {
2031
        assert(s->data_file == state->bs->file);
2032
        s->data_file = NULL;
2033
    }
2034

2035
    return 0;
2036

2037
fail:
2038
    qcow2_update_options_abort(state->bs, r);
2039
    g_free(r);
2040
    return ret;
2041
}
2042

2043
static void qcow2_reopen_commit(BDRVReopenState *state)
2044
{
2045
    BDRVQcow2State *s = state->bs->opaque;
2046

2047
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2048

2049
    qcow2_update_options_commit(state->bs, state->opaque);
2050
    if (!s->data_file) {
2051
        /*
2052
         * If we don't have an external data file, s->data_file was cleared by
2053
         * qcow2_reopen_prepare() and needs to be updated.
2054
         */
2055
        s->data_file = state->bs->file;
2056
    }
2057
    g_free(state->opaque);
2058
}
2059

2060
static void qcow2_reopen_commit_post(BDRVReopenState *state)
2061
{
2062
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2063

2064
    if (state->flags & BDRV_O_RDWR) {
2065
        Error *local_err = NULL;
2066

2067
        if (qcow2_reopen_bitmaps_rw(state->bs, &local_err) < 0) {
2068
            /*
2069
             * This is not fatal, bitmaps just left read-only, so all following
2070
             * writes will fail. User can remove read-only bitmaps to unblock
2071
             * writes or retry reopen.
2072
             */
2073
            error_reportf_err(local_err,
2074
                              "%s: Failed to make dirty bitmaps writable: ",
2075
                              bdrv_get_node_name(state->bs));
2076
        }
2077
    }
2078
}
2079

2080
static void qcow2_reopen_abort(BDRVReopenState *state)
2081
{
2082
    BDRVQcow2State *s = state->bs->opaque;
2083

2084
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2085

2086
    if (!s->data_file) {
2087
        /*
2088
         * If we don't have an external data file, s->data_file was cleared by
2089
         * qcow2_reopen_prepare() and needs to be restored.
2090
         */
2091
        s->data_file = state->bs->file;
2092
    }
2093
    qcow2_update_options_abort(state->bs, state->opaque);
2094
    g_free(state->opaque);
2095
}
2096

2097
static void qcow2_join_options(QDict *options, QDict *old_options)
2098
{
2099
    bool has_new_overlap_template =
2100
        qdict_haskey(options, QCOW2_OPT_OVERLAP) ||
2101
        qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE);
2102
    bool has_new_total_cache_size =
2103
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE);
2104
    bool has_all_cache_options;
2105

2106
    /* New overlap template overrides all old overlap options */
2107
    if (has_new_overlap_template) {
2108
        qdict_del(old_options, QCOW2_OPT_OVERLAP);
2109
        qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE);
2110
        qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER);
2111
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1);
2112
        qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2);
2113
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE);
2114
        qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK);
2115
        qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE);
2116
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1);
2117
        qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2);
2118
    }
2119

2120
    /* New total cache size overrides all old options */
2121
    if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) {
2122
        qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE);
2123
        qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
2124
    }
2125

2126
    qdict_join(options, old_options, false);
2127

2128
    /*
2129
     * If after merging all cache size options are set, an old total size is
2130
     * overwritten. Do keep all options, however, if all three are new. The
2131
     * resulting error message is what we want to happen.
2132
     */
2133
    has_all_cache_options =
2134
        qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) ||
2135
        qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) ||
2136
        qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE);
2137

2138
    if (has_all_cache_options && !has_new_total_cache_size) {
2139
        qdict_del(options, QCOW2_OPT_CACHE_SIZE);
2140
    }
2141
}
2142

2143
static int coroutine_fn GRAPH_RDLOCK
2144
qcow2_co_block_status(BlockDriverState *bs, bool want_zero, int64_t offset,
2145
                      int64_t count, int64_t *pnum, int64_t *map,
2146
                      BlockDriverState **file)
2147
{
2148
    BDRVQcow2State *s = bs->opaque;
2149
    uint64_t host_offset;
2150
    unsigned int bytes;
2151
    QCow2SubclusterType type;
2152
    int ret, status = 0;
2153

2154
    qemu_co_mutex_lock(&s->lock);
2155

2156
    if (!s->metadata_preallocation_checked) {
2157
        ret = qcow2_detect_metadata_preallocation(bs);
2158
        s->metadata_preallocation = (ret == 1);
2159
        s->metadata_preallocation_checked = true;
2160
    }
2161

2162
    bytes = MIN(INT_MAX, count);
2163
    ret = qcow2_get_host_offset(bs, offset, &bytes, &host_offset, &type);
2164
    qemu_co_mutex_unlock(&s->lock);
2165
    if (ret < 0) {
2166
        return ret;
2167
    }
2168

2169
    *pnum = bytes;
2170

2171
    if ((type == QCOW2_SUBCLUSTER_NORMAL ||
2172
         type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
2173
         type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) && !s->crypto) {
2174
        *map = host_offset;
2175
        *file = s->data_file->bs;
2176
        status |= BDRV_BLOCK_OFFSET_VALID;
2177
    }
2178
    if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
2179
        type == QCOW2_SUBCLUSTER_ZERO_ALLOC) {
2180
        status |= BDRV_BLOCK_ZERO;
2181
    } else if (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
2182
               type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC) {
2183
        status |= BDRV_BLOCK_DATA;
2184
    }
2185
    if (s->metadata_preallocation && (status & BDRV_BLOCK_DATA) &&
2186
        (status & BDRV_BLOCK_OFFSET_VALID))
2187
    {
2188
        status |= BDRV_BLOCK_RECURSE;
2189
    }
2190
    if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
2191
        status |= BDRV_BLOCK_COMPRESSED;
2192
    }
2193
    return status;
2194
}
2195

2196
static int coroutine_fn GRAPH_RDLOCK
2197
qcow2_handle_l2meta(BlockDriverState *bs, QCowL2Meta **pl2meta, bool link_l2)
2198
{
2199
    int ret = 0;
2200
    QCowL2Meta *l2meta = *pl2meta;
2201

2202
    while (l2meta != NULL) {
2203
        QCowL2Meta *next;
2204

2205
        if (link_l2) {
2206
            ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
2207
            if (ret) {
2208
                goto out;
2209
            }
2210
        } else {
2211
            qcow2_alloc_cluster_abort(bs, l2meta);
2212
        }
2213

2214
        /* Take the request off the list of running requests */
2215
        QLIST_REMOVE(l2meta, next_in_flight);
2216

2217
        qemu_co_queue_restart_all(&l2meta->dependent_requests);
2218

2219
        next = l2meta->next;
2220
        g_free(l2meta);
2221
        l2meta = next;
2222
    }
2223
out:
2224
    *pl2meta = l2meta;
2225
    return ret;
2226
}
2227

2228
static int coroutine_fn GRAPH_RDLOCK
2229
qcow2_co_preadv_encrypted(BlockDriverState *bs,
2230
                           uint64_t host_offset,
2231
                           uint64_t offset,
2232
                           uint64_t bytes,
2233
                           QEMUIOVector *qiov,
2234
                           uint64_t qiov_offset)
2235
{
2236
    int ret;
2237
    BDRVQcow2State *s = bs->opaque;
2238
    uint8_t *buf;
2239

2240
    assert(bs->encrypted && s->crypto);
2241
    assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2242

2243
    /*
2244
     * For encrypted images, read everything into a temporary
2245
     * contiguous buffer on which the AES functions can work.
2246
     * Also, decryption in a separate buffer is better as it
2247
     * prevents the guest from learning information about the
2248
     * encrypted nature of the virtual disk.
2249
     */
2250

2251
    buf = qemu_try_blockalign(s->data_file->bs, bytes);
2252
    if (buf == NULL) {
2253
        return -ENOMEM;
2254
    }
2255

2256
    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
2257
    ret = bdrv_co_pread(s->data_file, host_offset, bytes, buf, 0);
2258
    if (ret < 0) {
2259
        goto fail;
2260
    }
2261

2262
    if (qcow2_co_decrypt(bs, host_offset, offset, buf, bytes) < 0)
2263
    {
2264
        ret = -EIO;
2265
        goto fail;
2266
    }
2267
    qemu_iovec_from_buf(qiov, qiov_offset, buf, bytes);
2268

2269
fail:
2270
    qemu_vfree(buf);
2271

2272
    return ret;
2273
}
2274

2275
typedef struct Qcow2AioTask {
2276
    AioTask task;
2277

2278
    BlockDriverState *bs;
2279
    QCow2SubclusterType subcluster_type; /* only for read */
2280
    uint64_t host_offset; /* or l2_entry for compressed read */
2281
    uint64_t offset;
2282
    uint64_t bytes;
2283
    QEMUIOVector *qiov;
2284
    uint64_t qiov_offset;
2285
    QCowL2Meta *l2meta; /* only for write */
2286
} Qcow2AioTask;
2287

2288
static coroutine_fn int qcow2_co_preadv_task_entry(AioTask *task);
2289
static coroutine_fn int qcow2_add_task(BlockDriverState *bs,
2290
                                       AioTaskPool *pool,
2291
                                       AioTaskFunc func,
2292
                                       QCow2SubclusterType subcluster_type,
2293
                                       uint64_t host_offset,
2294
                                       uint64_t offset,
2295
                                       uint64_t bytes,
2296
                                       QEMUIOVector *qiov,
2297
                                       size_t qiov_offset,
2298
                                       QCowL2Meta *l2meta)
2299
{
2300
    Qcow2AioTask local_task;
2301
    Qcow2AioTask *task = pool ? g_new(Qcow2AioTask, 1) : &local_task;
2302

2303
    *task = (Qcow2AioTask) {
2304
        .task.func = func,
2305
        .bs = bs,
2306
        .subcluster_type = subcluster_type,
2307
        .qiov = qiov,
2308
        .host_offset = host_offset,
2309
        .offset = offset,
2310
        .bytes = bytes,
2311
        .qiov_offset = qiov_offset,
2312
        .l2meta = l2meta,
2313
    };
2314

2315
    trace_qcow2_add_task(qemu_coroutine_self(), bs, pool,
2316
                         func == qcow2_co_preadv_task_entry ? "read" : "write",
2317
                         subcluster_type, host_offset, offset, bytes,
2318
                         qiov, qiov_offset);
2319

2320
    if (!pool) {
2321
        return func(&task->task);
2322
    }
2323

2324
    aio_task_pool_start_task(pool, &task->task);
2325

2326
    return 0;
2327
}
2328

2329
static int coroutine_fn GRAPH_RDLOCK
2330
qcow2_co_preadv_task(BlockDriverState *bs, QCow2SubclusterType subc_type,
2331
                     uint64_t host_offset, uint64_t offset, uint64_t bytes,
2332
                     QEMUIOVector *qiov, size_t qiov_offset)
2333
{
2334
    BDRVQcow2State *s = bs->opaque;
2335

2336
    switch (subc_type) {
2337
    case QCOW2_SUBCLUSTER_ZERO_PLAIN:
2338
    case QCOW2_SUBCLUSTER_ZERO_ALLOC:
2339
        /* Both zero types are handled in qcow2_co_preadv_part */
2340
        g_assert_not_reached();
2341

2342
    case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
2343
    case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
2344
        assert(bs->backing); /* otherwise handled in qcow2_co_preadv_part */
2345

2346
        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_BACKING_AIO);
2347
        return bdrv_co_preadv_part(bs->backing, offset, bytes,
2348
                                   qiov, qiov_offset, 0);
2349

2350
    case QCOW2_SUBCLUSTER_COMPRESSED:
2351
        return qcow2_co_preadv_compressed(bs, host_offset,
2352
                                          offset, bytes, qiov, qiov_offset);
2353

2354
    case QCOW2_SUBCLUSTER_NORMAL:
2355
        if (bs->encrypted) {
2356
            return qcow2_co_preadv_encrypted(bs, host_offset,
2357
                                             offset, bytes, qiov, qiov_offset);
2358
        }
2359

2360
        BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_AIO);
2361
        return bdrv_co_preadv_part(s->data_file, host_offset,
2362
                                   bytes, qiov, qiov_offset, 0);
2363

2364
    default:
2365
        g_assert_not_reached();
2366
    }
2367

2368
    g_assert_not_reached();
2369
}
2370

2371
/*
2372
 * This function can count as GRAPH_RDLOCK because qcow2_co_preadv_part() holds
2373
 * the graph lock and keeps it until this coroutine has terminated.
2374
 */
2375
static int coroutine_fn GRAPH_RDLOCK qcow2_co_preadv_task_entry(AioTask *task)
2376
{
2377
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
2378

2379
    assert(!t->l2meta);
2380

2381
    return qcow2_co_preadv_task(t->bs, t->subcluster_type,
2382
                                t->host_offset, t->offset, t->bytes,
2383
                                t->qiov, t->qiov_offset);
2384
}
2385

2386
static int coroutine_fn GRAPH_RDLOCK
2387
qcow2_co_preadv_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
2388
                     QEMUIOVector *qiov, size_t qiov_offset,
2389
                     BdrvRequestFlags flags)
2390
{
2391
    BDRVQcow2State *s = bs->opaque;
2392
    int ret = 0;
2393
    unsigned int cur_bytes; /* number of bytes in current iteration */
2394
    uint64_t host_offset = 0;
2395
    QCow2SubclusterType type;
2396
    AioTaskPool *aio = NULL;
2397

2398
    while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2399
        /* prepare next request */
2400
        cur_bytes = MIN(bytes, INT_MAX);
2401
        if (s->crypto) {
2402
            cur_bytes = MIN(cur_bytes,
2403
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2404
        }
2405

2406
        qemu_co_mutex_lock(&s->lock);
2407
        ret = qcow2_get_host_offset(bs, offset, &cur_bytes,
2408
                                    &host_offset, &type);
2409
        qemu_co_mutex_unlock(&s->lock);
2410
        if (ret < 0) {
2411
            goto out;
2412
        }
2413

2414
        if (type == QCOW2_SUBCLUSTER_ZERO_PLAIN ||
2415
            type == QCOW2_SUBCLUSTER_ZERO_ALLOC ||
2416
            (type == QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN && !bs->backing) ||
2417
            (type == QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC && !bs->backing))
2418
        {
2419
            qemu_iovec_memset(qiov, qiov_offset, 0, cur_bytes);
2420
        } else {
2421
            if (!aio && cur_bytes != bytes) {
2422
                aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
2423
            }
2424
            ret = qcow2_add_task(bs, aio, qcow2_co_preadv_task_entry, type,
2425
                                 host_offset, offset, cur_bytes,
2426
                                 qiov, qiov_offset, NULL);
2427
            if (ret < 0) {
2428
                goto out;
2429
            }
2430
        }
2431

2432
        bytes -= cur_bytes;
2433
        offset += cur_bytes;
2434
        qiov_offset += cur_bytes;
2435
    }
2436

2437
out:
2438
    if (aio) {
2439
        aio_task_pool_wait_all(aio);
2440
        if (ret == 0) {
2441
            ret = aio_task_pool_status(aio);
2442
        }
2443
        g_free(aio);
2444
    }
2445

2446
    return ret;
2447
}
2448

2449
/* Check if it's possible to merge a write request with the writing of
2450
 * the data from the COW regions */
2451
static bool merge_cow(uint64_t offset, unsigned bytes,
2452
                      QEMUIOVector *qiov, size_t qiov_offset,
2453
                      QCowL2Meta *l2meta)
2454
{
2455
    QCowL2Meta *m;
2456

2457
    for (m = l2meta; m != NULL; m = m->next) {
2458
        /* If both COW regions are empty then there's nothing to merge */
2459
        if (m->cow_start.nb_bytes == 0 && m->cow_end.nb_bytes == 0) {
2460
            continue;
2461
        }
2462

2463
        /* If COW regions are handled already, skip this too */
2464
        if (m->skip_cow) {
2465
            continue;
2466
        }
2467

2468
        /*
2469
         * The write request should start immediately after the first
2470
         * COW region. This does not always happen because the area
2471
         * touched by the request can be larger than the one defined
2472
         * by @m (a single request can span an area consisting of a
2473
         * mix of previously unallocated and allocated clusters, that
2474
         * is why @l2meta is a list).
2475
         */
2476
        if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) {
2477
            /* In this case the request starts before this region */
2478
            assert(offset < l2meta_cow_start(m));
2479
            assert(m->cow_start.nb_bytes == 0);
2480
            continue;
2481
        }
2482

2483
        /* The write request should end immediately before the second
2484
         * COW region (see above for why it does not always happen) */
2485
        if (m->offset + m->cow_end.offset != offset + bytes) {
2486
            assert(offset + bytes > m->offset + m->cow_end.offset);
2487
            assert(m->cow_end.nb_bytes == 0);
2488
            continue;
2489
        }
2490

2491
        /* Make sure that adding both COW regions to the QEMUIOVector
2492
         * does not exceed IOV_MAX */
2493
        if (qemu_iovec_subvec_niov(qiov, qiov_offset, bytes) > IOV_MAX - 2) {
2494
            continue;
2495
        }
2496

2497
        m->data_qiov = qiov;
2498
        m->data_qiov_offset = qiov_offset;
2499
        return true;
2500
    }
2501

2502
    return false;
2503
}
2504

2505
/*
2506
 * Return 1 if the COW regions read as zeroes, 0 if not, < 0 on error.
2507
 * Note that returning 0 does not guarantee non-zero data.
2508
 */
2509
static int coroutine_fn GRAPH_RDLOCK
2510
is_zero_cow(BlockDriverState *bs, QCowL2Meta *m)
2511
{
2512
    /*
2513
     * This check is designed for optimization shortcut so it must be
2514
     * efficient.
2515
     * Instead of is_zero(), use bdrv_co_is_zero_fast() as it is
2516
     * faster (but not as accurate and can result in false negatives).
2517
     */
2518
    int ret = bdrv_co_is_zero_fast(bs, m->offset + m->cow_start.offset,
2519
                                   m->cow_start.nb_bytes);
2520
    if (ret <= 0) {
2521
        return ret;
2522
    }
2523

2524
    return bdrv_co_is_zero_fast(bs, m->offset + m->cow_end.offset,
2525
                                m->cow_end.nb_bytes);
2526
}
2527

2528
static int coroutine_fn GRAPH_RDLOCK
2529
handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta)
2530
{
2531
    BDRVQcow2State *s = bs->opaque;
2532
    QCowL2Meta *m;
2533

2534
    if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) {
2535
        return 0;
2536
    }
2537

2538
    if (bs->encrypted) {
2539
        return 0;
2540
    }
2541

2542
    for (m = l2meta; m != NULL; m = m->next) {
2543
        int ret;
2544
        uint64_t start_offset = m->alloc_offset + m->cow_start.offset;
2545
        unsigned nb_bytes = m->cow_end.offset + m->cow_end.nb_bytes -
2546
            m->cow_start.offset;
2547

2548
        if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) {
2549
            continue;
2550
        }
2551

2552
        ret = is_zero_cow(bs, m);
2553
        if (ret < 0) {
2554
            return ret;
2555
        } else if (ret == 0) {
2556
            continue;
2557
        }
2558

2559
        /*
2560
         * instead of writing zero COW buffers,
2561
         * efficiently zero out the whole clusters
2562
         */
2563

2564
        ret = qcow2_pre_write_overlap_check(bs, 0, start_offset, nb_bytes,
2565
                                            true);
2566
        if (ret < 0) {
2567
            return ret;
2568
        }
2569

2570
        BLKDBG_CO_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE);
2571
        ret = bdrv_co_pwrite_zeroes(s->data_file, start_offset, nb_bytes,
2572
                                    BDRV_REQ_NO_FALLBACK);
2573
        if (ret < 0) {
2574
            if (ret != -ENOTSUP && ret != -EAGAIN) {
2575
                return ret;
2576
            }
2577
            continue;
2578
        }
2579

2580
        trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, m->nb_clusters);
2581
        m->skip_cow = true;
2582
    }
2583
    return 0;
2584
}
2585

2586
/*
2587
 * qcow2_co_pwritev_task
2588
 * Called with s->lock unlocked
2589
 * l2meta  - if not NULL, qcow2_co_pwritev_task() will consume it. Caller must
2590
 *           not use it somehow after qcow2_co_pwritev_task() call
2591
 */
2592
static coroutine_fn GRAPH_RDLOCK
2593
int qcow2_co_pwritev_task(BlockDriverState *bs, uint64_t host_offset,
2594
                          uint64_t offset, uint64_t bytes, QEMUIOVector *qiov,
2595
                          uint64_t qiov_offset, QCowL2Meta *l2meta)
2596
{
2597
    int ret;
2598
    BDRVQcow2State *s = bs->opaque;
2599
    void *crypt_buf = NULL;
2600
    QEMUIOVector encrypted_qiov;
2601

2602
    if (bs->encrypted) {
2603
        assert(s->crypto);
2604
        assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
2605
        crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);
2606
        if (crypt_buf == NULL) {
2607
            ret = -ENOMEM;
2608
            goto out_unlocked;
2609
        }
2610
        qemu_iovec_to_buf(qiov, qiov_offset, crypt_buf, bytes);
2611

2612
        if (qcow2_co_encrypt(bs, host_offset, offset, crypt_buf, bytes) < 0) {
2613
            ret = -EIO;
2614
            goto out_unlocked;
2615
        }
2616

2617
        qemu_iovec_init_buf(&encrypted_qiov, crypt_buf, bytes);
2618
        qiov = &encrypted_qiov;
2619
        qiov_offset = 0;
2620
    }
2621

2622
    /* Try to efficiently initialize the physical space with zeroes */
2623
    ret = handle_alloc_space(bs, l2meta);
2624
    if (ret < 0) {
2625
        goto out_unlocked;
2626
    }
2627

2628
    /*
2629
     * If we need to do COW, check if it's possible to merge the
2630
     * writing of the guest data together with that of the COW regions.
2631
     * If it's not possible (or not necessary) then write the
2632
     * guest data now.
2633
     */
2634
    if (!merge_cow(offset, bytes, qiov, qiov_offset, l2meta)) {
2635
        BLKDBG_CO_EVENT(bs->file, BLKDBG_WRITE_AIO);
2636
        trace_qcow2_writev_data(qemu_coroutine_self(), host_offset);
2637
        ret = bdrv_co_pwritev_part(s->data_file, host_offset,
2638
                                   bytes, qiov, qiov_offset, 0);
2639
        if (ret < 0) {
2640
            goto out_unlocked;
2641
        }
2642
    }
2643

2644
    qemu_co_mutex_lock(&s->lock);
2645

2646
    ret = qcow2_handle_l2meta(bs, &l2meta, true);
2647
    goto out_locked;
2648

2649
out_unlocked:
2650
    qemu_co_mutex_lock(&s->lock);
2651

2652
out_locked:
2653
    qcow2_handle_l2meta(bs, &l2meta, false);
2654
    qemu_co_mutex_unlock(&s->lock);
2655

2656
    qemu_vfree(crypt_buf);
2657

2658
    return ret;
2659
}
2660

2661
/*
2662
 * This function can count as GRAPH_RDLOCK because qcow2_co_pwritev_part() holds
2663
 * the graph lock and keeps it until this coroutine has terminated.
2664
 */
2665
static coroutine_fn GRAPH_RDLOCK int qcow2_co_pwritev_task_entry(AioTask *task)
2666
{
2667
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
2668

2669
    assert(!t->subcluster_type);
2670

2671
    return qcow2_co_pwritev_task(t->bs, t->host_offset,
2672
                                 t->offset, t->bytes, t->qiov, t->qiov_offset,
2673
                                 t->l2meta);
2674
}
2675

2676
static int coroutine_fn GRAPH_RDLOCK
2677
qcow2_co_pwritev_part(BlockDriverState *bs, int64_t offset, int64_t bytes,
2678
                      QEMUIOVector *qiov, size_t qiov_offset,
2679
                      BdrvRequestFlags flags)
2680
{
2681
    BDRVQcow2State *s = bs->opaque;
2682
    int offset_in_cluster;
2683
    int ret;
2684
    unsigned int cur_bytes; /* number of sectors in current iteration */
2685
    uint64_t host_offset;
2686
    QCowL2Meta *l2meta = NULL;
2687
    AioTaskPool *aio = NULL;
2688

2689
    trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
2690

2691
    while (bytes != 0 && aio_task_pool_status(aio) == 0) {
2692

2693
        l2meta = NULL;
2694

2695
        trace_qcow2_writev_start_part(qemu_coroutine_self());
2696
        offset_in_cluster = offset_into_cluster(s, offset);
2697
        cur_bytes = MIN(bytes, INT_MAX);
2698
        if (bs->encrypted) {
2699
            cur_bytes = MIN(cur_bytes,
2700
                            QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
2701
                            - offset_in_cluster);
2702
        }
2703

2704
        qemu_co_mutex_lock(&s->lock);
2705

2706
        ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
2707
                                      &host_offset, &l2meta);
2708
        if (ret < 0) {
2709
            goto out_locked;
2710
        }
2711

2712
        ret = qcow2_pre_write_overlap_check(bs, 0, host_offset,
2713
                                            cur_bytes, true);
2714
        if (ret < 0) {
2715
            goto out_locked;
2716
        }
2717

2718
        qemu_co_mutex_unlock(&s->lock);
2719

2720
        if (!aio && cur_bytes != bytes) {
2721
            aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
2722
        }
2723
        ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_task_entry, 0,
2724
                             host_offset, offset,
2725
                             cur_bytes, qiov, qiov_offset, l2meta);
2726
        l2meta = NULL; /* l2meta is consumed by qcow2_co_pwritev_task() */
2727
        if (ret < 0) {
2728
            goto fail_nometa;
2729
        }
2730

2731
        bytes -= cur_bytes;
2732
        offset += cur_bytes;
2733
        qiov_offset += cur_bytes;
2734
        trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_bytes);
2735
    }
2736
    ret = 0;
2737

2738
    qemu_co_mutex_lock(&s->lock);
2739

2740
out_locked:
2741
    qcow2_handle_l2meta(bs, &l2meta, false);
2742

2743
    qemu_co_mutex_unlock(&s->lock);
2744

2745
fail_nometa:
2746
    if (aio) {
2747
        aio_task_pool_wait_all(aio);
2748
        if (ret == 0) {
2749
            ret = aio_task_pool_status(aio);
2750
        }
2751
        g_free(aio);
2752
    }
2753

2754
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
2755

2756
    return ret;
2757
}
2758

2759
static int GRAPH_RDLOCK qcow2_inactivate(BlockDriverState *bs)
2760
{
2761
    BDRVQcow2State *s = bs->opaque;
2762
    int ret, result = 0;
2763
    Error *local_err = NULL;
2764

2765
    qcow2_store_persistent_dirty_bitmaps(bs, true, &local_err);
2766
    if (local_err != NULL) {
2767
        result = -EINVAL;
2768
        error_reportf_err(local_err, "Lost persistent bitmaps during "
2769
                          "inactivation of node '%s': ",
2770
                          bdrv_get_device_or_node_name(bs));
2771
    }
2772

2773
    ret = qcow2_cache_flush(bs, s->l2_table_cache);
2774
    if (ret) {
2775
        result = ret;
2776
        error_report("Failed to flush the L2 table cache: %s",
2777
                     strerror(-ret));
2778
    }
2779

2780
    ret = qcow2_cache_flush(bs, s->refcount_block_cache);
2781
    if (ret) {
2782
        result = ret;
2783
        error_report("Failed to flush the refcount block cache: %s",
2784
                     strerror(-ret));
2785
    }
2786

2787
    if (result == 0) {
2788
        qcow2_mark_clean(bs);
2789
    }
2790

2791
    return result;
2792
}
2793

2794
static void coroutine_mixed_fn GRAPH_RDLOCK
2795
qcow2_do_close(BlockDriverState *bs, bool close_data_file)
2796
{
2797
    BDRVQcow2State *s = bs->opaque;
2798
    qemu_vfree(s->l1_table);
2799
    /* else pre-write overlap checks in cache_destroy may crash */
2800
    s->l1_table = NULL;
2801

2802
    if (!(s->flags & BDRV_O_INACTIVE)) {
2803
        qcow2_inactivate(bs);
2804
    }
2805

2806
    cache_clean_timer_del(bs);
2807
    qcow2_cache_destroy(s->l2_table_cache);
2808
    qcow2_cache_destroy(s->refcount_block_cache);
2809

2810
    qcrypto_block_free(s->crypto);
2811
    s->crypto = NULL;
2812
    qapi_free_QCryptoBlockOpenOptions(s->crypto_opts);
2813

2814
    g_free(s->unknown_header_fields);
2815
    cleanup_unknown_header_ext(bs);
2816

2817
    g_free(s->image_data_file);
2818
    g_free(s->image_backing_file);
2819
    g_free(s->image_backing_format);
2820

2821
    if (close_data_file && has_data_file(bs)) {
2822
        GLOBAL_STATE_CODE();
2823
        bdrv_graph_rdunlock_main_loop();
2824
        bdrv_graph_wrlock();
2825
        bdrv_unref_child(bs, s->data_file);
2826
        bdrv_graph_wrunlock();
2827
        s->data_file = NULL;
2828
        bdrv_graph_rdlock_main_loop();
2829
    }
2830

2831
    qcow2_refcount_close(bs);
2832
    qcow2_free_snapshots(bs);
2833
}
2834

2835
static void GRAPH_UNLOCKED qcow2_close(BlockDriverState *bs)
2836
{
2837
    GLOBAL_STATE_CODE();
2838
    GRAPH_RDLOCK_GUARD_MAINLOOP();
2839

2840
    qcow2_do_close(bs, true);
2841
}
2842

2843
static void coroutine_fn GRAPH_RDLOCK
2844
qcow2_co_invalidate_cache(BlockDriverState *bs, Error **errp)
2845
{
2846
    ERRP_GUARD();
2847
    BDRVQcow2State *s = bs->opaque;
2848
    BdrvChild *data_file;
2849
    int flags = s->flags;
2850
    QCryptoBlock *crypto = NULL;
2851
    QDict *options;
2852
    int ret;
2853

2854
    /*
2855
     * Backing files are read-only which makes all of their metadata immutable,
2856
     * that means we don't have to worry about reopening them here.
2857
     */
2858

2859
    crypto = s->crypto;
2860
    s->crypto = NULL;
2861

2862
    /*
2863
     * Do not reopen s->data_file (i.e., have qcow2_do_close() not close it,
2864
     * and then prevent qcow2_do_open() from opening it), because this function
2865
     * runs in the I/O path and as such we must not invoke global-state
2866
     * functions like bdrv_unref_child() and bdrv_open_child().
2867
     */
2868

2869
    qcow2_do_close(bs, false);
2870

2871
    data_file = s->data_file;
2872
    memset(s, 0, sizeof(BDRVQcow2State));
2873
    s->data_file = data_file;
2874

2875
    options = qdict_clone_shallow(bs->options);
2876

2877
    flags &= ~BDRV_O_INACTIVE;
2878
    qemu_co_mutex_lock(&s->lock);
2879
    ret = qcow2_do_open(bs, options, flags, false, errp);
2880
    qemu_co_mutex_unlock(&s->lock);
2881
    qobject_unref(options);
2882
    if (ret < 0) {
2883
        error_prepend(errp, "Could not reopen qcow2 layer: ");
2884
        bs->drv = NULL;
2885
        return;
2886
    }
2887

2888
    s->crypto = crypto;
2889
}
2890

2891
static size_t header_ext_add(char *buf, uint32_t magic, const void *s,
2892
    size_t len, size_t buflen)
2893
{
2894
    QCowExtension *ext_backing_fmt = (QCowExtension*) buf;
2895
    size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7);
2896

2897
    if (buflen < ext_len) {
2898
        return -ENOSPC;
2899
    }
2900

2901
    *ext_backing_fmt = (QCowExtension) {
2902
        .magic  = cpu_to_be32(magic),
2903
        .len    = cpu_to_be32(len),
2904
    };
2905

2906
    if (len) {
2907
        memcpy(buf + sizeof(QCowExtension), s, len);
2908
    }
2909

2910
    return ext_len;
2911
}
2912

2913
/*
2914
 * Updates the qcow2 header, including the variable length parts of it, i.e.
2915
 * the backing file name and all extensions. qcow2 was not designed to allow
2916
 * such changes, so if we run out of space (we can only use the first cluster)
2917
 * this function may fail.
2918
 *
2919
 * Returns 0 on success, -errno in error cases.
2920
 */
2921
int qcow2_update_header(BlockDriverState *bs)
2922
{
2923
    BDRVQcow2State *s = bs->opaque;
2924
    QCowHeader *header;
2925
    char *buf;
2926
    size_t buflen = s->cluster_size;
2927
    int ret;
2928
    uint64_t total_size;
2929
    uint32_t refcount_table_clusters;
2930
    size_t header_length;
2931
    Qcow2UnknownHeaderExtension *uext;
2932

2933
    buf = qemu_blockalign(bs, buflen);
2934

2935
    /* Header structure */
2936
    header = (QCowHeader*) buf;
2937

2938
    if (buflen < sizeof(*header)) {
2939
        ret = -ENOSPC;
2940
        goto fail;
2941
    }
2942

2943
    header_length = sizeof(*header) + s->unknown_header_fields_size;
2944
    total_size = bs->total_sectors * BDRV_SECTOR_SIZE;
2945
    refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3);
2946

2947
    ret = validate_compression_type(s, NULL);
2948
    if (ret) {
2949
        goto fail;
2950
    }
2951

2952
    *header = (QCowHeader) {
2953
        /* Version 2 fields */
2954
        .magic                  = cpu_to_be32(QCOW_MAGIC),
2955
        .version                = cpu_to_be32(s->qcow_version),
2956
        .backing_file_offset    = 0,
2957
        .backing_file_size      = 0,
2958
        .cluster_bits           = cpu_to_be32(s->cluster_bits),
2959
        .size                   = cpu_to_be64(total_size),
2960
        .crypt_method           = cpu_to_be32(s->crypt_method_header),
2961
        .l1_size                = cpu_to_be32(s->l1_size),
2962
        .l1_table_offset        = cpu_to_be64(s->l1_table_offset),
2963
        .refcount_table_offset  = cpu_to_be64(s->refcount_table_offset),
2964
        .refcount_table_clusters = cpu_to_be32(refcount_table_clusters),
2965
        .nb_snapshots           = cpu_to_be32(s->nb_snapshots),
2966
        .snapshots_offset       = cpu_to_be64(s->snapshots_offset),
2967

2968
        /* Version 3 fields */
2969
        .incompatible_features  = cpu_to_be64(s->incompatible_features),
2970
        .compatible_features    = cpu_to_be64(s->compatible_features),
2971
        .autoclear_features     = cpu_to_be64(s->autoclear_features),
2972
        .refcount_order         = cpu_to_be32(s->refcount_order),
2973
        .header_length          = cpu_to_be32(header_length),
2974
        .compression_type       = s->compression_type,
2975
    };
2976

2977
    /* For older versions, write a shorter header */
2978
    switch (s->qcow_version) {
2979
    case 2:
2980
        ret = offsetof(QCowHeader, incompatible_features);
2981
        break;
2982
    case 3:
2983
        ret = sizeof(*header);
2984
        break;
2985
    default:
2986
        ret = -EINVAL;
2987
        goto fail;
2988
    }
2989

2990
    buf += ret;
2991
    buflen -= ret;
2992
    memset(buf, 0, buflen);
2993

2994
    /* Preserve any unknown field in the header */
2995
    if (s->unknown_header_fields_size) {
2996
        if (buflen < s->unknown_header_fields_size) {
2997
            ret = -ENOSPC;
2998
            goto fail;
2999
        }
3000

3001
        memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size);
3002
        buf += s->unknown_header_fields_size;
3003
        buflen -= s->unknown_header_fields_size;
3004
    }
3005

3006
    /* Backing file format header extension */
3007
    if (s->image_backing_format) {
3008
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT,
3009
                             s->image_backing_format,
3010
                             strlen(s->image_backing_format),
3011
                             buflen);
3012
        if (ret < 0) {
3013
            goto fail;
3014
        }
3015

3016
        buf += ret;
3017
        buflen -= ret;
3018
    }
3019

3020
    /* External data file header extension */
3021
    if (has_data_file(bs) && s->image_data_file) {
3022
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_DATA_FILE,
3023
                             s->image_data_file, strlen(s->image_data_file),
3024
                             buflen);
3025
        if (ret < 0) {
3026
            goto fail;
3027
        }
3028

3029
        buf += ret;
3030
        buflen -= ret;
3031
    }
3032

3033
    /* Full disk encryption header pointer extension */
3034
    if (s->crypto_header.offset != 0) {
3035
        s->crypto_header.offset = cpu_to_be64(s->crypto_header.offset);
3036
        s->crypto_header.length = cpu_to_be64(s->crypto_header.length);
3037
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_CRYPTO_HEADER,
3038
                             &s->crypto_header, sizeof(s->crypto_header),
3039
                             buflen);
3040
        s->crypto_header.offset = be64_to_cpu(s->crypto_header.offset);
3041
        s->crypto_header.length = be64_to_cpu(s->crypto_header.length);
3042
        if (ret < 0) {
3043
            goto fail;
3044
        }
3045
        buf += ret;
3046
        buflen -= ret;
3047
    }
3048

3049
    /*
3050
     * Feature table.  A mere 8 feature names occupies 392 bytes, and
3051
     * when coupled with the v3 minimum header of 104 bytes plus the
3052
     * 8-byte end-of-extension marker, that would leave only 8 bytes
3053
     * for a backing file name in an image with 512-byte clusters.
3054
     * Thus, we choose to omit this header for cluster sizes 4k and
3055
     * smaller.
3056
     */
3057
    if (s->qcow_version >= 3 && s->cluster_size > 4096) {
3058
        static const Qcow2Feature features[] = {
3059
            {
3060
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3061
                .bit  = QCOW2_INCOMPAT_DIRTY_BITNR,
3062
                .name = "dirty bit",
3063
            },
3064
            {
3065
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3066
                .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
3067
                .name = "corrupt bit",
3068
            },
3069
            {
3070
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3071
                .bit  = QCOW2_INCOMPAT_DATA_FILE_BITNR,
3072
                .name = "external data file",
3073
            },
3074
            {
3075
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3076
                .bit  = QCOW2_INCOMPAT_COMPRESSION_BITNR,
3077
                .name = "compression type",
3078
            },
3079
            {
3080
                .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
3081
                .bit  = QCOW2_INCOMPAT_EXTL2_BITNR,
3082
                .name = "extended L2 entries",
3083
            },
3084
            {
3085
                .type = QCOW2_FEAT_TYPE_COMPATIBLE,
3086
                .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
3087
                .name = "lazy refcounts",
3088
            },
3089
            {
3090
                .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
3091
                .bit  = QCOW2_AUTOCLEAR_BITMAPS_BITNR,
3092
                .name = "bitmaps",
3093
            },
3094
            {
3095
                .type = QCOW2_FEAT_TYPE_AUTOCLEAR,
3096
                .bit  = QCOW2_AUTOCLEAR_DATA_FILE_RAW_BITNR,
3097
                .name = "raw external data",
3098
            },
3099
        };
3100

3101
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE,
3102
                             features, sizeof(features), buflen);
3103
        if (ret < 0) {
3104
            goto fail;
3105
        }
3106
        buf += ret;
3107
        buflen -= ret;
3108
    }
3109

3110
    /* Bitmap extension */
3111
    if (s->nb_bitmaps > 0) {
3112
        Qcow2BitmapHeaderExt bitmaps_header = {
3113
            .nb_bitmaps = cpu_to_be32(s->nb_bitmaps),
3114
            .bitmap_directory_size =
3115
                    cpu_to_be64(s->bitmap_directory_size),
3116
            .bitmap_directory_offset =
3117
                    cpu_to_be64(s->bitmap_directory_offset)
3118
        };
3119
        ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BITMAPS,
3120
                             &bitmaps_header, sizeof(bitmaps_header),
3121
                             buflen);
3122
        if (ret < 0) {
3123
            goto fail;
3124
        }
3125
        buf += ret;
3126
        buflen -= ret;
3127
    }
3128

3129
    /* Keep unknown header extensions */
3130
    QLIST_FOREACH(uext, &s->unknown_header_ext, next) {
3131
        ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen);
3132
        if (ret < 0) {
3133
            goto fail;
3134
        }
3135

3136
        buf += ret;
3137
        buflen -= ret;
3138
    }
3139

3140
    /* End of header extensions */
3141
    ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen);
3142
    if (ret < 0) {
3143
        goto fail;
3144
    }
3145

3146
    buf += ret;
3147
    buflen -= ret;
3148

3149
    /* Backing file name */
3150
    if (s->image_backing_file) {
3151
        size_t backing_file_len = strlen(s->image_backing_file);
3152

3153
        if (buflen < backing_file_len) {
3154
            ret = -ENOSPC;
3155
            goto fail;
3156
        }
3157

3158
        /* Using strncpy is ok here, since buf is not NUL-terminated. */
3159
        strncpy(buf, s->image_backing_file, buflen);
3160

3161
        header->backing_file_offset = cpu_to_be64(buf - ((char*) header));
3162
        header->backing_file_size   = cpu_to_be32(backing_file_len);
3163
    }
3164

3165
    /* Write the new header */
3166
    ret = bdrv_pwrite(bs->file, 0, s->cluster_size, header, 0);
3167
    if (ret < 0) {
3168
        goto fail;
3169
    }
3170

3171
    ret = 0;
3172
fail:
3173
    qemu_vfree(header);
3174
    return ret;
3175
}
3176

3177
static int coroutine_fn GRAPH_RDLOCK
3178
qcow2_co_change_backing_file(BlockDriverState *bs, const char *backing_file,
3179
                             const char *backing_fmt)
3180
{
3181
    BDRVQcow2State *s = bs->opaque;
3182

3183
    /* Adding a backing file means that the external data file alone won't be
3184
     * enough to make sense of the content */
3185
    if (backing_file && data_file_is_raw(bs)) {
3186
        return -EINVAL;
3187
    }
3188

3189
    if (backing_file && strlen(backing_file) > 1023) {
3190
        return -EINVAL;
3191
    }
3192

3193
    pstrcpy(bs->auto_backing_file, sizeof(bs->auto_backing_file),
3194
            backing_file ?: "");
3195
    pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: "");
3196
    pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: "");
3197

3198
    g_free(s->image_backing_file);
3199
    g_free(s->image_backing_format);
3200

3201
    s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL;
3202
    s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL;
3203

3204
    return qcow2_update_header(bs);
3205
}
3206

3207
static int coroutine_fn GRAPH_RDLOCK
3208
qcow2_set_up_encryption(BlockDriverState *bs,
3209
                        QCryptoBlockCreateOptions *cryptoopts,
3210
                        Error **errp)
3211
{
3212
    BDRVQcow2State *s = bs->opaque;
3213
    QCryptoBlock *crypto = NULL;
3214
    int fmt, ret;
3215

3216
    switch (cryptoopts->format) {
3217
    case Q_CRYPTO_BLOCK_FORMAT_LUKS:
3218
        fmt = QCOW_CRYPT_LUKS;
3219
        break;
3220
    case Q_CRYPTO_BLOCK_FORMAT_QCOW:
3221
        fmt = QCOW_CRYPT_AES;
3222
        break;
3223
    default:
3224
        error_setg(errp, "Crypto format not supported in qcow2");
3225
        return -EINVAL;
3226
    }
3227

3228
    s->crypt_method_header = fmt;
3229

3230
    crypto = qcrypto_block_create(cryptoopts, "encrypt.",
3231
                                  qcow2_crypto_hdr_init_func,
3232
                                  qcow2_crypto_hdr_write_func,
3233
                                  bs, 0, errp);
3234
    if (!crypto) {
3235
        return -EINVAL;
3236
    }
3237

3238
    ret = qcow2_update_header(bs);
3239
    if (ret < 0) {
3240
        error_setg_errno(errp, -ret, "Could not write encryption header");
3241
        goto out;
3242
    }
3243

3244
    ret = 0;
3245
 out:
3246
    qcrypto_block_free(crypto);
3247
    return ret;
3248
}
3249

3250
/**
3251
 * Preallocates metadata structures for data clusters between @offset (in the
3252
 * guest disk) and @new_length (which is thus generally the new guest disk
3253
 * size).
3254
 *
3255
 * Returns: 0 on success, -errno on failure.
3256
 */
3257
static int coroutine_fn GRAPH_RDLOCK
3258
preallocate_co(BlockDriverState *bs, uint64_t offset, uint64_t new_length,
3259
               PreallocMode mode, Error **errp)
3260
{
3261
    BDRVQcow2State *s = bs->opaque;
3262
    uint64_t bytes;
3263
    uint64_t host_offset = 0;
3264
    int64_t file_length;
3265
    unsigned int cur_bytes;
3266
    int ret;
3267
    QCowL2Meta *meta = NULL, *m;
3268

3269
    assert(offset <= new_length);
3270
    bytes = new_length - offset;
3271

3272
    while (bytes) {
3273
        cur_bytes = MIN(bytes, QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size));
3274
        ret = qcow2_alloc_host_offset(bs, offset, &cur_bytes,
3275
                                      &host_offset, &meta);
3276
        if (ret < 0) {
3277
            error_setg_errno(errp, -ret, "Allocating clusters failed");
3278
            goto out;
3279
        }
3280

3281
        for (m = meta; m != NULL; m = m->next) {
3282
            m->prealloc = true;
3283
        }
3284

3285
        ret = qcow2_handle_l2meta(bs, &meta, true);
3286
        if (ret < 0) {
3287
            error_setg_errno(errp, -ret, "Mapping clusters failed");
3288
            goto out;
3289
        }
3290

3291
        /* TODO Preallocate data if requested */
3292

3293
        bytes -= cur_bytes;
3294
        offset += cur_bytes;
3295
    }
3296

3297
    /*
3298
     * It is expected that the image file is large enough to actually contain
3299
     * all of the allocated clusters (otherwise we get failing reads after
3300
     * EOF). Extend the image to the last allocated sector.
3301
     */
3302
    file_length = bdrv_co_getlength(s->data_file->bs);
3303
    if (file_length < 0) {
3304
        error_setg_errno(errp, -file_length, "Could not get file size");
3305
        ret = file_length;
3306
        goto out;
3307
    }
3308

3309
    if (host_offset + cur_bytes > file_length) {
3310
        if (mode == PREALLOC_MODE_METADATA) {
3311
            mode = PREALLOC_MODE_OFF;
3312
        }
3313
        ret = bdrv_co_truncate(s->data_file, host_offset + cur_bytes, false,
3314
                               mode, 0, errp);
3315
        if (ret < 0) {
3316
            goto out;
3317
        }
3318
    }
3319

3320
    ret = 0;
3321

3322
out:
3323
    qcow2_handle_l2meta(bs, &meta, false);
3324
    return ret;
3325
}
3326

3327
/* qcow2_refcount_metadata_size:
3328
 * @clusters: number of clusters to refcount (including data and L1/L2 tables)
3329
 * @cluster_size: size of a cluster, in bytes
3330
 * @refcount_order: refcount bits power-of-2 exponent
3331
 * @generous_increase: allow for the refcount table to be 1.5x as large as it
3332
 *                     needs to be
3333
 *
3334
 * Returns: Number of bytes required for refcount blocks and table metadata.
3335
 */
3336
int64_t qcow2_refcount_metadata_size(int64_t clusters, size_t cluster_size,
3337
                                     int refcount_order, bool generous_increase,
3338
                                     uint64_t *refblock_count)
3339
{
3340
    /*
3341
     * Every host cluster is reference-counted, including metadata (even
3342
     * refcount metadata is recursively included).
3343
     *
3344
     * An accurate formula for the size of refcount metadata size is difficult
3345
     * to derive.  An easier method of calculation is finding the fixed point
3346
     * where no further refcount blocks or table clusters are required to
3347
     * reference count every cluster.
3348
     */
3349
    int64_t blocks_per_table_cluster = cluster_size / REFTABLE_ENTRY_SIZE;
3350
    int64_t refcounts_per_block = cluster_size * 8 / (1 << refcount_order);
3351
    int64_t table = 0;  /* number of refcount table clusters */
3352
    int64_t blocks = 0; /* number of refcount block clusters */
3353
    int64_t last;
3354
    int64_t n = 0;
3355

3356
    do {
3357
        last = n;
3358
        blocks = DIV_ROUND_UP(clusters + table + blocks, refcounts_per_block);
3359
        table = DIV_ROUND_UP(blocks, blocks_per_table_cluster);
3360
        n = clusters + blocks + table;
3361

3362
        if (n == last && generous_increase) {
3363
            clusters += DIV_ROUND_UP(table, 2);
3364
            n = 0; /* force another loop */
3365
            generous_increase = false;
3366
        }
3367
    } while (n != last);
3368

3369
    if (refblock_count) {
3370
        *refblock_count = blocks;
3371
    }
3372

3373
    return (blocks + table) * cluster_size;
3374
}
3375

3376
/**
3377
 * qcow2_calc_prealloc_size:
3378
 * @total_size: virtual disk size in bytes
3379
 * @cluster_size: cluster size in bytes
3380
 * @refcount_order: refcount bits power-of-2 exponent
3381
 * @extended_l2: true if the image has extended L2 entries
3382
 *
3383
 * Returns: Total number of bytes required for the fully allocated image
3384
 * (including metadata).
3385
 */
3386
static int64_t qcow2_calc_prealloc_size(int64_t total_size,
3387
                                        size_t cluster_size,
3388
                                        int refcount_order,
3389
                                        bool extended_l2)
3390
{
3391
    int64_t meta_size = 0;
3392
    uint64_t nl1e, nl2e;
3393
    int64_t aligned_total_size = ROUND_UP(total_size, cluster_size);
3394
    size_t l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
3395

3396
    /* header: 1 cluster */
3397
    meta_size += cluster_size;
3398

3399
    /* total size of L2 tables */
3400
    nl2e = aligned_total_size / cluster_size;
3401
    nl2e = ROUND_UP(nl2e, cluster_size / l2e_size);
3402
    meta_size += nl2e * l2e_size;
3403

3404
    /* total size of L1 tables */
3405
    nl1e = nl2e * l2e_size / cluster_size;
3406
    nl1e = ROUND_UP(nl1e, cluster_size / L1E_SIZE);
3407
    meta_size += nl1e * L1E_SIZE;
3408

3409
    /* total size of refcount table and blocks */
3410
    meta_size += qcow2_refcount_metadata_size(
3411
            (meta_size + aligned_total_size) / cluster_size,
3412
            cluster_size, refcount_order, false, NULL);
3413

3414
    return meta_size + aligned_total_size;
3415
}
3416

3417
static bool validate_cluster_size(size_t cluster_size, bool extended_l2,
3418
                                  Error **errp)
3419
{
3420
    int cluster_bits = ctz32(cluster_size);
3421
    if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS ||
3422
        (1 << cluster_bits) != cluster_size)
3423
    {
3424
        error_setg(errp, "Cluster size must be a power of two between %d and "
3425
                   "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10));
3426
        return false;
3427
    }
3428

3429
    if (extended_l2) {
3430
        unsigned min_cluster_size =
3431
            (1 << MIN_CLUSTER_BITS) * QCOW_EXTL2_SUBCLUSTERS_PER_CLUSTER;
3432
        if (cluster_size < min_cluster_size) {
3433
            error_setg(errp, "Extended L2 entries are only supported with "
3434
                       "cluster sizes of at least %u bytes", min_cluster_size);
3435
            return false;
3436
        }
3437
    }
3438

3439
    return true;
3440
}
3441

3442
static size_t qcow2_opt_get_cluster_size_del(QemuOpts *opts, bool extended_l2,
3443
                                             Error **errp)
3444
{
3445
    size_t cluster_size;
3446

3447
    cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE,
3448
                                         DEFAULT_CLUSTER_SIZE);
3449
    if (!validate_cluster_size(cluster_size, extended_l2, errp)) {
3450
        return 0;
3451
    }
3452
    return cluster_size;
3453
}
3454

3455
static int qcow2_opt_get_version_del(QemuOpts *opts, Error **errp)
3456
{
3457
    char *buf;
3458
    int ret;
3459

3460
    buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL);
3461
    if (!buf) {
3462
        ret = 3; /* default */
3463
    } else if (!strcmp(buf, "0.10")) {
3464
        ret = 2;
3465
    } else if (!strcmp(buf, "1.1")) {
3466
        ret = 3;
3467
    } else {
3468
        error_setg(errp, "Invalid compatibility level: '%s'", buf);
3469
        ret = -EINVAL;
3470
    }
3471
    g_free(buf);
3472
    return ret;
3473
}
3474

3475
static uint64_t qcow2_opt_get_refcount_bits_del(QemuOpts *opts, int version,
3476
                                                Error **errp)
3477
{
3478
    uint64_t refcount_bits;
3479

3480
    refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, 16);
3481
    if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) {
3482
        error_setg(errp, "Refcount width must be a power of two and may not "
3483
                   "exceed 64 bits");
3484
        return 0;
3485
    }
3486

3487
    if (version < 3 && refcount_bits != 16) {
3488
        error_setg(errp, "Different refcount widths than 16 bits require "
3489
                   "compatibility level 1.1 or above (use compat=1.1 or "
3490
                   "greater)");
3491
        return 0;
3492
    }
3493

3494
    return refcount_bits;
3495
}
3496

3497
static int coroutine_fn GRAPH_UNLOCKED
3498
qcow2_co_create(BlockdevCreateOptions *create_options, Error **errp)
3499
{
3500
    ERRP_GUARD();
3501
    BlockdevCreateOptionsQcow2 *qcow2_opts;
3502
    QDict *options;
3503

3504
    /*
3505
     * Open the image file and write a minimal qcow2 header.
3506
     *
3507
     * We keep things simple and start with a zero-sized image. We also
3508
     * do without refcount blocks or a L1 table for now. We'll fix the
3509
     * inconsistency later.
3510
     *
3511
     * We do need a refcount table because growing the refcount table means
3512
     * allocating two new refcount blocks - the second of which would be at
3513
     * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file
3514
     * size for any qcow2 image.
3515
     */
3516
    BlockBackend *blk = NULL;
3517
    BlockDriverState *bs = NULL;
3518
    BlockDriverState *data_bs = NULL;
3519
    QCowHeader *header;
3520
    size_t cluster_size;
3521
    int version;
3522
    int refcount_order;
3523
    uint64_t *refcount_table;
3524
    int ret;
3525
    uint8_t compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
3526

3527
    assert(create_options->driver == BLOCKDEV_DRIVER_QCOW2);
3528
    qcow2_opts = &create_options->u.qcow2;
3529

3530
    bs = bdrv_co_open_blockdev_ref(qcow2_opts->file, errp);
3531
    if (bs == NULL) {
3532
        return -EIO;
3533
    }
3534

3535
    /* Validate options and set default values */
3536
    if (!QEMU_IS_ALIGNED(qcow2_opts->size, BDRV_SECTOR_SIZE)) {
3537
        error_setg(errp, "Image size must be a multiple of %u bytes",
3538
                   (unsigned) BDRV_SECTOR_SIZE);
3539
        ret = -EINVAL;
3540
        goto out;
3541
    }
3542

3543
    if (qcow2_opts->has_version) {
3544
        switch (qcow2_opts->version) {
3545
        case BLOCKDEV_QCOW2_VERSION_V2:
3546
            version = 2;
3547
            break;
3548
        case BLOCKDEV_QCOW2_VERSION_V3:
3549
            version = 3;
3550
            break;
3551
        default:
3552
            g_assert_not_reached();
3553
        }
3554
    } else {
3555
        version = 3;
3556
    }
3557

3558
    if (qcow2_opts->has_cluster_size) {
3559
        cluster_size = qcow2_opts->cluster_size;
3560
    } else {
3561
        cluster_size = DEFAULT_CLUSTER_SIZE;
3562
    }
3563

3564
    if (!qcow2_opts->has_extended_l2) {
3565
        qcow2_opts->extended_l2 = false;
3566
    }
3567
    if (qcow2_opts->extended_l2) {
3568
        if (version < 3) {
3569
            error_setg(errp, "Extended L2 entries are only supported with "
3570
                       "compatibility level 1.1 and above (use version=v3 or "
3571
                       "greater)");
3572
            ret = -EINVAL;
3573
            goto out;
3574
        }
3575
    }
3576

3577
    if (!validate_cluster_size(cluster_size, qcow2_opts->extended_l2, errp)) {
3578
        ret = -EINVAL;
3579
        goto out;
3580
    }
3581

3582
    if (!qcow2_opts->has_preallocation) {
3583
        qcow2_opts->preallocation = PREALLOC_MODE_OFF;
3584
    }
3585
    if (qcow2_opts->backing_file &&
3586
        qcow2_opts->preallocation != PREALLOC_MODE_OFF &&
3587
        !qcow2_opts->extended_l2)
3588
    {
3589
        error_setg(errp, "Backing file and preallocation can only be used at "
3590
                   "the same time if extended_l2 is on");
3591
        ret = -EINVAL;
3592
        goto out;
3593
    }
3594
    if (qcow2_opts->has_backing_fmt && !qcow2_opts->backing_file) {
3595
        error_setg(errp, "Backing format cannot be used without backing file");
3596
        ret = -EINVAL;
3597
        goto out;
3598
    }
3599

3600
    if (!qcow2_opts->has_lazy_refcounts) {
3601
        qcow2_opts->lazy_refcounts = false;
3602
    }
3603
    if (version < 3 && qcow2_opts->lazy_refcounts) {
3604
        error_setg(errp, "Lazy refcounts only supported with compatibility "
3605
                   "level 1.1 and above (use version=v3 or greater)");
3606
        ret = -EINVAL;
3607
        goto out;
3608
    }
3609

3610
    if (!qcow2_opts->has_refcount_bits) {
3611
        qcow2_opts->refcount_bits = 16;
3612
    }
3613
    if (qcow2_opts->refcount_bits > 64 ||
3614
        !is_power_of_2(qcow2_opts->refcount_bits))
3615
    {
3616
        error_setg(errp, "Refcount width must be a power of two and may not "
3617
                   "exceed 64 bits");
3618
        ret = -EINVAL;
3619
        goto out;
3620
    }
3621
    if (version < 3 && qcow2_opts->refcount_bits != 16) {
3622
        error_setg(errp, "Different refcount widths than 16 bits require "
3623
                   "compatibility level 1.1 or above (use version=v3 or "
3624
                   "greater)");
3625
        ret = -EINVAL;
3626
        goto out;
3627
    }
3628
    refcount_order = ctz32(qcow2_opts->refcount_bits);
3629

3630
    if (qcow2_opts->data_file_raw && !qcow2_opts->data_file) {
3631
        error_setg(errp, "data-file-raw requires data-file");
3632
        ret = -EINVAL;
3633
        goto out;
3634
    }
3635
    if (qcow2_opts->data_file_raw && qcow2_opts->backing_file) {
3636
        error_setg(errp, "Backing file and data-file-raw cannot be used at "
3637
                   "the same time");
3638
        ret = -EINVAL;
3639
        goto out;
3640
    }
3641
    if (qcow2_opts->data_file_raw &&
3642
        qcow2_opts->preallocation == PREALLOC_MODE_OFF)
3643
    {
3644
        /*
3645
         * data-file-raw means that "the external data file can be
3646
         * read as a consistent standalone raw image without looking
3647
         * at the qcow2 metadata."  It does not say that the metadata
3648
         * must be ignored, though (and the qcow2 driver in fact does
3649
         * not ignore it), so the L1/L2 tables must be present and
3650
         * give a 1:1 mapping, so you get the same result regardless
3651
         * of whether you look at the metadata or whether you ignore
3652
         * it.
3653
         */
3654
        qcow2_opts->preallocation = PREALLOC_MODE_METADATA;
3655

3656
        /*
3657
         * Cannot use preallocation with backing files, but giving a
3658
         * backing file when specifying data_file_raw is an error
3659
         * anyway.
3660
         */
3661
        assert(!qcow2_opts->backing_file);
3662
    }
3663

3664
    if (qcow2_opts->data_file) {
3665
        if (version < 3) {
3666
            error_setg(errp, "External data files are only supported with "
3667
                       "compatibility level 1.1 and above (use version=v3 or "
3668
                       "greater)");
3669
            ret = -EINVAL;
3670
            goto out;
3671
        }
3672
        data_bs = bdrv_co_open_blockdev_ref(qcow2_opts->data_file, errp);
3673
        if (data_bs == NULL) {
3674
            ret = -EIO;
3675
            goto out;
3676
        }
3677
    }
3678

3679
    if (qcow2_opts->has_compression_type &&
3680
        qcow2_opts->compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
3681

3682
        ret = -EINVAL;
3683

3684
        if (version < 3) {
3685
            error_setg(errp, "Non-zlib compression type is only supported with "
3686
                       "compatibility level 1.1 and above (use version=v3 or "
3687
                       "greater)");
3688
            goto out;
3689
        }
3690

3691
        switch (qcow2_opts->compression_type) {
3692
#ifdef CONFIG_ZSTD
3693
        case QCOW2_COMPRESSION_TYPE_ZSTD:
3694
            break;
3695
#endif
3696
        default:
3697
            error_setg(errp, "Unknown compression type");
3698
            goto out;
3699
        }
3700

3701
        compression_type = qcow2_opts->compression_type;
3702
    }
3703

3704
    /* Create BlockBackend to write to the image */
3705
    blk = blk_co_new_with_bs(bs, BLK_PERM_WRITE | BLK_PERM_RESIZE, BLK_PERM_ALL,
3706
                             errp);
3707
    if (!blk) {
3708
        ret = -EPERM;
3709
        goto out;
3710
    }
3711
    blk_set_allow_write_beyond_eof(blk, true);
3712

3713
    /* Write the header */
3714
    QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header));
3715
    header = g_malloc0(cluster_size);
3716
    *header = (QCowHeader) {
3717
        .magic                      = cpu_to_be32(QCOW_MAGIC),
3718
        .version                    = cpu_to_be32(version),
3719
        .cluster_bits               = cpu_to_be32(ctz32(cluster_size)),
3720
        .size                       = cpu_to_be64(0),
3721
        .l1_table_offset            = cpu_to_be64(0),
3722
        .l1_size                    = cpu_to_be32(0),
3723
        .refcount_table_offset      = cpu_to_be64(cluster_size),
3724
        .refcount_table_clusters    = cpu_to_be32(1),
3725
        .refcount_order             = cpu_to_be32(refcount_order),
3726
        /* don't deal with endianness since compression_type is 1 byte long */
3727
        .compression_type           = compression_type,
3728
        .header_length              = cpu_to_be32(sizeof(*header)),
3729
    };
3730

3731
    /* We'll update this to correct value later */
3732
    header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE);
3733

3734
    if (qcow2_opts->lazy_refcounts) {
3735
        header->compatible_features |=
3736
            cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS);
3737
    }
3738
    if (data_bs) {
3739
        header->incompatible_features |=
3740
            cpu_to_be64(QCOW2_INCOMPAT_DATA_FILE);
3741
    }
3742
    if (qcow2_opts->data_file_raw) {
3743
        header->autoclear_features |=
3744
            cpu_to_be64(QCOW2_AUTOCLEAR_DATA_FILE_RAW);
3745
    }
3746
    if (compression_type != QCOW2_COMPRESSION_TYPE_ZLIB) {
3747
        header->incompatible_features |=
3748
            cpu_to_be64(QCOW2_INCOMPAT_COMPRESSION);
3749
    }
3750

3751
    if (qcow2_opts->extended_l2) {
3752
        header->incompatible_features |=
3753
            cpu_to_be64(QCOW2_INCOMPAT_EXTL2);
3754
    }
3755

3756
    ret = blk_co_pwrite(blk, 0, cluster_size, header, 0);
3757
    g_free(header);
3758
    if (ret < 0) {
3759
        error_setg_errno(errp, -ret, "Could not write qcow2 header");
3760
        goto out;
3761
    }
3762

3763
    /* Write a refcount table with one refcount block */
3764
    refcount_table = g_malloc0(2 * cluster_size);
3765
    refcount_table[0] = cpu_to_be64(2 * cluster_size);
3766
    ret = blk_co_pwrite(blk, cluster_size, 2 * cluster_size, refcount_table, 0);
3767
    g_free(refcount_table);
3768

3769
    if (ret < 0) {
3770
        error_setg_errno(errp, -ret, "Could not write refcount table");
3771
        goto out;
3772
    }
3773

3774
    blk_co_unref(blk);
3775
    blk = NULL;
3776

3777
    /*
3778
     * And now open the image and make it consistent first (i.e. increase the
3779
     * refcount of the cluster that is occupied by the header and the refcount
3780
     * table)
3781
     */
3782
    options = qdict_new();
3783
    qdict_put_str(options, "driver", "qcow2");
3784
    qdict_put_str(options, "file", bs->node_name);
3785
    if (data_bs) {
3786
        qdict_put_str(options, "data-file", data_bs->node_name);
3787
    }
3788
    blk = blk_co_new_open(NULL, NULL, options,
3789
                          BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_NO_FLUSH,
3790
                          errp);
3791
    if (blk == NULL) {
3792
        ret = -EIO;
3793
        goto out;
3794
    }
3795

3796
    bdrv_graph_co_rdlock();
3797
    ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size);
3798
    if (ret < 0) {
3799
        bdrv_graph_co_rdunlock();
3800
        error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 "
3801
                         "header and refcount table");
3802
        goto out;
3803

3804
    } else if (ret != 0) {
3805
        error_report("Huh, first cluster in empty image is already in use?");
3806
        abort();
3807
    }
3808

3809
    /* Set the external data file if necessary */
3810
    if (data_bs) {
3811
        BDRVQcow2State *s = blk_bs(blk)->opaque;
3812
        s->image_data_file = g_strdup(data_bs->filename);
3813
    }
3814

3815
    /* Create a full header (including things like feature table) */
3816
    ret = qcow2_update_header(blk_bs(blk));
3817
    bdrv_graph_co_rdunlock();
3818

3819
    if (ret < 0) {
3820
        error_setg_errno(errp, -ret, "Could not update qcow2 header");
3821
        goto out;
3822
    }
3823

3824
    /* Okay, now that we have a valid image, let's give it the right size */
3825
    ret = blk_co_truncate(blk, qcow2_opts->size, false,
3826
                          qcow2_opts->preallocation, 0, errp);
3827
    if (ret < 0) {
3828
        error_prepend(errp, "Could not resize image: ");
3829
        goto out;
3830
    }
3831

3832
    /* Want a backing file? There you go. */
3833
    if (qcow2_opts->backing_file) {
3834
        const char *backing_format = NULL;
3835

3836
        if (qcow2_opts->has_backing_fmt) {
3837
            backing_format = BlockdevDriver_str(qcow2_opts->backing_fmt);
3838
        }
3839

3840
        bdrv_graph_co_rdlock();
3841
        ret = bdrv_co_change_backing_file(blk_bs(blk), qcow2_opts->backing_file,
3842
                                          backing_format, false);
3843
        bdrv_graph_co_rdunlock();
3844

3845
        if (ret < 0) {
3846
            error_setg_errno(errp, -ret, "Could not assign backing file '%s' "
3847
                             "with format '%s'", qcow2_opts->backing_file,
3848
                             backing_format);
3849
            goto out;
3850
        }
3851
    }
3852

3853
    /* Want encryption? There you go. */
3854
    if (qcow2_opts->encrypt) {
3855
        bdrv_graph_co_rdlock();
3856
        ret = qcow2_set_up_encryption(blk_bs(blk), qcow2_opts->encrypt, errp);
3857
        bdrv_graph_co_rdunlock();
3858

3859
        if (ret < 0) {
3860
            goto out;
3861
        }
3862
    }
3863

3864
    blk_co_unref(blk);
3865
    blk = NULL;
3866

3867
    /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning.
3868
     * Using BDRV_O_NO_IO, since encryption is now setup we don't want to
3869
     * have to setup decryption context. We're not doing any I/O on the top
3870
     * level BlockDriverState, only lower layers, where BDRV_O_NO_IO does
3871
     * not have effect.
3872
     */
3873
    options = qdict_new();
3874
    qdict_put_str(options, "driver", "qcow2");
3875
    qdict_put_str(options, "file", bs->node_name);
3876
    if (data_bs) {
3877
        qdict_put_str(options, "data-file", data_bs->node_name);
3878
    }
3879
    blk = blk_co_new_open(NULL, NULL, options,
3880
                          BDRV_O_RDWR | BDRV_O_NO_BACKING | BDRV_O_NO_IO,
3881
                          errp);
3882
    if (blk == NULL) {
3883
        ret = -EIO;
3884
        goto out;
3885
    }
3886

3887
    ret = 0;
3888
out:
3889
    blk_co_unref(blk);
3890
    bdrv_co_unref(bs);
3891
    bdrv_co_unref(data_bs);
3892
    return ret;
3893
}
3894

3895
static int coroutine_fn GRAPH_UNLOCKED
3896
qcow2_co_create_opts(BlockDriver *drv, const char *filename, QemuOpts *opts,
3897
                     Error **errp)
3898
{
3899
    BlockdevCreateOptions *create_options = NULL;
3900
    QDict *qdict;
3901
    Visitor *v;
3902
    BlockDriverState *bs = NULL;
3903
    BlockDriverState *data_bs = NULL;
3904
    const char *val;
3905
    int ret;
3906

3907
    /* Only the keyval visitor supports the dotted syntax needed for
3908
     * encryption, so go through a QDict before getting a QAPI type. Ignore
3909
     * options meant for the protocol layer so that the visitor doesn't
3910
     * complain. */
3911
    qdict = qemu_opts_to_qdict_filtered(opts, NULL, bdrv_qcow2.create_opts,
3912
                                        true);
3913

3914
    /* Handle encryption options */
3915
    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT);
3916
    if (val && !strcmp(val, "on")) {
3917
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT, "qcow");
3918
    } else if (val && !strcmp(val, "off")) {
3919
        qdict_del(qdict, BLOCK_OPT_ENCRYPT);
3920
    }
3921

3922
    val = qdict_get_try_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT);
3923
    if (val && !strcmp(val, "aes")) {
3924
        qdict_put_str(qdict, BLOCK_OPT_ENCRYPT_FORMAT, "qcow");
3925
    }
3926

3927
    /* Convert compat=0.10/1.1 into compat=v2/v3, to be renamed into
3928
     * version=v2/v3 below. */
3929
    val = qdict_get_try_str(qdict, BLOCK_OPT_COMPAT_LEVEL);
3930
    if (val && !strcmp(val, "0.10")) {
3931
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v2");
3932
    } else if (val && !strcmp(val, "1.1")) {
3933
        qdict_put_str(qdict, BLOCK_OPT_COMPAT_LEVEL, "v3");
3934
    }
3935

3936
    /* Change legacy command line options into QMP ones */
3937
    static const QDictRenames opt_renames[] = {
3938
        { BLOCK_OPT_BACKING_FILE,       "backing-file" },
3939
        { BLOCK_OPT_BACKING_FMT,        "backing-fmt" },
3940
        { BLOCK_OPT_CLUSTER_SIZE,       "cluster-size" },
3941
        { BLOCK_OPT_LAZY_REFCOUNTS,     "lazy-refcounts" },
3942
        { BLOCK_OPT_EXTL2,              "extended-l2" },
3943
        { BLOCK_OPT_REFCOUNT_BITS,      "refcount-bits" },
3944
        { BLOCK_OPT_ENCRYPT,            BLOCK_OPT_ENCRYPT_FORMAT },
3945
        { BLOCK_OPT_COMPAT_LEVEL,       "version" },
3946
        { BLOCK_OPT_DATA_FILE_RAW,      "data-file-raw" },
3947
        { BLOCK_OPT_COMPRESSION_TYPE,   "compression-type" },
3948
        { NULL, NULL },
3949
    };
3950

3951
    if (!qdict_rename_keys(qdict, opt_renames, errp)) {
3952
        ret = -EINVAL;
3953
        goto finish;
3954
    }
3955

3956
    /* Create and open the file (protocol layer) */
3957
    ret = bdrv_co_create_file(filename, opts, errp);
3958
    if (ret < 0) {
3959
        goto finish;
3960
    }
3961

3962
    bs = bdrv_co_open(filename, NULL, NULL,
3963
                      BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL, errp);
3964
    if (bs == NULL) {
3965
        ret = -EIO;
3966
        goto finish;
3967
    }
3968

3969
    /* Create and open an external data file (protocol layer) */
3970
    val = qdict_get_try_str(qdict, BLOCK_OPT_DATA_FILE);
3971
    if (val) {
3972
        ret = bdrv_co_create_file(val, opts, errp);
3973
        if (ret < 0) {
3974
            goto finish;
3975
        }
3976

3977
        data_bs = bdrv_co_open(val, NULL, NULL,
3978
                               BDRV_O_RDWR | BDRV_O_RESIZE | BDRV_O_PROTOCOL,
3979
                               errp);
3980
        if (data_bs == NULL) {
3981
            ret = -EIO;
3982
            goto finish;
3983
        }
3984

3985
        qdict_del(qdict, BLOCK_OPT_DATA_FILE);
3986
        qdict_put_str(qdict, "data-file", data_bs->node_name);
3987
    }
3988

3989
    /* Set 'driver' and 'node' options */
3990
    qdict_put_str(qdict, "driver", "qcow2");
3991
    qdict_put_str(qdict, "file", bs->node_name);
3992

3993
    /* Now get the QAPI type BlockdevCreateOptions */
3994
    v = qobject_input_visitor_new_flat_confused(qdict, errp);
3995
    if (!v) {
3996
        ret = -EINVAL;
3997
        goto finish;
3998
    }
3999

4000
    visit_type_BlockdevCreateOptions(v, NULL, &create_options, errp);
4001
    visit_free(v);
4002
    if (!create_options) {
4003
        ret = -EINVAL;
4004
        goto finish;
4005
    }
4006

4007
    /* Silently round up size */
4008
    create_options->u.qcow2.size = ROUND_UP(create_options->u.qcow2.size,
4009
                                            BDRV_SECTOR_SIZE);
4010

4011
    /* Create the qcow2 image (format layer) */
4012
    ret = qcow2_co_create(create_options, errp);
4013
finish:
4014
    if (ret < 0) {
4015
        bdrv_graph_co_rdlock();
4016
        bdrv_co_delete_file_noerr(bs);
4017
        bdrv_co_delete_file_noerr(data_bs);
4018
        bdrv_graph_co_rdunlock();
4019
    } else {
4020
        ret = 0;
4021
    }
4022

4023
    qobject_unref(qdict);
4024
    bdrv_co_unref(bs);
4025
    bdrv_co_unref(data_bs);
4026
    qapi_free_BlockdevCreateOptions(create_options);
4027
    return ret;
4028
}
4029

4030

4031
static bool coroutine_fn GRAPH_RDLOCK
4032
is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
4033
{
4034
    int64_t nr;
4035
    int res;
4036

4037
    /* Clamp to image length, before checking status of underlying sectors */
4038
    if (offset + bytes > bs->total_sectors * BDRV_SECTOR_SIZE) {
4039
        bytes = bs->total_sectors * BDRV_SECTOR_SIZE - offset;
4040
    }
4041

4042
    if (!bytes) {
4043
        return true;
4044
    }
4045

4046
    /*
4047
     * bdrv_block_status_above doesn't merge different types of zeros, for
4048
     * example, zeros which come from the region which is unallocated in
4049
     * the whole backing chain, and zeros which come because of a short
4050
     * backing file. So, we need a loop.
4051
     */
4052
    do {
4053
        res = bdrv_co_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
4054
        offset += nr;
4055
        bytes -= nr;
4056
    } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
4057

4058
    return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
4059
}
4060

4061
static int coroutine_fn GRAPH_RDLOCK
4062
qcow2_co_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes,
4063
                       BdrvRequestFlags flags)
4064
{
4065
    int ret;
4066
    BDRVQcow2State *s = bs->opaque;
4067

4068
    uint32_t head = offset_into_subcluster(s, offset);
4069
    uint32_t tail = ROUND_UP(offset + bytes, s->subcluster_size) -
4070
        (offset + bytes);
4071

4072
    trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, bytes);
4073
    if (offset + bytes == bs->total_sectors * BDRV_SECTOR_SIZE) {
4074
        tail = 0;
4075
    }
4076

4077
    if (head || tail) {
4078
        uint64_t off;
4079
        unsigned int nr;
4080
        QCow2SubclusterType type;
4081

4082
        assert(head + bytes + tail <= s->subcluster_size);
4083

4084
        /* check whether remainder of cluster already reads as zero */
4085
        if (!(is_zero(bs, offset - head, head) &&
4086
              is_zero(bs, offset + bytes, tail))) {
4087
            return -ENOTSUP;
4088
        }
4089

4090
        qemu_co_mutex_lock(&s->lock);
4091
        /* We can have new write after previous check */
4092
        offset -= head;
4093
        bytes = s->subcluster_size;
4094
        nr = s->subcluster_size;
4095
        ret = qcow2_get_host_offset(bs, offset, &nr, &off, &type);
4096
        if (ret < 0 ||
4097
            (type != QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN &&
4098
             type != QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC &&
4099
             type != QCOW2_SUBCLUSTER_ZERO_PLAIN &&
4100
             type != QCOW2_SUBCLUSTER_ZERO_ALLOC)) {
4101
            qemu_co_mutex_unlock(&s->lock);
4102
            return ret < 0 ? ret : -ENOTSUP;
4103
        }
4104
    } else {
4105
        qemu_co_mutex_lock(&s->lock);
4106
    }
4107

4108
    trace_qcow2_pwrite_zeroes(qemu_coroutine_self(), offset, bytes);
4109

4110
    /* Whatever is left can use real zero subclusters */
4111
    ret = qcow2_subcluster_zeroize(bs, offset, bytes, flags);
4112
    qemu_co_mutex_unlock(&s->lock);
4113

4114
    return ret;
4115
}
4116

4117
static int coroutine_fn GRAPH_RDLOCK
4118
qcow2_co_pdiscard(BlockDriverState *bs, int64_t offset, int64_t bytes)
4119
{
4120
    int ret;
4121
    BDRVQcow2State *s = bs->opaque;
4122

4123
    /* If the image does not support QCOW_OFLAG_ZERO then discarding
4124
     * clusters could expose stale data from the backing file. */
4125
    if (s->qcow_version < 3 && bs->backing) {
4126
        return -ENOTSUP;
4127
    }
4128

4129
    if (!QEMU_IS_ALIGNED(offset | bytes, s->cluster_size)) {
4130
        assert(bytes < s->cluster_size);
4131
        /* Ignore partial clusters, except for the special case of the
4132
         * complete partial cluster at the end of an unaligned file */
4133
        if (!QEMU_IS_ALIGNED(offset, s->cluster_size) ||
4134
            offset + bytes != bs->total_sectors * BDRV_SECTOR_SIZE) {
4135
            return -ENOTSUP;
4136
        }
4137
    }
4138

4139
    qemu_co_mutex_lock(&s->lock);
4140
    ret = qcow2_cluster_discard(bs, offset, bytes, QCOW2_DISCARD_REQUEST,
4141
                                false);
4142
    qemu_co_mutex_unlock(&s->lock);
4143
    return ret;
4144
}
4145

4146
static int coroutine_fn GRAPH_RDLOCK
4147
qcow2_co_copy_range_from(BlockDriverState *bs,
4148
                         BdrvChild *src, int64_t src_offset,
4149
                         BdrvChild *dst, int64_t dst_offset,
4150
                         int64_t bytes, BdrvRequestFlags read_flags,
4151
                         BdrvRequestFlags write_flags)
4152
{
4153
    BDRVQcow2State *s = bs->opaque;
4154
    int ret;
4155
    unsigned int cur_bytes; /* number of bytes in current iteration */
4156
    BdrvChild *child = NULL;
4157
    BdrvRequestFlags cur_write_flags;
4158

4159
    assert(!bs->encrypted);
4160
    qemu_co_mutex_lock(&s->lock);
4161

4162
    while (bytes != 0) {
4163
        uint64_t copy_offset = 0;
4164
        QCow2SubclusterType type;
4165
        /* prepare next request */
4166
        cur_bytes = MIN(bytes, INT_MAX);
4167
        cur_write_flags = write_flags;
4168

4169
        ret = qcow2_get_host_offset(bs, src_offset, &cur_bytes,
4170
                                    &copy_offset, &type);
4171
        if (ret < 0) {
4172
            goto out;
4173
        }
4174

4175
        switch (type) {
4176
        case QCOW2_SUBCLUSTER_UNALLOCATED_PLAIN:
4177
        case QCOW2_SUBCLUSTER_UNALLOCATED_ALLOC:
4178
            if (bs->backing && bs->backing->bs) {
4179
                int64_t backing_length = bdrv_co_getlength(bs->backing->bs);
4180
                if (src_offset >= backing_length) {
4181
                    cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4182
                } else {
4183
                    child = bs->backing;
4184
                    cur_bytes = MIN(cur_bytes, backing_length - src_offset);
4185
                    copy_offset = src_offset;
4186
                }
4187
            } else {
4188
                cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4189
            }
4190
            break;
4191

4192
        case QCOW2_SUBCLUSTER_ZERO_PLAIN:
4193
        case QCOW2_SUBCLUSTER_ZERO_ALLOC:
4194
            cur_write_flags |= BDRV_REQ_ZERO_WRITE;
4195
            break;
4196

4197
        case QCOW2_SUBCLUSTER_COMPRESSED:
4198
            ret = -ENOTSUP;
4199
            goto out;
4200

4201
        case QCOW2_SUBCLUSTER_NORMAL:
4202
            child = s->data_file;
4203
            break;
4204

4205
        default:
4206
            abort();
4207
        }
4208
        qemu_co_mutex_unlock(&s->lock);
4209
        ret = bdrv_co_copy_range_from(child,
4210
                                      copy_offset,
4211
                                      dst, dst_offset,
4212
                                      cur_bytes, read_flags, cur_write_flags);
4213
        qemu_co_mutex_lock(&s->lock);
4214
        if (ret < 0) {
4215
            goto out;
4216
        }
4217

4218
        bytes -= cur_bytes;
4219
        src_offset += cur_bytes;
4220
        dst_offset += cur_bytes;
4221
    }
4222
    ret = 0;
4223

4224
out:
4225
    qemu_co_mutex_unlock(&s->lock);
4226
    return ret;
4227
}
4228

4229
static int coroutine_fn GRAPH_RDLOCK
4230
qcow2_co_copy_range_to(BlockDriverState *bs,
4231
                       BdrvChild *src, int64_t src_offset,
4232
                       BdrvChild *dst, int64_t dst_offset,
4233
                       int64_t bytes, BdrvRequestFlags read_flags,
4234
                       BdrvRequestFlags write_flags)
4235
{
4236
    BDRVQcow2State *s = bs->opaque;
4237
    int ret;
4238
    unsigned int cur_bytes; /* number of sectors in current iteration */
4239
    uint64_t host_offset;
4240
    QCowL2Meta *l2meta = NULL;
4241

4242
    assert(!bs->encrypted);
4243

4244
    qemu_co_mutex_lock(&s->lock);
4245

4246
    while (bytes != 0) {
4247

4248
        l2meta = NULL;
4249

4250
        cur_bytes = MIN(bytes, INT_MAX);
4251

4252
        /* TODO:
4253
         * If src->bs == dst->bs, we could simply copy by incrementing
4254
         * the refcnt, without copying user data.
4255
         * Or if src->bs == dst->bs->backing->bs, we could copy by discarding. */
4256
        ret = qcow2_alloc_host_offset(bs, dst_offset, &cur_bytes,
4257
                                      &host_offset, &l2meta);
4258
        if (ret < 0) {
4259
            goto fail;
4260
        }
4261

4262
        ret = qcow2_pre_write_overlap_check(bs, 0, host_offset, cur_bytes,
4263
                                            true);
4264
        if (ret < 0) {
4265
            goto fail;
4266
        }
4267

4268
        qemu_co_mutex_unlock(&s->lock);
4269
        ret = bdrv_co_copy_range_to(src, src_offset, s->data_file, host_offset,
4270
                                    cur_bytes, read_flags, write_flags);
4271
        qemu_co_mutex_lock(&s->lock);
4272
        if (ret < 0) {
4273
            goto fail;
4274
        }
4275

4276
        ret = qcow2_handle_l2meta(bs, &l2meta, true);
4277
        if (ret) {
4278
            goto fail;
4279
        }
4280

4281
        bytes -= cur_bytes;
4282
        src_offset += cur_bytes;
4283
        dst_offset += cur_bytes;
4284
    }
4285
    ret = 0;
4286

4287
fail:
4288
    qcow2_handle_l2meta(bs, &l2meta, false);
4289

4290
    qemu_co_mutex_unlock(&s->lock);
4291

4292
    trace_qcow2_writev_done_req(qemu_coroutine_self(), ret);
4293

4294
    return ret;
4295
}
4296

4297
static int coroutine_fn GRAPH_RDLOCK
4298
qcow2_co_truncate(BlockDriverState *bs, int64_t offset, bool exact,
4299
                  PreallocMode prealloc, BdrvRequestFlags flags, Error **errp)
4300
{
4301
    ERRP_GUARD();
4302
    BDRVQcow2State *s = bs->opaque;
4303
    uint64_t old_length;
4304
    int64_t new_l1_size;
4305
    int ret;
4306
    QDict *options;
4307

4308
    if (prealloc != PREALLOC_MODE_OFF && prealloc != PREALLOC_MODE_METADATA &&
4309
        prealloc != PREALLOC_MODE_FALLOC && prealloc != PREALLOC_MODE_FULL)
4310
    {
4311
        error_setg(errp, "Unsupported preallocation mode '%s'",
4312
                   PreallocMode_str(prealloc));
4313
        return -ENOTSUP;
4314
    }
4315

4316
    if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
4317
        error_setg(errp, "The new size must be a multiple of %u",
4318
                   (unsigned) BDRV_SECTOR_SIZE);
4319
        return -EINVAL;
4320
    }
4321

4322
    qemu_co_mutex_lock(&s->lock);
4323

4324
    /*
4325
     * Even though we store snapshot size for all images, it was not
4326
     * required until v3, so it is not safe to proceed for v2.
4327
     */
4328
    if (s->nb_snapshots && s->qcow_version < 3) {
4329
        error_setg(errp, "Can't resize a v2 image which has snapshots");
4330
        ret = -ENOTSUP;
4331
        goto fail;
4332
    }
4333

4334
    /* See qcow2-bitmap.c for which bitmap scenarios prevent a resize. */
4335
    if (qcow2_truncate_bitmaps_check(bs, errp)) {
4336
        ret = -ENOTSUP;
4337
        goto fail;
4338
    }
4339

4340
    old_length = bs->total_sectors * BDRV_SECTOR_SIZE;
4341
    new_l1_size = size_to_l1(s, offset);
4342

4343
    if (offset < old_length) {
4344
        int64_t last_cluster, old_file_size;
4345
        if (prealloc != PREALLOC_MODE_OFF) {
4346
            error_setg(errp,
4347
                       "Preallocation can't be used for shrinking an image");
4348
            ret = -EINVAL;
4349
            goto fail;
4350
        }
4351

4352
        ret = qcow2_cluster_discard(bs, ROUND_UP(offset, s->cluster_size),
4353
                                    old_length - ROUND_UP(offset,
4354
                                                          s->cluster_size),
4355
                                    QCOW2_DISCARD_ALWAYS, true);
4356
        if (ret < 0) {
4357
            error_setg_errno(errp, -ret, "Failed to discard cropped clusters");
4358
            goto fail;
4359
        }
4360

4361
        ret = qcow2_shrink_l1_table(bs, new_l1_size);
4362
        if (ret < 0) {
4363
            error_setg_errno(errp, -ret,
4364
                             "Failed to reduce the number of L2 tables");
4365
            goto fail;
4366
        }
4367

4368
        ret = qcow2_shrink_reftable(bs);
4369
        if (ret < 0) {
4370
            error_setg_errno(errp, -ret,
4371
                             "Failed to discard unused refblocks");
4372
            goto fail;
4373
        }
4374

4375
        old_file_size = bdrv_co_getlength(bs->file->bs);
4376
        if (old_file_size < 0) {
4377
            error_setg_errno(errp, -old_file_size,
4378
                             "Failed to inquire current file length");
4379
            ret = old_file_size;
4380
            goto fail;
4381
        }
4382
        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
4383
        if (last_cluster < 0) {
4384
            error_setg_errno(errp, -last_cluster,
4385
                             "Failed to find the last cluster");
4386
            ret = last_cluster;
4387
            goto fail;
4388
        }
4389
        if ((last_cluster + 1) * s->cluster_size < old_file_size) {
4390
            Error *local_err = NULL;
4391

4392
            /*
4393
             * Do not pass @exact here: It will not help the user if
4394
             * we get an error here just because they wanted to shrink
4395
             * their qcow2 image (on a block device) with qemu-img.
4396
             * (And on the qcow2 layer, the @exact requirement is
4397
             * always fulfilled, so there is no need to pass it on.)
4398
             */
4399
            bdrv_co_truncate(bs->file, (last_cluster + 1) * s->cluster_size,
4400
                             false, PREALLOC_MODE_OFF, 0, &local_err);
4401
            if (local_err) {
4402
                warn_reportf_err(local_err,
4403
                                 "Failed to truncate the tail of the image: ");
4404
            }
4405
        }
4406
    } else {
4407
        ret = qcow2_grow_l1_table(bs, new_l1_size, true);
4408
        if (ret < 0) {
4409
            error_setg_errno(errp, -ret, "Failed to grow the L1 table");
4410
            goto fail;
4411
        }
4412

4413
        if (data_file_is_raw(bs) && prealloc == PREALLOC_MODE_OFF) {
4414
            /*
4415
             * When creating a qcow2 image with data-file-raw, we enforce
4416
             * at least prealloc=metadata, so that the L1/L2 tables are
4417
             * fully allocated and reading from the data file will return
4418
             * the same data as reading from the qcow2 image.  When the
4419
             * image is grown, we must consequently preallocate the
4420
             * metadata structures to cover the added area.
4421
             */
4422
            prealloc = PREALLOC_MODE_METADATA;
4423
        }
4424
    }
4425

4426
    switch (prealloc) {
4427
    case PREALLOC_MODE_OFF:
4428
        if (has_data_file(bs)) {
4429
            /*
4430
             * If the caller wants an exact resize, the external data
4431
             * file should be resized to the exact target size, too,
4432
             * so we pass @exact here.
4433
             */
4434
            ret = bdrv_co_truncate(s->data_file, offset, exact, prealloc, 0,
4435
                                   errp);
4436
            if (ret < 0) {
4437
                goto fail;
4438
            }
4439
        }
4440
        break;
4441

4442
    case PREALLOC_MODE_METADATA:
4443
        ret = preallocate_co(bs, old_length, offset, prealloc, errp);
4444
        if (ret < 0) {
4445
            goto fail;
4446
        }
4447
        break;
4448

4449
    case PREALLOC_MODE_FALLOC:
4450
    case PREALLOC_MODE_FULL:
4451
    {
4452
        int64_t allocation_start, host_offset, guest_offset;
4453
        int64_t clusters_allocated;
4454
        int64_t old_file_size, last_cluster, new_file_size;
4455
        uint64_t nb_new_data_clusters, nb_new_l2_tables;
4456
        bool subclusters_need_allocation = false;
4457

4458
        /* With a data file, preallocation means just allocating the metadata
4459
         * and forwarding the truncate request to the data file */
4460
        if (has_data_file(bs)) {
4461
            ret = preallocate_co(bs, old_length, offset, prealloc, errp);
4462
            if (ret < 0) {
4463
                goto fail;
4464
            }
4465
            break;
4466
        }
4467

4468
        old_file_size = bdrv_co_getlength(bs->file->bs);
4469
        if (old_file_size < 0) {
4470
            error_setg_errno(errp, -old_file_size,
4471
                             "Failed to inquire current file length");
4472
            ret = old_file_size;
4473
            goto fail;
4474
        }
4475

4476
        last_cluster = qcow2_get_last_cluster(bs, old_file_size);
4477
        if (last_cluster >= 0) {
4478
            old_file_size = (last_cluster + 1) * s->cluster_size;
4479
        } else {
4480
            old_file_size = ROUND_UP(old_file_size, s->cluster_size);
4481
        }
4482

4483
        nb_new_data_clusters = (ROUND_UP(offset, s->cluster_size) -
4484
            start_of_cluster(s, old_length)) >> s->cluster_bits;
4485

4486
        /* This is an overestimation; we will not actually allocate space for
4487
         * these in the file but just make sure the new refcount structures are
4488
         * able to cover them so we will not have to allocate new refblocks
4489
         * while entering the data blocks in the potentially new L2 tables.
4490
         * (We do not actually care where the L2 tables are placed. Maybe they
4491
         *  are already allocated or they can be placed somewhere before
4492
         *  @old_file_size. It does not matter because they will be fully
4493
         *  allocated automatically, so they do not need to be covered by the
4494
         *  preallocation. All that matters is that we will not have to allocate
4495
         *  new refcount structures for them.) */
4496
        nb_new_l2_tables = DIV_ROUND_UP(nb_new_data_clusters,
4497
                                        s->cluster_size / l2_entry_size(s));
4498
        /* The cluster range may not be aligned to L2 boundaries, so add one L2
4499
         * table for a potential head/tail */
4500
        nb_new_l2_tables++;
4501

4502
        allocation_start = qcow2_refcount_area(bs, old_file_size,
4503
                                               nb_new_data_clusters +
4504
                                               nb_new_l2_tables,
4505
                                               true, 0, 0);
4506
        if (allocation_start < 0) {
4507
            error_setg_errno(errp, -allocation_start,
4508
                             "Failed to resize refcount structures");
4509
            ret = allocation_start;
4510
            goto fail;
4511
        }
4512

4513
        clusters_allocated = qcow2_alloc_clusters_at(bs, allocation_start,
4514
                                                     nb_new_data_clusters);
4515
        if (clusters_allocated < 0) {
4516
            error_setg_errno(errp, -clusters_allocated,
4517
                             "Failed to allocate data clusters");
4518
            ret = clusters_allocated;
4519
            goto fail;
4520
        }
4521

4522
        assert(clusters_allocated == nb_new_data_clusters);
4523

4524
        /* Allocate the data area */
4525
        new_file_size = allocation_start +
4526
                        nb_new_data_clusters * s->cluster_size;
4527
        /*
4528
         * Image file grows, so @exact does not matter.
4529
         *
4530
         * If we need to zero out the new area, try first whether the protocol
4531
         * driver can already take care of this.
4532
         */
4533
        if (flags & BDRV_REQ_ZERO_WRITE) {
4534
            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc,
4535
                                   BDRV_REQ_ZERO_WRITE, NULL);
4536
            if (ret >= 0) {
4537
                flags &= ~BDRV_REQ_ZERO_WRITE;
4538
                /* Ensure that we read zeroes and not backing file data */
4539
                subclusters_need_allocation = true;
4540
            }
4541
        } else {
4542
            ret = -1;
4543
        }
4544
        if (ret < 0) {
4545
            ret = bdrv_co_truncate(bs->file, new_file_size, false, prealloc, 0,
4546
                                   errp);
4547
        }
4548
        if (ret < 0) {
4549
            error_prepend(errp, "Failed to resize underlying file: ");
4550
            qcow2_free_clusters(bs, allocation_start,
4551
                                nb_new_data_clusters * s->cluster_size,
4552
                                QCOW2_DISCARD_OTHER);
4553
            goto fail;
4554
        }
4555

4556
        /* Create the necessary L2 entries */
4557
        host_offset = allocation_start;
4558
        guest_offset = old_length;
4559
        while (nb_new_data_clusters) {
4560
            int64_t nb_clusters = MIN(
4561
                nb_new_data_clusters,
4562
                s->l2_slice_size - offset_to_l2_slice_index(s, guest_offset));
4563
            unsigned cow_start_length = offset_into_cluster(s, guest_offset);
4564
            QCowL2Meta allocation;
4565
            guest_offset = start_of_cluster(s, guest_offset);
4566
            allocation = (QCowL2Meta) {
4567
                .offset       = guest_offset,
4568
                .alloc_offset = host_offset,
4569
                .nb_clusters  = nb_clusters,
4570
                .cow_start    = {
4571
                    .offset       = 0,
4572
                    .nb_bytes     = cow_start_length,
4573
                },
4574
                .cow_end      = {
4575
                    .offset       = nb_clusters << s->cluster_bits,
4576
                    .nb_bytes     = 0,
4577
                },
4578
                .prealloc     = !subclusters_need_allocation,
4579
            };
4580
            qemu_co_queue_init(&allocation.dependent_requests);
4581

4582
            ret = qcow2_alloc_cluster_link_l2(bs, &allocation);
4583
            if (ret < 0) {
4584
                error_setg_errno(errp, -ret, "Failed to update L2 tables");
4585
                qcow2_free_clusters(bs, host_offset,
4586
                                    nb_new_data_clusters * s->cluster_size,
4587
                                    QCOW2_DISCARD_OTHER);
4588
                goto fail;
4589
            }
4590

4591
            guest_offset += nb_clusters * s->cluster_size;
4592
            host_offset += nb_clusters * s->cluster_size;
4593
            nb_new_data_clusters -= nb_clusters;
4594
        }
4595
        break;
4596
    }
4597

4598
    default:
4599
        g_assert_not_reached();
4600
    }
4601

4602
    if ((flags & BDRV_REQ_ZERO_WRITE) && offset > old_length) {
4603
        uint64_t zero_start = QEMU_ALIGN_UP(old_length, s->subcluster_size);
4604

4605
        /*
4606
         * Use zero clusters as much as we can. qcow2_subcluster_zeroize()
4607
         * requires a subcluster-aligned start. The end may be unaligned if
4608
         * it is at the end of the image (which it is here).
4609
         */
4610
        if (offset > zero_start) {
4611
            ret = qcow2_subcluster_zeroize(bs, zero_start, offset - zero_start,
4612
                                           0);
4613
            if (ret < 0) {
4614
                error_setg_errno(errp, -ret, "Failed to zero out new clusters");
4615
                goto fail;
4616
            }
4617
        }
4618

4619
        /* Write explicit zeros for the unaligned head */
4620
        if (zero_start > old_length) {
4621
            uint64_t len = MIN(zero_start, offset) - old_length;
4622
            uint8_t *buf = qemu_blockalign0(bs, len);
4623
            QEMUIOVector qiov;
4624
            qemu_iovec_init_buf(&qiov, buf, len);
4625

4626
            qemu_co_mutex_unlock(&s->lock);
4627
            ret = qcow2_co_pwritev_part(bs, old_length, len, &qiov, 0, 0);
4628
            qemu_co_mutex_lock(&s->lock);
4629

4630
            qemu_vfree(buf);
4631
            if (ret < 0) {
4632
                error_setg_errno(errp, -ret, "Failed to zero out the new area");
4633
                goto fail;
4634
            }
4635
        }
4636
    }
4637

4638
    if (prealloc != PREALLOC_MODE_OFF) {
4639
        /* Flush metadata before actually changing the image size */
4640
        ret = qcow2_write_caches(bs);
4641
        if (ret < 0) {
4642
            error_setg_errno(errp, -ret,
4643
                             "Failed to flush the preallocated area to disk");
4644
            goto fail;
4645
        }
4646
    }
4647

4648
    bs->total_sectors = offset / BDRV_SECTOR_SIZE;
4649

4650
    /* write updated header.size */
4651
    offset = cpu_to_be64(offset);
4652
    ret = bdrv_co_pwrite_sync(bs->file, offsetof(QCowHeader, size),
4653
                              sizeof(offset), &offset, 0);
4654
    if (ret < 0) {
4655
        error_setg_errno(errp, -ret, "Failed to update the image size");
4656
        goto fail;
4657
    }
4658

4659
    s->l1_vm_state_index = new_l1_size;
4660

4661
    /* Update cache sizes */
4662
    options = qdict_clone_shallow(bs->options);
4663
    ret = qcow2_update_options(bs, options, s->flags, errp);
4664
    qobject_unref(options);
4665
    if (ret < 0) {
4666
        goto fail;
4667
    }
4668
    ret = 0;
4669
fail:
4670
    qemu_co_mutex_unlock(&s->lock);
4671
    return ret;
4672
}
4673

4674
static int coroutine_fn GRAPH_RDLOCK
4675
qcow2_co_pwritev_compressed_task(BlockDriverState *bs,
4676
                                 uint64_t offset, uint64_t bytes,
4677
                                 QEMUIOVector *qiov, size_t qiov_offset)
4678
{
4679
    BDRVQcow2State *s = bs->opaque;
4680
    int ret;
4681
    ssize_t out_len;
4682
    uint8_t *buf, *out_buf;
4683
    uint64_t cluster_offset;
4684

4685
    assert(bytes == s->cluster_size || (bytes < s->cluster_size &&
4686
           (offset + bytes == bs->total_sectors << BDRV_SECTOR_BITS)));
4687

4688
    buf = qemu_blockalign(bs, s->cluster_size);
4689
    if (bytes < s->cluster_size) {
4690
        /* Zero-pad last write if image size is not cluster aligned */
4691
        memset(buf + bytes, 0, s->cluster_size - bytes);
4692
    }
4693
    qemu_iovec_to_buf(qiov, qiov_offset, buf, bytes);
4694

4695
    out_buf = g_malloc(s->cluster_size);
4696

4697
    out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1,
4698
                                buf, s->cluster_size);
4699
    if (out_len == -ENOMEM) {
4700
        /* could not compress: write normal cluster */
4701
        ret = qcow2_co_pwritev_part(bs, offset, bytes, qiov, qiov_offset, 0);
4702
        if (ret < 0) {
4703
            goto fail;
4704
        }
4705
        goto success;
4706
    } else if (out_len < 0) {
4707
        ret = -EINVAL;
4708
        goto fail;
4709
    }
4710

4711
    qemu_co_mutex_lock(&s->lock);
4712
    ret = qcow2_alloc_compressed_cluster_offset(bs, offset, out_len,
4713
                                                &cluster_offset);
4714
    if (ret < 0) {
4715
        qemu_co_mutex_unlock(&s->lock);
4716
        goto fail;
4717
    }
4718

4719
    ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len, true);
4720
    qemu_co_mutex_unlock(&s->lock);
4721
    if (ret < 0) {
4722
        goto fail;
4723
    }
4724

4725
    BLKDBG_CO_EVENT(s->data_file, BLKDBG_WRITE_COMPRESSED);
4726
    ret = bdrv_co_pwrite(s->data_file, cluster_offset, out_len, out_buf, 0);
4727
    if (ret < 0) {
4728
        goto fail;
4729
    }
4730
success:
4731
    ret = 0;
4732
fail:
4733
    qemu_vfree(buf);
4734
    g_free(out_buf);
4735
    return ret;
4736
}
4737

4738
/*
4739
 * This function can count as GRAPH_RDLOCK because
4740
 * qcow2_co_pwritev_compressed_part() holds the graph lock and keeps it until
4741
 * this coroutine has terminated.
4742
 */
4743
static int coroutine_fn GRAPH_RDLOCK
4744
qcow2_co_pwritev_compressed_task_entry(AioTask *task)
4745
{
4746
    Qcow2AioTask *t = container_of(task, Qcow2AioTask, task);
4747

4748
    assert(!t->subcluster_type && !t->l2meta);
4749

4750
    return qcow2_co_pwritev_compressed_task(t->bs, t->offset, t->bytes, t->qiov,
4751
                                            t->qiov_offset);
4752
}
4753

4754
/*
4755
 * XXX: put compressed sectors first, then all the cluster aligned
4756
 * tables to avoid losing bytes in alignment
4757
 */
4758
static int coroutine_fn GRAPH_RDLOCK
4759
qcow2_co_pwritev_compressed_part(BlockDriverState *bs,
4760
                                 int64_t offset, int64_t bytes,
4761
                                 QEMUIOVector *qiov, size_t qiov_offset)
4762
{
4763
    BDRVQcow2State *s = bs->opaque;
4764
    AioTaskPool *aio = NULL;
4765
    int ret = 0;
4766

4767
    if (has_data_file(bs)) {
4768
        return -ENOTSUP;
4769
    }
4770

4771
    if (bytes == 0) {
4772
        /*
4773
         * align end of file to a sector boundary to ease reading with
4774
         * sector based I/Os
4775
         */
4776
        int64_t len = bdrv_co_getlength(bs->file->bs);
4777
        if (len < 0) {
4778
            return len;
4779
        }
4780
        return bdrv_co_truncate(bs->file, len, false, PREALLOC_MODE_OFF, 0,
4781
                                NULL);
4782
    }
4783

4784
    if (offset_into_cluster(s, offset)) {
4785
        return -EINVAL;
4786
    }
4787

4788
    if (offset_into_cluster(s, bytes) &&
4789
        (offset + bytes) != (bs->total_sectors << BDRV_SECTOR_BITS)) {
4790
        return -EINVAL;
4791
    }
4792

4793
    while (bytes && aio_task_pool_status(aio) == 0) {
4794
        uint64_t chunk_size = MIN(bytes, s->cluster_size);
4795

4796
        if (!aio && chunk_size != bytes) {
4797
            aio = aio_task_pool_new(QCOW2_MAX_WORKERS);
4798
        }
4799

4800
        ret = qcow2_add_task(bs, aio, qcow2_co_pwritev_compressed_task_entry,
4801
                             0, 0, offset, chunk_size, qiov, qiov_offset, NULL);
4802
        if (ret < 0) {
4803
            break;
4804
        }
4805
        qiov_offset += chunk_size;
4806
        offset += chunk_size;
4807
        bytes -= chunk_size;
4808
    }
4809

4810
    if (aio) {
4811
        aio_task_pool_wait_all(aio);
4812
        if (ret == 0) {
4813
            ret = aio_task_pool_status(aio);
4814
        }
4815
        g_free(aio);
4816
    }
4817

4818
    return ret;
4819
}
4820

4821
static int coroutine_fn GRAPH_RDLOCK
4822
qcow2_co_preadv_compressed(BlockDriverState *bs,
4823
                           uint64_t l2_entry,
4824
                           uint64_t offset,
4825
                           uint64_t bytes,
4826
                           QEMUIOVector *qiov,
4827
                           size_t qiov_offset)
4828
{
4829
    BDRVQcow2State *s = bs->opaque;
4830
    int ret = 0, csize;
4831
    uint64_t coffset;
4832
    uint8_t *buf, *out_buf;
4833
    int offset_in_cluster = offset_into_cluster(s, offset);
4834

4835
    qcow2_parse_compressed_l2_entry(bs, l2_entry, &coffset, &csize);
4836

4837
    buf = g_try_malloc(csize);
4838
    if (!buf) {
4839
        return -ENOMEM;
4840
    }
4841

4842
    out_buf = qemu_blockalign(bs, s->cluster_size);
4843

4844
    BLKDBG_CO_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
4845
    ret = bdrv_co_pread(bs->file, coffset, csize, buf, 0);
4846
    if (ret < 0) {
4847
        goto fail;
4848
    }
4849

4850
    if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
4851
        ret = -EIO;
4852
        goto fail;
4853
    }
4854

4855
    qemu_iovec_from_buf(qiov, qiov_offset, out_buf + offset_in_cluster, bytes);
4856

4857
fail:
4858
    qemu_vfree(out_buf);
4859
    g_free(buf);
4860

4861
    return ret;
4862
}
4863

4864
static int GRAPH_RDLOCK make_completely_empty(BlockDriverState *bs)
4865
{
4866
    BDRVQcow2State *s = bs->opaque;
4867
    Error *local_err = NULL;
4868
    int ret, l1_clusters;
4869
    int64_t offset;
4870
    uint64_t *new_reftable = NULL;
4871
    uint64_t rt_entry, l1_size2;
4872
    struct {
4873
        uint64_t l1_offset;
4874
        uint64_t reftable_offset;
4875
        uint32_t reftable_clusters;
4876
    } QEMU_PACKED l1_ofs_rt_ofs_cls;
4877

4878
    ret = qcow2_cache_empty(bs, s->l2_table_cache);
4879
    if (ret < 0) {
4880
        goto fail;
4881
    }
4882

4883
    ret = qcow2_cache_empty(bs, s->refcount_block_cache);
4884
    if (ret < 0) {
4885
        goto fail;
4886
    }
4887

4888
    /* Refcounts will be broken utterly */
4889
    ret = qcow2_mark_dirty(bs);
4890
    if (ret < 0) {
4891
        goto fail;
4892
    }
4893

4894
    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
4895

4896
    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
4897
    l1_size2 = (uint64_t)s->l1_size * L1E_SIZE;
4898

4899
    /* After this call, neither the in-memory nor the on-disk refcount
4900
     * information accurately describe the actual references */
4901

4902
    ret = bdrv_pwrite_zeroes(bs->file, s->l1_table_offset,
4903
                             l1_clusters * s->cluster_size, 0);
4904
    if (ret < 0) {
4905
        goto fail_broken_refcounts;
4906
    }
4907
    memset(s->l1_table, 0, l1_size2);
4908

4909
    BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE);
4910

4911
    /* Overwrite enough clusters at the beginning of the sectors to place
4912
     * the refcount table, a refcount block and the L1 table in; this may
4913
     * overwrite parts of the existing refcount and L1 table, which is not
4914
     * an issue because the dirty flag is set, complete data loss is in fact
4915
     * desired and partial data loss is consequently fine as well */
4916
    ret = bdrv_pwrite_zeroes(bs->file, s->cluster_size,
4917
                             (2 + l1_clusters) * s->cluster_size, 0);
4918
    /* This call (even if it failed overall) may have overwritten on-disk
4919
     * refcount structures; in that case, the in-memory refcount information
4920
     * will probably differ from the on-disk information which makes the BDS
4921
     * unusable */
4922
    if (ret < 0) {
4923
        goto fail_broken_refcounts;
4924
    }
4925

4926
    BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
4927
    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
4928

4929
    /* "Create" an empty reftable (one cluster) directly after the image
4930
     * header and an empty L1 table three clusters after the image header;
4931
     * the cluster between those two will be used as the first refblock */
4932
    l1_ofs_rt_ofs_cls.l1_offset = cpu_to_be64(3 * s->cluster_size);
4933
    l1_ofs_rt_ofs_cls.reftable_offset = cpu_to_be64(s->cluster_size);
4934
    l1_ofs_rt_ofs_cls.reftable_clusters = cpu_to_be32(1);
4935
    ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset),
4936
                           sizeof(l1_ofs_rt_ofs_cls), &l1_ofs_rt_ofs_cls, 0);
4937
    if (ret < 0) {
4938
        goto fail_broken_refcounts;
4939
    }
4940

4941
    s->l1_table_offset = 3 * s->cluster_size;
4942

4943
    new_reftable = g_try_new0(uint64_t, s->cluster_size / REFTABLE_ENTRY_SIZE);
4944
    if (!new_reftable) {
4945
        ret = -ENOMEM;
4946
        goto fail_broken_refcounts;
4947
    }
4948

4949
    s->refcount_table_offset = s->cluster_size;
4950
    s->refcount_table_size   = s->cluster_size / REFTABLE_ENTRY_SIZE;
4951
    s->max_refcount_table_index = 0;
4952

4953
    g_free(s->refcount_table);
4954
    s->refcount_table = new_reftable;
4955
    new_reftable = NULL;
4956

4957
    /* Now the in-memory refcount information again corresponds to the on-disk
4958
     * information (reftable is empty and no refblocks (the refblock cache is
4959
     * empty)); however, this means some clusters (e.g. the image header) are
4960
     * referenced, but not refcounted, but the normal qcow2 code assumes that
4961
     * the in-memory information is always correct */
4962

4963
    BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC);
4964

4965
    /* Enter the first refblock into the reftable */
4966
    rt_entry = cpu_to_be64(2 * s->cluster_size);
4967
    ret = bdrv_pwrite_sync(bs->file, s->cluster_size, sizeof(rt_entry),
4968
                           &rt_entry, 0);
4969
    if (ret < 0) {
4970
        goto fail_broken_refcounts;
4971
    }
4972
    s->refcount_table[0] = 2 * s->cluster_size;
4973

4974
    s->free_cluster_index = 0;
4975
    assert(3 + l1_clusters <= s->refcount_block_size);
4976
    offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2);
4977
    if (offset < 0) {
4978
        ret = offset;
4979
        goto fail_broken_refcounts;
4980
    } else if (offset > 0) {
4981
        error_report("First cluster in emptied image is in use");
4982
        abort();
4983
    }
4984

4985
    /* Now finally the in-memory information corresponds to the on-disk
4986
     * structures and is correct */
4987
    ret = qcow2_mark_clean(bs);
4988
    if (ret < 0) {
4989
        goto fail;
4990
    }
4991

4992
    ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size, false,
4993
                        PREALLOC_MODE_OFF, 0, &local_err);
4994
    if (ret < 0) {
4995
        error_report_err(local_err);
4996
        goto fail;
4997
    }
4998

4999
    return 0;
5000

5001
fail_broken_refcounts:
5002
    /* The BDS is unusable at this point. If we wanted to make it usable, we
5003
     * would have to call qcow2_refcount_close(), qcow2_refcount_init(),
5004
     * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init()
5005
     * again. However, because the functions which could have caused this error
5006
     * path to be taken are used by those functions as well, it's very likely
5007
     * that that sequence will fail as well. Therefore, just eject the BDS. */
5008
    bs->drv = NULL;
5009

5010
fail:
5011
    g_free(new_reftable);
5012
    return ret;
5013
}
5014

5015
static int GRAPH_RDLOCK qcow2_make_empty(BlockDriverState *bs)
5016
{
5017
    BDRVQcow2State *s = bs->opaque;
5018
    uint64_t offset, end_offset;
5019
    int step = QEMU_ALIGN_DOWN(INT_MAX, s->cluster_size);
5020
    int l1_clusters, ret = 0;
5021

5022
    l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / L1E_SIZE);
5023

5024
    if (s->qcow_version >= 3 && !s->snapshots && !s->nb_bitmaps &&
5025
        3 + l1_clusters <= s->refcount_block_size &&
5026
        s->crypt_method_header != QCOW_CRYPT_LUKS &&
5027
        !has_data_file(bs)) {
5028
        /* The following function only works for qcow2 v3 images (it
5029
         * requires the dirty flag) and only as long as there are no
5030
         * features that reserve extra clusters (such as snapshots,
5031
         * LUKS header, or persistent bitmaps), because it completely
5032
         * empties the image.  Furthermore, the L1 table and three
5033
         * additional clusters (image header, refcount table, one
5034
         * refcount block) have to fit inside one refcount block. It
5035
         * only resets the image file, i.e. does not work with an
5036
         * external data file. */
5037
        return make_completely_empty(bs);
5038
    }
5039

5040
    /* This fallback code simply discards every active cluster; this is slow,
5041
     * but works in all cases */
5042
    end_offset = bs->total_sectors * BDRV_SECTOR_SIZE;
5043
    for (offset = 0; offset < end_offset; offset += step) {
5044
        /* As this function is generally used after committing an external
5045
         * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the
5046
         * default action for this kind of discard is to pass the discard,
5047
         * which will ideally result in an actually smaller image file, as
5048
         * is probably desired. */
5049
        ret = qcow2_cluster_discard(bs, offset, MIN(step, end_offset - offset),
5050
                                    QCOW2_DISCARD_SNAPSHOT, true);
5051
        if (ret < 0) {
5052
            break;
5053
        }
5054
    }
5055

5056
    return ret;
5057
}
5058

5059
static coroutine_fn GRAPH_RDLOCK int qcow2_co_flush_to_os(BlockDriverState *bs)
5060
{
5061
    BDRVQcow2State *s = bs->opaque;
5062
    int ret;
5063

5064
    qemu_co_mutex_lock(&s->lock);
5065
    ret = qcow2_write_caches(bs);
5066
    qemu_co_mutex_unlock(&s->lock);
5067

5068
    return ret;
5069
}
5070

5071
static BlockMeasureInfo *qcow2_measure(QemuOpts *opts, BlockDriverState *in_bs,
5072
                                       Error **errp)
5073
{
5074
    Error *local_err = NULL;
5075
    BlockMeasureInfo *info;
5076
    uint64_t required = 0; /* bytes that contribute to required size */
5077
    uint64_t virtual_size; /* disk size as seen by guest */
5078
    uint64_t refcount_bits;
5079
    uint64_t l2_tables;
5080
    uint64_t luks_payload_size = 0;
5081
    size_t cluster_size;
5082
    int version;
5083
    char *optstr;
5084
    PreallocMode prealloc;
5085
    bool has_backing_file;
5086
    bool has_luks;
5087
    bool extended_l2;
5088
    size_t l2e_size;
5089

5090
    /* Parse image creation options */
5091
    extended_l2 = qemu_opt_get_bool_del(opts, BLOCK_OPT_EXTL2, false);
5092

5093
    cluster_size = qcow2_opt_get_cluster_size_del(opts, extended_l2,
5094
                                                  &local_err);
5095
    if (local_err) {
5096
        goto err;
5097
    }
5098

5099
    version = qcow2_opt_get_version_del(opts, &local_err);
5100
    if (local_err) {
5101
        goto err;
5102
    }
5103

5104
    refcount_bits = qcow2_opt_get_refcount_bits_del(opts, version, &local_err);
5105
    if (local_err) {
5106
        goto err;
5107
    }
5108

5109
    optstr = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC);
5110
    prealloc = qapi_enum_parse(&PreallocMode_lookup, optstr,
5111
                               PREALLOC_MODE_OFF, &local_err);
5112
    g_free(optstr);
5113
    if (local_err) {
5114
        goto err;
5115
    }
5116

5117
    optstr = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE);
5118
    has_backing_file = !!optstr;
5119
    g_free(optstr);
5120

5121
    optstr = qemu_opt_get_del(opts, BLOCK_OPT_ENCRYPT_FORMAT);
5122
    has_luks = optstr && strcmp(optstr, "luks") == 0;
5123
    g_free(optstr);
5124

5125
    if (has_luks) {
5126
        g_autoptr(QCryptoBlockCreateOptions) create_opts = NULL;
5127
        QDict *cryptoopts = qcow2_extract_crypto_opts(opts, "luks", errp);
5128
        size_t headerlen;
5129

5130
        create_opts = block_crypto_create_opts_init(cryptoopts, errp);
5131
        qobject_unref(cryptoopts);
5132
        if (!create_opts) {
5133
            goto err;
5134
        }
5135

5136
        if (!qcrypto_block_calculate_payload_offset(create_opts,
5137
                                                    "encrypt.",
5138
                                                    &headerlen,
5139
                                                    &local_err)) {
5140
            goto err;
5141
        }
5142

5143
        luks_payload_size = ROUND_UP(headerlen, cluster_size);
5144
    }
5145

5146
    virtual_size = qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0);
5147
    virtual_size = ROUND_UP(virtual_size, cluster_size);
5148

5149
    /* Check that virtual disk size is valid */
5150
    l2e_size = extended_l2 ? L2E_SIZE_EXTENDED : L2E_SIZE_NORMAL;
5151
    l2_tables = DIV_ROUND_UP(virtual_size / cluster_size,
5152
                             cluster_size / l2e_size);
5153
    if (l2_tables * L1E_SIZE > QCOW_MAX_L1_SIZE) {
5154
        error_setg(&local_err, "The image size is too large "
5155
                               "(try using a larger cluster size)");
5156
        goto err;
5157
    }
5158

5159
    /* Account for input image */
5160
    if (in_bs) {
5161
        int64_t ssize = bdrv_getlength(in_bs);
5162
        if (ssize < 0) {
5163
            error_setg_errno(&local_err, -ssize,
5164
                             "Unable to get image virtual_size");
5165
            goto err;
5166
        }
5167

5168
        virtual_size = ROUND_UP(ssize, cluster_size);
5169

5170
        if (has_backing_file) {
5171
            /* We don't how much of the backing chain is shared by the input
5172
             * image and the new image file.  In the worst case the new image's
5173
             * backing file has nothing in common with the input image.  Be
5174
             * conservative and assume all clusters need to be written.
5175
             */
5176
            required = virtual_size;
5177
        } else {
5178
            int64_t offset;
5179
            int64_t pnum = 0;
5180

5181
            for (offset = 0; offset < ssize; offset += pnum) {
5182
                int ret;
5183

5184
                ret = bdrv_block_status_above(in_bs, NULL, offset,
5185
                                              ssize - offset, &pnum, NULL,
5186
                                              NULL);
5187
                if (ret < 0) {
5188
                    error_setg_errno(&local_err, -ret,
5189
                                     "Unable to get block status");
5190
                    goto err;
5191
                }
5192

5193
                if (ret & BDRV_BLOCK_ZERO) {
5194
                    /* Skip zero regions (safe with no backing file) */
5195
                } else if ((ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) ==
5196
                           (BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED)) {
5197
                    /* Extend pnum to end of cluster for next iteration */
5198
                    pnum = ROUND_UP(offset + pnum, cluster_size) - offset;
5199

5200
                    /* Count clusters we've seen */
5201
                    required += offset % cluster_size + pnum;
5202
                }
5203
            }
5204
        }
5205
    }
5206

5207
    /* Take into account preallocation.  Nothing special is needed for
5208
     * PREALLOC_MODE_METADATA since metadata is always counted.
5209
     */
5210
    if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) {
5211
        required = virtual_size;
5212
    }
5213

5214
    info = g_new0(BlockMeasureInfo, 1);
5215
    info->fully_allocated = luks_payload_size +
5216
        qcow2_calc_prealloc_size(virtual_size, cluster_size,
5217
                                 ctz32(refcount_bits), extended_l2);
5218

5219
    /*
5220
     * Remove data clusters that are not required.  This overestimates the
5221
     * required size because metadata needed for the fully allocated file is
5222
     * still counted.  Show bitmaps only if both source and destination
5223
     * would support them.
5224
     */
5225
    info->required = info->fully_allocated - virtual_size + required;
5226
    info->has_bitmaps = version >= 3 && in_bs &&
5227
        bdrv_supports_persistent_dirty_bitmap(in_bs);
5228
    if (info->has_bitmaps) {
5229
        info->bitmaps = qcow2_get_persistent_dirty_bitmap_size(in_bs,
5230
                                                               cluster_size);
5231
    }
5232
    return info;
5233

5234
err:
5235
    error_propagate(errp, local_err);
5236
    return NULL;
5237
}
5238

5239
static int coroutine_fn
5240
qcow2_co_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
5241
{
5242
    BDRVQcow2State *s = bs->opaque;
5243
    bdi->cluster_size = s->cluster_size;
5244
    bdi->subcluster_size = s->subcluster_size;
5245
    bdi->vm_state_offset = qcow2_vm_state_offset(s);
5246
    bdi->is_dirty = s->incompatible_features & QCOW2_INCOMPAT_DIRTY;
5247
    return 0;
5248
}
5249

5250
static ImageInfoSpecific * GRAPH_RDLOCK
5251
qcow2_get_specific_info(BlockDriverState *bs, Error **errp)
5252
{
5253
    BDRVQcow2State *s = bs->opaque;
5254
    ImageInfoSpecific *spec_info;
5255
    QCryptoBlockInfo *encrypt_info = NULL;
5256

5257
    if (s->crypto != NULL) {
5258
        encrypt_info = qcrypto_block_get_info(s->crypto, errp);
5259
        if (!encrypt_info) {
5260
            return NULL;
5261
        }
5262
    }
5263

5264
    spec_info = g_new(ImageInfoSpecific, 1);
5265
    *spec_info = (ImageInfoSpecific){
5266
        .type  = IMAGE_INFO_SPECIFIC_KIND_QCOW2,
5267
        .u.qcow2.data = g_new0(ImageInfoSpecificQCow2, 1),
5268
    };
5269
    if (s->qcow_version == 2) {
5270
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
5271
            .compat             = g_strdup("0.10"),
5272
            .refcount_bits      = s->refcount_bits,
5273
        };
5274
    } else if (s->qcow_version == 3) {
5275
        Qcow2BitmapInfoList *bitmaps;
5276
        if (!qcow2_get_bitmap_info_list(bs, &bitmaps, errp)) {
5277
            qapi_free_ImageInfoSpecific(spec_info);
5278
            qapi_free_QCryptoBlockInfo(encrypt_info);
5279
            return NULL;
5280
        }
5281
        *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){
5282
            .compat             = g_strdup("1.1"),
5283
            .lazy_refcounts     = s->compatible_features &
5284
                                  QCOW2_COMPAT_LAZY_REFCOUNTS,
5285
            .has_lazy_refcounts = true,
5286
            .corrupt            = s->incompatible_features &
5287
                                  QCOW2_INCOMPAT_CORRUPT,
5288
            .has_corrupt        = true,
5289
            .has_extended_l2    = true,
5290
            .extended_l2        = has_subclusters(s),
5291
            .refcount_bits      = s->refcount_bits,
5292
            .has_bitmaps        = !!bitmaps,
5293
            .bitmaps            = bitmaps,
5294
            .data_file          = g_strdup(s->image_data_file),
5295
            .has_data_file_raw  = has_data_file(bs),
5296
            .data_file_raw      = data_file_is_raw(bs),
5297
            .compression_type   = s->compression_type,
5298
        };
5299
    } else {
5300
        /* if this assertion fails, this probably means a new version was
5301
         * added without having it covered here */
5302
        assert(false);
5303
    }
5304

5305
    if (encrypt_info) {
5306
        ImageInfoSpecificQCow2Encryption *qencrypt =
5307
            g_new(ImageInfoSpecificQCow2Encryption, 1);
5308
        switch (encrypt_info->format) {
5309
        case Q_CRYPTO_BLOCK_FORMAT_QCOW:
5310
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_AES;
5311
            break;
5312
        case Q_CRYPTO_BLOCK_FORMAT_LUKS:
5313
            qencrypt->format = BLOCKDEV_QCOW2_ENCRYPTION_FORMAT_LUKS;
5314
            qencrypt->u.luks = encrypt_info->u.luks;
5315
            break;
5316
        default:
5317
            abort();
5318
        }
5319
        /* Since we did shallow copy above, erase any pointers
5320
         * in the original info */
5321
        memset(&encrypt_info->u, 0, sizeof(encrypt_info->u));
5322
        qapi_free_QCryptoBlockInfo(encrypt_info);
5323

5324
        spec_info->u.qcow2.data->encrypt = qencrypt;
5325
    }
5326

5327
    return spec_info;
5328
}
5329

5330
static int coroutine_mixed_fn GRAPH_RDLOCK
5331
qcow2_has_zero_init(BlockDriverState *bs)
5332
{
5333
    BDRVQcow2State *s = bs->opaque;
5334
    bool preallocated;
5335

5336
    if (qemu_in_coroutine()) {
5337
        qemu_co_mutex_lock(&s->lock);
5338
    }
5339
    /*
5340
     * Check preallocation status: Preallocated images have all L2
5341
     * tables allocated, nonpreallocated images have none.  It is
5342
     * therefore enough to check the first one.
5343
     */
5344
    preallocated = s->l1_size > 0 && s->l1_table[0] != 0;
5345
    if (qemu_in_coroutine()) {
5346
        qemu_co_mutex_unlock(&s->lock);
5347
    }
5348

5349
    if (!preallocated) {
5350
        return 1;
5351
    } else if (bs->encrypted) {
5352
        return 0;
5353
    } else {
5354
        return bdrv_has_zero_init(s->data_file->bs);
5355
    }
5356
}
5357

5358
/*
5359
 * Check the request to vmstate. On success return
5360
 *      qcow2_vm_state_offset(bs) + @pos
5361
 */
5362
static int64_t qcow2_check_vmstate_request(BlockDriverState *bs,
5363
                                           QEMUIOVector *qiov, int64_t pos)
5364
{
5365
    BDRVQcow2State *s = bs->opaque;
5366
    int64_t vmstate_offset = qcow2_vm_state_offset(s);
5367
    int ret;
5368

5369
    /* Incoming requests must be OK */
5370
    bdrv_check_qiov_request(pos, qiov->size, qiov, 0, &error_abort);
5371

5372
    if (INT64_MAX - pos < vmstate_offset) {
5373
        return -EIO;
5374
    }
5375

5376
    pos += vmstate_offset;
5377
    ret = bdrv_check_qiov_request(pos, qiov->size, qiov, 0, NULL);
5378
    if (ret < 0) {
5379
        return ret;
5380
    }
5381

5382
    return pos;
5383
}
5384

5385
static int coroutine_fn GRAPH_RDLOCK
5386
qcow2_co_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
5387
{
5388
    int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
5389
    if (offset < 0) {
5390
        return offset;
5391
    }
5392

5393
    BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
5394
    return bs->drv->bdrv_co_pwritev_part(bs, offset, qiov->size, qiov, 0, 0);
5395
}
5396

5397
static int coroutine_fn GRAPH_RDLOCK
5398
qcow2_co_load_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
5399
{
5400
    int64_t offset = qcow2_check_vmstate_request(bs, qiov, pos);
5401
    if (offset < 0) {
5402
        return offset;
5403
    }
5404

5405
    BLKDBG_CO_EVENT(bs->file, BLKDBG_VMSTATE_LOAD);
5406
    return bs->drv->bdrv_co_preadv_part(bs, offset, qiov->size, qiov, 0, 0);
5407
}
5408

5409
static int GRAPH_RDLOCK qcow2_has_compressed_clusters(BlockDriverState *bs)
5410
{
5411
    int64_t offset = 0;
5412
    int64_t bytes = bdrv_getlength(bs);
5413

5414
    if (bytes < 0) {
5415
        return bytes;
5416
    }
5417

5418
    while (bytes != 0) {
5419
        int ret;
5420
        QCow2SubclusterType type;
5421
        unsigned int cur_bytes = MIN(INT_MAX, bytes);
5422
        uint64_t host_offset;
5423

5424
        ret = qcow2_get_host_offset(bs, offset, &cur_bytes, &host_offset,
5425
                                    &type);
5426
        if (ret < 0) {
5427
            return ret;
5428
        }
5429

5430
        if (type == QCOW2_SUBCLUSTER_COMPRESSED) {
5431
            return 1;
5432
        }
5433

5434
        offset += cur_bytes;
5435
        bytes -= cur_bytes;
5436
    }
5437

5438
    return 0;
5439
}
5440

5441
/*
5442
 * Downgrades an image's version. To achieve this, any incompatible features
5443
 * have to be removed.
5444
 */
5445
static int GRAPH_RDLOCK
5446
qcow2_downgrade(BlockDriverState *bs, int target_version,
5447
                BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
5448
                Error **errp)
5449
{
5450
    BDRVQcow2State *s = bs->opaque;
5451
    int current_version = s->qcow_version;
5452
    int ret;
5453
    int i;
5454

5455
    /* This is qcow2_downgrade(), not qcow2_upgrade() */
5456
    assert(target_version < current_version);
5457

5458
    /* There are no other versions (now) that you can downgrade to */
5459
    assert(target_version == 2);
5460

5461
    if (s->refcount_order != 4) {
5462
        error_setg(errp, "compat=0.10 requires refcount_bits=16");
5463
        return -ENOTSUP;
5464
    }
5465

5466
    if (has_data_file(bs)) {
5467
        error_setg(errp, "Cannot downgrade an image with a data file");
5468
        return -ENOTSUP;
5469
    }
5470

5471
    /*
5472
     * If any internal snapshot has a different size than the current
5473
     * image size, or VM state size that exceeds 32 bits, downgrading
5474
     * is unsafe.  Even though we would still use v3-compliant output
5475
     * to preserve that data, other v2 programs might not realize
5476
     * those optional fields are important.
5477
     */
5478
    for (i = 0; i < s->nb_snapshots; i++) {
5479
        if (s->snapshots[i].vm_state_size > UINT32_MAX ||
5480
            s->snapshots[i].disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) {
5481
            error_setg(errp, "Internal snapshots prevent downgrade of image");
5482
            return -ENOTSUP;
5483
        }
5484
    }
5485

5486
    /* clear incompatible features */
5487
    if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) {
5488
        ret = qcow2_mark_clean(bs);
5489
        if (ret < 0) {
5490
            error_setg_errno(errp, -ret, "Failed to make the image clean");
5491
            return ret;
5492
        }
5493
    }
5494

5495
    /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in
5496
     * the first place; if that happens nonetheless, returning -ENOTSUP is the
5497
     * best thing to do anyway */
5498

5499
    if (s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION) {
5500
        error_setg(errp, "Cannot downgrade an image with incompatible features "
5501
                   "0x%" PRIx64 " set",
5502
                   s->incompatible_features & ~QCOW2_INCOMPAT_COMPRESSION);
5503
        return -ENOTSUP;
5504
    }
5505

5506
    /* since we can ignore compatible features, we can set them to 0 as well */
5507
    s->compatible_features = 0;
5508
    /* if lazy refcounts have been used, they have already been fixed through
5509
     * clearing the dirty flag */
5510

5511
    /* clearing autoclear features is trivial */
5512
    s->autoclear_features = 0;
5513

5514
    ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque);
5515
    if (ret < 0) {
5516
        error_setg_errno(errp, -ret, "Failed to turn zero into data clusters");
5517
        return ret;
5518
    }
5519

5520
    if (s->incompatible_features & QCOW2_INCOMPAT_COMPRESSION) {
5521
        ret = qcow2_has_compressed_clusters(bs);
5522
        if (ret < 0) {
5523
            error_setg(errp, "Failed to check block status");
5524
            return -EINVAL;
5525
        }
5526
        if (ret) {
5527
            error_setg(errp, "Cannot downgrade an image with zstd compression "
5528
                       "type and existing compressed clusters");
5529
            return -ENOTSUP;
5530
        }
5531
        /*
5532
         * No compressed clusters for now, so just chose default zlib
5533
         * compression.
5534
         */
5535
        s->incompatible_features &= ~QCOW2_INCOMPAT_COMPRESSION;
5536
        s->compression_type = QCOW2_COMPRESSION_TYPE_ZLIB;
5537
    }
5538

5539
    assert(s->incompatible_features == 0);
5540

5541
    s->qcow_version = target_version;
5542
    ret = qcow2_update_header(bs);
5543
    if (ret < 0) {
5544
        s->qcow_version = current_version;
5545
        error_setg_errno(errp, -ret, "Failed to update the image header");
5546
        return ret;
5547
    }
5548
    return 0;
5549
}
5550

5551
/*
5552
 * Upgrades an image's version.  While newer versions encompass all
5553
 * features of older versions, some things may have to be presented
5554
 * differently.
5555
 */
5556
static int GRAPH_RDLOCK
5557
qcow2_upgrade(BlockDriverState *bs, int target_version,
5558
              BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
5559
              Error **errp)
5560
{
5561
    BDRVQcow2State *s = bs->opaque;
5562
    bool need_snapshot_update;
5563
    int current_version = s->qcow_version;
5564
    int i;
5565
    int ret;
5566

5567
    /* This is qcow2_upgrade(), not qcow2_downgrade() */
5568
    assert(target_version > current_version);
5569

5570
    /* There are no other versions (yet) that you can upgrade to */
5571
    assert(target_version == 3);
5572

5573
    status_cb(bs, 0, 2, cb_opaque);
5574

5575
    /*
5576
     * In v2, snapshots do not need to have extra data.  v3 requires
5577
     * the 64-bit VM state size and the virtual disk size to be
5578
     * present.
5579
     * qcow2_write_snapshots() will always write the list in the
5580
     * v3-compliant format.
5581
     */
5582
    need_snapshot_update = false;
5583
    for (i = 0; i < s->nb_snapshots; i++) {
5584
        if (s->snapshots[i].extra_data_size <
5585
            sizeof_field(QCowSnapshotExtraData, vm_state_size_large) +
5586
            sizeof_field(QCowSnapshotExtraData, disk_size))
5587
        {
5588
            need_snapshot_update = true;
5589
            break;
5590
        }
5591
    }
5592
    if (need_snapshot_update) {
5593
        ret = qcow2_write_snapshots(bs);
5594
        if (ret < 0) {
5595
            error_setg_errno(errp, -ret, "Failed to update the snapshot table");
5596
            return ret;
5597
        }
5598
    }
5599
    status_cb(bs, 1, 2, cb_opaque);
5600

5601
    s->qcow_version = target_version;
5602
    ret = qcow2_update_header(bs);
5603
    if (ret < 0) {
5604
        s->qcow_version = current_version;
5605
        error_setg_errno(errp, -ret, "Failed to update the image header");
5606
        return ret;
5607
    }
5608
    status_cb(bs, 2, 2, cb_opaque);
5609

5610
    return 0;
5611
}
5612

5613
typedef enum Qcow2AmendOperation {
5614
    /* This is the value Qcow2AmendHelperCBInfo::last_operation will be
5615
     * statically initialized to so that the helper CB can discern the first
5616
     * invocation from an operation change */
5617
    QCOW2_NO_OPERATION = 0,
5618

5619
    QCOW2_UPGRADING,
5620
    QCOW2_UPDATING_ENCRYPTION,
5621
    QCOW2_CHANGING_REFCOUNT_ORDER,
5622
    QCOW2_DOWNGRADING,
5623
} Qcow2AmendOperation;
5624

5625
typedef struct Qcow2AmendHelperCBInfo {
5626
    /* The code coordinating the amend operations should only modify
5627
     * these four fields; the rest will be managed by the CB */
5628
    BlockDriverAmendStatusCB *original_status_cb;
5629
    void *original_cb_opaque;
5630

5631
    Qcow2AmendOperation current_operation;
5632

5633
    /* Total number of operations to perform (only set once) */
5634
    int total_operations;
5635

5636
    /* The following fields are managed by the CB */
5637

5638
    /* Number of operations completed */
5639
    int operations_completed;
5640

5641
    /* Cumulative offset of all completed operations */
5642
    int64_t offset_completed;
5643

5644
    Qcow2AmendOperation last_operation;
5645
    int64_t last_work_size;
5646
} Qcow2AmendHelperCBInfo;
5647

5648
static void qcow2_amend_helper_cb(BlockDriverState *bs,
5649
                                  int64_t operation_offset,
5650
                                  int64_t operation_work_size, void *opaque)
5651
{
5652
    Qcow2AmendHelperCBInfo *info = opaque;
5653
    int64_t current_work_size;
5654
    int64_t projected_work_size;
5655

5656
    if (info->current_operation != info->last_operation) {
5657
        if (info->last_operation != QCOW2_NO_OPERATION) {
5658
            info->offset_completed += info->last_work_size;
5659
            info->operations_completed++;
5660
        }
5661

5662
        info->last_operation = info->current_operation;
5663
    }
5664

5665
    assert(info->total_operations > 0);
5666
    assert(info->operations_completed < info->total_operations);
5667

5668
    info->last_work_size = operation_work_size;
5669

5670
    current_work_size = info->offset_completed + operation_work_size;
5671

5672
    /* current_work_size is the total work size for (operations_completed + 1)
5673
     * operations (which includes this one), so multiply it by the number of
5674
     * operations not covered and divide it by the number of operations
5675
     * covered to get a projection for the operations not covered */
5676
    projected_work_size = current_work_size * (info->total_operations -
5677
                                               info->operations_completed - 1)
5678
                                            / (info->operations_completed + 1);
5679

5680
    info->original_status_cb(bs, info->offset_completed + operation_offset,
5681
                             current_work_size + projected_work_size,
5682
                             info->original_cb_opaque);
5683
}
5684

5685
static int GRAPH_RDLOCK
5686
qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts,
5687
                    BlockDriverAmendStatusCB *status_cb, void *cb_opaque,
5688
                    bool force, Error **errp)
5689
{
5690
    BDRVQcow2State *s = bs->opaque;
5691
    int old_version = s->qcow_version, new_version = old_version;
5692
    uint64_t new_size = 0;
5693
    const char *backing_file = NULL, *backing_format = NULL, *data_file = NULL;
5694
    bool lazy_refcounts = s->use_lazy_refcounts;
5695
    bool data_file_raw = data_file_is_raw(bs);
5696
    const char *compat = NULL;
5697
    int refcount_bits = s->refcount_bits;
5698
    int ret;
5699
    QemuOptDesc *desc = opts->list->desc;
5700
    Qcow2AmendHelperCBInfo helper_cb_info;
5701
    bool encryption_update = false;
5702

5703
    while (desc && desc->name) {
5704
        if (!qemu_opt_find(opts, desc->name)) {
5705
            /* only change explicitly defined options */
5706
            desc++;
5707
            continue;
5708
        }
5709

5710
        if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) {
5711
            compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL);
5712
            if (!compat) {
5713
                /* preserve default */
5714
            } else if (!strcmp(compat, "0.10") || !strcmp(compat, "v2")) {
5715
                new_version = 2;
5716
            } else if (!strcmp(compat, "1.1") || !strcmp(compat, "v3")) {
5717
                new_version = 3;
5718
            } else {
5719
                error_setg(errp, "Unknown compatibility level %s", compat);
5720
                return -EINVAL;
5721
            }
5722
        } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) {
5723
            new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0);
5724
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) {
5725
            backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE);
5726
        } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) {
5727
            backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT);
5728
        } else if (g_str_has_prefix(desc->name, "encrypt.")) {
5729
            if (!s->crypto) {
5730
                error_setg(errp,
5731
                           "Can't amend encryption options - encryption not present");
5732
                return -EINVAL;
5733
            }
5734
            if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
5735
                error_setg(errp,
5736
                           "Only LUKS encryption options can be amended");
5737
                return -ENOTSUP;
5738
            }
5739
            encryption_update = true;
5740
        } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) {
5741
            lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS,
5742
                                               lazy_refcounts);
5743
        } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) {
5744
            refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS,
5745
                                                refcount_bits);
5746

5747
            if (refcount_bits <= 0 || refcount_bits > 64 ||
5748
                !is_power_of_2(refcount_bits))
5749
            {
5750
                error_setg(errp, "Refcount width must be a power of two and "
5751
                           "may not exceed 64 bits");
5752
                return -EINVAL;
5753
            }
5754
        } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE)) {
5755
            data_file = qemu_opt_get(opts, BLOCK_OPT_DATA_FILE);
5756
            if (data_file && !has_data_file(bs)) {
5757
                error_setg(errp, "data-file can only be set for images that "
5758
                                 "use an external data file");
5759
                return -EINVAL;
5760
            }
5761
        } else if (!strcmp(desc->name, BLOCK_OPT_DATA_FILE_RAW)) {
5762
            data_file_raw = qemu_opt_get_bool(opts, BLOCK_OPT_DATA_FILE_RAW,
5763
                                              data_file_raw);
5764
            if (data_file_raw && !data_file_is_raw(bs)) {
5765
                error_setg(errp, "data-file-raw cannot be set on existing "
5766
                                 "images");
5767
                return -EINVAL;
5768
            }
5769
        } else {
5770
            /* if this point is reached, this probably means a new option was
5771
             * added without having it covered here */
5772
            abort();
5773
        }
5774

5775
        desc++;
5776
    }
5777

5778
    helper_cb_info = (Qcow2AmendHelperCBInfo){
5779
        .original_status_cb = status_cb,
5780
        .original_cb_opaque = cb_opaque,
5781
        .total_operations = (new_version != old_version)
5782
                          + (s->refcount_bits != refcount_bits) +
5783
                            (encryption_update == true)
5784
    };
5785

5786
    /* Upgrade first (some features may require compat=1.1) */
5787
    if (new_version > old_version) {
5788
        helper_cb_info.current_operation = QCOW2_UPGRADING;
5789
        ret = qcow2_upgrade(bs, new_version, &qcow2_amend_helper_cb,
5790
                            &helper_cb_info, errp);
5791
        if (ret < 0) {
5792
            return ret;
5793
        }
5794
    }
5795

5796
    if (encryption_update) {
5797
        QDict *amend_opts_dict;
5798
        QCryptoBlockAmendOptions *amend_opts;
5799

5800
        helper_cb_info.current_operation = QCOW2_UPDATING_ENCRYPTION;
5801
        amend_opts_dict = qcow2_extract_crypto_opts(opts, "luks", errp);
5802
        if (!amend_opts_dict) {
5803
            return -EINVAL;
5804
        }
5805
        amend_opts = block_crypto_amend_opts_init(amend_opts_dict, errp);
5806
        qobject_unref(amend_opts_dict);
5807
        if (!amend_opts) {
5808
            return -EINVAL;
5809
        }
5810
        ret = qcrypto_block_amend_options(s->crypto,
5811
                                          qcow2_crypto_hdr_read_func,
5812
                                          qcow2_crypto_hdr_write_func,
5813
                                          bs,
5814
                                          amend_opts,
5815
                                          force,
5816
                                          errp);
5817
        qapi_free_QCryptoBlockAmendOptions(amend_opts);
5818
        if (ret < 0) {
5819
            return ret;
5820
        }
5821
    }
5822

5823
    if (s->refcount_bits != refcount_bits) {
5824
        int refcount_order = ctz32(refcount_bits);
5825

5826
        if (new_version < 3 && refcount_bits != 16) {
5827
            error_setg(errp, "Refcount widths other than 16 bits require "
5828
                       "compatibility level 1.1 or above (use compat=1.1 or "
5829
                       "greater)");
5830
            return -EINVAL;
5831
        }
5832

5833
        helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER;
5834
        ret = qcow2_change_refcount_order(bs, refcount_order,
5835
                                          &qcow2_amend_helper_cb,
5836
                                          &helper_cb_info, errp);
5837
        if (ret < 0) {
5838
            return ret;
5839
        }
5840
    }
5841

5842
    /* data-file-raw blocks backing files, so clear it first if requested */
5843
    if (data_file_raw) {
5844
        s->autoclear_features |= QCOW2_AUTOCLEAR_DATA_FILE_RAW;
5845
    } else {
5846
        s->autoclear_features &= ~QCOW2_AUTOCLEAR_DATA_FILE_RAW;
5847
    }
5848

5849
    if (data_file) {
5850
        g_free(s->image_data_file);
5851
        s->image_data_file = *data_file ? g_strdup(data_file) : NULL;
5852
    }
5853

5854
    ret = qcow2_update_header(bs);
5855
    if (ret < 0) {
5856
        error_setg_errno(errp, -ret, "Failed to update the image header");
5857
        return ret;
5858
    }
5859

5860
    if (backing_file || backing_format) {
5861
        if (g_strcmp0(backing_file, s->image_backing_file) ||
5862
            g_strcmp0(backing_format, s->image_backing_format)) {
5863
            error_setg(errp, "Cannot amend the backing file");
5864
            error_append_hint(errp,
5865
                              "You can use 'qemu-img rebase' instead.\n");
5866
            return -EINVAL;
5867
        }
5868
    }
5869

5870
    if (s->use_lazy_refcounts != lazy_refcounts) {
5871
        if (lazy_refcounts) {
5872
            if (new_version < 3) {
5873
                error_setg(errp, "Lazy refcounts only supported with "
5874
                           "compatibility level 1.1 and above (use compat=1.1 "
5875
                           "or greater)");
5876
                return -EINVAL;
5877
            }
5878
            s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
5879
            ret = qcow2_update_header(bs);
5880
            if (ret < 0) {
5881
                s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
5882
                error_setg_errno(errp, -ret, "Failed to update the image header");
5883
                return ret;
5884
            }
5885
            s->use_lazy_refcounts = true;
5886
        } else {
5887
            /* make image clean first */
5888
            ret = qcow2_mark_clean(bs);
5889
            if (ret < 0) {
5890
                error_setg_errno(errp, -ret, "Failed to make the image clean");
5891
                return ret;
5892
            }
5893
            /* now disallow lazy refcounts */
5894
            s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS;
5895
            ret = qcow2_update_header(bs);
5896
            if (ret < 0) {
5897
                s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS;
5898
                error_setg_errno(errp, -ret, "Failed to update the image header");
5899
                return ret;
5900
            }
5901
            s->use_lazy_refcounts = false;
5902
        }
5903
    }
5904

5905
    if (new_size) {
5906
        BlockBackend *blk = blk_new_with_bs(bs, BLK_PERM_RESIZE, BLK_PERM_ALL,
5907
                                            errp);
5908
        if (!blk) {
5909
            return -EPERM;
5910
        }
5911

5912
        /*
5913
         * Amending image options should ensure that the image has
5914
         * exactly the given new values, so pass exact=true here.
5915
         */
5916
        ret = blk_truncate(blk, new_size, true, PREALLOC_MODE_OFF, 0, errp);
5917
        blk_unref(blk);
5918
        if (ret < 0) {
5919
            return ret;
5920
        }
5921
    }
5922

5923
    /* Downgrade last (so unsupported features can be removed before) */
5924
    if (new_version < old_version) {
5925
        helper_cb_info.current_operation = QCOW2_DOWNGRADING;
5926
        ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb,
5927
                              &helper_cb_info, errp);
5928
        if (ret < 0) {
5929
            return ret;
5930
        }
5931
    }
5932

5933
    return 0;
5934
}
5935

5936
static int coroutine_fn qcow2_co_amend(BlockDriverState *bs,
5937
                                       BlockdevAmendOptions *opts,
5938
                                       bool force,
5939
                                       Error **errp)
5940
{
5941
    BlockdevAmendOptionsQcow2 *qopts = &opts->u.qcow2;
5942
    BDRVQcow2State *s = bs->opaque;
5943
    int ret = 0;
5944

5945
    if (qopts->encrypt) {
5946
        if (!s->crypto) {
5947
            error_setg(errp, "image is not encrypted, can't amend");
5948
            return -EOPNOTSUPP;
5949
        }
5950

5951
        if (qopts->encrypt->format != Q_CRYPTO_BLOCK_FORMAT_LUKS) {
5952
            error_setg(errp,
5953
                       "Amend can't be used to change the qcow2 encryption format");
5954
            return -EOPNOTSUPP;
5955
        }
5956

5957
        if (s->crypt_method_header != QCOW_CRYPT_LUKS) {
5958
            error_setg(errp,
5959
                       "Only LUKS encryption options can be amended for qcow2 with blockdev-amend");
5960
            return -EOPNOTSUPP;
5961
        }
5962

5963
        ret = qcrypto_block_amend_options(s->crypto,
5964
                                          qcow2_crypto_hdr_read_func,
5965
                                          qcow2_crypto_hdr_write_func,
5966
                                          bs,
5967
                                          qopts->encrypt,
5968
                                          force,
5969
                                          errp);
5970
    }
5971
    return ret;
5972
}
5973

5974
/*
5975
 * If offset or size are negative, respectively, they will not be included in
5976
 * the BLOCK_IMAGE_CORRUPTED event emitted.
5977
 * fatal will be ignored for read-only BDS; corruptions found there will always
5978
 * be considered non-fatal.
5979
 */
5980
void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset,
5981
                             int64_t size, const char *message_format, ...)
5982
{
5983
    BDRVQcow2State *s = bs->opaque;
5984
    const char *node_name;
5985
    char *message;
5986
    va_list ap;
5987

5988
    fatal = fatal && bdrv_is_writable(bs);
5989

5990
    if (s->signaled_corruption &&
5991
        (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT)))
5992
    {
5993
        return;
5994
    }
5995

5996
    va_start(ap, message_format);
5997
    message = g_strdup_vprintf(message_format, ap);
5998
    va_end(ap);
5999

6000
    if (fatal) {
6001
        fprintf(stderr, "qcow2: Marking image as corrupt: %s; further "
6002
                "corruption events will be suppressed\n", message);
6003
    } else {
6004
        fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal "
6005
                "corruption events will be suppressed\n", message);
6006
    }
6007

6008
    node_name = bdrv_get_node_name(bs);
6009
    qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs),
6010
                                          *node_name ? node_name : NULL,
6011
                                          message, offset >= 0, offset,
6012
                                          size >= 0, size,
6013
                                          fatal);
6014
    g_free(message);
6015

6016
    if (fatal) {
6017
        qcow2_mark_corrupt(bs);
6018
        bs->drv = NULL; /* make BDS unusable */
6019
    }
6020

6021
    s->signaled_corruption = true;
6022
}
6023

6024
#define QCOW_COMMON_OPTIONS                                         \
6025
    {                                                               \
6026
        .name = BLOCK_OPT_SIZE,                                     \
6027
        .type = QEMU_OPT_SIZE,                                      \
6028
        .help = "Virtual disk size"                                 \
6029
    },                                                              \
6030
    {                                                               \
6031
        .name = BLOCK_OPT_COMPAT_LEVEL,                             \
6032
        .type = QEMU_OPT_STRING,                                    \
6033
        .help = "Compatibility level (v2 [0.10] or v3 [1.1])"       \
6034
    },                                                              \
6035
    {                                                               \
6036
        .name = BLOCK_OPT_BACKING_FILE,                             \
6037
        .type = QEMU_OPT_STRING,                                    \
6038
        .help = "File name of a base image"                         \
6039
    },                                                              \
6040
    {                                                               \
6041
        .name = BLOCK_OPT_BACKING_FMT,                              \
6042
        .type = QEMU_OPT_STRING,                                    \
6043
        .help = "Image format of the base image"                    \
6044
    },                                                              \
6045
    {                                                               \
6046
        .name = BLOCK_OPT_DATA_FILE,                                \
6047
        .type = QEMU_OPT_STRING,                                    \
6048
        .help = "File name of an external data file"                \
6049
    },                                                              \
6050
    {                                                               \
6051
        .name = BLOCK_OPT_DATA_FILE_RAW,                            \
6052
        .type = QEMU_OPT_BOOL,                                      \
6053
        .help = "The external data file must stay valid "           \
6054
                "as a raw image"                                    \
6055
    },                                                              \
6056
    {                                                               \
6057
        .name = BLOCK_OPT_LAZY_REFCOUNTS,                           \
6058
        .type = QEMU_OPT_BOOL,                                      \
6059
        .help = "Postpone refcount updates",                        \
6060
        .def_value_str = "off"                                      \
6061
    },                                                              \
6062
    {                                                               \
6063
        .name = BLOCK_OPT_REFCOUNT_BITS,                            \
6064
        .type = QEMU_OPT_NUMBER,                                    \
6065
        .help = "Width of a reference count entry in bits",         \
6066
        .def_value_str = "16"                                       \
6067
    }
6068

6069
static QemuOptsList qcow2_create_opts = {
6070
    .name = "qcow2-create-opts",
6071
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
6072
    .desc = {
6073
        {                                                               \
6074
            .name = BLOCK_OPT_ENCRYPT,                                  \
6075
            .type = QEMU_OPT_BOOL,                                      \
6076
            .help = "Encrypt the image with format 'aes'. (Deprecated " \
6077
                    "in favor of " BLOCK_OPT_ENCRYPT_FORMAT "=aes)",    \
6078
        },                                                              \
6079
        {                                                               \
6080
            .name = BLOCK_OPT_ENCRYPT_FORMAT,                           \
6081
            .type = QEMU_OPT_STRING,                                    \
6082
            .help = "Encrypt the image, format choices: 'aes', 'luks'", \
6083
        },                                                              \
6084
        BLOCK_CRYPTO_OPT_DEF_KEY_SECRET("encrypt.",                     \
6085
            "ID of secret providing qcow AES key or LUKS passphrase"),  \
6086
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_ALG("encrypt."),               \
6087
        BLOCK_CRYPTO_OPT_DEF_LUKS_CIPHER_MODE("encrypt."),              \
6088
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_ALG("encrypt."),                \
6089
        BLOCK_CRYPTO_OPT_DEF_LUKS_IVGEN_HASH_ALG("encrypt."),           \
6090
        BLOCK_CRYPTO_OPT_DEF_LUKS_HASH_ALG("encrypt."),                 \
6091
        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),                \
6092
        {                                                               \
6093
            .name = BLOCK_OPT_CLUSTER_SIZE,                             \
6094
            .type = QEMU_OPT_SIZE,                                      \
6095
            .help = "qcow2 cluster size",                               \
6096
            .def_value_str = stringify(DEFAULT_CLUSTER_SIZE)            \
6097
        },                                                              \
6098
        {                                                               \
6099
            .name = BLOCK_OPT_EXTL2,                                    \
6100
            .type = QEMU_OPT_BOOL,                                      \
6101
            .help = "Extended L2 tables",                               \
6102
            .def_value_str = "off"                                      \
6103
        },                                                              \
6104
        {                                                               \
6105
            .name = BLOCK_OPT_PREALLOC,                                 \
6106
            .type = QEMU_OPT_STRING,                                    \
6107
            .help = "Preallocation mode (allowed values: off, "         \
6108
                    "metadata, falloc, full)"                           \
6109
        },                                                              \
6110
        {                                                               \
6111
            .name = BLOCK_OPT_COMPRESSION_TYPE,                         \
6112
            .type = QEMU_OPT_STRING,                                    \
6113
            .help = "Compression method used for image cluster "        \
6114
                    "compression",                                      \
6115
            .def_value_str = "zlib"                                     \
6116
        },
6117
        QCOW_COMMON_OPTIONS,
6118
        { /* end of list */ }
6119
    }
6120
};
6121

6122
static QemuOptsList qcow2_amend_opts = {
6123
    .name = "qcow2-amend-opts",
6124
    .head = QTAILQ_HEAD_INITIALIZER(qcow2_amend_opts.head),
6125
    .desc = {
6126
        BLOCK_CRYPTO_OPT_DEF_LUKS_STATE("encrypt."),
6127
        BLOCK_CRYPTO_OPT_DEF_LUKS_KEYSLOT("encrypt."),
6128
        BLOCK_CRYPTO_OPT_DEF_LUKS_OLD_SECRET("encrypt."),
6129
        BLOCK_CRYPTO_OPT_DEF_LUKS_NEW_SECRET("encrypt."),
6130
        BLOCK_CRYPTO_OPT_DEF_LUKS_ITER_TIME("encrypt."),
6131
        QCOW_COMMON_OPTIONS,
6132
        { /* end of list */ }
6133
    }
6134
};
6135

6136
static const char *const qcow2_strong_runtime_opts[] = {
6137
    "encrypt." BLOCK_CRYPTO_OPT_QCOW_KEY_SECRET,
6138

6139
    NULL
6140
};
6141

6142
BlockDriver bdrv_qcow2 = {
6143
    .format_name                        = "qcow2",
6144
    .instance_size                      = sizeof(BDRVQcow2State),
6145
    .bdrv_probe                         = qcow2_probe,
6146
    .bdrv_open                          = qcow2_open,
6147
    .bdrv_close                         = qcow2_close,
6148
    .bdrv_reopen_prepare                = qcow2_reopen_prepare,
6149
    .bdrv_reopen_commit                 = qcow2_reopen_commit,
6150
    .bdrv_reopen_commit_post            = qcow2_reopen_commit_post,
6151
    .bdrv_reopen_abort                  = qcow2_reopen_abort,
6152
    .bdrv_join_options                  = qcow2_join_options,
6153
    .bdrv_child_perm                    = bdrv_default_perms,
6154
    .bdrv_co_create_opts                = qcow2_co_create_opts,
6155
    .bdrv_co_create                     = qcow2_co_create,
6156
    .bdrv_has_zero_init                 = qcow2_has_zero_init,
6157
    .bdrv_co_block_status               = qcow2_co_block_status,
6158

6159
    .bdrv_co_preadv_part                = qcow2_co_preadv_part,
6160
    .bdrv_co_pwritev_part               = qcow2_co_pwritev_part,
6161
    .bdrv_co_flush_to_os                = qcow2_co_flush_to_os,
6162

6163
    .bdrv_co_pwrite_zeroes              = qcow2_co_pwrite_zeroes,
6164
    .bdrv_co_pdiscard                   = qcow2_co_pdiscard,
6165
    .bdrv_co_copy_range_from            = qcow2_co_copy_range_from,
6166
    .bdrv_co_copy_range_to              = qcow2_co_copy_range_to,
6167
    .bdrv_co_truncate                   = qcow2_co_truncate,
6168
    .bdrv_co_pwritev_compressed_part    = qcow2_co_pwritev_compressed_part,
6169
    .bdrv_make_empty                    = qcow2_make_empty,
6170

6171
    .bdrv_snapshot_create               = qcow2_snapshot_create,
6172
    .bdrv_snapshot_goto                 = qcow2_snapshot_goto,
6173
    .bdrv_snapshot_delete               = qcow2_snapshot_delete,
6174
    .bdrv_snapshot_list                 = qcow2_snapshot_list,
6175
    .bdrv_snapshot_load_tmp             = qcow2_snapshot_load_tmp,
6176
    .bdrv_measure                       = qcow2_measure,
6177
    .bdrv_co_get_info                   = qcow2_co_get_info,
6178
    .bdrv_get_specific_info             = qcow2_get_specific_info,
6179

6180
    .bdrv_co_save_vmstate               = qcow2_co_save_vmstate,
6181
    .bdrv_co_load_vmstate               = qcow2_co_load_vmstate,
6182

6183
    .is_format                          = true,
6184
    .supports_backing                   = true,
6185
    .bdrv_co_change_backing_file        = qcow2_co_change_backing_file,
6186

6187
    .bdrv_refresh_limits                = qcow2_refresh_limits,
6188
    .bdrv_co_invalidate_cache           = qcow2_co_invalidate_cache,
6189
    .bdrv_inactivate                    = qcow2_inactivate,
6190

6191
    .create_opts                        = &qcow2_create_opts,
6192
    .amend_opts                         = &qcow2_amend_opts,
6193
    .strong_runtime_opts                = qcow2_strong_runtime_opts,
6194
    .mutable_opts                       = mutable_opts,
6195
    .bdrv_co_check                      = qcow2_co_check,
6196
    .bdrv_amend_options                 = qcow2_amend_options,
6197
    .bdrv_co_amend                      = qcow2_co_amend,
6198

6199
    .bdrv_detach_aio_context            = qcow2_detach_aio_context,
6200
    .bdrv_attach_aio_context            = qcow2_attach_aio_context,
6201

6202
    .bdrv_supports_persistent_dirty_bitmap =
6203
            qcow2_supports_persistent_dirty_bitmap,
6204
    .bdrv_co_can_store_new_dirty_bitmap = qcow2_co_can_store_new_dirty_bitmap,
6205
    .bdrv_co_remove_persistent_dirty_bitmap =
6206
            qcow2_co_remove_persistent_dirty_bitmap,
6207
};
6208

6209
static void bdrv_qcow2_init(void)
6210
{
6211
    bdrv_register(&bdrv_qcow2);
6212
}
6213

6214
block_init(bdrv_qcow2_init);
6215

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.